[catboost] catboost를 위한 베이지안 하이퍼 파라미터 최적화 코드

티스토리 뷰

데이터 사이언스 & 로봇/ML 및 DL 관련 이론

[catboost] catboost를 위한 베이지안 하이퍼 파라미터 최적화 코드

sikaro 2024. 3. 22. 10:35

from catboost import CatBoostRegressor
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.model_selection import cross_val_score

# 하이퍼파라미터 검색 공간 정의
space = {
    'depth': hp.quniform('depth', 3, 16, 1),
    'learning_rate': hp.loguniform('learning_rate', -5, -0.5),
    'l2_leaf_reg': hp.loguniform('l2_leaf_reg', -16, -1),
    'max_bin': hp.quniform('max_bin', 16, 512, 1),
    'one_hot_max_size': hp.quniform('one_hot_max_size', 0, 500, 1),
    'random_strength': hp.uniform('random_strength', 0, 100),
    'bagging_temperature': hp.uniform('bagging_temperature', 0, 1),
    'min_data_in_leaf': hp.quniform('min_data_in_leaf', 1, 200, 1),
}

# 목적 함수 정의
def objective(params):
    model = CatBoostRegressor(
        depth=int(params['depth']),
        learning_rate=params['learning_rate'],
        l2_leaf_reg=params['l2_leaf_reg'],
        max_bin=int(params['max_bin']),
        one_hot_max_size=int(params['one_hot_max_size']),
        random_strength=params['random_strength'],
        bagging_temperature=params['bagging_temperature'],
        min_data_in_leaf=int(params['min_data_in_leaf']),
        verbose=0
    )

    score = -cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error').mean()
    return {'loss': score, 'status': STATUS_OK}

# 베이지안 최적화 실행
trials = Trials()
best = fmin(objective, space, algo=tpe.suggest, max_evals=100, trials=trials)

# 최적 하이퍼파라미터 출력
print('Best Parameters: ', best)

해당 파라미터로 다시 학습하려면 다음과 같이.

from catboost import CatBoostRegressor

# 최적 하이퍼파라미터 조합
best_params = {
    'depth': 10,
    'learning_rate': 0.01,
    'l2_leaf_reg': 3,
    'max_bin': 256,
    'one_hot_max_size': 100,
    'random_strength': 50,
    'bagging_temperature': 0.3,
    'min_data_in_leaf': 20
}

# 모델 생성 및 학습
model = CatBoostRegressor(
    depth=best_params['depth'],
    learning_rate=best_params['learning_rate'],
    l2_leaf_reg=best_params['l2_leaf_reg'],
    max_bin=best_params['max_bin'],
    one_hot_max_size=best_params['one_hot_max_size'],
    random_strength=best_params['random_strength'],
    bagging_temperature=best_params['bagging_temperature'],
    min_data_in_leaf=best_params['min_data_in_leaf'],
    verbose=0
)

model.fit(X_train, y_train)

optuna도 가능

import optuna
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Objective 함수 정의
def objective(trial):
    # 하이퍼파라미터 탐색 공간 정의
    params = {
        'depth': trial.suggest_int('depth', 5, 16),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 0.1),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-16, 1e-1),
        'max_bin': trial.suggest_int('max_bin', 16, 512),
        'one_hot_max_size': trial.suggest_int('one_hot_max_size', 0, 500),
        'random_strength': trial.suggest_uniform('random_strength', 0, 100),
        'bagging_temperature': trial.suggest_uniform('bagging_temperature', 0, 1),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 200),
    }
    
    # 모델 초기화
    model = CatBoostRegressor(**params, verbose=0)
    
    # 모델 훈련
    model.fit(x_train, y_train)
    
    # 검증 데이터에 대한 예측
    preds = model.predict(x_val)
    
    # 손실 계산
    rmse = mean_squared_error(y_val, preds, squared=False)
    
    return rmse

# 데이터 분할
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)

# Optuna 최적화
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

# 최적의 하이퍼파라미터 출력
best_params = study.best_params
print('Best Parameters:', best_params)

# 최적의 하이퍼파라미터를 사용하여 모델 초기화
best_model = CatBoostRegressor(**best_params, verbose=0)

# 모델 훈련
best_model.fit(x_train, y_train)

# 테스트 데이터에 대한 예측
test_preds = best_model.predict(x_test)

시카로의 공부방

티스토리 뷰

[catboost] catboost를 위한 베이지안 하이퍼 파라미터 최적화 코드

티스토리툴바