【數(shù)據(jù)競賽】Kaggle GM秘技:樹模型初始化技巧
樹模型初始化技巧
依據(jù)驗證集合的效果,來調(diào)整learning rate的大小,從而獲得更好的效果;
找到一個還不錯的learning rate然后訓練,不行就自動換一個learning rate;

找一個線下調(diào)試不錯的學習率,訓完結束!
import?pandas?as?pd
import?numpy?as?np
import?lightgbm?as?lgbm
from?sklearn.model_selection?import?KFold
from?sklearn.metrics?import?mean_squared_error
#####?1.讀取數(shù)據(jù)
train?=?pd.read_csv("...")
test?=?pd.read_csv("...")
#####?2.N折訓練測試
cont_features?=?[col?for?col?in?train.columns?if?col.startswith("cont")]
len(cont_features)?
y?=?train["target"]
kf?=?KFold(n_splits=5,?shuffle=True,?random_state=1)
oof?=?np.zeros(len(train))
score_list?=?[]
fold?=?1
test_preds?=?[]
for?train_index,?test_index?in?kf.split(train):
????X_train,?X_val?=?train.iloc[train_index],?train.iloc[test_index]
????y_train,?y_val?=?y.iloc[train_index],?y.iloc[test_index]?
????
????y_pred_list?=?[]
????for?seed?in?[1]:
????????dtrain?=?lgbm.Dataset(X_train[cont_features],?y_train)
????????dvalid?=?lgbm.Dataset(X_val[cont_features],?y_val)
????????print(seed)
????????params?=?{"objective":?"regression",
??????????????"metric":?"rmse",
??????????????"verbosity":?-1,
??????????????"boosting_type":?"gbdt",
??????????????"feature_fraction":0.5,
??????????????"num_leaves":?200,
??????????????"lambda_l1":2,
??????????????"lambda_l2":2,
??????????????"learning_rate":0.01,
??????????????'min_child_samples':?50,
??????????????"bagging_fraction":0.7,
??????????????"bagging_freq":1}
????????params["seed"]?=?seed
????????model?=?lgbm.train(params,
????????????????????????dtrain,
????????????????????????valid_sets=[dtrain,?dvalid],
????????????????????????verbose_eval=100,
????????????????????????num_boost_round=100000,
????????????????????????early_stopping_rounds=100
????????????????????)
????
????????y_pred_list.append(model.predict(X_val[cont_features]))
????????test_preds.append(model.predict(test[cont_features]))?
????????
????oof[test_index]?=?np.mean(y_pred_list,axis=0)????
????score?=?np.sqrt(mean_squared_error(y_val,?oof[test_index]))
????score_list.append(score)
????print(f"RMSE?Fold-{fold}?:?{score}")
????fold+=1
np.mean(score_list)
大致思路:
用一個較大的learning rate學習得到初始版本模型1; 用一個較小的learning rate在模型1上繼續(xù)訓練得到模型2; ...
import?pandas?as?pd
import?numpy?as?np
import?lightgbm?as?lgbm
from?sklearn.model_selection?import?KFold
from?sklearn.metrics?import?mean_squared_error
#####?1.讀取數(shù)據(jù)
train?=?pd.read_csv("...")
test?=?pd.read_csv("...")
#####?2.N折訓練測試
cont_features?=?[col?for?col?in?train.columns?if?col.startswith("cont")]
len(cont_features)?
y?=?train["target"]
kf?=?KFold(n_splits=5,?shuffle=True,?random_state=1)
oof?=?np.zeros(len(train))
score_list?=?[]
fold?=?1
test_preds?=?[]
for?train_index,?test_index?in?kf.split(train):
????X_train,?X_val?=?train.iloc[train_index],?train.iloc[test_index]
????y_train,?y_val?=?y.iloc[train_index],?y.iloc[test_index]
????
????
????X_train?=?X_train.abs()
????
????y_pred_list?=?[]
????for?seed?in?[1]:
????????dtrain?=?lgbm.Dataset(X_train[cont_features],?y_train)
????????dvalid?=?lgbm.Dataset(X_val[cont_features],?y_val)
????????print(seed)
????????params?=?{"objective":?"regression",
??????????????"metric":?"rmse",
??????????????"verbosity":?-1,
??????????????"boosting_type":?"gbdt",
??????????????"feature_fraction":0.5,
??????????????"num_leaves":?200,
??????????????"lambda_l1":2,
??????????????"lambda_l2":2,
??????????????"learning_rate":0.01,
??????????????'min_child_samples':?50,
??????????????"bagging_fraction":0.7,
??????????????"bagging_freq":1}
????????params["seed"]?=?seed
????????model?=?lgbm.train(params,
????????????????????????dtrain,
????????????????????????valid_sets=[dtrain,?dvalid],
????????????????????????verbose_eval=100,
????????????????????????num_boost_round=100000,
????????????????????????early_stopping_rounds=100
????????????????????)
????????
????????
????????#####?3.?額外的策略
????????dtrain?=?lgbm.Dataset(X_train[cont_features],?y_train)
????????dvalid?=?lgbm.Dataset(X_val[cont_features],?y_val)
????????params?=?{"objective":?"regression",
??????????????"metric":?"rmse",
??????????????"verbosity":?-1,
??????????????"boosting_type":?"gbdt",
??????????????"feature_fraction":0.5,
??????????????"num_leaves":?300,
??????????????"lambda_l1":2,
??????????????"lambda_l2":2,
??????????????"learning_rate":0.003,
??????????????'min_child_samples':?50,
??????????????"bagging_fraction":0.7,
??????????????"bagging_freq":1}
????????params["seed"]?=?seed
????????model?=?lgbm.train(params,
????????????????????????????dtrain,
????????????????????????????valid_sets=[dtrain,?dvalid],
????????????????????????????verbose_eval=100,
????????????????????????????num_boost_round=1000,
???????????????????????????early_stopping_rounds=100,
???????????????????????????init_model?=?model
????????????????????????)
????
????
????????y_pred_list.append(model.predict(X_val[cont_features]))
????????test_preds.append(model.predict(test[cont_features]))
????
???
????
????oof[test_index]?=?np.mean(y_pred_list,axis=0)????
????score?=?np.sqrt(mean_squared_error(y_val,?oof[test_index]))
????score_list.append(score)
????print(f"RMSE?Fold-{fold}?:?{score}")
????fold+=1
np.mean(score_list)

https://www.kaggle.com/fatihozturk/lgbm-model-initialisation-trick
?
往期精彩回顧
本站知識星球“黃博的機器學習圈子”(92416895)
本站qq群704220115。
加入微信群請掃碼:
評論
圖片
表情
