工業(yè)蒸汽預(yù)測(cè)案例(天池大賽)
這是天池大賽其中一個(gè)項(xiàng)目數(shù)據(jù),在拿這個(gè)項(xiàng)目練手幾個(gè)模型,推導(dǎo)之后,留下了大部分內(nèi)容。下面是具體案例。
競(jìng)賽資料相關(guān):
基本介紹:根據(jù)經(jīng)脫敏后的鍋爐傳感器采集的數(shù)據(jù)(采集頻率是分鐘級(jí)別),根據(jù)鍋爐的工況,預(yù)測(cè)產(chǎn)生的蒸汽量。
競(jìng)賽地址:https://tianchi.aliyun.com/competition/entrance/231693/introduction?spm=5176.12281973.1005.9.56bb1f54ICNwSI
案例代碼
導(dǎo)入模塊
import?warnings
warnings.filterwarnings("ignore")
import?matplotlib.pyplot?as?plt
import?seaborn?as?sns
import?pandas?as?pd
import?numpy?as?np
from?sklearn.model_selection?import?train_test_split
from?sklearn.metrics?import?mean_squared_error
from?sklearn.neighbors?import?KNeighborsRegressor
from?sklearn.ensemble?import?RandomForestRegressor
import?lightgbm?as?lgb??
from??sklearn.tree?import?DecisionTreeRegressor
from?sklearn.svm?import?SVR
導(dǎo)入數(shù)據(jù)集
train_data_file?=?"./zhengqi_train.txt"
test_data_file?=?"./zhengqi_test.txt"
data_train?=?pd.read_csv(train_data_file,sep='\t',encoding='utf-8')
data_test?=?pd.read_csv(test_data_file,sep='\t',encoding='utf-8')
數(shù)據(jù)探索
data_train.info()
'pandas.core.frame.DataFrame'>
RangeIndex:?2888?entries,?0?to?2887
Data?columns?(total?39?columns):
?#???Column??Non-Null?Count??Dtype??
---??------??--------------??-----??
?0???V0??????2888?non-null???float64
?1???V1??????2888?non-null???float64
?2???V2??????2888?non-null???float64
?3???V3??????2888?non-null???float64
?4???V4??????2888?non-null???float64
?5???V5??????2888?non-null???float64
?6???V6??????2888?non-null???float64
?7???V7??????2888?non-null???float64
?8???V8??????2888?non-null???float64
?9???V9??????2888?non-null???float64
?10??V10?????2888?non-null???float64
?11??V11?????2888?non-null???float64
?12??V12?????2888?non-null???float64
?13??V13?????2888?non-null???float64
?14??V14?????2888?non-null???float64
?15??V15?????2888?non-null???float64
?16??V16?????2888?non-null???float64
?17??V17?????2888?non-null???float64
?18??V18?????2888?non-null???float64
?19??V19?????2888?non-null???float64
?20??V20?????2888?non-null???float64
?21??V21?????2888?non-null???float64
?22??V22?????2888?non-null???float64
?23??V23?????2888?non-null???float64
?24??V24?????2888?non-null???float64
?25??V25?????2888?non-null???float64
?26??V26?????2888?non-null???float64
?27??V27?????2888?non-null???float64
?28??V28?????2888?non-null???float64
?29??V29?????2888?non-null???float64
?30??V30?????2888?non-null???float64
?31??V31?????2888?non-null???float64
?32??V32?????2888?non-null???float64
?33??V33?????2888?non-null???float64
?34??V34?????2888?non-null???float64
?35??V35?????2888?non-null???float64
?36??V36?????2888?non-null???float64
?37??V37?????2888?non-null???float64
?38??target??2888?non-null???float64
dtypes:?float64(39)
memory?usage:?880.1?KB
data_test.info()
'pandas.core.frame.DataFrame'>
RangeIndex:?1925?entries,?0?to?1924
Data?columns?(total?38?columns):
?#???Column??Non-Null?Count??Dtype??
---??------??--------------??-----??
?0???V0??????1925?non-null???float64
?1???V1??????1925?non-null???float64
?2???V2??????1925?non-null???float64
?3???V3??????1925?non-null???float64
?4???V4??????1925?non-null???float64
?5???V5??????1925?non-null???float64
?6???V6??????1925?non-null???float64
?7???V7??????1925?non-null???float64
?8???V8??????1925?non-null???float64
?9???V9??????1925?non-null???float64
?10??V10?????1925?non-null???float64
?11??V11?????1925?non-null???float64
?12??V12?????1925?non-null???float64
?13??V13?????1925?non-null???float64
?14??V14?????1925?non-null???float64
?15??V15?????1925?non-null???float64
?16??V16?????1925?non-null???float64
?17??V17?????1925?non-null???float64
?18??V18?????1925?non-null???float64
?19??V19?????1925?non-null???float64
?20??V20?????1925?non-null???float64
?21??V21?????1925?non-null???float64
?22??V22?????1925?non-null???float64
?23??V23?????1925?non-null???float64
?24??V24?????1925?non-null???float64
?25??V25?????1925?non-null???float64
?26??V26?????1925?non-null???float64
?27??V27?????1925?non-null???float64
?28??V28?????1925?non-null???float64
?29??V29?????1925?non-null???float64
?30??V30?????1925?non-null???float64
?31??V31?????1925?non-null???float64
?32??V32?????1925?non-null???float64
?33??V33?????1925?non-null???float64
?34??V34?????1925?non-null???float64
?35??V35?????1925?non-null???float64
?36??V36?????1925?non-null???float64
?37??V37?????1925?non-null???float64
dtypes:?float64(38)
memory?usage:?571.6?KB
可以看出訓(xùn)練集有和測(cè)試集有38特征,且均無空值。
#?核密度估計(jì)(Kde)是在概率論中用來估計(jì)未知的密度函數(shù),屬于非參數(shù)檢驗(yàn)方法之一。通過核密度估計(jì)圖可以比較直觀的看出
#?數(shù)據(jù)樣本本身的分布特征。
dist_cols?=?6
dist_rows?=?len(data_test.columns)
plt.figure(figsize=(4*dist_cols,4*dist_rows))
i=1
for?col?in?data_test.columns:
????ax=plt.subplot(dist_rows,dist_cols,i)
????ax?=?sns.kdeplot(data_train[col],?color="Red",?shade=True)
????ax?=?sns.kdeplot(data_test[col],?color="Blue",?shade=True)
????ax.set_xlabel(col)
????ax.set_ylabel("Frequency")
????ax?=?ax.legend(["train","test"])
????i+=1
plt.show()
由上圖的數(shù)據(jù)分布可以看到特征'V5','V9','V11','V17','V22','V23','V28' 訓(xùn)練集數(shù)據(jù)與測(cè)試集數(shù)據(jù)分布不一致,會(huì)導(dǎo)致模型泛化能力差,采用刪除此類特征方法。
data_train.drop(["V5","V9","V11","V17","V22","V23","V28"],axis=1,inplace=True)
data_test.drop(["V5","V9","V11","V17","V22","V23","V28"],axis=1,inplace=True)
相關(guān)性分析
#?找出相關(guān)程度
plt.figure(figsize=(20,?16))??#?指定繪圖對(duì)象寬度和高度
colnm?=?data_train.columns.tolist()??#?列表頭
mcorr?=?data_train[colnm].corr(method="spearman")??#?相關(guān)系數(shù)矩陣,即給出了任意兩個(gè)變量之間的相關(guān)系數(shù)
mask?=?np.zeros_like(mcorr,?dtype=np.bool)??#?構(gòu)造與mcorr同維數(shù)矩陣?為bool型
mask[np.triu_indices_from(mask)]?=?True??#?角分線右側(cè)為True
cmap?=?sns.diverging_palette(220,?10,?as_cmap=True)??#?返回matplotlib?colormap對(duì)象
g?=?sns.heatmap(mcorr,?mask=mask,?cmap=cmap,?square=True,?annot=True,?fmt='0.2f')??#?熱力圖(看兩兩相似度)
plt.show()

#?查找出特征變量和target變量相關(guān)系數(shù)大于0.3的特征變量
threshold?=?0.3
corrmat?=?data_train.corr()
data_train_corr?=?corrmat.index[abs(corrmat["target"])>threshold]
plt.figure(figsize=(10,10))
g?=?sns.heatmap(data_train[data_train_corr].corr(),annot=True,cmap="RdYlGn")
data_train_corr
Index(['V0',?'V1',?'V2',?'V3',?'V4',?'V6',?'V8',?'V10',?'V12',?'V16',?'V20',
???????'V27',?'V31',?'V36',?'V37',?'target'],
??????dtype='object')

建模
from?sklearn.model_selection?import?train_test_split
x?=?data_train.iloc[:,data_train.columns?!=?'target']
y?=?data_train.iloc[:,data_train.columns?==?'target']
Xtrain,?Xtest,?Ytrain,?Ytest?=?train_test_split(x,y,test_size=0.3)
#?恢復(fù)數(shù)據(jù)索引
for?i?in?[Xtrain?,Xtest,Ytrain,Ytest]:
????i.index?=?range(i.shape[0])
#?不同模型的名稱列表
model_names?=?['RandomForest',?'KNeighborsRegressor',?"DecisionTree",'SVR',"lgb"]?
model_RandomForest?=?RandomForestRegressor(n_estimators=200)?
model_KNN?=?KNeighborsRegressor(n_neighbors=8)?
model_tree?=?DecisionTreeRegressor(random_state=0)
model_svr?=?SVR(gamma='scale')??
model_lgb?=?lgb.LGBMRegressor(
????????learning_rate=0.01,
????????max_depth=-1,
????????n_estimators=5000,
????????boosting_type='gbdt',
????????random_state=2019,
????????objective='regression',
????)
#?不同回歸模型對(duì)象的集合
model_list?=?[model_RandomForest,model_KNN,model_tree,?model_svr,model_lgb]
#?各個(gè)回歸模型預(yù)測(cè)的y值列表
pre_y_list?=?[model.fit(Xtrain?,Ytrain).predict(Xtest)?for?model?in?model_list]
from?sklearn.metrics?import?explained_variance_score,?mean_absolute_error,?mean_squared_error,?r2_score?????????
#?總樣本量,總特征數(shù)
n_samples,?n_features?=?x.shape??
#?回歸評(píng)估指標(biāo)對(duì)象集
model_metrics_functions?=?[explained_variance_score,?mean_absolute_error,?mean_squared_error,r2_score]??
#?回歸評(píng)估指標(biāo)列表
model_metrics_list?=?[[m(Ytest,?pre_y_list[i])?for?m?in?model_metrics_functions]?for?i?in?range(len(model_list))]??
#?建立回歸指標(biāo)的數(shù)據(jù)dataframe
regresstion_score?=?pd.DataFrame(model_metrics_list,?index=model_names,
???????????????????columns=['explained_variance',?'mae',?'mse',?'r2'])??
#?打印輸出樣本量和特征數(shù)量
print('all?samples:?%d?\t?features:?%d'?%?(n_samples,?n_features),'\n','-'*60)?
#?打印輸出標(biāo)題
print('\n','regression?metrics:','\n','-'*60)??
#?打印輸出回歸指標(biāo)的數(shù)據(jù)框
print(regresstion_score)
all?samples:?2888???features:?31?
?------------------------------------------------------------
?regression?metrics:?
?------------------------------------------------------------
?????????????????????explained_variance???????mae???????mse????????r2
RandomForest???????????????????0.882672??0.246085??0.117992??0.882585
KNeighborsRegressor????????????0.827096??0.315223??0.180750??0.820134
DecisionTree???????????????????0.753935??0.361561??0.247357??0.753851
SVR????????????????????????????0.862009??0.253634??0.138674??0.862003
lgb????????????????????????????0.896023??0.228566??0.104491??0.896020
數(shù)據(jù)集直接在天池競(jìng)賽地址獲取就可以。
這份代碼已經(jīng)傳到云盤。關(guān)注公眾號(hào),在對(duì)話框回復(fù)“1128”獲取代碼。
另外,天池競(jìng)賽最近出了一本書,講了一些實(shí)際案例,感興趣可以買實(shí)體書看看(目前這本書不打算出電子版)。
評(píng)論
圖片
表情
