機器學(xué)習(xí)基礎(chǔ):缺失值的處理技巧(附Python代碼)
1、缺失查看
import pandas as pd# 統(tǒng)計缺失值數(shù)量missing=data.isnull().sum().reset_index().rename(columns={0:'missNum'})# 計算缺失比例missing['missRate']=missing['missNum']/data.shape[0]# 按照缺失率排序顯示miss_analy=missing[missing.missRate>0].sort_values(by='missRate',ascending=False)# miss_analy 存儲的是每個變量缺失情況的數(shù)據(jù)框
import matplotlib.pyplot as pltimport pylab as plfig = plt.figure(figsize=(18,6))plt.bar(np.arange(miss_analy.shape[0]), list(miss_analy.missRate.values), align = 'center',color=['red','green','yellow','steelblue'])plt.title('Histogram of missing value of variables')plt.xlabel('variables names')plt.ylabel('missing rate')# 添加x軸標(biāo)簽,并旋轉(zhuǎn)90度plt.xticks(np.arange(miss_analy.shape[0]),list(miss_analy['index']))pl.xticks(rotation=90)# 添加數(shù)值顯示for x,y in enumerate(list(miss_analy.missRate.values)):plt.text(x,y+0.12,'{:.2%}'.format(y),ha='center',rotation=90)plt.ylim([0,1.2])plt.show()
2、缺失處理
方式1:刪除
func: df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)# 1、刪除‘a(chǎn)ge’列df.drop('age', axis=1, inplace=True)# 2、刪除數(shù)據(jù)表中含有空值的行df.dropna()# 3、丟棄某幾列有缺失值的行df.dropna(axis=0, subset=['a','b'], inplace=True)
# 去掉缺失比例大于80%以上的變量data=data.dropna(thresh=len(data)*0.2, axis=1)
方式2:常量填充
# 均值填充data['col'] = data['col'].fillna(data['col'].means())# 中位數(shù)填充data['col'] = data['col'].fillna(data['col'].median())# 眾數(shù)填充data['col'] = data['col'].fillna(stats.mode(data['col'])[0][0])
from sklearn.preprocessing import Imputerimr = Imputer(missing_values='NaN', strategy='mean', axis=0)imputed_data =pd.DataFrame(imr.fit_transform(df.values),columns=df.columns)imputed_data
方式3:插值填充
# interpolate()插值法,缺失值前后數(shù)值的均值,但是若缺失值前后也存在缺失,則不進行計算插補。df['a'] = df['a'].interpolate()# 用前面的值替換, 當(dāng)?shù)谝恍杏腥笔е禃r,該行利用向前替換無值可取,仍缺失df.fillna(method='pad')# 用后面的值替換,當(dāng)最后一行有缺失值時,該行利用向后替換無值可取,仍缺失df.fillna(method='backfill')#用后面的值替換
方式4:KNN填充
from fancyimpute import KNNfill_knn = KNN(k=3).fit_transform(data)data = pd.DataFrame(fill_knn)
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressordef knn_filled_func(x_train, y_train, test, k = 3, dispersed = True):# params: x_train 為目標(biāo)列不含缺失值的數(shù)據(jù)(不包括目標(biāo)列)# params: y_train 為不含缺失值的目標(biāo)列# params: test 為目標(biāo)列為缺失值的數(shù)據(jù)(不包括目標(biāo)列)if dispersed:knn= KNeighborsClassifier(n_neighbors = k, weights = "distance")else:knn= KNeighborsRegressor(n_neighbors = k, weights = "distance")knn.fit(x_train, y_train)return test.index, knn.predict(test)
方式5:隨機森林填充
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifierdef knn_filled_func(x_train, y_train, test, k = 3, dispersed = True):# params: x_train 為目標(biāo)列不含缺失值的數(shù)據(jù)(不包括目標(biāo)列)# params: y_train 為不含缺失值的目標(biāo)列# params: test 為目標(biāo)列為缺失值的數(shù)據(jù)(不包括目標(biāo)列)if dispersed:rf= RandomForestRegressor()else:rf= RandomForestClassifier()rf.fit(x_train, y_train)return test.index, rf.predict(test)
3、缺失衍生
4、總結(jié)
評論
圖片
表情

