數(shù)學(xué)推導(dǎo)+純Python實(shí)現(xiàn)機(jī)器學(xué)習(xí)算法15:GBDT
點(diǎn)擊上方“小白學(xué)視覺(jué)”,選擇加"星標(biāo)"或“置頂”
重磅干貨,第一時(shí)間送達(dá)
時(shí)隔大半年,機(jī)器學(xué)習(xí)算法推導(dǎo)系列終于有時(shí)間繼續(xù)更新了。在之前的14講中,筆者將監(jiān)督模型中主要的單模型算法基本都過(guò)了一遍。預(yù)計(jì)在接下來(lái)的10講中,筆者將努力更新完以GBDT代表的集成學(xué)習(xí)模型,以EM算法、CRF和隱馬為代表的概率圖模型以及以聚類降維為代表的無(wú)監(jiān)督學(xué)習(xí)算法。











(2) 對(duì)
有:
對(duì)每個(gè)樣本
,計(jì)算負(fù)梯度,即殘差

將上步得到的殘差作為樣本新的真實(shí)值,并將數(shù)據(jù)
作為下棵樹(shù)的訓(xùn)練數(shù)據(jù),得到一顆新的回歸樹(shù)
其對(duì)應(yīng)的葉子節(jié)點(diǎn)區(qū)域?yàn)?/span>
。其中
為回歸樹(shù)t的葉子節(jié)點(diǎn)的個(gè)數(shù)。對(duì)葉子區(qū)域
計(jì)算最佳擬合值

更新強(qiáng)學(xué)習(xí)器


class TreeNode():def __init__(self, feature_i=None, threshold=None,value=None, true_branch=None, false_branch=None):pass
class Tree(object):def __init__(self, min_samples_split=2, min_impurity=1e-7,max_depth=float("inf"), loss=None):self.root = None # Root node in dec. tree# Minimum n of samples to justify splitself.min_samples_split = min_samples_split# The minimum impurity to justify splitself.min_impurity = min_impurity# The maximum depth to grow the tree toself.max_depth = max_depth# Function to calculate impurity (classif.=>info gain, regr=>variance reduct.)# 切割樹(shù)的方法,gini,方差等self._impurity_calculation = None# Function to determine prediction of y at leaf# 樹(shù)節(jié)點(diǎn)取值的方法,分類樹(shù):選取出現(xiàn)最多次數(shù)的值,回歸樹(shù):取所有值的平均值self._leaf_value_calculation = None# If y is one-hot encoded (multi-dim) or not (one-dim)self.one_dim = None# If Gradient Boostself.loss = lossdef fit(self, X, y, loss=None):""" Build decision tree """passdef _build_tree(self, X, y, current_depth=0):""" Recursive method which builds out the decision tree and splits X and respective ypassdef predict_value(self, x, tree=None):""" Do a recursive search down the tree and make a prediction of the data sample by thevalue of the leaf that we end up at """passdef predict(self, X):""" Classify samples one by one and return the set of labels """passdef print_tree(self, tree=None, indent=" "):pass
class RegressionTree(Tree):# 使用方差法進(jìn)行樹(shù)分割def _calculate_variance_reduction(self, y, y1, y2):var_tot = calculate_variance(y)var_1 = calculate_variance(y1)var_2 = calculate_variance(y2)frac_1 = len(y1) / len(y)frac_2 = len(y2) / len(y)# Calculate the variance reductionvariance_reduction = var_tot - (frac_1 * var_1 + frac_2 * var_2)return sum(variance_reduction)# 使用均值法取葉子結(jié)點(diǎn)值def _mean_of_y(self, y):value = np.mean(y, axis=0)return value if len(value) > 1 else value[0]# 回歸樹(shù)擬合def fit(self, X, y):self._impurity_calculation = self._calculate_variance_reductionself._leaf_value_calculation = self._mean_of_ysuper(RegressionTree, self).fit(X, y)
class Loss(object):def loss(self, y_true, y_pred):return NotImplementedError()def gradient(self, y, y_pred):raise NotImplementedError()def acc(self, y, y_pred):return 0class SquareLoss(Loss):def __init__(self): passdef loss(self, y, y_pred):return 0.5 * np.power((y - y_pred), 2)def gradient(self, y, y_pred):return -(y - y_pred)
class GBDT(object):def __init__(self, n_estimators, learning_rate, min_samples_split,min_impurity, max_depth, regression):# 基本參數(shù)self.n_estimators = n_estimatorsself.learning_rate = learning_rateself.min_samples_split = min_samples_splitself.min_impurity = min_impurityself.max_depth = max_depthself.regression = regressionself.loss = SquareLoss()if not self.regression:self.loss = SotfMaxLoss()# 分類問(wèn)題也可以使用回歸樹(shù),利用殘差去學(xué)習(xí)概率self.estimators = []for i in range(self.n_estimators):self.estimators.append(RegressionTree(min_samples_split=self.min_samples_split,min_impurity=self.min_impurity,max_depth=self.max_depth))# 擬合方法def fit(self, X, y):# 讓第一棵樹(shù)去擬合模型self.estimators[0].fit(X, y)y_pred = self.estimators[0].predict(X)for i in range(1, self.n_estimators):gradient = self.loss.gradient(y, y_pred)self.estimators[i].fit(X, gradient)y_pred -= np.multiply(self.learning_rate, self.estimators[i].predict(X))# 預(yù)測(cè)方法def predict(self, X):y_pred = self.estimators[0].predict(X)for i in range(1, self.n_estimators):y_pred -= np.multiply(self.learning_rate, self.estimators[i].predict(X))if not self.regression:# Turn into probability distributiony_pred = np.exp(y_pred) / np.expand_dims(np.sum(np.exp(y_pred), axis=1), axis=1)# Set label to the value that maximizes probabilityy_pred = np.argmax(y_pred, axis=1)return y_pred
# regression treeclass GBDTRegressor(GBDT):def __init__(self, n_estimators=200, learning_rate=0.5, min_samples_split=2,min_var_red=1e-7, max_depth=4, debug=False):super(GBDTRegressor, self).__init__(n_estimators=n_estimators,learning_rate=learning_rate,min_samples_split=min_samples_split,min_impurity=min_var_red,max_depth=max_depth,regression=True)# classification treeclass GBDTClassifier(GBDT):def __init__(self, n_estimators=200, learning_rate=.5, min_samples_split=2,min_info_gain=1e-7, max_depth=2, debug=False):super(GBDTClassifier, self).__init__(n_estimators=n_estimators,learning_rate=learning_rate,min_samples_split=min_samples_split,min_impurity=min_info_gain,max_depth=max_depth,regression=False)def fit(self, X, y):y = to_categorical(y)super(GBDTClassifier, self).fit(X, y)
from sklearn import datasetsboston = datasets.load_boston()X, y = shuffle_data(boston.data, boston.target, seed=13)X = X.astype(np.float32)offset = int(X.shape[0] * 0.9)X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)model = GBDTRegressor()model.fit(X_train, y_train)y_pred = model.predict(X_test)# Color mapcmap = plt.get_cmap('viridis')mse = mean_squared_error(y_test, y_pred)print ("Mean Squared Error:", mse)# Plot the resultsm1 = plt.scatter(range(X_test.shape[0]), y_test, color=cmap(0.5), s=10)m2 = plt.scatter(range(X_test.shape[0]), y_pred, color='black', s=10)plt.suptitle("Regression Tree")plt.title("MSE: %.2f" % mse, fontsize=10)plt.xlabel('sample')plt.ylabel('house price')plt.legend((m1, m2), ("Test data", "Prediction"), loc='lower right')plt.show();

好消息!
小白學(xué)視覺(jué)知識(shí)星球
開(kāi)始面向外開(kāi)放啦??????
下載1:OpenCV-Contrib擴(kuò)展模塊中文版教程 在「小白學(xué)視覺(jué)」公眾號(hào)后臺(tái)回復(fù):擴(kuò)展模塊中文教程,即可下載全網(wǎng)第一份OpenCV擴(kuò)展模塊教程中文版,涵蓋擴(kuò)展模塊安裝、SFM算法、立體視覺(jué)、目標(biāo)跟蹤、生物視覺(jué)、超分辨率處理等二十多章內(nèi)容。 下載2:Python視覺(jué)實(shí)戰(zhàn)項(xiàng)目52講 在「小白學(xué)視覺(jué)」公眾號(hào)后臺(tái)回復(fù):Python視覺(jué)實(shí)戰(zhàn)項(xiàng)目,即可下載包括圖像分割、口罩檢測(cè)、車道線檢測(cè)、車輛計(jì)數(shù)、添加眼線、車牌識(shí)別、字符識(shí)別、情緒檢測(cè)、文本內(nèi)容提取、面部識(shí)別等31個(gè)視覺(jué)實(shí)戰(zhàn)項(xiàng)目,助力快速學(xué)校計(jì)算機(jī)視覺(jué)。 下載3:OpenCV實(shí)戰(zhàn)項(xiàng)目20講 在「小白學(xué)視覺(jué)」公眾號(hào)后臺(tái)回復(fù):OpenCV實(shí)戰(zhàn)項(xiàng)目20講,即可下載含有20個(gè)基于OpenCV實(shí)現(xiàn)20個(gè)實(shí)戰(zhàn)項(xiàng)目,實(shí)現(xiàn)OpenCV學(xué)習(xí)進(jìn)階。 交流群
歡迎加入公眾號(hào)讀者群一起和同行交流,目前有SLAM、三維視覺(jué)、傳感器、自動(dòng)駕駛、計(jì)算攝影、檢測(cè)、分割、識(shí)別、醫(yī)學(xué)影像、GAN、算法競(jìng)賽等微信群(以后會(huì)逐漸細(xì)分),請(qǐng)掃描下面微信號(hào)加群,備注:”昵稱+學(xué)校/公司+研究方向“,例如:”張三 + 上海交大 + 視覺(jué)SLAM“。請(qǐng)按照格式備注,否則不予通過(guò)。添加成功后會(huì)根據(jù)研究方向邀請(qǐng)進(jìn)入相關(guān)微信群。請(qǐng)勿在群內(nèi)發(fā)送廣告,否則會(huì)請(qǐng)出群,謝謝理解~

