<kbd id="afajh"><form id="afajh"></form></kbd>
<strong id="afajh"><dl id="afajh"></dl></strong>
    <del id="afajh"><form id="afajh"></form></del>
        1. <th id="afajh"><progress id="afajh"></progress></th>
          <b id="afajh"><abbr id="afajh"></abbr></b>
          <th id="afajh"><progress id="afajh"></progress></th>

          在PySpark上使用XGBoost

          共 4463字,需瀏覽 9分鐘

           ·

          2021-04-26 09:25


          我這里提供一個(gè)pyspark的版本,參考了大家公開(kāi)的版本。同時(shí)因?yàn)楣倬W(wǎng)沒(méi)有查看特征重要性的方法,所以自己寫(xiě)了一個(gè)方法。本方法沒(méi)有保存模型,相信大家應(yīng)該會(huì)。


          from pyspark.conf import SparkConffrom pyspark.sql import SparkSessionimport pyspark.sql.functions as Ffrom pyspark.sql.types import FloatType,DoubleType,StringType,IntegerTypefrom pyspark.ml import Pipeline,PipelineModelfrom xparkxgb import XGBoostClassifier,XGBoostRegressorimport loggingfrom datetime import date,timedaltafrom pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler,MinAMaxScaler,IndexToStringconf = SparkConf()\    .setExecutorEnv('','123')spark = SparkSession \    .builder \    .config(conf=conf)\    .appName('pyspark demo')    .getOrCreate()sc = spark.sparkContext


          ?? 拉取數(shù)據(jù)

          df = spark.sql("select *  from test_table where datadate='20200101'")#刪除不要的字段df = df.drop("column2")#選擇字段-num_feature:數(shù)值,cat_feature:分類(lèi)值num_features = ["num1","num2"]cat_features = ["cat1","cat2"]label_columns = ["is_true_flag"]df = df[num_features,cat_features+label_columns  ]df = df.dropna()df = df.na.replace('','NA')df = df.fillna(0)#change data typefor col in cat_features:    df = df.withColumn(col,df[col].cast(StringType()))    for col in num_features:    df = df.withColumn(col,df[col].cast(DoubleType())) df = df.withColumn('is_true_flag',df['ist_true_flag'].cast(IntegerType()))


          ?? 轉(zhuǎn)onehot

          #one-hot & standard scaler  stages = []for col in cat_features:    # 字符串轉(zhuǎn)成索引    string_index = StringIndexer(inputCol = col, outputCol = col + 'Index')    # 轉(zhuǎn)換為OneHot編碼    encoder = OneHotEncoder(inputCol=string_index.getOutputCol(), outputCol=col + "_one_hot")    # 將每個(gè)字段的轉(zhuǎn)換方式 放到stages中    stages += [string_index, encoder]
          # 將income轉(zhuǎn)換為索引label_string_index = StringIndexer(inputCol = 'is_true_flag', outputCol = 'label')# 添加到stages中stages += [label_string_index]
          # 類(lèi)別變量 + 數(shù)值變量assembler_cols = [c + "_one_hot" for c in cat_features] + num_featuresassembler = VectorAssembler(inputCols=assembler_cols, outputCol="features")stages += [assembler]
          # 使用pipeline完成數(shù)據(jù)處理pipeline = Pipeline(stages=stages)pipeline_model = pipeline.fit(df)df = pipeline_model.transform(df)train, test = df.randomSplit([0.7, 0.3], seed=2021)print(train.count())print(test.count())


          ?? 創(chuàng)建模型

          # 創(chuàng)建模型xgb = XGBoostClassifier(featuresCol = 'features', labelCol = 'label',predictionCol='predict_val',missing=0.0,numRound=50,numWorkers=10)preModel = xgb.fit(trainData)out1 = preModel.transform(testData)


          ?? 查看訓(xùn)練效果

          ###訓(xùn)練效果##import pyspark.mllib.eveluation as evlr_results = out1.select(['predict_val','label']).rdd.map(lambda row:(row[0],row[1] * 1.0))lr_ev =ev.BinaryClassificationMetrics(lr_results)print (":Area under PR:{}".format(lr_ev.areaUnderPR))print (":Area under ROC:{}".format(lr_ev.areaUnderROC))tp = out1[(out.label == 1) & (out1.predict_val == 1)].count()tn = out1[(out.label == 0) & (out1.predict_val == 0)].count()fn = out1[(out.label == 0) & (out1.predict_val == 1)].count()fn = out1[(out.label == 1) & (out1.predict_val == 0)].count()print ('accuracy is : %f'%((tp+tn)/(tp+tn+fp+fn))) #準(zhǔn)確率print ('recall is : %f'%((tp)/(tp+fn))) #召回率print ('precision is : %f'%((tp)/(tp+fp))) #精確率


          ?? 特征解析

          #特征解析df.schema['features'].metadatatemp = df.schema["features"].metadata["ml_attr"]["attrs"]df_importance = pd.DataFrame(columns=['idx', 'name'])for attr in temp['numeric']:    temp_df = {}    temp_df['idx'] = attr['idx']    temp_df['name'] = attr['name']    #print(temp_df)    df_importance = df_importance.append(temp_df, ignore_index=True)    #print(attr['idx'], attr['name'])    #print(attr)    #breakdf_importancefor attr in temp['binary']:    temp_df = {}    temp_df['idx'] = attr['idx']    temp_df['name'] = attr['name']    df_importance = df_importance.append(temp_df, ignore_index=True)df_importance#解析特征重要值FeatureScoreMap = preModel.nativeBooster.getScore("","gain")file_path ="C://Users//Administrator//Desktop//importance.csv"file  = open(file_path,"w+")print(FeatureScoreMap ,file = file)file.close()f1 = open(file_path)line = f1.readline()data=line.replace(',','\n').replace('->',',').replace('Map(','').replace(')','').replace('f','')file  = open(file_path,"w+")print(data,file = file)file.close()df_temp = pd.read_csv(file_path,header=None,names=["feature","weight"])df_importance = df_importance.merge(df_temp, left_on="feature", right_on="feature")df_importance.sort_values(by=['feature_importance'], ascending=False, inplace=True)df_importance


          瀏覽 55
          點(diǎn)贊
          評(píng)論
          收藏
          分享

          手機(jī)掃一掃分享

          分享
          舉報(bào)
          評(píng)論
          圖片
          表情
          推薦
          點(diǎn)贊
          評(píng)論
          收藏
          分享

          手機(jī)掃一掃分享

          分享
          舉報(bào)
          <kbd id="afajh"><form id="afajh"></form></kbd>
          <strong id="afajh"><dl id="afajh"></dl></strong>
            <del id="afajh"><form id="afajh"></form></del>
                1. <th id="afajh"><progress id="afajh"></progress></th>
                  <b id="afajh"><abbr id="afajh"></abbr></b>
                  <th id="afajh"><progress id="afajh"></progress></th>
                  一级二级黄色视屏 | 特级特黄A片一级一片 | 黄色片勉费视频网站 | 日韩无码第一页 | 日本a在线 |