3www.男人的天堂,美艳麻麻诱子乱小说,在线黄色小电影,欧美成人免费性爱,韩国美女操逼,五月天婷婷电影,韩国啪啪网站,一本色道无码道DVD在线播放

我這里提供一個(gè)pyspark的版本，參考了大家公開(kāi)的版本。同時(shí)因?yàn)楣倬W(wǎng)沒(méi)有查看特征重要性的方法，所以自己寫(xiě)了一個(gè)方法。本方法沒(méi)有保存模型，相信大家應(yīng)該會(huì)。

from pyspark.conf import SparkConffrom pyspark.sql import SparkSessionimport pyspark.sql.functions as Ffrom pyspark.sql.types import FloatType,DoubleType,StringType,IntegerTypefrom pyspark.ml import Pipeline,PipelineModelfrom xparkxgb import XGBoostClassifier,XGBoostRegressorimport loggingfrom datetime import date,timedaltafrom pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler,MinAMaxScaler,IndexToStringconf = SparkConf()\    .setExecutorEnv('','123')spark = SparkSession \    .builder \    .config(conf=conf)\    .appName('pyspark demo')    .getOrCreate()sc = spark.sparkContext

?? 拉取數(shù)據(jù)

df = spark.sql("select *  from test_table where datadate='20200101'")#刪除不要的字段df = df.drop("column2")#選擇字段-num_feature:數(shù)值，cat_feature:分類(lèi)值num_features = ["num1","num2"]cat_features = ["cat1","cat2"]label_columns = ["is_true_flag"]df = df[num_features,cat_features+label_columns  ]df = df.dropna()df = df.na.replace('','NA')df = df.fillna(0)#change data typefor col in cat_features:    df = df.withColumn(col,df[col].cast(StringType()))    for col in num_features:    df = df.withColumn(col,df[col].cast(DoubleType())) df = df.withColumn('is_true_flag',df['ist_true_flag'].cast(IntegerType()))

?? 轉(zhuǎn)onehot

#one-hot & standard scaler  stages = []for col in cat_features:    # 字符串轉(zhuǎn)成索引    string_index = StringIndexer(inputCol = col, outputCol = col + 'Index')    # 轉(zhuǎn)換為OneHot編碼    encoder = OneHotEncoder(inputCol=string_index.getOutputCol(), outputCol=col + "_one_hot")    # 將每個(gè)字段的轉(zhuǎn)換方式 放到stages中    stages += [string_index, encoder]
# 將income轉(zhuǎn)換為索引label_string_index = StringIndexer(inputCol = 'is_true_flag', outputCol = 'label')# 添加到stages中stages += [label_string_index]
# 類(lèi)別變量 + 數(shù)值變量assembler_cols = [c + "_one_hot" for c in cat_features] + num_featuresassembler = VectorAssembler(inputCols=assembler_cols, outputCol="features")stages += [assembler]
# 使用pipeline完成數(shù)據(jù)處理pipeline = Pipeline(stages=stages)pipeline_model = pipeline.fit(df)df = pipeline_model.transform(df)train, test = df.randomSplit([0.7, 0.3], seed=2021)print(train.count())print(test.count())

?? 創(chuàng)建模型

# 創(chuàng)建模型xgb = XGBoostClassifier(featuresCol = 'features', labelCol = 'label',predictionCol='predict_val',missing=0.0,numRound=50,numWorkers=10)preModel = xgb.fit(trainData)out1 = preModel.transform(testData)

?? 查看訓(xùn)練效果

###訓(xùn)練效果##import pyspark.mllib.eveluation as evlr_results = out1.select(['predict_val','label']).rdd.map(lambda row:(row[0],row[1] * 1.0))lr_ev =ev.BinaryClassificationMetrics(lr_results)print (":Area under PR:{}".format(lr_ev.areaUnderPR))print (":Area under ROC:{}".format(lr_ev.areaUnderROC))tp = out1[(out.label == 1) & (out1.predict_val == 1)].count()tn = out1[(out.label == 0) & (out1.predict_val == 0)].count()fn = out1[(out.label == 0) & (out1.predict_val == 1)].count()fn = out1[(out.label == 1) & (out1.predict_val == 0)].count()print ('accuracy is : %f'%((tp+tn)/(tp+tn+fp+fn))) #準(zhǔn)確率print ('recall is : %f'%((tp)/(tp+fn))) #召回率print ('precision is : %f'%((tp)/(tp+fp))) #精確率

?? 特征解析

#特征解析df.schema['features'].metadatatemp = df.schema["features"].metadata["ml_attr"]["attrs"]df_importance = pd.DataFrame(columns=['idx', 'name'])for attr in temp['numeric']:    temp_df = {}    temp_df['idx'] = attr['idx']    temp_df['name'] = attr['name']    #print(temp_df)    df_importance = df_importance.append(temp_df, ignore_index=True)    #print(attr['idx'], attr['name'])    #print(attr)    #breakdf_importancefor attr in temp['binary']:    temp_df = {}    temp_df['idx'] = attr['idx']    temp_df['name'] = attr['name']    df_importance = df_importance.append(temp_df, ignore_index=True)df_importance#解析特征重要值FeatureScoreMap = preModel.nativeBooster.getScore("","gain")file_path ="C://Users//Administrator//Desktop//importance.csv"file  = open(file_path,"w+")print(FeatureScoreMap ,file = file)file.close()f1 = open(file_path)line = f1.readline()data=line.replace(',','\n').replace('->',',').replace('Map(','').replace(')','').replace('f','')file  = open(file_path,"w+")print(data,file = file)file.close()df_temp = pd.read_csv(file_path,header=None,names=["feature","weight"])df_importance = df_importance.merge(df_temp, left_on="feature", right_on="feature")df_importance.sort_values(by=['feature_importance'], ascending=False, inplace=True)df_importance

在PySpark上使用XGBoost

?? 拉取數(shù)據(jù)

?? 轉(zhuǎn)onehot

?? 創(chuàng)建模型

?? 查看訓(xùn)練效果

?? 特征解析