ML之xgboost:利用xgboost算法(特征筛选和GridSearchCV)对数据集实现回归预测
目录
- ['EnterCOD', 'EnterBOD', 'EnterAD', 'EnterZL', 'EnterZD', 'EnterPH', 'EnterSS', 'M4', 'N4', 'O4', 'P4', 'Q4', 'R4']
-
- EnterCOD EnterBOD EnterAD EnterZL EnterZD EnterPH EnterSS M4 \
- 0 299.0 0.0 16.7 9.63 26.5 7 354.0 4609.0
- 1 331.0 0.0 15.0 9.34 31.8 7 297.5 4834.0
- 2 326.0 0.0 19.6 11.17 33.5 7 389.5 4928.0
- 3 230.0 0.0 17.4 6.23 32.3 7 277.5 5073.0
- 4 149.0 0.0 16.8 3.59 23.7 7 106.0 4856.0
-
- N4 O4 P4 Q4 R4
- 0 2346.0 1.72 32.0 69.43 17.0
- 1 2434.0 1.72 34.0 70.34 18.0
- 2 2604.0 1.70 35.0 71.02 18.0
- 3 2678.0 1.68 36.0 70.96 19.0
- 4 2452.0 1.69 37.0 76.19 19.0
-
-
- mlss准确率: 0.950752699205583
- 特征: Index(['EnterCOD', 'EnterBOD', 'EnterAD', 'EnterZL', 'EnterZD', 'EnterPH',
- 'EnterSS', 'M4', 'N4', 'O4', 'P4', 'Q4', 'R4'],
- dtype='object')
- 每个特征的重要性: [100. 21.307432 48.90534 37.218624 26.950356 2.081406
- 31.82239 72.88005 49.49121 61.9334 19.071848 33.441257
- 17.745914]
-
-
- mlss选取重要特征后准确率: 0.9485146037853682
- 重要特征: Index(['EnterCOD', 'M4', 'O4', 'N4', 'EnterAD', 'EnterZL', 'Q4', 'EnterSS',
- 'EnterZD', 'EnterBOD', 'P4', 'R4'],
- dtype='object')
- 每个重要特征的重要性: [100. 92.00673 75.79092 55.387436 36.038513 32.217636
- 42.442307 28.243927 24.789852 12.685312 18.707016 19.150238]
-
-
-
- ML之xgboost:利用xgboost算法(特征筛选和GridSearchCV)对数据集实现回归预测
-
- import pandas as pd
- import numpy as np
- import matplotlib.pyplot as plt
- from sklearn import metrics
- import pickle
- from xgboost.sklearn import XGBRegressor
- from sklearn.preprocessing import StandardScaler
- from clean_data import prep_water_data, normalize_water_data, normalize_data, delete_null_date
- from sklearn.model_selection import KFold, train_test_split, GridSearchCV, cross_val_score
- from sklearn.model_selection import TimeSeriesSplit
-
-
- def GDBTTrain(X, y):
- """xgboost用法"""
- train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=0) test_size测试集合所占比例
- test_preds = pd.DataFrame({"label": test_y})
- clf = XGBRegressor(
- learning_rate=0.1, 默认0.3
- n_estimators=400, 树的个数
- max_depth=8,
- )
- clf.fit(train_x, train_y)
- test_preds['y_pred'] = clf.predict(test_x)
- stdm = metrics.r2_score(test_preds['label'], test_preds['y_pred'])
-
- GridSearchCV和cross_val_score的结果一样
- scores = cross_val_score(clf, X, y, scoring='r2')
- print(scores)
- gs = GridSearchCV(clf, {}, cv=3, verbose=3).fit(X, y)
-
- return stdm, clf
-
-
- def XGTSearch(X, y):
- print("Parameter optimization")
- n_estimators = [50, 100, 200, 400]
- max_depth = [2, 4, 6, 8]
- learning_rate = [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]
- param_grid = dict(max_depth=max_depth, n_estimators=n_estimators, learning_rate=learning_rate)
- xgb_model = XGBRegressor()
- kfold = TimeSeriesSplit(n_splits=2).get_n_splits([X, y])
- fit_params = {"eval_metric": "rmse"}
- grid_search = GridSearchCV(xgb_model, param_grid, verbose=1, fit_params=fit_params, cv=kfold)
- grid_result = grid_search.fit(X, y)
- summarize results
- print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
- means = grid_result.cv_results_['mean_test_score']
- stds = grid_result.cv_results_['std_test_score']
- params = grid_result.cv_results_['params']
- for mean, stdev, param in zip(means, stds, params):
- print("%f (%f) with: %r" % (mean, stdev, param))
-
- return means, grid_result
-
-
-
- feature_string = 'EnterCOD EnterBOD EnterAD EnterZL EnterZD EnterPH EnterSS M4 N4 O4 P4 Q4 R4' 选取的特征
- outputs_string = 'mlss mlvss sv30 OutCOD OutBOD OutAD OutZL OutZD OutPH OutSS' 需要预测的标签
- feature = feature_string.split()
- outputs = outputs_string.split()
- print(feature)
-
-
- def prep_water_data(data, columns):
- for c in columns:
- data[c] = [0 if ((x in ['Not Available', 'Not Mapped', 'NULL']) | (pd.isnull(x))) else x for x in data[c]]
- return data
-
- def delete_null_date(data, date_name):
- data = data[data[date_name].notnull()] 删除日期存在缺失的数据
- return data
-
- data = pd.read_csv('water_a.csv', encoding="gb18030")
- data = prep_water_data(data, feature)
-
- print(data.iloc[:5][feature])
-
-
-
-
-
- def predict(data, out):
- data = delete_null_date(data, out)
- y = data[out]
- y = y.as_matrix()
- X = data[feature]
- stdm, clf = GDBTTrain(X, y)
- print(out +'准确率:', stdm)
-
- feature_importance = clf.feature_importances_
- feature_importance = 100.0 * (feature_importance / feature_importance.max())
- print('特征:', X.columns)
- print('每个特征的重要性:', feature_importance)
-
- sorted_idx = np.argsort(feature_importance)
-
- pos = np.arange(sorted_idx.shape[0])
- plt.barh(pos, feature_importance[sorted_idx], align='center')
- plt.yticks(pos, X.columns[sorted_idx])
- plt.xlabel('Features')
- plt.ylabel('Importance')
- plt.title('Variable Importance')
- plt.show()
-
- .......................选取重要性高的特征再次进行训练和预测..................................
- X = data[X.columns[sorted_idx][::-1][:-1]]
- stdm, clf = GDBTTrain(X, y)
- print(out +'选取重要特征后准确率:', stdm)
-
- feature_importance = clf.feature_importances_
- feature_importance = 100.0 * (feature_importance / feature_importance.max())
- print('重要特征:', X.columns)
- print('每个重要特征的重要性:', feature_importance)
-
- sorted_idx = np.argsort(feature_importance)
-
- pos = np.arange(sorted_idx.shape[0])
- plt.barh(pos, feature_importance[sorted_idx], align='center')
- plt.yticks(pos, X.columns[sorted_idx])
- plt.xlabel('Features')
- plt.ylabel('Importance')
- plt.title('重要特征 Variable Importance')
- plt.show()
-
- for out in outputs[:1]:
- sorted_idx = predict(data, out)
-
网站声明:如果转载,请联系本站管理员。否则一切后果自行承担。
添加我为好友,拉您入交流群!
请使用微信扫一扫!