机器学习——数据处理流程梳理
数据处理流程梳理
⼀、数据预处理
以⼆分类问题为例,数据集形式为excle,label在最后⼀列
1.数据缺失处理
分别对⾏列缺失进⾏处理
def preTrainDataAndSelectFeature(self, file_path='', label_name='', col_miss_rate=0.5,row_miss_rate=0.8,
balance_prior=False,all_data_balance=0.33,train_data_balance=0.33,test_data_balance=None,
forest_n=None, kbest_n=None, lasso=False, pca_feature=4,):
"""
训练集数据处理
:param file_path:string ⽂件路径
:param label_name:标签列名
:param col_miss_rate:float 列缺失⽐率
:param row_miss_rate:float 标签为0的⾏缺失率
:param balance_prior:bool 数据集平衡优先
:param all_data_balance:balance_prior=True时,float 整体数据不均衡调整⽐例若=1,为0,1标签⽐值为1:1,=0.33为1:0.33
:param train_data_balance:balance_prior=False时,
:param test_data_balance:balance_prior=False时,
:param forest_n:int 使⽤forest选取的特征数
:param kbest_n:int 使⽤kbest选取的特征数
:param lasso:bool 是否使⽤lasso回归选取特征
:param pca_feature:int pca降维后的特征数(这是降维的最后⼀步,数据此后⼀直使⽤降维后的特征)
:return:可直接放⼊模型训练的最终数据
"""
data = pd.read_excel(file_path)
# 列处理
data = data.dropna(axis=1, thresh=data.shape[0]* col_miss_rate)
# 保存过滤缺失后的特征
self.drop_select =list(data)
self.final_feature = np.array(self.drop_select)
print("drop select:",len(self.final_feature),"\n", self.final_feature)
# ⾏处理:根据标签取值拆分样本,不同取值⽤各⾃的均值填充
data_group = upby(label_name)
data0 =dict([x for x in data_group])[0]
data0 = data0.dropna(axis=0, thresh=data0.shape[1]* row_miss_rate)
data0 = data0.an())
data1 =dict([x for x in data_group])[1]
data1 = data1.an())
data = pd.concat([data0, data1], axis=0)
2.划分训练集、测试集及数据不平衡处理唐嫣身高
这⾥存在两个问题:
1.是否⽤整体的数据集先提取特征,再应⽤特征,然后划分训练集测试集?
这样可以较好的考虑到数据集的整体特征,但同时,测试集的表现可能会⾼于真实场景,考虑到数据集较⼤,故没有使⽤该⽅法。
2.数据平衡处理、数据集划分顺序
两种做法:先整体数据集平衡处理,再划分训练集、测试集 或 先划分训练集、测试集,再单独对训练集进⾏平衡处理(,若测试集样本⽐例极不平衡,也需要平衡处理)。
同样,由于第⼀种⽅法测试集表现可能过优,采⽤后⼀种。
这⾥,将两种⽅法进⾏实现,balance_prior=True,为第⼀种,否则第⼆种。
# # 数据不平衡处理
X, Y = data.drop([label_name], axis=1), data[label_name]
if balance_prior:
#先数据平衡处理,再拆分训练集、测试集
smo = BorderlineSMOTE(kind='borderline-1',
sampling_strategy={0: Y.value_counts()[0],1:int(Y.value_counts()[0]* all_data_balance)}, random_state=2)
x, y = smo.fit_resample(X, Y)
print("data balance result:\n", pd.Series(y).value_counts())
# 打乱不平衡后的数据
data_select = pd.concat((x, y), axis=1)
data_select = shuffle(data_select,random_state=1)
X, Y = data_select.drop([label_name], axis=1), data_select[label_name]
#划分训练集、测试集
X_train, X_test, y_train, y_test = ain_test_split(X,Y, test_size=0.3, random_state=1234) else:
# 先拆分训练集、测试集,再分别对训练集、测试集平衡处理
X_train, X_test, y_train, y_test = ain_test_split(X, Y, test_size=0.3, random_state=1234) # # 数据不平衡处理
smo = BorderlineSMOTE(kind='borderline-1',
sampling_strategy={0: y_train.value_counts()[0],
1:int(y_train.value_counts()[0]* train_data_balance)},
random_state=2)
x_train, y_train = smo.fit_resample(X_train, y_train)
print("train data balance result:\n", pd.Series(y_train).value_counts())
if test_data_balance is not None:
smo = BorderlineSMOTE(kind='borderline-1',
sampling_strategy={0: y_test.value_counts()[0],
1:int(y_test.value_counts()[0]* test_data_balance)},
random_state=2)军婚重生的小说
X_test, y_test = smo.fit_resample(X_test, y_test)
print("test data balance result:\n", pd.Series(y_test).value_counts())
# # 打乱平衡处理后的数据
data_select = pd.concat((x_train, y_train), axis=1)
data_select = shuffle(data_select, random_state=1)
X_train, y_train = data_select.drop([label_name], axis=1), data_select[label_name]
print("data train:\n", pd.Series(y_train).value_counts())
print("data test:\n", pd.Series(y_test).value_counts())
3.数据标准化
# ⽤训练集的数据特征对测试集标准化
x_train= pd.der.fit_transform(X_train.values))
x_test =pd.ansform(X_test.values))
4.特征选择
在样本极其不平衡时,先将数据平衡处理再进⾏特征选择,选择的特征⽐较稳定。
1)随机森林
# 随机森林
if forest_n!=None:
forest = RandomForestClassifier(random_state=1)
forest.fit(x_train, y_train)
ra=pd.Series(forest.feature_importances_,index=lumns)
self.forest_select= ra.sort_values()[::-1][:forest_n].index夏季节气有哪些
print("forest importance:",forest.feature_importances_)
x_train = x_train.loc[:, self.forest_select]
x_test = x_test.loc[:, self.forest_select]
self.final_feature=self.final_feature[self.forest_select]
print("forest features:",len(self.final_feature),"\n", self.final_feature)
2)相关性检测
# 相关性检测
if kbest_n !=None:
pearsonr_way =lambda x, y: np.array(list(map(lambda x: pearsonr(np.squeeze(x), np.squeeze(y)), x.T))).T[0] self.pearsonr_select = SelectKBest(pearsonr_way, k=kbest_n).fit(x_train, y_train).get_support(indices=True)
x_train = x_train.loc[:, self.pearsonr_select]
x_test = x_test.loc[:, self.pearsonr_select]
self.final_feature = self.final_feature[self.pearsonr_select]
print("kbest feature:",len(self.final_feature ),"\n", self.final_feature )
3)lasso特征
# lasso特征
if lasso:
# 交叉验证出最好的lambdas
Lambdas = np.logspace(-5,2,200)
lasso_cv = LassoCV(alphas=Lambdas, normalize=True, cv=10, max_iter=10000)
lasso_cv.fit(x_train,y_train)
lasso = Lasso(alpha=lasso_cv.alpha_, normalize=True, max_iter=10000)
lasso.fit(x_train,y_train)
print("lasso importance:\n",f_,"\n","lasso mse:",mean_squared_error(y_train, lasso.predict(x_train))) tmp = pd.Series(index=list(), f_.tolist())
self.lasso_select = lumns[tmp.values !=0]
x_train = x_train.loc[:, self.lasso_select]
x_test= x_test.loc[:,self.lasso_select]
self.final_feature = self.final_feature[self.lasso_select]
print("lasso select:",len(self.final_feature),"\n", self.final_feature)
5.特征降维
使⽤PCA主成分分析
# 主成分分析降维
if pca_feature!=None:
self.pca = PCA(n_components=pca_feature)
self.pca.fit(x_train)
print("pca importance:", plained_variance_ratio_)
x_train = ansform(x_train)
x_test=ansform(x_test)
return x_train, y_train,x_test,y_test
⼆、训练模型
晓组织成员1.训练
def Train(self, file_path='', label_name="", col_miss_rate=0.5, row_miss_rate=0.8,
balance_prior=False, all_data_balance=0.33, train_data_balance=1, test_data_balance=None,
forest_n=20, kbest_n=None, lasso=False, pca_feature=None,):
x_train, y_train,x_test,y_test = self.preTrainDataAndSelectFeature(file_path=file_path, label_name=label_name,
col_miss_rate=col_miss_rate,row_miss_rate=row_miss_rate,
balance_prior=balance_prior, all_data_balance=all_data_balance,train_data_balance=train_data_balance, test_data_balance=test_data_bala nce,
forest_n=forest_n,kbest_n=kbest_n,lasso=lasso, pca_feature=pca_feature)
self.svm.fit(x_train, y_train)
2.查看ROC曲线
⽤predict_proba()进⾏预测,得到各类别概率值
self._showROC(self.svm.predict_proba(x_test)[:,1], y_test)
def_showROC(self, y_pred, y_true):
fpr, tpr, thresholds = roc_curve(y_true,y_pred)
roc_auc = auc(fpr, tpr)
print("AUC:",roc_auc)
plt.plot(fpr, tpr,'b', label='Val AUC = %0.3f'% roc_auc)
plt.legend(loc='upper right')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
3.评价指标
y_pre=self.svm.predict(x_test)
print("recall_score:", recall_score(y_test, y_pre))
print("precision_score:", precision_score(y_test, y_pre))
print("f1_score:", f1_score(y_test, y_pre))
三、测试数据预处理
1.去除训练集缺失率⾼的列
根据训练集的处理步骤,使其在标准化前拥有相同的特征
def preEvalData(self,file_path="",label_name=""):
x= pd.read_excel(file_path)
#填充,若填充后仍有NAN,对x.mean()进⾏填充.fillna(0)
x = x.an().fillna(0))
#去缺失特征和标签
x=x.loc[:, self.drop_select]
x = x.drop([label_name], axis=1)
2.标准化
# ⽤训练集的均值和⽅差进⾏标准化
x= pd.ansform(x.values))
3.特征选择
if self.forest_select is not None:
x = x.loc[:, self.forest_select]
if self.lasso_select is not None:
x = x.loc[:, self.lasso_select]
if self.pearsonr_select is not None:
x = x.loc[:, self.pearsonr_select]
4.特征降维
if self.pca is not None:
x = ansform(x.values)
return x
四、测试集预测
紫萱的扮演者def predict(self, file_path='', label_name=""):
x=self.preEvalData(file_path,label_name=label_name)
pred=self.svm.predict_proba(x)
count_1 =0
for i in pred[:,1]:
#观察roc曲线得阈值为0.8
if i >0.8:
count_1 = count_1 +1
print("1:", count_1)
print("0:",len(pred)- count_1)黄花梨的鉴别方法
五、完整代码
# coding=utf-8
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from imblearn.over_sampling import BorderlineSMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import LassoCV,Lasso
semble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn import svm,model_selection
ics import mean_squared_error,roc_curve, auc,recall_score, precision_score, f1_score, roc_curve,accuracy_score from scipy.stats import pearsonr
class ML():
def__init__(self):
#特征、数据处理参数,⽤preTrainDataAndSelectFeature()计算出实际值
#数据标准化模型,将训练集的均值⽅差应⽤到测试集
#依据列缺失特征选择
self.drop_select=None
#随机森林特征选择
self.forest_select=None
#lasso回归特征选择
self.lasso_select=None
#kbest相关性检测特征选择
self.pearsonr_select=None
#pca数据降维模型
self.pca=None
# svc⽀持向量机模型
self.svm = svm.SVC(C=0.8, kernel='rbf', gamma=20, decision_function_shape='ovr', probability=True) #合并特征,为中⽂信息、⾮索引
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
发表评论