JobPlus知识库 IT 工业智能4.0 文章
常用机器学习算法的python实现

KNN

参数指定版

from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split

import pandas as pd 

 #read data

data = pd.read_csv('D:/column_2C_weka.csv') 

#n_neighbors: K. In this example it is 3. it means that Look at the 3 closest labeled data points

knn = KNeighborsClassifier(n_neighbors = 3)

    #x: features y: target variables

x,y = data.loc[:,data.columns != 'class'], data.loc[:,'class']

   #split our data train and test sets

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3,random_state = 1) 

knn.fit(x_train,y_train)   #train the data

prediction = knn.predict(x_test) #predicts the data 

print('Prediction: {}'.format(prediction)) 

print('With KNN (K=3) accuracy is: ',knn.score(x_test,y_test)) # accuracy


调参版,其他算法的调参均可参照于此

from sklearn.model_selection import GridSearchCV 

grid = {'n_neighbors': np.arange(1,50)} 

knn = KNeighborsClassifier() 

knn_cv = GridSearchCV(knn, grid, cv=3) # GridSearchCV

knn_cv.fit(x,y)# Fit

# Print hyperparameter

print("Tuned hyperparameter k: {}".format(knn_cv.best_params_)) 

 print("Best score: {}".format(knn_cv.best_score_))


数据切分

将数据集划分为训练集与测试集

# train/test split

from sklearn.model_selection import train_test_split 

x,y = data.loc[:,data.columns != 'class'], data.loc[:,'class'] 

#test_size:percentage of test size, train_size = 1 - test_size 

#random_state: sets a seed. If this seed is same number, 

 #train_test_split() produce exact same split at each time

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3,random_state = 1) 

knn = KNeighborsClassifier(n_neighbors = 3) 

#train with train_set

knn.fit(x_train,y_train) 

#predict with test_set

prediction = knn.predict(x_test) 

print('With KNN (K=3) accuracy is: ',knn.score(x_test,y_test)) # accuracy


LR-线性回归

from sklearn.metrics import roc_curve

from sklearn.metrics import confusion_matrix, classification_report

# LinearRegression

from sklearn.linear_model import LinearRegression 

reg = LinearRegression()

# Predict space

predict_space = np.linspace(min(x), max(x)).reshape(-1,1)

# Fit

reg.fit(x,y)

# Predict

predicted = reg.predict(predict_space)

# R^2 

print('R^2 score: ',reg.score(x, y)) 

 y_pred_prob = reg.predict_proba(x_test)[:,1] 

fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

# Plot ROC curve

plt.plot([0, 1], [0, 1], 'k--') 

plt.plot(fpr, tpr) 

plt.xlabel('False Positive Rate') 

plt.ylabel('True Positive Rate') 

plt.title('ROC') plt.show()



K fold CROSS VALIDATION-k fold CV-K折交叉验证

参数指定版

from sklearn.model_selection import cross_val_score 

reg = LinearRegression() 

k = 5 

cv_result = cross_val_score(reg,x,y,cv=k) # uses R^2 as score

print('CV Scores: ',cv_result) 

print('CV scores average: ',np.sum(cv_result)/k)


调参版

# grid search cross validation with 2 hyperparameter

# 1. hyperparameter is C:logistic regression regularization parameter

# 2. penalty l1 or l2

# Hyperparameter grid

param_grid = {'C': np.logspace(-3, 3, 7), 'penalty': ['l1', 'l2']} 

x_train, x_test, y_train, y_test = train_test_split(x,y, 

test_size = 0.3,random_state = 12) 

logreg = LogisticRegression() 

logreg_cv = GridSearchCV(logreg,param_grid,cv=3) 

logreg_cv.fit(x_train,y_train)

# Print the optimal parameters and best score

print("Tuned hyperparameters : {}".format(logreg_cv.best_params_)) 

print("Best Accuracy: {}".format(logreg_cv.best_score_))


CROSS VALIDATION-正则化回归

损失函数正则化,分别采用L2和L1正则化

# Ridge-L2

from sklearn.linear_model import Ridge 

x_train,x_test,y_train,y_test = train_test_split(x,y,random_state = 2, test_size = 0.3)

# alpha-正则化项的系数

ridge = Ridge(alpha = 0.1, normalize = True) 

ridge.fit(x_train,y_train) 

ridge_predict = ridge.predict(x_test) 

print('Ridge score: ',ridge.score(x_test,y_test))


# Lasso-L1

from sklearn.linear_model import Lasso 

x = np.array(data1.loc[:,['pelvic_incidence','pelvic_tilt numeric','lumbar_lordosis_angle','pelvic_radius']]) 

x_train,x_test,y_train,y_test = train_test_split(x,y,random_state = 3, test_size = 0.3) 

lasso = Lasso(alpha = 0.1, normalize = True) 

lasso.fit(x_train,y_train) 

ridge_predict = lasso.predict(x_test) 

print('Lasso score: ',lasso.score(x_test,y_test)) 

print('Lasso coefficients: ',lasso.coef_)


Random forest

# Confusion matrix with random forest

from sklearn.metrics import classification_report, confusion_matrix

from sklearn.ensemble import RandomForestClassifier 

x,y = data.loc[:,data.columns != 'class'], data.loc[:,'class'] 

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3, random_state = 1) 

rf = RandomForestClassifier(random_state = 4) 

rf.fit(x_train,y_train) 

y_pred = rf.predict(x_test) 

cm = confusion_matrix(y_test,y_pred) 

print('Confusion matrix: \n',cm) 

print('Classification report: \n',classification_report(y_test,y_pred))



如果觉得我的文章对您有用,请随意打赏。您的支持将鼓励我继续创作!

¥ 打赏支持
11人赞 举报
分享到
用户评价(0)

暂无评价,你也可以发布评价哦:)

扫码APP

扫描使用APP

扫码使用

扫描使用小程序