sklearn-Autograd
sklearn
Prepocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, LabelBinarizer
# feature (OneHotEncoder) v.s. labels (LabelBinarizer)
data = ['cold', 'cold', 'warm', 'cold', 'hot', 'hot', 'warm', 'cold', 'warm', 'hot']
LabelEncoder().fit_transform(data)
# -> array([0, 0, 2, 0, 1, 1, 2, 0, 2, 1])
OneHotEncoder(sparse=False).fit_transform(np.array(data).reshape(-1,1))
# -> array([[1., 0., 0.],
# [1., 0., 0.],
# ...,
# [0., 1., 0.]])
LabelBinarizer().fit_transform(data)
# -> array([[1, 0, 0],
# [1, 0, 0],
# ...,
# [0, 1, 0]])
# one label v.s. multilabels
data = [["US", "M"], ["UK", "M"], ["FR", "F"]]
OneHotEncoder(sparse=False).fit_transform(data)
# -> array([[0., 0., 1., 0., 1.],
# [0., 1., 0., 0., 1.],
# [1., 0., 0., 1., 0.]])
MultiLabelBinarizer().fit_transform(data)
# -> array([[0, 0, 1, 0, 1],
# [0, 0, 1, 1, 0],
# [1, 1, 0, 0, 0]])
Feature Selection
Filter Methods
-
Pearson’s Correlation
-
Linear Discriminant Analysis(LDA,线性判别分析)
-
Analysis of Variance(ANOVA,方差分析)
-
Chi-Square: $$ \chi^2=\sum\frac{(A-E)^2}{E} $$
from sklearn.feature_selection import f_classif, chi2 importances = f_classif(features, labels)[0] importances = np.nan_to_num(importances, nan=0)
-
Mutual Information: $$ I(X;Y)=\sum_{x\in X}\sum_{y\in Y}p(x,y)log\frac{p(x,y)}{p(x)p(y)} $$
from sklearn.feature_selection import mutual_info_classif
importances = mutual_info_classif(features, labels)
- Variance Threshold
from sklearn.feature_selection import VarianceThreshold
selected_features = VarianceThreshold(threshold=threshold).fit_transform(features)
Wrapper Methods
-
Brute-force
-
Greedy search
- forward sequential selection
- backward sequential selection
-
Recursive feature elimination:使用一个基模型进行多轮训练,每轮训练后通过学习器返回的coef_或者feature_importances_消除若干权重较低的特征,再基于新的特征集进行下一轮训练
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
#递归特征消除法,返回特征选择后的数据
#参数estimator为基模型
#参数n_features_to_select为选择的特征个数
RFE(estimator=LogisticRegression(),
n_features_to_select=2).fit_transform(iris.data,
iris.target)
- Heuristic algorithm
- 遗传算法(Genetic algorithm)
- 蚁群算法(Ant colony algorithm)
Embedded Methods
-
Regulation-based
- Lasso:通过对回归系数添加
L1
惩罚项来防止过拟合,可以让特定的回归系数变为0,从而可以选择一个不包含那些系数的更简单的模型
from sklearn.feature_selection import SelectFromModel from sklearn.linear_model import LogisticRegression #带L1惩罚项的逻辑回归作为基模型的特征选择 SelectFromModel(LogisticRegression(penalty="l1", C=0.1)).fit_transform(iris.data,iris.target)
- Elastic Net
- Ridge Regression
- Lasso:通过对回归系数添加
-
Tree-based
- Decision Tree
from sklearn.tree import DecisionTreeClassifier clf = DecisionTreeClassifier(criterion='gini', max_depth=3) clf.fit(features, labels) top_n = 10 print(np.array(feature_names)[clf.feature_importances_.argsort()[-top_n:]][::-1])
- Random Forest
- GBDT
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingClassifier
#GBDT作为基模型的特征选择
SelectFromModel(
GradientBoostingClassifier()).fit_transform(
iris.data,iris.target)
Multilabel Classifier
from sklearn.multiclass import OneVsRestClassifier
ovr = OneVsRestClassifier(LogisticRegression()).fit(x_train, y_train)
pred = ovr.predict(x_test)
# equivalent to
pred = np.zeros_like(y_test)
for i in range(y_test.shape[1]):
clf = LogisticRegression().fit(x_train, y_train[:,i])
pred[:,i] = clf.predict(x_test)
# evaluate
from sklearn.metrics import f1_score
score = f1_score(y_test, pred, average='macro')
score = f1_score(y_test, pred, average='micro')
Clustering
from sklearn.cluster import KMeans, DBSCAN
# K-means
kmeans = KMeans(n_clusters=10, n_init=20, n_jobs=8)
y_pred = kmeans.fit_predict(X)
# DBSCAN
dbscan = DBSCAN(eps=0.9, min_samples=2)
y_pred = dbscan.fit_predict(X)
# HDBSCAN
import hdbscan
clusterer = hdbscan.HDBSCAN(min_cluster_size=15, leaf_size=400).fit(X)
print(clusterer.labels_)
Visulization using t-SNE
# training
n_components = 3 # 2 or 3D
tsne = TSNE(n_components)
X_tsne = tsne.fit_transform(features)
print(X_tsne.shape)
# ploting
fig = plt.figure()
ax = fig.add_subplot(projection='3d')
scatter = ax.scatter(X_tsne[:,0], X_tsne[:,1], X_tsne[:,2], c=labels)
legend = ax.legend(*scatter.legend_elements(), loc="lower left", title="Classes")
ax.add_artist(legend)
Autograd
Gradients, Jacobin and Hessian
计算scaler->scaler, vector->scaler, vector->vector的gradient, Jacobin, Hessian
import autograd.numpy as np
import autograd
# scaler in scaler out
f1 = lambda x: x**2 + 3*x +1
x = 2.
# f(x)
f1(x)
# gradients of f(x)
autograd.grad(f1)(x)
# f(x) and f'(x)
autograd.value_and_grad(f1)(x)
# vertor in scaler out
f2 = lambda x: np.sum(np.square(x))
x = np.arange(5).astype(np.float32)
# f(x)
f2(x)
# gradients of f(x), [df/dx1, df/dx2, ..., df/dxn]
autograd.grad(f2)(x)
# total derivative of f(x), =np.sum(autograd.grad(f2)(x))
autograd.deriv(f2)(x)
# jacobin of f(x), same as grad, [df/dx1, df/dx2, ..., df/dxn]
autograd.jacobian(f2)(x)
# hessian of f(x), [[d2y/dx1^2, d2y/dx1*dx2, ..., d2y/dx1*dxn], ..., [d2y/dxn*dx1, d2y/dxn*dx2, ..., d2y/dxn^2]]
autograd.hessian(f2)(x)
# vector in vector out
f3 = lambda x: np.square(x)
x = np.arange(5).astype(np.float32)
# f(x)
f3(x)
# jacobin of f(x), [[dy1/dx1, dy1/dx2, ..., dy1/dxn], ..., [dyn/dx1, dyn/dx2, ..., dyn/dxn]]
autograd.jacobian(f3)(x)