sklearn-Autograd

sklearn

Prepocessing

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, LabelBinarizer

# feature (OneHotEncoder) v.s. labels (LabelBinarizer)
data = ['cold', 'cold', 'warm', 'cold', 'hot', 'hot', 'warm', 'cold', 'warm', 'hot']
LabelEncoder().fit_transform(data)
# -> array([0, 0, 2, 0, 1, 1, 2, 0, 2, 1])
OneHotEncoder(sparse=False).fit_transform(np.array(data).reshape(-1,1))
# -> array([[1., 0., 0.],
#           [1., 0., 0.],
#           ...,
#           [0., 1., 0.]])
LabelBinarizer().fit_transform(data)
# -> array([[1, 0, 0],
#           [1, 0, 0],
#           ...,
#           [0, 1, 0]])

# one label v.s. multilabels
data = [["US", "M"], ["UK", "M"], ["FR", "F"]]
OneHotEncoder(sparse=False).fit_transform(data)
# -> array([[0., 0., 1., 0., 1.],
#           [0., 1., 0., 0., 1.],
#           [1., 0., 0., 1., 0.]])
MultiLabelBinarizer().fit_transform(data)
# -> array([[0, 0, 1, 0, 1],
#           [0, 0, 1, 1, 0],
#           [1, 1, 0, 0, 0]])

Feature Selection

Filter Methods

  • Pearson’s Correlation

  • Linear Discriminant Analysis(LDA,线性判别分析)

  • Analysis of Variance(ANOVA,方差分析)

  • Chi-Square: $$ \chi^2=\sum\frac{(A-E)^2}{E} $$

    from sklearn.feature_selection import f_classif, chi2
    
    importances = f_classif(features, labels)[0]
    importances = np.nan_to_num(importances, nan=0)
    
  • Mutual Information: $$ I(X;Y)=\sum_{x\in X}\sum_{y\in Y}p(x,y)log\frac{p(x,y)}{p(x)p(y)} $$

from sklearn.feature_selection import mutual_info_classif

importances = mutual_info_classif(features, labels)
  • Variance Threshold
from sklearn.feature_selection import VarianceThreshold

selected_features = VarianceThreshold(threshold=threshold).fit_transform(features)

Wrapper Methods

  • Brute-force

  • Greedy search

    • forward sequential selection
    • backward sequential selection
  • Recursive feature elimination:使用一个基模型进行多轮训练,每轮训练后通过学习器返回的coef_或者feature_importances_消除若干权重较低的特征,再基于新的特征集进行下一轮训练

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
#递归特征消除法,返回特征选择后的数据
#参数estimator为基模型
#参数n_features_to_select为选择的特征个数
RFE(estimator=LogisticRegression(), 
    n_features_to_select=2).fit_transform(iris.data, 
                                          iris.target)
  • Heuristic algorithm
    • 遗传算法(Genetic algorithm)
    • 蚁群算法(Ant colony algorithm)

Embedded Methods

  • Regulation-based

    • Lasso:通过对回归系数添加L1惩罚项来防止过拟合,可以让特定的回归系数变为0,从而可以选择一个不包含那些系数的更简单的模型
    from sklearn.feature_selection import SelectFromModel
    from sklearn.linear_model import LogisticRegression
    #带L1惩罚项的逻辑回归作为基模型的特征选择
    SelectFromModel(LogisticRegression(penalty="l1", C=0.1)).fit_transform(iris.data,iris.target)
    
    • Elastic Net
    • Ridge Regression
  • Tree-based

    • Decision Tree
    from sklearn.tree import DecisionTreeClassifier
    
    clf = DecisionTreeClassifier(criterion='gini', max_depth=3)
    clf.fit(features, labels)
    top_n = 10
    print(np.array(feature_names)[clf.feature_importances_.argsort()[-top_n:]][::-1])
    
    • Random Forest
    • GBDT
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingClassifier
#GBDT作为基模型的特征选择
SelectFromModel(
    GradientBoostingClassifier()).fit_transform(
      iris.data,iris.target)

Multilabel Classifier

from sklearn.multiclass import OneVsRestClassifier

ovr = OneVsRestClassifier(LogisticRegression()).fit(x_train, y_train)
pred = ovr.predict(x_test)

# equivalent to

pred = np.zeros_like(y_test)
for i in range(y_test.shape[1]):
    clf = LogisticRegression().fit(x_train, y_train[:,i])
    pred[:,i] = clf.predict(x_test)

# evaluate
from sklearn.metrics import f1_score
score = f1_score(y_test, pred, average='macro')
score = f1_score(y_test, pred, average='micro')

Clustering

from sklearn.cluster import KMeans, DBSCAN
# K-means
kmeans = KMeans(n_clusters=10, n_init=20, n_jobs=8)
y_pred = kmeans.fit_predict(X)

# DBSCAN
dbscan = DBSCAN(eps=0.9, min_samples=2)
y_pred = dbscan.fit_predict(X)

# HDBSCAN
import hdbscan
clusterer = hdbscan.HDBSCAN(min_cluster_size=15, leaf_size=400).fit(X)
print(clusterer.labels_)

Visulization using t-SNE

# training
n_components = 3 # 2 or 3D
tsne = TSNE(n_components)
X_tsne = tsne.fit_transform(features)
print(X_tsne.shape)

# ploting
fig = plt.figure()
ax = fig.add_subplot(projection='3d')
scatter = ax.scatter(X_tsne[:,0], X_tsne[:,1], X_tsne[:,2], c=labels)
legend = ax.legend(*scatter.legend_elements(), loc="lower left", title="Classes")
ax.add_artist(legend)

Autograd

Gradients, Jacobin and Hessian

计算scaler->scaler, vector->scaler, vector->vector的gradient, Jacobin, Hessian

import autograd.numpy as np
import autograd

# scaler in scaler out
f1 = lambda x: x**2 + 3*x +1
x = 2.
# f(x)
f1(x)
# gradients of f(x)
autograd.grad(f1)(x)
# f(x) and f'(x)
autograd.value_and_grad(f1)(x)

# vertor in scaler out
f2 = lambda x: np.sum(np.square(x))
x = np.arange(5).astype(np.float32)
# f(x)
f2(x)
# gradients of f(x), [df/dx1, df/dx2, ..., df/dxn]
autograd.grad(f2)(x)
# total derivative of f(x), =np.sum(autograd.grad(f2)(x))
autograd.deriv(f2)(x)
# jacobin of f(x), same as grad, [df/dx1, df/dx2, ..., df/dxn]
autograd.jacobian(f2)(x)
# hessian of f(x), [[d2y/dx1^2, d2y/dx1*dx2, ..., d2y/dx1*dxn], ..., [d2y/dxn*dx1, d2y/dxn*dx2, ..., d2y/dxn^2]]
autograd.hessian(f2)(x)

# vector in vector out
f3 = lambda x: np.square(x)
x = np.arange(5).astype(np.float32)
# f(x)
f3(x)
# jacobin of f(x), [[dy1/dx1, dy1/dx2, ..., dy1/dxn], ..., [dyn/dx1, dyn/dx2, ..., dyn/dxn]]
autograd.jacobian(f3)(x)