分类特征稀疏的文本
翻译者:@Loopy 校验者:@barrycg
这个示例展示了如何使用scikit-learn中的单词包方法,根据主题对文档进行分类。本例使用scipy.sparse中的矩阵来存储特征,并演示各种能够有效处理稀疏矩阵的分类器。
本例中使用的数据集是20条新闻组数据集。通过scikit-learn可以自动下载该数据集,并进行缓存。
下述条形图展示了各个不同分类器,其信息包括精度、训练时间(已归一化)和测试时间(已归一化)。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 import loggingimport numpy as npfrom optparse import OptionParserimport sysfrom time import timeimport matplotlib.pyplot as pltfrom sklearn.datasets import fetch_20newsgroupsfrom sklearn.feature_extraction.text import TfidfVectorizerfrom sklearn.feature_extraction.text import HashingVectorizerfrom sklearn.feature_selection import SelectFromModelfrom sklearn.feature_selection import SelectKBest, chi2from sklearn.linear_model import RidgeClassifierfrom sklearn.pipeline import Pipelinefrom sklearn.svm import LinearSVCfrom sklearn.linear_model import SGDClassifierfrom sklearn.linear_model import Perceptronfrom sklearn.linear_model import PassiveAggressiveClassifierfrom sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNBfrom sklearn.neighbors import KNeighborsClassifierfrom sklearn.neighbors import NearestCentroidfrom sklearn.ensemble import RandomForestClassifierfrom sklearn.utils.extmath import densityfrom sklearn import metrics
1 2 logging.basicConfig(level=logging.INFO,format ='%(asctime)s %(levelname)s %(message)s' )
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 op = OptionParser() op.add_option("--report" , action="store_true" , dest="print_report" , help ="Print a detailed classification report." ) op.add_option("--chi2_select" , action="store" , type ="int" , dest="select_chi2" , help ="Select some number of features using a chi-squared test" ) op.add_option("--confusion_matrix" , action="store_true" , dest="print_cm" , help ="Print the confusion matrix." ) op.add_option("--top10" , action="store_true" , dest="print_top10" , help ="Print ten most discriminative terms per class" " for every classifier." ) op.add_option("--all_categories" , action="store_true" , dest="all_categories" , help ="Whether to use all categories or not." ) op.add_option("--use_hashing" , action="store_true" , help ="Use a hashing vectorizer." ) op.add_option("--n_features" , action="store" , type =int , default=2 ** 16 , help ="n_features when using the hashing vectorizer." ) op.add_option("--filtered" , action="store_true" , help ="Remove newsgroup information that is easily overfit: " "headers, signatures, and quoting." )
<Option at 0x7febca4f9320: --filtered>
1 2 def is_interactive (): return not hasattr (sys.modules['__main__' ], '__file__' )
1 2 3 4 5 6 7 8 9 10 argv = [] if is_interactive() else sys.argv[1 :] (opts, args) = op.parse_args(argv) if len (args) > 0 : op.error("this script takes no arguments." ) sys.exit(1 ) print (__doc__)op.print_help() print ()
Automatically created module for IPython interactive environment
Usage: ipykernel_launcher.py [options]
Options:
-h, --help show this help message and exit
--report Print a detailed classification report.
--chi2_select=SELECT_CHI2
Select some number of features using a chi-squared
test
--confusion_matrix Print the confusion matrix.
--top10 Print ten most discriminative terms per class for
every classifier.
--all_categories Whether to use all categories or not.
--use_hashing Use a hashing vectorizer.
--n_features=N_FEATURES
n_features when using the hashing vectorizer.
--filtered Remove newsgroup information that is easily overfit:
headers, signatures, and quoting.
1 2 3 4 5 6 7 8 9 10 if opts.all_categories: categories = None else : categories = [ 'alt.atheism' , 'talk.religion.misc' , 'comp.graphics' , 'sci.space' , ]
1 2 3 4 5 6 7 if opts.filtered: remove = ('headers' , 'footers' , 'quotes' ) else : remove = () print ("Loading 20 newsgroups dataset for categories:" )print (categories if categories else "all" )
Loading 20 newsgroups dataset for categories:
['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
1 2 3 data_train = fetch_20newsgroups(subset='train' , categories=categories,shuffle=True , random_state=42 ,remove=remove) data_test = fetch_20newsgroups(subset='test' , categories=categories,shuffle=True , random_state=42 ,remove=remove)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 target_names = data_train.target_names def size_mb (docs ): return sum (len (s.encode('utf-8' )) for s in docs) / 1e6 data_train_size_mb = size_mb(data_train.data) data_test_size_mb = size_mb(data_test.data) print ("%d documents - %0.3fMB (training set)" % ( len (data_train.data), data_train_size_mb)) print ("%d documents - %0.3fMB (test set)" % ( len (data_test.data), data_test_size_mb)) print ("%d categories" % len (target_names))
2034 documents - 3.980MB (training set)
1353 documents - 2.867MB (test set)
4 categories
1 2 y_train, y_test = data_train.target, data_test.target
1 2 3 4 5 6 7 8 9 10 11 12 13 print ("使用稀疏向量机从训练数据中提取特征" )t0 = time() if opts.use_hashing: vectorizer = HashingVectorizer(stop_words='english' , alternate_sign=False , n_features=opts.n_features) X_train = vectorizer.transform(data_train.data) else : vectorizer = TfidfVectorizer(sublinear_tf=True , max_df=0.5 , stop_words='english' ) X_train = vectorizer.fit_transform(data_train.data) duration = time() - t0 print ("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))print ("n_samples: %d, n_features: %d" % X_train.shape)
使用稀疏向量机从训练数据中提取特征
done in 0.476004s at 8.360MB/s
n_samples: 2034, n_features: 33809
1 2 3 4 5 6 print ("使用相同的矢量化器从测试数据中提取特征" )t0 = time() X_test = vectorizer.transform(data_test.data) duration = time() - t0 print ("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))print ("n_samples: %d, n_features: %d" % X_test.shape)
使用相同的矢量化器从测试数据中提取特征
done in 0.311447s at 9.207MB/s
n_samples: 1353, n_features: 33809
1 2 3 4 5 if opts.use_hashing: feature_names = None else : feature_names = vectorizer.get_feature_names()
1 2 3 4 5 6 7 8 9 10 11 12 if opts.select_chi2: print ("使用卡方检验提取 %d 个特征" % opts.select_chi2) t0 = time() ch2 = SelectKBest(chi2, k=opts.select_chi2) X_train = ch2.fit_transform(X_train, y_train) X_test = ch2.transform(X_test) if feature_names: feature_names = [feature_names[i] for i in ch2.get_support(indices=True )] print ("done in %fs" % (time() - t0))
1 2 if feature_names: feature_names = np.asarray(feature_names)
1 2 3 def trim (s ): return s if len (s) <= 80 else s[:77 ] + "..."
基准分类器 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 def benchmark (clf ): print ('_' * 80 ) print ("训练: " ) print (clf) t0 = time() clf.fit(X_train, y_train) train_time = time() - t0 print ("训练时间: %0.3fs" % train_time) t0 = time() pred = clf.predict(X_test) test_time = time() - t0 print ("最佳时间: %0.3fs" % test_time) score = metrics.accuracy_score(y_test, pred) print ("准确率: %0.3f" % score) if hasattr (clf, 'coef_' ): print ("维数: %d" % clf.coef_.shape[1 ]) print ("密度: %f" % density(clf.coef_)) if opts.print_top10 and feature_names is not None : print ("每个类的前十个词:" ) for i, label in enumerate (target_names): top10 = np.argsort(clf.coef_[i])[-10 :] print (trim("%s: %s" % (label, " " .join(feature_names[top10])))) print () if opts.print_report: print ("分类报告:" ) print (metrics.classification_report(y_test, pred, target_names=target_names)) if opts.print_cm: print ("混淆矩阵:" ) print (metrics.confusion_matrix(y_test, pred)) clf_descr = str (clf).split('(' )[0 ] return clf_descr, score, train_time, test_time
1 2 3 4 5 6 7 8 9 10 11 results = [] for clf, name in ( (RidgeClassifier(tol=1e-2 , solver="sag" ), "岭分类器" ), (Perceptron(max_iter=50 , tol=1e-3 ), "感知器" ), (PassiveAggressiveClassifier(max_iter=50 , tol=1e-3 ), "PAC分类器" ), (KNeighborsClassifier(n_neighbors=10 ), "K近邻" ), (RandomForestClassifier(n_estimators=100 ), "随机森林" )): print ('=' * 80 ) print (name) results.append(benchmark(clf))
================================================================================
岭分类器
________________________________________________________________________________
训练:
RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
max_iter=None, normalize=False, random_state=None, solver='sag',
tol=0.01)
训练时间: 0.202s
最佳时间: 0.002s
准确率: 0.897
维数: 33809
密度: 1.000000
================================================================================
感知器
________________________________________________________________________________
训练:
Perceptron(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
fit_intercept=True, max_iter=50, n_iter_no_change=5, n_jobs=None,
penalty=None, random_state=0, shuffle=True, tol=0.001,
validation_fraction=0.1, verbose=0, warm_start=False)
训练时间: 0.030s
最佳时间: 0.003s
准确率: 0.888
维数: 33809
密度: 0.255302
================================================================================
PAC分类器
________________________________________________________________________________
训练:
PassiveAggressiveClassifier(C=1.0, average=False, class_weight=None,
early_stopping=False, fit_intercept=True,
loss='hinge', max_iter=50, n_iter_no_change=5,
n_jobs=None, random_state=None, shuffle=True,
tol=0.001, validation_fraction=0.1, verbose=0,
warm_start=False)
训练时间: 0.063s
最佳时间: 0.003s
准确率: 0.902
维数: 33809
密度: 0.700487
================================================================================
K近邻
________________________________________________________________________________
训练:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=None, n_neighbors=10, p=2,
weights='uniform')
训练时间: 0.002s
最佳时间: 0.235s
准确率: 0.858
================================================================================
随机森林
________________________________________________________________________________
训练:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False)
训练时间: 1.752s
最佳时间: 0.084s
准确率: 0.822
1 2 3 4 5 6 7 8 9 10 for penalty in ["l2" , "l1" ]: print ('=' * 80 ) print ("%s 罚项" % penalty.upper()) results.append(benchmark(LinearSVC(penalty=penalty, dual=False , tol=1e-3 ))) results.append(benchmark(SGDClassifier(alpha=.0001 , max_iter=50 , penalty=penalty)))
================================================================================
L2 罚项
________________________________________________________________________________
训练:
LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, loss='squared_hinge', max_iter=1000,
multi_class='ovr', penalty='l2', random_state=None, tol=0.001,
verbose=0)
训练时间: 0.274s
最佳时间: 0.003s
准确率: 0.900
维数: 33809
密度: 1.000000
________________________________________________________________________________
训练:
SGDClassifier(alpha=0.0001, average=False, class_weight=None,
early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=50,
n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,
random_state=None, shuffle=True, tol=0.001,
validation_fraction=0.1, verbose=0, warm_start=False)
训练时间: 0.050s
最佳时间: 0.003s
准确率: 0.899
维数: 33809
密度: 0.573353
================================================================================
L1 罚项
________________________________________________________________________________
训练:
LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, loss='squared_hinge', max_iter=1000,
multi_class='ovr', penalty='l1', random_state=None, tol=0.001,
verbose=0)
训练时间: 0.257s
最佳时间: 0.002s
准确率: 0.873
维数: 33809
密度: 0.005568
________________________________________________________________________________
训练:
SGDClassifier(alpha=0.0001, average=False, class_weight=None,
early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=50,
n_iter_no_change=5, n_jobs=None, penalty='l1', power_t=0.5,
random_state=None, shuffle=True, tol=0.001,
validation_fraction=0.1, verbose=0, warm_start=False)
训练时间: 0.187s
最佳时间: 0.003s
准确率: 0.882
维数: 33809
密度: 0.023049
1 2 3 4 5 print ('=' * 80 )print ("弹性网络(Elastic Net)罚项" )results.append(benchmark(SGDClassifier(alpha=.0001 , max_iter=50 , penalty="elasticnet" )))
================================================================================
弹性网络(Elastic Net)罚项
________________________________________________________________________________
训练:
SGDClassifier(alpha=0.0001, average=False, class_weight=None,
early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=50,
n_iter_no_change=5, n_jobs=None, penalty='elasticnet',
power_t=0.5, random_state=None, shuffle=True, tol=0.001,
validation_fraction=0.1, verbose=0, warm_start=False)
训练时间: 0.295s
最佳时间: 0.003s
准确率: 0.897
维数: 33809
密度: 0.185956
1 2 3 4 print ('=' * 80 )print ("不带阈值的Rocchio分类器" )results.append(benchmark(NearestCentroid()))
================================================================================
不带阈值的Rocchio分类器
________________________________________________________________________________
训练:
NearestCentroid(metric='euclidean', shrink_threshold=None)
训练时间: 0.007s
最佳时间: 0.002s
准确率: 0.855
1 2 3 4 5 6 print ('=' * 80 )print ("稀疏朴素贝叶斯分类器" )results.append(benchmark(MultinomialNB(alpha=.01 ))) results.append(benchmark(BernoulliNB(alpha=.01 ))) results.append(benchmark(ComplementNB(alpha=.1 )))
================================================================================
稀疏朴素贝叶斯分类器
________________________________________________________________________________
训练:
MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)
训练时间: 0.007s
最佳时间: 0.003s
准确率: 0.899
维数: 33809
密度: 1.000000
________________________________________________________________________________
训练:
BernoulliNB(alpha=0.01, binarize=0.0, class_prior=None, fit_prior=True)
训练时间: 0.010s
最佳时间: 0.008s
准确率: 0.884
维数: 33809
密度: 1.000000
________________________________________________________________________________
训练:
ComplementNB(alpha=0.1, class_prior=None, fit_prior=True, norm=False)
训练时间: 0.007s
最佳时间: 0.002s
准确率: 0.911
维数: 33809
密度: 1.000000
1 2 3 4 5 6 7 8 print ('=' * 80 )print ("基于l1的特征选择的LinearSVC" )results.append(benchmark(Pipeline([ ('feature_selection' , SelectFromModel(LinearSVC(penalty="l1" , dual=False , tol=1e-3 ))), ('classification' , LinearSVC(penalty="l2" ))])))
================================================================================
基于l1的特征选择的LinearSVC
________________________________________________________________________________
训练:
Pipeline(memory=None,
steps=[('feature_selection',
SelectFromModel(estimator=LinearSVC(C=1.0, class_weight=None,
dual=False,
fit_intercept=True,
intercept_scaling=1,
loss='squared_hinge',
max_iter=1000,
multi_class='ovr',
penalty='l1',
random_state=None,
tol=0.001, verbose=0),
max_features=None, norm_order=1, prefit=False,
threshold=None)),
('classification',
LinearSVC(C=1.0, class_weight=None, dual=True,
fit_intercept=True, intercept_scaling=1,
loss='squared_hinge', max_iter=1000,
multi_class='ovr', penalty='l2', random_state=None,
tol=0.0001, verbose=0))],
verbose=False)
训练时间: 0.277s
最佳时间: 0.002s
准确率: 0.880
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 classifier_dic={ 'RidgeClassifier' :'岭分类器(Ridge)' , 'Perceptron' :'感知器(Perceptron)' , 'PassiveAggressiveClassifier' :'PAC分类器' , 'KNeighborsClassifier' :'K近邻(KNN)' , 'RandomForestClassifier' :'随机森林' , 'LinearSVC' :'线性SVC' , 'SGDClassifier' :'SGD分类器' , 'NearestCentroid' :'线性SVC' , 'MultinomialNB' :'(多项式)稀疏朴素贝叶斯分类器' , 'BernoulliNB' :'(伯努利)稀疏朴素贝叶斯分类器' , 'ComplementNB' :'(补偿)稀疏朴素贝叶斯分类器' , 'Pipeline' :'基于l1的特征选择的LinearSVC' , }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 indices = np.arange(len (results)) results = [[x[i] for x in results] for i in range (4 )] clf_names, score, training_time, test_time = results training_time = np.array(training_time) / np.max (training_time) test_time = np.array(test_time) / np.max (test_time) plt.figure(figsize=(12 , 8 )) plt.title("模型对比" ) plt.barh(indices, score, .2 , label="得分(score)" , color='navy' ) plt.barh(indices + .3 , training_time, .2 , label="训练时间" , color='c' ) plt.barh(indices + .6 , test_time, .2 , label="最佳时间" , color='darkorange' ) plt.yticks(()) plt.legend(loc='best' ) plt.subplots_adjust(left=.25 ) plt.subplots_adjust(top=.95 ) plt.subplots_adjust(bottom=.05 ) for i, c in zip (indices, clf_names): plt.text(-.3 , i, classifier_dic[c]) plt.show()