朴素贝叶斯模型广泛应用在文本分类的任务中,包括互联网新闻分类、垃圾邮件的筛选等。
导入数据,fetch_20newsgroups 在需要时会从互联网下载数据。
from sklearn.datasets import fetch_20newsgroups
news = fetch_20newsgroups(subset='all')
print len(news.data)
print news.data[0]
对20类新闻文本数据分割
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(news.data, news.target, test_size=0.25, random_state=33)
将文本转化为特征向量
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
X_train = vec.fit_transform(X_train)
X_test = vec.transform(X_test)
利用贝叶斯训练模型
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(X_train, y_train)
y_predict = mnb.predict(X_test)
对表现性能进行评估
from sklearn.metrics import classification_report
print('The accuracy of Naive Bayes Classifier is', mnb.score(X_test, y_test))
print(classification_report(y_test, y_predict, target_names = news.target_names))