文本分类的具体实现-sklearn实现和NLTK实现
import pandas as pd
raw = pd.read_table('../data/金庸-射雕英雄传txt精校版.txt', names=['txt'], encoding='GBK')
# 章节判断用变量预处理
def m_head(tmpstr):
return tmpstr[:1]
def m_mid(tmpstr):
return tmpstr.find('回 ')
raw['head'] = raw.txt.apply(m_head)
raw['mid'] = raw.txt.apply(m_mid)
raw['len'] = raw.txt.apply(len)
# 章节判断
chapnum = 0
for i in range(len(raw)):
if raw['head'][i] == '第' and raw['mid'][i] > 0 and raw['len'][i] < 30:
chapnum += 1
if chapnum >= 40 and raw['txt'][i] == '附录一:成吉思汗家族':
chapnum = 0
raw.loc[i, 'chap'] = chapnum
# 删除临时变量
del raw['head']
del raw['mid']
del raw['len']
raw
# 从原始语料df中提取所需的前两章段落
raw12 = raw[raw.chap.isin([1,2])]
raw12
# 只使用超过50字的段落
raw12ana = raw12.iloc[list(raw12.txt.apply(len) > 50), :]
raw12ana.reset_index(drop=True, inplace=True)
print(len(raw12ana))
raw12ana.head()
# 分词和预处理
import jieba
cuttxt = lambda x: " ".join(jieba.lcut(x)) # 这里不做任何处理,以保留情感词
raw12ana['cleantxt'] = raw12ana.txt.apply(cuttxt)
raw12ana.head()
from sklearn.feature_extraction.text import CountVectorizer
countvec = CountVectorizer()
wordmtx = countvec.fit_transform(raw12ana.cleantxt)
wordmtx
# 划分训练集和测试集
# 作用:将数据集划分为训练集和测试集
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(wordmtx, raw12ana.chap, test_size=0.3, random_state=111)
x_train
# 拟合朴素贝叶斯
from sklearn import naive_bayes
NBmodel = naive_bayes.MultinomialNB()
# 拟合模型
NBmodel.fit(x_train, y_train)
# 进行验证集预测
x_test
NBmodel.predict(x_test)
# 模型评估
print('训练集:', NBmodel.score(x_train, y_train), '验证集:', NBmodel.score(x_test, y_test))
from sklearn.metrics import classification_report
print(classification_report(y_test, NBmodel.predict(x_test)))
# 使用Logistic回归模型进行分类
from sklearn.linear_model import LogisticRegression
# 定义Logistic回归模型
logitmodel = LogisticRegression()
# 拟合模型
logitmodel.fit(x_train, y_train)
print(classification_report(y_test, logitmodel.predict(x_test)))
# 模型预测
# 将需要预测的文本转换为和建模时格式完全对应的d2m矩阵格式,随后即可进行预测
# countvec.vocabulary_
# 对下面语句进行预测
string = "杨铁心和包惜弱收养穆念慈"
words = " ".join(jieba.lcut(string))
words_vecs = countvec.transform([words]) # 数据需要转换为可迭代的list格式
words_vecs
# 进行预测
NBmodel.predict(words_vecs)
import pandas as pd
raw = pd.read_table('../data/金庸-射雕英雄传txt精校版.txt', names=['txt'], encoding='GBK')
# 章节判断用变量预处理
def m_head(tmpstr):
return tmpstr[:1]
def m_mid(tmpstr):
return tmpstr.find('回 ')
raw['head'] = raw.txt.apply(m_head)
raw['mid'] = raw.txt.apply(m_mid)
raw['len'] = raw.txt.apply(len)
# 章节判断
chapnum = 0
for i in range(len(raw)):
if raw['head'][i] == '第' and raw['mid'][i] > 0 and raw['len'][i] < 30:
chapnum += 1
if chapnum >= 40 and raw['txt'][i] == '附录一:成吉思汗家族':
chapnum = 0
raw.loc[i, 'chap'] = chapnum
# 删除临时变量
del raw['head']
del raw['mid']
del raw['len']
rawgrp = raw.groupby('chap')
chapter = rawgrp.agg(sum) # 只有字符串的情况下,sum函数自动转为合并字符串
chapter = chapter[1:]
chapter
import jieba
# 设定分词及请理停用词函数
stop_list = list(pd.read_csv('../data/停用词.txt', names=['w'], sep='aaa', encoding='utf-8').w)
stop_list
def m_cut(intxt):
return [w for w in jieba.cut(intxt) if w not in stop_list and len(w) > 1]
# 这里直接以章节为一个单元进行分析,以简化程序
import nltk
from nltk import FreqDist
# 生成完整的词条频数字典,这部分也可以用遍历方式实现
fdist1 = FreqDist(m_cut(chapter.txt[1]))
fdist2 = FreqDist(m_cut(chapter.txt[2]))
fdist3 = FreqDist(m_cut(chapter.txt[3]))
fdist1
from nltk.classify import NaiveBayesClassifier
training_data = [[fdist1, 'chap1'], [fdist2, 'chap2'], [fdist3, 'chap3']]
training_data
# 训练分类模型
NLTKmodel = NaiveBayesClassifier.train(training_data)
NLTKmodel
print(NLTKmodel.classify(FreqDist(m_cut('杨铁心收养穆念慈'))))
print(NLTKmodel.classify(FreqDist(m_cut('钱塘江 日日夜夜'))))
# 模型拟合效果的考察
nltk.classify.accuracy(NLTKmodel, training_data) # 准确的评价
NLTKmodel.show_most_informative_features(5) # 得到似然比,检测对于哪些特征有用