将LSTM与word2vec结合实现中文自动写作
# 载入所需的工具包
import jieba
from gensim.models.word2vec import Word2Vec
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
# 导入数据
import pandas as pd
raw = pd.read_table('../data/金庸-射雕英雄传txt精校版.txt', names=['txt'], encoding="GBK")
# 章节判断用变量预处理
def m_head(tmpstr):
return tmpstr[:1]
def m_mid(tmpstr):
return tmpstr.find("回 ")
raw['head'] = raw.txt.apply(m_head)
raw['mid'] = raw.txt.apply(m_mid)
raw['len'] = raw.txt.apply(len)
# 章节判断
chapnum = 0
for i in range(len(raw)):
if raw['head'][i] == "第" and raw['mid'][i] > 0 and raw['len'][i] < 30:
chapnum += 1
if chapnum >= 40 and raw['txt'][i] == "附录一:成吉思汗家族":
chapnum = 0
raw.loc[i, 'chap'] = chapnum
# 删除临时变量
del raw['head']
del raw['mid']
del raw['len']
raw
# 以整句或者整段为基本单位进行分析显然更为合适
corpus = [jieba.lcut(item) for item in raw.txt]
print(len(corpus))
corpus[:3]
# 将文本转换为word2vec向量,此处长度越长,则后续所需的训练时间也越长
# 此处完全可以使用外部语料库进行更全面的训练
w2c_model = Word2Vec(corpus, size=100, window=5, min_count=5)
w2c_model.wv['郭啸天']
# 将数据还原为一个长list
raw_input = [item for sublist in corpus for item in sublist]
print(len(raw_input))
raw_input[:10]
# 列出模型中纳入的词条
vocab = w2c_model.wv.vocab
vocab
# min_count = 5 参数会过滤掉低频词,因此需要在文本中同步清除这些低频词
text_stream = []
for word in raw_input:
if word in vocab:
text_stream.append(word) # 要去掉之前min_count=5已经去掉过的词
print(len(text_stream))
text_stream[:10]
# 构造训练测试集
seq_length = 10 # 取前面10个单词用于预测
x, y = [], []
for i in range(0, len(text_stream) - seq_length):
given = text_stream[i: i+seq_length]
predict = text_stream[i+seq_length]
x.append(np.array([w2c_model.wv[word] for word in given]))
y.append(w2c_model.wv[predict])
len(x)
x[0][0]
y[0]
# 随后将w2v格式的数值表达转换为LSTM需要的格式:[样本数,时间步伐,特征]
x = np.reshape(x, (-1, seq_length, 100)) # 每一个词条,对应一个word2vec向量
y = np.reshape(y, (-1, 100))
# 建立LSTM模型
model = Sequential()
model.add(LSTM(128, input_shape=(seq_length, 100)))
model.add(Dropout(0.2))
model.add(Dense(100, activation='sigmoid'))
model.compile(loss='mse', optimizer='adam')
model.fit(x, y, epochs=5, batch_size=64)
model.summary()
model.save_weights('LSTM.h5') # 文件类型为HDF5
model.load_weights('LSTM.h5')
model.fit(x, y, epochs=10) # 按照指定的数据和参数继续训练模型
# 进行文本预测
def predict_next(input_array):
x = np.reshape(input_array, (-1, seq_length, 100))
y = model.predict(x)
return y
def string_to_index(raw_input):
input_stream = []
for word in jieba.lcut(raw_input):
if word in vocab:
input_stream.append(word)
res = []
for word in input_stream[len(input_stream)-seq_length:]:
res.append(w2c_model.wv[word])
return res
def y_to_word(y):
word = w2c_model.wv.most_similar(positive=y, topn=1)
return word
def generate_article(init, rounds=50):
in_string = init.lower()
for i in range(rounds):
n = y_to_word(predict_next(string_to_index(in_string)))
in_string += n[0][0]
return in_string
init = '郭啸天、杨铁心越听越怒。郭啸天道:“靖康年间徽钦二帝被金兵掳去这件大耻,我们'
article = generate_article(init)
print(article)