BiLstm将前向和后向输出拼接起来,然后recuce_mean连个softmax即可实现文本分类(中间也可以过渡个全连接)。
实践发现:
(1) 带正则的损失函数表现更好;
(2) 基于词的表现没有基于字的表现好(差0.5个百分点)。
(3) 最后多加个全连接层,效果没有有效提升(实际调参也要尝试下,毕竟不同的数据集表现不同)。
基于tensorflow的实现代码
#!/usr/bin/python
# coding=utf8
import os
import numpy as np
from datetime import datetime
import tensorflow as tf
from sklearn import metrics
from nlp_utils import *
#os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
class BiLSTM():
"""
文本分类,LSTM模型
"""
def __init__(self, num_classes, max_seq_len, pretrained, embedding_pretrained, vocab_size, embedding_dim, epochs, learning_rate):
"""
:param config:
"""
self.num_classes = num_classes
self.max_seq_len = max_seq_len
self.vocab_size = vocab_size
self.embedding_dim = embedding_dim
self.pretrained = None
self.embedding_pretrained = embedding_pretrained
self.epochs = epochs
self.learning_rate = learning_rate
self.input_x = tf.placeholder(tf.int32, [None, self.max_seq_len], name='input_x')
self.input_y = tf.placeholder(tf.float32, [None, self.num_classes], name='input_y')
self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')
self.l2_loss = tf.constant(0.0)
self.regularizer = tf.contrib.layers.l2_regularizer(scale=0.01)
self.inference()
def inference(self):
"""
:return:
"""
# 词向量映射
with tf.name_scope("embedding"):
embedding = tf.get_variable("embedding", [self.vocab_size, self.embedding_dim])
if self.pretrained: # 加载预训练的word embedding
embedding.assign(self.embedding_pretrained)
embedding_inputs = tf.nn.embedding_lookup(embedding, self.input_x) # batch_size * max_seq_len * embedding_dim
embedding_inputs = tf.nn.dropout(embedding_inputs, self.keep_prob)
with tf.name_scope("lstm"):
lstm_fw_cell = tf.nn.rnn_cell.LSTMCell(self.embedding_dim, forget_bias=1.0, state_is_tuple=True)
lstm_bw_cell = tf.nn.rnn_cell.LSTMCell(self.embedding_dim, forget_bias=1.0, state_is_tuple=True)
(output_fw, output_bw), states = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell,
lstm_bw_cell,
embedding_inputs,
dtype=tf.float32,
time_major=False,
scope=None)
bilstm_out = tf.concat([output_fw, output_bw], axis=2)
bilstm_out = tf.reduce_mean(bilstm_out, axis=1)
with tf.name_scope("dropout"):
rnn_drop = tf.nn.dropout(bilstm_out, self.keep_prob)
with tf.name_scope("score"):
#fc = tf.layers.dense(bilstm_out, self.embedding_dim, activation=tf.nn.relu, name='fc1') # batch_size * hidden_dim
#fc_drop = tf.layers.dropout(fc, self.keep_prob)
# classify
self.logits = tf.layers.dense(rnn_drop, self.num_classes, name='fc2') # batch_size * num_classes
self.y_pred_cls = tf.argmax(tf.nn.softmax(self.logits), 1, name="pred")
with tf.name_scope("loss"):
# 损失函数,交叉熵
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
logits=self.logits, labels=self.input_y)
l2_loss = tf.losses.get_regularization_loss()
self.loss = tf.reduce_mean(cross_entropy, name="loss")
self.loss += l2_loss
# optim
self.optim = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
with tf.name_scope("accuracy"):
# 准确率
correct_pred = tf.equal(tf.argmax(self.input_y, 1), self.y_pred_cls)
self.acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32), name="acc")
def batch_iter(self, x, y, batch_size=32, shuffle=True):
"""
生成batch数据
:param x: 训练集特征变量
:param y: 训练集标签
:param batch_size: 每个batch的大小
:param shuffle: 是否在每个epoch时打乱数据
:return:
"""
data_len = len(x)
num_batch = int((data_len - 1) / batch_size) + 1
if shuffle:
shuffle_indices = np.random.permutation(np.arange(data_len))
x_shuffle = x[shuffle_indices]
y_shuffle = y[shuffle_indices]
else:
x_shuffle = x
y_shuffle = y
for i in range(num_batch):
start_index = i * batch_size
end_index = min((i + 1) * batch_size, data_len)
yield (x_shuffle[start_index:end_index], y_shuffle[start_index:end_index])
def evaluate(self, sess, x_, y_):
"""
评估 val data 的准确率和损失
"""
data_len = len(x_)
batch_eval =self.batch_iter(x_, y_, 64)
total_loss = 0.0
total_acc = 0.0
for x_batch, y_batch in batch_eval:
batch_len = len(x_batch)
feed_dict = {self.input_x: x_batch, self.input_y: y_batch,
self.keep_prob: 1}
loss, acc = sess.run([self.loss, self.acc], feed_dict=feed_dict)
total_loss += loss * batch_len
total_acc += acc * batch_len
return total_loss / data_len, total_acc / data_len
def fit(self, train_x, train_y, val_x, val_y, batch_size, keep_prob):
"""
训练过程
"""
train_steps = 0
best_acc_val = 0.0 # 最佳验证集准确率
saver = tf.train.Saver(max_to_keep=10)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer()) # 初始化变量
"""
# 继续训练的话可把此处注释去掉即可生效
ckpt = tf.train.get_checkpoint_state('./model/') # 注意此处是checkpoint存在的目录,千万不要写成‘./model’
if ckpt and ckpt.model_checkpoint_path:
saver.restore(sess, ckpt.model_checkpoint_path) # 自动恢复model_checkpoint_path保存模型一般是最新
print("Model restored...")
else:
print('No Model')
"""
for epoch in range(self.epochs):
batch_train = self.batch_iter(train_x, train_y, batch_size)
for x_batch, y_batch in batch_train:
train_steps += 1
feed_dict = {self.input_x: x_batch, self.input_y: y_batch,
self.keep_prob: keep_prob}
_, train_loss, train_acc = sess.run([self.optim, self.loss,
self.acc], feed_dict=feed_dict)
if train_steps % 1000 == 0:
val_loss, val_acc = self.evaluate(sess, val_x, val_y)
if val_acc > best_acc_val:
# 保存最好结果
best_acc_val = val_acc
saver.save(sess, "./modellstm/lstm", global_step=train_steps)
msg = 'epoch:%d/%d,train_steps:%d,train_loss:%.4f,train_acc:%.4f,val_loss:%.4f,val_acc:%.4f'
print(msg % (epoch+1, self.epochs, train_steps, train_loss, train_acc, val_loss, val_acc))
if __name__ == "__main__":
train_file = "./train.txt"
val_file = "./val.txt"
data_set_train = load_data2(train_file) # 加载数据
data_set_val = load_data2(val_file) # 加载数据
word2id_dict, label2id_dict = build_dict(data_set_train) # 构建dict
save_dict(word2id_dict, "word2id_dict.txt")
save_dict(label2id_dict, "label2id_dict.txt")
batch_size = 64
max_seq_len = 64
num_classes = len(label2id_dict)
vocab_size = len(word2id_dict)
embedding_dim = 128
learning_rate = 0.001
epochs = 20
keep_prob = 0.5
train_x, train_y = convert_corpus_to_id_with_padding(data_set_train, word2id_dict, label2id_dict, max_seq_len, num_classes)
val_x, val_y = convert_corpus_to_id_with_padding(data_set_val, word2id_dict, label2id_dict, max_seq_len, num_classes)
lstm_model = BiLSTM(num_classes, max_seq_len, False, None, vocab_size, embedding_dim, epochs, learning_rate)
lstm_model.fit(train_x, train_y, val_x, val_y, batch_size, keep_prob)