BiLSTM文本分类实践

BiLstm将前向和后向输出拼接起来,然后recuce_mean连个softmax即可实现文本分类(中间也可以过渡个全连接)。

实践发现:

(1) 带正则的损失函数表现更好;

(2) 基于词的表现没有基于字的表现好(差0.5个百分点)。

(3) 最后多加个全连接层,效果没有有效提升(实际调参也要尝试下,毕竟不同的数据集表现不同)。

 

基于tensorflow的实现代码

#!/usr/bin/python
# coding=utf8

import os
import numpy as np
from datetime import datetime
import tensorflow as tf
from sklearn import metrics

from nlp_utils import *

#os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

class BiLSTM():
    """
    文本分类,LSTM模型
    """
    def __init__(self, num_classes, max_seq_len, pretrained, embedding_pretrained, vocab_size, embedding_dim, epochs, learning_rate):
        """
        :param config:
        """
        self.num_classes = num_classes
        self.max_seq_len = max_seq_len
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.pretrained = None
        self.embedding_pretrained = embedding_pretrained
        self.epochs = epochs
        self.learning_rate = learning_rate

        self.input_x = tf.placeholder(tf.int32, [None, self.max_seq_len], name='input_x')
        self.input_y = tf.placeholder(tf.float32, [None, self.num_classes], name='input_y')
        self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')
        self.l2_loss = tf.constant(0.0)
        self.regularizer = tf.contrib.layers.l2_regularizer(scale=0.01)
        self.inference()

    def inference(self):
        """
        :return:
        """
        # 词向量映射
        with tf.name_scope("embedding"):
            embedding = tf.get_variable("embedding", [self.vocab_size, self.embedding_dim])
            if self.pretrained: # 加载预训练的word embedding
                embedding.assign(self.embedding_pretrained)
            embedding_inputs = tf.nn.embedding_lookup(embedding, self.input_x) # batch_size * max_seq_len * embedding_dim
            embedding_inputs = tf.nn.dropout(embedding_inputs, self.keep_prob)

        with tf.name_scope("lstm"):
            lstm_fw_cell = tf.nn.rnn_cell.LSTMCell(self.embedding_dim, forget_bias=1.0, state_is_tuple=True)
            lstm_bw_cell = tf.nn.rnn_cell.LSTMCell(self.embedding_dim, forget_bias=1.0, state_is_tuple=True)
            (output_fw, output_bw), states = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell,
                                                                            lstm_bw_cell,
                                                                            embedding_inputs,
                                                                            dtype=tf.float32,
                                                                            time_major=False,
                                                                            scope=None)
            bilstm_out = tf.concat([output_fw, output_bw], axis=2)
            bilstm_out = tf.reduce_mean(bilstm_out, axis=1)
        
        with tf.name_scope("dropout"):
            rnn_drop = tf.nn.dropout(bilstm_out, self.keep_prob)
        
        with tf.name_scope("score"):
            #fc = tf.layers.dense(bilstm_out, self.embedding_dim, activation=tf.nn.relu, name='fc1') # batch_size * hidden_dim
            #fc_drop = tf.layers.dropout(fc, self.keep_prob)
            
            # classify
            self.logits = tf.layers.dense(rnn_drop, self.num_classes, name='fc2') # batch_size * num_classes
            self.y_pred_cls = tf.argmax(tf.nn.softmax(self.logits), 1, name="pred")

        with tf.name_scope("loss"):
            # 损失函数,交叉熵
            cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
                logits=self.logits, labels=self.input_y)

            l2_loss = tf.losses.get_regularization_loss()
            self.loss = tf.reduce_mean(cross_entropy, name="loss")
            self.loss += l2_loss

            # optim
            self.optim = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)

        with tf.name_scope("accuracy"):
            # 准确率
            correct_pred = tf.equal(tf.argmax(self.input_y, 1), self.y_pred_cls)
            self.acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32), name="acc")


    def batch_iter(self, x, y, batch_size=32, shuffle=True):
        """
        生成batch数据
        :param x: 训练集特征变量
        :param y: 训练集标签
        :param batch_size: 每个batch的大小
        :param shuffle: 是否在每个epoch时打乱数据
        :return:
        """
        data_len = len(x)
        num_batch = int((data_len - 1) / batch_size) + 1
 
        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(data_len))
            x_shuffle = x[shuffle_indices]
            y_shuffle = y[shuffle_indices]
        else:
            x_shuffle = x
            y_shuffle = y
        for i in range(num_batch):
            start_index = i * batch_size
            end_index = min((i + 1) * batch_size, data_len)
            yield (x_shuffle[start_index:end_index], y_shuffle[start_index:end_index])

    def evaluate(self, sess, x_, y_):
        """
        评估 val data 的准确率和损失
        """
        data_len = len(x_)
        batch_eval =self.batch_iter(x_, y_, 64)
        total_loss = 0.0
        total_acc = 0.0
        for x_batch, y_batch in batch_eval:
            batch_len = len(x_batch)
            feed_dict = {self.input_x: x_batch, self.input_y: y_batch,
                        self.keep_prob: 1}
            loss, acc = sess.run([self.loss, self.acc], feed_dict=feed_dict)
            total_loss += loss * batch_len
            total_acc += acc * batch_len

        return total_loss / data_len, total_acc / data_len

    def fit(self, train_x, train_y, val_x, val_y, batch_size, keep_prob):
        """
        训练过程
        """
        train_steps = 0
        best_acc_val = 0.0  # 最佳验证集准确率
        
        saver = tf.train.Saver(max_to_keep=10)
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer()) # 初始化变量
            """
            # 继续训练的话可把此处注释去掉即可生效
            ckpt = tf.train.get_checkpoint_state('./model/')  # 注意此处是checkpoint存在的目录,千万不要写成‘./model’
            if ckpt and ckpt.model_checkpoint_path:
                saver.restore(sess, ckpt.model_checkpoint_path) # 自动恢复model_checkpoint_path保存模型一般是最新
                print("Model restored...")
            else:
                print('No Model')
            """
            for epoch in range(self.epochs):
                batch_train = self.batch_iter(train_x, train_y, batch_size)
                for x_batch, y_batch in batch_train:
                    train_steps += 1
                    feed_dict = {self.input_x: x_batch, self.input_y: y_batch,
                                self.keep_prob: keep_prob}
                    _, train_loss, train_acc = sess.run([self.optim, self.loss,
                                                        self.acc], feed_dict=feed_dict)
                    if train_steps % 1000 == 0:
                        val_loss, val_acc = self.evaluate(sess, val_x, val_y)
                        if val_acc > best_acc_val:
                            # 保存最好结果
                            best_acc_val = val_acc
                            saver.save(sess, "./modellstm/lstm", global_step=train_steps)
                    
                        msg = 'epoch:%d/%d,train_steps:%d,train_loss:%.4f,train_acc:%.4f,val_loss:%.4f,val_acc:%.4f'
                        print(msg % (epoch+1, self.epochs, train_steps, train_loss, train_acc, val_loss, val_acc))

if __name__ == "__main__":
    train_file = "./train.txt"
    val_file = "./val.txt"

    data_set_train = load_data2(train_file) # 加载数据
    data_set_val = load_data2(val_file) # 加载数据

    word2id_dict, label2id_dict = build_dict(data_set_train) # 构建dict
    save_dict(word2id_dict, "word2id_dict.txt")
    save_dict(label2id_dict, "label2id_dict.txt")
    
    batch_size = 64
    max_seq_len = 64
    num_classes = len(label2id_dict)
    vocab_size = len(word2id_dict)
    embedding_dim = 128
    learning_rate = 0.001
    epochs = 20
    keep_prob = 0.5

    train_x, train_y = convert_corpus_to_id_with_padding(data_set_train, word2id_dict, label2id_dict, max_seq_len, num_classes)
    val_x, val_y = convert_corpus_to_id_with_padding(data_set_val, word2id_dict, label2id_dict, max_seq_len, num_classes)
    
    lstm_model = BiLSTM(num_classes, max_seq_len, False, None, vocab_size, embedding_dim, epochs, learning_rate)
    lstm_model.fit(train_x, train_y, val_x, val_y, batch_size, keep_prob)

 

已标记关键词 清除标记
©️2020 CSDN 皮肤主题: 编程工作室 设计师:CSDN官方博客 返回首页