輿情監控系統——step2.CNN-基於tensorFlow實現


中間經過了漫長的期末考試周,第二步拖了半個多月,終於把第二步做好了,使用了兩種方法,現在我先主要介紹基於深度學習的方法GitHub代碼點擊此處

數據集選擇

一開始數據集大概每類300條,准確率只有86%左右,文本分類要求數據量足夠,才能訓練處合適的模型,我選擇數據集的過程中經歷了很多波折,最后使用的清華的THUCNews,我覺得是我能找到的最優的數據集了,關於數據集我專門寫了一個博文,請點這里
最后使用的數據格式如下,因為原始數據量太大了,只抽取了一部分。

這里寫圖片描述

抽取、整理數據

存放在text文件夾中,其中涉及兩個模塊:

  • copyData.py: 用於從每個分類拷貝1400個文件。
  • cnews_group.py:用於將多個文件整合到一個文件中。

從每個分類拷貝1400個文件

數據使用:

訓練集: 1100*7
驗證集: 100*7
測試集: 200*7

代碼如下
import os
import glob
import shutil
import random

basePath = "/Users/alicelmx/Documents/實習/文本分類/基於深度學習/SogouData/ClassFile/"
newPath = "/Users/alicelmx/Documents/實習/文本分類/基於深度學習/text/"

listPath = list(map(lambda  x:basePath+str(x)+"/",list(filter(lambda  x:not str(x).startswith("."),os.listdir(basePath)))))
""" 訓練集: 1100*7 驗證集: 100*7 測試集: 200*7 """
def copy(listPath,MAXCOUNT=1400):
    for  path in listPath:
        newdir = newPath+ str(path).split("/")[-2]
        print("====================")
        print(newdir)
        print("====================")

        if not os.path.exists(newdir):
            os.mkdir(newdir)
        files=glob.glob(path+"*.txt")

        if len(files) < MAXCOUNT:
            resultlist = []
            for i in range(MAXCOUNT):
                resultlist.append(random.choice(files))
        else:
            resultlist = random.sample(files,MAXCOUNT)
        for file in resultlist:
            shutil.copy(file,newdir)

if  __name__=='__main__':
    copy(listPath)
    print("抽取成功!")

將多個文件整合到一個文件中

目標

創建sougou.train.txt(訓練集1100*7)、sougou.test.txt(驗證集100*7)、sougou.val.txt(測試集200*7),其中每一個文件包含每個類下的部分文件,存放於data文件夾中。
這里寫圖片描述

代碼實現
#!/usr/bin/python
# -*- coding: utf-8 -*-
""" 將文本整合到 train、test、val 三個文件中 """
import  os

basePath = "/Users/alicelmx/Documents/實習/文本分類/基於深度學習/text/"
trainPath = "/Users/alicelmx/Documents/實習/文本分類/基於深度學習/data/"

def _read_file(filename):
    """讀取一個文件並轉換為一行"""
    with open(filename, 'r', encoding='utf-8') as f:
        return f.read().replace('\n', '').replace('\t', '').replace('\u3000', '')

def save_file(dirname):
    """ 將多個文件整合並存到3個文件中 dirname: 原數據目錄 文件內容格式: 類別\t內容 """
    f_train = open(trainPath+"sougou.train.txt",'w',encoding='utf-8')
    f_test = open(trainPath + "sougou.test.txt", 'w', encoding='utf-8')
    f_val = open(trainPath + "sougou.val.txt", 'w', encoding='utf-8')

    for category in os.listdir(dirname):
        catdir = os.path.join(dirname,category)
        if not os.path.isdir(catdir):
            continue
        files = os.listdir(catdir)
        print(len(files))

        count = 0
        for cur_file in files:
            filename = os.path.join(catdir,cur_file)
            content = _read_file(filename)

            if count < 1100:
                f_train.write(category+"\t"+content+"\n")
            elif count < 1300:
                f_test.write(category+"\t"+content+"\n")
            else:
                f_val.write(category + '\t' + content + '\n')
            count += 1

        print("===============")
        print("finish:",category)
        print("===============")

    f_train.close()
    f_test.close()
    f_val.close()

if  __name__=='__main__':
    save_file(basePath)
    print(len(open(trainPath+"sougou.train.txt", 'r', encoding='utf-8').readlines()))
    print(len(open(trainPath + "sougou.test.txt", 'r', encoding='utf-8').readlines()))
    print(len(open(trainPath + "sougou.val.txt", 'r', encoding='utf-8').readlines()))

數據預處理

data/cnews_loader.py為數據的預處理文件。

  1. read_file(): 讀取文件數據;
  2. build_vocab():構建詞匯表,使用字符級的表示,這一函數會將詞匯表存儲下來,避免每一次重復處理;
  3. read_vocab():讀取上一步存儲的詞匯表,轉換為{詞:id}表示; read_category(): 將分類目錄固定,轉換為{類別: id}表示;
  4. to_words(): 將一條由id表示的數據重新轉換為文字;

完整代碼

#!/usr/bin/python
# -*- coding: utf-8 -*-

from collections import Counter
import tensorflow.contrib.keras as kr
import numpy as np
import os

def open_file(filename, mode='r'):
    """ Commonly used file reader, change this to switch between python2 and python3. mode: 'r' or 'w' for read or write """
    return open(filename, mode, encoding='utf-8', errors='ignore')

def read_file(filename):
    """讀取文件數據"""
    contents, labels = [], []
    with open_file(filename) as f:
        for line in f:
            try:
                label, content = line.strip().split('\t')
                contents.append(list(content))
                labels.append(label)
            except:
                pass
    return contents, labels

def build_vocab(train_dir, vocab_dir, vocab_size=5000):
    """根據訓練集構建詞匯表,存儲"""
    data_train, _ = read_file(train_dir)

    all_data = []
    for content in data_train:
        all_data.extend(content)

    counter = Counter(all_data)
    count_pairs = counter.most_common(vocab_size - 1)
    words, _ = list(zip(*count_pairs))
    # 添加一個 <PAD> 來將所有文本pad為同一長度
    words = ['<PAD>'] + list(words)

    open_file(vocab_dir, mode='w').write('\n'.join(words) + '\n')

def read_vocab(vocab_dir):
    """讀取詞匯表"""
    words = open_file(vocab_dir).read().strip().split('\n')
    word_to_id = dict(zip(words, range(len(words))))

    return words, word_to_id

def read_category():
    """讀取分類目錄,固定"""
    categories =  [ '財經','房產','股票','家居','科技','時政','娛樂' ]
    cat_to_id = dict(zip(categories, range(len(categories))))

    return categories, cat_to_id

def to_words(content, words):
    """將id表示的內容轉換為文字"""
    return ''.join(words[x] for x in content)

def process_file(filename, word_to_id, cat_to_id, max_length=600):
    """將文件轉換為id表示"""
    contents, labels = read_file(filename)

    data_id, label_id = [], []
    for i in range(len(contents)):
        data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])
        label_id.append(cat_to_id[labels[i]])

    # 使用keras提供的pad_sequences來將文本pad為固定長度
    x_pad = kr.preprocessing.sequence.pad_sequences(data_id, max_length)
    y_pad = kr.utils.to_categorical(label_id)  # 將標簽轉換為one-hot表示

    return x_pad, y_pad

def batch_iter(x, y, batch_size=64):
    """生成批次數據"""
    data_len = len(x)
    num_batch = int((data_len - 1) / batch_size) + 1

    indices = np.random.permutation(np.arange(data_len))
    x_shuffle = x[indices]
    y_shuffle = y[indices]

    for i in range(num_batch):
        start_id = i * batch_size
        end_id = min((i + 1) * batch_size, data_len)
        yield x_shuffle[start_id:end_id], y_shuffle[start_id:end_id]

配置CNN卷積神經網絡模型

見cnnModel.py文件

#!/usr/bin/python
# -*- coding: utf-8 -*-

import tensorflow as tf

class TCNNConfig(object):
    """CNN配置參數"""
    embedding_dim = 64      # 詞向量維度
    seq_length = 600        # 序列長度
    num_classes = 7        # 類別數
    num_filters = 256        # 卷積核數目
    kernel_size = 5         # 卷積核尺寸
    vocab_size = 5000       # 詞匯表達小

    hidden_dim = 128        # 全連接層神經元

    dropout_keep_prob = 0.5 # dropout保留比例
    learning_rate = 1e-3    # 學習率

    batch_size = 64         # 每批訓練大小
    num_epochs = 10         # 總迭代輪次

    print_per_batch = 100    # 每多少輪輸出一次結果
    save_per_batch = 10      # 每多少輪存入tensorboard


class TextCNN(object):
    """文本分類,CNN模型"""
    def __init__(self, config):
        self.config = config

        # 三個待輸入的數據
        self.input_x = tf.placeholder(tf.int32, [None, self.config.seq_length], name='input_x')
        self.input_y = tf.placeholder(tf.float32, [None, self.config.num_classes], name='input_y')
        self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')

        self.cnn()

    def cnn(self):
        """CNN模型"""
        # 詞向量映射
        with tf.device('/cpu:0'):
            embedding = tf.get_variable('embedding', [self.config.vocab_size, self.config.embedding_dim])
            embedding_inputs = tf.nn.embedding_lookup(embedding, self.input_x)

        with tf.name_scope("cnn"):
            # CNN layer
            conv = tf.layers.conv1d(embedding_inputs, self.config.num_filters, self.config.kernel_size, name='conv')
            # global max pooling layer
            gmp = tf.reduce_max(conv, reduction_indices=[1], name='gmp')

        with tf.name_scope("score"):
            # 全連接層,后面接dropout以及relu激活
            fc = tf.layers.dense(gmp, self.config.hidden_dim, name='fc1')
            fc = tf.contrib.layers.dropout(fc, self.keep_prob)
            fc = tf.nn.relu(fc)

            # 分類器
            self.logits = tf.layers.dense(fc, self.config.num_classes, name='fc2')
            self.y_pred_cls = tf.argmax(tf.nn.softmax(self.logits), 1)  # 預測類別

        with tf.name_scope("optimize"):
            # 損失函數,交叉熵
            cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.input_y)
            self.loss = tf.reduce_mean(cross_entropy)
            # 優化器
            self.optim = tf.train.AdamOptimizer(learning_rate=self.config.learning_rate).minimize(self.loss)

        with tf.name_scope("accuracy"):
            # 准確率
            correct_pred = tf.equal(tf.argmax(self.input_y, 1), self.y_pred_cls)
            self.acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

訓練和驗證

代碼實現

見runCNN.py
若之前進行過訓練,請把tensorboard/textcnn刪除,避免TensorBoard多次訓練結果重疊。

#!/usr/bin/python
# -*- coding: utf-8 -*-

from cnnModel import *
from data.sougouLoader1 import *
from sklearn import metrics
import sys

import time
from datetime import timedelta


base_dir = '/Users/alicelmx/Documents/實習/文本分類/基於深度學習/data/'
train_dir = os.path.join(base_dir, 'sougou.train.txt')
test_dir = os.path.join(base_dir, 'sougou.test.txt')
val_dir = os.path.join(base_dir, 'sougou.val.txt')
vocab_dir = os.path.join(base_dir, 'sougou.vocab.txt')

save_dir = '/Users/alicelmx/Documents/實習/文本分類/基於深度學習/測試結果/'
save_path = os.path.join(save_dir, 'best_validation')   # 最佳驗證結果保存路徑

def get_time_dif(start_time):
    """獲取已使用時間"""
    end_time = time.time()
    time_dif = end_time - start_time
    return timedelta(seconds=int(round(time_dif)))

def feed_data(x_batch, y_batch, keep_prob):
    feed_dict = {
        model.input_x: x_batch,
        model.input_y: y_batch,
        model.keep_prob: keep_prob
    }
    return feed_dict

def evaluate(sess, x_, y_):
    """評估在某一數據上的准確率和損失"""
    data_len = len(x_)
    batch_eval = batch_iter(x_, y_, 128)
    total_loss = 0.0
    total_acc = 0.0
    for x_batch, y_batch in batch_eval:
        batch_len = len(x_batch)
        feed_dict = feed_data(x_batch, y_batch, 1.0)
        loss, acc = sess.run([model.loss, model.acc], feed_dict=feed_dict)
        total_loss += loss * batch_len
        total_acc += acc * batch_len

    return total_loss / data_len, total_acc / data_len

def train():
    print("Configuring TensorBoard and Saver...")
    # 配置 Tensorboard,重新訓練時,請將tensorboard文件夾刪除,不然圖會覆蓋
    tensorboard_dir = 'tensorboard/textcnn'
    if not os.path.exists(tensorboard_dir):
        os.makedirs(tensorboard_dir)

    tf.summary.scalar("loss", model.loss)
    tf.summary.scalar("accuracy", model.acc)
    merged_summary = tf.summary.merge_all()
    writer = tf.summary.FileWriter(tensorboard_dir)

    # 配置 Saver
    saver = tf.train.Saver()
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    print("Loading training and validation data...")
    # 載入訓練集與驗證集
    start_time = time.time()
    x_train, y_train = process_file(train_dir, word_to_id, cat_to_id, config.seq_length)
    x_val, y_val = process_file(val_dir, word_to_id, cat_to_id, config.seq_length)
    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)

    # 創建session
    session = tf.Session()
    session.run(tf.global_variables_initializer())
    writer.add_graph(session.graph)

    print('Training and evaluating...')
    start_time = time.time()
    total_batch = 0              # 總批次
    best_acc_val = 0.0           # 最佳驗證集准確率
    last_improved = 0            # 記錄上一次提升批次
    require_improvement = 1000   # 如果超過1000輪未提升,提前結束訓練

    flag = False
    for epoch in range(config.num_epochs):
        print('Epoch:', epoch + 1)
        batch_train = batch_iter(x_train, y_train, config.batch_size)
        for x_batch, y_batch in batch_train:
            feed_dict = feed_data(x_batch, y_batch, config.dropout_keep_prob)

            if total_batch % config.save_per_batch == 0:
                # 每多少輪次將訓練結果寫入tensorboard scalar
                s = session.run(merged_summary, feed_dict=feed_dict)
                writer.add_summary(s, total_batch)

            if total_batch % config.print_per_batch == 0:
                # 每多少輪次輸出在訓練集和驗證集上的性能
                feed_dict[model.keep_prob] = 1.0
                loss_train, acc_train = session.run([model.loss, model.acc], feed_dict=feed_dict)
                loss_val, acc_val = evaluate(session, x_val, y_val)   # todo

                if acc_val > best_acc_val:
                    # 保存最好結果
                    best_acc_val = acc_val
                    last_improved = total_batch
                    saver.save(sess=session, save_path=save_path)
                    improved_str = '*'
                else:
                    improved_str = ''

                time_dif = get_time_dif(start_time)
                msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},'\
                    + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}'
                print(msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improved_str))

            session.run(model.optim, feed_dict=feed_dict)  # 運行優化
            total_batch += 1

            if total_batch - last_improved > require_improvement:
                # 驗證集正確率長期不提升,提前結束訓練
                print("No optimization for a long time, auto-stopping...")
                flag = True
                break  # 跳出循環
        if flag:  # 同上
            break

def test():
    print("Loading test data...")
    start_time = time.time()
    x_test, y_test = process_file(test_dir, word_to_id, cat_to_id, config.seq_length)

    session = tf.Session()
    session.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    saver.restore(sess=session, save_path=save_path)  # 讀取保存的模型

    print('Testing...')
    loss_test, acc_test = evaluate(session, x_test, y_test)
    msg = 'Test Loss: {0:>6.2}, Test Acc: {1:>7.2%}'
    print(msg.format(loss_test, acc_test))

    batch_size = 128
    data_len = len(x_test)
    num_batch = int((data_len - 1) / batch_size) + 1

    y_test_cls = np.argmax(y_test, 1)
    y_pred_cls = np.zeros(shape=len(x_test), dtype=np.int32) # 保存預測結果
    for i in range(num_batch):   # 逐批次處理
        start_id = i * batch_size
        end_id = min((i + 1) * batch_size, data_len)
        feed_dict = {
            model.input_x: x_test[start_id:end_id],
            model.keep_prob: 1.0
        }
        y_pred_cls[start_id:end_id] = session.run(model.y_pred_cls, feed_dict=feed_dict)

    # 評估
    print("Precision, Recall and F1-Score...")
    print(metrics.classification_report(y_test_cls, y_pred_cls, target_names=categories))

    # 混淆矩陣
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test_cls, y_pred_cls)
    print(cm)

    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)


if __name__ == '__main__':
    if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']:
        raise ValueError("""usage: python run_cnn.py [train / test]""")

    print('Configuring CNN model...')
    config = TCNNConfig()
    if not os.path.exists(vocab_dir):  # 如果不存在詞匯表,重建
        build_vocab(train_dir, vocab_dir, config.vocab_size)
    categories, cat_to_id = read_category()
    words, word_to_id = read_vocab(vocab_dir)
    config.vocab_size = len(words)
    model = TextCNN(config)

    if sys.argv[1] == 'train':
        train()
    else:
        test()

訓練結果

python runCNN.py train

這里寫圖片描述

驗證結果

python runCNN.py test
結果還行96.43%,至少達到我老板的需求了,可以交差了

這里寫代碼片

總結

具體實現機制還很不明白,畢竟深度學習還沒開始,打算還是先把機器學習搞搞在弄深度學習吧,都是大塊頭,在這先挖坑:

  • python對文件讀寫
  • 深度學習(CNN)
  • tensorFlow框架
  • 學完在復習下這段代碼

注意!

本站转载的文章为个人学习借鉴使用,本站对版权不负任何法律责任。如果侵犯了您的隐私权益,请联系我们删除。



 
粤ICP备14056181号  © 2014-2021 ITdaan.com