tensorflow神经⽹络做中⽂情感分析本⽂使⽤哈⼯⼤分词做⽂本预处理; 两层隐层神经⽹络;
# -*- coding: utf-8 -*-
# @bref :使⽤tensorflow做中⽂情感分析
import numpy as np
import tensorflow as tf
import random
from sklearn. import CountVectorizer
import os
import traceback
# 读取当前的⽂件路径
# D:\PycharmProjects2020\tensor1\gensiom_word2vec
real_dir_path = os.path.split(alpath(__file__))[0]
# 对⽂件路径进⾏拼接D:\PycharmProjects2020\tensor1\gensiom_word2vec\data/
pos_file = os.path.join(real_dir_path,'data/')
neg_file = os.path.join(real_dir_path,'data/')
# 使⽤哈⼯⼤分词和词性标注
from pyltp import Segmentor, Postagger
# 导⼊分词模型
seg = Segmentor()
seg.load('F:\modelmodel\ltp_data_v3.4.del')
# 导⼊标记模型
poser = Postagger()
poser.load('F:\modelmodel\ltp_data_v3.4.del')
# 当前⽂件所在路径
real_dir_path = os.path.split(alpath(__file__))[0]
# 停⽤词路径
stop_words_file = os.path.join(real_dir_path,'../')
# 定义允许的词性  LTP 使⽤ 863 词性标注集,详细请参考词性标准集。
allow_pos_ltp =('a','i','j','n','nh','ni','nl','ns','nt','nz','v','ws')
#分词、去除停⽤词、词性筛选
# 这个⽅法可以把s="今天去钓鱼了你去么" 过滤成为['今天', '钓鱼']
def cut_stopword_pos(s):
# s.split()是为了把"今天去钓鱼了你去么"改成今天去钓鱼了你去么
# 便于后边的sement进⾏分词
words = seg.segment(''.join(s.split()))
# 标记今天|去|钓鱼|了|你|去|么
# nt|v|v|u|r|v|u
poses = poser.postag(words)
# {',': None, '?': None, '、': None, '。': None,
stopwords ={}.fromkeys([line.rstrip()for line in open(stop_words_file,encoding='UTF-8')])
sentence =[]
#for i, pos in enumerate(poses):这段话执⾏如下
# 0 nt
# 1 wp
# 2 n
# 3 v
for i, pos in enumerate(poses):
if(pos in allow_pos_ltp)and(words[i]not in stopwords):
sentence.append(words[i])
return sentence
# 读取⽂本把过滤成这种dict_keys(['⼼得', '勘误', '疑点', '兴趣', '朋友', '访问', '⽹站', '交流', '切磋', 'www',
def create_vocab(pos_file, neg_file):
eval是做什么的def process_file(file_path):
with open(file_path,'r',encoding='UTF-8')as f:
v =[]
lines = f.readlines()
for line in lines:
sentence = cut_stopword_pos(line)
v.append(' '.join(sentence))
return v
return v
sent = process_file(pos_file)
sent += process_file(neg_file)
# CountVectorizer(max_df=0.9, min_df=1)
tf_v = CountVectorizer(max_df=0.9, min_df=1)
tf = tf_v.fit_transform(sent)
#print tf_v.vocabulary_
return tf_v.vocabulary_.keys()
#获取词汇
vocab = create_vocab(pos_file, neg_file)
#依据词汇将评论转化为向量
def normalize_dataset(vocab):
dataset =[]
# vocab:词汇表; review:评论; clf:评论对应的分类, [0, 1]表⽰负⾯评论,[1, 0]表⽰正⾯
def string_to_vector(vocab, review, clf):
words = cut_stopword_pos(review)# list of str
# 看有⼏个坑就放⼏个0,,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]14个坑14个0
features = np.zeros(len(vocab))
# 遍历cut_stopword_pos处理后的review 如果review的词在词汇表vocab
for w in words:
de('utf-8')in vocab:
features[vocab.de('utf-8'))]=1
return[features, clf]
with open(pos_file,'r',encoding='UTF-8')as f:
lines = f.readlines()
for line in lines:
one_sample = string_to_vector(vocab, line,[1,0])
dataset.append(one_sample)
with open(neg_file,'r',encoding='UTF-8')as f:
lines = f.readlines()
for line in lines:
one_sample = string_to_vector(vocab, line,[0,1])
dataset.append(one_sample)
return dataset
# 把vocab放进这个⽅法
dataset = normalize_dataset(vocab)
random.shuffle(dataset)#打乱顺序
#取样本的10%作为测试数据
# len(dataset)元素的个数
test_size =int(len(dataset)*0.1)
dataset = np.array(dataset)
train_dataset = dataset[:-test_size]
test_dataset = dataset[-test_size:]
# print 'test_size = {}'.format(test_size)
print('test_size = {}'.format(test_size))
#print 'size of train_dataset is {}'.format(train_dataset)
#Feed-forward nueral network
#定义每个层有多少个神经元
n_input_layer =len(vocab)#输⼊层每个神经元代表⼀个term
n_layer_1 =1000#hiden layer
n_layer_2 =1000# hiden layer
n_output_layer =2
#定义待训练的神经⽹络
def neural_netword(data):
#定义第⼀层神经元的w和b, random_normal定义服从正态分布的随机变量
layer_1_w_b ={'w_':tf.Variable(tf.random_normal([n_input_layer, n_layer_1])),'b_':tf.Variable(tf.random_normal([n_layer_1]))}
layer_2_w_b ={'w_':tf.Variable(tf.random_normal([n_layer_1, n_layer_2])),'b_':tf.Variable(tf.random_normal([n_layer_2]))}
layer_output_w_b ={'w_':tf.Variable(tf.random_normal([n_layer_2, n_output_layer])),'b_':tf.Variable(tf.random_normal([n_output_layer]))}    layer_1 = tf.add(tf.matmul(data, layer_1_w_b['w_']), layer_1_w_b['b_'])
layer_1 = tf.add(tf.matmul(data, layer_1_w_b['w_']), layer_1_w_b['b_'])
layer_1 = lu(layer_1)#relu做激活函数
layer_2 = tf.add(tf.matmul(layer_1, layer_2_w_b['w_']), layer_2_w_b['b_'])
layer_2 = lu(layer_2)
layer_output = tf.add(tf.matmul(layer_2, layer_output_w_b['w_']), layer_output_w_b['b_'])
return layer_output
batch_size =50
X = tf.placeholder('float',[None, n_input_layer])#None表⽰样本数量任意; 每个样本纬度是term数量
Y = tf.placeholder('float')
#使⽤数据训练神经⽹络
def train_neural_network(X, Y):
predict = neural_netword(X)
#cost func是输出层softmax的cross entropy的平均值。将softmax 放在此处⽽⾮nn中是为了效率.
cost_func = tf.reduce_softmax_cross_entropy_with_logits(logits=predict, labels=Y))
#设置优化器
optimizer = tf.train.AdamOptimizer().minimize(cost_func)
epochs =13#epoch本意是时代、纪, 这⾥是迭代周期
with tf.Session()as session:
session.run(tf.initialize_all_variables())#初始化所有变量,包括w,b
random.shuffle(train_dataset)
train_x = train_dataset[:,0]#每⼀⾏的features;
train_y = train_dataset[:,1]#每⼀⾏的label
# print 'size of train_x is {}'.format(len(train_x))
print('size of train_x is {}'.format(len(train_x)))
for epoch in range(epochs):
epoch_loss =0#每个周期的loss
i =0
while i <len(train_x):
start = i
end = i + batch_size
batch_x = train_x[start:end]
batch_y = train_y[start:end]
#run的第⼀个参数fetches可以是单个,也可以是多个。返回值是fetches的返回值。
#此处因为要打印cost,所以cost_func也在fetches中
_, c = session.run([optimizer, cost_func], feed_dict={X:list(batch_x), Y:list(batch_y)})
epoch_loss += c
i = end
print(epoch,' : ', epoch_loss)
#评估模型
test_x = test_dataset[:,0]
test_y = test_dataset[:,1]
#argmax能给出某个tensor对象在某⼀维上的其数据最⼤值所在的索引值, 这⾥是索引值的list。tf.equal⽤于检测匹配,返回bool型的list        correct = tf.equal(tf.argmax(predict,1), tf.argmax(Y,1))
#tf.cast 可以将[True, False, True] 转化为[1, 0, 1]
#reduce_mean⽤于在某⼀维上计算平均值, 未指定纬度则计算所有元素
accurqcy = tf.reduce_mean(tf.cast(correct,'float'))
print('准确率: {}'.format(accurqcy.eval({X:list(test_x), Y:list(test_y)})))
#等价: print session.run(accuracy, feed_dict={X:list(test_x), Y:list(test_y)})
train_neural_network(X, Y)
输出结果:
杂乱分析代码:
import numpy as np
import tensorflow as tf
import random
from sklearn. import CountVectorizer
import os
import traceback
# 读取当前的⽂件路径
# D:\PycharmProjects2020\tensor1\gensiom_word2vec
real_dir_path = os.path.split(alpath(__file__))[0]
# 对⽂件路径进⾏拼接D:\PycharmProjects2020\tensor1\gensiom_word2vec\data/ pos_file = os.path.join(real_dir_path,'data/')
neg_file = os.path.join(real_dir_path,'data/')
# 使⽤哈⼯⼤分词和词性标注
from pyltp import Segmentor, Postagger
# 导⼊分词模型
seg = Segmentor()
seg.load('F:\modelmodel\ltp_data_v3.4.del')
# 导⼊标记模型
poser = Postagger()
poser.load('F:\modelmodel\ltp_data_v3.4.del')
# 当前⽂件所在路径
real_dir_path = os.path.split(alpath(__file__))[0]
# 停⽤词路径
stop_words_file = os.path.join(real_dir_path,'../')
# 定义允许的词性  LTP 使⽤ 863 词性标注集,详细请参考词性标准集。
allow_pos_ltp =('a','i','j','n','nh','ni','nl','ns','nt','nz','v','ws')
#分词、去除停⽤词、词性筛选
# word=seg.segment('今天去钓鱼了你去么')
word1="今天去钓鱼了你去么"
print(''.join(word1.split()))
a=seg.segment(''.join(word1.split()))
print(
'|'.join(a)
)
poses = poser.postag(a)
print(
'|'.join(poses)
)
)
stopwords ={}.fromkeys(
[
line.rstrip()for line in open(stop_words_file,encoding='UTF-8')
]
)
# stopwords = {}.fromkeys([line.rstrip() for line in open(stop_words_file)])
print(stopwords)
# for i, pos in enumerate(poses):
#
#    print(i,pos)
sentence=[]
for i, pos in enumerate(poses):
if(pos in allow_pos_ltp)and(a[i]not in stopwords):
sentence.append(a[i])
print(sentence)
# --------------------------------------------------------------------
def cut_stopword_pos(s):
# s.split()是为了把"今天去钓鱼了你去么"改成今天去钓鱼了你去么
# 便于后边的sement进⾏分词
words = seg.segment(''.join(s.split()))
# 标记今天|去|钓鱼|了|你|去|么
# nt|v|v|u|r|v|u
poses = poser.postag(words)
# {',': None, '?': None, '、': None, '。': None,
stopwords ={}.fromkeys([line.rstrip()for line in open(stop_words_file,encoding='UTF-8')])    sentence =[]
# 0 nt
# 1 wp
# 2 n
# 3 v
for i, pos in enumerate(poses):
if(pos in allow_pos_ltp)and(words[i]not in stopwords):
sentence.append(words[i])
return sentence
# ----------------------------------------------------
real_dir_path = os.path.split(alpath(__file__))[0]
# 对⽂件路径进⾏拼接D:\PycharmProjects2020\tensor1\gensiom_word2vec\data/ aaa = os.path.join(real_dir_path,'')
bbb = os.path.join(real_dir_path,'')
def process_file(file_path):
with open(file_path,'r',encoding='UTF-8')as f:
v =[]
lines = f.readlines()
for line in lines:
sentence = cut_stopword_pos(line)
v.append(' '.join(sentence))
return v
# -------------------------------------------------------
def create_vocab(pos_file, neg_file):
def process_file(file_path):
with open(file_path,'r',encoding='UTF-8')as f:
v =[]
lines = f.readlines()
for line in lines:
sentence = cut_stopword_pos(line)
v.append(' '.join(sentence))
return v
sent = process_file(pos_file)
sent += process_file(neg_file)
tf_v = CountVectorizer(max_df=0.9, min_df=1)
tf = tf_v.fit_transform(sent)