keras—多层感知器MLP—IMDb情感分析

  1 import urllib.request
  2 import os
  3 import tarfile
  4 from keras.datasets import imdb
  5 from keras.preprocessing import sequence
  6 from keras.preprocessing.text import Tokenizer
  7 import re
  8 def rm_tags(text):
  9     re_tag=re.compile(r'<[^>]+>')
 10     return re_tag.sub('',text)
 11 def read_files(filetype):
 12     path="C:/Users/admin/.keras/aclImdb/"
 13     file_list=[]
 14     positive_path=path+filetype+"/pos/"
 15     for f in os.listdir(positive_path):
 16         file_list+=[positive_path+f]
 17     negative_path=path+filetype+"/pos/"
 18     for f in os.listdir(negative_path):
 19         file_list+=[negative_path+f]
 20     print('read',filetype,'files:',len(file_list))
 21     all_labels=([1]*12500+[0]*12500)
 22     all_texts=[]
 23     for fi in file_list:
 24         with open(fi,encoding='utf8') as file_input:
 25             all_texts+=[rm_tags(" ".join(file_input.readlines()))]
 26     return all_labels,all_texts
 27 y_train,train_text=read_files("train")
 28 y_test,test_text=read_files("test")
 29 print(train_text[0])
 30 print(y_train[0])
 31 token=Tokenizer(num_words=2000)
 32 token.fit_on_texts(train_text)
 33 print(token.document_count)
 34 print(token.word_index)
 35 x_train_seq=token.texts_to_sequences(train_text)
 36 x_test_seq=token.texts_to_sequences(test_text)
 37 print(train_text[0])
 38 print(x_train_seq[0])
 39 x_train=sequence.pad_sequences(x_train_seq,maxlen=100)
 40 x_test=sequence.pad_sequences(x_test_seq,maxlen=100)
 41 print('before pad_sequences lenfth=',len(x_train_seq[113]))
 42 print(x_train_seq[113])
 43 print('after pad_sequences lenfth=',len(x_train[113]))
 44 print(x_train[113])
 45 from keras.models import Sequential
 46 from keras.layers import Dense,Dropout,Flatten,Activation
 47 from keras.layers.embeddings import Embedding
 48 model=Sequential()
 49 model.add(Embedding(output_dim=32,
 50                  input_dim=2000,
 51                  input_length=100))
 52 model.add(Dropout(0.2))
 53 #model.add(SimpleRNN(units=16))
 54 model.add(Flatten())
 55 model.add(Dense(units=256,
 56                 activation='relu'))
 57 model.add(Dropout(0.35))
 58 model.add(Dense(units=1,
 59                 activation='sigmoid'))
 60 print(model.summary())
 61 model.compile(loss='binary_crossentropy',
 62               optimizer='adam',
 63               metrics=['accuracy'])
 64 train_history=model.fit(x=x_train,y=y_train,batch_size=100,
 65                         epochs=10,verbose=2,
 66                         validation_split=0.2)
 67 scores=model.evaluate(x_test,y_test,verbose=1)
 68 print('accuracy',scores[1])
 69 predict=model.predict_classes(x_test)
 70 print("prediction[:10]",predict[:10])
 71 predict_classes=predict.reshape(-1)
 72 print(predict_classes[:10])
 73 SentimentDict = {1: '正面的', 0: '负面的'}
 74 def display_test_Sentiment(i):
 75     print(test_text[i])
 76     print('label真实值:', SentimentDict[y_test[i]],
 77           '预测结果:', SentimentDict[predict_classes[i]])
 78 display_test_Sentiment(12502)
 79 input_text='''
 80 I saw this film with my 6-year-old a couple weeks ago. While there's plenty about which to gripe, here's one of 
 81 my biggest problems: I can't stand this constant CGI-heavy everything-must-be-a-sequel-or- a- remake era of film
 82 making. It's making movie makers lazy.
 83 '''
 84 input_seq=token.texts_to_sequences([input_text])
 85 len(input_seq[0])
 86 print(input_seq[0])
 87 pad_input_seq=sequence.pad_sequences(input_seq,maxlen=100)
 88 len(pad_input_seq[0])
 89 print(pad_input_seq[0])
 90 predict_result=model.predict_classes(pad_input_seq)
 91 print(predict_result)
 92 print(predict_result[0][0])
 93 print(SentimentDict[predict_result[0][0]])
 94 def predict_review(input_text):
 95     input_seq=token.texts_to_sequences([input_text])
 96     pad_input_seq=sequence.pad_sequences(input_seq,maxlen=100)
 97     predict_result=model.predict_classes(pad_input_seq)
 98     print(SentimentDict[predict_result[0][0]])
 99 
100 predict_review('''
101 They poured on the whole "LeFou is gay" thing a bit thick for my taste. It was the only thing that added levity to the movie (despite how much fun it should have been already), but it seemed a bit cheap. I'm not going to apologize for wanting more for my LGBTQ characters than to be just the comic relief.
102 ''')