1、代码
def clean_text(text, remove_stopwords=False): """ 数据清洗 """ text = BeautifulSoup(text, 'html.parser').get_text() text = re.sub(r'[^a-zA-Z]', ' ', text) words = text.lower().split() if remove_stopwords: words = [w for w in words if w not in eng_stopwords] return words def to_review_vector(review): """ 获取词向量 """ global word_vec review = clean_text(review, remove_stopwords=True) #print (review) #words = nltk.word_tokenize(review) word_vec = np.zeros((1,300)) for word in review: #word_vec = np.zeros((1,300)) if word in model: word_vec += np.array([model[word]]) #print (word_vec.mean(axis = 0)) return pd.Series(word_vec.mean(axis = 0))
本站文章如无特殊说明,均为本站原创,如若转载,请注明出处:深度学习之NLP获取词向量 - Python技术站