最近在工作中有一个需求:用训练好的模型将数据库中所有数据得出预测结果,并保存到另一张表上。数据库中的数据是一篇篇文章,我训练好的模型是对其中的四个段落分别分类,即我有四个模型,拿到文本后需要提取出这四个段落,并用对应模型分别预测这四个段落的类别,然后存入数据库中。我是用keras训练的模型,backend为tensorflow,因为数据量比较大,自然想到用多进程。在Windows上运行一点问题没有,但是在Linux服务器上运行时发现每次都停在model.predict上不动了。

模型使用时大致如下:

# -*- coding: utf-8 -*-
import jieba
import numpy as np
import keras
import tensorflow as tf
from keras.preprocessing import sequence
from keras.models import load_model
from config import Config
import json


config_file = 'data/config.ini'
model_path = Config(config_file).get_value_str('cnn', 'model_path')
graph = tf.Graph()
with graph.as_default():
    session = tf.Session()
    with session.as_default():
        model = load_model(model_path)

graph_var = graph
session_var = session


def sentence_process(sentence):
    with open('data/words.json', encoding='utf-8') as f:
        words_json = json.load(f)
    words = words_json['words']
    word_to_id = words_json['word_to_id']
    max_length = words_json['max_length']
    segs = jieba.lcut(sentence)
    segs = filter(lambda x: len(x) >= 1, segs)
    segs = [x for x in segs if x]
    vector = []
    for seg in segs:
        if seg in words:
            vector.append(word_to_id[seg])
        else:
            vector.append(4999)
    return vector, max_length


def predict(sentence):
    vector, max_length = sentence_process(sentence)
    vector_np = np.array([vector])
    x_vector = sequence.pad_sequences(vector_np, max_length)
    with graph_var.as_default():
        with session_var.as_default():
            y = model.predict_proba(x_vector)
            if y[0][1] > 0.5:
                predict = 1
            else:
                predict = 0
    return predict

View Code