训练模型

<span class='yzk_title_765'>【Python】深度学习笔记02</span>-Maxchen个人博客
模型训练的过程

我们通过数据生成步骤分别产生了:dev验证集,test测试集,train训练集三个文件,数据量的比例大约为1:2:14。由于模型并不能识别文本,我们需要先将文本转换为数字形式。

class Chunk(object):

    # 初始化
    def __init__(self):

        # 加载配置文件
        self.config_file=json.load(open("config_file", encoding="utf8"))
        self.tf_config = tf.ConfigProto()

        # tf会话初始化
        self.sess=tf.Session(config=self.tf_config)
        self.sess.run(tf.global_variables_initializer())
        self.maps="maps.pkl"

        # 文字和标签数字化或者文本化
        if pyversion=='three':    
            self.char_to_id, self.id_to_char, self.tag_to_id, self.id_to_tag = pickle.load(open(self.maps, "rb"))
        else:
            self.char_to_id, self.id_to_char, self.tag_to_id, self.id_to_tag = pickle.load(open(self.maps, "rb"),protocol=2)   

        # 加载模型
        self.model = Model(self.config_file)
        self.ckpt = tf.train.get_checkpoint_state("ckpt")

        # 读取模型参数
        if self.ckpt and tf.train.checkpoint_exists(self.ckpt.model_checkpoint_path):
            print("Reading model parameters from %s" % self.ckpt.model_checkpoint_path)
            self.model.saver.restore(self.sess, self.ckpt.model_checkpoint_path)
        else:
            print("No model file")

    # jieba切词后每个词的长度,1:0(单个字),2:13(两个字组成的词),3:123(三个字组成的词),4:1223(四个字组成的词)
    def features(self,string):
        def _w2f(word):
            lenth=len(word)
            if lenth==1:
                r=[0]
            if lenth>1:
                r=[2]*lenth
                r[0]=1
                r[-1]=3
            return r
        return list(chain.from_iterable([_w2f(word) for word in jieba.cut(string) if len(word.strip())>0]))
      
    # 将汉字转换为数字字典
    def get_text_input(self,text):

        inputs = list()
        inputs.append([text])
        D = self.char_to_id["<UNK>"]

        # 汉字转换为ID
        inputs.append([[self.char_to_id.setdefault(char, D) 
                            for char in text if len(char.strip())>0]])
        inputs.append([self.features(text)])
        inputs.append([[]])        
        if len(text.strip())>1: 
            return self.model.evaluate_line(self.sess,inputs, self.id_to_tag)

模型的配置在config_file文件中,具体内容如下:

{
    "model_type": "idcnn",
    "num_chars": 3538,
    "char_dim": 100,
    "num_tags": 51,
    "seg_dim": 20,
    "lstm_dim": 100,
    "batch_size": 20,
    "emb_file": "D:\\code\\NERuselocal4\\NERuselocal\\data\\vec.txt",
    "clip": 5,
    "dropout_keep": 0.5,
    "optimizer": "adam",
    "lr": 0.001,
    "tag_schema": "iobes",
    "pre_emb": true,
    "zeros": true,
    "lower": false
}

模型初始化过程中将读取上述参数:

# 定义模型的配置
def config_model(char_to_id, tag_to_id):
    config = OrderedDict()
    config["model_type"] = FLAGS.model_type
    config["num_chars"] = len(char_to_id)
    config["char_dim"] = FLAGS.char_dim
    config["num_tags"] = len(tag_to_id)
    config["seg_dim"] = FLAGS.seg_dim
    config["lstm_dim"] = FLAGS.lstm_dim
    config["batch_size"] = FLAGS.batch_size

    config["emb_file"] = FLAGS.emb_file
    config["clip"] = FLAGS.clip
    config["dropout_keep"] = 1.0 - FLAGS.dropout
    config["optimizer"] = FLAGS.optimizer
    config["lr"] = FLAGS.lr
    config["tag_schema"] = FLAGS.tag_schema
    config["pre_emb"] = FLAGS.pre_emb
    config["zeros"] = FLAGS.zeros
    config["lower"] = FLAGS.lower
    return config
class Model(object):
    
    # 从config_file中获取模型初始化参数,设置初始化参数
    def __init__(self, config):

        self.config = config
        
        self.lr = config["lr"]
        self.char_dim = config["char_dim"]
        self.lstm_dim = config["lstm_dim"]
        self.seg_dim = config["seg_dim"]

        self.num_tags = config["num_tags"]
        self.num_chars = config["num_chars"]#样本中总字数
        self.num_segs = 4
        self.global_step = tf.Variable(0, trainable=False)
        self.best_dev_f1 = tf.Variable(0.0, trainable=False)
        self.best_test_f1 = tf.Variable(0.0, trainable=False)
        self.initializer = initializers.xavier_initializer()
        self.char_inputs = tf.placeholder(dtype=tf.int32,
                                          shape=[None, None],
                                          name="ChatInputs")
        self.seg_inputs = tf.placeholder(dtype=tf.int32,
                                         shape=[None, None],
                                         name="SegInputs")

        self.targets = tf.placeholder(dtype=tf.int32,
                                      shape=[None, None],
                                      name="Targets")
        
        # dropout keep prob
        self.dropout = tf.placeholder(dtype=tf.float32,
                                      name="Dropout")

        used = tf.sign(tf.abs(self.char_inputs))
        length = tf.reduce_sum(used, reduction_indices=1)
        self.lengths = tf.cast(length, tf.int32)
        self.batch_size = tf.shape(self.char_inputs)[0]
        self.num_steps = tf.shape(self.char_inputs)[-1]
        self.model_type = config['model_type']
        self.layers = [
            {
                'dilation': 1
            },
            {
                'dilation': 1
            },
            {
                'dilation': 2
            },
        ]
        self.filter_width = 3
        self.num_filter = self.lstm_dim 
        self.embedding_dim = self.char_dim + self.seg_dim
        self.repeat_times = 4
        self.cnn_output_width = 0
        embedding = self.embedding_layer(self.char_inputs, self.seg_inputs, config)
        if self.model_type == 'idcnn':
            model_inputs = tf.nn.dropout(embedding, self.dropout)
            model_outputs = self.IDCNN_layer(model_inputs)
            self.logits = self.project_layer_idcnn(model_outputs)
        
        else:
            raise KeyError

        self.loss = self.loss_layer(self.logits, self.lengths)

        with tf.variable_scope("optimizer"):
            optimizer = self.config["optimizer"]
            if optimizer == "sgd":
                self.opt = tf.train.GradientDescentOptimizer(self.lr)
            elif optimizer == "adam":
                self.opt = tf.train.AdamOptimizer(self.lr)
            elif optimizer == "adgrad":
                self.opt = tf.train.AdagradOptimizer(self.lr)
            else:
                raise KeyError
            grads_vars = self.opt.compute_gradients(self.loss)
            capped_grads_vars = [[tf.clip_by_value(g, -self.config["clip"], self.config["clip"]), v]
                                 for g, v in grads_vars]
            self.train_op = self.opt.apply_gradients(capped_grads_vars, self.global_step)
        self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=5)

接着设置一些TensorFlow参数并初始化:

# 使用Flags定义命令行参数
flags = tf.app.flags
flags.DEFINE_boolean("clean",       True,      "clean train folder")
flags.DEFINE_boolean("train",       False,      "Whether train the model")
# flags.DEFINE_boolean("train",       True,      "Whether train the model")
# configurations for the model
flags.DEFINE_integer("seg_dim",     20,         "Embedding size for segmentation, 0 if not used")
flags.DEFINE_integer("char_dim",    100,        "Embedding size for characters")
flags.DEFINE_integer("lstm_dim",    100,        "Num of hidden units in LSTM, or num of filters in IDCNN")
flags.DEFINE_string("tag_schema",   "iobes",    "tagging schema iobes or iob")

# configurations for training
flags.DEFINE_float("clip",          5,          "Gradient clip")
flags.DEFINE_float("dropout",       0.5,        "Dropout rate")
flags.DEFINE_float("batch_size",    60,         "batch size")
flags.DEFINE_float("lr",            0.001,      "Initial learning rate")
flags.DEFINE_string("optimizer",    "adam",     "Optimizer for training")
flags.DEFINE_boolean("pre_emb",     True,       "Wither use pre-trained embedding")
flags.DEFINE_boolean("zeros",       True,      "Wither replace digits with zero")
flags.DEFINE_boolean("lower",       False,       "Wither lower case")

flags.DEFINE_integer("max_epoch",   100,        "maximum training epochs")
flags.DEFINE_integer("steps_check", 100,        "steps per checkpoint")
flags.DEFINE_string("ckpt_path",    "ckpt",      "Path to save model")
flags.DEFINE_string("summary_path", "summary",      "Path to store summaries")
flags.DEFINE_string("log_file",     "train.log",    "File for log")
flags.DEFINE_string("map_file",     "maps.pkl",     "file for maps")
flags.DEFINE_string("vocab_file",   "vocab.json",   "File for vocab")
flags.DEFINE_string("config_file",  "config_file",  "File for config")
flags.DEFINE_string("script",       "conlleval",    "evaluation script")
flags.DEFINE_string("result_path",  "result",       "Path for results")
flags.DEFINE_string("emb_file",     os.path.join(root_path+"data", "vec.txt"),  "Path for pre_trained embedding")
flags.DEFINE_string("train_file",   os.path.join(root_path+"data", "example.train"),  "Path for train data")
flags.DEFINE_string("dev_file",     os.path.join(root_path+"data", "example.dev"),    "Path for dev data")
flags.DEFINE_string("test_file",    os.path.join(root_path+"data", "example.test"),   "Path for test data")

flags.DEFINE_string("model_type", "idcnn", "Model type, can be idcnn or bilstm")
# flags.DEFINE_string("model_type", "bilstm", "Model type, can be idcnn or bilstm")

# tf读取命令行参数
FLAGS = tf.app.flags.FLAGS

# 断言
assert FLAGS.clip < 5.1, "gradient clip should't be too much"
assert 0 <= FLAGS.dropout < 1, "dropout rate between 0 and 1"
assert FLAGS.lr > 0, "learning rate must larger than zero"
assert FLAGS.optimizer in ["adam", "sgd", "adagrad"]

经过汉字数字化,模型初始化,TF会话初始化之后,开始训练模型。首先要加载数字化后的训练集,测试集和验证集;然后将原来的IOB标签方案更新为IOBES并处理成包含原文,每个字的编号,jieba切词后每个词的长度的集合;接着将数据批量输入到模型当中,创建一个文件夹用于存储日志和模型;最后获取计算机的硬件资源,保存模型和调整后的参数。

def train():
    
    # 分别加载dev验证集(1),test测试集(2),train训练集(14)三种数据
    train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros)
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    # 更新标记方案,由原来的IOB转换为IOBES
    # B,即Begin,表示开始
    # I,即Intermediate,表示中间
    # E,即End,表示结尾
    # S,即Single,表示单个字符
    # O,即Other,表示其他,用于标记无关字符    
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)
    update_tag_scheme(dev_sentences, FLAGS.tag_schema)
    
    # 如果不存在maps.pkl则创建maps.pkl
    if not os.path.isfile(FLAGS.map_file):
        
        # 为每个字创建张量字典
        if FLAGS.pre_emb:
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(),
                FLAGS.emb_file,
                list(itertools.chain.from_iterable(
                    [[w[0] for w in s] for s in test_sentences])
                )
            )
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower)

        # 为每个标签创建字典和映射
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        #with open('maps.txt','w',encoding='utf8') as f1:
            #f1.writelines(str(char_to_id)+" "+id_to_char+" "+str(tag_to_id)+" "+id_to_tag+'\n')
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)
            
    # 准备数据,获取包含索引的列表的集合
    # 集合包含原文,每个字的编号,jieba切词后每个词的长度
    train_data = prepare_dataset(
        train_sentences, char_to_id, tag_to_id, FLAGS.lower
    )
    dev_data = prepare_dataset(
        dev_sentences, char_to_id, tag_to_id, FLAGS.lower
    )
    test_data = prepare_dataset(
        test_sentences, char_to_id, tag_to_id, FLAGS.lower
    )
    print("%i / %i / %i sentences in train / dev / test." % (
        len(train_data), 0, len(test_data)))

    # 数据分批次输入
    train_manager = BatchManager(train_data, FLAGS.batch_size)
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)
    
    # 创建目录用于存储日志和模型
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)

    # 限制GPU内存
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger)
        logger.info("start training")
        loss = []
        
        # 调用GPU硬件资源,如果没有GPU则改为CPU
        with tf.device("/gpu:0"):
            
            # 批量数据放入到模型中
            for i in range(100):
                for batch in train_manager.iter_batch(shuffle=True):
                    step, batch_loss = model.run_step(sess, True, batch)
                    loss.append(batch_loss)
                    if step % FLAGS.steps_check == 0:
                        iteration = step // steps_per_epoch + 1
                        logger.info("iteration:{} step:{}/{}, "
                                    "NER loss:{:>9.6f}".format(
                            iteration, step%steps_per_epoch, steps_per_epoch, np.mean(loss)))
                        loss = []
    
               # best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
                if i%7==0:
                    save_model(sess, model, FLAGS.ckpt_path, logger)
            #evaluate(sess, model, "test", test_manager, id_to_tag, logger)

好的识别结果,需要样本以及模型共同发力,因此机器生成的样本也需要部分人工纠错,样本的准确性可以通过验证集和测试集的评分来判断,评分越高则模型整体的判断就会越加准确。

<span class='yzk_title_765'>【Python】深度学习笔记02</span>-Maxchen个人博客
对样本人工纠错
# 模型综合评分
def evaluate(sess, model, name, data, id_to_tag, logger):
    logger.info("evaluate:{}".format(name))
    ner_results = model.evaluate(sess, data, id_to_tag)
    eval_lines = test_ner(ner_results, FLAGS.result_path)
    for line in eval_lines:
        logger.info(line)
    f1 = float(eval_lines[1].strip().split()[-1])

    if name == "dev":
        best_test_f1 = model.best_dev_f1.eval()
        if f1 > best_test_f1:
            tf.assign(model.best_dev_f1, f1).eval()
            logger.info("new best dev f1 score:{:>.3f}".format(f1))
        return f1 > best_test_f1
    elif name == "test":
        best_test_f1 = model.best_test_f1.eval()
        if f1 > best_test_f1:
            tf.assign(model.best_test_f1, f1).eval()
            logger.info("new best test f1 score:{:>.3f}".format(f1))
        return f1 > best_test_f1

最后执行语义识别程序,程序的总入口在Chunk()当中,将Chunk()初始化传入要识别的语句,查看运行的结果,每个词会标记它在文本中的开始位置和结束位置,英文字母代表该词的词性:

if __name__ == "__main__":   

    # 初始化Chunk()
    c=Chunk()

    # 每次运行程序都会加载Chunk()
    #for line in open('text.txt','r',encoding='utf8'):
    #   print(c.get_text_input(line.strip()))

    s="典型胸痛 因体力活动、情绪激动等诱发,突感心前区疼痛,多为发作性绞痛或压榨痛,也可为憋闷感。疼痛从胸骨后或心前区开始,向上放射至左肩、臂,甚至小指和无名指,休息或含服硝酸甘油可缓解。胸痛放散的部位也可涉及颈部、下颌、牙齿、腹部等。胸痛也可出现在安静状态下或夜间,由冠脉痉挛所致,也称变异型心绞痛。如胸痛性质发生变化,如新近出现的进行性胸痛,痛阈逐步下降,以至稍事体力活动或情绪激动甚至休息或熟睡时亦可发作。疼痛逐渐加剧、变频,持续时间延长,祛除诱因或含服硝酸甘油不能缓解,此时往往怀疑不稳定心绞痛。"
    print(c.get_text_input(s))
<span class='yzk_title_765'>【Python】深度学习笔记02</span>-Maxchen个人博客
语义识别的结果
记录你的想法-丰富你的知识
  • <span class='yzk_title_799'>【Python】深度学习笔记03</span>
    【Python】深度学习笔记03
  • <span class='yzk_title_712'>【Python】深度学习笔记01</span>
    【Python】深度学习笔记01
  • <span class='yzk_title_528'>【Python】将.mat格式的数据保存在数据库中</span>
    【Python】将.mat格式的数据保存在数据库中
  • <span class='yzk_title_1668'>【Java】Spring Cloud学习笔记05</span>
    【Java】Spring Cloud学习笔记05
  • <span class='yzk_title_1601'>【Java】Spring Cloud学习笔记04</span>
    【Java】Spring Cloud学习笔记04
  • <span class='yzk_title_1492'>【Java】Spring Cloud学习笔记03</span>
    【Java】Spring Cloud学习笔记03
发表评论

坐等沙发