BERT 两种输入数据处理方式
TFRecord格式
TFRecord
内部使用了“Protocol Buffer”
二进制数据编码 方案,它只占用一个内存块,只需要一次性加载一个二进制文件的方式即可,简单,快速,尤其对大型训练数据很友好。而且当我们的训练数据量比较大的时候,可以将数据分成多个TFRecord
文件,来提高处理效率。
写文件
使用TFRecord生成器以及样本Example模块。
writer = tf.python_io.TFRecordWriter(output_file)
tf_example = tf.train.Example(
features=tf.train.Features(feature=features))
writer.write(tf_example.SerializeToString())
writer.close()
上述writer
是TFrecord生成器,通过writer.write(tf_example.SerializeToString())
来生成tfrecord文件。
tf_example.SerializeToString()
是将Example中的map压缩为二进制文件,更好的节省空间。
Example协议如下:
message Example {
Features features = 1;
};
message Features {
map<string, Feature> feature = 1;
};
tf.train.Features(feature = None)
这里的feature是以 字典 的形式存在。
key:要保存数据的名字,value:要保存的数据,格式必须符合tf.train.Feature实例要求。
读取
- 从
tfrecord
文件创建TFRecordDataset - 通过解析器tf.parse_single_example将的example解析出来,即序列化后的
tf.train.Example
,输入参数是name_to_features = { "input_ids": tf.FixedLenFeature([seq_length], tf.int64), "input_mask": tf.FixedLenFeature([seq_length], tf.int64), "segment_ids": tf.FixedLenFeature([seq_length], tf.int64), "label_ids": tf.FixedLenFeature([], tf.int64), "is_real_example": tf.FixedLenFeature([], tf.int64), } d = tf.data.TFRecordDataset(input_file) example = tf.parse_single_example(record, name_to_features)
第一种:TFRecord类型
该种方法在训练模型文件中使用run_classifier.py
将数据文件,保存为TFRecord类型的文件,使用时再从TFRecord文件中读取/解码出来。
-
将输入文本处理为
InputExample
类的形式
调用:predict_examples = get_test_examples(test_file)
函数实现:
def get_test_examples(data_file): """See base class.""" # file_path = os.path.join(data_dir, 'test_1.csv') examples = [] with open(data_file, encoding='utf-8') as f: reader = f.readlines() for i, line in enumerate(reader): guid = "train-%d" % (i) split_line = line.strip().split(",") text_a = tokenization.convert_to_unicode(split_line[1]) text_b = None # text_b = tokenization.convert_to_unicode(split_line[2]) # label = tokenization.convert_to_unicode(line[2]) label = str(split_line[0]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
-
将examples数据转化为features,写入TFRecord类型的文本中
调用函数如下:
# get features 将predict_examples转化为一系列的InputExample写入TFRecord file file_based_convert_examples_to_features(predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file)
函数定义:
# 将所有测试文本转换为InputFeatures的形式,然后写入到TFRecord文件中去 def file_based_convert_examples_to_features( examples, label_list, max_seq_length, tokenizer, output_file): """Convert a set of `InputExample`s to a TFRecord file.""" writer = tf.python_io.TFRecordWriter(output_file) for (ex_index, example) in enumerate(examples): if ex_index % 10000 == 0: tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) feature = convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer) def create_int_feature(values): f = tf.train.Feature( int64_list=tf.train.Int64List(value=list(values))) return f features = collections.OrderedDict() features["input_ids"] = create_int_feature(feature.input_ids) features["input_mask"] = create_int_feature(feature.input_mask) features["segment_ids"] = create_int_feature(feature.segment_ids) features["label_ids"] = create_int_feature([feature.label_id]) features["is_real_example"] = create_int_feature( [int(feature.is_real_example)]) tf_example = tf.train.Example( features=tf.train.Features(feature=features)) writer.write(tf_example.SerializeToString()) writer.close()
-
将数据生成input_fn形式用来传入Estimator
作用: 即将数据文件从
TFRecord
格式的文件中读取/解码出来,返回batch_size的输入数据,送入Estimator
中调用:
predict_file
:为上一步生成的TFRecord
格式的文件# 将predict_file生成input_fn形式用来传入Estimator predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder)
实现:
# 从TFRecord文件中(测试文件)中读取/解码,返回batch_size的输入数据 def file_based_input_fn_builder(input_file, seq_length, is_training, drop_remainder): """Creates an `input_fn` closure to be passed to TPUEstimator.""" name_to_features = { "input_ids": tf.FixedLenFeature([seq_length], tf.int64), "input_mask": tf.FixedLenFeature([seq_length], tf.int64), "segment_ids": tf.FixedLenFeature([seq_length], tf.int64), "label_ids": tf.FixedLenFeature([], tf.int64), "is_real_example": tf.FixedLenFeature([], tf.int64), } def _decode_record(record, name_to_features): """Decodes a record to a TensorFlow example.""" example = tf.parse_single_example(record, name_to_features) # tf.Example only supports tf.int64, but the TPU only supports tf.int32. # So cast all int64 to int32. for name in list(example.keys()): t = example[name] if t.dtype == tf.int64: t = tf.to_int32(t) example[name] = t return example def input_fn(params): """The actual input function.""" batch_size = params["batch_size"] # For training, we want a lot of parallel reading and shuffling. # For eval, we want no shuffling and parallel reading doesn't matter. d = tf.data.TFRecordDataset(input_file) if is_training: d = d.repeat() d = d.shuffle(buffer_size=100) d = d.apply( tf.contrib.data.map_and_batch( lambda record: _decode_record(record, name_to_features), batch_size=batch_size, drop_remainder=drop_remainder)) return d return input_fn
第二种:tf.data.Dataset
- 将examples数据转化为features
def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer): """Convert a set of `InputExample`s to a list of `InputFeatures`.""" features = [] for (ex_index, example) in enumerate(examples): if ex_index % 10000 == 0: tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) feature = convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer) features.append(feature) return features
- 将features处理为input_fn的形式
使用tf.data.Dataset
的形式处理。def input_fn_builder(features, seq_length, is_training, drop_remainder): """Creates an `input_fn` closure to be passed to TPUEstimator.""" all_input_ids = [] all_input_mask = [] all_segment_ids = [] all_label_ids = [] for feature in features: all_input_ids.append(feature.input_ids) all_input_mask.append(feature.input_mask) all_segment_ids.append(feature.segment_ids) all_label_ids.append(feature.label_id) def input_fn(params): """The actual input function.""" batch_size = params["batch_size"] num_examples = len(features) # This is for demo purposes and does NOT scale to large data sets. We do # not use Dataset.from_generator() because that uses tf.py_func which is # not TPU compatible. The right way to load data is with TFRecordReader. d = tf.data.Dataset.from_tensor_slices({ "input_ids": tf.constant( all_input_ids, shape=[num_examples, seq_length], dtype=tf.int32), "input_mask": tf.constant( all_input_mask, shape=[num_examples, seq_length], dtype=tf.int32), "segment_ids": tf.constant( all_segment_ids, shape=[num_examples, seq_length], dtype=tf.int32), "label_ids": tf.constant(all_label_ids, shape=[ num_examples], dtype=tf.int32), }) if is_training: d = d.repeat() d = d.shuffle(buffer_size=100) d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder) return d return input_fn
将单个文本处理为InputFeatures
的形式
# 将单个文本处理为InputFeatures的形式
def convert_single_example(ex_index, example, label_list, max_seq_length,
tokenizer):
"""Converts a single `InputExample` into a single `InputFeatures`."""
if isinstance(example, PaddingInputExample):
return InputFeatures(
input_ids=[0] * max_seq_length,
input_mask=[0] * max_seq_length,
segment_ids=[0] * max_seq_length,
label_id=0,
is_real_example=False)
label_map = {}
for (i, label) in enumerate(label_list):
label_map[label] = i
tokens_a = tokenizer.tokenize(example.text_a)
tokens_b = None
if example.text_b:
tokens_b = tokenizer.tokenize(example.text_b)
if tokens_b:
# Account for [CLS], [SEP], [SEP] with "- 3"
_truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
else:
# Account for [CLS] and [SEP] with "- 2"
if len(tokens_a) > max_seq_length - 2:
tokens_a = tokens_a[0:(max_seq_length - 2)]
tokens = []
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
for token in tokens_a:
tokens.append(token)
segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(0)
if tokens_b:
for token in tokens_b:
tokens.append(token)
segment_ids.append(1)
tokens.append("[SEP]")
segment_ids.append(1)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length.
while len(input_ids) < max_seq_length:
input_ids.append(0)
input_mask.append(0)
segment_ids.append(0)
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
label_id = label_map[example.label]
feature = InputFeatures(
input_ids=input_ids,
input_mask=input_mask,
segment_ids=segment_ids,
label_id=label_id,
is_real_example=True)
return feature
更多推荐
所有评论(0)