# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import random
from functools import reduce
import spacy
from tqdm import tqdm
from cerebras.modelzoo.data_preparation.nlp.tokenizers.Tokenization import (
FullTokenizer,
)
from cerebras.modelzoo.data_preparation.utils import (
convert_to_unicode,
create_masked_lm_predictions,
pad_instance_to_max_seq_length,
text_to_tokenized_documents,
)
[docs]class MLMOnlyInstance:
"""
A single training MLMOnly instance.
:param list tokens: List of tokens for MLM example
:param list masked_lm_positions: List of masked lm positions for sentence
pair
:param list masked_lm_labels: List of masked lm labels for example
"""
def __init__(
self,
tokens,
masked_lm_positions,
masked_lm_labels,
):
self.tokens = tokens
self.masked_lm_labels = masked_lm_labels
self.masked_lm_positions = masked_lm_positions
def __str__(self):
tokens = " ".join([convert_to_unicode(x) for x in self.tokens])
mlm_positions = " ".join([str(x) for x in self.masked_lm_positions])
mlm_labels = " ".join(
[convert_to_unicode(x) for x in self.masked_lm_labels]
)
s = f"MLMOnlyInstance: \n"
s += f"tokens: {tokens}\n"
s += f"masked_lm_positions: {mlm_positions}\n"
s += f"masked_lm_labels: {mlm_labels}\n"
s += "\n"
return s
def __repr__(self):
return self.__str__()
[docs]def data_generator(
metadata_files,
vocab_file,
do_lower,
disable_masking,
mask_whole_word,
max_seq_length,
max_predictions_per_seq,
masked_lm_prob,
dupe_factor,
output_type_shapes,
multiple_docs_in_single_file=False,
multiple_docs_separator="\n",
single_sentence_per_line=False,
buffer_size=1e6,
min_short_seq_length=None,
overlap_size=None,
short_seq_prob=0,
spacy_model="en_core_web_sm",
inverted_mask=False,
allow_cross_document_examples=True,
document_separator_token="[SEP]",
seed=None,
input_files_prefix="",
):
"""
Generator function used to create input dataset
for MLM only dataset.
1. Generate raw examples with tokens based on "overlap_size",
"max_sequence_length", "allow_cross_document_examples"
and "document_separator_token" and using a sliding window approach.
The exact steps are detailed in "_create_examples_from_document" function
2. Mask the raw examples based on "max_predictions_per_seq"
3. Pad the masked example to "max_sequence_length" if less that msl
:param str or list[str] metadata_files: A string or strings list each
pointing to a metadata file. A metadata file contains file paths for
flat text cleaned documents. It has one file path per line.
:param str vocab_file: Vocabulary file, to build tokenization from
:param bool do_lower: Boolean value indicating if words should be
converted to lowercase or not
:param bool disable_masking: whether masking should be disabled
:param bool mask_whole_word: If True, all subtokens corresponding to a word
will be masked.
:param int max_seq_length: Maximum length of the sequence to generate
:param int max_predictions_per_seq: Maximum number of Masked tokens
in a sequence
:param float masked_lm_prob: Proportion of tokens to be masked
:param int dupe_factor: Number of times to duplicate the dataset
with different static masks
:param dict output_type_shapes: Dictionary indicating the shapes of
different outputs
:param bool multiple_docs_in_single_file: True, when a single text
file contains multiple documents separated by <multiple_docs_separator>
:param str multiple_docs_separator: String which separates multiple
documents in a single text file.
:param single_sentence_per_line: True,when the document is already split
into sentences with one sentence in each line and there is no
requirement for further sentence segmentation of a document
:param int buffer_size: Number of tokens to be processed at a time
:param int min_short_seq_length: When short_seq_prob > 0, this number
indicates the least number of tokens that each example should have i.e
the num_tokens (excluding pad) would be in the range
[min_short_seq_length, MSL]
:param int overlap_size: Number of tokens that overlap with previous example
when processing buffer with a sliding window approach.
If None, defaults to overlap to max_seq_len/4.
:param int short_seq_prob: Probability of a short sequence. Defaults to 0.
Sometimes we want to use shorter sequences to minimize the mismatch
between pre-training and fine-tuning.
:param spacy_model: spaCy model to load, i.e. shortcut
link, package name or path. Used to segment text into sentences.
:param bool inverted_mask: If set to False, has 0's on padded positions and
1's elsewhere. Otherwise, "inverts" the mask, so that 1's are on padded
positions and 0's elsewhere.
:param bool allow_cross_document_examples: If True, the sequences can
contain tokens from the next document.
:param str document_separator_token: String to separate tokens from
one document and the next when sequences span documents
:param int seed: Random seed.
:param str input_file_prefix: Prefix to be added to paths of the input files.
:returns: yields training examples (feature, [])
"""
## Set defaults if values passed are None
if overlap_size is None:
overlap_size = int(max_seq_length / 4)
print(
f"--- Setting overlap_size to {overlap_size} since None value passed"
)
if not allow_cross_document_examples and document_separator_token:
print(
f"--- Since example cannot span documents "
f"(allow_cross_document_examples: {allow_cross_document_examples}),"
f" document_separator_token: {document_separator_token} will be ignored"
)
if min_short_seq_length is None:
min_short_seq_length = 2 + overlap_size
elif (min_short_seq_length < (2 + overlap_size)) or (
min_short_seq_length > max_seq_length - 2
):
raise ValueError(
f"The min_short_seq_len param {min_short_seq_length} is invalid. \n"
f"Allowed values are [{2 + overlap_size}, {max_seq_length - 2})"
)
# define tokenizer
tokenizer = FullTokenizer(vocab_file, do_lower)
vocab_words = tokenizer.get_vocab_words()
if do_lower:
document_separator_token = document_separator_token.lower()
assert (
document_separator_token in vocab_words
), f" document_separator_token: {document_separator_token} not present in vocab file"
rng = random.Random(seed)
# get all text files by reading metadata files
if isinstance(metadata_files, str):
metadata_files = [metadata_files]
input_files = []
for _file in metadata_files:
with open(_file, "r") as _fin:
input_files.extend(_fin.readlines())
input_files = [x.strip() for x in input_files if x]
num_input_files = len(input_files)
rng.shuffle(input_files)
def _generate_train_feature(example):
if disable_masking:
return example
else:
return create_masked_lm_features(
example,
vocab_words,
max_seq_length,
mask_whole_word,
max_predictions_per_seq,
masked_lm_prob,
document_separator_token,
rng,
tokenizer,
output_type_shapes,
inverted_mask,
)
current_buffer_length = 0
buffer_documents = []
# to speed up processing load spacy module once here
# disable the ununsed pipeline stages to speed up processing
nlp = spacy.load(spacy_model, disable=['tagger', 'ner'])
for _ in tqdm(range(dupe_factor)):
# Reset buffers
prev_tokens = []
for _file_num, _file in enumerate(input_files):
_fin_path = os.path.abspath(os.path.join(input_files_prefix, _file))
with open(_fin_path, "r") as _fin:
_fin_data = _fin.read()
processed_doc, num_tokens = text_to_tokenized_documents(
_fin_data,
tokenizer,
multiple_docs_in_single_file,
multiple_docs_separator,
single_sentence_per_line,
nlp,
)
# Flatten one level
buffer_documents.extend(
[
reduce(lambda x, y: x + y, doc_list)
for doc_list in processed_doc
]
)
current_buffer_length += num_tokens
# Continue if we don't have enough tokens
if (
current_buffer_length < buffer_size
and _file_num < num_input_files - 1
):
continue
rng.shuffle(buffer_documents)
# When enough tokens available, yield examples
for document_index, document in enumerate(buffer_documents):
_example_generator = _create_examples_from_document(
document,
allow_cross_document_examples,
document_separator_token,
overlap_size,
prev_tokens,
max_seq_length,
short_seq_prob,
min_short_seq_length,
rng,
)
for example, prev_tokens in _example_generator:
if example:
yield _generate_train_feature(example)
# Fix buffer lengths, buffer etc
buffer_documents = []
current_buffer_length = 0
# Last few tokens remaining after processing all input_files
if prev_tokens:
yield _generate_train_feature(["[CLS]"] + prev_tokens + ["[SEP]"])
[docs]def create_masked_lm_features(
example,
vocab_words,
max_seq_length,
mask_whole_word,
max_predictions_per_seq,
masked_lm_prob,
document_separator_token,
rng,
tokenizer,
output_type_shapes,
inverted_mask,
):
exclude_from_masking = list(
set(["[CLS]", "[SEP]", document_separator_token])
)
(
masked_example_tokens,
masked_lm_positions,
masked_lm_labels,
) = create_masked_lm_predictions(
example,
vocab_words,
mask_whole_word,
max_predictions_per_seq,
masked_lm_prob,
rng,
exclude_from_masking,
)
masked_lm_instance = MLMOnlyInstance(
tokens=masked_example_tokens,
masked_lm_positions=masked_lm_positions,
masked_lm_labels=masked_lm_labels,
)
# pad to MSL
feature, label = pad_instance_to_max_seq_length(
instance=masked_lm_instance,
mlm_only=True,
tokenizer=tokenizer,
max_seq_length=max_seq_length,
max_predictions_per_seq=max_predictions_per_seq,
output_type_shapes=output_type_shapes,
inverted_mask=inverted_mask,
)
return feature, label
def _create_examples_from_document(
document,
allow_cross_document_examples,
document_separator_token,
overlap_size,
prev_tokens,
max_seq_length,
short_seq_prob,
min_short_seq_length,
rng,
):
# Process for generating an example
# 1. The text from metadata files is read and accumulated in a buffer
# until the buffer_size limit is hit. Note that documents are always
# read entirely before the buffer_size limit is checked
# 2. Next, reading one document at a time from "buffer",
# we slide a window of size "max_sequence_length" and construct an example.
# 3. If "overlap_size" is set, then when generating the next example,
# the window is slided back by "overlap_size" and the next example is constructed
# 4. If an example can span multiple documents,
# i.e "allow_cross_document_examples" is set to True,
# then we use "document_separator_token" to separate tokens
# from the two documents
# i.e [CLS] <tokens-doc1> <document_separator_token><tokens-doc2>[SEP]
# 5. The last remaining tokens are used to contruct the final example
# and this example most of the times will have tokens less than "max_sequence_length"
# and would be padded to "max_sequence_length"
max_num_tokens = max_seq_length - 2
start_idx = 0
# We usually want to fill up the entire sequence since we are padding
# to `max_seq_length` anyways, so short sequences are generally not
# needed. However, we sometimes (i.e., short_seq_prob = 0.1 == 10% of
# the time) want to use shorter sequences to minimize the mismatch
# between pre-training and fine-tuning. The `target_seq_len` is just a
# rough target however, whereas `max_seq_length` is a hard limit
target_seq_len = max_num_tokens
if rng.random() < short_seq_prob:
target_seq_len = rng.randint(min_short_seq_length, max_num_tokens)
assert (
len(prev_tokens) <= max_seq_length
), "Number of leftover tokens i.e len(prev_tokens) > max_seq_length"
# NOTE: prev_tokens cannot be more than MSL.
# Basically, we do a windowing to construct examples and "overlap_size"
# refers to the amount we push the window back.
# "buffer_size", on the other hand enables us to process more than one
# document at once if the document contains fewer tokens.
# So, at a single time we can process "buffer_size" number of tokens
if prev_tokens:
document = prev_tokens + [document_separator_token] + document
prev_tokens = []
start_idx = 0 # inclusive of this element
end_idx = start_idx + target_seq_len - 1 # inclusive of this element
while end_idx < len(document):
example = document[
start_idx : end_idx + 1
] # All elements from start_idx to end_idx (inclusive)
# add special token for input start and end
assert (
len(example) > overlap_size
), f"Length of example {len(example)} less than overlap_size {overlap_size}"
assert (
len(example) <= max_num_tokens
), f"Length of example greater than max_num_tokens {max_num_tokens}"
example.insert(0, "[CLS]")
example.append("[SEP]")
yield example, prev_tokens
start_idx = end_idx - overlap_size + 1
# Recalculate target_seq_len,
if rng.random() < short_seq_prob:
target_seq_len = rng.randint(min_short_seq_length, max_num_tokens)
end_idx = start_idx + target_seq_len - 1
assert (
end_idx > 0
), f" When generating example, end_idx {end_idx} is less than zero."
assert (
start_idx >= 0
), f" When generating example, start_idx {start_idx} is less than zero."
if allow_cross_document_examples:
example = []
prev_tokens = document[start_idx:]
else:
example = ["[CLS]"] + document[start_idx:] + ["[SEP]"]
prev_tokens = []
yield example, prev_tokens