Source code for cerebras.modelzoo.data_preparation.nlp.bert.dynamic_processor

# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import random

import spacy
from tqdm import tqdm

from cerebras.modelzoo.data_preparation.nlp.tokenizers.Tokenization import (
    FullTokenizer,
)
from cerebras.modelzoo.data_preparation.utils import (
    convert_to_unicode,
    text_to_tokenized_documents,
)


[docs]class PreprocessInstance:
    """
    A single training (sentence-pair) instance.
    :param list tokens: List of tokens for sentence pair
    :param list segment_ids: List of segment ids for sentence pair
    :param bool is_random_next: Specifies wh ether the second element in the
    pair is random
    """

    def __init__(
        self,
        tokens,
        segment_ids,
        is_random_next,
    ):
        self.tokens = tokens
        self.segment_ids = segment_ids
        self.is_random_next = 1 if is_random_next else 0
        self.dict = {
            "tokens": self.tokens,
            "segment_ids": self.segment_ids,
            "is_random_next": self.is_random_next,
        }

    def __str__(self):
        tokens = " ".join([convert_to_unicode(x) for x in self.tokens])
        segment_ids = " ".join([str(x) for x in self.segment_ids])

        s = ""
        s += f"tokens: {tokens}\n"
        s += f"segment_ids: {segment_ids}\n"
        s += f"is_random_next: {self.is_random_next}\n"
        s += "\n"

        return s

    def to_dict(self):
        return self.dict

    def __repr__(self):
        return self.__str__()


[docs]def data_generator(
    metadata_files,
    vocab_file,
    do_lower,
    split_num,
    max_seq_length,
    short_seq_prob,
    mask_whole_word,
    max_predictions_per_seq,
    masked_lm_prob,
    dupe_factor,
    output_type_shapes,
    min_short_seq_length=None,
    multiple_docs_in_single_file=False,
    multiple_docs_separator="\n",
    single_sentence_per_line=False,
    inverted_mask=False,
    seed=None,
    spacy_model="en_core_web_sm",
    input_files_prefix="",
    sop_labels=False,
):
    """
    Generator function used to create input dataset
    for MLM + NSP dataset.

    1. Generate raw examples by concatenating two parts
    'tokens-a' and 'tokens-b' as follows:
    [CLS] <tokens-a> [SEP] <tokens-b> [SEP]
    where :
        tokens-a: list of tokens taken from the
        current document and of random length (less than msl).

        tokens-b: list of tokens chosen based on the
        randomly set "next_sentence_labels" and of
        length msl-len(<tokens-a>)- 3 (to account for [CLS] and [SEP] tokens)

    If "next_sentence_labels" is 1, (set to 1 with 0.5 probability),
        tokens-b are list of tokens from sentences chosen randomly
        from different document
    else,
        tokens-b are list of tokens taken from the same document
        and is a continuation of tokens-a in the document
    The number of raw tokens depends on "short_sequence_prob" as well

    :param str or list[str] metadata_files: A string or strings list each
        pointing to a metadata file. A metadata file contains file paths for
        flat text cleaned documents. It has one file path per line.
    :param str vocab_file: Vocabulary file, to build tokenization from
    :param bool do_lower: Boolean value indicating if words should be
        converted to lowercase or not
    :param int split_num: Number of input files to read at a given
        time for processing.
    :param int max_seq_length: Maximum length of the sequence to generate
    :param int short_seq_prob: Probability of a short sequence. Defaults to 0.
        Sometimes we want to use shorter sequences to minimize the mismatch
        between pre-training and fine-tuning.
    :param bool mask_whole_word: If True, all subtokens corresponding to a word
        will be masked.
    :param int max_predictions_per_seq: Maximum number of Masked tokens
        in a sequence
    :param float masked_lm_prob: Proportion of tokens to be masked
    :param int dupe_factor: Number of times to duplicate the dataset
        with different static masks
    :param int min_short_seq_length: When short_seq_prob > 0, this number
        indicates the least number of tokens that each example should have i.e
        the num_tokens (excluding pad) would be in the range
        [min_short_seq_length, MSL]
    :param dict output_type_shapes: Dictionary indicating the shapes of
        different outputs
    :param bool multiple_docs_in_single_file: True, when a single text file
        contains multiple documents separated by <multiple_docs_separator>
    :param str multiple_docs_separator: String which separates
    multiple documents in a single text file.
    :param single_sentence_per_line: True,when the document is already
        split into sentences with one sentence in each line and there is
        no requirement for further sentence segmentation of a document
    :param bool inverted_mask: If set to False, has 0's on padded positions and
        1's elsewhere. Otherwise, "inverts" the mask, so that 1's are on padded
        positions and 0's elsewhere.
    :param int seed: Random seed.
    :param spacy_model: spaCy model to load, i.e. shortcut
        link, package name or path. Used to segment text into sentences.
    :param str input_file_prefix: Prefix to be added to paths of the input files.
    :param bool sop_labels: If true, negative examples of the dataset will be two
        consecutive sentences in reversed order. Otherwise, uses regular (NSP)
        labels (where negative examples are from different documents).

    :returns: yields training examples (feature, label)
    where label refers to the next_sentence_prediction label

    """
    if min_short_seq_length is None:
        min_short_seq_length = 2

    elif (min_short_seq_length < 2) or (
        min_short_seq_length > max_seq_length - 3
    ):
        raise ValueError(
            f"The min_short_seq_len param {min_short_seq_length} is invalid.\n"
            f"Allowed values are [2, {max_seq_length - 3})"
        )

    # define tokenizer
    vocab_file = os.path.abspath(vocab_file)
    tokenizer = FullTokenizer(vocab_file, do_lower)
    vocab_words = tokenizer.get_vocab_words()

    rng = random.Random(seed)

    # get all text files by reading metadata files
    if isinstance(metadata_files, str):
        metadata_files = [metadata_files]

    input_files = []
    for _file in metadata_files:
        with open(_file, "r") as _fin:
            input_files.extend(_fin.readlines())
    input_files = [x.strip() for x in input_files if x]
    rng.shuffle(input_files)

    split_num = len(input_files) if split_num <= 0 else split_num

    # for better performance load spacy model once here
    nlp = spacy.load(spacy_model)
    for i in range(0, len(input_files), split_num):
        current_input_files = input_files[i : i + split_num]

        all_documents = []
        for _file in tqdm(current_input_files):
            _fin_path = os.path.abspath(os.path.join(input_files_prefix, _file))
            with open(_fin_path, "r") as _fin:
                _fin_data = _fin.read()
            processed_doc, _ = text_to_tokenized_documents(
                _fin_data,
                tokenizer,
                multiple_docs_in_single_file,
                multiple_docs_separator,
                single_sentence_per_line,
                nlp,
            )
            all_documents.extend(processed_doc)

        rng.shuffle(all_documents)

        # create a set of instance to process further
        # repeat this process `dupe_factor` times
        # get a list of SentencePairInstances
        instances = []
        for _ in range(dupe_factor):
            for document_index in range(len(all_documents)):
                instances.extend(
                    _create_sentence_instances_from_document(
                        all_documents,
                        document_index,
                        vocab_words,
                        max_seq_length,
                        short_seq_prob,
                        min_short_seq_length,
                        mask_whole_word,
                        max_predictions_per_seq,
                        masked_lm_prob,
                        rng,
                        sop_labels,
                    )
                )
        rng.shuffle(instances)

        for instance in instances:
            yield instance


def _create_sentence_instances_from_document(
    all_documents,
    document_index,
    vocab_words,
    max_seq_length,
    short_seq_prob,
    min_short_seq_length,
    mask_whole_word,
    max_predictions_per_seq,
    masked_lm_prob,
    rng,
    sop_labels=False,
):
    """
    Create instances from documents.
    :param list all_documents: List of lists which contains tokenized
    senteneces from each document
    :param int document_index: Index of document to process currently
    :param list vocab_words: List of all words present in the vocabulary
    :param bool sop_labels: If true, negative examples of the dataset will be two
        consecutive sentences in reversed order. Otherwise, uses regular (NSP)
        labels (where negative examples are from different documents).
    :returns: List of SentencePairInstance objects
    """

    # get document with document_index
    # Example:
    # [
    #   [line1], [line2], [line3]
    # ]
    # where each line = [tokens]
    document = all_documents[document_index]

    # account for [CLS], [SEP], [SEP]
    max_num_tokens = max_seq_length - 3

    # We usually want to fill up the entire sequence since we are padding
    # to `max_seq_length` anyways, so short sequences are generally not
    # needed. However, we sometimes (i.e., short_seq_prob = 0.1 == 10% of
    # the time) want to use shorter sequences to minimize the mismatch
    # between pre-training and fine-tuning. The `target_seq_len` is just a
    # rough target however, whereas `max_seq_length` is a hard limit
    target_seq_len = max_num_tokens
    if rng.random() < short_seq_prob:
        target_seq_len = rng.randint(min_short_seq_length, max_num_tokens)

    # We don't just concatenate all of the tokens from a line into a long
    # sequence and choose an arbitrary split point because this would make
    # the NSP task too easy. Instead, we split the input into segments
    # `A` and `B` based on the actual "sentences" provided by the user
    # input
    instances = []
    current_chunk = []
    current_length = 0
    i = 0
    # lambda function for fast internal calls. called multiple times
    flatten = lambda l: [item for sublist in l for item in sublist]

    while i < len(document):
        # a line is a list of tokens [includes words / punctuations /
        # special characters / wordpieces]
        # we initially read an entire line - but also ensure that if we
        # meet the target seq_len with the current line - we cut it off
        # remove the unused `segments` and put them back in circulation for
        # input creation
        line = document[i]
        current_chunk.append(line)
        current_length += len(line)

        if i == len(document) - 1 or current_length >= target_seq_len:
            if current_chunk:
                # generate a sentence pair instance for NSP loss

                # `a_end` is how many segments from `current_chunk` go into
                # `A` (first sentence)
                a_end = 1
                if len(current_chunk) >= 2:
                    a_end = rng.randint(1, len(current_chunk) - 1)

                tokens_a = []
                tokens_a.extend(flatten(current_chunk[0:a_end]))

                tokens_b = []
                # Random next
                is_random_next = False
                if len(current_chunk) == 1 or (
                    not sop_labels and rng.random() < 0.5
                ):
                    is_random_next = True
                    target_b_length = target_seq_len - len(tokens_a)

                    # this should rarely go for more than one iteration
                    # for large corpora. However, just to be careful, we
                    # try to make sure that the random document is
                    # not the same as the document we are processing
                    for _ in range(10):
                        random_document_index = rng.randint(
                            0, len(all_documents) - 1
                        )
                        if random_document_index != document_index:
                            break

                    random_document = all_documents[random_document_index]
                    random_start = rng.randint(0, len(random_document) - 1)

                    for j in range(random_start, len(random_document)):
                        tokens_b.extend(random_document[j])
                        if len(tokens_b) >= target_b_length:
                            break

                    # We don't actually use these segments [peices of line]
                    # so we "put them back" so they do not to waste for
                    # later computations
                    num_unused_segments = len(current_chunk) - a_end
                    i -= num_unused_segments
                elif sop_labels and rng.random() < 0.5:
                    # From https://github.com/google-research/albert/blob/a36e095d3066934a30c7e2a816b2eeb3480e9b87/create_pretraining_data.py#L338
                    is_random_next = True
                    for j in range(a_end, len(current_chunk)):
                        tokens_b.extend(current_chunk[j])
                    tokens_a, tokens_b = tokens_b, tokens_a
                else:
                    # Actual next
                    is_random_next = False
                    tokens_b.extend(
                        flatten(current_chunk[a_end : len(current_chunk)])
                    )

                # truncate seq pair tokens to max_num_tokens
                _truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng)

                assert len(tokens_a) >= 1
                assert len(tokens_b) >= 1

                # create actual input instance
                tokens = []
                segment_ids = []

                # add special token for input start
                tokens.append("[CLS]")
                segment_ids.append(0)

                # append input `A`
                extend_list = [0] * len(tokens_a)
                segment_ids.extend(extend_list)
                tokens.extend(tokens_a)

                # add special token for input separation
                tokens.append("[SEP]")
                segment_ids.append(0)

                # append input `B`
                extend_list = [1] * len(tokens_b)
                segment_ids.extend(extend_list)
                tokens.extend(tokens_b)

                # add special token for input separation
                tokens.append("[SEP]")
                segment_ids.append(1)

                instance = PreprocessInstance(
                    tokens=tokens,
                    segment_ids=segment_ids,
                    is_random_next=is_random_next,
                )
                instances.append(instance)
            # reset buffers
            current_chunk = []
            current_length = 0
        # move on to next segment
        i += 1

    return instances


def _truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng):
    """
    Truncate a pair of tokens so that their total length is lesser than
    defined maximum number of tokens
    :param list tokens_a: First list of tokens in sequence pair
    :param list tokens_b: Second list of tokens in sequence pair
    :param int max_num_tokens: Maximum number of tokens for the length of
    sequence pair tokens
    """

    total_length = len(tokens_a) + len(tokens_b)
    while total_length > max_num_tokens:
        # find the correct list to truncate this iteration of the loop
        trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
        assert (len(trunc_tokens)) >= 1

        # check whether to remove from front or rear
        if rng.random() < 0.5:
            del trunc_tokens[0]
        else:
            trunc_tokens.pop()

        # recompute lengths again after deletion of token
        total_length = len(tokens_a) + len(tokens_b)