Source code for cerebras.modelzoo.data_preparation.nlp.bert.parser_utils

# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse


[docs]def add_common_parser_args(parser):
    parser.add_argument(
        "--metadata_files",
        type=str,
        required=True,
        nargs='+',
        help="path to text file containing a list of file names "
        "corresponding to the raw input documents to be "
        "processed and stored; can handle multiple metadata files "
        "separated by space",
    )
    parser.add_argument(
        "--multiple_docs_in_single_file",
        action="store_true",
        help="Pass this flag when a single text file contains multiple"
        " documents separated by <multiple_docs_separator>",
    )
    parser.add_argument(
        "--multiple_docs_separator",
        type=str,
        default="\n",
        help="String which separates multiple documents in a single text file. "
        "If newline character, pass \\n"
        "There can only be one separator string for all the documents.",
    )
    parser.add_argument(
        "--single_sentence_per_line",
        action="store_true",
        help="Pass this flag when the document is already split into sentences with"
        "one sentence in each line and there is no requirement for "
        "further sentence segmentation of a document ",
    )
    parser.add_argument(
        '--input_files_prefix',
        type=str,
        default="",
        help='prefix to be added to paths of the input files. '
        'For example, can be a directory where raw data is stored '
        'if the paths are relative',
    )
    parser.add_argument(
        "--vocab_file", type=str, required=True, help="path to vocabulary"
    )
    parser.add_argument(
        "--split_num",
        type=int,
        default=1000,
        help="number of input files to read at a given time for processing. "
        "Defaults to 1000.",
    )
    parser.add_argument(
        "--do_lower_case",
        action="store_true",
        help="pass this flag to lower case the input text; should be "
        "True for uncased models and False for cased models",
    )
    parser.add_argument(
        "--max_seq_length",
        type=int,
        default=128,
        help="maximum sequence length",
    )
    parser.add_argument(
        "--short_seq_prob",
        type=float,
        default=0.1,
        help="probability of creating sequences which are shorter "
        "than the maximum sequence length",
    )
    parser.add_argument(
        "--min_short_seq_length",
        type=int,
        default=None,
        help="The minimum number of tokens to be present in an example"
        "if short sequence probability > 0."
        "If None, defaults to 2 "
        "Allowed values are [2, max_seq_length - 3)",
    )
    parser.add_argument(
        "--masked_lm_prob",
        type=float,
        default=0.15,
        help="masked LM probability",
    )
    parser.add_argument(
        "--max_predictions_per_seq",
        type=int,
        default=20,
        help="maximum number of masked LM predictions per sequence",
    )
    parser.add_argument(
        "--spacy_model",
        type=str,
        default="en_core_web_sm",
        help="spaCy model to load, i.e. shortcut link, package name or path.",
    )
    parser.add_argument(
        "--mask_whole_word",
        action="store_true",
        help="whether to use whole word masking rather than per-WordPiece "
        "masking.",
    )
    parser.add_argument(
        "--output_dir",
        type=str,
        default=None,
        help="directory where HDF5 files will be stored.",
    )
    parser.add_argument(
        "--num_output_files",
        type=int,
        default=10,
        help="number of output files in total i.e each process writes num_output_files//num_processes number of files"
        "Defaults to 10.",
    )
    parser.add_argument(
        "--name",
        type=str,
        default="preprocessed_data",
        help="name of the dataset; i.e. prefix to use for hdf5 file names. "
        "Defaults to 'preprocessed_data'.",
    )
    parser.add_argument(
        "--seed",
        type=int,
        default=0,
        help="random seed. Defaults to 0.",
    )
    parser.add_argument(
        "--num_processes",
        type=int,
        default=0,
        help="Number of parallel processes to use, defaults to cpu count",
    )
    return parser


[docs]def add_mlm_only_specific_args(parser):
    parser = add_common_parser_args(parser)
    parser.add_argument(
        "--overlap_size",
        type=int,
        default=None,
        help="overlap size for generating sequences from buffered data for mlm only sequences"
        "Defaults to None, which sets the overlap to max_seq_len/4.",
    )
    parser.add_argument(
        "--buffer_size",
        type=int,
        default=1e6,
        help="buffer_size number of elements to be processed at a time",
    )
    parser.add_argument(
        "--allow_cross_document_examples",
        action="store_true",
        help="Pass this flag when examples can cross document boundaries",
    )
    parser.add_argument(
        "--document_separator_token",
        type=str,
        default="[SEP]",
        help="If examples can span documents, "
        "use this separator to indicate separate tokens of current and next document",
    )
    # This is a suppressed argument that will not show in --help.
    # Users MUST NOT specify this arg. It is used to switch between the two modes
    # defined to generate HDF5 files.
    parser.add_argument(
        '--__mode', default="mlm_only", help=argparse.SUPPRESS, required=False
    )
    return parser


[docs]def add_mlm_nsp_specific_args(parser):
    parser = add_common_parser_args(parser)
    # This is a suppressed argument that will not show in --help.
    # Users MUST NOT specify this arg. It is used to switch between the two modes
    # defined to generate HDF5 files.
    parser.add_argument(
        '--__mode', default="mlm_nsp", help=argparse.SUPPRESS, required=False
    )
    return parser


[docs]def create_arg_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    subparsers = parser.add_subparsers()

    mlm_subparser = subparsers.add_parser("mlm_only")
    mlm_subparser = add_mlm_only_specific_args(mlm_subparser)

    mlm_nsp_subparser = subparsers.add_parser("mlm_nsp")
    mlm_nsp_subparser = add_mlm_nsp_specific_args(mlm_nsp_subparser)

    return parser


[docs]def get_parser_args():
    parser = create_arg_parser()
    args = parser.parse_args()
    return args