Source code for cerebras.modelzoo.common.run_eleuther_eval_harness

# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Eval Harness run script."""

import argparse
import logging
import sys
from copy import deepcopy
from warnings import warn

# isort: off
import os

sys.path.append(os.path.join(os.path.dirname(__file__), "../../.."))
# isort: on
from cerebras.modelzoo.common.utils.run.cli_parser import get_params_from_args
from cerebras.modelzoo.common.utils.run.utils import DeviceType
from cerebras.modelzoo.trainer.extensions.eleuther.eval_harness_utils import (
    SUPPORTED_MODELS,
)
from cerebras.modelzoo.trainer.utils import (
    configure_trainer_from_params,
    convert_legacy_params_to_trainer_params,
    inject_cli_args_to_trainer_params,
    is_legacy_params,
)


[docs]def eeh_parser(): parser = argparse.ArgumentParser( "Script for running Eleuther Eval Harness for GPT style models", add_help=False, ) optional_arguments = parser.add_argument_group( "Eleuther Eval Harness Arguments" ) # EEH-SPECIFIC ARGS # Ref: https://github.com/EleutherAI/lm-evaluation-harness/blob/c9bbec6e7de418b9082379da82797522eb173054/lm_eval/__main__.py#L26 optional_arguments.add_argument( "--tasks", "-t", default=None, type=str, metavar="task1,task2", help="To get full list of tasks, use the command lm-eval --tasks list", ) optional_arguments.add_argument( "--num_fewshot", "-f", type=int, default=None, metavar="N", help="Number of examples in few-shot context", ) optional_arguments.add_argument( "--output_path", default=None, type=str, metavar="DIR|DIR/file.json", help="The path to the output file where the result metrics will be saved. If the path is a directory and log_samples is true, the results will be saved in the directory. Else the parent directory will be used.", ) optional_arguments.add_argument( "--limit", "-L", type=float, default=None, metavar="N|0<N<1", help="Limit the number of examples per task. " "If <1, limit is a percentage of the total number of examples.", ) optional_arguments.add_argument( "--use_cache", type=str, default=None, metavar="DIR", help="A path to a sqlite db file for caching model responses. `None` if not caching.", ) optional_arguments.add_argument( "--cache_requests", type=str, default=None, choices=["true", "refresh", "delete"], help="Speed up evaluation by caching the building of dataset requests. `None` if not caching.", ) optional_arguments.add_argument( "--check_integrity", action="store_true", default=None, help="Whether to run the relevant part of the test suite for the tasks.", ) optional_arguments.add_argument( "--write_out", "-w", action="store_true", default=None, help="Prints the prompt for the first few documents.", ) optional_arguments.add_argument( "--log_samples", "-s", action="store_true", default=None, help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis. Use with --output_path.", ) optional_arguments.add_argument( "--show_config", action="store_true", default=None, help="If True, shows the the full config of all tasks at the end of the evaluation.", ) optional_arguments.add_argument( "--include_path", type=str, default=None, metavar="DIR", help="Additional path to include if there are external tasks to include.", ) optional_arguments.add_argument( "--predict_only", "-x", action="store_true", default=None, help="Use with --log_samples. Only model outputs will be saved and metrics will not be evaluated.", ) optional_arguments.add_argument( "--seed", default=None, help=( "Set seed for python's random, numpy and torch.\n" "Accepts a comma-separated list of 3 values for python's random, numpy, and torch seeds, respectively, " "or a single integer to set the same seed for all three.\n" "The values are either an integer or 'None' to not set the seed. Default is `0,1234,1234` (for backward compatibility).\n" "E.g. `--seed 0,None,8` sets `random.seed(0)` and `torch.manual_seed(8)`. Here numpy's seed is not set since the second value is `None`.\n" "E.g, `--seed 42` sets all three seeds to 42." ), ) optional_arguments.add_argument( "--trust_remote_code", default=None, action="store_true", help="Sets trust_remote_code to True to execute code to create HF Datasets from the Hub", ) # NONGREEDY SAMPLING ARGS FOR GENERATIVE TASKS optional_arguments.add_argument( "--temperature", type=float, default=None, help="Sampling temperature used for generation.", ) optional_arguments.add_argument( "--top_p", type=float, default=None, help="Top-p parameter used for nucleus sampling.", ) optional_arguments.add_argument( "--top_k", type=int, default=None, help="Top-k parameter used for generation.", ) # CEREBRAS-SPECIFIC ARGS optional_arguments.add_argument( "--keep_data_dir", action="store_true", default=False, help=( "Specifies whether dumped data samples should be kept for reuse. " "Defaults to False, i.e. data samples are deleted after the run." ), ) return parser
[docs]def run_eval_harness(): """Main run script.""" parser_fn = lambda: [eeh_parser()] parser_args = { "parser_epilog": ( "Please run 'python run_eleuther_eval_harness.py CSX -h'. \n \n" "Here is an example command for running on CSX: \n \n" " python run_eleuther_eval_harness.py CSX --params /path/to/params --checkpoint_path " "/path/to/checkpoint --tasks 'hellaswag,winogrande' --num_fewshot 0 \n \n" "Note that Eval Harness is currently only supported for device CSX" ), "csx_parser_epilog": ( "To see a complete list of all available arguments, \n" "please run 'python run_eleuther_eval_harness.py CSX -h'. \n\n" "Here is an example command for running with CSX: \n \n" " python run_eleuther_eval_harness.py CSX --params /path/to/params " "--checkpoint_path /path/to/checkpoint --tasks 'hellaswag,winogrande' --num_fewshot 0 " "\n \nEval Harness resides in the Cerebras Model Zoo. Please specify --python_paths and " "\n --mount_dirs here or in your params.yaml under the 'runconfig' section with \n" "the path to the directory in which the Cerebras Model Zoo resides. \n" ), "modes": ["eval"], } # Parse args params = get_params_from_args( argv=sys.argv[1:], extra_args_parser_fn=parser_fn, device_type=DeviceType.CSX, **parser_args, ) runconfig_params = params["runconfig"] parser = parser_fn()[0] eeh_args = {} other_eeh_args = {} for arg in parser._action_groups[0]._actions: arg_name = arg.dest # Exclude Cerebras-specific args if arg_name in {"keep_data_dir"}: other_eeh_args[arg_name] = runconfig_params.pop(arg_name, None) elif arg_name in runconfig_params: arg_val = runconfig_params.pop(arg_name, None) if arg_val is not None: # Only consider specified CLI args eeh_args[arg_name] = arg_val if is_legacy_params(params): warn( f"Detected that legacy params are being used. " f"Automatically converting params to new format." ) params = convert_legacy_params_to_trainer_params( params, # Allow None objects inside the params obj_filter=lambda obj: obj is None, ) # Convert ScopedValidateFlags to ScopedEleutherEvalHarnessFlags for callback in params["trainer"]["init"].get("callbacks", []): if "ScopedValidateFlags" in callback: callback["ScopedEleutherEvalHarnessFlags"] = callback.pop( "ScopedValidateFlags" ) # Add EleutherEvalHarness callback to the list of callbacks dataloader_args = ( params["trainer"].get("validate_all", {}).pop("val_dataloaders", {}) ) dataloader_args["data_processor"] = "InferenceDataProcessor" params["trainer"]["init"]["callbacks"].append( { "EleutherEvalHarness": { "eeh_args": eeh_args, **deepcopy(other_eeh_args), **deepcopy(dataloader_args), } } ) # Remove fit/validate keys that are not used in the standalone flow for key in ("fit", "validate"): params["trainer"].pop(key, None) elif "runconfig" in params: params = inject_cli_args_to_trainer_params( params.pop("runconfig"), params ) if "trainer" not in params: raise KeyError( "Trainer configuration not found in params. " "Please ensure that the params contain a 'trainer' key." ) if isinstance(params["trainer"], (list, tuple)): raise ValueError( "Standalone Eleuther evaluation harness script only supports " "a single trainer instance, but found a list of trainers." ) if "model_name" in params["trainer"]["init"]["model"]: model_name = params["trainer"]["init"]["model"].pop("model_name") if model_name not in SUPPORTED_MODELS: raise ValueError( f"Invalid model_name specified. Please choose a " f"valid model name from: {SUPPORTED_MODELS}" ) else: raise RuntimeError( f"No model_name specified under config trainer.init.model. Please " f"choose a valid model name from: {SUPPORTED_MODELS}" ) # Extract EleutherEvalHarness callbacks from the list of callbacks eeh_callbacks = [ callback["EleutherEvalHarness"] for callback in params["trainer"]["init"].get("callbacks", []) if "EleutherEvalHarness" in callback ] if not eeh_callbacks: raise RuntimeError(f"Found no EleutherEvalHarness callback") for eeh_callback in eeh_callbacks: eeh_callback.setdefault("eeh_args", {}).update( (key, value) for key, value in deepcopy(eeh_args).items() ) # Set the data_processor key of the callback # for the dataloader config validation eeh_callback["data_processor"] = "InferenceDataProcessor" trainer = configure_trainer_from_params(params, model_name) if "val_dataloaders" in params["trainer"].get("validate_all") is not None: logging.warning( f"Found `validate_all.val_dataloaders` specified in the yaml, " f"but no upstream validation will be run for the standalone " f"Eleuther Eval Harness script." ) params["trainer"]["validate_all"]["val_dataloaders"] = None trainer.validate_all(**params["trainer"].get("validate_all", {}))
if __name__ == "__main__": run_eval_harness()