cerebras.modelzoo.config_manager.config_classes.base.run_config.RunConfig#

class cerebras.modelzoo.config_manager.config_classes.base.run_config.RunConfig(steps_per_epoch: Optional[int] = None, max_steps: Optional[int] = None, mgmt_address: Optional[str] = None, mount_dirs: List[str] = <factory>, num_epochs: Optional[int] = None, python_paths: List[str] = <factory>, compile_dir: Optional[str] = None, checkpoint_path: Optional[str] = None, credentials_path: Optional[str] = None, debug_args_path: Optional[str] = None, retrace_every_iteration: bool = False, eval_steps: Optional[int] = None, init_method: str = 'env://', job_time_sec: Optional[int] = None, job_labels: List[str] = <factory>, job_priority: str = 'p2', seed: Optional[int] = None, mgmt_namespace: Optional[str] = None, load_checkpoint_states: str = 'all', target_device: Literal['CPU', 'GPU', 'CSX'] = 'CSX', mode: Literal['train', 'eval', 'eval_all', 'train_and_eval', 'inference'] = 'train', wsc_log_level: Optional[dict] = None, autoload_last_checkpoint: bool = True, check_loss_values: bool = True, disable_strict_checkpoint_loading: bool = False, dist_addr: str = 'localhost:8888', dist_backend: str = 'nccl', checkpoint_steps: Optional[int] = None, disable_version_check: bool = False, enable_distributed: bool = False, model_dir: str = './model_dir', save_initial_checkpoint: bool = False, precision_opt_level: int = 1, num_workers_per_csx: int = 1, validate_only: bool = False, logging: Union[str, int] = 'INFO', sync_batchnorm: bool = False, compile_only: bool = False, log_steps: int = 1, num_steps: Optional[int] = None, transfer_processes: int = 5, num_wgt_servers: int = 24, num_csx: int = 1, num_act_servers: int = 60, eval_frequency: Optional[int] = None, execute_crd_memory_gi: Optional[int] = None, compile_crd_memory_gi: Optional[int] = None, op_profiler_config: Optional[cerebras.modelzoo.config_manager.config_classes.base.run_config.CSTorchProfilerConfig] = None, dump_activations: bool = False, log_input_summaries: bool = False, main_process_id: int = 0, max_checkpoints: Optional[int] = None, summary_dir: Optional[str] = None, lazy_initialization: bool = True, wrk_memory_gi: Optional[int] = None, act_memory_gi: Optional[int] = None, cmd_memory_gi: Optional[int] = None, wgt_memory_gi: Optional[int] = None, experimental: dict = <factory>, ini: Optional[Dict[str, Union[bool, int, float, str]]] = None, debug_args: Optional[Dict[str, Union[bool, int, float, str]]] = None, legacy_event_dirs: bool = False)[source]#
steps_per_epoch: Optional[int] = None#

The number of steps per epoch.

max_steps: Optional[int] = None#

Specifies the maximum number of steps for training. max_steps is optional unless neither num_epochs nor num_steps are provided,

mgmt_address: Optional[str] = None#

The address of the management service used for coordinating the training job as <host>:<port>.

mount_dirs: List[str]#

A list of paths to be mounted to the appliance containers. It should generally contain path to the directory containing the Cerebras model zoo and data dir.

num_epochs: Optional[int] = None#

The number of epochs to train for.

python_paths: List[str]#

A list of paths to be exported into PYTHONPATH for worker containers. It should generally contain path to the directory containing the Cerebras model zoo.

compile_dir: Optional[str] = None#

Compile directory where compile artifacts will be written.

checkpoint_path: Optional[str] = None#

The path to load checkpoints from during training.

credentials_path: Optional[str] = None#

Credentials for cluster access. If None, the value from a pre-configured location will be used if available.

debug_args_path: Optional[str] = None#

Path to debugs args file.

retrace_every_iteration: bool = False#
eval_steps: Optional[int] = None#

Specifies the number of steps to run the model evaluation.

init_method: str = 'env://'#
job_time_sec: Optional[int] = None#
job_labels: List[str]#

A list of equal-sign-separated key value pairs served as job labels.

job_priority: str = 'p2'#

Priority of the job in scheduling queue.

seed: Optional[int] = None#

The seed to use for random number generation for reproducibility.

mgmt_namespace: Optional[str] = None#
load_checkpoint_states: str = 'all'#

Comma-separated string of keys used in conjunction with checkpoint_path to explicitly specify what components’ state should be loaded if present in a checkpoint. If this flag is used, any component whose key isn’t specified will not load state from the checkpoint. For example, if load_checkpoint_states is model, we only load the model state and enforce resetting of optimizer states and training steps after loading a given checkpoint; i.e., matching weights are initialized from checkpoint provided by checkpoint_path, training starts from step 0, and optimizer states present in the checkpoint are ignored.” This is useful for fine-tuning runs on different tasks (e.g., classification, Q&A, etc.) where weights from a pre-trained model trained on language modeling (LM) tasks are loaded or fine-tuning on a different dataset on the same LM task. If dataloader state exists in the checkpoint that will also be ignored. In this case, the dataloaders will yield samples from the beginning. However, if load_checkpoint_states is model,dataloader`then only the model and dataloader states will be loaded. By default, this config is `None meaning that we load state for every compononent found in the checkpoint.

target_device: Literal['CPU', 'GPU', 'CSX'] = 'CSX'#

CPU, GPU, CSX. Required in command line.

Type

The target device to run the training on. One of

mode: Literal['train', 'eval', 'eval_all', 'train_and_eval', 'inference'] = 'train'#

The mode of the training job, either ‘train’, ‘eval’, eval_all or train_and_eval.

wsc_log_level: Optional[dict] = None#

Specifes the logging level for particular Wafer-Scale Cluster servers or tasks. Input can be either a single value setting a global log level (i.e. –wsc_log_level DEBUG) or a list of equal-sign-separated key value pairs in the format of <task or server>=<log level>. A task and server can be combined to specify a server only during a specific task (i.e. <execute>.<crd>). The log level can be either an int or a string (i.e. INFO, DEBUG, VERBOSE, 20, 10). See [more](https://docs.python.org/3/library/logging.html#logging-levels).

autoload_last_checkpoint: bool = True#

Flag to automatically load the last checkpoint in the model_dir.

check_loss_values: bool = True#

Flag to check the loss values to see if it is Nan/inf. Defaults to True

disable_strict_checkpoint_loading: bool = False#

Flag used in conjunction with checkpoint_path, to avoid enforcing strict model state loading. Defaults to False

dist_addr: str = 'localhost:8888'#

To init master_addr and master_port of distributed. Defaults to ‘localhost:8888’

dist_backend: str = 'nccl'#

Distributed backend engine. Defaults to ‘nccl’

checkpoint_steps: Optional[int] = None#

The number of steps between saving model checkpoints during training. 0 means no checkpoints saved. Defaults to 0

disable_version_check: bool = False#
model_dir: str = './model_dir'#

The directory where the model checkpoints and other metadata will be saved during training. Defaults to ‘./model_dir’

save_initial_checkpoint: bool = False#

Whether to save an initial checkpoint before training starts. Defaults to False

precision_opt_level: int = 1#

Setting to control the level of numerical precision used for training runs for large NLP modelzoo. See [more] (https://docs.cerebras.net/en/latest/general/performance-optimization.html ?#precision-optimization-level) Defaults to 1

num_workers_per_csx: int = 1#

Number of input workers, per CSX, to use for streaming samples. This setting depends on whether the model is compute-bound or input-bound and how efficient the dataloader implementation is. For compute-bound modelzoo.(e.g., LLM), even 1 input worker per csx is enough to saturate the input buffers on CSX systems. But for smaller modelzoo.a larger number may be used. We currently default to 1 worker per CSX. defaults to 0

validate_only: bool = False#

Enables validate only workflow, stops the compilation at kernel matching stage. Defaults to False

logging: Union[str, int] = 'INFO'#

Logging Specifies the logging level during training. Defaults to ‘INFO’

sync_batchnorm: bool = False#

Whether to use synchronized batch normalization on multi GPU setup. Defaults to False

compile_only: bool = False#

Enables compile only workflow. Defaults to False

log_steps: int = 1#

Specifies the number of steps between logging during training. Same number controls the summary steps in Tensorboard.

num_steps: Optional[int] = None#

The number of steps to train for.

transfer_processes: int = 5#

Number of transfer processes used for weight transfer

num_wgt_servers: int = 24#

Upper bound on the number of MemoryX servers used for storing the model weights. Compilation may choose a smaller number depending on the model topology. A sensible upper bound (currently 24) is selected if a value is not provided.

num_csx: int = 1#

The number of CSX systems to use in Cerebras WSE cluster. Defaults to 1

num_act_servers: int = 60#

Number of activation servers per CS-X dedicated to stream samples to the WSE. Input workers stream data to these activation servers, and the activation servers to hold and further stream the data to the WSE. For LLMs, we generally choose 1 because they’re compute-bound. For CV modelzoo.we choose a higher number, a crude rule of thumb is to have one activation server for every 4 workers (i.e. num_workers_per_csx // 4 if num_workers_per_csx > 4, else 1). It is suggested to keep the default values for this param when possible. Defaults to 60.

eval_frequency: Optional[int] = None#

Specifies the evaluation frequency during training. Only used for `train_and_eval`mode

execute_crd_memory_gi: Optional[int] = None#

None

Type

Optional parameter to specifu the memory used for execution. Default

compile_crd_memory_gi: Optional[int] = None#

None

Type

Optional parameter to specifu the memory used for compile. Default

op_profiler_config: Optional[cerebras.modelzoo.config_manager.config_classes.base.run_config.CSTorchProfilerConfig] = None#
dump_activations: bool = False#
enable_distributed: bool = False#

Flag to enable distributed training on GPU. Defaults to False

log_input_summaries: bool = False#
main_process_id: int = 0#
max_checkpoints: Optional[int] = None#
summary_dir: Optional[str] = None#
lazy_initialization: bool = True#
wrk_memory_gi: Optional[int] = None#
act_memory_gi: Optional[int] = None#
cmd_memory_gi: Optional[int] = None#
wgt_memory_gi: Optional[int] = None#
experimental: dict#
ini: Optional[Dict[str, Union[bool, int, float, str]]] = None#

Internal debug flags for Wafer Scale Cluster compiler and runtime.

debug_args: Optional[Dict[str, Union[bool, int, float, str]]] = None#

Internal debug flags for Wafer Scale Cluster compiler and runtime.

legacy_event_dirs: bool = False#