# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Config classes of Model Configs
"""
from dataclasses import dataclass
from typing import List, Literal, Optional, Union
from cerebras.modelzoo.config_manager.config_classes.base.base_config import (
BaseConfig,
)
[docs]@dataclass
class InitializerConfig(BaseConfig):
name: str = Literal[
"constant",
"ones",
"zeros",
"eye" "uniform",
"normal",
"xavier_normal",
"glorot_normal", # alias for `xavier_normal`
"xavier_uniform",
"glorot_uniform", # alias for `xavier_uniform`
"truncated_normal",
"variance_scaling",
"lecun_normal",
"lecun_uniform",
"kaiming_normal",
"kaiming_uniform",
]
mean: Optional[float] = None
std: Optional[float] = None
a: Optional[float] = None
b: Optional[float] = None
nonlinearity: Optional[
Literal[
"linear",
"conv1d",
"conv2d",
"conv3d",
"conv_transpose1d",
"conv_transpose2d",
"conv_transpose3d",
"sigmoid",
"tanh",
"relu",
"leaky_relu",
]
] = None
mode: Optional[str] = None
scale: Optional[float] = None
distribution: Optional[str] = None
gain: Optional[float] = None
[docs]@dataclass
class NormKWArgsConfig(BaseConfig):
pass
[docs]@dataclass
class LoraConfig:
r: int = 0
"Rank of LoRA matrix projections"
alpha: int = 1
"Scaling factor (see paper for additional details)"
dropout: float = 0.0
"Dropout to apply to LoRA updates"
fan_in_fan_out: bool = False
merge_weights: bool = True
"""Determines whether lora weights should be merged/folded
into underlying layers"""
target_modules: Optional[list] = None
"""A list of module names that must all exist in layers
that will be converted to LoRA. For example, setting target_modules
to ["TransformerDecoderLayer", "Linear"] would mean that all linear
layers that were children of a TransformerDecoderLayer would be
converted to LoRA."""
[docs]@dataclass
class CompressionConfig(BaseConfig):
format: Literal["mx8-e4m3", "mx8-e3m4"]
"mx8 Compression formats"
param_filter: Union[str, List[str]]
"""A glob or list of glob expressions to match against parameter names
that are to be compressed with format"""
[docs]@dataclass
class SelectiveGradConfig(BaseConfig):
param_filter: Optional[Union[str, List[str]]] = None
"""A glob or list of glob expressions to match against parameter names
that are to have the selective gradient mask applied"""
init_method: str = "outlier"
"An initialization method that represents the mask to apply"
[docs]@dataclass()
class ModelConfig(BaseConfig):
mixed_precision: bool = False
"Enable to run the model in mixed precision mode"
fp16_type: Optional[Literal["bfloat16", "float16", "cbfloat16"]] = None
"Type of 16bit precision used"
boundary_casting: Optional[bool] = False
lora_params: Optional[Union[LoraConfig, List[LoraConfig]]] = None
compression: Optional[Union[CompressionConfig, List[CompressionConfig]]] = (
None
)
"Weight compression configuration as a single dictionary or a list of dictionaries"
selective_grad: Optional[
Union[SelectiveGradConfig, List[SelectiveGradConfig]]
] = None
"Selective gradient configuration as a single dictionary or a list of dictionaries"