Source code for cerebras.modelzoo.tools.checkpoint_converters.gemma2

# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import re
from typing import Tuple

import torch

from cerebras.modelzoo.tools.checkpoint_converters.base_converter import (
    BaseCheckpointConverter_HF_CS,
    BaseConfigConverter,
    BaseConfigConverter_HF_CS,
    ConfigConversionError,
    ConversionRule,
    EquivalentSubkey,
    FormatVersions,
)
from cerebras.modelzoo.tools.checkpoint_converters.llama import (
    Converter_LlamaAttention_HF_CS,
)

#########################################################
# Gemma2 HF <> CS2.3.1
#########################################################


[docs]class Converter_Gemma2ForCausalLM_HF_CS23(BaseCheckpointConverter_HF_CS):
    def __init__(self):
        super().__init__()
        self.rules = [
            ConversionRule(
                [
                    EquivalentSubkey(
                        "model.embed_tokens", "embedding_layer.word_embeddings"
                    ),
                    r"\.weight",
                ],
                action=self.replaceKey,
            ),
            ConversionRule(
                [
                    EquivalentSubkey(
                        "model.layers", "transformer_decoder.layers"
                    ),
                    r"\.\d+\.self_attn\.",
                    Converter_LlamaAttention_HF_CS(),
                ],
                action=None,
            ),
            ConversionRule(
                [
                    EquivalentSubkey(
                        "model.layers", "transformer_decoder.layers"
                    ),
                    r"\.\d+\.",
                    EquivalentSubkey(
                        "mlp.gate_proj", "ffn.ffn.0.linear_layer_for_glu"
                    ),
                    r"\.weight",
                ],
                action=self.replaceKey,
            ),
            ConversionRule(
                [
                    EquivalentSubkey(
                        "model.layers", "transformer_decoder.layers"
                    ),
                    r"\.\d+\.",
                    EquivalentSubkey("mlp.up_proj", "ffn.ffn.0.linear_layer"),
                    r"\.weight",
                ],
                action=self.replaceKey,
            ),
            ConversionRule(
                [
                    EquivalentSubkey(
                        "model.layers", "transformer_decoder.layers"
                    ),
                    r"\.\d+\.",
                    EquivalentSubkey("mlp.down_proj", "ffn.ffn.1.linear_layer"),
                    r"\.weight",
                ],
                action=self.replaceKey,
            ),
            ConversionRule(
                [
                    EquivalentSubkey(
                        "model.layers", "transformer_decoder.layers"
                    ),
                    r"\.\d+\.",
                    EquivalentSubkey("input_layernorm", "norm1"),
                    r"\.weight",
                ],
                action=self.convert_layer_norm,
            ),
            ConversionRule(
                [
                    EquivalentSubkey(
                        "model.layers", "transformer_decoder.layers"
                    ),
                    "\.\d+\.",
                    EquivalentSubkey("post_attention_layernorm", "norm1_post"),
                    r"\.weight",
                ],
                action=self.convert_layer_norm,
            ),
            ConversionRule(
                [
                    EquivalentSubkey(
                        "model.layers", "transformer_decoder.layers"
                    ),
                    "\.\d+\.",
                    EquivalentSubkey("pre_feedforward_layernorm", "norm3"),
                    r"\.weight",
                ],
                action=self.convert_layer_norm,
            ),
            ConversionRule(
                [
                    EquivalentSubkey(
                        "model.layers", "transformer_decoder.layers"
                    ),
                    "\.\d+\.",
                    EquivalentSubkey(
                        "post_feedforward_layernorm", "norm3_post"
                    ),
                    r"\.weight",
                ],
                action=self.convert_layer_norm,
            ),
            ConversionRule(
                [
                    EquivalentSubkey("model.norm", "transformer_decoder.norm"),
                    r"\.weight",
                ],
                action=self.convert_layer_norm,
            ),
            ConversionRule(
                [
                    r"lm_head\.weight",
                ],
                action=self.replaceKey,
            ),
            ConversionRule([r"ln_f\.weight"]),
        ]

    def convert_layer_norm(
        self,
        old_key,
        new_key,
        old_state_dict,
        new_state_dict,
        from_index,
        action_fn_args,
    ):
        weight = old_state_dict[old_key]

        # Gemma2 HF implementation has a constant (1) offset unlike other
        # implementations. See https://github.com/huggingface/transformers/blob/main/src/transformers/models/gemma2/modeling_gemma2.py#L70
        # for details. The constant offset can be folded/unfolded into the
        # weight in order to avoid a model implementation change.
        if from_index == 0:
            weight = weight + torch.ones_like(weight)
        else:
            weight = weight - torch.ones_like(weight)

        new_state_dict[new_key] = weight

        # Since CS 1.7, our model implementations store the final layernorm
        # twice ("ln_f" and "transformer_decoder.norm"). As a result, we need to
        # copy "ln_f".
        if from_index == 0 and new_key.find("layers") == -1:
            ln_f_key = re.sub(r"transformer_decoder\.norm\.", "ln_f.", new_key)
            new_state_dict[ln_f_key] = weight

    @staticmethod
    def formats() -> Tuple[FormatVersions, FormatVersions]:
        return (FormatVersions("hf"), FormatVersions("cs-2.3"))

    @classmethod
    def converter_note(cls) -> str:
        return (
            "{} Gemma2ForCausalLM <-> {} GPT2LMHeadModel (configured as Gemma2)"
        ).format(cls.formats()[0], cls.formats()[1])

    @staticmethod
    def get_config_converter_class() -> BaseConfigConverter:
        return ConfigConverter_Gemma2_HF_CS23


[docs]class ConfigConverter_Gemma2_HF_CS23(BaseConfigConverter_HF_CS):
    def __init__(self):
        super().__init__()
        self.rules = [
            ConversionRule(
                ["model_type"],
                action=self.assert_factory_fn(0, "gemma2"),
            ),
            # Parameters that are in both HF and CS:
            ConversionRule(
                [EquivalentSubkey("vocab_size", "vocab_size")],
                action=self.replaceKey,
            ),
            ConversionRule(
                [EquivalentSubkey("hidden_size", "hidden_size")],
                action=self.replaceKey,
            ),
            ConversionRule(
                [EquivalentSubkey("num_hidden_layers", "num_hidden_layers")],
                action=self.replaceKey,
            ),
            ConversionRule(
                [EquivalentSubkey("num_attention_heads", "num_heads")],
                action=self.replaceKey,
            ),
            ConversionRule(
                [
                    EquivalentSubkey(
                        "num_key_value_heads", "extra_attention_params"
                    )
                ],
                action=self.convert_gqa,
            ),
            ConversionRule(
                [
                    EquivalentSubkey(
                        "max_position_embeddings", "max_position_embeddings"
                    )
                ],
                action=self.replaceKey,
            ),
            ConversionRule(
                [EquivalentSubkey("rms_norm_eps", "layer_norm_epsilon")],
                action=self.replaceKey,
            ),
            ConversionRule(
                [EquivalentSubkey("rope_theta", "rope_theta")],
                action=self.replaceKey,
            ),
            ConversionRule(
                [
                    EquivalentSubkey(
                        "final_logit_softcapping", "final_logit_softcapping"
                    )
                ],
                action=self.replaceKey,
            ),
            ConversionRule(
                [
                    EquivalentSubkey(
                        "attn_logit_softcapping", "attention_logit_softcapping"
                    )
                ],
                action=self.replaceKey,
            ),
            ConversionRule(
                [
                    EquivalentSubkey(
                        "sliding_window", "attention_sliding_window_length"
                    )
                ],
                action=self.replaceKey,
            ),
            ConversionRule(
                [EquivalentSubkey("hidden_activation", "nonlinearity")],
                action=self.convert_nonlinearity,
            ),
            ConversionRule(
                [
                    EquivalentSubkey(
                        "tie_word_embeddings", "share_embedding_weights"
                    )
                ],
                action=self.replaceKey,
            ),
            ConversionRule(
                [EquivalentSubkey("intermediate_size", "filter_size")],
                action=self.replaceKey,
            ),
            ConversionRule(
                [EquivalentSubkey("head_dim", "attention_inner_dim")],
                action=self.convert_head_dim,
            ),
            ConversionRule(
                [
                    EquivalentSubkey(
                        "attention_dropout", "attention_dropout_rate"
                    )
                ],
                action=self.replaceKey,
            ),
            ConversionRule(
                [
                    EquivalentSubkey(
                        "attention_bias", "use_projection_bias_in_attention"
                    )
                ],
                exists="left",
                action=self.convert_attention_bias,
            ),
            # Parameters that are only in HF:
            ConversionRule(
                ["eos_token_id"],
                exists="left",
                action=None,
            ),
            ConversionRule(
                ["bos_token_id"],
                exists="left",
                action=None,
            ),
            ConversionRule(
                ["cache_implementation"],
                exists="left",
                action=None,
            ),
            ConversionRule(
                ["use_cache"],
                exists="left",
                action=None,
            ),
            ConversionRule(
                ["pad_token_id"],
                exists="left",
                action=None,
            ),
            ConversionRule(
                ["initializer_range"],
                exists="left",
                action=None,
            ),
            # Parameters that are only in CS:
            ConversionRule(
                ["position_embedding_type"],
                exists="right",
                action=self.assert_factory_fn(1, "rotary"),
            ),
            ConversionRule(
                ["rotary_dim"],
                exists="right",
                action=self.assert_rotary_dim,
            ),
            ConversionRule(
                ["dropout_rate"],
                exists="right",
                action=self.assert_factory_fn(1, 0.0),
            ),
            ConversionRule(
                ["sliding_window_every_other_decoder_layer"],
                exists="right",
                action=self.assert_factory_fn(1, True),
            ),
            ConversionRule(
                ["norm_first"],
                exists="right",
                action=self.assert_factory_fn(1, True),
            ),
            ConversionRule(
                ["norm_first_sandwich"],
                exists="right",
                action=self.assert_factory_fn(1, True),
            ),
            ConversionRule(
                ["use_bias_in_output"],
                exists="right",
                action=self.assert_factory_fn(1, False),
            ),
            ConversionRule(
                ["embedding_dropout_rate"],
                exists="right",
                action=self.assert_factory_fn(1, [None, 0.0]),
            ),
            ConversionRule(
                ["use_projection_bias_in_attention"],
                exists="right",
                action=self.assert_factory_fn(1, True),  # Verify this!
            ),
            ConversionRule(
                ["position_embedding_offset"],
                exists="right",
                action=self.assert_factory_fn(1, 0),
            ),
            ConversionRule(
                ["use_ffn_bias"],
                exists="right",
                action=self.assert_factory_fn(1, False),
            ),
            ConversionRule(
                ["embedding_layer_norm"],
                exists="right",
                action=self.assert_factory_fn(1, False),
            ),
            ConversionRule(
                ["attention_type"],
                exists="right",
                action=self.assert_factory_fn(1, 'scaled_dot_product'),
            ),
            ConversionRule(
                ["norm_type"],
                exists="right",
                action=self.assert_factory_fn(1, 'rmsnorm'),
            ),
            ConversionRule(
                ["use_ff_layer1_dropout"],
                exists="right",
                action=self.assert_factory_fn(1, False),  # Verify this!
            ),
        ]

        # HF config class defaults:
        self.pre_convert_defaults[0].update(
            {
                "vocab_size": 256000,
                "hidden_size": 3072,
                "intermediate_size": 24576,
                "num_hidden_layers": 28,
                "num_attention_heads": 16,
                "num_key_value_heads": 16,
                "head_dim": 256,
                "hidden_activation": "gelu_pytorch_tanh",
                "max_position_embeddings": 8192,
                "initializer_range": 0.02,
                "rms_norm_eps": 1e-06,
                "use_cache": True,
                "pad_token_id": 0,
                "eos_token_id": 1,
                "bos_token_id": 2,
                "tie_word_embeddings": True,
                "rope_theta": 10000.0,
                "attention_bias": False,
                "attention_dropout": 0.0,
                "final_logit_softcapping": 30.0,
                "attn_logit_softcapping": 50.0,
                "query_pre_attn_scalar": 224,
                "sliding_window": 4096,
            }
        )
        self.post_convert_defaults[0].update({"model_type": "gemma2"})

        # CS config class defaults:
        self.pre_convert_defaults[1].update(
            {
                "embeddings_scale": 1.0,
                "embedding_layer_norm": False,
                "embedding_dropout_rate": None,
                "share_embedding_weights": True,
                "position_embedding_type": 'learned',
                "max_position_embeddings": 1024,
                "position_embedding_offset": 0,
                "num_relative_attention_buckets": 32,
                "rotary_dim": None,
                "rope_theta": 10000,
                "alibi_trainable_slopes": False,
                "pos_scaling_factor": 1.0,
                "hidden_size": 768,
                "num_hidden_layers": 12,
                "dropout_rate": 0.1,
                "norm_type": 'layernorm',
                "layer_norm_epsilon": 1e-5,
                "norm_first": True,
                "norm_first_sandwich": False,
                "num_heads": 12,
                "attention_module": 'aiayn_attention',
                "extra_attention_params": {},
                "attention_type": 'scaled_dot_product',
                "attention_dropout_rate": None,
                "use_projection_bias_in_attention": True,
                "use_ffn_bias_in_attention": True,
                "attention_sliding_window_length": None,
                "sliding_window_every_other_decoder_layer": False,
                "attention_sink_tokens": None,
                "attention_qk_norm_layer": None,
                "attention_qk_norm_eps": 1e-5,
                "attention_inner_dim": None,
                "scale_qk_dot_by_layer_idx": False,
                "attention_logit_softcapping": None,
                "filter_size": 3072,
                "nonlinearity": 'gelu',
                "use_ffn_bias": True,
                "use_bias_in_output": False,
                "use_ff_layer1_dropout": False,
                "final_logit_softcapping": None,
                "attention_logits_alpha": 1,
            }
        )
        self.post_convert_defaults[1].update(
            {
                "norm_first": True,
                "norm_first_sandwich": True,
                "sliding_window_every_other_decoder_layer": True,
                "position_embedding_type": "rotary",
                "norm_type": "rmsnorm",
                "use_ffn_bias": False,
                "dropout_rate": 0.0,
            }
        )

    @staticmethod
    def formats() -> Tuple[FormatVersions, FormatVersions]:
        return (FormatVersions("hf"), FormatVersions("cs-2.3"))

    def convert_gqa(
        self,
        old_key,
        new_key,
        old_state_dict,
        new_state_dict,
        from_index,
        action_fn_args,
    ):
        if from_index == 0:
            # check mha or gqa
            if old_state_dict[old_key] == old_state_dict["num_attention_heads"]:
                new_state_dict["attention_module"] = "aiayn_attention"
            else:
                assert (
                    old_state_dict["num_attention_heads"]
                    % old_state_dict[old_key]
                    == 0
                ), (
                    f"number of attention heads should be divisible by num_key_value_heads but "
                    f"got {old_state_dict['num_attention_heads']} and {old_state_dict[old_key]},"
                )
                extra = {"num_kv_groups": old_state_dict[old_key]}
                new_state_dict[new_key] = extra
                new_state_dict["attention_module"] = "multiquery_attention"
        elif from_index == 1:
            if (
                old_state_dict.get("attention_module", "aiayn_attention")
                == "aiayn_attention"
            ):
                assert (
                    old_key not in old_state_dict
                    or "num_kv_groups" not in old_state_dict[old_key]
                ), "Conflict between use of multi-query and multi-head attention"
                new_state_dict[new_key] = old_state_dict["num_heads"]
            elif old_state_dict["attention_module"] == "multiquery_attention":
                num_heads = old_state_dict["num_heads"]
                num_kv_groups = old_state_dict[old_key]["num_kv_groups"]
                assert num_heads % num_kv_groups == 0, (
                    f"number of attention heads should be divisible by num_key_value_heads but "
                    f"got {num_heads} and {num_kv_groups}."
                )
                new_state_dict[new_key] = old_state_dict[old_key][
                    "num_kv_groups"
                ]
            else:
                assert False, (
                    f"attention_module {old_state_dict['attention_module']} is not supported for "
                    f"gemma2"
                )

    def assert_rotary_dim(
        self,
        old_key,
        new_key,
        old_state_dict,
        new_state_dict,
        from_index,
        action_fn_args,
    ):
        assert from_index == 1, "{} should only exist in CS config".format(
            old_key
        )
        if (
            old_state_dict[old_key]
            != old_state_dict["attention_inner_dim"]
            // old_state_dict["num_heads"]
        ):
            raise ConfigConversionError(
                "rotary_dim must be attention_inner_dim // num_heads in order to be compatible with HF"
            )

    def convert_head_dim(
        self,
        old_key,
        new_key,
        old_state_dict,
        new_state_dict,
        from_index,
        action_fn_args,
    ):
        if from_index == 0:
            # attention_inner_dim = head_dim * num_attention_heads
            new_state_dict[new_key] = (
                old_state_dict[old_key] * old_state_dict["num_attention_heads"]
            )
        else:
            # head_dim = attention_inner_dim // num_attention_heads
            attention_inner_dim = old_state_dict[old_key]
            if attention_inner_dim is None:
                attention_inner_dim = old_state_dict["hidden_size"]

            if attention_inner_dim % old_state_dict["num_heads"] != 0:
                raise ConfigConversionError(
                    "attention_inner_dim must be divisible by num_heads"
                )
            new_state_dict[new_key] = (
                old_state_dict[old_key] // old_state_dict["num_heads"]
            )

    def convert_attention_bias(
        self,
        old_key,
        new_key,
        old_state_dict,
        new_state_dict,
        from_index,
        action_fn_args,
    ):
        if from_index == 0:
            # attention_bias -> use_projection_bias_in_attention, use_ffn_bias_in_attention
            new_state_dict["use_projection_bias_in_attention"] = old_state_dict[
                old_key
            ]
            new_state_dict["use_ffn_bias_in_attention"] = old_state_dict[
                old_key
            ]
        else:
            # use_projection_bias_in_attention, use_ffn_bias_in_attention -> attention_bias
            assert (
                old_state_dict["use_ffn_bias_in_attention"]
                == old_state_dict["use_projection_bias_in_attention"]
            )
            new_state_dict[new_key] = old_state_dict[old_key]

    def convert_nonlinearity(
        self,
        old_key,
        new_key,
        old_state_dict,
        new_state_dict,
        from_index,
        action_fn_args,
    ):
        activation = old_state_dict[old_key]
        if from_index == 0:
            if activation.startswith("gelu_"):
                activation = "gelu"
            gated_hf2cs = {
                "silu": "swiglu",
                "relu": "reglu",
                "gelu": "geglu",
                "gelu": "geglu",
            }
            if activation not in gated_hf2cs:
                raise ConfigConversionError(
                    "{} is not a GLU-able activation in CS".format(activation)
                )
            activation = gated_hf2cs[activation]
        elif from_index == 1:
            gated_cs2hf = {"swiglu": "silu", "reglu": "relu", "geglu": "gelu"}
            if activation not in gated_cs2hf:
                raise ConfigConversionError(
                    "{} is not a supported GLU activation in HF".format(
                        activation
                    )
                )
            activation = gated_cs2hf[activation]

        new_state_dict[new_key] = activation

    def pre_config_convert(
        self,
        config,
        converter_indices,
    ):
        config = super().pre_config_convert(config, converter_indices)

        if converter_indices.direction == 1:
            exception = None
            try:
                torch.testing.assert_close(
                    config["embeddings_scale"],
                    config["hidden_size"] ** 0.5,
                    rtol=2.0e-7,
                    atol=1e-6,
                )
            except Exception as e:
                exception = e
            # Reraise the exception as a config conversion error
            if exception:
                raise ConfigConversionError(
                    "embeddings_scale must be equal to hidden_size**0.5\n"
                    + str(exception)
                )

        if converter_indices.direction == 1 and (
            "attention_inner_dim" not in config
            or config["attention_inner_dim"] is None
        ):
            config["attention_inner_dim"] = config["hidden_size"]

        if converter_indices.direction == 1 and (
            "rotary_dim" not in config or config["rotary_dim"] is None
        ):
            raise ConfigConversionError("rotary_dim must be specified")

        if (
            converter_indices.direction == 1
            and config["attention_dropout_rate"] is None
        ):
            config["attention_dropout_rate"] = config["dropout_rate"]

        return config

    def post_config_convert(
        self,
        original_config,
        old_config,
        new_config,
        converter_indices,
        drop_unmatched_keys,
    ):
        if converter_indices.direction == 0:
            new_config["rotary_dim"] = old_config["head_dim"]
            new_config["embeddings_scale"] = new_config["hidden_size"] ** 0.5

            attention_head_size = (
                new_config["attention_inner_dim"] / new_config["num_heads"]
            )
            if attention_head_size != old_config["query_pre_attn_scalar"]:
                new_config["attention_logits_alpha"] = (
                    attention_head_size / old_config["query_pre_attn_scalar"]
                ) ** 0.5
        else:
            attention_head_size = (
                old_config["attention_inner_dim"] / old_config["num_heads"]
            )
            new_config["query_pre_attn_scalar"] = (
                attention_head_size / old_config["attention_logits_alpha"] ** 2
            )

        return super().post_config_convert(
            original_config,
            old_config,
            new_config,
            converter_indices,
            drop_unmatched_keys,
        )