Source code for cerebras.modelzoo.tools.checkpoint_converters.mpt

# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import math
import re
from typing import Tuple

import torch

from cerebras.modelzoo.tools.checkpoint_converters.base_converter import (
    BaseCheckpointConverter_HF_CS,
    BaseConfigConverter,
    BaseConfigConverter_HF_CS,
    ConfigConversionError,
    ConversionRule,
    EquivalentSubkey,
    FormatVersions,
)
from cerebras.modelzoo.tools.checkpoint_converters.gpt2_hf_cs import (
    Converter_GPT2LMHeadModel_CS20_CS21,
)
from cerebras.modelzoo.tools.checkpoint_converters.helper import (
    Build_HF_CS_Converter_WithOptionalModel,
)


[docs]class Converter_MPTAttention_HF_CS(BaseCheckpointConverter_HF_CS): def __init__(self): super().__init__() self.rules = [ ConversionRule( [ EquivalentSubkey("Wqkv", "proj_q_dense_layer"), r"\.(?:weight|bias)", ], action=self.convert_qkv, ), ConversionRule( [ EquivalentSubkey("Wqkv", "proj_k_dense_layer"), r"\.(?:weight|bias)", ], action=self.assert_already_converted, ), ConversionRule( [ EquivalentSubkey("Wqkv", "proj_v_dense_layer"), r"\.(?:weight|bias)", ], action=self.assert_already_converted, ), ConversionRule( [ EquivalentSubkey("out_proj", "proj_output_dense_layer"), r"\.(?:weight|bias)", ], action=self.replaceKey, ), ] @staticmethod def formats() -> Tuple[FormatVersions, FormatVersions]: return (FormatVersions("hf"), FormatVersions("cs-1.7")) @staticmethod def get_config_converter_class() -> BaseConfigConverter: return None def convert_qkv( self, old_key, new_key, old_state_dict, new_state_dict, from_index, action_fn_args, ): if from_index == 0: self.convert_qkv_hf_to_cs( old_key, new_key, old_state_dict, new_state_dict, action_fn_args ) else: self.convert_qkv_cs_to_hf( old_key, new_key, old_state_dict, new_state_dict, action_fn_args ) def convert_qkv_hf_to_cs( self, old_key, new_key, old_state_dict, new_state_dict, action_fn_args ): # HF represents Q, K, and V in a packed format. We need to unpack the # weight and bias tensor for CS 1.7 format. q_key = new_key k_key = re.sub(r"\.proj_q_dense_layer\.", ".proj_k_dense_layer.", q_key) v_key = re.sub(r"\.proj_q_dense_layer\.", ".proj_v_dense_layer.", q_key) ( new_state_dict[q_key], new_state_dict[k_key], new_state_dict[v_key], ) = torch.chunk(old_state_dict[old_key], 3, dim=0) def convert_qkv_cs_to_hf( self, old_key, new_key, old_state_dict, new_state_dict, action_fn_args, ): # HF represents Q, K, and V in a packed format. It also contains # special ".bias" and ".masked_bias" register buffers that need to be # initialized q_key = old_key k_key = re.sub(r"\.proj_q_dense_layer\.", ".proj_k_dense_layer.", q_key) v_key = re.sub(r"\.proj_q_dense_layer\.", ".proj_v_dense_layer.", q_key) assert ( k_key in old_state_dict ), "Expected the following key to exist! {}".format(k_key) assert ( v_key in old_state_dict ), "Expected the following key to exist! {}".format(v_key) new_state_dict[new_key] = torch.cat( ( old_state_dict[q_key], old_state_dict[k_key], old_state_dict[v_key], ), dim=0, ) def assert_already_converted( self, old_key, new_key, old_state_dict, new_state_dict, from_index, action_fn_args, ): if from_index == 0: # We should never hit this case as this key should have been matched # already assert False, "Invalid key: {}".format(old_key) else: # When we convert from CS -> HF, the proj_q_dense_layer should also handle # conversion of proj_k_dense_layer and proj_v_dense_layer since HF # represents these three layers in a packed format. We simply need # to test that the key containing the packed format has already # been converted. assert ( new_key in new_state_dict ), "Key should've been already converted: {} -> {}".format( old_key, new_key )
[docs]class Converter_MPTModel_HF_CS(BaseCheckpointConverter_HF_CS): def __init__(self): super().__init__() self.cs_slopes_key = "relative_pe_helper.slopes" self.rules = [ # word embeddings ConversionRule( [ EquivalentSubkey("wte", "embedding_layer.word_embeddings"), r"\.(?:weight|bias)", ], action=self.convert_word_embeddings, ), ConversionRule( [ EquivalentSubkey( "wpe", "embedding_layer.position_embeddings" ), r"\.(?:weight|bias)", ], action=self.replaceKey, ), # final layer norm ConversionRule( [ EquivalentSubkey("norm_f", "transformer_decoder.norm"), r"\.(?:weight|bias)", ], action=self.replace_final_norm, ), # attention ConversionRule( [ EquivalentSubkey("blocks", "transformer_decoder.layers"), r"\.\d+\.", EquivalentSubkey("attn", "self_attn"), r"\.", Converter_MPTAttention_HF_CS(), ], action=None, ), # attention norm ConversionRule( [ EquivalentSubkey("blocks", "transformer_decoder.layers"), r"\.\d+\.", EquivalentSubkey("norm_1", "norm1"), r"\.(?:weight|bias)", ], action=self.replaceKey, ), ConversionRule( [ EquivalentSubkey("blocks", "transformer_decoder.layers"), r"\.\d+\.", EquivalentSubkey("norm_2", "norm3"), r"\.(?:weight|bias)", ], action=self.replaceKey, ), # intermediate ffn ConversionRule( [ EquivalentSubkey("blocks", "transformer_decoder.layers"), r"\.\d+\.", EquivalentSubkey("ffn.up_proj", "ffn.ffn.0.linear_layer"), r"\.(?:weight|bias)", ], action=self.replaceKey, ), ConversionRule( [ EquivalentSubkey("blocks", "transformer_decoder.layers"), r"\.\d+\.", EquivalentSubkey("ffn.down_proj", "ffn.ffn.1.linear_layer"), r"\.(?:weight|bias)", ], action=self.replaceKey, ), ConversionRule([r"lm_head\.(?:weight|bias)"], exists="right"), ConversionRule([r"ln_f\.(?:weight|bias)"], exists="right"), ConversionRule([r"relative_pe_helper\.slopes"], exists="right"), ] def convert_word_embeddings( self, old_key, new_key, old_state_dict, new_state_dict, from_index, action_fn_args, ): new_state_dict[new_key] = old_state_dict[old_key] if from_index == 0: lm_head_key = re.sub( r"embedding_layer\.word_embeddings", "lm_head", new_key ) new_state_dict[lm_head_key] = old_state_dict[old_key] def replace_final_norm( self, old_key, new_key, old_state_dict, new_state_dict, from_index, action_fn_args, ): new_state_dict[new_key] = old_state_dict[old_key] # CS 1.7 has both "ln_f" and "transformer_decoder.norm" # we need to copy the original ("ln_f") too: if from_index == 0: ln_f_key = re.sub(r"transformer_decoder\.norm\.", "ln_f.", new_key) new_state_dict[ln_f_key] = old_state_dict[old_key] @staticmethod def get_alibi_slopes(n): def get_slopes_power_of_2(n): start = 2 ** (-(2 ** -(math.log2(n) - 3))) ratio = start return [start * ratio**i for i in range(n)] # In the paper, we only train models that have 2^a heads for some a. This function has # some good properties that only occur when the input is a power of 2. To maintain that even # when the number of heads is not a power of 2, we use this workaround. if math.log2(n).is_integer(): slopes_list = get_slopes_power_of_2(n) else: closest_power_of_2 = 2 ** math.floor(math.log2(n)) slopes_list = ( get_slopes_power_of_2(closest_power_of_2) + Converter_MPTModel_HF_CS.get_alibi_slopes( 2 * closest_power_of_2 )[0::2][: n - closest_power_of_2] ) return torch.tensor(slopes_list).unsqueeze(-1) def post_model_convert( self, old_state_dict, new_state_dict, configs, converter_indices, drop_unmatched_keys, key_prefix="", ): if converter_indices.direction == 0: # We are converting from HF MPTModel (which is headless) -> # CS GPT2LMHeadModel configured as MPT (which has a head) # # convert_word_embeddings action_fn already initialized lm_head # we just need to warn the user. logging.warning( "{} has a language model head (lm_head) " "while {} does not. Initializing to same as word embeddings " "(tied embeddings)".format(self.formats()[1], self.formats()[0]) ) # Need to initialize alibi slopes: cs_config = configs[1] if cs_config["model"]["position_embedding_type"] == "alibi": new_state_dict[key_prefix + self.cs_slopes_key] = ( Converter_MPTModel_HF_CS.get_alibi_slopes( cs_config["model"]["num_heads"] ) ) super().post_model_convert( old_state_dict, new_state_dict, configs, converter_indices, drop_unmatched_keys, key_prefix=key_prefix, ) @staticmethod def formats() -> Tuple[FormatVersions, FormatVersions]: return (FormatVersions("hf"), FormatVersions("cs")) @staticmethod def get_config_converter_class() -> BaseConfigConverter: return None
[docs]class Converter_MPTModel_HF_CS20(Converter_MPTModel_HF_CS): def __init__(self): super().__init__() self.rules = [ # Catch checkpoints from Pytorch 2.0 API ConversionRule( [ Converter_MPTModel_HF_CS(), ], action=None, ), # Catch checkpoints from deprecated PyTorchBaseModel ConversionRule( [ EquivalentSubkey("", "model."), Converter_MPTModel_HF_CS(), ], action=None, ), ] @staticmethod def formats() -> Tuple[FormatVersions, FormatVersions]: return (FormatVersions("hf"), FormatVersions("cs-2.0")) @classmethod def converter_note(cls) -> str: return ( "{} MPTModel <-> {} GPT2LMHeadModel (configured as MPT)\n" "The HF model doesn't contain a language model head while the CS " "one does. When converting to CS, the exported checkpoint will " "contain a language model head initialized to default random " "values. When converting to HF, the language model head will be " "dropped." ).format(cls.formats()[0], cls.formats()[1]) @staticmethod def get_config_converter_class() -> BaseConfigConverter: return ConfigConverter_MPT_HF_CS20
[docs]class Converter_MPTForCausalLM_HF_CS(BaseCheckpointConverter_HF_CS): def __init__(self): super().__init__() self.cs_slopes_key = "relative_pe_helper.slopes" self.rules = [ ConversionRule( [ EquivalentSubkey("transformer.", ""), Converter_MPTModel_HF_CS(), ], action=None, ), ] @staticmethod def formats() -> Tuple[FormatVersions, FormatVersions]: return (FormatVersions("hf"), FormatVersions("cs")) @staticmethod def get_config_converter_class() -> BaseConfigConverter: return None def post_model_convert( self, old_state_dict, new_state_dict, configs, converter_indices, drop_unmatched_keys, key_prefix="", ): if converter_indices.direction == 0: # Need to initialize alibi slopes: cs_config = configs[1] if cs_config["model"]["position_embedding_type"] == "alibi": new_state_dict[key_prefix + self.cs_slopes_key] = ( Converter_MPTModel_HF_CS.get_alibi_slopes( cs_config["model"]["num_heads"] ) ) super().post_model_convert( old_state_dict, new_state_dict, configs, converter_indices, drop_unmatched_keys, key_prefix=key_prefix, )
[docs]class Converter_MPTForCausalLM_HF_CS20(Converter_MPTForCausalLM_HF_CS): def __init__(self): super().__init__() self.rules = [ # Catch checkpoints from Pytorch 2.0 API ConversionRule( [ Converter_MPTForCausalLM_HF_CS(), ], action=None, ), # Catch checkpoints from deprecated PyTorchBaseModel ConversionRule( [ EquivalentSubkey("", "model."), Converter_MPTForCausalLM_HF_CS(), ], action=None, ), ] @staticmethod def formats() -> Tuple[FormatVersions, FormatVersions]: return (FormatVersions("hf"), FormatVersions("cs-2.0")) @classmethod def converter_note(cls) -> str: return "{} MPTForCausalLM <-> {} GPT2LMHeadModel (configured as MPT)".format( cls.formats()[0], cls.formats()[1] ) @staticmethod def get_config_converter_class() -> BaseConfigConverter: return ConfigConverter_MPT_HF_CS20
[docs]class ConfigConverter_MPT_HF_CS20(BaseConfigConverter_HF_CS): def __init__(self): super().__init__() self.rules = [ ConversionRule( ["model_type"], action=BaseConfigConverter.assert_factory_fn(0, "mpt"), ), # Embedding ConversionRule(["vocab_size"], action=self.replaceKey), ConversionRule( ["use_position_embedding"], exists="right", action=BaseConfigConverter.assert_factory_fn(1, True), ), ConversionRule( [EquivalentSubkey("emb_pdrop", "embedding_dropout_rate")], action=self.replaceKey, ), ConversionRule( [ EquivalentSubkey( "tie_word_embeddings", "share_embedding_weights" ) ], action=self.replaceKey, ), ConversionRule( ["embedding_layer_norm"], action=BaseConfigConverter.assert_factory_fn(1, False), ), # Decoder Block ConversionRule( [EquivalentSubkey("d_model", "hidden_size")], action=self.replaceKey, ), ConversionRule( [EquivalentSubkey("n_heads", "num_heads")], action=self.replaceKey, ), ConversionRule( [EquivalentSubkey("n_layers", "num_hidden_layers")], action=self.replaceKey, ), ConversionRule( [EquivalentSubkey("max_seq_len", "max_position_embeddings")], action=self.replaceKey, ), ConversionRule( ["attn_config"], exists="left", action=self.convert_attention_config, ), ConversionRule( ["attention_module"], exists="right", action=self.convert_attention_config, ), ConversionRule( ["attention_type"], exists="right", action=self.convert_attention_config, ), ConversionRule( ["attention_dropout_rate"], exists="right", action=self.convert_attention_config, ), ConversionRule( ["position_embedding_type"], exists="right", action=self.convert_attention_config, ), ConversionRule( [ EquivalentSubkey( "no_bias", "use_projection_bias_in_attention" ) ], action=self.convert_no_bias, ), ConversionRule( [EquivalentSubkey("no_bias", "use_ffn_bias_in_attention")], action=self.convert_no_bias, ), ConversionRule( [EquivalentSubkey("no_bias", "use_ffn_bias")], action=self.convert_no_bias, ), ConversionRule( [EquivalentSubkey("expansion_ratio", "filter_size")], action=self.convert_expansion_ratio, ), ConversionRule( ["nonlinearity"], exists="right", action=BaseConfigConverter.assert_factory_fn(1, "gelu"), ), ConversionRule( [EquivalentSubkey("resid_pdrop", "dropout_rate")], action=self.replaceKey, ), ConversionRule( ["layer_norm_epsilon"], action=BaseConfigConverter.assert_factory_fn(1, 1.0e-5), ), ConversionRule( ["use_bias_in_output"], action=BaseConfigConverter.assert_factory_fn(1, False), ), ConversionRule(["initializer_range"], action=self.replaceKey), ConversionRule( ["fixed_sparse_attention"], action=BaseConfigConverter.assert_factory_fn(1, None), ), ConversionRule( ["norm_first"], action=BaseConfigConverter.assert_factory_fn(1, True), ), ConversionRule( ["use_ff_layer1_dropout"], action=BaseConfigConverter.assert_factory_fn(1, False), ), ConversionRule( ["norm_type"], action=self.convert_norm_type, ), ConversionRule( ["embedding_fraction"], action=BaseConfigConverter.assert_factory_fn(0, 1.0), ), ConversionRule( ["logit_scale"], action=BaseConfigConverter.assert_factory_fn(0, None), ), ] self.pre_convert_defaults[0].update( { "d_model": 2048, "n_heads": 16, "n_layers": 24, "expansion_ratio": 4, "max_seq_len": 2048, "vocab_size": 50368, "resid_pdrop": 0.0, "emb_pdrop": 0.0, "learned_pos_emb": True, "attn_config": { "attn_type": "multihead_attention", "attn_pdrop": 0.0, "attn_impl": "triton", "qk_ln": False, "clip_qkv": None, "softmax_scale": None, "prefix_lm": False, "attn_uses_sequence_id": False, "alibi": False, "alibi_bias_max": 8, }, "logit_scale": None, "no_bias": False, "embedding_fraction": 1.0, "norm_type": "low_precision_layernorm", "use_cache": False, } ) self.pre_convert_defaults[1].update( { "share_embedding_weights": True, "norm_type": "layernorm", "max_position_embeddings": 1024, "position_embedding_type": "learned", "layer_norm_epsilon": 1.0e-5, "use_projection_bias_in_attention": True, "use_ffn_bias_in_attention": True, "nonlinearity": "gelu", "use_ffn_bias": True, "use_bias_in_output": False, "norm_first": True, }, ) self.post_convert_defaults[0].update( { "model_type": "mpt", "attn_config": { "attn_type": "multihead_attention", "attn_pdrop": 0.0, "attn_impl": "torch", "qk_ln": False, "clip_qkv": None, "softmax_scale": None, "prefix_lm": False, "attn_uses_sequence_id": False, "alibi": False, "alibi_bias_max": 8, }, } ) self.post_convert_defaults[1].update( { "use_position_embedding": True, "position_embedding_type": "learned", "embedding_dropout_rate": 0.0, "embedding_layer_norm": False, "attention_type": "scaled_dot_product", "use_projection_bias_in_attention": True, "use_ffn_bias_in_attention": True, "use_ffn_bias": True, "attention_dropout_rate": 0.0, "dropout_rate": 0.0, "use_bias_in_output": False, "norm_first": True, "use_ff_layer1_dropout": False, "share_embedding_weights": True, "nonlinearity": "gelu", }, ) def convert_attention_config( self, old_key, new_key, old_state_dict, new_state_dict, from_index, action_fn_args, ): if from_index == 0: attention_module = old_state_dict[old_key]["attn_type"] if attention_module == "multihead_attention": attention_module = "aiayn_attention" new_state_dict["attention_module"] = attention_module new_state_dict["attention_dropout_rate"] = old_state_dict[old_key][ "attn_pdrop" ] softmax_scale = old_state_dict[old_key]["softmax_scale"] if softmax_scale is None: new_state_dict["attention_type"] = "scaled_dot_product" elif softmax_scale == 1.0: new_state_dict["attention_type"] = "dot_product" else: raise ConfigConversionError( "CS model only supports softmax_scale of 1.0 or None" ) if old_state_dict[old_key]["alibi"]: new_state_dict["position_embedding_type"] = "alibi" else: new_state_dict["position_embedding_type"] = "learned" if old_state_dict[old_key].get("alibi_bias_max", 8) != 8: raise ConfigConversionError("CS only supports alibi_bias_max=8") else: if "attn_config" not in new_state_dict: new_state_dict["attn_config"] = {} if old_key == "attention_module": attention_module = old_state_dict[old_key] if attention_module == "aiayn_attention": attention_module = "multihead_attention" elif attention_module == "multiquery_attention": pass else: raise ConfigConversionError( "MPT model does not support attention_module={}".format( attention_module ) ) new_state_dict["attn_config"]["attn_type"] = attention_module elif old_key == "attention_dropout_rate": new_state_dict["attn_config"]["attn_pdrop"] = old_state_dict[ old_key ] elif old_key == "attention_type": attention_type = old_state_dict[old_key] if attention_type == "scaled_dot_product": softmax_scale = None elif attention_type == "dot_product": softmax_scale = 1.0 else: raise ConfigConversionError( "attention_type {} isn't supported in MPT models".format( attention_type ) ) new_state_dict["attn_config"]["softmax_scale"] = softmax_scale elif old_key == "position_embedding_type": position_embedding_type = old_state_dict[old_key] if position_embedding_type == "alibi": new_state_dict["attn_config"]["alibi"] = True elif position_embedding_type == "learned": new_state_dict["attn_config"]["alibi"] = False else: raise ConfigConversionError( "MPT model only supports alibi or learned position embeddings" ) def convert_expansion_ratio( self, old_key, new_key, old_state_dict, new_state_dict, from_index, action_fn_args, ): if from_index == 0: new_state_dict[new_key] = ( old_state_dict[old_key] * old_state_dict["d_model"] ) else: expansion_ratio = ( old_state_dict[old_key] / old_state_dict["hidden_size"] ) if not expansion_ratio.is_integer(): raise ConfigConversionError( "expansion_ratio (filter_size / hidden_size) must be an integer" ) new_state_dict[new_key] = int(expansion_ratio) def convert_no_bias( self, old_key, new_key, old_state_dict, new_state_dict, from_index, action_fn_args, ): if from_index == 0: new_state_dict["use_projection_bias_in_attention"] = ( not old_state_dict[old_key] ) new_state_dict["use_ffn_bias_in_attention"] = not old_state_dict[ old_key ] new_state_dict["use_ffn_bias"] = not old_state_dict[old_key] else: if ( new_key in new_state_dict and new_state_dict[new_key] == old_state_dict[old_key] ): raise ConfigConversionError( "use_projection_bias_in_attention, use_ffn_bias_in_attention, and " "use_ffn_bias must all be the same in MPT models." ) new_state_dict[new_key] = not old_state_dict[old_key] def convert_norm_type( self, old_key, new_key, old_state_dict, new_state_dict, from_index, action_fn_args, ): norm_type = old_state_dict[old_key] if from_index == 0: if norm_type.startswith("low_precision_"): logging.warning( "CS doesn't support low precision layer norm. Using non-low-precision " "implementation." ) norm_type = norm_type[len("low_precision_") :] if norm_type == "layernorm" and old_state_dict["no_bias"]: norm_type = "biasless-layernorm" else: if norm_type == "biasless-layernorm": if ( old_state_dict["use_projection_bias_in_attention"] or old_state_dict["use_ffn_bias_in_attention"] or old_state_dict["use_ffn_bias"] ): raise ConfigConversionError( "use_projection_bias_in_attention, use_ffn_bias_in_attention, and " "use_ffn_bias must all be False when using biasless-layernorm in MPT " "models." ) new_state_dict["no_bias"] = True norm_type = "layernorm" new_state_dict[new_key] = norm_type @staticmethod def formats() -> Tuple[FormatVersions, FormatVersions]: return (FormatVersions("hf"), FormatVersions("cs-2.0"))
########################################################### # In CS 2.1, we refactored the embedding layer. # CS 2.0 <> CS 2.1, and HF <> CS 2.1 converters: ###########################################################
[docs]class Converter_MPTForCausalLM_CS20_CS21(Converter_GPT2LMHeadModel_CS20_CS21): def __init__(self): super().__init__() @classmethod def converter_note(cls) -> str: return "GPT2LMHeadModel class (configured as MPT)"
[docs]class ConfigConverter_MPT_HF_CS21(ConfigConverter_MPT_HF_CS20): "CS 2.1 config is the same as CS 2.0" def __init__(self): super().__init__() del self.post_convert_defaults[1]["use_position_embedding"] @staticmethod def formats() -> Tuple[FormatVersions, FormatVersions]: return ( FormatVersions("hf"), FormatVersions("cs-2.1", "cs-2.2", "cs-2.3"), )
[docs]class Converter_MPTModel_WithoutOptionalModel_HF_CS21(Converter_MPTModel_HF_CS): def __init__(self): super().__init__() self.cs_slopes_key = "embedding_layer.position_embed_helper.slopes" # used in post_model_convert fn self.rules = [ ConversionRule( [ EquivalentSubkey( "wpe", "embedding_layer.position_embeddings.embed" ), r"\.(?:weight|bias)", ], action=self.replaceKey, ), ConversionRule( [r"embedding_layer\.position_embed_helper\.slopes"], exists="right", ), *self.rules, ] @staticmethod def formats() -> Tuple[FormatVersions, FormatVersions]: return ( FormatVersions("hf"), FormatVersions("cs-2.1", "cs-2.2", "cs-2.3"), ) @classmethod def converter_note(cls) -> str: return ( "{} MPTModel <-> {} GPT2LMHeadModel (configured as MPT)\n" "The HF model doesn't contain a language model head while the CS " "one does. When converting to CS, the exported checkpoint will " "contain a language model head initialized to default random " "values. When converting to HF, the language model head will be " "dropped." ).format(cls.formats()[0], cls.formats()[1]) @staticmethod def get_config_converter_class() -> BaseConfigConverter: return ConfigConverter_MPT_HF_CS21
Converter_MPTModel_HF_CS21 = Build_HF_CS_Converter_WithOptionalModel( "Converter_MPTModel_HF_CS21", Converter_MPTModel_WithoutOptionalModel_HF_CS21, derived_class=Converter_MPTModel_WithoutOptionalModel_HF_CS21, )
[docs]class Converter_MPTForCausalLM_WithoutOptionalModel_HF_CS21( Converter_MPTForCausalLM_HF_CS ): def __init__(self): super().__init__() self.cs_slopes_key = "embedding_layer.position_embed_helper.slopes" # used in post_model_convert fn self.rules = [ ConversionRule( [ EquivalentSubkey("transformer.", ""), Converter_MPTModel_WithoutOptionalModel_HF_CS21(), ], action=None, ), ] @staticmethod def formats() -> Tuple[FormatVersions, FormatVersions]: return ( FormatVersions("hf"), FormatVersions("cs-2.1", "cs-2.2", "cs-2.3"), ) @classmethod def converter_note(cls) -> str: return "{} MPTForCausalLM <-> {} GPT2LMHeadModel (configured as MPT)".format( cls.formats()[0], cls.formats()[1] ) @staticmethod def get_config_converter_class() -> BaseConfigConverter: return ConfigConverter_MPT_HF_CS21
Converter_MPTForCausalLM_HF_CS21 = Build_HF_CS_Converter_WithOptionalModel( "Converter_MPTForCausalLM_HF_CS21", Converter_MPTForCausalLM_WithoutOptionalModel_HF_CS21, derived_class=Converter_MPTForCausalLM_WithoutOptionalModel_HF_CS21, )