Source code for cerebras.modelzoo.tools.checkpoint_converters.bloom_hf_cs

# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import math
import re
from typing import Tuple

import torch

from cerebras.modelzoo.tools.checkpoint_converters.base_converter import (
    BaseCheckpointConverter_HF_CS,
    BaseConfigConverter,
    BaseConfigConverter_HF_CS,
    ConfigConversionError,
    ConversionRule,
    EquivalentSubkey,
    FormatVersions,
)
from cerebras.modelzoo.tools.checkpoint_converters.gpt2_hf_cs import (
    ConfigConverter_GPT2Model_CS18_CS20,
    Converter_GPT2LMHeadModel_CS18_CS20,
    Converter_GPT2LMHeadModel_CS20_CS21,
)
from cerebras.modelzoo.tools.checkpoint_converters.helper import (
    Build_HF_CS_Converter_WithOptionalModel,
)


[docs]class Converter_Bloom_Attention_HF_CS17(BaseCheckpointConverter_HF_CS): def __init__( self, ): super().__init__() self.rules = [ ConversionRule( [ EquivalentSubkey("dense", "proj_output_dense_layer"), r"\.(?:weight|bias)", ], action=self.replaceKey, ), ConversionRule( [ EquivalentSubkey("query_key_value", "proj_q_dense_layer"), r"\.(?:weight|bias)", ], action=self.qkv_converter, ), ConversionRule( [ EquivalentSubkey("query_key_value", "proj_k_dense_layer"), r"\.(?:weight|bias)", ], action=self.assert_already_converted, ), ConversionRule( [ EquivalentSubkey("query_key_value", "proj_v_dense_layer"), r"\.(?:weight|bias)", ], action=self.assert_already_converted, ), ] def qkv_converter( self, old_key, new_key, old_state_dict, new_state_dict, from_index, action_fn_args, ): if from_index == 0: self.qkv_converter_hf_to_cs17( old_key, new_key, old_state_dict, new_state_dict, action_fn_args ) else: self.qkv_converter_cs17_to_hf( old_key, new_key, old_state_dict, new_state_dict, action_fn_args ) def qkv_converter_hf_to_cs17( self, old_key, new_key, old_state_dict, new_state_dict, action_fn_args ): # HF represents Q, K, and V in a packed format. We need to unpack the # weight and bias tensor for CS format. q_key = new_key k_key = re.sub(r"\.proj_q_dense_layer\.", ".proj_k_dense_layer.", q_key) v_key = re.sub(r"\.proj_q_dense_layer\.", ".proj_v_dense_layer.", q_key) num_heads = action_fn_args["configs"][1]["model"]["num_heads"] hidden_size = action_fn_args["configs"][1]["model"]["hidden_size"] if new_key.endswith(".bias"): reshaped = old_state_dict[old_key].view( num_heads, 3, hidden_size // num_heads ) new_state_dict[q_key] = reshaped[:, 0, :].reshape(hidden_size) new_state_dict[k_key] = reshaped[:, 1, :].reshape(hidden_size) new_state_dict[v_key] = reshaped[:, 2, :].reshape(hidden_size) elif new_key.endswith(".weight"): reshaped = old_state_dict[old_key].view( num_heads, 3, hidden_size // num_heads, hidden_size ) new_state_dict[q_key] = reshaped[:, 0, :, :].reshape( hidden_size, hidden_size ) new_state_dict[k_key] = reshaped[:, 1, :, :].reshape( hidden_size, hidden_size ) new_state_dict[v_key] = reshaped[:, 2, :, :].reshape( hidden_size, hidden_size ) else: raise ValueError("Invalid key after conversion: {}".format(new_key)) def qkv_converter_cs17_to_hf( self, old_key, new_key, old_state_dict, new_state_dict, action_fn_args ): # HF represents Q, K, and V in a packed format. q_key = old_key k_key = re.sub(r"\.proj_q_dense_layer\.", ".proj_k_dense_layer.", q_key) v_key = re.sub(r"\.proj_q_dense_layer\.", ".proj_v_dense_layer.", q_key) assert ( k_key in old_state_dict ), "Expected the following key to exist! {}".format(k_key) assert ( v_key in old_state_dict ), "Expected the following key to exist! {}".format(v_key) num_heads = action_fn_args["configs"][1]["model"]["num_heads"] hidden_size = action_fn_args["configs"][1]["model"]["hidden_size"] expand_dim = [ num_heads, 1, hidden_size // num_heads, ] shrink_dim = [hidden_size * 3] if new_key.endswith(".weight"): expand_dim += [ hidden_size, ] shrink_dim += [ hidden_size, ] new_state_dict[new_key] = torch.cat( ( old_state_dict[q_key].view(*expand_dim), old_state_dict[k_key].view(*expand_dim), old_state_dict[v_key].view(*expand_dim), ), dim=1, ).view(*shrink_dim) def assert_already_converted( self, old_key, new_key, old_state_dict, new_state_dict, from_index, action_fn_args, ): if from_index == 0: # We should never hit this case as this key should have been matched # already assert False, "Invalid key: {}".format(old_key) else: # When we convert from CS -> HF, the proj_q_dense_layer should also handle # conversion of proj_k_dense_layer and proj_v_dense_layer since HF # represents these three layers in a packed format. We simply need # to test that the key containing the packed format has already # been converted. assert ( new_key in new_state_dict ), "Key should've been already converted: {} -> {}".format( old_key, new_key ) @staticmethod def formats() -> Tuple[FormatVersions, FormatVersions]: return (FormatVersions("hf"), FormatVersions("cs-1.7")) @staticmethod def get_config_converter_class() -> BaseConfigConverter: return None
[docs]class Converter_BloomModel_HF_CS17(BaseCheckpointConverter_HF_CS): def __init__(self): super().__init__() self.cs_slopes_key = "relative_pe_helper.slopes" self.rules = [ ConversionRule( [ EquivalentSubkey( "word_embeddings", "embedding_layer.word_embeddings" ), r"\.(?:weight|bias)", ], action=self.replaceKey, ), ConversionRule( [ EquivalentSubkey( "word_embeddings_layernorm", "embedding_ln_f" ), r"\.(?:weight|bias)", ], action=self.replaceKey, ), ConversionRule( [ EquivalentSubkey("h", "transformer_decoder.layers"), r"\.\d+\.", EquivalentSubkey("self_attention.", "self_attn."), Converter_Bloom_Attention_HF_CS17(), ], action=None, ), ConversionRule( [ EquivalentSubkey("h", "transformer_decoder.layers"), r"\.\d+\.", EquivalentSubkey("input_layernorm", "norm1"), r"\.(?:weight|bias)", ], action=self.replaceKey, ), ConversionRule( [ EquivalentSubkey("h", "transformer_decoder.layers"), r"\.\d+\.", EquivalentSubkey("post_attention_layernorm", "norm3"), r"\.(?:weight|bias)", ], action=self.replaceKey, ), ConversionRule( [ EquivalentSubkey("h", "transformer_decoder.layers"), r"\.\d+\.", EquivalentSubkey( "mlp.dense_h_to_4h", "ffn.ffn.0.linear_layer" ), r"\.(?:weight|bias)", ], action=self.replaceKey, ), ConversionRule( [ EquivalentSubkey("h", "transformer_decoder.layers"), r"\.\d+\.", EquivalentSubkey( "mlp.dense_4h_to_h", "ffn.ffn.1.linear_layer" ), r"\.(?:weight|bias)", ], action=self.replaceKey, ), ConversionRule( [ EquivalentSubkey("ln_f", "transformer_decoder.norm"), r"\.(?:weight|bias)", ], action=self.replace_final_norm, ), ConversionRule([r"lm_head\.(?:weight|bias)"], exists="right"), ConversionRule([r"ln_f\.(?:weight|bias)"], exists="right"), ConversionRule( [ r"relative_pe_helper\.slopes", ], exists="right", ), ] def replace_final_norm( self, old_key, new_key, old_state_dict, new_state_dict, from_index, action_fn_args, ): new_state_dict[new_key] = old_state_dict[old_key] # CS 1.7 has both "ln_f" and "transformer_decoder.norm" # we need to copy the original ("ln_f") too: if from_index == 0: ln_f_key = re.sub(r"transformer_decoder\.norm\.", "ln_f.", new_key) new_state_dict[ln_f_key] = old_state_dict[old_key] @staticmethod def get_alibi_slopes(n): def get_slopes_power_of_2(n): start = 2 ** (-(2 ** -(math.log2(n) - 3))) ratio = start return [start * ratio**i for i in range(n)] # In the paper, we only train models that have 2^a heads for some a. This function has # some good properties that only occur when the input is a power of 2. To maintain that even # when the number of heads is not a power of 2, we use this workaround. if math.log2(n).is_integer(): slopes_list = get_slopes_power_of_2(n) else: closest_power_of_2 = 2 ** math.floor(math.log2(n)) slopes_list = ( get_slopes_power_of_2(closest_power_of_2) + Converter_BloomModel_HF_CS17.get_alibi_slopes( 2 * closest_power_of_2 )[0::2][: n - closest_power_of_2] ) return torch.tensor(slopes_list).unsqueeze(-1) def pre_model_convert( self, old_state_dict, new_state_dict, configs, converter_indices, drop_unmatched_keys, ): if converter_indices.direction == 0: logging.warning( "{} GPT2 backbone has a language model head (lm_head) " "while {} BloomModel does not. Initializing lm_head to default.".format( self.formats()[1], self.formats()[0] ) ) # Manually tie weights if ( converter_indices.direction == 1 and configs[1]["model"]["share_embedding_weights"] ): if ( old_state_dict.get("embedding_layer.word_embeddings.weight", 0) is None ): old_state_dict["embedding_layer.word_embeddings.weight"] = ( old_state_dict["lm_head.weight"] ) def post_model_convert( self, old_state_dict, new_state_dict, configs, converter_indices, drop_unmatched_keys, key_prefix="", ): if converter_indices.direction == 0: # We are converting from HF GPT2Model (which is headless) -> CS GPT2LMHeadModel # We need to create 'lm_head' and init to default values hf_config = configs[0] cs_config = configs[1] use_bias_in_output = cs_config["model"].get( "use_bias_in_output", False ) vocab_size = cs_config["model"]["vocab_size"] embed_dim = cs_config["model"]["hidden_size"] if hf_config["tie_word_embeddings"]: lm_head_weight = old_state_dict['word_embeddings.weight'] else: lm_head_weight = torch.zeros((vocab_size, embed_dim)) lm_head_weight.normal_(mean=0.0, std=0.02) new_state_dict[key_prefix + "lm_head.weight"] = lm_head_weight if use_bias_in_output: lm_head_bias = torch.zeros(vocab_size) new_state_dict[key_prefix + "lm_head.bias"] = lm_head_bias new_state_dict[key_prefix + self.cs_slopes_key] = ( Converter_BloomModel_HF_CS17.get_alibi_slopes( cs_config["model"]["num_heads"] ) ) super().post_model_convert( old_state_dict, new_state_dict, configs, converter_indices, drop_unmatched_keys, key_prefix=key_prefix, ) @staticmethod def formats() -> Tuple[FormatVersions, FormatVersions]: return (FormatVersions("hf"), FormatVersions("cs-1.7")) @classmethod def converter_note(cls) -> str: return ( "{} BloomModel <-> {} GPT2LMHeadModel (configured as bloom)\n" "The HF model doesn't contain a language model head while the CS " "one does. When converting to CS, the exported checkpoint will " "contain a language model head initialized to default random values" ". When converting to HF, the language model head will be dropped." ).format(cls.formats()[0], cls.formats()[1]) @staticmethod def get_config_converter_class() -> BaseConfigConverter: return ConfigConverter_BloomModel_HF_CS17
[docs]class Converter_BloomModel_HF_CS19(Converter_BloomModel_HF_CS17): def __init__(self): super().__init__() self.rules = [ # Catch checkpoints from Pytorch 2.0 API ConversionRule( [ Converter_BloomModel_HF_CS17(), ], action=None, ), # Catch checkpoints from 1.7/1.8 ConversionRule( [ EquivalentSubkey("", "model."), Converter_BloomModel_HF_CS17(), ], action=None, ), ] @staticmethod def formats() -> Tuple[FormatVersions, FormatVersions]: return (FormatVersions("hf"), FormatVersions("cs-1.9")) @classmethod def converter_note(cls) -> str: return ( "{} BloomModel <-> {} GPT2LMHeadModel (configured as bloom)\n" "The HF model doesn't contain a language model head while the CS " "one does. When converting to CS, the exported checkpoint will " "contain a language model head initialized to default random values" ". When converting to HF, the language model head will be dropped." ).format(cls.formats()[0], cls.formats()[1]) @staticmethod def get_config_converter_class() -> BaseConfigConverter: return ConfigConverter_BloomModel_HF_CS19
[docs]class ConfigConverter_BloomModel_HF_CS17(BaseConfigConverter_HF_CS): def __init__(self): super().__init__() self.rules = [ ConversionRule( ["model_type"], action=BaseConfigConverter.assert_factory_fn(0, "bloom"), ), # Embedding ConversionRule(["vocab_size"], action=self.replaceKey), ConversionRule( ["position_embedding_type"], exists="right", action=BaseConfigConverter.assert_factory_fn(1, "alibi"), ), ConversionRule( ["use_position_embedding"], exists="right", action=BaseConfigConverter.assert_factory_fn(1, True), ), ConversionRule( ["embedding_dropout_rate"], action=BaseConfigConverter.assert_factory_fn(1, 0.0), ), ConversionRule( [ EquivalentSubkey( "tie_word_embeddings", "share_embedding_weights" ) ], action=self.replaceKey, ), ConversionRule( ["embedding_layer_norm"], action=BaseConfigConverter.assert_factory_fn(1, True), ), # Decoder Block ConversionRule( ["hidden_size"], action=self.replaceKey, ), ConversionRule( [EquivalentSubkey("n_head", "num_heads")], action=self.replaceKey, ), ConversionRule( [EquivalentSubkey("n_layer", "num_hidden_layers")], action=self.replaceKey, ), ConversionRule( ["max_position_embeddings"], exists="right", action=None, ), ConversionRule( ["attention_type"], action=BaseConfigConverter.assert_factory_fn( 1, "scaled_dot_product" ), ), ConversionRule( ["attention_module"], action=BaseConfigConverter.assert_factory_fn( 1, "aiayn_attention" ), ), ConversionRule( ["attention_inner_dim"], action=self.assert_attention_inner_dim, ), ConversionRule( ["use_rms_norm"], action=BaseConfigConverter.assert_factory_fn(1, False), ), ConversionRule( ["use_projection_bias_in_attention"], action=BaseConfigConverter.assert_factory_fn(1, True), ), ConversionRule( ["use_ffn_bias_in_attention"], exists="right", action=BaseConfigConverter.assert_factory_fn(1, True), ), ConversionRule( ["use_ffn_bias"], exists="right", action=BaseConfigConverter.assert_factory_fn(1, True), ), ConversionRule( ["filter_size"], exists="right", action=self.assert_filter_size, ), ConversionRule( ["nonlinearity"], action=BaseConfigConverter.assert_factory_fn(1, "gelu"), ), ConversionRule( ["apply_residual_connection_post_layernorm"], action=BaseConfigConverter.assert_factory_fn(0, False), ), ConversionRule( ["norm_first"], action=BaseConfigConverter.assert_factory_fn(1, True), ), ConversionRule( [ EquivalentSubkey( "attention_dropout", "attention_dropout_rate" ) ], action=self.replaceKey, ), ConversionRule( [EquivalentSubkey("hidden_dropout", "dropout_rate")], action=self.replaceKey, ), ConversionRule( ["layer_norm_epsilon"], action=self.replaceKey, ), ConversionRule( ["use_bias_in_output"], action=BaseConfigConverter.assert_factory_fn(1, False), ), ConversionRule(["initializer_range"], action=self.replaceKey), ConversionRule( ["fixed_sparse_attention"], action=BaseConfigConverter.assert_factory_fn(1, None), ), ConversionRule( ["norm_first"], action=BaseConfigConverter.assert_factory_fn(1, True), ), ConversionRule( ["use_ff_layer1_dropout"], action=BaseConfigConverter.assert_factory_fn(1, False), ), ConversionRule( ["scale_attn_by_inverse_layer_idx"], action=BaseConfigConverter.assert_factory_fn(0, False), ), ConversionRule( ["reorder_and_upcast_attn"], action=BaseConfigConverter.assert_factory_fn(0, False), ), ConversionRule( ["alibi_trainable_slopes"], action=BaseConfigConverter.assert_factory_fn(1, False), ), ] self.pre_convert_defaults[0].update( { "tie_word_embeddings": True, "vocab_size": 250880, "hidden_size": 64, "n_layer": 2, "n_head": 8, "layer_norm_epsilon": 1e-5, "initializer_range": 0.02, "apply_residual_connection_post_layernorm": False, "hidden_dropout": 0.0, "attention_dropout": 0.0, } ) self.pre_convert_defaults[1].update( {"share_embedding_weights": True, "embedding_layer_norm": False} ) self.post_convert_defaults[0].update({"model_type": "bloom"}) self.post_convert_defaults[1].update( { "position_embedding_type": "alibi", "use_ffn_bias_in_attention": True, "use_projection_bias_in_attention": True, "use_ffn_bias": True, "use_bias_in_output": False, "attention_type": "scaled_dot_product", "embedding_layer_norm": True, } ) def assert_filter_size( self, old_key, new_key, old_state_dict, new_state_dict, from_index, action_fn_args, ): if old_state_dict[old_key] != 4 * old_state_dict["hidden_size"]: raise ConfigConversionError( "HF model only supports filter_size = 4 * hidden_size" ) def assert_attention_inner_dim( self, old_key, new_key, old_state_dict, new_state_dict, from_index, action_fn_args, ): if ( old_state_dict[old_key] is not None and old_state_dict[old_key] != old_state_dict["hidden_size"] ): raise ConfigConversionError( "HF model only supports attention_inner_dim = hidden_size" ) def pre_config_convert( self, config, converter_indices, ): config = super().pre_config_convert(config, converter_indices) if converter_indices.direction == 0: if "n_embed" in config and config["n_embed"] is not None: config["hidden_size"] = config["n_embed"] elif converter_indices.direction == 1: if "embedding_dropout_rate" not in config: config["embedding_dropout_rate"] = config["dropout_rate"] if "attention_dropout_rate" not in config: config["attention_dropout_rate"] = config["dropout_rate"] return config def post_config_convert( self, original_config, old_config, new_config, converter_indices, drop_unmatched_keys, ): if converter_indices.direction == 0: new_config["filter_size"] = 4 * new_config["hidden_size"] return super().post_config_convert( original_config, old_config, new_config, converter_indices, drop_unmatched_keys, ) @staticmethod def formats() -> Tuple[FormatVersions, FormatVersions]: return (FormatVersions("hf"), FormatVersions("cs-1.7"))
[docs]class ConfigConverter_BloomModel_HF_CS19(ConfigConverter_BloomModel_HF_CS17): @staticmethod def formats() -> Tuple[FormatVersions, FormatVersions]: return (FormatVersions("hf"), FormatVersions("cs-1.9"))
[docs]class Converter_BloomLMHeadModel_HF_CS17(BaseCheckpointConverter_HF_CS): def __init__(self): super().__init__() self.cs_slopes_key = "relative_pe_helper.slopes" self.rules = [ ConversionRule( [r"lm_head\.(?:weight|bias)"], action=self.replaceKey, ), ConversionRule( [ EquivalentSubkey("transformer.", ""), Converter_BloomModel_HF_CS17(), ], action=None, ), ] def pre_model_convert( self, old_state_dict, new_state_dict, configs, converter_indices, drop_unmatched_keys, ): # Manually tie weights if ( converter_indices.direction == 1 and configs[1]["model"]["share_embedding_weights"] ): if ( old_state_dict.get("embedding_layer.word_embeddings.weight", 0) is None ): old_state_dict["embedding_layer.word_embeddings.weight"] = ( old_state_dict["lm_head.weight"] ) def post_model_convert( self, old_state_dict, new_state_dict, configs, converter_indices, drop_unmatched_keys, key_prefix="", ): if converter_indices.direction == 0: cs_config = configs[1] new_state_dict[key_prefix + self.cs_slopes_key] = ( Converter_BloomModel_HF_CS17.get_alibi_slopes( cs_config["model"]["num_heads"] ) ) super().post_model_convert( old_state_dict, new_state_dict, configs, converter_indices, drop_unmatched_keys, key_prefix=key_prefix, ) @staticmethod def formats() -> Tuple[FormatVersions, FormatVersions]: return (FormatVersions("hf"), FormatVersions("cs-1.7")) @classmethod def converter_note(cls) -> str: return "{} BloomForCausalLM <-> {} GPT2LMHeadModel (configured as bloom)".format( cls.formats()[0], cls.formats()[1] ) @staticmethod def get_config_converter_class() -> BaseConfigConverter: return ConfigConverter_BloomModel_HF_CS17
[docs]class Converter_BloomLMHeadModel_HF_CS19(BaseCheckpointConverter_HF_CS): def __init__(self): super().__init__() self.cs_slopes_key = "relative_pe_helper.slopes" self.rules = [ # Catch checkpoints from Pytorch 2.0 API ConversionRule( [ Converter_BloomLMHeadModel_HF_CS17(), ], action=None, ), # Catch checkpoints from 1.7/1.8 ConversionRule( [ EquivalentSubkey("", "model."), Converter_BloomLMHeadModel_HF_CS17(), ], action=None, ), ] def post_model_convert( self, old_state_dict, new_state_dict, configs, converter_indices, drop_unmatched_keys, key_prefix="", ): if converter_indices.direction == 0: cs_config = configs[1] new_state_dict[key_prefix + self.cs_slopes_key] = ( Converter_BloomModel_HF_CS17.get_alibi_slopes( cs_config["model"]["num_heads"] ) ) super().post_model_convert( old_state_dict, new_state_dict, configs, converter_indices, drop_unmatched_keys, key_prefix=key_prefix, ) @staticmethod def formats() -> Tuple[FormatVersions, FormatVersions]: return (FormatVersions("hf"), FormatVersions("cs-1.9")) @classmethod def converter_note(cls) -> str: return "{} BloomForCausalLM <-> {} GPT2LMHeadModel (configured as bloom)".format( cls.formats()[0], cls.formats()[1] ) @staticmethod def get_config_converter_class() -> BaseConfigConverter: return ConfigConverter_BloomModel_HF_CS19
[docs]class Converter_BloomLMHeadModel_CS19_CS20(Converter_GPT2LMHeadModel_CS18_CS20): r""" Bloom uses the GPT2 backbone """ @classmethod def converter_note(cls) -> str: return "GPT2LMHeadModel class (configured as bloom)" @staticmethod def formats() -> Tuple[FormatVersions, FormatVersions]: return (FormatVersions("cs-1.9"), FormatVersions("cs-2.0")) @staticmethod def get_config_converter_class() -> BaseConfigConverter: return ConfigConverter_BloomLMHeadModel_CS19_CS20
[docs]class ConfigConverter_BloomLMHeadModel_CS19_CS20( ConfigConverter_GPT2Model_CS18_CS20 ): r""" Bloom uses the GPT2 backbone """ @staticmethod def formats() -> Tuple[FormatVersions, FormatVersions]: return (FormatVersions("cs-1.9"), FormatVersions("cs-2.0"))
[docs]class Converter_BloomModel_HF_CS20(Converter_BloomModel_HF_CS19): @staticmethod def formats() -> Tuple[FormatVersions, FormatVersions]: return (FormatVersions("hf"), FormatVersions("cs-2.0")) @staticmethod def get_config_converter_class() -> BaseConfigConverter: return ConfigConverter_BloomModel_HF_CS20
[docs]class Converter_BloomLMHeadModel_HF_CS20(Converter_BloomLMHeadModel_HF_CS19): @staticmethod def formats() -> Tuple[FormatVersions, FormatVersions]: return (FormatVersions("hf"), FormatVersions("cs-2.0")) @staticmethod def get_config_converter_class() -> BaseConfigConverter: return ConfigConverter_BloomModel_HF_CS20
[docs]class ConfigConverter_BloomModel_HF_CS20(ConfigConverter_BloomModel_HF_CS19): def __init__(self): super().__init__() self.rules = [ ConversionRule( ["norm_type"], action=BaseConfigConverter.assert_factory_fn(1, "layernorm"), ), *self.rules, ] @staticmethod def formats() -> Tuple[FormatVersions, FormatVersions]: return (FormatVersions("hf"), FormatVersions("cs-2.0"))
########################################################### # In CS 2.1, we refactored the embedding layer. # CS 2.0 <> CS 2.1, and HF <> CS 2.1 converters: ###########################################################
[docs]class Converter_BloomLMHeadModel_CS20_CS21(Converter_GPT2LMHeadModel_CS20_CS21): def __init__(self): super().__init__() @classmethod def converter_note(cls) -> str: return "GPT2LMHeadModel class (configured as Bloom)"
[docs]class ConfigConverter_BloomModel_HF_CS21(ConfigConverter_BloomModel_HF_CS20): "CS 2.1 config is the same as CS 2.0" @staticmethod def formats() -> Tuple[FormatVersions, FormatVersions]: return ( FormatVersions("hf"), FormatVersions("cs-2.1", "cs-2.2", "cs-2.3"), ) def supports_mup_conversion(self): return True
[docs]class Converter_BloomModel_WithoutOptionalModel_HF_CS21( Converter_BloomModel_HF_CS17 ): def __init__(self): super().__init__() self.cs_slopes_key = "embedding_layer.position_embed_helper.slopes" # used in post_model_convert fn self.rules = [ ConversionRule( ["embedding_layer\.position_embed_helper\.slopes"], exists="right", ), *self.rules, ] @staticmethod def formats() -> Tuple[FormatVersions, FormatVersions]: return ( FormatVersions("hf"), FormatVersions("cs-2.1", "cs-2.2", "cs-2.3"), ) @staticmethod def get_config_converter_class() -> BaseConfigConverter: return ConfigConverter_BloomModel_HF_CS21
Converter_BloomModel_HF_CS21 = Build_HF_CS_Converter_WithOptionalModel( "Converter_BloomModel_HF_CS21", Converter_BloomModel_WithoutOptionalModel_HF_CS21, derived_class=Converter_BloomModel_WithoutOptionalModel_HF_CS21, )
[docs]class Converter_BloomLMHeadModel_WithoutOptionalModel_HF_CS21( Converter_BloomLMHeadModel_HF_CS20 ): def __init__(self): super().__init__() self.cs_slopes_key = "embedding_layer.position_embed_helper.slopes" # used in post_model_convert fn self.rules = [ ConversionRule( ["lm_head\.(?:weight|bias)"], action=self.replaceKey, ), ConversionRule( [ EquivalentSubkey("transformer.", ""), Converter_BloomModel_WithoutOptionalModel_HF_CS21(), ], action=None, ), ] @staticmethod def formats() -> Tuple[FormatVersions, FormatVersions]: return ( FormatVersions("hf"), FormatVersions("cs-2.1", "cs-2.2", "cs-2.3"), ) @staticmethod def get_config_converter_class() -> BaseConfigConverter: return ConfigConverter_BloomModel_HF_CS21 def supports_mup_conversion(self): return True
Converter_BloomLMHeadModel_HF_CS21 = Build_HF_CS_Converter_WithOptionalModel( "Converter_BloomLMHeadModel_HF_CS21", Converter_BloomLMHeadModel_WithoutOptionalModel_HF_CS21, derived_class=Converter_BloomLMHeadModel_WithoutOptionalModel_HF_CS21, )