Source code for cerebras.modelzoo.tools.checkpoint_converters.starcoder

# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import re
from typing import Tuple

import torch

from cerebras.modelzoo.tools.checkpoint_converters.base_converter import (
    BaseCheckpointConverter_HF_CS,
    BaseConfigConverter,
    BaseConfigConverter_HF_CS,
    ConfigConversionError,
    ConversionRule,
    EquivalentSubkey,
    FormatVersions,
)
from cerebras.modelzoo.tools.checkpoint_converters.gpt2_hf_cs import (
    Converter_GPT2LMHeadModel_CS20_CS21,
    Converter_GPT2Model_HF_CS17,
)
from cerebras.modelzoo.tools.checkpoint_converters.helper import (
    Build_HF_CS_Converter_WithOptionalModel,
)


[docs]class Converter_Starcoder_Attention_HF_CS(BaseCheckpointConverter_HF_CS): def __init__(self): super().__init__() self.rules = [ ConversionRule( [ EquivalentSubkey("c_proj", "proj_output_dense_layer"), r"\.(?:weight|bias)", ], action=self.replaceKey, ), ConversionRule( [ EquivalentSubkey("c_attn", "proj_q_dense_layer"), r"\.(?:weight|bias)", ], action=self.c_attn_converter, ), ConversionRule( [ EquivalentSubkey("c_attn", "proj_k_dense_layer"), r"\.(?:weight|bias)", ], action=self.assert_already_converted, ), ConversionRule( [ EquivalentSubkey("c_attn", "proj_v_dense_layer"), r"\.(?:weight|bias)", ], action=self.assert_already_converted, ), ] @staticmethod def formats() -> Tuple[FormatVersions, FormatVersions]: return (FormatVersions("hf"), FormatVersions("cs-X.X")) @staticmethod def get_config_converter_class() -> BaseConfigConverter: return None def c_attn_converter( self, old_key, new_key, old_state_dict, new_state_dict, from_index, action_fn_args, ): if from_index == 0: self.c_attn_converter_hf_to_cs( old_key, new_key, old_state_dict, new_state_dict, action_fn_args ) else: self.c_attn_converter_cs_to_hf( old_key, new_key, old_state_dict, new_state_dict, action_fn_args ) def c_attn_converter_hf_to_cs( self, old_key, new_key, old_state_dict, new_state_dict, action_fn_args ): # For both MHA and MQA, the c_attn weights are packed, # but the weight matrix for each is a different shape. # MHA: weight --> 3 * embed_dim x embed_dim # MQA: weight --> (embed_dim + 2 * head_dim) x embed_dim # where embed_dim is for the Queries, and each of the 2 head_dim is # for one of Keys and Values q_key = new_key k_key = re.sub(r"\.proj_q_dense_layer\.", ".proj_k_dense_layer.", q_key) v_key = re.sub(r"\.proj_q_dense_layer\.", ".proj_v_dense_layer.", q_key) hf_config = action_fn_args["configs"][0] is_multiquery = hf_config["multi_query"] embed_dim = hf_config["n_embd"] n_head = hf_config["n_head"] d_head = int(embed_dim / n_head) # Note that nn.Linear stores matrices with shape [out_dim x in_dim] packed_dim = old_state_dict[old_key].shape[0] if is_multiquery: assert packed_dim == embed_dim + 2 * d_head, ( f"Invalid tensor shape {old_state_dict[old_key].shape} at {old_key}. The second " f"dimension should be the first dimension (embed_dim) plus 2x the head_dim since " f"Q, K, and V are packed" ) # the ellipsis handles both weight and bias. indexes all of the 2nd dim for weight and # no-op for bias q_weight, kv_weight = ( old_state_dict[old_key][:embed_dim, ...], old_state_dict[old_key][embed_dim:, ...], ) k_weight, v_weight = kv_weight.chunk(2, dim=0) ( new_state_dict[q_key], new_state_dict[k_key], new_state_dict[v_key], ) = (q_weight, k_weight, v_weight) else: assert 3 * embed_dim == packed_dim, ( f"Invalid tensor shape {old_state_dict[old_key].shape} at {old_key}. The second " f"dimension should be 3x the first dimension (embed_dim) since Q, K, and V are " f"packed" ) packed_weight = old_state_dict[old_key] query_indices = [ i + j for i in range(0, packed_dim, 3 * d_head) for j in range(d_head) if i + j < packed_dim ] key_indices = [ i + j for i in range(d_head, packed_dim, 3 * d_head) for j in range(d_head) if i + j < packed_dim ] value_indices = [ i + j for i in range(2 * d_head, packed_dim, 3 * d_head) for j in range(d_head) if i + j < packed_dim ] query = packed_weight[query_indices, ...] key = packed_weight[key_indices, ...] value = packed_weight[value_indices, ...] new_state_dict[q_key] = query new_state_dict[k_key] = key new_state_dict[v_key] = value def c_attn_converter_cs_to_hf( self, old_key, new_key, old_state_dict, new_state_dict, action_fn_args, ): # HF represents Q, K, and V in a packed format q_key = old_key k_key = re.sub(r"\.proj_q_dense_layer\.", ".proj_k_dense_layer.", q_key) v_key = re.sub(r"\.proj_q_dense_layer\.", ".proj_v_dense_layer.", q_key) assert ( k_key in old_state_dict ), "Expected the following key to exist! {}".format(k_key) assert ( v_key in old_state_dict ), "Expected the following key to exist! {}".format(v_key) hf_config = action_fn_args["configs"][0] embed_dim = hf_config["n_embd"] n_head = hf_config["n_head"] d_head = int(embed_dim / n_head) is_multiquery = hf_config["multi_query"] # Note that nn.Linear stores matrices with shape [out_dim x in_dim] packed_dim = 3 * embed_dim if is_multiquery: new_state_dict[new_key] = torch.cat( ( old_state_dict[q_key], old_state_dict[k_key], old_state_dict[v_key], ), dim=0, ) else: query_indices = [ i + j for i in range(0, packed_dim, 3 * d_head) for j in range(d_head) if i + j < packed_dim ] key_indices = [ i + j for i in range(d_head, packed_dim, 3 * d_head) for j in range(d_head) if i + j < packed_dim ] value_indices = [ i + j for i in range(2 * d_head, packed_dim, 3 * d_head) for j in range(d_head) if i + j < packed_dim ] is_weight = len(old_state_dict[q_key].shape) > 1 packed_weights = ( torch.zeros(packed_dim, embed_dim) if is_weight else torch.zeros(packed_dim) ) packed_weights[query_indices, ...] = old_state_dict[q_key] packed_weights[key_indices, ...] = old_state_dict[k_key] packed_weights[value_indices, ...] = old_state_dict[v_key] new_state_dict[new_key] = packed_weights def assert_already_converted( self, old_key, new_key, old_state_dict, new_state_dict, from_index, action_fn_args, ): if from_index == 0: # We should never hit this case as this key should have been matched # already assert False, "Invalid key: {}".format(old_key) else: # When we convert from CS -> HF, the proj_q_dense_layer should also handle # conversion of proj_k_dense_layer and proj_v_dense_layer since HF # represents these three layers in a packed format. We simply need # to test that the key containing the packed format has already # been converted. assert ( new_key in new_state_dict ), "Key should've been already converted: {} -> {}".format( old_key, new_key )
# This is a base converter for Starcoder that inherits from GPT-2 # CS17 converter that contains most of the rules necessary for # converting GPT-2 checkpoints. This class is meant to be used as # an action within the rules of the CS-2.0 converter below, # that catches checkpoints from Pytorch 2.0 API and PyTorchBaseModel. # It is not meant for use on its own, because this model was not # included in the codebase before release 2.0. Note that we include a # a formats() method in this class and the StarcoderForCausalLM # converter below because it is a required method, due to the # declaration as an @abstractmethod in the BaseDictionaryConverter. # The cs-X.X in the formats() method is meant to call this to attention
[docs]class Converter_StarcoderModel_HF_CS(Converter_GPT2Model_HF_CS17): def attention_converter_class(self): return Converter_Starcoder_Attention_HF_CS() def ffn_converter(self): return self.replaceKey # see note above @staticmethod def formats() -> Tuple[FormatVersions, FormatVersions]: return (FormatVersions("hf"), FormatVersions("cs-X.X")) @staticmethod def get_config_converter_class() -> BaseConfigConverter: return ConfigConverter_StarcoderModel_HF_CS20
[docs]class Converter_StarcoderForCausalLM_HF_CS(BaseCheckpointConverter_HF_CS): def __init__(self): super().__init__() self.rules = [ ConversionRule( [r"lm_head\.(?:weight|bias)"], action=self.replaceKey, ), ConversionRule( [ EquivalentSubkey("transformer.", ""), Converter_StarcoderModel_HF_CS(), ], action=None, ), ] @staticmethod def formats() -> Tuple[FormatVersions, FormatVersions]: return (FormatVersions("hf"), FormatVersions("cs-X.X")) @staticmethod def get_config_converter_class() -> BaseConfigConverter: return ConfigConverter_StarcoderModel_HF_CS20
[docs]class Converter_StarcoderModel_HF_CS20(Converter_StarcoderModel_HF_CS): def __init__(self): super().__init__() self.rules = [ # Catch checkpoints from Pytorch 2.0 API ConversionRule( [ Converter_StarcoderModel_HF_CS(), ], action=None, ), # Catch checkpoints from deprecated PyTorchBaseModel ConversionRule( [ EquivalentSubkey("", "model."), Converter_StarcoderModel_HF_CS(), ], action=None, ), ] @staticmethod def formats() -> Tuple[FormatVersions, FormatVersions]: return (FormatVersions("hf"), FormatVersions("cs-2.0")) @classmethod def converter_note(cls) -> str: return ( "{} GPTBigCodeModel <-> {} GPT2ForCausalLM (configured as Starcoder)\n" "The HF model doesn't contain a language model head while the CS " "one does. When converting to CS, the exported checkpoint will " "contain a language model head initialized to default random " "values. When converting to HF, the language model head will be " "dropped." ).format(cls.formats()[0], cls.formats()[1]) @staticmethod def get_config_converter_class() -> BaseConfigConverter: return ConfigConverter_StarcoderModel_HF_CS20
[docs]class Converter_StarcoderForCausalLM_HF_CS20(BaseCheckpointConverter_HF_CS): def __init__(self): super().__init__() self.rules = [ # Catch checkpoints from Pytorch 2.0 API ConversionRule( [ Converter_StarcoderForCausalLM_HF_CS(), ], action=None, ), # Catch checkpoints from deprecated PyTorchBaseModel ConversionRule( [ EquivalentSubkey("", "model."), Converter_StarcoderForCausalLM_HF_CS(), ], action=None, ), ] @staticmethod def formats() -> Tuple[FormatVersions, FormatVersions]: return (FormatVersions("hf"), FormatVersions("cs-2.0")) @classmethod def converter_note(cls) -> str: return "{} GPTBigCodeForCausalLM <-> {} GPT2ForCausalLM (configured as Starcoder)".format( cls.formats()[0], cls.formats()[1] ) @staticmethod def get_config_converter_class() -> BaseConfigConverter: return ConfigConverter_StarcoderModel_HF_CS20
[docs]class ConfigConverter_StarcoderModel_HF_CS20(BaseConfigConverter_HF_CS): def __init__(self): super().__init__() self.rules = [ ConversionRule( ["norm_type"], action=BaseConfigConverter.assert_factory_fn(1, "layernorm"), ), ConversionRule( ["model_type"], action=BaseConfigConverter.assert_factory_fn(0, "gpt_bigcode"), ), # Embedding ConversionRule(["vocab_size"], action=self.replaceKey), ConversionRule( ["position_embedding_type"], exists="right", action=BaseConfigConverter.assert_factory_fn(1, "learned"), ), ConversionRule( ["use_position_embedding"], exists="right", action=BaseConfigConverter.assert_factory_fn(1, True), ), ConversionRule( [EquivalentSubkey("embd_pdrop", "embedding_dropout_rate")], action=self.replaceKey, ), ConversionRule( [ EquivalentSubkey( "tie_word_embeddings", "share_embedding_weights" ) ], action=self.replaceKey, ), ConversionRule( ["embedding_layer_norm"], action=BaseConfigConverter.assert_factory_fn(1, False), ), # Decoder Block ConversionRule( [EquivalentSubkey("n_embd", "hidden_size")], action=self.replaceKey, ), ConversionRule( [EquivalentSubkey("n_head", "num_heads")], action=self.replaceKey, ), ConversionRule( [EquivalentSubkey("n_layer", "num_hidden_layers")], action=self.replaceKey, ), ConversionRule( [EquivalentSubkey("n_positions", "max_position_embeddings")], action=self.replaceKey, ), ConversionRule( [EquivalentSubkey("scale_attn_weights", "attention_type")], action=self.convert_attention_type, ), ConversionRule( ["use_projection_bias_in_attention"], action=BaseConfigConverter.assert_factory_fn(1, True), ), ConversionRule( ["use_ffn_bias_in_attention"], exists="right", action=BaseConfigConverter.assert_factory_fn(1, True), ), ConversionRule( ["use_ffn_bias"], exists="right", action=BaseConfigConverter.assert_factory_fn(1, True), ), ConversionRule( [EquivalentSubkey("n_inner", "filter_size")], action=self.replaceKey, ), ConversionRule( [EquivalentSubkey("activation_function", "nonlinearity")], action=self.replaceKey, ), ConversionRule( [EquivalentSubkey("attn_pdrop", "attention_dropout_rate")], action=self.replaceKey, ), ConversionRule( [EquivalentSubkey("resid_pdrop", "dropout_rate")], action=self.replaceKey, ), ConversionRule(["rotary_dim"], action=self.replaceKey), ConversionRule( ["layer_norm_epsilon"], action=self.replaceKey, ), ConversionRule( ["use_bias_in_output"], action=BaseConfigConverter.assert_factory_fn(1, False), ), ConversionRule(["initializer_range"], action=self.replaceKey), ConversionRule( ["fixed_sparse_attention"], action=BaseConfigConverter.assert_factory_fn(1, None), ), ConversionRule( ["norm_first"], action=BaseConfigConverter.assert_factory_fn(1, True), ), ConversionRule( ["use_ff_layer1_dropout"], action=BaseConfigConverter.assert_factory_fn(1, False), ), ConversionRule( [ EquivalentSubkey( "attention_softmax_in_fp32", "attention_softmax_fp32", ) ], action=self.replaceKey, ), ConversionRule( ["scale_qk_dot_by_layer_idx"], action=BaseConfigConverter.assert_factory_fn(1, False), ), ] # HF pre/post updates self.pre_convert_defaults[0].update( { "tie_word_embeddings": True, "multi_query": True, "attn_pdrop": 0.0, "scale_attn_weights": True, "resid_pdrop": 0.0, "embd_pdrop": 0.0, "n_inner": 24576, "n_embd": 6144, "n_head": 48, "n_layer": 40, "vocab_size": 49152, "n_positions": 8192, } ) self.post_convert_defaults[0].update( { "model_type": "gpt_bigcode", "architectures": ["GPTBigCodeForCausalLM"], "validate_runner_input": True, "use_cache": True, "transformers_version": "4.28.1", "summary_use_proj": True, "summary_type": "cls_index", "inference_runner": 0, "eos_token_id": 0, "bos_token_id": 0, "max_sequence_length": None, "max_batch_size": None, } ) # CS pre/post updates self.pre_convert_defaults[1].update( { "share_embedding_weights": True, "attention_dropout_rate": 0.0, "attention_module": "multiquery_attention", "attention_type": "scaled_dot_product", "scale_qk_dot_by_layer_idx": False, "dropout_rate": 0.0, "embedding_dropout_rate": 0.0, "filter_size": 24576, "hidden_size": 6144, "max_position_embeddings": 8192, "num_heads": 48, "num_hidden_layers": 40, "vocab_size": 49152, }, ) self.post_convert_defaults[1].update( { "position_embedding_type": "learned", "use_projection_bias_in_attention": True, "use_ffn_bias_in_attention": True, "use_ffn_bias": True, "nonlinearity": "gelu", "use_bias_in_output": False, "loss_scaling": "num_tokens", } ) def convert_attention_type( self, old_key, new_key, old_state_dict, new_state_dict, from_index, action_fn_args, ): if from_index == 0: new_state_dict[new_key] = ( "scaled_dot_product" if old_state_dict[old_key] else "dot_product" ) new_state_dict["attention_module"] = ( "multiquery_attention" if old_state_dict["multi_query"] else "aiayn_attention" ) if old_state_dict["multi_query"]: new_state_dict["extra_attention_params"] = {"num_kv_groups": 1} else: if ( old_state_dict[old_key] != "scaled_dot_product" and old_state_dict[old_key] != "dot_product" ): raise ConfigConversionError( "Can't convert config with {}={}. Only {} is supported.".format( old_key, old_state_dict[old_key], "scaled_dot_product and dot_product", ) ) new_state_dict[new_key] = old_state_dict[old_key].startswith( "scaled_" ) is_multiquery = ( old_state_dict["attention_module"] == "multiquery_attention" ) new_state_dict["multi_query"] = is_multiquery def pre_config_convert( self, config, converter_indices, ): config = super().pre_config_convert(config, converter_indices) if converter_indices.direction == 0: if "n_inner" not in config or config["n_inner"] is None: config["n_inner"] = 4 * config["n_embd"] else: if "embedding_dropout_rate" not in config: config["embedding_dropout_rate"] = config["dropout_rate"] if "attention_dropout_rate" not in config: config["attention_dropout_rate"] = config["dropout_rate"] return config @staticmethod def formats() -> Tuple[FormatVersions, FormatVersions]: return (FormatVersions("hf"), FormatVersions("cs-2.0"))
########################################################### # In CS 2.1, we refactored the embedding layer. ###########################################################
[docs]class Converter_StarcoderLMHeadModel_CS20_CS21( Converter_GPT2LMHeadModel_CS20_CS21 ): @classmethod def converter_note(cls) -> str: return "GPT2LMHeadModel class (configured as Starcoder)"
[docs]class ConfigConverter_StarcoderModel_HF_CS21( ConfigConverter_StarcoderModel_HF_CS20 ): "CS 2.1 config is the same as CS 2.0" @staticmethod def formats() -> Tuple[FormatVersions, FormatVersions]: return ( FormatVersions("hf"), FormatVersions("cs-2.1", "cs-2.2", "cs-2.3"), ) def supports_mup_conversion(self): return True
[docs]class Converter_StarcoderModel_WithoutOptionalModel_HF_CS21( Converter_StarcoderModel_HF_CS ): def __init__(self): super().__init__() self.rules = [ ConversionRule( [ EquivalentSubkey( "wpe", "embedding_layer.position_embeddings.embed" ), "\.(?:weight|bias)", ], action=self.replaceKey, ), *self.rules, ] @staticmethod def formats() -> Tuple[FormatVersions, FormatVersions]: return ( FormatVersions("hf"), FormatVersions("cs-2.1", "cs-2.2", "cs-2.3"), ) @staticmethod def get_config_converter_class() -> BaseConfigConverter: return ConfigConverter_StarcoderModel_HF_CS21 @classmethod def converter_note(cls) -> str: return ( "{} GPTBigCodeModel <-> {} GPT2ForCausalLM (configured as Starcoder)\n" "The HF model doesn't contain a language model head while the CS " "one does. When converting to CS, the exported checkpoint will " "contain a language model head initialized to default random " "values. When converting to HF, the language model head will be " "dropped." ).format(cls.formats()[0], cls.formats()[1])
Converter_StarcoderModel_HF_CS21 = Build_HF_CS_Converter_WithOptionalModel( "Converter_StarcoderModel_HF_CS21", Converter_StarcoderModel_WithoutOptionalModel_HF_CS21, derived_class=Converter_StarcoderModel_WithoutOptionalModel_HF_CS21, )
[docs]class Converter_StarcoderForCausalLM_WithoutOptionalModel_HF_CS21( BaseCheckpointConverter_HF_CS ): def __init__(self): super().__init__() self.rules = [ ConversionRule( [r"lm_head\.(?:weight|bias)"], action=self.replaceKey, ), ConversionRule( [ EquivalentSubkey("transformer.", ""), Converter_StarcoderModel_WithoutOptionalModel_HF_CS21(), ], action=None, ), ] @staticmethod def formats() -> Tuple[FormatVersions, FormatVersions]: return ( FormatVersions("hf"), FormatVersions("cs-2.1", "cs-2.2", "cs-2.3"), ) @staticmethod def get_config_converter_class() -> BaseConfigConverter: return ConfigConverter_StarcoderModel_HF_CS21 @classmethod def converter_note(cls) -> str: return "{} GPTBigCodeForCausalLM <-> {} GPT2ForCausalLM (configured as Starcoder)".format( cls.formats()[0], cls.formats()[1] ) def supports_mup_conversion(self): return True
Converter_StarcoderForCausalLM_HF_CS21 = Build_HF_CS_Converter_WithOptionalModel( "Converter_StarcoderForCausalLM_HF_CS21", Converter_StarcoderForCausalLM_WithoutOptionalModel_HF_CS21, derived_class=Converter_StarcoderForCausalLM_WithoutOptionalModel_HF_CS21, )