Source code for cerebras.modelzoo.tools.checkpoint_converters.vit

# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Tuple

import torch

from cerebras.modelzoo.tools.checkpoint_converters.base_converter import (
    BaseCheckpointConverter_HF_CS,
    BaseConfigConverter,
    BaseConfigConverter_HF_CS,
    ConfigConversionError,
    ConversionRule,
    EquivalentSubkey,
    FormatVersions,
)


[docs]class Converter_ViT_Core_HF_CS21(BaseCheckpointConverter_HF_CS): def __init__(self): super().__init__() self.rules = [ # Embedding: ConversionRule( [ EquivalentSubkey( "embeddings.cls_token", "embedding_layer.cls_embedding" ), ], action=self.cls_embedding_convert, ), ConversionRule( [ EquivalentSubkey( "embeddings.position_embeddings", "embedding_layer.position_embeddings.weight", ), ], action=self.position_embeddings_convert, ), ConversionRule( [ EquivalentSubkey( "embeddings.patch_embeddings.projection", "embedding_layer.linear_proj", ), r"\.(?:weight|bias)", ], action=self.replaceKey, ), # Encoder: ConversionRule( [ EquivalentSubkey( "encoder.layer", "encoder.transformer_encoder.layers", ), r"\.\d+\.", EquivalentSubkey( "attention.attention.query", "self_attn.proj_q_dense_layer", ), r"\.(?:weight|bias)", ], action=self.replaceKey, ), ConversionRule( [ EquivalentSubkey( "encoder.layer", "encoder.transformer_encoder.layers", ), r"\.\d+\.", EquivalentSubkey( "attention.attention.key", "self_attn.proj_k_dense_layer", ), r"\.(?:weight|bias)", ], action=self.replaceKey, ), ConversionRule( [ EquivalentSubkey( "encoder.layer", "encoder.transformer_encoder.layers", ), r"\.\d+\.", EquivalentSubkey( "attention.attention.value", "self_attn.proj_v_dense_layer", ), r"\.(?:weight|bias)", ], action=self.replaceKey, ), ConversionRule( [ EquivalentSubkey( "encoder.layer", "encoder.transformer_encoder.layers", ), r"\.\d+\.", EquivalentSubkey( "attention.output.dense", "self_attn.proj_output_dense_layer", ), r"\.(?:weight|bias)", ], action=self.replaceKey, ), ConversionRule( [ EquivalentSubkey( "encoder.layer", "encoder.transformer_encoder.layers", ), r"\.\d+\.", EquivalentSubkey( "intermediate.dense", "ffn.ffn.0.linear_layer" ), r"\.(?:weight|bias)", ], action=self.replaceKey, ), ConversionRule( [ EquivalentSubkey( "encoder.layer", "encoder.transformer_encoder.layers", ), r"\.\d+\.", EquivalentSubkey("output.dense", "ffn.ffn.1.linear_layer"), r"\.(?:weight|bias)", ], action=self.replaceKey, ), ConversionRule( [ EquivalentSubkey( "encoder.layer", "encoder.transformer_encoder.layers", ), r"\.\d+\.", EquivalentSubkey("layernorm_before", "norm1"), r"\.(?:weight|bias)", ], action=self.replaceKey, ), ConversionRule( [ EquivalentSubkey( "encoder.layer", "encoder.transformer_encoder.layers", ), r"\.\d+\.", EquivalentSubkey("layernorm_after", "norm2"), r"\.(?:weight|bias)", ], action=self.replaceKey, ), ConversionRule( [ EquivalentSubkey( "layernorm", "encoder.transformer_encoder.norm", ), r"\.(?:weight|bias)", ], action=self.replaceKey, ), # pooler ConversionRule( [ EquivalentSubkey( "pooler.dense", "encoder.pooler.pooler.ffn.0.linear_layer", ), r"\.(?:weight|bias)", ], action=self.replaceKey, ), ] def cls_embedding_convert( self, old_key, new_key, old_state_dict, new_state_dict, from_index, action_fn_args, ): if from_index == 0: new_state_dict[new_key] = old_state_dict[old_key].squeeze() else: new_state_dict[new_key] = old_state_dict[old_key].reshape(1, 1, -1) def position_embeddings_convert( self, old_key, new_key, old_state_dict, new_state_dict, from_index, action_fn_args, ): assert ( action_fn_args["configs"][1]["model"]["position_embedding_type"] == "learned" ), "Only learned embeddings are supported" # cs vit pe puts cls token at last by default but hf put at index 0 if from_index == 0: new_state_dict[new_key] = torch.cat( [ old_state_dict[old_key][0, 1:, :], old_state_dict[old_key][0, :1, :], ], dim=0, ) else: new_state_dict[new_key] = torch.cat( [ old_state_dict[old_key][-1:, :], old_state_dict[old_key][:-1, :], ], dim=0, ).unsqueeze(0) @staticmethod def formats() -> Tuple[FormatVersions, FormatVersions]: return ( FormatVersions("hf"), FormatVersions("cs-2.1"), ) @staticmethod def get_config_converter_class() -> BaseConfigConverter: return ConfigConverter_ViT_HF_CS21
[docs]class Converter_ViT_Headless_HF_CS21(BaseCheckpointConverter_HF_CS): def __init__(self): super().__init__() self.rules = [ # ViTModel has a pooling layer, ViTModelForImageClassification doesn't ConversionRule( [ r"pooler.dense\.(?:weight|bias)", ], exists="left", action=None, ), # for HF without head ConversionRule( [ EquivalentSubkey("", "vit_model."), Converter_ViT_Core_HF_CS21(), ], ), # drop classifier during CS -> HF ConversionRule( [ r"classifier.classifier.ffn.0.linear_layer\.(?:weight|bias)", ], exists="right", action=None, ), ] @staticmethod def formats() -> Tuple[FormatVersions, FormatVersions]: return ( FormatVersions("hf"), FormatVersions("cs-2.1", "cs-2.2", "cs-2.3"), ) @classmethod def converter_note(cls) -> str: return ( "{} ViTModel <-> {} ViTClassificationModel\n" "The HF model doesn't contain a classifier head while the CS " "one does. When converting to CS, the exported checkpoint will " "contain a classifier head initialized to default random " "values. When converting to HF, the classifier head will be " "dropped." ).format(cls.formats()[0], cls.formats()[1]) @staticmethod def get_config_converter_class() -> BaseConfigConverter: return ConfigConverter_ViT_HF_CS21 def post_model_convert( self, old_state_dict, new_state_dict, configs, converter_indices, drop_unmatched_keys, key_prefix="", ): if converter_indices.direction == 0: # We are converting from HF ViTModel (headless) to our ViTForClassificationModel # We need to create 'classifier' and init to default values cs_config = configs[1] use_bias_in_output = cs_config["model"].get( "use_bias_in_output", False ) num_classes = cs_config["model"]["num_classes"] embed_dim = cs_config["model"]["hidden_size"] classifier_weight = torch.zeros((num_classes, embed_dim)) classifier_weight.normal_(mean=0.0, std=0.02) new_state_dict[ key_prefix + "classifier.classifier.ffn.0.linear_layer.weight" ] = classifier_weight if use_bias_in_output: lm_head_bias = torch.zeros(num_classes) new_state_dict[ key_prefix + "classifier.classifier.ffn.0.linear_layer.bias" ] = lm_head_bias super().post_model_convert( old_state_dict, new_state_dict, configs, converter_indices, drop_unmatched_keys, key_prefix=key_prefix, )
[docs]class Converter_ViT_HF_CS21(BaseCheckpointConverter_HF_CS): def __init__(self): super().__init__() self.rules = [ # for HF with head ConversionRule( [ EquivalentSubkey("vit.", "vit_model."), Converter_ViT_Core_HF_CS21(), ], ), # classifier ConversionRule( [ EquivalentSubkey( "classifier", "classifier.classifier.ffn.0.linear_layer" ), r"\.(?:weight|bias)", ], action=self.replaceKey, ), ] @staticmethod def formats() -> Tuple[FormatVersions, FormatVersions]: return ( FormatVersions("hf"), FormatVersions("cs-2.1", "cs-2.2", "cs-2.3"), ) @classmethod def converter_note(cls) -> str: return ( "{} ViTForImageClassification <-> {} ViTClassificationModel".format( cls.formats()[0], cls.formats()[1] ) ) @staticmethod def get_config_converter_class() -> BaseConfigConverter: return ConfigConverter_ViT_HF_CS21
[docs]class ConfigConverter_ViT_Core_HF_CS21(BaseConfigConverter_HF_CS): def __init__(self): super().__init__() self.rules = [ ConversionRule( ["model_type"], action=BaseConfigConverter.assert_factory_fn(0, "vit"), ), ConversionRule( ["use_post_embed_layer_norm"], exists="right", action=BaseConfigConverter.assert_factory_fn(1, False), ), ConversionRule( ["hidden_size"], action=self.replaceKey, ), ConversionRule( ["num_hidden_layers"], action=self.replaceKey, ), ConversionRule( [EquivalentSubkey("layer_norm_eps", "layer_norm_epsilon")], action=self.replaceKey, ), ConversionRule( [EquivalentSubkey("num_attention_heads", "num_heads")], action=self.replaceKey, ), ConversionRule( ["attention_type"], action=BaseConfigConverter.assert_factory_fn( 1, "scaled_dot_product" ), ), ConversionRule( [EquivalentSubkey("hidden_dropout_prob", "dropout_rate")], action=self.replaceKey, ), ConversionRule( [EquivalentSubkey("hidden_act", "nonlinearity")], action=self.convert_nonlinearity, ), ConversionRule( [ EquivalentSubkey( "attention_probs_dropout_prob", "attention_dropout_rate" ) ], action=self.replaceKey, ), ConversionRule( ["use_projection_bias_in_attention"], exists="right", action=BaseConfigConverter.assert_factory_fn(1, True), ), ConversionRule( ["use_ffn_bias_in_attention"], exists="right", action=BaseConfigConverter.assert_factory_fn(1, True), ), ConversionRule( [EquivalentSubkey("intermediate_size", "filter_size")], action=self.replaceKey, ), ConversionRule( ["use_ffn_bias"], exists="right", action=BaseConfigConverter.assert_factory_fn(1, True), ), ConversionRule( ["initializer_range"], action=self.replaceKey, ), ConversionRule( ["image_size"], action=self.convert_image_patch_size, ), ConversionRule( ["num_channels"], action=self.replaceKey, ), ConversionRule( ["patch_size"], action=self.convert_image_patch_size, ), ConversionRule( ["use_conv_patchified_embedding"], exists="right", action=BaseConfigConverter.assert_factory_fn(1, True), ), ] self.pre_convert_defaults[0].update( { "attention_probs_dropout_prob": 0.0, "encoder_stride": 16, "hidden_act": "gelu", "hidden_dropout_prob": 0.0, "hidden_size": 768, "image_size": 224, "initializer_range": 0.02, "intermediate_size": 3072, "layer_norm_eps": 1e-12, "model_type": "vit", "num_attention_heads": 12, "num_channels": 3, "num_hidden_layers": 12, "patch_size": 16, "qkv_bias": True, } ) self.pre_convert_defaults[1].update( { "use_conv_patchified_embedding": True, "prepend_cls_token": True, "use_encoder_pooler_layer": True, "position_embedding_type": "learned", "num_classes": 2, }, ) self.post_convert_defaults[0].update( { "model_type": "vit", } ) self.post_convert_defaults[1].update( { "use_conv_patchified_embedding": True, "prepend_cls_token": True, "use_encoder_pooler_layer": True, "position_embedding_type": "learned", "num_classes": 2, "use_bias_in_output": True, } ) def convert_image_patch_size( self, old_key, new_key, old_state_dict, new_state_dict, from_index, action_fn_args, ): if from_index == 0: size = old_state_dict[old_key] new_state_dict[new_key] = [size, size] else: width, height = old_state_dict[old_key] if width != height: raise ConfigConversionError( "Can't convert config with {}={}. Image width and height need to match.".format( old_key, old_state_dict[old_key] ) ) new_state_dict[new_key] = width def convert_nonlinearity( self, old_key, new_key, old_state_dict, new_state_dict, from_index, action_fn_args, ): activation = old_state_dict[old_key] if from_index == 0: gated_hf2cs = { "silu": "swiglu", "gelu_pytorch_tanh": "gelu_new", "quick_gelu": "quick_gelu", } if activation in gated_hf2cs: activation = gated_hf2cs[activation] elif from_index == 1: gated_cs2hf = { "swiglu": "silu", "gelu_new": "gelu_pytorch_tanh", "quick_gelu": "quick_gelu", } if activation in gated_cs2hf: activation = gated_cs2hf[activation] new_state_dict[new_key] = activation def pre_config_convert( self, config, converter_indices, ): config = super().pre_config_convert(config, converter_indices) if ( converter_indices.direction == 0 and "encoder_stride" in config and config["encoder_stride"] != config["patch_size"] ): raise ConfigConversionError( f"{self.formats()[1]} model only supports encoder_stride == patch_size" ) return config def post_config_convert( self, original_config, old_config, new_config, converter_indices, drop_unmatched_keys, ): if converter_indices.direction == 1: if "encoder_stride" not in new_config: new_config["encoder_stride"] = new_config["patch_size"] return super().post_config_convert( original_config, old_config, new_config, converter_indices, drop_unmatched_keys, ) @staticmethod def formats() -> Tuple[FormatVersions, FormatVersions]: return ( FormatVersions("hf"), FormatVersions("cs-2.1", "cs-2.2", "cs-2.3"), )
[docs]class ConfigConverter_ViT_HF_CS21(ConfigConverter_ViT_Core_HF_CS21): def __init__(self): super().__init__() self.rules = [ ConversionRule( [EquivalentSubkey("num_labels", "num_classes")], action=self.replaceKey, ), *self.rules, ] self.pre_convert_defaults[1].update( { "use_encoder_pooler_layer": False, "num_classes": 2, }, ) self.post_convert_defaults[1].update( { "use_encoder_pooler_layer": False, "num_classes": 2, } )