Source code for cerebras.modelzoo.tools.checkpoint_converters.streaming_checkpoints

# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import logging
import math
import os
import re
from typing import Union

import safetensors.torch as safetensors_torch
import torch


[docs]def convert_file_size_to_int(size: Union[int, str]): """ Converts a size expressed as a string with digits and unit to an integer. Args: size (`int` or `str`): The size to convert (e.g., `"5MB"`). Will be directly returned if an `int`. Returns: The size in bytes. Example: ```py >>> convert_file_size_to_int("10GiB") 10737418240 ``` """ from cerebras.appliance.utils.units import convert_byte_unit if isinstance(size, str): match = re.search(r'(\d+)(.*)', size) if not match: raise ValueError( f"size '{size}' is not in a valid format. Use an integer followed by the " f"unit, e.g., '10GB'." ) try: num = int(match.group(1)) unit = match.group(2) size = convert_byte_unit(num, "B", src_unit=unit) except: raise ValueError( f"size '{size}' is not in a valid format. Use an integer followed by the " f"unit, e.g., '10GB'." ) return size
[docs]def dtype_byte_size(dtype: torch.dtype) -> float: """ Returns the size (in bytes) occupied by one parameter of type `dtype`. Example: ```py >>> dtype_byte_size(torch.float32) 4.0 ``` """ if dtype == torch.bool: return 1 / 8 if dtype.is_floating_point: return torch.finfo(dtype).bits / 8 else: return torch.iinfo(dtype).bits / 8
[docs]class StreamingShardedHFReader: r"""Allows sharded HuggingFace checkpoints to be read in a streaming manner rather than loading all shards into memory all at once. The underlying checkpoint is read-only. Only one shard is stored into memory at a time. For this reason, accessing random keys may slow due to the switching cost (loading) between shards. For this reason, it is recommend that keys are accessed in the order given by `self.keys()` or `self.__iter__()` as keys that appear in the same shard are in consecutive order. Args: index_file: Path to .index.json file. """ def __init__(self, index_file: str) -> None: self.index_dir = os.path.dirname(index_file) with open(index_file, "r") as f: index = json.load(f) self.weight_map = index["weight_map"] self.file2keys = { file: [] for file in sorted(set(self.weight_map.values())) } for file in self.file2keys: shard_path = os.path.join(self.index_dir, file) if not os.path.exists(shard_path): raise FileNotFoundError( f"Detected missing checkpoint shard: {shard_path}" ) for key, file in self.weight_map.items(): self.file2keys[file].append(key) self.active_file_name = None self.active_file_data = None def load_shard(self, file): if file.endswith(".safetensors"): return safetensors_torch.load_file(file, device="cpu") else: return torch.load(file, map_location="cpu") def __len__(self): return len(self.weight_map) def __iter__(self): for file in self.file2keys: for key in self.file2keys[file]: yield key def __getitem__(self, key): if key not in self.weight_map: raise KeyError file = self.weight_map[key] if file != self.active_file_name: self.active_file_name = file if self.active_file_data is not None: # Drop old data *before* load. # Without this, peak mem usage = prev shard + new shard del self.active_file_data self.active_file_data = self.load_shard( os.path.join(self.index_dir, file), ) return self.active_file_data[key] def items(self): for key in self.keys(): yield key, self[key] def keys(self): return list(self.__iter__()) def values(self): for key in self.keys(): yield self[key]
[docs]class StreamingShardedHFWriter: r"""Writes a HuggingFace sharded checkpoint in a streaming manner rather than accumulating the full checkpoint into memory and then writing all shards at the end. A partial checkpoint is accumulated into memory until it reaches the shard size limit at which point this shard is written to disk. It is essential that `self.save()` is called in order to flush the last shard to disk and to save other required metadata. The StreamingShardedHFWriter class supports re-accessing and even updating keys that have already been written. Note that accessing existing keys randomly may be slow due to the switching cost (loading) between shards that have already been written to disk. For this reason, it is recommend that keys are re-accessed in the order given by `self.keys()` or `self.__iter__()` as keys that appear in the same shard are in consecutive order. Note that updating data stored in a shard may result in a shard that is smaller/larger than the original shard size, as StreamingShardedHFWriter will not intelligently split or coalesce shards during updates. Args: checkpoint_dir: Path to where a new directory will be created to store the checkpoint shards. shard_size: The maximum size each checkpoint shard should be. Can be an integer representing the number of bytes, or a formatted string (ex: "10GB"). See convert_file_size_to_int for valid string formats. export_safetensors: Whether the output shards should be saved as safetensors or pickle files. Default: False. When using pickle files, the checkpoint & index files are saved with the 'pytorch_model` prefix while they use the 'model' prefix when using safetensors. """ def __init__( self, checkpoint_dir: str, shard_size: Union[str, int] = "10GB", export_safetensors=False, ) -> None: self.checkpoint_dir = checkpoint_dir self.file_ext = 'safetensors' if export_safetensors else 'bin' self.file_prefix = "pytorch_" if not export_safetensors else "" os.mkdir(self.checkpoint_dir) self.index_file = os.path.join( self.checkpoint_dir, f"{self.file_prefix}model.{self.file_ext}.index.json", ) self.weight_map = {} self.current_file_number = 0 self.last_file_number = 0 self.total_shards_finalized = 0 self.active_file_name = self.get_filename( self.current_file_number, self.total_shards_finalized ) self.active_file_data = {} self.file_size = {self.active_file_name: 0} self.dirty = True self.max_shard_size = convert_file_size_to_int(shard_size) def __len__(self): return len(self.weight_map) def __iter__(self): for key in self.weight_map: yield key def __getitem__(self, key): if key not in self.weight_map: raise KeyError file = self.weight_map[key] if file != self.active_file_name: self._switch_shards(file) return self.active_file_data[key] def __setitem__(self, key, value): if key in self.weight_map: # We are updating a key that has already been seen before file = self.weight_map[key] if self.active_file_name != file: self._switch_shards(file) old_value = self.active_file_data[key] old_weight_size = math.ceil( old_value.numel() * dtype_byte_size(old_value.dtype) ) weight_size = math.ceil( value.numel() * dtype_byte_size(value.dtype) ) delta_size = weight_size - old_weight_size if ( self.file_size[self.active_file_name] + delta_size > self.max_shard_size ): logging.warning( f"Updating {key} is causing shard {self.active_file_name} to be larger than " f"limit." ) self.active_file_data[key] = value self.weight_map[key] = self.active_file_name self.file_size[self.active_file_name] += delta_size self.dirty = True else: # We are adding a new key that hasn't been seen before weight_size = math.ceil( value.numel() * dtype_byte_size(value.dtype) ) if self.current_file_number != self.last_file_number: self._switch_shards( self.get_filename( self.last_file_number, self.total_shards_finalized ) ) # Create a new shard if this new weight "tips" us over the limit: if ( self.file_size[self.active_file_name] + weight_size > self.max_shard_size ): self._flush() self.last_file_number += 1 self.current_file_number = self.last_file_number if self.active_file_data is not None: # Drop old data *before* load. # Without this, peak mem usage = prev shard + new shard del self.active_file_data self.active_file_data = {} self.active_file_name = self.get_filename( self.current_file_number, self.total_shards_finalized ) self.file_size[self.active_file_name] = 0 self.active_file_data[key] = value self.weight_map[key] = self.active_file_name self.file_size[self.active_file_name] += weight_size self.dirty = True def get_filename(self, file_number, total_shards=0): return f"{self.file_prefix}model-{file_number+1:05d}-of-{total_shards:05d}.{self.file_ext}" def load_shard(self, file): if self.file_ext == "safetensors": return safetensors_torch.load_file(file, device="cpu") else: return torch.load(file, map_location="cpu") def save_shard(self, data, file): if self.file_ext == "safetensors": def materialize(value): if hasattr(value, "_materialize"): value = value._materialize() if isinstance(value, torch.Tensor): value = value.contiguous() return value materialized_data = {k: materialize(v) for k, v in data.items()} safetensors_torch.save_file( materialized_data, file, {"format": "pt"} ) else: torch.save(data, file) def _flush(self): if self.dirty: self.save_shard( self.active_file_data, os.path.join(self.checkpoint_dir, self.active_file_name), ) self.dirty = False def _switch_shards(self, new_file): self._flush() self.active_file_name = new_file if self.active_file_data is not None: # Drop old data *before* load. # Without this, peak mem usage = prev shard + new shard del self.active_file_data self.active_file_data = self.load_shard( os.path.join(self.checkpoint_dir, new_file), ) def save(self): self._flush() total_size = sum(shard_size for shard_size in self.file_size.values()) # Finalize total number of shards: new_total_shards = self.last_file_number + 1 if self.total_shards_finalized != new_total_shards: # Step 1: Figure out the prev file -> new file mapping so that # we can rename the files / data structures file_renames = { self.get_filename( i, self.total_shards_finalized ): self.get_filename(i, new_total_shards) for i in range(new_total_shards) } # Step 2: Rename the checkpoint files for prev_file, new_file in file_renames.items(): os.rename( os.path.join(self.checkpoint_dir, prev_file), os.path.join(self.checkpoint_dir, new_file), ) # Step 3: Update the weight map & file size data structures: self.weight_map = { key: file_renames[prev_file] for key, prev_file in self.weight_map.items() } self.file_size = { file_renames[prev_file]: size for prev_file, size in self.file_size.items() } # Step 4: Update the # of finalized shards so that future updates # to the writer will be able to correctly pick up the shards self.total_shards_finalized = new_total_shards with open(self.index_file, "w") as f: f.write( json.dumps( { "metadata": { "total_size": total_size, }, "weight_map": self.weight_map, }, indent=4, ) ) def items(self): for key in self.keys(): yield key, self[key] def keys(self): return list(self.__iter__()) def values(self): for key in self.keys(): yield self[key]
[docs]class StreamingCSLeaf: r"""Marks checkpoint keys that can be directly loaded from/saved to the H5 checkpoint. Non-leafs are accessed through StreamingCSWriterView due to their iterable nature. """ def __str__(self) -> str: return "*" def __repr__(self) -> str: return "*"
[docs]class StreamingCSWriterView: r"""StreamingCSWriterView allows for checkpoints with arbitrarily nested dictionaries/lists to be written in a streaming (incremental) manner by offering a "view" into a StreamingCSWriter. For example, in a checkpoint with the structure {"model": {<model state>}}, we can obtain a view into the model state via checkpoint["model"]. This view has state <model state> and prefix ["model"]. The view acts like a dict (offers `__getitem__`, `__setitem__`, etc operations) which incrementally saves/loads from an H5 checkpoint under the hood. Args: checkpoint_file: Path to H5 checkpoint state: (Sub)state dictionary corresponding to the current view of the checkpoint. prefix: Chain of keys that were accessed in the checkpoint that yielded the current view """ def __init__(self, checkpoint_file, state, prefix=None) -> None: self.checkpoint_file = checkpoint_file self.state = state self.prefix = prefix or [] def __str__(self): return str(self.state) def __repr__(self): return f"StreamingCSWriterView: {str(self)}" def __iter__(self): if isinstance(self.state, dict): for key in self.keys(): yield key if isinstance(self.state, (list, tuple)): for i in range(len(self.state)): yield self[i] def __len__(self): return len(self.state) def items(self): assert isinstance(self.state, dict) for key in self.keys(): yield key, self[key] def keys(self): assert isinstance(self.state, dict) for key in self.state: if key in self: yield key def values(self): assert isinstance(self.state, dict) for key in self.keys(): yield self[key] def __contains__(self, item): return item in self.state def __getitem__(self, key): from cerebras.pytorch.saver.pt_h5_saver import PyTorchH5Saver value = self.state[key] if isinstance(value, StreamingCSLeaf): saver = PyTorchH5Saver() name = ".".join(self.prefix + [str(key)]) return saver.load_tensor(self.checkpoint_file, name) if isinstance(value, StreamingCSWriterView): return value if isinstance(value, (dict, list, tuple)): subview = StreamingCSWriterView( self.checkpoint_file, value, self.prefix + [str(key)] ) return subview return value def get(self, key, default=None): if key in self: return self[key] return default def pop(self, key, default=None): if key in self: item = self[key] self.state.pop(key) return item return default def __setitem__(self, key, value): from cerebras.pytorch.saver.pt_h5_saver import PyTorchH5Saver from cerebras.pytorch.utils.nest import recurse_spec if key in self.state and not isinstance( self.state[key], StreamingCSLeaf ): raise ValueError( "StreamingCSWriter does not support updating an existing \ key which had a dict/list/tuple value" ) if isinstance(value, (dict, list, tuple)): if key in self.state: raise ValueError( "StreamingCSWriter does not support updating a key which \ already exists with a dict/list/tuple" ) flattened, spec = torch.utils._pytree.tree_flatten(value) for scope, v in zip(recurse_spec(spec), flattened): name = ".".join(self.prefix + [key] + scope) saver = PyTorchH5Saver() saver.save_tensor(self.checkpoint_file, name, v) substate = torch.utils._pytree.tree_unflatten( [StreamingCSLeaf() for i in range(len(flattened))], spec, ) self.state[key] = substate else: name = ".".join(self.prefix + [key]) saver = PyTorchH5Saver() saver.save_tensor(self.checkpoint_file, name, value) self.state[key] = StreamingCSLeaf()
[docs]class StreamingCSWriter(StreamingCSWriterView): r"""Writes a Cerebras H5 checkpoint in a streaming (incremental) manner rather than accumulating the full checkpoint into memory and then writing all weights at the end. It is essential that `self.save()` is called in order to flush the required metadata (state's spec). Without this call, the resulting checkpoint will not be able to be loaded with `cstorch.load(...)`. The StreamingCSWriter class supports re-accessing and even updating keys that have already been written. There are two restrictions: 1. An existing key that stores a dict/list/tuple cannot be replaced. 2. An existing key storing any type cannot be replaced by a dict/list/tuple Args: checkpoint_file: Path to new H5 checkpoint. A file cannot already exist at this location. """ def __init__(self, checkpoint_file) -> None: if os.path.exists(checkpoint_file): raise FileExistsError( f"Checkpoint file \"{checkpoint_file}\" cannot be created because " "file already exists" ) super().__init__(checkpoint_file, {}) def save(self): from cerebras.pytorch.saver.pt_h5_saver import PyTorchH5Saver saver = PyTorchH5Saver() _, spec = saver.flatten_state_dict(self.state) saver.save_spec(self.checkpoint_file, spec) def __str__(self): return f"{self.checkpoint_file}:\n{self.state}" def __repr__(self): return f"StreamingCSWriter: {str(self)}"
[docs]class OnDemandDictionaryConverter: r"""Wraps around an input dictionary in order to transform its values on-the-fly. The transformation has the following restrictions: 1. It must maintain a 1-1 mapping (i.e. no new/dropped keys) 2. The keys cannot change names (only values can change) There is error checking during object initialization and during runtime to ensure that this restriction holds. Args: underlying_dict: Underlying dictionary that needs to be transformed in an on-demand fashion converter_class: A subclass of BaseDictionaryConverter which describes the transformation of the underlying dictionary action_fn_args: Additional arguments that may be used in the converter's action functions. """ def __init__( self, underlying_dict, converter_class, action_fn_args=None ) -> None: super().__init__() self.underlying_dict = ReadOnlyDict(underlying_dict) self.converter_instance = converter_class() self.action_fn_args = action_fn_args or {} self.verify_converter() def verify_converter(self): # Deferred to prevent circular import: from cerebras.modelzoo.tools.checkpoint_converters.base_converter import ( BaseDictionaryConverter, ) assert isinstance(self.converter_instance, BaseDictionaryConverter), ( f"{self.__class__}'s nested converter must subclass " f"BaseDictionaryConverter" ) disallowed_fns = [ "pre_checkpoint_convert", "pre_model_convert", "post_model_convert", "post_checkpoint_convert", ] for fn_name in disallowed_fns: assert not hasattr(self.converter_instance, fn_name), ( f"{self.__class__} only supports converters that are 1-1 " f"mappings. Therefore, the nested converter cannot contain the " f"{fn_name} function" ) for rule in self.converter_instance.rules: if not all(isinstance(elm, str) for elm in rule.segments): raise ValueError( f"{self.__class__} only supports converters that are 1-1 " f"mappings. Therefore, their rules can only contain regex " f"strings (no EquivalentSubkey or BaseDictionaryConverter " f"objects). The following conversion rule offends this " f"constraint:\n{rule}" ) def __len__(self): return len(self.underlying_dict) def __iter__(self): return self.underlying_dict.__iter__() def __getitem__(self, key): if key not in self.underlying_dict: raise KeyError new_temp_dict = {} from_index = 0 match = self.converter_instance.convert_key( key, self.underlying_dict, new_temp_dict, from_index, action_fn_args=self.action_fn_args, ) if set(new_temp_dict) != {key}: raise ValueError( f"{self.__class__}'s nested converter did not create a 1-1 " f"mapping." ) if not match: raise KeyError return new_temp_dict[key] def items(self): for key in self.keys(): yield key, self[key] def keys(self): return self.underlying_dict.keys() def values(self): for key in self.keys(): yield self[key]
def _readonly(self, *args, **kwargs): raise RuntimeError("Cannot modify ReadOnlyDict")
[docs]class ReadOnlyDict(dict): """A Read-only dict. Note that this object doesn't guard against the values from being mutated in-place. """ __setitem__ = _readonly __delitem__ = _readonly pop = _readonly popitem = _readonly clear = _readonly update = _readonly setdefault = _readonly