Source code for cerebras.modelzoo.data.common.SyntheticDataProcessor

# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Utilities for generating synthetic data based on some specification."""

import torch
from torch.utils._pytree import (
    _dict_flatten,
    _dict_unflatten,
    _register_pytree_node,
    tree_flatten,
    tree_unflatten,
)

from cerebras.modelzoo.data.common.tensor_spec import TensorSpec

try:
    import cerebras.pytorch as cstorch
except:
    cstorch = None


[docs]def custom_dict_flatten(d: dict): """Constructs TensorSpec instances to contain the leaf nodes of the tree structure before flattening. """ if "shape" or "dtype" or "tensor_factory" in d: return [[TensorSpec(**d)], "TensorSpec"] return _dict_flatten(d)
[docs]def custom_dict_unflatten(values, context): """After mapping the TensorSpecs to tensors/callables, return them directly as leaf nodes instead of reconstructing a dictionary when unflattening. """ if context == "TensorSpec": return values[0] return _dict_unflatten(values, context)
[docs]class SyntheticDataProcessor: """Creates a synthetic dataset. Constructs a SyntheticDataset from the user-provided nested structure of input tensors and returns a torch.utils.data.DataLoader from the SyntheticDataset and the regular torch.utils.data.DataLoader inputs specified in params.yaml. The torch.utils.data.DataLoader is returned by calling the create_dataloader() method. Args: params: Dictionary containing dataset inputs and specifications. Within this dictionary, the user provides the additional 'synthetic_inputs' field that corresponds to a nested tree structure of input tensor specifications used to construct the SyntheticDataset. In params.yaml: data_processor: "SyntheticDataProcessor". Must set this input to use this class batch_size: int shuffle_seed: Optional[int] = None. If it is not None, then torch.manual_seed(seed=shuffle_seed) will be called when creating the dataloader. num_examples: Optional[int] = None. If it is not None, then the it specifies the number of examples/samples in the SyntheticDataset. Otherwise, the SyntheticDataset will generate samples indefinitely. .. regular torch.utils.DataLoader inputs ... synthetic_inputs: .. shape: Collection of positive ints dtype: PyTorch dtype OR tensor_factory: name of PyTorch function args: size: dtype: ... """ def __init__(self, params): if cstorch is None: raise RuntimeError( f"Unable to import cerebras.pytorch. In order to use " f"SyntheticDataProcessor, please ensure you have access to " f"the cerebras_pytorch package." ) # Regular torch.utils.DataLoader inputs self._batch_size = params.get("batch_size", None) if not self._batch_size: raise ValueError( f"No 'batch_size' field specified. Please enter a positive " f"integer batch_size." ) if not isinstance(self._batch_size, int) or self._batch_size <= 0: raise ValueError( f"Expected batch_size to be a positive integer but got " f"{self._batch_size}." ) self._shuffle = params.get("shuffle", False) self._sampler = params.get("sampler", None) self._batch_sampler = params.get("batch_sampler", None) self._num_workers = params.get("num_workers", 0) self._pin_memory = params.get("pin_memory", False) self._drop_last = params.get("drop_last", False) self._timeout = params.get("timeout", 0) # SyntheticDataset specific inputs self._seed = params.get("shuffle_seed", None) self._num_examples = params.get("num_examples", None) if self._num_examples is not None: if ( not isinstance(self._num_examples, int) or self._num_examples <= 0 ): raise ValueError( f"Expected num_examples to be a positive integer but got " f"{self._num_examples}." ) if self._drop_last and self._num_examples < self._batch_size: raise ValueError( f"This dataset does not return any batches because number of " f"examples in the dataset ({self._num_examples}) is less than " f"the batch size ({self._batch_size}) and `drop_last` is True." ) self._tensors = [] synthetic_inputs = params.get("synthetic_inputs", {}) if synthetic_inputs: _register_pytree_node( dict, custom_dict_flatten, custom_dict_unflatten ) leaf_nodes, self._spec_tree = tree_flatten(synthetic_inputs) for tensor_spec in leaf_nodes: if not isinstance(tensor_spec, TensorSpec): raise TypeError( f"Expected all leaf nodes in 'synthetic_inputs' to be " f"of type TensorSpec but got {type(tensor_spec)}. " f"Please ensure that all leaf nodes under " f"'synthetic_inputs' are instances of TensorSpec. " f"These instances are created by specifying either a " f"'shape' and 'dtype' keys or a 'tensor_factory' " f"key in a dict (mutually exclusive)." ) self._tensors.append(self._process_tensor(tensor_spec.specs)) self._tensor_specs = tree_unflatten(self._tensors, self._spec_tree) _register_pytree_node(dict, _dict_flatten, _dict_unflatten) else: raise ValueError( f"Expected 'synthetic_inputs' field but found none. Please " f"specify this field and provide tensor information according " f"to the documentation." ) def _torch_dtype_from_str(self, dtype): """Takes in the user input string for dtype and returns the corresponding torch.dtype. """ torch_dtype = getattr(torch, dtype, None) if not isinstance(torch_dtype, torch.dtype): raise ValueError( f"Invalid torch dtype '{dtype}'. Please ensure all tensors use " f"a valid torch dtype." ) return torch_dtype def _process_tensor(self, tensor_spec): """Parses the tensor_spec and returns a corresponding synthetic tensor.""" if not tensor_spec: raise ValueError( f"Empty TensorSpec found. Please provide at least a 'shape' " f"and 'dtype' field to complete the tensor specification." ) shape = tensor_spec.get("shape", None) dtype = tensor_spec.get("dtype", None) tensor_factory = tensor_spec.get("tensor_factory", None) # Enforce mutually exclusive inputs mutex = shape and dtype and not tensor_factory mutex = mutex or (not shape and not dtype and tensor_factory) if not mutex: possible_inputs = ['shape', 'dtype', 'tensor_factory'] found = [ i for i, j in locals().items() if i in possible_inputs and j is not None ] raise ValueError( f"Expected either 'shape' and 'dtype' fields or 'tensor_factory' " f"field specified (mutually exclusive) but instead found the " f"following fields: {found}. Please ensure each tensor either " f"has a 'shape' and 'dtype' field OR a 'tensor_factory' field." ) if shape and dtype: if not all(isinstance(e, int) and e > 0 for e in shape): raise ValueError( f"Expected shape to be a collection of positive integers " f"but got {shape}. Please ensure all tensor shapes are " f"collections of positive integers." ) torch_dtype = self._torch_dtype_from_str(dtype) return torch.zeros(shape, dtype=torch_dtype) elif tensor_factory: torch_args = tensor_spec.get("args", None) if not torch_args: raise ValueError( f"Expected 'args' field but found none for the " f"tensor_factory '{tensor_factory}'. Please specify this " f"field and fill it with the arguments for the chosen " f"tensor generation function." ) if not torch_args.get("dtype", None): raise ValueError( f"Expected 'dtype' argument for tensor_factory '{tensor_factory}' " f"in the 'args' field, but found none. Please specify this " f"argument with the desired tensor dtype." ) torch_dtype = self._torch_dtype_from_str(torch_args["dtype"]) torch_args["dtype"] = torch_dtype # Raises torch AttributeError if the provided function is invalid try: test_tensor = getattr(torch, tensor_factory)(**torch_args) except Exception as e: raise ValueError( f"Provided tensor_factory '{tensor_factory}' is invalid " f"Please ensure you are using a supported PyTorch callable " f"that returns a torch tensor." ) from e if not isinstance(test_tensor, torch.Tensor): raise ValueError( f"Expected tensor_factory {tensor_factory} to return a " f"torch.Tensor but instead got {type(test_tensor)}. Please " f"ensure that tensor_factory contains a valid PyTorch " f"callable that returns a torch tensor." ) return lambda x: getattr(torch, tensor_factory)(**torch_args)
[docs] def create_dataloader(self): """Returns torch.utils.data.DataLoader that corresponds to the created SyntheticDataset. """ if self._shuffle and self._seed is not None: torch.manual_seed(self._seed) return torch.utils.data.DataLoader( cstorch.utils.data.SyntheticDataset( self._tensor_specs, num_samples=self._num_examples ), batch_size=self._batch_size, shuffle=self._shuffle, sampler=self._sampler, batch_sampler=self._batch_sampler, num_workers=self._num_workers, pin_memory=self._pin_memory, drop_last=self._drop_last, timeout=self._timeout, )