Source code for cerebras.pytorch.utils.data.dataset

# Copyright 2016-2023 Cerebras Systems
# SPDX-License-Identifier: BSD-3-Clause

"""Dataset classes for use with PyTorch DataLoaders."""

from typing import (
    Callable,
    Dict,
    Iterator,
    List,
    NamedTuple,
    Optional,
    OrderedDict,
    Tuple,
    Union,
)

import torch
from torch.utils._pytree import SUPPORTED_NODES, tree_flatten, tree_unflatten
from torch.utils.data import IterDataPipe

LeafT = Union[torch.Tensor, Callable[[int], torch.Tensor]]
SampleSpecT = Union[
    LeafT,
    List["SampleSpecT"],
    Tuple["SampleSpecT", ...],
    Dict[str, "SampleSpecT"],
    OrderedDict[str, "SampleSpecT"],
    NamedTuple,
]
SampleT = Union[
    torch.Tensor,
    List["SampleT"],
    Tuple["SampleT", ...],
    Dict[str, "SampleT"],
    OrderedDict[str, "SampleT"],
    NamedTuple,
]


# pylint: disable=abstract-method
[docs]class SyntheticDataset(IterDataPipe): """A synthetic dataset that generates samples from a `SampleSpec`.""" def __init__( self, sample_spec: SampleSpecT, num_samples: Optional[int] = None ): """Constructs a `SyntheticDataset` instance. A synthetic dataset can be used to generate samples on the fly with an expected dtype/shape but without needing to create a full-blown dataset. This is especially useful for compile validation. Args: sample_spec: Specification of the samples to generate. This can be a nested structure of one of the following types: - `torch.Tensor`: A tensor to be cloned. - `Callable`: A callable that takes the sample index and returns a tensor. Supported data structures for holding the above leaf nodes are `list`, `tuple`, `dict`, `OrderedDict`, and `NamedTuple`. num_samples: Total size of the dataset. If None, the dataset will generate samples indefinitely. """ super().__init__() self._leaf_nodes, self._spec_tree = tree_flatten(sample_spec) if not self._leaf_nodes: raise ValueError( f"`sample_spec` must be a non-empty python tree of " f"`torch.Tensor` or `Callable`." ) for item in self._leaf_nodes: if not isinstance(item, (torch.Tensor, Callable)): raise ValueError( f"`sample_spec` is expected to contain a python tree of " f"`torch.Tensor`, or `Callable`, but got an item of type " f"`{type(item)}`. Note that supported data structures for " f"holding leaf nodes in the tree are " f"{', '.join(str(x) for x in SUPPORTED_NODES)}." ) if isinstance(num_samples, int): if num_samples <= 0: raise ValueError( f"`num_samples` must be a positive integer, but got " f"`{num_samples}`." ) self._num_samples = num_samples elif num_samples is None: self._num_samples = None else: raise TypeError( f"`num_samples` must be a positive integer or None, but got a " f"value of type `{type(num_samples)}`." ) def __iter__(self) -> Iterator[SampleT]: """Returns an iterator for generating samples.""" index = 0 while self._num_samples is None or index < self._num_samples: sample_flat = [] for item in self._leaf_nodes: if isinstance(item, torch.Tensor): sample_flat.append(item.clone()) elif callable(item): sample_flat.append(item(index)) else: raise TypeError( f"Invalid type for leaf node: {type(item)}." ) yield tree_unflatten(sample_flat, self._spec_tree) index += 1 def __len__(self) -> int: """Returns the number of samples in the dataset.""" if self._num_samples is None: raise TypeError( f"`{self.__class__.__name__}` does not have a length because " f"`num_samples` was not provided." ) return self._num_samples