Source code for cerebras.modelzoo.data.vision.utils

# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import filecmp
import math
import os
import random
import shutil

import torch
import torch.distributed as dist
from tqdm import tqdm

import cerebras.pytorch as cstorch
import cerebras.pytorch.distributed as dist


[docs]def is_gpu_distributed():
    """
    Returns True if DDP is enabled
    """
    return (
        torch.distributed.is_available() and torch.distributed.is_initialized()
    )


[docs]def task_id():
    if dist.is_streamer():
        return dist.get_streaming_rank()
    elif is_gpu_distributed():
        return dist.get_rank()
    else:
        return 0


[docs]def num_tasks():
    if dist.is_streamer():
        return dist.num_streamers()
    elif is_gpu_distributed():
        return dist.get_world_size()
    else:
        return 1


[docs]class ShardedSampler(torch.utils.data.Sampler):
    """
    Modified from:
    https://pytorch.org/docs/stable/_modules/torch/utils/data/distributed.html#DistributedSampler
    Sampler that restricts data loading to a subset of the dataset.

    Dataset is assumed to be of constant size.

    Args:
        dataset (torch.utils.data.Dataset): Dataset used for sampling.
        mode (modes): Instance of `modes` to indicate train or eval mode.
        shuffle (bool, optional): If `True` (default), sampler will shuffle
            the indices.
        seed (int, optional): Random seed used to shuffle the sampler if
            :attr:`shuffle=True`. This number should be identical across all
            processes in the distributed group. Default: `0`.
        drop_last (bool, optional): If `True`, then the sampler will drop the
            tail of the data to make it evenly divisible across the number of
            replicas. If `False`, the sampler will add extra indices to make
            the data evenly divisible across the replicas. Default: `False`.
    """

    def __init__(self, dataset, shuffle=True, seed=None, drop_last=False):
        self.num_tasks = num_tasks()
        self.task_id = task_id()
        self.dataset = dataset
        self.dataset_len = len(self.dataset)
        self.drop_last = drop_last

        if cstorch.use_cs() and not self.drop_last:
            raise ValueError(
                "On CS2 we do not support unequal batch sizes so `drop_last` "
                "must be set to `True`."
            )
        # If the dataset length is evenly divisible by # of replicas, then there
        # is no need to drop any data, since the dataset will be split equally.
        if self.drop_last and len(self.dataset) % self.num_tasks:
            # Split to nearest available length that is evenly divisible.
            # This is to ensure each task receives the same amount of data when
            # using this sampler.
            self.num_samples = len(self.dataset) // self.num_tasks
        else:
            self.num_samples = math.ceil(len(self.dataset) / self.num_tasks)
        self.total_size = self.num_samples * self.num_tasks
        self.shuffle = shuffle
        self.seed = seed
        self.indices = list(range(self.dataset_len))
        if not self.drop_last:
            # add extra samples to make it evenly divisible across tasks
            padding_indices_size = self.total_size - self.dataset_len
            # choose padding indices at random to reduce the chance of
            # reusing samples.
            random.seed(self.seed)
            padding_indices = random.sample(self.indices, padding_indices_size)
            self.indices += padding_indices
        else:
            # remove tail of data to make it evenly divisible.
            self.indices = self.indices[: self.total_size]
        assert len(self.indices) == self.total_size, (
            f"Total `indices` after dropping/padding indices must be equal "
            f"to `total_size` of the dataset. Received total indices: "
            f"`{len(self.indices)}` and total size is: `{self.total_size}`."
        )

    def __iter__(self):
        if self.shuffle:
            random.seed(self.seed)
            random.shuffle(self.indices)

        # subsample
        indices = self.indices[self.task_id : self.total_size : self.num_tasks]
        assert len(indices) == self.num_samples, (
            f"Total `indices` for tasks must be equal to `num_samples` in a "
            f"task. Received total indices: `{len(indices)}` and samples in "
            f"task are: `{self.num_samples}`."
        )

        yield from indices

    def __len__(self):
        return self.num_samples


##### Experimental to reduce first batch loading times for MAP style only #####
class _RepeatSampler(object):
    """Sampler that repeats forever.

    Args:
        sampler (Sampler)
    """

    def __init__(self, sampler):
        self.sampler = sampler

    def __iter__(self):
        while True:
            yield from iter(self.sampler)


[docs]class FastDataLoader(torch.utils.data.DataLoader):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        object.__setattr__(
            self, 'batch_sampler', _RepeatSampler(self.batch_sampler)
        )
        self.iterator = super().__iter__()

    def __len__(self):
        return len(self.batch_sampler.sampler)

    def __iter__(self):
        for i in range(len(self)):
            yield next(self.iterator)


def _get_worker_cache_dir(src_dir):
    """Gets the path to worker cache dir corresponding to the src_dir"""
    src_dir = os.path.abspath(src_dir)
    cache_dir = os.path.normpath("/".join([dist.WORKER_CACHE_ROOT, src_dir]))
    os.makedirs(cache_dir, exist_ok=True)
    return cache_dir


def _same_dirs_shallow(src_dir, dest_dir):
    """Takes a directory comparison obj and does a shallow comparison
    between the dirs src_dir and dest_dir
    The shallow comparison does a recursive check of the following:
        1. Check if the dirs exist, if they don't then return False
        2. Check if the files have a diff, or if there
        are additional files for either of the two dirs, if different,
        return False.
        3. Repeat 1 and 2 on subdirs
    """

    def _same_dirs_shallow_helper(dcmp: filecmp.dircmp):
        if not os.path.exists(dcmp.left) or not os.path.exists(dcmp.right):
            return False
        if dcmp.left_only:
            # If diff consists of only broken
            # symlinks, then its a match
            parent = dcmp.left
            for left_file in dcmp.left_only:
                if os.path.isdir(
                    os.path.join(parent, left_file)
                ) or os.path.isfile(os.path.join(parent, left_file)):
                    return False
        if dcmp.diff_files or dcmp.right_only:
            return False
        for sub_dcmp in dcmp.subdirs.values():
            if not _same_dirs_shallow_helper(sub_dcmp):
                return False
        return True

    return _same_dirs_shallow_helper(filecmp.dircmp(src_dir, dest_dir))


[docs]def create_worker_cache(src_dir: str, force_overwrite: bool = False):
    """Checks for the dir in the worker_cache (SSD) on the worker node corresponding to the src_dir.
    If the directory exists and is same as the src_dir, it returns the dir path on worker_cache.
    Otherwise writes the directory to the worker_cache and returns the dir path.
    Writing to the cache can take a while, depending on the size of the src_dir:
    Displays a progress bar (in the worker logs) which shows progress of the cache
    Forces cache overwrite irrespective of a cache hit, when force_overwrite is True.
    """
    from filelock import FileLock

    if (
        os.path.commonprefix([src_dir, dist.WORKER_CACHE_ROOT])
        == dist.WORKER_CACHE_ROOT
    ):
        raise RuntimeError(
            f"Ensure that the src_dir path does not have "
            f"a worker_cache path prefix: {dist.WORKER_CACHE_ROOT}"
        )
    if not dist.is_streamer():
        raise RuntimeError(
            "Ensure that create_worker_cache is called only for a worker node."
        )
    dest_dir = _get_worker_cache_dir(src_dir)
    # Provide read/write permissions for the lock for all users
    with FileLock(f"{dest_dir}.lock", mode=0o666):
        if _same_dirs_shallow(src_dir, dest_dir) and not force_overwrite:
            print(f"WORKER CACHE HIT: Skipping overwrite")
        else:
            (
                is_limit_hit,
                dir_size,
                available_space_for_copy,
            ) = dist.hit_worker_cache_limit(src_dir, dest_dir)
            if is_limit_hit:
                raise RuntimeError(
                    f"Failed when copying the directory to the worker cache: {src_dir},"
                    f" directory size: {dir_size} exceeds the available space on worker cache: {available_space_for_copy}."
                    f"Please contact your system administrator to clear the worker cache."
                )
            if os.path.exists(dest_dir):
                shutil.rmtree(dest_dir)
            # copy dirs to destination
            # get the total number of files to copy
            total_files = sum(
                [len(files) for root, dirs, files in os.walk(src_dir)]
            )

            # copy directory with progress bar
            def copy2_with_progress(src_path, dst_path, update):
                # skip if its a broken symlink
                if os.path.isfile(src_path):
                    shutil.copy2(src_path, dst_path)
                    update(1)

            with tqdm(
                total=total_files,
                desc="Overwriting cache",
                unit="files",
                dynamic_ncols=True,
            ) as pbar:
                shutil.copytree(
                    src_dir,
                    dest_dir,
                    symlinks=False,
                    ignore=None,
                    ignore_dangling_symlinks=True,
                    copy_function=lambda f, d: copy2_with_progress(
                        f, d, pbar.update
                    ),
                )
    return dest_dir