Source code for cerebras.modelzoo.data.vision.diffusion.dit_transforms

# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch


[docs]class NoiseGenerator(torch.nn.Module):
    def __init__(self, width, height, channels, num_diffusion_steps, seed=None):
        super(NoiseGenerator, self).__init__()
        self.height = height
        self.width = width
        self.channels = channels
        self.num_diffusion_steps = num_diffusion_steps

[docs]    def forward(self, input, label):
        """
        Args:
            :param input : Float tensor of size (B, C, H, W).
            :param label : Int tensor of size (B, ).
        Returns:
            A dict corresponding to the noisy images, ground truth noises and
            the timesteps corresponding to the scheduled noise variance with
            the following keys and shapes.

            "input": Tensor of shape (batch_size, C, H, W). This tensor is simply passed through.
            "label": Tensor of shape (batch_size, ) representing labels. This tensor is simply passed through.
            "diffusion_noise": Tensor of shape (batch_size, channels, height, width)
                represents diffusion noise to be applied
            "timestep": Tensor of shape (batch_size, ) that indicates the timesteps for each diffusion sample
            "vae_noise": Tensor of shape (batch_size, latent_channels, latent_height, latent_width)
                represents the noise sample to be used with reparametrization of VAE

        """

        if input.ndim != 4:
            raise ValueError(f"Samples ndim should be 4. Got {input.ndim}")

        # reshaping to (batch_size, 1, ..., 1) for broadcasting
        batch_size = input.shape[0]
        timestep = torch.randint(
            self.num_diffusion_steps, size=(batch_size,), dtype=label.dtype
        )
        noise_shape = (batch_size, self.channels, self.height, self.width)
        diffusion_noise = torch.randn(noise_shape, dtype=input.dtype).to(
            input.device
        )

        vae_noise_shape = (batch_size, self.channels, self.height, self.width)
        vae_noise_sample = torch.randn(vae_noise_shape, dtype=input.dtype)

        return {
            "input": input,
            "label": label,
            "diffusion_noise": diffusion_noise,
            "timestep": timestep,
            "vae_noise": vae_noise_sample,
        }

    def __repr__(self):
        return (
            f"{self.__class__.__name__}("
            f"schedule_name={self.schedule_name}"
            f", num_diffusion_steps={self.num_diffusion_steps}"
            f")"
        )


[docs]class LabelDropout(torch.nn.Module):
    def __init__(self, dropout_prob, num_classes):
        super(LabelDropout, self).__init__()
        assert dropout_prob > 0
        self.dropout_prob = dropout_prob
        self.num_classes = num_classes

    def token_drop(self, label):
        drop_ids = torch.rand(label.shape[0]) < self.dropout_prob
        return drop_ids

    def forward(self, image, label):
        drop_ids = self.token_drop(label)
        label = torch.where(
            drop_ids, torch.tensor(self.num_classes, dtype=label.dtype), label
        )
        return image, label