cerebras.modelzoo.data.nlp.transformer.config.TransformerDynamicDataProcessorConfig#
- class cerebras.modelzoo.data.nlp.transformer.config.TransformerDynamicDataProcessorConfig(batch_size: int = <object object at 0x7f0436677b60>, shuffle: bool = True, shuffle_seed: int = 0, num_workers: int = 0, prefetch_factor: int = 10, persistent_workers: bool = True, src_data_dir: str = <object object at 0x7f0436677b60>, src_vocab_file: str = <object object at 0x7f0436677b60>, src_max_sequence_length: int = <object object at 0x7f0436677b60>, tgt_max_sequence_length: int = <object object at 0x7f0436677b60>, shuffle_buffer: Optional[int] = None, do_lower: bool = False, buckets: Optional[List[int]] = None, dynamic_loss_weight: Optional[bool] = None, pack_sequences: Optional[bool] = False, num_documents_to_concatenate: int = 128, drop_last: bool = True, oov_token: str = '<unk>', sos_token: str = '<s>', eos_token: str = '</s>', pad_token: str = '<pad>', extra_ids: Union[int, List[int]] = 0, labels_pad_id: int = 0, input_pad_id: int = 0)[source]#
- batch_size: int = <object object>#
Batch size to be used
- buckets: Optional[List[int]] = None#
- do_lower: bool = False#
- drop_last: bool = True#
- dynamic_loss_weight: Optional[bool] = None#
- eos_token: str = '</s>'#
- extra_ids: Union[int, List[int]] = 0#
- input_pad_id: int = 0#
- labels_pad_id: int = 0#
- num_documents_to_concatenate: int = 128#
- num_workers: int = 0#
The number of PyTorch processes used in the dataloader
- oov_token: str = '<unk>'#
- pack_sequences: Optional[bool] = False#
- pad_token: str = '<pad>'#
- persistent_workers: bool = True#
Whether or not to keep workers persistent between epochs
- prefetch_factor: int = 10#
The number of batches to prefetch in the dataloader
- shuffle: bool = True#
Whether or not to shuffle the dataset
- shuffle_buffer: Optional[int] = None#
- shuffle_seed: int = 0#
Seed used for deterministic shuffling
- sos_token: str = '<s>'#
- src_data_dir: str = <object object>#
- src_max_sequence_length: int = <object object>#
- src_vocab_file: str = <object object>#
- tgt_max_sequence_length: int = <object object>#