Source code for cerebras.modelzoo.data_preparation.data_preprocessing.tokenflow.tokenizer

# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os

import transformers


[docs]class GenericTokenizer: def __init__(self, processing_params, filepath): self.processing_params = processing_params self.filepath = filepath self.custom_tokenizer = processing_params.pop("custom_tokenizer", None) if self.processing_params.get('huggingface_tokenizer'): self.tokenizer = self.initialize_huggingfacetokenizer() elif self.custom_tokenizer == "gpt2tokenizer": self.tokenizer = self.initialize_gpt2tokenizer() elif self.custom_tokenizer == "neoxtokenizer": self.tokenizer = self.initialize_neoxtokenizer() elif self.custom_tokenizer: # Old tokenizer_type key self.tokenizer = self.initialize_customtokenizer() else: self.tokenizer_type = self.processing_params.pop('tokenizer_type') self.tokenizer = self._initialize_customtokenizer_old() def initialize_gpt2tokenizer(self): if self.processing_params.get('vocab_file', False): merges_file = os.path.normpath( os.path.join( os.path.dirname(self.filepath), self.processing_params.pop('vocab_file'), ) ) vocab_file = os.path.normpath( os.path.join( os.path.dirname(self.filepath), self.processing_params.pop('encoder_file'), ) ) else: merges_path = self.processing_params['tokenizer_params'].pop( 'vocab_file' ) vocab_path = self.processing_params['tokenizer_params'].pop( 'encoder_file' ) merges_file = os.path.normpath( os.path.join(os.path.dirname(self.filepath), merges_path) ) vocab_file = os.path.normpath( os.path.join(os.path.dirname(self.filepath), vocab_path) ) return transformers.GPT2TokenizerFast( vocab_file=vocab_file, merges_file=merges_file, name_or_path="gpt2-tokenizer", **self.processing_params.get('tokenizer_params', {}), ) def initialize_neoxtokenizer(self): from tokenizers import Tokenizer tokenizer_params = self.processing_params.get('tokenizer_params', {}) return transformers.PreTrainedTokenizerFast( tokenizer_object=Tokenizer.from_file( tokenizer_params.pop("encoder_file", None) ), name_or_path="neox-tokenizer", **tokenizer_params, ) def initialize_huggingfacetokenizer(self): return transformers.AutoTokenizer.from_pretrained( pretrained_model_name_or_path=self.processing_params[ 'huggingface_tokenizer' ], **self.processing_params.get('tokenizer_params', {}), ) def initialize_customtokenizer(self): import importlib module, class_name = self.custom_tokenizer.rsplit(':', 1) module = importlib.import_module(module) TokenizerClass = getattr(module, class_name) return TokenizerClass( **self.processing_params.get('tokenizer_params', {}) ) def _initialize_customtokenizer_old(self): merges_file = os.path.normpath( os.path.join( os.path.dirname(self.filepath), self.processing_params['vocab_file'], ) ) vocab_file = os.path.normpath( os.path.join( os.path.dirname(self.filepath), self.processing_params['encoder_file'], ) ) tokenizer_class = getattr(transformers, self.tokenizer_type) return tokenizer_class(vocab_file=vocab_file, merges_file=merges_file) def decode(self, ids): return self.tokenizer.decode(ids) def convert_ids_to_tokens(self, ids): return self.tokenizer.convert_ids_to_tokens(ids)