Source code for cerebras.modelzoo.common.registry

# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

'''
This is registry for the cerebras modelzoo
'''

import importlib
import os
import pathlib
from pathlib import Path

import cerebras.modelzoo as modelzoo


[docs]class Registry: mapping = { "model": {}, "datasetprocessor": {}, "lr_scheduler": {}, "loss": {}, "dataset": {}, "paths": {}, "config": {}, "submodel_config": {}, "model_to_config": {}, "data_config": {}, } submodel_mapping = { "gpt2lmhead": "gpt2", "t5forconditionalgeneration": "t5", "vit": "vision_transformer", } _modules_imported = False @classmethod def _import_modules_for_registry( cls, directory_path: str, import_files_regex: str ): """Importing all classes from the files mentioned in the directory path and in import_files. If no files are specified, all python files from that directory will be imported.""" modelzoo_path = os.path.dirname(os.path.realpath(modelzoo.__file__)) for file in Path(directory_path).rglob(import_files_regex): filename = pathlib.Path(file).name module_path = "cerebras.modelzoo.{}".format( os.path.relpath(file, modelzoo_path).replace(os.path.sep, '.')[ :-3 ] ) # Import the module dynamically try: importlib.import_module(module_path, package=__name__) except Exception as ex: raise Exception("Registry Import Failure: {}".format(ex)) @classmethod def _import_modules(cls): if cls._modules_imported: return for path in cls.mapping["paths"]["model_path"]: cls._import_modules_for_registry( path, import_files_regex="**/model.py", ) for path in cls.mapping["paths"]["loss_path"]: cls._import_modules_for_registry(path, import_files_regex="**/*.py") for path in cls.mapping["paths"]["datasetprocessor_path"]: cls._import_modules_for_registry( path, import_files_regex="**/*Processor*.py", ) for path in cls.mapping["paths"]["datasetprocessor_path"]: cls._import_modules_for_registry( path, import_files_regex="**/config.py", ) for path in cls.mapping["paths"]["model_path"]: cls._import_modules_for_registry( path, import_files_regex="**/config.py", ) cls._modules_imported = True
[docs] @classmethod def register_model(cls, model_name, datasetprocessor=[], dataset=[]): """ This method is added to register models """ def wrap(model_cls): if not isinstance(model_name, list): names = [model_name] else: names = model_name for name in names: if name in cls.mapping["model"]: raise KeyError( "Name '{}' already registered for {}.".format( name, cls.mapping["model"][name] ) ) cls.mapping["model"][name] = { "class": model_cls, "run": cls.register_run_path(name), "datasetprocessor": datasetprocessor, "dataset": dataset, } return model_cls return wrap
[docs] @classmethod def register_datasetprocessor(cls, name): """ This method is added to register datasetprocessor """ def wrap(datasetprocessor_cls): if name in cls.mapping["datasetprocessor"]: raise KeyError( "Name '{}' already registered for {}.".format( name, cls.mapping["datasetprocessor"][name] ) ) cls.mapping["datasetprocessor"][name] = datasetprocessor_cls return datasetprocessor_cls return wrap
[docs] @classmethod def register_loss(cls, name): """ This method is added to register loss """ def wrap(loss_cls): if name in cls.mapping["loss"]: raise KeyError( "Name '{}' already registered for {}.".format( name, cls.mapping["loss"][name] ) ) cls.mapping["loss"][name] = loss_cls return loss_cls return wrap
[docs] @classmethod def register_lr_scheduler(cls, name): """ This method is added to register lr_schedular """ def wrap(lr_scheduler_cls): if name in cls.mapping["lr_scheduler"]: raise KeyError( "Name '{}' already registered for {}.".format( name, cls.mapping["lr_scheduler"][name] ) ) cls.mapping["lr_scheduler"][name] = lr_scheduler_cls return lr_scheduler_cls return wrap
[docs] @classmethod def register_dataset(cls, name): """ This method is added to register dataset """ def wrap(dataset_cls): if name in cls.mapping["dataset"]: raise KeyError( "Name '{}' already registered for {}.".format( name, cls.mapping["dataset"][name] ) ) cls.mapping["dataset"][name] = dataset_cls return dataset_cls return wrap
[docs] @classmethod def register_paths(cls, kind, path): """ This method is register paths useful for the user """ if kind in cls.mapping["paths"]: cls.mapping["paths"][kind].append(path) else: cls.mapping["paths"].setdefault(kind, [path])
[docs] @classmethod def register_submodel_config(cls, name): """ This method is added to register config classes for submodels """ def wrap(model_cls): cls.mapping["submodel_config"][name] = model_cls # Register the run path as well. if name not in cls.submodel_mapping: raise KeyError( "Submodel '{}' not mapped to any parent model.".format(name) ) path = cls.get_path("model_path", cls.submodel_mapping[name]) cls.mapping["submodel_config"][path] = model_cls if name in cls.mapping["model"]: model = cls.mapping["model"][name]["class"] cls.mapping["model_to_config"][model] = model_cls return model_cls return wrap
[docs] @classmethod def register_config(cls, name): """ This method is added to register config classes """ def wrap(model_cls): cls.mapping["config"][name] = model_cls # Register the run path as well. path = cls.get_path("model_path", name) if path and os.path.exists(path): path = Path(os.path.realpath(path)) path = path.relative_to( os.path.dirname(os.path.realpath(modelzoo.__file__)) ) cls.mapping["config"][path] = model_cls if name in cls.mapping["model"]: model = cls.mapping["model"][name]["class"] cls.mapping["model_to_config"][model] = model_cls return model_cls return wrap
[docs] @classmethod def register_data_config(cls, name): """ This method is added to register config classes """ def wrap(model_cls): cls.mapping["data_config"][name] = model_cls return model_cls return wrap
@classmethod def get_path(cls, kind, name): if kind in cls.mapping["paths"]: for path in cls.mapping["paths"][kind]: if os.path.isdir(os.path.join(path, name)): return os.path.join(path, name) return None else: raise ValueError("{} not initialised in registry".format(kind))
[docs] @classmethod def register_run_path(cls, name): """ Look for run path for the model """ return cls.get_path("model_path", name)
[docs] @classmethod def unregister(cls, region, name): """ This method is added to unregister region can be ['model', 'loss', 'lr_scheduler', 'datasetprocessor', 'dataset'] """ if cls.mapping.get('region') is None: raise KeyError("Undefined {}".format(region)) return cls.mapping[region].pop(name, None)
@classmethod def list_models(cls): cls._import_modules() return sorted(cls.mapping["model"].keys()) @classmethod def list_loss(cls): cls._import_modules() return sorted(cls.mapping["loss"].keys()) @classmethod def list_datasetprocessor(cls, model_name=None): cls._import_modules() if model_name is None: return sorted(cls.mapping["datasetprocessor"].keys()) if model_name in cls.mapping["model"]: for dl in cls.mapping["model"][model_name]["datasetprocessor"]: if not (dl in cls.mapping["datasetprocessor"]): raise ValueError( "{} datasetprocessor is not registered".format(dl) ) return cls.mapping["model"][model_name]["datasetprocessor"] else: raise ValueError("{} model is not registered".format(model_name)) @classmethod def list_lr_scheduler(cls): cls._import_modules() return sorted(cls.mapping["lr_scheduler"].keys()) @classmethod def list_dataset(cls, model_name=None): cls._import_modules() if model_name is None: return sorted(cls.mapping["dataset"].keys()) if model_name in cls.mapping["model"]: for ds in cls.mapping["model"][model_name]["dataset"]: if not (ds in cls.mapping["datset"]): raise ValueError("{} dataset is not registered".format(ds)) return cls.mapping["model"][model_name]["dataset"] else: raise ValueError("{} model is not registered".format(model_name)) @classmethod def get_model_class(cls, name): cls._import_modules() if name in cls.mapping["model"]: return cls.mapping["model"][name]["class"] raise ValueError("{} model is not registered".format(name)) @classmethod def get_submodel_config_class(cls, key): cls._import_modules() if key in cls.mapping["submodel_config"]: return cls.mapping["submodel_config"][key] return None @classmethod def get_config_class(cls, key): if key is None: return None cls._import_modules() if key in cls.mapping["config"]: return cls.mapping["config"][key] return None @classmethod def get_data_config(cls, name): cls._import_modules() if name in cls.mapping["data_config"]: return cls.mapping["data_config"][name] return None @classmethod def get_config_class_from_model(cls, key): cls._import_modules() if key in cls.mapping["model_to_config"]: return cls.mapping["model_to_config"][key] return None @classmethod def get_loss_class(cls, name): cls._import_modules() return cls.mapping["loss"].get(name, None) @classmethod def get_run_path(cls, name): if name in cls.mapping["model"]: return cls.mapping["model"][name]["run"] else: return cls.get_path("model_path", name)
registry = Registry()