diff --git a/nemo/collections/llm/gpt/model/nemotron.py b/nemo/collections/llm/gpt/model/nemotron.py index dd659f7eedf7b..d946e5f48cce6 100644 --- a/nemo/collections/llm/gpt/model/nemotron.py +++ b/nemo/collections/llm/gpt/model/nemotron.py @@ -4,16 +4,17 @@ import torch from torch import nn -from transformers import NemotronConfig as HFNemotronConfig -from transformers import NemotronForCausalLM -from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer from nemo.collections.llm.fn.activation import squared_relu from nemo.collections.llm.gpt.model.base import GPTConfig, GPTModel from nemo.collections.llm.utils import Config from nemo.lightning import OptimizerModule, io, teardown if TYPE_CHECKING: + from transformers import NemotronConfig as HFNemotronConfig + from transformers import NemotronForCausalLM + + from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec @@ -123,6 +124,8 @@ def init(self) -> NemotronModel: return NemotronModel(self.config, tokenizer=self.tokenizer) def apply(self, output_path: Path) -> Path: + from transformers import NemotronForCausalLM + source = NemotronForCausalLM.from_pretrained(str(self)) target = self.init() trainer = self.nemo_setup(target) @@ -155,10 +158,14 @@ def convert_state(self, source, target): @property def tokenizer(self) -> "AutoTokenizer": + from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer + return AutoTokenizer(str(self)) @property def config(self) -> NemotronConfig: + from transformers import NemotronConfig as HFNemotronConfig + source = HFNemotronConfig.from_pretrained(str(self)) def make_vocab_size_divisible_by(vocab_size): @@ -224,6 +231,8 @@ def tokenizer(self): @property def config(self) -> "HFNemotronConfig": + from transformers import NemotronConfig as HFNemotronConfig + source: NemotronConfig = io.load_context(str(self)).model.config return HFNemotronConfig(