Hi everyone
I was following these two blogs Handling big models and How Accelerate runs very large models thanks to PyTorch and I wanted to use it for nllb-200-3.3B on CPU.
Here is my script
from accelerate import init_empty_weights, load_checkpoint_and_dispatch
from transformers import AutoConfig, AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
from accelerate import load_checkpoint_and_dispatch
checkpoint = "nllb-200-3.3B"
config = AutoConfig.from_pretrained(checkpoint)
with init_empty_weights():
model = AutoModelForSeq2SeqLM.from_config(config)
print("loading model ...")
model = load_checkpoint_and_dispatch(model, checkpoint,
offload_folder="offload",
offload_state_dict = True)
print("load tokenizer ...")
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
print("inference ...")
source = 'eng_Latn' # English
target = 'kor_Hang' # Korean
translator = pipeline('translation', model=model,
tokenizer=tokenizer,
src_lang=source,
tgt_lang=target,
device="cpu")
text = 'Hi, nice to meet you'
output = translator(text)
# tokenizer.src_lang = source
# encoded_hi = tokenizer(text, return_tensors="pt")
# encoded_hi = encoded_hi.to("cpu")
# generated_tokens = model.generate(
# **encoded_hi,
# forced_bos_token_id=tokenizer.lang_code_to_id[target]
# )
# out = self.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
translated_text = output[0]['translation_text']
print(translated_text)
and I got this error for meta
and cpu
device
Traceback (most recent call last):
File "/home/ubuntu/trans_test.py", line 28, in <module>
output = translator(text)
File "/home/ubuntu/.local/lib/python3.10/site-packages/transformers/pipelines/text2text_generation.py", line 351, in __call__
return super().__call__(*args, **kwargs)
File "/home/ubuntu/.local/lib/python3.10/site-packages/transformers/pipelines/text2text_generation.py", line 150, in __call__
result = super().__call__(*args, **kwargs)
File "/home/ubuntu/.local/lib/python3.10/site-packages/transformers/pipelines/base.py", line 1074, in __call__
return self.run_single(inputs, preprocess_params, forward_params, postprocess_params)
File "/home/ubuntu/.local/lib/python3.10/site-packages/transformers/pipelines/base.py", line 1081, in run_single
model_outputs = self.forward(model_inputs, **forward_params)
File "/home/ubuntu/.local/lib/python3.10/site-packages/transformers/pipelines/base.py", line 990, in forward
model_outputs = self._forward(model_inputs, **forward_params)
File "/home/ubuntu/.local/lib/python3.10/site-packages/transformers/pipelines/text2text_generation.py", line 172, in _forward
output_ids = self.model.generate(**model_inputs, **generate_kwargs)
File "/home/ubuntu/.local/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/home/ubuntu/.local/lib/python3.10/site-packages/transformers/generation_utils.py", line 1339, in generate
model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation(
File "/home/ubuntu/.local/lib/python3.10/site-packages/transformers/generation_utils.py", line 583, in _prepare_encoder_decoder_kwargs_for_generation
model_kwargs["encoder_outputs"]: ModelOutput = encoder(**encoder_kwargs)
File "/home/ubuntu/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1190, in _call_impl
return forward_call(*input, **kwargs)
File "/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/m2m_100/modeling_m2m_100.py", line 829, in forward
layer_outputs = encoder_layer(
File "/home/ubuntu/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1190, in _call_impl
return forward_call(*input, **kwargs)
File "/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/m2m_100/modeling_m2m_100.py", line 382, in forward
hidden_states, attn_weights, _ = self.self_attn(
File "/home/ubuntu/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1190, in _call_impl
return forward_call(*input, **kwargs)
File "/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/m2m_100/modeling_m2m_100.py", line 298, in forward
attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, meta and cpu!
I was wondering do I need to move tensors to meta device because as far as I know meta carries no data or something else.
Thanks