I tried to tried to trace a Causal Language Model (see here) on an inf1.6xlarge
chip for AWS Sagemaker via:
import os
#import tensorflow # to workaround a protobuf version conflict issue
import torch
import torch.neuron
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler
from transformers import BloomTokenizerFast, BloomForCausalLM
model_id = "bigscience/bloom-560m"
tokenizer = BloomTokenizerFast.from_pretrained(model_id, )
model = BloomForCausalLM.from_pretrained(model_id)
dummy_input = "Dummy input which will be padded later"
max_length = 128
embeddings = tokenizer(dummy_input, max_length=max_length, padding="max_length",return_tensors="pt")
input_ids = embeddings['input_ids']
x = torch.rand(16, 64, 6)
y = torch.rand(16, 6, 64)
x_24 = (x,)*24
y_24 = (y,)*24
past_key_values = tuple(zip(x_24, y_24))
neuron_inputs = (input_ids, past_key_values)
neuron_net = torch.neuron.trace(model, example_inputs = neuron_inputs, compiler_workdir="./workdir2", separate_weights=True)
However, I get the error
RuntimeError: Tracer cannot infer type of CausalLMOutputWithCrossAttentions(loss=None, logits=tensor([[[ -6.8985, 3.4898, 6.8012, ..., -1.6817, -1.6819, -1.6826],
... Some Tensor parts ...
[-0.0698, -0.7791, -0.5454, ..., -0.2304, -1.0882, -0.4852]]],
grad_fn=<CatBackward0>))), hidden_states=None, attentions=None, cross_attentions=None)
:Dictionary inputs to traced functions must have consistent type. Found Tensor and Tuple[Tuple[Tensor, Tensor], Tuple[Tensor, Tensor], Tuple[Tensor, Tensor], Tuple[Tensor, Tensor], Tuple[Tensor, Tensor], Tuple[Tensor, Tensor], Tuple[Tensor, Tensor], Tuple[Tensor, Tensor], Tuple[Tensor, Tensor], Tuple[Tensor, Tensor], Tuple[Tensor, Tensor], Tuple[Tensor, Tensor], Tuple[Tensor, Tensor], Tuple[Tensor, Tensor], Tuple[Tensor, Tensor], Tuple[Tensor, Tensor], Tuple[Tensor, Tensor], Tuple[Tensor, Tensor], Tuple[Tensor, Tensor], Tuple[Tensor, Tensor], Tuple[Tensor, Tensor], Tuple[Tensor, Tensor], Tuple[Tensor, Tensor], Tuple[Tensor, Tensor]]
I also tried some variants for the model loading
model = BloomForCausalLM.from_pretrained(model_id, return_dict=False)
resp.
model = BloomForCausalLM.from_pretrained(model_id, torchscript=True)
but in both cases, I got essentially the same error message saying
INFO:Neuron:There are 2 ops of 2 different types in the TorchScript that are not compiled by neuron-cc: aten::__or__, aten::embedding, (For more information see https://awsdocs-neuron.readthedocs-hosted.com/en/latest/release-notes/compiler/neuron-cc/neuron-cc-ops/neuron-cc-ops-pytorch.html)
INFO:Neuron:Number of arithmetic operators (pre-compilation) before = 1871, fused = 1822, percent fused = 97.38%
INFO:Neuron:PyTorch to TF conversion failed to resolve function on aten::pow with inputs [array(0.70710677, dtype=float32), <tf.Tensor 'BloomModel_30/aten_arange/range:0' shape=(16,) dtype=int32>]
INFO:Neuron:Exception = Input 'y' of 'Pow' Op has type int32 that does not match type float32 of argument 'x'.
WARNING:Neuron:torch.neuron.trace failed on _NeuronGraph$3278; falling back to native python function call
ERROR:Neuron:Input 'y' of 'Pow' Op has type int32 that does not match type float32 of argument 'x'.
Traceback (most recent call last):
File "/home/ec2-user/anaconda3/envs/aws_neuron_venv_pytorch/lib/python3.7/site-packages/tensorflow_core/python/framework/op_def_library.py", line 528, in _apply_op_helper
preferred_dtype=default_dtype)
File "/home/ec2-user/anaconda3/envs/aws_neuron_venv_pytorch/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py", line 1273, in internal_convert_to_tensor
(dtype.name, value.dtype.name, value))
ValueError: Tensor conversion requested dtype float32 for Tensor with dtype int32: <tf.Tensor 'BloomModel_30/aten_arange/range:0' shape=(16,) dtype=int32>
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/ec2-user/anaconda3/envs/aws_neuron_venv_pytorch/lib/python3.7/site-packages/torch_neuron/convert.py", line 414, in op_converter
item, inputs, compiler_workdir=sg_workdir, **kwargs)
File "/home/ec2-user/anaconda3/envs/aws_neuron_venv_pytorch/lib/python3.7/site-packages/torch_neuron/decorators.py", line 81, in trace
transform_torch_graph_to_tensorflow(jit_trace, example_inputs, separate_weights=separate_weights, neuron_graph=func, **kwargs)
File "/home/ec2-user/anaconda3/envs/aws_neuron_venv_pytorch/lib/python3.7/site-packages/torch_neuron/decorators.py", line 634, in transform_torch_graph_to_tensorflow
raise e
File "/home/ec2-user/anaconda3/envs/aws_neuron_venv_pytorch/lib/python3.7/site-packages/torch_neuron/decorators.py", line 628, in transform_torch_graph_to_tensorflow
tensor_outputs = local_func(op, *tensor_inputs)
File "/home/ec2-user/anaconda3/envs/aws_neuron_venv_pytorch/lib/python3.7/site-packages/torch_neuron/ops/aten.py", line 1308, in pow
return tf.pow(tensor, exponent)
File "/home/ec2-user/anaconda3/envs/aws_neuron_venv_pytorch/lib/python3.7/site-packages/tensorflow_core/python/util/dispatch.py", line 180, in wrapper
return target(*args, **kwargs)
File "/home/ec2-user/anaconda3/envs/aws_neuron_venv_pytorch/lib/python3.7/site-packages/tensorflow_core/python/ops/math_ops.py", line 459, in pow
return gen_math_ops._pow(x, y, name=name)
File "/home/ec2-user/anaconda3/envs/aws_neuron_venv_pytorch/lib/python3.7/site-packages/tensorflow_core/python/ops/gen_math_ops.py", line 7181, in _pow
"Pow", x=x, y=y, name=name)
File "/home/ec2-user/anaconda3/envs/aws_neuron_venv_pytorch/lib/python3.7/site-packages/tensorflow_core/python/framework/op_def_library.py", line 564, in _apply_op_helper
inferred_from[input_arg.type_attr]))
TypeError: Input 'y' of 'Pow' Op has type int32 that does not match type float32 of argument 'x'.
INFO:Neuron:Number of arithmetic operators (post-compilation) before = 1871, compiled = 0, percent compiled = 0.0%
INFO:Neuron:The neuron partitioner created 1 sub-graphs
INFO:Neuron:Neuron successfully compiled 0 sub-graphs, Total fused subgraphs = 1, Percent of model sub-graphs successfully compiled = 0.0%
INFO:Neuron:Compiled these operators (and operator counts) to Neuron:
INFO:Neuron:Not compiled operators (and operator counts) to Neuron:
INFO:Neuron: => aten::Int: 446 [supported]
INFO:Neuron: => aten::ScalarImplicit: 1 [supported]
INFO:Neuron: => aten::__or__: 1 [not supported]
INFO:Neuron: => aten::add: 99 [supported]
INFO:Neuron: => aten::arange: 2 [supported]
INFO:Neuron: => aten::baddbmm: 24 [supported]
INFO:Neuron: => aten::bitwise_not: 1 [supported]
INFO:Neuron: => aten::bmm: 24 [supported]
INFO:Neuron: => aten::cat: 48 [supported]
INFO:Neuron: => aten::copy_: 1 [supported]
INFO:Neuron: => aten::cumsum: 1 [supported]
INFO:Neuron: => aten::detach: 1 [supported]
INFO:Neuron: => aten::dropout: 72 [supported]
INFO:Neuron: => aten::embedding: 1 [not supported]
INFO:Neuron: => aten::empty: 1 [supported]
INFO:Neuron: => aten::expand: 2 [supported]
INFO:Neuron: => aten::fill_: 1 [supported]
INFO:Neuron: => aten::floor_divide: 24 [supported]
INFO:Neuron: => aten::layer_norm: 50 [supported]
INFO:Neuron: => aten::linear: 97 [supported]
INFO:Neuron: => aten::lt: 1 [supported]
INFO:Neuron: => aten::masked_fill: 24 [supported]
INFO:Neuron: => aten::mul: 243 [supported]
INFO:Neuron: => aten::ones: 1 [supported]
INFO:Neuron: => aten::permute: 48 [supported]
INFO:Neuron: => aten::pow: 1 [supported]
INFO:Neuron: => aten::reshape: 97 [supported]
INFO:Neuron: => aten::select: 72 [supported]
INFO:Neuron: => aten::size: 175 [supported]
INFO:Neuron: => aten::slice: 84 [supported]
INFO:Neuron: => aten::softmax: 24 [supported]
INFO:Neuron: => aten::sub: 1 [supported]
INFO:Neuron: => aten::tanh: 24 [supported]
INFO:Neuron: => aten::to: 27 [supported]
INFO:Neuron: => aten::transpose: 48 [supported]
INFO:Neuron: => aten::unsqueeze: 8 [supported]
INFO:Neuron: => aten::view: 96 [supported]
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
/tmp/ipykernel_1588/3792650745.py in <module>
----> 1 neuron_net = torch.neuron.trace(model, example_inputs = neuron_inputs, compiler_workdir="./workdir2", separate_weights=True)
~/anaconda3/envs/aws_neuron_venv_pytorch/lib/python3.7/site-packages/torch_neuron/convert.py in trace(func, example_inputs, fallback, op_whitelist, minimum_segment_size, subgraph_builder_function, subgraph_inputs_pruning, skip_compiler, debug_must_trace, allow_no_ops_on_neuron, compiler_workdir, dynamic_batch_size, compiler_timeout, single_fusion_ratio_threshold, _neuron_trace, compiler_args, optimizations, separate_weights, verbose, **kwargs)
215 logger.debug("skip_inference_context - trace with fallback at {}".format(get_file_and_line()))
216 neuron_graph = cu.compile_fused_operators(neuron_graph, **compile_kwargs)
--> 217 cu.stats_post_compiler(neuron_graph)
218
219 # Wrap the compiled version of the model in a script module. Note that this is
~/anaconda3/envs/aws_neuron_venv_pytorch/lib/python3.7/site-packages/torch_neuron/convert.py in stats_post_compiler(self, neuron_graph)
529 if succesful_compilations == 0 and not self.allow_no_ops_on_neuron:
530 raise RuntimeError(
--> 531 "No operations were successfully partitioned and compiled to neuron for this model - aborting trace!")
532
533 if percent_operations_compiled < 50.0:
RuntimeError: No operations were successfully partitioned and compiled to neuron for this model - aborting trace!
Do you know how to address that issue?