Hello everyone,
I want to translate text from japanese to french, in order to do so I choose Helsinki-NLP/opus-mt-ja-fr. I have a personal dataset containing japaneses texts and their translation in french and I want to finetune my model on it using AWS Sagemaker.
My code is based on the following workshop.
The code given in the workshop is for sentiment analysis, so I did made some adjustments.
I have two files :
First, the script “train.py”
import argparse
import logging
import os
import random
import sys
import subprocess
import numpy as np
import torch
from datasets import load_from_disk, load_metric
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Trainer, TrainingArguments
from transformers.trainer_utils import get_last_checkpoint
def install(name):
subprocess.call([sys.executable, '-m', 'pip', 'install', name])
if __name__ == "__main__":
install("sacrebleu")
parser = argparse.ArgumentParser()
# hyperparameters sent by the client are passed as command-line arguments to the script.
parser.add_argument("--epochs", type=int, default=3)
parser.add_argument("--train_batch_size", type=int, default=32)
parser.add_argument("--eval_batch_size", type=int, default=64)
parser.add_argument("--warmup_steps", type=int, default=500)
parser.add_argument("--model_id", type=str)
parser.add_argument("--learning_rate", type=str, default=5e-5)
parser.add_argument("--fp16", type=bool, default=True)
# Data, model, and output directories
parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"])
parser.add_argument("--output_dir", type=str, default=os.environ["SM_MODEL_DIR"])
#parser.add_argument("--n_gpus", type=str, default=os.environ["SM_NUM_GPUS"])
parser.add_argument('--n_cpus', type=int, default=os.environ['SM_NUM_CPUS'])
parser.add_argument("--training_dir", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
parser.add_argument("--test_dir", type=str, default=os.environ["SM_CHANNEL_TEST"])
args, _ = parser.parse_known_args()
# Set up logging
logger = logging.getLogger(__name__)
logging.basicConfig(
level=logging.getLevelName("INFO"),
handlers=[logging.StreamHandler(sys.stdout)],
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
print("1", flush=True)
# load datasets
train_dataset = load_from_disk(args.training_dir)
test_dataset = load_from_disk(args.test_dir)
logger.info(f" loaded train_dataset length is: {len(train_dataset)}")
logger.info(f" loaded test_dataset length is: {len(test_dataset)}")
# download model from model hub
model = AutoModelForSeq2SeqLM.from_pretrained(args.model_id)
tokenizer = AutoTokenizer.from_pretrained(args.model_id)
metric = load_metric("sacrebleu")
# Old method from HF workhsop, intially used for sentiment classification
# def compute_metrics(eval_pred):
# predictions, labels = eval_pred
# predictions = np.argmax(predictions, axis=1)
# return metric.compute(predictions=predictions, references=labels)
def postprocess_text(preds, labels):
preds = [pred.strip() for pred in preds]
labels = [[label.strip()] for label in labels]
return preds, labels
def compute_metrics(eval_preds):
preds, labels = eval_preds
if isinstance(preds, tuple):
preds = preds[0]
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
# Replace -100 in the labels as we can't decode them.
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
# Some simple post-processing
decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
result = metric.compute(predictions=decoded_preds, references=decoded_labels)
result = {"bleu": result["score"]}
return result
# define training args
training_args = TrainingArguments(
output_dir=args.output_dir,
overwrite_output_dir=True if get_last_checkpoint(args.output_dir) is not None else False,
num_train_epochs=args.epochs,
per_device_train_batch_size=args.train_batch_size,
per_device_eval_batch_size=args.eval_batch_size,
warmup_steps=args.warmup_steps,
fp16=args.fp16,
evaluation_strategy="epoch",
save_strategy="epoch",
save_total_limit=3,
logging_dir=f"{args.output_data_dir}/logs",
learning_rate=float(args.learning_rate),
load_best_model_at_end=True,
metric_for_best_model="sacrebleu"#,
#remove_unused_columns=False
)
# create Trainer instance
trainer = Trainer(
model=model,
args=training_args,
compute_metrics=compute_metrics,
train_dataset=train_dataset,
eval_dataset=test_dataset,
tokenizer=tokenizer,
)
# train model
if get_last_checkpoint(args.output_dir) is not None:
logger.info("***** continue training *****")
last_checkpoint = get_last_checkpoint(args.output_dir)
trainer.train(resume_from_checkpoint=last_checkpoint)
else:
trainer.train()
# evaluate model
eval_result = trainer.evaluate(eval_dataset=test_dataset)
# writes eval result to file which can be accessed later in s3 ouput
with open(os.path.join(args.output_data_dir, "eval_results.txt"), "w") as writer:
print(f"***** Eval results *****")
for key, value in sorted(eval_result.items()):
writer.write(f"{key} = {value}\n")
print(f"{key} = {value}\n")
# Saves the model to s3 uses os.environ["SM_MODEL_DIR"] to make sure checkpointing works
trainer.save_model(os.environ["SM_MODEL_DIR"])
And then I have a jupyter notebook for the whole process
save_path = f's3://{bucket}/{folder}'
import datasets
from datasets import Dataset, DatasetDict, load_dataset, load_metric
csv_dataset = datasets.load_dataset("csv", data_files=data_location, keep_default_na=False, delimiter=';')
#dataset = Dataset.from_pandas(df)
dataset = csv_dataset['train']
# 90% train, 10% (test + validation)
dataset_train_test = dataset.train_test_split(test_size=0.1, seed=42)
# 10% to 5% test + 5% validation
dataset_test_valid = dataset_train_test['test'].train_test_split(test_size=0.5, seed=42)
dataset = datasets.DatasetDict({
'train': dataset_train_test['train'],
'test': dataset_test_valid['test'],
'validation': dataset_test_valid['train']})
prefix = ""
max_input_length = 256
max_target_length = 256
source_lang = "ja"
target_lang = "fr"
def preprocess_function(examples):
inputs = [prefix + ex for ex in examples["text"]]
targets = [ex for ex in examples["translation"]]
model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
# Setup the tokenizer for targets
with tokenizer.as_target_tokenizer():
labels = tokenizer(targets, max_length=max_target_length, truncation=True)
model_inputs["labels"] = labels["input_ids"]
return model_inputs
tokenized_datasets = dataset.map(preprocess_function, batched=True)
from sagemaker.huggingface import HuggingFace
import time
# hyperparameters, which are passed into the training job
hyperparameters={'seed': 42,
'epochs': 1, # number of training epochs
'train_batch_size': 32, # batch size for training
'eval_batch_size': 64, # batch size for evaluation
'learning_rate': 3e-5, # learning rate used during training
'model_id':'Helsinki-NLP/opus-mt-ja-fr', # pre-trained model
'fp16': True, # Whether to use 16-bit (mixed) precision training
}
job_name = f'translation-finetuning-{time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())}'
huggingface_estimator = HuggingFace(
entry_point = 'train.py', # fine-tuning script used in training jon
source_dir = './scripts', # directory where fine-tuning script is stored
instance_type = 'ml.g5.xlarge', # instances type used for the training job
instance_count = 1, # the number of instances used for training
base_job_name = job_name, # the name of the training job
role = role, # Iam role used in training job to access AWS ressources, e.g. S3
transformers_version = '4.17', # the transformers version used in the training job
pytorch_version = '1.10', # the pytorch_version version used in the training job
py_version = 'py38', # the python version used in the training job
hyperparameters = hyperparameters, # the hyperparameter used for running the training job
)
# define a data input dictonary with our uploaded s3 uris
data = {
'train': save_path + '/train',
'test': save_path + '/test'
}
# starting the train job with our uploaded datasets as input
huggingface_estimator.fit(data, wait=True)
I have been struggling for hours trying to make it works, I’m desperately trying to solve error by error and now I’m confronted with this error :
AlgorithmError: ExecuteUserScriptError: ExitCode 1 ErrorMessage "KeyError: 'length'" Command "/opt/conda/bin/python3.8 train.py --epochs 1 --eval_batch_size 64 --fp16 True --learning_rate 3e-05 --model_id Helsinki-NLP/opus-mt-ja-fr --seed 42 --train_batch_size 32", exit code: 1
Here is the whole log print from aws cloudwatch
2023-07-01T01:45:53.555+02:00 bash: cannot set terminal process group (-1): Inappropriate ioctl for device
2023-07-01T01:45:53.555+02:00 bash: no job control in this shell
2023-07-01T01:45:53.555+02:00 2023-06-30 23:45:51,936 sagemaker-training-toolkit INFO Imported framework sagemaker_pytorch_container.training
2023-07-01T01:45:53.555+02:00 2023-06-30 23:45:51,956 sagemaker_pytorch_container.training INFO Block until all host DNS lookups succeed.
2023-07-01T01:45:53.555+02:00 2023-06-30 23:45:51,959 sagemaker_pytorch_container.training INFO Invoking user training script.
2023-07-01T01:45:53.555+02:00 2023-06-30 23:45:52,182 sagemaker-training-toolkit INFO Invoking user script
2023-07-01T01:45:53.555+02:00 Training Env:
2023-07-01T01:45:53.557+02:00 { "additional_framework_parameters": {}, "channel_input_dirs": { "test": "/opt/ml/input/data/test", "train": "/opt/ml/input/data/train" }, "current_host": "algo-1", "current_instance_group": "homogeneousCluster", "current_instance_group_hosts": [ "algo-1" ], "current_instance_type": "ml.g5.xlarge", "distribution_hosts": [], "distribution_instance_groups": [], "framework_module": "sagemaker_pytorch_container.training:main", "hosts": [ "algo-1" ], "hyperparameters": { "epochs": 1, "eval_batch_size": 64, "fp16": true, "learning_rate": 3e-05, "model_id": "Helsinki-NLP/opus-mt-ja-fr", "seed": 42, "train_batch_size": 32 }, "input_config_dir": "/opt/ml/input/config", "input_data_config": { "test": { "TrainingInputMode": "File", "S3DistributionType": "FullyReplicated", "RecordWrapperType": "None" }, "train": { "TrainingInputMode": "File", "S3DistributionType": "FullyReplicated", "RecordWrapperType": "None" } }, "input_dir": "/opt/ml/input", "instance_groups": [ "homogeneousCluster" ], "instance_groups_dict": { "homogeneousCluster": { "instance_group_name": "homogeneousCluster", "instance_type": "ml.g5.xlarge", "hosts": [ "algo-1" ] } }, "is_hetero": false, "is_master": true, "is_modelparallel_enabled": null, "job_name": "translation-finetuning-2023-06-30-23-40-06-2023-06-30-23-40-06-533", "log_level": 20, "master_hostname": "algo-1", "model_dir": "/opt/ml/model", "module_dir": "s3://sagemaker-us-east-1-984909470121/translation-finetuning-2023-06-30-23-40-06-2023-06-30-23-40-06-533/source/sourcedir.tar.gz", "module_name": "train", "network_interface_name": "eth0", "num_cpus": 4, "num_gpus": 1, "output_data_dir": "/opt/ml/output/data", "output_dir": "/opt/ml/output", "output_intermediate_dir": "/opt/ml/output/intermediate", "resource_config": { "current_host": "algo-1", "current_instance_type": "ml.g5.xlarge", "current_group_name": "homogeneousCluster", "hosts": [ "algo-1" ], "instance_groups": [ { "instance_group_name": "homogeneousCluster", "instance_type": "ml.g5.xlarge", "hosts": [ "algo-1" ] } ], "network_interface_name": "eth0" }, "user_entry_point": "train.py"
2023-07-01T01:45:53.557+02:00 }
2023-07-01T01:45:53.557+02:00 Environment variables:
2023-07-01T01:45:53.557+02:00 SM_HOSTS=["algo-1"]
2023-07-01T01:45:53.557+02:00 SM_NETWORK_INTERFACE_NAME=eth0
2023-07-01T01:45:53.557+02:00 SM_HPS={"epochs":1,"eval_batch_size":64,"fp16":true,"learning_rate":3e-05,"model_id":"Helsinki-NLP/opus-mt-ja-fr","seed":42,"train_batch_size":32}
2023-07-01T01:45:53.557+02:00 SM_USER_ENTRY_POINT=train.py
2023-07-01T01:45:53.557+02:00 SM_FRAMEWORK_PARAMS={}
2023-07-01T01:45:53.557+02:00 SM_RESOURCE_CONFIG={"current_group_name":"homogeneousCluster","current_host":"algo-1","current_instance_type":"ml.g5.xlarge","hosts":["algo-1"],"instance_groups":[{"hosts":["algo-1"],"instance_group_name":"homogeneousCluster","instance_type":"ml.g5.xlarge"}],"network_interface_name":"eth0"}
2023-07-01T01:45:53.557+02:00 SM_INPUT_DATA_CONFIG={"test":{"RecordWrapperType":"None","S3DistributionType":"FullyReplicated","TrainingInputMode":"File"},"train":{"RecordWrapperType":"None","S3DistributionType":"FullyReplicated","TrainingInputMode":"File"}}
2023-07-01T01:45:53.557+02:00 SM_OUTPUT_DATA_DIR=/opt/ml/output/data
2023-07-01T01:45:53.557+02:00 SM_CHANNELS=["test","train"]
2023-07-01T01:45:53.557+02:00 SM_CURRENT_HOST=algo-1
2023-07-01T01:45:53.557+02:00 SM_CURRENT_INSTANCE_TYPE=ml.g5.xlarge
2023-07-01T01:45:53.557+02:00 SM_CURRENT_INSTANCE_GROUP=homogeneousCluster
2023-07-01T01:45:53.557+02:00 SM_CURRENT_INSTANCE_GROUP_HOSTS=["algo-1"]
2023-07-01T01:45:53.557+02:00 SM_INSTANCE_GROUPS=["homogeneousCluster"]
2023-07-01T01:45:53.557+02:00 SM_INSTANCE_GROUPS_DICT={"homogeneousCluster":{"hosts":["algo-1"],"instance_group_name":"homogeneousCluster","instance_type":"ml.g5.xlarge"}}
2023-07-01T01:45:53.557+02:00 SM_DISTRIBUTION_INSTANCE_GROUPS=[]
2023-07-01T01:45:53.557+02:00 SM_IS_HETERO=false
2023-07-01T01:45:53.557+02:00 SM_MODULE_NAME=train
2023-07-01T01:45:53.557+02:00 SM_LOG_LEVEL=20
2023-07-01T01:45:53.557+02:00 SM_FRAMEWORK_MODULE=sagemaker_pytorch_container.training:main
2023-07-01T01:45:53.558+02:00 SM_INPUT_DIR=/opt/ml/input
2023-07-01T01:45:53.558+02:00 SM_INPUT_CONFIG_DIR=/opt/ml/input/config
2023-07-01T01:45:53.558+02:00 SM_OUTPUT_DIR=/opt/ml/output
2023-07-01T01:45:53.558+02:00 SM_NUM_CPUS=4
2023-07-01T01:45:53.558+02:00 SM_NUM_GPUS=1
2023-07-01T01:45:53.558+02:00 SM_MODEL_DIR=/opt/ml/model
2023-07-01T01:45:53.558+02:00 SM_MODULE_DIR=s3://sagemaker-us-east-1-984909470121/translation-finetuning-2023-06-30-23-40-06-2023-06-30-23-40-06-533/source/sourcedir.tar.gz
2023-07-01T01:45:53.558+02:00 SM_TRAINING_ENV={"additional_framework_parameters":{},"channel_input_dirs":{"test":"/opt/ml/input/data/test","train":"/opt/ml/input/data/train"},"current_host":"algo-1","current_instance_group":"homogeneousCluster","current_instance_group_hosts":["algo-1"],"current_instance_type":"ml.g5.xlarge","distribution_hosts":[],"distribution_instance_groups":[],"framework_module":"sagemaker_pytorch_container.training:main","hosts":["algo-1"],"hyperparameters":{"epochs":1,"eval_batch_size":64,"fp16":true,"learning_rate":3e-05,"model_id":"Helsinki-NLP/opus-mt-ja-fr","seed":42,"train_batch_size":32},"input_config_dir":"/opt/ml/input/config","input_data_config":{"test":{"RecordWrapperType":"None","S3DistributionType":"FullyReplicated","TrainingInputMode":"File"},"train":{"RecordWrapperType":"None","S3DistributionType":"FullyReplicated","TrainingInputMode":"File"}},"input_dir":"/opt/ml/input","instance_groups":["homogeneousCluster"],"instance_groups_dict":{"homogeneousCluster":{"hosts":["algo-1"],"instance_group_name":"homogeneousCluster","instance_type":"ml.g5.xlarge"}},"is_hetero":false,"is_master":true,"is_modelparallel_enabled":null,"job_name":"translation-finetuning-2023-06-30-23-40-06-2023-06-30-23-40-06-533","log_level":20,"master_hostname":"algo-1","model_dir":"/opt/ml/model","module_dir":"s3://sagemaker-us-east-1-984909470121/translation-finetuning-2023-06-30-23-40-06-2023-06-30-23-40-06-533/source/sourcedir.tar.gz","module_name":"train","network_interface_name":"eth0","num_cpus":4,"num_gpus":1,"output_data_dir":"/opt/ml/output/data","output_dir":"/opt/ml/output","output_intermediate_dir":"/opt/ml/output/intermediate","resource_config":{"current_group_name":"homogeneousCluster","current_host":"algo-1","current_instance_type":"ml.g5.xlarge","hosts":["algo-1"],"instance_groups":[{"hosts":["algo-1"],"instance_group_name":"homogeneousCluster","instance_type":"ml.g5.xlarge"}],"network_interface_name":"eth0"},"user_entry_point":"train.py"}
2023-07-01T01:45:53.558+02:00 SM_USER_ARGS=["--epochs","1","--eval_batch_size","64","--fp16","True","--learning_rate","3e-05","--model_id","Helsinki-NLP/opus-mt-ja-fr","--seed","42","--train_batch_size","32"]
2023-07-01T01:45:53.558+02:00 SM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate
2023-07-01T01:45:53.558+02:00 SM_CHANNEL_TEST=/opt/ml/input/data/test
2023-07-01T01:45:53.558+02:00 SM_CHANNEL_TRAIN=/opt/ml/input/data/train
2023-07-01T01:45:53.558+02:00 SM_HP_EPOCHS=1
2023-07-01T01:45:53.558+02:00 SM_HP_EVAL_BATCH_SIZE=64
2023-07-01T01:45:53.558+02:00 SM_HP_FP16=true
2023-07-01T01:45:53.558+02:00 SM_HP_LEARNING_RATE=3e-05
2023-07-01T01:45:53.558+02:00 SM_HP_MODEL_ID=Helsinki-NLP/opus-mt-ja-fr
2023-07-01T01:45:53.558+02:00 SM_HP_SEED=42
2023-07-01T01:45:53.558+02:00 SM_HP_TRAIN_BATCH_SIZE=32
2023-07-01T01:45:53.558+02:00 PYTHONPATH=/opt/ml/code:/opt/conda/bin:/opt/conda/lib/python38.zip:/opt/conda/lib/python3.8:/opt/conda/lib/python3.8/lib-dynload:/opt/conda/lib/python3.8/site-packages:/opt/conda/lib/python3.8/site-packages/smdebug-1.0.22b20220929-py3.8.egg:/opt/conda/lib/python3.8/site-packages/pyinstrument-3.4.2-py3.8.egg:/opt/conda/lib/python3.8/site-packages/pyinstrument_cext-0.2.4-py3.8-linux-x86_64.egg
2023-07-01T01:45:53.558+02:00 Invoking script with the following command:
2023-07-01T01:45:53.558+02:00 /opt/conda/bin/python3.8 train.py --epochs 1 --eval_batch_size 64 --fp16 True --learning_rate 3e-05 --model_id Helsinki-NLP/opus-mt-ja-fr --seed 42 --train_batch_size 32
2023-07-01T01:45:56.559+02:00 Collecting sacrebleu
2023-07-01T01:45:56.559+02:00 Downloading sacrebleu-2.3.1-py3-none-any.whl (118 kB)
2023-07-01T01:45:56.559+02:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 118.9/118.9 kB 16.8 MB/s eta 0:00:00
2023-07-01T01:45:56.559+02:00 Collecting lxml
2023-07-01T01:45:56.559+02:00 Downloading lxml-4.9.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (7.1 MB)
2023-07-01T01:45:56.559+02:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 7.1/7.1 MB 112.9 MB/s eta 0:00:00
2023-07-01T01:45:56.559+02:00 Requirement already satisfied: tabulate>=0.8.9 in /opt/conda/lib/python3.8/site-packages (from sacrebleu) (0.8.10)
2023-07-01T01:45:56.559+02:00 Collecting portalocker
2023-07-01T01:45:56.559+02:00 Downloading portalocker-2.7.0-py2.py3-none-any.whl (15 kB)
2023-07-01T01:45:56.559+02:00 Requirement already satisfied: colorama in /opt/conda/lib/python3.8/site-packages (from sacrebleu) (0.4.4)
2023-07-01T01:45:56.560+02:00 Requirement already satisfied: regex in /opt/conda/lib/python3.8/site-packages (from sacrebleu) (2022.9.13)
2023-07-01T01:45:56.560+02:00 Requirement already satisfied: numpy>=1.17 in /opt/conda/lib/python3.8/site-packages (from sacrebleu) (1.22.2)
2023-07-01T01:45:58.560+02:00 Installing collected packages: portalocker, lxml, sacrebleu
2023-07-01T01:45:58.560+02:00 Successfully installed lxml-4.9.2 portalocker-2.7.0 sacrebleu-2.3.1
2023-07-01T01:45:58.560+02:00 WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
2023-07-01T01:45:58.560+02:00 [notice] A new release of pip available: 22.2.2 -> 23.1.2
2023-07-01T01:45:58.560+02:00 [notice] To update, run: pip install --upgrade pip
2023-07-01T01:45:58.560+02:00 1
2023-07-01T01:45:58.560+02:00 Traceback (most recent call last): File "train.py", line 51, in <module>
2023-07-01T01:45:58.560+02:00 train_dataset = load_from_disk(args.training_dir) File "/opt/conda/lib/python3.8/site-packages/datasets/load.py", line 1755, in load_from_disk
2023-07-01T01:45:58.560+02:00 return Dataset.load_from_disk(dataset_path, fs, keep_in_memory=keep_in_memory) File "/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py", line 1107, in load_from_disk
2023-07-01T01:45:58.560+02:00 dataset_info = DatasetInfo.from_dict(json.load(dataset_info_file)) File "/opt/conda/lib/python3.8/site-packages/datasets/info.py", line 255, in from_dict
2023-07-01T01:45:58.560+02:00 return cls(**{k: v for k, v in dataset_info_dict.items() if k in field_names}) File "<string>", line 20, in __init__
2023-07-01T01:45:58.560+02:00 File "/opt/conda/lib/python3.8/site-packages/datasets/info.py", line 141, in __post_init__ self.features = Features.from_dict(self.features) File "/opt/conda/lib/python3.8/site-packages/datasets/features/features.py", line 1271, in from_dict
2023-07-01T01:45:58.560+02:00 obj = generate_from_dict(dic) File "/opt/conda/lib/python3.8/site-packages/datasets/features/features.py", line 1076, in generate_from_dict
2023-07-01T01:45:58.560+02:00 return {key: generate_from_dict(value) for key, value in obj.items()} File "/opt/conda/lib/python3.8/site-packages/datasets/features/features.py", line 1076, in <dictcomp>
2023-07-01T01:45:58.561+02:00 return {key: generate_from_dict(value) for key, value in obj.items()} File "/opt/conda/lib/python3.8/site-packages/datasets/features/features.py", line 1080, in generate_from_dict
2023-07-01T01:45:58.561+02:00 return Sequence(feature=generate_from_dict(obj["feature"]), length=obj["length"])
2023-07-01T01:45:58.561+02:00 KeyError: 'length'
2023-07-01T01:45:58.561+02:00 2023-06-30 23:45:58,439 sagemaker-training-toolkit INFO Waiting for the process to finish and give a return code.
2023-07-01T01:45:58.561+02:00 2023-06-30 23:45:58,439 sagemaker-training-toolkit INFO Done waiting for a return code. Received 1 from exiting process.
2023-07-01T01:45:58.561+02:00 2023-06-30 23:45:58,440 sagemaker-training-toolkit ERROR Reporting training FAILURE
2023-07-01T01:45:58.561+02:00 2023-06-30 23:45:58,440 sagemaker-training-toolkit ERROR ExecuteUserScriptError:
2023-07-01T01:45:58.561+02:00 ExitCode 1
2023-07-01T01:45:58.561+02:00 ErrorMessage "KeyError: 'length'"
2023-07-01T01:45:58.561+02:00 Command "/opt/conda/bin/python3.8 train.py --epochs 1 --eval_batch_size 64 --fp16 True --learning_rate 3e-05 --model_id Helsinki-NLP/opus-mt-ja-fr --seed 42 --train_batch_size 32"
2023-07-01T01:45:58.561+02:00 2023-06-30 23:45:58,440 sagemaker-training-toolkit ERROR Encountered exit_code 1
Do someone have any idea on this ?
Thank you,