@philschmid correct, that is the error message I receive. Below is my training script and estimator call
### Estimator
estimator = HuggingFace(
entry_point = 'train.py', # fine-tuning script used in training jon
source_dir = 'embed_source', # directory where fine-tuning script is stored
instance_type = instance_type, # instances type used for the training job
instance_count = 1, # the number of instances used for training
role = get_execution_role(), # Iam role used in training job to access AWS ressources,
transformers_version = '4.6', # the transformers version used in the training job
#train_use_spot_instances=True,
max_run= 36000,
#max_wait= 36000,
pytorch_version = '1.7', # the pytorch_version version used in the training job
py_version = 'py36', # the python version used in the training job
hyperparameters = hyperparameters, # the hyperparameter used for running the training job
metric_definitions = metric_definitions, # the metrics regex definitions to extract logs
output_path=os.path.join(dataconnector.version_s3_prefix, "models"),
code_location=os.path.join(dataconnector.version_s3_prefix, "models"),
volume_size = 200,
checkpoint_s3_uri='s3://kj-temp/checkpoints'
)
### Trainging script (train.py)
import os
os.environ['TRANSFORMERS_CACHE'] = "cache"
os.environ['HF_DATASETS_CACHE'] = "cache"
print(os.system('python -m pip install datasets --upgrade'))
from transformers import (
AutoModel,
Trainer,
TrainingArguments,
DataCollatorForLanguageModeling,
AutoTokenizer,
AutoFeatureExtractor,
AutoModelForMaskedLM,
default_data_collator,
AutoModelForSequenceClassification
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import load_dataset, Dataset
import random
import logging
import sys
import argparse
import torch
import numpy as np
import pandas as pd
import datasets
import ast
print('datasets.__version__', datasets.__version__)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# hyperparameters sent by the client are passed as command-line arguments to the script.
parser.add_argument("--epochs", type=int, default=1)
parser.add_argument("--train_batch_size", type=int, default=32)
parser.add_argument("--eval_batch_size", type=int, default=64)
parser.add_argument("--warmup_steps", type=int, default=500)
parser.add_argument("--model_id", type=str)
parser.add_argument("--num_labels", type=str)
parser.add_argument("--labels", type=str)
parser.add_argument("--learning_rate", type=str, default=5e-5)
parser.add_argument("--train_file", type=str, default="train.DbEmbeddings")
parser.add_argument("--test_file", type=str, default="test.DbEmbeddings")
parser.add_argument("--fp16", type=bool, default=True)
# Data, model, and output directories
parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"])
parser.add_argument("--model_dir", type=str, default=os.environ["SM_MODEL_DIR"])
parser.add_argument("--n_gpus", type=str, default=os.environ["SM_NUM_GPUS"])
parser.add_argument("--training_dir", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
parser.add_argument("--test_dir", type=str, default=os.environ["SM_CHANNEL_TEST"])
args, _ = parser.parse_known_args()
# Set up logging
logger = logging.getLogger(__name__)
logging.basicConfig(
level=logging.getLevelName("INFO"),
handlers=[logging.StreamHandler(sys.stdout)],
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
raw_train_dataset = load_dataset("json", data_files=os.path.join(args.training_dir, args.train_file), cache_dir="opt/ml/input")["train"]
raw_test_dataset = load_dataset("json", data_files=os.path.join(args.test_dir, args.test_file), cache_dir="opt/ml/input")["train"]
print('\nargs.labels', args.labels)
print('type args.labels', type(args.labels))
num_labels = int(args.num_labels)
labels = ast.literal_eval(args.labels)
print('type(args.num_labels)', type(labels))
raw_train_dataset = raw_train_dataset.cast_column("label", datasets.ClassLabel(num_classes=num_labels,
names= labels, names_file=None, id=None))
print('\nraw_train_dataset.features', raw_train_dataset.features)
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(args.model_id)
def tokenize(examples):
result = tokenizer(examples["source"], padding=True, truncation=True)
return result
# Use batched=True to activate fast multithreading!
train_dataset = raw_train_dataset.map(
tokenize, batched=True, batch_size=None
)
test_dataset = raw_test_dataset.map(
tokenize, batched=True, batch_size=None
)
train_dataset.reset_format()
test_dataset.reset_format()
train_dataset.set_format("torch",
columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="pandas")
df = test_dataset[:]
df_test, df_valid = np.split(df, [int(.5*len(df))])
test_data = Dataset.from_pandas(df_test)
valid_data = Dataset.from_pandas(df_valid)
test_data = test_data.cast_column("label", datasets.ClassLabel(num_classes=num_labels,
names= labels ,
names_file=None, id=None))
valid_data = valid_data.cast_column("label", datasets.ClassLabel(num_classes=num_labels,
names= labels , names_file=None,
id=None))
test_data.reset_format()
test_data.set_format("torch",
columns=["input_ids", "attention_mask", "label"])
valid_data.reset_format()
valid_data.set_format("torch",
columns=["input_ids", "attention_mask", "label"])
from sklearn.metrics import accuracy_score, f1_score
def compute_metrics(pred):
labels = pred.label_ids
preds = pred.predictions.argmax(-1)
f1 = f1_score(labels, preds, average="weighted")
acc = accuracy_score(labels, preds)
return {"accuracy": acc, "f1": f1}
# Saves the model to s3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = (AutoModelForSequenceClassification
.from_pretrained(args.model_id, num_labels=num_labels)
.to(device))
batch_size = 64
logging_steps = len(train_dataset) // batch_size
model_name = f"{args.model_id}-finetuned-d"
training_args = TrainingArguments(output_dir=model_name,
num_train_epochs=args.epochs,
learning_rate=2e-5,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
weight_decay=0.01,
evaluation_strategy="epoch",
disable_tqdm=False,
logging_steps=logging_steps,
push_to_hub=False,
# log_level="error"
)
trainer = Trainer(model=model, args=training_args,
compute_metrics=compute_metrics,
train_dataset=train_dataset,
eval_dataset=valid_data,
#eval_dataset=valid_data_down,
tokenizer=tokenizer)
trainer.train()
preds_output = trainer.predict(test_data)
print('.')
print('preds_output.metrics:')
print(preds_output.metrics)
trainer.save_model(args.model_dir)
print(f'my_acc: {preds_output.metrics["test_accuracy"]}')