Sagemaker gpt-j train file error

Do you know if this has now been updated? I am getting the same error

@E1l1dh cou you share your code? The latest available version currently is 4.17

@philschmid Sure thank you!

Here is my code

import sagemaker.huggingface
import sagemaker
import boto3
import json
from transformers import AutoTokenizer
import torch
from torch.utils.data import TensorDataset
from sagemaker.huggingface import HuggingFace
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split

sess = sagemaker.Session()
sagemaker_session_bucket = '111111111111-path-to-data'
if sagemaker_session_bucket is None and sess is not None:
	sagemaker_session_bucket = sess.default_bucket()

role = sagemaker.get_execution_role()
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

# load dataset
with open('input_data/dataset_4.json', 'r') as f:
	data = json.load(f)

# Download the tokenizer
tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-j-6B', padding=True, truncation=True)
tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

# Tokenize the examples
prompts = [tokenizer.encode(example["prompt"]) for example in data["examples"]]
completions = [tokenizer.encode(example["completion"]) for example in data["examples"]]

# Get the maximum length of all prompts and completions
max_len = max(len(prompt) for prompt in prompts)

# Pad the prompts and completions
prompts = [prompt + [tokenizer.pad_token_id] * (max_len - len(prompt)) for prompt in prompts]
completions = [completion + [tokenizer.pad_token_id] * (max_len - len(completion)) for completion in completions]

# Convert the examples to PyTorch tensors
prompts = torch.tensor(prompts)
completions = torch.tensor(completions)

prompts_train, prompts_test, completions_train, completions_test = train_test_split(prompts, completions, test_size=0.2,
																					random_state=42)

# Create train and val datasets
train_dataset = TensorDataset(prompts_train, completions_train)
test_dataset = TensorDataset(prompts_test, completions_test)

# Create train and val data loaders
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=True)

# save the train_dataset as a binary file
torch.save(train_dataset, "train_dataset.pt")
torch.save(test_dataset, "test_dataset.pt")

# Create an S3 client
s3 = boto3.client('s3')

# Define the S3 bucket and prefix where you want to save the dataset
s3_bucket_name = "111111111111-path-to-data"
s3_bucket_path = 'gpt-j-6B/datasets/'

s3.upload_file('train_dataset.pt', s3_bucket_name, s3_bucket_path + 'train_dataset.pt')
s3.upload_file('test_dataset.pt', s3_bucket_name, s3_bucket_path + 'test_dataset.pt')

# gets role for executing training job
iam_client = boto3.client('iam')
role = 'arn:aws:iam::000000000000:role/my-training-role'
output_bucket = f"s3://{'111111111111-path-to-data'}/{'gptj-model-outputdir'}/"

hyperparameters = {
	'model_name_or_path': 'EleutherAI/gpt-j-6B',
	'output_dir': '/opt/ml/model'}

# git configuration to download our fine-tuning script
git_config = {'repo': 'https://github.com/huggingface/transformers.git', 'branch': 'v4.17.0'}

# creates Hugging Face estimator
huggingface_estimator = HuggingFace(
	entry_point='run_clm.py',
	source_dir='./examples/pytorch/language-modeling',
	instance_type='ml.p3.2xlarge',
	instance_count=1,
	role=role,
	git_config=git_config,
	transformers_version='4.17.0',
	pytorch_version='1.10.2',
	py_version='py38',
	hyperparameters=hyperparameters,
	output_path=output_bucket
)

# starting the train job
huggingface_estimator.fit(
	inputs={'train_dataloader': 's3://111111111111-path-to-data/gpt-j-6B/datasets/train_dataset.pt',
			'test_dataloader': 's3://111111111111-path-to-data/gpt-j-6B/datasets/test_dataset.pt'})

And here is the error message I receive

UnexpectedStatusException: Error for Training job huggingface-pytorch-training-2023-01-30-07-34-51-099: Failed. Reason: AlgorithmError: ExecuteUserScriptError:
ExitCode 1
ErrorMessage "raise ValueError("Need either a dataset name or a training/validation file.")
 ValueError: Need either a dataset name or a training/validation file."
Command "/opt/conda/bin/python3.8 run_clm.py --epochs 1 --model_name_or_path EleutherAI/gpt-j-6B --output_dir /opt/ml/model", exit code: 1

You are not providing a dataset to train on.

Right ok, do I need to specify the path to my dataset elsewhere? I thought when running

# starting the train job
huggingface_estimator.fit(
	inputs={'train_dataloader': 's3://111111111111-path-to-data/gpt-j-6B/datasets/train_dataset.pt',
			'test_dataloader': 's3://111111111111-path-to-data/gpt-j-6B/datasets/test_dataset.pt'})

Then this would pass the train dataset through?

Yes, you need to modify the script to load your dataset. Please check here how it is done: notebooks/train.py at 63bb015d64f2dcf6e10f84326b7102535885b03e · huggingface/notebooks · GitHub

I’m using run_clm.py (transformers/run_clm.py at main · huggingface/transformers · GitHub) the guide you have sent is for a different script

@E1l1dh Were you able to resolve the issue, if yes what need to be done?