Hello Huggingface Community ! I’ve fine-tuned deberta with my own data and would like to do some hyperparameter optimization. I’m trying to use ray tune but it keeps showing a trial error in all the trials that appear. Can anybody help me, please?
I am not able to fix or find a standard ray implementation solution that uses the trainer.hyperparameter_search() yet.
My code:
class Dataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels=None):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
if self.labels:
item["labels"] = torch.tensor(self.labels[idx])
return item
def __len__(self):
return len(self.encodings["input_ids"])
def compute_metrics(p):
print(type(p))
pred, labels = p
pred = np.argmax(pred, axis=1)
accuracy = accuracy_score(y_true=labels, y_pred=pred, average='micro')
recall = recall_score(y_true=labels, y_pred=pred, average='micro')
precision = precision_score(y_true=labels, y_pred=pred, average='micro')
f1 = f1_score(y_true=labels, y_pred=pred, average='micro')
return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}
def load_dataset(dataset_file, dataset_size):
data = pd.read_csv('/content/postagged.csv')
data = data[:dataset_size]
data['label'] = data['label'].astype(int)
data['premise'] = data['premise'].astype(str)
data['hypothesis'] = data['hypothesis'].astype(str)
return data
def load_model(model_hf):
model_hf = 'microsoft/deberta-v3-base'
bert_model = AutoModelForSequenceClassification.from_pretrained(model_hf, num_labels=3)
tokenizer = AutoTokenizer.from_pretrained(model_hf)
return bert_model, tokenizer
def prepare_dataset(dataset_file, dataset_size, tokenizer):
df = load_dataset(dataset_file, dataset_size)
training_size = int(dataset_size * 0.8)
#val_size = dataset_size - training_size
premise = list(df['premise'])
hypothesis = list(df['hypothesis'])
y = list(df['label'])
X_train_tokenized = tokenizer(premise[0:training_size], hypothesis[0:training_size], padding=True, truncation=True, max_length=256)
y_train = y[0:training_size]
X_val_tokenized = tokenizer(premise[training_size:dataset_size], hypothesis[training_size:dataset_size], padding=True, truncation=True, max_length=256)
y_val = y[training_size:dataset_size]
train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)
return train_dataset, val_dataset
def ray_hp_space(trial):
return {
"learning_rate": tune.loguniform(1e-6, 1e-4),
"per_device_train_batch_size": tune.choice([16, 32, 64, 128]),
}
def model_init(trial):
return AutoModelForSequenceClassification.from_pretrained(
model_hf,
num_labels=3
)
def train_model(per_device_train_batch_size, model, train_dataset, val_dataset, compute_metrics, model_name):
args = TrainingArguments(
output_dir= "/content/driveTraining/",
do_train=True,
do_eval=True,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
seed=42
)
trainer = Trainer(
model=model,
model_init=model_init,
train_dataset=train_dataset,
eval_dataset=val_dataset,
compute_metrics=compute_metrics
)
reporter = CLIReporter(
parameter_columns={
"learning_rate": "lr",
"warmup_steps": "warmup_steps",
"weight_decay": "w_decay"
},
metric_columns=["eval_acc", "eval_loss", "epoch", "eval_f1"],
)
best_trial = trainer.hyperparameter_search(
direction="maximize",
backend="ray",
progress_reporter=reporter,
hp_space=ray_hp_space
)
model, tokenizer = load_model(model_hf)
train_dataset, val_dataset = prepare_dataset(dataset_file, dataset_size, tokenizer)
train_model(per_device_train_batch_size, model, train_dataset, val_dataset, compute_metrics, model_name)
Error:
TuneError Traceback (most recent call last)
<ipython-input-14-d666714d0178> in <module>
168 model, tokenizer = load_model(model_hf)
169 train_dataset, val_dataset = prepare_dataset(dataset_file, dataset_size, tokenizer)
--> 170 train_model(per_device_train_batch_size, model, train_dataset, val_dataset, compute_metrics, model_name)
171 #test_model(model_name, tokenizer)
3 frames
<ipython-input-14-d666714d0178> in train_model(per_device_train_batch_size, model, train_dataset, val_dataset, compute_metrics, model_name)
118 metric_columns=["eval_acc", "eval_loss", "epoch", "eval_f1"],
119 )
--> 120 best_trial = trainer.hyperparameter_search(
121 direction="maximize",
122 backend="ray",
/usr/local/lib/python3.9/dist-packages/transformers/trainer.py in hyperparameter_search(self, hp_space, compute_objective, n_trials, direction, backend, hp_name, **kwargs)
2536 HPSearchBackend.WANDB: run_hp_search_wandb,
2537 }
-> 2538 best_run = backend_dict[backend](self, n_trials, direction, **kwargs)
2539
2540 self.hp_search_backend = None
/usr/local/lib/python3.9/dist-packages/transformers/integrations.py in run_hp_search_ray(trainer, n_trials, direction, **kwargs)
340 dynamic_modules_import_trainable.__mixins__ = trainable.__mixins__
341
--> 342 analysis = ray.tune.run(
343 dynamic_modules_import_trainable,
344 config=trainer.hp_space(None),
/usr/local/lib/python3.9/dist-packages/ray/tune/tune.py in run(run_or_experiment, name, metric, mode, stop, time_budget_s, config, resources_per_trial, num_samples, local_dir, search_alg, scheduler, keep_checkpoints_num, checkpoint_score_attr, checkpoint_freq, checkpoint_at_end, verbose, progress_reporter, log_to_file, trial_name_creator, trial_dirname_creator, chdir_to_trial_dir, sync_config, export_formats, max_failures, fail_fast, restore, server_port, resume, reuse_actors, raise_on_failed_trial, callbacks, max_concurrent_trials, trial_executor, _experiment_checkpoint_dir, _remote, _remote_string_queue)
790 if incomplete_trials:
791 if raise_on_failed_trial and not experiment_interrupted_event.is_set():
--> 792 raise TuneError("Trials did not complete", incomplete_trials)
793 else:
794 logger.error("Trials did not complete: %s", incomplete_trials)
TuneError: ('Trials did not complete', [_objective_1091e_00000, _objective_1091e_00001, _objective_1091e_00002, _objective_1091e_00003, _objective_1091e_00004, _objective_1091e_00005, _objective_1091e_00006, _objective_1091e_00007, _objective_1091e_00008, _objective_1091e_00009, _objective_1091e_00010, _objective_1091e_00011, _objective_1091e_00012, _objective_1091e_00013, _objective_1091e_00014, _objective_1091e_00015, _objective_1091e_00016, _objective_1091e_00017, _objective_1091e_00018, _objective_1091e_00019])