Hi, after successfully fine-tuning GPT-2 model I wanted to try fine-tuning a longformer model as the sequences I use a very long.
Essentially I duplicated the code I used for GPT-2 with some minor adjustments.
the data I use is a table with 2 columns - text and labels
I trained the model for 10 epochs.
looking at the training log file, it seems that the lose wasn’t improving during training.
Could it be that something is missing in the data that I need to add to it (maybe something that GPT-2 doesn’t need but longformer does) ?
Here is a code snippet of the training
model_name_or_path = 'allenai/longformer-base-4096'
def tokenize_function(examples):
tokenizer = LongformerTokenizer.from_pretrained(model_name_or_path)
return tokenizer(examples["text"], max_length=4096, truncation=True, padding='max_length')
if __name__ == '__main__':
model_config = LongformerConfig()
model = LongformerForSequenceClassification.from_pretrained(model_name_or_path)
model.to(device)
train_list = []
test_list = []
query = "(text.str.len() <= 4096) & (text.str.len() > 20)"
train_df = pd.read_csv(os.path.join(os.path.abspath(os.getcwd()), r'train.csv'), header=None, names=['text', 'label']).fillna(' ')
train_df = train_df.query(query, engine="python")
train_dataset = Dataset.from_pandas(train_df)
tokenized_train_datasets = train_dataset.map(tokenize_function, batched=True)
test_df = pd.read_csv(os.path.join(os.path.abspath(os.getcwd()), r'eval.csv'), header=None, names=['text', 'label']).fillna(' ')
test_df = test_df.query(query, engine="python")
test_dataset = Dataset.from_pandas(test_df)
tokenized_test_datasets = test_dataset.map(tokenize_function, batched=True)
training_args = TrainingArguments(
output_dir=os.path.join(os.path.abspath(os.getcwd()), 'results'), # output directory
overwrite_output_dir=True,
evaluation_strategy='epoch',
save_strategy='epoch',
num_train_epochs=10, # total number of training epochs
per_device_train_batch_size=2, # batch size per device during training
per_device_eval_batch_size=2, # batch size for evaluation
warmup_steps=500, # number of warmup steps for learning rate scheduler
weight_decay=0.01, # strength of weight decay
logging_dir=os.path.join(os.path.abspath(os.getcwd()), 'results'), # directory for storing logs
logging_steps=10,
load_best_model_at_end=True
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_train_datasets,
eval_dataset=tokenized_test_datasets
)
trainer.train()
type or paste code here