Hello,
I have a very simple training loop that I have built with the Trainer
class, and I am having a hard time understanding the graphs that wandb
is tracking.
I have a couple of questions for these graphs,
- Why does the global step go back to zero after some time?
- Why is the graph for train/epoch not linear? Why does this happen?
Here is my training loop,
import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# import torch
# import torch.nn as nn
# import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, OPTConfig
from datasets import load_dataset
import transformers
def main():
# model_name = "facebook/opt-125m"
# cfg = AutoConfig.from_pretrained(model_name)
cfg = OPTConfig(max_position_embeddings=512)
# cfg = OPTConfig()
model = AutoModelForCausalLM.from_config(cfg)
print(f'Model being used: {model.config}')
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
ds = load_dataset("deven367/babylm-10M-switchboard")
data = ds.map(lambda samples: tokenizer(samples["text"]), batched=True)
trainer = transformers.Trainer(
model=model,
train_dataset=data["train"],
eval_dataset=data["valid"],
args=transformers.TrainingArguments(
auto_find_batch_size=True,
per_device_train_batch_size=2048,
per_device_eval_batch_size=2048,
gradient_accumulation_steps=4,
# warmup_steps=100,
num_train_epochs=5.,
evaluation_strategy='epoch',
learning_rate=2e-4,
# fp16=True,
logging_steps=1,
output_dir="outputs",
),
data_collator=transformers.DataCollatorForLanguageModeling(
tokenizer, mlm=False
),
)
model.config.use_cache = (
False # silence the warnings. Please re-enable for inference!
)
trainer.train( )
# save the model
trainer.save_model("trained-opt125M-5-epochs-switchboard")
if __name__ == "__main__":
main()