Pytorch trainer giving worse results than tensorflow

I’m trying to make the switch from tensorflow to pytorch, but I’m getting a good bit worse results when running the pytorch model using Trainer.

I’m using bert-base-uncased, and as far as I can tell am using primarily the same settings across both (batch size, epochs, learning rate, etc). However I am getting a f1 score of 0.9967 from tensorflow, and a 0.944649446494465 from pytorch. The loss also seems to fluctuate a lot more in pytorch. I’m still pretty new to machine learning and python in general, so I feel like it’s gotta be something obvious, but I’ve yet to find it. Here are my scripts. Thanks in advance.

Tensorflow

SEQ_LEN = 256
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def train():
    def preprocess_function(examples):
        return tokenizer(examples["text"], max_length=SEQ_LEN, truncation=True, padding='max_length', add_special_tokens=True, return_attention_mask=True, return_token_type_ids=False, return_tensors='tf')

    dataset = load_dataset('json', data_files={"train": "full-items.json", "test": "validation-2.json"})

    tokenized = dataset.map(preprocess_function, batched=True)

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

    batch_size = 8
    num_epochs = 4
    batches_per_epoch = len(tokenized["train"]) // batch_size
    total_train_steps = int(batches_per_epoch * num_epochs)
    optimizer, schedule = create_optimizer(init_lr=4e-5, num_warmup_steps=0, num_train_steps=total_train_steps)


    id2label = {0: "NEGATIVE", 1: "POSITIVE"}
    label2id = {"NEGATIVE": 0, "POSITIVE": 1}

    model = TFAutoModelForSequenceClassification.from_pretrained(
        "bert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
    )

    tf_train_set = model.prepare_tf_dataset(
        tokenized["train"],
        shuffle=True,
        batch_size=batch_size,
        collate_fn=data_collator,
    )

    tf_validation_set = model.prepare_tf_dataset(
        tokenized["test"],
        shuffle=False,
        batch_size=batch_size,
        collate_fn=data_collator,
    )

    eval_metrics = evaluate.load("f1")

    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        return eval_metrics.compute(predictions=predictions, references=labels)

    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
        
    METRICS = [
        tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy'),
        tf.keras.metrics.SparseCategoricalCrossentropy(from_logits=True, name='sparse_crossentropy'),
    ]

    metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_train_set)
    early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)
    class_weights = dict(enumerate(sklearn.utils.class_weight.compute_class_weight('balanced',
                                                         classes=np.unique(tokenized["train"]["label"]),
                                                         y=tokenized["train"]["label"])))

    model.compile(optimizer=optimizer, loss=loss, metrics=METRICS)
    model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=num_epochs, class_weight=class_weights, callbacks=[early_stop, metric_callback])
    model.save_pretrained('lease_to_own_model', save_format="tf")

Pytorch

def pyTorch():
    def preprocess_function(examples):
        return tokenizer(examples["text"], max_length=SEQ_LEN, truncation=True, padding='max_length', add_special_tokens=True, return_attention_mask=True, return_token_type_ids=False)

    dataset = load_dataset('json', data_files={"train": "full-items.json", "test": "validation-2.json"})
    tokenized = dataset.map(preprocess_function, batched=True)

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    eval_f1 = evaluate.load("f1")
    eval_accuracy = evaluate.load("accuracy")

    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        f1 = eval_f1.compute(predictions=predictions, references=labels)
        accuracy = eval_accuracy.compute(predictions=predictions, references=labels)
        return {"accuracy": accuracy["accuracy"], "f1": f1["f1"]}

    id2label = {0: "NEGATIVE", 1: "POSITIVE"}
    label2id = {"NEGATIVE": 0, "POSITIVE": 1}

    model = AutoModelForSequenceClassification.from_pretrained(
        "bert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
    )
    device = torch.device("cuda")
    model.to(device)

    batch_size = 8

    training_args = TrainingArguments(
        num_train_epochs=4,
        output_dir="pytorch",
        learning_rate=4e-5,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        metric_for_best_model='f1',
        load_best_model_at_end=True,
        logging_strategy="epoch",
        warmup_steps=0,
    )

    class_weights = sklearn.utils.class_weight.compute_class_weight('balanced',
                                                         classes=np.unique(tokenized["train"]["label"]),
                                                         y=tokenized["train"]["label"])
    weights= torch.tensor(class_weights,dtype=torch.float).to(device)

    class CustomTrainer(Trainer):
        def compute_loss(self, model, inputs, return_outputs=False):
            labels = inputs.get("labels")
            outputs = model(**inputs)
            logits = outputs.get("logits")
            loss_fct = torch.nn.CrossEntropyLoss(weight=weights)
            loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
            return (loss, outputs) if return_outputs else loss

    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized["train"],
        eval_dataset=tokenized["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    trainer.save_model("pytorch")