Model miscategorizes sentences with clear class indicators

I have a training set consisting of 60K samples used for vendor classification. Each document is a brief description, and will sometimes have certain keywords or names that indicates the class. For instance, if the document mentions “ebay” it should always be classified as an etailer (without exception). Now most times the model picks up on these just fine, but in some cases it doesn’t and misses keywords that are clear indicators to a human what class it is. For example, I have 200 records that contain the name NEWVENDORNAME along with some other noisy words (‘monthly renewal fee’, ‘deposit’ etc). They are all classified the same, so you would think any model would be able to pick up that when NEWVENDORNAME is mentioned, it is this class.

But after finetuning the model still gets slight variations wrong, such as just ‘NEWVENDORNAME’. I initially thought it was overfitting, but I have tried training for just a single as well as several epochs and it still handles some queries poorly even though it has a glaring keyword that gives its class away.

How do I explain why the model misses these? Is this a common thing that a model can miss picking up on something like this that is fairly obvious to a human even if they are not familiar with the meaning of the keyword itself?

I think there’s something wrong with your training and preprocessing setup. But it’s a guess. I can’t say more unless you share at least which pre-trained model you’ve fine tuned and how you have preprocessed your corpus.

Thanks - here is my training script and param values.

Key Value
_tuning_objective_metric eval_loss
epochs 2
eval_batch_size 64
fp16 true
learning_rate 0.0001
model_id distilbert-base-uncased
num_labels 23
train_batch_size 32
unused 8
weight_decay 0.0005
### Trainging script (train.py)

import os

os.environ['TRANSFORMERS_CACHE'] = "cache"
os.environ['HF_DATASETS_CACHE'] = "cache"
print(os.system('python -m pip install datasets --upgrade'))

from transformers import (
    AutoModel,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    AutoTokenizer,
    AutoFeatureExtractor,
    AutoModelForMaskedLM,
    default_data_collator,
    AutoModelForSequenceClassification
)

from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import load_dataset, Dataset
import random
import logging
import sys
import argparse
import torch
import numpy as np
import pandas as pd
import datasets

import ast

print('datasets.__version__', datasets.__version__)

if __name__ == "__main__":

    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument("--epochs", type=int, default=1)
    parser.add_argument("--train_batch_size", type=int, default=32)
    parser.add_argument("--eval_batch_size", type=int, default=64)
    parser.add_argument("--warmup_steps", type=int, default=500)
    parser.add_argument("--model_id", type=str)
    parser.add_argument("--num_labels", type=str)
    parser.add_argument("--labels", type=str)
    parser.add_argument("--learning_rate", type=str, default=5e-5)
    parser.add_argument("--train_file", type=str, default="train.DbEmbeddings")
    parser.add_argument("--test_file", type=str, default="test.DbEmbeddings")
    parser.add_argument("--fp16", type=bool, default=True)

    # Data, model, and output directories
    parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"])
    parser.add_argument("--model_dir", type=str, default=os.environ["SM_MODEL_DIR"])
    parser.add_argument("--n_gpus", type=str, default=os.environ["SM_NUM_GPUS"])
    parser.add_argument("--training_dir", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
    parser.add_argument("--test_dir", type=str, default=os.environ["SM_CHANNEL_TEST"])

    args, _ = parser.parse_known_args()

    # Set up logging
    logger = logging.getLogger(__name__)

    logging.basicConfig(
        level=logging.getLevelName("INFO"),
        handlers=[logging.StreamHandler(sys.stdout)],
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
    )
    
          
    raw_train_dataset = load_dataset("json", data_files=os.path.join(args.training_dir, args.train_file), cache_dir="opt/ml/input")["train"]
    raw_test_dataset = load_dataset("json", data_files=os.path.join(args.test_dir, args.test_file), cache_dir="opt/ml/input")["train"]
    
    
    print('\nargs.labels', args.labels)
    print('type args.labels', type(args.labels))

    num_labels = int(args.num_labels)
    labels = ast.literal_eval(args.labels)
    
    print('type(args.num_labels)', type(labels))
    raw_train_dataset = raw_train_dataset.cast_column("label", datasets.ClassLabel(num_classes=num_labels, 
                                                                                   names= labels, names_file=None, id=None))
    
    print('\nraw_train_dataset.features', raw_train_dataset.features)

    # load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(args.model_id)
 

    def tokenize(examples):
        result = tokenizer(examples["source"], padding=True, truncation=True)

        return result


    # Use batched=True to activate fast multithreading!
    train_dataset = raw_train_dataset.map(
        tokenize, batched=True, batch_size=None
    )
    test_dataset = raw_test_dataset.map(
        tokenize, batched=True, batch_size=None
    )


    train_dataset.reset_format()
    test_dataset.reset_format()
    
    train_dataset.set_format("torch",
                                columns=["input_ids", "attention_mask", "label"])

    test_dataset.set_format(type="pandas")
    df = test_dataset[:]
    df_test, df_valid = np.split(df, [int(.5*len(df))])
    test_data = Dataset.from_pandas(df_test)
    valid_data = Dataset.from_pandas(df_valid)

    test_data = test_data.cast_column("label", datasets.ClassLabel(num_classes=num_labels, 
                                                                 names= labels , 
                                                                   names_file=None, id=None))

    valid_data = valid_data.cast_column("label", datasets.ClassLabel(num_classes=num_labels, 
                                                                 names= labels , names_file=None, 
                                                                   id=None))

    test_data.reset_format()
    test_data.set_format("torch",
                                columns=["input_ids", "attention_mask", "label"])

    valid_data.reset_format()
    valid_data.set_format("torch",
                                columns=["input_ids", "attention_mask", "label"])

    from sklearn.metrics import accuracy_score, f1_score

    def compute_metrics(pred):
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)
        f1 = f1_score(labels, preds, average="weighted")
        acc = accuracy_score(labels, preds)
        return {"accuracy": acc, "f1": f1}    
        # Saves the model to s3



    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = (AutoModelForSequenceClassification
             .from_pretrained(args.model_id, num_labels=num_labels)
             .to(device))                      

    batch_size = 64
    logging_steps = len(train_dataset) // batch_size
    model_name = f"{args.model_id}-finetuned-d"
    training_args = TrainingArguments(output_dir=model_name,
                                      num_train_epochs=args.epochs,
                                      learning_rate=2e-5,
                                      per_device_train_batch_size=batch_size,
                                      per_device_eval_batch_size=batch_size,
                                      weight_decay=0.01,
                                      evaluation_strategy="epoch",
                                      disable_tqdm=False,
                                      logging_steps=logging_steps,
                                      push_to_hub=False,
                                     # log_level="error"
                                     )                       


    trainer = Trainer(model=model, args=training_args,
                      compute_metrics=compute_metrics,
                      train_dataset=train_dataset,
                      eval_dataset=valid_data,
                      #eval_dataset=valid_data_down,
                      tokenizer=tokenizer)                       
    trainer.train()
    
    

    preds_output = trainer.predict(test_data)
    
    print('.')
    print('preds_output.metrics:')
    print(preds_output.metrics)
            
    
    trainer.save_model(args.model_dir)
    
                       
    print(f'my_acc: {preds_output.metrics["test_accuracy"]}')

I don’t see anything wrong with your training setup. However I’m curious what’s your test set F1-score is. Simple accuracy isn’t always reliable.

Also, important questions:

  1. Is the class name NEWVENDORNAME in all caps in your training set?
  2. Since you’ve used the default options for the tokenizer, it’ll use the max sequence length of 512. How long are the descriptions in your corpus? If they exceed that length, you’re losing information.
  3. Can you kindly check your validation loss during training and how far is it from your training loss?