Training Loss 0.0000 and Validation Loss nan

when I trainer.train(), I get


BASE_MODEL = "google/gemma-2-2b-it"

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL,truncation=True,padding='max_length')
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

model = AutoModelForSequenceClassification.from_pretrained(BASE_MODEL,num_labels=780,id2label=id2label_1, label2id=label2id)
model.config.problem_type='single_label_classification'

trainer=Trainer(
     model =model,
     train_dataset=tokenized_datasets['train'],
     eval_dataset=tokenized_datasets['test'],
     args =TrainingArguments(
        output_dir="./content/drive/MyDrive/output_1002/",
        remove_unused_columns=True,
        learning_rate=1e-3,
        weight_decay=0.01,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=1,
        optim='paged_adamw_8bit',
        eval_strategy='steps',
        save_strategy='steps',
        logging_dir='./logs',
        logging_steps=1,
        max_steps=10,                         
        save_total_limit=2, 
        push_to_hub=False,
        bf16=True,
        load_best_model_at_end=True,
        report_to='none',
        max_grad_norm=1.0,), 
     data_collator=data_collator,
     tokenizer=tokenizer,
)

I’ve tried
lr from 2e-7 to 1e-3,
max_steps from 10 to 100000,
fp16 True,False

and result of below code is

print(tokenized_datasets['train'][0])

{‘label’: 61, ‘input_ids’: [2, 7639, 1162, 9160, 9999, 573, 13107, 576, 3757, 4063, 235303, 24261, 611, 65707, 5158, 578, 24023], ‘attention_mask’: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

and model.config is

Gemma2Config {
  "_name_or_path": "google/gemma-2-2b-it",
  "architectures": [
    "Gemma2ForSequenceClassification"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "attn_logit_softcapping": 50.0,
  "bos_token_id": 2,
  "cache_implementation": "hybrid",
  "eos_token_id": [
    1,
    107
  ],
  "final_logit_softcapping": 30.0,
  "head_dim": 256,
  "hidden_act": "gelu_pytorch_tanh",
  "hidden_activation": "gelu_pytorch_tanh",
  "hidden_size": 2304,
  "id2label": {
    "0": "-0.003",
    "1": "0.006",
    "2": "-0.002",
    "3": "0.005",
    "4": "-0.021",
    "5": "-0.02",
    "6": "0.041",
    "7": "0.035",
    "8": "0.037",
    "9": "0.042",
    "10": "0.044",
    "11": "-0.022",
    "12": "0.01",
    "13": "-0.008",
    "14": "0.016",
    "15": "0.011",
    "16": "0.013",
    "17": "0.004",
    "18": "0.001",
    "19": "-0.0",
    "20": "0.0",
    "21": "0.046",
    "22": "0.047",
    "23": "0.045",
    "24": "-0.023",
    "25": "0.043",
    "26": "0.027",
    "27": "-0.013",
    "28": "0.025",
    "29": "-0.012",
    "30": "-0.011",
    "31": "-0.007",
    "32": "0.026",
    "33": "0.021",
    "34": "0.024",
    "35": "0.023",
    "36": "0.022",
    "37": "0.007",
    "38": "0.155",
    "39": "-0.076",
    "40": "-0.077",
    "41": "-0.078",
    "42": "-0.074",
    "43": "0.157",
    "44": "0.012",
    "45": "-0.006",
    "46": "0.009",
    "47": "-0.004",
    "48": "-0.005",
    "49": "-0.024",
    "50": "0.036",
    "51": "0.017",
    "52": "0.018",
    "53": "0.019",
    "54": "-0.009",
    "55": "0.02",
    "56": "-0.01",
    "57": "0.015",
    "58": "0.008",
    "59": "0.003",
    "60": "0.002",
    "61": "-0.001",
    "62": "-0.026",
    "63": "-0.025",
    "64": "0.051",
    "65": "0.052",
    "66": "0.033",
    "67": "0.053",
    "68": "0.049",
    "69": "-0.017",
    "70": "0.05",
    "71": "-0.051",
    "72": "-0.05",
    "73": "0.069",
    "74": "-0.049",
    "75": "0.097",
    "76": "-0.038",
    "77": "-0.047",
    "78": "-0.04",
    "79": "-0.041",
    "80": "0.104",
    "81": "-0.048",
    "82": "0.081",
    "83": "-0.034",
    "84": "0.107",
    "85": "0.106",
    "86": "-0.045",
    "87": "0.085",
    "88": "-0.052",
    "89": "-0.053",
    "90": "-0.039",
    "91": "0.105",
    "92": "-0.029",
    "93": "-0.015",
    "94": "-0.016",
    "95": "0.034",
    "96": "0.032",
    "97": "0.031",
    "98": "0.058",
    "99": "-0.028",
    "100": "0.057",
    "101": "0.055",
    "102": "-0.027",
    "103": "0.014",
    "104": "-0.014",
    "105": "0.028",
    "106": "-0.033",
    "107": "0.061",
    "108": "0.059",
    "109": "0.067",
    "110": "-0.031",
    "111": "-0.032",
    "112": "-0.018",
    "113": "0.063",
    "114": "0.054",
    "115": "0.04",
    "116": "-0.019",
    "117": "0.195",
    "118": "-0.101",
    "119": "0.207",
    "120": "-0.099",
    "121": "-0.097",
    "122": "0.175",
    "123": "-0.098",
    "124": "0.092",
    "125": "-0.043",
    "126": "-0.046",
    "127": "0.068",
    "128": "-0.037",
    "129": "-0.042",
    "130": "0.093",
    "131": "0.087",
    "132": "0.094",
    "133": "0.091",
    "134": "-0.044",
    "135": "0.089",
    "136": "0.126",
    "137": "-0.062",
    "138": "-0.063",
    "139": "-0.06",
    "140": "-0.058",
    "141": "0.128",
    "142": "-0.061",
    "143": "0.115",
    "144": "-0.056",
    "145": "-0.057",
    "146": "-0.054",
    "147": "0.079",
    "148": "0.095",
    "149": "0.084",
    "150": "0.038",
    "151": "0.103",
    "152": "0.102",
    "153": "0.101",
    "154": "0.113",
    "155": "0.122",
    "156": "0.111",
    "157": "0.125",
    "158": "0.096",
    "159": "-0.055",
    "160": "0.086",
    "161": "0.117",
    "162": "0.088",
    "163": "0.09",
    "164": "-0.035",
    "165": "0.074",
    "166": "0.098",
    "167": "0.099",
    "168": "0.029",
    "169": "0.03",
    "170": "0.066",
    "171": "-0.083",
    "172": "-0.084",
    "173": "-0.082",
    "174": "0.048",
    "175": "0.064",
    "176": "0.065",
    "177": "-0.03",
    "178": "0.039",
    "179": "0.073",
    "180": "0.075",
    "181": "0.07",
    "182": "0.078",
    "183": "-0.036",
    "184": "0.08",
    "185": "0.056",
    "186": "0.082",
    "187": "0.077",
    "188": "-0.073",
    "189": "-0.08",
    "190": "-0.081",
    "191": "-0.075",
    "192": "0.168",
    "193": "0.174",
    "194": "-0.086",
    "195": "0.177",
    "196": "0.171",
    "197": "0.17",
    "198": "0.076",
    "199": "0.16",
    "200": "-0.079",
    "201": "0.159",
    "202": "-0.064",
    "203": "-0.106",
    "204": "0.145",
    "205": "-0.103",
    "206": "-0.105",
    "207": "-0.104",
    "208": "0.208",
    "209": "-0.07",
    "210": "-0.071",
    "211": "-0.069",
    "212": "-0.09",
    "213": "-0.091",
    "214": "0.142",
    "215": "0.187",
    "216": "0.18",
    "217": "0.169",
    "218": "0.162",
    "219": "-0.059",
    "220": "0.138",
    "221": "0.153",
    "222": "-0.072",
    "223": "0.15",
    "224": "-0.065",
    "225": "0.123",
    "226": "0.12",
    "227": "0.124",
    "228": "0.121",
    "229": "-0.113",
    "230": "-0.109",
    "231": "0.119",
    "232": "-0.066",
    "233": "0.136",
    "234": "-0.067",
    "235": "0.137",
    "236": "0.109",
    "237": "0.108",
    "238": "0.163",
    "239": "0.156",
    "240": "-0.068",
    "241": "0.135",
    "242": "0.134",
    "243": "0.1",
    "244": "0.118",
    "245": "0.129",
    "246": "0.13",
    "247": "0.06",
    "248": "0.062",
    "249": "0.143",
    "250": "0.165",
    "251": "0.164",
    "252": "0.172",
    "253": "0.167",
    "254": "0.139",
    "255": "0.147",
    "256": "0.149",
    "257": "0.14",
    "258": "0.2",
    "259": "0.202",
    "260": "-0.088",
    "261": "0.194",
    "262": "0.21",
    "263": "-0.108",
    "264": "-0.107",
    "265": "0.197",
    "266": "0.213",
    "267": "0.11",
    "268": "-0.1",
    "269": "-0.092",
    "270": "0.192",
    "271": "0.141",
    "272": "0.112",
    "273": "0.293",
    "274": "-0.136",
    "275": "-0.144",
    "276": "-0.142",
    "277": "0.146",
    "278": "0.133",
    "279": "0.071",
    "280": "0.083",
    "281": "-0.185",
    "282": "-0.169",
    "283": "-0.18",
    "284": "0.379",
    "285": "-0.143",
    "286": "0.199",
    "287": "-0.184",
    "288": "0.148",
    "289": "0.161",
    "290": "0.154",
    "291": "0.144",
    "292": "0.116",
    "293": "0.158",
    "294": "0.22",
    "295": "-0.111",
    "296": "-0.112",
    "297": "0.251",
    "298": "0.259",
    "299": "0.266",
    "300": "0.265",
    "301": "0.072",
    "302": "0.114",
    "303": "0.131",
    "304": "0.132",
    "305": "-0.093",
    "306": "0.211",
    "307": "-0.102",
    "308": "-0.096",
    "309": "-0.095",
    "310": "0.219",
    "311": "-0.126",
    "312": "-0.133",
    "313": "-0.138",
    "314": "-0.137",
    "315": "-0.087",
    "316": "0.196",
    "317": "0.32",
    "318": "-0.157",
    "319": "-0.158",
    "320": "0.185",
    "321": "0.321",
    "322": "-0.094",
    "323": "-0.152",
    "324": "0.322",
    "325": "0.318",
    "326": "0.179",
    "327": "-0.089",
    "328": "0.183",
    "329": "0.182",
    "330": "0.227",
    "331": "0.224",
    "332": "0.191",
    "333": "0.205",
    "334": "0.173",
    "335": "0.178",
    "336": "-0.085",
    "337": "0.127",
    "338": "-0.145",
    "339": "0.273",
    "340": "-0.141",
    "341": "0.218",
    "342": "-0.11",
    "343": "0.225",
    "344": "0.189",
    "345": "0.186",
    "346": "-0.116",
    "347": "0.232",
    "348": "0.226",
    "349": "0.176",
    "350": "0.201",
    "351": "0.235",
    "352": "-0.115",
    "353": "0.166",
    "354": "0.248",
    "355": "-0.125",
    "356": "-0.121",
    "357": "-0.124",
    "358": "0.188",
    "359": "0.231",
    "360": "-0.119",
    "361": "0.243",
    "362": "-0.114",
    "363": "-0.123",
    "364": "-0.129",
    "365": "-0.118",
    "366": "0.253",
    "367": "-0.128",
    "368": "0.215",
    "369": "0.214",
    "370": "0.24",
    "371": "0.233",
    "372": "0.152",
    "373": "0.212",
    "374": "-0.127",
    "375": "-0.117",
    "376": "0.328",
    "377": "-0.164",
    "378": "-0.168",
    "379": "-0.166",
    "380": "0.257",
    "381": "0.25",
    "382": "0.193",
    "383": "0.238",
    "384": "0.223",
    "385": "0.247",
    "386": "0.244",
    "387": "-0.153",
    "388": "-0.151",
    "389": "0.237",
    "390": "0.221",
    "391": "0.206",
    "392": "-0.163",
    "393": "-0.148",
    "394": "-0.162",
    "395": "0.334",
    "396": "0.229",
    "397": "0.216",
    "398": "0.151",
    "399": "0.649",
    "400": "-0.337",
    "401": "-0.301",
    "402": "0.264",
    "403": "0.269",
    "404": "-0.131",
    "405": "0.217",
    "406": "0.222",
    "407": "0.181",
    "408": "0.278",
    "409": "0.288",
    "410": "0.203",
    "411": "0.23",
    "412": "0.236",
    "413": "0.337",
    "414": "0.277",
    "415": "-0.13",
    "416": "-0.156",
    "417": "0.256",
    "418": "0.252",
    "419": "0.184",
    "420": "0.204",
    "421": "0.258",
    "422": "0.291",
    "423": "0.275",
    "424": "-0.154",
    "425": "0.261",
    "426": "0.26",
    "427": "-0.132",
    "428": "0.28",
    "429": "-0.135",
    "430": "0.234",
    "431": "0.19",
    "432": "0.198",
    "433": "-0.14",
    "434": "0.299",
    "435": "-0.146",
    "436": "0.297",
    "437": "0.289",
    "438": "-0.139",
    "439": "-0.12",
    "440": "0.249",
    "441": "0.283",
    "442": "0.271",
    "443": "0.228",
    "444": "0.305",
    "445": "-0.15",
    "446": "0.302",
    "447": "0.306",
    "448": "0.3",
    "449": "-0.147",
    "450": "0.209",
    "451": "-0.231",
    "452": "0.414",
    "453": "-0.23",
    "454": "0.336",
    "455": "-0.165",
    "456": "-0.167",
    "457": "0.281",
    "458": "-0.149",
    "459": "0.461",
    "460": "-0.229",
    "461": "0.463",
    "462": "0.309",
    "463": "-0.193",
    "464": "-0.122",
    "465": "0.241",
    "466": "0.267",
    "467": "0.276",
    "468": "0.274",
    "469": "0.239",
    "470": "-0.155",
    "471": "0.303",
    "472": "0.242",
    "473": "0.255",
    "474": "0.263",
    "475": "-0.256",
    "476": "-0.251",
    "477": "-0.255",
    "478": "0.438",
    "479": "0.383",
    "480": "0.446",
    "481": "-0.219",
    "482": "0.443",
    "483": "-0.217",
    "484": "0.292",
    "485": "0.295",
    "486": "0.262",
    "487": "0.245",
    "488": "0.296",
    "489": "0.294",
    "490": "-0.134",
    "491": "-0.16",
    "492": "0.254",
    "493": "0.284",
    "494": "0.279",
    "495": "0.349",
    "496": "-0.171",
    "497": "-0.17",
    "498": "-0.247",
    "499": "-0.241",
    "500": "0.331",
    "501": "-0.161",
    "502": "0.327",
    "503": "0.33",
    "504": "0.329",
    "505": "0.378",
    "506": "-0.186",
    "507": "-0.181",
    "508": "0.343",
    "509": "-0.182",
    "510": "0.365",
    "511": "0.246",
    "512": "-0.199",
    "513": "-0.196",
    "514": "-0.198",
    "515": "-0.195",
    "516": "0.405",
    "517": "-0.177",
    "518": "-0.179",
    "519": "-0.188",
    "520": "0.272",
    "521": "0.282",
    "522": "-0.159",
    "523": "-0.225",
    "524": "-0.226",
    "525": "0.46",
    "526": "0.391",
    "527": "0.34",
    "528": "-0.189",
    "529": "-0.194",
    "530": "0.408",
    "531": "-0.222",
    "532": "-0.224",
    "533": "0.324",
    "534": "0.287",
    "535": "-0.191",
    "536": "-0.19",
    "537": "-0.206",
    "538": "-0.213",
    "539": "-0.212",
    "540": "0.285",
    "541": "0.326",
    "542": "0.335",
    "543": "0.29",
    "544": "0.301",
    "545": "0.286",
    "546": "-0.178",
    "547": "0.37",
    "548": "0.374",
    "549": "0.362",
    "550": "-0.175",
    "551": "0.401",
    "552": "0.403",
    "553": "0.31",
    "554": "0.307",
    "555": "0.304",
    "556": "0.308",
    "557": "0.27",
    "558": "0.407",
    "559": "0.406",
    "560": "-0.203",
    "561": "0.404",
    "562": "0.4",
    "563": "-0.205",
    "564": "0.333",
    "565": "-1.134",
    "566": "-0.333",
    "567": "-0.33",
    "568": "-0.22",
    "569": "-0.218",
    "570": "-0.187",
    "571": "-0.173",
    "572": "0.341",
    "573": "0.268",
    "574": "0.348",
    "575": "0.347",
    "576": "0.523",
    "577": "-0.262",
    "578": "-0.25",
    "579": "0.516",
    "580": "-0.174",
    "581": "-0.172",
    "582": "0.339",
    "583": "0.357",
    "584": "0.354",
    "585": "0.352",
    "586": "0.444",
    "587": "-0.192",
    "588": "0.493",
    "589": "-0.237",
    "590": "-0.483",
    "591": "0.905",
    "592": "-0.473",
    "593": "-0.488",
    "594": "0.325",
    "595": "0.355",
    "596": "-0.176",
    "597": "-1.061",
    "598": "0.393",
    "599": "0.311",
    "600": "0.381",
    "601": "0.396",
    "602": "0.521",
    "603": "-0.489",
    "604": "0.993",
    "605": "0.387",
    "606": "0.504",
    "607": "-0.248",
    "608": "-0.267",
    "609": "-0.269",
    "610": "0.539",
    "611": "-0.263",
    "612": "0.389",
    "613": "-0.209",
    "614": "-0.208",
    "615": "0.427",
    "616": "0.316",
    "617": "0.429",
    "618": "-0.211",
    "619": "-0.242",
    "620": "-0.239",
    "621": "-0.243",
    "622": "0.454",
    "623": "0.518",
    "624": "-0.233",
    "625": "-0.2",
    "626": "0.398",
    "627": "-0.236",
    "628": "-0.228",
    "629": "-0.235",
    "630": "0.332",
    "631": "0.345",
    "632": "0.344",
    "633": "0.317",
    "634": "1.059",
    "635": "1.147",
    "636": "0.323",
    "637": "-0.272",
    "638": "0.544",
    "639": "-0.271",
    "640": "0.298",
    "641": "0.38",
    "642": "0.367",
    "643": "-0.183",
    "644": "0.375",
    "645": "-0.258",
    "646": "-0.26",
    "647": "0.482",
    "648": "-0.326",
    "649": "-0.325",
    "650": "0.615",
    "651": "-0.234",
    "652": "0.488",
    "653": "-0.232",
    "654": "-0.238",
    "655": "0.425",
    "656": "0.413",
    "657": "0.368",
    "658": "0.361",
    "659": "-0.254",
    "660": "-0.253",
    "661": "0.314",
    "662": "0.315",
    "663": "0.342",
    "664": "0.358",
    "665": "0.366",
    "666": "-0.24",
    "667": "0.487",
    "668": "0.483",
    "669": "0.496",
    "670": "-0.244",
    "671": "-0.207",
    "672": "0.497",
    "673": "-0.438",
    "674": "-0.439",
    "675": "-0.441",
    "676": "-0.394",
    "677": "0.474",
    "678": "0.399",
    "679": "-0.321",
    "680": "-0.314",
    "681": "-0.316",
    "682": "0.376",
    "683": "-0.227",
    "684": "-0.204",
    "685": "0.426",
    "686": "-0.214",
    "687": "0.39",
    "688": "0.431",
    "689": "-0.535",
    "690": "0.371",
    "691": "0.356",
    "692": "0.346",
    "693": "-0.223",
    "694": "0.409",
    "695": "0.554",
    "696": "0.582",
    "697": "0.595",
    "698": "-0.216",
    "699": "0.535",
    "700": "-0.257",
    "701": "0.364",
    "702": "0.433",
    "703": "0.441",
    "704": "0.44",
    "705": "-0.215",
    "706": "0.35",
    "707": "-0.21",
    "708": "-0.201",
    "709": "0.382",
    "710": "-0.27",
    "711": "-0.266",
    "712": "0.538",
    "713": "0.546",
    "714": "0.532",
    "715": "0.363",
    "716": "0.473",
    "717": "-0.202",
    "718": "0.394",
    "719": "-0.197",
    "720": "0.372",
    "721": "0.384",
    "722": "0.312",
    "723": "0.313",
    "724": "0.319",
    "725": "0.574",
    "726": "-0.279",
    "727": "-0.304",
    "728": "-0.309",
    "729": "-0.312",
    "730": "0.353",
    "731": "0.36",
    "732": "0.485",
    "733": "0.373",
    "734": "0.533",
    "735": "0.543",
    "736": "-0.268",
    "737": "-0.292",
    "738": "-0.289",
    "739": "0.591",
    "740": "-0.221",
    "741": "0.511",
    "742": "0.51",
    "743": "0.359",
    "744": "-0.287",
    "745": "-0.3",
    "746": "0.377",
    "747": "0.417",
    "748": "-0.252",
    "749": "0.519",
    "750": "0.464",
    "751": "0.397",
    "752": "0.467",
    "753": "0.369",
    "754": "-0.323",
    "755": "-0.327",
    "756": "0.447",
    "757": "0.419",
    "758": "0.455",
    "759": "0.385",
    "760": "-0.261",
    "761": "0.457",
    "762": "0.428",
    "763": "0.432",
    "764": "0.452",
    "765": "0.338",
    "766": "0.351",
    "767": "0.54",
    "768": "0.439",
    "769": "0.537",
    "770": "0.416",
    "771": "0.43",
    "772": "0.421",
    "773": "-0.276",
    "774": "0.458",
    "775": "0.411",
    "776": "0.423",
    "777": "-0.805",
    "778": "-0.283",
    "779": "-0.259"
  },
  "initializer_range": 0.02,
  "intermediate_size": 9216,
   "max_position_embeddings": 8192,
  "model_type": "gemma2",
  "num_attention_heads": 8,
  "num_hidden_layers": 26,
  "num_key_value_heads": 4,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "query_pre_attn_scalar": 256,
  "rms_norm_eps": 1e-06,
  "rope_theta": 10000.0,
  "sliding_window": 4096,
  "torch_dtype": "float32",
  "transformers_version": "4.44.2",
  "use_cache": true,
  "vocab_size": 256000
}

I removed label2id from result because text is too long.

What should I do to solve this problem?

1 Like

I solved my problem with using LoraConfig

image

1 Like