So I want to load the hugging face from my local folder and train my model with it.
However, I get this error:
OSError: Incorrect path_or_model_id: '/distilgpt2'. Please provide either the path to a local folder or the repo_id of a model on the Hub.
# https://www.linkedin.com/advice/1/how-do-you-use-hugging-face-natural-language-processing-q4gve
# Models
# microsoft/phi-1_5
# distilbert/distilbert-base-uncased
# google-t5/t5-base
# deberta-v3-base
import pandas as pd
import torch
import transformers as tm #import BertLMHeadModel, AutoModelForMaskedLM, AutoModelForSeq2SeqLM, AutoModelForCausalLM, AutoModel, AutoModelForSequenceClassification, GPT2LMHeadModel,PhiForCausalLM, GPT2Tokenizer, AutoTokenizer, GPT2TokenizerFast
import os
import datasets as ds
from trl import SFTTrainer
import json
parent = os.path.dirname(os.getcwd())
fileName=parent+'\\amazon-kdd-cup-2024-starter-kit\data\development.json'
# fileName="./dev/data.json"
file=open(fileName)
# myData = ds.load_dataset("json", data_files=fileName, split="train")
myData=pd.read_json(fileName, lines=True)
testInput=myData["input_field"]
testMCQ=myData["is_multiple_choice"]
for i in range(len(myData)):
tuple=list(zip(testInput, testMCQ))
testData=pd.DataFrame(tuple, columns=['input_field', 'is_multiple_choice'])
testDataSet=ds.Dataset.from_pandas(testData)
print(testDataSet["input_field"])
print(testDataSet["is_multiple_choice"])
model_name="/distilgpt2"
model = tm.AutoModelForCausalLM.from_pretrained(model_name) # for phi-1_5
tokenizer = tm.AutoModelForCausalLM.from_pretrained(model_name, use_fast=False)
training_args = tm.TrainingArguments(
output_dir="./",
per_device_train_batch_size=16,
learning_rate=2e-4,
lr_scheduler_type="cosine",
num_train_epochs=3,
gradient_accumulation_steps=2, # simulate larger batch sizes
)
trainer = SFTTrainer(
model,
train_dataset=testDataSet,
dataset_text_field="input_field",
max_seq_length=3,
)
trainer.train()
How exactly should I specify the local folder name?