When i use TFGPT2LMHeadModel, how can i build labels?labels = inputs_ids or labels = inputs_ids[1:]

Orient · July 18, 2022, 2:26am

my code as follow:

type or paste code here
def train(tokenizer_path, corpus_path, pretrained_model_path, saved_model_path, continue_training, embed_data_path):
    transformers.logging.set_verbosity_info()
    # load tokenizer
    tokenizer = Tokenizer.from_file(tokenizer_path)
    new_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer, unk_token='[UNK]')#, padding_side="right"
    new_tokenizer.pad_token = new_tokenizer.eos_token
    corpus = []
    with open(corpus_path, 'r') as f:
        corpus = json.load(f)
    df_corpus = pd.DataFrame(corpus)
    df_corpus.columns = ['text']
    

    # split train and valid data
    trn_df, val_df = train_test_split(df_corpus, test_size=0.025, random_state=42)
    
    # convert to dataset
    trn_ds = Dataset.from_pandas(trn_df.reset_index(drop=True))
    val_ds = Dataset.from_pandas(val_df.reset_index(drop=True))
    
    # tokenizer parallelly
    #device = torch.device('cpu')
    def tokenize_function(examples):
        
        output_dict = new_tokenizer(examples["text"], padding='max_length', truncation=True, max_length=MAX_SEQ_LEN)
        output_dict['labels'] = output_dict["input_ids"].copy()
        #print("output:",output_dict)
        return output_dict
    trn_ds_token = trn_ds.map(tokenize_function, batched=True, num_proc=20, remove_columns=["text"])
    val_ds_token = val_ds.map(tokenize_function, batched=True, num_proc=20, remove_columns=["text"])
    print(trn_ds_token)
    print(val_ds_token)

    lm_datasets = {}
    lm_datasets['train'] = trn_ds_token 
    lm_datasets['valid'] = val_ds_token

    
    if(continue_training == False):
        config = AutoConfig.from_pretrained(pretrained_model_path,vocab_size=VOCAB_SIZE, n_positions=MAX_SEQ_LEN, n_ctx=MAX_SEQ_LEN, n_layer=6,  n_embd=384)#, initializer_range=0.002
        model = TFGPT2LMHeadModel.from_config(config)
        embedding = np.load(embed_data_path)
        model.set_input_embeddings(embedding)
    else:
        model = TFGPT2LMHeadModel.from_pretrained(pretrained_model_path)
    model.transformer.wte.trainable = False
    def map_example_to_dict(input_ids, attention_mask, label):
        return {
                "input_ids": input_ids,
                "attention_mask": attention_mask,
            }, label

    def encode_example(ds, limit=-1):
        print(len(ds))
        input_ids_list = []
        attention_maks_list = []
        label_list = []
        for row in ds:
            input_ids_list.append(row["input_ids"])#[:-1]
            attention_maks_list.append(row["attention_mask"])#[:-1]
            
            label_list.append(row["labels"])#[-100 if k == 1 else k for k in row["labels"][1:]]
            #print(row["labels"])
        return tf.data.Dataset.from_tensor_slices(
            (input_ids_list, attention_maks_list, label_list)).map(map_example_to_dict)

    trn_ds_token = encode_example(trn_ds_token).batch(128)
    val_ds_token = encode_example(val_ds_token).batch(128)

    model.save_pretrained(saved_model_path)

    model.fit(trn_ds_token, epochs=4, verbose=1, validation_data=val_ds_token)
    model.summary()
    model.save_pretrained(saved_model_path)

if my input to TFGPT2LMHeadModel is right?

Topic		Replies	Views
Labels shape when using model.fit and TFGPT2LMHeadModel 🤗Transformers	0	755	February 1, 2021
Api and parameters change from transofrmers 2.5.1 to 3.5.1 for GPT2 🤗Transformers	0	242	January 4, 2021
Am I doing this right? Beginners	1	511	July 12, 2020
Training GPT-2 from scratch Beginners	2	1249	August 3, 2020
GPT2 Training from scratch in German 🤗Transformers	3	2331	October 3, 2020

When i use TFGPT2LMHeadModel, how can i build labels?labels = inputs_ids or labels = inputs_ids[1:]

Related topics