my code as follow:
type or paste code here
def train(tokenizer_path, corpus_path, pretrained_model_path, saved_model_path, continue_training, embed_data_path):
transformers.logging.set_verbosity_info()
# load tokenizer
tokenizer = Tokenizer.from_file(tokenizer_path)
new_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer, unk_token='[UNK]')#, padding_side="right"
new_tokenizer.pad_token = new_tokenizer.eos_token
corpus = []
with open(corpus_path, 'r') as f:
corpus = json.load(f)
df_corpus = pd.DataFrame(corpus)
df_corpus.columns = ['text']
# split train and valid data
trn_df, val_df = train_test_split(df_corpus, test_size=0.025, random_state=42)
# convert to dataset
trn_ds = Dataset.from_pandas(trn_df.reset_index(drop=True))
val_ds = Dataset.from_pandas(val_df.reset_index(drop=True))
# tokenizer parallelly
#device = torch.device('cpu')
def tokenize_function(examples):
output_dict = new_tokenizer(examples["text"], padding='max_length', truncation=True, max_length=MAX_SEQ_LEN)
output_dict['labels'] = output_dict["input_ids"].copy()
#print("output:",output_dict)
return output_dict
trn_ds_token = trn_ds.map(tokenize_function, batched=True, num_proc=20, remove_columns=["text"])
val_ds_token = val_ds.map(tokenize_function, batched=True, num_proc=20, remove_columns=["text"])
print(trn_ds_token)
print(val_ds_token)
lm_datasets = {}
lm_datasets['train'] = trn_ds_token
lm_datasets['valid'] = val_ds_token
if(continue_training == False):
config = AutoConfig.from_pretrained(pretrained_model_path,vocab_size=VOCAB_SIZE, n_positions=MAX_SEQ_LEN, n_ctx=MAX_SEQ_LEN, n_layer=6, n_embd=384)#, initializer_range=0.002
model = TFGPT2LMHeadModel.from_config(config)
embedding = np.load(embed_data_path)
model.set_input_embeddings(embedding)
else:
model = TFGPT2LMHeadModel.from_pretrained(pretrained_model_path)
model.transformer.wte.trainable = False
def map_example_to_dict(input_ids, attention_mask, label):
return {
"input_ids": input_ids,
"attention_mask": attention_mask,
}, label
def encode_example(ds, limit=-1):
print(len(ds))
input_ids_list = []
attention_maks_list = []
label_list = []
for row in ds:
input_ids_list.append(row["input_ids"])#[:-1]
attention_maks_list.append(row["attention_mask"])#[:-1]
label_list.append(row["labels"])#[-100 if k == 1 else k for k in row["labels"][1:]]
#print(row["labels"])
return tf.data.Dataset.from_tensor_slices(
(input_ids_list, attention_maks_list, label_list)).map(map_example_to_dict)
trn_ds_token = encode_example(trn_ds_token).batch(128)
val_ds_token = encode_example(val_ds_token).batch(128)
model.save_pretrained(saved_model_path)
model.fit(trn_ds_token, epochs=4, verbose=1, validation_data=val_ds_token)
model.summary()
model.save_pretrained(saved_model_path)
if my input to TFGPT2LMHeadModel is right?