I want to retain the formatting present in the dataset I am using which looks something like this.
John: Hi, how are you today?
Jane: I'm doing well, how about you?
It is meant to look like a chat-log
Here is my preprocessing chain.
from datasets import load_dataset
dataset_dict = load_dataset("text", data_files={"/content/table_names.txt"})
dataset = dataset_dict['train']
dataset = dataset.train_test_split(test_size=0.2, shuffle=False)
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
def preprocess_function(examples):
return tokenizer(examples["text"], truncation=True)
tokenized_dataset= dataset.map(
preprocess_function,
batched=True,
num_proc=4,
remove_columns=["text"],
)
block_size = 128
def group_texts(examples):
# Concatenate all texts.
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs.
total_length = (total_length // block_size) * block_size
# Split by chunks of max_len.
result = {
k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
for k, t in concatenated_examples.items()
}
result["labels"] = result["input_ids"].copy()
return result
lm_dataset = tokenized_dataset.map(group_texts, batched=True, batch_size=1000, num_proc=4)
Output script
input_ids = tokenizer.encode('Example', return_tensors='pt')
sample_outputs = model.generate(
input_ids,
do_sample=True,
max_length=200,
top_k=40,
top_p=0.95,
)
#print(sample_outputs)
for i, sample_output in enumerate(sample_outputs):
print(tokenizer.decode(sample_output, skip_special_tokens=True))
Output now looks like:
John: Hi, how are you today?Jane: I'm doing well, how about you?
I can’t figure out if it’s the preprocessing removing the lines in between while concatenating, or the tokenizer.decode loop omitting lines?