How to get model output to retain \n from dataset?

I want to retain the formatting present in the dataset I am using which looks something like this.

John: Hi, how are you today?

Jane: I'm doing well, how about you?

It is meant to look like a chat-log
Here is my preprocessing chain.

from datasets import load_dataset

dataset_dict = load_dataset("text", data_files={"/content/table_names.txt"})

dataset = dataset_dict['train']

dataset = dataset.train_test_split(test_size=0.2, shuffle=False)

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)


block_size = 128

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    result["labels"] = result["input_ids"].copy()
    return result

lm_dataset =, batched=True, batch_size=1000, num_proc=4)

Output script

input_ids = tokenizer.encode('Example', return_tensors='pt')

sample_outputs = model.generate(
for i, sample_output in enumerate(sample_outputs):
 print(tokenizer.decode(sample_output, skip_special_tokens=True))

Output now looks like:

John: Hi, how are you today?Jane: I'm doing well, how about you?

I can’t figure out if it’s the preprocessing removing the lines in between while concatenating, or the tokenizer.decode loop omitting lines?