Hello friends,
I recorded all human-customer conversations on my e-commerce site in json form. This data is as follows:
{
dialog_1 = [
{“role”:“user”,“content”:“hello”},
{“role”: “user”, “content”: “are you online”},
{“role”: “user”, “content”: “can you help me ?”},
{“role”: “assistant”, “content”: “yes, how can I help you?”},
{“role”: “assistant”, “content”: “What’s your problem?”},
…
],
dialog_2 = [{“role”: “user”, “content”: “hello”},
{“role”: “assistant”, “content”: “hello”},
{“role”: “user”, “content”: “there is a problem with my order”},
{“role”: “assistant”, “content”: “Can I have your order number??”},
…],
…
}
there are 7k dialogs in this structure, but if I give a dataset, it does not do successful learning and does not respond properly. How can I use this dataset successfully? I give it the same as it is.
full_data = list(data_dict)
train_data = []
for conversation_id in full_data:
conversation = data_dict[conversation_id]
user_message_buffer = ""
assistant_response_buffer = ""
assistant_responses = []
dialogue_context = ""
messages_tranin = []
messages_tranin.append({
"from":"system",
"value":DEFAULT_SYSTEM_PROMPT,
})
for message in conversation:
role = message['role']
content = message['content']
message_type = message["type"]
if message_type != "chat":
continue
content = f"<--{message_type}-->"
if role == "user":
role = "human"
else:
role = "gpt"
messages_tranin.append({
"from":role,
"value":content,
})
if len(messages_tranin) > 10:
train_data.append({"conversations":messages_tranin})
tokenizer = get_chat_template(
tokenizer,
mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"},
chat_template="chatml",
map_eos_token = True,
)
def apply_template(examples):
messages = examples["conversations"]
print(messages)
text = [tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=False) for message in messages]
return {"text": text}```