Source code:
from datasets import Dataset, load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorForSeq2Seq, Trainer
from peft import PromptTuningConfig, get_peft_model, TaskType, PromptTuningInit
dataset = load_dataset("json", data_files="/home/david/LLM4Decompile/train/AnghaBench_demo_compile.jsonl")
The error:
Failed to load JSON from file '/home/david/LLM4Decompile/train/AnghaBench_demo_compile.jsonl' with error <class 'pyarrow.lib.ArrowInvalid'>: JSON parse error: Column(/output/opt-state-O1) changed from string to object in row 42
My dataset format is like this:
{"name": "..." , "input": "...", "input_ori": "...", "output": {"opt-state-O0": "...", "opt-state-O1": "...", "opt-state-O2": "...", "opt-state-O3": "..."}}
the 42 row is here:
{"name": "/home/david/LLM4Decompile/train/AnghaBench_demo/8cc/extr_parse.c_read_declarator_params_oldstyle.c", "input": "static void read_declarator_params_oldstyle(Vector *vars) {\n for (;;) {\n Token *tok = get();\n if (tok->kind != TIDENT)\n errort(tok, \"identifier expected, but got %s\", tok2s(tok));\n vec_push(vars, ast_lvar(type_int, tok->sval));\n if (next_token(')'))\n return;\n if (!next_token(','))\n errort(tok, \"comma expected, but got %s\", tok2s(get()));\n }\n}", "input_ori": "#define NULL ((void*)0)\ntypedef unsigned long size_t; // Customize by platform.\ntypedef long intptr_t; typedef unsigned long uintptr_t;\ntypedef long scalar_t__; // Either arithmetic or pointer type.\n/* By default, we understand bool (as a convenience). */\ntypedef int bool;\n#define false 0\n#define true 1\n\n/* Forward declarations */\ntypedef struct TYPE_5__ TYPE_1__ ;\n\n/* Type definitions */\ntypedef int /*<<< orphan*/ Vector ;\nstruct TYPE_5__ {scalar_t__ kind; int /*<<< orphan*/ sval; } ;\ntypedef TYPE_1__ Token ;\n\n/* Variables and functions */\n scalar_t__ TIDENT ; \n int /*<<< orphan*/ ast_lvar (int /*<<< orphan*/ ,int /*<<< orphan*/ ) ; \n int /*<<< orphan*/ errort (TYPE_1__*,char*,int /*<<< orphan*/ ) ; \n TYPE_1__* get () ; \n scalar_t__ next_token (char) ; \n int /*<<< orphan*/ tok2s (TYPE_1__*) ; \n int /*<<< orphan*/ type_int ; \n int /*<<< orphan*/ vec_push (int /*<<< orphan*/ *,int /*<<< orphan*/ ) ; \n\n__attribute__((used)) static void read_declarator_params_oldstyle(Vector *vars) {\n for (;;) {\n Token *tok = get();\n if (tok->kind != TIDENT)\n errort(tok, \"identifier expected, but got %s\", tok2s(tok));\n vec_push(vars, ast_lvar(type_int, tok->sval));\n if (next_token(')'))\n return;\n if (!next_token(','))\n errort(tok, \"comma expected, but got %s\", tok2s(get()));\n }\n}", "output": {"opt-state-O0": "<read_declarator_params_oldstyle>:\nendbr64\npush %rbp\nmov %rsp,%rbp\nsub $0x20,%rsp\nmov %rdi,-0x18(%rbp)\nmov $0x0,%eax\ncall 1a <read_declarator_params_oldstyle+0x1a>\nmov %rax,-0x8(%rbp)\nmov -0x8(%rbp),%rax\nmov (%rax),%rdx\nmov 0x0(%rip),%rax\ncmp %rax,%rdx\nje 55 <read_declarator_params_oldstyle+0x55>\nmov -0x8(%rbp),%rax\nmov %rax,%rdi\ncall 3d <read_declarator_params_oldstyle+0x3d>\nmov %eax,%edx\nmov -0x8(%rbp),%rax\nlea 0x0(%rip),%rcx\nmov %rcx,%rsi\nmov %rax,%rdi\ncall 55 <read_declarator_params_oldstyle+0x55>\nmov -0x8(%rbp),%rax\nmov 0x8(%rax),%edx\nmov 0x0(%rip),%eax\nmov %edx,%esi\nmov %eax,%edi\ncall 6b <read_declarator_params_oldstyle+0x6b>\nmov %eax,%edx\nmov -0x18(%rbp),%rax\nmov %edx,%esi\nmov %rax,%rdi\ncall 7b <read_declarator_params_oldstyle+0x7b>\nmov $0x29,%edi\ncall 85 <read_declarator_params_oldstyle+0x85>\ntest %rax,%rax\njne cc <read_declarator_params_oldstyle+0xcc>\nmov $0x2c,%edi\ncall 94 <read_declarator_params_oldstyle+0x94>\ntest %rax,%rax\njne 10 <read_declarator_params_oldstyle+0x10>\nmov $0x0,%eax\ncall a7 <read_declarator_params_oldstyle+0xa7>\nmov %rax,%rdi\ncall af <read_declarator_params_oldstyle+0xaf>\nmov %eax,%edx\nmov -0x8(%rbp),%rax\nlea 0x0(%rip),%rcx\nmov %rcx,%rsi\nmov %rax,%rdi\ncall c7 <read_declarator_params_oldstyle+0xc7>\njmp 10 <read_declarator_params_oldstyle+0x10>\nnop\nleave\nret\n", "opt-state-O1": "<read_declarator_params_oldstyle>:\nendbr64\npush %r13\npush %r12\npush %rbp\npush %rbx\nsub $0x8,%rsp\nmov %rdi,%rbp\nlea 0x0(%rip),%r12\nlea 0x0(%rip),%r13\njmp 6c <read_declarator_params_oldstyle+0x6c>\nmov %rbx,%rdi\ncall 29 <read_declarator_params_oldstyle+0x29>\nmov %eax,%edx\nmov %r12,%rsi\nmov %rbx,%rdi\ncall 36 <read_declarator_params_oldstyle+0x36>\nmov 0x8(%rbx),%esi\nmov 0x0(%rip),%edi\ncall 44 <read_declarator_params_oldstyle+0x44>\nmov %eax,%esi\nmov %rbp,%rdi\ncall 4e <read_declarator_params_oldstyle+0x4e>\nmov $0x29,%edi\ncall 58 <read_declarator_params_oldstyle+0x58>\ntest %rax,%rax\njne a3 <read_declarator_params_oldstyle+0xa3>\nmov $0x2c,%edi\ncall 67 <read_declarator_params_oldstyle+0x67>\ntest %rax,%rax\nje 87 <read_declarator_params_oldstyle+0x87>\nmov $0x0,%eax\ncall 76 <read_declarator_params_oldstyle+0x76>\nmov %rax,%rbx\nmov 0x0(%rip),%rax\ncmp %rax,(%rbx)\nje 36 <read_declarator_params_oldstyle+0x36>\njmp 21 <read_declarator_params_oldstyle+0x21>\ncall 8c <read_declarator_params_oldstyle+0x8c>\nmov %rax,%rdi\ncall 94 <read_declarator_params_oldstyle+0x94>\nmov %eax,%edx\nmov %r13,%rsi\nmov %rbx,%rdi\ncall a1 <read_declarator_params_oldstyle+0xa1>\njmp 6c <read_declarator_params_oldstyle+0x6c>\nadd $0x8,%rsp\npop %rbx\npop %rbp\npop %r12\npop %r13\nret\n", "opt-state-O2": "<read_declarator_params_oldstyle>:\nendbr64\npush %r13\nlea 0x0(%rip),%r13\npush %r12\nlea 0x0(%rip),%r12\npush %rbp\npush %rbx\nmov %rdi,%rbx\nsub $0x8,%rsp\njmp 37 <read_declarator_params_oldstyle+0x37>\nnopl 0x0(%rax)\nmov $0x2c,%edi\ncall 32 <read_declarator_params_oldstyle+0x32>\ntest %rax,%rax\nje 98 <read_declarator_params_oldstyle+0x98>\nxor %eax,%eax\ncall 3e <read_declarator_params_oldstyle+0x3e>\nmov %rax,%rbp\nmov 0x0(%rip),%rax\ncmp %rax,0x0(%rbp)\nje 63 <read_declarator_params_oldstyle+0x63>\nmov %rbp,%rdi\ncall 56 <read_declarator_params_oldstyle+0x56>\nmov %r12,%rsi\nmov %rbp,%rdi\nmov %eax,%edx\ncall 63 <read_declarator_params_oldstyle+0x63>\nmov 0x8(%rbp),%esi\nmov 0x0(%rip),%edi\ncall 71 <read_declarator_params_oldstyle+0x71>\nmov %rbx,%rdi\nmov %eax,%esi\ncall 7b <read_declarator_params_oldstyle+0x7b>\nmov $0x29,%edi\ncall 85 <read_declarator_params_oldstyle+0x85>\ntest %rax,%rax\nje 28 <read_declarator_params_oldstyle+0x28>\nadd $0x8,%rsp\npop %rbx\npop %rbp\npop %r12\npop %r13\nret\nnopl (%rax)\ncall 9d <read_declarator_params_oldstyle+0x9d>\nmov %rax,%rdi\ncall a5 <read_declarator_params_oldstyle+0xa5>\nmov %r13,%rsi\nmov %rbp,%rdi\nmov %eax,%edx\ncall b2 <read_declarator_params_oldstyle+0xb2>\njmp 37 <read_declarator_params_oldstyle+0x37>\n", "opt-state-O3": "<read_declarator_params_oldstyle>:\nendbr64\npush %r13\nlea 0x0(%rip),%r13\npush %r12\nlea 0x0(%rip),%r12\npush %rbp\npush %rbx\nmov %rdi,%rbx\nsub $0x8,%rsp\njmp 37 <read_declarator_params_oldstyle+0x37>\nnopl 0x0(%rax)\nmov $0x2c,%edi\ncall 32 <read_declarator_params_oldstyle+0x32>\ntest %rax,%rax\nje 98 <read_declarator_params_oldstyle+0x98>\nxor %eax,%eax\ncall 3e <read_declarator_params_oldstyle+0x3e>\nmov %rax,%rbp\nmov 0x0(%rip),%rax\ncmp %rax,0x0(%rbp)\nje 63 <read_declarator_params_oldstyle+0x63>\nmov %rbp,%rdi\ncall 56 <read_declarator_params_oldstyle+0x56>\nmov %r12,%rsi\nmov %rbp,%rdi\nmov %eax,%edx\ncall 63 <read_declarator_params_oldstyle+0x63>\nmov 0x8(%rbp),%esi\nmov 0x0(%rip),%edi\ncall 71 <read_declarator_params_oldstyle+0x71>\nmov %rbx,%rdi\nmov %eax,%esi\ncall 7b <read_declarator_params_oldstyle+0x7b>\nmov $0x29,%edi\ncall 85 <read_declarator_params_oldstyle+0x85>\ntest %rax,%rax\nje 28 <read_declarator_params_oldstyle+0x28>\nadd $0x8,%rsp\npop %rbx\npop %rbp\npop %r12\npop %r13\nret\nnopl (%rax)\ncall 9d <read_declarator_params_oldstyle+0x9d>\nmov %rax,%rdi\ncall a5 <read_declarator_params_oldstyle+0xa5>\nmov %r13,%rsi\nmov %rbp,%rdi\nmov %eax,%edx\ncall b2 <read_declarator_params_oldstyle+0xb2>\njmp 37 <read_declarator_params_oldstyle+0x37>\n"}}
It can pass the formatting validation on 在线JSON校验格式化工具(Be JSON)
I wonder how to address this error. Thank you!