"KeyError: 'text' text_column = self.column_mapping["text"]"

I’m trying to train token classification model with my local train files and it alway returns 500 error:
my dataset looks like this:
train.csv

tokens,tags
"['ist', 'lebt', 'Herr', 'Berlin', '030', 'Siemens', '.', 'E-Mail-Adresse', 'Telefonnummer', '@', '12345678', '.', 'Seine', 'und', 'example', 'de', 'mueller', 'in', 'arbeitet', '.', 'Müller', 'bei']","['O', 'O', 'O', 'LOCATION', 'PHONE_NUMBER', 'ORGANIZATION', 'O', 'O', 'O', 'EMAIL', 'PHONE_NUMBER', 'EMAIL', 'O', 'O', 'EMAIL', 'EMAIL', 'EMAIL', 'O', 'O', 'O', 'PERSON', 'O']"
"['arbeitet', 'bei', 'de', '030', '.', '@', 'und', 'lebt', 'example', '.', 'E-Mail-Adresse', 'ist', '12345678', 'Müller', 'in', 'Telefonnummer', '.', 'Siemens', 'Seine', 'Berlin', 'mueller', 'Herr']","['O', 'O', 'EMAIL', 'PHONE_NUMBER', 'O', 'EMAIL', 'O', 'O', 'EMAIL', 'EMAIL', 'O', 'O', 'PHONE_NUMBER', 'PERSON', 'O', 'O', 'O', 'ORGANIZATION', 'O', 'LOCATION', 'EMAIL', 'O']"
"['.', 'bei', 'example', '12345678', '.', 'ist', 'Seine', '@', 'de', 'Herr', 'arbeitet', 'in', 'Telefonnummer', 'Berlin', '030', 'lebt', 'mueller', 'Müller', 'Siemens', '.', 'E-Mail-Adresse', 'und']","['O', 'O', 'EMAIL', 'PHONE_NUMBER', 'EMAIL', 'O', 'O', 'EMAIL', 'EMAIL', 'O', 'O', 'O', 'O', 'LOCATION', 'PHONE_NUMBER', 'O', 'EMAIL', 'PERSON', 'ORGANIZATION', 'O', 'O', 'O']"
.
.
.

validate.csv

tokens,tags
"['Siemens', 'arbeitet', '12345678', 'E-Mail-Adresse', '.', '@', 'Müller', '030', 'lebt', 'bei', 'Telefonnummer', 'mueller', 'example', 'ist', 'de', 'Berlin', 'in', 'und', 'Herr', 'Seine', '.', '.']","['ORGANIZATION', 'O', 'PHONE_NUMBER', 'O', 'O', 'EMAIL', 'PERSON', 'PHONE_NUMBER', 'O', 'O', 'O', 'EMAIL', 'EMAIL', 'O', 'EMAIL', 'LOCATION', 'O', 'O', 'O', 'O', 'EMAIL', 'O']"
"['lebt', 'in', 'ist', 'de', '12345678', 'arbeitet', '030', 'Berlin', '@', 'Herr', 'Müller', 'Seine', '.', 'E-Mail-Adresse', 'Siemens', '.', 'und', '.', 'example', 'bei', 'mueller', 'Telefonnummer']","['O', 'O', 'O', 'EMAIL', 'PHONE_NUMBER', 'O', 'PHONE_NUMBER', 'LOCATION', 'EMAIL', 'O', 'PERSON', 'O', 'O', 'O', 'ORGANIZATION', 'O', 'O', 'EMAIL', 'EMAIL', 'O', 'EMAIL', 'O']"
.
.
.
KeyError: 'text'

text_column = self.column_mapping["text"]

File "/app/env/lib/python3.10/site-packages/autotrain/dataset.py", line 609, in prepare

data_path = dset.prepare()

File "/app/env/lib/python3.10/site-packages/autotrain/app/ui_routes.py", line 673, in handle_form

return await dependant.call(**values)

File "/app/env/lib/python3.10/site-packages/fastapi/routing.py", line 212, in run_endpoint_function

raw_response = await run_endpoint_function(

File "/app/env/lib/python3.10/site-packages/fastapi/routing.py", line 301, in app

response = await f(request)

File "/app/env/lib/python3.10/site-packages/starlette/routing.py", line 73, in app

await app(scope, receive, sender)

File "/app/env/lib/python3.10/site-packages/starlette/_exception_handler.py", line 42, in wrapped_app

raise exc

File "/app/env/lib/python3.10/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app

await wrap_app_handling_exceptions(app, request)(scope, receive, send)

File "/app/env/lib/python3.10/site-packages/starlette/routing.py", line 76, in app

await self.app(scope, receive, send)

File "/app/env/lib/python3.10/site-packages/starlette/routing.py", line 288, in handle

await route.handle(scope, receive, send)

File "/app/env/lib/python3.10/site-packages/starlette/routing.py", line 735, in app

await self.middleware_stack(scope, receive, send)

File "/app/env/lib/python3.10/site-packages/starlette/routing.py", line 715, in __call__

await app(scope, receive, sender)

File "/app/env/lib/python3.10/site-packages/starlette/_exception_handler.py", line 42, in wrapped_app

raise exc

File "/app/env/lib/python3.10/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app

await wrap_app_handling_exceptions(self.app, conn)(scope, receive, send)

File "/app/env/lib/python3.10/site-packages/starlette/middleware/exceptions.py", line 62, in __call__

await self.app(scope, receive, send_wrapper)

File "/app/env/lib/python3.10/site-packages/starlette/middleware/sessions.py", line 85, in __call__

await self.app(scope, receive, _send)

File "/app/env/lib/python3.10/site-packages/starlette/middleware/errors.py", line 165, in __call__

raise exc

File "/app/env/lib/python3.10/site-packages/starlette/middleware/errors.py", line 187, in __call__

await self.middleware_stack(scope, receive, send)

File "/app/env/lib/python3.10/site-packages/starlette/applications.py", line 113, in __call__

await super().__call__(scope, receive, send)

File "/app/env/lib/python3.10/site-packages/fastapi/applications.py", line 1054, in __call__

return await self.app(scope, receive, send)

File "/app/env/lib/python3.10/site-packages/uvicorn/middleware/proxy_headers.py", line 60, in __call__

result = await app( # type: ignore[func-returns-value]

File "/app/env/lib/python3.10/site-packages/uvicorn/protocols/http/httptools_impl.py", line 401, in run_asgi

Traceback (most recent call last):

ERROR: Exception in ASGI application

INFO | 2024-12-13 18:48:50 | autotrain.app.ui_routes:handle_form:657 - Column mapping: {'tokens': 'tokens', 'tags': 'tags'}

INFO | 2024-12-13 18:48:50 | autotrain.app.ui_routes:handle_form:656 - Task: text_token_classification

INFO | 2024-12-13 18:48:50 | autotrain.app.ui_routes:handle_form:540 - hardware: local-ui

INFO | 2024-12-13 18:48:25 | autotrain.app.ui_routes:fetch_params:415 - Task: token-classification

INFO | 2024-12-13 18:47:59 | autotrain.app.ui_routes:fetch_params:415 - Task: llm:sft

INFO: 10.20.26.107:19543 - "GET /?```
![Screenshot 2024-12-13 at 19.53.36|690x396](upload://lSsCk9K1eS7NdO4X72LZXzGio54.png)
2 Likes

The 500 error likely stems from a problem in either the preprocessing, model setup, or interaction with your environment. Here’s a checklist and suggestions to debug and resolve the issue:


1. Dataset Format

  • Check tokenization:
    • Ensure that the tokens and tags in the CSV files are properly formatted as lists.
    • Double-check that the quotes (") around tokens and tags are not interfering with reading the file as lists in Python.
  • Recommended Fix: Instead of storing lists as strings in CSV, store them as lists in JSONL (JSON Lines) format. Example:

json

Copy code

{"tokens": ["ist", "lebt", "Herr", "Berlin", "030", "Siemens", ".", "E-Mail-Adresse", "Telefonnummer"], "tags": ["O", "O", "O", "LOCATION", "PHONE_NUMBER", "ORGANIZATION", "O", "O", "O", "PHONE_NUMBER"]}

Use JSONL for better compatibility with frameworks like Hugging Face.


2. Loading the Dataset

If you’re using Hugging Face’s datasets library, ensure the dataset is correctly loaded. For a CSV:

python

Copy code

from datasets import load_dataset

data_files = {"train": "train.csv", "validation": "validate.csv"}
dataset = load_dataset("csv", data_files=data_files)

If your tokens and tags are strings, you may need to parse them:

python

Copy code

def preprocess_data(example):
    example["tokens"] = eval(example["tokens"])  # Convert string to list
    example["tags"] = eval(example["tags"])
    return example

dataset = dataset.map(preprocess_data)
1 Like

The issue is that Docker UI version does not work. Here the link to my spaces of AutoTrain Sheripov/autotrain-advanced which is duplicated from autotrain-projects/autotrain-advanced

It works If locally setup it with using python version:

params = TokenClassificationParams(
    data_path="model_training/data/",
    model="obi/deid_roberta_i2b2",
    lr=3e-5,
    epochs=6, 
    max_seq_length=128,
    batch_size=32,
    warmup_ratio=0.1,
    gradient_accumulation=1,
    optimizer="adamw_torch",
    scheduler="linear",
    weight_decay=0.01,
    max_grad_norm=0.5,  # Reduce gradient norm clipping
    seed=42,
    train_split="train",
    valid_split="valid",
    tokens_column="tokens",
    tags_column="tags",
    logging_steps=5,
    project_name="deid-roberta-i2b2-fine-tuned-german",
    auto_find_batch_size=False,
    mixed_precision="none",  
    save_total_limit=2, 
    token="",
    push_to_hub=True,
    eval_strategy="epoch",
    username="",
    log="wandb",
    early_stopping_patience=3, 
    early_stopping_threshold=0.01,
)

backend = "local"
project = AutoTrainProject(params=params, backend=backend, process=True)
project.create()
1 Like