This is my fine tuning trocr code why is it not working anyone please help me I really need your help I am working on new language

  1. Install necessary libraries

We install ‘evaluate’ and upgrade ‘jiwer’ for modern metric calculation.

!pip install -q transformers datasets “jiwer>=3.0.0” tensorboard tokenizers accelerate evaluate

  1. Import core libraries

import pandas as pd
import torch
from torch.utils.data import Dataset
from PIL import Image
from sklearn.model_selection import train_test_split
import os
import json
from google.colab import drive
from transformers import (
TrOCRProcessor,
VisionEncoderDecoderModel,
Seq2SeqTrainingArguments,
Seq2SeqTrainer,
default_data_collator
)

This line is removed as it’s no longer needed → from datasets import load_metric

from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
import jiwer # We use jiwer directly for metrics

  1. Mount Google Drive

print(“Mounting Google Drive…”)
drive.mount(‘/content/drive’)
print(“Google Drive mounted successfully.”)

  1. Install necessary libraries

We install ‘evaluate’ and upgrade ‘jiwer’ for modern metric calculation.

!pip install -q transformers datasets “jiwer>=3.0.0” tensorboard tokenizers accelerate evaluate

  1. Import core libraries

import pandas as pd
import torch
from torch.utils.data import Dataset
from PIL import Image
from sklearn.model_selection import train_test_split
import os
import json
from google.colab import drive
from transformers import (
TrOCRProcessor,
VisionEncoderDecoderModel,
Seq2SeqTrainingArguments,
Seq2SeqTrainer,
default_data_collator
)

This line is removed as it’s no longer needed → from datasets import load_metric

from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
import jiwer # We use jiwer directly for metrics

  1. Mount Google # 1. Install necessary libraries

We install ‘evaluate’ and upgrade ‘jiwer’ for modern metric calculation.

!pip install -q transformers datasets “jiwer>=3.0.0” tensorboard tokenizers accelerate evaluate

  1. Import core libraries

import pandas as pd
import torch
from torch.utils.data import Dataset
from PIL import Image
from sklearn.model_selection import train_test_split
import os
import json
from google.colab import drive
from transformers import (
TrOCRProcessor,
VisionEncoderDecoderModel,
Seq2SeqTrainingArguments,
Seq2SeqTrainer,
default_data_collator
)

This line is removed as it’s no longer needed → from datasets import load_metric

from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
import jiwer # We use jiwer directly for metrics

  1. Mount Google Drive

print(“Mounting Google Drive…”)
drive.mount(‘/content/drive’)
print(“Google Drive mounted successfully.”)
print(“Mounting Google Drive…”)
drive.mount(‘/content/drive’)
print(“Google Drive mounted successfully.”)

  1. Define the PyTorch Dataset Class

class IAMDataset(Dataset):
def init(self, root_dir, df, processor, max_target_length=128):
self.root_dir = root_dir
self.df = df
self.processor = processor
self.max_target_length = max_target_length

def len(self):
return len(self.df)

def getitem(self, idx):
file_name = self.df[‘file_name’][idx]
text = str(self.df[‘text’][idx]) # Ensure text is a string
image_path = os.path.join(self.root_dir, file_name)
try:
image = Image.open(image_path).convert(“RGB”)
except FileNotFoundError:
print(f"Warning: Image file not found at {image_path}. Skipping.")
return None # Handle missing images gracefully
pixel_values = self.processor(image, return_tensors=“pt”).pixel_values
labels = self.processor.tokenizer(text,
padding=“max_length”,
max_length=self.max_target_length,
truncation=True).input_ids
labels = [label if label != self.processor.tokenizer.pad_token_id else -100 for label in labels]
return {“pixel_values”: pixel_values.squeeze(), “labels”: torch.tensor(labels)}

  1. Instantiate datasets

print(“Creating PyTorch datasets…”)
train_dataset = IAMDataset(root_dir=image_folder_path, df=train_df, processor=processor)
eval_dataset = IAMDataset(root_dir=image_folder_path, df=test_df, processor=processor)
print(“Training and evaluation datasets are ready.”)

  1. Configure model generation parameters

model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.eos_token_id = processor.tokenizer.sep_token_id
model.config.max_length = 64
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4

  1. Define Training Arguments

*** FIX: Renamed ‘evaluation_strategy’ to ‘eval_strategy’ ***

*** FIX: Reduced steps and epochs for a quick test run ***

training_args = Seq2SeqTrainingArguments(
predict_with_generate=True,
eval_strategy=“steps”, # RENAMED from evaluation_strategy
per_device_train_batch_size=8, # Small batch size for testing
per_device_eval_batch_size=8,
fp16=True,
output_dir=checkpoint_dir,
logging_dir=logging_dir,
num_train_epochs=3, # MODIFIED FOR QUICK TEST: Run for only one epoch
logging_steps=50, # MODIFIED FOR QUICK TEST: Log metrics every 20 steps
save_steps=100, # MODIFIED FOR QUICK TEST: Save a checkpoint every 20 steps
eval_steps=100, # MODIFIED FOR QUICK TEST: Evaluate every 20 steps
save_total_limit=2, # Keep only the last 2 checkpoints
report_to=“tensorboard”,
load_best_model_at_end=True,
metric_for_best_model=“cer”,
greater_is_better=False,
)

  1. Define the compute_metrics function

def compute_metrics(pred):
labels_ids = pred.label_ids
pred_ids = pred.predictions

pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
labels_ids[labels_ids == -100] = processor.tokenizer.pad_token_id
label_str = processor.batch_decode(labels_ids, skip_special_tokens=True)

cer = jiwer.cer(label_str, pred_str)
return {“cer”: cer}

print(“:white_check_mark: Training arguments and metrics function are correctly defined.”)
print(“Parameters have been set for a QUICK TEST RUN.”)

  1. Initialize the Trainer

trainer = Seq2SeqTrainer(
model=model,
tokenizer=processor.image_processor, # Use image_processor instead of feature_extractor
args=training_args,
compute_metrics=compute_metrics,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
data_collator=default_data_collator,
)

  1. Start Training

To start fresh:

print(“— Starting Model Training —”)
trainer.train()
print(“— Training Complete! —”)

— HOW TO RESUME TRAINING —

If your session disconnects, all checkpoints are safe in your Google Drive.

To resume from the last saved checkpoint, simply re-run Cells 1, 2, and 3,

and then run the following command in this cell instead of trainer.train():

print(“— Resuming Training from Last Checkpoint —”)

trainer.train(resume_from_checkpoint=True)

  1. Save the final model and processor

print(f"Saving the best model to: {final_model_dir}“)
os.makedirs(final_model_dir, exist_ok=True)
model.save_pretrained(final_model_dir)
processor.save_pretrained(final_model_dir)
print(f":white_check_mark: Final model and processor saved to Google Drive.”)

  1. How to view TensorBoard

The training logs are saved in the logging_dir on your Drive.

To view them, you can run this command in a NEW, separate cell

AFTER the training has started or completed.

%load_ext tensorboard
%tensorboard --logdir “{logging_dir}”

import torch
from PIL import Image
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from google.colab import files
from IPython.display import display, HTML
import io
import os

— Configuration —

  1. Set the path to your final model directory in Google Drive

final_model_dir = “/content/drive/MyDrive/Data/trocr-meiteimayek-final-model”

— Setup Device (GPU or CPU) —

  1. Check for GPU and set the device accordingly. This is crucial for speed.

device = torch.device(“cuda” if torch.cuda.is_available() else “cpu”)
print(f":white_check_mark: Using device: {device} ({torch.cuda.get_device_name(0) if torch.cuda.is_available() else ‘CPU’})“)
if device.type == ‘cpu’:
print(”\n⚠️ WARNING: You are running on a CPU. Inference will be very slow.")
print(“Go to Runtime > Change runtime type and select ‘T4 GPU’ or another GPU accelerator.”)

— Model Loading —

  1. Load the model and processor

if not os.path.exists(final_model_dir):
print(f":cross_mark: Error: Model directory not found at ‘{final_model_dir}’“)
else:
print(”\nLoading model and processor…")
try:
processor = TrOCRProcessor.from_pretrained(final_model_dir)

Load the model and move it to the selected device (GPU)

model = VisionEncoderDecoderModel.from_pretrained(final_model_dir).to(device)
print(“:white_check_mark: Model and processor loaded successfully!”)

— Interactive File Upload and Inference —

print(“\n— Please upload one or more Meitei Mayek images —”)
uploaded_files = files.upload()

if not uploaded_files:
print(“\nNo files were uploaded. Run the cell again to try.”)
else:
print(“\n— Inference Results —”)
for file_name, file_content in uploaded_files.items():
print(f"\nProcessing image: ‘{file_name}’…")

— Image Pre-processing for Robustness —

image = Image.open(io.BytesIO(file_content)).convert("RGB")      

# Resize the image to prevent issues with very large inputs      
# A max size of 1500px on the longest side is a good balance      
max_size = 1500      
if max(image.size) > max_size:      
    image.thumbnail((max_size, max_size))      
    print(f"   - Image resized to {image.size} for faster processing.")      

# Prepare the image for the model      
# Move the pixel_values tensor to the same device as the model (GPU)      
pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(device)      

print("   - Image processed. Generating text...")      

# --- Generate Text ---      
# Set max_length to prevent extremely long (or infinite) generation      
generated_ids = model.generate(pixel_values, max_length=128)      

# Decode the token IDs to get the recognized text      
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]      
print("   - Text generation complete.")      

# --- Display the results ---      
display(image)      
display(HTML(f"<h3>Recognized Text: <span style='color:blue;'>{generated_text}</span></h3>"))      
print("-" * 40)

except Exception as e:
print(f":cross_mark: An error occurred: {e}")

This is my full project for fine tuning tocr in my new languge script

But when I run inference it print out garbage question mark box and not any charcter what do you think is the problem stem from

1 Like

garbage question mark box

Tofu issue?
https://stackoverflow.com/questions/18760943/character-code-of-unknown-character-character-e-g-square-or-question-mark-romb/45465594

Also, if the tokenizer is incomplete, everything may be treated as <unk>

from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

# 2a. Initialize BPE with an <unk> token
tokenizer = Tokenizer(BPE(unk_token="<unk>"))
tokenizer.pre_tokenizer = Whitespace()  # split on spaces

# 2b. Train on your entire Meitei Mayek corpus
trainer = BpeTrainer(
    vocab_size=8000,
    special_tokens=["<s>", "</s>", "<pad>", "<unk>"]
)
tokenizer.train(files=["all_meitei_texts.txt"], trainer=trainer)  # trains merges & vocab

# 2c. Save the tokenizer JSON
tokenizer.save("meitei_bpe.json")

from transformers import (
    ViTImageProcessor,
    RobertaTokenizerFast,
    TrOCRProcessor,
    VisionEncoderDecoderModel
)

# 3a. Wrap your BPE in a Fast tokenizer
roberta = RobertaTokenizerFast(
    tokenizer_file="meitei_bpe.json",
    bos_token="<s>",
    eos_token="</s>",
    pad_token="<pad>",
    unk_token="<unk>"
)

# 3b. Load the image feature extractor
feature_extractor = ViTImageProcessor.from_pretrained("microsoft/trocr-base-handwritten")

# 3c. Combine into a single processor
processor = TrOCRProcessor(
    feature_extractor=feature_extractor,
    tokenizer=roberta
)

# 3d. Load pretrained model & update special tokens
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
model.config.decoder_start_token_id = roberta.bos_token_id
model.config.eos_token_id          = roberta.eos_token_id
model.config.pad_token_id          = roberta.pad_token_id
model.config.vocab_size            = roberta.vocab_size

Thank you I also found that the problem is in the training code ,despite the cretinon of tokenizer in cell 2 it was not using in the training cell and it still uses the default english tokenizer I fixed it but the text it generate are still garbage a single word repetad gundred if times .can you help me in my project to fine tune a completely new script

1 Like

I think it would be safer to use print statements and other methods to trace each step one by one to find where it stops working properly. By the way, I think that with Seq2Seq, it won’t work properly unless the tokenizer is like this…

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    #tokenizer=processor.image_processor,
    tokenizer=processor.tokenizer, # for Seq2SeqTrainer
    data_collator=default_data_collator,
    compute_metrics=compute_metrics
)

print(“— Initializing Training —”)

Define paths for model checkpoints and final output

checkpoint_dir = os.path.join(BASE_DRIVE_PATH, “trocr-meiteimayek-checkpoints”)
final_model_dir = os.path.join(BASE_DRIVE_PATH, “trocr-meiteimayek-final-model”)

1. Define Training Arguments

training_args = Seq2SeqTrainingArguments(
predict_with_generate=True,
eval_strategy=“steps”, # ‘evaluation_strategy’ is correct, ‘eval_strategy’ is an older alias
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
fp16=True, # Use mixed-precision for faster training
output_dir=checkpoint_dir,
num_train_epochs=5, # Start with 5, but you may need more
logging_steps=50,
save_steps=200,
eval_steps=200,
save_total_limit=2,
load_best_model_at_end=True,
metric_for_best_model=“cer”,
greater_is_better=False, # Lower CER is better
report_to=“tensorboard”,
)

2. Define the Character Error Rate (CER) metric function

cer_metric = evaluate.load(“cer”)
def compute_metrics(pred):
labels_ids = pred.label_ids
pred_ids = pred.predictions

# Decode predictions and labels
pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
labels_ids[labels_ids == -100] = processor.tokenizer.pad_token_id
label_str = processor.batch_decode(labels_ids, skip_special_tokens=True)

cer = cer_metric.compute(predictions=pred_str, references=label_str)
return {"cer": cer}

3. Initialize the Trainer (CORRECTED LINE)

The tokenizer argument has been removed. The trainer will correctly use the

tokenizer from the processor associated with the model.

trainer = Seq2SeqTrainer(
model=model,
args=training_args,
compute_metrics=compute_metrics,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
data_collator=default_data_collator,
)

4. Start Training! :rocket:

print(“\n— Starting Model Training —”)
trainer.train()
print(“\n— Training Complete! —”)

5. Save the final model and processor

print(f"\nSaving the best model to: {final_model_dir}“)
trainer.save_model(final_model_dir)
processor.save_pretrained(final_model_dir)
print(f":white_check_mark: Final model and custom processor saved to Google Drive.”)

Recognized Text: ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ ꯅ

The output is like this

1 Like

It’s a character for manipuri languge but it’s a diffrent image containing other characters

1 Like

How about this for debugging? Check whether the cause is a missing font.

display(HTML(f"<h3>Recognized Text Code: <span style='color:blue;'>{[hex(ord(c)) for c in generated_text]}</span></h3>"))
display(HTML(f"<h3>Recognized Text: <span style='color:blue;'>{generated_text}</span></h3>"))