- Install necessary libraries
We install ‘evaluate’ and upgrade ‘jiwer’ for modern metric calculation.
!pip install -q transformers datasets “jiwer>=3.0.0” tensorboard tokenizers accelerate evaluate
- Import core libraries
import pandas as pd
import torch
from torch.utils.data import Dataset
from PIL import Image
from sklearn.model_selection import train_test_split
import os
import json
from google.colab import drive
from transformers import (
TrOCRProcessor,
VisionEncoderDecoderModel,
Seq2SeqTrainingArguments,
Seq2SeqTrainer,
default_data_collator
)
This line is removed as it’s no longer needed → from datasets import load_metric
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
import jiwer # We use jiwer directly for metrics
- Mount Google Drive
print(“Mounting Google Drive…”)
drive.mount(‘/content/drive’)
print(“Google Drive mounted successfully.”)
- Install necessary libraries
We install ‘evaluate’ and upgrade ‘jiwer’ for modern metric calculation.
!pip install -q transformers datasets “jiwer>=3.0.0” tensorboard tokenizers accelerate evaluate
- Import core libraries
import pandas as pd
import torch
from torch.utils.data import Dataset
from PIL import Image
from sklearn.model_selection import train_test_split
import os
import json
from google.colab import drive
from transformers import (
TrOCRProcessor,
VisionEncoderDecoderModel,
Seq2SeqTrainingArguments,
Seq2SeqTrainer,
default_data_collator
)
This line is removed as it’s no longer needed → from datasets import load_metric
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
import jiwer # We use jiwer directly for metrics
- Mount Google # 1. Install necessary libraries
We install ‘evaluate’ and upgrade ‘jiwer’ for modern metric calculation.
!pip install -q transformers datasets “jiwer>=3.0.0” tensorboard tokenizers accelerate evaluate
- Import core libraries
import pandas as pd
import torch
from torch.utils.data import Dataset
from PIL import Image
from sklearn.model_selection import train_test_split
import os
import json
from google.colab import drive
from transformers import (
TrOCRProcessor,
VisionEncoderDecoderModel,
Seq2SeqTrainingArguments,
Seq2SeqTrainer,
default_data_collator
)
This line is removed as it’s no longer needed → from datasets import load_metric
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
import jiwer # We use jiwer directly for metrics
- Mount Google Drive
print(“Mounting Google Drive…”)
drive.mount(‘/content/drive’)
print(“Google Drive mounted successfully.”)
print(“Mounting Google Drive…”)
drive.mount(‘/content/drive’)
print(“Google Drive mounted successfully.”)
- Define the PyTorch Dataset Class
class IAMDataset(Dataset):
def init(self, root_dir, df, processor, max_target_length=128):
self.root_dir = root_dir
self.df = df
self.processor = processor
self.max_target_length = max_target_length
def len(self):
return len(self.df)
def getitem(self, idx):
file_name = self.df[‘file_name’][idx]
text = str(self.df[‘text’][idx]) # Ensure text is a string
image_path = os.path.join(self.root_dir, file_name)
try:
image = Image.open(image_path).convert(“RGB”)
except FileNotFoundError:
print(f"Warning: Image file not found at {image_path}. Skipping.")
return None # Handle missing images gracefully
pixel_values = self.processor(image, return_tensors=“pt”).pixel_values
labels = self.processor.tokenizer(text,
padding=“max_length”,
max_length=self.max_target_length,
truncation=True).input_ids
labels = [label if label != self.processor.tokenizer.pad_token_id else -100 for label in labels]
return {“pixel_values”: pixel_values.squeeze(), “labels”: torch.tensor(labels)}
- Instantiate datasets
print(“Creating PyTorch datasets…”)
train_dataset = IAMDataset(root_dir=image_folder_path, df=train_df, processor=processor)
eval_dataset = IAMDataset(root_dir=image_folder_path, df=test_df, processor=processor)
print(“Training and evaluation datasets are ready.”)
- Configure model generation parameters
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.eos_token_id = processor.tokenizer.sep_token_id
model.config.max_length = 64
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4
- Define Training Arguments
*** FIX: Renamed ‘evaluation_strategy’ to ‘eval_strategy’ ***
*** FIX: Reduced steps and epochs for a quick test run ***
training_args = Seq2SeqTrainingArguments(
predict_with_generate=True,
eval_strategy=“steps”, # RENAMED from evaluation_strategy
per_device_train_batch_size=8, # Small batch size for testing
per_device_eval_batch_size=8,
fp16=True,
output_dir=checkpoint_dir,
logging_dir=logging_dir,
num_train_epochs=3, # MODIFIED FOR QUICK TEST: Run for only one epoch
logging_steps=50, # MODIFIED FOR QUICK TEST: Log metrics every 20 steps
save_steps=100, # MODIFIED FOR QUICK TEST: Save a checkpoint every 20 steps
eval_steps=100, # MODIFIED FOR QUICK TEST: Evaluate every 20 steps
save_total_limit=2, # Keep only the last 2 checkpoints
report_to=“tensorboard”,
load_best_model_at_end=True,
metric_for_best_model=“cer”,
greater_is_better=False,
)
- Define the compute_metrics function
def compute_metrics(pred):
labels_ids = pred.label_ids
pred_ids = pred.predictions
pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
labels_ids[labels_ids == -100] = processor.tokenizer.pad_token_id
label_str = processor.batch_decode(labels_ids, skip_special_tokens=True)
cer = jiwer.cer(label_str, pred_str)
return {“cer”: cer}
print(“ Training arguments and metrics function are correctly defined.”)
print(“Parameters have been set for a QUICK TEST RUN.”)
- Initialize the Trainer
trainer = Seq2SeqTrainer(
model=model,
tokenizer=processor.image_processor, # Use image_processor instead of feature_extractor
args=training_args,
compute_metrics=compute_metrics,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
data_collator=default_data_collator,
)
- Start Training
To start fresh:
print(“— Starting Model Training —”)
trainer.train()
print(“— Training Complete! —”)
— HOW TO RESUME TRAINING —
If your session disconnects, all checkpoints are safe in your Google Drive.
To resume from the last saved checkpoint, simply re-run Cells 1, 2, and 3,
and then run the following command in this cell instead of trainer.train():
print(“— Resuming Training from Last Checkpoint —”)
trainer.train(resume_from_checkpoint=True)
- Save the final model and processor
print(f"Saving the best model to: {final_model_dir}“)
os.makedirs(final_model_dir, exist_ok=True)
model.save_pretrained(final_model_dir)
processor.save_pretrained(final_model_dir)
print(f" Final model and processor saved to Google Drive.”)
- How to view TensorBoard
The training logs are saved in the logging_dir on your Drive.
To view them, you can run this command in a NEW, separate cell
AFTER the training has started or completed.
%load_ext tensorboard
%tensorboard --logdir “{logging_dir}”
import torch
from PIL import Image
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from google.colab import files
from IPython.display import display, HTML
import io
import os
— Configuration —
- Set the path to your final model directory in Google Drive
final_model_dir = “/content/drive/MyDrive/Data/trocr-meiteimayek-final-model”
— Setup Device (GPU or CPU) —
- Check for GPU and set the device accordingly. This is crucial for speed.
device = torch.device(“cuda” if torch.cuda.is_available() else “cpu”)
print(f" Using device: {device} ({torch.cuda.get_device_name(0) if torch.cuda.is_available() else ‘CPU’})“)
if device.type == ‘cpu’:
print(”\n⚠️ WARNING: You are running on a CPU. Inference will be very slow.")
print(“Go to Runtime > Change runtime type and select ‘T4 GPU’ or another GPU accelerator.”)
— Model Loading —
- Load the model and processor
if not os.path.exists(final_model_dir):
print(f" Error: Model directory not found at ‘{final_model_dir}’“)
else:
print(”\nLoading model and processor…")
try:
processor = TrOCRProcessor.from_pretrained(final_model_dir)
Load the model and move it to the selected device (GPU)
model = VisionEncoderDecoderModel.from_pretrained(final_model_dir).to(device)
print(“ Model and processor loaded successfully!”)
— Interactive File Upload and Inference —
print(“\n— Please upload one or more Meitei Mayek images —”)
uploaded_files = files.upload()
if not uploaded_files:
print(“\nNo files were uploaded. Run the cell again to try.”)
else:
print(“\n— Inference Results —”)
for file_name, file_content in uploaded_files.items():
print(f"\nProcessing image: ‘{file_name}’…")
— Image Pre-processing for Robustness —
image = Image.open(io.BytesIO(file_content)).convert("RGB")
# Resize the image to prevent issues with very large inputs
# A max size of 1500px on the longest side is a good balance
max_size = 1500
if max(image.size) > max_size:
image.thumbnail((max_size, max_size))
print(f" - Image resized to {image.size} for faster processing.")
# Prepare the image for the model
# Move the pixel_values tensor to the same device as the model (GPU)
pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(device)
print(" - Image processed. Generating text...")
# --- Generate Text ---
# Set max_length to prevent extremely long (or infinite) generation
generated_ids = model.generate(pixel_values, max_length=128)
# Decode the token IDs to get the recognized text
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(" - Text generation complete.")
# --- Display the results ---
display(image)
display(HTML(f"<h3>Recognized Text: <span style='color:blue;'>{generated_text}</span></h3>"))
print("-" * 40)
except Exception as e:
print(f" An error occurred: {e}")
This is my full project for fine tuning tocr in my new languge script
But when I run inference it print out garbage question mark box and not any charcter what do you think is the problem stem from