How do I create Datasets from PDF files?

Yes, this is definitely possible! I found a solution that can:

  1. Extract text from PDF files automatically
  2. Generate question-answer pairs from that text
  3. Save everything as a dataset file (like Excel or JSON)

The process is pretty straightforward, we just need to install a few Python libraries and run the script

import PyPDF2
import pandas as pd
import re
from typing import List, Dict, Tuple
import json
from transformers import pipeline
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import random

Download required NLTK data

try:
nltk.data.find(‘tokenizers/punkt’)
except LookupError:
nltk.download(‘punkt’)

class PDFToQADataset:
def init(self):
# Initialize question generation pipeline (requires internet connection)
try:
self.question_generator = pipeline(“text2text-generation”,
model=“valhalla/t5-small-qg-hl”)
except:
print(“Warning: Question generation model not available. Using rule-based approach.”)
self.question_generator = None

def extract_text_from_pdf(self, pdf_path: str) -> str:
    """Extract text from PDF file"""
    text = ""
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            for page in pdf_reader.pages:
                text += page.extract_text() + "\n"
    except Exception as e:
        print(f"Error extracting text from PDF: {e}")
    return text

def clean_text(self, text: str) -> str:
    """Clean and preprocess extracted text"""
    # Remove extra whitespace and line breaks
    text = re.sub(r'\s+', ' ', text)
    # Remove special characters but keep punctuation
    text = re.sub(r'[^\w\s.,!?;:-]', '', text)
    return text.strip()

def split_into_chunks(self, text: str, chunk_size: int = 200) -> List[str]:
    """Split text into manageable chunks for Q&A generation"""
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = ""
    
    for sentence in sentences:
        if len(word_tokenize(current_chunk + sentence)) <= chunk_size:
            current_chunk += sentence + " "
        else:
            if current_chunk:
                chunks.append(current_chunk.strip())
            current_chunk = sentence + " "
    
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

def generate_questions_rule_based(self, text: str) -> List[Dict[str, str]]:
    """Generate questions using rule-based approach"""
    qa_pairs = []
    sentences = sent_tokenize(text)
    
    for sentence in sentences:
        # Skip very short sentences
        if len(word_tokenize(sentence)) < 5:
            continue
            
        # Generate different types of questions
        questions = []
        
        # What questions
        if any(word in sentence.lower() for word in ['is', 'are', 'was', 'were']):
            questions.append(f"What {sentence.lower()}")
        
        # Who questions
        if any(word in sentence.lower() for word in ['person', 'people', 'author', 'researcher']):
            questions.append(f"Who {sentence.lower().replace('the person', '').replace('people', '')}")
        
        # When questions
        if any(word in sentence.lower() for word in ['year', 'date', 'time', 'century']):
            questions.append(f"When {sentence.lower()}")
        
        # Where questions
        if any(word in sentence.lower() for word in ['location', 'place', 'country', 'city']):
            questions.append(f"Where {sentence.lower()}")
        
        # How questions
        if any(word in sentence.lower() for word in ['method', 'process', 'way']):
            questions.append(f"How {sentence.lower()}")
        
        # Add some of the generated questions
        for question in questions[:2]:  # Limit to 2 questions per sentence
            qa_pairs.append({
                'question': question.capitalize() + '?',
                'answer': sentence,
                'context': text[:500] + '...' if len(text) > 500 else text
            })
    
    return qa_pairs

def generate_questions_transformer(self, text: str) -> List[Dict[str, str]]:
    """Generate questions using transformer model"""
    qa_pairs = []
    chunks = self.split_into_chunks(text, chunk_size=150)
    
    for chunk in chunks[:10]:  # Limit to first 10 chunks to avoid overload
        try:
            # Generate question using transformer
            input_text = f"generate question: {chunk}"
            result = self.question_generator(input_text, max_length=64, num_return_sequences=1)
            question = result[0]['generated_text']
            
            qa_pairs.append({
                'question': question,
                'answer': chunk,
                'context': text[:500] + '...' if len(text) > 500 else text
            })
        except Exception as e:
            print(f"Error generating question for chunk: {e}")
            continue
    
    return qa_pairs

def extract_key_facts(self, text: str) -> List[Dict[str, str]]:
    """Extract key facts and create factual Q&A pairs"""
    qa_pairs = []
    
    # Use TF-IDF to find important sentences
    sentences = sent_tokenize(text)
    if len(sentences) < 2:
        return qa_pairs
        
    vectorizer = TfidfVectorizer(stop_words='english', max_features=100)
    try:
        tfidf_matrix = vectorizer.fit_transform(sentences)
        feature_names = vectorizer.get_feature_names_out()
        
        # Get top sentences based on TF-IDF scores
        sentence_scores = tfidf_matrix.sum(axis=1).A1
        top_sentence_indices = sentence_scores.argsort()[-5:][::-1]
        
        for idx in top_sentence_indices:
            sentence = sentences[idx]
            # Create a simple factual question
            question = f"What does the document say about {random.choice(feature_names)}?"
            qa_pairs.append({
                'question': question,
                'answer': sentence,
                'context': text[:500] + '...' if len(text) > 500 else text
            })
            
    except Exception as e:
        print(f"Error in key fact extraction: {e}")
    
    return qa_pairs

def create_qa_dataset(self, pdf_path: str, output_format: str = 'json') -> str:
    """Main function to create Q&A dataset from PDF"""
    # Extract text
    print("Extracting text from PDF...")
    raw_text = self.extract_text_from_pdf(pdf_path)
    
    if not raw_text.strip():
        return "Error: No text could be extracted from the PDF."
    
    # Clean text
    print("Cleaning extracted text...")
    clean_text = self.clean_text(raw_text)
    
    # Generate Q&A pairs using different methods
    print("Generating Q&A pairs...")
    qa_pairs = []
    
    # Method 1: Rule-based approach
    rule_based_qa = self.generate_questions_rule_based(clean_text)
    qa_pairs.extend(rule_based_qa)
    
    # Method 2: Transformer-based (if available)
    if self.question_generator:
        transformer_qa = self.generate_questions_transformer(clean_text)
        qa_pairs.extend(transformer_qa)
    
    # Method 3: Key facts extraction
    fact_based_qa = self.extract_key_facts(clean_text)
    qa_pairs.extend(fact_based_qa)
    
    # Remove duplicates and clean up
    unique_qa = []
    seen_questions = set()
    
    for qa in qa_pairs:
        if qa['question'].lower() not in seen_questions:
            seen_questions.add(qa['question'].lower())
            unique_qa.append(qa)
    
    print(f"Generated {len(unique_qa)} unique Q&A pairs.")
    
    # Save dataset
    if output_format.lower() == 'json':
        output_file = 'qa_dataset.json'
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(unique_qa, f, indent=2, ensure_ascii=False)
    elif output_format.lower() == 'csv':
        output_file = 'qa_dataset.csv'
        df = pd.DataFrame(unique_qa)
        df.to_csv(output_file, index=False, encoding='utf-8')
    else:
        output_file = 'qa_dataset.jsonl'
        with open(output_file, 'w', encoding='utf-8') as f:
            for qa in unique_qa:
                f.write(json.dumps(qa, ensure_ascii=False) + '\n')
    
    return f"Q&A dataset saved as {output_file}"

Example usage

def main():
# Initialize the PDF to Q&A converter
converter = PDFToQADataset()

# Example usage
pdf_path = "your_document.pdf"  # Replace with your PDF path

# Create dataset
result = converter.create_qa_dataset(pdf_path, output_format='json')
print(result)

# You can also extract just the text if needed
text = converter.extract_text_from_pdf(pdf_path)
print(f"Extracted {len(text.split())} words from PDF")

if name == “main”:
# Required libraries installation command:
print(“Required libraries:”)
print(“pip install PyPDF2 pandas transformers nltk scikit-learn torch”)
print(“\nAlternative PDF libraries:”)
print(“pip install pdfplumber pymupdf textract”) # More robust options

main()

This Python script handles both PDF text extraction and Q&A dataset creation
You can use it by:

  1. Replace "your_document.pdf" with the actual PDF path
  2. Run the script
  3. Get a dataset file with question-answer pairs

The script generates different types of questions (What, Who, When, Where, How) and can handle various document types. For production use, I’d recommend using pdfplumber or PyMuPDF for better text extraction quality.

2 Likes