Yes, this is definitely possible! I found a solution that can:
- Extract text from PDF files automatically
- Generate question-answer pairs from that text
- Save everything as a dataset file (like Excel or JSON)
The process is pretty straightforward, we just need to install a few Python libraries and run the script
import PyPDF2
import pandas as pd
import re
from typing import List, Dict, Tuple
import json
from transformers import pipeline
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import random
Download required NLTK data
try:
nltk.data.find(‘tokenizers/punkt’)
except LookupError:
nltk.download(‘punkt’)
class PDFToQADataset:
def init(self):
# Initialize question generation pipeline (requires internet connection)
try:
self.question_generator = pipeline(“text2text-generation”,
model=“valhalla/t5-small-qg-hl”)
except:
print(“Warning: Question generation model not available. Using rule-based approach.”)
self.question_generator = None
def extract_text_from_pdf(self, pdf_path: str) -> str:
"""Extract text from PDF file"""
text = ""
try:
with open(pdf_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
for page in pdf_reader.pages:
text += page.extract_text() + "\n"
except Exception as e:
print(f"Error extracting text from PDF: {e}")
return text
def clean_text(self, text: str) -> str:
"""Clean and preprocess extracted text"""
# Remove extra whitespace and line breaks
text = re.sub(r'\s+', ' ', text)
# Remove special characters but keep punctuation
text = re.sub(r'[^\w\s.,!?;:-]', '', text)
return text.strip()
def split_into_chunks(self, text: str, chunk_size: int = 200) -> List[str]:
"""Split text into manageable chunks for Q&A generation"""
sentences = sent_tokenize(text)
chunks = []
current_chunk = ""
for sentence in sentences:
if len(word_tokenize(current_chunk + sentence)) <= chunk_size:
current_chunk += sentence + " "
else:
if current_chunk:
chunks.append(current_chunk.strip())
current_chunk = sentence + " "
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
def generate_questions_rule_based(self, text: str) -> List[Dict[str, str]]:
"""Generate questions using rule-based approach"""
qa_pairs = []
sentences = sent_tokenize(text)
for sentence in sentences:
# Skip very short sentences
if len(word_tokenize(sentence)) < 5:
continue
# Generate different types of questions
questions = []
# What questions
if any(word in sentence.lower() for word in ['is', 'are', 'was', 'were']):
questions.append(f"What {sentence.lower()}")
# Who questions
if any(word in sentence.lower() for word in ['person', 'people', 'author', 'researcher']):
questions.append(f"Who {sentence.lower().replace('the person', '').replace('people', '')}")
# When questions
if any(word in sentence.lower() for word in ['year', 'date', 'time', 'century']):
questions.append(f"When {sentence.lower()}")
# Where questions
if any(word in sentence.lower() for word in ['location', 'place', 'country', 'city']):
questions.append(f"Where {sentence.lower()}")
# How questions
if any(word in sentence.lower() for word in ['method', 'process', 'way']):
questions.append(f"How {sentence.lower()}")
# Add some of the generated questions
for question in questions[:2]: # Limit to 2 questions per sentence
qa_pairs.append({
'question': question.capitalize() + '?',
'answer': sentence,
'context': text[:500] + '...' if len(text) > 500 else text
})
return qa_pairs
def generate_questions_transformer(self, text: str) -> List[Dict[str, str]]:
"""Generate questions using transformer model"""
qa_pairs = []
chunks = self.split_into_chunks(text, chunk_size=150)
for chunk in chunks[:10]: # Limit to first 10 chunks to avoid overload
try:
# Generate question using transformer
input_text = f"generate question: {chunk}"
result = self.question_generator(input_text, max_length=64, num_return_sequences=1)
question = result[0]['generated_text']
qa_pairs.append({
'question': question,
'answer': chunk,
'context': text[:500] + '...' if len(text) > 500 else text
})
except Exception as e:
print(f"Error generating question for chunk: {e}")
continue
return qa_pairs
def extract_key_facts(self, text: str) -> List[Dict[str, str]]:
"""Extract key facts and create factual Q&A pairs"""
qa_pairs = []
# Use TF-IDF to find important sentences
sentences = sent_tokenize(text)
if len(sentences) < 2:
return qa_pairs
vectorizer = TfidfVectorizer(stop_words='english', max_features=100)
try:
tfidf_matrix = vectorizer.fit_transform(sentences)
feature_names = vectorizer.get_feature_names_out()
# Get top sentences based on TF-IDF scores
sentence_scores = tfidf_matrix.sum(axis=1).A1
top_sentence_indices = sentence_scores.argsort()[-5:][::-1]
for idx in top_sentence_indices:
sentence = sentences[idx]
# Create a simple factual question
question = f"What does the document say about {random.choice(feature_names)}?"
qa_pairs.append({
'question': question,
'answer': sentence,
'context': text[:500] + '...' if len(text) > 500 else text
})
except Exception as e:
print(f"Error in key fact extraction: {e}")
return qa_pairs
def create_qa_dataset(self, pdf_path: str, output_format: str = 'json') -> str:
"""Main function to create Q&A dataset from PDF"""
# Extract text
print("Extracting text from PDF...")
raw_text = self.extract_text_from_pdf(pdf_path)
if not raw_text.strip():
return "Error: No text could be extracted from the PDF."
# Clean text
print("Cleaning extracted text...")
clean_text = self.clean_text(raw_text)
# Generate Q&A pairs using different methods
print("Generating Q&A pairs...")
qa_pairs = []
# Method 1: Rule-based approach
rule_based_qa = self.generate_questions_rule_based(clean_text)
qa_pairs.extend(rule_based_qa)
# Method 2: Transformer-based (if available)
if self.question_generator:
transformer_qa = self.generate_questions_transformer(clean_text)
qa_pairs.extend(transformer_qa)
# Method 3: Key facts extraction
fact_based_qa = self.extract_key_facts(clean_text)
qa_pairs.extend(fact_based_qa)
# Remove duplicates and clean up
unique_qa = []
seen_questions = set()
for qa in qa_pairs:
if qa['question'].lower() not in seen_questions:
seen_questions.add(qa['question'].lower())
unique_qa.append(qa)
print(f"Generated {len(unique_qa)} unique Q&A pairs.")
# Save dataset
if output_format.lower() == 'json':
output_file = 'qa_dataset.json'
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(unique_qa, f, indent=2, ensure_ascii=False)
elif output_format.lower() == 'csv':
output_file = 'qa_dataset.csv'
df = pd.DataFrame(unique_qa)
df.to_csv(output_file, index=False, encoding='utf-8')
else:
output_file = 'qa_dataset.jsonl'
with open(output_file, 'w', encoding='utf-8') as f:
for qa in unique_qa:
f.write(json.dumps(qa, ensure_ascii=False) + '\n')
return f"Q&A dataset saved as {output_file}"
Example usage
def main():
# Initialize the PDF to Q&A converter
converter = PDFToQADataset()
# Example usage
pdf_path = "your_document.pdf" # Replace with your PDF path
# Create dataset
result = converter.create_qa_dataset(pdf_path, output_format='json')
print(result)
# You can also extract just the text if needed
text = converter.extract_text_from_pdf(pdf_path)
print(f"Extracted {len(text.split())} words from PDF")
if name == “main”:
# Required libraries installation command:
print(“Required libraries:”)
print(“pip install PyPDF2 pandas transformers nltk scikit-learn torch”)
print(“\nAlternative PDF libraries:”)
print(“pip install pdfplumber pymupdf textract”) # More robust options
main()
This Python script handles both PDF text extraction and Q&A dataset creation
You can use it by:
- Replace
"your_document.pdf"
with the actual PDF path - Run the script
- Get a dataset file with question-answer pairs
The script generates different types of questions (What, Who, When, Where, How) and can handle various document types. For production use, I’d recommend using pdfplumber or PyMuPDF for better text extraction quality.