Has anyone managed to extract text from PDF files and create a good question-and-answer dataset from the extracted text? Or generally create datasets from given PDF files?
There are several libraries for Python.
Edit:
Code sample by Hugging Chat.
pip install PyMuPDF
pip install spacy
python -m spacy download en_core_web_sm
pip install datasets
import fitz # PyMuPDF
import spacy
from datasets import Dataset
# Load the spaCy model
nlp = spacy.load("en_core_web_sm")
def extract_text_from_pdf(pdf_path):
document = fitz.open(pdf_path)
text = ""
for page_num in range(len(document)):
page = document.load_page(page_num)
text += page.get_text("text")
return text
def identify_questions(text):
doc = nlp(text)
questions = []
for sent in doc.sents:
if sent[-1].text == '?':
questions.append(sent.text)
return questions
def simple_qa_pairs(text):
questions = identify_questions(text)
qa_pairs = []
doc = nlp(text)
sentences = list(doc.sents)
for question in questions:
question_idx = [i for i, sent in enumerate(sentences) if sent.text == question][0]
if question_idx > 0:
answer = sentences[question_idx - 1].text
qa_pairs.append({"question": question, "answer": answer})
return qa_pairs
def create_qa_dataset(pdf_path):
text = extract_text_from_pdf(pdf_path)
qa_pairs = simple_qa_pairs(text)
dataset = Dataset.from_list(qa_pairs)
return dataset
# Example usage
pdf_path = "example.pdf"
dataset = create_qa_dataset(pdf_path)
# Save the dataset to disk
dataset.save_to_disk("qa_dataset")
# Load the dataset to verify
loaded_dataset = Dataset.load_from_disk("qa_dataset")
for example in loaded_dataset:
print("Question:", example["question"])
print("Answer:", example["answer"])
I will suggest you combine a RAG system with a good amount of prompt engineering.
The PDF file will be embedded and stored in a vector DB, which you will query via a vector similarity_search with score. The results will be the contents of the document that have closer vector values compared to the vector of the search query.
You save each query and its search result with the highest similarity_score in a dataFrame.
PS: You can use a good prompt to get a lot of questions related to your type of document and its content. A for loop can handle the rest (from querying the vector db for each question) and appending the query and result (with highest similarity_score) into a dataFrame.
Yes, itâs possible to extract text from PDFs and create Q&A datasets. Tools like PyPDF2 or PDFMiner can extract text, while NLP models can generate questions and answers. This process often involves cleaning the text, extracting key information, and using models for question generation.
not sure if itâs totally ready for use but nvidia seems to be doing something similar to what youâre looking for
This extracts data from pdfs to json
theres this tool i found like this, + it is online and only by using prompts
try it out here: https://pdf2dataset.streamlit.app
Yes, this is definitely possible! I found a solution that can:
- Extract text from PDF files automatically
- Generate question-answer pairs from that text
- Save everything as a dataset file (like Excel or JSON)
The process is pretty straightforward, we just need to install a few Python libraries and run the script
import PyPDF2
import pandas as pd
import re
from typing import List, Dict, Tuple
import json
from transformers import pipeline
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import random
Download required NLTK data
try:
nltk.data.find(âtokenizers/punktâ)
except LookupError:
nltk.download(âpunktâ)
class PDFToQADataset:
def init(self):
# Initialize question generation pipeline (requires internet connection)
try:
self.question_generator = pipeline(âtext2text-generationâ,
model=âvalhalla/t5-small-qg-hlâ)
except:
print(âWarning: Question generation model not available. Using rule-based approach.â)
self.question_generator = None
def extract_text_from_pdf(self, pdf_path: str) -> str:
"""Extract text from PDF file"""
text = ""
try:
with open(pdf_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
for page in pdf_reader.pages:
text += page.extract_text() + "\n"
except Exception as e:
print(f"Error extracting text from PDF: {e}")
return text
def clean_text(self, text: str) -> str:
"""Clean and preprocess extracted text"""
# Remove extra whitespace and line breaks
text = re.sub(r'\s+', ' ', text)
# Remove special characters but keep punctuation
text = re.sub(r'[^\w\s.,!?;:-]', '', text)
return text.strip()
def split_into_chunks(self, text: str, chunk_size: int = 200) -> List[str]:
"""Split text into manageable chunks for Q&A generation"""
sentences = sent_tokenize(text)
chunks = []
current_chunk = ""
for sentence in sentences:
if len(word_tokenize(current_chunk + sentence)) <= chunk_size:
current_chunk += sentence + " "
else:
if current_chunk:
chunks.append(current_chunk.strip())
current_chunk = sentence + " "
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
def generate_questions_rule_based(self, text: str) -> List[Dict[str, str]]:
"""Generate questions using rule-based approach"""
qa_pairs = []
sentences = sent_tokenize(text)
for sentence in sentences:
# Skip very short sentences
if len(word_tokenize(sentence)) < 5:
continue
# Generate different types of questions
questions = []
# What questions
if any(word in sentence.lower() for word in ['is', 'are', 'was', 'were']):
questions.append(f"What {sentence.lower()}")
# Who questions
if any(word in sentence.lower() for word in ['person', 'people', 'author', 'researcher']):
questions.append(f"Who {sentence.lower().replace('the person', '').replace('people', '')}")
# When questions
if any(word in sentence.lower() for word in ['year', 'date', 'time', 'century']):
questions.append(f"When {sentence.lower()}")
# Where questions
if any(word in sentence.lower() for word in ['location', 'place', 'country', 'city']):
questions.append(f"Where {sentence.lower()}")
# How questions
if any(word in sentence.lower() for word in ['method', 'process', 'way']):
questions.append(f"How {sentence.lower()}")
# Add some of the generated questions
for question in questions[:2]: # Limit to 2 questions per sentence
qa_pairs.append({
'question': question.capitalize() + '?',
'answer': sentence,
'context': text[:500] + '...' if len(text) > 500 else text
})
return qa_pairs
def generate_questions_transformer(self, text: str) -> List[Dict[str, str]]:
"""Generate questions using transformer model"""
qa_pairs = []
chunks = self.split_into_chunks(text, chunk_size=150)
for chunk in chunks[:10]: # Limit to first 10 chunks to avoid overload
try:
# Generate question using transformer
input_text = f"generate question: {chunk}"
result = self.question_generator(input_text, max_length=64, num_return_sequences=1)
question = result[0]['generated_text']
qa_pairs.append({
'question': question,
'answer': chunk,
'context': text[:500] + '...' if len(text) > 500 else text
})
except Exception as e:
print(f"Error generating question for chunk: {e}")
continue
return qa_pairs
def extract_key_facts(self, text: str) -> List[Dict[str, str]]:
"""Extract key facts and create factual Q&A pairs"""
qa_pairs = []
# Use TF-IDF to find important sentences
sentences = sent_tokenize(text)
if len(sentences) < 2:
return qa_pairs
vectorizer = TfidfVectorizer(stop_words='english', max_features=100)
try:
tfidf_matrix = vectorizer.fit_transform(sentences)
feature_names = vectorizer.get_feature_names_out()
# Get top sentences based on TF-IDF scores
sentence_scores = tfidf_matrix.sum(axis=1).A1
top_sentence_indices = sentence_scores.argsort()[-5:][::-1]
for idx in top_sentence_indices:
sentence = sentences[idx]
# Create a simple factual question
question = f"What does the document say about {random.choice(feature_names)}?"
qa_pairs.append({
'question': question,
'answer': sentence,
'context': text[:500] + '...' if len(text) > 500 else text
})
except Exception as e:
print(f"Error in key fact extraction: {e}")
return qa_pairs
def create_qa_dataset(self, pdf_path: str, output_format: str = 'json') -> str:
"""Main function to create Q&A dataset from PDF"""
# Extract text
print("Extracting text from PDF...")
raw_text = self.extract_text_from_pdf(pdf_path)
if not raw_text.strip():
return "Error: No text could be extracted from the PDF."
# Clean text
print("Cleaning extracted text...")
clean_text = self.clean_text(raw_text)
# Generate Q&A pairs using different methods
print("Generating Q&A pairs...")
qa_pairs = []
# Method 1: Rule-based approach
rule_based_qa = self.generate_questions_rule_based(clean_text)
qa_pairs.extend(rule_based_qa)
# Method 2: Transformer-based (if available)
if self.question_generator:
transformer_qa = self.generate_questions_transformer(clean_text)
qa_pairs.extend(transformer_qa)
# Method 3: Key facts extraction
fact_based_qa = self.extract_key_facts(clean_text)
qa_pairs.extend(fact_based_qa)
# Remove duplicates and clean up
unique_qa = []
seen_questions = set()
for qa in qa_pairs:
if qa['question'].lower() not in seen_questions:
seen_questions.add(qa['question'].lower())
unique_qa.append(qa)
print(f"Generated {len(unique_qa)} unique Q&A pairs.")
# Save dataset
if output_format.lower() == 'json':
output_file = 'qa_dataset.json'
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(unique_qa, f, indent=2, ensure_ascii=False)
elif output_format.lower() == 'csv':
output_file = 'qa_dataset.csv'
df = pd.DataFrame(unique_qa)
df.to_csv(output_file, index=False, encoding='utf-8')
else:
output_file = 'qa_dataset.jsonl'
with open(output_file, 'w', encoding='utf-8') as f:
for qa in unique_qa:
f.write(json.dumps(qa, ensure_ascii=False) + '\n')
return f"Q&A dataset saved as {output_file}"
Example usage
def main():
# Initialize the PDF to Q&A converter
converter = PDFToQADataset()
# Example usage
pdf_path = "your_document.pdf" # Replace with your PDF path
# Create dataset
result = converter.create_qa_dataset(pdf_path, output_format='json')
print(result)
# You can also extract just the text if needed
text = converter.extract_text_from_pdf(pdf_path)
print(f"Extracted {len(text.split())} words from PDF")
if name == âmainâ:
# Required libraries installation command:
print(âRequired libraries:â)
print(âpip install PyPDF2 pandas transformers nltk scikit-learn torchâ)
print(â\nAlternative PDF libraries:â)
print(âpip install pdfplumber pymupdf textractâ) # More robust options
main()
This Python script handles both PDF text extraction and Q&A dataset creation
You can use it by:
- Replace
"your_document.pdf"
with the actual PDF path - Run the script
- Get a dataset file with question-answer pairs
The script generates different types of questions (What, Who, When, Where, How) and can handle various document types. For production use, Iâd recommend using pdfplumber or PyMuPDF for better text extraction quality.