Text Extraction with Docling Parse and Other Extractors¶
This notebook demonstrates text extraction from PDF files using Docling Parse and all available extractors in OmniDocs.
In [2]:
Copied!
from omnidocs.tasks.text_extraction.extractors import (
PyMuPDFExtractor,
PyPDF2Extractor,
PDFPlumberTextExtractor,
PDFTextExtractor,
DoclingExtractor,
SuryaTextExtractor
)
from omnidocs.tasks.text_extraction.extractors import (
PyMuPDFExtractor,
PyPDF2Extractor,
PDFPlumberTextExtractor,
PDFTextExtractor,
DoclingExtractor,
SuryaTextExtractor
)
c:\Users\laxma\OneDrive\Desktop\CogLab\11-07-2025\Omnidocs\new\Lib\site-packages\transformers\utils\hub.py:111: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. warnings.warn(
In [5]:
Copied!
pdf_extractors = [
PyMuPDFExtractor,
PyPDF2Extractor,
PDFPlumberTextExtractor,
PDFTextExtractor,
DoclingExtractor,
# SuryaTextExtractor pass image, it only works with images
]
pdf_path = "../../../../tests/text_extraction/assets/sample_document.pdf"
pdf_extractors = [
PyMuPDFExtractor,
PyPDF2Extractor,
PDFPlumberTextExtractor,
PDFTextExtractor,
DoclingExtractor,
# SuryaTextExtractor pass image, it only works with images
]
pdf_path = "../../../../tests/text_extraction/assets/sample_document.pdf"
In [7]:
Copied!
for extractor_cls in pdf_extractors:
result = extractor_cls().extract(pdf_path)
full_text_preview = result.full_text.strip()[:200]
if len(result.full_text.strip()) > 200:
full_text_preview += "..."
print(f"\nFull text preview: '{full_text_preview}'")
for extractor_cls in pdf_extractors:
result = extractor_cls().extract(pdf_path)
full_text_preview = result.full_text.strip()[:200]
if len(result.full_text.strip()) > 200:
full_text_preview += "..."
print(f"\nFull text preview: '{full_text_preview}'")
Full text preview: '1 Sample PDF Created for testing PDFObject This PDF is three pages long. Three long pages. Or three short pages if you’re optimistic. Is it the same as saying “three long minutes”, knowing that a...' Full text preview: '1 Sample PDF Created for testing PDFObject This PDF is three pages long. Three long pages. Or three short pages if you’re optimistic. Is it the same as saying “three long minutes”, knowing that all mi...' Full text preview: 'Sample PDF Created for testing PDFObject This PDF is three pages long. Three long pages. Or three short pages if you’re optimistic. Is it the same as saying “three long m...' Full text preview: '1 Sample PDF Created for testing PDFObject This PDF is three pages long. Three long pages. Or three short pages if you’re optimistic. Is it the same as saying “three long minutes”, knowing that all...' Full text preview: 'Sample PDF Created for testing PDFObject This PDF is three pages long. Three long pages. Or three short pages if you're optimistic. Is it the same as saying 'three long minutes', knowing that all mi...'
In [ ]:
Copied!