PDFPlumber
PDF plumber¶
In [1]:
Copied!
from omnidocs.tasks.table_extraction.extractors import PDFPlumberExtractor
from omnidocs.tasks.table_extraction.extractors import PDFPlumberExtractor
In [2]:
Copied!
pdf_path = "../../../../tests/table_extraction/assets/table_document.pdf"
result = PDFPlumberExtractor().extract(pdf_path)
pdf_path = "../../../../tests/table_extraction/assets/table_document.pdf"
result = PDFPlumberExtractor().extract(pdf_path)
INFO [timestamp]2025-08-02 12:41:56[/] | [logger.name]omnidocs.tasks.table_extraction.extractors.pdfplumber[/] | [function]logging.py:150[/] | [info]extract completed in 0.90s[/] INFO [timestamp]2025-08-02 12:41:56[/] | [logger.name]omnidocs.tasks.table_extraction.extractors.pdfplumber[/] | [function]logging.py:150[/] | [info]extract completed in 0.90s[/]
In [5]:
Copied!
for i, table in enumerate(result.tables):
print(f"\nTable {i+1}: {table.num_rows} rows x {table.num_cols} columns")
print(f"Total cells: {len(table.cells)}")
if table.cells:
non_empty_cells = [cell for cell in table.cells if cell.text.strip()]
print(f"Non-empty cells: {len(non_empty_cells)}")
# Show first few cells
for cell in table.cells[:20]:
if cell.text.strip():
text = cell.text.strip()[:50]
print(f" [{cell.row},{cell.col}]: '{text}'")
for i, table in enumerate(result.tables):
print(f"\nTable {i+1}: {table.num_rows} rows x {table.num_cols} columns")
print(f"Total cells: {len(table.cells)}")
if table.cells:
non_empty_cells = [cell for cell in table.cells if cell.text.strip()]
print(f"Non-empty cells: {len(non_empty_cells)}")
# Show first few cells
for cell in table.cells[:20]:
if cell.text.strip():
text = cell.text.strip()[:50]
print(f" [{cell.row},{cell.col}]: '{text}'")
Table 1: 14 rows x 18 columns Total cells: 59 Non-empty cells: 36 [0,13]: 'Results' [1,10]: 'Ballots' [2,1]: 'Disability' [2,7]: 'Ballots' [3,4]: 'Participants' [3,10]: 'Incomplete/'
In [ ]:
Copied!