PDFPlumber

PDF plumber¶

In [1]:

Copied!

from omnidocs.tasks.table_extraction.extractors import PDFPlumberExtractor
from omnidocs.tasks.table_extraction.extractors import PDFPlumberExtractor

In [2]:

Copied!

pdf_path = "../../../../tests/table_extraction/assets/table_document.pdf"
result = PDFPlumberExtractor().extract(pdf_path)
pdf_path = "../../../../tests/table_extraction/assets/table_document.pdf"
result = PDFPlumberExtractor().extract(pdf_path)

INFO     [timestamp]2025-08-02 12:41:56[/] |                                   
         [logger.name]omnidocs.tasks.table_extraction.extractors.pdfplumber[/] 
         | [function]logging.py:150[/] | [info]extract completed in 0.90s[/]   
INFO     [timestamp]2025-08-02 12:41:56[/] |                                   
         [logger.name]omnidocs.tasks.table_extraction.extractors.pdfplumber[/] 
         | [function]logging.py:150[/] | [info]extract completed in 0.90s[/]

In [5]:

Copied!





for i, table in enumerate(result.tables):
                print(f"\nTable {i+1}: {table.num_rows} rows x {table.num_cols} columns")
                print(f"Total cells: {len(table.cells)}")

                if table.cells:
                    non_empty_cells = [cell for cell in table.cells if cell.text.strip()]
                    print(f"Non-empty cells: {len(non_empty_cells)}")

                    # Show first few cells
                    for cell in table.cells[:20]:
                        if cell.text.strip():
                            text = cell.text.strip()[:50]
                            print(f"  [{cell.row},{cell.col}]: '{text}'")
for i, table in enumerate(result.tables):
                print(f"\nTable {i+1}: {table.num_rows} rows x {table.num_cols} columns")
                print(f"Total cells: {len(table.cells)}")

                if table.cells:
                    non_empty_cells = [cell for cell in table.cells if cell.text.strip()]
                    print(f"Non-empty cells: {len(non_empty_cells)}")

                    # Show first few cells
                    for cell in table.cells[:20]:
                        if cell.text.strip():
                            text = cell.text.strip()[:50]
                            print(f"  [{cell.row},{cell.col}]: '{text}'")

Table 1: 14 rows x 18 columns
Total cells: 59
Non-empty cells: 36
  [0,13]: 'Results'
  [1,10]: 'Ballots'
  [2,1]: 'Disability'
  [2,7]: 'Ballots'
  [3,4]: 'Participants'
  [3,10]: 'Incomplete/'

In [ ]: