Tabula

Tabula Table Extractor¶

In [7]:

Copied!

from omnidocs.tasks.table_extraction.extractors import TabulaExtractor
from omnidocs.tasks.table_extraction.extractors import TabulaExtractor

In [8]:

Copied!

pdf_path = "../../../../tests/table_extraction/assets/table_document.pdf"
result = TabulaExtractor().extract(pdf_path)
pdf_path = "../../../../tests/table_extraction/assets/table_document.pdf"
result = TabulaExtractor().extract(pdf_path)

INFO     [timestamp]2025-07-31 13:32:36[/] | [logger.name]omnidocs.tasks.table_extraction.extractors.tabula[/] |   
         [function]logging.py:150[/] | [info]extract completed in 2.19s[/]

INFO     [timestamp]2025-07-31 13:32:36[/] | [logger.name]omnidocs.tasks.table_extraction.extractors.tabula[/] |   
         [function]logging.py:150[/] | [info]extract completed in 2.19s[/]

In [9]:

Copied!





for i, table in enumerate(result.tables):
                print(f"\nTable {i+1}: {table.num_rows} rows x {table.num_cols} columns")
                print(f"Total cells: {len(table.cells)}")

                if table.cells:
                    non_empty_cells = [cell for cell in table.cells if cell.text.strip()]
                    print(f"Non-empty cells: {len(non_empty_cells)}")

                    # Show first few cells
                    for cell in table.cells[:60]:
                        if cell.text.strip():
                            text = cell.text.strip()[:60]
                            print(f"  [{cell.row},{cell.col}]: '{text}'")
for i, table in enumerate(result.tables):
                print(f"\nTable {i+1}: {table.num_rows} rows x {table.num_cols} columns")
                print(f"Total cells: {len(table.cells)}")

                if table.cells:
                    non_empty_cells = [cell for cell in table.cells if cell.text.strip()]
                    print(f"Non-empty cells: {len(non_empty_cells)}")

                    # Show first few cells
                    for cell in table.cells[:60]:
                        if cell.text.strip():
                            text = cell.text.strip()[:60]
                            print(f"  [{cell.row},{cell.col}]: '{text}'")

Table 1: 13 rows x 11 columns
Total cells: 143
Non-empty cells: 36
  [0,5]: 'Results'
  [1,4]: 'Ballots'
  [2,1]: 'Disability'
  [2,4]: 'Ballots'
  [3,2]: 'articipants'
  [3,4]: 'Incomplete/'
  [3,6]: 'Accuracy'
  [3,9]: 'Time to'
  [4,0]: 'Category'
  [4,1]: 'Completed'
  [5,3]: 'Terminated'