Tabula
Tabula Table Extractor¶
In [7]:
Copied!
from omnidocs.tasks.table_extraction.extractors import TabulaExtractor
from omnidocs.tasks.table_extraction.extractors import TabulaExtractor
In [8]:
Copied!
pdf_path = "../../../../tests/table_extraction/assets/table_document.pdf"
result = TabulaExtractor().extract(pdf_path)
pdf_path = "../../../../tests/table_extraction/assets/table_document.pdf"
result = TabulaExtractor().extract(pdf_path)
INFO [timestamp]2025-07-31 13:32:36[/] | [logger.name]omnidocs.tasks.table_extraction.extractors.tabula[/] | [function]logging.py:150[/] | [info]extract completed in 2.19s[/]
INFO [timestamp]2025-07-31 13:32:36[/] | [logger.name]omnidocs.tasks.table_extraction.extractors.tabula[/] | [function]logging.py:150[/] | [info]extract completed in 2.19s[/]
In [9]:
Copied!
for i, table in enumerate(result.tables):
print(f"\nTable {i+1}: {table.num_rows} rows x {table.num_cols} columns")
print(f"Total cells: {len(table.cells)}")
if table.cells:
non_empty_cells = [cell for cell in table.cells if cell.text.strip()]
print(f"Non-empty cells: {len(non_empty_cells)}")
# Show first few cells
for cell in table.cells[:60]:
if cell.text.strip():
text = cell.text.strip()[:60]
print(f" [{cell.row},{cell.col}]: '{text}'")
for i, table in enumerate(result.tables):
print(f"\nTable {i+1}: {table.num_rows} rows x {table.num_cols} columns")
print(f"Total cells: {len(table.cells)}")
if table.cells:
non_empty_cells = [cell for cell in table.cells if cell.text.strip()]
print(f"Non-empty cells: {len(non_empty_cells)}")
# Show first few cells
for cell in table.cells[:60]:
if cell.text.strip():
text = cell.text.strip()[:60]
print(f" [{cell.row},{cell.col}]: '{text}'")
Table 1: 13 rows x 11 columns Total cells: 143 Non-empty cells: 36 [0,5]: 'Results' [1,4]: 'Ballots' [2,1]: 'Disability' [2,4]: 'Ballots' [3,2]: 'articipants' [3,4]: 'Incomplete/' [3,6]: 'Accuracy' [3,9]: 'Time to' [4,0]: 'Category' [4,1]: 'Completed' [5,3]: 'Terminated'