Skip to content

Models

Pydantic models for table extraction outputs.

Provides structured table data with cells, spans, and multiple export formats including HTML, Markdown, and Pandas DataFrame conversion.

Example
result = extractor.extract(table_image)

# Get HTML
html = result.to_html()

# Get Pandas DataFrame
df = result.to_dataframe()

# Access cells
for cell in result.cells:
    print(f"[{cell.row},{cell.col}] {cell.text}")

CellType

Bases: str, Enum

Type of table cell.

BoundingBox

Bases: BaseModel

Bounding box in pixel coordinates.

width property

width: float

Width of the bounding box.

height property

height: float

Height of the bounding box.

area property

area: float

Area of the bounding box.

center property

center: Tuple[float, float]

Center point of the bounding box.

to_list

to_list() -> List[float]

Convert to [x1, y1, x2, y2] list.

Source code in omnidocs/tasks/table_extraction/models.py
def to_list(self) -> List[float]:
    """Convert to [x1, y1, x2, y2] list."""
    return [self.x1, self.y1, self.x2, self.y2]

to_xyxy

to_xyxy() -> Tuple[float, float, float, float]

Convert to (x1, y1, x2, y2) tuple.

Source code in omnidocs/tasks/table_extraction/models.py
def to_xyxy(self) -> Tuple[float, float, float, float]:
    """Convert to (x1, y1, x2, y2) tuple."""
    return (self.x1, self.y1, self.x2, self.y2)

from_list classmethod

from_list(coords: List[float]) -> BoundingBox

Create from [x1, y1, x2, y2] list.

Source code in omnidocs/tasks/table_extraction/models.py
@classmethod
def from_list(cls, coords: List[float]) -> "BoundingBox":
    """Create from [x1, y1, x2, y2] list."""
    if len(coords) != 4:
        raise ValueError(f"Expected 4 coordinates, got {len(coords)}")
    return cls(x1=coords[0], y1=coords[1], x2=coords[2], y2=coords[3])

from_ltrb classmethod

from_ltrb(
    left: float, top: float, right: float, bottom: float
) -> BoundingBox

Create from left, top, right, bottom coordinates.

Source code in omnidocs/tasks/table_extraction/models.py
@classmethod
def from_ltrb(cls, left: float, top: float, right: float, bottom: float) -> "BoundingBox":
    """Create from left, top, right, bottom coordinates."""
    return cls(x1=left, y1=top, x2=right, y2=bottom)

to_normalized

to_normalized(
    image_width: int, image_height: int
) -> BoundingBox

Convert to normalized coordinates (0-1024 range).

PARAMETER DESCRIPTION
image_width

Original image width in pixels

TYPE: int

image_height

Original image height in pixels

TYPE: int

RETURNS DESCRIPTION
BoundingBox

New BoundingBox with coordinates in 0-1024 range

Source code in omnidocs/tasks/table_extraction/models.py
def to_normalized(self, image_width: int, image_height: int) -> "BoundingBox":
    """
    Convert to normalized coordinates (0-1024 range).

    Args:
        image_width: Original image width in pixels
        image_height: Original image height in pixels

    Returns:
        New BoundingBox with coordinates in 0-1024 range
    """
    return BoundingBox(
        x1=self.x1 / image_width * NORMALIZED_SIZE,
        y1=self.y1 / image_height * NORMALIZED_SIZE,
        x2=self.x2 / image_width * NORMALIZED_SIZE,
        y2=self.y2 / image_height * NORMALIZED_SIZE,
    )

TableCell

Bases: BaseModel

Single table cell with position, span, and content.

The cell position uses 0-indexed row/column indices. Spans indicate how many rows/columns the cell occupies.

end_row property

end_row: int

Ending row index (exclusive).

end_col property

end_col: int

Ending column index (exclusive).

is_header property

is_header: bool

Check if cell is any type of header.

to_dict

to_dict() -> Dict

Convert to dictionary representation.

Source code in omnidocs/tasks/table_extraction/models.py
def to_dict(self) -> Dict:
    """Convert to dictionary representation."""
    return {
        "row": self.row,
        "col": self.col,
        "row_span": self.row_span,
        "col_span": self.col_span,
        "text": self.text,
        "cell_type": self.cell_type.value,
        "bbox": self.bbox.to_list() if self.bbox else None,
        "confidence": self.confidence,
    }

TableOutput

Bases: BaseModel

Complete table extraction result.

Provides multiple export formats and utility methods for working with extracted table data.

Example
result = extractor.extract(table_image)

# Basic info
print(f"Table: {result.num_rows}x{result.num_cols}")

# Export to HTML
html = result.to_html()

# Export to Pandas
df = result.to_dataframe()

# Export to Markdown
md = result.to_markdown()

# Access specific cell
cell = result.get_cell(row=0, col=0)

cell_count property

cell_count: int

Number of cells in the table.

has_headers property

has_headers: bool

Check if table has header cells.

get_cell

get_cell(row: int, col: int) -> Optional[TableCell]

Get cell at specific position.

Handles merged cells by returning the cell that covers the position.

Source code in omnidocs/tasks/table_extraction/models.py
def get_cell(self, row: int, col: int) -> Optional[TableCell]:
    """
    Get cell at specific position.

    Handles merged cells by returning the cell that covers the position.
    """
    for cell in self.cells:
        if cell.row <= row < cell.end_row and cell.col <= col < cell.end_col:
            return cell
    return None

get_row

get_row(row: int) -> List[TableCell]

Get all cells in a specific row.

Source code in omnidocs/tasks/table_extraction/models.py
def get_row(self, row: int) -> List[TableCell]:
    """Get all cells in a specific row."""
    return [c for c in self.cells if c.row == row]

get_column

get_column(col: int) -> List[TableCell]

Get all cells in a specific column.

Source code in omnidocs/tasks/table_extraction/models.py
def get_column(self, col: int) -> List[TableCell]:
    """Get all cells in a specific column."""
    return [c for c in self.cells if c.col == col]

to_html

to_html(include_styles: bool = True) -> str

Convert table to HTML string.

PARAMETER DESCRIPTION
include_styles

Whether to include basic CSS styling

TYPE: bool DEFAULT: True

RETURNS DESCRIPTION
str

HTML table string

Example
html = result.to_html()
with open("table.html", "w") as f:
    f.write(html)
Source code in omnidocs/tasks/table_extraction/models.py
def to_html(self, include_styles: bool = True) -> str:
    """
    Convert table to HTML string.

    Args:
        include_styles: Whether to include basic CSS styling

    Returns:
        HTML table string

    Example:
        ```python
        html = result.to_html()
        with open("table.html", "w") as f:
            f.write(html)
        ```
    """
    # Build 2D grid accounting for spans
    grid: List[List[Optional[TableCell]]] = [[None for _ in range(self.num_cols)] for _ in range(self.num_rows)]

    for cell in self.cells:
        for r in range(cell.row, cell.end_row):
            for c in range(cell.col, cell.end_col):
                if r < self.num_rows and c < self.num_cols:
                    grid[r][c] = cell

    # Generate HTML
    lines = []

    if include_styles:
        lines.append('<table style="border-collapse: collapse; width: 100%;">')
    else:
        lines.append("<table>")

    processed: set[Tuple[int, int]] = set()  # Track cells we've already output

    for row_idx in range(self.num_rows):
        lines.append("  <tr>")

        for col_idx in range(self.num_cols):
            cell = grid[row_idx][col_idx]

            if cell is None:
                lines.append("    <td></td>")
                continue

            # Skip if this cell was already output (merged cell)
            cell_id = (cell.row, cell.col)
            if cell_id in processed:
                continue
            processed.add(cell_id)

            # Determine tag based on cell type
            tag = "th" if cell.is_header else "td"

            # Build attributes
            attrs = []
            if cell.row_span > 1:
                attrs.append(f'rowspan="{cell.row_span}"')
            if cell.col_span > 1:
                attrs.append(f'colspan="{cell.col_span}"')
            if include_styles:
                attrs.append('style="border: 1px solid #ddd; padding: 8px;"')

            attr_str = " " + " ".join(attrs) if attrs else ""

            # Escape HTML in text
            text = (cell.text or "").replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")

            lines.append(f"    <{tag}{attr_str}>{text}</{tag}>")

        lines.append("  </tr>")

    lines.append("</table>")

    return "\n".join(lines)

to_dataframe

to_dataframe()

Convert table to Pandas DataFrame.

RETURNS DESCRIPTION

pandas.DataFrame with table data

RAISES DESCRIPTION
ImportError

If pandas is not installed

Example
df = result.to_dataframe()
print(df.head())
df.to_csv("table.csv")
Source code in omnidocs/tasks/table_extraction/models.py
def to_dataframe(self):
    """
    Convert table to Pandas DataFrame.

    Returns:
        pandas.DataFrame with table data

    Raises:
        ImportError: If pandas is not installed

    Example:
        ```python
        df = result.to_dataframe()
        print(df.head())
        df.to_csv("table.csv")
        ```
    """
    try:
        import pandas as pd
    except ImportError:
        raise ImportError("pandas is required for to_dataframe(). Install with: pip install pandas")

    # Build 2D array
    data: List[List[Optional[str]]] = [[None for _ in range(self.num_cols)] for _ in range(self.num_rows)]

    for cell in self.cells:
        # For merged cells, put value in top-left position
        if cell.row < self.num_rows and cell.col < self.num_cols:
            data[cell.row][cell.col] = cell.text

    # Determine if first row is header
    first_row_cells = self.get_row(0)
    use_header = all(c.cell_type == CellType.COLUMN_HEADER for c in first_row_cells) if first_row_cells else False

    if use_header and self.num_rows > 1:
        headers = data[0]
        data = data[1:]
        return pd.DataFrame(data, columns=headers)
    else:
        return pd.DataFrame(data)

to_markdown

to_markdown() -> str

Convert table to Markdown format.

Note: Markdown tables don't support merged cells, so spans are ignored and only the top-left cell value is used.

RETURNS DESCRIPTION
str

Markdown table string

Source code in omnidocs/tasks/table_extraction/models.py
def to_markdown(self) -> str:
    """
    Convert table to Markdown format.

    Note: Markdown tables don't support merged cells, so spans
    are ignored and only the top-left cell value is used.

    Returns:
        Markdown table string
    """
    if self.num_rows == 0 or self.num_cols == 0:
        return ""

    # Build 2D grid
    grid: List[List[str]] = [["" for _ in range(self.num_cols)] for _ in range(self.num_rows)]

    for cell in self.cells:
        if cell.row < self.num_rows and cell.col < self.num_cols:
            grid[cell.row][cell.col] = cell.text or ""

    lines = []

    # Header row
    lines.append("| " + " | ".join(grid[0]) + " |")

    # Separator
    lines.append("| " + " | ".join(["---"] * self.num_cols) + " |")

    # Data rows
    for row in grid[1:]:
        lines.append("| " + " | ".join(row) + " |")

    return "\n".join(lines)

to_dict

to_dict() -> Dict

Convert to dictionary representation.

Source code in omnidocs/tasks/table_extraction/models.py
def to_dict(self) -> Dict:
    """Convert to dictionary representation."""
    return {
        "cells": [c.to_dict() for c in self.cells],
        "num_rows": self.num_rows,
        "num_cols": self.num_cols,
        "image_width": self.image_width,
        "image_height": self.image_height,
        "model_name": self.model_name,
        "html": self.to_html(include_styles=False),
    }

save_json

save_json(file_path: Union[str, Path]) -> None

Save to JSON file.

Source code in omnidocs/tasks/table_extraction/models.py
def save_json(self, file_path: Union[str, Path]) -> None:
    """Save to JSON file."""
    path = Path(file_path)
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(self.model_dump_json(indent=2), encoding="utf-8")

load_json classmethod

load_json(file_path: Union[str, Path]) -> TableOutput

Load from JSON file.

Source code in omnidocs/tasks/table_extraction/models.py
@classmethod
def load_json(cls, file_path: Union[str, Path]) -> "TableOutput":
    """Load from JSON file."""
    path = Path(file_path)
    return cls.model_validate_json(path.read_text(encoding="utf-8"))