Skip to content

Models

Pydantic models for reading order prediction.

Takes layout detection and OCR results, produces ordered element sequence with caption and footnote associations.

Example
# Get layout and OCR
layout = layout_extractor.extract(image)
ocr = ocr_extractor.extract(image)

# Predict reading order
reading_order = predictor.predict(layout, ocr)

# Iterate in reading order
for element in reading_order.ordered_elements:
    print(f"{element.index}: [{element.element_type}] {element.text[:50]}...")

# Get caption associations
for fig_id, caption_ids in reading_order.caption_map.items():
    print(f"Figure {fig_id} has captions: {caption_ids}")

ElementType

Bases: str, Enum

Type of document element for reading order.

BoundingBox

Bases: BaseModel

Bounding box in pixel coordinates.

width property

width: float

Width of the bounding box.

height property

height: float

Height of the bounding box.

center property

center: Tuple[float, float]

Center point of the bounding box.

to_list

to_list() -> List[float]

Convert to [x1, y1, x2, y2] list.

Source code in omnidocs/tasks/reading_order/models.py
def to_list(self) -> List[float]:
    """Convert to [x1, y1, x2, y2] list."""
    return [self.x1, self.y1, self.x2, self.y2]

from_list classmethod

from_list(coords: List[float]) -> BoundingBox

Create from [x1, y1, x2, y2] list.

Source code in omnidocs/tasks/reading_order/models.py
@classmethod
def from_list(cls, coords: List[float]) -> "BoundingBox":
    """Create from [x1, y1, x2, y2] list."""
    if len(coords) != 4:
        raise ValueError(f"Expected 4 coordinates, got {len(coords)}")
    return cls(x1=coords[0], y1=coords[1], x2=coords[2], y2=coords[3])

to_normalized

to_normalized(
    image_width: int, image_height: int
) -> BoundingBox

Convert to normalized coordinates (0-1024 range).

PARAMETER DESCRIPTION
image_width

Original image width in pixels

TYPE: int

image_height

Original image height in pixels

TYPE: int

RETURNS DESCRIPTION
BoundingBox

New BoundingBox with coordinates in 0-1024 range

Source code in omnidocs/tasks/reading_order/models.py
def to_normalized(self, image_width: int, image_height: int) -> "BoundingBox":
    """
    Convert to normalized coordinates (0-1024 range).

    Args:
        image_width: Original image width in pixels
        image_height: Original image height in pixels

    Returns:
        New BoundingBox with coordinates in 0-1024 range
    """
    return BoundingBox(
        x1=self.x1 / image_width * NORMALIZED_SIZE,
        y1=self.y1 / image_height * NORMALIZED_SIZE,
        x2=self.x2 / image_width * NORMALIZED_SIZE,
        y2=self.y2 / image_height * NORMALIZED_SIZE,
    )

OrderedElement

Bases: BaseModel

A document element with its reading order position.

Combines layout detection results with OCR text and assigns a reading order index.

to_dict

to_dict() -> Dict

Convert to dictionary representation.

Source code in omnidocs/tasks/reading_order/models.py
def to_dict(self) -> Dict:
    """Convert to dictionary representation."""
    return {
        "index": self.index,
        "element_type": self.element_type.value,
        "bbox": self.bbox.to_list(),
        "text": self.text,
        "confidence": self.confidence,
        "page_no": self.page_no,
        "original_id": self.original_id,
    }

ReadingOrderOutput

Bases: BaseModel

Complete reading order prediction result.

Provides: - Ordered list of document elements - Caption-to-element associations - Footnote-to-element associations - Merge suggestions for split elements

Example
result = predictor.predict(layout, ocr)

# Get full text in reading order
full_text = result.get_full_text()

# Get elements by type
tables = result.get_elements_by_type(ElementType.TABLE)

# Find caption for a figure
captions = result.get_captions_for(figure_element.original_id)

element_count property

element_count: int

Total number of ordered elements.

get_full_text

get_full_text(separator: str = '\n\n') -> str

Get concatenated text in reading order.

Excludes page headers, footers, captions, and footnotes from main text flow.

Source code in omnidocs/tasks/reading_order/models.py
def get_full_text(self, separator: str = "\n\n") -> str:
    """
    Get concatenated text in reading order.

    Excludes page headers, footers, captions, and footnotes
    from main text flow.
    """
    main_elements = [
        e
        for e in self.ordered_elements
        if e.element_type
        not in (
            ElementType.PAGE_HEADER,
            ElementType.PAGE_FOOTER,
            ElementType.CAPTION,
            ElementType.FOOTNOTE,
        )
    ]
    return separator.join(e.text for e in main_elements if e.text)

get_elements_by_type

get_elements_by_type(
    element_type: ElementType,
) -> List[OrderedElement]

Filter elements by type.

Source code in omnidocs/tasks/reading_order/models.py
def get_elements_by_type(self, element_type: ElementType) -> List[OrderedElement]:
    """Filter elements by type."""
    return [e for e in self.ordered_elements if e.element_type == element_type]

get_captions_for

get_captions_for(element_id: int) -> List[OrderedElement]

Get caption elements for a given element ID.

Source code in omnidocs/tasks/reading_order/models.py
def get_captions_for(self, element_id: int) -> List[OrderedElement]:
    """Get caption elements for a given element ID."""
    caption_ids = self.caption_map.get(element_id, [])
    return [e for e in self.ordered_elements if e.original_id in caption_ids]

get_footnotes_for

get_footnotes_for(element_id: int) -> List[OrderedElement]

Get footnote elements for a given element ID.

Source code in omnidocs/tasks/reading_order/models.py
def get_footnotes_for(self, element_id: int) -> List[OrderedElement]:
    """Get footnote elements for a given element ID."""
    footnote_ids = self.footnote_map.get(element_id, [])
    return [e for e in self.ordered_elements if e.original_id in footnote_ids]

to_dict

to_dict() -> Dict

Convert to dictionary representation.

Source code in omnidocs/tasks/reading_order/models.py
def to_dict(self) -> Dict:
    """Convert to dictionary representation."""
    return {
        "ordered_elements": [e.to_dict() for e in self.ordered_elements],
        "caption_map": self.caption_map,
        "footnote_map": self.footnote_map,
        "merge_map": self.merge_map,
        "image_width": self.image_width,
        "image_height": self.image_height,
        "element_count": self.element_count,
    }

save_json

save_json(file_path: Union[str, Path]) -> None

Save to JSON file.

Source code in omnidocs/tasks/reading_order/models.py
def save_json(self, file_path: Union[str, Path]) -> None:
    """Save to JSON file."""
    path = Path(file_path)
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(self.model_dump_json(indent=2), encoding="utf-8")

load_json classmethod

load_json(
    file_path: Union[str, Path],
) -> ReadingOrderOutput

Load from JSON file.

Source code in omnidocs/tasks/reading_order/models.py
@classmethod
def load_json(cls, file_path: Union[str, Path]) -> "ReadingOrderOutput":
    """Load from JSON file."""
    path = Path(file_path)
    return cls.model_validate_json(path.read_text(encoding="utf-8"))