Skip to content

Overview

OmniDocs Task Modules.

Each task module provides extractors for specific document processing tasks.

Available task modules
  • layout_extraction: Detect document structure (titles, tables, figures, etc.)
  • ocr_extraction: Extract text with bounding boxes from images
  • text_extraction: Convert document images to HTML/Markdown
  • table_extraction: Extract table structure and content
  • reading_order: Determine logical reading sequence of document elements

layout_extraction

Layout Extraction Module.

Provides extractors for detecting document layout elements such as titles, text blocks, figures, tables, formulas, and captions.

Available Extractors
  • DocLayoutYOLO: YOLO-based layout detector (fast, accurate)
  • RTDETRLayoutExtractor: Transformer-based detector (more categories)
  • QwenLayoutDetector: VLM-based detector with custom label support (multi-backend)
  • MinerUVLLayoutDetector: MinerU VL 1.2B layout detector (multi-backend)
Example
from omnidocs.tasks.layout_extraction import DocLayoutYOLO, DocLayoutYOLOConfig

extractor = DocLayoutYOLO(config=DocLayoutYOLOConfig(device="cuda"))
result = extractor.extract(image)

for box in result.bboxes:
        print(f"{box.label.value}: {box.confidence:.2f}")
# VLM-based detection with custom labels
from omnidocs.tasks.layout_extraction import QwenLayoutDetector, CustomLabel
from omnidocs.tasks.layout_extraction.qwen import QwenLayoutPyTorchConfig

detector = QwenLayoutDetector(
        backend=QwenLayoutPyTorchConfig(model="Qwen/Qwen3-VL-8B-Instruct")
    )
result = detector.extract(image, custom_labels=["code_block", "sidebar"])

BaseLayoutExtractor

Bases: ABC

Abstract base class for layout extractors.

All layout extraction models must inherit from this class and implement the required methods.

Example
class MyLayoutExtractor(BaseLayoutExtractor):
        def __init__(self, config: MyConfig):
            self.config = config
            self._load_model()

        def _load_model(self):
            # Load model weights
            pass

        def extract(self, image):
            # Run extraction
            return LayoutOutput(...)

extract abstractmethod

extract(
    image: Union[Image, ndarray, str, Path],
) -> LayoutOutput

Run layout extraction on an image.

PARAMETER DESCRIPTION
image

Input image as: - PIL.Image.Image: PIL image object - np.ndarray: Numpy array (HWC format, RGB) - str or Path: Path to image file

TYPE: Union[Image, ndarray, str, Path]

RETURNS DESCRIPTION
LayoutOutput

LayoutOutput containing detected layout boxes with standardized labels

RAISES DESCRIPTION
ValueError

If image format is not supported

RuntimeError

If model is not loaded or inference fails

Source code in omnidocs/tasks/layout_extraction/base.py
@abstractmethod
def extract(self, image: Union[Image.Image, np.ndarray, str, Path]) -> LayoutOutput:
    """
    Run layout extraction on an image.

    Args:
        image: Input image as:
            - PIL.Image.Image: PIL image object
            - np.ndarray: Numpy array (HWC format, RGB)
            - str or Path: Path to image file

    Returns:
        LayoutOutput containing detected layout boxes with standardized labels

    Raises:
        ValueError: If image format is not supported
        RuntimeError: If model is not loaded or inference fails
    """
    pass

batch_extract

batch_extract(
    images: List[Union[Image, ndarray, str, Path]],
    progress_callback: Optional[
        Callable[[int, int], None]
    ] = None,
) -> List[LayoutOutput]

Run layout extraction on multiple images.

Default implementation loops over extract(). Subclasses can override for optimized batching.

PARAMETER DESCRIPTION
images

List of images in any supported format

TYPE: List[Union[Image, ndarray, str, Path]]

progress_callback

Optional function(current, total) for progress

TYPE: Optional[Callable[[int, int], None]] DEFAULT: None

RETURNS DESCRIPTION
List[LayoutOutput]

List of LayoutOutput in same order as input

Examples:

images = [doc.get_page(i) for i in range(doc.page_count)]
results = extractor.batch_extract(images)
Source code in omnidocs/tasks/layout_extraction/base.py
def batch_extract(
    self,
    images: List[Union[Image.Image, np.ndarray, str, Path]],
    progress_callback: Optional[Callable[[int, int], None]] = None,
) -> List[LayoutOutput]:
    """
    Run layout extraction on multiple images.

    Default implementation loops over extract(). Subclasses can override
    for optimized batching.

    Args:
        images: List of images in any supported format
        progress_callback: Optional function(current, total) for progress

    Returns:
        List of LayoutOutput in same order as input

    Examples:
        ```python
        images = [doc.get_page(i) for i in range(doc.page_count)]
        results = extractor.batch_extract(images)
        ```
    """
    results = []
    total = len(images)

    for i, image in enumerate(images):
        if progress_callback:
            progress_callback(i + 1, total)

        result = self.extract(image)
        results.append(result)

    return results

extract_document

extract_document(
    document: Document,
    progress_callback: Optional[
        Callable[[int, int], None]
    ] = None,
) -> List[LayoutOutput]

Run layout extraction on all pages of a document.

PARAMETER DESCRIPTION
document

Document instance

TYPE: Document

progress_callback

Optional function(current, total) for progress

TYPE: Optional[Callable[[int, int], None]] DEFAULT: None

RETURNS DESCRIPTION
List[LayoutOutput]

List of LayoutOutput, one per page

Examples:

doc = Document.from_pdf("paper.pdf")
results = extractor.extract_document(doc)
Source code in omnidocs/tasks/layout_extraction/base.py
def extract_document(
    self,
    document: "Document",
    progress_callback: Optional[Callable[[int, int], None]] = None,
) -> List[LayoutOutput]:
    """
    Run layout extraction on all pages of a document.

    Args:
        document: Document instance
        progress_callback: Optional function(current, total) for progress

    Returns:
        List of LayoutOutput, one per page

    Examples:
        ```python
        doc = Document.from_pdf("paper.pdf")
        results = extractor.extract_document(doc)
        ```
    """
    results = []
    total = document.page_count

    for i, page in enumerate(document.iter_pages()):
        if progress_callback:
            progress_callback(i + 1, total)

        result = self.extract(page)
        results.append(result)

    return results

DocLayoutYOLO

DocLayoutYOLO(config: DocLayoutYOLOConfig)

Bases: BaseLayoutExtractor

DocLayout-YOLO layout extractor.

A YOLO-based model optimized for document layout detection. Detects: title, text, figure, table, formula, captions, etc.

This is a single-backend model (PyTorch only).

Example
from omnidocs.tasks.layout_extraction import DocLayoutYOLO, DocLayoutYOLOConfig

extractor = DocLayoutYOLO(config=DocLayoutYOLOConfig(device="cuda"))
result = extractor.extract(image)

for box in result.bboxes:
        print(f"{box.label.value}: {box.confidence:.2f}")

Initialize DocLayout-YOLO extractor.

PARAMETER DESCRIPTION
config

Configuration object with device, model_path, etc.

TYPE: DocLayoutYOLOConfig

Source code in omnidocs/tasks/layout_extraction/doc_layout_yolo.py
def __init__(self, config: DocLayoutYOLOConfig):
    """
    Initialize DocLayout-YOLO extractor.

    Args:
        config: Configuration object with device, model_path, etc.
    """
    self.config = config
    self._model = None
    self._device = self._resolve_device(config.device)
    self._model_path = self._resolve_model_path(config.model_path)

    # Load model
    self._load_model()

extract

extract(
    image: Union[Image, ndarray, str, Path],
) -> LayoutOutput

Run layout extraction on an image.

PARAMETER DESCRIPTION
image

Input image (PIL Image, numpy array, or path)

TYPE: Union[Image, ndarray, str, Path]

RETURNS DESCRIPTION
LayoutOutput

LayoutOutput with detected layout boxes

Source code in omnidocs/tasks/layout_extraction/doc_layout_yolo.py
def extract(self, image: Union[Image.Image, np.ndarray, str, Path]) -> LayoutOutput:
    """
    Run layout extraction on an image.

    Args:
        image: Input image (PIL Image, numpy array, or path)

    Returns:
        LayoutOutput with detected layout boxes
    """
    if self._model is None:
        raise RuntimeError("Model not loaded. Call _load_model() first.")

    # Prepare image
    pil_image = self._prepare_image(image)
    img_width, img_height = pil_image.size

    # Run inference
    results = self._model.predict(
        pil_image,
        imgsz=self.config.img_size,
        conf=self.config.confidence,
        device=self._device,
    )

    result = results[0]

    # Parse detections
    layout_boxes = []

    if hasattr(result, "boxes") and result.boxes is not None:
        boxes = result.boxes

        for i in range(len(boxes)):
            # Get coordinates
            bbox_coords = boxes.xyxy[i].cpu().numpy().tolist()

            # Get class and confidence
            class_id = int(boxes.cls[i].item())
            confidence = float(boxes.conf[i].item())

            # Get original label from class names
            original_label = DOCLAYOUT_YOLO_CLASS_NAMES.get(class_id, f"class_{class_id}")

            # Map to standardized label
            standard_label = DOCLAYOUT_YOLO_MAPPING.to_standard(original_label)

            layout_boxes.append(
                LayoutBox(
                    label=standard_label,
                    bbox=BoundingBox.from_list(bbox_coords),
                    confidence=confidence,
                    class_id=class_id,
                    original_label=original_label,
                )
            )

    # Sort by y-coordinate (top to bottom reading order)
    layout_boxes.sort(key=lambda b: (b.bbox.y1, b.bbox.x1))

    return LayoutOutput(
        bboxes=layout_boxes,
        image_width=img_width,
        image_height=img_height,
        model_name="DocLayout-YOLO",
    )

DocLayoutYOLOConfig

Bases: BaseModel

Configuration for DocLayout-YOLO layout extractor.

This is a single-backend model (PyTorch only).

Example
config = DocLayoutYOLOConfig(device="cuda", confidence=0.3)
extractor = DocLayoutYOLO(config=config)

MinerUVLLayoutDetector

MinerUVLLayoutDetector(
    backend: MinerUVLLayoutBackendConfig,
)

Bases: BaseLayoutExtractor

MinerU VL layout detector.

Uses MinerU2.5-2509-1.2B for document layout detection. Detects 22+ element types including text, titles, tables, equations, figures, code, and more.

For full document extraction (layout + content), use MinerUVLTextExtractor from the text_extraction module instead.

Example
from omnidocs.tasks.layout_extraction import MinerUVLLayoutDetector
from omnidocs.tasks.layout_extraction.mineruvl import MinerUVLLayoutPyTorchConfig

detector = MinerUVLLayoutDetector(
    backend=MinerUVLLayoutPyTorchConfig(device="cuda")
)
result = detector.extract(image)

for box in result.bboxes:
    print(f"{box.label}: {box.confidence:.2f}")

Initialize MinerU VL layout detector.

PARAMETER DESCRIPTION
backend

Backend configuration (PyTorch, VLLM, MLX, or API)

TYPE: MinerUVLLayoutBackendConfig

Source code in omnidocs/tasks/layout_extraction/mineruvl/detector.py
def __init__(self, backend: MinerUVLLayoutBackendConfig):
    """
    Initialize MinerU VL layout detector.

    Args:
        backend: Backend configuration (PyTorch, VLLM, MLX, or API)
    """
    self.backend_config = backend
    self._client = None
    self._loaded = False
    self._load_model()

extract

extract(
    image: Union[Image, ndarray, str, Path],
) -> LayoutOutput

Detect layout elements in the image.

PARAMETER DESCRIPTION
image

Input image (PIL Image, numpy array, or file path)

TYPE: Union[Image, ndarray, str, Path]

RETURNS DESCRIPTION
LayoutOutput

LayoutOutput with standardized labels and bounding boxes

Source code in omnidocs/tasks/layout_extraction/mineruvl/detector.py
def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
) -> LayoutOutput:
    """
    Detect layout elements in the image.

    Args:
        image: Input image (PIL Image, numpy array, or file path)

    Returns:
        LayoutOutput with standardized labels and bounding boxes
    """
    if not self._loaded:
        raise RuntimeError("Model not loaded.")

    pil_image = self._prepare_image(image)
    width, height = pil_image.size

    # Run layout detection
    blocks = self._detect_layout(pil_image)

    # Convert to LayoutOutput
    bboxes = []
    for block in blocks:
        # Convert normalized [0,1] to pixel coords
        x1, y1, x2, y2 = block.bbox
        pixel_bbox = BoundingBox(
            x1=x1 * width,
            y1=y1 * height,
            x2=x2 * width,
            y2=y2 * height,
        )

        # Map label
        label = MINERUVL_LABEL_MAPPING.get(block.type, LayoutLabel.UNKNOWN)

        bboxes.append(
            LayoutBox(
                label=label,
                bbox=pixel_bbox,
                confidence=1.0,  # MinerU VL doesn't output confidence
                original_label=block.type.value,
            )
        )

    return LayoutOutput(
        bboxes=bboxes,
        image_width=width,
        image_height=height,
        model_name="MinerU2.5-2509-1.2B",
    )

BoundingBox

Bases: BaseModel

Bounding box coordinates in pixel space.

Coordinates follow the convention: (x1, y1) is top-left, (x2, y2) is bottom-right.

width property

width: float

Width of the bounding box.

height property

height: float

Height of the bounding box.

area property

area: float

Area of the bounding box.

center property

center: Tuple[float, float]

Center point of the bounding box.

to_list

to_list() -> List[float]

Convert to [x1, y1, x2, y2] list.

Source code in omnidocs/tasks/layout_extraction/models.py
def to_list(self) -> List[float]:
    """Convert to [x1, y1, x2, y2] list."""
    return [self.x1, self.y1, self.x2, self.y2]

to_xyxy

to_xyxy() -> Tuple[float, float, float, float]

Convert to (x1, y1, x2, y2) tuple.

Source code in omnidocs/tasks/layout_extraction/models.py
def to_xyxy(self) -> Tuple[float, float, float, float]:
    """Convert to (x1, y1, x2, y2) tuple."""
    return (self.x1, self.y1, self.x2, self.y2)

to_xywh

to_xywh() -> Tuple[float, float, float, float]

Convert to (x, y, width, height) format.

Source code in omnidocs/tasks/layout_extraction/models.py
def to_xywh(self) -> Tuple[float, float, float, float]:
    """Convert to (x, y, width, height) format."""
    return (self.x1, self.y1, self.width, self.height)

from_list classmethod

from_list(coords: List[float]) -> BoundingBox

Create from [x1, y1, x2, y2] list.

Source code in omnidocs/tasks/layout_extraction/models.py
@classmethod
def from_list(cls, coords: List[float]) -> "BoundingBox":
    """Create from [x1, y1, x2, y2] list."""
    if len(coords) != 4:
        raise ValueError(f"Expected 4 coordinates, got {len(coords)}")
    return cls(x1=coords[0], y1=coords[1], x2=coords[2], y2=coords[3])

to_normalized

to_normalized(
    image_width: int, image_height: int
) -> BoundingBox

Convert to normalized coordinates (0-1024 range).

Scales coordinates from absolute pixel values to a virtual 1024x1024 canvas. This provides consistent coordinates regardless of original image size.

PARAMETER DESCRIPTION
image_width

Original image width in pixels

TYPE: int

image_height

Original image height in pixels

TYPE: int

RETURNS DESCRIPTION
BoundingBox

New BoundingBox with coordinates in 0-1024 range

Example
bbox = BoundingBox(x1=100, y1=50, x2=500, y2=300)
normalized = bbox.to_normalized(1000, 800)
# x: 100/1000*1024 = 102.4, y: 50/800*1024 = 64
Source code in omnidocs/tasks/layout_extraction/models.py
def to_normalized(self, image_width: int, image_height: int) -> "BoundingBox":
    """
    Convert to normalized coordinates (0-1024 range).

    Scales coordinates from absolute pixel values to a virtual 1024x1024 canvas.
    This provides consistent coordinates regardless of original image size.

    Args:
        image_width: Original image width in pixels
        image_height: Original image height in pixels

    Returns:
        New BoundingBox with coordinates in 0-1024 range

    Example:
        ```python
        bbox = BoundingBox(x1=100, y1=50, x2=500, y2=300)
        normalized = bbox.to_normalized(1000, 800)
        # x: 100/1000*1024 = 102.4, y: 50/800*1024 = 64
        ```
    """
    return BoundingBox(
        x1=self.x1 / image_width * NORMALIZED_SIZE,
        y1=self.y1 / image_height * NORMALIZED_SIZE,
        x2=self.x2 / image_width * NORMALIZED_SIZE,
        y2=self.y2 / image_height * NORMALIZED_SIZE,
    )

to_absolute

to_absolute(
    image_width: int, image_height: int
) -> BoundingBox

Convert from normalized (0-1024) to absolute pixel coordinates.

PARAMETER DESCRIPTION
image_width

Target image width in pixels

TYPE: int

image_height

Target image height in pixels

TYPE: int

RETURNS DESCRIPTION
BoundingBox

New BoundingBox with absolute pixel coordinates

Source code in omnidocs/tasks/layout_extraction/models.py
def to_absolute(self, image_width: int, image_height: int) -> "BoundingBox":
    """
    Convert from normalized (0-1024) to absolute pixel coordinates.

    Args:
        image_width: Target image width in pixels
        image_height: Target image height in pixels

    Returns:
        New BoundingBox with absolute pixel coordinates
    """
    return BoundingBox(
        x1=self.x1 / NORMALIZED_SIZE * image_width,
        y1=self.y1 / NORMALIZED_SIZE * image_height,
        x2=self.x2 / NORMALIZED_SIZE * image_width,
        y2=self.y2 / NORMALIZED_SIZE * image_height,
    )

CustomLabel

Bases: BaseModel

Type-safe custom layout label definition for VLM-based models.

VLM models like Qwen3-VL support flexible custom labels beyond the standard LayoutLabel enum. Use this class to define custom labels with validation.

Example
from omnidocs.tasks.layout_extraction import CustomLabel

# Simple custom label
code_block = CustomLabel(name="code_block")

# With metadata
sidebar = CustomLabel(
        name="sidebar",
        description="Secondary content panel",
        color="#9B59B6",
    )

# Use with QwenLayoutDetector
result = detector.extract(image, custom_labels=[code_block, sidebar])

LabelMapping

LabelMapping(mapping: Dict[str, LayoutLabel])

Base class for model-specific label mappings.

Each model maps its native labels to standardized LayoutLabel values.

Initialize label mapping.

PARAMETER DESCRIPTION
mapping

Dict mapping model-specific labels to LayoutLabel enum values

TYPE: Dict[str, LayoutLabel]

Source code in omnidocs/tasks/layout_extraction/models.py
def __init__(self, mapping: Dict[str, LayoutLabel]):
    """
    Initialize label mapping.

    Args:
        mapping: Dict mapping model-specific labels to LayoutLabel enum values
    """
    self._mapping = {k.lower(): v for k, v in mapping.items()}
    self._reverse_mapping = {v: k for k, v in mapping.items()}

supported_labels property

supported_labels: List[str]

Get list of supported model-specific labels.

standard_labels property

standard_labels: List[LayoutLabel]

Get list of standard labels this mapping produces.

to_standard

to_standard(model_label: str) -> LayoutLabel

Convert model-specific label to standardized LayoutLabel.

Source code in omnidocs/tasks/layout_extraction/models.py
def to_standard(self, model_label: str) -> LayoutLabel:
    """Convert model-specific label to standardized LayoutLabel."""
    return self._mapping.get(model_label.lower(), LayoutLabel.UNKNOWN)

from_standard

from_standard(standard_label: LayoutLabel) -> Optional[str]

Convert standardized LayoutLabel to model-specific label.

Source code in omnidocs/tasks/layout_extraction/models.py
def from_standard(self, standard_label: LayoutLabel) -> Optional[str]:
    """Convert standardized LayoutLabel to model-specific label."""
    return self._reverse_mapping.get(standard_label)

LayoutBox

Bases: BaseModel

Single detected layout element with label, bounding box, and confidence.

to_dict

to_dict() -> Dict

Convert to dictionary representation.

Source code in omnidocs/tasks/layout_extraction/models.py
def to_dict(self) -> Dict:
    """Convert to dictionary representation."""
    return {
        "label": self.label.value,
        "bbox": self.bbox.to_list(),
        "confidence": self.confidence,
        "class_id": self.class_id,
        "original_label": self.original_label,
    }

get_normalized_bbox

get_normalized_bbox(
    image_width: int, image_height: int
) -> BoundingBox

Get bounding box in normalized (0-1024) coordinates.

PARAMETER DESCRIPTION
image_width

Original image width

TYPE: int

image_height

Original image height

TYPE: int

RETURNS DESCRIPTION
BoundingBox

BoundingBox with normalized coordinates

Source code in omnidocs/tasks/layout_extraction/models.py
def get_normalized_bbox(self, image_width: int, image_height: int) -> BoundingBox:
    """
    Get bounding box in normalized (0-1024) coordinates.

    Args:
        image_width: Original image width
        image_height: Original image height

    Returns:
        BoundingBox with normalized coordinates
    """
    return self.bbox.to_normalized(image_width, image_height)

LayoutLabel

Bases: str, Enum

Standardized layout labels used across all layout extractors.

These provide a consistent vocabulary regardless of which model is used.

LayoutOutput

Bases: BaseModel

Complete layout extraction results for a single image.

element_count property

element_count: int

Number of detected elements.

labels_found property

labels_found: List[str]

Unique labels found in detections.

filter_by_label

filter_by_label(label: LayoutLabel) -> List[LayoutBox]

Filter boxes by label.

Source code in omnidocs/tasks/layout_extraction/models.py
def filter_by_label(self, label: LayoutLabel) -> List[LayoutBox]:
    """Filter boxes by label."""
    return [box for box in self.bboxes if box.label == label]

filter_by_confidence

filter_by_confidence(
    min_confidence: float,
) -> List[LayoutBox]

Filter boxes by minimum confidence.

Source code in omnidocs/tasks/layout_extraction/models.py
def filter_by_confidence(self, min_confidence: float) -> List[LayoutBox]:
    """Filter boxes by minimum confidence."""
    return [box for box in self.bboxes if box.confidence >= min_confidence]

to_dict

to_dict() -> Dict

Convert to dictionary representation.

Source code in omnidocs/tasks/layout_extraction/models.py
def to_dict(self) -> Dict:
    """Convert to dictionary representation."""
    return {
        "bboxes": [box.to_dict() for box in self.bboxes],
        "image_width": self.image_width,
        "image_height": self.image_height,
        "model_name": self.model_name,
        "element_count": self.element_count,
        "labels_found": self.labels_found,
    }

sort_by_position

sort_by_position(
    top_to_bottom: bool = True,
) -> LayoutOutput

Return a new LayoutOutput with boxes sorted by position.

PARAMETER DESCRIPTION
top_to_bottom

If True, sort by y-coordinate (reading order)

TYPE: bool DEFAULT: True

Source code in omnidocs/tasks/layout_extraction/models.py
def sort_by_position(self, top_to_bottom: bool = True) -> "LayoutOutput":
    """
    Return a new LayoutOutput with boxes sorted by position.

    Args:
        top_to_bottom: If True, sort by y-coordinate (reading order)
    """
    sorted_boxes = sorted(self.bboxes, key=lambda b: (b.bbox.y1, b.bbox.x1), reverse=not top_to_bottom)
    return LayoutOutput(
        bboxes=sorted_boxes,
        image_width=self.image_width,
        image_height=self.image_height,
        model_name=self.model_name,
    )

get_normalized_bboxes

get_normalized_bboxes() -> List[Dict]

Get all bounding boxes in normalized (0-1024) coordinates.

RETURNS DESCRIPTION
List[Dict]

List of dicts with normalized bbox coordinates and metadata.

Example
result = extractor.extract(image)
normalized = result.get_normalized_bboxes()
for box in normalized:
        print(f"{box['label']}: {box['bbox']}")  # coords in 0-1024 range
Source code in omnidocs/tasks/layout_extraction/models.py
def get_normalized_bboxes(self) -> List[Dict]:
    """
    Get all bounding boxes in normalized (0-1024) coordinates.

    Returns:
        List of dicts with normalized bbox coordinates and metadata.

    Example:
        ```python
        result = extractor.extract(image)
        normalized = result.get_normalized_bboxes()
        for box in normalized:
                print(f"{box['label']}: {box['bbox']}")  # coords in 0-1024 range
        ```
    """
    normalized = []
    for box in self.bboxes:
        norm_bbox = box.bbox.to_normalized(self.image_width, self.image_height)
        normalized.append(
            {
                "label": box.label.value,
                "bbox": norm_bbox.to_list(),
                "confidence": box.confidence,
                "class_id": box.class_id,
                "original_label": box.original_label,
            }
        )
    return normalized

visualize

visualize(
    image: Image,
    output_path: Optional[Union[str, Path]] = None,
    show_labels: bool = True,
    show_confidence: bool = True,
    line_width: int = 3,
    font_size: int = 12,
) -> Image.Image

Visualize layout detection results on the image.

Draws bounding boxes with labels and confidence scores on the image. Each layout category has a distinct color for easy identification.

PARAMETER DESCRIPTION
image

PIL Image to draw on (will be copied, not modified)

TYPE: Image

output_path

Optional path to save the visualization

TYPE: Optional[Union[str, Path]] DEFAULT: None

show_labels

Whether to show label text

TYPE: bool DEFAULT: True

show_confidence

Whether to show confidence scores

TYPE: bool DEFAULT: True

line_width

Width of bounding box lines

TYPE: int DEFAULT: 3

font_size

Size of label text (note: uses default font)

TYPE: int DEFAULT: 12

RETURNS DESCRIPTION
Image

PIL Image with visualizations drawn

Example
result = extractor.extract(image)
viz = result.visualize(image, output_path="layout_viz.png")
viz.show()  # Display in notebook/viewer
Source code in omnidocs/tasks/layout_extraction/models.py
def visualize(
    self,
    image: "Image.Image",
    output_path: Optional[Union[str, Path]] = None,
    show_labels: bool = True,
    show_confidence: bool = True,
    line_width: int = 3,
    font_size: int = 12,
) -> "Image.Image":
    """
    Visualize layout detection results on the image.

    Draws bounding boxes with labels and confidence scores on the image.
    Each layout category has a distinct color for easy identification.

    Args:
        image: PIL Image to draw on (will be copied, not modified)
        output_path: Optional path to save the visualization
        show_labels: Whether to show label text
        show_confidence: Whether to show confidence scores
        line_width: Width of bounding box lines
        font_size: Size of label text (note: uses default font)

    Returns:
        PIL Image with visualizations drawn

    Example:
        ```python
        result = extractor.extract(image)
        viz = result.visualize(image, output_path="layout_viz.png")
        viz.show()  # Display in notebook/viewer
        ```
    """
    from PIL import ImageDraw

    # Copy image to avoid modifying original
    viz_image = image.copy().convert("RGB")
    draw = ImageDraw.Draw(viz_image)

    for box in self.bboxes:
        # Get color for this label
        color = LABEL_COLORS.get(box.label, "#95A5A6")

        # Draw bounding box
        coords = box.bbox.to_xyxy()
        draw.rectangle(coords, outline=color, width=line_width)

        # Build label text
        if show_labels or show_confidence:
            label_parts = []
            if show_labels:
                label_parts.append(box.label.value)
            if show_confidence:
                label_parts.append(f"{box.confidence:.2f}")
            label_text = " ".join(label_parts)

            # Draw label background
            text_bbox = draw.textbbox((coords[0], coords[1] - 20), label_text)
            draw.rectangle(text_bbox, fill=color)

            # Draw label text
            draw.text(
                (coords[0], coords[1] - 20),
                label_text,
                fill="white",
            )

    # Save if path provided
    if output_path:
        output_path = Path(output_path)
        output_path.parent.mkdir(parents=True, exist_ok=True)
        viz_image.save(output_path)

    return viz_image

load_json classmethod

load_json(file_path: Union[str, Path]) -> LayoutOutput

Load a LayoutOutput instance from a JSON file.

Reads a JSON file and deserializes its contents into a LayoutOutput object. Uses Pydantic's model_validate_json for proper handling of nested objects.

PARAMETER DESCRIPTION
file_path

Path to JSON file containing serialized LayoutOutput data. Can be string or pathlib.Path object.

TYPE: Union[str, Path]

RETURNS DESCRIPTION
LayoutOutput

Deserialized layout output instance from file.

TYPE: LayoutOutput

RAISES DESCRIPTION
FileNotFoundError

If the specified file does not exist.

UnicodeDecodeError

If file cannot be decoded as UTF-8.

ValueError

If file contents are not valid JSON.

ValidationError

If JSON data doesn't match LayoutOutput schema.

Example

output = LayoutOutput.load_json('layout_results.json')
print(f"Found {output.element_count} elements")
Found 5 elements

Source code in omnidocs/tasks/layout_extraction/models.py
@classmethod
def load_json(cls, file_path: Union[str, Path]) -> "LayoutOutput":
    """
    Load a LayoutOutput instance from a JSON file.

    Reads a JSON file and deserializes its contents into a LayoutOutput object.
    Uses Pydantic's model_validate_json for proper handling of nested objects.

    Args:
        file_path: Path to JSON file containing serialized LayoutOutput data.
                  Can be string or pathlib.Path object.

    Returns:
        LayoutOutput: Deserialized layout output instance from file.

    Raises:
        FileNotFoundError: If the specified file does not exist.
        UnicodeDecodeError: If file cannot be decoded as UTF-8.
        ValueError: If file contents are not valid JSON.
        ValidationError: If JSON data doesn't match LayoutOutput schema.

    Example:
        ```python
        output = LayoutOutput.load_json('layout_results.json')
        print(f"Found {output.element_count} elements")
        ```
        Found 5 elements
    """
    path = Path(file_path)
    return cls.model_validate_json(path.read_text(encoding="utf-8"))

save_json

save_json(file_path: Union[str, Path]) -> None

Save LayoutOutput instance to a JSON file.

Serializes the LayoutOutput object to JSON and writes it to a file. Automatically creates parent directories if they don't exist. Uses UTF-8 encoding for compatibility and proper handling of special characters.

PARAMETER DESCRIPTION
file_path

Path where JSON file should be saved. Can be string or pathlib.Path object. Parent directories will be created if they don't exist.

TYPE: Union[str, Path]

RETURNS DESCRIPTION
None

None

RAISES DESCRIPTION
OSError

If file cannot be written due to permission or disk errors.

TypeError

If file_path is not a string or Path object.

Example
output = LayoutOutput(bboxes=[], image_width=800, image_height=600)
output.save_json('results/layout_output.json')
# File is created at results/layout_output.json
# Parent 'results' directory is created if it didn't exist
Source code in omnidocs/tasks/layout_extraction/models.py
def save_json(self, file_path: Union[str, Path]) -> None:
    """
    Save LayoutOutput instance to a JSON file.

    Serializes the LayoutOutput object to JSON and writes it to a file.
    Automatically creates parent directories if they don't exist. Uses UTF-8
    encoding for compatibility and proper handling of special characters.

    Args:
        file_path: Path where JSON file should be saved. Can be string or
                  pathlib.Path object. Parent directories will be created
                  if they don't exist.

    Returns:
        None

    Raises:
        OSError: If file cannot be written due to permission or disk errors.
        TypeError: If file_path is not a string or Path object.

    Example:
        ```python
        output = LayoutOutput(bboxes=[], image_width=800, image_height=600)
        output.save_json('results/layout_output.json')
        # File is created at results/layout_output.json
        # Parent 'results' directory is created if it didn't exist
        ```
    """
    path = Path(file_path)
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(self.model_dump_json(), encoding="utf-8")

QwenLayoutDetector

QwenLayoutDetector(backend: QwenLayoutBackendConfig)

Bases: BaseLayoutExtractor

Qwen3-VL Vision-Language Model layout detector.

A flexible VLM-based layout detector that supports custom labels. Unlike fixed-label models (DocLayoutYOLO, RT-DETR), Qwen can detect any document elements specified at runtime.

Supports PyTorch, VLLM, MLX, and API backends.

Example
from omnidocs.tasks.layout_extraction import QwenLayoutDetector, CustomLabel
from omnidocs.tasks.layout_extraction.qwen import QwenLayoutPyTorchConfig

# Initialize with PyTorch backend
detector = QwenLayoutDetector(
        backend=QwenLayoutPyTorchConfig(model="Qwen/Qwen3-VL-8B-Instruct")
    )

# Basic extraction with default labels
result = detector.extract(image)

# With custom labels (strings)
result = detector.extract(image, custom_labels=["code_block", "sidebar"])

# With typed custom labels
labels = [
        CustomLabel(name="code_block", color="#E74C3C"),
        CustomLabel(name="sidebar", description="Side panel content"),
    ]
result = detector.extract(image, custom_labels=labels)

Initialize Qwen layout detector.

PARAMETER DESCRIPTION
backend

Backend configuration. One of: - QwenLayoutPyTorchConfig: PyTorch/HuggingFace backend - QwenLayoutVLLMConfig: VLLM high-throughput backend - QwenLayoutMLXConfig: MLX backend for Apple Silicon - QwenLayoutAPIConfig: API backend (OpenRouter, etc.)

TYPE: QwenLayoutBackendConfig

Source code in omnidocs/tasks/layout_extraction/qwen/detector.py
def __init__(self, backend: QwenLayoutBackendConfig):
    """
    Initialize Qwen layout detector.

    Args:
        backend: Backend configuration. One of:
            - QwenLayoutPyTorchConfig: PyTorch/HuggingFace backend
            - QwenLayoutVLLMConfig: VLLM high-throughput backend
            - QwenLayoutMLXConfig: MLX backend for Apple Silicon
            - QwenLayoutAPIConfig: API backend (OpenRouter, etc.)
    """
    self.backend_config = backend
    self._backend: Any = None
    self._processor: Any = None
    self._loaded = False

    # Backend-specific helpers
    self._process_vision_info: Any = None
    self._sampling_params_class: Any = None
    self._mlx_config: Any = None
    self._apply_chat_template: Any = None
    self._generate: Any = None

    # Load model
    self._load_model()

extract

extract(
    image: Union[Image, ndarray, str, Path],
    custom_labels: Optional[
        List[Union[str, CustomLabel]]
    ] = None,
) -> LayoutOutput

Run layout detection on an image.

PARAMETER DESCRIPTION
image

Input image as: - PIL.Image.Image: PIL image object - np.ndarray: Numpy array (HWC format, RGB) - str or Path: Path to image file

TYPE: Union[Image, ndarray, str, Path]

custom_labels

Optional custom labels to detect. Can be: - None: Use default labels (title, text, table, figure, etc.) - List[str]: Simple label names ["code_block", "sidebar"] - List[CustomLabel]: Typed labels with metadata

TYPE: Optional[List[Union[str, CustomLabel]]] DEFAULT: None

RETURNS DESCRIPTION
LayoutOutput

LayoutOutput with detected layout boxes

RAISES DESCRIPTION
RuntimeError

If model is not loaded

ValueError

If image format is not supported

Source code in omnidocs/tasks/layout_extraction/qwen/detector.py
def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    custom_labels: Optional[List[Union[str, CustomLabel]]] = None,
) -> LayoutOutput:
    """
    Run layout detection on an image.

    Args:
        image: Input image as:
            - PIL.Image.Image: PIL image object
            - np.ndarray: Numpy array (HWC format, RGB)
            - str or Path: Path to image file
        custom_labels: Optional custom labels to detect. Can be:
            - None: Use default labels (title, text, table, figure, etc.)
            - List[str]: Simple label names ["code_block", "sidebar"]
            - List[CustomLabel]: Typed labels with metadata

    Returns:
        LayoutOutput with detected layout boxes

    Raises:
        RuntimeError: If model is not loaded
        ValueError: If image format is not supported
    """
    if not self._loaded:
        raise RuntimeError("Model not loaded. Call _load_model() first.")

    # Prepare image
    pil_image = self._prepare_image(image)
    width, height = pil_image.size

    # Normalize labels
    label_names = self._normalize_labels(custom_labels)

    # Build prompt
    prompt = self._build_detection_prompt(label_names)

    # Run inference based on backend
    config_type = type(self.backend_config).__name__
    if config_type == "QwenLayoutPyTorchConfig":
        raw_output = self._infer_pytorch(pil_image, prompt)
    elif config_type == "QwenLayoutVLLMConfig":
        raw_output = self._infer_vllm(pil_image, prompt)
    elif config_type == "QwenLayoutMLXConfig":
        raw_output = self._infer_mlx(pil_image, prompt)
    elif config_type == "QwenLayoutAPIConfig":
        raw_output = self._infer_api(pil_image, prompt)
    else:
        raise RuntimeError(f"Unknown backend: {config_type}")

    # Parse detections
    detections = self._parse_json_output(raw_output)

    # Convert to LayoutOutput
    layout_boxes = self._build_layout_boxes(detections, width, height)

    # Sort by position (reading order)
    layout_boxes.sort(key=lambda b: (b.bbox.y1, b.bbox.x1))

    return LayoutOutput(
        bboxes=layout_boxes,
        image_width=width,
        image_height=height,
        model_name=f"Qwen3-VL ({type(self.backend_config).__name__})",
    )

RTDETRConfig

Bases: BaseModel

Configuration for RT-DETR layout extractor.

This is a single-backend model (PyTorch/Transformers only).

Example
config = RTDETRConfig(device="cuda", confidence=0.4)
extractor = RTDETRLayoutExtractor(config=config)

RTDETRLayoutExtractor

RTDETRLayoutExtractor(config: RTDETRConfig)

Bases: BaseLayoutExtractor

RT-DETR layout extractor using HuggingFace Transformers.

A transformer-based real-time detection model for document layout. Detects: title, text, table, figure, list, formula, captions, headers, footers.

This is a single-backend model (PyTorch/Transformers only).

Example
from omnidocs.tasks.layout_extraction import RTDETRLayoutExtractor, RTDETRConfig

extractor = RTDETRLayoutExtractor(config=RTDETRConfig(device="cuda"))
result = extractor.extract(image)

for box in result.bboxes:
        print(f"{box.label.value}: {box.confidence:.2f}")

Initialize RT-DETR layout extractor.

PARAMETER DESCRIPTION
config

Configuration object with device, model settings, etc.

TYPE: RTDETRConfig

Source code in omnidocs/tasks/layout_extraction/rtdetr.py
def __init__(self, config: RTDETRConfig):
    """
    Initialize RT-DETR layout extractor.

    Args:
        config: Configuration object with device, model settings, etc.
    """
    self.config = config
    self._model = None
    self._processor = None
    self._device = self._resolve_device(config.device)
    self._model_path = self._resolve_model_path(config.model_path)

    # Load model
    self._load_model()

extract

extract(
    image: Union[Image, ndarray, str, Path],
) -> LayoutOutput

Run layout extraction on an image.

PARAMETER DESCRIPTION
image

Input image (PIL Image, numpy array, or path)

TYPE: Union[Image, ndarray, str, Path]

RETURNS DESCRIPTION
LayoutOutput

LayoutOutput with detected layout boxes

Source code in omnidocs/tasks/layout_extraction/rtdetr.py
def extract(self, image: Union[Image.Image, np.ndarray, str, Path]) -> LayoutOutput:
    """
    Run layout extraction on an image.

    Args:
        image: Input image (PIL Image, numpy array, or path)

    Returns:
        LayoutOutput with detected layout boxes
    """
    import torch

    if self._model is None or self._processor is None:
        raise RuntimeError("Model not loaded. Call _load_model() first.")

    # Prepare image
    pil_image = self._prepare_image(image)
    img_width, img_height = pil_image.size

    # Preprocess
    inputs = self._processor(
        images=pil_image,
        return_tensors="pt",
        size={"height": self.config.image_size, "width": self.config.image_size},
    )

    # Move to device
    inputs = {k: v.to(self._device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}

    # Run inference
    with torch.no_grad():
        outputs = self._model(**inputs)

    # Post-process results
    target_sizes = torch.tensor([[img_height, img_width]])
    results = self._processor.post_process_object_detection(
        outputs,
        target_sizes=target_sizes,
        threshold=self.config.confidence,
    )[0]

    # Parse detections
    layout_boxes = []

    for score, label_id, box in zip(results["scores"], results["labels"], results["boxes"]):
        confidence = float(score.item())
        class_id = int(label_id.item())

        # Get original label from model config
        # Note: The model outputs 0-indexed class IDs, but id2label has background at index 0,
        # so we add 1 to map correctly (e.g., model output 8 -> id2label[9] = "Table")
        original_label = self._model.config.id2label.get(class_id + 1, f"class_{class_id}")

        # Map to standardized label
        standard_label = RTDETR_MAPPING.to_standard(original_label)

        # Box coordinates
        box_coords = box.cpu().tolist()

        layout_boxes.append(
            LayoutBox(
                label=standard_label,
                bbox=BoundingBox.from_list(box_coords),
                confidence=confidence,
                class_id=class_id,
                original_label=original_label,
            )
        )

    # Sort by y-coordinate (top to bottom reading order)
    layout_boxes.sort(key=lambda b: (b.bbox.y1, b.bbox.x1))

    return LayoutOutput(
        bboxes=layout_boxes,
        image_width=img_width,
        image_height=img_height,
        model_name="RT-DETR (docling-layout)",
    )

VLMLayoutDetector

VLMLayoutDetector(config: VLMAPIConfig)

Bases: BaseLayoutExtractor

Provider-agnostic VLM layout detector using litellm.

Works with any cloud VLM API: Gemini, OpenRouter, Azure, OpenAI, Anthropic, etc. Supports custom labels for flexible detection.

Example
from omnidocs.vlm import VLMAPIConfig
from omnidocs.tasks.layout_extraction import VLMLayoutDetector

config = VLMAPIConfig(model="gemini/gemini-2.5-flash")
detector = VLMLayoutDetector(config=config)

# Default labels
result = detector.extract("document.png")

# Custom labels
result = detector.extract("document.png", custom_labels=["code_block", "sidebar"])

Initialize VLM layout detector.

PARAMETER DESCRIPTION
config

VLM API configuration with model and provider details.

TYPE: VLMAPIConfig

Source code in omnidocs/tasks/layout_extraction/vlm.py
def __init__(self, config: VLMAPIConfig):
    """
    Initialize VLM layout detector.

    Args:
        config: VLM API configuration with model and provider details.
    """
    self.config = config
    self._loaded = True

extract

extract(
    image: Union[Image, ndarray, str, Path],
    custom_labels: Optional[
        List[Union[str, CustomLabel]]
    ] = None,
    prompt: Optional[str] = None,
) -> LayoutOutput

Run layout detection on an image.

PARAMETER DESCRIPTION
image

Input image (PIL Image, numpy array, or file path).

TYPE: Union[Image, ndarray, str, Path]

custom_labels

Optional custom labels to detect. Can be: - None: Use default labels (title, text, table, figure, etc.) - List[str]: Simple label names ["code_block", "sidebar"] - List[CustomLabel]: Typed labels with metadata

TYPE: Optional[List[Union[str, CustomLabel]]] DEFAULT: None

prompt

Custom prompt. If None, builds a default detection prompt.

TYPE: Optional[str] DEFAULT: None

RETURNS DESCRIPTION
LayoutOutput

LayoutOutput with detected layout boxes.

Source code in omnidocs/tasks/layout_extraction/vlm.py
def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    custom_labels: Optional[List[Union[str, CustomLabel]]] = None,
    prompt: Optional[str] = None,
) -> LayoutOutput:
    """
    Run layout detection on an image.

    Args:
        image: Input image (PIL Image, numpy array, or file path).
        custom_labels: Optional custom labels to detect. Can be:
            - None: Use default labels (title, text, table, figure, etc.)
            - List[str]: Simple label names ["code_block", "sidebar"]
            - List[CustomLabel]: Typed labels with metadata
        prompt: Custom prompt. If None, builds a default detection prompt.

    Returns:
        LayoutOutput with detected layout boxes.
    """
    if not self._loaded:
        raise RuntimeError("Model not loaded.")

    pil_image = self._prepare_image(image)
    width, height = pil_image.size

    # Normalize labels
    label_names = self._normalize_labels(custom_labels)

    # Build or use custom prompt
    final_prompt = prompt or _build_layout_prompt(label_names)

    raw_output = vlm_completion(self.config, final_prompt, pil_image)
    detections = _parse_layout_response(raw_output, (width, height))
    layout_boxes = self._build_layout_boxes(detections, width, height)

    # Sort by reading order
    layout_boxes.sort(key=lambda b: (b.bbox.y1, b.bbox.x1))

    return LayoutOutput(
        bboxes=layout_boxes,
        image_width=width,
        image_height=height,
        model_name=f"VLM ({self.config.model})",
    )

base

Base class for layout extractors.

Defines the abstract interface that all layout extractors must implement.

BaseLayoutExtractor

Bases: ABC

Abstract base class for layout extractors.

All layout extraction models must inherit from this class and implement the required methods.

Example
class MyLayoutExtractor(BaseLayoutExtractor):
        def __init__(self, config: MyConfig):
            self.config = config
            self._load_model()

        def _load_model(self):
            # Load model weights
            pass

        def extract(self, image):
            # Run extraction
            return LayoutOutput(...)
extract abstractmethod
extract(
    image: Union[Image, ndarray, str, Path],
) -> LayoutOutput

Run layout extraction on an image.

PARAMETER DESCRIPTION
image

Input image as: - PIL.Image.Image: PIL image object - np.ndarray: Numpy array (HWC format, RGB) - str or Path: Path to image file

TYPE: Union[Image, ndarray, str, Path]

RETURNS DESCRIPTION
LayoutOutput

LayoutOutput containing detected layout boxes with standardized labels

RAISES DESCRIPTION
ValueError

If image format is not supported

RuntimeError

If model is not loaded or inference fails

Source code in omnidocs/tasks/layout_extraction/base.py
@abstractmethod
def extract(self, image: Union[Image.Image, np.ndarray, str, Path]) -> LayoutOutput:
    """
    Run layout extraction on an image.

    Args:
        image: Input image as:
            - PIL.Image.Image: PIL image object
            - np.ndarray: Numpy array (HWC format, RGB)
            - str or Path: Path to image file

    Returns:
        LayoutOutput containing detected layout boxes with standardized labels

    Raises:
        ValueError: If image format is not supported
        RuntimeError: If model is not loaded or inference fails
    """
    pass
batch_extract
batch_extract(
    images: List[Union[Image, ndarray, str, Path]],
    progress_callback: Optional[
        Callable[[int, int], None]
    ] = None,
) -> List[LayoutOutput]

Run layout extraction on multiple images.

Default implementation loops over extract(). Subclasses can override for optimized batching.

PARAMETER DESCRIPTION
images

List of images in any supported format

TYPE: List[Union[Image, ndarray, str, Path]]

progress_callback

Optional function(current, total) for progress

TYPE: Optional[Callable[[int, int], None]] DEFAULT: None

RETURNS DESCRIPTION
List[LayoutOutput]

List of LayoutOutput in same order as input

Examples:

images = [doc.get_page(i) for i in range(doc.page_count)]
results = extractor.batch_extract(images)
Source code in omnidocs/tasks/layout_extraction/base.py
def batch_extract(
    self,
    images: List[Union[Image.Image, np.ndarray, str, Path]],
    progress_callback: Optional[Callable[[int, int], None]] = None,
) -> List[LayoutOutput]:
    """
    Run layout extraction on multiple images.

    Default implementation loops over extract(). Subclasses can override
    for optimized batching.

    Args:
        images: List of images in any supported format
        progress_callback: Optional function(current, total) for progress

    Returns:
        List of LayoutOutput in same order as input

    Examples:
        ```python
        images = [doc.get_page(i) for i in range(doc.page_count)]
        results = extractor.batch_extract(images)
        ```
    """
    results = []
    total = len(images)

    for i, image in enumerate(images):
        if progress_callback:
            progress_callback(i + 1, total)

        result = self.extract(image)
        results.append(result)

    return results
extract_document
extract_document(
    document: Document,
    progress_callback: Optional[
        Callable[[int, int], None]
    ] = None,
) -> List[LayoutOutput]

Run layout extraction on all pages of a document.

PARAMETER DESCRIPTION
document

Document instance

TYPE: Document

progress_callback

Optional function(current, total) for progress

TYPE: Optional[Callable[[int, int], None]] DEFAULT: None

RETURNS DESCRIPTION
List[LayoutOutput]

List of LayoutOutput, one per page

Examples:

doc = Document.from_pdf("paper.pdf")
results = extractor.extract_document(doc)
Source code in omnidocs/tasks/layout_extraction/base.py
def extract_document(
    self,
    document: "Document",
    progress_callback: Optional[Callable[[int, int], None]] = None,
) -> List[LayoutOutput]:
    """
    Run layout extraction on all pages of a document.

    Args:
        document: Document instance
        progress_callback: Optional function(current, total) for progress

    Returns:
        List of LayoutOutput, one per page

    Examples:
        ```python
        doc = Document.from_pdf("paper.pdf")
        results = extractor.extract_document(doc)
        ```
    """
    results = []
    total = document.page_count

    for i, page in enumerate(document.iter_pages()):
        if progress_callback:
            progress_callback(i + 1, total)

        result = self.extract(page)
        results.append(result)

    return results

doc_layout_yolo

DocLayout-YOLO layout extractor.

A YOLO-based model for document layout detection, optimized for academic papers and technical documents.

Model: juliozhao/DocLayout-YOLO-DocStructBench

DocLayoutYOLOConfig

Bases: BaseModel

Configuration for DocLayout-YOLO layout extractor.

This is a single-backend model (PyTorch only).

Example
config = DocLayoutYOLOConfig(device="cuda", confidence=0.3)
extractor = DocLayoutYOLO(config=config)

DocLayoutYOLO

DocLayoutYOLO(config: DocLayoutYOLOConfig)

Bases: BaseLayoutExtractor

DocLayout-YOLO layout extractor.

A YOLO-based model optimized for document layout detection. Detects: title, text, figure, table, formula, captions, etc.

This is a single-backend model (PyTorch only).

Example
from omnidocs.tasks.layout_extraction import DocLayoutYOLO, DocLayoutYOLOConfig

extractor = DocLayoutYOLO(config=DocLayoutYOLOConfig(device="cuda"))
result = extractor.extract(image)

for box in result.bboxes:
        print(f"{box.label.value}: {box.confidence:.2f}")

Initialize DocLayout-YOLO extractor.

PARAMETER DESCRIPTION
config

Configuration object with device, model_path, etc.

TYPE: DocLayoutYOLOConfig

Source code in omnidocs/tasks/layout_extraction/doc_layout_yolo.py
def __init__(self, config: DocLayoutYOLOConfig):
    """
    Initialize DocLayout-YOLO extractor.

    Args:
        config: Configuration object with device, model_path, etc.
    """
    self.config = config
    self._model = None
    self._device = self._resolve_device(config.device)
    self._model_path = self._resolve_model_path(config.model_path)

    # Load model
    self._load_model()
extract
extract(
    image: Union[Image, ndarray, str, Path],
) -> LayoutOutput

Run layout extraction on an image.

PARAMETER DESCRIPTION
image

Input image (PIL Image, numpy array, or path)

TYPE: Union[Image, ndarray, str, Path]

RETURNS DESCRIPTION
LayoutOutput

LayoutOutput with detected layout boxes

Source code in omnidocs/tasks/layout_extraction/doc_layout_yolo.py
def extract(self, image: Union[Image.Image, np.ndarray, str, Path]) -> LayoutOutput:
    """
    Run layout extraction on an image.

    Args:
        image: Input image (PIL Image, numpy array, or path)

    Returns:
        LayoutOutput with detected layout boxes
    """
    if self._model is None:
        raise RuntimeError("Model not loaded. Call _load_model() first.")

    # Prepare image
    pil_image = self._prepare_image(image)
    img_width, img_height = pil_image.size

    # Run inference
    results = self._model.predict(
        pil_image,
        imgsz=self.config.img_size,
        conf=self.config.confidence,
        device=self._device,
    )

    result = results[0]

    # Parse detections
    layout_boxes = []

    if hasattr(result, "boxes") and result.boxes is not None:
        boxes = result.boxes

        for i in range(len(boxes)):
            # Get coordinates
            bbox_coords = boxes.xyxy[i].cpu().numpy().tolist()

            # Get class and confidence
            class_id = int(boxes.cls[i].item())
            confidence = float(boxes.conf[i].item())

            # Get original label from class names
            original_label = DOCLAYOUT_YOLO_CLASS_NAMES.get(class_id, f"class_{class_id}")

            # Map to standardized label
            standard_label = DOCLAYOUT_YOLO_MAPPING.to_standard(original_label)

            layout_boxes.append(
                LayoutBox(
                    label=standard_label,
                    bbox=BoundingBox.from_list(bbox_coords),
                    confidence=confidence,
                    class_id=class_id,
                    original_label=original_label,
                )
            )

    # Sort by y-coordinate (top to bottom reading order)
    layout_boxes.sort(key=lambda b: (b.bbox.y1, b.bbox.x1))

    return LayoutOutput(
        bboxes=layout_boxes,
        image_width=img_width,
        image_height=img_height,
        model_name="DocLayout-YOLO",
    )

mineruvl

MinerU VL layout detection module.

MinerU VL can be used for standalone layout detection, returning detected regions with types and bounding boxes.

For full document extraction (layout + content), use MinerUVLTextExtractor from the text_extraction module instead.

Example
from omnidocs.tasks.layout_extraction import MinerUVLLayoutDetector
from omnidocs.tasks.layout_extraction.mineruvl import MinerUVLLayoutPyTorchConfig

detector = MinerUVLLayoutDetector(
    backend=MinerUVLLayoutPyTorchConfig(device="cuda")
)
result = detector.extract(image)

for box in result.bboxes:
    print(f"{box.label}: {box.confidence:.2f}")

MinerUVLLayoutAPIConfig

Bases: BaseModel

API backend config for MinerU VL layout detection.

Example
from omnidocs.tasks.layout_extraction import MinerUVLLayoutDetector
from omnidocs.tasks.layout_extraction.mineruvl import MinerUVLLayoutAPIConfig

detector = MinerUVLLayoutDetector(
    backend=MinerUVLLayoutAPIConfig(
        server_url="https://your-server.modal.run"
    )
)
result = detector.extract(image)

MinerUVLLayoutDetector

MinerUVLLayoutDetector(
    backend: MinerUVLLayoutBackendConfig,
)

Bases: BaseLayoutExtractor

MinerU VL layout detector.

Uses MinerU2.5-2509-1.2B for document layout detection. Detects 22+ element types including text, titles, tables, equations, figures, code, and more.

For full document extraction (layout + content), use MinerUVLTextExtractor from the text_extraction module instead.

Example
from omnidocs.tasks.layout_extraction import MinerUVLLayoutDetector
from omnidocs.tasks.layout_extraction.mineruvl import MinerUVLLayoutPyTorchConfig

detector = MinerUVLLayoutDetector(
    backend=MinerUVLLayoutPyTorchConfig(device="cuda")
)
result = detector.extract(image)

for box in result.bboxes:
    print(f"{box.label}: {box.confidence:.2f}")

Initialize MinerU VL layout detector.

PARAMETER DESCRIPTION
backend

Backend configuration (PyTorch, VLLM, MLX, or API)

TYPE: MinerUVLLayoutBackendConfig

Source code in omnidocs/tasks/layout_extraction/mineruvl/detector.py
def __init__(self, backend: MinerUVLLayoutBackendConfig):
    """
    Initialize MinerU VL layout detector.

    Args:
        backend: Backend configuration (PyTorch, VLLM, MLX, or API)
    """
    self.backend_config = backend
    self._client = None
    self._loaded = False
    self._load_model()
extract
extract(
    image: Union[Image, ndarray, str, Path],
) -> LayoutOutput

Detect layout elements in the image.

PARAMETER DESCRIPTION
image

Input image (PIL Image, numpy array, or file path)

TYPE: Union[Image, ndarray, str, Path]

RETURNS DESCRIPTION
LayoutOutput

LayoutOutput with standardized labels and bounding boxes

Source code in omnidocs/tasks/layout_extraction/mineruvl/detector.py
def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
) -> LayoutOutput:
    """
    Detect layout elements in the image.

    Args:
        image: Input image (PIL Image, numpy array, or file path)

    Returns:
        LayoutOutput with standardized labels and bounding boxes
    """
    if not self._loaded:
        raise RuntimeError("Model not loaded.")

    pil_image = self._prepare_image(image)
    width, height = pil_image.size

    # Run layout detection
    blocks = self._detect_layout(pil_image)

    # Convert to LayoutOutput
    bboxes = []
    for block in blocks:
        # Convert normalized [0,1] to pixel coords
        x1, y1, x2, y2 = block.bbox
        pixel_bbox = BoundingBox(
            x1=x1 * width,
            y1=y1 * height,
            x2=x2 * width,
            y2=y2 * height,
        )

        # Map label
        label = MINERUVL_LABEL_MAPPING.get(block.type, LayoutLabel.UNKNOWN)

        bboxes.append(
            LayoutBox(
                label=label,
                bbox=pixel_bbox,
                confidence=1.0,  # MinerU VL doesn't output confidence
                original_label=block.type.value,
            )
        )

    return LayoutOutput(
        bboxes=bboxes,
        image_width=width,
        image_height=height,
        model_name="MinerU2.5-2509-1.2B",
    )

MinerUVLLayoutMLXConfig

Bases: BaseModel

MLX backend config for MinerU VL layout detection on Apple Silicon.

Example
from omnidocs.tasks.layout_extraction import MinerUVLLayoutDetector
from omnidocs.tasks.layout_extraction.mineruvl import MinerUVLLayoutMLXConfig

detector = MinerUVLLayoutDetector(
    backend=MinerUVLLayoutMLXConfig()
)
result = detector.extract(image)

MinerUVLLayoutPyTorchConfig

Bases: BaseModel

PyTorch/HuggingFace backend config for MinerU VL layout detection.

Example
from omnidocs.tasks.layout_extraction import MinerUVLLayoutDetector
from omnidocs.tasks.layout_extraction.mineruvl import MinerUVLLayoutPyTorchConfig

detector = MinerUVLLayoutDetector(
    backend=MinerUVLLayoutPyTorchConfig(device="cuda")
)
result = detector.extract(image)

MinerUVLLayoutVLLMConfig

Bases: BaseModel

VLLM backend config for MinerU VL layout detection.

Example
from omnidocs.tasks.layout_extraction import MinerUVLLayoutDetector
from omnidocs.tasks.layout_extraction.mineruvl import MinerUVLLayoutVLLMConfig

detector = MinerUVLLayoutDetector(
    backend=MinerUVLLayoutVLLMConfig(tensor_parallel_size=1)
)
result = detector.extract(image)

api

API backend configuration for MinerU VL layout detection.

MinerUVLLayoutAPIConfig

Bases: BaseModel

API backend config for MinerU VL layout detection.

Example
from omnidocs.tasks.layout_extraction import MinerUVLLayoutDetector
from omnidocs.tasks.layout_extraction.mineruvl import MinerUVLLayoutAPIConfig

detector = MinerUVLLayoutDetector(
    backend=MinerUVLLayoutAPIConfig(
        server_url="https://your-server.modal.run"
    )
)
result = detector.extract(image)

detector

MinerU VL layout detector.

Uses MinerU2.5-2509-1.2B for document layout detection. Detects 22+ element types including text, titles, tables, equations, figures, code.

MinerUVLLayoutDetector
MinerUVLLayoutDetector(
    backend: MinerUVLLayoutBackendConfig,
)

Bases: BaseLayoutExtractor

MinerU VL layout detector.

Uses MinerU2.5-2509-1.2B for document layout detection. Detects 22+ element types including text, titles, tables, equations, figures, code, and more.

For full document extraction (layout + content), use MinerUVLTextExtractor from the text_extraction module instead.

Example
from omnidocs.tasks.layout_extraction import MinerUVLLayoutDetector
from omnidocs.tasks.layout_extraction.mineruvl import MinerUVLLayoutPyTorchConfig

detector = MinerUVLLayoutDetector(
    backend=MinerUVLLayoutPyTorchConfig(device="cuda")
)
result = detector.extract(image)

for box in result.bboxes:
    print(f"{box.label}: {box.confidence:.2f}")

Initialize MinerU VL layout detector.

PARAMETER DESCRIPTION
backend

Backend configuration (PyTorch, VLLM, MLX, or API)

TYPE: MinerUVLLayoutBackendConfig

Source code in omnidocs/tasks/layout_extraction/mineruvl/detector.py
def __init__(self, backend: MinerUVLLayoutBackendConfig):
    """
    Initialize MinerU VL layout detector.

    Args:
        backend: Backend configuration (PyTorch, VLLM, MLX, or API)
    """
    self.backend_config = backend
    self._client = None
    self._loaded = False
    self._load_model()
extract
extract(
    image: Union[Image, ndarray, str, Path],
) -> LayoutOutput

Detect layout elements in the image.

PARAMETER DESCRIPTION
image

Input image (PIL Image, numpy array, or file path)

TYPE: Union[Image, ndarray, str, Path]

RETURNS DESCRIPTION
LayoutOutput

LayoutOutput with standardized labels and bounding boxes

Source code in omnidocs/tasks/layout_extraction/mineruvl/detector.py
def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
) -> LayoutOutput:
    """
    Detect layout elements in the image.

    Args:
        image: Input image (PIL Image, numpy array, or file path)

    Returns:
        LayoutOutput with standardized labels and bounding boxes
    """
    if not self._loaded:
        raise RuntimeError("Model not loaded.")

    pil_image = self._prepare_image(image)
    width, height = pil_image.size

    # Run layout detection
    blocks = self._detect_layout(pil_image)

    # Convert to LayoutOutput
    bboxes = []
    for block in blocks:
        # Convert normalized [0,1] to pixel coords
        x1, y1, x2, y2 = block.bbox
        pixel_bbox = BoundingBox(
            x1=x1 * width,
            y1=y1 * height,
            x2=x2 * width,
            y2=y2 * height,
        )

        # Map label
        label = MINERUVL_LABEL_MAPPING.get(block.type, LayoutLabel.UNKNOWN)

        bboxes.append(
            LayoutBox(
                label=label,
                bbox=pixel_bbox,
                confidence=1.0,  # MinerU VL doesn't output confidence
                original_label=block.type.value,
            )
        )

    return LayoutOutput(
        bboxes=bboxes,
        image_width=width,
        image_height=height,
        model_name="MinerU2.5-2509-1.2B",
    )

mlx

MLX backend configuration for MinerU VL layout detection (Apple Silicon).

MinerUVLLayoutMLXConfig

Bases: BaseModel

MLX backend config for MinerU VL layout detection on Apple Silicon.

Example
from omnidocs.tasks.layout_extraction import MinerUVLLayoutDetector
from omnidocs.tasks.layout_extraction.mineruvl import MinerUVLLayoutMLXConfig

detector = MinerUVLLayoutDetector(
    backend=MinerUVLLayoutMLXConfig()
)
result = detector.extract(image)

pytorch

PyTorch backend configuration for MinerU VL layout detection.

MinerUVLLayoutPyTorchConfig

Bases: BaseModel

PyTorch/HuggingFace backend config for MinerU VL layout detection.

Example
from omnidocs.tasks.layout_extraction import MinerUVLLayoutDetector
from omnidocs.tasks.layout_extraction.mineruvl import MinerUVLLayoutPyTorchConfig

detector = MinerUVLLayoutDetector(
    backend=MinerUVLLayoutPyTorchConfig(device="cuda")
)
result = detector.extract(image)

vllm

VLLM backend configuration for MinerU VL layout detection.

MinerUVLLayoutVLLMConfig

Bases: BaseModel

VLLM backend config for MinerU VL layout detection.

Example
from omnidocs.tasks.layout_extraction import MinerUVLLayoutDetector
from omnidocs.tasks.layout_extraction.mineruvl import MinerUVLLayoutVLLMConfig

detector = MinerUVLLayoutDetector(
    backend=MinerUVLLayoutVLLMConfig(tensor_parallel_size=1)
)
result = detector.extract(image)

models

Pydantic models for layout extraction outputs.

Defines standardized output types and label enums for layout detection.

Coordinate Systems
  • Absolute (default): Coordinates in pixels relative to original image size
  • Normalized (0-1024): Coordinates scaled to 0-1024 range (virtual 1024x1024 canvas)

Use bbox.to_normalized(width, height) or output.get_normalized_bboxes() to convert to normalized coordinates.

Example
result = extractor.extract(image)  # Returns absolute pixel coordinates
normalized = result.get_normalized_bboxes()  # Returns 0-1024 normalized coords

LayoutLabel

Bases: str, Enum

Standardized layout labels used across all layout extractors.

These provide a consistent vocabulary regardless of which model is used.

CustomLabel

Bases: BaseModel

Type-safe custom layout label definition for VLM-based models.

VLM models like Qwen3-VL support flexible custom labels beyond the standard LayoutLabel enum. Use this class to define custom labels with validation.

Example
from omnidocs.tasks.layout_extraction import CustomLabel

# Simple custom label
code_block = CustomLabel(name="code_block")

# With metadata
sidebar = CustomLabel(
        name="sidebar",
        description="Secondary content panel",
        color="#9B59B6",
    )

# Use with QwenLayoutDetector
result = detector.extract(image, custom_labels=[code_block, sidebar])

LabelMapping

LabelMapping(mapping: Dict[str, LayoutLabel])

Base class for model-specific label mappings.

Each model maps its native labels to standardized LayoutLabel values.

Initialize label mapping.

PARAMETER DESCRIPTION
mapping

Dict mapping model-specific labels to LayoutLabel enum values

TYPE: Dict[str, LayoutLabel]

Source code in omnidocs/tasks/layout_extraction/models.py
def __init__(self, mapping: Dict[str, LayoutLabel]):
    """
    Initialize label mapping.

    Args:
        mapping: Dict mapping model-specific labels to LayoutLabel enum values
    """
    self._mapping = {k.lower(): v for k, v in mapping.items()}
    self._reverse_mapping = {v: k for k, v in mapping.items()}
supported_labels property
supported_labels: List[str]

Get list of supported model-specific labels.

standard_labels property
standard_labels: List[LayoutLabel]

Get list of standard labels this mapping produces.

to_standard
to_standard(model_label: str) -> LayoutLabel

Convert model-specific label to standardized LayoutLabel.

Source code in omnidocs/tasks/layout_extraction/models.py
def to_standard(self, model_label: str) -> LayoutLabel:
    """Convert model-specific label to standardized LayoutLabel."""
    return self._mapping.get(model_label.lower(), LayoutLabel.UNKNOWN)
from_standard
from_standard(standard_label: LayoutLabel) -> Optional[str]

Convert standardized LayoutLabel to model-specific label.

Source code in omnidocs/tasks/layout_extraction/models.py
def from_standard(self, standard_label: LayoutLabel) -> Optional[str]:
    """Convert standardized LayoutLabel to model-specific label."""
    return self._reverse_mapping.get(standard_label)

BoundingBox

Bases: BaseModel

Bounding box coordinates in pixel space.

Coordinates follow the convention: (x1, y1) is top-left, (x2, y2) is bottom-right.

width property
width: float

Width of the bounding box.

height property
height: float

Height of the bounding box.

area property
area: float

Area of the bounding box.

center property
center: Tuple[float, float]

Center point of the bounding box.

to_list
to_list() -> List[float]

Convert to [x1, y1, x2, y2] list.

Source code in omnidocs/tasks/layout_extraction/models.py
def to_list(self) -> List[float]:
    """Convert to [x1, y1, x2, y2] list."""
    return [self.x1, self.y1, self.x2, self.y2]
to_xyxy
to_xyxy() -> Tuple[float, float, float, float]

Convert to (x1, y1, x2, y2) tuple.

Source code in omnidocs/tasks/layout_extraction/models.py
def to_xyxy(self) -> Tuple[float, float, float, float]:
    """Convert to (x1, y1, x2, y2) tuple."""
    return (self.x1, self.y1, self.x2, self.y2)
to_xywh
to_xywh() -> Tuple[float, float, float, float]

Convert to (x, y, width, height) format.

Source code in omnidocs/tasks/layout_extraction/models.py
def to_xywh(self) -> Tuple[float, float, float, float]:
    """Convert to (x, y, width, height) format."""
    return (self.x1, self.y1, self.width, self.height)
from_list classmethod
from_list(coords: List[float]) -> BoundingBox

Create from [x1, y1, x2, y2] list.

Source code in omnidocs/tasks/layout_extraction/models.py
@classmethod
def from_list(cls, coords: List[float]) -> "BoundingBox":
    """Create from [x1, y1, x2, y2] list."""
    if len(coords) != 4:
        raise ValueError(f"Expected 4 coordinates, got {len(coords)}")
    return cls(x1=coords[0], y1=coords[1], x2=coords[2], y2=coords[3])
to_normalized
to_normalized(
    image_width: int, image_height: int
) -> BoundingBox

Convert to normalized coordinates (0-1024 range).

Scales coordinates from absolute pixel values to a virtual 1024x1024 canvas. This provides consistent coordinates regardless of original image size.

PARAMETER DESCRIPTION
image_width

Original image width in pixels

TYPE: int

image_height

Original image height in pixels

TYPE: int

RETURNS DESCRIPTION
BoundingBox

New BoundingBox with coordinates in 0-1024 range

Example
bbox = BoundingBox(x1=100, y1=50, x2=500, y2=300)
normalized = bbox.to_normalized(1000, 800)
# x: 100/1000*1024 = 102.4, y: 50/800*1024 = 64
Source code in omnidocs/tasks/layout_extraction/models.py
def to_normalized(self, image_width: int, image_height: int) -> "BoundingBox":
    """
    Convert to normalized coordinates (0-1024 range).

    Scales coordinates from absolute pixel values to a virtual 1024x1024 canvas.
    This provides consistent coordinates regardless of original image size.

    Args:
        image_width: Original image width in pixels
        image_height: Original image height in pixels

    Returns:
        New BoundingBox with coordinates in 0-1024 range

    Example:
        ```python
        bbox = BoundingBox(x1=100, y1=50, x2=500, y2=300)
        normalized = bbox.to_normalized(1000, 800)
        # x: 100/1000*1024 = 102.4, y: 50/800*1024 = 64
        ```
    """
    return BoundingBox(
        x1=self.x1 / image_width * NORMALIZED_SIZE,
        y1=self.y1 / image_height * NORMALIZED_SIZE,
        x2=self.x2 / image_width * NORMALIZED_SIZE,
        y2=self.y2 / image_height * NORMALIZED_SIZE,
    )
to_absolute
to_absolute(
    image_width: int, image_height: int
) -> BoundingBox

Convert from normalized (0-1024) to absolute pixel coordinates.

PARAMETER DESCRIPTION
image_width

Target image width in pixels

TYPE: int

image_height

Target image height in pixels

TYPE: int

RETURNS DESCRIPTION
BoundingBox

New BoundingBox with absolute pixel coordinates

Source code in omnidocs/tasks/layout_extraction/models.py
def to_absolute(self, image_width: int, image_height: int) -> "BoundingBox":
    """
    Convert from normalized (0-1024) to absolute pixel coordinates.

    Args:
        image_width: Target image width in pixels
        image_height: Target image height in pixels

    Returns:
        New BoundingBox with absolute pixel coordinates
    """
    return BoundingBox(
        x1=self.x1 / NORMALIZED_SIZE * image_width,
        y1=self.y1 / NORMALIZED_SIZE * image_height,
        x2=self.x2 / NORMALIZED_SIZE * image_width,
        y2=self.y2 / NORMALIZED_SIZE * image_height,
    )

LayoutBox

Bases: BaseModel

Single detected layout element with label, bounding box, and confidence.

to_dict
to_dict() -> Dict

Convert to dictionary representation.

Source code in omnidocs/tasks/layout_extraction/models.py
def to_dict(self) -> Dict:
    """Convert to dictionary representation."""
    return {
        "label": self.label.value,
        "bbox": self.bbox.to_list(),
        "confidence": self.confidence,
        "class_id": self.class_id,
        "original_label": self.original_label,
    }
get_normalized_bbox
get_normalized_bbox(
    image_width: int, image_height: int
) -> BoundingBox

Get bounding box in normalized (0-1024) coordinates.

PARAMETER DESCRIPTION
image_width

Original image width

TYPE: int

image_height

Original image height

TYPE: int

RETURNS DESCRIPTION
BoundingBox

BoundingBox with normalized coordinates

Source code in omnidocs/tasks/layout_extraction/models.py
def get_normalized_bbox(self, image_width: int, image_height: int) -> BoundingBox:
    """
    Get bounding box in normalized (0-1024) coordinates.

    Args:
        image_width: Original image width
        image_height: Original image height

    Returns:
        BoundingBox with normalized coordinates
    """
    return self.bbox.to_normalized(image_width, image_height)

LayoutOutput

Bases: BaseModel

Complete layout extraction results for a single image.

element_count property
element_count: int

Number of detected elements.

labels_found property
labels_found: List[str]

Unique labels found in detections.

filter_by_label
filter_by_label(label: LayoutLabel) -> List[LayoutBox]

Filter boxes by label.

Source code in omnidocs/tasks/layout_extraction/models.py
def filter_by_label(self, label: LayoutLabel) -> List[LayoutBox]:
    """Filter boxes by label."""
    return [box for box in self.bboxes if box.label == label]
filter_by_confidence
filter_by_confidence(
    min_confidence: float,
) -> List[LayoutBox]

Filter boxes by minimum confidence.

Source code in omnidocs/tasks/layout_extraction/models.py
def filter_by_confidence(self, min_confidence: float) -> List[LayoutBox]:
    """Filter boxes by minimum confidence."""
    return [box for box in self.bboxes if box.confidence >= min_confidence]
to_dict
to_dict() -> Dict

Convert to dictionary representation.

Source code in omnidocs/tasks/layout_extraction/models.py
def to_dict(self) -> Dict:
    """Convert to dictionary representation."""
    return {
        "bboxes": [box.to_dict() for box in self.bboxes],
        "image_width": self.image_width,
        "image_height": self.image_height,
        "model_name": self.model_name,
        "element_count": self.element_count,
        "labels_found": self.labels_found,
    }
sort_by_position
sort_by_position(
    top_to_bottom: bool = True,
) -> LayoutOutput

Return a new LayoutOutput with boxes sorted by position.

PARAMETER DESCRIPTION
top_to_bottom

If True, sort by y-coordinate (reading order)

TYPE: bool DEFAULT: True

Source code in omnidocs/tasks/layout_extraction/models.py
def sort_by_position(self, top_to_bottom: bool = True) -> "LayoutOutput":
    """
    Return a new LayoutOutput with boxes sorted by position.

    Args:
        top_to_bottom: If True, sort by y-coordinate (reading order)
    """
    sorted_boxes = sorted(self.bboxes, key=lambda b: (b.bbox.y1, b.bbox.x1), reverse=not top_to_bottom)
    return LayoutOutput(
        bboxes=sorted_boxes,
        image_width=self.image_width,
        image_height=self.image_height,
        model_name=self.model_name,
    )
get_normalized_bboxes
get_normalized_bboxes() -> List[Dict]

Get all bounding boxes in normalized (0-1024) coordinates.

RETURNS DESCRIPTION
List[Dict]

List of dicts with normalized bbox coordinates and metadata.

Example
result = extractor.extract(image)
normalized = result.get_normalized_bboxes()
for box in normalized:
        print(f"{box['label']}: {box['bbox']}")  # coords in 0-1024 range
Source code in omnidocs/tasks/layout_extraction/models.py
def get_normalized_bboxes(self) -> List[Dict]:
    """
    Get all bounding boxes in normalized (0-1024) coordinates.

    Returns:
        List of dicts with normalized bbox coordinates and metadata.

    Example:
        ```python
        result = extractor.extract(image)
        normalized = result.get_normalized_bboxes()
        for box in normalized:
                print(f"{box['label']}: {box['bbox']}")  # coords in 0-1024 range
        ```
    """
    normalized = []
    for box in self.bboxes:
        norm_bbox = box.bbox.to_normalized(self.image_width, self.image_height)
        normalized.append(
            {
                "label": box.label.value,
                "bbox": norm_bbox.to_list(),
                "confidence": box.confidence,
                "class_id": box.class_id,
                "original_label": box.original_label,
            }
        )
    return normalized
visualize
visualize(
    image: Image,
    output_path: Optional[Union[str, Path]] = None,
    show_labels: bool = True,
    show_confidence: bool = True,
    line_width: int = 3,
    font_size: int = 12,
) -> Image.Image

Visualize layout detection results on the image.

Draws bounding boxes with labels and confidence scores on the image. Each layout category has a distinct color for easy identification.

PARAMETER DESCRIPTION
image

PIL Image to draw on (will be copied, not modified)

TYPE: Image

output_path

Optional path to save the visualization

TYPE: Optional[Union[str, Path]] DEFAULT: None

show_labels

Whether to show label text

TYPE: bool DEFAULT: True

show_confidence

Whether to show confidence scores

TYPE: bool DEFAULT: True

line_width

Width of bounding box lines

TYPE: int DEFAULT: 3

font_size

Size of label text (note: uses default font)

TYPE: int DEFAULT: 12

RETURNS DESCRIPTION
Image

PIL Image with visualizations drawn

Example
result = extractor.extract(image)
viz = result.visualize(image, output_path="layout_viz.png")
viz.show()  # Display in notebook/viewer
Source code in omnidocs/tasks/layout_extraction/models.py
def visualize(
    self,
    image: "Image.Image",
    output_path: Optional[Union[str, Path]] = None,
    show_labels: bool = True,
    show_confidence: bool = True,
    line_width: int = 3,
    font_size: int = 12,
) -> "Image.Image":
    """
    Visualize layout detection results on the image.

    Draws bounding boxes with labels and confidence scores on the image.
    Each layout category has a distinct color for easy identification.

    Args:
        image: PIL Image to draw on (will be copied, not modified)
        output_path: Optional path to save the visualization
        show_labels: Whether to show label text
        show_confidence: Whether to show confidence scores
        line_width: Width of bounding box lines
        font_size: Size of label text (note: uses default font)

    Returns:
        PIL Image with visualizations drawn

    Example:
        ```python
        result = extractor.extract(image)
        viz = result.visualize(image, output_path="layout_viz.png")
        viz.show()  # Display in notebook/viewer
        ```
    """
    from PIL import ImageDraw

    # Copy image to avoid modifying original
    viz_image = image.copy().convert("RGB")
    draw = ImageDraw.Draw(viz_image)

    for box in self.bboxes:
        # Get color for this label
        color = LABEL_COLORS.get(box.label, "#95A5A6")

        # Draw bounding box
        coords = box.bbox.to_xyxy()
        draw.rectangle(coords, outline=color, width=line_width)

        # Build label text
        if show_labels or show_confidence:
            label_parts = []
            if show_labels:
                label_parts.append(box.label.value)
            if show_confidence:
                label_parts.append(f"{box.confidence:.2f}")
            label_text = " ".join(label_parts)

            # Draw label background
            text_bbox = draw.textbbox((coords[0], coords[1] - 20), label_text)
            draw.rectangle(text_bbox, fill=color)

            # Draw label text
            draw.text(
                (coords[0], coords[1] - 20),
                label_text,
                fill="white",
            )

    # Save if path provided
    if output_path:
        output_path = Path(output_path)
        output_path.parent.mkdir(parents=True, exist_ok=True)
        viz_image.save(output_path)

    return viz_image
load_json classmethod
load_json(file_path: Union[str, Path]) -> LayoutOutput

Load a LayoutOutput instance from a JSON file.

Reads a JSON file and deserializes its contents into a LayoutOutput object. Uses Pydantic's model_validate_json for proper handling of nested objects.

PARAMETER DESCRIPTION
file_path

Path to JSON file containing serialized LayoutOutput data. Can be string or pathlib.Path object.

TYPE: Union[str, Path]

RETURNS DESCRIPTION
LayoutOutput

Deserialized layout output instance from file.

TYPE: LayoutOutput

RAISES DESCRIPTION
FileNotFoundError

If the specified file does not exist.

UnicodeDecodeError

If file cannot be decoded as UTF-8.

ValueError

If file contents are not valid JSON.

ValidationError

If JSON data doesn't match LayoutOutput schema.

Example

output = LayoutOutput.load_json('layout_results.json')
print(f"Found {output.element_count} elements")
Found 5 elements

Source code in omnidocs/tasks/layout_extraction/models.py
@classmethod
def load_json(cls, file_path: Union[str, Path]) -> "LayoutOutput":
    """
    Load a LayoutOutput instance from a JSON file.

    Reads a JSON file and deserializes its contents into a LayoutOutput object.
    Uses Pydantic's model_validate_json for proper handling of nested objects.

    Args:
        file_path: Path to JSON file containing serialized LayoutOutput data.
                  Can be string or pathlib.Path object.

    Returns:
        LayoutOutput: Deserialized layout output instance from file.

    Raises:
        FileNotFoundError: If the specified file does not exist.
        UnicodeDecodeError: If file cannot be decoded as UTF-8.
        ValueError: If file contents are not valid JSON.
        ValidationError: If JSON data doesn't match LayoutOutput schema.

    Example:
        ```python
        output = LayoutOutput.load_json('layout_results.json')
        print(f"Found {output.element_count} elements")
        ```
        Found 5 elements
    """
    path = Path(file_path)
    return cls.model_validate_json(path.read_text(encoding="utf-8"))
save_json
save_json(file_path: Union[str, Path]) -> None

Save LayoutOutput instance to a JSON file.

Serializes the LayoutOutput object to JSON and writes it to a file. Automatically creates parent directories if they don't exist. Uses UTF-8 encoding for compatibility and proper handling of special characters.

PARAMETER DESCRIPTION
file_path

Path where JSON file should be saved. Can be string or pathlib.Path object. Parent directories will be created if they don't exist.

TYPE: Union[str, Path]

RETURNS DESCRIPTION
None

None

RAISES DESCRIPTION
OSError

If file cannot be written due to permission or disk errors.

TypeError

If file_path is not a string or Path object.

Example
output = LayoutOutput(bboxes=[], image_width=800, image_height=600)
output.save_json('results/layout_output.json')
# File is created at results/layout_output.json
# Parent 'results' directory is created if it didn't exist
Source code in omnidocs/tasks/layout_extraction/models.py
def save_json(self, file_path: Union[str, Path]) -> None:
    """
    Save LayoutOutput instance to a JSON file.

    Serializes the LayoutOutput object to JSON and writes it to a file.
    Automatically creates parent directories if they don't exist. Uses UTF-8
    encoding for compatibility and proper handling of special characters.

    Args:
        file_path: Path where JSON file should be saved. Can be string or
                  pathlib.Path object. Parent directories will be created
                  if they don't exist.

    Returns:
        None

    Raises:
        OSError: If file cannot be written due to permission or disk errors.
        TypeError: If file_path is not a string or Path object.

    Example:
        ```python
        output = LayoutOutput(bboxes=[], image_width=800, image_height=600)
        output.save_json('results/layout_output.json')
        # File is created at results/layout_output.json
        # Parent 'results' directory is created if it didn't exist
        ```
    """
    path = Path(file_path)
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(self.model_dump_json(), encoding="utf-8")

qwen

Qwen3-VL backend configurations and detector for layout detection.

Available backends
  • QwenLayoutPyTorchConfig: PyTorch/HuggingFace backend
  • QwenLayoutVLLMConfig: VLLM high-throughput backend
  • QwenLayoutMLXConfig: MLX backend for Apple Silicon
  • QwenLayoutAPIConfig: API backend (OpenRouter, etc.)
Example
from omnidocs.tasks.layout_extraction.qwen import QwenLayoutPyTorchConfig
config = QwenLayoutPyTorchConfig(model="Qwen/Qwen3-VL-8B-Instruct")

QwenLayoutAPIConfig

Bases: BaseModel

API backend configuration for Qwen layout detection.

Uses litellm for provider-agnostic API access. Supports OpenRouter, Gemini, Azure, OpenAI, and any other litellm-compatible provider.

API keys can be passed directly or read from environment variables.

Example
# OpenRouter (reads OPENROUTER_API_KEY from env)
config = QwenLayoutAPIConfig(
    model="openrouter/qwen/qwen3-vl-8b-instruct",
)

# With explicit key
config = QwenLayoutAPIConfig(
    model="openrouter/qwen/qwen3-vl-8b-instruct",
    api_key=os.environ["OPENROUTER_API_KEY"],
    api_base="https://openrouter.ai/api/v1",
)

QwenLayoutDetector

QwenLayoutDetector(backend: QwenLayoutBackendConfig)

Bases: BaseLayoutExtractor

Qwen3-VL Vision-Language Model layout detector.

A flexible VLM-based layout detector that supports custom labels. Unlike fixed-label models (DocLayoutYOLO, RT-DETR), Qwen can detect any document elements specified at runtime.

Supports PyTorch, VLLM, MLX, and API backends.

Example
from omnidocs.tasks.layout_extraction import QwenLayoutDetector, CustomLabel
from omnidocs.tasks.layout_extraction.qwen import QwenLayoutPyTorchConfig

# Initialize with PyTorch backend
detector = QwenLayoutDetector(
        backend=QwenLayoutPyTorchConfig(model="Qwen/Qwen3-VL-8B-Instruct")
    )

# Basic extraction with default labels
result = detector.extract(image)

# With custom labels (strings)
result = detector.extract(image, custom_labels=["code_block", "sidebar"])

# With typed custom labels
labels = [
        CustomLabel(name="code_block", color="#E74C3C"),
        CustomLabel(name="sidebar", description="Side panel content"),
    ]
result = detector.extract(image, custom_labels=labels)

Initialize Qwen layout detector.

PARAMETER DESCRIPTION
backend

Backend configuration. One of: - QwenLayoutPyTorchConfig: PyTorch/HuggingFace backend - QwenLayoutVLLMConfig: VLLM high-throughput backend - QwenLayoutMLXConfig: MLX backend for Apple Silicon - QwenLayoutAPIConfig: API backend (OpenRouter, etc.)

TYPE: QwenLayoutBackendConfig

Source code in omnidocs/tasks/layout_extraction/qwen/detector.py
def __init__(self, backend: QwenLayoutBackendConfig):
    """
    Initialize Qwen layout detector.

    Args:
        backend: Backend configuration. One of:
            - QwenLayoutPyTorchConfig: PyTorch/HuggingFace backend
            - QwenLayoutVLLMConfig: VLLM high-throughput backend
            - QwenLayoutMLXConfig: MLX backend for Apple Silicon
            - QwenLayoutAPIConfig: API backend (OpenRouter, etc.)
    """
    self.backend_config = backend
    self._backend: Any = None
    self._processor: Any = None
    self._loaded = False

    # Backend-specific helpers
    self._process_vision_info: Any = None
    self._sampling_params_class: Any = None
    self._mlx_config: Any = None
    self._apply_chat_template: Any = None
    self._generate: Any = None

    # Load model
    self._load_model()
extract
extract(
    image: Union[Image, ndarray, str, Path],
    custom_labels: Optional[
        List[Union[str, CustomLabel]]
    ] = None,
) -> LayoutOutput

Run layout detection on an image.

PARAMETER DESCRIPTION
image

Input image as: - PIL.Image.Image: PIL image object - np.ndarray: Numpy array (HWC format, RGB) - str or Path: Path to image file

TYPE: Union[Image, ndarray, str, Path]

custom_labels

Optional custom labels to detect. Can be: - None: Use default labels (title, text, table, figure, etc.) - List[str]: Simple label names ["code_block", "sidebar"] - List[CustomLabel]: Typed labels with metadata

TYPE: Optional[List[Union[str, CustomLabel]]] DEFAULT: None

RETURNS DESCRIPTION
LayoutOutput

LayoutOutput with detected layout boxes

RAISES DESCRIPTION
RuntimeError

If model is not loaded

ValueError

If image format is not supported

Source code in omnidocs/tasks/layout_extraction/qwen/detector.py
def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    custom_labels: Optional[List[Union[str, CustomLabel]]] = None,
) -> LayoutOutput:
    """
    Run layout detection on an image.

    Args:
        image: Input image as:
            - PIL.Image.Image: PIL image object
            - np.ndarray: Numpy array (HWC format, RGB)
            - str or Path: Path to image file
        custom_labels: Optional custom labels to detect. Can be:
            - None: Use default labels (title, text, table, figure, etc.)
            - List[str]: Simple label names ["code_block", "sidebar"]
            - List[CustomLabel]: Typed labels with metadata

    Returns:
        LayoutOutput with detected layout boxes

    Raises:
        RuntimeError: If model is not loaded
        ValueError: If image format is not supported
    """
    if not self._loaded:
        raise RuntimeError("Model not loaded. Call _load_model() first.")

    # Prepare image
    pil_image = self._prepare_image(image)
    width, height = pil_image.size

    # Normalize labels
    label_names = self._normalize_labels(custom_labels)

    # Build prompt
    prompt = self._build_detection_prompt(label_names)

    # Run inference based on backend
    config_type = type(self.backend_config).__name__
    if config_type == "QwenLayoutPyTorchConfig":
        raw_output = self._infer_pytorch(pil_image, prompt)
    elif config_type == "QwenLayoutVLLMConfig":
        raw_output = self._infer_vllm(pil_image, prompt)
    elif config_type == "QwenLayoutMLXConfig":
        raw_output = self._infer_mlx(pil_image, prompt)
    elif config_type == "QwenLayoutAPIConfig":
        raw_output = self._infer_api(pil_image, prompt)
    else:
        raise RuntimeError(f"Unknown backend: {config_type}")

    # Parse detections
    detections = self._parse_json_output(raw_output)

    # Convert to LayoutOutput
    layout_boxes = self._build_layout_boxes(detections, width, height)

    # Sort by position (reading order)
    layout_boxes.sort(key=lambda b: (b.bbox.y1, b.bbox.x1))

    return LayoutOutput(
        bboxes=layout_boxes,
        image_width=width,
        image_height=height,
        model_name=f"Qwen3-VL ({type(self.backend_config).__name__})",
    )

QwenLayoutMLXConfig

Bases: BaseModel

MLX backend configuration for Qwen layout detection.

This backend uses MLX for Apple Silicon native inference. Best for local development and testing on macOS M1/M2/M3+. Requires: mlx, mlx-vlm

Note: This backend only works on Apple Silicon Macs. Do NOT use for Modal/cloud deployments.

Example
config = QwenLayoutMLXConfig(
        model="mlx-community/Qwen3-VL-8B-Instruct-4bit",
    )

QwenLayoutPyTorchConfig

Bases: BaseModel

PyTorch/HuggingFace backend configuration for Qwen layout detection.

This backend uses the transformers library with PyTorch for local GPU inference. Requires: torch, transformers, accelerate, qwen-vl-utils

Example
config = QwenLayoutPyTorchConfig(
        model="Qwen/Qwen3-VL-8B-Instruct",
        device="cuda",
        torch_dtype="bfloat16",
    )

QwenLayoutVLLMConfig

Bases: BaseModel

VLLM backend configuration for Qwen layout detection.

This backend uses VLLM for high-throughput inference. Best for batch processing and production deployments. Requires: vllm, torch, transformers, qwen-vl-utils

Example
config = QwenLayoutVLLMConfig(
        model="Qwen/Qwen3-VL-8B-Instruct",
        tensor_parallel_size=2,
        gpu_memory_utilization=0.9,
    )

api

API backend configuration for Qwen3-VL layout detection.

Uses litellm for provider-agnostic inference (OpenRouter, Gemini, Azure, etc.).

QwenLayoutAPIConfig

Bases: BaseModel

API backend configuration for Qwen layout detection.

Uses litellm for provider-agnostic API access. Supports OpenRouter, Gemini, Azure, OpenAI, and any other litellm-compatible provider.

API keys can be passed directly or read from environment variables.

Example
# OpenRouter (reads OPENROUTER_API_KEY from env)
config = QwenLayoutAPIConfig(
    model="openrouter/qwen/qwen3-vl-8b-instruct",
)

# With explicit key
config = QwenLayoutAPIConfig(
    model="openrouter/qwen/qwen3-vl-8b-instruct",
    api_key=os.environ["OPENROUTER_API_KEY"],
    api_base="https://openrouter.ai/api/v1",
)

detector

Qwen3-VL layout detector.

A Vision-Language Model for flexible layout detection with custom label support. Supports PyTorch, VLLM, MLX, and API backends.

Example
from omnidocs.tasks.layout_extraction import QwenLayoutDetector
from omnidocs.tasks.layout_extraction.qwen import QwenLayoutPyTorchConfig

detector = QwenLayoutDetector(
        backend=QwenLayoutPyTorchConfig(model="Qwen/Qwen3-VL-8B-Instruct")
    )
result = detector.extract(image)

# With custom labels
result = detector.extract(image, custom_labels=["code_block", "sidebar"])
QwenLayoutDetector
QwenLayoutDetector(backend: QwenLayoutBackendConfig)

Bases: BaseLayoutExtractor

Qwen3-VL Vision-Language Model layout detector.

A flexible VLM-based layout detector that supports custom labels. Unlike fixed-label models (DocLayoutYOLO, RT-DETR), Qwen can detect any document elements specified at runtime.

Supports PyTorch, VLLM, MLX, and API backends.

Example
from omnidocs.tasks.layout_extraction import QwenLayoutDetector, CustomLabel
from omnidocs.tasks.layout_extraction.qwen import QwenLayoutPyTorchConfig

# Initialize with PyTorch backend
detector = QwenLayoutDetector(
        backend=QwenLayoutPyTorchConfig(model="Qwen/Qwen3-VL-8B-Instruct")
    )

# Basic extraction with default labels
result = detector.extract(image)

# With custom labels (strings)
result = detector.extract(image, custom_labels=["code_block", "sidebar"])

# With typed custom labels
labels = [
        CustomLabel(name="code_block", color="#E74C3C"),
        CustomLabel(name="sidebar", description="Side panel content"),
    ]
result = detector.extract(image, custom_labels=labels)

Initialize Qwen layout detector.

PARAMETER DESCRIPTION
backend

Backend configuration. One of: - QwenLayoutPyTorchConfig: PyTorch/HuggingFace backend - QwenLayoutVLLMConfig: VLLM high-throughput backend - QwenLayoutMLXConfig: MLX backend for Apple Silicon - QwenLayoutAPIConfig: API backend (OpenRouter, etc.)

TYPE: QwenLayoutBackendConfig

Source code in omnidocs/tasks/layout_extraction/qwen/detector.py
def __init__(self, backend: QwenLayoutBackendConfig):
    """
    Initialize Qwen layout detector.

    Args:
        backend: Backend configuration. One of:
            - QwenLayoutPyTorchConfig: PyTorch/HuggingFace backend
            - QwenLayoutVLLMConfig: VLLM high-throughput backend
            - QwenLayoutMLXConfig: MLX backend for Apple Silicon
            - QwenLayoutAPIConfig: API backend (OpenRouter, etc.)
    """
    self.backend_config = backend
    self._backend: Any = None
    self._processor: Any = None
    self._loaded = False

    # Backend-specific helpers
    self._process_vision_info: Any = None
    self._sampling_params_class: Any = None
    self._mlx_config: Any = None
    self._apply_chat_template: Any = None
    self._generate: Any = None

    # Load model
    self._load_model()
extract
extract(
    image: Union[Image, ndarray, str, Path],
    custom_labels: Optional[
        List[Union[str, CustomLabel]]
    ] = None,
) -> LayoutOutput

Run layout detection on an image.

PARAMETER DESCRIPTION
image

Input image as: - PIL.Image.Image: PIL image object - np.ndarray: Numpy array (HWC format, RGB) - str or Path: Path to image file

TYPE: Union[Image, ndarray, str, Path]

custom_labels

Optional custom labels to detect. Can be: - None: Use default labels (title, text, table, figure, etc.) - List[str]: Simple label names ["code_block", "sidebar"] - List[CustomLabel]: Typed labels with metadata

TYPE: Optional[List[Union[str, CustomLabel]]] DEFAULT: None

RETURNS DESCRIPTION
LayoutOutput

LayoutOutput with detected layout boxes

RAISES DESCRIPTION
RuntimeError

If model is not loaded

ValueError

If image format is not supported

Source code in omnidocs/tasks/layout_extraction/qwen/detector.py
def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    custom_labels: Optional[List[Union[str, CustomLabel]]] = None,
) -> LayoutOutput:
    """
    Run layout detection on an image.

    Args:
        image: Input image as:
            - PIL.Image.Image: PIL image object
            - np.ndarray: Numpy array (HWC format, RGB)
            - str or Path: Path to image file
        custom_labels: Optional custom labels to detect. Can be:
            - None: Use default labels (title, text, table, figure, etc.)
            - List[str]: Simple label names ["code_block", "sidebar"]
            - List[CustomLabel]: Typed labels with metadata

    Returns:
        LayoutOutput with detected layout boxes

    Raises:
        RuntimeError: If model is not loaded
        ValueError: If image format is not supported
    """
    if not self._loaded:
        raise RuntimeError("Model not loaded. Call _load_model() first.")

    # Prepare image
    pil_image = self._prepare_image(image)
    width, height = pil_image.size

    # Normalize labels
    label_names = self._normalize_labels(custom_labels)

    # Build prompt
    prompt = self._build_detection_prompt(label_names)

    # Run inference based on backend
    config_type = type(self.backend_config).__name__
    if config_type == "QwenLayoutPyTorchConfig":
        raw_output = self._infer_pytorch(pil_image, prompt)
    elif config_type == "QwenLayoutVLLMConfig":
        raw_output = self._infer_vllm(pil_image, prompt)
    elif config_type == "QwenLayoutMLXConfig":
        raw_output = self._infer_mlx(pil_image, prompt)
    elif config_type == "QwenLayoutAPIConfig":
        raw_output = self._infer_api(pil_image, prompt)
    else:
        raise RuntimeError(f"Unknown backend: {config_type}")

    # Parse detections
    detections = self._parse_json_output(raw_output)

    # Convert to LayoutOutput
    layout_boxes = self._build_layout_boxes(detections, width, height)

    # Sort by position (reading order)
    layout_boxes.sort(key=lambda b: (b.bbox.y1, b.bbox.x1))

    return LayoutOutput(
        bboxes=layout_boxes,
        image_width=width,
        image_height=height,
        model_name=f"Qwen3-VL ({type(self.backend_config).__name__})",
    )

mlx

MLX backend configuration for Qwen3-VL layout detection.

QwenLayoutMLXConfig

Bases: BaseModel

MLX backend configuration for Qwen layout detection.

This backend uses MLX for Apple Silicon native inference. Best for local development and testing on macOS M1/M2/M3+. Requires: mlx, mlx-vlm

Note: This backend only works on Apple Silicon Macs. Do NOT use for Modal/cloud deployments.

Example
config = QwenLayoutMLXConfig(
        model="mlx-community/Qwen3-VL-8B-Instruct-4bit",
    )

pytorch

PyTorch/HuggingFace backend configuration for Qwen3-VL layout detection.

QwenLayoutPyTorchConfig

Bases: BaseModel

PyTorch/HuggingFace backend configuration for Qwen layout detection.

This backend uses the transformers library with PyTorch for local GPU inference. Requires: torch, transformers, accelerate, qwen-vl-utils

Example
config = QwenLayoutPyTorchConfig(
        model="Qwen/Qwen3-VL-8B-Instruct",
        device="cuda",
        torch_dtype="bfloat16",
    )

vllm

VLLM backend configuration for Qwen3-VL layout detection.

QwenLayoutVLLMConfig

Bases: BaseModel

VLLM backend configuration for Qwen layout detection.

This backend uses VLLM for high-throughput inference. Best for batch processing and production deployments. Requires: vllm, torch, transformers, qwen-vl-utils

Example
config = QwenLayoutVLLMConfig(
        model="Qwen/Qwen3-VL-8B-Instruct",
        tensor_parallel_size=2,
        gpu_memory_utilization=0.9,
    )

rtdetr

RT-DETR layout extractor.

A transformer-based real-time detection model for document layout detection. Uses HuggingFace Transformers implementation.

Model: HuggingPanda/docling-layout

RTDETRConfig

Bases: BaseModel

Configuration for RT-DETR layout extractor.

This is a single-backend model (PyTorch/Transformers only).

Example
config = RTDETRConfig(device="cuda", confidence=0.4)
extractor = RTDETRLayoutExtractor(config=config)

RTDETRLayoutExtractor

RTDETRLayoutExtractor(config: RTDETRConfig)

Bases: BaseLayoutExtractor

RT-DETR layout extractor using HuggingFace Transformers.

A transformer-based real-time detection model for document layout. Detects: title, text, table, figure, list, formula, captions, headers, footers.

This is a single-backend model (PyTorch/Transformers only).

Example
from omnidocs.tasks.layout_extraction import RTDETRLayoutExtractor, RTDETRConfig

extractor = RTDETRLayoutExtractor(config=RTDETRConfig(device="cuda"))
result = extractor.extract(image)

for box in result.bboxes:
        print(f"{box.label.value}: {box.confidence:.2f}")

Initialize RT-DETR layout extractor.

PARAMETER DESCRIPTION
config

Configuration object with device, model settings, etc.

TYPE: RTDETRConfig

Source code in omnidocs/tasks/layout_extraction/rtdetr.py
def __init__(self, config: RTDETRConfig):
    """
    Initialize RT-DETR layout extractor.

    Args:
        config: Configuration object with device, model settings, etc.
    """
    self.config = config
    self._model = None
    self._processor = None
    self._device = self._resolve_device(config.device)
    self._model_path = self._resolve_model_path(config.model_path)

    # Load model
    self._load_model()
extract
extract(
    image: Union[Image, ndarray, str, Path],
) -> LayoutOutput

Run layout extraction on an image.

PARAMETER DESCRIPTION
image

Input image (PIL Image, numpy array, or path)

TYPE: Union[Image, ndarray, str, Path]

RETURNS DESCRIPTION
LayoutOutput

LayoutOutput with detected layout boxes

Source code in omnidocs/tasks/layout_extraction/rtdetr.py
def extract(self, image: Union[Image.Image, np.ndarray, str, Path]) -> LayoutOutput:
    """
    Run layout extraction on an image.

    Args:
        image: Input image (PIL Image, numpy array, or path)

    Returns:
        LayoutOutput with detected layout boxes
    """
    import torch

    if self._model is None or self._processor is None:
        raise RuntimeError("Model not loaded. Call _load_model() first.")

    # Prepare image
    pil_image = self._prepare_image(image)
    img_width, img_height = pil_image.size

    # Preprocess
    inputs = self._processor(
        images=pil_image,
        return_tensors="pt",
        size={"height": self.config.image_size, "width": self.config.image_size},
    )

    # Move to device
    inputs = {k: v.to(self._device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}

    # Run inference
    with torch.no_grad():
        outputs = self._model(**inputs)

    # Post-process results
    target_sizes = torch.tensor([[img_height, img_width]])
    results = self._processor.post_process_object_detection(
        outputs,
        target_sizes=target_sizes,
        threshold=self.config.confidence,
    )[0]

    # Parse detections
    layout_boxes = []

    for score, label_id, box in zip(results["scores"], results["labels"], results["boxes"]):
        confidence = float(score.item())
        class_id = int(label_id.item())

        # Get original label from model config
        # Note: The model outputs 0-indexed class IDs, but id2label has background at index 0,
        # so we add 1 to map correctly (e.g., model output 8 -> id2label[9] = "Table")
        original_label = self._model.config.id2label.get(class_id + 1, f"class_{class_id}")

        # Map to standardized label
        standard_label = RTDETR_MAPPING.to_standard(original_label)

        # Box coordinates
        box_coords = box.cpu().tolist()

        layout_boxes.append(
            LayoutBox(
                label=standard_label,
                bbox=BoundingBox.from_list(box_coords),
                confidence=confidence,
                class_id=class_id,
                original_label=original_label,
            )
        )

    # Sort by y-coordinate (top to bottom reading order)
    layout_boxes.sort(key=lambda b: (b.bbox.y1, b.bbox.x1))

    return LayoutOutput(
        bboxes=layout_boxes,
        image_width=img_width,
        image_height=img_height,
        model_name="RT-DETR (docling-layout)",
    )

vlm

VLM layout detector.

A provider-agnostic Vision-Language Model layout detector using litellm. Works with any cloud API: Gemini, OpenRouter, Azure, OpenAI, Anthropic, etc.

Example
from omnidocs.vlm import VLMAPIConfig
from omnidocs.tasks.layout_extraction import VLMLayoutDetector

config = VLMAPIConfig(model="gemini/gemini-2.5-flash")
detector = VLMLayoutDetector(config=config)
result = detector.extract("document.png")

for box in result.bboxes:
    print(f"{box.label.value}: {box.bbox}")

VLMLayoutDetector

VLMLayoutDetector(config: VLMAPIConfig)

Bases: BaseLayoutExtractor

Provider-agnostic VLM layout detector using litellm.

Works with any cloud VLM API: Gemini, OpenRouter, Azure, OpenAI, Anthropic, etc. Supports custom labels for flexible detection.

Example
from omnidocs.vlm import VLMAPIConfig
from omnidocs.tasks.layout_extraction import VLMLayoutDetector

config = VLMAPIConfig(model="gemini/gemini-2.5-flash")
detector = VLMLayoutDetector(config=config)

# Default labels
result = detector.extract("document.png")

# Custom labels
result = detector.extract("document.png", custom_labels=["code_block", "sidebar"])

Initialize VLM layout detector.

PARAMETER DESCRIPTION
config

VLM API configuration with model and provider details.

TYPE: VLMAPIConfig

Source code in omnidocs/tasks/layout_extraction/vlm.py
def __init__(self, config: VLMAPIConfig):
    """
    Initialize VLM layout detector.

    Args:
        config: VLM API configuration with model and provider details.
    """
    self.config = config
    self._loaded = True
extract
extract(
    image: Union[Image, ndarray, str, Path],
    custom_labels: Optional[
        List[Union[str, CustomLabel]]
    ] = None,
    prompt: Optional[str] = None,
) -> LayoutOutput

Run layout detection on an image.

PARAMETER DESCRIPTION
image

Input image (PIL Image, numpy array, or file path).

TYPE: Union[Image, ndarray, str, Path]

custom_labels

Optional custom labels to detect. Can be: - None: Use default labels (title, text, table, figure, etc.) - List[str]: Simple label names ["code_block", "sidebar"] - List[CustomLabel]: Typed labels with metadata

TYPE: Optional[List[Union[str, CustomLabel]]] DEFAULT: None

prompt

Custom prompt. If None, builds a default detection prompt.

TYPE: Optional[str] DEFAULT: None

RETURNS DESCRIPTION
LayoutOutput

LayoutOutput with detected layout boxes.

Source code in omnidocs/tasks/layout_extraction/vlm.py
def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    custom_labels: Optional[List[Union[str, CustomLabel]]] = None,
    prompt: Optional[str] = None,
) -> LayoutOutput:
    """
    Run layout detection on an image.

    Args:
        image: Input image (PIL Image, numpy array, or file path).
        custom_labels: Optional custom labels to detect. Can be:
            - None: Use default labels (title, text, table, figure, etc.)
            - List[str]: Simple label names ["code_block", "sidebar"]
            - List[CustomLabel]: Typed labels with metadata
        prompt: Custom prompt. If None, builds a default detection prompt.

    Returns:
        LayoutOutput with detected layout boxes.
    """
    if not self._loaded:
        raise RuntimeError("Model not loaded.")

    pil_image = self._prepare_image(image)
    width, height = pil_image.size

    # Normalize labels
    label_names = self._normalize_labels(custom_labels)

    # Build or use custom prompt
    final_prompt = prompt or _build_layout_prompt(label_names)

    raw_output = vlm_completion(self.config, final_prompt, pil_image)
    detections = _parse_layout_response(raw_output, (width, height))
    layout_boxes = self._build_layout_boxes(detections, width, height)

    # Sort by reading order
    layout_boxes.sort(key=lambda b: (b.bbox.y1, b.bbox.x1))

    return LayoutOutput(
        bboxes=layout_boxes,
        image_width=width,
        image_height=height,
        model_name=f"VLM ({self.config.model})",
    )

ocr_extraction

OCR Extraction Module.

Provides extractors for detecting text with bounding boxes from document images. Returns text content along with spatial coordinates (unlike Text Extraction which returns formatted Markdown/HTML without coordinates).

Available Extractors
  • TesseractOCR: Open-source OCR (CPU, requires system Tesseract)
  • EasyOCR: PyTorch-based OCR (CPU/GPU, 80+ languages)
  • PaddleOCR: PaddlePaddle-based OCR (CPU/GPU, excellent CJK support)
Key Difference from Text Extraction
  • OCR Extraction: Text + Bounding Boxes (spatial location)
  • Text Extraction: Markdown/HTML (formatted document export)
Example
from omnidocs.tasks.ocr_extraction import TesseractOCR, TesseractOCRConfig

ocr = TesseractOCR(config=TesseractOCRConfig(languages=["eng"]))
result = ocr.extract(image)

for block in result.text_blocks:
        print(f"'{block.text}' @ {block.bbox.to_list()} (conf: {block.confidence:.2f})")
# With EasyOCR
from omnidocs.tasks.ocr_extraction import EasyOCR, EasyOCRConfig

ocr = EasyOCR(config=EasyOCRConfig(languages=["en", "ch_sim"], gpu=True))
result = ocr.extract(image)
# With PaddleOCR
from omnidocs.tasks.ocr_extraction import PaddleOCR, PaddleOCRConfig

ocr = PaddleOCR(config=PaddleOCRConfig(lang="ch", device="cpu"))
result = ocr.extract(image)

BaseOCRExtractor

Bases: ABC

Abstract base class for OCR extractors.

All OCR extraction models must inherit from this class and implement the required methods.

Example
class MyOCRExtractor(BaseOCRExtractor):
        def __init__(self, config: MyConfig):
            self.config = config
            self._load_model()

        def _load_model(self):
            # Initialize OCR engine
            pass

        def extract(self, image):
            # Run OCR extraction
            return OCROutput(...)

extract abstractmethod

extract(
    image: Union[Image, ndarray, str, Path],
) -> OCROutput

Run OCR extraction on an image.

PARAMETER DESCRIPTION
image

Input image as: - PIL.Image.Image: PIL image object - np.ndarray: Numpy array (HWC format, RGB) - str or Path: Path to image file

TYPE: Union[Image, ndarray, str, Path]

RETURNS DESCRIPTION
OCROutput

OCROutput containing detected text blocks with bounding boxes

RAISES DESCRIPTION
ValueError

If image format is not supported

RuntimeError

If OCR engine is not initialized or extraction fails

Source code in omnidocs/tasks/ocr_extraction/base.py
@abstractmethod
def extract(self, image: Union[Image.Image, np.ndarray, str, Path]) -> OCROutput:
    """
    Run OCR extraction on an image.

    Args:
        image: Input image as:
            - PIL.Image.Image: PIL image object
            - np.ndarray: Numpy array (HWC format, RGB)
            - str or Path: Path to image file

    Returns:
        OCROutput containing detected text blocks with bounding boxes

    Raises:
        ValueError: If image format is not supported
        RuntimeError: If OCR engine is not initialized or extraction fails
    """
    pass

batch_extract

batch_extract(
    images: List[Union[Image, ndarray, str, Path]],
    progress_callback: Optional[
        Callable[[int, int], None]
    ] = None,
) -> List[OCROutput]

Run OCR extraction on multiple images.

Default implementation loops over extract(). Subclasses can override for optimized batching.

PARAMETER DESCRIPTION
images

List of images in any supported format

TYPE: List[Union[Image, ndarray, str, Path]]

progress_callback

Optional function(current, total) for progress

TYPE: Optional[Callable[[int, int], None]] DEFAULT: None

RETURNS DESCRIPTION
List[OCROutput]

List of OCROutput in same order as input

Examples:

images = [doc.get_page(i) for i in range(doc.page_count)]
results = extractor.batch_extract(images)
Source code in omnidocs/tasks/ocr_extraction/base.py
def batch_extract(
    self,
    images: List[Union[Image.Image, np.ndarray, str, Path]],
    progress_callback: Optional[Callable[[int, int], None]] = None,
) -> List[OCROutput]:
    """
    Run OCR extraction on multiple images.

    Default implementation loops over extract(). Subclasses can override
    for optimized batching.

    Args:
        images: List of images in any supported format
        progress_callback: Optional function(current, total) for progress

    Returns:
        List of OCROutput in same order as input

    Examples:
        ```python
        images = [doc.get_page(i) for i in range(doc.page_count)]
        results = extractor.batch_extract(images)
        ```
    """
    results = []
    total = len(images)

    for i, image in enumerate(images):
        if progress_callback:
            progress_callback(i + 1, total)

        result = self.extract(image)
        results.append(result)

    return results

extract_document

extract_document(
    document: Document,
    progress_callback: Optional[
        Callable[[int, int], None]
    ] = None,
) -> List[OCROutput]

Run OCR extraction on all pages of a document.

PARAMETER DESCRIPTION
document

Document instance

TYPE: Document

progress_callback

Optional function(current, total) for progress

TYPE: Optional[Callable[[int, int], None]] DEFAULT: None

RETURNS DESCRIPTION
List[OCROutput]

List of OCROutput, one per page

Examples:

doc = Document.from_pdf("paper.pdf")
results = extractor.extract_document(doc)
Source code in omnidocs/tasks/ocr_extraction/base.py
def extract_document(
    self,
    document: "Document",
    progress_callback: Optional[Callable[[int, int], None]] = None,
) -> List[OCROutput]:
    """
    Run OCR extraction on all pages of a document.

    Args:
        document: Document instance
        progress_callback: Optional function(current, total) for progress

    Returns:
        List of OCROutput, one per page

    Examples:
        ```python
        doc = Document.from_pdf("paper.pdf")
        results = extractor.extract_document(doc)
        ```
    """
    results = []
    total = document.page_count

    for i, page in enumerate(document.iter_pages()):
        if progress_callback:
            progress_callback(i + 1, total)

        result = self.extract(page)
        results.append(result)

    return results

EasyOCR

EasyOCR(config: EasyOCRConfig)

Bases: BaseOCRExtractor

EasyOCR text extractor.

Single-backend model (PyTorch - CPU/GPU).

Example
from omnidocs.tasks.ocr_extraction import EasyOCR, EasyOCRConfig

ocr = EasyOCR(config=EasyOCRConfig(languages=["en"], gpu=True))
result = ocr.extract(image)

for block in result.text_blocks:
        print(f"'{block.text}' @ {block.bbox.to_list()}")

Initialize EasyOCR extractor.

PARAMETER DESCRIPTION
config

Configuration object

TYPE: EasyOCRConfig

RAISES DESCRIPTION
ImportError

If easyocr is not installed

Source code in omnidocs/tasks/ocr_extraction/easyocr.py
def __init__(self, config: EasyOCRConfig):
    """
    Initialize EasyOCR extractor.

    Args:
        config: Configuration object

    Raises:
        ImportError: If easyocr is not installed
    """
    self.config = config
    self._reader = None
    self._load_model()

extract

extract(
    image: Union[Image, ndarray, str, Path],
    detail: int = 1,
    paragraph: bool = False,
    min_size: int = 10,
    text_threshold: float = 0.7,
    low_text: float = 0.4,
    link_threshold: float = 0.4,
    canvas_size: int = 2560,
    mag_ratio: float = 1.0,
) -> OCROutput

Run OCR on an image.

PARAMETER DESCRIPTION
image

Input image (PIL Image, numpy array, or path)

TYPE: Union[Image, ndarray, str, Path]

detail

0 = simple output, 1 = detailed with boxes

TYPE: int DEFAULT: 1

paragraph

Combine results into paragraphs

TYPE: bool DEFAULT: False

min_size

Minimum text box size

TYPE: int DEFAULT: 10

text_threshold

Text confidence threshold

TYPE: float DEFAULT: 0.7

low_text

Low text bound

TYPE: float DEFAULT: 0.4

link_threshold

Link threshold for text joining

TYPE: float DEFAULT: 0.4

canvas_size

Max image dimension for processing

TYPE: int DEFAULT: 2560

mag_ratio

Magnification ratio

TYPE: float DEFAULT: 1.0

RETURNS DESCRIPTION
OCROutput

OCROutput with detected text blocks

RAISES DESCRIPTION
ValueError

If detail is not 0 or 1

RuntimeError

If EasyOCR is not initialized

Source code in omnidocs/tasks/ocr_extraction/easyocr.py
def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    detail: int = 1,
    paragraph: bool = False,
    min_size: int = 10,
    text_threshold: float = 0.7,
    low_text: float = 0.4,
    link_threshold: float = 0.4,
    canvas_size: int = 2560,
    mag_ratio: float = 1.0,
) -> OCROutput:
    """
    Run OCR on an image.

    Args:
        image: Input image (PIL Image, numpy array, or path)
        detail: 0 = simple output, 1 = detailed with boxes
        paragraph: Combine results into paragraphs
        min_size: Minimum text box size
        text_threshold: Text confidence threshold
        low_text: Low text bound
        link_threshold: Link threshold for text joining
        canvas_size: Max image dimension for processing
        mag_ratio: Magnification ratio

    Returns:
        OCROutput with detected text blocks

    Raises:
        ValueError: If detail is not 0 or 1
        RuntimeError: If EasyOCR is not initialized
    """
    if self._reader is None:
        raise RuntimeError("EasyOCR not initialized. Call _load_model() first.")

    # Validate detail parameter
    if detail not in (0, 1):
        raise ValueError(f"detail must be 0 or 1, got {detail}")

    # Prepare image
    pil_image = self._prepare_image(image)
    image_width, image_height = pil_image.size

    # Convert to numpy array for EasyOCR
    image_array = np.array(pil_image)

    # Run EasyOCR
    results = self._reader.readtext(
        image_array,
        detail=detail,
        paragraph=paragraph,
        min_size=min_size,
        text_threshold=text_threshold,
        low_text=low_text,
        link_threshold=link_threshold,
        canvas_size=canvas_size,
        mag_ratio=mag_ratio,
    )

    # Parse results
    text_blocks = []
    full_text_parts = []

    for result in results:
        if detail == 0:
            # Simple output: just text
            text = result
            confidence = 1.0
            bbox = BoundingBox(x1=0, y1=0, x2=0, y2=0)
            polygon = None
        else:
            # Detailed output: [polygon, text, confidence]
            polygon_points, text, confidence = result

            # EasyOCR returns 4 corner points: [[x1,y1], [x2,y1], [x2,y2], [x1,y2]]
            # Convert to list of lists for storage
            polygon = [list(p) for p in polygon_points]

            # Convert to axis-aligned bounding box
            bbox = BoundingBox.from_polygon(polygon)

        if not text.strip():
            continue

        text_blocks.append(
            TextBlock(
                text=text,
                bbox=bbox,
                confidence=float(confidence),
                granularity=(OCRGranularity.LINE if paragraph else OCRGranularity.WORD),
                polygon=polygon,
                language="+".join(self.config.languages),
            )
        )

        full_text_parts.append(text)

    # Sort by position
    text_blocks.sort(key=lambda b: (b.bbox.y1, b.bbox.x1))

    return OCROutput(
        text_blocks=text_blocks,
        full_text=" ".join(full_text_parts),
        image_width=image_width,
        image_height=image_height,
        model_name=self.MODEL_NAME,
        languages_detected=self.config.languages,
    )

extract_batch

extract_batch(
    images: List[Union[Image, ndarray, str, Path]], **kwargs
) -> List[OCROutput]

Run OCR on multiple images.

PARAMETER DESCRIPTION
images

List of input images

TYPE: List[Union[Image, ndarray, str, Path]]

**kwargs

Arguments passed to extract()

DEFAULT: {}

RETURNS DESCRIPTION
List[OCROutput]

List of OCROutput objects

Source code in omnidocs/tasks/ocr_extraction/easyocr.py
def extract_batch(
    self,
    images: List[Union[Image.Image, np.ndarray, str, Path]],
    **kwargs,
) -> List[OCROutput]:
    """
    Run OCR on multiple images.

    Args:
        images: List of input images
        **kwargs: Arguments passed to extract()

    Returns:
        List of OCROutput objects
    """
    results = []
    for img in images:
        results.append(self.extract(img, **kwargs))
    return results

EasyOCRConfig

Bases: BaseModel

Configuration for EasyOCR extractor.

This is a single-backend model (PyTorch - CPU/GPU).

Example
config = EasyOCRConfig(languages=["en", "ch_sim"], gpu=True)
ocr = EasyOCR(config=config)

BoundingBox

Bases: BaseModel

Bounding box coordinates in pixel space.

Coordinates follow the convention: (x1, y1) is top-left, (x2, y2) is bottom-right. For rotated text, use the polygon field in TextBlock instead.

Example
bbox = BoundingBox(x1=100, y1=50, x2=300, y2=80)
print(bbox.width, bbox.height)  # 200, 30
print(bbox.center)  # (200.0, 65.0)

width property

width: float

Width of the bounding box.

height property

height: float

Height of the bounding box.

area property

area: float

Area of the bounding box.

center property

center: Tuple[float, float]

Center point of the bounding box.

to_list

to_list() -> List[float]

Convert to [x1, y1, x2, y2] list.

Source code in omnidocs/tasks/ocr_extraction/models.py
def to_list(self) -> List[float]:
    """Convert to [x1, y1, x2, y2] list."""
    return [self.x1, self.y1, self.x2, self.y2]

to_xyxy

to_xyxy() -> Tuple[float, float, float, float]

Convert to (x1, y1, x2, y2) tuple.

Source code in omnidocs/tasks/ocr_extraction/models.py
def to_xyxy(self) -> Tuple[float, float, float, float]:
    """Convert to (x1, y1, x2, y2) tuple."""
    return (self.x1, self.y1, self.x2, self.y2)

to_xywh

to_xywh() -> Tuple[float, float, float, float]

Convert to (x, y, width, height) format.

Source code in omnidocs/tasks/ocr_extraction/models.py
def to_xywh(self) -> Tuple[float, float, float, float]:
    """Convert to (x, y, width, height) format."""
    return (self.x1, self.y1, self.width, self.height)

from_list classmethod

from_list(coords: List[float]) -> BoundingBox

Create from [x1, y1, x2, y2] list.

Source code in omnidocs/tasks/ocr_extraction/models.py
@classmethod
def from_list(cls, coords: List[float]) -> "BoundingBox":
    """Create from [x1, y1, x2, y2] list."""
    if len(coords) != 4:
        raise ValueError(f"Expected 4 coordinates, got {len(coords)}")
    return cls(x1=coords[0], y1=coords[1], x2=coords[2], y2=coords[3])

from_polygon classmethod

from_polygon(polygon: List[List[float]]) -> BoundingBox

Create axis-aligned bounding box from polygon points.

PARAMETER DESCRIPTION
polygon

List of [x, y] points (usually 4 for quadrilateral)

TYPE: List[List[float]]

RETURNS DESCRIPTION
BoundingBox

BoundingBox that encloses all polygon points

Source code in omnidocs/tasks/ocr_extraction/models.py
@classmethod
def from_polygon(cls, polygon: List[List[float]]) -> "BoundingBox":
    """
    Create axis-aligned bounding box from polygon points.

    Args:
        polygon: List of [x, y] points (usually 4 for quadrilateral)

    Returns:
        BoundingBox that encloses all polygon points
    """
    if not polygon:
        raise ValueError("Polygon cannot be empty")

    xs = [p[0] for p in polygon]
    ys = [p[1] for p in polygon]
    return cls(x1=min(xs), y1=min(ys), x2=max(xs), y2=max(ys))

to_normalized

to_normalized(
    image_width: int, image_height: int
) -> BoundingBox

Convert to normalized coordinates (0-1024 range).

Scales coordinates from absolute pixel values to a virtual 1024x1024 canvas. This provides consistent coordinates regardless of original image size.

PARAMETER DESCRIPTION
image_width

Original image width in pixels

TYPE: int

image_height

Original image height in pixels

TYPE: int

RETURNS DESCRIPTION
BoundingBox

New BoundingBox with coordinates in 0-1024 range

Source code in omnidocs/tasks/ocr_extraction/models.py
def to_normalized(self, image_width: int, image_height: int) -> "BoundingBox":
    """
    Convert to normalized coordinates (0-1024 range).

    Scales coordinates from absolute pixel values to a virtual 1024x1024 canvas.
    This provides consistent coordinates regardless of original image size.

    Args:
        image_width: Original image width in pixels
        image_height: Original image height in pixels

    Returns:
        New BoundingBox with coordinates in 0-1024 range
    """
    return BoundingBox(
        x1=self.x1 / image_width * NORMALIZED_SIZE,
        y1=self.y1 / image_height * NORMALIZED_SIZE,
        x2=self.x2 / image_width * NORMALIZED_SIZE,
        y2=self.y2 / image_height * NORMALIZED_SIZE,
    )

to_absolute

to_absolute(
    image_width: int, image_height: int
) -> BoundingBox

Convert from normalized (0-1024) to absolute pixel coordinates.

PARAMETER DESCRIPTION
image_width

Target image width in pixels

TYPE: int

image_height

Target image height in pixels

TYPE: int

RETURNS DESCRIPTION
BoundingBox

New BoundingBox with absolute pixel coordinates

Source code in omnidocs/tasks/ocr_extraction/models.py
def to_absolute(self, image_width: int, image_height: int) -> "BoundingBox":
    """
    Convert from normalized (0-1024) to absolute pixel coordinates.

    Args:
        image_width: Target image width in pixels
        image_height: Target image height in pixels

    Returns:
        New BoundingBox with absolute pixel coordinates
    """
    return BoundingBox(
        x1=self.x1 / NORMALIZED_SIZE * image_width,
        y1=self.y1 / NORMALIZED_SIZE * image_height,
        x2=self.x2 / NORMALIZED_SIZE * image_width,
        y2=self.y2 / NORMALIZED_SIZE * image_height,
    )

OCRGranularity

Bases: str, Enum

OCR detection granularity levels.

Different OCR engines return results at different granularity levels. This enum standardizes the options across all extractors.

OCROutput

Bases: BaseModel

Complete OCR extraction results for a single image.

Contains all detected text blocks with their bounding boxes, plus metadata about the extraction.

Example
result = ocr.extract(image)
print(f"Found {result.block_count} blocks")
print(f"Full text: {result.full_text}")
for block in result.text_blocks:
        print(f"'{block.text}' @ {block.bbox.to_list()}")

block_count property

block_count: int

Number of detected text blocks.

word_count property

word_count: int

Approximate word count from full text.

average_confidence property

average_confidence: float

Average confidence across all text blocks.

filter_by_confidence

filter_by_confidence(
    min_confidence: float,
) -> List[TextBlock]

Filter text blocks by minimum confidence.

Source code in omnidocs/tasks/ocr_extraction/models.py
def filter_by_confidence(self, min_confidence: float) -> List[TextBlock]:
    """Filter text blocks by minimum confidence."""
    return [b for b in self.text_blocks if b.confidence >= min_confidence]

filter_by_granularity

filter_by_granularity(
    granularity: OCRGranularity,
) -> List[TextBlock]

Filter text blocks by granularity level.

Source code in omnidocs/tasks/ocr_extraction/models.py
def filter_by_granularity(self, granularity: OCRGranularity) -> List[TextBlock]:
    """Filter text blocks by granularity level."""
    return [b for b in self.text_blocks if b.granularity == granularity]

to_dict

to_dict() -> Dict

Convert to dictionary representation.

Source code in omnidocs/tasks/ocr_extraction/models.py
def to_dict(self) -> Dict:
    """Convert to dictionary representation."""
    return {
        "text_blocks": [b.to_dict() for b in self.text_blocks],
        "full_text": self.full_text,
        "image_width": self.image_width,
        "image_height": self.image_height,
        "model_name": self.model_name,
        "languages_detected": self.languages_detected,
        "block_count": self.block_count,
        "word_count": self.word_count,
        "average_confidence": self.average_confidence,
    }

sort_by_position

sort_by_position(top_to_bottom: bool = True) -> OCROutput

Return a new OCROutput with blocks sorted by position.

PARAMETER DESCRIPTION
top_to_bottom

If True, sort by y-coordinate (reading order)

TYPE: bool DEFAULT: True

RETURNS DESCRIPTION
OCROutput

New OCROutput with sorted text blocks

Source code in omnidocs/tasks/ocr_extraction/models.py
def sort_by_position(self, top_to_bottom: bool = True) -> "OCROutput":
    """
    Return a new OCROutput with blocks sorted by position.

    Args:
        top_to_bottom: If True, sort by y-coordinate (reading order)

    Returns:
        New OCROutput with sorted text blocks
    """
    sorted_blocks = sorted(
        self.text_blocks,
        key=lambda b: (b.bbox.y1, b.bbox.x1),
        reverse=not top_to_bottom,
    )
    # Regenerate full_text in sorted order
    full_text = " ".join(b.text for b in sorted_blocks)

    return OCROutput(
        text_blocks=sorted_blocks,
        full_text=full_text,
        image_width=self.image_width,
        image_height=self.image_height,
        model_name=self.model_name,
        languages_detected=self.languages_detected,
    )

get_normalized_blocks

get_normalized_blocks() -> List[Dict]

Get all text blocks with normalized (0-1024) coordinates.

RETURNS DESCRIPTION
List[Dict]

List of dicts with normalized bbox coordinates and metadata.

Source code in omnidocs/tasks/ocr_extraction/models.py
def get_normalized_blocks(self) -> List[Dict]:
    """
    Get all text blocks with normalized (0-1024) coordinates.

    Returns:
        List of dicts with normalized bbox coordinates and metadata.
    """
    normalized = []
    for block in self.text_blocks:
        norm_bbox = block.bbox.to_normalized(self.image_width, self.image_height)
        normalized.append(
            {
                "text": block.text,
                "bbox": norm_bbox.to_list(),
                "confidence": block.confidence,
                "granularity": block.granularity.value,
                "language": block.language,
            }
        )
    return normalized

visualize

visualize(
    image: Image,
    output_path: Optional[Union[str, Path]] = None,
    show_text: bool = True,
    show_confidence: bool = False,
    line_width: int = 2,
    box_color: str = "#2ECC71",
    text_color: str = "#000000",
) -> Image.Image

Visualize OCR results on the image.

Draws bounding boxes around detected text with optional labels.

PARAMETER DESCRIPTION
image

PIL Image to draw on (will be copied, not modified)

TYPE: Image

output_path

Optional path to save the visualization

TYPE: Optional[Union[str, Path]] DEFAULT: None

show_text

Whether to show detected text

TYPE: bool DEFAULT: True

show_confidence

Whether to show confidence scores

TYPE: bool DEFAULT: False

line_width

Width of bounding box lines

TYPE: int DEFAULT: 2

box_color

Color for bounding boxes (hex)

TYPE: str DEFAULT: '#2ECC71'

text_color

Color for text labels (hex)

TYPE: str DEFAULT: '#000000'

RETURNS DESCRIPTION
Image

PIL Image with visualizations drawn

Example
result = ocr.extract(image)
viz = result.visualize(image, output_path="ocr_viz.png")
Source code in omnidocs/tasks/ocr_extraction/models.py
def visualize(
    self,
    image: "Image.Image",
    output_path: Optional[Union[str, Path]] = None,
    show_text: bool = True,
    show_confidence: bool = False,
    line_width: int = 2,
    box_color: str = "#2ECC71",
    text_color: str = "#000000",
) -> "Image.Image":
    """
    Visualize OCR results on the image.

    Draws bounding boxes around detected text with optional labels.

    Args:
        image: PIL Image to draw on (will be copied, not modified)
        output_path: Optional path to save the visualization
        show_text: Whether to show detected text
        show_confidence: Whether to show confidence scores
        line_width: Width of bounding box lines
        box_color: Color for bounding boxes (hex)
        text_color: Color for text labels (hex)

    Returns:
        PIL Image with visualizations drawn

    Example:
        ```python
        result = ocr.extract(image)
        viz = result.visualize(image, output_path="ocr_viz.png")
        ```
    """
    from PIL import ImageDraw, ImageFont

    # Copy image to avoid modifying original
    viz_image = image.copy().convert("RGB")
    draw = ImageDraw.Draw(viz_image)

    # Try to get a font
    try:
        font = ImageFont.truetype("/System/Library/Fonts/Helvetica.ttc", 12)
    except Exception:
        try:
            font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 12)
        except Exception:
            font = ImageFont.load_default()

    for block in self.text_blocks:
        coords = block.bbox.to_xyxy()

        # Draw polygon if available, otherwise draw rectangle
        if block.polygon:
            flat_polygon = [coord for point in block.polygon for coord in point]
            draw.polygon(flat_polygon, outline=box_color, width=line_width)
        else:
            draw.rectangle(coords, outline=box_color, width=line_width)

        # Build label text
        if show_text or show_confidence:
            label_parts = []
            if show_text:
                # Truncate long text
                text = block.text[:25] + "..." if len(block.text) > 25 else block.text
                label_parts.append(text)
            if show_confidence:
                label_parts.append(f"{block.confidence:.2f}")
            label_text = " | ".join(label_parts)

            # Position label below the box
            label_x = coords[0]
            label_y = coords[3] + 2  # Below bottom edge

            # Draw label with background
            text_bbox = draw.textbbox((label_x, label_y), label_text, font=font)
            padding = 2
            draw.rectangle(
                [
                    text_bbox[0] - padding,
                    text_bbox[1] - padding,
                    text_bbox[2] + padding,
                    text_bbox[3] + padding,
                ],
                fill="#FFFFFF",
                outline=box_color,
            )
            draw.text((label_x, label_y), label_text, fill=text_color, font=font)

    # Save if path provided
    if output_path:
        output_path = Path(output_path)
        output_path.parent.mkdir(parents=True, exist_ok=True)
        viz_image.save(output_path)

    return viz_image

load_json classmethod

load_json(file_path: Union[str, Path]) -> OCROutput

Load an OCROutput instance from a JSON file.

PARAMETER DESCRIPTION
file_path

Path to JSON file

TYPE: Union[str, Path]

RETURNS DESCRIPTION
OCROutput

OCROutput instance

Source code in omnidocs/tasks/ocr_extraction/models.py
@classmethod
def load_json(cls, file_path: Union[str, Path]) -> "OCROutput":
    """
    Load an OCROutput instance from a JSON file.

    Args:
        file_path: Path to JSON file

    Returns:
        OCROutput instance
    """
    path = Path(file_path)
    return cls.model_validate_json(path.read_text(encoding="utf-8"))

save_json

save_json(file_path: Union[str, Path]) -> None

Save OCROutput instance to a JSON file.

PARAMETER DESCRIPTION
file_path

Path where JSON file should be saved

TYPE: Union[str, Path]

Source code in omnidocs/tasks/ocr_extraction/models.py
def save_json(self, file_path: Union[str, Path]) -> None:
    """
    Save OCROutput instance to a JSON file.

    Args:
        file_path: Path where JSON file should be saved
    """
    path = Path(file_path)
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(self.model_dump_json(indent=2), encoding="utf-8")

TextBlock

Bases: BaseModel

Single detected text element with text, bounding box, and confidence.

This is the fundamental unit of OCR output - can represent a character, word, line, or block depending on the OCR model and configuration.

Example
block = TextBlock(
        text="Hello",
        bbox=BoundingBox(x1=100, y1=50, x2=200, y2=80),
        confidence=0.95,
        granularity=OCRGranularity.WORD,
    )

to_dict

to_dict() -> Dict

Convert to dictionary representation.

Source code in omnidocs/tasks/ocr_extraction/models.py
def to_dict(self) -> Dict:
    """Convert to dictionary representation."""
    return {
        "text": self.text,
        "bbox": self.bbox.to_list(),
        "confidence": self.confidence,
        "granularity": self.granularity.value,
        "polygon": self.polygon,
        "language": self.language,
    }

get_normalized_bbox

get_normalized_bbox(
    image_width: int, image_height: int
) -> BoundingBox

Get bounding box in normalized (0-1024) coordinates.

PARAMETER DESCRIPTION
image_width

Original image width

TYPE: int

image_height

Original image height

TYPE: int

RETURNS DESCRIPTION
BoundingBox

BoundingBox with normalized coordinates

Source code in omnidocs/tasks/ocr_extraction/models.py
def get_normalized_bbox(self, image_width: int, image_height: int) -> BoundingBox:
    """
    Get bounding box in normalized (0-1024) coordinates.

    Args:
        image_width: Original image width
        image_height: Original image height

    Returns:
        BoundingBox with normalized coordinates
    """
    return self.bbox.to_normalized(image_width, image_height)

PaddleOCR

PaddleOCR(config: PaddleOCRConfig)

Bases: BaseOCRExtractor

PaddleOCR text extractor.

Single-backend model (PaddlePaddle - CPU/GPU).

Example
from omnidocs.tasks.ocr_extraction import PaddleOCR, PaddleOCRConfig

ocr = PaddleOCR(config=PaddleOCRConfig(lang="en", device="cpu"))
result = ocr.extract(image)

for block in result.text_blocks:
        print(f"'{block.text}' @ {block.bbox.to_list()}")

Initialize PaddleOCR extractor.

PARAMETER DESCRIPTION
config

Configuration object

TYPE: PaddleOCRConfig

RAISES DESCRIPTION
ImportError

If paddleocr or paddlepaddle is not installed

Source code in omnidocs/tasks/ocr_extraction/paddleocr.py
def __init__(self, config: PaddleOCRConfig):
    """
    Initialize PaddleOCR extractor.

    Args:
        config: Configuration object

    Raises:
        ImportError: If paddleocr or paddlepaddle is not installed
    """
    self.config = config
    self._ocr = None

    # Normalize language code
    self._lang = LANG_CODES.get(config.lang.lower(), config.lang)

    self._load_model()

extract

extract(
    image: Union[Image, ndarray, str, Path],
) -> OCROutput

Run OCR on an image.

PARAMETER DESCRIPTION
image

Input image (PIL Image, numpy array, or path)

TYPE: Union[Image, ndarray, str, Path]

RETURNS DESCRIPTION
OCROutput

OCROutput with detected text blocks

Source code in omnidocs/tasks/ocr_extraction/paddleocr.py
def extract(self, image: Union[Image.Image, np.ndarray, str, Path]) -> OCROutput:
    """
    Run OCR on an image.

    Args:
        image: Input image (PIL Image, numpy array, or path)

    Returns:
        OCROutput with detected text blocks
    """
    if self._ocr is None:
        raise RuntimeError("PaddleOCR not initialized. Call _load_model() first.")

    # Prepare image
    pil_image = self._prepare_image(image)
    image_width, image_height = pil_image.size

    # Convert to numpy array
    image_array = np.array(pil_image)

    # Run PaddleOCR v3.x - use predict() method
    results = self._ocr.predict(image_array)

    # Parse results
    text_blocks = []

    # PaddleOCR may return None or empty results
    if results is None or len(results) == 0:
        return OCROutput(
            text_blocks=[],
            full_text="",
            image_width=image_width,
            image_height=image_height,
            model_name=self.MODEL_NAME,
            languages_detected=[self._lang],
        )

    # PaddleOCR v3.x returns list of dicts with 'rec_texts', 'rec_scores', 'dt_polys'
    for result in results:
        if result is None:
            continue

        rec_texts = result.get("rec_texts", [])
        rec_scores = result.get("rec_scores", [])
        dt_polys = result.get("dt_polys", [])

        for i, text in enumerate(rec_texts):
            if not text.strip():
                continue

            confidence = rec_scores[i] if i < len(rec_scores) else 1.0

            # Get polygon and convert to list
            polygon: Optional[List[List[float]]] = None
            if i < len(dt_polys) and dt_polys[i] is not None:
                poly_array = dt_polys[i]
                # Handle numpy array
                if hasattr(poly_array, "tolist"):
                    polygon = poly_array.tolist()
                else:
                    polygon = list(poly_array)

            # Convert polygon to bbox
            if polygon:
                bbox = BoundingBox.from_polygon(polygon)
            else:
                bbox = BoundingBox(x1=0, y1=0, x2=0, y2=0)

            text_blocks.append(
                TextBlock(
                    text=text,
                    bbox=bbox,
                    confidence=float(confidence),
                    granularity=OCRGranularity.LINE,
                    polygon=polygon,
                    language=self._lang,
                )
            )

    # Sort by position (top to bottom, left to right)
    text_blocks.sort(key=lambda b: (b.bbox.y1, b.bbox.x1))

    # Build full_text from sorted blocks to ensure reading order
    full_text = " ".join(block.text for block in text_blocks)

    return OCROutput(
        text_blocks=text_blocks,
        full_text=full_text,
        image_width=image_width,
        image_height=image_height,
        model_name=self.MODEL_NAME,
        languages_detected=[self._lang],
    )

PaddleOCRConfig

Bases: BaseModel

Configuration for PaddleOCR extractor.

This is a single-backend model (PaddlePaddle - CPU/GPU).

Example
config = PaddleOCRConfig(lang="ch", device="gpu")
ocr = PaddleOCR(config=config)

TesseractOCR

TesseractOCR(config: TesseractOCRConfig)

Bases: BaseOCRExtractor

Tesseract OCR extractor.

Single-backend model (CPU only). Requires system Tesseract installation.

Example
from omnidocs.tasks.ocr_extraction import TesseractOCR, TesseractOCRConfig

ocr = TesseractOCR(config=TesseractOCRConfig(languages=["eng"]))
result = ocr.extract(image)

for block in result.text_blocks:
        print(f"'{block.text}' @ {block.bbox.to_list()}")

Initialize Tesseract OCR extractor.

PARAMETER DESCRIPTION
config

Configuration object

TYPE: TesseractOCRConfig

RAISES DESCRIPTION
RuntimeError

If Tesseract is not installed

ImportError

If pytesseract is not installed

Source code in omnidocs/tasks/ocr_extraction/tesseract.py
def __init__(self, config: TesseractOCRConfig):
    """
    Initialize Tesseract OCR extractor.

    Args:
        config: Configuration object

    Raises:
        RuntimeError: If Tesseract is not installed
        ImportError: If pytesseract is not installed
    """
    self.config = config
    self._pytesseract = None
    self._load_model()

extract

extract(
    image: Union[Image, ndarray, str, Path],
) -> OCROutput

Run OCR on an image.

PARAMETER DESCRIPTION
image

Input image (PIL Image, numpy array, or path)

TYPE: Union[Image, ndarray, str, Path]

RETURNS DESCRIPTION
OCROutput

OCROutput with detected text blocks at word level

Source code in omnidocs/tasks/ocr_extraction/tesseract.py
def extract(self, image: Union[Image.Image, np.ndarray, str, Path]) -> OCROutput:
    """
    Run OCR on an image.

    Args:
        image: Input image (PIL Image, numpy array, or path)

    Returns:
        OCROutput with detected text blocks at word level
    """
    if self._pytesseract is None:
        raise RuntimeError("Tesseract not initialized. Call _load_model() first.")

    # Prepare image
    pil_image = self._prepare_image(image)
    image_width, image_height = pil_image.size

    # Build config string
    config = f"--oem {self.config.oem} --psm {self.config.psm}"
    if self.config.config_params:
        for key, value in self.config.config_params.items():
            config += f" -c {key}={value}"

    # Language string
    lang_str = "+".join(self.config.languages)

    # Get detailed data (word-level boxes)
    data = self._pytesseract.image_to_data(
        pil_image,
        lang=lang_str,
        config=config,
        output_type=self._pytesseract.Output.DICT,
    )

    # Parse results into TextBlocks
    text_blocks = []
    full_text_parts = []

    n_boxes = len(data["text"])
    for i in range(n_boxes):
        text = data["text"][i].strip()
        # Safely convert conf to float (handles string values from some Tesseract versions)
        try:
            conf = float(data["conf"][i])
        except (ValueError, TypeError):
            conf = -1

        # Skip empty text or low confidence (-1 means no confidence)
        if not text or conf == -1:
            continue

        # Tesseract returns confidence as 0-100, normalize to 0-1
        confidence = conf / 100.0

        # Get bounding box
        x = data["left"][i]
        y = data["top"][i]
        w = data["width"][i]
        h = data["height"][i]

        bbox = BoundingBox(
            x1=float(x),
            y1=float(y),
            x2=float(x + w),
            y2=float(y + h),
        )

        text_blocks.append(
            TextBlock(
                text=text,
                bbox=bbox,
                confidence=confidence,
                granularity=OCRGranularity.WORD,
                language=lang_str,
            )
        )

        full_text_parts.append(text)

    # Sort by position (top to bottom, left to right)
    text_blocks.sort(key=lambda b: (b.bbox.y1, b.bbox.x1))

    return OCROutput(
        text_blocks=text_blocks,
        full_text=" ".join(full_text_parts),
        image_width=image_width,
        image_height=image_height,
        model_name=self.MODEL_NAME,
        languages_detected=self.config.languages,
    )

extract_lines

extract_lines(
    image: Union[Image, ndarray, str, Path],
) -> OCROutput

Run OCR and return line-level blocks.

Groups words into lines based on Tesseract's line detection.

PARAMETER DESCRIPTION
image

Input image (PIL Image, numpy array, or path)

TYPE: Union[Image, ndarray, str, Path]

RETURNS DESCRIPTION
OCROutput

OCROutput with line-level text blocks

Source code in omnidocs/tasks/ocr_extraction/tesseract.py
def extract_lines(self, image: Union[Image.Image, np.ndarray, str, Path]) -> OCROutput:
    """
    Run OCR and return line-level blocks.

    Groups words into lines based on Tesseract's line detection.

    Args:
        image: Input image (PIL Image, numpy array, or path)

    Returns:
        OCROutput with line-level text blocks
    """
    if self._pytesseract is None:
        raise RuntimeError("Tesseract not initialized. Call _load_model() first.")

    # Prepare image
    pil_image = self._prepare_image(image)
    image_width, image_height = pil_image.size

    # Build config string (including config_params like extract method)
    config = f"--oem {self.config.oem} --psm {self.config.psm}"
    if self.config.config_params:
        for key, value in self.config.config_params.items():
            config += f" -c {key}={value}"

    # Language string
    lang_str = "+".join(self.config.languages)

    # Get detailed data
    data = self._pytesseract.image_to_data(
        pil_image,
        lang=lang_str,
        config=config,
        output_type=self._pytesseract.Output.DICT,
    )

    # Group words into lines
    lines: Dict[tuple, Dict] = {}
    n_boxes = len(data["text"])

    for i in range(n_boxes):
        text = data["text"][i].strip()
        # Safely convert conf to float (handles string values from some Tesseract versions)
        try:
            conf = float(data["conf"][i])
        except (ValueError, TypeError):
            conf = -1

        if not text or conf == -1:
            continue

        # Tesseract provides block_num, par_num, line_num
        line_key = (data["block_num"][i], data["par_num"][i], data["line_num"][i])

        x = data["left"][i]
        y = data["top"][i]
        w = data["width"][i]
        h = data["height"][i]

        if line_key not in lines:
            lines[line_key] = {
                "words": [],
                "confidences": [],
                "x1": x,
                "y1": y,
                "x2": x + w,
                "y2": y + h,
            }

        lines[line_key]["words"].append(text)
        lines[line_key]["confidences"].append(conf / 100.0)
        lines[line_key]["x1"] = min(lines[line_key]["x1"], x)
        lines[line_key]["y1"] = min(lines[line_key]["y1"], y)
        lines[line_key]["x2"] = max(lines[line_key]["x2"], x + w)
        lines[line_key]["y2"] = max(lines[line_key]["y2"], y + h)

    # Convert to TextBlocks
    text_blocks = []
    full_text_parts = []

    for line_key in sorted(lines.keys()):
        line = lines[line_key]
        line_text = " ".join(line["words"])
        avg_conf = sum(line["confidences"]) / len(line["confidences"])

        bbox = BoundingBox(
            x1=float(line["x1"]),
            y1=float(line["y1"]),
            x2=float(line["x2"]),
            y2=float(line["y2"]),
        )

        text_blocks.append(
            TextBlock(
                text=line_text,
                bbox=bbox,
                confidence=avg_conf,
                granularity=OCRGranularity.LINE,
                language=lang_str,
            )
        )

        full_text_parts.append(line_text)

    return OCROutput(
        text_blocks=text_blocks,
        full_text="\n".join(full_text_parts),
        image_width=image_width,
        image_height=image_height,
        model_name=self.MODEL_NAME,
        languages_detected=self.config.languages,
    )

TesseractOCRConfig

Bases: BaseModel

Configuration for Tesseract OCR extractor.

This is a single-backend model (CPU only, requires system Tesseract).

Example
config = TesseractOCRConfig(languages=["eng", "fra"], psm=3)
ocr = TesseractOCR(config=config)

base

Base class for OCR extractors.

Defines the abstract interface that all OCR extractors must implement.

BaseOCRExtractor

Bases: ABC

Abstract base class for OCR extractors.

All OCR extraction models must inherit from this class and implement the required methods.

Example
class MyOCRExtractor(BaseOCRExtractor):
        def __init__(self, config: MyConfig):
            self.config = config
            self._load_model()

        def _load_model(self):
            # Initialize OCR engine
            pass

        def extract(self, image):
            # Run OCR extraction
            return OCROutput(...)
extract abstractmethod
extract(
    image: Union[Image, ndarray, str, Path],
) -> OCROutput

Run OCR extraction on an image.

PARAMETER DESCRIPTION
image

Input image as: - PIL.Image.Image: PIL image object - np.ndarray: Numpy array (HWC format, RGB) - str or Path: Path to image file

TYPE: Union[Image, ndarray, str, Path]

RETURNS DESCRIPTION
OCROutput

OCROutput containing detected text blocks with bounding boxes

RAISES DESCRIPTION
ValueError

If image format is not supported

RuntimeError

If OCR engine is not initialized or extraction fails

Source code in omnidocs/tasks/ocr_extraction/base.py
@abstractmethod
def extract(self, image: Union[Image.Image, np.ndarray, str, Path]) -> OCROutput:
    """
    Run OCR extraction on an image.

    Args:
        image: Input image as:
            - PIL.Image.Image: PIL image object
            - np.ndarray: Numpy array (HWC format, RGB)
            - str or Path: Path to image file

    Returns:
        OCROutput containing detected text blocks with bounding boxes

    Raises:
        ValueError: If image format is not supported
        RuntimeError: If OCR engine is not initialized or extraction fails
    """
    pass
batch_extract
batch_extract(
    images: List[Union[Image, ndarray, str, Path]],
    progress_callback: Optional[
        Callable[[int, int], None]
    ] = None,
) -> List[OCROutput]

Run OCR extraction on multiple images.

Default implementation loops over extract(). Subclasses can override for optimized batching.

PARAMETER DESCRIPTION
images

List of images in any supported format

TYPE: List[Union[Image, ndarray, str, Path]]

progress_callback

Optional function(current, total) for progress

TYPE: Optional[Callable[[int, int], None]] DEFAULT: None

RETURNS DESCRIPTION
List[OCROutput]

List of OCROutput in same order as input

Examples:

images = [doc.get_page(i) for i in range(doc.page_count)]
results = extractor.batch_extract(images)
Source code in omnidocs/tasks/ocr_extraction/base.py
def batch_extract(
    self,
    images: List[Union[Image.Image, np.ndarray, str, Path]],
    progress_callback: Optional[Callable[[int, int], None]] = None,
) -> List[OCROutput]:
    """
    Run OCR extraction on multiple images.

    Default implementation loops over extract(). Subclasses can override
    for optimized batching.

    Args:
        images: List of images in any supported format
        progress_callback: Optional function(current, total) for progress

    Returns:
        List of OCROutput in same order as input

    Examples:
        ```python
        images = [doc.get_page(i) for i in range(doc.page_count)]
        results = extractor.batch_extract(images)
        ```
    """
    results = []
    total = len(images)

    for i, image in enumerate(images):
        if progress_callback:
            progress_callback(i + 1, total)

        result = self.extract(image)
        results.append(result)

    return results
extract_document
extract_document(
    document: Document,
    progress_callback: Optional[
        Callable[[int, int], None]
    ] = None,
) -> List[OCROutput]

Run OCR extraction on all pages of a document.

PARAMETER DESCRIPTION
document

Document instance

TYPE: Document

progress_callback

Optional function(current, total) for progress

TYPE: Optional[Callable[[int, int], None]] DEFAULT: None

RETURNS DESCRIPTION
List[OCROutput]

List of OCROutput, one per page

Examples:

doc = Document.from_pdf("paper.pdf")
results = extractor.extract_document(doc)
Source code in omnidocs/tasks/ocr_extraction/base.py
def extract_document(
    self,
    document: "Document",
    progress_callback: Optional[Callable[[int, int], None]] = None,
) -> List[OCROutput]:
    """
    Run OCR extraction on all pages of a document.

    Args:
        document: Document instance
        progress_callback: Optional function(current, total) for progress

    Returns:
        List of OCROutput, one per page

    Examples:
        ```python
        doc = Document.from_pdf("paper.pdf")
        results = extractor.extract_document(doc)
        ```
    """
    results = []
    total = document.page_count

    for i, page in enumerate(document.iter_pages()):
        if progress_callback:
            progress_callback(i + 1, total)

        result = self.extract(page)
        results.append(result)

    return results

easyocr

EasyOCR extractor.

EasyOCR is a PyTorch-based OCR engine with excellent multi-language support. - GPU accelerated (optional) - Supports 80+ languages - Good for scene text and printed documents

Python Package

pip install easyocr

Model Download Location

By default, EasyOCR downloads models to ~/.EasyOCR/ Can be overridden with model_storage_directory parameter

EasyOCRConfig

Bases: BaseModel

Configuration for EasyOCR extractor.

This is a single-backend model (PyTorch - CPU/GPU).

Example
config = EasyOCRConfig(languages=["en", "ch_sim"], gpu=True)
ocr = EasyOCR(config=config)

EasyOCR

EasyOCR(config: EasyOCRConfig)

Bases: BaseOCRExtractor

EasyOCR text extractor.

Single-backend model (PyTorch - CPU/GPU).

Example
from omnidocs.tasks.ocr_extraction import EasyOCR, EasyOCRConfig

ocr = EasyOCR(config=EasyOCRConfig(languages=["en"], gpu=True))
result = ocr.extract(image)

for block in result.text_blocks:
        print(f"'{block.text}' @ {block.bbox.to_list()}")

Initialize EasyOCR extractor.

PARAMETER DESCRIPTION
config

Configuration object

TYPE: EasyOCRConfig

RAISES DESCRIPTION
ImportError

If easyocr is not installed

Source code in omnidocs/tasks/ocr_extraction/easyocr.py
def __init__(self, config: EasyOCRConfig):
    """
    Initialize EasyOCR extractor.

    Args:
        config: Configuration object

    Raises:
        ImportError: If easyocr is not installed
    """
    self.config = config
    self._reader = None
    self._load_model()
extract
extract(
    image: Union[Image, ndarray, str, Path],
    detail: int = 1,
    paragraph: bool = False,
    min_size: int = 10,
    text_threshold: float = 0.7,
    low_text: float = 0.4,
    link_threshold: float = 0.4,
    canvas_size: int = 2560,
    mag_ratio: float = 1.0,
) -> OCROutput

Run OCR on an image.

PARAMETER DESCRIPTION
image

Input image (PIL Image, numpy array, or path)

TYPE: Union[Image, ndarray, str, Path]

detail

0 = simple output, 1 = detailed with boxes

TYPE: int DEFAULT: 1

paragraph

Combine results into paragraphs

TYPE: bool DEFAULT: False

min_size

Minimum text box size

TYPE: int DEFAULT: 10

text_threshold

Text confidence threshold

TYPE: float DEFAULT: 0.7

low_text

Low text bound

TYPE: float DEFAULT: 0.4

link_threshold

Link threshold for text joining

TYPE: float DEFAULT: 0.4

canvas_size

Max image dimension for processing

TYPE: int DEFAULT: 2560

mag_ratio

Magnification ratio

TYPE: float DEFAULT: 1.0

RETURNS DESCRIPTION
OCROutput

OCROutput with detected text blocks

RAISES DESCRIPTION
ValueError

If detail is not 0 or 1

RuntimeError

If EasyOCR is not initialized

Source code in omnidocs/tasks/ocr_extraction/easyocr.py
def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    detail: int = 1,
    paragraph: bool = False,
    min_size: int = 10,
    text_threshold: float = 0.7,
    low_text: float = 0.4,
    link_threshold: float = 0.4,
    canvas_size: int = 2560,
    mag_ratio: float = 1.0,
) -> OCROutput:
    """
    Run OCR on an image.

    Args:
        image: Input image (PIL Image, numpy array, or path)
        detail: 0 = simple output, 1 = detailed with boxes
        paragraph: Combine results into paragraphs
        min_size: Minimum text box size
        text_threshold: Text confidence threshold
        low_text: Low text bound
        link_threshold: Link threshold for text joining
        canvas_size: Max image dimension for processing
        mag_ratio: Magnification ratio

    Returns:
        OCROutput with detected text blocks

    Raises:
        ValueError: If detail is not 0 or 1
        RuntimeError: If EasyOCR is not initialized
    """
    if self._reader is None:
        raise RuntimeError("EasyOCR not initialized. Call _load_model() first.")

    # Validate detail parameter
    if detail not in (0, 1):
        raise ValueError(f"detail must be 0 or 1, got {detail}")

    # Prepare image
    pil_image = self._prepare_image(image)
    image_width, image_height = pil_image.size

    # Convert to numpy array for EasyOCR
    image_array = np.array(pil_image)

    # Run EasyOCR
    results = self._reader.readtext(
        image_array,
        detail=detail,
        paragraph=paragraph,
        min_size=min_size,
        text_threshold=text_threshold,
        low_text=low_text,
        link_threshold=link_threshold,
        canvas_size=canvas_size,
        mag_ratio=mag_ratio,
    )

    # Parse results
    text_blocks = []
    full_text_parts = []

    for result in results:
        if detail == 0:
            # Simple output: just text
            text = result
            confidence = 1.0
            bbox = BoundingBox(x1=0, y1=0, x2=0, y2=0)
            polygon = None
        else:
            # Detailed output: [polygon, text, confidence]
            polygon_points, text, confidence = result

            # EasyOCR returns 4 corner points: [[x1,y1], [x2,y1], [x2,y2], [x1,y2]]
            # Convert to list of lists for storage
            polygon = [list(p) for p in polygon_points]

            # Convert to axis-aligned bounding box
            bbox = BoundingBox.from_polygon(polygon)

        if not text.strip():
            continue

        text_blocks.append(
            TextBlock(
                text=text,
                bbox=bbox,
                confidence=float(confidence),
                granularity=(OCRGranularity.LINE if paragraph else OCRGranularity.WORD),
                polygon=polygon,
                language="+".join(self.config.languages),
            )
        )

        full_text_parts.append(text)

    # Sort by position
    text_blocks.sort(key=lambda b: (b.bbox.y1, b.bbox.x1))

    return OCROutput(
        text_blocks=text_blocks,
        full_text=" ".join(full_text_parts),
        image_width=image_width,
        image_height=image_height,
        model_name=self.MODEL_NAME,
        languages_detected=self.config.languages,
    )
extract_batch
extract_batch(
    images: List[Union[Image, ndarray, str, Path]], **kwargs
) -> List[OCROutput]

Run OCR on multiple images.

PARAMETER DESCRIPTION
images

List of input images

TYPE: List[Union[Image, ndarray, str, Path]]

**kwargs

Arguments passed to extract()

DEFAULT: {}

RETURNS DESCRIPTION
List[OCROutput]

List of OCROutput objects

Source code in omnidocs/tasks/ocr_extraction/easyocr.py
def extract_batch(
    self,
    images: List[Union[Image.Image, np.ndarray, str, Path]],
    **kwargs,
) -> List[OCROutput]:
    """
    Run OCR on multiple images.

    Args:
        images: List of input images
        **kwargs: Arguments passed to extract()

    Returns:
        List of OCROutput objects
    """
    results = []
    for img in images:
        results.append(self.extract(img, **kwargs))
    return results

models

Pydantic models for OCR extraction outputs.

Defines standardized output types for OCR detection including text blocks with bounding boxes, confidence scores, and granularity levels.

Key difference from Text Extraction: - OCR returns text WITH bounding boxes (word/line/character level) - Text Extraction returns formatted text (MD/HTML) WITHOUT bboxes

Coordinate Systems
  • Absolute (default): Coordinates in pixels relative to original image size
  • Normalized (0-1024): Coordinates scaled to 0-1024 range (virtual 1024x1024 canvas)

Use bbox.to_normalized(width, height) or output.get_normalized_blocks() to convert to normalized coordinates.

Example
result = ocr.extract(image)  # Returns absolute pixel coordinates
normalized = result.get_normalized_blocks()  # Returns 0-1024 normalized coords

OCRGranularity

Bases: str, Enum

OCR detection granularity levels.

Different OCR engines return results at different granularity levels. This enum standardizes the options across all extractors.

BoundingBox

Bases: BaseModel

Bounding box coordinates in pixel space.

Coordinates follow the convention: (x1, y1) is top-left, (x2, y2) is bottom-right. For rotated text, use the polygon field in TextBlock instead.

Example
bbox = BoundingBox(x1=100, y1=50, x2=300, y2=80)
print(bbox.width, bbox.height)  # 200, 30
print(bbox.center)  # (200.0, 65.0)
width property
width: float

Width of the bounding box.

height property
height: float

Height of the bounding box.

area property
area: float

Area of the bounding box.

center property
center: Tuple[float, float]

Center point of the bounding box.

to_list
to_list() -> List[float]

Convert to [x1, y1, x2, y2] list.

Source code in omnidocs/tasks/ocr_extraction/models.py
def to_list(self) -> List[float]:
    """Convert to [x1, y1, x2, y2] list."""
    return [self.x1, self.y1, self.x2, self.y2]
to_xyxy
to_xyxy() -> Tuple[float, float, float, float]

Convert to (x1, y1, x2, y2) tuple.

Source code in omnidocs/tasks/ocr_extraction/models.py
def to_xyxy(self) -> Tuple[float, float, float, float]:
    """Convert to (x1, y1, x2, y2) tuple."""
    return (self.x1, self.y1, self.x2, self.y2)
to_xywh
to_xywh() -> Tuple[float, float, float, float]

Convert to (x, y, width, height) format.

Source code in omnidocs/tasks/ocr_extraction/models.py
def to_xywh(self) -> Tuple[float, float, float, float]:
    """Convert to (x, y, width, height) format."""
    return (self.x1, self.y1, self.width, self.height)
from_list classmethod
from_list(coords: List[float]) -> BoundingBox

Create from [x1, y1, x2, y2] list.

Source code in omnidocs/tasks/ocr_extraction/models.py
@classmethod
def from_list(cls, coords: List[float]) -> "BoundingBox":
    """Create from [x1, y1, x2, y2] list."""
    if len(coords) != 4:
        raise ValueError(f"Expected 4 coordinates, got {len(coords)}")
    return cls(x1=coords[0], y1=coords[1], x2=coords[2], y2=coords[3])
from_polygon classmethod
from_polygon(polygon: List[List[float]]) -> BoundingBox

Create axis-aligned bounding box from polygon points.

PARAMETER DESCRIPTION
polygon

List of [x, y] points (usually 4 for quadrilateral)

TYPE: List[List[float]]

RETURNS DESCRIPTION
BoundingBox

BoundingBox that encloses all polygon points

Source code in omnidocs/tasks/ocr_extraction/models.py
@classmethod
def from_polygon(cls, polygon: List[List[float]]) -> "BoundingBox":
    """
    Create axis-aligned bounding box from polygon points.

    Args:
        polygon: List of [x, y] points (usually 4 for quadrilateral)

    Returns:
        BoundingBox that encloses all polygon points
    """
    if not polygon:
        raise ValueError("Polygon cannot be empty")

    xs = [p[0] for p in polygon]
    ys = [p[1] for p in polygon]
    return cls(x1=min(xs), y1=min(ys), x2=max(xs), y2=max(ys))
to_normalized
to_normalized(
    image_width: int, image_height: int
) -> BoundingBox

Convert to normalized coordinates (0-1024 range).

Scales coordinates from absolute pixel values to a virtual 1024x1024 canvas. This provides consistent coordinates regardless of original image size.

PARAMETER DESCRIPTION
image_width

Original image width in pixels

TYPE: int

image_height

Original image height in pixels

TYPE: int

RETURNS DESCRIPTION
BoundingBox

New BoundingBox with coordinates in 0-1024 range

Source code in omnidocs/tasks/ocr_extraction/models.py
def to_normalized(self, image_width: int, image_height: int) -> "BoundingBox":
    """
    Convert to normalized coordinates (0-1024 range).

    Scales coordinates from absolute pixel values to a virtual 1024x1024 canvas.
    This provides consistent coordinates regardless of original image size.

    Args:
        image_width: Original image width in pixels
        image_height: Original image height in pixels

    Returns:
        New BoundingBox with coordinates in 0-1024 range
    """
    return BoundingBox(
        x1=self.x1 / image_width * NORMALIZED_SIZE,
        y1=self.y1 / image_height * NORMALIZED_SIZE,
        x2=self.x2 / image_width * NORMALIZED_SIZE,
        y2=self.y2 / image_height * NORMALIZED_SIZE,
    )
to_absolute
to_absolute(
    image_width: int, image_height: int
) -> BoundingBox

Convert from normalized (0-1024) to absolute pixel coordinates.

PARAMETER DESCRIPTION
image_width

Target image width in pixels

TYPE: int

image_height

Target image height in pixels

TYPE: int

RETURNS DESCRIPTION
BoundingBox

New BoundingBox with absolute pixel coordinates

Source code in omnidocs/tasks/ocr_extraction/models.py
def to_absolute(self, image_width: int, image_height: int) -> "BoundingBox":
    """
    Convert from normalized (0-1024) to absolute pixel coordinates.

    Args:
        image_width: Target image width in pixels
        image_height: Target image height in pixels

    Returns:
        New BoundingBox with absolute pixel coordinates
    """
    return BoundingBox(
        x1=self.x1 / NORMALIZED_SIZE * image_width,
        y1=self.y1 / NORMALIZED_SIZE * image_height,
        x2=self.x2 / NORMALIZED_SIZE * image_width,
        y2=self.y2 / NORMALIZED_SIZE * image_height,
    )

TextBlock

Bases: BaseModel

Single detected text element with text, bounding box, and confidence.

This is the fundamental unit of OCR output - can represent a character, word, line, or block depending on the OCR model and configuration.

Example
block = TextBlock(
        text="Hello",
        bbox=BoundingBox(x1=100, y1=50, x2=200, y2=80),
        confidence=0.95,
        granularity=OCRGranularity.WORD,
    )
to_dict
to_dict() -> Dict

Convert to dictionary representation.

Source code in omnidocs/tasks/ocr_extraction/models.py
def to_dict(self) -> Dict:
    """Convert to dictionary representation."""
    return {
        "text": self.text,
        "bbox": self.bbox.to_list(),
        "confidence": self.confidence,
        "granularity": self.granularity.value,
        "polygon": self.polygon,
        "language": self.language,
    }
get_normalized_bbox
get_normalized_bbox(
    image_width: int, image_height: int
) -> BoundingBox

Get bounding box in normalized (0-1024) coordinates.

PARAMETER DESCRIPTION
image_width

Original image width

TYPE: int

image_height

Original image height

TYPE: int

RETURNS DESCRIPTION
BoundingBox

BoundingBox with normalized coordinates

Source code in omnidocs/tasks/ocr_extraction/models.py
def get_normalized_bbox(self, image_width: int, image_height: int) -> BoundingBox:
    """
    Get bounding box in normalized (0-1024) coordinates.

    Args:
        image_width: Original image width
        image_height: Original image height

    Returns:
        BoundingBox with normalized coordinates
    """
    return self.bbox.to_normalized(image_width, image_height)

OCROutput

Bases: BaseModel

Complete OCR extraction results for a single image.

Contains all detected text blocks with their bounding boxes, plus metadata about the extraction.

Example
result = ocr.extract(image)
print(f"Found {result.block_count} blocks")
print(f"Full text: {result.full_text}")
for block in result.text_blocks:
        print(f"'{block.text}' @ {block.bbox.to_list()}")
block_count property
block_count: int

Number of detected text blocks.

word_count property
word_count: int

Approximate word count from full text.

average_confidence property
average_confidence: float

Average confidence across all text blocks.

filter_by_confidence
filter_by_confidence(
    min_confidence: float,
) -> List[TextBlock]

Filter text blocks by minimum confidence.

Source code in omnidocs/tasks/ocr_extraction/models.py
def filter_by_confidence(self, min_confidence: float) -> List[TextBlock]:
    """Filter text blocks by minimum confidence."""
    return [b for b in self.text_blocks if b.confidence >= min_confidence]
filter_by_granularity
filter_by_granularity(
    granularity: OCRGranularity,
) -> List[TextBlock]

Filter text blocks by granularity level.

Source code in omnidocs/tasks/ocr_extraction/models.py
def filter_by_granularity(self, granularity: OCRGranularity) -> List[TextBlock]:
    """Filter text blocks by granularity level."""
    return [b for b in self.text_blocks if b.granularity == granularity]
to_dict
to_dict() -> Dict

Convert to dictionary representation.

Source code in omnidocs/tasks/ocr_extraction/models.py
def to_dict(self) -> Dict:
    """Convert to dictionary representation."""
    return {
        "text_blocks": [b.to_dict() for b in self.text_blocks],
        "full_text": self.full_text,
        "image_width": self.image_width,
        "image_height": self.image_height,
        "model_name": self.model_name,
        "languages_detected": self.languages_detected,
        "block_count": self.block_count,
        "word_count": self.word_count,
        "average_confidence": self.average_confidence,
    }
sort_by_position
sort_by_position(top_to_bottom: bool = True) -> OCROutput

Return a new OCROutput with blocks sorted by position.

PARAMETER DESCRIPTION
top_to_bottom

If True, sort by y-coordinate (reading order)

TYPE: bool DEFAULT: True

RETURNS DESCRIPTION
OCROutput

New OCROutput with sorted text blocks

Source code in omnidocs/tasks/ocr_extraction/models.py
def sort_by_position(self, top_to_bottom: bool = True) -> "OCROutput":
    """
    Return a new OCROutput with blocks sorted by position.

    Args:
        top_to_bottom: If True, sort by y-coordinate (reading order)

    Returns:
        New OCROutput with sorted text blocks
    """
    sorted_blocks = sorted(
        self.text_blocks,
        key=lambda b: (b.bbox.y1, b.bbox.x1),
        reverse=not top_to_bottom,
    )
    # Regenerate full_text in sorted order
    full_text = " ".join(b.text for b in sorted_blocks)

    return OCROutput(
        text_blocks=sorted_blocks,
        full_text=full_text,
        image_width=self.image_width,
        image_height=self.image_height,
        model_name=self.model_name,
        languages_detected=self.languages_detected,
    )
get_normalized_blocks
get_normalized_blocks() -> List[Dict]

Get all text blocks with normalized (0-1024) coordinates.

RETURNS DESCRIPTION
List[Dict]

List of dicts with normalized bbox coordinates and metadata.

Source code in omnidocs/tasks/ocr_extraction/models.py
def get_normalized_blocks(self) -> List[Dict]:
    """
    Get all text blocks with normalized (0-1024) coordinates.

    Returns:
        List of dicts with normalized bbox coordinates and metadata.
    """
    normalized = []
    for block in self.text_blocks:
        norm_bbox = block.bbox.to_normalized(self.image_width, self.image_height)
        normalized.append(
            {
                "text": block.text,
                "bbox": norm_bbox.to_list(),
                "confidence": block.confidence,
                "granularity": block.granularity.value,
                "language": block.language,
            }
        )
    return normalized
visualize
visualize(
    image: Image,
    output_path: Optional[Union[str, Path]] = None,
    show_text: bool = True,
    show_confidence: bool = False,
    line_width: int = 2,
    box_color: str = "#2ECC71",
    text_color: str = "#000000",
) -> Image.Image

Visualize OCR results on the image.

Draws bounding boxes around detected text with optional labels.

PARAMETER DESCRIPTION
image

PIL Image to draw on (will be copied, not modified)

TYPE: Image

output_path

Optional path to save the visualization

TYPE: Optional[Union[str, Path]] DEFAULT: None

show_text

Whether to show detected text

TYPE: bool DEFAULT: True

show_confidence

Whether to show confidence scores

TYPE: bool DEFAULT: False

line_width

Width of bounding box lines

TYPE: int DEFAULT: 2

box_color

Color for bounding boxes (hex)

TYPE: str DEFAULT: '#2ECC71'

text_color

Color for text labels (hex)

TYPE: str DEFAULT: '#000000'

RETURNS DESCRIPTION
Image

PIL Image with visualizations drawn

Example
result = ocr.extract(image)
viz = result.visualize(image, output_path="ocr_viz.png")
Source code in omnidocs/tasks/ocr_extraction/models.py
def visualize(
    self,
    image: "Image.Image",
    output_path: Optional[Union[str, Path]] = None,
    show_text: bool = True,
    show_confidence: bool = False,
    line_width: int = 2,
    box_color: str = "#2ECC71",
    text_color: str = "#000000",
) -> "Image.Image":
    """
    Visualize OCR results on the image.

    Draws bounding boxes around detected text with optional labels.

    Args:
        image: PIL Image to draw on (will be copied, not modified)
        output_path: Optional path to save the visualization
        show_text: Whether to show detected text
        show_confidence: Whether to show confidence scores
        line_width: Width of bounding box lines
        box_color: Color for bounding boxes (hex)
        text_color: Color for text labels (hex)

    Returns:
        PIL Image with visualizations drawn

    Example:
        ```python
        result = ocr.extract(image)
        viz = result.visualize(image, output_path="ocr_viz.png")
        ```
    """
    from PIL import ImageDraw, ImageFont

    # Copy image to avoid modifying original
    viz_image = image.copy().convert("RGB")
    draw = ImageDraw.Draw(viz_image)

    # Try to get a font
    try:
        font = ImageFont.truetype("/System/Library/Fonts/Helvetica.ttc", 12)
    except Exception:
        try:
            font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 12)
        except Exception:
            font = ImageFont.load_default()

    for block in self.text_blocks:
        coords = block.bbox.to_xyxy()

        # Draw polygon if available, otherwise draw rectangle
        if block.polygon:
            flat_polygon = [coord for point in block.polygon for coord in point]
            draw.polygon(flat_polygon, outline=box_color, width=line_width)
        else:
            draw.rectangle(coords, outline=box_color, width=line_width)

        # Build label text
        if show_text or show_confidence:
            label_parts = []
            if show_text:
                # Truncate long text
                text = block.text[:25] + "..." if len(block.text) > 25 else block.text
                label_parts.append(text)
            if show_confidence:
                label_parts.append(f"{block.confidence:.2f}")
            label_text = " | ".join(label_parts)

            # Position label below the box
            label_x = coords[0]
            label_y = coords[3] + 2  # Below bottom edge

            # Draw label with background
            text_bbox = draw.textbbox((label_x, label_y), label_text, font=font)
            padding = 2
            draw.rectangle(
                [
                    text_bbox[0] - padding,
                    text_bbox[1] - padding,
                    text_bbox[2] + padding,
                    text_bbox[3] + padding,
                ],
                fill="#FFFFFF",
                outline=box_color,
            )
            draw.text((label_x, label_y), label_text, fill=text_color, font=font)

    # Save if path provided
    if output_path:
        output_path = Path(output_path)
        output_path.parent.mkdir(parents=True, exist_ok=True)
        viz_image.save(output_path)

    return viz_image
load_json classmethod
load_json(file_path: Union[str, Path]) -> OCROutput

Load an OCROutput instance from a JSON file.

PARAMETER DESCRIPTION
file_path

Path to JSON file

TYPE: Union[str, Path]

RETURNS DESCRIPTION
OCROutput

OCROutput instance

Source code in omnidocs/tasks/ocr_extraction/models.py
@classmethod
def load_json(cls, file_path: Union[str, Path]) -> "OCROutput":
    """
    Load an OCROutput instance from a JSON file.

    Args:
        file_path: Path to JSON file

    Returns:
        OCROutput instance
    """
    path = Path(file_path)
    return cls.model_validate_json(path.read_text(encoding="utf-8"))
save_json
save_json(file_path: Union[str, Path]) -> None

Save OCROutput instance to a JSON file.

PARAMETER DESCRIPTION
file_path

Path where JSON file should be saved

TYPE: Union[str, Path]

Source code in omnidocs/tasks/ocr_extraction/models.py
def save_json(self, file_path: Union[str, Path]) -> None:
    """
    Save OCROutput instance to a JSON file.

    Args:
        file_path: Path where JSON file should be saved
    """
    path = Path(file_path)
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(self.model_dump_json(indent=2), encoding="utf-8")

paddleocr

PaddleOCR extractor.

PaddleOCR is an OCR toolkit developed by Baidu/PaddlePaddle. - Excellent for CJK languages (Chinese, Japanese, Korean) - GPU accelerated - Supports layout analysis + OCR

Python Package

pip install paddleocr paddlepaddle # CPU version pip install paddleocr paddlepaddle-gpu # GPU version

Model Download Location

By default, PaddleOCR downloads models to ~/.paddleocr/

PaddleOCRConfig

Bases: BaseModel

Configuration for PaddleOCR extractor.

This is a single-backend model (PaddlePaddle - CPU/GPU).

Example
config = PaddleOCRConfig(lang="ch", device="gpu")
ocr = PaddleOCR(config=config)

PaddleOCR

PaddleOCR(config: PaddleOCRConfig)

Bases: BaseOCRExtractor

PaddleOCR text extractor.

Single-backend model (PaddlePaddle - CPU/GPU).

Example
from omnidocs.tasks.ocr_extraction import PaddleOCR, PaddleOCRConfig

ocr = PaddleOCR(config=PaddleOCRConfig(lang="en", device="cpu"))
result = ocr.extract(image)

for block in result.text_blocks:
        print(f"'{block.text}' @ {block.bbox.to_list()}")

Initialize PaddleOCR extractor.

PARAMETER DESCRIPTION
config

Configuration object

TYPE: PaddleOCRConfig

RAISES DESCRIPTION
ImportError

If paddleocr or paddlepaddle is not installed

Source code in omnidocs/tasks/ocr_extraction/paddleocr.py
def __init__(self, config: PaddleOCRConfig):
    """
    Initialize PaddleOCR extractor.

    Args:
        config: Configuration object

    Raises:
        ImportError: If paddleocr or paddlepaddle is not installed
    """
    self.config = config
    self._ocr = None

    # Normalize language code
    self._lang = LANG_CODES.get(config.lang.lower(), config.lang)

    self._load_model()
extract
extract(
    image: Union[Image, ndarray, str, Path],
) -> OCROutput

Run OCR on an image.

PARAMETER DESCRIPTION
image

Input image (PIL Image, numpy array, or path)

TYPE: Union[Image, ndarray, str, Path]

RETURNS DESCRIPTION
OCROutput

OCROutput with detected text blocks

Source code in omnidocs/tasks/ocr_extraction/paddleocr.py
def extract(self, image: Union[Image.Image, np.ndarray, str, Path]) -> OCROutput:
    """
    Run OCR on an image.

    Args:
        image: Input image (PIL Image, numpy array, or path)

    Returns:
        OCROutput with detected text blocks
    """
    if self._ocr is None:
        raise RuntimeError("PaddleOCR not initialized. Call _load_model() first.")

    # Prepare image
    pil_image = self._prepare_image(image)
    image_width, image_height = pil_image.size

    # Convert to numpy array
    image_array = np.array(pil_image)

    # Run PaddleOCR v3.x - use predict() method
    results = self._ocr.predict(image_array)

    # Parse results
    text_blocks = []

    # PaddleOCR may return None or empty results
    if results is None or len(results) == 0:
        return OCROutput(
            text_blocks=[],
            full_text="",
            image_width=image_width,
            image_height=image_height,
            model_name=self.MODEL_NAME,
            languages_detected=[self._lang],
        )

    # PaddleOCR v3.x returns list of dicts with 'rec_texts', 'rec_scores', 'dt_polys'
    for result in results:
        if result is None:
            continue

        rec_texts = result.get("rec_texts", [])
        rec_scores = result.get("rec_scores", [])
        dt_polys = result.get("dt_polys", [])

        for i, text in enumerate(rec_texts):
            if not text.strip():
                continue

            confidence = rec_scores[i] if i < len(rec_scores) else 1.0

            # Get polygon and convert to list
            polygon: Optional[List[List[float]]] = None
            if i < len(dt_polys) and dt_polys[i] is not None:
                poly_array = dt_polys[i]
                # Handle numpy array
                if hasattr(poly_array, "tolist"):
                    polygon = poly_array.tolist()
                else:
                    polygon = list(poly_array)

            # Convert polygon to bbox
            if polygon:
                bbox = BoundingBox.from_polygon(polygon)
            else:
                bbox = BoundingBox(x1=0, y1=0, x2=0, y2=0)

            text_blocks.append(
                TextBlock(
                    text=text,
                    bbox=bbox,
                    confidence=float(confidence),
                    granularity=OCRGranularity.LINE,
                    polygon=polygon,
                    language=self._lang,
                )
            )

    # Sort by position (top to bottom, left to right)
    text_blocks.sort(key=lambda b: (b.bbox.y1, b.bbox.x1))

    # Build full_text from sorted blocks to ensure reading order
    full_text = " ".join(block.text for block in text_blocks)

    return OCROutput(
        text_blocks=text_blocks,
        full_text=full_text,
        image_width=image_width,
        image_height=image_height,
        model_name=self.MODEL_NAME,
        languages_detected=[self._lang],
    )

tesseract

Tesseract OCR extractor.

Tesseract is an open-source OCR engine maintained by Google. - CPU-based (no GPU required) - Requires system installation of Tesseract - Good for printed text, supports 100+ languages

System Requirements

macOS: brew install tesseract Ubuntu: sudo apt-get install tesseract-ocr Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki

Python Package

pip install pytesseract

TesseractOCRConfig

Bases: BaseModel

Configuration for Tesseract OCR extractor.

This is a single-backend model (CPU only, requires system Tesseract).

Example
config = TesseractOCRConfig(languages=["eng", "fra"], psm=3)
ocr = TesseractOCR(config=config)

TesseractOCR

TesseractOCR(config: TesseractOCRConfig)

Bases: BaseOCRExtractor

Tesseract OCR extractor.

Single-backend model (CPU only). Requires system Tesseract installation.

Example
from omnidocs.tasks.ocr_extraction import TesseractOCR, TesseractOCRConfig

ocr = TesseractOCR(config=TesseractOCRConfig(languages=["eng"]))
result = ocr.extract(image)

for block in result.text_blocks:
        print(f"'{block.text}' @ {block.bbox.to_list()}")

Initialize Tesseract OCR extractor.

PARAMETER DESCRIPTION
config

Configuration object

TYPE: TesseractOCRConfig

RAISES DESCRIPTION
RuntimeError

If Tesseract is not installed

ImportError

If pytesseract is not installed

Source code in omnidocs/tasks/ocr_extraction/tesseract.py
def __init__(self, config: TesseractOCRConfig):
    """
    Initialize Tesseract OCR extractor.

    Args:
        config: Configuration object

    Raises:
        RuntimeError: If Tesseract is not installed
        ImportError: If pytesseract is not installed
    """
    self.config = config
    self._pytesseract = None
    self._load_model()
extract
extract(
    image: Union[Image, ndarray, str, Path],
) -> OCROutput

Run OCR on an image.

PARAMETER DESCRIPTION
image

Input image (PIL Image, numpy array, or path)

TYPE: Union[Image, ndarray, str, Path]

RETURNS DESCRIPTION
OCROutput

OCROutput with detected text blocks at word level

Source code in omnidocs/tasks/ocr_extraction/tesseract.py
def extract(self, image: Union[Image.Image, np.ndarray, str, Path]) -> OCROutput:
    """
    Run OCR on an image.

    Args:
        image: Input image (PIL Image, numpy array, or path)

    Returns:
        OCROutput with detected text blocks at word level
    """
    if self._pytesseract is None:
        raise RuntimeError("Tesseract not initialized. Call _load_model() first.")

    # Prepare image
    pil_image = self._prepare_image(image)
    image_width, image_height = pil_image.size

    # Build config string
    config = f"--oem {self.config.oem} --psm {self.config.psm}"
    if self.config.config_params:
        for key, value in self.config.config_params.items():
            config += f" -c {key}={value}"

    # Language string
    lang_str = "+".join(self.config.languages)

    # Get detailed data (word-level boxes)
    data = self._pytesseract.image_to_data(
        pil_image,
        lang=lang_str,
        config=config,
        output_type=self._pytesseract.Output.DICT,
    )

    # Parse results into TextBlocks
    text_blocks = []
    full_text_parts = []

    n_boxes = len(data["text"])
    for i in range(n_boxes):
        text = data["text"][i].strip()
        # Safely convert conf to float (handles string values from some Tesseract versions)
        try:
            conf = float(data["conf"][i])
        except (ValueError, TypeError):
            conf = -1

        # Skip empty text or low confidence (-1 means no confidence)
        if not text or conf == -1:
            continue

        # Tesseract returns confidence as 0-100, normalize to 0-1
        confidence = conf / 100.0

        # Get bounding box
        x = data["left"][i]
        y = data["top"][i]
        w = data["width"][i]
        h = data["height"][i]

        bbox = BoundingBox(
            x1=float(x),
            y1=float(y),
            x2=float(x + w),
            y2=float(y + h),
        )

        text_blocks.append(
            TextBlock(
                text=text,
                bbox=bbox,
                confidence=confidence,
                granularity=OCRGranularity.WORD,
                language=lang_str,
            )
        )

        full_text_parts.append(text)

    # Sort by position (top to bottom, left to right)
    text_blocks.sort(key=lambda b: (b.bbox.y1, b.bbox.x1))

    return OCROutput(
        text_blocks=text_blocks,
        full_text=" ".join(full_text_parts),
        image_width=image_width,
        image_height=image_height,
        model_name=self.MODEL_NAME,
        languages_detected=self.config.languages,
    )
extract_lines
extract_lines(
    image: Union[Image, ndarray, str, Path],
) -> OCROutput

Run OCR and return line-level blocks.

Groups words into lines based on Tesseract's line detection.

PARAMETER DESCRIPTION
image

Input image (PIL Image, numpy array, or path)

TYPE: Union[Image, ndarray, str, Path]

RETURNS DESCRIPTION
OCROutput

OCROutput with line-level text blocks

Source code in omnidocs/tasks/ocr_extraction/tesseract.py
def extract_lines(self, image: Union[Image.Image, np.ndarray, str, Path]) -> OCROutput:
    """
    Run OCR and return line-level blocks.

    Groups words into lines based on Tesseract's line detection.

    Args:
        image: Input image (PIL Image, numpy array, or path)

    Returns:
        OCROutput with line-level text blocks
    """
    if self._pytesseract is None:
        raise RuntimeError("Tesseract not initialized. Call _load_model() first.")

    # Prepare image
    pil_image = self._prepare_image(image)
    image_width, image_height = pil_image.size

    # Build config string (including config_params like extract method)
    config = f"--oem {self.config.oem} --psm {self.config.psm}"
    if self.config.config_params:
        for key, value in self.config.config_params.items():
            config += f" -c {key}={value}"

    # Language string
    lang_str = "+".join(self.config.languages)

    # Get detailed data
    data = self._pytesseract.image_to_data(
        pil_image,
        lang=lang_str,
        config=config,
        output_type=self._pytesseract.Output.DICT,
    )

    # Group words into lines
    lines: Dict[tuple, Dict] = {}
    n_boxes = len(data["text"])

    for i in range(n_boxes):
        text = data["text"][i].strip()
        # Safely convert conf to float (handles string values from some Tesseract versions)
        try:
            conf = float(data["conf"][i])
        except (ValueError, TypeError):
            conf = -1

        if not text or conf == -1:
            continue

        # Tesseract provides block_num, par_num, line_num
        line_key = (data["block_num"][i], data["par_num"][i], data["line_num"][i])

        x = data["left"][i]
        y = data["top"][i]
        w = data["width"][i]
        h = data["height"][i]

        if line_key not in lines:
            lines[line_key] = {
                "words": [],
                "confidences": [],
                "x1": x,
                "y1": y,
                "x2": x + w,
                "y2": y + h,
            }

        lines[line_key]["words"].append(text)
        lines[line_key]["confidences"].append(conf / 100.0)
        lines[line_key]["x1"] = min(lines[line_key]["x1"], x)
        lines[line_key]["y1"] = min(lines[line_key]["y1"], y)
        lines[line_key]["x2"] = max(lines[line_key]["x2"], x + w)
        lines[line_key]["y2"] = max(lines[line_key]["y2"], y + h)

    # Convert to TextBlocks
    text_blocks = []
    full_text_parts = []

    for line_key in sorted(lines.keys()):
        line = lines[line_key]
        line_text = " ".join(line["words"])
        avg_conf = sum(line["confidences"]) / len(line["confidences"])

        bbox = BoundingBox(
            x1=float(line["x1"]),
            y1=float(line["y1"]),
            x2=float(line["x2"]),
            y2=float(line["y2"]),
        )

        text_blocks.append(
            TextBlock(
                text=line_text,
                bbox=bbox,
                confidence=avg_conf,
                granularity=OCRGranularity.LINE,
                language=lang_str,
            )
        )

        full_text_parts.append(line_text)

    return OCROutput(
        text_blocks=text_blocks,
        full_text="\n".join(full_text_parts),
        image_width=image_width,
        image_height=image_height,
        model_name=self.MODEL_NAME,
        languages_detected=self.config.languages,
    )

reading_order

Reading Order Module.

Provides predictors for determining the logical reading sequence of document elements based on layout detection and spatial analysis.

Available Predictors
  • RuleBasedReadingOrderPredictor: Rule-based predictor using R-tree indexing
Example
from omnidocs.tasks.reading_order import RuleBasedReadingOrderPredictor
from omnidocs.tasks.layout_extraction import DocLayoutYOLO, DocLayoutYOLOConfig
from omnidocs.tasks.ocr_extraction import EasyOCR, EasyOCRConfig

# Initialize components
layout_extractor = DocLayoutYOLO(config=DocLayoutYOLOConfig())
ocr = EasyOCR(config=EasyOCRConfig())
predictor = RuleBasedReadingOrderPredictor()

# Process document
layout = layout_extractor.extract(image)
ocr_result = ocr.extract(image)
reading_order = predictor.predict(layout, ocr_result)

# Get text in reading order
text = reading_order.get_full_text()

# Get elements by type
tables = reading_order.get_elements_by_type(ElementType.TABLE)

# Get caption associations
for elem in reading_order.ordered_elements:
    if elem.element_type == ElementType.FIGURE:
        captions = reading_order.get_captions_for(elem.original_id)
        print(f"Figure {elem.original_id} captions: {[c.text for c in captions]}")

BaseReadingOrderPredictor

Bases: ABC

Abstract base class for reading order predictors.

Reading order predictors take layout detection and OCR results and produce a properly ordered sequence of document elements.

Example
predictor = RuleBasedReadingOrderPredictor()

# Get layout and OCR
layout = layout_extractor.extract(image)
ocr = ocr_extractor.extract(image)

# Predict reading order
result = predictor.predict(layout, ocr)

# Or with multiple pages
results = predictor.predict_multi_page(layouts, ocrs)

predict abstractmethod

predict(
    layout: LayoutOutput,
    ocr: Optional[OCROutput] = None,
    page_no: int = 0,
) -> ReadingOrderOutput

Predict reading order for a single page.

PARAMETER DESCRIPTION
layout

Layout detection results with bounding boxes

TYPE: LayoutOutput

ocr

Optional OCR results. If provided, text will be matched to layout elements by bbox overlap.

TYPE: Optional[OCROutput] DEFAULT: None

page_no

Page number (for multi-page documents)

TYPE: int DEFAULT: 0

RETURNS DESCRIPTION
ReadingOrderOutput

ReadingOrderOutput with ordered elements and associations

Example
layout = layout_extractor.extract(page_image)
ocr = ocr_extractor.extract(page_image)
order = predictor.predict(layout, ocr, page_no=0)
Source code in omnidocs/tasks/reading_order/base.py
@abstractmethod
def predict(
    self,
    layout: "LayoutOutput",
    ocr: Optional["OCROutput"] = None,
    page_no: int = 0,
) -> ReadingOrderOutput:
    """
    Predict reading order for a single page.

    Args:
        layout: Layout detection results with bounding boxes
        ocr: Optional OCR results. If provided, text will be
             matched to layout elements by bbox overlap.
        page_no: Page number (for multi-page documents)

    Returns:
        ReadingOrderOutput with ordered elements and associations

    Example:
        ```python
        layout = layout_extractor.extract(page_image)
        ocr = ocr_extractor.extract(page_image)
        order = predictor.predict(layout, ocr, page_no=0)
        ```
    """
    pass

predict_multi_page

predict_multi_page(
    layouts: List[LayoutOutput],
    ocrs: Optional[List[OCROutput]] = None,
) -> List[ReadingOrderOutput]

Predict reading order for multiple pages.

PARAMETER DESCRIPTION
layouts

List of layout results, one per page

TYPE: List[LayoutOutput]

ocrs

Optional list of OCR results, one per page

TYPE: Optional[List[OCROutput]] DEFAULT: None

RETURNS DESCRIPTION
List[ReadingOrderOutput]

List of ReadingOrderOutput, one per page

Source code in omnidocs/tasks/reading_order/base.py
def predict_multi_page(
    self,
    layouts: List["LayoutOutput"],
    ocrs: Optional[List["OCROutput"]] = None,
) -> List[ReadingOrderOutput]:
    """
    Predict reading order for multiple pages.

    Args:
        layouts: List of layout results, one per page
        ocrs: Optional list of OCR results, one per page

    Returns:
        List of ReadingOrderOutput, one per page
    """
    results = []

    for i, layout in enumerate(layouts):
        ocr = ocrs[i] if ocrs else None
        result = self.predict(layout, ocr, page_no=i)
        results.append(result)

    return results

BoundingBox

Bases: BaseModel

Bounding box in pixel coordinates.

width property

width: float

Width of the bounding box.

height property

height: float

Height of the bounding box.

center property

center: Tuple[float, float]

Center point of the bounding box.

to_list

to_list() -> List[float]

Convert to [x1, y1, x2, y2] list.

Source code in omnidocs/tasks/reading_order/models.py
def to_list(self) -> List[float]:
    """Convert to [x1, y1, x2, y2] list."""
    return [self.x1, self.y1, self.x2, self.y2]

from_list classmethod

from_list(coords: List[float]) -> BoundingBox

Create from [x1, y1, x2, y2] list.

Source code in omnidocs/tasks/reading_order/models.py
@classmethod
def from_list(cls, coords: List[float]) -> "BoundingBox":
    """Create from [x1, y1, x2, y2] list."""
    if len(coords) != 4:
        raise ValueError(f"Expected 4 coordinates, got {len(coords)}")
    return cls(x1=coords[0], y1=coords[1], x2=coords[2], y2=coords[3])

to_normalized

to_normalized(
    image_width: int, image_height: int
) -> BoundingBox

Convert to normalized coordinates (0-1024 range).

PARAMETER DESCRIPTION
image_width

Original image width in pixels

TYPE: int

image_height

Original image height in pixels

TYPE: int

RETURNS DESCRIPTION
BoundingBox

New BoundingBox with coordinates in 0-1024 range

Source code in omnidocs/tasks/reading_order/models.py
def to_normalized(self, image_width: int, image_height: int) -> "BoundingBox":
    """
    Convert to normalized coordinates (0-1024 range).

    Args:
        image_width: Original image width in pixels
        image_height: Original image height in pixels

    Returns:
        New BoundingBox with coordinates in 0-1024 range
    """
    return BoundingBox(
        x1=self.x1 / image_width * NORMALIZED_SIZE,
        y1=self.y1 / image_height * NORMALIZED_SIZE,
        x2=self.x2 / image_width * NORMALIZED_SIZE,
        y2=self.y2 / image_height * NORMALIZED_SIZE,
    )

ElementType

Bases: str, Enum

Type of document element for reading order.

OrderedElement

Bases: BaseModel

A document element with its reading order position.

Combines layout detection results with OCR text and assigns a reading order index.

to_dict

to_dict() -> Dict

Convert to dictionary representation.

Source code in omnidocs/tasks/reading_order/models.py
def to_dict(self) -> Dict:
    """Convert to dictionary representation."""
    return {
        "index": self.index,
        "element_type": self.element_type.value,
        "bbox": self.bbox.to_list(),
        "text": self.text,
        "confidence": self.confidence,
        "page_no": self.page_no,
        "original_id": self.original_id,
    }

ReadingOrderOutput

Bases: BaseModel

Complete reading order prediction result.

Provides: - Ordered list of document elements - Caption-to-element associations - Footnote-to-element associations - Merge suggestions for split elements

Example
result = predictor.predict(layout, ocr)

# Get full text in reading order
full_text = result.get_full_text()

# Get elements by type
tables = result.get_elements_by_type(ElementType.TABLE)

# Find caption for a figure
captions = result.get_captions_for(figure_element.original_id)

element_count property

element_count: int

Total number of ordered elements.

get_full_text

get_full_text(separator: str = '\n\n') -> str

Get concatenated text in reading order.

Excludes page headers, footers, captions, and footnotes from main text flow.

Source code in omnidocs/tasks/reading_order/models.py
def get_full_text(self, separator: str = "\n\n") -> str:
    """
    Get concatenated text in reading order.

    Excludes page headers, footers, captions, and footnotes
    from main text flow.
    """
    main_elements = [
        e
        for e in self.ordered_elements
        if e.element_type
        not in (
            ElementType.PAGE_HEADER,
            ElementType.PAGE_FOOTER,
            ElementType.CAPTION,
            ElementType.FOOTNOTE,
        )
    ]
    return separator.join(e.text for e in main_elements if e.text)

get_elements_by_type

get_elements_by_type(
    element_type: ElementType,
) -> List[OrderedElement]

Filter elements by type.

Source code in omnidocs/tasks/reading_order/models.py
def get_elements_by_type(self, element_type: ElementType) -> List[OrderedElement]:
    """Filter elements by type."""
    return [e for e in self.ordered_elements if e.element_type == element_type]

get_captions_for

get_captions_for(element_id: int) -> List[OrderedElement]

Get caption elements for a given element ID.

Source code in omnidocs/tasks/reading_order/models.py
def get_captions_for(self, element_id: int) -> List[OrderedElement]:
    """Get caption elements for a given element ID."""
    caption_ids = self.caption_map.get(element_id, [])
    return [e for e in self.ordered_elements if e.original_id in caption_ids]

get_footnotes_for

get_footnotes_for(element_id: int) -> List[OrderedElement]

Get footnote elements for a given element ID.

Source code in omnidocs/tasks/reading_order/models.py
def get_footnotes_for(self, element_id: int) -> List[OrderedElement]:
    """Get footnote elements for a given element ID."""
    footnote_ids = self.footnote_map.get(element_id, [])
    return [e for e in self.ordered_elements if e.original_id in footnote_ids]

to_dict

to_dict() -> Dict

Convert to dictionary representation.

Source code in omnidocs/tasks/reading_order/models.py
def to_dict(self) -> Dict:
    """Convert to dictionary representation."""
    return {
        "ordered_elements": [e.to_dict() for e in self.ordered_elements],
        "caption_map": self.caption_map,
        "footnote_map": self.footnote_map,
        "merge_map": self.merge_map,
        "image_width": self.image_width,
        "image_height": self.image_height,
        "element_count": self.element_count,
    }

save_json

save_json(file_path: Union[str, Path]) -> None

Save to JSON file.

Source code in omnidocs/tasks/reading_order/models.py
def save_json(self, file_path: Union[str, Path]) -> None:
    """Save to JSON file."""
    path = Path(file_path)
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(self.model_dump_json(indent=2), encoding="utf-8")

load_json classmethod

load_json(
    file_path: Union[str, Path],
) -> ReadingOrderOutput

Load from JSON file.

Source code in omnidocs/tasks/reading_order/models.py
@classmethod
def load_json(cls, file_path: Union[str, Path]) -> "ReadingOrderOutput":
    """Load from JSON file."""
    path = Path(file_path)
    return cls.model_validate_json(path.read_text(encoding="utf-8"))

RuleBasedReadingOrderPredictor

RuleBasedReadingOrderPredictor()

Bases: BaseReadingOrderPredictor

Rule-based reading order predictor using spatial analysis.

Uses R-tree spatial indexing and rule-based algorithms to determine the logical reading sequence of document elements. This is a CPU-only implementation that doesn't require GPU resources.

Features: - Multi-column layout detection - Header/footer separation - Caption-to-figure/table association - Footnote linking - Element merge suggestions

Example
from omnidocs.tasks.reading_order import RuleBasedReadingOrderPredictor
from omnidocs.tasks.layout_extraction import DocLayoutYOLO, DocLayoutYOLOConfig
from omnidocs.tasks.ocr_extraction import EasyOCR, EasyOCRConfig

# Initialize components
layout_extractor = DocLayoutYOLO(config=DocLayoutYOLOConfig())
ocr = EasyOCR(config=EasyOCRConfig())
predictor = RuleBasedReadingOrderPredictor()

# Process document
layout = layout_extractor.extract(image)
ocr_result = ocr.extract(image)
reading_order = predictor.predict(layout, ocr_result)

# Get text in reading order
text = reading_order.get_full_text()

Initialize the reading order predictor.

Source code in omnidocs/tasks/reading_order/rule_based/predictor.py
def __init__(self):
    """Initialize the reading order predictor."""
    self.dilated_page_element = True
    # Apply horizontal dilation only if less than this page-width normalized threshold
    self._horizontal_dilation_threshold_norm = 0.15

predict

predict(
    layout: LayoutOutput,
    ocr: Optional[OCROutput] = None,
    page_no: int = 0,
) -> ReadingOrderOutput

Predict reading order for a single page.

PARAMETER DESCRIPTION
layout

Layout detection results with bounding boxes

TYPE: LayoutOutput

ocr

Optional OCR results for text content

TYPE: Optional[OCROutput] DEFAULT: None

page_no

Page number (for multi-page documents)

TYPE: int DEFAULT: 0

RETURNS DESCRIPTION
ReadingOrderOutput

ReadingOrderOutput with ordered elements and associations

Source code in omnidocs/tasks/reading_order/rule_based/predictor.py
def predict(
    self,
    layout: "LayoutOutput",
    ocr: Optional["OCROutput"] = None,
    page_no: int = 0,
) -> ReadingOrderOutput:
    """
    Predict reading order for a single page.

    Args:
        layout: Layout detection results with bounding boxes
        ocr: Optional OCR results for text content
        page_no: Page number (for multi-page documents)

    Returns:
        ReadingOrderOutput with ordered elements and associations
    """
    page_width = layout.image_width
    page_height = layout.image_height

    # Build text map from OCR if available
    text_map: Dict[int, str] = {}
    if ocr:
        text_map = self._build_text_map(layout, ocr)

    # Convert layout boxes to internal PageElements
    page_elements: List[_PageElement] = []
    for i, box in enumerate(layout.bboxes):
        label_str = box.label.value.lower()
        element_type = LABEL_TO_ELEMENT_TYPE.get(label_str, ElementType.OTHER)

        # Convert from top-left origin to bottom-left origin
        elem = _PageElement(
            cid=i,
            text=text_map.get(i, ""),
            page_no=page_no,
            page_width=page_width,
            page_height=page_height,
            label=element_type,
            left=box.bbox.x1,
            bottom=page_height - box.bbox.y2,  # Convert y2 to bottom
            right=box.bbox.x2,
            top=page_height - box.bbox.y1,  # Convert y1 to top
        )
        page_elements.append(elem)

    # Run reading order prediction
    sorted_elements = self._predict_reading_order(page_elements)

    # Get caption associations
    caption_map = self._find_to_captions(sorted_elements)

    # Get footnote associations
    footnote_map = self._find_to_footnotes(sorted_elements)

    # Get merge suggestions
    merge_map = self._predict_merges(sorted_elements)

    # Convert to OrderedElements
    ordered_elements: List[OrderedElement] = []
    for idx, elem in enumerate(sorted_elements):
        # Convert back from bottom-left to top-left origin
        bbox = BoundingBox(
            x1=elem.left,
            y1=page_height - elem.top,
            x2=elem.right,
            y2=page_height - elem.bottom,
        )

        confidence = 1.0
        if elem.cid < len(layout.bboxes):
            confidence = layout.bboxes[elem.cid].confidence

        ordered_elem = OrderedElement(
            index=idx,
            element_type=elem.label,
            bbox=bbox,
            text=elem.text,
            confidence=confidence,
            page_no=page_no,
            original_id=elem.cid,
        )
        ordered_elements.append(ordered_elem)

    return ReadingOrderOutput(
        ordered_elements=ordered_elements,
        caption_map=caption_map,
        footnote_map=footnote_map,
        merge_map=merge_map,
        image_width=page_width,
        image_height=page_height,
        model_name="RuleBasedReadingOrderPredictor",
    )

base

Base class for reading order predictors.

Defines the abstract interface that all reading order predictors must implement.

BaseReadingOrderPredictor

Bases: ABC

Abstract base class for reading order predictors.

Reading order predictors take layout detection and OCR results and produce a properly ordered sequence of document elements.

Example
predictor = RuleBasedReadingOrderPredictor()

# Get layout and OCR
layout = layout_extractor.extract(image)
ocr = ocr_extractor.extract(image)

# Predict reading order
result = predictor.predict(layout, ocr)

# Or with multiple pages
results = predictor.predict_multi_page(layouts, ocrs)
predict abstractmethod
predict(
    layout: LayoutOutput,
    ocr: Optional[OCROutput] = None,
    page_no: int = 0,
) -> ReadingOrderOutput

Predict reading order for a single page.

PARAMETER DESCRIPTION
layout

Layout detection results with bounding boxes

TYPE: LayoutOutput

ocr

Optional OCR results. If provided, text will be matched to layout elements by bbox overlap.

TYPE: Optional[OCROutput] DEFAULT: None

page_no

Page number (for multi-page documents)

TYPE: int DEFAULT: 0

RETURNS DESCRIPTION
ReadingOrderOutput

ReadingOrderOutput with ordered elements and associations

Example
layout = layout_extractor.extract(page_image)
ocr = ocr_extractor.extract(page_image)
order = predictor.predict(layout, ocr, page_no=0)
Source code in omnidocs/tasks/reading_order/base.py
@abstractmethod
def predict(
    self,
    layout: "LayoutOutput",
    ocr: Optional["OCROutput"] = None,
    page_no: int = 0,
) -> ReadingOrderOutput:
    """
    Predict reading order for a single page.

    Args:
        layout: Layout detection results with bounding boxes
        ocr: Optional OCR results. If provided, text will be
             matched to layout elements by bbox overlap.
        page_no: Page number (for multi-page documents)

    Returns:
        ReadingOrderOutput with ordered elements and associations

    Example:
        ```python
        layout = layout_extractor.extract(page_image)
        ocr = ocr_extractor.extract(page_image)
        order = predictor.predict(layout, ocr, page_no=0)
        ```
    """
    pass
predict_multi_page
predict_multi_page(
    layouts: List[LayoutOutput],
    ocrs: Optional[List[OCROutput]] = None,
) -> List[ReadingOrderOutput]

Predict reading order for multiple pages.

PARAMETER DESCRIPTION
layouts

List of layout results, one per page

TYPE: List[LayoutOutput]

ocrs

Optional list of OCR results, one per page

TYPE: Optional[List[OCROutput]] DEFAULT: None

RETURNS DESCRIPTION
List[ReadingOrderOutput]

List of ReadingOrderOutput, one per page

Source code in omnidocs/tasks/reading_order/base.py
def predict_multi_page(
    self,
    layouts: List["LayoutOutput"],
    ocrs: Optional[List["OCROutput"]] = None,
) -> List[ReadingOrderOutput]:
    """
    Predict reading order for multiple pages.

    Args:
        layouts: List of layout results, one per page
        ocrs: Optional list of OCR results, one per page

    Returns:
        List of ReadingOrderOutput, one per page
    """
    results = []

    for i, layout in enumerate(layouts):
        ocr = ocrs[i] if ocrs else None
        result = self.predict(layout, ocr, page_no=i)
        results.append(result)

    return results

models

Pydantic models for reading order prediction.

Takes layout detection and OCR results, produces ordered element sequence with caption and footnote associations.

Example
# Get layout and OCR
layout = layout_extractor.extract(image)
ocr = ocr_extractor.extract(image)

# Predict reading order
reading_order = predictor.predict(layout, ocr)

# Iterate in reading order
for element in reading_order.ordered_elements:
    print(f"{element.index}: [{element.element_type}] {element.text[:50]}...")

# Get caption associations
for fig_id, caption_ids in reading_order.caption_map.items():
    print(f"Figure {fig_id} has captions: {caption_ids}")

ElementType

Bases: str, Enum

Type of document element for reading order.

BoundingBox

Bases: BaseModel

Bounding box in pixel coordinates.

width property
width: float

Width of the bounding box.

height property
height: float

Height of the bounding box.

center property
center: Tuple[float, float]

Center point of the bounding box.

to_list
to_list() -> List[float]

Convert to [x1, y1, x2, y2] list.

Source code in omnidocs/tasks/reading_order/models.py
def to_list(self) -> List[float]:
    """Convert to [x1, y1, x2, y2] list."""
    return [self.x1, self.y1, self.x2, self.y2]
from_list classmethod
from_list(coords: List[float]) -> BoundingBox

Create from [x1, y1, x2, y2] list.

Source code in omnidocs/tasks/reading_order/models.py
@classmethod
def from_list(cls, coords: List[float]) -> "BoundingBox":
    """Create from [x1, y1, x2, y2] list."""
    if len(coords) != 4:
        raise ValueError(f"Expected 4 coordinates, got {len(coords)}")
    return cls(x1=coords[0], y1=coords[1], x2=coords[2], y2=coords[3])
to_normalized
to_normalized(
    image_width: int, image_height: int
) -> BoundingBox

Convert to normalized coordinates (0-1024 range).

PARAMETER DESCRIPTION
image_width

Original image width in pixels

TYPE: int

image_height

Original image height in pixels

TYPE: int

RETURNS DESCRIPTION
BoundingBox

New BoundingBox with coordinates in 0-1024 range

Source code in omnidocs/tasks/reading_order/models.py
def to_normalized(self, image_width: int, image_height: int) -> "BoundingBox":
    """
    Convert to normalized coordinates (0-1024 range).

    Args:
        image_width: Original image width in pixels
        image_height: Original image height in pixels

    Returns:
        New BoundingBox with coordinates in 0-1024 range
    """
    return BoundingBox(
        x1=self.x1 / image_width * NORMALIZED_SIZE,
        y1=self.y1 / image_height * NORMALIZED_SIZE,
        x2=self.x2 / image_width * NORMALIZED_SIZE,
        y2=self.y2 / image_height * NORMALIZED_SIZE,
    )

OrderedElement

Bases: BaseModel

A document element with its reading order position.

Combines layout detection results with OCR text and assigns a reading order index.

to_dict
to_dict() -> Dict

Convert to dictionary representation.

Source code in omnidocs/tasks/reading_order/models.py
def to_dict(self) -> Dict:
    """Convert to dictionary representation."""
    return {
        "index": self.index,
        "element_type": self.element_type.value,
        "bbox": self.bbox.to_list(),
        "text": self.text,
        "confidence": self.confidence,
        "page_no": self.page_no,
        "original_id": self.original_id,
    }

ReadingOrderOutput

Bases: BaseModel

Complete reading order prediction result.

Provides: - Ordered list of document elements - Caption-to-element associations - Footnote-to-element associations - Merge suggestions for split elements

Example
result = predictor.predict(layout, ocr)

# Get full text in reading order
full_text = result.get_full_text()

# Get elements by type
tables = result.get_elements_by_type(ElementType.TABLE)

# Find caption for a figure
captions = result.get_captions_for(figure_element.original_id)
element_count property
element_count: int

Total number of ordered elements.

get_full_text
get_full_text(separator: str = '\n\n') -> str

Get concatenated text in reading order.

Excludes page headers, footers, captions, and footnotes from main text flow.

Source code in omnidocs/tasks/reading_order/models.py
def get_full_text(self, separator: str = "\n\n") -> str:
    """
    Get concatenated text in reading order.

    Excludes page headers, footers, captions, and footnotes
    from main text flow.
    """
    main_elements = [
        e
        for e in self.ordered_elements
        if e.element_type
        not in (
            ElementType.PAGE_HEADER,
            ElementType.PAGE_FOOTER,
            ElementType.CAPTION,
            ElementType.FOOTNOTE,
        )
    ]
    return separator.join(e.text for e in main_elements if e.text)
get_elements_by_type
get_elements_by_type(
    element_type: ElementType,
) -> List[OrderedElement]

Filter elements by type.

Source code in omnidocs/tasks/reading_order/models.py
def get_elements_by_type(self, element_type: ElementType) -> List[OrderedElement]:
    """Filter elements by type."""
    return [e for e in self.ordered_elements if e.element_type == element_type]
get_captions_for
get_captions_for(element_id: int) -> List[OrderedElement]

Get caption elements for a given element ID.

Source code in omnidocs/tasks/reading_order/models.py
def get_captions_for(self, element_id: int) -> List[OrderedElement]:
    """Get caption elements for a given element ID."""
    caption_ids = self.caption_map.get(element_id, [])
    return [e for e in self.ordered_elements if e.original_id in caption_ids]
get_footnotes_for
get_footnotes_for(element_id: int) -> List[OrderedElement]

Get footnote elements for a given element ID.

Source code in omnidocs/tasks/reading_order/models.py
def get_footnotes_for(self, element_id: int) -> List[OrderedElement]:
    """Get footnote elements for a given element ID."""
    footnote_ids = self.footnote_map.get(element_id, [])
    return [e for e in self.ordered_elements if e.original_id in footnote_ids]
to_dict
to_dict() -> Dict

Convert to dictionary representation.

Source code in omnidocs/tasks/reading_order/models.py
def to_dict(self) -> Dict:
    """Convert to dictionary representation."""
    return {
        "ordered_elements": [e.to_dict() for e in self.ordered_elements],
        "caption_map": self.caption_map,
        "footnote_map": self.footnote_map,
        "merge_map": self.merge_map,
        "image_width": self.image_width,
        "image_height": self.image_height,
        "element_count": self.element_count,
    }
save_json
save_json(file_path: Union[str, Path]) -> None

Save to JSON file.

Source code in omnidocs/tasks/reading_order/models.py
def save_json(self, file_path: Union[str, Path]) -> None:
    """Save to JSON file."""
    path = Path(file_path)
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(self.model_dump_json(indent=2), encoding="utf-8")
load_json classmethod
load_json(
    file_path: Union[str, Path],
) -> ReadingOrderOutput

Load from JSON file.

Source code in omnidocs/tasks/reading_order/models.py
@classmethod
def load_json(cls, file_path: Union[str, Path]) -> "ReadingOrderOutput":
    """Load from JSON file."""
    path = Path(file_path)
    return cls.model_validate_json(path.read_text(encoding="utf-8"))

rule_based

Rule-based reading order predictor module.

Provides rule-based reading order prediction using spatial analysis.

RuleBasedReadingOrderPredictor

RuleBasedReadingOrderPredictor()

Bases: BaseReadingOrderPredictor

Rule-based reading order predictor using spatial analysis.

Uses R-tree spatial indexing and rule-based algorithms to determine the logical reading sequence of document elements. This is a CPU-only implementation that doesn't require GPU resources.

Features: - Multi-column layout detection - Header/footer separation - Caption-to-figure/table association - Footnote linking - Element merge suggestions

Example
from omnidocs.tasks.reading_order import RuleBasedReadingOrderPredictor
from omnidocs.tasks.layout_extraction import DocLayoutYOLO, DocLayoutYOLOConfig
from omnidocs.tasks.ocr_extraction import EasyOCR, EasyOCRConfig

# Initialize components
layout_extractor = DocLayoutYOLO(config=DocLayoutYOLOConfig())
ocr = EasyOCR(config=EasyOCRConfig())
predictor = RuleBasedReadingOrderPredictor()

# Process document
layout = layout_extractor.extract(image)
ocr_result = ocr.extract(image)
reading_order = predictor.predict(layout, ocr_result)

# Get text in reading order
text = reading_order.get_full_text()

Initialize the reading order predictor.

Source code in omnidocs/tasks/reading_order/rule_based/predictor.py
def __init__(self):
    """Initialize the reading order predictor."""
    self.dilated_page_element = True
    # Apply horizontal dilation only if less than this page-width normalized threshold
    self._horizontal_dilation_threshold_norm = 0.15
predict
predict(
    layout: LayoutOutput,
    ocr: Optional[OCROutput] = None,
    page_no: int = 0,
) -> ReadingOrderOutput

Predict reading order for a single page.

PARAMETER DESCRIPTION
layout

Layout detection results with bounding boxes

TYPE: LayoutOutput

ocr

Optional OCR results for text content

TYPE: Optional[OCROutput] DEFAULT: None

page_no

Page number (for multi-page documents)

TYPE: int DEFAULT: 0

RETURNS DESCRIPTION
ReadingOrderOutput

ReadingOrderOutput with ordered elements and associations

Source code in omnidocs/tasks/reading_order/rule_based/predictor.py
def predict(
    self,
    layout: "LayoutOutput",
    ocr: Optional["OCROutput"] = None,
    page_no: int = 0,
) -> ReadingOrderOutput:
    """
    Predict reading order for a single page.

    Args:
        layout: Layout detection results with bounding boxes
        ocr: Optional OCR results for text content
        page_no: Page number (for multi-page documents)

    Returns:
        ReadingOrderOutput with ordered elements and associations
    """
    page_width = layout.image_width
    page_height = layout.image_height

    # Build text map from OCR if available
    text_map: Dict[int, str] = {}
    if ocr:
        text_map = self._build_text_map(layout, ocr)

    # Convert layout boxes to internal PageElements
    page_elements: List[_PageElement] = []
    for i, box in enumerate(layout.bboxes):
        label_str = box.label.value.lower()
        element_type = LABEL_TO_ELEMENT_TYPE.get(label_str, ElementType.OTHER)

        # Convert from top-left origin to bottom-left origin
        elem = _PageElement(
            cid=i,
            text=text_map.get(i, ""),
            page_no=page_no,
            page_width=page_width,
            page_height=page_height,
            label=element_type,
            left=box.bbox.x1,
            bottom=page_height - box.bbox.y2,  # Convert y2 to bottom
            right=box.bbox.x2,
            top=page_height - box.bbox.y1,  # Convert y1 to top
        )
        page_elements.append(elem)

    # Run reading order prediction
    sorted_elements = self._predict_reading_order(page_elements)

    # Get caption associations
    caption_map = self._find_to_captions(sorted_elements)

    # Get footnote associations
    footnote_map = self._find_to_footnotes(sorted_elements)

    # Get merge suggestions
    merge_map = self._predict_merges(sorted_elements)

    # Convert to OrderedElements
    ordered_elements: List[OrderedElement] = []
    for idx, elem in enumerate(sorted_elements):
        # Convert back from bottom-left to top-left origin
        bbox = BoundingBox(
            x1=elem.left,
            y1=page_height - elem.top,
            x2=elem.right,
            y2=page_height - elem.bottom,
        )

        confidence = 1.0
        if elem.cid < len(layout.bboxes):
            confidence = layout.bboxes[elem.cid].confidence

        ordered_elem = OrderedElement(
            index=idx,
            element_type=elem.label,
            bbox=bbox,
            text=elem.text,
            confidence=confidence,
            page_no=page_no,
            original_id=elem.cid,
        )
        ordered_elements.append(ordered_elem)

    return ReadingOrderOutput(
        ordered_elements=ordered_elements,
        caption_map=caption_map,
        footnote_map=footnote_map,
        merge_map=merge_map,
        image_width=page_width,
        image_height=page_height,
        model_name="RuleBasedReadingOrderPredictor",
    )

predictor

Rule-based reading order predictor.

Uses spatial analysis and R-tree indexing to determine the logical reading sequence of document elements. Self-contained implementation without external dependencies on docling-ibm-models.

Based on the algorithm from docling-ibm-models, adapted for omnidocs.

RuleBasedReadingOrderPredictor
RuleBasedReadingOrderPredictor()

Bases: BaseReadingOrderPredictor

Rule-based reading order predictor using spatial analysis.

Uses R-tree spatial indexing and rule-based algorithms to determine the logical reading sequence of document elements. This is a CPU-only implementation that doesn't require GPU resources.

Features: - Multi-column layout detection - Header/footer separation - Caption-to-figure/table association - Footnote linking - Element merge suggestions

Example
from omnidocs.tasks.reading_order import RuleBasedReadingOrderPredictor
from omnidocs.tasks.layout_extraction import DocLayoutYOLO, DocLayoutYOLOConfig
from omnidocs.tasks.ocr_extraction import EasyOCR, EasyOCRConfig

# Initialize components
layout_extractor = DocLayoutYOLO(config=DocLayoutYOLOConfig())
ocr = EasyOCR(config=EasyOCRConfig())
predictor = RuleBasedReadingOrderPredictor()

# Process document
layout = layout_extractor.extract(image)
ocr_result = ocr.extract(image)
reading_order = predictor.predict(layout, ocr_result)

# Get text in reading order
text = reading_order.get_full_text()

Initialize the reading order predictor.

Source code in omnidocs/tasks/reading_order/rule_based/predictor.py
def __init__(self):
    """Initialize the reading order predictor."""
    self.dilated_page_element = True
    # Apply horizontal dilation only if less than this page-width normalized threshold
    self._horizontal_dilation_threshold_norm = 0.15
predict
predict(
    layout: LayoutOutput,
    ocr: Optional[OCROutput] = None,
    page_no: int = 0,
) -> ReadingOrderOutput

Predict reading order for a single page.

PARAMETER DESCRIPTION
layout

Layout detection results with bounding boxes

TYPE: LayoutOutput

ocr

Optional OCR results for text content

TYPE: Optional[OCROutput] DEFAULT: None

page_no

Page number (for multi-page documents)

TYPE: int DEFAULT: 0

RETURNS DESCRIPTION
ReadingOrderOutput

ReadingOrderOutput with ordered elements and associations

Source code in omnidocs/tasks/reading_order/rule_based/predictor.py
def predict(
    self,
    layout: "LayoutOutput",
    ocr: Optional["OCROutput"] = None,
    page_no: int = 0,
) -> ReadingOrderOutput:
    """
    Predict reading order for a single page.

    Args:
        layout: Layout detection results with bounding boxes
        ocr: Optional OCR results for text content
        page_no: Page number (for multi-page documents)

    Returns:
        ReadingOrderOutput with ordered elements and associations
    """
    page_width = layout.image_width
    page_height = layout.image_height

    # Build text map from OCR if available
    text_map: Dict[int, str] = {}
    if ocr:
        text_map = self._build_text_map(layout, ocr)

    # Convert layout boxes to internal PageElements
    page_elements: List[_PageElement] = []
    for i, box in enumerate(layout.bboxes):
        label_str = box.label.value.lower()
        element_type = LABEL_TO_ELEMENT_TYPE.get(label_str, ElementType.OTHER)

        # Convert from top-left origin to bottom-left origin
        elem = _PageElement(
            cid=i,
            text=text_map.get(i, ""),
            page_no=page_no,
            page_width=page_width,
            page_height=page_height,
            label=element_type,
            left=box.bbox.x1,
            bottom=page_height - box.bbox.y2,  # Convert y2 to bottom
            right=box.bbox.x2,
            top=page_height - box.bbox.y1,  # Convert y1 to top
        )
        page_elements.append(elem)

    # Run reading order prediction
    sorted_elements = self._predict_reading_order(page_elements)

    # Get caption associations
    caption_map = self._find_to_captions(sorted_elements)

    # Get footnote associations
    footnote_map = self._find_to_footnotes(sorted_elements)

    # Get merge suggestions
    merge_map = self._predict_merges(sorted_elements)

    # Convert to OrderedElements
    ordered_elements: List[OrderedElement] = []
    for idx, elem in enumerate(sorted_elements):
        # Convert back from bottom-left to top-left origin
        bbox = BoundingBox(
            x1=elem.left,
            y1=page_height - elem.top,
            x2=elem.right,
            y2=page_height - elem.bottom,
        )

        confidence = 1.0
        if elem.cid < len(layout.bboxes):
            confidence = layout.bboxes[elem.cid].confidence

        ordered_elem = OrderedElement(
            index=idx,
            element_type=elem.label,
            bbox=bbox,
            text=elem.text,
            confidence=confidence,
            page_no=page_no,
            original_id=elem.cid,
        )
        ordered_elements.append(ordered_elem)

    return ReadingOrderOutput(
        ordered_elements=ordered_elements,
        caption_map=caption_map,
        footnote_map=footnote_map,
        merge_map=merge_map,
        image_width=page_width,
        image_height=page_height,
        model_name="RuleBasedReadingOrderPredictor",
    )

structured_extraction

Structured Extraction Module.

Provides extractors for extracting structured data from document images using Pydantic schemas for type-safe output.

Example
from pydantic import BaseModel
from omnidocs.vlm import VLMAPIConfig
from omnidocs.tasks.structured_extraction import VLMStructuredExtractor

class Invoice(BaseModel):
    vendor: str
    total: float
    items: list[str]

config = VLMAPIConfig(model="gemini/gemini-2.5-flash")
extractor = VLMStructuredExtractor(config=config)
result = extractor.extract(
    "invoice.png",
    schema=Invoice,
    prompt="Extract invoice details from this document.",
)
print(result.vendor, result.total)

BaseStructuredExtractor

Bases: ABC

Abstract base class for structured extractors.

Structured extractors return data matching a user-provided Pydantic schema.

Example
class MyExtractor(BaseStructuredExtractor):
    def __init__(self, config):
        self.config = config

    def _load_model(self):
        pass

    def extract(self, image, schema, prompt):
        return StructuredOutput(data=schema(...), ...)

extract abstractmethod

extract(
    image: Union[Image, ndarray, str, Path],
    schema: type[BaseModel],
    prompt: str,
) -> StructuredOutput

Extract structured data from an image.

PARAMETER DESCRIPTION
image

Input image (PIL Image, numpy array, or file path).

TYPE: Union[Image, ndarray, str, Path]

schema

Pydantic model class defining the expected output structure.

TYPE: type[BaseModel]

prompt

Extraction prompt describing what to extract.

TYPE: str

RETURNS DESCRIPTION
StructuredOutput

StructuredOutput containing the validated data.

Source code in omnidocs/tasks/structured_extraction/base.py
@abstractmethod
def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    schema: type[BaseModel],
    prompt: str,
) -> StructuredOutput:
    """
    Extract structured data from an image.

    Args:
        image: Input image (PIL Image, numpy array, or file path).
        schema: Pydantic model class defining the expected output structure.
        prompt: Extraction prompt describing what to extract.

    Returns:
        StructuredOutput containing the validated data.
    """
    pass

StructuredOutput

Bases: BaseModel

Output from structured extraction.

Contains the extracted data as a validated Pydantic model instance, along with metadata about the extraction.

VLMStructuredExtractor

VLMStructuredExtractor(config: VLMAPIConfig)

Bases: BaseStructuredExtractor

Provider-agnostic VLM structured extractor using litellm.

Extracts structured data from document images using any cloud VLM API. Uses litellm's native response_format support to send Pydantic schemas to providers that support structured output (OpenAI, Gemini, etc.).

Example
from pydantic import BaseModel
from omnidocs.vlm import VLMAPIConfig
from omnidocs.tasks.structured_extraction import VLMStructuredExtractor

class Invoice(BaseModel):
    vendor: str
    total: float
    items: list[str]

config = VLMAPIConfig(model="gemini/gemini-2.5-flash")
extractor = VLMStructuredExtractor(config=config)
result = extractor.extract("invoice.png", schema=Invoice, prompt="Extract invoice fields")
print(result.data.vendor)

Initialize VLM structured extractor.

PARAMETER DESCRIPTION
config

VLM API configuration with model and provider details.

TYPE: VLMAPIConfig

Source code in omnidocs/tasks/structured_extraction/vlm.py
def __init__(self, config: VLMAPIConfig):
    """
    Initialize VLM structured extractor.

    Args:
        config: VLM API configuration with model and provider details.
    """
    self.config = config
    self._loaded = True

extract

extract(
    image: Union[Image, ndarray, str, Path],
    schema: type[BaseModel],
    prompt: str,
) -> StructuredOutput

Extract structured data from an image.

PARAMETER DESCRIPTION
image

Input image (PIL Image, numpy array, or file path).

TYPE: Union[Image, ndarray, str, Path]

schema

Pydantic model class defining the expected output structure.

TYPE: type[BaseModel]

prompt

Extraction prompt describing what to extract.

TYPE: str

RETURNS DESCRIPTION
StructuredOutput

StructuredOutput containing the validated data.

Source code in omnidocs/tasks/structured_extraction/vlm.py
def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    schema: type[BaseModel],
    prompt: str,
) -> StructuredOutput:
    """
    Extract structured data from an image.

    Args:
        image: Input image (PIL Image, numpy array, or file path).
        schema: Pydantic model class defining the expected output structure.
        prompt: Extraction prompt describing what to extract.

    Returns:
        StructuredOutput containing the validated data.
    """
    pil_image = self._prepare_image(image)
    width, height = pil_image.size

    data = vlm_structured_completion(self.config, prompt, pil_image, schema)

    return StructuredOutput(
        data=data,
        image_width=width,
        image_height=height,
        model_name=f"VLM ({self.config.model})",
    )

base

Base class for structured extractors.

Defines the abstract interface for extracting structured data from document images.

BaseStructuredExtractor

Bases: ABC

Abstract base class for structured extractors.

Structured extractors return data matching a user-provided Pydantic schema.

Example
class MyExtractor(BaseStructuredExtractor):
    def __init__(self, config):
        self.config = config

    def _load_model(self):
        pass

    def extract(self, image, schema, prompt):
        return StructuredOutput(data=schema(...), ...)
extract abstractmethod
extract(
    image: Union[Image, ndarray, str, Path],
    schema: type[BaseModel],
    prompt: str,
) -> StructuredOutput

Extract structured data from an image.

PARAMETER DESCRIPTION
image

Input image (PIL Image, numpy array, or file path).

TYPE: Union[Image, ndarray, str, Path]

schema

Pydantic model class defining the expected output structure.

TYPE: type[BaseModel]

prompt

Extraction prompt describing what to extract.

TYPE: str

RETURNS DESCRIPTION
StructuredOutput

StructuredOutput containing the validated data.

Source code in omnidocs/tasks/structured_extraction/base.py
@abstractmethod
def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    schema: type[BaseModel],
    prompt: str,
) -> StructuredOutput:
    """
    Extract structured data from an image.

    Args:
        image: Input image (PIL Image, numpy array, or file path).
        schema: Pydantic model class defining the expected output structure.
        prompt: Extraction prompt describing what to extract.

    Returns:
        StructuredOutput containing the validated data.
    """
    pass

models

Pydantic models for structured extraction outputs.

StructuredOutput

Bases: BaseModel

Output from structured extraction.

Contains the extracted data as a validated Pydantic model instance, along with metadata about the extraction.

vlm

VLM structured extractor.

A provider-agnostic Vision-Language Model structured extractor using litellm. Extracts structured data matching a Pydantic schema from document images.

Example
from pydantic import BaseModel
from omnidocs.vlm import VLMAPIConfig
from omnidocs.tasks.structured_extraction import VLMStructuredExtractor

class Invoice(BaseModel):
    vendor: str
    total: float
    items: list[str]
    date: str

config = VLMAPIConfig(model="gemini/gemini-2.5-flash")
extractor = VLMStructuredExtractor(config=config)

result = extractor.extract(
    image="invoice.png",
    schema=Invoice,
    prompt="Extract invoice details from this document.",
)
print(result.data.vendor, result.data.total)

VLMStructuredExtractor

VLMStructuredExtractor(config: VLMAPIConfig)

Bases: BaseStructuredExtractor

Provider-agnostic VLM structured extractor using litellm.

Extracts structured data from document images using any cloud VLM API. Uses litellm's native response_format support to send Pydantic schemas to providers that support structured output (OpenAI, Gemini, etc.).

Example
from pydantic import BaseModel
from omnidocs.vlm import VLMAPIConfig
from omnidocs.tasks.structured_extraction import VLMStructuredExtractor

class Invoice(BaseModel):
    vendor: str
    total: float
    items: list[str]

config = VLMAPIConfig(model="gemini/gemini-2.5-flash")
extractor = VLMStructuredExtractor(config=config)
result = extractor.extract("invoice.png", schema=Invoice, prompt="Extract invoice fields")
print(result.data.vendor)

Initialize VLM structured extractor.

PARAMETER DESCRIPTION
config

VLM API configuration with model and provider details.

TYPE: VLMAPIConfig

Source code in omnidocs/tasks/structured_extraction/vlm.py
def __init__(self, config: VLMAPIConfig):
    """
    Initialize VLM structured extractor.

    Args:
        config: VLM API configuration with model and provider details.
    """
    self.config = config
    self._loaded = True
extract
extract(
    image: Union[Image, ndarray, str, Path],
    schema: type[BaseModel],
    prompt: str,
) -> StructuredOutput

Extract structured data from an image.

PARAMETER DESCRIPTION
image

Input image (PIL Image, numpy array, or file path).

TYPE: Union[Image, ndarray, str, Path]

schema

Pydantic model class defining the expected output structure.

TYPE: type[BaseModel]

prompt

Extraction prompt describing what to extract.

TYPE: str

RETURNS DESCRIPTION
StructuredOutput

StructuredOutput containing the validated data.

Source code in omnidocs/tasks/structured_extraction/vlm.py
def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    schema: type[BaseModel],
    prompt: str,
) -> StructuredOutput:
    """
    Extract structured data from an image.

    Args:
        image: Input image (PIL Image, numpy array, or file path).
        schema: Pydantic model class defining the expected output structure.
        prompt: Extraction prompt describing what to extract.

    Returns:
        StructuredOutput containing the validated data.
    """
    pil_image = self._prepare_image(image)
    width, height = pil_image.size

    data = vlm_structured_completion(self.config, prompt, pil_image, schema)

    return StructuredOutput(
        data=data,
        image_width=width,
        image_height=height,
        model_name=f"VLM ({self.config.model})",
    )

table_extraction

Table Extraction Module.

Provides extractors for detecting and extracting table structure from document images. Outputs structured table data with cells, spans, and multiple export formats (HTML, Markdown, Pandas DataFrame).

Available Extractors
  • TableFormerExtractor: Transformer-based table structure extractor
Example
from omnidocs.tasks.table_extraction import TableFormerExtractor, TableFormerConfig

# Initialize extractor
extractor = TableFormerExtractor(
    config=TableFormerConfig(mode="fast", device="cuda")
)

# Extract table structure
result = extractor.extract(table_image)

# Get HTML output
html = result.to_html()

# Get DataFrame
df = result.to_dataframe()

# Get Markdown
md = result.to_markdown()

# Access cells
for cell in result.cells:
    print(f"[{cell.row},{cell.col}] {cell.text}")

BaseTableExtractor

Bases: ABC

Abstract base class for table structure extractors.

Table extractors analyze table images to detect cell structure, identify headers, and extract text content.

Example
class MyTableExtractor(BaseTableExtractor):
    def __init__(self, config: MyConfig):
        self.config = config
        self._load_model()

    def _load_model(self):
        # Load model weights
        pass

    def extract(self, image):
        # Run extraction
        return TableOutput(...)

extract abstractmethod

extract(
    image: Union[Image, ndarray, str, Path],
    ocr_output: Optional[OCROutput] = None,
) -> TableOutput

Extract table structure from an image.

PARAMETER DESCRIPTION
image

Table image (should be cropped to table region)

TYPE: Union[Image, ndarray, str, Path]

ocr_output

Optional OCR results for cell text matching. If not provided, model will attempt to extract text.

TYPE: Optional[OCROutput] DEFAULT: None

RETURNS DESCRIPTION
TableOutput

TableOutput with cells, structure, and export methods

Example
# Without OCR (model extracts text)
result = extractor.extract(table_image)

# With OCR (better text quality)
ocr = some_ocr.extract(table_image)
result = extractor.extract(table_image, ocr_output=ocr)
Source code in omnidocs/tasks/table_extraction/base.py
@abstractmethod
def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    ocr_output: Optional["OCROutput"] = None,
) -> TableOutput:
    """
    Extract table structure from an image.

    Args:
        image: Table image (should be cropped to table region)
        ocr_output: Optional OCR results for cell text matching.
                   If not provided, model will attempt to extract text.

    Returns:
        TableOutput with cells, structure, and export methods

    Example:
        ```python
        # Without OCR (model extracts text)
        result = extractor.extract(table_image)

        # With OCR (better text quality)
        ocr = some_ocr.extract(table_image)
        result = extractor.extract(table_image, ocr_output=ocr)
        ```
    """
    pass

batch_extract

batch_extract(
    images: List[Union[Image, ndarray, str, Path]],
    ocr_outputs: Optional[List[OCROutput]] = None,
    progress_callback: Optional[
        Callable[[int, int], None]
    ] = None,
) -> List[TableOutput]

Extract tables from multiple images.

Default implementation loops over extract(). Subclasses can override for optimized batching.

PARAMETER DESCRIPTION
images

List of table images

TYPE: List[Union[Image, ndarray, str, Path]]

ocr_outputs

Optional list of OCR results (same length as images)

TYPE: Optional[List[OCROutput]] DEFAULT: None

progress_callback

Optional function(current, total) for progress

TYPE: Optional[Callable[[int, int], None]] DEFAULT: None

RETURNS DESCRIPTION
List[TableOutput]

List of TableOutput in same order as input

Examples:

results = extractor.batch_extract(table_images)
Source code in omnidocs/tasks/table_extraction/base.py
def batch_extract(
    self,
    images: List[Union[Image.Image, np.ndarray, str, Path]],
    ocr_outputs: Optional[List["OCROutput"]] = None,
    progress_callback: Optional[Callable[[int, int], None]] = None,
) -> List[TableOutput]:
    """
    Extract tables from multiple images.

    Default implementation loops over extract(). Subclasses can override
    for optimized batching.

    Args:
        images: List of table images
        ocr_outputs: Optional list of OCR results (same length as images)
        progress_callback: Optional function(current, total) for progress

    Returns:
        List of TableOutput in same order as input

    Examples:
        ```python
        results = extractor.batch_extract(table_images)
        ```
    """
    results = []
    total = len(images)

    for i, image in enumerate(images):
        if progress_callback:
            progress_callback(i + 1, total)

        ocr = ocr_outputs[i] if ocr_outputs else None
        result = self.extract(image, ocr_output=ocr)
        results.append(result)

    return results

extract_document

extract_document(
    document: Document,
    table_bboxes: Optional[List[List[float]]] = None,
    progress_callback: Optional[
        Callable[[int, int], None]
    ] = None,
) -> List[TableOutput]

Extract tables from all pages of a document.

PARAMETER DESCRIPTION
document

Document instance

TYPE: Document

table_bboxes

Optional list of table bounding boxes per page. Each element should be a list of [x1, y1, x2, y2] coords.

TYPE: Optional[List[List[float]]] DEFAULT: None

progress_callback

Optional function(current, total) for progress

TYPE: Optional[Callable[[int, int], None]] DEFAULT: None

RETURNS DESCRIPTION
List[TableOutput]

List of TableOutput, one per detected table

Examples:

doc = Document.from_pdf("paper.pdf")
results = extractor.extract_document(doc)
Source code in omnidocs/tasks/table_extraction/base.py
def extract_document(
    self,
    document: "Document",
    table_bboxes: Optional[List[List[float]]] = None,
    progress_callback: Optional[Callable[[int, int], None]] = None,
) -> List[TableOutput]:
    """
    Extract tables from all pages of a document.

    Args:
        document: Document instance
        table_bboxes: Optional list of table bounding boxes per page.
                     Each element should be a list of [x1, y1, x2, y2] coords.
        progress_callback: Optional function(current, total) for progress

    Returns:
        List of TableOutput, one per detected table

    Examples:
        ```python
        doc = Document.from_pdf("paper.pdf")
        results = extractor.extract_document(doc)
        ```
    """
    results = []
    total = document.page_count

    for i, page in enumerate(document.iter_pages()):
        if progress_callback:
            progress_callback(i + 1, total)

        # If no bboxes provided, process entire page
        if table_bboxes is None:
            result = self.extract(page)
            results.append(result)
        else:
            # Crop and process each table region
            for bbox in table_bboxes:
                x1, y1, x2, y2 = bbox
                table_region = page.crop((x1, y1, x2, y2))
                result = self.extract(table_region)
                results.append(result)

    return results

BoundingBox

Bases: BaseModel

Bounding box in pixel coordinates.

width property

width: float

Width of the bounding box.

height property

height: float

Height of the bounding box.

area property

area: float

Area of the bounding box.

center property

center: Tuple[float, float]

Center point of the bounding box.

to_list

to_list() -> List[float]

Convert to [x1, y1, x2, y2] list.

Source code in omnidocs/tasks/table_extraction/models.py
def to_list(self) -> List[float]:
    """Convert to [x1, y1, x2, y2] list."""
    return [self.x1, self.y1, self.x2, self.y2]

to_xyxy

to_xyxy() -> Tuple[float, float, float, float]

Convert to (x1, y1, x2, y2) tuple.

Source code in omnidocs/tasks/table_extraction/models.py
def to_xyxy(self) -> Tuple[float, float, float, float]:
    """Convert to (x1, y1, x2, y2) tuple."""
    return (self.x1, self.y1, self.x2, self.y2)

from_list classmethod

from_list(coords: List[float]) -> BoundingBox

Create from [x1, y1, x2, y2] list.

Source code in omnidocs/tasks/table_extraction/models.py
@classmethod
def from_list(cls, coords: List[float]) -> "BoundingBox":
    """Create from [x1, y1, x2, y2] list."""
    if len(coords) != 4:
        raise ValueError(f"Expected 4 coordinates, got {len(coords)}")
    return cls(x1=coords[0], y1=coords[1], x2=coords[2], y2=coords[3])

from_ltrb classmethod

from_ltrb(
    left: float, top: float, right: float, bottom: float
) -> BoundingBox

Create from left, top, right, bottom coordinates.

Source code in omnidocs/tasks/table_extraction/models.py
@classmethod
def from_ltrb(cls, left: float, top: float, right: float, bottom: float) -> "BoundingBox":
    """Create from left, top, right, bottom coordinates."""
    return cls(x1=left, y1=top, x2=right, y2=bottom)

to_normalized

to_normalized(
    image_width: int, image_height: int
) -> BoundingBox

Convert to normalized coordinates (0-1024 range).

PARAMETER DESCRIPTION
image_width

Original image width in pixels

TYPE: int

image_height

Original image height in pixels

TYPE: int

RETURNS DESCRIPTION
BoundingBox

New BoundingBox with coordinates in 0-1024 range

Source code in omnidocs/tasks/table_extraction/models.py
def to_normalized(self, image_width: int, image_height: int) -> "BoundingBox":
    """
    Convert to normalized coordinates (0-1024 range).

    Args:
        image_width: Original image width in pixels
        image_height: Original image height in pixels

    Returns:
        New BoundingBox with coordinates in 0-1024 range
    """
    return BoundingBox(
        x1=self.x1 / image_width * NORMALIZED_SIZE,
        y1=self.y1 / image_height * NORMALIZED_SIZE,
        x2=self.x2 / image_width * NORMALIZED_SIZE,
        y2=self.y2 / image_height * NORMALIZED_SIZE,
    )

CellType

Bases: str, Enum

Type of table cell.

TableCell

Bases: BaseModel

Single table cell with position, span, and content.

The cell position uses 0-indexed row/column indices. Spans indicate how many rows/columns the cell occupies.

end_row property

end_row: int

Ending row index (exclusive).

end_col property

end_col: int

Ending column index (exclusive).

is_header property

is_header: bool

Check if cell is any type of header.

to_dict

to_dict() -> Dict

Convert to dictionary representation.

Source code in omnidocs/tasks/table_extraction/models.py
def to_dict(self) -> Dict:
    """Convert to dictionary representation."""
    return {
        "row": self.row,
        "col": self.col,
        "row_span": self.row_span,
        "col_span": self.col_span,
        "text": self.text,
        "cell_type": self.cell_type.value,
        "bbox": self.bbox.to_list() if self.bbox else None,
        "confidence": self.confidence,
    }

TableOutput

Bases: BaseModel

Complete table extraction result.

Provides multiple export formats and utility methods for working with extracted table data.

Example
result = extractor.extract(table_image)

# Basic info
print(f"Table: {result.num_rows}x{result.num_cols}")

# Export to HTML
html = result.to_html()

# Export to Pandas
df = result.to_dataframe()

# Export to Markdown
md = result.to_markdown()

# Access specific cell
cell = result.get_cell(row=0, col=0)

cell_count property

cell_count: int

Number of cells in the table.

has_headers property

has_headers: bool

Check if table has header cells.

get_cell

get_cell(row: int, col: int) -> Optional[TableCell]

Get cell at specific position.

Handles merged cells by returning the cell that covers the position.

Source code in omnidocs/tasks/table_extraction/models.py
def get_cell(self, row: int, col: int) -> Optional[TableCell]:
    """
    Get cell at specific position.

    Handles merged cells by returning the cell that covers the position.
    """
    for cell in self.cells:
        if cell.row <= row < cell.end_row and cell.col <= col < cell.end_col:
            return cell
    return None

get_row

get_row(row: int) -> List[TableCell]

Get all cells in a specific row.

Source code in omnidocs/tasks/table_extraction/models.py
def get_row(self, row: int) -> List[TableCell]:
    """Get all cells in a specific row."""
    return [c for c in self.cells if c.row == row]

get_column

get_column(col: int) -> List[TableCell]

Get all cells in a specific column.

Source code in omnidocs/tasks/table_extraction/models.py
def get_column(self, col: int) -> List[TableCell]:
    """Get all cells in a specific column."""
    return [c for c in self.cells if c.col == col]

to_html

to_html(include_styles: bool = True) -> str

Convert table to HTML string.

PARAMETER DESCRIPTION
include_styles

Whether to include basic CSS styling

TYPE: bool DEFAULT: True

RETURNS DESCRIPTION
str

HTML table string

Example
html = result.to_html()
with open("table.html", "w") as f:
    f.write(html)
Source code in omnidocs/tasks/table_extraction/models.py
def to_html(self, include_styles: bool = True) -> str:
    """
    Convert table to HTML string.

    Args:
        include_styles: Whether to include basic CSS styling

    Returns:
        HTML table string

    Example:
        ```python
        html = result.to_html()
        with open("table.html", "w") as f:
            f.write(html)
        ```
    """
    # Build 2D grid accounting for spans
    grid: List[List[Optional[TableCell]]] = [[None for _ in range(self.num_cols)] for _ in range(self.num_rows)]

    for cell in self.cells:
        for r in range(cell.row, cell.end_row):
            for c in range(cell.col, cell.end_col):
                if r < self.num_rows and c < self.num_cols:
                    grid[r][c] = cell

    # Generate HTML
    lines = []

    if include_styles:
        lines.append('<table style="border-collapse: collapse; width: 100%;">')
    else:
        lines.append("<table>")

    processed: set[Tuple[int, int]] = set()  # Track cells we've already output

    for row_idx in range(self.num_rows):
        lines.append("  <tr>")

        for col_idx in range(self.num_cols):
            cell = grid[row_idx][col_idx]

            if cell is None:
                lines.append("    <td></td>")
                continue

            # Skip if this cell was already output (merged cell)
            cell_id = (cell.row, cell.col)
            if cell_id in processed:
                continue
            processed.add(cell_id)

            # Determine tag based on cell type
            tag = "th" if cell.is_header else "td"

            # Build attributes
            attrs = []
            if cell.row_span > 1:
                attrs.append(f'rowspan="{cell.row_span}"')
            if cell.col_span > 1:
                attrs.append(f'colspan="{cell.col_span}"')
            if include_styles:
                attrs.append('style="border: 1px solid #ddd; padding: 8px;"')

            attr_str = " " + " ".join(attrs) if attrs else ""

            # Escape HTML in text
            text = (cell.text or "").replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")

            lines.append(f"    <{tag}{attr_str}>{text}</{tag}>")

        lines.append("  </tr>")

    lines.append("</table>")

    return "\n".join(lines)

to_dataframe

to_dataframe()

Convert table to Pandas DataFrame.

RETURNS DESCRIPTION

pandas.DataFrame with table data

RAISES DESCRIPTION
ImportError

If pandas is not installed

Example
df = result.to_dataframe()
print(df.head())
df.to_csv("table.csv")
Source code in omnidocs/tasks/table_extraction/models.py
def to_dataframe(self):
    """
    Convert table to Pandas DataFrame.

    Returns:
        pandas.DataFrame with table data

    Raises:
        ImportError: If pandas is not installed

    Example:
        ```python
        df = result.to_dataframe()
        print(df.head())
        df.to_csv("table.csv")
        ```
    """
    try:
        import pandas as pd
    except ImportError:
        raise ImportError("pandas is required for to_dataframe(). Install with: pip install pandas")

    # Build 2D array
    data: List[List[Optional[str]]] = [[None for _ in range(self.num_cols)] for _ in range(self.num_rows)]

    for cell in self.cells:
        # For merged cells, put value in top-left position
        if cell.row < self.num_rows and cell.col < self.num_cols:
            data[cell.row][cell.col] = cell.text

    # Determine if first row is header
    first_row_cells = self.get_row(0)
    use_header = all(c.cell_type == CellType.COLUMN_HEADER for c in first_row_cells) if first_row_cells else False

    if use_header and self.num_rows > 1:
        headers = data[0]
        data = data[1:]
        return pd.DataFrame(data, columns=headers)
    else:
        return pd.DataFrame(data)

to_markdown

to_markdown() -> str

Convert table to Markdown format.

Note: Markdown tables don't support merged cells, so spans are ignored and only the top-left cell value is used.

RETURNS DESCRIPTION
str

Markdown table string

Source code in omnidocs/tasks/table_extraction/models.py
def to_markdown(self) -> str:
    """
    Convert table to Markdown format.

    Note: Markdown tables don't support merged cells, so spans
    are ignored and only the top-left cell value is used.

    Returns:
        Markdown table string
    """
    if self.num_rows == 0 or self.num_cols == 0:
        return ""

    # Build 2D grid
    grid: List[List[str]] = [["" for _ in range(self.num_cols)] for _ in range(self.num_rows)]

    for cell in self.cells:
        if cell.row < self.num_rows and cell.col < self.num_cols:
            grid[cell.row][cell.col] = cell.text or ""

    lines = []

    # Header row
    lines.append("| " + " | ".join(grid[0]) + " |")

    # Separator
    lines.append("| " + " | ".join(["---"] * self.num_cols) + " |")

    # Data rows
    for row in grid[1:]:
        lines.append("| " + " | ".join(row) + " |")

    return "\n".join(lines)

to_dict

to_dict() -> Dict

Convert to dictionary representation.

Source code in omnidocs/tasks/table_extraction/models.py
def to_dict(self) -> Dict:
    """Convert to dictionary representation."""
    return {
        "cells": [c.to_dict() for c in self.cells],
        "num_rows": self.num_rows,
        "num_cols": self.num_cols,
        "image_width": self.image_width,
        "image_height": self.image_height,
        "model_name": self.model_name,
        "html": self.to_html(include_styles=False),
    }

save_json

save_json(file_path: Union[str, Path]) -> None

Save to JSON file.

Source code in omnidocs/tasks/table_extraction/models.py
def save_json(self, file_path: Union[str, Path]) -> None:
    """Save to JSON file."""
    path = Path(file_path)
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(self.model_dump_json(indent=2), encoding="utf-8")

load_json classmethod

load_json(file_path: Union[str, Path]) -> TableOutput

Load from JSON file.

Source code in omnidocs/tasks/table_extraction/models.py
@classmethod
def load_json(cls, file_path: Union[str, Path]) -> "TableOutput":
    """Load from JSON file."""
    path = Path(file_path)
    return cls.model_validate_json(path.read_text(encoding="utf-8"))

TableFormerConfig

Bases: BaseModel

Configuration for TableFormer table structure extractor.

TableFormer is a transformer-based model that predicts table structure using OTSL (Optimal Table Structure Language) tags and cell bounding boxes.

ATTRIBUTE DESCRIPTION
mode

Inference mode - "fast" or "accurate"

TYPE: TableFormerMode

device

Device for inference - "cpu", "cuda", "mps", or "auto"

TYPE: Literal['cpu', 'cuda', 'mps', 'auto']

num_threads

Number of CPU threads for inference

TYPE: int

do_cell_matching

Whether to match predicted cells with OCR text cells

TYPE: bool

artifacts_path

Path to pre-downloaded model artifacts

TYPE: Optional[str]

repo_id

HuggingFace model repository

TYPE: str

revision

Model revision/tag

TYPE: str

Example
from omnidocs.tasks.table_extraction import TableFormerExtractor, TableFormerConfig

# Fast mode
extractor = TableFormerExtractor(config=TableFormerConfig(mode="fast"))

# Accurate mode with GPU
extractor = TableFormerExtractor(
    config=TableFormerConfig(
        mode="accurate",
        device="cuda",
        do_cell_matching=True,
    )
)

TableFormerExtractor

TableFormerExtractor(config: TableFormerConfig)

Bases: BaseTableExtractor

Table structure extractor using TableFormer model.

TableFormer is a transformer-based model that predicts table structure using OTSL (Optimal Table Structure Language) tags. It can detect: - Cell boundaries (bounding boxes) - Row and column spans - Header cells (column and row headers) - Section rows

Example
from omnidocs.tasks.table_extraction import TableFormerExtractor, TableFormerConfig

# Initialize extractor
extractor = TableFormerExtractor(
    config=TableFormerConfig(mode="fast", device="cuda")
)

# Extract table structure
result = extractor.extract(table_image)

# Get HTML output
html = result.to_html()

# Get DataFrame
df = result.to_dataframe()

Initialize TableFormer extractor.

PARAMETER DESCRIPTION
config

TableFormerConfig with model settings

TYPE: TableFormerConfig

Source code in omnidocs/tasks/table_extraction/tableformer/pytorch.py
def __init__(self, config: TableFormerConfig):
    """
    Initialize TableFormer extractor.

    Args:
        config: TableFormerConfig with model settings
    """
    self.config = config
    self._device = _resolve_device(config.device)
    self._predictor = None
    self._model_config: Optional[Dict] = None
    self._load_model()

extract

extract(
    image: Union[Image, ndarray, str, Path],
    ocr_output: Optional[OCROutput] = None,
) -> TableOutput

Extract table structure from an image.

PARAMETER DESCRIPTION
image

Table image (should be cropped to table region)

TYPE: Union[Image, ndarray, str, Path]

ocr_output

Optional OCR results for cell text matching

TYPE: Optional[OCROutput] DEFAULT: None

RETURNS DESCRIPTION
TableOutput

TableOutput with cells, structure, and export methods

Example
result = extractor.extract(table_image)
print(f"Table: {result.num_rows}x{result.num_cols}")
html = result.to_html()
Source code in omnidocs/tasks/table_extraction/tableformer/pytorch.py
def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    ocr_output: Optional["OCROutput"] = None,
) -> TableOutput:
    """
    Extract table structure from an image.

    Args:
        image: Table image (should be cropped to table region)
        ocr_output: Optional OCR results for cell text matching

    Returns:
        TableOutput with cells, structure, and export methods

    Example:
        ```python
        result = extractor.extract(table_image)
        print(f"Table: {result.num_rows}x{result.num_cols}")
        html = result.to_html()
        ```
    """
    # Prepare image
    pil_image = self._prepare_image(image)
    width, height = pil_image.size

    # Convert to OpenCV format (required by TFPredictor)
    try:
        import cv2
    except ImportError:
        raise ImportError(
            "opencv-python is required for TableFormerExtractor. Install with: pip install opencv-python-headless"
        )

    cv_image = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)

    # Build iOCR page data
    tokens = self._build_tokens_from_ocr(ocr_output) if ocr_output else []
    iocr_page = {
        "width": width,
        "height": height,
        "image": cv_image,
        "tokens": tokens,
    }

    # Table bbox is the entire image
    table_bbox = [0, 0, width, height]

    # Run prediction
    results = self._predictor.multi_table_predict(
        iocr_page=iocr_page,
        table_bboxes=[table_bbox],
        do_matching=self.config.do_cell_matching,
        correct_overlapping_cells=self.config.correct_overlapping_cells,
        sort_row_col_indexes=self.config.sort_row_col_indexes,
    )

    # Convert results to TableOutput
    return self._convert_results(results, width, height)

TableFormerMode

Bases: str, Enum

TableFormer inference mode.

base

Base class for table extractors.

Defines the abstract interface that all table extractors must implement.

BaseTableExtractor

Bases: ABC

Abstract base class for table structure extractors.

Table extractors analyze table images to detect cell structure, identify headers, and extract text content.

Example
class MyTableExtractor(BaseTableExtractor):
    def __init__(self, config: MyConfig):
        self.config = config
        self._load_model()

    def _load_model(self):
        # Load model weights
        pass

    def extract(self, image):
        # Run extraction
        return TableOutput(...)
extract abstractmethod
extract(
    image: Union[Image, ndarray, str, Path],
    ocr_output: Optional[OCROutput] = None,
) -> TableOutput

Extract table structure from an image.

PARAMETER DESCRIPTION
image

Table image (should be cropped to table region)

TYPE: Union[Image, ndarray, str, Path]

ocr_output

Optional OCR results for cell text matching. If not provided, model will attempt to extract text.

TYPE: Optional[OCROutput] DEFAULT: None

RETURNS DESCRIPTION
TableOutput

TableOutput with cells, structure, and export methods

Example
# Without OCR (model extracts text)
result = extractor.extract(table_image)

# With OCR (better text quality)
ocr = some_ocr.extract(table_image)
result = extractor.extract(table_image, ocr_output=ocr)
Source code in omnidocs/tasks/table_extraction/base.py
@abstractmethod
def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    ocr_output: Optional["OCROutput"] = None,
) -> TableOutput:
    """
    Extract table structure from an image.

    Args:
        image: Table image (should be cropped to table region)
        ocr_output: Optional OCR results for cell text matching.
                   If not provided, model will attempt to extract text.

    Returns:
        TableOutput with cells, structure, and export methods

    Example:
        ```python
        # Without OCR (model extracts text)
        result = extractor.extract(table_image)

        # With OCR (better text quality)
        ocr = some_ocr.extract(table_image)
        result = extractor.extract(table_image, ocr_output=ocr)
        ```
    """
    pass
batch_extract
batch_extract(
    images: List[Union[Image, ndarray, str, Path]],
    ocr_outputs: Optional[List[OCROutput]] = None,
    progress_callback: Optional[
        Callable[[int, int], None]
    ] = None,
) -> List[TableOutput]

Extract tables from multiple images.

Default implementation loops over extract(). Subclasses can override for optimized batching.

PARAMETER DESCRIPTION
images

List of table images

TYPE: List[Union[Image, ndarray, str, Path]]

ocr_outputs

Optional list of OCR results (same length as images)

TYPE: Optional[List[OCROutput]] DEFAULT: None

progress_callback

Optional function(current, total) for progress

TYPE: Optional[Callable[[int, int], None]] DEFAULT: None

RETURNS DESCRIPTION
List[TableOutput]

List of TableOutput in same order as input

Examples:

results = extractor.batch_extract(table_images)
Source code in omnidocs/tasks/table_extraction/base.py
def batch_extract(
    self,
    images: List[Union[Image.Image, np.ndarray, str, Path]],
    ocr_outputs: Optional[List["OCROutput"]] = None,
    progress_callback: Optional[Callable[[int, int], None]] = None,
) -> List[TableOutput]:
    """
    Extract tables from multiple images.

    Default implementation loops over extract(). Subclasses can override
    for optimized batching.

    Args:
        images: List of table images
        ocr_outputs: Optional list of OCR results (same length as images)
        progress_callback: Optional function(current, total) for progress

    Returns:
        List of TableOutput in same order as input

    Examples:
        ```python
        results = extractor.batch_extract(table_images)
        ```
    """
    results = []
    total = len(images)

    for i, image in enumerate(images):
        if progress_callback:
            progress_callback(i + 1, total)

        ocr = ocr_outputs[i] if ocr_outputs else None
        result = self.extract(image, ocr_output=ocr)
        results.append(result)

    return results
extract_document
extract_document(
    document: Document,
    table_bboxes: Optional[List[List[float]]] = None,
    progress_callback: Optional[
        Callable[[int, int], None]
    ] = None,
) -> List[TableOutput]

Extract tables from all pages of a document.

PARAMETER DESCRIPTION
document

Document instance

TYPE: Document

table_bboxes

Optional list of table bounding boxes per page. Each element should be a list of [x1, y1, x2, y2] coords.

TYPE: Optional[List[List[float]]] DEFAULT: None

progress_callback

Optional function(current, total) for progress

TYPE: Optional[Callable[[int, int], None]] DEFAULT: None

RETURNS DESCRIPTION
List[TableOutput]

List of TableOutput, one per detected table

Examples:

doc = Document.from_pdf("paper.pdf")
results = extractor.extract_document(doc)
Source code in omnidocs/tasks/table_extraction/base.py
def extract_document(
    self,
    document: "Document",
    table_bboxes: Optional[List[List[float]]] = None,
    progress_callback: Optional[Callable[[int, int], None]] = None,
) -> List[TableOutput]:
    """
    Extract tables from all pages of a document.

    Args:
        document: Document instance
        table_bboxes: Optional list of table bounding boxes per page.
                     Each element should be a list of [x1, y1, x2, y2] coords.
        progress_callback: Optional function(current, total) for progress

    Returns:
        List of TableOutput, one per detected table

    Examples:
        ```python
        doc = Document.from_pdf("paper.pdf")
        results = extractor.extract_document(doc)
        ```
    """
    results = []
    total = document.page_count

    for i, page in enumerate(document.iter_pages()):
        if progress_callback:
            progress_callback(i + 1, total)

        # If no bboxes provided, process entire page
        if table_bboxes is None:
            result = self.extract(page)
            results.append(result)
        else:
            # Crop and process each table region
            for bbox in table_bboxes:
                x1, y1, x2, y2 = bbox
                table_region = page.crop((x1, y1, x2, y2))
                result = self.extract(table_region)
                results.append(result)

    return results

models

Pydantic models for table extraction outputs.

Provides structured table data with cells, spans, and multiple export formats including HTML, Markdown, and Pandas DataFrame conversion.

Example
result = extractor.extract(table_image)

# Get HTML
html = result.to_html()

# Get Pandas DataFrame
df = result.to_dataframe()

# Access cells
for cell in result.cells:
    print(f"[{cell.row},{cell.col}] {cell.text}")

CellType

Bases: str, Enum

Type of table cell.

BoundingBox

Bases: BaseModel

Bounding box in pixel coordinates.

width property
width: float

Width of the bounding box.

height property
height: float

Height of the bounding box.

area property
area: float

Area of the bounding box.

center property
center: Tuple[float, float]

Center point of the bounding box.

to_list
to_list() -> List[float]

Convert to [x1, y1, x2, y2] list.

Source code in omnidocs/tasks/table_extraction/models.py
def to_list(self) -> List[float]:
    """Convert to [x1, y1, x2, y2] list."""
    return [self.x1, self.y1, self.x2, self.y2]
to_xyxy
to_xyxy() -> Tuple[float, float, float, float]

Convert to (x1, y1, x2, y2) tuple.

Source code in omnidocs/tasks/table_extraction/models.py
def to_xyxy(self) -> Tuple[float, float, float, float]:
    """Convert to (x1, y1, x2, y2) tuple."""
    return (self.x1, self.y1, self.x2, self.y2)
from_list classmethod
from_list(coords: List[float]) -> BoundingBox

Create from [x1, y1, x2, y2] list.

Source code in omnidocs/tasks/table_extraction/models.py
@classmethod
def from_list(cls, coords: List[float]) -> "BoundingBox":
    """Create from [x1, y1, x2, y2] list."""
    if len(coords) != 4:
        raise ValueError(f"Expected 4 coordinates, got {len(coords)}")
    return cls(x1=coords[0], y1=coords[1], x2=coords[2], y2=coords[3])
from_ltrb classmethod
from_ltrb(
    left: float, top: float, right: float, bottom: float
) -> BoundingBox

Create from left, top, right, bottom coordinates.

Source code in omnidocs/tasks/table_extraction/models.py
@classmethod
def from_ltrb(cls, left: float, top: float, right: float, bottom: float) -> "BoundingBox":
    """Create from left, top, right, bottom coordinates."""
    return cls(x1=left, y1=top, x2=right, y2=bottom)
to_normalized
to_normalized(
    image_width: int, image_height: int
) -> BoundingBox

Convert to normalized coordinates (0-1024 range).

PARAMETER DESCRIPTION
image_width

Original image width in pixels

TYPE: int

image_height

Original image height in pixels

TYPE: int

RETURNS DESCRIPTION
BoundingBox

New BoundingBox with coordinates in 0-1024 range

Source code in omnidocs/tasks/table_extraction/models.py
def to_normalized(self, image_width: int, image_height: int) -> "BoundingBox":
    """
    Convert to normalized coordinates (0-1024 range).

    Args:
        image_width: Original image width in pixels
        image_height: Original image height in pixels

    Returns:
        New BoundingBox with coordinates in 0-1024 range
    """
    return BoundingBox(
        x1=self.x1 / image_width * NORMALIZED_SIZE,
        y1=self.y1 / image_height * NORMALIZED_SIZE,
        x2=self.x2 / image_width * NORMALIZED_SIZE,
        y2=self.y2 / image_height * NORMALIZED_SIZE,
    )

TableCell

Bases: BaseModel

Single table cell with position, span, and content.

The cell position uses 0-indexed row/column indices. Spans indicate how many rows/columns the cell occupies.

end_row property
end_row: int

Ending row index (exclusive).

end_col property
end_col: int

Ending column index (exclusive).

is_header property
is_header: bool

Check if cell is any type of header.

to_dict
to_dict() -> Dict

Convert to dictionary representation.

Source code in omnidocs/tasks/table_extraction/models.py
def to_dict(self) -> Dict:
    """Convert to dictionary representation."""
    return {
        "row": self.row,
        "col": self.col,
        "row_span": self.row_span,
        "col_span": self.col_span,
        "text": self.text,
        "cell_type": self.cell_type.value,
        "bbox": self.bbox.to_list() if self.bbox else None,
        "confidence": self.confidence,
    }

TableOutput

Bases: BaseModel

Complete table extraction result.

Provides multiple export formats and utility methods for working with extracted table data.

Example
result = extractor.extract(table_image)

# Basic info
print(f"Table: {result.num_rows}x{result.num_cols}")

# Export to HTML
html = result.to_html()

# Export to Pandas
df = result.to_dataframe()

# Export to Markdown
md = result.to_markdown()

# Access specific cell
cell = result.get_cell(row=0, col=0)
cell_count property
cell_count: int

Number of cells in the table.

has_headers property
has_headers: bool

Check if table has header cells.

get_cell
get_cell(row: int, col: int) -> Optional[TableCell]

Get cell at specific position.

Handles merged cells by returning the cell that covers the position.

Source code in omnidocs/tasks/table_extraction/models.py
def get_cell(self, row: int, col: int) -> Optional[TableCell]:
    """
    Get cell at specific position.

    Handles merged cells by returning the cell that covers the position.
    """
    for cell in self.cells:
        if cell.row <= row < cell.end_row and cell.col <= col < cell.end_col:
            return cell
    return None
get_row
get_row(row: int) -> List[TableCell]

Get all cells in a specific row.

Source code in omnidocs/tasks/table_extraction/models.py
def get_row(self, row: int) -> List[TableCell]:
    """Get all cells in a specific row."""
    return [c for c in self.cells if c.row == row]
get_column
get_column(col: int) -> List[TableCell]

Get all cells in a specific column.

Source code in omnidocs/tasks/table_extraction/models.py
def get_column(self, col: int) -> List[TableCell]:
    """Get all cells in a specific column."""
    return [c for c in self.cells if c.col == col]
to_html
to_html(include_styles: bool = True) -> str

Convert table to HTML string.

PARAMETER DESCRIPTION
include_styles

Whether to include basic CSS styling

TYPE: bool DEFAULT: True

RETURNS DESCRIPTION
str

HTML table string

Example
html = result.to_html()
with open("table.html", "w") as f:
    f.write(html)
Source code in omnidocs/tasks/table_extraction/models.py
def to_html(self, include_styles: bool = True) -> str:
    """
    Convert table to HTML string.

    Args:
        include_styles: Whether to include basic CSS styling

    Returns:
        HTML table string

    Example:
        ```python
        html = result.to_html()
        with open("table.html", "w") as f:
            f.write(html)
        ```
    """
    # Build 2D grid accounting for spans
    grid: List[List[Optional[TableCell]]] = [[None for _ in range(self.num_cols)] for _ in range(self.num_rows)]

    for cell in self.cells:
        for r in range(cell.row, cell.end_row):
            for c in range(cell.col, cell.end_col):
                if r < self.num_rows and c < self.num_cols:
                    grid[r][c] = cell

    # Generate HTML
    lines = []

    if include_styles:
        lines.append('<table style="border-collapse: collapse; width: 100%;">')
    else:
        lines.append("<table>")

    processed: set[Tuple[int, int]] = set()  # Track cells we've already output

    for row_idx in range(self.num_rows):
        lines.append("  <tr>")

        for col_idx in range(self.num_cols):
            cell = grid[row_idx][col_idx]

            if cell is None:
                lines.append("    <td></td>")
                continue

            # Skip if this cell was already output (merged cell)
            cell_id = (cell.row, cell.col)
            if cell_id in processed:
                continue
            processed.add(cell_id)

            # Determine tag based on cell type
            tag = "th" if cell.is_header else "td"

            # Build attributes
            attrs = []
            if cell.row_span > 1:
                attrs.append(f'rowspan="{cell.row_span}"')
            if cell.col_span > 1:
                attrs.append(f'colspan="{cell.col_span}"')
            if include_styles:
                attrs.append('style="border: 1px solid #ddd; padding: 8px;"')

            attr_str = " " + " ".join(attrs) if attrs else ""

            # Escape HTML in text
            text = (cell.text or "").replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")

            lines.append(f"    <{tag}{attr_str}>{text}</{tag}>")

        lines.append("  </tr>")

    lines.append("</table>")

    return "\n".join(lines)
to_dataframe
to_dataframe()

Convert table to Pandas DataFrame.

RETURNS DESCRIPTION

pandas.DataFrame with table data

RAISES DESCRIPTION
ImportError

If pandas is not installed

Example
df = result.to_dataframe()
print(df.head())
df.to_csv("table.csv")
Source code in omnidocs/tasks/table_extraction/models.py
def to_dataframe(self):
    """
    Convert table to Pandas DataFrame.

    Returns:
        pandas.DataFrame with table data

    Raises:
        ImportError: If pandas is not installed

    Example:
        ```python
        df = result.to_dataframe()
        print(df.head())
        df.to_csv("table.csv")
        ```
    """
    try:
        import pandas as pd
    except ImportError:
        raise ImportError("pandas is required for to_dataframe(). Install with: pip install pandas")

    # Build 2D array
    data: List[List[Optional[str]]] = [[None for _ in range(self.num_cols)] for _ in range(self.num_rows)]

    for cell in self.cells:
        # For merged cells, put value in top-left position
        if cell.row < self.num_rows and cell.col < self.num_cols:
            data[cell.row][cell.col] = cell.text

    # Determine if first row is header
    first_row_cells = self.get_row(0)
    use_header = all(c.cell_type == CellType.COLUMN_HEADER for c in first_row_cells) if first_row_cells else False

    if use_header and self.num_rows > 1:
        headers = data[0]
        data = data[1:]
        return pd.DataFrame(data, columns=headers)
    else:
        return pd.DataFrame(data)
to_markdown
to_markdown() -> str

Convert table to Markdown format.

Note: Markdown tables don't support merged cells, so spans are ignored and only the top-left cell value is used.

RETURNS DESCRIPTION
str

Markdown table string

Source code in omnidocs/tasks/table_extraction/models.py
def to_markdown(self) -> str:
    """
    Convert table to Markdown format.

    Note: Markdown tables don't support merged cells, so spans
    are ignored and only the top-left cell value is used.

    Returns:
        Markdown table string
    """
    if self.num_rows == 0 or self.num_cols == 0:
        return ""

    # Build 2D grid
    grid: List[List[str]] = [["" for _ in range(self.num_cols)] for _ in range(self.num_rows)]

    for cell in self.cells:
        if cell.row < self.num_rows and cell.col < self.num_cols:
            grid[cell.row][cell.col] = cell.text or ""

    lines = []

    # Header row
    lines.append("| " + " | ".join(grid[0]) + " |")

    # Separator
    lines.append("| " + " | ".join(["---"] * self.num_cols) + " |")

    # Data rows
    for row in grid[1:]:
        lines.append("| " + " | ".join(row) + " |")

    return "\n".join(lines)
to_dict
to_dict() -> Dict

Convert to dictionary representation.

Source code in omnidocs/tasks/table_extraction/models.py
def to_dict(self) -> Dict:
    """Convert to dictionary representation."""
    return {
        "cells": [c.to_dict() for c in self.cells],
        "num_rows": self.num_rows,
        "num_cols": self.num_cols,
        "image_width": self.image_width,
        "image_height": self.image_height,
        "model_name": self.model_name,
        "html": self.to_html(include_styles=False),
    }
save_json
save_json(file_path: Union[str, Path]) -> None

Save to JSON file.

Source code in omnidocs/tasks/table_extraction/models.py
def save_json(self, file_path: Union[str, Path]) -> None:
    """Save to JSON file."""
    path = Path(file_path)
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(self.model_dump_json(indent=2), encoding="utf-8")
load_json classmethod
load_json(file_path: Union[str, Path]) -> TableOutput

Load from JSON file.

Source code in omnidocs/tasks/table_extraction/models.py
@classmethod
def load_json(cls, file_path: Union[str, Path]) -> "TableOutput":
    """Load from JSON file."""
    path = Path(file_path)
    return cls.model_validate_json(path.read_text(encoding="utf-8"))

tableformer

TableFormer module for table structure extraction.

Provides the TableFormer-based table structure extractor.

TableFormerConfig

Bases: BaseModel

Configuration for TableFormer table structure extractor.

TableFormer is a transformer-based model that predicts table structure using OTSL (Optimal Table Structure Language) tags and cell bounding boxes.

ATTRIBUTE DESCRIPTION
mode

Inference mode - "fast" or "accurate"

TYPE: TableFormerMode

device

Device for inference - "cpu", "cuda", "mps", or "auto"

TYPE: Literal['cpu', 'cuda', 'mps', 'auto']

num_threads

Number of CPU threads for inference

TYPE: int

do_cell_matching

Whether to match predicted cells with OCR text cells

TYPE: bool

artifacts_path

Path to pre-downloaded model artifacts

TYPE: Optional[str]

repo_id

HuggingFace model repository

TYPE: str

revision

Model revision/tag

TYPE: str

Example
from omnidocs.tasks.table_extraction import TableFormerExtractor, TableFormerConfig

# Fast mode
extractor = TableFormerExtractor(config=TableFormerConfig(mode="fast"))

# Accurate mode with GPU
extractor = TableFormerExtractor(
    config=TableFormerConfig(
        mode="accurate",
        device="cuda",
        do_cell_matching=True,
    )
)

TableFormerMode

Bases: str, Enum

TableFormer inference mode.

TableFormerExtractor

TableFormerExtractor(config: TableFormerConfig)

Bases: BaseTableExtractor

Table structure extractor using TableFormer model.

TableFormer is a transformer-based model that predicts table structure using OTSL (Optimal Table Structure Language) tags. It can detect: - Cell boundaries (bounding boxes) - Row and column spans - Header cells (column and row headers) - Section rows

Example
from omnidocs.tasks.table_extraction import TableFormerExtractor, TableFormerConfig

# Initialize extractor
extractor = TableFormerExtractor(
    config=TableFormerConfig(mode="fast", device="cuda")
)

# Extract table structure
result = extractor.extract(table_image)

# Get HTML output
html = result.to_html()

# Get DataFrame
df = result.to_dataframe()

Initialize TableFormer extractor.

PARAMETER DESCRIPTION
config

TableFormerConfig with model settings

TYPE: TableFormerConfig

Source code in omnidocs/tasks/table_extraction/tableformer/pytorch.py
def __init__(self, config: TableFormerConfig):
    """
    Initialize TableFormer extractor.

    Args:
        config: TableFormerConfig with model settings
    """
    self.config = config
    self._device = _resolve_device(config.device)
    self._predictor = None
    self._model_config: Optional[Dict] = None
    self._load_model()
extract
extract(
    image: Union[Image, ndarray, str, Path],
    ocr_output: Optional[OCROutput] = None,
) -> TableOutput

Extract table structure from an image.

PARAMETER DESCRIPTION
image

Table image (should be cropped to table region)

TYPE: Union[Image, ndarray, str, Path]

ocr_output

Optional OCR results for cell text matching

TYPE: Optional[OCROutput] DEFAULT: None

RETURNS DESCRIPTION
TableOutput

TableOutput with cells, structure, and export methods

Example
result = extractor.extract(table_image)
print(f"Table: {result.num_rows}x{result.num_cols}")
html = result.to_html()
Source code in omnidocs/tasks/table_extraction/tableformer/pytorch.py
def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    ocr_output: Optional["OCROutput"] = None,
) -> TableOutput:
    """
    Extract table structure from an image.

    Args:
        image: Table image (should be cropped to table region)
        ocr_output: Optional OCR results for cell text matching

    Returns:
        TableOutput with cells, structure, and export methods

    Example:
        ```python
        result = extractor.extract(table_image)
        print(f"Table: {result.num_rows}x{result.num_cols}")
        html = result.to_html()
        ```
    """
    # Prepare image
    pil_image = self._prepare_image(image)
    width, height = pil_image.size

    # Convert to OpenCV format (required by TFPredictor)
    try:
        import cv2
    except ImportError:
        raise ImportError(
            "opencv-python is required for TableFormerExtractor. Install with: pip install opencv-python-headless"
        )

    cv_image = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)

    # Build iOCR page data
    tokens = self._build_tokens_from_ocr(ocr_output) if ocr_output else []
    iocr_page = {
        "width": width,
        "height": height,
        "image": cv_image,
        "tokens": tokens,
    }

    # Table bbox is the entire image
    table_bbox = [0, 0, width, height]

    # Run prediction
    results = self._predictor.multi_table_predict(
        iocr_page=iocr_page,
        table_bboxes=[table_bbox],
        do_matching=self.config.do_cell_matching,
        correct_overlapping_cells=self.config.correct_overlapping_cells,
        sort_row_col_indexes=self.config.sort_row_col_indexes,
    )

    # Convert results to TableOutput
    return self._convert_results(results, width, height)

config

Configuration for TableFormer table structure extractor.

TableFormer uses a dual-decoder transformer architecture with OTSL+ support for recognizing table structure from images.

Example
from omnidocs.tasks.table_extraction import TableFormerExtractor, TableFormerConfig

# Fast mode (default)
extractor = TableFormerExtractor(config=TableFormerConfig())

# Accurate mode with GPU
extractor = TableFormerExtractor(
    config=TableFormerConfig(
        mode="accurate",
        device="cuda",
        do_cell_matching=True,
    )
)
TableFormerMode

Bases: str, Enum

TableFormer inference mode.

TableFormerConfig

Bases: BaseModel

Configuration for TableFormer table structure extractor.

TableFormer is a transformer-based model that predicts table structure using OTSL (Optimal Table Structure Language) tags and cell bounding boxes.

ATTRIBUTE DESCRIPTION
mode

Inference mode - "fast" or "accurate"

TYPE: TableFormerMode

device

Device for inference - "cpu", "cuda", "mps", or "auto"

TYPE: Literal['cpu', 'cuda', 'mps', 'auto']

num_threads

Number of CPU threads for inference

TYPE: int

do_cell_matching

Whether to match predicted cells with OCR text cells

TYPE: bool

artifacts_path

Path to pre-downloaded model artifacts

TYPE: Optional[str]

repo_id

HuggingFace model repository

TYPE: str

revision

Model revision/tag

TYPE: str

Example
from omnidocs.tasks.table_extraction import TableFormerExtractor, TableFormerConfig

# Fast mode
extractor = TableFormerExtractor(config=TableFormerConfig(mode="fast"))

# Accurate mode with GPU
extractor = TableFormerExtractor(
    config=TableFormerConfig(
        mode="accurate",
        device="cuda",
        do_cell_matching=True,
    )
)

pytorch

TableFormer extractor implementation using PyTorch backend.

Uses the TFPredictor from docling-ibm-models for table structure recognition.

TableFormerExtractor
TableFormerExtractor(config: TableFormerConfig)

Bases: BaseTableExtractor

Table structure extractor using TableFormer model.

TableFormer is a transformer-based model that predicts table structure using OTSL (Optimal Table Structure Language) tags. It can detect: - Cell boundaries (bounding boxes) - Row and column spans - Header cells (column and row headers) - Section rows

Example
from omnidocs.tasks.table_extraction import TableFormerExtractor, TableFormerConfig

# Initialize extractor
extractor = TableFormerExtractor(
    config=TableFormerConfig(mode="fast", device="cuda")
)

# Extract table structure
result = extractor.extract(table_image)

# Get HTML output
html = result.to_html()

# Get DataFrame
df = result.to_dataframe()

Initialize TableFormer extractor.

PARAMETER DESCRIPTION
config

TableFormerConfig with model settings

TYPE: TableFormerConfig

Source code in omnidocs/tasks/table_extraction/tableformer/pytorch.py
def __init__(self, config: TableFormerConfig):
    """
    Initialize TableFormer extractor.

    Args:
        config: TableFormerConfig with model settings
    """
    self.config = config
    self._device = _resolve_device(config.device)
    self._predictor = None
    self._model_config: Optional[Dict] = None
    self._load_model()
extract
extract(
    image: Union[Image, ndarray, str, Path],
    ocr_output: Optional[OCROutput] = None,
) -> TableOutput

Extract table structure from an image.

PARAMETER DESCRIPTION
image

Table image (should be cropped to table region)

TYPE: Union[Image, ndarray, str, Path]

ocr_output

Optional OCR results for cell text matching

TYPE: Optional[OCROutput] DEFAULT: None

RETURNS DESCRIPTION
TableOutput

TableOutput with cells, structure, and export methods

Example
result = extractor.extract(table_image)
print(f"Table: {result.num_rows}x{result.num_cols}")
html = result.to_html()
Source code in omnidocs/tasks/table_extraction/tableformer/pytorch.py
def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    ocr_output: Optional["OCROutput"] = None,
) -> TableOutput:
    """
    Extract table structure from an image.

    Args:
        image: Table image (should be cropped to table region)
        ocr_output: Optional OCR results for cell text matching

    Returns:
        TableOutput with cells, structure, and export methods

    Example:
        ```python
        result = extractor.extract(table_image)
        print(f"Table: {result.num_rows}x{result.num_cols}")
        html = result.to_html()
        ```
    """
    # Prepare image
    pil_image = self._prepare_image(image)
    width, height = pil_image.size

    # Convert to OpenCV format (required by TFPredictor)
    try:
        import cv2
    except ImportError:
        raise ImportError(
            "opencv-python is required for TableFormerExtractor. Install with: pip install opencv-python-headless"
        )

    cv_image = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)

    # Build iOCR page data
    tokens = self._build_tokens_from_ocr(ocr_output) if ocr_output else []
    iocr_page = {
        "width": width,
        "height": height,
        "image": cv_image,
        "tokens": tokens,
    }

    # Table bbox is the entire image
    table_bbox = [0, 0, width, height]

    # Run prediction
    results = self._predictor.multi_table_predict(
        iocr_page=iocr_page,
        table_bboxes=[table_bbox],
        do_matching=self.config.do_cell_matching,
        correct_overlapping_cells=self.config.correct_overlapping_cells,
        sort_row_col_indexes=self.config.sort_row_col_indexes,
    )

    # Convert results to TableOutput
    return self._convert_results(results, width, height)

text_extraction

Text Extraction Module.

Provides extractors for converting document images to structured text formats (HTML, Markdown, JSON). Uses Vision-Language Models for accurate text extraction with formatting preservation and optional layout detection.

Available Extractors
  • QwenTextExtractor: Qwen3-VL based extractor (multi-backend)
  • DotsOCRTextExtractor: Dots OCR with layout-aware extraction (PyTorch/VLLM/API)
  • NanonetsTextExtractor: Nanonets OCR2-3B for text extraction (PyTorch/VLLM)
  • GraniteDoclingTextExtractor: IBM Granite Docling for document conversion (multi-backend)
  • MinerUVLTextExtractor: MinerU VL 1.2B with layout-aware two-step extraction (multi-backend)
Example
from omnidocs.tasks.text_extraction import QwenTextExtractor
from omnidocs.tasks.text_extraction.qwen import QwenTextPyTorchConfig

extractor = QwenTextExtractor(
        backend=QwenTextPyTorchConfig(model="Qwen/Qwen3-VL-8B-Instruct")
    )
result = extractor.extract(image, output_format="markdown")
print(result.content)

BaseTextExtractor

Bases: ABC

Abstract base class for text extractors.

All text extraction models must inherit from this class and implement the required methods.

Example
class MyTextExtractor(BaseTextExtractor):
        def __init__(self, config: MyConfig):
            self.config = config
            self._load_model()

        def _load_model(self):
            # Load model weights
            pass

        def extract(self, image, output_format="markdown"):
            # Run extraction
            return TextOutput(...)

extract abstractmethod

extract(
    image: Union[Image, ndarray, str, Path],
    output_format: Literal["html", "markdown"] = "markdown",
) -> TextOutput

Extract text from an image.

PARAMETER DESCRIPTION
image

Input image as: - PIL.Image.Image: PIL image object - np.ndarray: Numpy array (HWC format, RGB) - str or Path: Path to image file

TYPE: Union[Image, ndarray, str, Path]

output_format

Desired output format: - "html": Structured HTML - "markdown": Markdown format

TYPE: Literal['html', 'markdown'] DEFAULT: 'markdown'

RETURNS DESCRIPTION
TextOutput

TextOutput containing extracted text content

RAISES DESCRIPTION
ValueError

If image format or output_format is not supported

RuntimeError

If model is not loaded or inference fails

Source code in omnidocs/tasks/text_extraction/base.py
@abstractmethod
def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    output_format: Literal["html", "markdown"] = "markdown",
) -> TextOutput:
    """
    Extract text from an image.

    Args:
        image: Input image as:
            - PIL.Image.Image: PIL image object
            - np.ndarray: Numpy array (HWC format, RGB)
            - str or Path: Path to image file
        output_format: Desired output format:
            - "html": Structured HTML
            - "markdown": Markdown format

    Returns:
        TextOutput containing extracted text content

    Raises:
        ValueError: If image format or output_format is not supported
        RuntimeError: If model is not loaded or inference fails
    """
    pass

batch_extract

batch_extract(
    images: List[Union[Image, ndarray, str, Path]],
    output_format: Literal["html", "markdown"] = "markdown",
    progress_callback: Optional[
        Callable[[int, int], None]
    ] = None,
) -> List[TextOutput]

Extract text from multiple images.

Default implementation loops over extract(). Subclasses can override for optimized batching (e.g., VLLM).

PARAMETER DESCRIPTION
images

List of images in any supported format

TYPE: List[Union[Image, ndarray, str, Path]]

output_format

Desired output format

TYPE: Literal['html', 'markdown'] DEFAULT: 'markdown'

progress_callback

Optional function(current, total) for progress

TYPE: Optional[Callable[[int, int], None]] DEFAULT: None

RETURNS DESCRIPTION
List[TextOutput]

List of TextOutput in same order as input

Examples:

images = [doc.get_page(i) for i in range(doc.page_count)]
results = extractor.batch_extract(images, output_format="markdown")
Source code in omnidocs/tasks/text_extraction/base.py
def batch_extract(
    self,
    images: List[Union[Image.Image, np.ndarray, str, Path]],
    output_format: Literal["html", "markdown"] = "markdown",
    progress_callback: Optional[Callable[[int, int], None]] = None,
) -> List[TextOutput]:
    """
    Extract text from multiple images.

    Default implementation loops over extract(). Subclasses can override
    for optimized batching (e.g., VLLM).

    Args:
        images: List of images in any supported format
        output_format: Desired output format
        progress_callback: Optional function(current, total) for progress

    Returns:
        List of TextOutput in same order as input

    Examples:
        ```python
        images = [doc.get_page(i) for i in range(doc.page_count)]
        results = extractor.batch_extract(images, output_format="markdown")
        ```
    """
    results = []
    total = len(images)

    for i, image in enumerate(images):
        if progress_callback:
            progress_callback(i + 1, total)

        result = self.extract(image, output_format=output_format)
        results.append(result)

    return results

extract_document

extract_document(
    document: Document,
    output_format: Literal["html", "markdown"] = "markdown",
    progress_callback: Optional[
        Callable[[int, int], None]
    ] = None,
) -> List[TextOutput]

Extract text from all pages of a document.

PARAMETER DESCRIPTION
document

Document instance

TYPE: Document

output_format

Desired output format

TYPE: Literal['html', 'markdown'] DEFAULT: 'markdown'

progress_callback

Optional function(current, total) for progress

TYPE: Optional[Callable[[int, int], None]] DEFAULT: None

RETURNS DESCRIPTION
List[TextOutput]

List of TextOutput, one per page

Examples:

doc = Document.from_pdf("paper.pdf")
results = extractor.extract_document(doc, output_format="markdown")
Source code in omnidocs/tasks/text_extraction/base.py
def extract_document(
    self,
    document: "Document",
    output_format: Literal["html", "markdown"] = "markdown",
    progress_callback: Optional[Callable[[int, int], None]] = None,
) -> List[TextOutput]:
    """
    Extract text from all pages of a document.

    Args:
        document: Document instance
        output_format: Desired output format
        progress_callback: Optional function(current, total) for progress

    Returns:
        List of TextOutput, one per page

    Examples:
        ```python
        doc = Document.from_pdf("paper.pdf")
        results = extractor.extract_document(doc, output_format="markdown")
        ```
    """
    results = []
    total = document.page_count

    for i, page in enumerate(document.iter_pages()):
        if progress_callback:
            progress_callback(i + 1, total)

        result = self.extract(page, output_format=output_format)
        results.append(result)

    return results

DotsOCRTextExtractor

DotsOCRTextExtractor(backend: DotsOCRBackendConfig)

Bases: BaseTextExtractor

Dots OCR Vision-Language Model text extractor with layout detection.

Extracts text from document images with layout information including: - 11 layout categories (Caption, Footnote, Formula, List-item, etc.) - Bounding boxes (normalized to 0-1024) - Multi-format text (Markdown, LaTeX, HTML) - Reading order preservation

Supports PyTorch, VLLM, and API backends.

Example
from omnidocs.tasks.text_extraction import DotsOCRTextExtractor
from omnidocs.tasks.text_extraction.dotsocr import DotsOCRPyTorchConfig

# Initialize with PyTorch backend
extractor = DotsOCRTextExtractor(
        backend=DotsOCRPyTorchConfig(model="rednote-hilab/dots.ocr")
    )

# Extract with layout
result = extractor.extract(image, include_layout=True)
print(f"Found {result.num_layout_elements} elements")
print(result.content)

Initialize Dots OCR text extractor.

PARAMETER DESCRIPTION
backend

Backend configuration. One of: - DotsOCRPyTorchConfig: PyTorch/HuggingFace backend - DotsOCRVLLMConfig: VLLM high-throughput backend - DotsOCRAPIConfig: API backend (online VLLM server)

TYPE: DotsOCRBackendConfig

Source code in omnidocs/tasks/text_extraction/dotsocr/extractor.py
def __init__(self, backend: DotsOCRBackendConfig):
    """
    Initialize Dots OCR text extractor.

    Args:
        backend: Backend configuration. One of:
            - DotsOCRPyTorchConfig: PyTorch/HuggingFace backend
            - DotsOCRVLLMConfig: VLLM high-throughput backend
            - DotsOCRAPIConfig: API backend (online VLLM server)
    """
    self.backend_config = backend
    self._backend: Any = None
    self._processor: Any = None
    self._model: Any = None
    self._loaded = False

    # Load model
    self._load_model()

extract

extract(
    image: Union[Image, ndarray, str, Path],
    output_format: Literal[
        "markdown", "html", "json"
    ] = "markdown",
    include_layout: bool = False,
    custom_prompt: Optional[str] = None,
    max_tokens: int = 8192,
) -> DotsOCRTextOutput

Extract text from image using Dots OCR.

PARAMETER DESCRIPTION
image

Input image (PIL Image, numpy array, or file path)

TYPE: Union[Image, ndarray, str, Path]

output_format

Output format ("markdown", "html", or "json")

TYPE: Literal['markdown', 'html', 'json'] DEFAULT: 'markdown'

include_layout

Include layout bounding boxes in output

TYPE: bool DEFAULT: False

custom_prompt

Override default extraction prompt

TYPE: Optional[str] DEFAULT: None

max_tokens

Maximum tokens for generation

TYPE: int DEFAULT: 8192

RETURNS DESCRIPTION
DotsOCRTextOutput

DotsOCRTextOutput with extracted content and optional layout

RAISES DESCRIPTION
RuntimeError

If model is not loaded or inference fails

Source code in omnidocs/tasks/text_extraction/dotsocr/extractor.py
def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    output_format: Literal["markdown", "html", "json"] = "markdown",
    include_layout: bool = False,
    custom_prompt: Optional[str] = None,
    max_tokens: int = 8192,
) -> DotsOCRTextOutput:
    """
    Extract text from image using Dots OCR.

    Args:
        image: Input image (PIL Image, numpy array, or file path)
        output_format: Output format ("markdown", "html", or "json")
        include_layout: Include layout bounding boxes in output
        custom_prompt: Override default extraction prompt
        max_tokens: Maximum tokens for generation

    Returns:
        DotsOCRTextOutput with extracted content and optional layout

    Raises:
        RuntimeError: If model is not loaded or inference fails
    """
    if not self._loaded:
        raise RuntimeError("Model not loaded. Call _load_model() first.")

    # Prepare image
    img = self._prepare_image(image)

    # Get prompt
    prompt = custom_prompt or DOTS_OCR_PROMPT

    # Run inference based on backend
    config_type = type(self.backend_config).__name__

    if config_type == "DotsOCRPyTorchConfig":
        raw_output = self._infer_pytorch(img, prompt, max_tokens)
    elif config_type == "DotsOCRVLLMConfig":
        raw_output = self._infer_vllm(img, prompt, max_tokens)
    elif config_type == "DotsOCRAPIConfig":
        raw_output = self._infer_api(img, prompt, max_tokens)
    else:
        raise RuntimeError(f"Unknown backend: {config_type}")

    # Parse output
    return self._parse_output(
        raw_output,
        img.size,
        output_format,
        include_layout,
    )

GraniteDoclingTextExtractor

GraniteDoclingTextExtractor(
    backend: GraniteDoclingTextBackendConfig,
)

Bases: BaseTextExtractor

Granite Docling text extractor supporting PyTorch, VLLM, MLX, and API backends.

Granite Docling is IBM's compact vision-language model optimized for document conversion. It outputs DocTags format which is converted to Markdown using the docling_core library.

Example

from omnidocs.tasks.text_extraction.granitedocling import ( ... GraniteDoclingTextExtractor, ... GraniteDoclingTextPyTorchConfig, ... ) config = GraniteDoclingTextPyTorchConfig(device="cuda") extractor = GraniteDoclingTextExtractor(backend=config) result = extractor.extract(image, output_format="markdown") print(result.content)

Initialize Granite Docling extractor with backend configuration.

PARAMETER DESCRIPTION
backend

Backend configuration (PyTorch, VLLM, MLX, or API config)

TYPE: GraniteDoclingTextBackendConfig

Source code in omnidocs/tasks/text_extraction/granitedocling/extractor.py
def __init__(self, backend: GraniteDoclingTextBackendConfig):
    """
    Initialize Granite Docling extractor with backend configuration.

    Args:
        backend: Backend configuration (PyTorch, VLLM, MLX, or API config)
    """
    self.backend_config = backend
    self._backend: Any = None
    self._processor: Any = None
    self._loaded: bool = False

    # Backend-specific helpers
    self._mlx_config: Any = None
    self._apply_chat_template: Any = None
    self._generate: Any = None
    self._sampling_params_class: Any = None
    self._device: str = "cpu"

    self._load_model()

extract

extract(
    image: Union[Image, ndarray, str, Path],
    output_format: Literal["html", "markdown"] = "markdown",
) -> TextOutput

Extract text from an image using Granite Docling.

PARAMETER DESCRIPTION
image

Input image (PIL Image, numpy array, or file path)

TYPE: Union[Image, ndarray, str, Path]

output_format

Output format ("markdown" or "html")

TYPE: Literal['html', 'markdown'] DEFAULT: 'markdown'

RETURNS DESCRIPTION
TextOutput

TextOutput with extracted content

Source code in omnidocs/tasks/text_extraction/granitedocling/extractor.py
def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    output_format: Literal["html", "markdown"] = "markdown",
) -> TextOutput:
    """
    Extract text from an image using Granite Docling.

    Args:
        image: Input image (PIL Image, numpy array, or file path)
        output_format: Output format ("markdown" or "html")

    Returns:
        TextOutput with extracted content
    """
    if not self._loaded:
        raise RuntimeError("Model not loaded")

    if output_format not in ("html", "markdown"):
        raise ValueError(f"Invalid output_format: {output_format}")

    pil_image = self._prepare_image(image)
    width, height = pil_image.size

    # Dispatch to backend-specific inference
    config_type = type(self.backend_config).__name__

    if config_type == "GraniteDoclingTextPyTorchConfig":
        raw_output = self._infer_pytorch(pil_image)
    elif config_type == "GraniteDoclingTextVLLMConfig":
        raw_output = self._infer_vllm(pil_image)
    elif config_type == "GraniteDoclingTextMLXConfig":
        raw_output = self._infer_mlx(pil_image)
    elif config_type == "GraniteDoclingTextAPIConfig":
        raw_output = self._infer_api(pil_image)
    else:
        raise RuntimeError(f"Unknown backend: {config_type}")

    # Convert DocTags to Markdown
    markdown_output = self._convert_doctags_to_markdown(raw_output, pil_image)

    # For HTML output, wrap in basic HTML structure
    if output_format == "html":
        content = f"<html><body>\n{markdown_output}\n</body></html>"
    else:
        content = markdown_output

    return TextOutput(
        content=content,
        format=OutputFormat(output_format),
        raw_output=raw_output,
        plain_text=self._extract_plain_text(markdown_output),
        image_width=width,
        image_height=height,
        model_name=f"Granite-Docling-258M ({config_type.replace('Config', '')})",
    )

MinerUVLTextExtractor

MinerUVLTextExtractor(backend: MinerUVLTextBackendConfig)

Bases: BaseTextExtractor

MinerU VL text extractor with layout-aware extraction.

Performs two-step extraction: 1. Layout detection (detect regions) 2. Content recognition (extract text/table/equation from each region)

Supports multiple backends: - PyTorch (HuggingFace Transformers) - VLLM (high-throughput GPU) - MLX (Apple Silicon) - API (VLLM OpenAI-compatible server)

Example
from omnidocs.tasks.text_extraction import MinerUVLTextExtractor
from omnidocs.tasks.text_extraction.mineruvl import MinerUVLTextPyTorchConfig

extractor = MinerUVLTextExtractor(
    backend=MinerUVLTextPyTorchConfig(device="cuda")
)
result = extractor.extract(image)

print(result.content)  # Combined text + tables + equations
print(result.blocks)   # List of ContentBlock objects

Initialize MinerU VL text extractor.

PARAMETER DESCRIPTION
backend

Backend configuration (PyTorch, VLLM, MLX, or API)

TYPE: MinerUVLTextBackendConfig

Source code in omnidocs/tasks/text_extraction/mineruvl/extractor.py
def __init__(self, backend: MinerUVLTextBackendConfig):
    """
    Initialize MinerU VL text extractor.

    Args:
        backend: Backend configuration (PyTorch, VLLM, MLX, or API)
    """
    self.backend_config = backend
    self._client = None
    self._loaded = False
    self._load_model()

extract

extract(
    image: Union[Image, ndarray, str, Path],
    output_format: Literal["html", "markdown"] = "markdown",
) -> TextOutput

Extract text with layout-aware two-step extraction.

PARAMETER DESCRIPTION
image

Input image (PIL Image, numpy array, or file path)

TYPE: Union[Image, ndarray, str, Path]

output_format

Output format ('html' or 'markdown')

TYPE: Literal['html', 'markdown'] DEFAULT: 'markdown'

RETURNS DESCRIPTION
TextOutput

TextOutput with extracted content and metadata

Source code in omnidocs/tasks/text_extraction/mineruvl/extractor.py
def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    output_format: Literal["html", "markdown"] = "markdown",
) -> TextOutput:
    """
    Extract text with layout-aware two-step extraction.

    Args:
        image: Input image (PIL Image, numpy array, or file path)
        output_format: Output format ('html' or 'markdown')

    Returns:
        TextOutput with extracted content and metadata
    """
    if not self._loaded:
        raise RuntimeError("Model not loaded. Call _load_model() first.")

    pil_image = self._prepare_image(image)
    width, height = pil_image.size

    # Step 1: Layout detection
    blocks = self._detect_layout(pil_image)

    # Step 2: Content extraction for each block
    blocks = self._extract_content(pil_image, blocks)

    # Post-process (OTSL to HTML for tables)
    blocks = simple_post_process(blocks)

    # Combine content
    content = self._combine_content(blocks, output_format)

    # Build raw output with blocks info
    raw_output = self._build_raw_output(blocks)

    return TextOutput(
        content=content,
        format=OutputFormat(output_format),
        raw_output=raw_output,
        image_width=width,
        image_height=height,
        model_name="MinerU2.5-2509-1.2B",
    )

extract_with_blocks

extract_with_blocks(
    image: Union[Image, ndarray, str, Path],
    output_format: Literal["html", "markdown"] = "markdown",
) -> tuple[TextOutput, List[ContentBlock]]

Extract text and return both TextOutput and ContentBlocks.

This method provides access to the detailed block information including bounding boxes and block types.

PARAMETER DESCRIPTION
image

Input image

TYPE: Union[Image, ndarray, str, Path]

output_format

Output format

TYPE: Literal['html', 'markdown'] DEFAULT: 'markdown'

RETURNS DESCRIPTION
tuple[TextOutput, List[ContentBlock]]

Tuple of (TextOutput, List[ContentBlock])

Source code in omnidocs/tasks/text_extraction/mineruvl/extractor.py
def extract_with_blocks(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    output_format: Literal["html", "markdown"] = "markdown",
) -> tuple[TextOutput, List[ContentBlock]]:
    """
    Extract text and return both TextOutput and ContentBlocks.

    This method provides access to the detailed block information
    including bounding boxes and block types.

    Args:
        image: Input image
        output_format: Output format

    Returns:
        Tuple of (TextOutput, List[ContentBlock])
    """
    if not self._loaded:
        raise RuntimeError("Model not loaded.")

    pil_image = self._prepare_image(image)
    width, height = pil_image.size

    # Two-step extraction
    blocks = self._detect_layout(pil_image)
    blocks = self._extract_content(pil_image, blocks)
    blocks = simple_post_process(blocks)

    content = self._combine_content(blocks, output_format)
    raw_output = self._build_raw_output(blocks)

    text_output = TextOutput(
        content=content,
        format=OutputFormat(output_format),
        raw_output=raw_output,
        image_width=width,
        image_height=height,
        model_name="MinerU2.5-2509-1.2B",
    )

    return text_output, blocks

DotsOCRTextOutput

Bases: BaseModel

Text extraction output from Dots OCR with layout information.

Dots OCR provides structured output with: - Layout detection (11 categories) - Bounding boxes (normalized to 0-1024) - Multi-format text (Markdown/LaTeX/HTML) - Reading order preservation

Layout Categories

Caption, Footnote, Formula, List-item, Page-footer, Page-header, Picture, Section-header, Table, Text, Title

Text Formatting
  • Text/Title/Section-header: Markdown
  • Formula: LaTeX
  • Table: HTML
  • Picture: (text omitted)
Example
from omnidocs.tasks.text_extraction import DotsOCRTextExtractor
result = extractor.extract(image, include_layout=True)
print(result.content)  # Full text with formatting
for elem in result.layout:
        print(f"{elem.category}: {elem.bbox}")

num_layout_elements property

num_layout_elements: int

Number of detected layout elements.

content_length property

content_length: int

Length of extracted content in characters.

LayoutElement

Bases: BaseModel

Single layout element from document layout detection.

Represents a detected region in the document with its bounding box, category label, and extracted text content.

ATTRIBUTE DESCRIPTION
bbox

Bounding box coordinates [x1, y1, x2, y2] (normalized to 0-1024)

TYPE: List[int]

category

Layout category (e.g., "Text", "Title", "Table", "Formula")

TYPE: str

text

Extracted text content (None for pictures)

TYPE: Optional[str]

confidence

Detection confidence score (optional)

TYPE: Optional[float]

OutputFormat

Bases: str, Enum

Supported text extraction output formats.

Each format has different characteristics
  • HTML: Structured with div elements, preserves layout semantics
  • MARKDOWN: Portable, human-readable, good for documentation
  • JSON: Structured data with layout information (Dots OCR)

TextOutput

Bases: BaseModel

Text extraction output from a document image.

Contains the extracted text content in the requested format, along with optional raw output and plain text versions.

Example
result = extractor.extract(image, output_format="markdown")
print(result.content)  # Clean markdown
print(result.plain_text)  # Plain text without formatting

content_length property

content_length: int

Length of the extracted content in characters.

word_count property

word_count: int

Approximate word count of the plain text.

NanonetsTextExtractor

NanonetsTextExtractor(backend: NanonetsTextBackendConfig)

Bases: BaseTextExtractor

Nanonets OCR2-3B Vision-Language Model text extractor.

Extracts text from document images with support for: - Tables (output as HTML) - Equations (output as LaTeX) - Image captions (wrapped in tags) - Watermarks (wrapped in tags) - Page numbers (wrapped in tags) - Checkboxes (using ☐ and ☑ symbols)

Supports PyTorch, VLLM, and MLX backends.

Example
from omnidocs.tasks.text_extraction import NanonetsTextExtractor
from omnidocs.tasks.text_extraction.nanonets import NanonetsTextPyTorchConfig

# Initialize with PyTorch backend
extractor = NanonetsTextExtractor(
        backend=NanonetsTextPyTorchConfig()
    )

# Extract text
result = extractor.extract(image)
print(result.content)

Initialize Nanonets text extractor.

PARAMETER DESCRIPTION
backend

Backend configuration. One of: - NanonetsTextPyTorchConfig: PyTorch/HuggingFace backend - NanonetsTextVLLMConfig: VLLM high-throughput backend - NanonetsTextMLXConfig: MLX backend for Apple Silicon

TYPE: NanonetsTextBackendConfig

Source code in omnidocs/tasks/text_extraction/nanonets/extractor.py
def __init__(self, backend: NanonetsTextBackendConfig):
    """
    Initialize Nanonets text extractor.

    Args:
        backend: Backend configuration. One of:
            - NanonetsTextPyTorchConfig: PyTorch/HuggingFace backend
            - NanonetsTextVLLMConfig: VLLM high-throughput backend
            - NanonetsTextMLXConfig: MLX backend for Apple Silicon
    """
    self.backend_config = backend
    self._backend: Any = None
    self._processor: Any = None
    self._loaded = False

    # Backend-specific helpers
    self._process_vision_info: Any = None
    self._sampling_params_class: Any = None
    self._device: str = "cpu"

    # MLX-specific helpers
    self._mlx_config: Any = None
    self._apply_chat_template: Any = None
    self._generate: Any = None

    # Load model
    self._load_model()

extract

extract(
    image: Union[Image, ndarray, str, Path],
    output_format: Literal["html", "markdown"] = "markdown",
) -> TextOutput

Extract text from an image.

Note: Nanonets OCR2 produces a unified output format that includes tables as HTML and equations as LaTeX inline. The output_format parameter is accepted for API compatibility but does not change the output structure.

PARAMETER DESCRIPTION
image

Input image as: - PIL.Image.Image: PIL image object - np.ndarray: Numpy array (HWC format, RGB) - str or Path: Path to image file

TYPE: Union[Image, ndarray, str, Path]

output_format

Accepted for API compatibility (default: "markdown")

TYPE: Literal['html', 'markdown'] DEFAULT: 'markdown'

RETURNS DESCRIPTION
TextOutput

TextOutput containing extracted text content

RAISES DESCRIPTION
RuntimeError

If model is not loaded

ValueError

If image format is not supported

Source code in omnidocs/tasks/text_extraction/nanonets/extractor.py
def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    output_format: Literal["html", "markdown"] = "markdown",
) -> TextOutput:
    """
    Extract text from an image.

    Note: Nanonets OCR2 produces a unified output format that includes
    tables as HTML and equations as LaTeX inline. The output_format
    parameter is accepted for API compatibility but does not change
    the output structure.

    Args:
        image: Input image as:
            - PIL.Image.Image: PIL image object
            - np.ndarray: Numpy array (HWC format, RGB)
            - str or Path: Path to image file
        output_format: Accepted for API compatibility (default: "markdown")

    Returns:
        TextOutput containing extracted text content

    Raises:
        RuntimeError: If model is not loaded
        ValueError: If image format is not supported
    """
    if not self._loaded:
        raise RuntimeError("Model not loaded. Call _load_model() first.")

    # Prepare image
    pil_image = self._prepare_image(image)
    width, height = pil_image.size

    # Run inference based on backend
    config_type = type(self.backend_config).__name__
    if config_type == "NanonetsTextPyTorchConfig":
        raw_output = self._infer_pytorch(pil_image)
    elif config_type == "NanonetsTextVLLMConfig":
        raw_output = self._infer_vllm(pil_image)
    elif config_type == "NanonetsTextMLXConfig":
        raw_output = self._infer_mlx(pil_image)
    else:
        raise RuntimeError(f"Unknown backend: {config_type}")

    # Clean output
    cleaned_output = raw_output.replace("<|im_end|>", "").strip()

    return TextOutput(
        content=cleaned_output,
        format=OutputFormat(output_format),
        raw_output=raw_output,
        plain_text=cleaned_output,
        image_width=width,
        image_height=height,
        model_name=f"Nanonets-OCR2-3B ({type(self.backend_config).__name__})",
    )

QwenTextExtractor

QwenTextExtractor(backend: QwenTextBackendConfig)

Bases: BaseTextExtractor

Qwen3-VL Vision-Language Model text extractor.

Extracts text from document images and outputs as structured HTML or Markdown. Uses Qwen3-VL's built-in document parsing prompts.

Supports PyTorch, VLLM, MLX, and API backends.

Example
from omnidocs.tasks.text_extraction import QwenTextExtractor
from omnidocs.tasks.text_extraction.qwen import QwenTextPyTorchConfig

# Initialize with PyTorch backend
extractor = QwenTextExtractor(
        backend=QwenTextPyTorchConfig(model="Qwen/Qwen3-VL-8B-Instruct")
    )

# Extract as Markdown
result = extractor.extract(image, output_format="markdown")
print(result.content)

# Extract as HTML
result = extractor.extract(image, output_format="html")
print(result.content)

Initialize Qwen text extractor.

PARAMETER DESCRIPTION
backend

Backend configuration. One of: - QwenTextPyTorchConfig: PyTorch/HuggingFace backend - QwenTextVLLMConfig: VLLM high-throughput backend - QwenTextMLXConfig: MLX backend for Apple Silicon - QwenTextAPIConfig: API backend (OpenRouter, etc.)

TYPE: QwenTextBackendConfig

Source code in omnidocs/tasks/text_extraction/qwen/extractor.py
def __init__(self, backend: QwenTextBackendConfig):
    """
    Initialize Qwen text extractor.

    Args:
        backend: Backend configuration. One of:
            - QwenTextPyTorchConfig: PyTorch/HuggingFace backend
            - QwenTextVLLMConfig: VLLM high-throughput backend
            - QwenTextMLXConfig: MLX backend for Apple Silicon
            - QwenTextAPIConfig: API backend (OpenRouter, etc.)
    """
    self.backend_config = backend
    self._backend: Any = None
    self._processor: Any = None
    self._loaded = False

    # Backend-specific helpers
    self._process_vision_info: Any = None
    self._sampling_params_class: Any = None
    self._mlx_config: Any = None
    self._apply_chat_template: Any = None
    self._generate: Any = None

    # Load model
    self._load_model()

extract

extract(
    image: Union[Image, ndarray, str, Path],
    output_format: Literal["html", "markdown"] = "markdown",
) -> TextOutput

Extract text from an image.

PARAMETER DESCRIPTION
image

Input image as: - PIL.Image.Image: PIL image object - np.ndarray: Numpy array (HWC format, RGB) - str or Path: Path to image file

TYPE: Union[Image, ndarray, str, Path]

output_format

Desired output format: - "html": Structured HTML with div elements - "markdown": Markdown format

TYPE: Literal['html', 'markdown'] DEFAULT: 'markdown'

RETURNS DESCRIPTION
TextOutput

TextOutput containing extracted text content

RAISES DESCRIPTION
RuntimeError

If model is not loaded

ValueError

If image format or output_format is not supported

Source code in omnidocs/tasks/text_extraction/qwen/extractor.py
def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    output_format: Literal["html", "markdown"] = "markdown",
) -> TextOutput:
    """
    Extract text from an image.

    Args:
        image: Input image as:
            - PIL.Image.Image: PIL image object
            - np.ndarray: Numpy array (HWC format, RGB)
            - str or Path: Path to image file
        output_format: Desired output format:
            - "html": Structured HTML with div elements
            - "markdown": Markdown format

    Returns:
        TextOutput containing extracted text content

    Raises:
        RuntimeError: If model is not loaded
        ValueError: If image format or output_format is not supported
    """
    if not self._loaded:
        raise RuntimeError("Model not loaded. Call _load_model() first.")

    if output_format not in ("html", "markdown"):
        raise ValueError(f"Invalid output_format: {output_format}. Expected 'html' or 'markdown'.")

    # Prepare image
    pil_image = self._prepare_image(image)
    width, height = pil_image.size

    # Get prompt for output format
    prompt = QWEN_PROMPTS[output_format]

    # Run inference based on backend
    config_type = type(self.backend_config).__name__
    if config_type == "QwenTextPyTorchConfig":
        raw_output = self._infer_pytorch(pil_image, prompt)
    elif config_type == "QwenTextVLLMConfig":
        raw_output = self._infer_vllm(pil_image, prompt)
    elif config_type == "QwenTextMLXConfig":
        raw_output = self._infer_mlx(pil_image, prompt)
    elif config_type == "QwenTextAPIConfig":
        raw_output = self._infer_api(pil_image, prompt)
    else:
        raise RuntimeError(f"Unknown backend: {config_type}")

    # Clean output
    if output_format == "html":
        cleaned_output = _clean_html_output(raw_output)
    else:
        cleaned_output = _clean_markdown_output(raw_output)

    # Extract plain text
    plain_text = _extract_plain_text(raw_output, output_format)

    return TextOutput(
        content=cleaned_output,
        format=OutputFormat(output_format),
        raw_output=raw_output,
        plain_text=plain_text,
        image_width=width,
        image_height=height,
        model_name=f"Qwen3-VL ({type(self.backend_config).__name__})",
    )

VLMTextExtractor

VLMTextExtractor(config: VLMAPIConfig)

Bases: BaseTextExtractor

Provider-agnostic VLM text extractor using litellm.

Works with any cloud VLM API: Gemini, OpenRouter, Azure, OpenAI, Anthropic, etc. Supports custom prompts for specialized extraction.

Example
from omnidocs.vlm import VLMAPIConfig
from omnidocs.tasks.text_extraction import VLMTextExtractor

# Gemini (reads GOOGLE_API_KEY from env)
config = VLMAPIConfig(model="gemini/gemini-2.5-flash")
extractor = VLMTextExtractor(config=config)

# Default extraction
result = extractor.extract("document.png", output_format="markdown")

# Custom prompt
result = extractor.extract(
    "document.png",
    prompt="Extract only the table data as markdown",
)

Initialize VLM text extractor.

PARAMETER DESCRIPTION
config

VLM API configuration with model and provider details.

TYPE: VLMAPIConfig

Source code in omnidocs/tasks/text_extraction/vlm.py
def __init__(self, config: VLMAPIConfig):
    """
    Initialize VLM text extractor.

    Args:
        config: VLM API configuration with model and provider details.
    """
    self.config = config
    self._loaded = True

extract

extract(
    image: Union[Image, ndarray, str, Path],
    output_format: Literal["html", "markdown"] = "markdown",
    prompt: Optional[str] = None,
) -> TextOutput

Extract text from an image.

PARAMETER DESCRIPTION
image

Input image (PIL Image, numpy array, or file path).

TYPE: Union[Image, ndarray, str, Path]

output_format

Desired output format ("html" or "markdown").

TYPE: Literal['html', 'markdown'] DEFAULT: 'markdown'

prompt

Custom prompt. If None, uses a task-specific default prompt.

TYPE: Optional[str] DEFAULT: None

RETURNS DESCRIPTION
TextOutput

TextOutput containing extracted text content.

Source code in omnidocs/tasks/text_extraction/vlm.py
def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    output_format: Literal["html", "markdown"] = "markdown",
    prompt: Optional[str] = None,
) -> TextOutput:
    """
    Extract text from an image.

    Args:
        image: Input image (PIL Image, numpy array, or file path).
        output_format: Desired output format ("html" or "markdown").
        prompt: Custom prompt. If None, uses a task-specific default prompt.

    Returns:
        TextOutput containing extracted text content.
    """
    if output_format not in ("html", "markdown"):
        raise ValueError(f"Invalid output_format: {output_format}. Expected 'html' or 'markdown'.")

    pil_image = self._prepare_image(image)
    width, height = pil_image.size

    final_prompt = prompt or DEFAULT_PROMPTS[output_format]
    raw_output = vlm_completion(self.config, final_prompt, pil_image)
    plain_text = _extract_plain_text(raw_output, output_format)

    return TextOutput(
        content=raw_output,
        format=OutputFormat(output_format),
        raw_output=raw_output,
        plain_text=plain_text,
        image_width=width,
        image_height=height,
        model_name=f"VLM ({self.config.model})",
    )

base

Base class for text extractors.

Defines the abstract interface that all text extractors must implement.

BaseTextExtractor

Bases: ABC

Abstract base class for text extractors.

All text extraction models must inherit from this class and implement the required methods.

Example
class MyTextExtractor(BaseTextExtractor):
        def __init__(self, config: MyConfig):
            self.config = config
            self._load_model()

        def _load_model(self):
            # Load model weights
            pass

        def extract(self, image, output_format="markdown"):
            # Run extraction
            return TextOutput(...)
extract abstractmethod
extract(
    image: Union[Image, ndarray, str, Path],
    output_format: Literal["html", "markdown"] = "markdown",
) -> TextOutput

Extract text from an image.

PARAMETER DESCRIPTION
image

Input image as: - PIL.Image.Image: PIL image object - np.ndarray: Numpy array (HWC format, RGB) - str or Path: Path to image file

TYPE: Union[Image, ndarray, str, Path]

output_format

Desired output format: - "html": Structured HTML - "markdown": Markdown format

TYPE: Literal['html', 'markdown'] DEFAULT: 'markdown'

RETURNS DESCRIPTION
TextOutput

TextOutput containing extracted text content

RAISES DESCRIPTION
ValueError

If image format or output_format is not supported

RuntimeError

If model is not loaded or inference fails

Source code in omnidocs/tasks/text_extraction/base.py
@abstractmethod
def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    output_format: Literal["html", "markdown"] = "markdown",
) -> TextOutput:
    """
    Extract text from an image.

    Args:
        image: Input image as:
            - PIL.Image.Image: PIL image object
            - np.ndarray: Numpy array (HWC format, RGB)
            - str or Path: Path to image file
        output_format: Desired output format:
            - "html": Structured HTML
            - "markdown": Markdown format

    Returns:
        TextOutput containing extracted text content

    Raises:
        ValueError: If image format or output_format is not supported
        RuntimeError: If model is not loaded or inference fails
    """
    pass
batch_extract
batch_extract(
    images: List[Union[Image, ndarray, str, Path]],
    output_format: Literal["html", "markdown"] = "markdown",
    progress_callback: Optional[
        Callable[[int, int], None]
    ] = None,
) -> List[TextOutput]

Extract text from multiple images.

Default implementation loops over extract(). Subclasses can override for optimized batching (e.g., VLLM).

PARAMETER DESCRIPTION
images

List of images in any supported format

TYPE: List[Union[Image, ndarray, str, Path]]

output_format

Desired output format

TYPE: Literal['html', 'markdown'] DEFAULT: 'markdown'

progress_callback

Optional function(current, total) for progress

TYPE: Optional[Callable[[int, int], None]] DEFAULT: None

RETURNS DESCRIPTION
List[TextOutput]

List of TextOutput in same order as input

Examples:

images = [doc.get_page(i) for i in range(doc.page_count)]
results = extractor.batch_extract(images, output_format="markdown")
Source code in omnidocs/tasks/text_extraction/base.py
def batch_extract(
    self,
    images: List[Union[Image.Image, np.ndarray, str, Path]],
    output_format: Literal["html", "markdown"] = "markdown",
    progress_callback: Optional[Callable[[int, int], None]] = None,
) -> List[TextOutput]:
    """
    Extract text from multiple images.

    Default implementation loops over extract(). Subclasses can override
    for optimized batching (e.g., VLLM).

    Args:
        images: List of images in any supported format
        output_format: Desired output format
        progress_callback: Optional function(current, total) for progress

    Returns:
        List of TextOutput in same order as input

    Examples:
        ```python
        images = [doc.get_page(i) for i in range(doc.page_count)]
        results = extractor.batch_extract(images, output_format="markdown")
        ```
    """
    results = []
    total = len(images)

    for i, image in enumerate(images):
        if progress_callback:
            progress_callback(i + 1, total)

        result = self.extract(image, output_format=output_format)
        results.append(result)

    return results
extract_document
extract_document(
    document: Document,
    output_format: Literal["html", "markdown"] = "markdown",
    progress_callback: Optional[
        Callable[[int, int], None]
    ] = None,
) -> List[TextOutput]

Extract text from all pages of a document.

PARAMETER DESCRIPTION
document

Document instance

TYPE: Document

output_format

Desired output format

TYPE: Literal['html', 'markdown'] DEFAULT: 'markdown'

progress_callback

Optional function(current, total) for progress

TYPE: Optional[Callable[[int, int], None]] DEFAULT: None

RETURNS DESCRIPTION
List[TextOutput]

List of TextOutput, one per page

Examples:

doc = Document.from_pdf("paper.pdf")
results = extractor.extract_document(doc, output_format="markdown")
Source code in omnidocs/tasks/text_extraction/base.py
def extract_document(
    self,
    document: "Document",
    output_format: Literal["html", "markdown"] = "markdown",
    progress_callback: Optional[Callable[[int, int], None]] = None,
) -> List[TextOutput]:
    """
    Extract text from all pages of a document.

    Args:
        document: Document instance
        output_format: Desired output format
        progress_callback: Optional function(current, total) for progress

    Returns:
        List of TextOutput, one per page

    Examples:
        ```python
        doc = Document.from_pdf("paper.pdf")
        results = extractor.extract_document(doc, output_format="markdown")
        ```
    """
    results = []
    total = document.page_count

    for i, page in enumerate(document.iter_pages()):
        if progress_callback:
            progress_callback(i + 1, total)

        result = self.extract(page, output_format=output_format)
        results.append(result)

    return results

dotsocr

Dots OCR text extractor and backend configurations.

Available backends: - PyTorch: DotsOCRPyTorchConfig (local GPU inference) - VLLM: DotsOCRVLLMConfig (offline batch inference) - API: DotsOCRAPIConfig (online VLLM server via OpenAI-compatible API)

DotsOCRAPIConfig

Bases: BaseModel

API backend configuration for Dots OCR.

This config is for accessing a deployed VLLM server via OpenAI-compatible API. Typically used with modal_dotsocr_vllm_online.py deployment.

Example
from omnidocs.tasks.text_extraction import DotsOCRTextExtractor
from omnidocs.tasks.text_extraction.dotsocr import DotsOCRAPIConfig

config = DotsOCRAPIConfig(
        model="dotsocr",
        api_base="https://your-modal-app.modal.run/v1",
        api_key="optional-key",
    )
extractor = DotsOCRTextExtractor(backend=config)

DotsOCRTextExtractor

DotsOCRTextExtractor(backend: DotsOCRBackendConfig)

Bases: BaseTextExtractor

Dots OCR Vision-Language Model text extractor with layout detection.

Extracts text from document images with layout information including: - 11 layout categories (Caption, Footnote, Formula, List-item, etc.) - Bounding boxes (normalized to 0-1024) - Multi-format text (Markdown, LaTeX, HTML) - Reading order preservation

Supports PyTorch, VLLM, and API backends.

Example
from omnidocs.tasks.text_extraction import DotsOCRTextExtractor
from omnidocs.tasks.text_extraction.dotsocr import DotsOCRPyTorchConfig

# Initialize with PyTorch backend
extractor = DotsOCRTextExtractor(
        backend=DotsOCRPyTorchConfig(model="rednote-hilab/dots.ocr")
    )

# Extract with layout
result = extractor.extract(image, include_layout=True)
print(f"Found {result.num_layout_elements} elements")
print(result.content)

Initialize Dots OCR text extractor.

PARAMETER DESCRIPTION
backend

Backend configuration. One of: - DotsOCRPyTorchConfig: PyTorch/HuggingFace backend - DotsOCRVLLMConfig: VLLM high-throughput backend - DotsOCRAPIConfig: API backend (online VLLM server)

TYPE: DotsOCRBackendConfig

Source code in omnidocs/tasks/text_extraction/dotsocr/extractor.py
def __init__(self, backend: DotsOCRBackendConfig):
    """
    Initialize Dots OCR text extractor.

    Args:
        backend: Backend configuration. One of:
            - DotsOCRPyTorchConfig: PyTorch/HuggingFace backend
            - DotsOCRVLLMConfig: VLLM high-throughput backend
            - DotsOCRAPIConfig: API backend (online VLLM server)
    """
    self.backend_config = backend
    self._backend: Any = None
    self._processor: Any = None
    self._model: Any = None
    self._loaded = False

    # Load model
    self._load_model()
extract
extract(
    image: Union[Image, ndarray, str, Path],
    output_format: Literal[
        "markdown", "html", "json"
    ] = "markdown",
    include_layout: bool = False,
    custom_prompt: Optional[str] = None,
    max_tokens: int = 8192,
) -> DotsOCRTextOutput

Extract text from image using Dots OCR.

PARAMETER DESCRIPTION
image

Input image (PIL Image, numpy array, or file path)

TYPE: Union[Image, ndarray, str, Path]

output_format

Output format ("markdown", "html", or "json")

TYPE: Literal['markdown', 'html', 'json'] DEFAULT: 'markdown'

include_layout

Include layout bounding boxes in output

TYPE: bool DEFAULT: False

custom_prompt

Override default extraction prompt

TYPE: Optional[str] DEFAULT: None

max_tokens

Maximum tokens for generation

TYPE: int DEFAULT: 8192

RETURNS DESCRIPTION
DotsOCRTextOutput

DotsOCRTextOutput with extracted content and optional layout

RAISES DESCRIPTION
RuntimeError

If model is not loaded or inference fails

Source code in omnidocs/tasks/text_extraction/dotsocr/extractor.py
def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    output_format: Literal["markdown", "html", "json"] = "markdown",
    include_layout: bool = False,
    custom_prompt: Optional[str] = None,
    max_tokens: int = 8192,
) -> DotsOCRTextOutput:
    """
    Extract text from image using Dots OCR.

    Args:
        image: Input image (PIL Image, numpy array, or file path)
        output_format: Output format ("markdown", "html", or "json")
        include_layout: Include layout bounding boxes in output
        custom_prompt: Override default extraction prompt
        max_tokens: Maximum tokens for generation

    Returns:
        DotsOCRTextOutput with extracted content and optional layout

    Raises:
        RuntimeError: If model is not loaded or inference fails
    """
    if not self._loaded:
        raise RuntimeError("Model not loaded. Call _load_model() first.")

    # Prepare image
    img = self._prepare_image(image)

    # Get prompt
    prompt = custom_prompt or DOTS_OCR_PROMPT

    # Run inference based on backend
    config_type = type(self.backend_config).__name__

    if config_type == "DotsOCRPyTorchConfig":
        raw_output = self._infer_pytorch(img, prompt, max_tokens)
    elif config_type == "DotsOCRVLLMConfig":
        raw_output = self._infer_vllm(img, prompt, max_tokens)
    elif config_type == "DotsOCRAPIConfig":
        raw_output = self._infer_api(img, prompt, max_tokens)
    else:
        raise RuntimeError(f"Unknown backend: {config_type}")

    # Parse output
    return self._parse_output(
        raw_output,
        img.size,
        output_format,
        include_layout,
    )

DotsOCRPyTorchConfig

Bases: BaseModel

PyTorch/HuggingFace backend configuration for Dots OCR.

Dots OCR provides layout-aware text extraction with 11 predefined layout categories (Caption, Footnote, Formula, List-item, Page-footer, Page-header, Picture, Section-header, Table, Text, Title).

Example
from omnidocs.tasks.text_extraction import DotsOCRTextExtractor
from omnidocs.tasks.text_extraction.dotsocr import DotsOCRPyTorchConfig

config = DotsOCRPyTorchConfig(
        model="rednote-hilab/dots.ocr",
        device="cuda",
        torch_dtype="bfloat16",
    )
extractor = DotsOCRTextExtractor(backend=config)

DotsOCRVLLMConfig

Bases: BaseModel

VLLM backend configuration for Dots OCR.

VLLM provides high-throughput inference with optimizations like: - PagedAttention for efficient KV cache management - Continuous batching for higher throughput - Optimized CUDA kernels

Example
from omnidocs.tasks.text_extraction import DotsOCRTextExtractor
from omnidocs.tasks.text_extraction.dotsocr import DotsOCRVLLMConfig

config = DotsOCRVLLMConfig(
        model="rednote-hilab/dots.ocr",
        tensor_parallel_size=2,
        gpu_memory_utilization=0.9,
    )
extractor = DotsOCRTextExtractor(backend=config)

api

API backend configuration for Dots OCR (VLLM online server).

DotsOCRAPIConfig

Bases: BaseModel

API backend configuration for Dots OCR.

This config is for accessing a deployed VLLM server via OpenAI-compatible API. Typically used with modal_dotsocr_vllm_online.py deployment.

Example
from omnidocs.tasks.text_extraction import DotsOCRTextExtractor
from omnidocs.tasks.text_extraction.dotsocr import DotsOCRAPIConfig

config = DotsOCRAPIConfig(
        model="dotsocr",
        api_base="https://your-modal-app.modal.run/v1",
        api_key="optional-key",
    )
extractor = DotsOCRTextExtractor(backend=config)

extractor

Dots OCR text extractor with layout-aware extraction.

A Vision-Language Model optimized for document OCR with structured output containing layout information, bounding boxes, and multi-format text.

Supports PyTorch, VLLM, and API backends.

Example
from omnidocs.tasks.text_extraction import DotsOCRTextExtractor
from omnidocs.tasks.text_extraction.dotsocr import DotsOCRPyTorchConfig

extractor = DotsOCRTextExtractor(
        backend=DotsOCRPyTorchConfig(model="rednote-hilab/dots.ocr")
    )
result = extractor.extract(image, include_layout=True)
print(result.content)
for elem in result.layout:
        print(f"{elem.category}: {elem.bbox}")
DotsOCRTextExtractor
DotsOCRTextExtractor(backend: DotsOCRBackendConfig)

Bases: BaseTextExtractor

Dots OCR Vision-Language Model text extractor with layout detection.

Extracts text from document images with layout information including: - 11 layout categories (Caption, Footnote, Formula, List-item, etc.) - Bounding boxes (normalized to 0-1024) - Multi-format text (Markdown, LaTeX, HTML) - Reading order preservation

Supports PyTorch, VLLM, and API backends.

Example
from omnidocs.tasks.text_extraction import DotsOCRTextExtractor
from omnidocs.tasks.text_extraction.dotsocr import DotsOCRPyTorchConfig

# Initialize with PyTorch backend
extractor = DotsOCRTextExtractor(
        backend=DotsOCRPyTorchConfig(model="rednote-hilab/dots.ocr")
    )

# Extract with layout
result = extractor.extract(image, include_layout=True)
print(f"Found {result.num_layout_elements} elements")
print(result.content)

Initialize Dots OCR text extractor.

PARAMETER DESCRIPTION
backend

Backend configuration. One of: - DotsOCRPyTorchConfig: PyTorch/HuggingFace backend - DotsOCRVLLMConfig: VLLM high-throughput backend - DotsOCRAPIConfig: API backend (online VLLM server)

TYPE: DotsOCRBackendConfig

Source code in omnidocs/tasks/text_extraction/dotsocr/extractor.py
def __init__(self, backend: DotsOCRBackendConfig):
    """
    Initialize Dots OCR text extractor.

    Args:
        backend: Backend configuration. One of:
            - DotsOCRPyTorchConfig: PyTorch/HuggingFace backend
            - DotsOCRVLLMConfig: VLLM high-throughput backend
            - DotsOCRAPIConfig: API backend (online VLLM server)
    """
    self.backend_config = backend
    self._backend: Any = None
    self._processor: Any = None
    self._model: Any = None
    self._loaded = False

    # Load model
    self._load_model()
extract
extract(
    image: Union[Image, ndarray, str, Path],
    output_format: Literal[
        "markdown", "html", "json"
    ] = "markdown",
    include_layout: bool = False,
    custom_prompt: Optional[str] = None,
    max_tokens: int = 8192,
) -> DotsOCRTextOutput

Extract text from image using Dots OCR.

PARAMETER DESCRIPTION
image

Input image (PIL Image, numpy array, or file path)

TYPE: Union[Image, ndarray, str, Path]

output_format

Output format ("markdown", "html", or "json")

TYPE: Literal['markdown', 'html', 'json'] DEFAULT: 'markdown'

include_layout

Include layout bounding boxes in output

TYPE: bool DEFAULT: False

custom_prompt

Override default extraction prompt

TYPE: Optional[str] DEFAULT: None

max_tokens

Maximum tokens for generation

TYPE: int DEFAULT: 8192

RETURNS DESCRIPTION
DotsOCRTextOutput

DotsOCRTextOutput with extracted content and optional layout

RAISES DESCRIPTION
RuntimeError

If model is not loaded or inference fails

Source code in omnidocs/tasks/text_extraction/dotsocr/extractor.py
def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    output_format: Literal["markdown", "html", "json"] = "markdown",
    include_layout: bool = False,
    custom_prompt: Optional[str] = None,
    max_tokens: int = 8192,
) -> DotsOCRTextOutput:
    """
    Extract text from image using Dots OCR.

    Args:
        image: Input image (PIL Image, numpy array, or file path)
        output_format: Output format ("markdown", "html", or "json")
        include_layout: Include layout bounding boxes in output
        custom_prompt: Override default extraction prompt
        max_tokens: Maximum tokens for generation

    Returns:
        DotsOCRTextOutput with extracted content and optional layout

    Raises:
        RuntimeError: If model is not loaded or inference fails
    """
    if not self._loaded:
        raise RuntimeError("Model not loaded. Call _load_model() first.")

    # Prepare image
    img = self._prepare_image(image)

    # Get prompt
    prompt = custom_prompt or DOTS_OCR_PROMPT

    # Run inference based on backend
    config_type = type(self.backend_config).__name__

    if config_type == "DotsOCRPyTorchConfig":
        raw_output = self._infer_pytorch(img, prompt, max_tokens)
    elif config_type == "DotsOCRVLLMConfig":
        raw_output = self._infer_vllm(img, prompt, max_tokens)
    elif config_type == "DotsOCRAPIConfig":
        raw_output = self._infer_api(img, prompt, max_tokens)
    else:
        raise RuntimeError(f"Unknown backend: {config_type}")

    # Parse output
    return self._parse_output(
        raw_output,
        img.size,
        output_format,
        include_layout,
    )

pytorch

PyTorch backend configuration for Dots OCR.

DotsOCRPyTorchConfig

Bases: BaseModel

PyTorch/HuggingFace backend configuration for Dots OCR.

Dots OCR provides layout-aware text extraction with 11 predefined layout categories (Caption, Footnote, Formula, List-item, Page-footer, Page-header, Picture, Section-header, Table, Text, Title).

Example
from omnidocs.tasks.text_extraction import DotsOCRTextExtractor
from omnidocs.tasks.text_extraction.dotsocr import DotsOCRPyTorchConfig

config = DotsOCRPyTorchConfig(
        model="rednote-hilab/dots.ocr",
        device="cuda",
        torch_dtype="bfloat16",
    )
extractor = DotsOCRTextExtractor(backend=config)

vllm

VLLM backend configuration for Dots OCR.

DotsOCRVLLMConfig

Bases: BaseModel

VLLM backend configuration for Dots OCR.

VLLM provides high-throughput inference with optimizations like: - PagedAttention for efficient KV cache management - Continuous batching for higher throughput - Optimized CUDA kernels

Example
from omnidocs.tasks.text_extraction import DotsOCRTextExtractor
from omnidocs.tasks.text_extraction.dotsocr import DotsOCRVLLMConfig

config = DotsOCRVLLMConfig(
        model="rednote-hilab/dots.ocr",
        tensor_parallel_size=2,
        gpu_memory_utilization=0.9,
    )
extractor = DotsOCRTextExtractor(backend=config)

granitedocling

Granite Docling text extraction with multi-backend support.

GraniteDoclingTextAPIConfig

Bases: BaseModel

Configuration for Granite Docling text extraction via API.

Uses litellm for provider-agnostic API access. Supports OpenRouter, Gemini, Azure, OpenAI, and any other litellm-compatible provider.

API keys can be passed directly or read from environment variables.

Example
# OpenRouter
config = GraniteDoclingTextAPIConfig(
    model="openrouter/ibm-granite/granite-docling-258M",
)

GraniteDoclingTextExtractor

GraniteDoclingTextExtractor(
    backend: GraniteDoclingTextBackendConfig,
)

Bases: BaseTextExtractor

Granite Docling text extractor supporting PyTorch, VLLM, MLX, and API backends.

Granite Docling is IBM's compact vision-language model optimized for document conversion. It outputs DocTags format which is converted to Markdown using the docling_core library.

Example

from omnidocs.tasks.text_extraction.granitedocling import ( ... GraniteDoclingTextExtractor, ... GraniteDoclingTextPyTorchConfig, ... ) config = GraniteDoclingTextPyTorchConfig(device="cuda") extractor = GraniteDoclingTextExtractor(backend=config) result = extractor.extract(image, output_format="markdown") print(result.content)

Initialize Granite Docling extractor with backend configuration.

PARAMETER DESCRIPTION
backend

Backend configuration (PyTorch, VLLM, MLX, or API config)

TYPE: GraniteDoclingTextBackendConfig

Source code in omnidocs/tasks/text_extraction/granitedocling/extractor.py
def __init__(self, backend: GraniteDoclingTextBackendConfig):
    """
    Initialize Granite Docling extractor with backend configuration.

    Args:
        backend: Backend configuration (PyTorch, VLLM, MLX, or API config)
    """
    self.backend_config = backend
    self._backend: Any = None
    self._processor: Any = None
    self._loaded: bool = False

    # Backend-specific helpers
    self._mlx_config: Any = None
    self._apply_chat_template: Any = None
    self._generate: Any = None
    self._sampling_params_class: Any = None
    self._device: str = "cpu"

    self._load_model()
extract
extract(
    image: Union[Image, ndarray, str, Path],
    output_format: Literal["html", "markdown"] = "markdown",
) -> TextOutput

Extract text from an image using Granite Docling.

PARAMETER DESCRIPTION
image

Input image (PIL Image, numpy array, or file path)

TYPE: Union[Image, ndarray, str, Path]

output_format

Output format ("markdown" or "html")

TYPE: Literal['html', 'markdown'] DEFAULT: 'markdown'

RETURNS DESCRIPTION
TextOutput

TextOutput with extracted content

Source code in omnidocs/tasks/text_extraction/granitedocling/extractor.py
def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    output_format: Literal["html", "markdown"] = "markdown",
) -> TextOutput:
    """
    Extract text from an image using Granite Docling.

    Args:
        image: Input image (PIL Image, numpy array, or file path)
        output_format: Output format ("markdown" or "html")

    Returns:
        TextOutput with extracted content
    """
    if not self._loaded:
        raise RuntimeError("Model not loaded")

    if output_format not in ("html", "markdown"):
        raise ValueError(f"Invalid output_format: {output_format}")

    pil_image = self._prepare_image(image)
    width, height = pil_image.size

    # Dispatch to backend-specific inference
    config_type = type(self.backend_config).__name__

    if config_type == "GraniteDoclingTextPyTorchConfig":
        raw_output = self._infer_pytorch(pil_image)
    elif config_type == "GraniteDoclingTextVLLMConfig":
        raw_output = self._infer_vllm(pil_image)
    elif config_type == "GraniteDoclingTextMLXConfig":
        raw_output = self._infer_mlx(pil_image)
    elif config_type == "GraniteDoclingTextAPIConfig":
        raw_output = self._infer_api(pil_image)
    else:
        raise RuntimeError(f"Unknown backend: {config_type}")

    # Convert DocTags to Markdown
    markdown_output = self._convert_doctags_to_markdown(raw_output, pil_image)

    # For HTML output, wrap in basic HTML structure
    if output_format == "html":
        content = f"<html><body>\n{markdown_output}\n</body></html>"
    else:
        content = markdown_output

    return TextOutput(
        content=content,
        format=OutputFormat(output_format),
        raw_output=raw_output,
        plain_text=self._extract_plain_text(markdown_output),
        image_width=width,
        image_height=height,
        model_name=f"Granite-Docling-258M ({config_type.replace('Config', '')})",
    )

GraniteDoclingTextMLXConfig

Bases: BaseModel

Configuration for Granite Docling text extraction with MLX backend.

This backend is optimized for Apple Silicon Macs (M1/M2/M3/M4). Uses the MLX-optimized model variant.

GraniteDoclingTextPyTorchConfig

Bases: BaseModel

Configuration for Granite Docling text extraction with PyTorch backend.

GraniteDoclingTextVLLMConfig

Bases: BaseModel

Configuration for Granite Docling text extraction with VLLM backend.

IMPORTANT: This config uses revision="untied" by default, which is required for VLLM compatibility with Granite Docling's tied weights.

api

API backend configuration for Granite Docling text extraction.

Uses litellm for provider-agnostic inference (OpenRouter, Gemini, Azure, etc.).

GraniteDoclingTextAPIConfig

Bases: BaseModel

Configuration for Granite Docling text extraction via API.

Uses litellm for provider-agnostic API access. Supports OpenRouter, Gemini, Azure, OpenAI, and any other litellm-compatible provider.

API keys can be passed directly or read from environment variables.

Example
# OpenRouter
config = GraniteDoclingTextAPIConfig(
    model="openrouter/ibm-granite/granite-docling-258M",
)

extractor

Granite Docling text extractor with multi-backend support.

GraniteDoclingTextExtractor
GraniteDoclingTextExtractor(
    backend: GraniteDoclingTextBackendConfig,
)

Bases: BaseTextExtractor

Granite Docling text extractor supporting PyTorch, VLLM, MLX, and API backends.

Granite Docling is IBM's compact vision-language model optimized for document conversion. It outputs DocTags format which is converted to Markdown using the docling_core library.

Example

from omnidocs.tasks.text_extraction.granitedocling import ( ... GraniteDoclingTextExtractor, ... GraniteDoclingTextPyTorchConfig, ... ) config = GraniteDoclingTextPyTorchConfig(device="cuda") extractor = GraniteDoclingTextExtractor(backend=config) result = extractor.extract(image, output_format="markdown") print(result.content)

Initialize Granite Docling extractor with backend configuration.

PARAMETER DESCRIPTION
backend

Backend configuration (PyTorch, VLLM, MLX, or API config)

TYPE: GraniteDoclingTextBackendConfig

Source code in omnidocs/tasks/text_extraction/granitedocling/extractor.py
def __init__(self, backend: GraniteDoclingTextBackendConfig):
    """
    Initialize Granite Docling extractor with backend configuration.

    Args:
        backend: Backend configuration (PyTorch, VLLM, MLX, or API config)
    """
    self.backend_config = backend
    self._backend: Any = None
    self._processor: Any = None
    self._loaded: bool = False

    # Backend-specific helpers
    self._mlx_config: Any = None
    self._apply_chat_template: Any = None
    self._generate: Any = None
    self._sampling_params_class: Any = None
    self._device: str = "cpu"

    self._load_model()
extract
extract(
    image: Union[Image, ndarray, str, Path],
    output_format: Literal["html", "markdown"] = "markdown",
) -> TextOutput

Extract text from an image using Granite Docling.

PARAMETER DESCRIPTION
image

Input image (PIL Image, numpy array, or file path)

TYPE: Union[Image, ndarray, str, Path]

output_format

Output format ("markdown" or "html")

TYPE: Literal['html', 'markdown'] DEFAULT: 'markdown'

RETURNS DESCRIPTION
TextOutput

TextOutput with extracted content

Source code in omnidocs/tasks/text_extraction/granitedocling/extractor.py
def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    output_format: Literal["html", "markdown"] = "markdown",
) -> TextOutput:
    """
    Extract text from an image using Granite Docling.

    Args:
        image: Input image (PIL Image, numpy array, or file path)
        output_format: Output format ("markdown" or "html")

    Returns:
        TextOutput with extracted content
    """
    if not self._loaded:
        raise RuntimeError("Model not loaded")

    if output_format not in ("html", "markdown"):
        raise ValueError(f"Invalid output_format: {output_format}")

    pil_image = self._prepare_image(image)
    width, height = pil_image.size

    # Dispatch to backend-specific inference
    config_type = type(self.backend_config).__name__

    if config_type == "GraniteDoclingTextPyTorchConfig":
        raw_output = self._infer_pytorch(pil_image)
    elif config_type == "GraniteDoclingTextVLLMConfig":
        raw_output = self._infer_vllm(pil_image)
    elif config_type == "GraniteDoclingTextMLXConfig":
        raw_output = self._infer_mlx(pil_image)
    elif config_type == "GraniteDoclingTextAPIConfig":
        raw_output = self._infer_api(pil_image)
    else:
        raise RuntimeError(f"Unknown backend: {config_type}")

    # Convert DocTags to Markdown
    markdown_output = self._convert_doctags_to_markdown(raw_output, pil_image)

    # For HTML output, wrap in basic HTML structure
    if output_format == "html":
        content = f"<html><body>\n{markdown_output}\n</body></html>"
    else:
        content = markdown_output

    return TextOutput(
        content=content,
        format=OutputFormat(output_format),
        raw_output=raw_output,
        plain_text=self._extract_plain_text(markdown_output),
        image_width=width,
        image_height=height,
        model_name=f"Granite-Docling-258M ({config_type.replace('Config', '')})",
    )

mlx

MLX backend configuration for Granite Docling text extraction (Apple Silicon).

GraniteDoclingTextMLXConfig

Bases: BaseModel

Configuration for Granite Docling text extraction with MLX backend.

This backend is optimized for Apple Silicon Macs (M1/M2/M3/M4). Uses the MLX-optimized model variant.

pytorch

PyTorch backend configuration for Granite Docling text extraction.

GraniteDoclingTextPyTorchConfig

Bases: BaseModel

Configuration for Granite Docling text extraction with PyTorch backend.

vllm

VLLM backend configuration for Granite Docling text extraction.

GraniteDoclingTextVLLMConfig

Bases: BaseModel

Configuration for Granite Docling text extraction with VLLM backend.

IMPORTANT: This config uses revision="untied" by default, which is required for VLLM compatibility with Granite Docling's tied weights.

mineruvl

MinerU VL text extraction module.

MinerU VL is a vision-language model for document layout detection and text/table/equation recognition. It performs two-step extraction: 1. Layout Detection: Detect regions with types (text, table, equation, etc.) 2. Content Recognition: Extract content from each detected region

Example
from omnidocs.tasks.text_extraction import MinerUVLTextExtractor
from omnidocs.tasks.text_extraction.mineruvl import MinerUVLTextPyTorchConfig

# Initialize with PyTorch backend
extractor = MinerUVLTextExtractor(
    backend=MinerUVLTextPyTorchConfig(device="cuda")
)

# Extract text
result = extractor.extract(image)
print(result.content)

# Extract with detailed blocks
result, blocks = extractor.extract_with_blocks(image)
for block in blocks:
    print(f"{block.type}: {block.content[:50]}...")

MinerUVLTextAPIConfig

Bases: BaseModel

API backend config for MinerU VL text extraction.

Connects to a deployed VLLM server with OpenAI-compatible API.

Example
from omnidocs.tasks.text_extraction import MinerUVLTextExtractor
from omnidocs.tasks.text_extraction.mineruvl import MinerUVLTextAPIConfig

extractor = MinerUVLTextExtractor(
    backend=MinerUVLTextAPIConfig(
        server_url="https://your-server.modal.run"
    )
)
result = extractor.extract(image)

MinerUVLTextExtractor

MinerUVLTextExtractor(backend: MinerUVLTextBackendConfig)

Bases: BaseTextExtractor

MinerU VL text extractor with layout-aware extraction.

Performs two-step extraction: 1. Layout detection (detect regions) 2. Content recognition (extract text/table/equation from each region)

Supports multiple backends: - PyTorch (HuggingFace Transformers) - VLLM (high-throughput GPU) - MLX (Apple Silicon) - API (VLLM OpenAI-compatible server)

Example
from omnidocs.tasks.text_extraction import MinerUVLTextExtractor
from omnidocs.tasks.text_extraction.mineruvl import MinerUVLTextPyTorchConfig

extractor = MinerUVLTextExtractor(
    backend=MinerUVLTextPyTorchConfig(device="cuda")
)
result = extractor.extract(image)

print(result.content)  # Combined text + tables + equations
print(result.blocks)   # List of ContentBlock objects

Initialize MinerU VL text extractor.

PARAMETER DESCRIPTION
backend

Backend configuration (PyTorch, VLLM, MLX, or API)

TYPE: MinerUVLTextBackendConfig

Source code in omnidocs/tasks/text_extraction/mineruvl/extractor.py
def __init__(self, backend: MinerUVLTextBackendConfig):
    """
    Initialize MinerU VL text extractor.

    Args:
        backend: Backend configuration (PyTorch, VLLM, MLX, or API)
    """
    self.backend_config = backend
    self._client = None
    self._loaded = False
    self._load_model()
extract
extract(
    image: Union[Image, ndarray, str, Path],
    output_format: Literal["html", "markdown"] = "markdown",
) -> TextOutput

Extract text with layout-aware two-step extraction.

PARAMETER DESCRIPTION
image

Input image (PIL Image, numpy array, or file path)

TYPE: Union[Image, ndarray, str, Path]

output_format

Output format ('html' or 'markdown')

TYPE: Literal['html', 'markdown'] DEFAULT: 'markdown'

RETURNS DESCRIPTION
TextOutput

TextOutput with extracted content and metadata

Source code in omnidocs/tasks/text_extraction/mineruvl/extractor.py
def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    output_format: Literal["html", "markdown"] = "markdown",
) -> TextOutput:
    """
    Extract text with layout-aware two-step extraction.

    Args:
        image: Input image (PIL Image, numpy array, or file path)
        output_format: Output format ('html' or 'markdown')

    Returns:
        TextOutput with extracted content and metadata
    """
    if not self._loaded:
        raise RuntimeError("Model not loaded. Call _load_model() first.")

    pil_image = self._prepare_image(image)
    width, height = pil_image.size

    # Step 1: Layout detection
    blocks = self._detect_layout(pil_image)

    # Step 2: Content extraction for each block
    blocks = self._extract_content(pil_image, blocks)

    # Post-process (OTSL to HTML for tables)
    blocks = simple_post_process(blocks)

    # Combine content
    content = self._combine_content(blocks, output_format)

    # Build raw output with blocks info
    raw_output = self._build_raw_output(blocks)

    return TextOutput(
        content=content,
        format=OutputFormat(output_format),
        raw_output=raw_output,
        image_width=width,
        image_height=height,
        model_name="MinerU2.5-2509-1.2B",
    )
extract_with_blocks
extract_with_blocks(
    image: Union[Image, ndarray, str, Path],
    output_format: Literal["html", "markdown"] = "markdown",
) -> tuple[TextOutput, List[ContentBlock]]

Extract text and return both TextOutput and ContentBlocks.

This method provides access to the detailed block information including bounding boxes and block types.

PARAMETER DESCRIPTION
image

Input image

TYPE: Union[Image, ndarray, str, Path]

output_format

Output format

TYPE: Literal['html', 'markdown'] DEFAULT: 'markdown'

RETURNS DESCRIPTION
tuple[TextOutput, List[ContentBlock]]

Tuple of (TextOutput, List[ContentBlock])

Source code in omnidocs/tasks/text_extraction/mineruvl/extractor.py
def extract_with_blocks(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    output_format: Literal["html", "markdown"] = "markdown",
) -> tuple[TextOutput, List[ContentBlock]]:
    """
    Extract text and return both TextOutput and ContentBlocks.

    This method provides access to the detailed block information
    including bounding boxes and block types.

    Args:
        image: Input image
        output_format: Output format

    Returns:
        Tuple of (TextOutput, List[ContentBlock])
    """
    if not self._loaded:
        raise RuntimeError("Model not loaded.")

    pil_image = self._prepare_image(image)
    width, height = pil_image.size

    # Two-step extraction
    blocks = self._detect_layout(pil_image)
    blocks = self._extract_content(pil_image, blocks)
    blocks = simple_post_process(blocks)

    content = self._combine_content(blocks, output_format)
    raw_output = self._build_raw_output(blocks)

    text_output = TextOutput(
        content=content,
        format=OutputFormat(output_format),
        raw_output=raw_output,
        image_width=width,
        image_height=height,
        model_name="MinerU2.5-2509-1.2B",
    )

    return text_output, blocks

MinerUVLTextMLXConfig

Bases: BaseModel

MLX backend config for MinerU VL text extraction on Apple Silicon.

Uses MLX-VLM for efficient inference on M1/M2/M3/M4 chips.

Example
from omnidocs.tasks.text_extraction import MinerUVLTextExtractor
from omnidocs.tasks.text_extraction.mineruvl import MinerUVLTextMLXConfig

extractor = MinerUVLTextExtractor(
    backend=MinerUVLTextMLXConfig()
)
result = extractor.extract(image)

MinerUVLTextPyTorchConfig

Bases: BaseModel

PyTorch/HuggingFace backend config for MinerU VL text extraction.

Uses HuggingFace Transformers with Qwen2VLForConditionalGeneration.

Example
from omnidocs.tasks.text_extraction import MinerUVLTextExtractor
from omnidocs.tasks.text_extraction.mineruvl import MinerUVLTextPyTorchConfig

extractor = MinerUVLTextExtractor(
    backend=MinerUVLTextPyTorchConfig(device="cuda")
)
result = extractor.extract(image)

BlockType

Bases: str, Enum

MinerU VL block types (22 categories).

ContentBlock

Bases: BaseModel

A detected content block with type, bounding box, angle, and content.

Coordinates are normalized to [0, 1] range relative to image dimensions.

to_absolute
to_absolute(
    image_width: int, image_height: int
) -> List[int]

Convert normalized bbox to absolute pixel coordinates.

Source code in omnidocs/tasks/text_extraction/mineruvl/utils.py
def to_absolute(self, image_width: int, image_height: int) -> List[int]:
    """Convert normalized bbox to absolute pixel coordinates."""
    x1, y1, x2, y2 = self.bbox
    return [
        int(x1 * image_width),
        int(y1 * image_height),
        int(x2 * image_width),
        int(y2 * image_height),
    ]

MinerUSamplingParams

MinerUSamplingParams(
    temperature: Optional[float] = 0.0,
    top_p: Optional[float] = 0.01,
    top_k: Optional[int] = 1,
    presence_penalty: Optional[float] = 0.0,
    frequency_penalty: Optional[float] = 0.0,
    repetition_penalty: Optional[float] = 1.0,
    no_repeat_ngram_size: Optional[int] = 100,
    max_new_tokens: Optional[int] = None,
)

Bases: SamplingParams

Default sampling parameters optimized for MinerU VL.

Source code in omnidocs/tasks/text_extraction/mineruvl/utils.py
def __init__(
    self,
    temperature: Optional[float] = 0.0,
    top_p: Optional[float] = 0.01,
    top_k: Optional[int] = 1,
    presence_penalty: Optional[float] = 0.0,
    frequency_penalty: Optional[float] = 0.0,
    repetition_penalty: Optional[float] = 1.0,
    no_repeat_ngram_size: Optional[int] = 100,
    max_new_tokens: Optional[int] = None,
):
    super().__init__(
        temperature,
        top_p,
        top_k,
        presence_penalty,
        frequency_penalty,
        repetition_penalty,
        no_repeat_ngram_size,
        max_new_tokens,
    )

SamplingParams dataclass

SamplingParams(
    temperature: Optional[float] = None,
    top_p: Optional[float] = None,
    top_k: Optional[int] = None,
    presence_penalty: Optional[float] = None,
    frequency_penalty: Optional[float] = None,
    repetition_penalty: Optional[float] = None,
    no_repeat_ngram_size: Optional[int] = None,
    max_new_tokens: Optional[int] = None,
)

Sampling parameters for text generation.

MinerUVLTextVLLMConfig

Bases: BaseModel

VLLM backend config for MinerU VL text extraction.

Uses VLLM for high-throughput GPU inference with: - PagedAttention for efficient KV cache - Continuous batching - Optimized CUDA kernels

Example
from omnidocs.tasks.text_extraction import MinerUVLTextExtractor
from omnidocs.tasks.text_extraction.mineruvl import MinerUVLTextVLLMConfig

extractor = MinerUVLTextExtractor(
    backend=MinerUVLTextVLLMConfig(
        tensor_parallel_size=1,
        gpu_memory_utilization=0.85,
    )
)
result = extractor.extract(image)

convert_otsl_to_html

convert_otsl_to_html(otsl_content: str) -> str

Convert OTSL table format to HTML.

Source code in omnidocs/tasks/text_extraction/mineruvl/utils.py
def convert_otsl_to_html(otsl_content: str) -> str:
    """Convert OTSL table format to HTML."""
    if otsl_content.startswith("<table") and otsl_content.endswith("</table>"):
        return otsl_content

    pattern = r"(" + r"|".join(ALL_OTSL_TOKENS) + r")"
    tokens = re.findall(pattern, otsl_content)
    text_parts = re.split(pattern, otsl_content)
    text_parts = [part for part in text_parts if part.strip()]

    split_row_tokens = [list(y) for x, y in itertools.groupby(tokens, lambda z: z == OTSL_NL) if not x]
    if not split_row_tokens:
        return ""

    max_cols = max(len(row) for row in split_row_tokens)
    for row in split_row_tokens:
        while len(row) < max_cols:
            row.append(OTSL_ECEL)

    def count_right(tokens_grid, c, r, which_tokens):
        span = 0
        c_iter = c
        while c_iter < len(tokens_grid[r]) and tokens_grid[r][c_iter] in which_tokens:
            c_iter += 1
            span += 1
        return span

    def count_down(tokens_grid, c, r, which_tokens):
        span = 0
        r_iter = r
        while r_iter < len(tokens_grid) and tokens_grid[r_iter][c] in which_tokens:
            r_iter += 1
            span += 1
        return span

    table_cells = []
    r_idx = 0
    c_idx = 0

    for i, text in enumerate(text_parts):
        if text in [OTSL_FCEL, OTSL_ECEL]:
            row_span = 1
            col_span = 1
            cell_text = ""
            right_offset = 1

            if text != OTSL_ECEL and i + 1 < len(text_parts):
                next_text = text_parts[i + 1]
                if next_text not in ALL_OTSL_TOKENS:
                    cell_text = next_text
                    right_offset = 2

            if i + right_offset < len(text_parts):
                next_right = text_parts[i + right_offset]
                if next_right in [OTSL_LCEL, OTSL_XCEL]:
                    col_span += count_right(split_row_tokens, c_idx + 1, r_idx, [OTSL_LCEL, OTSL_XCEL])

            if r_idx + 1 < len(split_row_tokens) and c_idx < len(split_row_tokens[r_idx + 1]):
                next_bottom = split_row_tokens[r_idx + 1][c_idx]
                if next_bottom in [OTSL_UCEL, OTSL_XCEL]:
                    row_span += count_down(split_row_tokens, c_idx, r_idx + 1, [OTSL_UCEL, OTSL_XCEL])

            table_cells.append(
                {
                    "text": cell_text.strip(),
                    "row_span": row_span,
                    "col_span": col_span,
                    "start_row": r_idx,
                    "start_col": c_idx,
                }
            )

        if text in [OTSL_FCEL, OTSL_ECEL, OTSL_LCEL, OTSL_UCEL, OTSL_XCEL]:
            c_idx += 1
        if text == OTSL_NL:
            r_idx += 1
            c_idx = 0

    num_rows = len(split_row_tokens)
    num_cols = max_cols
    grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]

    for cell in table_cells:
        for i in range(cell["start_row"], min(cell["start_row"] + cell["row_span"], num_rows)):
            for j in range(cell["start_col"], min(cell["start_col"] + cell["col_span"], num_cols)):
                grid[i][j] = cell

    html_parts = []
    for i in range(num_rows):
        html_parts.append("<tr>")
        for j in range(num_cols):
            cell = grid[i][j]
            if cell is None:
                continue
            if cell["start_row"] != i or cell["start_col"] != j:
                continue

            content = html.escape(cell["text"])
            tag = "td"
            parts = [f"<{tag}"]
            if cell["row_span"] > 1:
                parts.append(f' rowspan="{cell["row_span"]}"')
            if cell["col_span"] > 1:
                parts.append(f' colspan="{cell["col_span"]}"')
            parts.append(f">{content}</{tag}>")
            html_parts.append("".join(parts))
        html_parts.append("</tr>")

    return f"<table>{''.join(html_parts)}</table>"

parse_layout_output

parse_layout_output(output: str) -> List[ContentBlock]

Parse layout detection model output into ContentBlocks.

Source code in omnidocs/tasks/text_extraction/mineruvl/utils.py
def parse_layout_output(output: str) -> List[ContentBlock]:
    """Parse layout detection model output into ContentBlocks."""
    blocks = []
    for line in output.split("\n"):
        match = re.match(LAYOUT_REGEX, line)
        if not match:
            continue
        x1, y1, x2, y2, ref_type, tail = match.groups()
        bbox = convert_bbox((x1, y1, x2, y2))
        if bbox is None:
            continue
        ref_type = ref_type.lower()
        if ref_type not in BLOCK_TYPES:
            continue
        angle = parse_angle(tail)
        blocks.append(
            ContentBlock(
                type=BlockType(ref_type),
                bbox=bbox,
                angle=angle,
            )
        )
    return blocks

api

API backend configuration for MinerU VL text extraction.

MinerUVLTextAPIConfig

Bases: BaseModel

API backend config for MinerU VL text extraction.

Connects to a deployed VLLM server with OpenAI-compatible API.

Example
from omnidocs.tasks.text_extraction import MinerUVLTextExtractor
from omnidocs.tasks.text_extraction.mineruvl import MinerUVLTextAPIConfig

extractor = MinerUVLTextExtractor(
    backend=MinerUVLTextAPIConfig(
        server_url="https://your-server.modal.run"
    )
)
result = extractor.extract(image)

extractor

MinerU VL text extractor with layout-aware two-step extraction.

MinerU VL performs document extraction in two steps: 1. Layout Detection: Detect regions with types (text, table, equation, etc.) 2. Content Recognition: Extract text/table/equation content from each region

MinerUVLTextExtractor
MinerUVLTextExtractor(backend: MinerUVLTextBackendConfig)

Bases: BaseTextExtractor

MinerU VL text extractor with layout-aware extraction.

Performs two-step extraction: 1. Layout detection (detect regions) 2. Content recognition (extract text/table/equation from each region)

Supports multiple backends: - PyTorch (HuggingFace Transformers) - VLLM (high-throughput GPU) - MLX (Apple Silicon) - API (VLLM OpenAI-compatible server)

Example
from omnidocs.tasks.text_extraction import MinerUVLTextExtractor
from omnidocs.tasks.text_extraction.mineruvl import MinerUVLTextPyTorchConfig

extractor = MinerUVLTextExtractor(
    backend=MinerUVLTextPyTorchConfig(device="cuda")
)
result = extractor.extract(image)

print(result.content)  # Combined text + tables + equations
print(result.blocks)   # List of ContentBlock objects

Initialize MinerU VL text extractor.

PARAMETER DESCRIPTION
backend

Backend configuration (PyTorch, VLLM, MLX, or API)

TYPE: MinerUVLTextBackendConfig

Source code in omnidocs/tasks/text_extraction/mineruvl/extractor.py
def __init__(self, backend: MinerUVLTextBackendConfig):
    """
    Initialize MinerU VL text extractor.

    Args:
        backend: Backend configuration (PyTorch, VLLM, MLX, or API)
    """
    self.backend_config = backend
    self._client = None
    self._loaded = False
    self._load_model()
extract
extract(
    image: Union[Image, ndarray, str, Path],
    output_format: Literal["html", "markdown"] = "markdown",
) -> TextOutput

Extract text with layout-aware two-step extraction.

PARAMETER DESCRIPTION
image

Input image (PIL Image, numpy array, or file path)

TYPE: Union[Image, ndarray, str, Path]

output_format

Output format ('html' or 'markdown')

TYPE: Literal['html', 'markdown'] DEFAULT: 'markdown'

RETURNS DESCRIPTION
TextOutput

TextOutput with extracted content and metadata

Source code in omnidocs/tasks/text_extraction/mineruvl/extractor.py
def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    output_format: Literal["html", "markdown"] = "markdown",
) -> TextOutput:
    """
    Extract text with layout-aware two-step extraction.

    Args:
        image: Input image (PIL Image, numpy array, or file path)
        output_format: Output format ('html' or 'markdown')

    Returns:
        TextOutput with extracted content and metadata
    """
    if not self._loaded:
        raise RuntimeError("Model not loaded. Call _load_model() first.")

    pil_image = self._prepare_image(image)
    width, height = pil_image.size

    # Step 1: Layout detection
    blocks = self._detect_layout(pil_image)

    # Step 2: Content extraction for each block
    blocks = self._extract_content(pil_image, blocks)

    # Post-process (OTSL to HTML for tables)
    blocks = simple_post_process(blocks)

    # Combine content
    content = self._combine_content(blocks, output_format)

    # Build raw output with blocks info
    raw_output = self._build_raw_output(blocks)

    return TextOutput(
        content=content,
        format=OutputFormat(output_format),
        raw_output=raw_output,
        image_width=width,
        image_height=height,
        model_name="MinerU2.5-2509-1.2B",
    )
extract_with_blocks
extract_with_blocks(
    image: Union[Image, ndarray, str, Path],
    output_format: Literal["html", "markdown"] = "markdown",
) -> tuple[TextOutput, List[ContentBlock]]

Extract text and return both TextOutput and ContentBlocks.

This method provides access to the detailed block information including bounding boxes and block types.

PARAMETER DESCRIPTION
image

Input image

TYPE: Union[Image, ndarray, str, Path]

output_format

Output format

TYPE: Literal['html', 'markdown'] DEFAULT: 'markdown'

RETURNS DESCRIPTION
tuple[TextOutput, List[ContentBlock]]

Tuple of (TextOutput, List[ContentBlock])

Source code in omnidocs/tasks/text_extraction/mineruvl/extractor.py
def extract_with_blocks(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    output_format: Literal["html", "markdown"] = "markdown",
) -> tuple[TextOutput, List[ContentBlock]]:
    """
    Extract text and return both TextOutput and ContentBlocks.

    This method provides access to the detailed block information
    including bounding boxes and block types.

    Args:
        image: Input image
        output_format: Output format

    Returns:
        Tuple of (TextOutput, List[ContentBlock])
    """
    if not self._loaded:
        raise RuntimeError("Model not loaded.")

    pil_image = self._prepare_image(image)
    width, height = pil_image.size

    # Two-step extraction
    blocks = self._detect_layout(pil_image)
    blocks = self._extract_content(pil_image, blocks)
    blocks = simple_post_process(blocks)

    content = self._combine_content(blocks, output_format)
    raw_output = self._build_raw_output(blocks)

    text_output = TextOutput(
        content=content,
        format=OutputFormat(output_format),
        raw_output=raw_output,
        image_width=width,
        image_height=height,
        model_name="MinerU2.5-2509-1.2B",
    )

    return text_output, blocks

mlx

MLX backend configuration for MinerU VL text extraction (Apple Silicon).

MinerUVLTextMLXConfig

Bases: BaseModel

MLX backend config for MinerU VL text extraction on Apple Silicon.

Uses MLX-VLM for efficient inference on M1/M2/M3/M4 chips.

Example
from omnidocs.tasks.text_extraction import MinerUVLTextExtractor
from omnidocs.tasks.text_extraction.mineruvl import MinerUVLTextMLXConfig

extractor = MinerUVLTextExtractor(
    backend=MinerUVLTextMLXConfig()
)
result = extractor.extract(image)

pytorch

PyTorch/HuggingFace backend configuration for MinerU VL text extraction.

MinerUVLTextPyTorchConfig

Bases: BaseModel

PyTorch/HuggingFace backend config for MinerU VL text extraction.

Uses HuggingFace Transformers with Qwen2VLForConditionalGeneration.

Example
from omnidocs.tasks.text_extraction import MinerUVLTextExtractor
from omnidocs.tasks.text_extraction.mineruvl import MinerUVLTextPyTorchConfig

extractor = MinerUVLTextExtractor(
    backend=MinerUVLTextPyTorchConfig(device="cuda")
)
result = extractor.extract(image)

utils

MinerU VL utilities for document extraction.

Contains data structures, parsing, prompts, and post-processing functions for MinerU VL document extraction pipeline.

This file contains code adapted from mineru-vl-utils

https://github.com/opendatalab/mineru-vl-utils https://pypi.org/project/mineru-vl-utils/

The original mineru-vl-utils is licensed under AGPL-3.0: Copyright (c) OpenDataLab https://github.com/opendatalab/mineru-vl-utils/blob/main/LICENSE.md

Adapted components
  • BlockType enum (from structs.py)
  • ContentBlock data structure (from structs.py)
  • OTSL to HTML table conversion (from post_process/otsl2html.py)
BlockType

Bases: str, Enum

MinerU VL block types (22 categories).

ContentBlock

Bases: BaseModel

A detected content block with type, bounding box, angle, and content.

Coordinates are normalized to [0, 1] range relative to image dimensions.

to_absolute
to_absolute(
    image_width: int, image_height: int
) -> List[int]

Convert normalized bbox to absolute pixel coordinates.

Source code in omnidocs/tasks/text_extraction/mineruvl/utils.py
def to_absolute(self, image_width: int, image_height: int) -> List[int]:
    """Convert normalized bbox to absolute pixel coordinates."""
    x1, y1, x2, y2 = self.bbox
    return [
        int(x1 * image_width),
        int(y1 * image_height),
        int(x2 * image_width),
        int(y2 * image_height),
    ]
SamplingParams dataclass
SamplingParams(
    temperature: Optional[float] = None,
    top_p: Optional[float] = None,
    top_k: Optional[int] = None,
    presence_penalty: Optional[float] = None,
    frequency_penalty: Optional[float] = None,
    repetition_penalty: Optional[float] = None,
    no_repeat_ngram_size: Optional[int] = None,
    max_new_tokens: Optional[int] = None,
)

Sampling parameters for text generation.

MinerUSamplingParams
MinerUSamplingParams(
    temperature: Optional[float] = 0.0,
    top_p: Optional[float] = 0.01,
    top_k: Optional[int] = 1,
    presence_penalty: Optional[float] = 0.0,
    frequency_penalty: Optional[float] = 0.0,
    repetition_penalty: Optional[float] = 1.0,
    no_repeat_ngram_size: Optional[int] = 100,
    max_new_tokens: Optional[int] = None,
)

Bases: SamplingParams

Default sampling parameters optimized for MinerU VL.

Source code in omnidocs/tasks/text_extraction/mineruvl/utils.py
def __init__(
    self,
    temperature: Optional[float] = 0.0,
    top_p: Optional[float] = 0.01,
    top_k: Optional[int] = 1,
    presence_penalty: Optional[float] = 0.0,
    frequency_penalty: Optional[float] = 0.0,
    repetition_penalty: Optional[float] = 1.0,
    no_repeat_ngram_size: Optional[int] = 100,
    max_new_tokens: Optional[int] = None,
):
    super().__init__(
        temperature,
        top_p,
        top_k,
        presence_penalty,
        frequency_penalty,
        repetition_penalty,
        no_repeat_ngram_size,
        max_new_tokens,
    )
convert_bbox
convert_bbox(bbox: Sequence) -> Optional[List[float]]

Convert bbox from model output (0-1000) to normalized format (0-1).

Source code in omnidocs/tasks/text_extraction/mineruvl/utils.py
def convert_bbox(bbox: Sequence) -> Optional[List[float]]:
    """Convert bbox from model output (0-1000) to normalized format (0-1)."""
    bbox = tuple(map(int, bbox))
    if any(coord < 0 or coord > 1000 for coord in bbox):
        return None
    x1, y1, x2, y2 = bbox
    x1, x2 = (x2, x1) if x2 < x1 else (x1, x2)
    y1, y2 = (y2, y1) if y2 < y1 else (y1, y2)
    if x1 == x2 or y1 == y2:
        return None
    return [coord / 1000.0 for coord in (x1, y1, x2, y2)]
parse_angle
parse_angle(tail: str) -> Literal[None, 0, 90, 180, 270]

Parse rotation angle from model output tail string.

Source code in omnidocs/tasks/text_extraction/mineruvl/utils.py
def parse_angle(tail: str) -> Literal[None, 0, 90, 180, 270]:
    """Parse rotation angle from model output tail string."""
    for token, angle in ANGLE_MAPPING.items():
        if token in tail:
            return angle
    return None
parse_layout_output
parse_layout_output(output: str) -> List[ContentBlock]

Parse layout detection model output into ContentBlocks.

Source code in omnidocs/tasks/text_extraction/mineruvl/utils.py
def parse_layout_output(output: str) -> List[ContentBlock]:
    """Parse layout detection model output into ContentBlocks."""
    blocks = []
    for line in output.split("\n"):
        match = re.match(LAYOUT_REGEX, line)
        if not match:
            continue
        x1, y1, x2, y2, ref_type, tail = match.groups()
        bbox = convert_bbox((x1, y1, x2, y2))
        if bbox is None:
            continue
        ref_type = ref_type.lower()
        if ref_type not in BLOCK_TYPES:
            continue
        angle = parse_angle(tail)
        blocks.append(
            ContentBlock(
                type=BlockType(ref_type),
                bbox=bbox,
                angle=angle,
            )
        )
    return blocks
get_rgb_image
get_rgb_image(image: Image) -> Image.Image

Convert image to RGB mode.

Source code in omnidocs/tasks/text_extraction/mineruvl/utils.py
def get_rgb_image(image: Image.Image) -> Image.Image:
    """Convert image to RGB mode."""
    if image.mode == "P":
        image = image.convert("RGBA")
    if image.mode != "RGB":
        image = image.convert("RGB")
    return image
prepare_for_layout
prepare_for_layout(
    image: Image,
    layout_size: Tuple[int, int] = LAYOUT_IMAGE_SIZE,
) -> Image.Image

Prepare image for layout detection.

Source code in omnidocs/tasks/text_extraction/mineruvl/utils.py
def prepare_for_layout(
    image: Image.Image,
    layout_size: Tuple[int, int] = LAYOUT_IMAGE_SIZE,
) -> Image.Image:
    """Prepare image for layout detection."""
    image = get_rgb_image(image)
    image = image.resize(layout_size, Image.Resampling.BICUBIC)
    return image
resize_by_need
resize_by_need(
    image: Image, min_edge: int = 28, max_ratio: float = 50
) -> Image.Image

Resize image if needed based on aspect ratio constraints.

Source code in omnidocs/tasks/text_extraction/mineruvl/utils.py
def resize_by_need(
    image: Image.Image,
    min_edge: int = 28,
    max_ratio: float = 50,
) -> Image.Image:
    """Resize image if needed based on aspect ratio constraints."""
    edge_ratio = max(image.size) / min(image.size)
    if edge_ratio > max_ratio:
        width, height = image.size
        if width > height:
            new_w, new_h = width, math.ceil(width / max_ratio)
        else:
            new_w, new_h = math.ceil(height / max_ratio), height
        new_image = Image.new(image.mode, (new_w, new_h), (255, 255, 255))
        new_image.paste(image, (int((new_w - width) / 2), int((new_h - height) / 2)))
        image = new_image
    if min(image.size) < min_edge:
        scale = min_edge / min(image.size)
        new_w, new_h = round(image.width * scale), round(image.height * scale)
        image = image.resize((new_w, new_h), Image.Resampling.BICUBIC)
    return image
prepare_for_extract
prepare_for_extract(
    image: Image,
    blocks: List[ContentBlock],
    prompts: Dict[str, str] = None,
    sampling_params: Dict[str, SamplingParams] = None,
    skip_types: set = None,
) -> Tuple[
    List[Image.Image],
    List[str],
    List[SamplingParams],
    List[int],
]

Prepare cropped images for content extraction.

Source code in omnidocs/tasks/text_extraction/mineruvl/utils.py
def prepare_for_extract(
    image: Image.Image,
    blocks: List[ContentBlock],
    prompts: Dict[str, str] = None,
    sampling_params: Dict[str, SamplingParams] = None,
    skip_types: set = None,
) -> Tuple[List[Image.Image], List[str], List[SamplingParams], List[int]]:
    """Prepare cropped images for content extraction."""
    if prompts is None:
        prompts = DEFAULT_PROMPTS
    if sampling_params is None:
        sampling_params = DEFAULT_SAMPLING_PARAMS
    if skip_types is None:
        skip_types = {"image", "list", "equation_block"}

    image = get_rgb_image(image)
    width, height = image.size

    block_images = []
    prompt_list = []
    params_list = []
    indices = []

    for idx, block in enumerate(blocks):
        if block.type.value in skip_types:
            continue

        x1, y1, x2, y2 = block.bbox
        scaled_bbox = (x1 * width, y1 * height, x2 * width, y2 * height)
        block_image = image.crop(scaled_bbox)

        if block_image.width < 1 or block_image.height < 1:
            continue

        if block.angle in [90, 180, 270]:
            block_image = block_image.rotate(block.angle, expand=True)

        block_image = resize_by_need(block_image)
        block_images.append(block_image)

        block_type = block.type.value
        prompt = prompts.get(block_type) or prompts.get("[default]")
        prompt_list.append(prompt)

        params = sampling_params.get(block_type) or sampling_params.get("[default]")
        params_list.append(params)
        indices.append(idx)

    return block_images, prompt_list, params_list, indices
convert_otsl_to_html
convert_otsl_to_html(otsl_content: str) -> str

Convert OTSL table format to HTML.

Source code in omnidocs/tasks/text_extraction/mineruvl/utils.py
def convert_otsl_to_html(otsl_content: str) -> str:
    """Convert OTSL table format to HTML."""
    if otsl_content.startswith("<table") and otsl_content.endswith("</table>"):
        return otsl_content

    pattern = r"(" + r"|".join(ALL_OTSL_TOKENS) + r")"
    tokens = re.findall(pattern, otsl_content)
    text_parts = re.split(pattern, otsl_content)
    text_parts = [part for part in text_parts if part.strip()]

    split_row_tokens = [list(y) for x, y in itertools.groupby(tokens, lambda z: z == OTSL_NL) if not x]
    if not split_row_tokens:
        return ""

    max_cols = max(len(row) for row in split_row_tokens)
    for row in split_row_tokens:
        while len(row) < max_cols:
            row.append(OTSL_ECEL)

    def count_right(tokens_grid, c, r, which_tokens):
        span = 0
        c_iter = c
        while c_iter < len(tokens_grid[r]) and tokens_grid[r][c_iter] in which_tokens:
            c_iter += 1
            span += 1
        return span

    def count_down(tokens_grid, c, r, which_tokens):
        span = 0
        r_iter = r
        while r_iter < len(tokens_grid) and tokens_grid[r_iter][c] in which_tokens:
            r_iter += 1
            span += 1
        return span

    table_cells = []
    r_idx = 0
    c_idx = 0

    for i, text in enumerate(text_parts):
        if text in [OTSL_FCEL, OTSL_ECEL]:
            row_span = 1
            col_span = 1
            cell_text = ""
            right_offset = 1

            if text != OTSL_ECEL and i + 1 < len(text_parts):
                next_text = text_parts[i + 1]
                if next_text not in ALL_OTSL_TOKENS:
                    cell_text = next_text
                    right_offset = 2

            if i + right_offset < len(text_parts):
                next_right = text_parts[i + right_offset]
                if next_right in [OTSL_LCEL, OTSL_XCEL]:
                    col_span += count_right(split_row_tokens, c_idx + 1, r_idx, [OTSL_LCEL, OTSL_XCEL])

            if r_idx + 1 < len(split_row_tokens) and c_idx < len(split_row_tokens[r_idx + 1]):
                next_bottom = split_row_tokens[r_idx + 1][c_idx]
                if next_bottom in [OTSL_UCEL, OTSL_XCEL]:
                    row_span += count_down(split_row_tokens, c_idx, r_idx + 1, [OTSL_UCEL, OTSL_XCEL])

            table_cells.append(
                {
                    "text": cell_text.strip(),
                    "row_span": row_span,
                    "col_span": col_span,
                    "start_row": r_idx,
                    "start_col": c_idx,
                }
            )

        if text in [OTSL_FCEL, OTSL_ECEL, OTSL_LCEL, OTSL_UCEL, OTSL_XCEL]:
            c_idx += 1
        if text == OTSL_NL:
            r_idx += 1
            c_idx = 0

    num_rows = len(split_row_tokens)
    num_cols = max_cols
    grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]

    for cell in table_cells:
        for i in range(cell["start_row"], min(cell["start_row"] + cell["row_span"], num_rows)):
            for j in range(cell["start_col"], min(cell["start_col"] + cell["col_span"], num_cols)):
                grid[i][j] = cell

    html_parts = []
    for i in range(num_rows):
        html_parts.append("<tr>")
        for j in range(num_cols):
            cell = grid[i][j]
            if cell is None:
                continue
            if cell["start_row"] != i or cell["start_col"] != j:
                continue

            content = html.escape(cell["text"])
            tag = "td"
            parts = [f"<{tag}"]
            if cell["row_span"] > 1:
                parts.append(f' rowspan="{cell["row_span"]}"')
            if cell["col_span"] > 1:
                parts.append(f' colspan="{cell["col_span"]}"')
            parts.append(f">{content}</{tag}>")
            html_parts.append("".join(parts))
        html_parts.append("</tr>")

    return f"<table>{''.join(html_parts)}</table>"
simple_post_process
simple_post_process(
    blocks: List[ContentBlock],
) -> List[ContentBlock]

Simple post-processing: convert OTSL tables to HTML.

Source code in omnidocs/tasks/text_extraction/mineruvl/utils.py
def simple_post_process(blocks: List[ContentBlock]) -> List[ContentBlock]:
    """Simple post-processing: convert OTSL tables to HTML."""
    for block in blocks:
        if block.type == BlockType.TABLE and block.content:
            try:
                block.content = convert_otsl_to_html(block.content)
            except Exception:
                pass
    return blocks

vllm

VLLM backend configuration for MinerU VL text extraction.

MinerUVLTextVLLMConfig

Bases: BaseModel

VLLM backend config for MinerU VL text extraction.

Uses VLLM for high-throughput GPU inference with: - PagedAttention for efficient KV cache - Continuous batching - Optimized CUDA kernels

Example
from omnidocs.tasks.text_extraction import MinerUVLTextExtractor
from omnidocs.tasks.text_extraction.mineruvl import MinerUVLTextVLLMConfig

extractor = MinerUVLTextExtractor(
    backend=MinerUVLTextVLLMConfig(
        tensor_parallel_size=1,
        gpu_memory_utilization=0.85,
    )
)
result = extractor.extract(image)

models

Pydantic models for text extraction outputs.

Defines output types and format enums for text extraction.

OutputFormat

Bases: str, Enum

Supported text extraction output formats.

Each format has different characteristics
  • HTML: Structured with div elements, preserves layout semantics
  • MARKDOWN: Portable, human-readable, good for documentation
  • JSON: Structured data with layout information (Dots OCR)

TextOutput

Bases: BaseModel

Text extraction output from a document image.

Contains the extracted text content in the requested format, along with optional raw output and plain text versions.

Example
result = extractor.extract(image, output_format="markdown")
print(result.content)  # Clean markdown
print(result.plain_text)  # Plain text without formatting
content_length property
content_length: int

Length of the extracted content in characters.

word_count property
word_count: int

Approximate word count of the plain text.

LayoutElement

Bases: BaseModel

Single layout element from document layout detection.

Represents a detected region in the document with its bounding box, category label, and extracted text content.

ATTRIBUTE DESCRIPTION
bbox

Bounding box coordinates [x1, y1, x2, y2] (normalized to 0-1024)

TYPE: List[int]

category

Layout category (e.g., "Text", "Title", "Table", "Formula")

TYPE: str

text

Extracted text content (None for pictures)

TYPE: Optional[str]

confidence

Detection confidence score (optional)

TYPE: Optional[float]

DotsOCRTextOutput

Bases: BaseModel

Text extraction output from Dots OCR with layout information.

Dots OCR provides structured output with: - Layout detection (11 categories) - Bounding boxes (normalized to 0-1024) - Multi-format text (Markdown/LaTeX/HTML) - Reading order preservation

Layout Categories

Caption, Footnote, Formula, List-item, Page-footer, Page-header, Picture, Section-header, Table, Text, Title

Text Formatting
  • Text/Title/Section-header: Markdown
  • Formula: LaTeX
  • Table: HTML
  • Picture: (text omitted)
Example
from omnidocs.tasks.text_extraction import DotsOCRTextExtractor
result = extractor.extract(image, include_layout=True)
print(result.content)  # Full text with formatting
for elem in result.layout:
        print(f"{elem.category}: {elem.bbox}")
num_layout_elements property
num_layout_elements: int

Number of detected layout elements.

content_length property
content_length: int

Length of extracted content in characters.

nanonets

Nanonets OCR2-3B backend configurations and extractor for text extraction.

Available backends
  • NanonetsTextPyTorchConfig: PyTorch/HuggingFace backend
  • NanonetsTextVLLMConfig: VLLM high-throughput backend
  • NanonetsTextMLXConfig: MLX backend for Apple Silicon
Example
from omnidocs.tasks.text_extraction.nanonets import NanonetsTextPyTorchConfig
config = NanonetsTextPyTorchConfig()

NanonetsTextExtractor

NanonetsTextExtractor(backend: NanonetsTextBackendConfig)

Bases: BaseTextExtractor

Nanonets OCR2-3B Vision-Language Model text extractor.

Extracts text from document images with support for: - Tables (output as HTML) - Equations (output as LaTeX) - Image captions (wrapped in tags) - Watermarks (wrapped in tags) - Page numbers (wrapped in tags) - Checkboxes (using ☐ and ☑ symbols)

Supports PyTorch, VLLM, and MLX backends.

Example
from omnidocs.tasks.text_extraction import NanonetsTextExtractor
from omnidocs.tasks.text_extraction.nanonets import NanonetsTextPyTorchConfig

# Initialize with PyTorch backend
extractor = NanonetsTextExtractor(
        backend=NanonetsTextPyTorchConfig()
    )

# Extract text
result = extractor.extract(image)
print(result.content)

Initialize Nanonets text extractor.

PARAMETER DESCRIPTION
backend

Backend configuration. One of: - NanonetsTextPyTorchConfig: PyTorch/HuggingFace backend - NanonetsTextVLLMConfig: VLLM high-throughput backend - NanonetsTextMLXConfig: MLX backend for Apple Silicon

TYPE: NanonetsTextBackendConfig

Source code in omnidocs/tasks/text_extraction/nanonets/extractor.py
def __init__(self, backend: NanonetsTextBackendConfig):
    """
    Initialize Nanonets text extractor.

    Args:
        backend: Backend configuration. One of:
            - NanonetsTextPyTorchConfig: PyTorch/HuggingFace backend
            - NanonetsTextVLLMConfig: VLLM high-throughput backend
            - NanonetsTextMLXConfig: MLX backend for Apple Silicon
    """
    self.backend_config = backend
    self._backend: Any = None
    self._processor: Any = None
    self._loaded = False

    # Backend-specific helpers
    self._process_vision_info: Any = None
    self._sampling_params_class: Any = None
    self._device: str = "cpu"

    # MLX-specific helpers
    self._mlx_config: Any = None
    self._apply_chat_template: Any = None
    self._generate: Any = None

    # Load model
    self._load_model()
extract
extract(
    image: Union[Image, ndarray, str, Path],
    output_format: Literal["html", "markdown"] = "markdown",
) -> TextOutput

Extract text from an image.

Note: Nanonets OCR2 produces a unified output format that includes tables as HTML and equations as LaTeX inline. The output_format parameter is accepted for API compatibility but does not change the output structure.

PARAMETER DESCRIPTION
image

Input image as: - PIL.Image.Image: PIL image object - np.ndarray: Numpy array (HWC format, RGB) - str or Path: Path to image file

TYPE: Union[Image, ndarray, str, Path]

output_format

Accepted for API compatibility (default: "markdown")

TYPE: Literal['html', 'markdown'] DEFAULT: 'markdown'

RETURNS DESCRIPTION
TextOutput

TextOutput containing extracted text content

RAISES DESCRIPTION
RuntimeError

If model is not loaded

ValueError

If image format is not supported

Source code in omnidocs/tasks/text_extraction/nanonets/extractor.py
def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    output_format: Literal["html", "markdown"] = "markdown",
) -> TextOutput:
    """
    Extract text from an image.

    Note: Nanonets OCR2 produces a unified output format that includes
    tables as HTML and equations as LaTeX inline. The output_format
    parameter is accepted for API compatibility but does not change
    the output structure.

    Args:
        image: Input image as:
            - PIL.Image.Image: PIL image object
            - np.ndarray: Numpy array (HWC format, RGB)
            - str or Path: Path to image file
        output_format: Accepted for API compatibility (default: "markdown")

    Returns:
        TextOutput containing extracted text content

    Raises:
        RuntimeError: If model is not loaded
        ValueError: If image format is not supported
    """
    if not self._loaded:
        raise RuntimeError("Model not loaded. Call _load_model() first.")

    # Prepare image
    pil_image = self._prepare_image(image)
    width, height = pil_image.size

    # Run inference based on backend
    config_type = type(self.backend_config).__name__
    if config_type == "NanonetsTextPyTorchConfig":
        raw_output = self._infer_pytorch(pil_image)
    elif config_type == "NanonetsTextVLLMConfig":
        raw_output = self._infer_vllm(pil_image)
    elif config_type == "NanonetsTextMLXConfig":
        raw_output = self._infer_mlx(pil_image)
    else:
        raise RuntimeError(f"Unknown backend: {config_type}")

    # Clean output
    cleaned_output = raw_output.replace("<|im_end|>", "").strip()

    return TextOutput(
        content=cleaned_output,
        format=OutputFormat(output_format),
        raw_output=raw_output,
        plain_text=cleaned_output,
        image_width=width,
        image_height=height,
        model_name=f"Nanonets-OCR2-3B ({type(self.backend_config).__name__})",
    )

NanonetsTextMLXConfig

Bases: BaseModel

MLX backend configuration for Nanonets OCR2-3B text extraction.

This backend uses MLX for Apple Silicon native inference. Best for local development and testing on macOS M1/M2/M3/M4+. Requires: mlx, mlx-vlm

Note: This backend only works on Apple Silicon Macs. Do NOT use for Modal/cloud deployments.

Example
config = NanonetsTextMLXConfig(
        model="mlx-community/Nanonets-OCR2-3B-bf16",
    )

NanonetsTextPyTorchConfig

Bases: BaseModel

PyTorch/HuggingFace backend configuration for Nanonets OCR2-3B text extraction.

This backend uses the transformers library with PyTorch for local GPU inference. Requires: torch, transformers, accelerate

Example
config = NanonetsTextPyTorchConfig(
        device="cuda",
        torch_dtype="float16",
    )

NanonetsTextVLLMConfig

Bases: BaseModel

VLLM backend configuration for Nanonets OCR2-3B text extraction.

This backend uses VLLM for high-throughput inference. Best for batch processing and production deployments. Requires: vllm, torch, transformers, qwen-vl-utils

Example
config = NanonetsTextVLLMConfig(
        tensor_parallel_size=1,
        gpu_memory_utilization=0.85,
    )

extractor

Nanonets OCR2-3B text extractor.

A Vision-Language Model for extracting text from document images with support for tables (HTML), equations (LaTeX), and image captions.

Supports PyTorch and VLLM backends.

Example
from omnidocs.tasks.text_extraction import NanonetsTextExtractor
from omnidocs.tasks.text_extraction.nanonets import NanonetsTextPyTorchConfig

extractor = NanonetsTextExtractor(
        backend=NanonetsTextPyTorchConfig()
    )
result = extractor.extract(image)
print(result.content)
NanonetsTextExtractor
NanonetsTextExtractor(backend: NanonetsTextBackendConfig)

Bases: BaseTextExtractor

Nanonets OCR2-3B Vision-Language Model text extractor.

Extracts text from document images with support for: - Tables (output as HTML) - Equations (output as LaTeX) - Image captions (wrapped in tags) - Watermarks (wrapped in tags) - Page numbers (wrapped in tags) - Checkboxes (using ☐ and ☑ symbols)

Supports PyTorch, VLLM, and MLX backends.

Example
from omnidocs.tasks.text_extraction import NanonetsTextExtractor
from omnidocs.tasks.text_extraction.nanonets import NanonetsTextPyTorchConfig

# Initialize with PyTorch backend
extractor = NanonetsTextExtractor(
        backend=NanonetsTextPyTorchConfig()
    )

# Extract text
result = extractor.extract(image)
print(result.content)

Initialize Nanonets text extractor.

PARAMETER DESCRIPTION
backend

Backend configuration. One of: - NanonetsTextPyTorchConfig: PyTorch/HuggingFace backend - NanonetsTextVLLMConfig: VLLM high-throughput backend - NanonetsTextMLXConfig: MLX backend for Apple Silicon

TYPE: NanonetsTextBackendConfig

Source code in omnidocs/tasks/text_extraction/nanonets/extractor.py
def __init__(self, backend: NanonetsTextBackendConfig):
    """
    Initialize Nanonets text extractor.

    Args:
        backend: Backend configuration. One of:
            - NanonetsTextPyTorchConfig: PyTorch/HuggingFace backend
            - NanonetsTextVLLMConfig: VLLM high-throughput backend
            - NanonetsTextMLXConfig: MLX backend for Apple Silicon
    """
    self.backend_config = backend
    self._backend: Any = None
    self._processor: Any = None
    self._loaded = False

    # Backend-specific helpers
    self._process_vision_info: Any = None
    self._sampling_params_class: Any = None
    self._device: str = "cpu"

    # MLX-specific helpers
    self._mlx_config: Any = None
    self._apply_chat_template: Any = None
    self._generate: Any = None

    # Load model
    self._load_model()
extract
extract(
    image: Union[Image, ndarray, str, Path],
    output_format: Literal["html", "markdown"] = "markdown",
) -> TextOutput

Extract text from an image.

Note: Nanonets OCR2 produces a unified output format that includes tables as HTML and equations as LaTeX inline. The output_format parameter is accepted for API compatibility but does not change the output structure.

PARAMETER DESCRIPTION
image

Input image as: - PIL.Image.Image: PIL image object - np.ndarray: Numpy array (HWC format, RGB) - str or Path: Path to image file

TYPE: Union[Image, ndarray, str, Path]

output_format

Accepted for API compatibility (default: "markdown")

TYPE: Literal['html', 'markdown'] DEFAULT: 'markdown'

RETURNS DESCRIPTION
TextOutput

TextOutput containing extracted text content

RAISES DESCRIPTION
RuntimeError

If model is not loaded

ValueError

If image format is not supported

Source code in omnidocs/tasks/text_extraction/nanonets/extractor.py
def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    output_format: Literal["html", "markdown"] = "markdown",
) -> TextOutput:
    """
    Extract text from an image.

    Note: Nanonets OCR2 produces a unified output format that includes
    tables as HTML and equations as LaTeX inline. The output_format
    parameter is accepted for API compatibility but does not change
    the output structure.

    Args:
        image: Input image as:
            - PIL.Image.Image: PIL image object
            - np.ndarray: Numpy array (HWC format, RGB)
            - str or Path: Path to image file
        output_format: Accepted for API compatibility (default: "markdown")

    Returns:
        TextOutput containing extracted text content

    Raises:
        RuntimeError: If model is not loaded
        ValueError: If image format is not supported
    """
    if not self._loaded:
        raise RuntimeError("Model not loaded. Call _load_model() first.")

    # Prepare image
    pil_image = self._prepare_image(image)
    width, height = pil_image.size

    # Run inference based on backend
    config_type = type(self.backend_config).__name__
    if config_type == "NanonetsTextPyTorchConfig":
        raw_output = self._infer_pytorch(pil_image)
    elif config_type == "NanonetsTextVLLMConfig":
        raw_output = self._infer_vllm(pil_image)
    elif config_type == "NanonetsTextMLXConfig":
        raw_output = self._infer_mlx(pil_image)
    else:
        raise RuntimeError(f"Unknown backend: {config_type}")

    # Clean output
    cleaned_output = raw_output.replace("<|im_end|>", "").strip()

    return TextOutput(
        content=cleaned_output,
        format=OutputFormat(output_format),
        raw_output=raw_output,
        plain_text=cleaned_output,
        image_width=width,
        image_height=height,
        model_name=f"Nanonets-OCR2-3B ({type(self.backend_config).__name__})",
    )

mlx

MLX backend configuration for Nanonets OCR2-3B text extraction.

NanonetsTextMLXConfig

Bases: BaseModel

MLX backend configuration for Nanonets OCR2-3B text extraction.

This backend uses MLX for Apple Silicon native inference. Best for local development and testing on macOS M1/M2/M3/M4+. Requires: mlx, mlx-vlm

Note: This backend only works on Apple Silicon Macs. Do NOT use for Modal/cloud deployments.

Example
config = NanonetsTextMLXConfig(
        model="mlx-community/Nanonets-OCR2-3B-bf16",
    )

pytorch

PyTorch/HuggingFace backend configuration for Nanonets OCR2-3B text extraction.

NanonetsTextPyTorchConfig

Bases: BaseModel

PyTorch/HuggingFace backend configuration for Nanonets OCR2-3B text extraction.

This backend uses the transformers library with PyTorch for local GPU inference. Requires: torch, transformers, accelerate

Example
config = NanonetsTextPyTorchConfig(
        device="cuda",
        torch_dtype="float16",
    )

vllm

VLLM backend configuration for Nanonets OCR2-3B text extraction.

NanonetsTextVLLMConfig

Bases: BaseModel

VLLM backend configuration for Nanonets OCR2-3B text extraction.

This backend uses VLLM for high-throughput inference. Best for batch processing and production deployments. Requires: vllm, torch, transformers, qwen-vl-utils

Example
config = NanonetsTextVLLMConfig(
        tensor_parallel_size=1,
        gpu_memory_utilization=0.85,
    )

qwen

Qwen3-VL backend configurations and extractor for text extraction.

Available backends
  • QwenTextPyTorchConfig: PyTorch/HuggingFace backend
  • QwenTextVLLMConfig: VLLM high-throughput backend
  • QwenTextMLXConfig: MLX backend for Apple Silicon
  • QwenTextAPIConfig: API backend (OpenRouter, etc.)
Example
from omnidocs.tasks.text_extraction.qwen import QwenTextPyTorchConfig
config = QwenTextPyTorchConfig(model="Qwen/Qwen3-VL-8B-Instruct")

QwenTextAPIConfig

Bases: BaseModel

API backend configuration for Qwen text extraction.

Uses litellm for provider-agnostic API access. Supports OpenRouter, Gemini, Azure, OpenAI, and any other litellm-compatible provider.

API keys can be passed directly or read from environment variables.

Example
# OpenRouter (reads OPENROUTER_API_KEY from env)
config = QwenTextAPIConfig(
    model="openrouter/qwen/qwen3-vl-8b-instruct",
)

# With explicit key
config = QwenTextAPIConfig(
    model="openrouter/qwen/qwen3-vl-8b-instruct",
    api_key=os.environ["OPENROUTER_API_KEY"],
    api_base="https://openrouter.ai/api/v1",
)

QwenTextExtractor

QwenTextExtractor(backend: QwenTextBackendConfig)

Bases: BaseTextExtractor

Qwen3-VL Vision-Language Model text extractor.

Extracts text from document images and outputs as structured HTML or Markdown. Uses Qwen3-VL's built-in document parsing prompts.

Supports PyTorch, VLLM, MLX, and API backends.

Example
from omnidocs.tasks.text_extraction import QwenTextExtractor
from omnidocs.tasks.text_extraction.qwen import QwenTextPyTorchConfig

# Initialize with PyTorch backend
extractor = QwenTextExtractor(
        backend=QwenTextPyTorchConfig(model="Qwen/Qwen3-VL-8B-Instruct")
    )

# Extract as Markdown
result = extractor.extract(image, output_format="markdown")
print(result.content)

# Extract as HTML
result = extractor.extract(image, output_format="html")
print(result.content)

Initialize Qwen text extractor.

PARAMETER DESCRIPTION
backend

Backend configuration. One of: - QwenTextPyTorchConfig: PyTorch/HuggingFace backend - QwenTextVLLMConfig: VLLM high-throughput backend - QwenTextMLXConfig: MLX backend for Apple Silicon - QwenTextAPIConfig: API backend (OpenRouter, etc.)

TYPE: QwenTextBackendConfig

Source code in omnidocs/tasks/text_extraction/qwen/extractor.py
def __init__(self, backend: QwenTextBackendConfig):
    """
    Initialize Qwen text extractor.

    Args:
        backend: Backend configuration. One of:
            - QwenTextPyTorchConfig: PyTorch/HuggingFace backend
            - QwenTextVLLMConfig: VLLM high-throughput backend
            - QwenTextMLXConfig: MLX backend for Apple Silicon
            - QwenTextAPIConfig: API backend (OpenRouter, etc.)
    """
    self.backend_config = backend
    self._backend: Any = None
    self._processor: Any = None
    self._loaded = False

    # Backend-specific helpers
    self._process_vision_info: Any = None
    self._sampling_params_class: Any = None
    self._mlx_config: Any = None
    self._apply_chat_template: Any = None
    self._generate: Any = None

    # Load model
    self._load_model()
extract
extract(
    image: Union[Image, ndarray, str, Path],
    output_format: Literal["html", "markdown"] = "markdown",
) -> TextOutput

Extract text from an image.

PARAMETER DESCRIPTION
image

Input image as: - PIL.Image.Image: PIL image object - np.ndarray: Numpy array (HWC format, RGB) - str or Path: Path to image file

TYPE: Union[Image, ndarray, str, Path]

output_format

Desired output format: - "html": Structured HTML with div elements - "markdown": Markdown format

TYPE: Literal['html', 'markdown'] DEFAULT: 'markdown'

RETURNS DESCRIPTION
TextOutput

TextOutput containing extracted text content

RAISES DESCRIPTION
RuntimeError

If model is not loaded

ValueError

If image format or output_format is not supported

Source code in omnidocs/tasks/text_extraction/qwen/extractor.py
def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    output_format: Literal["html", "markdown"] = "markdown",
) -> TextOutput:
    """
    Extract text from an image.

    Args:
        image: Input image as:
            - PIL.Image.Image: PIL image object
            - np.ndarray: Numpy array (HWC format, RGB)
            - str or Path: Path to image file
        output_format: Desired output format:
            - "html": Structured HTML with div elements
            - "markdown": Markdown format

    Returns:
        TextOutput containing extracted text content

    Raises:
        RuntimeError: If model is not loaded
        ValueError: If image format or output_format is not supported
    """
    if not self._loaded:
        raise RuntimeError("Model not loaded. Call _load_model() first.")

    if output_format not in ("html", "markdown"):
        raise ValueError(f"Invalid output_format: {output_format}. Expected 'html' or 'markdown'.")

    # Prepare image
    pil_image = self._prepare_image(image)
    width, height = pil_image.size

    # Get prompt for output format
    prompt = QWEN_PROMPTS[output_format]

    # Run inference based on backend
    config_type = type(self.backend_config).__name__
    if config_type == "QwenTextPyTorchConfig":
        raw_output = self._infer_pytorch(pil_image, prompt)
    elif config_type == "QwenTextVLLMConfig":
        raw_output = self._infer_vllm(pil_image, prompt)
    elif config_type == "QwenTextMLXConfig":
        raw_output = self._infer_mlx(pil_image, prompt)
    elif config_type == "QwenTextAPIConfig":
        raw_output = self._infer_api(pil_image, prompt)
    else:
        raise RuntimeError(f"Unknown backend: {config_type}")

    # Clean output
    if output_format == "html":
        cleaned_output = _clean_html_output(raw_output)
    else:
        cleaned_output = _clean_markdown_output(raw_output)

    # Extract plain text
    plain_text = _extract_plain_text(raw_output, output_format)

    return TextOutput(
        content=cleaned_output,
        format=OutputFormat(output_format),
        raw_output=raw_output,
        plain_text=plain_text,
        image_width=width,
        image_height=height,
        model_name=f"Qwen3-VL ({type(self.backend_config).__name__})",
    )

QwenTextMLXConfig

Bases: BaseModel

MLX backend configuration for Qwen text extraction.

This backend uses MLX for Apple Silicon native inference. Best for local development and testing on macOS M1/M2/M3+. Requires: mlx, mlx-vlm

Note: This backend only works on Apple Silicon Macs. Do NOT use for Modal/cloud deployments.

Example
config = QwenTextMLXConfig(
        model="mlx-community/Qwen3-VL-8B-Instruct-4bit",
    )

QwenTextPyTorchConfig

Bases: BaseModel

PyTorch/HuggingFace backend configuration for Qwen text extraction.

This backend uses the transformers library with PyTorch for local GPU inference. Requires: torch, transformers, accelerate, qwen-vl-utils

Example
config = QwenTextPyTorchConfig(
        model="Qwen/Qwen3-VL-8B-Instruct",
        device="cuda",
        torch_dtype="bfloat16",
    )

QwenTextVLLMConfig

Bases: BaseModel

VLLM backend configuration for Qwen text extraction.

This backend uses VLLM for high-throughput inference. Best for batch processing and production deployments. Requires: vllm, torch, transformers, qwen-vl-utils

Example
config = QwenTextVLLMConfig(
        model="Qwen/Qwen3-VL-8B-Instruct",
        tensor_parallel_size=2,
        gpu_memory_utilization=0.9,
    )

api

API backend configuration for Qwen3-VL text extraction.

Uses litellm for provider-agnostic inference (OpenRouter, Gemini, Azure, etc.).

QwenTextAPIConfig

Bases: BaseModel

API backend configuration for Qwen text extraction.

Uses litellm for provider-agnostic API access. Supports OpenRouter, Gemini, Azure, OpenAI, and any other litellm-compatible provider.

API keys can be passed directly or read from environment variables.

Example
# OpenRouter (reads OPENROUTER_API_KEY from env)
config = QwenTextAPIConfig(
    model="openrouter/qwen/qwen3-vl-8b-instruct",
)

# With explicit key
config = QwenTextAPIConfig(
    model="openrouter/qwen/qwen3-vl-8b-instruct",
    api_key=os.environ["OPENROUTER_API_KEY"],
    api_base="https://openrouter.ai/api/v1",
)

extractor

Qwen3-VL text extractor.

A Vision-Language Model for extracting text from document images as structured HTML or Markdown.

Supports PyTorch, VLLM, MLX, and API backends.

Example
from omnidocs.tasks.text_extraction import QwenTextExtractor
from omnidocs.tasks.text_extraction.qwen import QwenTextPyTorchConfig

extractor = QwenTextExtractor(
        backend=QwenTextPyTorchConfig(model="Qwen/Qwen3-VL-8B-Instruct")
    )
result = extractor.extract(image, output_format="markdown")
print(result.content)
QwenTextExtractor
QwenTextExtractor(backend: QwenTextBackendConfig)

Bases: BaseTextExtractor

Qwen3-VL Vision-Language Model text extractor.

Extracts text from document images and outputs as structured HTML or Markdown. Uses Qwen3-VL's built-in document parsing prompts.

Supports PyTorch, VLLM, MLX, and API backends.

Example
from omnidocs.tasks.text_extraction import QwenTextExtractor
from omnidocs.tasks.text_extraction.qwen import QwenTextPyTorchConfig

# Initialize with PyTorch backend
extractor = QwenTextExtractor(
        backend=QwenTextPyTorchConfig(model="Qwen/Qwen3-VL-8B-Instruct")
    )

# Extract as Markdown
result = extractor.extract(image, output_format="markdown")
print(result.content)

# Extract as HTML
result = extractor.extract(image, output_format="html")
print(result.content)

Initialize Qwen text extractor.

PARAMETER DESCRIPTION
backend

Backend configuration. One of: - QwenTextPyTorchConfig: PyTorch/HuggingFace backend - QwenTextVLLMConfig: VLLM high-throughput backend - QwenTextMLXConfig: MLX backend for Apple Silicon - QwenTextAPIConfig: API backend (OpenRouter, etc.)

TYPE: QwenTextBackendConfig

Source code in omnidocs/tasks/text_extraction/qwen/extractor.py
def __init__(self, backend: QwenTextBackendConfig):
    """
    Initialize Qwen text extractor.

    Args:
        backend: Backend configuration. One of:
            - QwenTextPyTorchConfig: PyTorch/HuggingFace backend
            - QwenTextVLLMConfig: VLLM high-throughput backend
            - QwenTextMLXConfig: MLX backend for Apple Silicon
            - QwenTextAPIConfig: API backend (OpenRouter, etc.)
    """
    self.backend_config = backend
    self._backend: Any = None
    self._processor: Any = None
    self._loaded = False

    # Backend-specific helpers
    self._process_vision_info: Any = None
    self._sampling_params_class: Any = None
    self._mlx_config: Any = None
    self._apply_chat_template: Any = None
    self._generate: Any = None

    # Load model
    self._load_model()
extract
extract(
    image: Union[Image, ndarray, str, Path],
    output_format: Literal["html", "markdown"] = "markdown",
) -> TextOutput

Extract text from an image.

PARAMETER DESCRIPTION
image

Input image as: - PIL.Image.Image: PIL image object - np.ndarray: Numpy array (HWC format, RGB) - str or Path: Path to image file

TYPE: Union[Image, ndarray, str, Path]

output_format

Desired output format: - "html": Structured HTML with div elements - "markdown": Markdown format

TYPE: Literal['html', 'markdown'] DEFAULT: 'markdown'

RETURNS DESCRIPTION
TextOutput

TextOutput containing extracted text content

RAISES DESCRIPTION
RuntimeError

If model is not loaded

ValueError

If image format or output_format is not supported

Source code in omnidocs/tasks/text_extraction/qwen/extractor.py
def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    output_format: Literal["html", "markdown"] = "markdown",
) -> TextOutput:
    """
    Extract text from an image.

    Args:
        image: Input image as:
            - PIL.Image.Image: PIL image object
            - np.ndarray: Numpy array (HWC format, RGB)
            - str or Path: Path to image file
        output_format: Desired output format:
            - "html": Structured HTML with div elements
            - "markdown": Markdown format

    Returns:
        TextOutput containing extracted text content

    Raises:
        RuntimeError: If model is not loaded
        ValueError: If image format or output_format is not supported
    """
    if not self._loaded:
        raise RuntimeError("Model not loaded. Call _load_model() first.")

    if output_format not in ("html", "markdown"):
        raise ValueError(f"Invalid output_format: {output_format}. Expected 'html' or 'markdown'.")

    # Prepare image
    pil_image = self._prepare_image(image)
    width, height = pil_image.size

    # Get prompt for output format
    prompt = QWEN_PROMPTS[output_format]

    # Run inference based on backend
    config_type = type(self.backend_config).__name__
    if config_type == "QwenTextPyTorchConfig":
        raw_output = self._infer_pytorch(pil_image, prompt)
    elif config_type == "QwenTextVLLMConfig":
        raw_output = self._infer_vllm(pil_image, prompt)
    elif config_type == "QwenTextMLXConfig":
        raw_output = self._infer_mlx(pil_image, prompt)
    elif config_type == "QwenTextAPIConfig":
        raw_output = self._infer_api(pil_image, prompt)
    else:
        raise RuntimeError(f"Unknown backend: {config_type}")

    # Clean output
    if output_format == "html":
        cleaned_output = _clean_html_output(raw_output)
    else:
        cleaned_output = _clean_markdown_output(raw_output)

    # Extract plain text
    plain_text = _extract_plain_text(raw_output, output_format)

    return TextOutput(
        content=cleaned_output,
        format=OutputFormat(output_format),
        raw_output=raw_output,
        plain_text=plain_text,
        image_width=width,
        image_height=height,
        model_name=f"Qwen3-VL ({type(self.backend_config).__name__})",
    )

mlx

MLX backend configuration for Qwen3-VL text extraction.

QwenTextMLXConfig

Bases: BaseModel

MLX backend configuration for Qwen text extraction.

This backend uses MLX for Apple Silicon native inference. Best for local development and testing on macOS M1/M2/M3+. Requires: mlx, mlx-vlm

Note: This backend only works on Apple Silicon Macs. Do NOT use for Modal/cloud deployments.

Example
config = QwenTextMLXConfig(
        model="mlx-community/Qwen3-VL-8B-Instruct-4bit",
    )

pytorch

PyTorch/HuggingFace backend configuration for Qwen3-VL text extraction.

QwenTextPyTorchConfig

Bases: BaseModel

PyTorch/HuggingFace backend configuration for Qwen text extraction.

This backend uses the transformers library with PyTorch for local GPU inference. Requires: torch, transformers, accelerate, qwen-vl-utils

Example
config = QwenTextPyTorchConfig(
        model="Qwen/Qwen3-VL-8B-Instruct",
        device="cuda",
        torch_dtype="bfloat16",
    )

vllm

VLLM backend configuration for Qwen3-VL text extraction.

QwenTextVLLMConfig

Bases: BaseModel

VLLM backend configuration for Qwen text extraction.

This backend uses VLLM for high-throughput inference. Best for batch processing and production deployments. Requires: vllm, torch, transformers, qwen-vl-utils

Example
config = QwenTextVLLMConfig(
        model="Qwen/Qwen3-VL-8B-Instruct",
        tensor_parallel_size=2,
        gpu_memory_utilization=0.9,
    )

vlm

VLM text extractor.

A provider-agnostic Vision-Language Model text extractor using litellm. Works with any cloud API: Gemini, OpenRouter, Azure, OpenAI, Anthropic, etc.

Example
from omnidocs.vlm import VLMAPIConfig
from omnidocs.tasks.text_extraction import VLMTextExtractor

config = VLMAPIConfig(model="gemini/gemini-2.5-flash")
extractor = VLMTextExtractor(config=config)
result = extractor.extract("document.png", output_format="markdown")
print(result.content)

# With custom prompt
result = extractor.extract("document.png", prompt="Extract only table data as markdown")

VLMTextExtractor

VLMTextExtractor(config: VLMAPIConfig)

Bases: BaseTextExtractor

Provider-agnostic VLM text extractor using litellm.

Works with any cloud VLM API: Gemini, OpenRouter, Azure, OpenAI, Anthropic, etc. Supports custom prompts for specialized extraction.

Example
from omnidocs.vlm import VLMAPIConfig
from omnidocs.tasks.text_extraction import VLMTextExtractor

# Gemini (reads GOOGLE_API_KEY from env)
config = VLMAPIConfig(model="gemini/gemini-2.5-flash")
extractor = VLMTextExtractor(config=config)

# Default extraction
result = extractor.extract("document.png", output_format="markdown")

# Custom prompt
result = extractor.extract(
    "document.png",
    prompt="Extract only the table data as markdown",
)

Initialize VLM text extractor.

PARAMETER DESCRIPTION
config

VLM API configuration with model and provider details.

TYPE: VLMAPIConfig

Source code in omnidocs/tasks/text_extraction/vlm.py
def __init__(self, config: VLMAPIConfig):
    """
    Initialize VLM text extractor.

    Args:
        config: VLM API configuration with model and provider details.
    """
    self.config = config
    self._loaded = True
extract
extract(
    image: Union[Image, ndarray, str, Path],
    output_format: Literal["html", "markdown"] = "markdown",
    prompt: Optional[str] = None,
) -> TextOutput

Extract text from an image.

PARAMETER DESCRIPTION
image

Input image (PIL Image, numpy array, or file path).

TYPE: Union[Image, ndarray, str, Path]

output_format

Desired output format ("html" or "markdown").

TYPE: Literal['html', 'markdown'] DEFAULT: 'markdown'

prompt

Custom prompt. If None, uses a task-specific default prompt.

TYPE: Optional[str] DEFAULT: None

RETURNS DESCRIPTION
TextOutput

TextOutput containing extracted text content.

Source code in omnidocs/tasks/text_extraction/vlm.py
def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    output_format: Literal["html", "markdown"] = "markdown",
    prompt: Optional[str] = None,
) -> TextOutput:
    """
    Extract text from an image.

    Args:
        image: Input image (PIL Image, numpy array, or file path).
        output_format: Desired output format ("html" or "markdown").
        prompt: Custom prompt. If None, uses a task-specific default prompt.

    Returns:
        TextOutput containing extracted text content.
    """
    if output_format not in ("html", "markdown"):
        raise ValueError(f"Invalid output_format: {output_format}. Expected 'html' or 'markdown'.")

    pil_image = self._prepare_image(image)
    width, height = pil_image.size

    final_prompt = prompt or DEFAULT_PROMPTS[output_format]
    raw_output = vlm_completion(self.config, final_prompt, pil_image)
    plain_text = _extract_plain_text(raw_output, output_format)

    return TextOutput(
        content=raw_output,
        format=OutputFormat(output_format),
        raw_output=raw_output,
        plain_text=plain_text,
        image_width=width,
        image_height=height,
        model_name=f"VLM ({self.config.model})",
    )