Overview¶

Layout Extraction Module.

Provides extractors for detecting document layout elements such as titles, text blocks, figures, tables, formulas, and captions.

Available Extractors

DocLayoutYOLO: YOLO-based layout detector (fast, accurate)
RTDETRLayoutExtractor: Transformer-based detector (more categories)
QwenLayoutDetector: VLM-based detector with custom label support (multi-backend)
MinerUVLLayoutDetector: MinerU VL 1.2B layout detector (multi-backend)

Example

from omnidocs.tasks.layout_extraction import DocLayoutYOLO, DocLayoutYOLOConfig

extractor = DocLayoutYOLO(config=DocLayoutYOLOConfig(device="cuda"))
result = extractor.extract(image)

for box in result.bboxes:
        print(f"{box.label.value}: {box.confidence:.2f}")
# VLM-based detection with custom labels
from omnidocs.tasks.layout_extraction import QwenLayoutDetector, CustomLabel
from omnidocs.tasks.layout_extraction.qwen import QwenLayoutPyTorchConfig

detector = QwenLayoutDetector(
        backend=QwenLayoutPyTorchConfig(model="Qwen/Qwen3-VL-8B-Instruct")
    )
result = detector.extract(image, custom_labels=["code_block", "sidebar"])

BaseLayoutExtractor ¶

Bases: ABC

Abstract base class for layout extractors.

All layout extraction models must inherit from this class and implement the required methods.

Example

class MyLayoutExtractor(BaseLayoutExtractor):
        def __init__(self, config: MyConfig):
            self.config = config
            self._load_model()

        def _load_model(self):
            # Load model weights
            pass

        def extract(self, image):
            # Run extraction
            return LayoutOutput(...)

extract `abstractmethod` ¶

extract(
    image: Union[Image, ndarray, str, Path],
) -> LayoutOutput

Run layout extraction on an image.

PARAMETER	DESCRIPTION
`image`	Input image as: - PIL.Image.Image: PIL image object - np.ndarray: Numpy array (HWC format, RGB) - str or Path: Path to image file TYPE: `Union[Image, ndarray, str, Path]`

RETURNS	DESCRIPTION
`LayoutOutput`	LayoutOutput containing detected layout boxes with standardized labels

RAISES	DESCRIPTION
`ValueError`	If image format is not supported
`RuntimeError`	If model is not loaded or inference fails

Source code in omnidocs/tasks/layout_extraction/base.py

@abstractmethod
def extract(self, image: Union[Image.Image, np.ndarray, str, Path]) -> LayoutOutput:
    """
    Run layout extraction on an image.

    Args:
        image: Input image as:
            - PIL.Image.Image: PIL image object
            - np.ndarray: Numpy array (HWC format, RGB)
            - str or Path: Path to image file

    Returns:
        LayoutOutput containing detected layout boxes with standardized labels

    Raises:
        ValueError: If image format is not supported
        RuntimeError: If model is not loaded or inference fails
    """
    pass

batch_extract ¶

batch_extract(
    images: List[Union[Image, ndarray, str, Path]],
    progress_callback: Optional[
        Callable[[int, int], None]
    ] = None,
) -> List[LayoutOutput]

Run layout extraction on multiple images.

Default implementation loops over extract(). Subclasses can override for optimized batching.

PARAMETER	DESCRIPTION
`images`	List of images in any supported format TYPE: `List[Union[Image, ndarray, str, Path]]`
`progress_callback`	Optional function(current, total) for progress TYPE: `Optional[Callable[[int, int], None]]` DEFAULT: `None`

RETURNS	DESCRIPTION
`List[LayoutOutput]`	List of LayoutOutput in same order as input

Examples:

images = [doc.get_page(i) for i in range(doc.page_count)]
results = extractor.batch_extract(images)

Source code in omnidocs/tasks/layout_extraction/base.py

def batch_extract(
    self,
    images: List[Union[Image.Image, np.ndarray, str, Path]],
    progress_callback: Optional[Callable[[int, int], None]] = None,
) -> List[LayoutOutput]:
    """
    Run layout extraction on multiple images.

    Default implementation loops over extract(). Subclasses can override
    for optimized batching.

    Args:
        images: List of images in any supported format
        progress_callback: Optional function(current, total) for progress

    Returns:
        List of LayoutOutput in same order as input

    Examples:
        ```python
        images = [doc.get_page(i) for i in range(doc.page_count)]
        results = extractor.batch_extract(images)
        ```
    """
    results = []
    total = len(images)

    for i, image in enumerate(images):
        if progress_callback:
            progress_callback(i + 1, total)

        result = self.extract(image)
        results.append(result)

    return results

extract_document ¶

extract_document(
    document: Document,
    progress_callback: Optional[
        Callable[[int, int], None]
    ] = None,
) -> List[LayoutOutput]

Run layout extraction on all pages of a document.

PARAMETER	DESCRIPTION
`document`	Document instance TYPE: `Document`
`progress_callback`	Optional function(current, total) for progress TYPE: `Optional[Callable[[int, int], None]]` DEFAULT: `None`

RETURNS	DESCRIPTION
`List[LayoutOutput]`	List of LayoutOutput, one per page

Examples:

doc = Document.from_pdf("paper.pdf")
results = extractor.extract_document(doc)

Source code in omnidocs/tasks/layout_extraction/base.py

def extract_document(
    self,
    document: "Document",
    progress_callback: Optional[Callable[[int, int], None]] = None,
) -> List[LayoutOutput]:
    """
    Run layout extraction on all pages of a document.

    Args:
        document: Document instance
        progress_callback: Optional function(current, total) for progress

    Returns:
        List of LayoutOutput, one per page

    Examples:
        ```python
        doc = Document.from_pdf("paper.pdf")
        results = extractor.extract_document(doc)
        ```
    """
    results = []
    total = document.page_count

    for i, page in enumerate(document.iter_pages()):
        if progress_callback:
            progress_callback(i + 1, total)

        result = self.extract(page)
        results.append(result)

    return results

DocLayoutYOLO ¶

DocLayoutYOLO(config: DocLayoutYOLOConfig)

Bases: BaseLayoutExtractor

DocLayout-YOLO layout extractor.

A YOLO-based model optimized for document layout detection. Detects: title, text, figure, table, formula, captions, etc.

This is a single-backend model (PyTorch only).

Example

from omnidocs.tasks.layout_extraction import DocLayoutYOLO, DocLayoutYOLOConfig

extractor = DocLayoutYOLO(config=DocLayoutYOLOConfig(device="cuda"))
result = extractor.extract(image)

for box in result.bboxes:
        print(f"{box.label.value}: {box.confidence:.2f}")

Initialize DocLayout-YOLO extractor.

PARAMETER	DESCRIPTION
`config`	Configuration object with device, model_path, etc. TYPE: `DocLayoutYOLOConfig`

Source code in omnidocs/tasks/layout_extraction/doc_layout_yolo.py

def __init__(self, config: DocLayoutYOLOConfig):
    """
    Initialize DocLayout-YOLO extractor.

    Args:
        config: Configuration object with device, model_path, etc.
    """
    self.config = config
    self._model = None
    self._device = self._resolve_device(config.device)
    self._model_path = self._resolve_model_path(config.model_path)

    # Load model
    self._load_model()

extract ¶

extract(
    image: Union[Image, ndarray, str, Path],
) -> LayoutOutput

Run layout extraction on an image.

PARAMETER	DESCRIPTION
`image`	Input image (PIL Image, numpy array, or path) TYPE: `Union[Image, ndarray, str, Path]`

RETURNS	DESCRIPTION
`LayoutOutput`	LayoutOutput with detected layout boxes

Source code in omnidocs/tasks/layout_extraction/doc_layout_yolo.py

def extract(self, image: Union[Image.Image, np.ndarray, str, Path]) -> LayoutOutput:
    """
    Run layout extraction on an image.

    Args:
        image: Input image (PIL Image, numpy array, or path)

    Returns:
        LayoutOutput with detected layout boxes
    """
    if self._model is None:
        raise RuntimeError("Model not loaded. Call _load_model() first.")

    # Prepare image
    pil_image = self._prepare_image(image)
    img_width, img_height = pil_image.size

    # Run inference
    results = self._model.predict(
        pil_image,
        imgsz=self.config.img_size,
        conf=self.config.confidence,
        device=self._device,
    )

    result = results[0]

    # Parse detections
    layout_boxes = []

    if hasattr(result, "boxes") and result.boxes is not None:
        boxes = result.boxes

        for i in range(len(boxes)):
            # Get coordinates
            bbox_coords = boxes.xyxy[i].cpu().numpy().tolist()

            # Get class and confidence
            class_id = int(boxes.cls[i].item())
            confidence = float(boxes.conf[i].item())

            # Get original label from class names
            original_label = DOCLAYOUT_YOLO_CLASS_NAMES.get(class_id, f"class_{class_id}")

            # Map to standardized label
            standard_label = DOCLAYOUT_YOLO_MAPPING.to_standard(original_label)

            layout_boxes.append(
                LayoutBox(
                    label=standard_label,
                    bbox=BoundingBox.from_list(bbox_coords),
                    confidence=confidence,
                    class_id=class_id,
                    original_label=original_label,
                )
            )

    # Sort by y-coordinate (top to bottom reading order)
    layout_boxes.sort(key=lambda b: (b.bbox.y1, b.bbox.x1))

    return LayoutOutput(
        bboxes=layout_boxes,
        image_width=img_width,
        image_height=img_height,
        model_name="DocLayout-YOLO",
    )

DocLayoutYOLOConfig ¶

Bases: BaseModel

Configuration for DocLayout-YOLO layout extractor.

This is a single-backend model (PyTorch only).

Example

config = DocLayoutYOLOConfig(device="cuda", confidence=0.3)
extractor = DocLayoutYOLO(config=config)

MinerUVLLayoutDetector ¶

MinerUVLLayoutDetector(
    backend: MinerUVLLayoutBackendConfig,
)

Bases: BaseLayoutExtractor

MinerU VL layout detector.

Uses MinerU2.5-2509-1.2B for document layout detection. Detects 22+ element types including text, titles, tables, equations, figures, code, and more.

For full document extraction (layout + content), use MinerUVLTextExtractor from the text_extraction module instead.

Example

from omnidocs.tasks.layout_extraction import MinerUVLLayoutDetector
from omnidocs.tasks.layout_extraction.mineruvl import MinerUVLLayoutPyTorchConfig

detector = MinerUVLLayoutDetector(
    backend=MinerUVLLayoutPyTorchConfig(device="cuda")
)
result = detector.extract(image)

for box in result.bboxes:
    print(f"{box.label}: {box.confidence:.2f}")

Initialize MinerU VL layout detector.

PARAMETER	DESCRIPTION
`backend`	Backend configuration (PyTorch, VLLM, MLX, or API) TYPE: `MinerUVLLayoutBackendConfig`

Source code in omnidocs/tasks/layout_extraction/mineruvl/detector.py

def __init__(self, backend: MinerUVLLayoutBackendConfig):
    """
    Initialize MinerU VL layout detector.

    Args:
        backend: Backend configuration (PyTorch, VLLM, MLX, or API)
    """
    self.backend_config = backend
    self._client = None
    self._loaded = False
    self._load_model()

extract ¶

extract(
    image: Union[Image, ndarray, str, Path],
) -> LayoutOutput

Detect layout elements in the image.

PARAMETER	DESCRIPTION
`image`	Input image (PIL Image, numpy array, or file path) TYPE: `Union[Image, ndarray, str, Path]`

RETURNS	DESCRIPTION
`LayoutOutput`	LayoutOutput with standardized labels and bounding boxes

Source code in omnidocs/tasks/layout_extraction/mineruvl/detector.py

def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
) -> LayoutOutput:
    """
    Detect layout elements in the image.

    Args:
        image: Input image (PIL Image, numpy array, or file path)

    Returns:
        LayoutOutput with standardized labels and bounding boxes
    """
    if not self._loaded:
        raise RuntimeError("Model not loaded.")

    pil_image = self._prepare_image(image)
    width, height = pil_image.size

    # Run layout detection
    blocks = self._detect_layout(pil_image)

    # Convert to LayoutOutput
    bboxes = []
    for block in blocks:
        # Convert normalized [0,1] to pixel coords
        x1, y1, x2, y2 = block.bbox
        pixel_bbox = BoundingBox(
            x1=x1 * width,
            y1=y1 * height,
            x2=x2 * width,
            y2=y2 * height,
        )

        # Map label
        label = MINERUVL_LABEL_MAPPING.get(block.type, LayoutLabel.UNKNOWN)

        bboxes.append(
            LayoutBox(
                label=label,
                bbox=pixel_bbox,
                confidence=1.0,  # MinerU VL doesn't output confidence
                original_label=block.type.value,
            )
        )

    return LayoutOutput(
        bboxes=bboxes,
        image_width=width,
        image_height=height,
        model_name="MinerU2.5-2509-1.2B",
    )

BoundingBox ¶

Bases: BaseModel

Bounding box coordinates in pixel space.

Coordinates follow the convention: (x1, y1) is top-left, (x2, y2) is bottom-right.

width `property` ¶

width: float

Width of the bounding box.

height `property` ¶

height: float

Height of the bounding box.

area `property` ¶

area: float

Area of the bounding box.

center `property` ¶

center: Tuple[float, float]

Center point of the bounding box.

to_list ¶

to_list() -> List[float]

Convert to [x1, y1, x2, y2] list.

Source code in omnidocs/tasks/layout_extraction/models.py

def to_list(self) -> List[float]:
    """Convert to [x1, y1, x2, y2] list."""
    return [self.x1, self.y1, self.x2, self.y2]

to_xyxy ¶

to_xyxy() -> Tuple[float, float, float, float]

Convert to (x1, y1, x2, y2) tuple.

Source code in omnidocs/tasks/layout_extraction/models.py

def to_xyxy(self) -> Tuple[float, float, float, float]:
    """Convert to (x1, y1, x2, y2) tuple."""
    return (self.x1, self.y1, self.x2, self.y2)

to_xywh ¶

to_xywh() -> Tuple[float, float, float, float]

Convert to (x, y, width, height) format.

Source code in omnidocs/tasks/layout_extraction/models.py

def to_xywh(self) -> Tuple[float, float, float, float]:
    """Convert to (x, y, width, height) format."""
    return (self.x1, self.y1, self.width, self.height)

from_list `classmethod` ¶

from_list(coords: List[float]) -> BoundingBox

Create from [x1, y1, x2, y2] list.

Source code in omnidocs/tasks/layout_extraction/models.py

@classmethod
def from_list(cls, coords: List[float]) -> "BoundingBox":
    """Create from [x1, y1, x2, y2] list."""
    if len(coords) != 4:
        raise ValueError(f"Expected 4 coordinates, got {len(coords)}")
    return cls(x1=coords[0], y1=coords[1], x2=coords[2], y2=coords[3])

to_normalized ¶

to_normalized(
    image_width: int, image_height: int
) -> BoundingBox

Convert to normalized coordinates (0-1024 range).

Scales coordinates from absolute pixel values to a virtual 1024x1024 canvas. This provides consistent coordinates regardless of original image size.

PARAMETER	DESCRIPTION
`image_width`	Original image width in pixels TYPE: `int`
`image_height`	Original image height in pixels TYPE: `int`

RETURNS	DESCRIPTION
`BoundingBox`	New BoundingBox with coordinates in 0-1024 range

Example

bbox = BoundingBox(x1=100, y1=50, x2=500, y2=300)
normalized = bbox.to_normalized(1000, 800)
# x: 100/1000*1024 = 102.4, y: 50/800*1024 = 64

Source code in omnidocs/tasks/layout_extraction/models.py

def to_normalized(self, image_width: int, image_height: int) -> "BoundingBox":
    """
    Convert to normalized coordinates (0-1024 range).

    Scales coordinates from absolute pixel values to a virtual 1024x1024 canvas.
    This provides consistent coordinates regardless of original image size.

    Args:
        image_width: Original image width in pixels
        image_height: Original image height in pixels

    Returns:
        New BoundingBox with coordinates in 0-1024 range

    Example:
        ```python
        bbox = BoundingBox(x1=100, y1=50, x2=500, y2=300)
        normalized = bbox.to_normalized(1000, 800)
        # x: 100/1000*1024 = 102.4, y: 50/800*1024 = 64
        ```
    """
    return BoundingBox(
        x1=self.x1 / image_width * NORMALIZED_SIZE,
        y1=self.y1 / image_height * NORMALIZED_SIZE,
        x2=self.x2 / image_width * NORMALIZED_SIZE,
        y2=self.y2 / image_height * NORMALIZED_SIZE,
    )

to_absolute ¶

to_absolute(
    image_width: int, image_height: int
) -> BoundingBox

Convert from normalized (0-1024) to absolute pixel coordinates.

PARAMETER	DESCRIPTION
`image_width`	Target image width in pixels TYPE: `int`
`image_height`	Target image height in pixels TYPE: `int`

RETURNS	DESCRIPTION
`BoundingBox`	New BoundingBox with absolute pixel coordinates

Source code in omnidocs/tasks/layout_extraction/models.py

def to_absolute(self, image_width: int, image_height: int) -> "BoundingBox":
    """
    Convert from normalized (0-1024) to absolute pixel coordinates.

    Args:
        image_width: Target image width in pixels
        image_height: Target image height in pixels

    Returns:
        New BoundingBox with absolute pixel coordinates
    """
    return BoundingBox(
        x1=self.x1 / NORMALIZED_SIZE * image_width,
        y1=self.y1 / NORMALIZED_SIZE * image_height,
        x2=self.x2 / NORMALIZED_SIZE * image_width,
        y2=self.y2 / NORMALIZED_SIZE * image_height,
    )

CustomLabel ¶

Bases: BaseModel

Type-safe custom layout label definition for VLM-based models.

VLM models like Qwen3-VL support flexible custom labels beyond the standard LayoutLabel enum. Use this class to define custom labels with validation.

Example

from omnidocs.tasks.layout_extraction import CustomLabel

# Simple custom label
code_block = CustomLabel(name="code_block")

# With metadata
sidebar = CustomLabel(
        name="sidebar",
        description="Secondary content panel",
        color="#9B59B6",
    )

# Use with QwenLayoutDetector
result = detector.extract(image, custom_labels=[code_block, sidebar])

LabelMapping ¶

LabelMapping(mapping: Dict[str, LayoutLabel])

Base class for model-specific label mappings.

Each model maps its native labels to standardized LayoutLabel values.

Initialize label mapping.

PARAMETER	DESCRIPTION
`mapping`	Dict mapping model-specific labels to LayoutLabel enum values TYPE: `Dict[str, LayoutLabel]`

Source code in omnidocs/tasks/layout_extraction/models.py

def __init__(self, mapping: Dict[str, LayoutLabel]):
    """
    Initialize label mapping.

    Args:
        mapping: Dict mapping model-specific labels to LayoutLabel enum values
    """
    self._mapping = {k.lower(): v for k, v in mapping.items()}
    self._reverse_mapping = {v: k for k, v in mapping.items()}

supported_labels `property` ¶

supported_labels: List[str]

Get list of supported model-specific labels.

standard_labels `property` ¶

standard_labels: List[LayoutLabel]

Get list of standard labels this mapping produces.

to_standard ¶

to_standard(model_label: str) -> LayoutLabel

Convert model-specific label to standardized LayoutLabel.

Source code in omnidocs/tasks/layout_extraction/models.py

def to_standard(self, model_label: str) -> LayoutLabel:
    """Convert model-specific label to standardized LayoutLabel."""
    return self._mapping.get(model_label.lower(), LayoutLabel.UNKNOWN)

from_standard ¶

from_standard(standard_label: LayoutLabel) -> Optional[str]

Convert standardized LayoutLabel to model-specific label.

Source code in omnidocs/tasks/layout_extraction/models.py

def from_standard(self, standard_label: LayoutLabel) -> Optional[str]:
    """Convert standardized LayoutLabel to model-specific label."""
    return self._reverse_mapping.get(standard_label)

LayoutBox ¶

Bases: BaseModel

Single detected layout element with label, bounding box, and confidence.

to_dict ¶

to_dict() -> Dict

Convert to dictionary representation.

Source code in omnidocs/tasks/layout_extraction/models.py

def to_dict(self) -> Dict:
    """Convert to dictionary representation."""
    return {
        "label": self.label.value,
        "bbox": self.bbox.to_list(),
        "confidence": self.confidence,
        "class_id": self.class_id,
        "original_label": self.original_label,
    }

get_normalized_bbox ¶

get_normalized_bbox(
    image_width: int, image_height: int
) -> BoundingBox

Get bounding box in normalized (0-1024) coordinates.

PARAMETER	DESCRIPTION
`image_width`	Original image width TYPE: `int`
`image_height`	Original image height TYPE: `int`

RETURNS	DESCRIPTION
`BoundingBox`	BoundingBox with normalized coordinates

Source code in omnidocs/tasks/layout_extraction/models.py

def get_normalized_bbox(self, image_width: int, image_height: int) -> BoundingBox:
    """
    Get bounding box in normalized (0-1024) coordinates.

    Args:
        image_width: Original image width
        image_height: Original image height

    Returns:
        BoundingBox with normalized coordinates
    """
    return self.bbox.to_normalized(image_width, image_height)

LayoutLabel ¶

Bases: str, Enum

Standardized layout labels used across all layout extractors.

These provide a consistent vocabulary regardless of which model is used.

LayoutOutput ¶

Bases: BaseModel

Complete layout extraction results for a single image.

element_count `property` ¶

element_count: int

Number of detected elements.

labels_found `property` ¶

labels_found: List[str]

Unique labels found in detections.

filter_by_label ¶

filter_by_label(label: LayoutLabel) -> List[LayoutBox]

Filter boxes by label.

Source code in omnidocs/tasks/layout_extraction/models.py

def filter_by_label(self, label: LayoutLabel) -> List[LayoutBox]:
    """Filter boxes by label."""
    return [box for box in self.bboxes if box.label == label]

filter_by_confidence ¶

filter_by_confidence(
    min_confidence: float,
) -> List[LayoutBox]

Filter boxes by minimum confidence.

Source code in omnidocs/tasks/layout_extraction/models.py

def filter_by_confidence(self, min_confidence: float) -> List[LayoutBox]:
    """Filter boxes by minimum confidence."""
    return [box for box in self.bboxes if box.confidence >= min_confidence]

to_dict ¶

to_dict() -> Dict

Convert to dictionary representation.

Source code in omnidocs/tasks/layout_extraction/models.py

def to_dict(self) -> Dict:
    """Convert to dictionary representation."""
    return {
        "bboxes": [box.to_dict() for box in self.bboxes],
        "image_width": self.image_width,
        "image_height": self.image_height,
        "model_name": self.model_name,
        "element_count": self.element_count,
        "labels_found": self.labels_found,
    }

sort_by_position ¶

sort_by_position(
    top_to_bottom: bool = True,
) -> LayoutOutput

Return a new LayoutOutput with boxes sorted by position.

PARAMETER	DESCRIPTION
`top_to_bottom`	If True, sort by y-coordinate (reading order) TYPE: `bool` DEFAULT: `True`

Source code in omnidocs/tasks/layout_extraction/models.py

def sort_by_position(self, top_to_bottom: bool = True) -> "LayoutOutput":
    """
    Return a new LayoutOutput with boxes sorted by position.

    Args:
        top_to_bottom: If True, sort by y-coordinate (reading order)
    """
    sorted_boxes = sorted(self.bboxes, key=lambda b: (b.bbox.y1, b.bbox.x1), reverse=not top_to_bottom)
    return LayoutOutput(
        bboxes=sorted_boxes,
        image_width=self.image_width,
        image_height=self.image_height,
        model_name=self.model_name,
    )

get_normalized_bboxes ¶

get_normalized_bboxes() -> List[Dict]

Get all bounding boxes in normalized (0-1024) coordinates.

RETURNS	DESCRIPTION
`List[Dict]`	List of dicts with normalized bbox coordinates and metadata.

Example

result = extractor.extract(image)
normalized = result.get_normalized_bboxes()
for box in normalized:
        print(f"{box['label']}: {box['bbox']}")  # coords in 0-1024 range

Source code in omnidocs/tasks/layout_extraction/models.py

def get_normalized_bboxes(self) -> List[Dict]:
    """
    Get all bounding boxes in normalized (0-1024) coordinates.

    Returns:
        List of dicts with normalized bbox coordinates and metadata.

    Example:
        ```python
        result = extractor.extract(image)
        normalized = result.get_normalized_bboxes()
        for box in normalized:
                print(f"{box['label']}: {box['bbox']}")  # coords in 0-1024 range
        ```
    """
    normalized = []
    for box in self.bboxes:
        norm_bbox = box.bbox.to_normalized(self.image_width, self.image_height)
        normalized.append(
            {
                "label": box.label.value,
                "bbox": norm_bbox.to_list(),
                "confidence": box.confidence,
                "class_id": box.class_id,
                "original_label": box.original_label,
            }
        )
    return normalized

visualize ¶

visualize(
    image: Image,
    output_path: Optional[Union[str, Path]] = None,
    show_labels: bool = True,
    show_confidence: bool = True,
    line_width: int = 3,
    font_size: int = 12,
) -> Image.Image

Visualize layout detection results on the image.

Draws bounding boxes with labels and confidence scores on the image. Each layout category has a distinct color for easy identification.

PARAMETER	DESCRIPTION
`image`	PIL Image to draw on (will be copied, not modified) TYPE: `Image`
`output_path`	Optional path to save the visualization TYPE: `Optional[Union[str, Path]]` DEFAULT: `None`
`show_labels`	Whether to show label text TYPE: `bool` DEFAULT: `True`
`show_confidence`	Whether to show confidence scores TYPE: `bool` DEFAULT: `True`
`line_width`	Width of bounding box lines TYPE: `int` DEFAULT: `3`
`font_size`	Size of label text (note: uses default font) TYPE: `int` DEFAULT: `12`

RETURNS	DESCRIPTION
`Image`	PIL Image with visualizations drawn

Example

result = extractor.extract(image)
viz = result.visualize(image, output_path="layout_viz.png")
viz.show()  # Display in notebook/viewer

Source code in omnidocs/tasks/layout_extraction/models.py

def visualize(
    self,
    image: "Image.Image",
    output_path: Optional[Union[str, Path]] = None,
    show_labels: bool = True,
    show_confidence: bool = True,
    line_width: int = 3,
    font_size: int = 12,
) -> "Image.Image":
    """
    Visualize layout detection results on the image.

    Draws bounding boxes with labels and confidence scores on the image.
    Each layout category has a distinct color for easy identification.

    Args:
        image: PIL Image to draw on (will be copied, not modified)
        output_path: Optional path to save the visualization
        show_labels: Whether to show label text
        show_confidence: Whether to show confidence scores
        line_width: Width of bounding box lines
        font_size: Size of label text (note: uses default font)

    Returns:
        PIL Image with visualizations drawn

    Example:
        ```python
        result = extractor.extract(image)
        viz = result.visualize(image, output_path="layout_viz.png")
        viz.show()  # Display in notebook/viewer
        ```
    """
    from PIL import ImageDraw

    # Copy image to avoid modifying original
    viz_image = image.copy().convert("RGB")
    draw = ImageDraw.Draw(viz_image)

    for box in self.bboxes:
        # Get color for this label
        color = LABEL_COLORS.get(box.label, "#95A5A6")

        # Draw bounding box
        coords = box.bbox.to_xyxy()
        draw.rectangle(coords, outline=color, width=line_width)

        # Build label text
        if show_labels or show_confidence:
            label_parts = []
            if show_labels:
                label_parts.append(box.label.value)
            if show_confidence:
                label_parts.append(f"{box.confidence:.2f}")
            label_text = " ".join(label_parts)

            # Draw label background
            text_bbox = draw.textbbox((coords[0], coords[1] - 20), label_text)
            draw.rectangle(text_bbox, fill=color)

            # Draw label text
            draw.text(
                (coords[0], coords[1] - 20),
                label_text,
                fill="white",
            )

    # Save if path provided
    if output_path:
        output_path = Path(output_path)
        output_path.parent.mkdir(parents=True, exist_ok=True)
        viz_image.save(output_path)

    return viz_image

load_json `classmethod` ¶

load_json(file_path: Union[str, Path]) -> LayoutOutput

Load a LayoutOutput instance from a JSON file.

Reads a JSON file and deserializes its contents into a LayoutOutput object. Uses Pydantic's model_validate_json for proper handling of nested objects.

PARAMETER	DESCRIPTION
`file_path`	Path to JSON file containing serialized LayoutOutput data. Can be string or pathlib.Path object. TYPE: `Union[str, Path]`

RETURNS	DESCRIPTION
`LayoutOutput`	Deserialized layout output instance from file. TYPE: `LayoutOutput`

RAISES	DESCRIPTION
`FileNotFoundError`	If the specified file does not exist.
`UnicodeDecodeError`	If file cannot be decoded as UTF-8.
`ValueError`	If file contents are not valid JSON.
`ValidationError`	If JSON data doesn't match LayoutOutput schema.

Example

output = LayoutOutput.load_json('layout_results.json')
print(f"Found {output.element_count} elements")

Found 5 elements

Source code in omnidocs/tasks/layout_extraction/models.py

@classmethod
def load_json(cls, file_path: Union[str, Path]) -> "LayoutOutput":
    """
    Load a LayoutOutput instance from a JSON file.

    Reads a JSON file and deserializes its contents into a LayoutOutput object.
    Uses Pydantic's model_validate_json for proper handling of nested objects.

    Args:
        file_path: Path to JSON file containing serialized LayoutOutput data.
                  Can be string or pathlib.Path object.

    Returns:
        LayoutOutput: Deserialized layout output instance from file.

    Raises:
        FileNotFoundError: If the specified file does not exist.
        UnicodeDecodeError: If file cannot be decoded as UTF-8.
        ValueError: If file contents are not valid JSON.
        ValidationError: If JSON data doesn't match LayoutOutput schema.

    Example:
        ```python
        output = LayoutOutput.load_json('layout_results.json')
        print(f"Found {output.element_count} elements")
        ```
        Found 5 elements
    """
    path = Path(file_path)
    return cls.model_validate_json(path.read_text(encoding="utf-8"))

save_json ¶

save_json(file_path: Union[str, Path]) -> None

Save LayoutOutput instance to a JSON file.

Serializes the LayoutOutput object to JSON and writes it to a file. Automatically creates parent directories if they don't exist. Uses UTF-8 encoding for compatibility and proper handling of special characters.

PARAMETER	DESCRIPTION
`file_path`	Path where JSON file should be saved. Can be string or pathlib.Path object. Parent directories will be created if they don't exist. TYPE: `Union[str, Path]`

RETURNS	DESCRIPTION
`None`	None

RAISES	DESCRIPTION
`OSError`	If file cannot be written due to permission or disk errors.
`TypeError`	If file_path is not a string or Path object.

Example

output = LayoutOutput(bboxes=[], image_width=800, image_height=600)
output.save_json('results/layout_output.json')
# File is created at results/layout_output.json
# Parent 'results' directory is created if it didn't exist

Source code in omnidocs/tasks/layout_extraction/models.py

def save_json(self, file_path: Union[str, Path]) -> None:
    """
    Save LayoutOutput instance to a JSON file.

    Serializes the LayoutOutput object to JSON and writes it to a file.
    Automatically creates parent directories if they don't exist. Uses UTF-8
    encoding for compatibility and proper handling of special characters.

    Args:
        file_path: Path where JSON file should be saved. Can be string or
                  pathlib.Path object. Parent directories will be created
                  if they don't exist.

    Returns:
        None

    Raises:
        OSError: If file cannot be written due to permission or disk errors.
        TypeError: If file_path is not a string or Path object.

    Example:
        ```python
        output = LayoutOutput(bboxes=[], image_width=800, image_height=600)
        output.save_json('results/layout_output.json')
        # File is created at results/layout_output.json
        # Parent 'results' directory is created if it didn't exist
        ```
    """
    path = Path(file_path)
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(self.model_dump_json(), encoding="utf-8")

QwenLayoutDetector ¶

QwenLayoutDetector(backend: QwenLayoutBackendConfig)

Bases: BaseLayoutExtractor

Qwen3-VL Vision-Language Model layout detector.

A flexible VLM-based layout detector that supports custom labels. Unlike fixed-label models (DocLayoutYOLO, RT-DETR), Qwen can detect any document elements specified at runtime.

Supports PyTorch, VLLM, MLX, and API backends.

Example

from omnidocs.tasks.layout_extraction import QwenLayoutDetector, CustomLabel
from omnidocs.tasks.layout_extraction.qwen import QwenLayoutPyTorchConfig

# Initialize with PyTorch backend
detector = QwenLayoutDetector(
        backend=QwenLayoutPyTorchConfig(model="Qwen/Qwen3-VL-8B-Instruct")
    )

# Basic extraction with default labels
result = detector.extract(image)

# With custom labels (strings)
result = detector.extract(image, custom_labels=["code_block", "sidebar"])

# With typed custom labels
labels = [
        CustomLabel(name="code_block", color="#E74C3C"),
        CustomLabel(name="sidebar", description="Side panel content"),
    ]
result = detector.extract(image, custom_labels=labels)

Initialize Qwen layout detector.

PARAMETER	DESCRIPTION
`backend`	Backend configuration. One of: - QwenLayoutPyTorchConfig: PyTorch/HuggingFace backend - QwenLayoutVLLMConfig: VLLM high-throughput backend - QwenLayoutMLXConfig: MLX backend for Apple Silicon - QwenLayoutAPIConfig: API backend (OpenRouter, etc.) TYPE: `QwenLayoutBackendConfig`

Source code in omnidocs/tasks/layout_extraction/qwen/detector.py

def __init__(self, backend: QwenLayoutBackendConfig):
    """
    Initialize Qwen layout detector.

    Args:
        backend: Backend configuration. One of:
            - QwenLayoutPyTorchConfig: PyTorch/HuggingFace backend
            - QwenLayoutVLLMConfig: VLLM high-throughput backend
            - QwenLayoutMLXConfig: MLX backend for Apple Silicon
            - QwenLayoutAPIConfig: API backend (OpenRouter, etc.)
    """
    self.backend_config = backend
    self._backend: Any = None
    self._processor: Any = None
    self._loaded = False

    # Backend-specific helpers
    self._process_vision_info: Any = None
    self._sampling_params_class: Any = None
    self._mlx_config: Any = None
    self._apply_chat_template: Any = None
    self._generate: Any = None

    # Load model
    self._load_model()

extract ¶

extract(
    image: Union[Image, ndarray, str, Path],
    custom_labels: Optional[
        List[Union[str, CustomLabel]]
    ] = None,
) -> LayoutOutput

Run layout detection on an image.

PARAMETER	DESCRIPTION
`image`	Input image as: - PIL.Image.Image: PIL image object - np.ndarray: Numpy array (HWC format, RGB) - str or Path: Path to image file TYPE: `Union[Image, ndarray, str, Path]`
`custom_labels`	Optional custom labels to detect. Can be: - None: Use default labels (title, text, table, figure, etc.) - List[str]: Simple label names ["code_block", "sidebar"] - List[CustomLabel]: Typed labels with metadata TYPE: `Optional[List[Union[str, CustomLabel]]]` DEFAULT: `None`

RETURNS	DESCRIPTION
`LayoutOutput`	LayoutOutput with detected layout boxes

RAISES	DESCRIPTION
`RuntimeError`	If model is not loaded
`ValueError`	If image format is not supported

Source code in omnidocs/tasks/layout_extraction/qwen/detector.py

def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    custom_labels: Optional[List[Union[str, CustomLabel]]] = None,
) -> LayoutOutput:
    """
    Run layout detection on an image.

    Args:
        image: Input image as:
            - PIL.Image.Image: PIL image object
            - np.ndarray: Numpy array (HWC format, RGB)
            - str or Path: Path to image file
        custom_labels: Optional custom labels to detect. Can be:
            - None: Use default labels (title, text, table, figure, etc.)
            - List[str]: Simple label names ["code_block", "sidebar"]
            - List[CustomLabel]: Typed labels with metadata

    Returns:
        LayoutOutput with detected layout boxes

    Raises:
        RuntimeError: If model is not loaded
        ValueError: If image format is not supported
    """
    if not self._loaded:
        raise RuntimeError("Model not loaded. Call _load_model() first.")

    # Prepare image
    pil_image = self._prepare_image(image)
    width, height = pil_image.size

    # Normalize labels
    label_names = self._normalize_labels(custom_labels)

    # Build prompt
    prompt = self._build_detection_prompt(label_names)

    # Run inference based on backend
    config_type = type(self.backend_config).__name__
    if config_type == "QwenLayoutPyTorchConfig":
        raw_output = self._infer_pytorch(pil_image, prompt)
    elif config_type == "QwenLayoutVLLMConfig":
        raw_output = self._infer_vllm(pil_image, prompt)
    elif config_type == "QwenLayoutMLXConfig":
        raw_output = self._infer_mlx(pil_image, prompt)
    elif config_type == "QwenLayoutAPIConfig":
        raw_output = self._infer_api(pil_image, prompt)
    else:
        raise RuntimeError(f"Unknown backend: {config_type}")

    # Parse detections
    detections = self._parse_json_output(raw_output)

    # Convert to LayoutOutput
    layout_boxes = self._build_layout_boxes(detections, width, height)

    # Sort by position (reading order)
    layout_boxes.sort(key=lambda b: (b.bbox.y1, b.bbox.x1))

    return LayoutOutput(
        bboxes=layout_boxes,
        image_width=width,
        image_height=height,
        model_name=f"Qwen3-VL ({type(self.backend_config).__name__})",
    )

RTDETRConfig ¶

Bases: BaseModel

Configuration for RT-DETR layout extractor.

This is a single-backend model (PyTorch/Transformers only).

Example

config = RTDETRConfig(device="cuda", confidence=0.4)
extractor = RTDETRLayoutExtractor(config=config)

RTDETRLayoutExtractor ¶

RTDETRLayoutExtractor(config: RTDETRConfig)

Bases: BaseLayoutExtractor

RT-DETR layout extractor using HuggingFace Transformers.

A transformer-based real-time detection model for document layout. Detects: title, text, table, figure, list, formula, captions, headers, footers.

This is a single-backend model (PyTorch/Transformers only).

Example

from omnidocs.tasks.layout_extraction import RTDETRLayoutExtractor, RTDETRConfig

extractor = RTDETRLayoutExtractor(config=RTDETRConfig(device="cuda"))
result = extractor.extract(image)

for box in result.bboxes:
        print(f"{box.label.value}: {box.confidence:.2f}")

Initialize RT-DETR layout extractor.

PARAMETER	DESCRIPTION
`config`	Configuration object with device, model settings, etc. TYPE: `RTDETRConfig`

Source code in omnidocs/tasks/layout_extraction/rtdetr.py

def __init__(self, config: RTDETRConfig):
    """
    Initialize RT-DETR layout extractor.

    Args:
        config: Configuration object with device, model settings, etc.
    """
    self.config = config
    self._model = None
    self._processor = None
    self._device = self._resolve_device(config.device)
    self._model_path = self._resolve_model_path(config.model_path)

    # Load model
    self._load_model()

extract ¶

extract(
    image: Union[Image, ndarray, str, Path],
) -> LayoutOutput

Run layout extraction on an image.

PARAMETER	DESCRIPTION
`image`	Input image (PIL Image, numpy array, or path) TYPE: `Union[Image, ndarray, str, Path]`

RETURNS	DESCRIPTION
`LayoutOutput`	LayoutOutput with detected layout boxes

Source code in omnidocs/tasks/layout_extraction/rtdetr.py

def extract(self, image: Union[Image.Image, np.ndarray, str, Path]) -> LayoutOutput:
    """
    Run layout extraction on an image.

    Args:
        image: Input image (PIL Image, numpy array, or path)

    Returns:
        LayoutOutput with detected layout boxes
    """
    import torch

    if self._model is None or self._processor is None:
        raise RuntimeError("Model not loaded. Call _load_model() first.")

    # Prepare image
    pil_image = self._prepare_image(image)
    img_width, img_height = pil_image.size

    # Preprocess
    inputs = self._processor(
        images=pil_image,
        return_tensors="pt",
        size={"height": self.config.image_size, "width": self.config.image_size},
    )

    # Move to device
    inputs = {k: v.to(self._device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}

    # Run inference
    with torch.no_grad():
        outputs = self._model(**inputs)

    # Post-process results
    target_sizes = torch.tensor([[img_height, img_width]])
    results = self._processor.post_process_object_detection(
        outputs,
        target_sizes=target_sizes,
        threshold=self.config.confidence,
    )[0]

    # Parse detections
    layout_boxes = []

    for score, label_id, box in zip(results["scores"], results["labels"], results["boxes"]):
        confidence = float(score.item())
        class_id = int(label_id.item())

        # Get original label from model config
        # Note: The model outputs 0-indexed class IDs, but id2label has background at index 0,
        # so we add 1 to map correctly (e.g., model output 8 -> id2label[9] = "Table")
        original_label = self._model.config.id2label.get(class_id + 1, f"class_{class_id}")

        # Map to standardized label
        standard_label = RTDETR_MAPPING.to_standard(original_label)

        # Box coordinates
        box_coords = box.cpu().tolist()

        layout_boxes.append(
            LayoutBox(
                label=standard_label,
                bbox=BoundingBox.from_list(box_coords),
                confidence=confidence,
                class_id=class_id,
                original_label=original_label,
            )
        )

    # Sort by y-coordinate (top to bottom reading order)
    layout_boxes.sort(key=lambda b: (b.bbox.y1, b.bbox.x1))

    return LayoutOutput(
        bboxes=layout_boxes,
        image_width=img_width,
        image_height=img_height,
        model_name="RT-DETR (docling-layout)",
    )

VLMLayoutDetector ¶

VLMLayoutDetector(config: VLMAPIConfig)

Bases: BaseLayoutExtractor

Provider-agnostic VLM layout detector using litellm.

Works with any cloud VLM API: Gemini, OpenRouter, Azure, OpenAI, Anthropic, etc. Supports custom labels for flexible detection.

Example

from omnidocs.vlm import VLMAPIConfig
from omnidocs.tasks.layout_extraction import VLMLayoutDetector

config = VLMAPIConfig(model="gemini/gemini-2.5-flash")
detector = VLMLayoutDetector(config=config)

# Default labels
result = detector.extract("document.png")

# Custom labels
result = detector.extract("document.png", custom_labels=["code_block", "sidebar"])

Initialize VLM layout detector.

PARAMETER	DESCRIPTION
`config`	VLM API configuration with model and provider details. TYPE: `VLMAPIConfig`

Source code in omnidocs/tasks/layout_extraction/vlm.py

def __init__(self, config: VLMAPIConfig):
    """
    Initialize VLM layout detector.

    Args:
        config: VLM API configuration with model and provider details.
    """
    self.config = config
    self._loaded = True

extract ¶

extract(
    image: Union[Image, ndarray, str, Path],
    custom_labels: Optional[
        List[Union[str, CustomLabel]]
    ] = None,
    prompt: Optional[str] = None,
) -> LayoutOutput

Run layout detection on an image.

PARAMETER	DESCRIPTION
`image`	Input image (PIL Image, numpy array, or file path). TYPE: `Union[Image, ndarray, str, Path]`
`custom_labels`	Optional custom labels to detect. Can be: - None: Use default labels (title, text, table, figure, etc.) - List[str]: Simple label names ["code_block", "sidebar"] - List[CustomLabel]: Typed labels with metadata TYPE: `Optional[List[Union[str, CustomLabel]]]` DEFAULT: `None`
`prompt`	Custom prompt. If None, builds a default detection prompt. TYPE: `Optional[str]` DEFAULT: `None`

RETURNS	DESCRIPTION
`LayoutOutput`	LayoutOutput with detected layout boxes.

Source code in omnidocs/tasks/layout_extraction/vlm.py

def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    custom_labels: Optional[List[Union[str, CustomLabel]]] = None,
    prompt: Optional[str] = None,
) -> LayoutOutput:
    """
    Run layout detection on an image.

    Args:
        image: Input image (PIL Image, numpy array, or file path).
        custom_labels: Optional custom labels to detect. Can be:
            - None: Use default labels (title, text, table, figure, etc.)
            - List[str]: Simple label names ["code_block", "sidebar"]
            - List[CustomLabel]: Typed labels with metadata
        prompt: Custom prompt. If None, builds a default detection prompt.

    Returns:
        LayoutOutput with detected layout boxes.
    """
    if not self._loaded:
        raise RuntimeError("Model not loaded.")

    pil_image = self._prepare_image(image)
    width, height = pil_image.size

    # Normalize labels
    label_names = self._normalize_labels(custom_labels)

    # Build or use custom prompt
    final_prompt = prompt or _build_layout_prompt(label_names)

    raw_output = vlm_completion(self.config, final_prompt, pil_image)
    detections = _parse_layout_response(raw_output, (width, height))
    layout_boxes = self._build_layout_boxes(detections, width, height)

    # Sort by reading order
    layout_boxes.sort(key=lambda b: (b.bbox.y1, b.bbox.x1))

    return LayoutOutput(
        bboxes=layout_boxes,
        image_width=width,
        image_height=height,
        model_name=f"VLM ({self.config.model})",
    )

base ¶

Base class for layout extractors.

Defines the abstract interface that all layout extractors must implement.

BaseLayoutExtractor ¶

Bases: ABC

Abstract base class for layout extractors.

All layout extraction models must inherit from this class and implement the required methods.

Example

class MyLayoutExtractor(BaseLayoutExtractor):
        def __init__(self, config: MyConfig):
            self.config = config
            self._load_model()

        def _load_model(self):
            # Load model weights
            pass

        def extract(self, image):
            # Run extraction
            return LayoutOutput(...)

extract `abstractmethod` ¶

extract(
    image: Union[Image, ndarray, str, Path],
) -> LayoutOutput

Run layout extraction on an image.

PARAMETER	DESCRIPTION
`image`	Input image as: - PIL.Image.Image: PIL image object - np.ndarray: Numpy array (HWC format, RGB) - str or Path: Path to image file TYPE: `Union[Image, ndarray, str, Path]`

RETURNS	DESCRIPTION
`LayoutOutput`	LayoutOutput containing detected layout boxes with standardized labels

RAISES	DESCRIPTION
`ValueError`	If image format is not supported
`RuntimeError`	If model is not loaded or inference fails

Source code in omnidocs/tasks/layout_extraction/base.py

@abstractmethod
def extract(self, image: Union[Image.Image, np.ndarray, str, Path]) -> LayoutOutput:
    """
    Run layout extraction on an image.

    Args:
        image: Input image as:
            - PIL.Image.Image: PIL image object
            - np.ndarray: Numpy array (HWC format, RGB)
            - str or Path: Path to image file

    Returns:
        LayoutOutput containing detected layout boxes with standardized labels

    Raises:
        ValueError: If image format is not supported
        RuntimeError: If model is not loaded or inference fails
    """
    pass

batch_extract ¶

batch_extract(
    images: List[Union[Image, ndarray, str, Path]],
    progress_callback: Optional[
        Callable[[int, int], None]
    ] = None,
) -> List[LayoutOutput]

Run layout extraction on multiple images.

Default implementation loops over extract(). Subclasses can override for optimized batching.

PARAMETER	DESCRIPTION
`images`	List of images in any supported format TYPE: `List[Union[Image, ndarray, str, Path]]`
`progress_callback`	Optional function(current, total) for progress TYPE: `Optional[Callable[[int, int], None]]` DEFAULT: `None`

RETURNS	DESCRIPTION
`List[LayoutOutput]`	List of LayoutOutput in same order as input

Examples:

images = [doc.get_page(i) for i in range(doc.page_count)]
results = extractor.batch_extract(images)

Source code in omnidocs/tasks/layout_extraction/base.py

def batch_extract(
    self,
    images: List[Union[Image.Image, np.ndarray, str, Path]],
    progress_callback: Optional[Callable[[int, int], None]] = None,
) -> List[LayoutOutput]:
    """
    Run layout extraction on multiple images.

    Default implementation loops over extract(). Subclasses can override
    for optimized batching.

    Args:
        images: List of images in any supported format
        progress_callback: Optional function(current, total) for progress

    Returns:
        List of LayoutOutput in same order as input

    Examples:
        ```python
        images = [doc.get_page(i) for i in range(doc.page_count)]
        results = extractor.batch_extract(images)
        ```
    """
    results = []
    total = len(images)

    for i, image in enumerate(images):
        if progress_callback:
            progress_callback(i + 1, total)

        result = self.extract(image)
        results.append(result)

    return results

extract_document ¶

extract_document(
    document: Document,
    progress_callback: Optional[
        Callable[[int, int], None]
    ] = None,
) -> List[LayoutOutput]

Run layout extraction on all pages of a document.

PARAMETER	DESCRIPTION
`document`	Document instance TYPE: `Document`
`progress_callback`	Optional function(current, total) for progress TYPE: `Optional[Callable[[int, int], None]]` DEFAULT: `None`

RETURNS	DESCRIPTION
`List[LayoutOutput]`	List of LayoutOutput, one per page

Examples:

doc = Document.from_pdf("paper.pdf")
results = extractor.extract_document(doc)

Source code in omnidocs/tasks/layout_extraction/base.py

def extract_document(
    self,
    document: "Document",
    progress_callback: Optional[Callable[[int, int], None]] = None,
) -> List[LayoutOutput]:
    """
    Run layout extraction on all pages of a document.

    Args:
        document: Document instance
        progress_callback: Optional function(current, total) for progress

    Returns:
        List of LayoutOutput, one per page

    Examples:
        ```python
        doc = Document.from_pdf("paper.pdf")
        results = extractor.extract_document(doc)
        ```
    """
    results = []
    total = document.page_count

    for i, page in enumerate(document.iter_pages()):
        if progress_callback:
            progress_callback(i + 1, total)

        result = self.extract(page)
        results.append(result)

    return results

doc_layout_yolo ¶

DocLayout-YOLO layout extractor.

A YOLO-based model for document layout detection, optimized for academic papers and technical documents.

Model: juliozhao/DocLayout-YOLO-DocStructBench

DocLayoutYOLOConfig ¶

Bases: BaseModel

Configuration for DocLayout-YOLO layout extractor.

This is a single-backend model (PyTorch only).

Example

config = DocLayoutYOLOConfig(device="cuda", confidence=0.3)
extractor = DocLayoutYOLO(config=config)

DocLayoutYOLO ¶

DocLayoutYOLO(config: DocLayoutYOLOConfig)

Bases: BaseLayoutExtractor

DocLayout-YOLO layout extractor.

A YOLO-based model optimized for document layout detection. Detects: title, text, figure, table, formula, captions, etc.

This is a single-backend model (PyTorch only).

Example

from omnidocs.tasks.layout_extraction import DocLayoutYOLO, DocLayoutYOLOConfig

extractor = DocLayoutYOLO(config=DocLayoutYOLOConfig(device="cuda"))
result = extractor.extract(image)

for box in result.bboxes:
        print(f"{box.label.value}: {box.confidence:.2f}")

Initialize DocLayout-YOLO extractor.

PARAMETER	DESCRIPTION
`config`	Configuration object with device, model_path, etc. TYPE: `DocLayoutYOLOConfig`

Source code in omnidocs/tasks/layout_extraction/doc_layout_yolo.py

def __init__(self, config: DocLayoutYOLOConfig):
    """
    Initialize DocLayout-YOLO extractor.

    Args:
        config: Configuration object with device, model_path, etc.
    """
    self.config = config
    self._model = None
    self._device = self._resolve_device(config.device)
    self._model_path = self._resolve_model_path(config.model_path)

    # Load model
    self._load_model()

extract ¶

extract(
    image: Union[Image, ndarray, str, Path],
) -> LayoutOutput

Run layout extraction on an image.

PARAMETER	DESCRIPTION
`image`	Input image (PIL Image, numpy array, or path) TYPE: `Union[Image, ndarray, str, Path]`

RETURNS	DESCRIPTION
`LayoutOutput`	LayoutOutput with detected layout boxes

Source code in omnidocs/tasks/layout_extraction/doc_layout_yolo.py

def extract(self, image: Union[Image.Image, np.ndarray, str, Path]) -> LayoutOutput:
    """
    Run layout extraction on an image.

    Args:
        image: Input image (PIL Image, numpy array, or path)

    Returns:
        LayoutOutput with detected layout boxes
    """
    if self._model is None:
        raise RuntimeError("Model not loaded. Call _load_model() first.")

    # Prepare image
    pil_image = self._prepare_image(image)
    img_width, img_height = pil_image.size

    # Run inference
    results = self._model.predict(
        pil_image,
        imgsz=self.config.img_size,
        conf=self.config.confidence,
        device=self._device,
    )

    result = results[0]

    # Parse detections
    layout_boxes = []

    if hasattr(result, "boxes") and result.boxes is not None:
        boxes = result.boxes

        for i in range(len(boxes)):
            # Get coordinates
            bbox_coords = boxes.xyxy[i].cpu().numpy().tolist()

            # Get class and confidence
            class_id = int(boxes.cls[i].item())
            confidence = float(boxes.conf[i].item())

            # Get original label from class names
            original_label = DOCLAYOUT_YOLO_CLASS_NAMES.get(class_id, f"class_{class_id}")

            # Map to standardized label
            standard_label = DOCLAYOUT_YOLO_MAPPING.to_standard(original_label)

            layout_boxes.append(
                LayoutBox(
                    label=standard_label,
                    bbox=BoundingBox.from_list(bbox_coords),
                    confidence=confidence,
                    class_id=class_id,
                    original_label=original_label,
                )
            )

    # Sort by y-coordinate (top to bottom reading order)
    layout_boxes.sort(key=lambda b: (b.bbox.y1, b.bbox.x1))

    return LayoutOutput(
        bboxes=layout_boxes,
        image_width=img_width,
        image_height=img_height,
        model_name="DocLayout-YOLO",
    )

mineruvl ¶

MinerU VL layout detection module.

MinerU VL can be used for standalone layout detection, returning detected regions with types and bounding boxes.

For full document extraction (layout + content), use MinerUVLTextExtractor from the text_extraction module instead.

Example

from omnidocs.tasks.layout_extraction import MinerUVLLayoutDetector
from omnidocs.tasks.layout_extraction.mineruvl import MinerUVLLayoutPyTorchConfig

detector = MinerUVLLayoutDetector(
    backend=MinerUVLLayoutPyTorchConfig(device="cuda")
)
result = detector.extract(image)

for box in result.bboxes:
    print(f"{box.label}: {box.confidence:.2f}")

MinerUVLLayoutAPIConfig ¶

Bases: BaseModel

API backend config for MinerU VL layout detection.

Example

from omnidocs.tasks.layout_extraction import MinerUVLLayoutDetector
from omnidocs.tasks.layout_extraction.mineruvl import MinerUVLLayoutAPIConfig

detector = MinerUVLLayoutDetector(
    backend=MinerUVLLayoutAPIConfig(
        server_url="https://your-server.modal.run"
    )
)
result = detector.extract(image)

MinerUVLLayoutDetector ¶

MinerUVLLayoutDetector(
    backend: MinerUVLLayoutBackendConfig,
)

Bases: BaseLayoutExtractor

MinerU VL layout detector.

Uses MinerU2.5-2509-1.2B for document layout detection. Detects 22+ element types including text, titles, tables, equations, figures, code, and more.

For full document extraction (layout + content), use MinerUVLTextExtractor from the text_extraction module instead.

Example

from omnidocs.tasks.layout_extraction import MinerUVLLayoutDetector
from omnidocs.tasks.layout_extraction.mineruvl import MinerUVLLayoutPyTorchConfig

detector = MinerUVLLayoutDetector(
    backend=MinerUVLLayoutPyTorchConfig(device="cuda")
)
result = detector.extract(image)

for box in result.bboxes:
    print(f"{box.label}: {box.confidence:.2f}")

Initialize MinerU VL layout detector.

PARAMETER	DESCRIPTION
`backend`	Backend configuration (PyTorch, VLLM, MLX, or API) TYPE: `MinerUVLLayoutBackendConfig`

Source code in omnidocs/tasks/layout_extraction/mineruvl/detector.py

def __init__(self, backend: MinerUVLLayoutBackendConfig):
    """
    Initialize MinerU VL layout detector.

    Args:
        backend: Backend configuration (PyTorch, VLLM, MLX, or API)
    """
    self.backend_config = backend
    self._client = None
    self._loaded = False
    self._load_model()

extract ¶

extract(
    image: Union[Image, ndarray, str, Path],
) -> LayoutOutput

Detect layout elements in the image.

PARAMETER	DESCRIPTION
`image`	Input image (PIL Image, numpy array, or file path) TYPE: `Union[Image, ndarray, str, Path]`

RETURNS	DESCRIPTION
`LayoutOutput`	LayoutOutput with standardized labels and bounding boxes

Source code in omnidocs/tasks/layout_extraction/mineruvl/detector.py

def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
) -> LayoutOutput:
    """
    Detect layout elements in the image.

    Args:
        image: Input image (PIL Image, numpy array, or file path)

    Returns:
        LayoutOutput with standardized labels and bounding boxes
    """
    if not self._loaded:
        raise RuntimeError("Model not loaded.")

    pil_image = self._prepare_image(image)
    width, height = pil_image.size

    # Run layout detection
    blocks = self._detect_layout(pil_image)

    # Convert to LayoutOutput
    bboxes = []
    for block in blocks:
        # Convert normalized [0,1] to pixel coords
        x1, y1, x2, y2 = block.bbox
        pixel_bbox = BoundingBox(
            x1=x1 * width,
            y1=y1 * height,
            x2=x2 * width,
            y2=y2 * height,
        )

        # Map label
        label = MINERUVL_LABEL_MAPPING.get(block.type, LayoutLabel.UNKNOWN)

        bboxes.append(
            LayoutBox(
                label=label,
                bbox=pixel_bbox,
                confidence=1.0,  # MinerU VL doesn't output confidence
                original_label=block.type.value,
            )
        )

    return LayoutOutput(
        bboxes=bboxes,
        image_width=width,
        image_height=height,
        model_name="MinerU2.5-2509-1.2B",
    )

MinerUVLLayoutMLXConfig ¶

Bases: BaseModel

MLX backend config for MinerU VL layout detection on Apple Silicon.

Example

from omnidocs.tasks.layout_extraction import MinerUVLLayoutDetector
from omnidocs.tasks.layout_extraction.mineruvl import MinerUVLLayoutMLXConfig

detector = MinerUVLLayoutDetector(
    backend=MinerUVLLayoutMLXConfig()
)
result = detector.extract(image)

MinerUVLLayoutPyTorchConfig ¶

Bases: BaseModel

PyTorch/HuggingFace backend config for MinerU VL layout detection.

Example

from omnidocs.tasks.layout_extraction import MinerUVLLayoutDetector
from omnidocs.tasks.layout_extraction.mineruvl import MinerUVLLayoutPyTorchConfig

detector = MinerUVLLayoutDetector(
    backend=MinerUVLLayoutPyTorchConfig(device="cuda")
)
result = detector.extract(image)

MinerUVLLayoutVLLMConfig ¶

Bases: BaseModel

VLLM backend config for MinerU VL layout detection.

Example

from omnidocs.tasks.layout_extraction import MinerUVLLayoutDetector
from omnidocs.tasks.layout_extraction.mineruvl import MinerUVLLayoutVLLMConfig

detector = MinerUVLLayoutDetector(
    backend=MinerUVLLayoutVLLMConfig(tensor_parallel_size=1)
)
result = detector.extract(image)

api ¶

API backend configuration for MinerU VL layout detection.

MinerUVLLayoutAPIConfig ¶

Bases: BaseModel

API backend config for MinerU VL layout detection.

Example

from omnidocs.tasks.layout_extraction import MinerUVLLayoutDetector
from omnidocs.tasks.layout_extraction.mineruvl import MinerUVLLayoutAPIConfig

detector = MinerUVLLayoutDetector(
    backend=MinerUVLLayoutAPIConfig(
        server_url="https://your-server.modal.run"
    )
)
result = detector.extract(image)

detector ¶

MinerU VL layout detector.

Uses MinerU2.5-2509-1.2B for document layout detection. Detects 22+ element types including text, titles, tables, equations, figures, code.

MinerUVLLayoutDetector ¶

MinerUVLLayoutDetector(
    backend: MinerUVLLayoutBackendConfig,
)

Bases: BaseLayoutExtractor

MinerU VL layout detector.

Uses MinerU2.5-2509-1.2B for document layout detection. Detects 22+ element types including text, titles, tables, equations, figures, code, and more.

For full document extraction (layout + content), use MinerUVLTextExtractor from the text_extraction module instead.

Example

from omnidocs.tasks.layout_extraction import MinerUVLLayoutDetector
from omnidocs.tasks.layout_extraction.mineruvl import MinerUVLLayoutPyTorchConfig

detector = MinerUVLLayoutDetector(
    backend=MinerUVLLayoutPyTorchConfig(device="cuda")
)
result = detector.extract(image)

for box in result.bboxes:
    print(f"{box.label}: {box.confidence:.2f}")

Initialize MinerU VL layout detector.

PARAMETER	DESCRIPTION
`backend`	Backend configuration (PyTorch, VLLM, MLX, or API) TYPE: `MinerUVLLayoutBackendConfig`

Source code in omnidocs/tasks/layout_extraction/mineruvl/detector.py

def __init__(self, backend: MinerUVLLayoutBackendConfig):
    """
    Initialize MinerU VL layout detector.

    Args:
        backend: Backend configuration (PyTorch, VLLM, MLX, or API)
    """
    self.backend_config = backend
    self._client = None
    self._loaded = False
    self._load_model()

extract ¶

extract(
    image: Union[Image, ndarray, str, Path],
) -> LayoutOutput

Detect layout elements in the image.

PARAMETER	DESCRIPTION
`image`	Input image (PIL Image, numpy array, or file path) TYPE: `Union[Image, ndarray, str, Path]`

RETURNS	DESCRIPTION
`LayoutOutput`	LayoutOutput with standardized labels and bounding boxes

Source code in omnidocs/tasks/layout_extraction/mineruvl/detector.py

def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
) -> LayoutOutput:
    """
    Detect layout elements in the image.

    Args:
        image: Input image (PIL Image, numpy array, or file path)

    Returns:
        LayoutOutput with standardized labels and bounding boxes
    """
    if not self._loaded:
        raise RuntimeError("Model not loaded.")

    pil_image = self._prepare_image(image)
    width, height = pil_image.size

    # Run layout detection
    blocks = self._detect_layout(pil_image)

    # Convert to LayoutOutput
    bboxes = []
    for block in blocks:
        # Convert normalized [0,1] to pixel coords
        x1, y1, x2, y2 = block.bbox
        pixel_bbox = BoundingBox(
            x1=x1 * width,
            y1=y1 * height,
            x2=x2 * width,
            y2=y2 * height,
        )

        # Map label
        label = MINERUVL_LABEL_MAPPING.get(block.type, LayoutLabel.UNKNOWN)

        bboxes.append(
            LayoutBox(
                label=label,
                bbox=pixel_bbox,
                confidence=1.0,  # MinerU VL doesn't output confidence
                original_label=block.type.value,
            )
        )

    return LayoutOutput(
        bboxes=bboxes,
        image_width=width,
        image_height=height,
        model_name="MinerU2.5-2509-1.2B",
    )

mlx ¶

MLX backend configuration for MinerU VL layout detection (Apple Silicon).

MinerUVLLayoutMLXConfig ¶

Bases: BaseModel

MLX backend config for MinerU VL layout detection on Apple Silicon.

Example

from omnidocs.tasks.layout_extraction import MinerUVLLayoutDetector
from omnidocs.tasks.layout_extraction.mineruvl import MinerUVLLayoutMLXConfig

detector = MinerUVLLayoutDetector(
    backend=MinerUVLLayoutMLXConfig()
)
result = detector.extract(image)

pytorch ¶

PyTorch backend configuration for MinerU VL layout detection.

MinerUVLLayoutPyTorchConfig ¶

Bases: BaseModel

PyTorch/HuggingFace backend config for MinerU VL layout detection.

Example

from omnidocs.tasks.layout_extraction import MinerUVLLayoutDetector
from omnidocs.tasks.layout_extraction.mineruvl import MinerUVLLayoutPyTorchConfig

detector = MinerUVLLayoutDetector(
    backend=MinerUVLLayoutPyTorchConfig(device="cuda")
)
result = detector.extract(image)

vllm ¶

VLLM backend configuration for MinerU VL layout detection.

MinerUVLLayoutVLLMConfig ¶

Bases: BaseModel

VLLM backend config for MinerU VL layout detection.

Example

from omnidocs.tasks.layout_extraction import MinerUVLLayoutDetector
from omnidocs.tasks.layout_extraction.mineruvl import MinerUVLLayoutVLLMConfig

detector = MinerUVLLayoutDetector(
    backend=MinerUVLLayoutVLLMConfig(tensor_parallel_size=1)
)
result = detector.extract(image)

models ¶

Pydantic models for layout extraction outputs.

Defines standardized output types and label enums for layout detection.

Coordinate Systems

Absolute (default): Coordinates in pixels relative to original image size
Normalized (0-1024): Coordinates scaled to 0-1024 range (virtual 1024x1024 canvas)

Use bbox.to_normalized(width, height) or output.get_normalized_bboxes() to convert to normalized coordinates.

Example

result = extractor.extract(image)  # Returns absolute pixel coordinates
normalized = result.get_normalized_bboxes()  # Returns 0-1024 normalized coords

LayoutLabel ¶

Bases: str, Enum

Standardized layout labels used across all layout extractors.

These provide a consistent vocabulary regardless of which model is used.

CustomLabel ¶

Bases: BaseModel

Type-safe custom layout label definition for VLM-based models.

VLM models like Qwen3-VL support flexible custom labels beyond the standard LayoutLabel enum. Use this class to define custom labels with validation.

Example

from omnidocs.tasks.layout_extraction import CustomLabel

# Simple custom label
code_block = CustomLabel(name="code_block")

# With metadata
sidebar = CustomLabel(
        name="sidebar",
        description="Secondary content panel",
        color="#9B59B6",
    )

# Use with QwenLayoutDetector
result = detector.extract(image, custom_labels=[code_block, sidebar])

LabelMapping ¶

LabelMapping(mapping: Dict[str, LayoutLabel])

Base class for model-specific label mappings.

Each model maps its native labels to standardized LayoutLabel values.

Initialize label mapping.

PARAMETER	DESCRIPTION
`mapping`	Dict mapping model-specific labels to LayoutLabel enum values TYPE: `Dict[str, LayoutLabel]`

Source code in omnidocs/tasks/layout_extraction/models.py

def __init__(self, mapping: Dict[str, LayoutLabel]):
    """
    Initialize label mapping.

    Args:
        mapping: Dict mapping model-specific labels to LayoutLabel enum values
    """
    self._mapping = {k.lower(): v for k, v in mapping.items()}
    self._reverse_mapping = {v: k for k, v in mapping.items()}

supported_labels `property` ¶

supported_labels: List[str]

Get list of supported model-specific labels.

standard_labels `property` ¶

standard_labels: List[LayoutLabel]

Get list of standard labels this mapping produces.

to_standard ¶

to_standard(model_label: str) -> LayoutLabel

Convert model-specific label to standardized LayoutLabel.

Source code in omnidocs/tasks/layout_extraction/models.py

def to_standard(self, model_label: str) -> LayoutLabel:
    """Convert model-specific label to standardized LayoutLabel."""
    return self._mapping.get(model_label.lower(), LayoutLabel.UNKNOWN)

from_standard ¶

from_standard(standard_label: LayoutLabel) -> Optional[str]

Convert standardized LayoutLabel to model-specific label.

Source code in omnidocs/tasks/layout_extraction/models.py

def from_standard(self, standard_label: LayoutLabel) -> Optional[str]:
    """Convert standardized LayoutLabel to model-specific label."""
    return self._reverse_mapping.get(standard_label)

BoundingBox ¶

Bases: BaseModel

Bounding box coordinates in pixel space.

Coordinates follow the convention: (x1, y1) is top-left, (x2, y2) is bottom-right.

width `property` ¶

width: float

Width of the bounding box.

height `property` ¶

height: float

Height of the bounding box.

area `property` ¶

area: float

Area of the bounding box.

center `property` ¶

center: Tuple[float, float]

Center point of the bounding box.

to_list ¶

to_list() -> List[float]

Convert to [x1, y1, x2, y2] list.

Source code in omnidocs/tasks/layout_extraction/models.py

def to_list(self) -> List[float]:
    """Convert to [x1, y1, x2, y2] list."""
    return [self.x1, self.y1, self.x2, self.y2]

to_xyxy ¶

to_xyxy() -> Tuple[float, float, float, float]

Convert to (x1, y1, x2, y2) tuple.

Source code in omnidocs/tasks/layout_extraction/models.py

def to_xyxy(self) -> Tuple[float, float, float, float]:
    """Convert to (x1, y1, x2, y2) tuple."""
    return (self.x1, self.y1, self.x2, self.y2)

to_xywh ¶

to_xywh() -> Tuple[float, float, float, float]

Convert to (x, y, width, height) format.

Source code in omnidocs/tasks/layout_extraction/models.py

def to_xywh(self) -> Tuple[float, float, float, float]:
    """Convert to (x, y, width, height) format."""
    return (self.x1, self.y1, self.width, self.height)

from_list `classmethod` ¶

from_list(coords: List[float]) -> BoundingBox

Create from [x1, y1, x2, y2] list.

Source code in omnidocs/tasks/layout_extraction/models.py

@classmethod
def from_list(cls, coords: List[float]) -> "BoundingBox":
    """Create from [x1, y1, x2, y2] list."""
    if len(coords) != 4:
        raise ValueError(f"Expected 4 coordinates, got {len(coords)}")
    return cls(x1=coords[0], y1=coords[1], x2=coords[2], y2=coords[3])

to_normalized ¶

to_normalized(
    image_width: int, image_height: int
) -> BoundingBox

Convert to normalized coordinates (0-1024 range).

Scales coordinates from absolute pixel values to a virtual 1024x1024 canvas. This provides consistent coordinates regardless of original image size.

PARAMETER	DESCRIPTION
`image_width`	Original image width in pixels TYPE: `int`
`image_height`	Original image height in pixels TYPE: `int`

RETURNS	DESCRIPTION
`BoundingBox`	New BoundingBox with coordinates in 0-1024 range

Example

bbox = BoundingBox(x1=100, y1=50, x2=500, y2=300)
normalized = bbox.to_normalized(1000, 800)
# x: 100/1000*1024 = 102.4, y: 50/800*1024 = 64

Source code in omnidocs/tasks/layout_extraction/models.py

def to_normalized(self, image_width: int, image_height: int) -> "BoundingBox":
    """
    Convert to normalized coordinates (0-1024 range).

    Scales coordinates from absolute pixel values to a virtual 1024x1024 canvas.
    This provides consistent coordinates regardless of original image size.

    Args:
        image_width: Original image width in pixels
        image_height: Original image height in pixels

    Returns:
        New BoundingBox with coordinates in 0-1024 range

    Example:
        ```python
        bbox = BoundingBox(x1=100, y1=50, x2=500, y2=300)
        normalized = bbox.to_normalized(1000, 800)
        # x: 100/1000*1024 = 102.4, y: 50/800*1024 = 64
        ```
    """
    return BoundingBox(
        x1=self.x1 / image_width * NORMALIZED_SIZE,
        y1=self.y1 / image_height * NORMALIZED_SIZE,
        x2=self.x2 / image_width * NORMALIZED_SIZE,
        y2=self.y2 / image_height * NORMALIZED_SIZE,
    )

to_absolute ¶

to_absolute(
    image_width: int, image_height: int
) -> BoundingBox

Convert from normalized (0-1024) to absolute pixel coordinates.

PARAMETER	DESCRIPTION
`image_width`	Target image width in pixels TYPE: `int`
`image_height`	Target image height in pixels TYPE: `int`

RETURNS	DESCRIPTION
`BoundingBox`	New BoundingBox with absolute pixel coordinates

Source code in omnidocs/tasks/layout_extraction/models.py

def to_absolute(self, image_width: int, image_height: int) -> "BoundingBox":
    """
    Convert from normalized (0-1024) to absolute pixel coordinates.

    Args:
        image_width: Target image width in pixels
        image_height: Target image height in pixels

    Returns:
        New BoundingBox with absolute pixel coordinates
    """
    return BoundingBox(
        x1=self.x1 / NORMALIZED_SIZE * image_width,
        y1=self.y1 / NORMALIZED_SIZE * image_height,
        x2=self.x2 / NORMALIZED_SIZE * image_width,
        y2=self.y2 / NORMALIZED_SIZE * image_height,
    )

LayoutBox ¶

Bases: BaseModel

Single detected layout element with label, bounding box, and confidence.

to_dict ¶

to_dict() -> Dict

Convert to dictionary representation.

Source code in omnidocs/tasks/layout_extraction/models.py

def to_dict(self) -> Dict:
    """Convert to dictionary representation."""
    return {
        "label": self.label.value,
        "bbox": self.bbox.to_list(),
        "confidence": self.confidence,
        "class_id": self.class_id,
        "original_label": self.original_label,
    }

get_normalized_bbox ¶

get_normalized_bbox(
    image_width: int, image_height: int
) -> BoundingBox

Get bounding box in normalized (0-1024) coordinates.

PARAMETER	DESCRIPTION
`image_width`	Original image width TYPE: `int`
`image_height`	Original image height TYPE: `int`

RETURNS	DESCRIPTION
`BoundingBox`	BoundingBox with normalized coordinates

Source code in omnidocs/tasks/layout_extraction/models.py

def get_normalized_bbox(self, image_width: int, image_height: int) -> BoundingBox:
    """
    Get bounding box in normalized (0-1024) coordinates.

    Args:
        image_width: Original image width
        image_height: Original image height

    Returns:
        BoundingBox with normalized coordinates
    """
    return self.bbox.to_normalized(image_width, image_height)

LayoutOutput ¶

Bases: BaseModel

Complete layout extraction results for a single image.

element_count `property` ¶

element_count: int

Number of detected elements.

labels_found `property` ¶

labels_found: List[str]

Unique labels found in detections.

filter_by_label ¶

filter_by_label(label: LayoutLabel) -> List[LayoutBox]

Filter boxes by label.

Source code in omnidocs/tasks/layout_extraction/models.py

def filter_by_label(self, label: LayoutLabel) -> List[LayoutBox]:
    """Filter boxes by label."""
    return [box for box in self.bboxes if box.label == label]

filter_by_confidence ¶

filter_by_confidence(
    min_confidence: float,
) -> List[LayoutBox]

Filter boxes by minimum confidence.

Source code in omnidocs/tasks/layout_extraction/models.py

def filter_by_confidence(self, min_confidence: float) -> List[LayoutBox]:
    """Filter boxes by minimum confidence."""
    return [box for box in self.bboxes if box.confidence >= min_confidence]

to_dict ¶

to_dict() -> Dict

Convert to dictionary representation.

Source code in omnidocs/tasks/layout_extraction/models.py

def to_dict(self) -> Dict:
    """Convert to dictionary representation."""
    return {
        "bboxes": [box.to_dict() for box in self.bboxes],
        "image_width": self.image_width,
        "image_height": self.image_height,
        "model_name": self.model_name,
        "element_count": self.element_count,
        "labels_found": self.labels_found,
    }

sort_by_position ¶

sort_by_position(
    top_to_bottom: bool = True,
) -> LayoutOutput

Return a new LayoutOutput with boxes sorted by position.

PARAMETER	DESCRIPTION
`top_to_bottom`	If True, sort by y-coordinate (reading order) TYPE: `bool` DEFAULT: `True`

Source code in omnidocs/tasks/layout_extraction/models.py

def sort_by_position(self, top_to_bottom: bool = True) -> "LayoutOutput":
    """
    Return a new LayoutOutput with boxes sorted by position.

    Args:
        top_to_bottom: If True, sort by y-coordinate (reading order)
    """
    sorted_boxes = sorted(self.bboxes, key=lambda b: (b.bbox.y1, b.bbox.x1), reverse=not top_to_bottom)
    return LayoutOutput(
        bboxes=sorted_boxes,
        image_width=self.image_width,
        image_height=self.image_height,
        model_name=self.model_name,
    )

get_normalized_bboxes ¶

get_normalized_bboxes() -> List[Dict]

Get all bounding boxes in normalized (0-1024) coordinates.

RETURNS	DESCRIPTION
`List[Dict]`	List of dicts with normalized bbox coordinates and metadata.

Example

result = extractor.extract(image)
normalized = result.get_normalized_bboxes()
for box in normalized:
        print(f"{box['label']}: {box['bbox']}")  # coords in 0-1024 range

Source code in omnidocs/tasks/layout_extraction/models.py

def get_normalized_bboxes(self) -> List[Dict]:
    """
    Get all bounding boxes in normalized (0-1024) coordinates.

    Returns:
        List of dicts with normalized bbox coordinates and metadata.

    Example:
        ```python
        result = extractor.extract(image)
        normalized = result.get_normalized_bboxes()
        for box in normalized:
                print(f"{box['label']}: {box['bbox']}")  # coords in 0-1024 range
        ```
    """
    normalized = []
    for box in self.bboxes:
        norm_bbox = box.bbox.to_normalized(self.image_width, self.image_height)
        normalized.append(
            {
                "label": box.label.value,
                "bbox": norm_bbox.to_list(),
                "confidence": box.confidence,
                "class_id": box.class_id,
                "original_label": box.original_label,
            }
        )
    return normalized

visualize ¶

visualize(
    image: Image,
    output_path: Optional[Union[str, Path]] = None,
    show_labels: bool = True,
    show_confidence: bool = True,
    line_width: int = 3,
    font_size: int = 12,
) -> Image.Image

Visualize layout detection results on the image.

Draws bounding boxes with labels and confidence scores on the image. Each layout category has a distinct color for easy identification.

PARAMETER	DESCRIPTION
`image`	PIL Image to draw on (will be copied, not modified) TYPE: `Image`
`output_path`	Optional path to save the visualization TYPE: `Optional[Union[str, Path]]` DEFAULT: `None`
`show_labels`	Whether to show label text TYPE: `bool` DEFAULT: `True`
`show_confidence`	Whether to show confidence scores TYPE: `bool` DEFAULT: `True`
`line_width`	Width of bounding box lines TYPE: `int` DEFAULT: `3`
`font_size`	Size of label text (note: uses default font) TYPE: `int` DEFAULT: `12`

RETURNS	DESCRIPTION
`Image`	PIL Image with visualizations drawn

Example

result = extractor.extract(image)
viz = result.visualize(image, output_path="layout_viz.png")
viz.show()  # Display in notebook/viewer

Source code in omnidocs/tasks/layout_extraction/models.py

def visualize(
    self,
    image: "Image.Image",
    output_path: Optional[Union[str, Path]] = None,
    show_labels: bool = True,
    show_confidence: bool = True,
    line_width: int = 3,
    font_size: int = 12,
) -> "Image.Image":
    """
    Visualize layout detection results on the image.

    Draws bounding boxes with labels and confidence scores on the image.
    Each layout category has a distinct color for easy identification.

    Args:
        image: PIL Image to draw on (will be copied, not modified)
        output_path: Optional path to save the visualization
        show_labels: Whether to show label text
        show_confidence: Whether to show confidence scores
        line_width: Width of bounding box lines
        font_size: Size of label text (note: uses default font)

    Returns:
        PIL Image with visualizations drawn

    Example:
        ```python
        result = extractor.extract(image)
        viz = result.visualize(image, output_path="layout_viz.png")
        viz.show()  # Display in notebook/viewer
        ```
    """
    from PIL import ImageDraw

    # Copy image to avoid modifying original
    viz_image = image.copy().convert("RGB")
    draw = ImageDraw.Draw(viz_image)

    for box in self.bboxes:
        # Get color for this label
        color = LABEL_COLORS.get(box.label, "#95A5A6")

        # Draw bounding box
        coords = box.bbox.to_xyxy()
        draw.rectangle(coords, outline=color, width=line_width)

        # Build label text
        if show_labels or show_confidence:
            label_parts = []
            if show_labels:
                label_parts.append(box.label.value)
            if show_confidence:
                label_parts.append(f"{box.confidence:.2f}")
            label_text = " ".join(label_parts)

            # Draw label background
            text_bbox = draw.textbbox((coords[0], coords[1] - 20), label_text)
            draw.rectangle(text_bbox, fill=color)

            # Draw label text
            draw.text(
                (coords[0], coords[1] - 20),
                label_text,
                fill="white",
            )

    # Save if path provided
    if output_path:
        output_path = Path(output_path)
        output_path.parent.mkdir(parents=True, exist_ok=True)
        viz_image.save(output_path)

    return viz_image

load_json `classmethod` ¶

load_json(file_path: Union[str, Path]) -> LayoutOutput

Load a LayoutOutput instance from a JSON file.

Reads a JSON file and deserializes its contents into a LayoutOutput object. Uses Pydantic's model_validate_json for proper handling of nested objects.

PARAMETER	DESCRIPTION
`file_path`	Path to JSON file containing serialized LayoutOutput data. Can be string or pathlib.Path object. TYPE: `Union[str, Path]`

RETURNS	DESCRIPTION
`LayoutOutput`	Deserialized layout output instance from file. TYPE: `LayoutOutput`

RAISES	DESCRIPTION
`FileNotFoundError`	If the specified file does not exist.
`UnicodeDecodeError`	If file cannot be decoded as UTF-8.
`ValueError`	If file contents are not valid JSON.
`ValidationError`	If JSON data doesn't match LayoutOutput schema.

Example

output = LayoutOutput.load_json('layout_results.json')
print(f"Found {output.element_count} elements")

Found 5 elements

Source code in omnidocs/tasks/layout_extraction/models.py

@classmethod
def load_json(cls, file_path: Union[str, Path]) -> "LayoutOutput":
    """
    Load a LayoutOutput instance from a JSON file.

    Reads a JSON file and deserializes its contents into a LayoutOutput object.
    Uses Pydantic's model_validate_json for proper handling of nested objects.

    Args:
        file_path: Path to JSON file containing serialized LayoutOutput data.
                  Can be string or pathlib.Path object.

    Returns:
        LayoutOutput: Deserialized layout output instance from file.

    Raises:
        FileNotFoundError: If the specified file does not exist.
        UnicodeDecodeError: If file cannot be decoded as UTF-8.
        ValueError: If file contents are not valid JSON.
        ValidationError: If JSON data doesn't match LayoutOutput schema.

    Example:
        ```python
        output = LayoutOutput.load_json('layout_results.json')
        print(f"Found {output.element_count} elements")
        ```
        Found 5 elements
    """
    path = Path(file_path)
    return cls.model_validate_json(path.read_text(encoding="utf-8"))

save_json ¶

save_json(file_path: Union[str, Path]) -> None

Save LayoutOutput instance to a JSON file.

Serializes the LayoutOutput object to JSON and writes it to a file. Automatically creates parent directories if they don't exist. Uses UTF-8 encoding for compatibility and proper handling of special characters.

PARAMETER	DESCRIPTION
`file_path`	Path where JSON file should be saved. Can be string or pathlib.Path object. Parent directories will be created if they don't exist. TYPE: `Union[str, Path]`

RETURNS	DESCRIPTION
`None`	None

RAISES	DESCRIPTION
`OSError`	If file cannot be written due to permission or disk errors.
`TypeError`	If file_path is not a string or Path object.

Example

output = LayoutOutput(bboxes=[], image_width=800, image_height=600)
output.save_json('results/layout_output.json')
# File is created at results/layout_output.json
# Parent 'results' directory is created if it didn't exist

Source code in omnidocs/tasks/layout_extraction/models.py

def save_json(self, file_path: Union[str, Path]) -> None:
    """
    Save LayoutOutput instance to a JSON file.

    Serializes the LayoutOutput object to JSON and writes it to a file.
    Automatically creates parent directories if they don't exist. Uses UTF-8
    encoding for compatibility and proper handling of special characters.

    Args:
        file_path: Path where JSON file should be saved. Can be string or
                  pathlib.Path object. Parent directories will be created
                  if they don't exist.

    Returns:
        None

    Raises:
        OSError: If file cannot be written due to permission or disk errors.
        TypeError: If file_path is not a string or Path object.

    Example:
        ```python
        output = LayoutOutput(bboxes=[], image_width=800, image_height=600)
        output.save_json('results/layout_output.json')
        # File is created at results/layout_output.json
        # Parent 'results' directory is created if it didn't exist
        ```
    """
    path = Path(file_path)
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(self.model_dump_json(), encoding="utf-8")

qwen ¶

Qwen3-VL backend configurations and detector for layout detection.

Available backends

QwenLayoutPyTorchConfig: PyTorch/HuggingFace backend
QwenLayoutVLLMConfig: VLLM high-throughput backend
QwenLayoutMLXConfig: MLX backend for Apple Silicon
QwenLayoutAPIConfig: API backend (OpenRouter, etc.)

Example

from omnidocs.tasks.layout_extraction.qwen import QwenLayoutPyTorchConfig
config = QwenLayoutPyTorchConfig(model="Qwen/Qwen3-VL-8B-Instruct")

QwenLayoutAPIConfig ¶

Bases: BaseModel

API backend configuration for Qwen layout detection.

Uses litellm for provider-agnostic API access. Supports OpenRouter, Gemini, Azure, OpenAI, and any other litellm-compatible provider.

API keys can be passed directly or read from environment variables.

Example

# OpenRouter (reads OPENROUTER_API_KEY from env)
config = QwenLayoutAPIConfig(
    model="openrouter/qwen/qwen3-vl-8b-instruct",
)

# With explicit key
config = QwenLayoutAPIConfig(
    model="openrouter/qwen/qwen3-vl-8b-instruct",
    api_key=os.environ["OPENROUTER_API_KEY"],
    api_base="https://openrouter.ai/api/v1",
)

QwenLayoutDetector ¶

QwenLayoutDetector(backend: QwenLayoutBackendConfig)

Bases: BaseLayoutExtractor

Qwen3-VL Vision-Language Model layout detector.

A flexible VLM-based layout detector that supports custom labels. Unlike fixed-label models (DocLayoutYOLO, RT-DETR), Qwen can detect any document elements specified at runtime.

Supports PyTorch, VLLM, MLX, and API backends.

Example

from omnidocs.tasks.layout_extraction import QwenLayoutDetector, CustomLabel
from omnidocs.tasks.layout_extraction.qwen import QwenLayoutPyTorchConfig

# Initialize with PyTorch backend
detector = QwenLayoutDetector(
        backend=QwenLayoutPyTorchConfig(model="Qwen/Qwen3-VL-8B-Instruct")
    )

# Basic extraction with default labels
result = detector.extract(image)

# With custom labels (strings)
result = detector.extract(image, custom_labels=["code_block", "sidebar"])

# With typed custom labels
labels = [
        CustomLabel(name="code_block", color="#E74C3C"),
        CustomLabel(name="sidebar", description="Side panel content"),
    ]
result = detector.extract(image, custom_labels=labels)

Initialize Qwen layout detector.

PARAMETER	DESCRIPTION
`backend`	Backend configuration. One of: - QwenLayoutPyTorchConfig: PyTorch/HuggingFace backend - QwenLayoutVLLMConfig: VLLM high-throughput backend - QwenLayoutMLXConfig: MLX backend for Apple Silicon - QwenLayoutAPIConfig: API backend (OpenRouter, etc.) TYPE: `QwenLayoutBackendConfig`

Source code in omnidocs/tasks/layout_extraction/qwen/detector.py

def __init__(self, backend: QwenLayoutBackendConfig):
    """
    Initialize Qwen layout detector.

    Args:
        backend: Backend configuration. One of:
            - QwenLayoutPyTorchConfig: PyTorch/HuggingFace backend
            - QwenLayoutVLLMConfig: VLLM high-throughput backend
            - QwenLayoutMLXConfig: MLX backend for Apple Silicon
            - QwenLayoutAPIConfig: API backend (OpenRouter, etc.)
    """
    self.backend_config = backend
    self._backend: Any = None
    self._processor: Any = None
    self._loaded = False

    # Backend-specific helpers
    self._process_vision_info: Any = None
    self._sampling_params_class: Any = None
    self._mlx_config: Any = None
    self._apply_chat_template: Any = None
    self._generate: Any = None

    # Load model
    self._load_model()

extract ¶

extract(
    image: Union[Image, ndarray, str, Path],
    custom_labels: Optional[
        List[Union[str, CustomLabel]]
    ] = None,
) -> LayoutOutput

Run layout detection on an image.

PARAMETER	DESCRIPTION
`image`	Input image as: - PIL.Image.Image: PIL image object - np.ndarray: Numpy array (HWC format, RGB) - str or Path: Path to image file TYPE: `Union[Image, ndarray, str, Path]`
`custom_labels`	Optional custom labels to detect. Can be: - None: Use default labels (title, text, table, figure, etc.) - List[str]: Simple label names ["code_block", "sidebar"] - List[CustomLabel]: Typed labels with metadata TYPE: `Optional[List[Union[str, CustomLabel]]]` DEFAULT: `None`

RETURNS	DESCRIPTION
`LayoutOutput`	LayoutOutput with detected layout boxes

RAISES	DESCRIPTION
`RuntimeError`	If model is not loaded
`ValueError`	If image format is not supported

Source code in omnidocs/tasks/layout_extraction/qwen/detector.py

def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    custom_labels: Optional[List[Union[str, CustomLabel]]] = None,
) -> LayoutOutput:
    """
    Run layout detection on an image.

    Args:
        image: Input image as:
            - PIL.Image.Image: PIL image object
            - np.ndarray: Numpy array (HWC format, RGB)
            - str or Path: Path to image file
        custom_labels: Optional custom labels to detect. Can be:
            - None: Use default labels (title, text, table, figure, etc.)
            - List[str]: Simple label names ["code_block", "sidebar"]
            - List[CustomLabel]: Typed labels with metadata

    Returns:
        LayoutOutput with detected layout boxes

    Raises:
        RuntimeError: If model is not loaded
        ValueError: If image format is not supported
    """
    if not self._loaded:
        raise RuntimeError("Model not loaded. Call _load_model() first.")

    # Prepare image
    pil_image = self._prepare_image(image)
    width, height = pil_image.size

    # Normalize labels
    label_names = self._normalize_labels(custom_labels)

    # Build prompt
    prompt = self._build_detection_prompt(label_names)

    # Run inference based on backend
    config_type = type(self.backend_config).__name__
    if config_type == "QwenLayoutPyTorchConfig":
        raw_output = self._infer_pytorch(pil_image, prompt)
    elif config_type == "QwenLayoutVLLMConfig":
        raw_output = self._infer_vllm(pil_image, prompt)
    elif config_type == "QwenLayoutMLXConfig":
        raw_output = self._infer_mlx(pil_image, prompt)
    elif config_type == "QwenLayoutAPIConfig":
        raw_output = self._infer_api(pil_image, prompt)
    else:
        raise RuntimeError(f"Unknown backend: {config_type}")

    # Parse detections
    detections = self._parse_json_output(raw_output)

    # Convert to LayoutOutput
    layout_boxes = self._build_layout_boxes(detections, width, height)

    # Sort by position (reading order)
    layout_boxes.sort(key=lambda b: (b.bbox.y1, b.bbox.x1))

    return LayoutOutput(
        bboxes=layout_boxes,
        image_width=width,
        image_height=height,
        model_name=f"Qwen3-VL ({type(self.backend_config).__name__})",
    )

QwenLayoutMLXConfig ¶

Bases: BaseModel

MLX backend configuration for Qwen layout detection.

This backend uses MLX for Apple Silicon native inference. Best for local development and testing on macOS M1/M2/M3+. Requires: mlx, mlx-vlm

Note: This backend only works on Apple Silicon Macs. Do NOT use for Modal/cloud deployments.

Example

config = QwenLayoutMLXConfig(
        model="mlx-community/Qwen3-VL-8B-Instruct-4bit",
    )

QwenLayoutPyTorchConfig ¶

Bases: BaseModel

PyTorch/HuggingFace backend configuration for Qwen layout detection.

This backend uses the transformers library with PyTorch for local GPU inference. Requires: torch, transformers, accelerate, qwen-vl-utils

Example

config = QwenLayoutPyTorchConfig(
        model="Qwen/Qwen3-VL-8B-Instruct",
        device="cuda",
        torch_dtype="bfloat16",
    )

QwenLayoutVLLMConfig ¶

Bases: BaseModel

VLLM backend configuration for Qwen layout detection.

This backend uses VLLM for high-throughput inference. Best for batch processing and production deployments. Requires: vllm, torch, transformers, qwen-vl-utils

Example

config = QwenLayoutVLLMConfig(
        model="Qwen/Qwen3-VL-8B-Instruct",
        tensor_parallel_size=2,
        gpu_memory_utilization=0.9,
    )

api ¶

API backend configuration for Qwen3-VL layout detection.

Uses litellm for provider-agnostic inference (OpenRouter, Gemini, Azure, etc.).

QwenLayoutAPIConfig ¶

Bases: BaseModel

API backend configuration for Qwen layout detection.

Uses litellm for provider-agnostic API access. Supports OpenRouter, Gemini, Azure, OpenAI, and any other litellm-compatible provider.

API keys can be passed directly or read from environment variables.

Example

# OpenRouter (reads OPENROUTER_API_KEY from env)
config = QwenLayoutAPIConfig(
    model="openrouter/qwen/qwen3-vl-8b-instruct",
)

# With explicit key
config = QwenLayoutAPIConfig(
    model="openrouter/qwen/qwen3-vl-8b-instruct",
    api_key=os.environ["OPENROUTER_API_KEY"],
    api_base="https://openrouter.ai/api/v1",
)

detector ¶

Qwen3-VL layout detector.

A Vision-Language Model for flexible layout detection with custom label support. Supports PyTorch, VLLM, MLX, and API backends.

Example

from omnidocs.tasks.layout_extraction import QwenLayoutDetector
from omnidocs.tasks.layout_extraction.qwen import QwenLayoutPyTorchConfig

detector = QwenLayoutDetector(
        backend=QwenLayoutPyTorchConfig(model="Qwen/Qwen3-VL-8B-Instruct")
    )
result = detector.extract(image)

# With custom labels
result = detector.extract(image, custom_labels=["code_block", "sidebar"])

QwenLayoutDetector ¶

QwenLayoutDetector(backend: QwenLayoutBackendConfig)

Bases: BaseLayoutExtractor

Qwen3-VL Vision-Language Model layout detector.

A flexible VLM-based layout detector that supports custom labels. Unlike fixed-label models (DocLayoutYOLO, RT-DETR), Qwen can detect any document elements specified at runtime.

Supports PyTorch, VLLM, MLX, and API backends.

Example

from omnidocs.tasks.layout_extraction import QwenLayoutDetector, CustomLabel
from omnidocs.tasks.layout_extraction.qwen import QwenLayoutPyTorchConfig

# Initialize with PyTorch backend
detector = QwenLayoutDetector(
        backend=QwenLayoutPyTorchConfig(model="Qwen/Qwen3-VL-8B-Instruct")
    )

# Basic extraction with default labels
result = detector.extract(image)

# With custom labels (strings)
result = detector.extract(image, custom_labels=["code_block", "sidebar"])

# With typed custom labels
labels = [
        CustomLabel(name="code_block", color="#E74C3C"),
        CustomLabel(name="sidebar", description="Side panel content"),
    ]
result = detector.extract(image, custom_labels=labels)

Initialize Qwen layout detector.

PARAMETER	DESCRIPTION
`backend`	Backend configuration. One of: - QwenLayoutPyTorchConfig: PyTorch/HuggingFace backend - QwenLayoutVLLMConfig: VLLM high-throughput backend - QwenLayoutMLXConfig: MLX backend for Apple Silicon - QwenLayoutAPIConfig: API backend (OpenRouter, etc.) TYPE: `QwenLayoutBackendConfig`

Source code in omnidocs/tasks/layout_extraction/qwen/detector.py

def __init__(self, backend: QwenLayoutBackendConfig):
    """
    Initialize Qwen layout detector.

    Args:
        backend: Backend configuration. One of:
            - QwenLayoutPyTorchConfig: PyTorch/HuggingFace backend
            - QwenLayoutVLLMConfig: VLLM high-throughput backend
            - QwenLayoutMLXConfig: MLX backend for Apple Silicon
            - QwenLayoutAPIConfig: API backend (OpenRouter, etc.)
    """
    self.backend_config = backend
    self._backend: Any = None
    self._processor: Any = None
    self._loaded = False

    # Backend-specific helpers
    self._process_vision_info: Any = None
    self._sampling_params_class: Any = None
    self._mlx_config: Any = None
    self._apply_chat_template: Any = None
    self._generate: Any = None

    # Load model
    self._load_model()

extract ¶

extract(
    image: Union[Image, ndarray, str, Path],
    custom_labels: Optional[
        List[Union[str, CustomLabel]]
    ] = None,
) -> LayoutOutput

Run layout detection on an image.

PARAMETER	DESCRIPTION
`image`	Input image as: - PIL.Image.Image: PIL image object - np.ndarray: Numpy array (HWC format, RGB) - str or Path: Path to image file TYPE: `Union[Image, ndarray, str, Path]`
`custom_labels`	Optional custom labels to detect. Can be: - None: Use default labels (title, text, table, figure, etc.) - List[str]: Simple label names ["code_block", "sidebar"] - List[CustomLabel]: Typed labels with metadata TYPE: `Optional[List[Union[str, CustomLabel]]]` DEFAULT: `None`

RETURNS	DESCRIPTION
`LayoutOutput`	LayoutOutput with detected layout boxes

RAISES	DESCRIPTION
`RuntimeError`	If model is not loaded
`ValueError`	If image format is not supported

Source code in omnidocs/tasks/layout_extraction/qwen/detector.py

def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    custom_labels: Optional[List[Union[str, CustomLabel]]] = None,
) -> LayoutOutput:
    """
    Run layout detection on an image.

    Args:
        image: Input image as:
            - PIL.Image.Image: PIL image object
            - np.ndarray: Numpy array (HWC format, RGB)
            - str or Path: Path to image file
        custom_labels: Optional custom labels to detect. Can be:
            - None: Use default labels (title, text, table, figure, etc.)
            - List[str]: Simple label names ["code_block", "sidebar"]
            - List[CustomLabel]: Typed labels with metadata

    Returns:
        LayoutOutput with detected layout boxes

    Raises:
        RuntimeError: If model is not loaded
        ValueError: If image format is not supported
    """
    if not self._loaded:
        raise RuntimeError("Model not loaded. Call _load_model() first.")

    # Prepare image
    pil_image = self._prepare_image(image)
    width, height = pil_image.size

    # Normalize labels
    label_names = self._normalize_labels(custom_labels)

    # Build prompt
    prompt = self._build_detection_prompt(label_names)

    # Run inference based on backend
    config_type = type(self.backend_config).__name__
    if config_type == "QwenLayoutPyTorchConfig":
        raw_output = self._infer_pytorch(pil_image, prompt)
    elif config_type == "QwenLayoutVLLMConfig":
        raw_output = self._infer_vllm(pil_image, prompt)
    elif config_type == "QwenLayoutMLXConfig":
        raw_output = self._infer_mlx(pil_image, prompt)
    elif config_type == "QwenLayoutAPIConfig":
        raw_output = self._infer_api(pil_image, prompt)
    else:
        raise RuntimeError(f"Unknown backend: {config_type}")

    # Parse detections
    detections = self._parse_json_output(raw_output)

    # Convert to LayoutOutput
    layout_boxes = self._build_layout_boxes(detections, width, height)

    # Sort by position (reading order)
    layout_boxes.sort(key=lambda b: (b.bbox.y1, b.bbox.x1))

    return LayoutOutput(
        bboxes=layout_boxes,
        image_width=width,
        image_height=height,
        model_name=f"Qwen3-VL ({type(self.backend_config).__name__})",
    )

mlx ¶

MLX backend configuration for Qwen3-VL layout detection.

QwenLayoutMLXConfig ¶

Bases: BaseModel

MLX backend configuration for Qwen layout detection.

This backend uses MLX for Apple Silicon native inference. Best for local development and testing on macOS M1/M2/M3+. Requires: mlx, mlx-vlm

Note: This backend only works on Apple Silicon Macs. Do NOT use for Modal/cloud deployments.

Example

config = QwenLayoutMLXConfig(
        model="mlx-community/Qwen3-VL-8B-Instruct-4bit",
    )

pytorch ¶

PyTorch/HuggingFace backend configuration for Qwen3-VL layout detection.

QwenLayoutPyTorchConfig ¶

Bases: BaseModel

PyTorch/HuggingFace backend configuration for Qwen layout detection.

This backend uses the transformers library with PyTorch for local GPU inference. Requires: torch, transformers, accelerate, qwen-vl-utils

Example

config = QwenLayoutPyTorchConfig(
        model="Qwen/Qwen3-VL-8B-Instruct",
        device="cuda",
        torch_dtype="bfloat16",
    )

vllm ¶

VLLM backend configuration for Qwen3-VL layout detection.

QwenLayoutVLLMConfig ¶

Bases: BaseModel

VLLM backend configuration for Qwen layout detection.

This backend uses VLLM for high-throughput inference. Best for batch processing and production deployments. Requires: vllm, torch, transformers, qwen-vl-utils

Example

config = QwenLayoutVLLMConfig(
        model="Qwen/Qwen3-VL-8B-Instruct",
        tensor_parallel_size=2,
        gpu_memory_utilization=0.9,
    )

rtdetr ¶

RT-DETR layout extractor.

A transformer-based real-time detection model for document layout detection. Uses HuggingFace Transformers implementation.

Model: HuggingPanda/docling-layout

RTDETRConfig ¶

Bases: BaseModel

Configuration for RT-DETR layout extractor.

This is a single-backend model (PyTorch/Transformers only).

Example

config = RTDETRConfig(device="cuda", confidence=0.4)
extractor = RTDETRLayoutExtractor(config=config)

RTDETRLayoutExtractor ¶

RTDETRLayoutExtractor(config: RTDETRConfig)

Bases: BaseLayoutExtractor

RT-DETR layout extractor using HuggingFace Transformers.

A transformer-based real-time detection model for document layout. Detects: title, text, table, figure, list, formula, captions, headers, footers.

This is a single-backend model (PyTorch/Transformers only).

Example

from omnidocs.tasks.layout_extraction import RTDETRLayoutExtractor, RTDETRConfig

extractor = RTDETRLayoutExtractor(config=RTDETRConfig(device="cuda"))
result = extractor.extract(image)

for box in result.bboxes:
        print(f"{box.label.value}: {box.confidence:.2f}")

Initialize RT-DETR layout extractor.

PARAMETER	DESCRIPTION
`config`	Configuration object with device, model settings, etc. TYPE: `RTDETRConfig`

Source code in omnidocs/tasks/layout_extraction/rtdetr.py

def __init__(self, config: RTDETRConfig):
    """
    Initialize RT-DETR layout extractor.

    Args:
        config: Configuration object with device, model settings, etc.
    """
    self.config = config
    self._model = None
    self._processor = None
    self._device = self._resolve_device(config.device)
    self._model_path = self._resolve_model_path(config.model_path)

    # Load model
    self._load_model()

extract ¶

extract(
    image: Union[Image, ndarray, str, Path],
) -> LayoutOutput

Run layout extraction on an image.

PARAMETER	DESCRIPTION
`image`	Input image (PIL Image, numpy array, or path) TYPE: `Union[Image, ndarray, str, Path]`

RETURNS	DESCRIPTION
`LayoutOutput`	LayoutOutput with detected layout boxes

Source code in omnidocs/tasks/layout_extraction/rtdetr.py

def extract(self, image: Union[Image.Image, np.ndarray, str, Path]) -> LayoutOutput:
    """
    Run layout extraction on an image.

    Args:
        image: Input image (PIL Image, numpy array, or path)

    Returns:
        LayoutOutput with detected layout boxes
    """
    import torch

    if self._model is None or self._processor is None:
        raise RuntimeError("Model not loaded. Call _load_model() first.")

    # Prepare image
    pil_image = self._prepare_image(image)
    img_width, img_height = pil_image.size

    # Preprocess
    inputs = self._processor(
        images=pil_image,
        return_tensors="pt",
        size={"height": self.config.image_size, "width": self.config.image_size},
    )

    # Move to device
    inputs = {k: v.to(self._device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}

    # Run inference
    with torch.no_grad():
        outputs = self._model(**inputs)

    # Post-process results
    target_sizes = torch.tensor([[img_height, img_width]])
    results = self._processor.post_process_object_detection(
        outputs,
        target_sizes=target_sizes,
        threshold=self.config.confidence,
    )[0]

    # Parse detections
    layout_boxes = []

    for score, label_id, box in zip(results["scores"], results["labels"], results["boxes"]):
        confidence = float(score.item())
        class_id = int(label_id.item())

        # Get original label from model config
        # Note: The model outputs 0-indexed class IDs, but id2label has background at index 0,
        # so we add 1 to map correctly (e.g., model output 8 -> id2label[9] = "Table")
        original_label = self._model.config.id2label.get(class_id + 1, f"class_{class_id}")

        # Map to standardized label
        standard_label = RTDETR_MAPPING.to_standard(original_label)

        # Box coordinates
        box_coords = box.cpu().tolist()

        layout_boxes.append(
            LayoutBox(
                label=standard_label,
                bbox=BoundingBox.from_list(box_coords),
                confidence=confidence,
                class_id=class_id,
                original_label=original_label,
            )
        )

    # Sort by y-coordinate (top to bottom reading order)
    layout_boxes.sort(key=lambda b: (b.bbox.y1, b.bbox.x1))

    return LayoutOutput(
        bboxes=layout_boxes,
        image_width=img_width,
        image_height=img_height,
        model_name="RT-DETR (docling-layout)",
    )

vlm ¶

VLM layout detector.

A provider-agnostic Vision-Language Model layout detector using litellm. Works with any cloud API: Gemini, OpenRouter, Azure, OpenAI, Anthropic, etc.

Example

from omnidocs.vlm import VLMAPIConfig
from omnidocs.tasks.layout_extraction import VLMLayoutDetector

config = VLMAPIConfig(model="gemini/gemini-2.5-flash")
detector = VLMLayoutDetector(config=config)
result = detector.extract("document.png")

for box in result.bboxes:
    print(f"{box.label.value}: {box.bbox}")

VLMLayoutDetector ¶

VLMLayoutDetector(config: VLMAPIConfig)

Bases: BaseLayoutExtractor

Provider-agnostic VLM layout detector using litellm.

Works with any cloud VLM API: Gemini, OpenRouter, Azure, OpenAI, Anthropic, etc. Supports custom labels for flexible detection.

Example

from omnidocs.vlm import VLMAPIConfig
from omnidocs.tasks.layout_extraction import VLMLayoutDetector

config = VLMAPIConfig(model="gemini/gemini-2.5-flash")
detector = VLMLayoutDetector(config=config)

# Default labels
result = detector.extract("document.png")

# Custom labels
result = detector.extract("document.png", custom_labels=["code_block", "sidebar"])

Initialize VLM layout detector.

PARAMETER	DESCRIPTION
`config`	VLM API configuration with model and provider details. TYPE: `VLMAPIConfig`

Source code in omnidocs/tasks/layout_extraction/vlm.py

def __init__(self, config: VLMAPIConfig):
    """
    Initialize VLM layout detector.

    Args:
        config: VLM API configuration with model and provider details.
    """
    self.config = config
    self._loaded = True

extract ¶

extract(
    image: Union[Image, ndarray, str, Path],
    custom_labels: Optional[
        List[Union[str, CustomLabel]]
    ] = None,
    prompt: Optional[str] = None,
) -> LayoutOutput

Run layout detection on an image.

PARAMETER	DESCRIPTION
`image`	Input image (PIL Image, numpy array, or file path). TYPE: `Union[Image, ndarray, str, Path]`
`custom_labels`	Optional custom labels to detect. Can be: - None: Use default labels (title, text, table, figure, etc.) - List[str]: Simple label names ["code_block", "sidebar"] - List[CustomLabel]: Typed labels with metadata TYPE: `Optional[List[Union[str, CustomLabel]]]` DEFAULT: `None`
`prompt`	Custom prompt. If None, builds a default detection prompt. TYPE: `Optional[str]` DEFAULT: `None`

RETURNS	DESCRIPTION
`LayoutOutput`	LayoutOutput with detected layout boxes.

Source code in omnidocs/tasks/layout_extraction/vlm.py

def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    custom_labels: Optional[List[Union[str, CustomLabel]]] = None,
    prompt: Optional[str] = None,
) -> LayoutOutput:
    """
    Run layout detection on an image.

    Args:
        image: Input image (PIL Image, numpy array, or file path).
        custom_labels: Optional custom labels to detect. Can be:
            - None: Use default labels (title, text, table, figure, etc.)
            - List[str]: Simple label names ["code_block", "sidebar"]
            - List[CustomLabel]: Typed labels with metadata
        prompt: Custom prompt. If None, builds a default detection prompt.

    Returns:
        LayoutOutput with detected layout boxes.
    """
    if not self._loaded:
        raise RuntimeError("Model not loaded.")

    pil_image = self._prepare_image(image)
    width, height = pil_image.size

    # Normalize labels
    label_names = self._normalize_labels(custom_labels)

    # Build or use custom prompt
    final_prompt = prompt or _build_layout_prompt(label_names)

    raw_output = vlm_completion(self.config, final_prompt, pil_image)
    detections = _parse_layout_response(raw_output, (width, height))
    layout_boxes = self._build_layout_boxes(detections, width, height)

    # Sort by reading order
    layout_boxes.sort(key=lambda b: (b.bbox.y1, b.bbox.x1))

    return LayoutOutput(
        bboxes=layout_boxes,
        image_width=width,
        image_height=height,
        model_name=f"VLM ({self.config.model})",
    )

Overview¶

BaseLayoutExtractor ¶

extract abstractmethod ¶

batch_extract ¶

extract_document ¶

DocLayoutYOLO ¶

extract ¶

DocLayoutYOLOConfig ¶

MinerUVLLayoutDetector ¶

extract ¶

BoundingBox ¶

width property ¶

height property ¶

area property ¶

center property ¶

to_list ¶

to_xyxy ¶

to_xywh ¶

from_list classmethod ¶

to_normalized ¶

to_absolute ¶

CustomLabel ¶

LabelMapping ¶

supported_labels property ¶

standard_labels property ¶

to_standard ¶

from_standard ¶

LayoutBox ¶

to_dict ¶

get_normalized_bbox ¶

LayoutLabel ¶

LayoutOutput ¶

element_count property ¶

labels_found property ¶

filter_by_label ¶

filter_by_confidence ¶

to_dict ¶

sort_by_position ¶

get_normalized_bboxes ¶

visualize ¶

load_json classmethod ¶

save_json ¶

QwenLayoutDetector ¶

extract ¶

RTDETRConfig ¶

RTDETRLayoutExtractor ¶

extract ¶

VLMLayoutDetector ¶

extract ¶

base ¶

BaseLayoutExtractor ¶

extract abstractmethod ¶

batch_extract ¶

extract_document ¶

doc_layout_yolo ¶

DocLayoutYOLOConfig ¶

DocLayoutYOLO ¶

extract ¶

mineruvl ¶

MinerUVLLayoutAPIConfig ¶

MinerUVLLayoutDetector ¶

extract ¶

MinerUVLLayoutMLXConfig ¶

MinerUVLLayoutPyTorchConfig ¶

MinerUVLLayoutVLLMConfig ¶

api ¶

MinerUVLLayoutAPIConfig ¶

detector ¶

MinerUVLLayoutDetector ¶

extract ¶

mlx ¶

MinerUVLLayoutMLXConfig ¶

pytorch ¶

MinerUVLLayoutPyTorchConfig ¶

vllm ¶

MinerUVLLayoutVLLMConfig ¶

models ¶

LayoutLabel ¶

CustomLabel ¶

LabelMapping ¶

extract `abstractmethod` ¶

width `property` ¶

height `property` ¶

area `property` ¶

center `property` ¶

from_list `classmethod` ¶

supported_labels `property` ¶

standard_labels `property` ¶

element_count `property` ¶

labels_found `property` ¶

load_json `classmethod` ¶

extract `abstractmethod` ¶

supported_labels `property` ¶

standard_labels `property` ¶

width `property` ¶

height `property` ¶

area `property` ¶

center `property` ¶

from_list `classmethod` ¶

element_count `property` ¶

labels_found `property` ¶

load_json `classmethod` ¶