Skip to content

Overview

MinerU VL layout detection module.

MinerU VL can be used for standalone layout detection, returning detected regions with types and bounding boxes.

For full document extraction (layout + content), use MinerUVLTextExtractor from the text_extraction module instead.

Example
from omnidocs.tasks.layout_extraction import MinerUVLLayoutDetector
from omnidocs.tasks.layout_extraction.mineruvl import MinerUVLLayoutPyTorchConfig

detector = MinerUVLLayoutDetector(
    backend=MinerUVLLayoutPyTorchConfig(device="cuda")
)
result = detector.extract(image)

for box in result.bboxes:
    print(f"{box.label}: {box.confidence:.2f}")

MinerUVLLayoutAPIConfig

Bases: BaseModel

API backend config for MinerU VL layout detection.

Example
from omnidocs.tasks.layout_extraction import MinerUVLLayoutDetector
from omnidocs.tasks.layout_extraction.mineruvl import MinerUVLLayoutAPIConfig

detector = MinerUVLLayoutDetector(
    backend=MinerUVLLayoutAPIConfig(
        server_url="https://your-server.modal.run"
    )
)
result = detector.extract(image)

MinerUVLLayoutDetector

MinerUVLLayoutDetector(
    backend: MinerUVLLayoutBackendConfig,
)

Bases: BaseLayoutExtractor

MinerU VL layout detector.

Uses MinerU2.5-2509-1.2B for document layout detection. Detects 22+ element types including text, titles, tables, equations, figures, code, and more.

For full document extraction (layout + content), use MinerUVLTextExtractor from the text_extraction module instead.

Example
from omnidocs.tasks.layout_extraction import MinerUVLLayoutDetector
from omnidocs.tasks.layout_extraction.mineruvl import MinerUVLLayoutPyTorchConfig

detector = MinerUVLLayoutDetector(
    backend=MinerUVLLayoutPyTorchConfig(device="cuda")
)
result = detector.extract(image)

for box in result.bboxes:
    print(f"{box.label}: {box.confidence:.2f}")

Initialize MinerU VL layout detector.

PARAMETER DESCRIPTION
backend

Backend configuration (PyTorch, VLLM, MLX, or API)

TYPE: MinerUVLLayoutBackendConfig

Source code in omnidocs/tasks/layout_extraction/mineruvl/detector.py
def __init__(self, backend: MinerUVLLayoutBackendConfig):
    """
    Initialize MinerU VL layout detector.

    Args:
        backend: Backend configuration (PyTorch, VLLM, MLX, or API)
    """
    self.backend_config = backend
    self._client = None
    self._loaded = False
    self._load_model()

extract

extract(
    image: Union[Image, ndarray, str, Path],
) -> LayoutOutput

Detect layout elements in the image.

PARAMETER DESCRIPTION
image

Input image (PIL Image, numpy array, or file path)

TYPE: Union[Image, ndarray, str, Path]

RETURNS DESCRIPTION
LayoutOutput

LayoutOutput with standardized labels and bounding boxes

Source code in omnidocs/tasks/layout_extraction/mineruvl/detector.py
def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
) -> LayoutOutput:
    """
    Detect layout elements in the image.

    Args:
        image: Input image (PIL Image, numpy array, or file path)

    Returns:
        LayoutOutput with standardized labels and bounding boxes
    """
    if not self._loaded:
        raise RuntimeError("Model not loaded.")

    pil_image = self._prepare_image(image)
    width, height = pil_image.size

    # Run layout detection
    blocks = self._detect_layout(pil_image)

    # Convert to LayoutOutput
    bboxes = []
    for block in blocks:
        # Convert normalized [0,1] to pixel coords
        x1, y1, x2, y2 = block.bbox
        pixel_bbox = BoundingBox(
            x1=x1 * width,
            y1=y1 * height,
            x2=x2 * width,
            y2=y2 * height,
        )

        # Map label
        label = MINERUVL_LABEL_MAPPING.get(block.type, LayoutLabel.UNKNOWN)

        bboxes.append(
            LayoutBox(
                label=label,
                bbox=pixel_bbox,
                confidence=1.0,  # MinerU VL doesn't output confidence
                original_label=block.type.value,
            )
        )

    return LayoutOutput(
        bboxes=bboxes,
        image_width=width,
        image_height=height,
        model_name="MinerU2.5-2509-1.2B",
    )

MinerUVLLayoutMLXConfig

Bases: BaseModel

MLX backend config for MinerU VL layout detection on Apple Silicon.

Example
from omnidocs.tasks.layout_extraction import MinerUVLLayoutDetector
from omnidocs.tasks.layout_extraction.mineruvl import MinerUVLLayoutMLXConfig

detector = MinerUVLLayoutDetector(
    backend=MinerUVLLayoutMLXConfig()
)
result = detector.extract(image)

MinerUVLLayoutPyTorchConfig

Bases: BaseModel

PyTorch/HuggingFace backend config for MinerU VL layout detection.

Example
from omnidocs.tasks.layout_extraction import MinerUVLLayoutDetector
from omnidocs.tasks.layout_extraction.mineruvl import MinerUVLLayoutPyTorchConfig

detector = MinerUVLLayoutDetector(
    backend=MinerUVLLayoutPyTorchConfig(device="cuda")
)
result = detector.extract(image)

MinerUVLLayoutVLLMConfig

Bases: BaseModel

VLLM backend config for MinerU VL layout detection.

Example
from omnidocs.tasks.layout_extraction import MinerUVLLayoutDetector
from omnidocs.tasks.layout_extraction.mineruvl import MinerUVLLayoutVLLMConfig

detector = MinerUVLLayoutDetector(
    backend=MinerUVLLayoutVLLMConfig(tensor_parallel_size=1)
)
result = detector.extract(image)

api

API backend configuration for MinerU VL layout detection.

MinerUVLLayoutAPIConfig

Bases: BaseModel

API backend config for MinerU VL layout detection.

Example
from omnidocs.tasks.layout_extraction import MinerUVLLayoutDetector
from omnidocs.tasks.layout_extraction.mineruvl import MinerUVLLayoutAPIConfig

detector = MinerUVLLayoutDetector(
    backend=MinerUVLLayoutAPIConfig(
        server_url="https://your-server.modal.run"
    )
)
result = detector.extract(image)

detector

MinerU VL layout detector.

Uses MinerU2.5-2509-1.2B for document layout detection. Detects 22+ element types including text, titles, tables, equations, figures, code.

MinerUVLLayoutDetector

MinerUVLLayoutDetector(
    backend: MinerUVLLayoutBackendConfig,
)

Bases: BaseLayoutExtractor

MinerU VL layout detector.

Uses MinerU2.5-2509-1.2B for document layout detection. Detects 22+ element types including text, titles, tables, equations, figures, code, and more.

For full document extraction (layout + content), use MinerUVLTextExtractor from the text_extraction module instead.

Example
from omnidocs.tasks.layout_extraction import MinerUVLLayoutDetector
from omnidocs.tasks.layout_extraction.mineruvl import MinerUVLLayoutPyTorchConfig

detector = MinerUVLLayoutDetector(
    backend=MinerUVLLayoutPyTorchConfig(device="cuda")
)
result = detector.extract(image)

for box in result.bboxes:
    print(f"{box.label}: {box.confidence:.2f}")

Initialize MinerU VL layout detector.

PARAMETER DESCRIPTION
backend

Backend configuration (PyTorch, VLLM, MLX, or API)

TYPE: MinerUVLLayoutBackendConfig

Source code in omnidocs/tasks/layout_extraction/mineruvl/detector.py
def __init__(self, backend: MinerUVLLayoutBackendConfig):
    """
    Initialize MinerU VL layout detector.

    Args:
        backend: Backend configuration (PyTorch, VLLM, MLX, or API)
    """
    self.backend_config = backend
    self._client = None
    self._loaded = False
    self._load_model()

extract

extract(
    image: Union[Image, ndarray, str, Path],
) -> LayoutOutput

Detect layout elements in the image.

PARAMETER DESCRIPTION
image

Input image (PIL Image, numpy array, or file path)

TYPE: Union[Image, ndarray, str, Path]

RETURNS DESCRIPTION
LayoutOutput

LayoutOutput with standardized labels and bounding boxes

Source code in omnidocs/tasks/layout_extraction/mineruvl/detector.py
def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
) -> LayoutOutput:
    """
    Detect layout elements in the image.

    Args:
        image: Input image (PIL Image, numpy array, or file path)

    Returns:
        LayoutOutput with standardized labels and bounding boxes
    """
    if not self._loaded:
        raise RuntimeError("Model not loaded.")

    pil_image = self._prepare_image(image)
    width, height = pil_image.size

    # Run layout detection
    blocks = self._detect_layout(pil_image)

    # Convert to LayoutOutput
    bboxes = []
    for block in blocks:
        # Convert normalized [0,1] to pixel coords
        x1, y1, x2, y2 = block.bbox
        pixel_bbox = BoundingBox(
            x1=x1 * width,
            y1=y1 * height,
            x2=x2 * width,
            y2=y2 * height,
        )

        # Map label
        label = MINERUVL_LABEL_MAPPING.get(block.type, LayoutLabel.UNKNOWN)

        bboxes.append(
            LayoutBox(
                label=label,
                bbox=pixel_bbox,
                confidence=1.0,  # MinerU VL doesn't output confidence
                original_label=block.type.value,
            )
        )

    return LayoutOutput(
        bboxes=bboxes,
        image_width=width,
        image_height=height,
        model_name="MinerU2.5-2509-1.2B",
    )

mlx

MLX backend configuration for MinerU VL layout detection (Apple Silicon).

MinerUVLLayoutMLXConfig

Bases: BaseModel

MLX backend config for MinerU VL layout detection on Apple Silicon.

Example
from omnidocs.tasks.layout_extraction import MinerUVLLayoutDetector
from omnidocs.tasks.layout_extraction.mineruvl import MinerUVLLayoutMLXConfig

detector = MinerUVLLayoutDetector(
    backend=MinerUVLLayoutMLXConfig()
)
result = detector.extract(image)

pytorch

PyTorch backend configuration for MinerU VL layout detection.

MinerUVLLayoutPyTorchConfig

Bases: BaseModel

PyTorch/HuggingFace backend config for MinerU VL layout detection.

Example
from omnidocs.tasks.layout_extraction import MinerUVLLayoutDetector
from omnidocs.tasks.layout_extraction.mineruvl import MinerUVLLayoutPyTorchConfig

detector = MinerUVLLayoutDetector(
    backend=MinerUVLLayoutPyTorchConfig(device="cuda")
)
result = detector.extract(image)

vllm

VLLM backend configuration for MinerU VL layout detection.

MinerUVLLayoutVLLMConfig

Bases: BaseModel

VLLM backend config for MinerU VL layout detection.

Example
from omnidocs.tasks.layout_extraction import MinerUVLLayoutDetector
from omnidocs.tasks.layout_extraction.mineruvl import MinerUVLLayoutVLLMConfig

detector = MinerUVLLayoutDetector(
    backend=MinerUVLLayoutVLLMConfig(tensor_parallel_size=1)
)
result = detector.extract(image)