Skip to content

Vlm

VLM layout detector.

A provider-agnostic Vision-Language Model layout detector using litellm. Works with any cloud API: Gemini, OpenRouter, Azure, OpenAI, Anthropic, etc.

Example
from omnidocs.vlm import VLMAPIConfig
from omnidocs.tasks.layout_extraction import VLMLayoutDetector

config = VLMAPIConfig(model="gemini/gemini-2.5-flash")
detector = VLMLayoutDetector(config=config)
result = detector.extract("document.png")

for box in result.bboxes:
    print(f"{box.label.value}: {box.bbox}")

VLMLayoutDetector

VLMLayoutDetector(config: VLMAPIConfig)

Bases: BaseLayoutExtractor

Provider-agnostic VLM layout detector using litellm.

Works with any cloud VLM API: Gemini, OpenRouter, Azure, OpenAI, Anthropic, etc. Supports custom labels for flexible detection.

Example
from omnidocs.vlm import VLMAPIConfig
from omnidocs.tasks.layout_extraction import VLMLayoutDetector

config = VLMAPIConfig(model="gemini/gemini-2.5-flash")
detector = VLMLayoutDetector(config=config)

# Default labels
result = detector.extract("document.png")

# Custom labels
result = detector.extract("document.png", custom_labels=["code_block", "sidebar"])

Initialize VLM layout detector.

PARAMETER DESCRIPTION
config

VLM API configuration with model and provider details.

TYPE: VLMAPIConfig

Source code in omnidocs/tasks/layout_extraction/vlm.py
def __init__(self, config: VLMAPIConfig):
    """
    Initialize VLM layout detector.

    Args:
        config: VLM API configuration with model and provider details.
    """
    self.config = config
    self._loaded = True

extract

extract(
    image: Union[Image, ndarray, str, Path],
    custom_labels: Optional[
        List[Union[str, CustomLabel]]
    ] = None,
    prompt: Optional[str] = None,
) -> LayoutOutput

Run layout detection on an image.

PARAMETER DESCRIPTION
image

Input image (PIL Image, numpy array, or file path).

TYPE: Union[Image, ndarray, str, Path]

custom_labels

Optional custom labels to detect. Can be: - None: Use default labels (title, text, table, figure, etc.) - List[str]: Simple label names ["code_block", "sidebar"] - List[CustomLabel]: Typed labels with metadata

TYPE: Optional[List[Union[str, CustomLabel]]] DEFAULT: None

prompt

Custom prompt. If None, builds a default detection prompt.

TYPE: Optional[str] DEFAULT: None

RETURNS DESCRIPTION
LayoutOutput

LayoutOutput with detected layout boxes.

Source code in omnidocs/tasks/layout_extraction/vlm.py
def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    custom_labels: Optional[List[Union[str, CustomLabel]]] = None,
    prompt: Optional[str] = None,
) -> LayoutOutput:
    """
    Run layout detection on an image.

    Args:
        image: Input image (PIL Image, numpy array, or file path).
        custom_labels: Optional custom labels to detect. Can be:
            - None: Use default labels (title, text, table, figure, etc.)
            - List[str]: Simple label names ["code_block", "sidebar"]
            - List[CustomLabel]: Typed labels with metadata
        prompt: Custom prompt. If None, builds a default detection prompt.

    Returns:
        LayoutOutput with detected layout boxes.
    """
    if not self._loaded:
        raise RuntimeError("Model not loaded.")

    pil_image = self._prepare_image(image)
    width, height = pil_image.size

    # Normalize labels
    label_names = self._normalize_labels(custom_labels)

    # Build or use custom prompt
    final_prompt = prompt or _build_layout_prompt(label_names)

    raw_output = vlm_completion(self.config, final_prompt, pil_image)
    detections = _parse_layout_response(raw_output, (width, height))
    layout_boxes = self._build_layout_boxes(detections, width, height)

    # Sort by reading order
    layout_boxes.sort(key=lambda b: (b.bbox.y1, b.bbox.x1))

    return LayoutOutput(
        bboxes=layout_boxes,
        image_width=width,
        image_height=height,
        model_name=f"VLM ({self.config.model})",
    )