Skip to content

Overview

Structured Extraction Module.

Provides extractors for extracting structured data from document images using Pydantic schemas for type-safe output.

Example
from pydantic import BaseModel
from omnidocs.vlm import VLMAPIConfig
from omnidocs.tasks.structured_extraction import VLMStructuredExtractor

class Invoice(BaseModel):
    vendor: str
    total: float
    items: list[str]

config = VLMAPIConfig(model="gemini/gemini-2.5-flash")
extractor = VLMStructuredExtractor(config=config)
result = extractor.extract(
    "invoice.png",
    schema=Invoice,
    prompt="Extract invoice details from this document.",
)
print(result.vendor, result.total)

BaseStructuredExtractor

Bases: ABC

Abstract base class for structured extractors.

Structured extractors return data matching a user-provided Pydantic schema.

Example
class MyExtractor(BaseStructuredExtractor):
    def __init__(self, config):
        self.config = config

    def _load_model(self):
        pass

    def extract(self, image, schema, prompt):
        return StructuredOutput(data=schema(...), ...)

extract abstractmethod

extract(
    image: Union[Image, ndarray, str, Path],
    schema: type[BaseModel],
    prompt: str,
) -> StructuredOutput

Extract structured data from an image.

PARAMETER DESCRIPTION
image

Input image (PIL Image, numpy array, or file path).

TYPE: Union[Image, ndarray, str, Path]

schema

Pydantic model class defining the expected output structure.

TYPE: type[BaseModel]

prompt

Extraction prompt describing what to extract.

TYPE: str

RETURNS DESCRIPTION
StructuredOutput

StructuredOutput containing the validated data.

Source code in omnidocs/tasks/structured_extraction/base.py
@abstractmethod
def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    schema: type[BaseModel],
    prompt: str,
) -> StructuredOutput:
    """
    Extract structured data from an image.

    Args:
        image: Input image (PIL Image, numpy array, or file path).
        schema: Pydantic model class defining the expected output structure.
        prompt: Extraction prompt describing what to extract.

    Returns:
        StructuredOutput containing the validated data.
    """
    pass

StructuredOutput

Bases: BaseModel

Output from structured extraction.

Contains the extracted data as a validated Pydantic model instance, along with metadata about the extraction.

VLMStructuredExtractor

VLMStructuredExtractor(config: VLMAPIConfig)

Bases: BaseStructuredExtractor

Provider-agnostic VLM structured extractor using litellm.

Extracts structured data from document images using any cloud VLM API. Uses litellm's native response_format support to send Pydantic schemas to providers that support structured output (OpenAI, Gemini, etc.).

Example
from pydantic import BaseModel
from omnidocs.vlm import VLMAPIConfig
from omnidocs.tasks.structured_extraction import VLMStructuredExtractor

class Invoice(BaseModel):
    vendor: str
    total: float
    items: list[str]

config = VLMAPIConfig(model="gemini/gemini-2.5-flash")
extractor = VLMStructuredExtractor(config=config)
result = extractor.extract("invoice.png", schema=Invoice, prompt="Extract invoice fields")
print(result.data.vendor)

Initialize VLM structured extractor.

PARAMETER DESCRIPTION
config

VLM API configuration with model and provider details.

TYPE: VLMAPIConfig

Source code in omnidocs/tasks/structured_extraction/vlm.py
def __init__(self, config: VLMAPIConfig):
    """
    Initialize VLM structured extractor.

    Args:
        config: VLM API configuration with model and provider details.
    """
    self.config = config
    self._loaded = True

extract

extract(
    image: Union[Image, ndarray, str, Path],
    schema: type[BaseModel],
    prompt: str,
) -> StructuredOutput

Extract structured data from an image.

PARAMETER DESCRIPTION
image

Input image (PIL Image, numpy array, or file path).

TYPE: Union[Image, ndarray, str, Path]

schema

Pydantic model class defining the expected output structure.

TYPE: type[BaseModel]

prompt

Extraction prompt describing what to extract.

TYPE: str

RETURNS DESCRIPTION
StructuredOutput

StructuredOutput containing the validated data.

Source code in omnidocs/tasks/structured_extraction/vlm.py
def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    schema: type[BaseModel],
    prompt: str,
) -> StructuredOutput:
    """
    Extract structured data from an image.

    Args:
        image: Input image (PIL Image, numpy array, or file path).
        schema: Pydantic model class defining the expected output structure.
        prompt: Extraction prompt describing what to extract.

    Returns:
        StructuredOutput containing the validated data.
    """
    pil_image = self._prepare_image(image)
    width, height = pil_image.size

    data = vlm_structured_completion(self.config, prompt, pil_image, schema)

    return StructuredOutput(
        data=data,
        image_width=width,
        image_height=height,
        model_name=f"VLM ({self.config.model})",
    )

base

Base class for structured extractors.

Defines the abstract interface for extracting structured data from document images.

BaseStructuredExtractor

Bases: ABC

Abstract base class for structured extractors.

Structured extractors return data matching a user-provided Pydantic schema.

Example
class MyExtractor(BaseStructuredExtractor):
    def __init__(self, config):
        self.config = config

    def _load_model(self):
        pass

    def extract(self, image, schema, prompt):
        return StructuredOutput(data=schema(...), ...)

extract abstractmethod

extract(
    image: Union[Image, ndarray, str, Path],
    schema: type[BaseModel],
    prompt: str,
) -> StructuredOutput

Extract structured data from an image.

PARAMETER DESCRIPTION
image

Input image (PIL Image, numpy array, or file path).

TYPE: Union[Image, ndarray, str, Path]

schema

Pydantic model class defining the expected output structure.

TYPE: type[BaseModel]

prompt

Extraction prompt describing what to extract.

TYPE: str

RETURNS DESCRIPTION
StructuredOutput

StructuredOutput containing the validated data.

Source code in omnidocs/tasks/structured_extraction/base.py
@abstractmethod
def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    schema: type[BaseModel],
    prompt: str,
) -> StructuredOutput:
    """
    Extract structured data from an image.

    Args:
        image: Input image (PIL Image, numpy array, or file path).
        schema: Pydantic model class defining the expected output structure.
        prompt: Extraction prompt describing what to extract.

    Returns:
        StructuredOutput containing the validated data.
    """
    pass

models

Pydantic models for structured extraction outputs.

StructuredOutput

Bases: BaseModel

Output from structured extraction.

Contains the extracted data as a validated Pydantic model instance, along with metadata about the extraction.

vlm

VLM structured extractor.

A provider-agnostic Vision-Language Model structured extractor using litellm. Extracts structured data matching a Pydantic schema from document images.

Example
from pydantic import BaseModel
from omnidocs.vlm import VLMAPIConfig
from omnidocs.tasks.structured_extraction import VLMStructuredExtractor

class Invoice(BaseModel):
    vendor: str
    total: float
    items: list[str]
    date: str

config = VLMAPIConfig(model="gemini/gemini-2.5-flash")
extractor = VLMStructuredExtractor(config=config)

result = extractor.extract(
    image="invoice.png",
    schema=Invoice,
    prompt="Extract invoice details from this document.",
)
print(result.data.vendor, result.data.total)

VLMStructuredExtractor

VLMStructuredExtractor(config: VLMAPIConfig)

Bases: BaseStructuredExtractor

Provider-agnostic VLM structured extractor using litellm.

Extracts structured data from document images using any cloud VLM API. Uses litellm's native response_format support to send Pydantic schemas to providers that support structured output (OpenAI, Gemini, etc.).

Example
from pydantic import BaseModel
from omnidocs.vlm import VLMAPIConfig
from omnidocs.tasks.structured_extraction import VLMStructuredExtractor

class Invoice(BaseModel):
    vendor: str
    total: float
    items: list[str]

config = VLMAPIConfig(model="gemini/gemini-2.5-flash")
extractor = VLMStructuredExtractor(config=config)
result = extractor.extract("invoice.png", schema=Invoice, prompt="Extract invoice fields")
print(result.data.vendor)

Initialize VLM structured extractor.

PARAMETER DESCRIPTION
config

VLM API configuration with model and provider details.

TYPE: VLMAPIConfig

Source code in omnidocs/tasks/structured_extraction/vlm.py
def __init__(self, config: VLMAPIConfig):
    """
    Initialize VLM structured extractor.

    Args:
        config: VLM API configuration with model and provider details.
    """
    self.config = config
    self._loaded = True

extract

extract(
    image: Union[Image, ndarray, str, Path],
    schema: type[BaseModel],
    prompt: str,
) -> StructuredOutput

Extract structured data from an image.

PARAMETER DESCRIPTION
image

Input image (PIL Image, numpy array, or file path).

TYPE: Union[Image, ndarray, str, Path]

schema

Pydantic model class defining the expected output structure.

TYPE: type[BaseModel]

prompt

Extraction prompt describing what to extract.

TYPE: str

RETURNS DESCRIPTION
StructuredOutput

StructuredOutput containing the validated data.

Source code in omnidocs/tasks/structured_extraction/vlm.py
def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    schema: type[BaseModel],
    prompt: str,
) -> StructuredOutput:
    """
    Extract structured data from an image.

    Args:
        image: Input image (PIL Image, numpy array, or file path).
        schema: Pydantic model class defining the expected output structure.
        prompt: Extraction prompt describing what to extract.

    Returns:
        StructuredOutput containing the validated data.
    """
    pil_image = self._prepare_image(image)
    width, height = pil_image.size

    data = vlm_structured_completion(self.config, prompt, pil_image, schema)

    return StructuredOutput(
        data=data,
        image_width=width,
        image_height=height,
        model_name=f"VLM ({self.config.model})",
    )