Skip to content

Overview

OmniDocs Utilities.

Provides utility functions for result aggregation, visualization, export, and cache management.

BatchResult

BatchResult()

Container for results from processing multiple documents.

Examples:

batch_result = BatchResult()
batch_result.add_document_result("doc1", doc_result1)
batch_result.add_document_result("doc2", doc_result2)

# Access results
doc1_result = batch_result.get_document_result("doc1")
all_ids = batch_result.document_ids

# Save all results
batch_result.save_json("all_results.json")

Initialize empty BatchResult.

Source code in omnidocs/utils/aggregation.py
def __init__(self):
    """Initialize empty BatchResult."""
    self._document_results: Dict[str, DocumentResult] = {}

document_ids property

document_ids: List[str]

List of document IDs.

document_count property

document_count: int

Number of documents processed.

total_pages property

total_pages: int

Total pages across all documents.

add_document_result

add_document_result(
    doc_id: str, result: DocumentResult
) -> None

Add result for a document.

PARAMETER DESCRIPTION
doc_id

Document identifier (usually filename without extension)

TYPE: str

result

DocumentResult instance

TYPE: DocumentResult

Source code in omnidocs/utils/aggregation.py
def add_document_result(self, doc_id: str, result: DocumentResult) -> None:
    """
    Add result for a document.

    Args:
        doc_id: Document identifier (usually filename without extension)
        result: DocumentResult instance
    """
    self._document_results[doc_id] = result

get_document_result

get_document_result(
    doc_id: str,
) -> Optional[DocumentResult]

Get result for a specific document.

PARAMETER DESCRIPTION
doc_id

Document identifier

TYPE: str

RETURNS DESCRIPTION
Optional[DocumentResult]

DocumentResult or None if not found

Source code in omnidocs/utils/aggregation.py
def get_document_result(self, doc_id: str) -> Optional[DocumentResult]:
    """
    Get result for a specific document.

    Args:
        doc_id: Document identifier

    Returns:
        DocumentResult or None if not found
    """
    return self._document_results.get(doc_id)

to_dict

to_dict() -> dict

Convert to dictionary.

RETURNS DESCRIPTION
dict

Dictionary representation

Source code in omnidocs/utils/aggregation.py
def to_dict(self) -> dict:
    """
    Convert to dictionary.

    Returns:
        Dictionary representation
    """
    return {
        "document_count": self.document_count,
        "total_pages": self.total_pages,
        "documents": {doc_id: result.to_dict() for doc_id, result in self._document_results.items()},
    }

save_json

save_json(path: str) -> None

Save all results to JSON file.

PARAMETER DESCRIPTION
path

Output file path

TYPE: str

Source code in omnidocs/utils/aggregation.py
def save_json(self, path: str) -> None:
    """
    Save all results to JSON file.

    Args:
        path: Output file path
    """
    out_path = Path(path)
    out_path.parent.mkdir(parents=True, exist_ok=True)
    with open(out_path, "w") as f:
        json.dump(self.to_dict(), f, indent=2, default=str)

DocumentResult

DocumentResult(
    source_path: Optional[str] = None, page_count: int = 0
)

Container for results from processing a single document.

Stores results by page for easy access and serialization.

Examples:

doc_result = DocumentResult(source_path="paper.pdf", page_count=10)
doc_result.add_page_result(0, text_output)
doc_result.add_page_result(1, text_output)

# Access results
all_results = doc_result.all_results
page_0_result = doc_result.get_page_result(0)

# Save to file
doc_result.save_json("paper_result.json")

Initialize DocumentResult.

PARAMETER DESCRIPTION
source_path

Path to source document

TYPE: Optional[str] DEFAULT: None

page_count

Total number of pages

TYPE: int DEFAULT: 0

Source code in omnidocs/utils/aggregation.py
def __init__(
    self,
    source_path: Optional[str] = None,
    page_count: int = 0,
):
    """
    Initialize DocumentResult.

    Args:
        source_path: Path to source document
        page_count: Total number of pages
    """
    self.source_path = source_path
    self.page_count = page_count
    self._page_results: Dict[int, Any] = {}

all_results property

all_results: List[Any]

Get all results in page order.

RETURNS DESCRIPTION
List[Any]

List of results sorted by page number

processed_pages property

processed_pages: int

Number of pages with results.

add_page_result

add_page_result(page_num: int, result: Any) -> None

Add result for a specific page.

PARAMETER DESCRIPTION
page_num

Page number (0-indexed)

TYPE: int

result

Extraction result (TextOutput, LayoutOutput, etc.)

TYPE: Any

Source code in omnidocs/utils/aggregation.py
def add_page_result(self, page_num: int, result: Any) -> None:
    """
    Add result for a specific page.

    Args:
        page_num: Page number (0-indexed)
        result: Extraction result (TextOutput, LayoutOutput, etc.)
    """
    self._page_results[page_num] = result

get_page_result

get_page_result(page_num: int) -> Optional[Any]

Get result for a specific page.

PARAMETER DESCRIPTION
page_num

Page number (0-indexed)

TYPE: int

RETURNS DESCRIPTION
Optional[Any]

Result for the page, or None if not found

Source code in omnidocs/utils/aggregation.py
def get_page_result(self, page_num: int) -> Optional[Any]:
    """
    Get result for a specific page.

    Args:
        page_num: Page number (0-indexed)

    Returns:
        Result for the page, or None if not found
    """
    return self._page_results.get(page_num)

to_dict

to_dict() -> dict

Convert to dictionary for serialization.

RETURNS DESCRIPTION
dict

Dictionary representation

Source code in omnidocs/utils/aggregation.py
def to_dict(self) -> dict:
    """
    Convert to dictionary for serialization.

    Returns:
        Dictionary representation
    """
    results_dict = {}
    for k, v in self._page_results.items():
        if hasattr(v, "model_dump"):
            results_dict[str(k)] = v.model_dump()
        elif hasattr(v, "to_dict"):
            results_dict[str(k)] = v.to_dict()
        elif hasattr(v, "__dict__"):
            results_dict[str(k)] = v.__dict__
        else:
            results_dict[str(k)] = str(v)

    return {
        "source_path": self.source_path,
        "page_count": self.page_count,
        "processed_pages": self.processed_pages,
        "results": results_dict,
    }

save_json

save_json(path: str) -> None

Save results to JSON file.

PARAMETER DESCRIPTION
path

Output file path

TYPE: str

Source code in omnidocs/utils/aggregation.py
def save_json(self, path: str) -> None:
    """
    Save results to JSON file.

    Args:
        path: Output file path
    """
    out_path = Path(path)
    out_path.parent.mkdir(parents=True, exist_ok=True)
    with open(out_path, "w") as f:
        json.dump(self.to_dict(), f, indent=2, default=str)

merge_text_results

merge_text_results(
    results: List[Any], separator: str = "\n\n"
) -> str

Merge multiple TextOutput results into single string.

PARAMETER DESCRIPTION
results

List of TextOutput (or objects with .content attribute)

TYPE: List[Any]

separator

String to join pages (default: double newline)

TYPE: str DEFAULT: '\n\n'

RETURNS DESCRIPTION
str

Combined content string

Examples:

all_results = doc_result.all_results
full_text = merge_text_results(all_results)
full_text_with_dividers = merge_text_results(all_results, separator="\n\n---\n\n")
Source code in omnidocs/utils/aggregation.py
def merge_text_results(results: List[Any], separator: str = "\n\n") -> str:
    """
    Merge multiple TextOutput results into single string.

    Args:
        results: List of TextOutput (or objects with .content attribute)
        separator: String to join pages (default: double newline)

    Returns:
        Combined content string

    Examples:
        ```python
        all_results = doc_result.all_results
        full_text = merge_text_results(all_results)
        full_text_with_dividers = merge_text_results(all_results, separator="\\n\\n---\\n\\n")
        ```
    """
    contents = []
    for r in results:
        if hasattr(r, "content") and r.content:
            contents.append(r.content)
        elif isinstance(r, str) and r:
            contents.append(r)
    return separator.join(contents)

configure_backend_cache

configure_backend_cache(
    cache_dir: Optional[str] = None,
) -> None

Configure cache directories for all backends.

When OMNIDOCS_MODELS_DIR is set (or cache_dir is passed), this OVERWRITES HF_HOME and TRANSFORMERS_CACHE so every backend downloads to the same place.

This is called automatically on import omnidocs.

PARAMETER DESCRIPTION
cache_dir

Optional cache directory path. If None, uses get_model_cache_dir().

TYPE: Optional[str] DEFAULT: None

Source code in omnidocs/utils/cache.py
def configure_backend_cache(cache_dir: Optional[str] = None) -> None:
    """
    Configure cache directories for all backends.

    When OMNIDOCS_MODELS_DIR is set (or cache_dir is passed), this OVERWRITES
    HF_HOME and TRANSFORMERS_CACHE so every backend downloads to the same place.

    This is called automatically on ``import omnidocs``.

    Args:
        cache_dir: Optional cache directory path. If None, uses get_model_cache_dir().
    """
    cache_path = str(get_model_cache_dir(cache_dir))

    # Overwrite HF_HOME so PyTorch, MLX, VLLM, and snapshot_download all use it
    os.environ["HF_HOME"] = cache_path
    os.environ["TRANSFORMERS_CACHE"] = cache_path

    # huggingface_hub caches HF_HUB_CACHE at import time, so if it's already
    # imported we must patch the constants directly for hf_hub_download etc.
    import sys

    if "huggingface_hub.constants" in sys.modules:
        import huggingface_hub.constants as hf_constants

        hub_cache = os.path.join(cache_path, "hub")
        hf_constants.HF_HOME = cache_path
        hf_constants.HF_HUB_CACHE = hub_cache
        hf_constants.HUGGINGFACE_HUB_CACHE = hub_cache

get_model_cache_dir

get_model_cache_dir(
    custom_dir: Optional[str] = None,
) -> Path

Get unified model cache directory.

Priority order: 1. custom_dir parameter (if provided) 2. OMNIDOCS_MODELS_DIR environment variable 3. HF_HOME environment variable 4. Default: ~/.cache/huggingface

PARAMETER DESCRIPTION
custom_dir

Optional custom cache directory path. Overrides environment variables if provided.

TYPE: Optional[str] DEFAULT: None

RETURNS DESCRIPTION
Path

Path object pointing to the cache directory.

Path

Directory is created if it doesn't exist.

Source code in omnidocs/utils/cache.py
def get_model_cache_dir(custom_dir: Optional[str] = None) -> Path:
    """
    Get unified model cache directory.

    Priority order:
    1. custom_dir parameter (if provided)
    2. OMNIDOCS_MODELS_DIR environment variable
    3. HF_HOME environment variable
    4. Default: ~/.cache/huggingface

    Args:
        custom_dir: Optional custom cache directory path.
                   Overrides environment variables if provided.

    Returns:
        Path object pointing to the cache directory.
        Directory is created if it doesn't exist.
    """
    if custom_dir:
        cache_dir = custom_dir
    else:
        cache_dir = os.environ.get(
            "OMNIDOCS_MODELS_DIR",
            os.environ.get("HF_HOME", os.path.expanduser("~/.cache/huggingface")),
        )

    path = Path(cache_dir).expanduser().resolve()
    path.mkdir(parents=True, exist_ok=True)
    return path

get_storage_info

get_storage_info() -> dict

Get current cache directory configuration information.

RETURNS DESCRIPTION
dict

Dictionary with cache paths and environment variable values.

Source code in omnidocs/utils/cache.py
def get_storage_info() -> dict:
    """
    Get current cache directory configuration information.

    Returns:
        Dictionary with cache paths and environment variable values.
    """
    return {
        "omnidocs_cache": str(get_model_cache_dir()),
        "omnidocs_models_dir_env": os.environ.get("OMNIDOCS_MODELS_DIR"),
        "hf_home": os.environ.get("HF_HOME"),
        "transformers_cache": os.environ.get("TRANSFORMERS_CACHE"),
    }

aggregation

Result aggregation utilities for batch processing.

Provides containers and utilities for storing, aggregating, and exporting results from batch document processing.

DocumentResult

DocumentResult(
    source_path: Optional[str] = None, page_count: int = 0
)

Container for results from processing a single document.

Stores results by page for easy access and serialization.

Examples:

doc_result = DocumentResult(source_path="paper.pdf", page_count=10)
doc_result.add_page_result(0, text_output)
doc_result.add_page_result(1, text_output)

# Access results
all_results = doc_result.all_results
page_0_result = doc_result.get_page_result(0)

# Save to file
doc_result.save_json("paper_result.json")

Initialize DocumentResult.

PARAMETER DESCRIPTION
source_path

Path to source document

TYPE: Optional[str] DEFAULT: None

page_count

Total number of pages

TYPE: int DEFAULT: 0

Source code in omnidocs/utils/aggregation.py
def __init__(
    self,
    source_path: Optional[str] = None,
    page_count: int = 0,
):
    """
    Initialize DocumentResult.

    Args:
        source_path: Path to source document
        page_count: Total number of pages
    """
    self.source_path = source_path
    self.page_count = page_count
    self._page_results: Dict[int, Any] = {}

all_results property

all_results: List[Any]

Get all results in page order.

RETURNS DESCRIPTION
List[Any]

List of results sorted by page number

processed_pages property

processed_pages: int

Number of pages with results.

add_page_result

add_page_result(page_num: int, result: Any) -> None

Add result for a specific page.

PARAMETER DESCRIPTION
page_num

Page number (0-indexed)

TYPE: int

result

Extraction result (TextOutput, LayoutOutput, etc.)

TYPE: Any

Source code in omnidocs/utils/aggregation.py
def add_page_result(self, page_num: int, result: Any) -> None:
    """
    Add result for a specific page.

    Args:
        page_num: Page number (0-indexed)
        result: Extraction result (TextOutput, LayoutOutput, etc.)
    """
    self._page_results[page_num] = result

get_page_result

get_page_result(page_num: int) -> Optional[Any]

Get result for a specific page.

PARAMETER DESCRIPTION
page_num

Page number (0-indexed)

TYPE: int

RETURNS DESCRIPTION
Optional[Any]

Result for the page, or None if not found

Source code in omnidocs/utils/aggregation.py
def get_page_result(self, page_num: int) -> Optional[Any]:
    """
    Get result for a specific page.

    Args:
        page_num: Page number (0-indexed)

    Returns:
        Result for the page, or None if not found
    """
    return self._page_results.get(page_num)

to_dict

to_dict() -> dict

Convert to dictionary for serialization.

RETURNS DESCRIPTION
dict

Dictionary representation

Source code in omnidocs/utils/aggregation.py
def to_dict(self) -> dict:
    """
    Convert to dictionary for serialization.

    Returns:
        Dictionary representation
    """
    results_dict = {}
    for k, v in self._page_results.items():
        if hasattr(v, "model_dump"):
            results_dict[str(k)] = v.model_dump()
        elif hasattr(v, "to_dict"):
            results_dict[str(k)] = v.to_dict()
        elif hasattr(v, "__dict__"):
            results_dict[str(k)] = v.__dict__
        else:
            results_dict[str(k)] = str(v)

    return {
        "source_path": self.source_path,
        "page_count": self.page_count,
        "processed_pages": self.processed_pages,
        "results": results_dict,
    }

save_json

save_json(path: str) -> None

Save results to JSON file.

PARAMETER DESCRIPTION
path

Output file path

TYPE: str

Source code in omnidocs/utils/aggregation.py
def save_json(self, path: str) -> None:
    """
    Save results to JSON file.

    Args:
        path: Output file path
    """
    out_path = Path(path)
    out_path.parent.mkdir(parents=True, exist_ok=True)
    with open(out_path, "w") as f:
        json.dump(self.to_dict(), f, indent=2, default=str)

BatchResult

BatchResult()

Container for results from processing multiple documents.

Examples:

batch_result = BatchResult()
batch_result.add_document_result("doc1", doc_result1)
batch_result.add_document_result("doc2", doc_result2)

# Access results
doc1_result = batch_result.get_document_result("doc1")
all_ids = batch_result.document_ids

# Save all results
batch_result.save_json("all_results.json")

Initialize empty BatchResult.

Source code in omnidocs/utils/aggregation.py
def __init__(self):
    """Initialize empty BatchResult."""
    self._document_results: Dict[str, DocumentResult] = {}

document_ids property

document_ids: List[str]

List of document IDs.

document_count property

document_count: int

Number of documents processed.

total_pages property

total_pages: int

Total pages across all documents.

add_document_result

add_document_result(
    doc_id: str, result: DocumentResult
) -> None

Add result for a document.

PARAMETER DESCRIPTION
doc_id

Document identifier (usually filename without extension)

TYPE: str

result

DocumentResult instance

TYPE: DocumentResult

Source code in omnidocs/utils/aggregation.py
def add_document_result(self, doc_id: str, result: DocumentResult) -> None:
    """
    Add result for a document.

    Args:
        doc_id: Document identifier (usually filename without extension)
        result: DocumentResult instance
    """
    self._document_results[doc_id] = result

get_document_result

get_document_result(
    doc_id: str,
) -> Optional[DocumentResult]

Get result for a specific document.

PARAMETER DESCRIPTION
doc_id

Document identifier

TYPE: str

RETURNS DESCRIPTION
Optional[DocumentResult]

DocumentResult or None if not found

Source code in omnidocs/utils/aggregation.py
def get_document_result(self, doc_id: str) -> Optional[DocumentResult]:
    """
    Get result for a specific document.

    Args:
        doc_id: Document identifier

    Returns:
        DocumentResult or None if not found
    """
    return self._document_results.get(doc_id)

to_dict

to_dict() -> dict

Convert to dictionary.

RETURNS DESCRIPTION
dict

Dictionary representation

Source code in omnidocs/utils/aggregation.py
def to_dict(self) -> dict:
    """
    Convert to dictionary.

    Returns:
        Dictionary representation
    """
    return {
        "document_count": self.document_count,
        "total_pages": self.total_pages,
        "documents": {doc_id: result.to_dict() for doc_id, result in self._document_results.items()},
    }

save_json

save_json(path: str) -> None

Save all results to JSON file.

PARAMETER DESCRIPTION
path

Output file path

TYPE: str

Source code in omnidocs/utils/aggregation.py
def save_json(self, path: str) -> None:
    """
    Save all results to JSON file.

    Args:
        path: Output file path
    """
    out_path = Path(path)
    out_path.parent.mkdir(parents=True, exist_ok=True)
    with open(out_path, "w") as f:
        json.dump(self.to_dict(), f, indent=2, default=str)

merge_text_results

merge_text_results(
    results: List[Any], separator: str = "\n\n"
) -> str

Merge multiple TextOutput results into single string.

PARAMETER DESCRIPTION
results

List of TextOutput (or objects with .content attribute)

TYPE: List[Any]

separator

String to join pages (default: double newline)

TYPE: str DEFAULT: '\n\n'

RETURNS DESCRIPTION
str

Combined content string

Examples:

all_results = doc_result.all_results
full_text = merge_text_results(all_results)
full_text_with_dividers = merge_text_results(all_results, separator="\n\n---\n\n")
Source code in omnidocs/utils/aggregation.py
def merge_text_results(results: List[Any], separator: str = "\n\n") -> str:
    """
    Merge multiple TextOutput results into single string.

    Args:
        results: List of TextOutput (or objects with .content attribute)
        separator: String to join pages (default: double newline)

    Returns:
        Combined content string

    Examples:
        ```python
        all_results = doc_result.all_results
        full_text = merge_text_results(all_results)
        full_text_with_dividers = merge_text_results(all_results, separator="\\n\\n---\\n\\n")
        ```
    """
    contents = []
    for r in results:
        if hasattr(r, "content") and r.content:
            contents.append(r.content)
        elif isinstance(r, str) and r:
            contents.append(r)
    return separator.join(contents)

cache

Unified model cache directory management for OmniDocs.

When OMNIDOCS_MODELS_DIR is set, ALL model downloads (PyTorch, VLLM, MLX, snapshot_download) go into that directory. It overwrites HF_HOME so every backend respects the same path.

Environment Variables

OMNIDOCS_MODELS_DIR: Primary cache directory for all OmniDocs models. Overwrites HF_HOME when set. HF_HOME: HuggingFace cache directory (used as fallback).

Example
export OMNIDOCS_MODELS_DIR=/data/models
from omnidocs.utils.cache import get_model_cache_dir

cache_dir = get_model_cache_dir()  # -> /data/models

get_model_cache_dir

get_model_cache_dir(
    custom_dir: Optional[str] = None,
) -> Path

Get unified model cache directory.

Priority order: 1. custom_dir parameter (if provided) 2. OMNIDOCS_MODELS_DIR environment variable 3. HF_HOME environment variable 4. Default: ~/.cache/huggingface

PARAMETER DESCRIPTION
custom_dir

Optional custom cache directory path. Overrides environment variables if provided.

TYPE: Optional[str] DEFAULT: None

RETURNS DESCRIPTION
Path

Path object pointing to the cache directory.

Path

Directory is created if it doesn't exist.

Source code in omnidocs/utils/cache.py
def get_model_cache_dir(custom_dir: Optional[str] = None) -> Path:
    """
    Get unified model cache directory.

    Priority order:
    1. custom_dir parameter (if provided)
    2. OMNIDOCS_MODELS_DIR environment variable
    3. HF_HOME environment variable
    4. Default: ~/.cache/huggingface

    Args:
        custom_dir: Optional custom cache directory path.
                   Overrides environment variables if provided.

    Returns:
        Path object pointing to the cache directory.
        Directory is created if it doesn't exist.
    """
    if custom_dir:
        cache_dir = custom_dir
    else:
        cache_dir = os.environ.get(
            "OMNIDOCS_MODELS_DIR",
            os.environ.get("HF_HOME", os.path.expanduser("~/.cache/huggingface")),
        )

    path = Path(cache_dir).expanduser().resolve()
    path.mkdir(parents=True, exist_ok=True)
    return path

configure_backend_cache

configure_backend_cache(
    cache_dir: Optional[str] = None,
) -> None

Configure cache directories for all backends.

When OMNIDOCS_MODELS_DIR is set (or cache_dir is passed), this OVERWRITES HF_HOME and TRANSFORMERS_CACHE so every backend downloads to the same place.

This is called automatically on import omnidocs.

PARAMETER DESCRIPTION
cache_dir

Optional cache directory path. If None, uses get_model_cache_dir().

TYPE: Optional[str] DEFAULT: None

Source code in omnidocs/utils/cache.py
def configure_backend_cache(cache_dir: Optional[str] = None) -> None:
    """
    Configure cache directories for all backends.

    When OMNIDOCS_MODELS_DIR is set (or cache_dir is passed), this OVERWRITES
    HF_HOME and TRANSFORMERS_CACHE so every backend downloads to the same place.

    This is called automatically on ``import omnidocs``.

    Args:
        cache_dir: Optional cache directory path. If None, uses get_model_cache_dir().
    """
    cache_path = str(get_model_cache_dir(cache_dir))

    # Overwrite HF_HOME so PyTorch, MLX, VLLM, and snapshot_download all use it
    os.environ["HF_HOME"] = cache_path
    os.environ["TRANSFORMERS_CACHE"] = cache_path

    # huggingface_hub caches HF_HUB_CACHE at import time, so if it's already
    # imported we must patch the constants directly for hf_hub_download etc.
    import sys

    if "huggingface_hub.constants" in sys.modules:
        import huggingface_hub.constants as hf_constants

        hub_cache = os.path.join(cache_path, "hub")
        hf_constants.HF_HOME = cache_path
        hf_constants.HF_HUB_CACHE = hub_cache
        hf_constants.HUGGINGFACE_HUB_CACHE = hub_cache

get_storage_info

get_storage_info() -> dict

Get current cache directory configuration information.

RETURNS DESCRIPTION
dict

Dictionary with cache paths and environment variable values.

Source code in omnidocs/utils/cache.py
def get_storage_info() -> dict:
    """
    Get current cache directory configuration information.

    Returns:
        Dictionary with cache paths and environment variable values.
    """
    return {
        "omnidocs_cache": str(get_model_cache_dir()),
        "omnidocs_models_dir_env": os.environ.get("OMNIDOCS_MODELS_DIR"),
        "hf_home": os.environ.get("HF_HOME"),
        "transformers_cache": os.environ.get("TRANSFORMERS_CACHE"),
    }