Skip to content

Aggregation

Result aggregation utilities for batch processing.

Provides containers and utilities for storing, aggregating, and exporting results from batch document processing.

DocumentResult

DocumentResult(
    source_path: Optional[str] = None, page_count: int = 0
)

Container for results from processing a single document.

Stores results by page for easy access and serialization.

Examples:

doc_result = DocumentResult(source_path="paper.pdf", page_count=10)
doc_result.add_page_result(0, text_output)
doc_result.add_page_result(1, text_output)

# Access results
all_results = doc_result.all_results
page_0_result = doc_result.get_page_result(0)

# Save to file
doc_result.save_json("paper_result.json")

Initialize DocumentResult.

PARAMETER DESCRIPTION
source_path

Path to source document

TYPE: Optional[str] DEFAULT: None

page_count

Total number of pages

TYPE: int DEFAULT: 0

Source code in omnidocs/utils/aggregation.py
def __init__(
    self,
    source_path: Optional[str] = None,
    page_count: int = 0,
):
    """
    Initialize DocumentResult.

    Args:
        source_path: Path to source document
        page_count: Total number of pages
    """
    self.source_path = source_path
    self.page_count = page_count
    self._page_results: Dict[int, Any] = {}

all_results property

all_results: List[Any]

Get all results in page order.

RETURNS DESCRIPTION
List[Any]

List of results sorted by page number

processed_pages property

processed_pages: int

Number of pages with results.

add_page_result

add_page_result(page_num: int, result: Any) -> None

Add result for a specific page.

PARAMETER DESCRIPTION
page_num

Page number (0-indexed)

TYPE: int

result

Extraction result (TextOutput, LayoutOutput, etc.)

TYPE: Any

Source code in omnidocs/utils/aggregation.py
def add_page_result(self, page_num: int, result: Any) -> None:
    """
    Add result for a specific page.

    Args:
        page_num: Page number (0-indexed)
        result: Extraction result (TextOutput, LayoutOutput, etc.)
    """
    self._page_results[page_num] = result

get_page_result

get_page_result(page_num: int) -> Optional[Any]

Get result for a specific page.

PARAMETER DESCRIPTION
page_num

Page number (0-indexed)

TYPE: int

RETURNS DESCRIPTION
Optional[Any]

Result for the page, or None if not found

Source code in omnidocs/utils/aggregation.py
def get_page_result(self, page_num: int) -> Optional[Any]:
    """
    Get result for a specific page.

    Args:
        page_num: Page number (0-indexed)

    Returns:
        Result for the page, or None if not found
    """
    return self._page_results.get(page_num)

to_dict

to_dict() -> dict

Convert to dictionary for serialization.

RETURNS DESCRIPTION
dict

Dictionary representation

Source code in omnidocs/utils/aggregation.py
def to_dict(self) -> dict:
    """
    Convert to dictionary for serialization.

    Returns:
        Dictionary representation
    """
    results_dict = {}
    for k, v in self._page_results.items():
        if hasattr(v, "model_dump"):
            results_dict[str(k)] = v.model_dump()
        elif hasattr(v, "to_dict"):
            results_dict[str(k)] = v.to_dict()
        elif hasattr(v, "__dict__"):
            results_dict[str(k)] = v.__dict__
        else:
            results_dict[str(k)] = str(v)

    return {
        "source_path": self.source_path,
        "page_count": self.page_count,
        "processed_pages": self.processed_pages,
        "results": results_dict,
    }

save_json

save_json(path: str) -> None

Save results to JSON file.

PARAMETER DESCRIPTION
path

Output file path

TYPE: str

Source code in omnidocs/utils/aggregation.py
def save_json(self, path: str) -> None:
    """
    Save results to JSON file.

    Args:
        path: Output file path
    """
    out_path = Path(path)
    out_path.parent.mkdir(parents=True, exist_ok=True)
    with open(out_path, "w") as f:
        json.dump(self.to_dict(), f, indent=2, default=str)

BatchResult

BatchResult()

Container for results from processing multiple documents.

Examples:

batch_result = BatchResult()
batch_result.add_document_result("doc1", doc_result1)
batch_result.add_document_result("doc2", doc_result2)

# Access results
doc1_result = batch_result.get_document_result("doc1")
all_ids = batch_result.document_ids

# Save all results
batch_result.save_json("all_results.json")

Initialize empty BatchResult.

Source code in omnidocs/utils/aggregation.py
def __init__(self):
    """Initialize empty BatchResult."""
    self._document_results: Dict[str, DocumentResult] = {}

document_ids property

document_ids: List[str]

List of document IDs.

document_count property

document_count: int

Number of documents processed.

total_pages property

total_pages: int

Total pages across all documents.

add_document_result

add_document_result(
    doc_id: str, result: DocumentResult
) -> None

Add result for a document.

PARAMETER DESCRIPTION
doc_id

Document identifier (usually filename without extension)

TYPE: str

result

DocumentResult instance

TYPE: DocumentResult

Source code in omnidocs/utils/aggregation.py
def add_document_result(self, doc_id: str, result: DocumentResult) -> None:
    """
    Add result for a document.

    Args:
        doc_id: Document identifier (usually filename without extension)
        result: DocumentResult instance
    """
    self._document_results[doc_id] = result

get_document_result

get_document_result(
    doc_id: str,
) -> Optional[DocumentResult]

Get result for a specific document.

PARAMETER DESCRIPTION
doc_id

Document identifier

TYPE: str

RETURNS DESCRIPTION
Optional[DocumentResult]

DocumentResult or None if not found

Source code in omnidocs/utils/aggregation.py
def get_document_result(self, doc_id: str) -> Optional[DocumentResult]:
    """
    Get result for a specific document.

    Args:
        doc_id: Document identifier

    Returns:
        DocumentResult or None if not found
    """
    return self._document_results.get(doc_id)

to_dict

to_dict() -> dict

Convert to dictionary.

RETURNS DESCRIPTION
dict

Dictionary representation

Source code in omnidocs/utils/aggregation.py
def to_dict(self) -> dict:
    """
    Convert to dictionary.

    Returns:
        Dictionary representation
    """
    return {
        "document_count": self.document_count,
        "total_pages": self.total_pages,
        "documents": {doc_id: result.to_dict() for doc_id, result in self._document_results.items()},
    }

save_json

save_json(path: str) -> None

Save all results to JSON file.

PARAMETER DESCRIPTION
path

Output file path

TYPE: str

Source code in omnidocs/utils/aggregation.py
def save_json(self, path: str) -> None:
    """
    Save all results to JSON file.

    Args:
        path: Output file path
    """
    out_path = Path(path)
    out_path.parent.mkdir(parents=True, exist_ok=True)
    with open(out_path, "w") as f:
        json.dump(self.to_dict(), f, indent=2, default=str)

merge_text_results

merge_text_results(
    results: List[Any], separator: str = "\n\n"
) -> str

Merge multiple TextOutput results into single string.

PARAMETER DESCRIPTION
results

List of TextOutput (or objects with .content attribute)

TYPE: List[Any]

separator

String to join pages (default: double newline)

TYPE: str DEFAULT: '\n\n'

RETURNS DESCRIPTION
str

Combined content string

Examples:

all_results = doc_result.all_results
full_text = merge_text_results(all_results)
full_text_with_dividers = merge_text_results(all_results, separator="\n\n---\n\n")
Source code in omnidocs/utils/aggregation.py
def merge_text_results(results: List[Any], separator: str = "\n\n") -> str:
    """
    Merge multiple TextOutput results into single string.

    Args:
        results: List of TextOutput (or objects with .content attribute)
        separator: String to join pages (default: double newline)

    Returns:
        Combined content string

    Examples:
        ```python
        all_results = doc_result.all_results
        full_text = merge_text_results(all_results)
        full_text_with_dividers = merge_text_results(all_results, separator="\\n\\n---\\n\\n")
        ```
    """
    contents = []
    for r in results:
        if hasattr(r, "content") and r.content:
            contents.append(r.content)
        elif isinstance(r, str) and r:
            contents.append(r)
    return separator.join(contents)