Aggregation¶

Result aggregation utilities for batch processing.

Provides containers and utilities for storing, aggregating, and exporting results from batch document processing.

DocumentResult ¶

DocumentResult(
    source_path: Optional[str] = None, page_count: int = 0
)

Container for results from processing a single document.

Stores results by page for easy access and serialization.

Examples:

doc_result = DocumentResult(source_path="paper.pdf", page_count=10)
doc_result.add_page_result(0, text_output)
doc_result.add_page_result(1, text_output)

# Access results
all_results = doc_result.all_results
page_0_result = doc_result.get_page_result(0)

# Save to file
doc_result.save_json("paper_result.json")

Initialize DocumentResult.

PARAMETER	DESCRIPTION
`source_path`	Path to source document TYPE: `Optional[str]` DEFAULT: `None`
`page_count`	Total number of pages TYPE: `int` DEFAULT: `0`

Source code in omnidocs/utils/aggregation.py

def __init__(
    self,
    source_path: Optional[str] = None,
    page_count: int = 0,
):
    """
    Initialize DocumentResult.

    Args:
        source_path: Path to source document
        page_count: Total number of pages
    """
    self.source_path = source_path
    self.page_count = page_count
    self._page_results: Dict[int, Any] = {}

all_results `property` ¶

all_results: List[Any]

Get all results in page order.

RETURNS	DESCRIPTION
`List[Any]`	List of results sorted by page number

processed_pages `property` ¶

processed_pages: int

Number of pages with results.

add_page_result ¶

add_page_result(page_num: int, result: Any) -> None

Add result for a specific page.

PARAMETER	DESCRIPTION
`page_num`	Page number (0-indexed) TYPE: `int`
`result`	Extraction result (TextOutput, LayoutOutput, etc.) TYPE: `Any`

Source code in omnidocs/utils/aggregation.py

def add_page_result(self, page_num: int, result: Any) -> None:
    """
    Add result for a specific page.

    Args:
        page_num: Page number (0-indexed)
        result: Extraction result (TextOutput, LayoutOutput, etc.)
    """
    self._page_results[page_num] = result

get_page_result ¶

get_page_result(page_num: int) -> Optional[Any]

Get result for a specific page.

PARAMETER	DESCRIPTION
`page_num`	Page number (0-indexed) TYPE: `int`

RETURNS	DESCRIPTION
`Optional[Any]`	Result for the page, or None if not found

Source code in omnidocs/utils/aggregation.py

def get_page_result(self, page_num: int) -> Optional[Any]:
    """
    Get result for a specific page.

    Args:
        page_num: Page number (0-indexed)

    Returns:
        Result for the page, or None if not found
    """
    return self._page_results.get(page_num)

to_dict ¶

to_dict() -> dict

Convert to dictionary for serialization.

RETURNS	DESCRIPTION
`dict`	Dictionary representation

Source code in omnidocs/utils/aggregation.py

def to_dict(self) -> dict:
    """
    Convert to dictionary for serialization.

    Returns:
        Dictionary representation
    """
    results_dict = {}
    for k, v in self._page_results.items():
        if hasattr(v, "model_dump"):
            results_dict[str(k)] = v.model_dump()
        elif hasattr(v, "to_dict"):
            results_dict[str(k)] = v.to_dict()
        elif hasattr(v, "__dict__"):
            results_dict[str(k)] = v.__dict__
        else:
            results_dict[str(k)] = str(v)

    return {
        "source_path": self.source_path,
        "page_count": self.page_count,
        "processed_pages": self.processed_pages,
        "results": results_dict,
    }

save_json ¶

save_json(path: str) -> None

Save results to JSON file.

PARAMETER	DESCRIPTION
`path`	Output file path TYPE: `str`

Source code in omnidocs/utils/aggregation.py

def save_json(self, path: str) -> None:
    """
    Save results to JSON file.

    Args:
        path: Output file path
    """
    out_path = Path(path)
    out_path.parent.mkdir(parents=True, exist_ok=True)
    with open(out_path, "w") as f:
        json.dump(self.to_dict(), f, indent=2, default=str)

BatchResult ¶

BatchResult()

Container for results from processing multiple documents.

Examples:

batch_result = BatchResult()
batch_result.add_document_result("doc1", doc_result1)
batch_result.add_document_result("doc2", doc_result2)

# Access results
doc1_result = batch_result.get_document_result("doc1")
all_ids = batch_result.document_ids

# Save all results
batch_result.save_json("all_results.json")

Initialize empty BatchResult.

Source code in omnidocs/utils/aggregation.py

def __init__(self):
    """Initialize empty BatchResult."""
    self._document_results: Dict[str, DocumentResult] = {}

document_ids `property` ¶

document_ids: List[str]

List of document IDs.

document_count `property` ¶

document_count: int

Number of documents processed.

total_pages `property` ¶

total_pages: int

Total pages across all documents.

add_document_result ¶

add_document_result(
    doc_id: str, result: DocumentResult
) -> None

Add result for a document.

PARAMETER	DESCRIPTION
`doc_id`	Document identifier (usually filename without extension) TYPE: `str`
`result`	DocumentResult instance TYPE: `DocumentResult`

Source code in omnidocs/utils/aggregation.py

def add_document_result(self, doc_id: str, result: DocumentResult) -> None:
    """
    Add result for a document.

    Args:
        doc_id: Document identifier (usually filename without extension)
        result: DocumentResult instance
    """
    self._document_results[doc_id] = result

get_document_result ¶

get_document_result(
    doc_id: str,
) -> Optional[DocumentResult]

Get result for a specific document.

PARAMETER	DESCRIPTION
`doc_id`	Document identifier TYPE: `str`

RETURNS	DESCRIPTION
`Optional[DocumentResult]`	DocumentResult or None if not found

Source code in omnidocs/utils/aggregation.py

def get_document_result(self, doc_id: str) -> Optional[DocumentResult]:
    """
    Get result for a specific document.

    Args:
        doc_id: Document identifier

    Returns:
        DocumentResult or None if not found
    """
    return self._document_results.get(doc_id)

to_dict ¶

to_dict() -> dict

Convert to dictionary.

RETURNS	DESCRIPTION
`dict`	Dictionary representation

Source code in omnidocs/utils/aggregation.py

def to_dict(self) -> dict:
    """
    Convert to dictionary.

    Returns:
        Dictionary representation
    """
    return {
        "document_count": self.document_count,
        "total_pages": self.total_pages,
        "documents": {doc_id: result.to_dict() for doc_id, result in self._document_results.items()},
    }

save_json ¶

save_json(path: str) -> None

Save all results to JSON file.

PARAMETER	DESCRIPTION
`path`	Output file path TYPE: `str`

Source code in omnidocs/utils/aggregation.py

def save_json(self, path: str) -> None:
    """
    Save all results to JSON file.

    Args:
        path: Output file path
    """
    out_path = Path(path)
    out_path.parent.mkdir(parents=True, exist_ok=True)
    with open(out_path, "w") as f:
        json.dump(self.to_dict(), f, indent=2, default=str)

merge_text_results ¶

merge_text_results(
    results: List[Any], separator: str = "\n\n"
) -> str

Merge multiple TextOutput results into single string.

PARAMETER	DESCRIPTION
`results`	List of TextOutput (or objects with .content attribute) TYPE: `List[Any]`
`separator`	String to join pages (default: double newline) TYPE: `str` DEFAULT: `'\n\n'`

RETURNS	DESCRIPTION
`str`	Combined content string

Examples:

all_results = doc_result.all_results
full_text = merge_text_results(all_results)
full_text_with_dividers = merge_text_results(all_results, separator="\n\n---\n\n")

Source code in omnidocs/utils/aggregation.py

def merge_text_results(results: List[Any], separator: str = "\n\n") -> str:
    """
    Merge multiple TextOutput results into single string.

    Args:
        results: List of TextOutput (or objects with .content attribute)
        separator: String to join pages (default: double newline)

    Returns:
        Combined content string

    Examples:
        ```python
        all_results = doc_result.all_results
        full_text = merge_text_results(all_results)
        full_text_with_dividers = merge_text_results(all_results, separator="\\n\\n---\\n\\n")
        ```
    """
    contents = []
    for r in results:
        if hasattr(r, "content") and r.content:
            contents.append(r.content)
        elif isinstance(r, str) and r:
            contents.append(r)
    return separator.join(contents)

Aggregation¶

DocumentResult ¶

all_results property ¶

processed_pages property ¶

add_page_result ¶

get_page_result ¶

to_dict ¶

save_json ¶

BatchResult ¶

document_ids property ¶

document_count property ¶

total_pages property ¶

add_document_result ¶

get_document_result ¶

to_dict ¶

save_json ¶

merge_text_results ¶

all_results `property` ¶

processed_pages `property` ¶

document_ids `property` ¶

document_count `property` ¶

total_pages `property` ¶