Base¶

Base class for table extractors.

Defines the abstract interface that all table extractors must implement.

BaseTableExtractor ¶

Bases: ABC

Abstract base class for table structure extractors.

Table extractors analyze table images to detect cell structure, identify headers, and extract text content.

Example

class MyTableExtractor(BaseTableExtractor):
    def __init__(self, config: MyConfig):
        self.config = config
        self._load_model()

    def _load_model(self):
        # Load model weights
        pass

    def extract(self, image):
        # Run extraction
        return TableOutput(...)

extract `abstractmethod` ¶

extract(
    image: Union[Image, ndarray, str, Path],
    ocr_output: Optional[OCROutput] = None,
) -> TableOutput

Extract table structure from an image.

PARAMETER	DESCRIPTION
`image`	Table image (should be cropped to table region) TYPE: `Union[Image, ndarray, str, Path]`
`ocr_output`	Optional OCR results for cell text matching. If not provided, model will attempt to extract text. TYPE: `Optional[OCROutput]` DEFAULT: `None`

RETURNS	DESCRIPTION
`TableOutput`	TableOutput with cells, structure, and export methods

Example

# Without OCR (model extracts text)
result = extractor.extract(table_image)

# With OCR (better text quality)
ocr = some_ocr.extract(table_image)
result = extractor.extract(table_image, ocr_output=ocr)

Source code in omnidocs/tasks/table_extraction/base.py

@abstractmethod
def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    ocr_output: Optional["OCROutput"] = None,
) -> TableOutput:
    """
    Extract table structure from an image.

    Args:
        image: Table image (should be cropped to table region)
        ocr_output: Optional OCR results for cell text matching.
                   If not provided, model will attempt to extract text.

    Returns:
        TableOutput with cells, structure, and export methods

    Example:
        ```python
        # Without OCR (model extracts text)
        result = extractor.extract(table_image)

        # With OCR (better text quality)
        ocr = some_ocr.extract(table_image)
        result = extractor.extract(table_image, ocr_output=ocr)
        ```
    """
    pass

batch_extract ¶

batch_extract(
    images: List[Union[Image, ndarray, str, Path]],
    ocr_outputs: Optional[List[OCROutput]] = None,
    progress_callback: Optional[
        Callable[[int, int], None]
    ] = None,
) -> List[TableOutput]

Extract tables from multiple images.

Default implementation loops over extract(). Subclasses can override for optimized batching.

PARAMETER	DESCRIPTION
`images`	List of table images TYPE: `List[Union[Image, ndarray, str, Path]]`
`ocr_outputs`	Optional list of OCR results (same length as images) TYPE: `Optional[List[OCROutput]]` DEFAULT: `None`
`progress_callback`	Optional function(current, total) for progress TYPE: `Optional[Callable[[int, int], None]]` DEFAULT: `None`

RETURNS	DESCRIPTION
`List[TableOutput]`	List of TableOutput in same order as input

Examples:

results = extractor.batch_extract(table_images)

Source code in omnidocs/tasks/table_extraction/base.py

def batch_extract(
    self,
    images: List[Union[Image.Image, np.ndarray, str, Path]],
    ocr_outputs: Optional[List["OCROutput"]] = None,
    progress_callback: Optional[Callable[[int, int], None]] = None,
) -> List[TableOutput]:
    """
    Extract tables from multiple images.

    Default implementation loops over extract(). Subclasses can override
    for optimized batching.

    Args:
        images: List of table images
        ocr_outputs: Optional list of OCR results (same length as images)
        progress_callback: Optional function(current, total) for progress

    Returns:
        List of TableOutput in same order as input

    Examples:
        ```python
        results = extractor.batch_extract(table_images)
        ```
    """
    results = []
    total = len(images)

    for i, image in enumerate(images):
        if progress_callback:
            progress_callback(i + 1, total)

        ocr = ocr_outputs[i] if ocr_outputs else None
        result = self.extract(image, ocr_output=ocr)
        results.append(result)

    return results

extract_document ¶

extract_document(
    document: Document,
    table_bboxes: Optional[List[List[float]]] = None,
    progress_callback: Optional[
        Callable[[int, int], None]
    ] = None,
) -> List[TableOutput]

Extract tables from all pages of a document.

PARAMETER	DESCRIPTION
`document`	Document instance TYPE: `Document`
`table_bboxes`	Optional list of table bounding boxes per page. Each element should be a list of [x1, y1, x2, y2] coords. TYPE: `Optional[List[List[float]]]` DEFAULT: `None`
`progress_callback`	Optional function(current, total) for progress TYPE: `Optional[Callable[[int, int], None]]` DEFAULT: `None`

RETURNS	DESCRIPTION
`List[TableOutput]`	List of TableOutput, one per detected table

Examples:

doc = Document.from_pdf("paper.pdf")
results = extractor.extract_document(doc)

Source code in omnidocs/tasks/table_extraction/base.py

def extract_document(
    self,
    document: "Document",
    table_bboxes: Optional[List[List[float]]] = None,
    progress_callback: Optional[Callable[[int, int], None]] = None,
) -> List[TableOutput]:
    """
    Extract tables from all pages of a document.

    Args:
        document: Document instance
        table_bboxes: Optional list of table bounding boxes per page.
                     Each element should be a list of [x1, y1, x2, y2] coords.
        progress_callback: Optional function(current, total) for progress

    Returns:
        List of TableOutput, one per detected table

    Examples:
        ```python
        doc = Document.from_pdf("paper.pdf")
        results = extractor.extract_document(doc)
        ```
    """
    results = []
    total = document.page_count

    for i, page in enumerate(document.iter_pages()):
        if progress_callback:
            progress_callback(i + 1, total)

        # If no bboxes provided, process entire page
        if table_bboxes is None:
            result = self.extract(page)
            results.append(result)
        else:
            # Crop and process each table region
            for bbox in table_bboxes:
                x1, y1, x2, y2 = bbox
                table_region = page.crop((x1, y1, x2, y2))
                result = self.extract(table_region)
                results.append(result)

    return results

Base¶

BaseTableExtractor ¶

extract abstractmethod ¶

batch_extract ¶

extract_document ¶

extract `abstractmethod` ¶