Skip to content

Overview

OCR Extraction Module.

Provides extractors for detecting text with bounding boxes from document images. Returns text content along with spatial coordinates (unlike Text Extraction which returns formatted Markdown/HTML without coordinates).

Available Extractors
  • TesseractOCR: Open-source OCR (CPU, requires system Tesseract)
  • EasyOCR: PyTorch-based OCR (CPU/GPU, 80+ languages)
  • PaddleOCR: PaddlePaddle-based OCR (CPU/GPU, excellent CJK support)
Key Difference from Text Extraction
  • OCR Extraction: Text + Bounding Boxes (spatial location)
  • Text Extraction: Markdown/HTML (formatted document export)
Example
from omnidocs.tasks.ocr_extraction import TesseractOCR, TesseractOCRConfig

ocr = TesseractOCR(config=TesseractOCRConfig(languages=["eng"]))
result = ocr.extract(image)

for block in result.text_blocks:
        print(f"'{block.text}' @ {block.bbox.to_list()} (conf: {block.confidence:.2f})")
# With EasyOCR
from omnidocs.tasks.ocr_extraction import EasyOCR, EasyOCRConfig

ocr = EasyOCR(config=EasyOCRConfig(languages=["en", "ch_sim"], gpu=True))
result = ocr.extract(image)
# With PaddleOCR
from omnidocs.tasks.ocr_extraction import PaddleOCR, PaddleOCRConfig

ocr = PaddleOCR(config=PaddleOCRConfig(lang="ch", device="cpu"))
result = ocr.extract(image)

BaseOCRExtractor

Bases: ABC

Abstract base class for OCR extractors.

All OCR extraction models must inherit from this class and implement the required methods.

Example
class MyOCRExtractor(BaseOCRExtractor):
        def __init__(self, config: MyConfig):
            self.config = config
            self._load_model()

        def _load_model(self):
            # Initialize OCR engine
            pass

        def extract(self, image):
            # Run OCR extraction
            return OCROutput(...)

extract abstractmethod

extract(
    image: Union[Image, ndarray, str, Path],
) -> OCROutput

Run OCR extraction on an image.

PARAMETER DESCRIPTION
image

Input image as: - PIL.Image.Image: PIL image object - np.ndarray: Numpy array (HWC format, RGB) - str or Path: Path to image file

TYPE: Union[Image, ndarray, str, Path]

RETURNS DESCRIPTION
OCROutput

OCROutput containing detected text blocks with bounding boxes

RAISES DESCRIPTION
ValueError

If image format is not supported

RuntimeError

If OCR engine is not initialized or extraction fails

Source code in omnidocs/tasks/ocr_extraction/base.py
@abstractmethod
def extract(self, image: Union[Image.Image, np.ndarray, str, Path]) -> OCROutput:
    """
    Run OCR extraction on an image.

    Args:
        image: Input image as:
            - PIL.Image.Image: PIL image object
            - np.ndarray: Numpy array (HWC format, RGB)
            - str or Path: Path to image file

    Returns:
        OCROutput containing detected text blocks with bounding boxes

    Raises:
        ValueError: If image format is not supported
        RuntimeError: If OCR engine is not initialized or extraction fails
    """
    pass

batch_extract

batch_extract(
    images: List[Union[Image, ndarray, str, Path]],
    progress_callback: Optional[
        Callable[[int, int], None]
    ] = None,
) -> List[OCROutput]

Run OCR extraction on multiple images.

Default implementation loops over extract(). Subclasses can override for optimized batching.

PARAMETER DESCRIPTION
images

List of images in any supported format

TYPE: List[Union[Image, ndarray, str, Path]]

progress_callback

Optional function(current, total) for progress

TYPE: Optional[Callable[[int, int], None]] DEFAULT: None

RETURNS DESCRIPTION
List[OCROutput]

List of OCROutput in same order as input

Examples:

images = [doc.get_page(i) for i in range(doc.page_count)]
results = extractor.batch_extract(images)
Source code in omnidocs/tasks/ocr_extraction/base.py
def batch_extract(
    self,
    images: List[Union[Image.Image, np.ndarray, str, Path]],
    progress_callback: Optional[Callable[[int, int], None]] = None,
) -> List[OCROutput]:
    """
    Run OCR extraction on multiple images.

    Default implementation loops over extract(). Subclasses can override
    for optimized batching.

    Args:
        images: List of images in any supported format
        progress_callback: Optional function(current, total) for progress

    Returns:
        List of OCROutput in same order as input

    Examples:
        ```python
        images = [doc.get_page(i) for i in range(doc.page_count)]
        results = extractor.batch_extract(images)
        ```
    """
    results = []
    total = len(images)

    for i, image in enumerate(images):
        if progress_callback:
            progress_callback(i + 1, total)

        result = self.extract(image)
        results.append(result)

    return results

extract_document

extract_document(
    document: Document,
    progress_callback: Optional[
        Callable[[int, int], None]
    ] = None,
) -> List[OCROutput]

Run OCR extraction on all pages of a document.

PARAMETER DESCRIPTION
document

Document instance

TYPE: Document

progress_callback

Optional function(current, total) for progress

TYPE: Optional[Callable[[int, int], None]] DEFAULT: None

RETURNS DESCRIPTION
List[OCROutput]

List of OCROutput, one per page

Examples:

doc = Document.from_pdf("paper.pdf")
results = extractor.extract_document(doc)
Source code in omnidocs/tasks/ocr_extraction/base.py
def extract_document(
    self,
    document: "Document",
    progress_callback: Optional[Callable[[int, int], None]] = None,
) -> List[OCROutput]:
    """
    Run OCR extraction on all pages of a document.

    Args:
        document: Document instance
        progress_callback: Optional function(current, total) for progress

    Returns:
        List of OCROutput, one per page

    Examples:
        ```python
        doc = Document.from_pdf("paper.pdf")
        results = extractor.extract_document(doc)
        ```
    """
    results = []
    total = document.page_count

    for i, page in enumerate(document.iter_pages()):
        if progress_callback:
            progress_callback(i + 1, total)

        result = self.extract(page)
        results.append(result)

    return results

EasyOCR

EasyOCR(config: EasyOCRConfig)

Bases: BaseOCRExtractor

EasyOCR text extractor.

Single-backend model (PyTorch - CPU/GPU).

Example
from omnidocs.tasks.ocr_extraction import EasyOCR, EasyOCRConfig

ocr = EasyOCR(config=EasyOCRConfig(languages=["en"], gpu=True))
result = ocr.extract(image)

for block in result.text_blocks:
        print(f"'{block.text}' @ {block.bbox.to_list()}")

Initialize EasyOCR extractor.

PARAMETER DESCRIPTION
config

Configuration object

TYPE: EasyOCRConfig

RAISES DESCRIPTION
ImportError

If easyocr is not installed

Source code in omnidocs/tasks/ocr_extraction/easyocr.py
def __init__(self, config: EasyOCRConfig):
    """
    Initialize EasyOCR extractor.

    Args:
        config: Configuration object

    Raises:
        ImportError: If easyocr is not installed
    """
    self.config = config
    self._reader = None
    self._load_model()

extract

extract(
    image: Union[Image, ndarray, str, Path],
    detail: int = 1,
    paragraph: bool = False,
    min_size: int = 10,
    text_threshold: float = 0.7,
    low_text: float = 0.4,
    link_threshold: float = 0.4,
    canvas_size: int = 2560,
    mag_ratio: float = 1.0,
) -> OCROutput

Run OCR on an image.

PARAMETER DESCRIPTION
image

Input image (PIL Image, numpy array, or path)

TYPE: Union[Image, ndarray, str, Path]

detail

0 = simple output, 1 = detailed with boxes

TYPE: int DEFAULT: 1

paragraph

Combine results into paragraphs

TYPE: bool DEFAULT: False

min_size

Minimum text box size

TYPE: int DEFAULT: 10

text_threshold

Text confidence threshold

TYPE: float DEFAULT: 0.7

low_text

Low text bound

TYPE: float DEFAULT: 0.4

link_threshold

Link threshold for text joining

TYPE: float DEFAULT: 0.4

canvas_size

Max image dimension for processing

TYPE: int DEFAULT: 2560

mag_ratio

Magnification ratio

TYPE: float DEFAULT: 1.0

RETURNS DESCRIPTION
OCROutput

OCROutput with detected text blocks

RAISES DESCRIPTION
ValueError

If detail is not 0 or 1

RuntimeError

If EasyOCR is not initialized

Source code in omnidocs/tasks/ocr_extraction/easyocr.py
def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    detail: int = 1,
    paragraph: bool = False,
    min_size: int = 10,
    text_threshold: float = 0.7,
    low_text: float = 0.4,
    link_threshold: float = 0.4,
    canvas_size: int = 2560,
    mag_ratio: float = 1.0,
) -> OCROutput:
    """
    Run OCR on an image.

    Args:
        image: Input image (PIL Image, numpy array, or path)
        detail: 0 = simple output, 1 = detailed with boxes
        paragraph: Combine results into paragraphs
        min_size: Minimum text box size
        text_threshold: Text confidence threshold
        low_text: Low text bound
        link_threshold: Link threshold for text joining
        canvas_size: Max image dimension for processing
        mag_ratio: Magnification ratio

    Returns:
        OCROutput with detected text blocks

    Raises:
        ValueError: If detail is not 0 or 1
        RuntimeError: If EasyOCR is not initialized
    """
    if self._reader is None:
        raise RuntimeError("EasyOCR not initialized. Call _load_model() first.")

    # Validate detail parameter
    if detail not in (0, 1):
        raise ValueError(f"detail must be 0 or 1, got {detail}")

    # Prepare image
    pil_image = self._prepare_image(image)
    image_width, image_height = pil_image.size

    # Convert to numpy array for EasyOCR
    image_array = np.array(pil_image)

    # Run EasyOCR
    results = self._reader.readtext(
        image_array,
        detail=detail,
        paragraph=paragraph,
        min_size=min_size,
        text_threshold=text_threshold,
        low_text=low_text,
        link_threshold=link_threshold,
        canvas_size=canvas_size,
        mag_ratio=mag_ratio,
    )

    # Parse results
    text_blocks = []
    full_text_parts = []

    for result in results:
        if detail == 0:
            # Simple output: just text
            text = result
            confidence = 1.0
            bbox = BoundingBox(x1=0, y1=0, x2=0, y2=0)
            polygon = None
        else:
            # Detailed output: [polygon, text, confidence]
            polygon_points, text, confidence = result

            # EasyOCR returns 4 corner points: [[x1,y1], [x2,y1], [x2,y2], [x1,y2]]
            # Convert to list of lists for storage
            polygon = [list(p) for p in polygon_points]

            # Convert to axis-aligned bounding box
            bbox = BoundingBox.from_polygon(polygon)

        if not text.strip():
            continue

        text_blocks.append(
            TextBlock(
                text=text,
                bbox=bbox,
                confidence=float(confidence),
                granularity=(OCRGranularity.LINE if paragraph else OCRGranularity.WORD),
                polygon=polygon,
                language="+".join(self.config.languages),
            )
        )

        full_text_parts.append(text)

    # Sort by position
    text_blocks.sort(key=lambda b: (b.bbox.y1, b.bbox.x1))

    return OCROutput(
        text_blocks=text_blocks,
        full_text=" ".join(full_text_parts),
        image_width=image_width,
        image_height=image_height,
        model_name=self.MODEL_NAME,
        languages_detected=self.config.languages,
    )

extract_batch

extract_batch(
    images: List[Union[Image, ndarray, str, Path]], **kwargs
) -> List[OCROutput]

Run OCR on multiple images.

PARAMETER DESCRIPTION
images

List of input images

TYPE: List[Union[Image, ndarray, str, Path]]

**kwargs

Arguments passed to extract()

DEFAULT: {}

RETURNS DESCRIPTION
List[OCROutput]

List of OCROutput objects

Source code in omnidocs/tasks/ocr_extraction/easyocr.py
def extract_batch(
    self,
    images: List[Union[Image.Image, np.ndarray, str, Path]],
    **kwargs,
) -> List[OCROutput]:
    """
    Run OCR on multiple images.

    Args:
        images: List of input images
        **kwargs: Arguments passed to extract()

    Returns:
        List of OCROutput objects
    """
    results = []
    for img in images:
        results.append(self.extract(img, **kwargs))
    return results

EasyOCRConfig

Bases: BaseModel

Configuration for EasyOCR extractor.

This is a single-backend model (PyTorch - CPU/GPU).

Example
config = EasyOCRConfig(languages=["en", "ch_sim"], gpu=True)
ocr = EasyOCR(config=config)

BoundingBox

Bases: BaseModel

Bounding box coordinates in pixel space.

Coordinates follow the convention: (x1, y1) is top-left, (x2, y2) is bottom-right. For rotated text, use the polygon field in TextBlock instead.

Example
bbox = BoundingBox(x1=100, y1=50, x2=300, y2=80)
print(bbox.width, bbox.height)  # 200, 30
print(bbox.center)  # (200.0, 65.0)

width property

width: float

Width of the bounding box.

height property

height: float

Height of the bounding box.

area property

area: float

Area of the bounding box.

center property

center: Tuple[float, float]

Center point of the bounding box.

to_list

to_list() -> List[float]

Convert to [x1, y1, x2, y2] list.

Source code in omnidocs/tasks/ocr_extraction/models.py
def to_list(self) -> List[float]:
    """Convert to [x1, y1, x2, y2] list."""
    return [self.x1, self.y1, self.x2, self.y2]

to_xyxy

to_xyxy() -> Tuple[float, float, float, float]

Convert to (x1, y1, x2, y2) tuple.

Source code in omnidocs/tasks/ocr_extraction/models.py
def to_xyxy(self) -> Tuple[float, float, float, float]:
    """Convert to (x1, y1, x2, y2) tuple."""
    return (self.x1, self.y1, self.x2, self.y2)

to_xywh

to_xywh() -> Tuple[float, float, float, float]

Convert to (x, y, width, height) format.

Source code in omnidocs/tasks/ocr_extraction/models.py
def to_xywh(self) -> Tuple[float, float, float, float]:
    """Convert to (x, y, width, height) format."""
    return (self.x1, self.y1, self.width, self.height)

from_list classmethod

from_list(coords: List[float]) -> BoundingBox

Create from [x1, y1, x2, y2] list.

Source code in omnidocs/tasks/ocr_extraction/models.py
@classmethod
def from_list(cls, coords: List[float]) -> "BoundingBox":
    """Create from [x1, y1, x2, y2] list."""
    if len(coords) != 4:
        raise ValueError(f"Expected 4 coordinates, got {len(coords)}")
    return cls(x1=coords[0], y1=coords[1], x2=coords[2], y2=coords[3])

from_polygon classmethod

from_polygon(polygon: List[List[float]]) -> BoundingBox

Create axis-aligned bounding box from polygon points.

PARAMETER DESCRIPTION
polygon

List of [x, y] points (usually 4 for quadrilateral)

TYPE: List[List[float]]

RETURNS DESCRIPTION
BoundingBox

BoundingBox that encloses all polygon points

Source code in omnidocs/tasks/ocr_extraction/models.py
@classmethod
def from_polygon(cls, polygon: List[List[float]]) -> "BoundingBox":
    """
    Create axis-aligned bounding box from polygon points.

    Args:
        polygon: List of [x, y] points (usually 4 for quadrilateral)

    Returns:
        BoundingBox that encloses all polygon points
    """
    if not polygon:
        raise ValueError("Polygon cannot be empty")

    xs = [p[0] for p in polygon]
    ys = [p[1] for p in polygon]
    return cls(x1=min(xs), y1=min(ys), x2=max(xs), y2=max(ys))

to_normalized

to_normalized(
    image_width: int, image_height: int
) -> BoundingBox

Convert to normalized coordinates (0-1024 range).

Scales coordinates from absolute pixel values to a virtual 1024x1024 canvas. This provides consistent coordinates regardless of original image size.

PARAMETER DESCRIPTION
image_width

Original image width in pixels

TYPE: int

image_height

Original image height in pixels

TYPE: int

RETURNS DESCRIPTION
BoundingBox

New BoundingBox with coordinates in 0-1024 range

Source code in omnidocs/tasks/ocr_extraction/models.py
def to_normalized(self, image_width: int, image_height: int) -> "BoundingBox":
    """
    Convert to normalized coordinates (0-1024 range).

    Scales coordinates from absolute pixel values to a virtual 1024x1024 canvas.
    This provides consistent coordinates regardless of original image size.

    Args:
        image_width: Original image width in pixels
        image_height: Original image height in pixels

    Returns:
        New BoundingBox with coordinates in 0-1024 range
    """
    return BoundingBox(
        x1=self.x1 / image_width * NORMALIZED_SIZE,
        y1=self.y1 / image_height * NORMALIZED_SIZE,
        x2=self.x2 / image_width * NORMALIZED_SIZE,
        y2=self.y2 / image_height * NORMALIZED_SIZE,
    )

to_absolute

to_absolute(
    image_width: int, image_height: int
) -> BoundingBox

Convert from normalized (0-1024) to absolute pixel coordinates.

PARAMETER DESCRIPTION
image_width

Target image width in pixels

TYPE: int

image_height

Target image height in pixels

TYPE: int

RETURNS DESCRIPTION
BoundingBox

New BoundingBox with absolute pixel coordinates

Source code in omnidocs/tasks/ocr_extraction/models.py
def to_absolute(self, image_width: int, image_height: int) -> "BoundingBox":
    """
    Convert from normalized (0-1024) to absolute pixel coordinates.

    Args:
        image_width: Target image width in pixels
        image_height: Target image height in pixels

    Returns:
        New BoundingBox with absolute pixel coordinates
    """
    return BoundingBox(
        x1=self.x1 / NORMALIZED_SIZE * image_width,
        y1=self.y1 / NORMALIZED_SIZE * image_height,
        x2=self.x2 / NORMALIZED_SIZE * image_width,
        y2=self.y2 / NORMALIZED_SIZE * image_height,
    )

OCRGranularity

Bases: str, Enum

OCR detection granularity levels.

Different OCR engines return results at different granularity levels. This enum standardizes the options across all extractors.

OCROutput

Bases: BaseModel

Complete OCR extraction results for a single image.

Contains all detected text blocks with their bounding boxes, plus metadata about the extraction.

Example
result = ocr.extract(image)
print(f"Found {result.block_count} blocks")
print(f"Full text: {result.full_text}")
for block in result.text_blocks:
        print(f"'{block.text}' @ {block.bbox.to_list()}")

block_count property

block_count: int

Number of detected text blocks.

word_count property

word_count: int

Approximate word count from full text.

average_confidence property

average_confidence: float

Average confidence across all text blocks.

filter_by_confidence

filter_by_confidence(
    min_confidence: float,
) -> List[TextBlock]

Filter text blocks by minimum confidence.

Source code in omnidocs/tasks/ocr_extraction/models.py
def filter_by_confidence(self, min_confidence: float) -> List[TextBlock]:
    """Filter text blocks by minimum confidence."""
    return [b for b in self.text_blocks if b.confidence >= min_confidence]

filter_by_granularity

filter_by_granularity(
    granularity: OCRGranularity,
) -> List[TextBlock]

Filter text blocks by granularity level.

Source code in omnidocs/tasks/ocr_extraction/models.py
def filter_by_granularity(self, granularity: OCRGranularity) -> List[TextBlock]:
    """Filter text blocks by granularity level."""
    return [b for b in self.text_blocks if b.granularity == granularity]

to_dict

to_dict() -> Dict

Convert to dictionary representation.

Source code in omnidocs/tasks/ocr_extraction/models.py
def to_dict(self) -> Dict:
    """Convert to dictionary representation."""
    return {
        "text_blocks": [b.to_dict() for b in self.text_blocks],
        "full_text": self.full_text,
        "image_width": self.image_width,
        "image_height": self.image_height,
        "model_name": self.model_name,
        "languages_detected": self.languages_detected,
        "block_count": self.block_count,
        "word_count": self.word_count,
        "average_confidence": self.average_confidence,
    }

sort_by_position

sort_by_position(top_to_bottom: bool = True) -> OCROutput

Return a new OCROutput with blocks sorted by position.

PARAMETER DESCRIPTION
top_to_bottom

If True, sort by y-coordinate (reading order)

TYPE: bool DEFAULT: True

RETURNS DESCRIPTION
OCROutput

New OCROutput with sorted text blocks

Source code in omnidocs/tasks/ocr_extraction/models.py
def sort_by_position(self, top_to_bottom: bool = True) -> "OCROutput":
    """
    Return a new OCROutput with blocks sorted by position.

    Args:
        top_to_bottom: If True, sort by y-coordinate (reading order)

    Returns:
        New OCROutput with sorted text blocks
    """
    sorted_blocks = sorted(
        self.text_blocks,
        key=lambda b: (b.bbox.y1, b.bbox.x1),
        reverse=not top_to_bottom,
    )
    # Regenerate full_text in sorted order
    full_text = " ".join(b.text for b in sorted_blocks)

    return OCROutput(
        text_blocks=sorted_blocks,
        full_text=full_text,
        image_width=self.image_width,
        image_height=self.image_height,
        model_name=self.model_name,
        languages_detected=self.languages_detected,
    )

get_normalized_blocks

get_normalized_blocks() -> List[Dict]

Get all text blocks with normalized (0-1024) coordinates.

RETURNS DESCRIPTION
List[Dict]

List of dicts with normalized bbox coordinates and metadata.

Source code in omnidocs/tasks/ocr_extraction/models.py
def get_normalized_blocks(self) -> List[Dict]:
    """
    Get all text blocks with normalized (0-1024) coordinates.

    Returns:
        List of dicts with normalized bbox coordinates and metadata.
    """
    normalized = []
    for block in self.text_blocks:
        norm_bbox = block.bbox.to_normalized(self.image_width, self.image_height)
        normalized.append(
            {
                "text": block.text,
                "bbox": norm_bbox.to_list(),
                "confidence": block.confidence,
                "granularity": block.granularity.value,
                "language": block.language,
            }
        )
    return normalized

visualize

visualize(
    image: Image,
    output_path: Optional[Union[str, Path]] = None,
    show_text: bool = True,
    show_confidence: bool = False,
    line_width: int = 2,
    box_color: str = "#2ECC71",
    text_color: str = "#000000",
) -> Image.Image

Visualize OCR results on the image.

Draws bounding boxes around detected text with optional labels.

PARAMETER DESCRIPTION
image

PIL Image to draw on (will be copied, not modified)

TYPE: Image

output_path

Optional path to save the visualization

TYPE: Optional[Union[str, Path]] DEFAULT: None

show_text

Whether to show detected text

TYPE: bool DEFAULT: True

show_confidence

Whether to show confidence scores

TYPE: bool DEFAULT: False

line_width

Width of bounding box lines

TYPE: int DEFAULT: 2

box_color

Color for bounding boxes (hex)

TYPE: str DEFAULT: '#2ECC71'

text_color

Color for text labels (hex)

TYPE: str DEFAULT: '#000000'

RETURNS DESCRIPTION
Image

PIL Image with visualizations drawn

Example
result = ocr.extract(image)
viz = result.visualize(image, output_path="ocr_viz.png")
Source code in omnidocs/tasks/ocr_extraction/models.py
def visualize(
    self,
    image: "Image.Image",
    output_path: Optional[Union[str, Path]] = None,
    show_text: bool = True,
    show_confidence: bool = False,
    line_width: int = 2,
    box_color: str = "#2ECC71",
    text_color: str = "#000000",
) -> "Image.Image":
    """
    Visualize OCR results on the image.

    Draws bounding boxes around detected text with optional labels.

    Args:
        image: PIL Image to draw on (will be copied, not modified)
        output_path: Optional path to save the visualization
        show_text: Whether to show detected text
        show_confidence: Whether to show confidence scores
        line_width: Width of bounding box lines
        box_color: Color for bounding boxes (hex)
        text_color: Color for text labels (hex)

    Returns:
        PIL Image with visualizations drawn

    Example:
        ```python
        result = ocr.extract(image)
        viz = result.visualize(image, output_path="ocr_viz.png")
        ```
    """
    from PIL import ImageDraw, ImageFont

    # Copy image to avoid modifying original
    viz_image = image.copy().convert("RGB")
    draw = ImageDraw.Draw(viz_image)

    # Try to get a font
    try:
        font = ImageFont.truetype("/System/Library/Fonts/Helvetica.ttc", 12)
    except Exception:
        try:
            font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 12)
        except Exception:
            font = ImageFont.load_default()

    for block in self.text_blocks:
        coords = block.bbox.to_xyxy()

        # Draw polygon if available, otherwise draw rectangle
        if block.polygon:
            flat_polygon = [coord for point in block.polygon for coord in point]
            draw.polygon(flat_polygon, outline=box_color, width=line_width)
        else:
            draw.rectangle(coords, outline=box_color, width=line_width)

        # Build label text
        if show_text or show_confidence:
            label_parts = []
            if show_text:
                # Truncate long text
                text = block.text[:25] + "..." if len(block.text) > 25 else block.text
                label_parts.append(text)
            if show_confidence:
                label_parts.append(f"{block.confidence:.2f}")
            label_text = " | ".join(label_parts)

            # Position label below the box
            label_x = coords[0]
            label_y = coords[3] + 2  # Below bottom edge

            # Draw label with background
            text_bbox = draw.textbbox((label_x, label_y), label_text, font=font)
            padding = 2
            draw.rectangle(
                [
                    text_bbox[0] - padding,
                    text_bbox[1] - padding,
                    text_bbox[2] + padding,
                    text_bbox[3] + padding,
                ],
                fill="#FFFFFF",
                outline=box_color,
            )
            draw.text((label_x, label_y), label_text, fill=text_color, font=font)

    # Save if path provided
    if output_path:
        output_path = Path(output_path)
        output_path.parent.mkdir(parents=True, exist_ok=True)
        viz_image.save(output_path)

    return viz_image

load_json classmethod

load_json(file_path: Union[str, Path]) -> OCROutput

Load an OCROutput instance from a JSON file.

PARAMETER DESCRIPTION
file_path

Path to JSON file

TYPE: Union[str, Path]

RETURNS DESCRIPTION
OCROutput

OCROutput instance

Source code in omnidocs/tasks/ocr_extraction/models.py
@classmethod
def load_json(cls, file_path: Union[str, Path]) -> "OCROutput":
    """
    Load an OCROutput instance from a JSON file.

    Args:
        file_path: Path to JSON file

    Returns:
        OCROutput instance
    """
    path = Path(file_path)
    return cls.model_validate_json(path.read_text(encoding="utf-8"))

save_json

save_json(file_path: Union[str, Path]) -> None

Save OCROutput instance to a JSON file.

PARAMETER DESCRIPTION
file_path

Path where JSON file should be saved

TYPE: Union[str, Path]

Source code in omnidocs/tasks/ocr_extraction/models.py
def save_json(self, file_path: Union[str, Path]) -> None:
    """
    Save OCROutput instance to a JSON file.

    Args:
        file_path: Path where JSON file should be saved
    """
    path = Path(file_path)
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(self.model_dump_json(indent=2), encoding="utf-8")

TextBlock

Bases: BaseModel

Single detected text element with text, bounding box, and confidence.

This is the fundamental unit of OCR output - can represent a character, word, line, or block depending on the OCR model and configuration.

Example
block = TextBlock(
        text="Hello",
        bbox=BoundingBox(x1=100, y1=50, x2=200, y2=80),
        confidence=0.95,
        granularity=OCRGranularity.WORD,
    )

to_dict

to_dict() -> Dict

Convert to dictionary representation.

Source code in omnidocs/tasks/ocr_extraction/models.py
def to_dict(self) -> Dict:
    """Convert to dictionary representation."""
    return {
        "text": self.text,
        "bbox": self.bbox.to_list(),
        "confidence": self.confidence,
        "granularity": self.granularity.value,
        "polygon": self.polygon,
        "language": self.language,
    }

get_normalized_bbox

get_normalized_bbox(
    image_width: int, image_height: int
) -> BoundingBox

Get bounding box in normalized (0-1024) coordinates.

PARAMETER DESCRIPTION
image_width

Original image width

TYPE: int

image_height

Original image height

TYPE: int

RETURNS DESCRIPTION
BoundingBox

BoundingBox with normalized coordinates

Source code in omnidocs/tasks/ocr_extraction/models.py
def get_normalized_bbox(self, image_width: int, image_height: int) -> BoundingBox:
    """
    Get bounding box in normalized (0-1024) coordinates.

    Args:
        image_width: Original image width
        image_height: Original image height

    Returns:
        BoundingBox with normalized coordinates
    """
    return self.bbox.to_normalized(image_width, image_height)

PaddleOCR

PaddleOCR(config: PaddleOCRConfig)

Bases: BaseOCRExtractor

PaddleOCR text extractor.

Single-backend model (PaddlePaddle - CPU/GPU).

Example
from omnidocs.tasks.ocr_extraction import PaddleOCR, PaddleOCRConfig

ocr = PaddleOCR(config=PaddleOCRConfig(lang="en", device="cpu"))
result = ocr.extract(image)

for block in result.text_blocks:
        print(f"'{block.text}' @ {block.bbox.to_list()}")

Initialize PaddleOCR extractor.

PARAMETER DESCRIPTION
config

Configuration object

TYPE: PaddleOCRConfig

RAISES DESCRIPTION
ImportError

If paddleocr or paddlepaddle is not installed

Source code in omnidocs/tasks/ocr_extraction/paddleocr.py
def __init__(self, config: PaddleOCRConfig):
    """
    Initialize PaddleOCR extractor.

    Args:
        config: Configuration object

    Raises:
        ImportError: If paddleocr or paddlepaddle is not installed
    """
    self.config = config
    self._ocr = None

    # Normalize language code
    self._lang = LANG_CODES.get(config.lang.lower(), config.lang)

    self._load_model()

extract

extract(
    image: Union[Image, ndarray, str, Path],
) -> OCROutput

Run OCR on an image.

PARAMETER DESCRIPTION
image

Input image (PIL Image, numpy array, or path)

TYPE: Union[Image, ndarray, str, Path]

RETURNS DESCRIPTION
OCROutput

OCROutput with detected text blocks

Source code in omnidocs/tasks/ocr_extraction/paddleocr.py
def extract(self, image: Union[Image.Image, np.ndarray, str, Path]) -> OCROutput:
    """
    Run OCR on an image.

    Args:
        image: Input image (PIL Image, numpy array, or path)

    Returns:
        OCROutput with detected text blocks
    """
    if self._ocr is None:
        raise RuntimeError("PaddleOCR not initialized. Call _load_model() first.")

    # Prepare image
    pil_image = self._prepare_image(image)
    image_width, image_height = pil_image.size

    # Convert to numpy array
    image_array = np.array(pil_image)

    # Run PaddleOCR v3.x - use predict() method
    results = self._ocr.predict(image_array)

    # Parse results
    text_blocks = []

    # PaddleOCR may return None or empty results
    if results is None or len(results) == 0:
        return OCROutput(
            text_blocks=[],
            full_text="",
            image_width=image_width,
            image_height=image_height,
            model_name=self.MODEL_NAME,
            languages_detected=[self._lang],
        )

    # PaddleOCR v3.x returns list of dicts with 'rec_texts', 'rec_scores', 'dt_polys'
    for result in results:
        if result is None:
            continue

        rec_texts = result.get("rec_texts", [])
        rec_scores = result.get("rec_scores", [])
        dt_polys = result.get("dt_polys", [])

        for i, text in enumerate(rec_texts):
            if not text.strip():
                continue

            confidence = rec_scores[i] if i < len(rec_scores) else 1.0

            # Get polygon and convert to list
            polygon: Optional[List[List[float]]] = None
            if i < len(dt_polys) and dt_polys[i] is not None:
                poly_array = dt_polys[i]
                # Handle numpy array
                if hasattr(poly_array, "tolist"):
                    polygon = poly_array.tolist()
                else:
                    polygon = list(poly_array)

            # Convert polygon to bbox
            if polygon:
                bbox = BoundingBox.from_polygon(polygon)
            else:
                bbox = BoundingBox(x1=0, y1=0, x2=0, y2=0)

            text_blocks.append(
                TextBlock(
                    text=text,
                    bbox=bbox,
                    confidence=float(confidence),
                    granularity=OCRGranularity.LINE,
                    polygon=polygon,
                    language=self._lang,
                )
            )

    # Sort by position (top to bottom, left to right)
    text_blocks.sort(key=lambda b: (b.bbox.y1, b.bbox.x1))

    # Build full_text from sorted blocks to ensure reading order
    full_text = " ".join(block.text for block in text_blocks)

    return OCROutput(
        text_blocks=text_blocks,
        full_text=full_text,
        image_width=image_width,
        image_height=image_height,
        model_name=self.MODEL_NAME,
        languages_detected=[self._lang],
    )

PaddleOCRConfig

Bases: BaseModel

Configuration for PaddleOCR extractor.

This is a single-backend model (PaddlePaddle - CPU/GPU).

Example
config = PaddleOCRConfig(lang="ch", device="gpu")
ocr = PaddleOCR(config=config)

TesseractOCR

TesseractOCR(config: TesseractOCRConfig)

Bases: BaseOCRExtractor

Tesseract OCR extractor.

Single-backend model (CPU only). Requires system Tesseract installation.

Example
from omnidocs.tasks.ocr_extraction import TesseractOCR, TesseractOCRConfig

ocr = TesseractOCR(config=TesseractOCRConfig(languages=["eng"]))
result = ocr.extract(image)

for block in result.text_blocks:
        print(f"'{block.text}' @ {block.bbox.to_list()}")

Initialize Tesseract OCR extractor.

PARAMETER DESCRIPTION
config

Configuration object

TYPE: TesseractOCRConfig

RAISES DESCRIPTION
RuntimeError

If Tesseract is not installed

ImportError

If pytesseract is not installed

Source code in omnidocs/tasks/ocr_extraction/tesseract.py
def __init__(self, config: TesseractOCRConfig):
    """
    Initialize Tesseract OCR extractor.

    Args:
        config: Configuration object

    Raises:
        RuntimeError: If Tesseract is not installed
        ImportError: If pytesseract is not installed
    """
    self.config = config
    self._pytesseract = None
    self._load_model()

extract

extract(
    image: Union[Image, ndarray, str, Path],
) -> OCROutput

Run OCR on an image.

PARAMETER DESCRIPTION
image

Input image (PIL Image, numpy array, or path)

TYPE: Union[Image, ndarray, str, Path]

RETURNS DESCRIPTION
OCROutput

OCROutput with detected text blocks at word level

Source code in omnidocs/tasks/ocr_extraction/tesseract.py
def extract(self, image: Union[Image.Image, np.ndarray, str, Path]) -> OCROutput:
    """
    Run OCR on an image.

    Args:
        image: Input image (PIL Image, numpy array, or path)

    Returns:
        OCROutput with detected text blocks at word level
    """
    if self._pytesseract is None:
        raise RuntimeError("Tesseract not initialized. Call _load_model() first.")

    # Prepare image
    pil_image = self._prepare_image(image)
    image_width, image_height = pil_image.size

    # Build config string
    config = f"--oem {self.config.oem} --psm {self.config.psm}"
    if self.config.config_params:
        for key, value in self.config.config_params.items():
            config += f" -c {key}={value}"

    # Language string
    lang_str = "+".join(self.config.languages)

    # Get detailed data (word-level boxes)
    data = self._pytesseract.image_to_data(
        pil_image,
        lang=lang_str,
        config=config,
        output_type=self._pytesseract.Output.DICT,
    )

    # Parse results into TextBlocks
    text_blocks = []
    full_text_parts = []

    n_boxes = len(data["text"])
    for i in range(n_boxes):
        text = data["text"][i].strip()
        # Safely convert conf to float (handles string values from some Tesseract versions)
        try:
            conf = float(data["conf"][i])
        except (ValueError, TypeError):
            conf = -1

        # Skip empty text or low confidence (-1 means no confidence)
        if not text or conf == -1:
            continue

        # Tesseract returns confidence as 0-100, normalize to 0-1
        confidence = conf / 100.0

        # Get bounding box
        x = data["left"][i]
        y = data["top"][i]
        w = data["width"][i]
        h = data["height"][i]

        bbox = BoundingBox(
            x1=float(x),
            y1=float(y),
            x2=float(x + w),
            y2=float(y + h),
        )

        text_blocks.append(
            TextBlock(
                text=text,
                bbox=bbox,
                confidence=confidence,
                granularity=OCRGranularity.WORD,
                language=lang_str,
            )
        )

        full_text_parts.append(text)

    # Sort by position (top to bottom, left to right)
    text_blocks.sort(key=lambda b: (b.bbox.y1, b.bbox.x1))

    return OCROutput(
        text_blocks=text_blocks,
        full_text=" ".join(full_text_parts),
        image_width=image_width,
        image_height=image_height,
        model_name=self.MODEL_NAME,
        languages_detected=self.config.languages,
    )

extract_lines

extract_lines(
    image: Union[Image, ndarray, str, Path],
) -> OCROutput

Run OCR and return line-level blocks.

Groups words into lines based on Tesseract's line detection.

PARAMETER DESCRIPTION
image

Input image (PIL Image, numpy array, or path)

TYPE: Union[Image, ndarray, str, Path]

RETURNS DESCRIPTION
OCROutput

OCROutput with line-level text blocks

Source code in omnidocs/tasks/ocr_extraction/tesseract.py
def extract_lines(self, image: Union[Image.Image, np.ndarray, str, Path]) -> OCROutput:
    """
    Run OCR and return line-level blocks.

    Groups words into lines based on Tesseract's line detection.

    Args:
        image: Input image (PIL Image, numpy array, or path)

    Returns:
        OCROutput with line-level text blocks
    """
    if self._pytesseract is None:
        raise RuntimeError("Tesseract not initialized. Call _load_model() first.")

    # Prepare image
    pil_image = self._prepare_image(image)
    image_width, image_height = pil_image.size

    # Build config string (including config_params like extract method)
    config = f"--oem {self.config.oem} --psm {self.config.psm}"
    if self.config.config_params:
        for key, value in self.config.config_params.items():
            config += f" -c {key}={value}"

    # Language string
    lang_str = "+".join(self.config.languages)

    # Get detailed data
    data = self._pytesseract.image_to_data(
        pil_image,
        lang=lang_str,
        config=config,
        output_type=self._pytesseract.Output.DICT,
    )

    # Group words into lines
    lines: Dict[tuple, Dict] = {}
    n_boxes = len(data["text"])

    for i in range(n_boxes):
        text = data["text"][i].strip()
        # Safely convert conf to float (handles string values from some Tesseract versions)
        try:
            conf = float(data["conf"][i])
        except (ValueError, TypeError):
            conf = -1

        if not text or conf == -1:
            continue

        # Tesseract provides block_num, par_num, line_num
        line_key = (data["block_num"][i], data["par_num"][i], data["line_num"][i])

        x = data["left"][i]
        y = data["top"][i]
        w = data["width"][i]
        h = data["height"][i]

        if line_key not in lines:
            lines[line_key] = {
                "words": [],
                "confidences": [],
                "x1": x,
                "y1": y,
                "x2": x + w,
                "y2": y + h,
            }

        lines[line_key]["words"].append(text)
        lines[line_key]["confidences"].append(conf / 100.0)
        lines[line_key]["x1"] = min(lines[line_key]["x1"], x)
        lines[line_key]["y1"] = min(lines[line_key]["y1"], y)
        lines[line_key]["x2"] = max(lines[line_key]["x2"], x + w)
        lines[line_key]["y2"] = max(lines[line_key]["y2"], y + h)

    # Convert to TextBlocks
    text_blocks = []
    full_text_parts = []

    for line_key in sorted(lines.keys()):
        line = lines[line_key]
        line_text = " ".join(line["words"])
        avg_conf = sum(line["confidences"]) / len(line["confidences"])

        bbox = BoundingBox(
            x1=float(line["x1"]),
            y1=float(line["y1"]),
            x2=float(line["x2"]),
            y2=float(line["y2"]),
        )

        text_blocks.append(
            TextBlock(
                text=line_text,
                bbox=bbox,
                confidence=avg_conf,
                granularity=OCRGranularity.LINE,
                language=lang_str,
            )
        )

        full_text_parts.append(line_text)

    return OCROutput(
        text_blocks=text_blocks,
        full_text="\n".join(full_text_parts),
        image_width=image_width,
        image_height=image_height,
        model_name=self.MODEL_NAME,
        languages_detected=self.config.languages,
    )

TesseractOCRConfig

Bases: BaseModel

Configuration for Tesseract OCR extractor.

This is a single-backend model (CPU only, requires system Tesseract).

Example
config = TesseractOCRConfig(languages=["eng", "fra"], psm=3)
ocr = TesseractOCR(config=config)

base

Base class for OCR extractors.

Defines the abstract interface that all OCR extractors must implement.

BaseOCRExtractor

Bases: ABC

Abstract base class for OCR extractors.

All OCR extraction models must inherit from this class and implement the required methods.

Example
class MyOCRExtractor(BaseOCRExtractor):
        def __init__(self, config: MyConfig):
            self.config = config
            self._load_model()

        def _load_model(self):
            # Initialize OCR engine
            pass

        def extract(self, image):
            # Run OCR extraction
            return OCROutput(...)

extract abstractmethod

extract(
    image: Union[Image, ndarray, str, Path],
) -> OCROutput

Run OCR extraction on an image.

PARAMETER DESCRIPTION
image

Input image as: - PIL.Image.Image: PIL image object - np.ndarray: Numpy array (HWC format, RGB) - str or Path: Path to image file

TYPE: Union[Image, ndarray, str, Path]

RETURNS DESCRIPTION
OCROutput

OCROutput containing detected text blocks with bounding boxes

RAISES DESCRIPTION
ValueError

If image format is not supported

RuntimeError

If OCR engine is not initialized or extraction fails

Source code in omnidocs/tasks/ocr_extraction/base.py
@abstractmethod
def extract(self, image: Union[Image.Image, np.ndarray, str, Path]) -> OCROutput:
    """
    Run OCR extraction on an image.

    Args:
        image: Input image as:
            - PIL.Image.Image: PIL image object
            - np.ndarray: Numpy array (HWC format, RGB)
            - str or Path: Path to image file

    Returns:
        OCROutput containing detected text blocks with bounding boxes

    Raises:
        ValueError: If image format is not supported
        RuntimeError: If OCR engine is not initialized or extraction fails
    """
    pass

batch_extract

batch_extract(
    images: List[Union[Image, ndarray, str, Path]],
    progress_callback: Optional[
        Callable[[int, int], None]
    ] = None,
) -> List[OCROutput]

Run OCR extraction on multiple images.

Default implementation loops over extract(). Subclasses can override for optimized batching.

PARAMETER DESCRIPTION
images

List of images in any supported format

TYPE: List[Union[Image, ndarray, str, Path]]

progress_callback

Optional function(current, total) for progress

TYPE: Optional[Callable[[int, int], None]] DEFAULT: None

RETURNS DESCRIPTION
List[OCROutput]

List of OCROutput in same order as input

Examples:

images = [doc.get_page(i) for i in range(doc.page_count)]
results = extractor.batch_extract(images)
Source code in omnidocs/tasks/ocr_extraction/base.py
def batch_extract(
    self,
    images: List[Union[Image.Image, np.ndarray, str, Path]],
    progress_callback: Optional[Callable[[int, int], None]] = None,
) -> List[OCROutput]:
    """
    Run OCR extraction on multiple images.

    Default implementation loops over extract(). Subclasses can override
    for optimized batching.

    Args:
        images: List of images in any supported format
        progress_callback: Optional function(current, total) for progress

    Returns:
        List of OCROutput in same order as input

    Examples:
        ```python
        images = [doc.get_page(i) for i in range(doc.page_count)]
        results = extractor.batch_extract(images)
        ```
    """
    results = []
    total = len(images)

    for i, image in enumerate(images):
        if progress_callback:
            progress_callback(i + 1, total)

        result = self.extract(image)
        results.append(result)

    return results

extract_document

extract_document(
    document: Document,
    progress_callback: Optional[
        Callable[[int, int], None]
    ] = None,
) -> List[OCROutput]

Run OCR extraction on all pages of a document.

PARAMETER DESCRIPTION
document

Document instance

TYPE: Document

progress_callback

Optional function(current, total) for progress

TYPE: Optional[Callable[[int, int], None]] DEFAULT: None

RETURNS DESCRIPTION
List[OCROutput]

List of OCROutput, one per page

Examples:

doc = Document.from_pdf("paper.pdf")
results = extractor.extract_document(doc)
Source code in omnidocs/tasks/ocr_extraction/base.py
def extract_document(
    self,
    document: "Document",
    progress_callback: Optional[Callable[[int, int], None]] = None,
) -> List[OCROutput]:
    """
    Run OCR extraction on all pages of a document.

    Args:
        document: Document instance
        progress_callback: Optional function(current, total) for progress

    Returns:
        List of OCROutput, one per page

    Examples:
        ```python
        doc = Document.from_pdf("paper.pdf")
        results = extractor.extract_document(doc)
        ```
    """
    results = []
    total = document.page_count

    for i, page in enumerate(document.iter_pages()):
        if progress_callback:
            progress_callback(i + 1, total)

        result = self.extract(page)
        results.append(result)

    return results

easyocr

EasyOCR extractor.

EasyOCR is a PyTorch-based OCR engine with excellent multi-language support. - GPU accelerated (optional) - Supports 80+ languages - Good for scene text and printed documents

Python Package

pip install easyocr

Model Download Location

By default, EasyOCR downloads models to ~/.EasyOCR/ Can be overridden with model_storage_directory parameter

EasyOCRConfig

Bases: BaseModel

Configuration for EasyOCR extractor.

This is a single-backend model (PyTorch - CPU/GPU).

Example
config = EasyOCRConfig(languages=["en", "ch_sim"], gpu=True)
ocr = EasyOCR(config=config)

EasyOCR

EasyOCR(config: EasyOCRConfig)

Bases: BaseOCRExtractor

EasyOCR text extractor.

Single-backend model (PyTorch - CPU/GPU).

Example
from omnidocs.tasks.ocr_extraction import EasyOCR, EasyOCRConfig

ocr = EasyOCR(config=EasyOCRConfig(languages=["en"], gpu=True))
result = ocr.extract(image)

for block in result.text_blocks:
        print(f"'{block.text}' @ {block.bbox.to_list()}")

Initialize EasyOCR extractor.

PARAMETER DESCRIPTION
config

Configuration object

TYPE: EasyOCRConfig

RAISES DESCRIPTION
ImportError

If easyocr is not installed

Source code in omnidocs/tasks/ocr_extraction/easyocr.py
def __init__(self, config: EasyOCRConfig):
    """
    Initialize EasyOCR extractor.

    Args:
        config: Configuration object

    Raises:
        ImportError: If easyocr is not installed
    """
    self.config = config
    self._reader = None
    self._load_model()

extract

extract(
    image: Union[Image, ndarray, str, Path],
    detail: int = 1,
    paragraph: bool = False,
    min_size: int = 10,
    text_threshold: float = 0.7,
    low_text: float = 0.4,
    link_threshold: float = 0.4,
    canvas_size: int = 2560,
    mag_ratio: float = 1.0,
) -> OCROutput

Run OCR on an image.

PARAMETER DESCRIPTION
image

Input image (PIL Image, numpy array, or path)

TYPE: Union[Image, ndarray, str, Path]

detail

0 = simple output, 1 = detailed with boxes

TYPE: int DEFAULT: 1

paragraph

Combine results into paragraphs

TYPE: bool DEFAULT: False

min_size

Minimum text box size

TYPE: int DEFAULT: 10

text_threshold

Text confidence threshold

TYPE: float DEFAULT: 0.7

low_text

Low text bound

TYPE: float DEFAULT: 0.4

link_threshold

Link threshold for text joining

TYPE: float DEFAULT: 0.4

canvas_size

Max image dimension for processing

TYPE: int DEFAULT: 2560

mag_ratio

Magnification ratio

TYPE: float DEFAULT: 1.0

RETURNS DESCRIPTION
OCROutput

OCROutput with detected text blocks

RAISES DESCRIPTION
ValueError

If detail is not 0 or 1

RuntimeError

If EasyOCR is not initialized

Source code in omnidocs/tasks/ocr_extraction/easyocr.py
def extract(
    self,
    image: Union[Image.Image, np.ndarray, str, Path],
    detail: int = 1,
    paragraph: bool = False,
    min_size: int = 10,
    text_threshold: float = 0.7,
    low_text: float = 0.4,
    link_threshold: float = 0.4,
    canvas_size: int = 2560,
    mag_ratio: float = 1.0,
) -> OCROutput:
    """
    Run OCR on an image.

    Args:
        image: Input image (PIL Image, numpy array, or path)
        detail: 0 = simple output, 1 = detailed with boxes
        paragraph: Combine results into paragraphs
        min_size: Minimum text box size
        text_threshold: Text confidence threshold
        low_text: Low text bound
        link_threshold: Link threshold for text joining
        canvas_size: Max image dimension for processing
        mag_ratio: Magnification ratio

    Returns:
        OCROutput with detected text blocks

    Raises:
        ValueError: If detail is not 0 or 1
        RuntimeError: If EasyOCR is not initialized
    """
    if self._reader is None:
        raise RuntimeError("EasyOCR not initialized. Call _load_model() first.")

    # Validate detail parameter
    if detail not in (0, 1):
        raise ValueError(f"detail must be 0 or 1, got {detail}")

    # Prepare image
    pil_image = self._prepare_image(image)
    image_width, image_height = pil_image.size

    # Convert to numpy array for EasyOCR
    image_array = np.array(pil_image)

    # Run EasyOCR
    results = self._reader.readtext(
        image_array,
        detail=detail,
        paragraph=paragraph,
        min_size=min_size,
        text_threshold=text_threshold,
        low_text=low_text,
        link_threshold=link_threshold,
        canvas_size=canvas_size,
        mag_ratio=mag_ratio,
    )

    # Parse results
    text_blocks = []
    full_text_parts = []

    for result in results:
        if detail == 0:
            # Simple output: just text
            text = result
            confidence = 1.0
            bbox = BoundingBox(x1=0, y1=0, x2=0, y2=0)
            polygon = None
        else:
            # Detailed output: [polygon, text, confidence]
            polygon_points, text, confidence = result

            # EasyOCR returns 4 corner points: [[x1,y1], [x2,y1], [x2,y2], [x1,y2]]
            # Convert to list of lists for storage
            polygon = [list(p) for p in polygon_points]

            # Convert to axis-aligned bounding box
            bbox = BoundingBox.from_polygon(polygon)

        if not text.strip():
            continue

        text_blocks.append(
            TextBlock(
                text=text,
                bbox=bbox,
                confidence=float(confidence),
                granularity=(OCRGranularity.LINE if paragraph else OCRGranularity.WORD),
                polygon=polygon,
                language="+".join(self.config.languages),
            )
        )

        full_text_parts.append(text)

    # Sort by position
    text_blocks.sort(key=lambda b: (b.bbox.y1, b.bbox.x1))

    return OCROutput(
        text_blocks=text_blocks,
        full_text=" ".join(full_text_parts),
        image_width=image_width,
        image_height=image_height,
        model_name=self.MODEL_NAME,
        languages_detected=self.config.languages,
    )

extract_batch

extract_batch(
    images: List[Union[Image, ndarray, str, Path]], **kwargs
) -> List[OCROutput]

Run OCR on multiple images.

PARAMETER DESCRIPTION
images

List of input images

TYPE: List[Union[Image, ndarray, str, Path]]

**kwargs

Arguments passed to extract()

DEFAULT: {}

RETURNS DESCRIPTION
List[OCROutput]

List of OCROutput objects

Source code in omnidocs/tasks/ocr_extraction/easyocr.py
def extract_batch(
    self,
    images: List[Union[Image.Image, np.ndarray, str, Path]],
    **kwargs,
) -> List[OCROutput]:
    """
    Run OCR on multiple images.

    Args:
        images: List of input images
        **kwargs: Arguments passed to extract()

    Returns:
        List of OCROutput objects
    """
    results = []
    for img in images:
        results.append(self.extract(img, **kwargs))
    return results

models

Pydantic models for OCR extraction outputs.

Defines standardized output types for OCR detection including text blocks with bounding boxes, confidence scores, and granularity levels.

Key difference from Text Extraction: - OCR returns text WITH bounding boxes (word/line/character level) - Text Extraction returns formatted text (MD/HTML) WITHOUT bboxes

Coordinate Systems
  • Absolute (default): Coordinates in pixels relative to original image size
  • Normalized (0-1024): Coordinates scaled to 0-1024 range (virtual 1024x1024 canvas)

Use bbox.to_normalized(width, height) or output.get_normalized_blocks() to convert to normalized coordinates.

Example
result = ocr.extract(image)  # Returns absolute pixel coordinates
normalized = result.get_normalized_blocks()  # Returns 0-1024 normalized coords

OCRGranularity

Bases: str, Enum

OCR detection granularity levels.

Different OCR engines return results at different granularity levels. This enum standardizes the options across all extractors.

BoundingBox

Bases: BaseModel

Bounding box coordinates in pixel space.

Coordinates follow the convention: (x1, y1) is top-left, (x2, y2) is bottom-right. For rotated text, use the polygon field in TextBlock instead.

Example
bbox = BoundingBox(x1=100, y1=50, x2=300, y2=80)
print(bbox.width, bbox.height)  # 200, 30
print(bbox.center)  # (200.0, 65.0)

width property

width: float

Width of the bounding box.

height property

height: float

Height of the bounding box.

area property

area: float

Area of the bounding box.

center property

center: Tuple[float, float]

Center point of the bounding box.

to_list

to_list() -> List[float]

Convert to [x1, y1, x2, y2] list.

Source code in omnidocs/tasks/ocr_extraction/models.py
def to_list(self) -> List[float]:
    """Convert to [x1, y1, x2, y2] list."""
    return [self.x1, self.y1, self.x2, self.y2]

to_xyxy

to_xyxy() -> Tuple[float, float, float, float]

Convert to (x1, y1, x2, y2) tuple.

Source code in omnidocs/tasks/ocr_extraction/models.py
def to_xyxy(self) -> Tuple[float, float, float, float]:
    """Convert to (x1, y1, x2, y2) tuple."""
    return (self.x1, self.y1, self.x2, self.y2)

to_xywh

to_xywh() -> Tuple[float, float, float, float]

Convert to (x, y, width, height) format.

Source code in omnidocs/tasks/ocr_extraction/models.py
def to_xywh(self) -> Tuple[float, float, float, float]:
    """Convert to (x, y, width, height) format."""
    return (self.x1, self.y1, self.width, self.height)

from_list classmethod

from_list(coords: List[float]) -> BoundingBox

Create from [x1, y1, x2, y2] list.

Source code in omnidocs/tasks/ocr_extraction/models.py
@classmethod
def from_list(cls, coords: List[float]) -> "BoundingBox":
    """Create from [x1, y1, x2, y2] list."""
    if len(coords) != 4:
        raise ValueError(f"Expected 4 coordinates, got {len(coords)}")
    return cls(x1=coords[0], y1=coords[1], x2=coords[2], y2=coords[3])

from_polygon classmethod

from_polygon(polygon: List[List[float]]) -> BoundingBox

Create axis-aligned bounding box from polygon points.

PARAMETER DESCRIPTION
polygon

List of [x, y] points (usually 4 for quadrilateral)

TYPE: List[List[float]]

RETURNS DESCRIPTION
BoundingBox

BoundingBox that encloses all polygon points

Source code in omnidocs/tasks/ocr_extraction/models.py
@classmethod
def from_polygon(cls, polygon: List[List[float]]) -> "BoundingBox":
    """
    Create axis-aligned bounding box from polygon points.

    Args:
        polygon: List of [x, y] points (usually 4 for quadrilateral)

    Returns:
        BoundingBox that encloses all polygon points
    """
    if not polygon:
        raise ValueError("Polygon cannot be empty")

    xs = [p[0] for p in polygon]
    ys = [p[1] for p in polygon]
    return cls(x1=min(xs), y1=min(ys), x2=max(xs), y2=max(ys))

to_normalized

to_normalized(
    image_width: int, image_height: int
) -> BoundingBox

Convert to normalized coordinates (0-1024 range).

Scales coordinates from absolute pixel values to a virtual 1024x1024 canvas. This provides consistent coordinates regardless of original image size.

PARAMETER DESCRIPTION
image_width

Original image width in pixels

TYPE: int

image_height

Original image height in pixels

TYPE: int

RETURNS DESCRIPTION
BoundingBox

New BoundingBox with coordinates in 0-1024 range

Source code in omnidocs/tasks/ocr_extraction/models.py
def to_normalized(self, image_width: int, image_height: int) -> "BoundingBox":
    """
    Convert to normalized coordinates (0-1024 range).

    Scales coordinates from absolute pixel values to a virtual 1024x1024 canvas.
    This provides consistent coordinates regardless of original image size.

    Args:
        image_width: Original image width in pixels
        image_height: Original image height in pixels

    Returns:
        New BoundingBox with coordinates in 0-1024 range
    """
    return BoundingBox(
        x1=self.x1 / image_width * NORMALIZED_SIZE,
        y1=self.y1 / image_height * NORMALIZED_SIZE,
        x2=self.x2 / image_width * NORMALIZED_SIZE,
        y2=self.y2 / image_height * NORMALIZED_SIZE,
    )

to_absolute

to_absolute(
    image_width: int, image_height: int
) -> BoundingBox

Convert from normalized (0-1024) to absolute pixel coordinates.

PARAMETER DESCRIPTION
image_width

Target image width in pixels

TYPE: int

image_height

Target image height in pixels

TYPE: int

RETURNS DESCRIPTION
BoundingBox

New BoundingBox with absolute pixel coordinates

Source code in omnidocs/tasks/ocr_extraction/models.py
def to_absolute(self, image_width: int, image_height: int) -> "BoundingBox":
    """
    Convert from normalized (0-1024) to absolute pixel coordinates.

    Args:
        image_width: Target image width in pixels
        image_height: Target image height in pixels

    Returns:
        New BoundingBox with absolute pixel coordinates
    """
    return BoundingBox(
        x1=self.x1 / NORMALIZED_SIZE * image_width,
        y1=self.y1 / NORMALIZED_SIZE * image_height,
        x2=self.x2 / NORMALIZED_SIZE * image_width,
        y2=self.y2 / NORMALIZED_SIZE * image_height,
    )

TextBlock

Bases: BaseModel

Single detected text element with text, bounding box, and confidence.

This is the fundamental unit of OCR output - can represent a character, word, line, or block depending on the OCR model and configuration.

Example
block = TextBlock(
        text="Hello",
        bbox=BoundingBox(x1=100, y1=50, x2=200, y2=80),
        confidence=0.95,
        granularity=OCRGranularity.WORD,
    )

to_dict

to_dict() -> Dict

Convert to dictionary representation.

Source code in omnidocs/tasks/ocr_extraction/models.py
def to_dict(self) -> Dict:
    """Convert to dictionary representation."""
    return {
        "text": self.text,
        "bbox": self.bbox.to_list(),
        "confidence": self.confidence,
        "granularity": self.granularity.value,
        "polygon": self.polygon,
        "language": self.language,
    }

get_normalized_bbox

get_normalized_bbox(
    image_width: int, image_height: int
) -> BoundingBox

Get bounding box in normalized (0-1024) coordinates.

PARAMETER DESCRIPTION
image_width

Original image width

TYPE: int

image_height

Original image height

TYPE: int

RETURNS DESCRIPTION
BoundingBox

BoundingBox with normalized coordinates

Source code in omnidocs/tasks/ocr_extraction/models.py
def get_normalized_bbox(self, image_width: int, image_height: int) -> BoundingBox:
    """
    Get bounding box in normalized (0-1024) coordinates.

    Args:
        image_width: Original image width
        image_height: Original image height

    Returns:
        BoundingBox with normalized coordinates
    """
    return self.bbox.to_normalized(image_width, image_height)

OCROutput

Bases: BaseModel

Complete OCR extraction results for a single image.

Contains all detected text blocks with their bounding boxes, plus metadata about the extraction.

Example
result = ocr.extract(image)
print(f"Found {result.block_count} blocks")
print(f"Full text: {result.full_text}")
for block in result.text_blocks:
        print(f"'{block.text}' @ {block.bbox.to_list()}")

block_count property

block_count: int

Number of detected text blocks.

word_count property

word_count: int

Approximate word count from full text.

average_confidence property

average_confidence: float

Average confidence across all text blocks.

filter_by_confidence

filter_by_confidence(
    min_confidence: float,
) -> List[TextBlock]

Filter text blocks by minimum confidence.

Source code in omnidocs/tasks/ocr_extraction/models.py
def filter_by_confidence(self, min_confidence: float) -> List[TextBlock]:
    """Filter text blocks by minimum confidence."""
    return [b for b in self.text_blocks if b.confidence >= min_confidence]

filter_by_granularity

filter_by_granularity(
    granularity: OCRGranularity,
) -> List[TextBlock]

Filter text blocks by granularity level.

Source code in omnidocs/tasks/ocr_extraction/models.py
def filter_by_granularity(self, granularity: OCRGranularity) -> List[TextBlock]:
    """Filter text blocks by granularity level."""
    return [b for b in self.text_blocks if b.granularity == granularity]

to_dict

to_dict() -> Dict

Convert to dictionary representation.

Source code in omnidocs/tasks/ocr_extraction/models.py
def to_dict(self) -> Dict:
    """Convert to dictionary representation."""
    return {
        "text_blocks": [b.to_dict() for b in self.text_blocks],
        "full_text": self.full_text,
        "image_width": self.image_width,
        "image_height": self.image_height,
        "model_name": self.model_name,
        "languages_detected": self.languages_detected,
        "block_count": self.block_count,
        "word_count": self.word_count,
        "average_confidence": self.average_confidence,
    }

sort_by_position

sort_by_position(top_to_bottom: bool = True) -> OCROutput

Return a new OCROutput with blocks sorted by position.

PARAMETER DESCRIPTION
top_to_bottom

If True, sort by y-coordinate (reading order)

TYPE: bool DEFAULT: True

RETURNS DESCRIPTION
OCROutput

New OCROutput with sorted text blocks

Source code in omnidocs/tasks/ocr_extraction/models.py
def sort_by_position(self, top_to_bottom: bool = True) -> "OCROutput":
    """
    Return a new OCROutput with blocks sorted by position.

    Args:
        top_to_bottom: If True, sort by y-coordinate (reading order)

    Returns:
        New OCROutput with sorted text blocks
    """
    sorted_blocks = sorted(
        self.text_blocks,
        key=lambda b: (b.bbox.y1, b.bbox.x1),
        reverse=not top_to_bottom,
    )
    # Regenerate full_text in sorted order
    full_text = " ".join(b.text for b in sorted_blocks)

    return OCROutput(
        text_blocks=sorted_blocks,
        full_text=full_text,
        image_width=self.image_width,
        image_height=self.image_height,
        model_name=self.model_name,
        languages_detected=self.languages_detected,
    )

get_normalized_blocks

get_normalized_blocks() -> List[Dict]

Get all text blocks with normalized (0-1024) coordinates.

RETURNS DESCRIPTION
List[Dict]

List of dicts with normalized bbox coordinates and metadata.

Source code in omnidocs/tasks/ocr_extraction/models.py
def get_normalized_blocks(self) -> List[Dict]:
    """
    Get all text blocks with normalized (0-1024) coordinates.

    Returns:
        List of dicts with normalized bbox coordinates and metadata.
    """
    normalized = []
    for block in self.text_blocks:
        norm_bbox = block.bbox.to_normalized(self.image_width, self.image_height)
        normalized.append(
            {
                "text": block.text,
                "bbox": norm_bbox.to_list(),
                "confidence": block.confidence,
                "granularity": block.granularity.value,
                "language": block.language,
            }
        )
    return normalized

visualize

visualize(
    image: Image,
    output_path: Optional[Union[str, Path]] = None,
    show_text: bool = True,
    show_confidence: bool = False,
    line_width: int = 2,
    box_color: str = "#2ECC71",
    text_color: str = "#000000",
) -> Image.Image

Visualize OCR results on the image.

Draws bounding boxes around detected text with optional labels.

PARAMETER DESCRIPTION
image

PIL Image to draw on (will be copied, not modified)

TYPE: Image

output_path

Optional path to save the visualization

TYPE: Optional[Union[str, Path]] DEFAULT: None

show_text

Whether to show detected text

TYPE: bool DEFAULT: True

show_confidence

Whether to show confidence scores

TYPE: bool DEFAULT: False

line_width

Width of bounding box lines

TYPE: int DEFAULT: 2

box_color

Color for bounding boxes (hex)

TYPE: str DEFAULT: '#2ECC71'

text_color

Color for text labels (hex)

TYPE: str DEFAULT: '#000000'

RETURNS DESCRIPTION
Image

PIL Image with visualizations drawn

Example
result = ocr.extract(image)
viz = result.visualize(image, output_path="ocr_viz.png")
Source code in omnidocs/tasks/ocr_extraction/models.py
def visualize(
    self,
    image: "Image.Image",
    output_path: Optional[Union[str, Path]] = None,
    show_text: bool = True,
    show_confidence: bool = False,
    line_width: int = 2,
    box_color: str = "#2ECC71",
    text_color: str = "#000000",
) -> "Image.Image":
    """
    Visualize OCR results on the image.

    Draws bounding boxes around detected text with optional labels.

    Args:
        image: PIL Image to draw on (will be copied, not modified)
        output_path: Optional path to save the visualization
        show_text: Whether to show detected text
        show_confidence: Whether to show confidence scores
        line_width: Width of bounding box lines
        box_color: Color for bounding boxes (hex)
        text_color: Color for text labels (hex)

    Returns:
        PIL Image with visualizations drawn

    Example:
        ```python
        result = ocr.extract(image)
        viz = result.visualize(image, output_path="ocr_viz.png")
        ```
    """
    from PIL import ImageDraw, ImageFont

    # Copy image to avoid modifying original
    viz_image = image.copy().convert("RGB")
    draw = ImageDraw.Draw(viz_image)

    # Try to get a font
    try:
        font = ImageFont.truetype("/System/Library/Fonts/Helvetica.ttc", 12)
    except Exception:
        try:
            font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 12)
        except Exception:
            font = ImageFont.load_default()

    for block in self.text_blocks:
        coords = block.bbox.to_xyxy()

        # Draw polygon if available, otherwise draw rectangle
        if block.polygon:
            flat_polygon = [coord for point in block.polygon for coord in point]
            draw.polygon(flat_polygon, outline=box_color, width=line_width)
        else:
            draw.rectangle(coords, outline=box_color, width=line_width)

        # Build label text
        if show_text or show_confidence:
            label_parts = []
            if show_text:
                # Truncate long text
                text = block.text[:25] + "..." if len(block.text) > 25 else block.text
                label_parts.append(text)
            if show_confidence:
                label_parts.append(f"{block.confidence:.2f}")
            label_text = " | ".join(label_parts)

            # Position label below the box
            label_x = coords[0]
            label_y = coords[3] + 2  # Below bottom edge

            # Draw label with background
            text_bbox = draw.textbbox((label_x, label_y), label_text, font=font)
            padding = 2
            draw.rectangle(
                [
                    text_bbox[0] - padding,
                    text_bbox[1] - padding,
                    text_bbox[2] + padding,
                    text_bbox[3] + padding,
                ],
                fill="#FFFFFF",
                outline=box_color,
            )
            draw.text((label_x, label_y), label_text, fill=text_color, font=font)

    # Save if path provided
    if output_path:
        output_path = Path(output_path)
        output_path.parent.mkdir(parents=True, exist_ok=True)
        viz_image.save(output_path)

    return viz_image

load_json classmethod

load_json(file_path: Union[str, Path]) -> OCROutput

Load an OCROutput instance from a JSON file.

PARAMETER DESCRIPTION
file_path

Path to JSON file

TYPE: Union[str, Path]

RETURNS DESCRIPTION
OCROutput

OCROutput instance

Source code in omnidocs/tasks/ocr_extraction/models.py
@classmethod
def load_json(cls, file_path: Union[str, Path]) -> "OCROutput":
    """
    Load an OCROutput instance from a JSON file.

    Args:
        file_path: Path to JSON file

    Returns:
        OCROutput instance
    """
    path = Path(file_path)
    return cls.model_validate_json(path.read_text(encoding="utf-8"))

save_json

save_json(file_path: Union[str, Path]) -> None

Save OCROutput instance to a JSON file.

PARAMETER DESCRIPTION
file_path

Path where JSON file should be saved

TYPE: Union[str, Path]

Source code in omnidocs/tasks/ocr_extraction/models.py
def save_json(self, file_path: Union[str, Path]) -> None:
    """
    Save OCROutput instance to a JSON file.

    Args:
        file_path: Path where JSON file should be saved
    """
    path = Path(file_path)
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(self.model_dump_json(indent=2), encoding="utf-8")

paddleocr

PaddleOCR extractor.

PaddleOCR is an OCR toolkit developed by Baidu/PaddlePaddle. - Excellent for CJK languages (Chinese, Japanese, Korean) - GPU accelerated - Supports layout analysis + OCR

Python Package

pip install paddleocr paddlepaddle # CPU version pip install paddleocr paddlepaddle-gpu # GPU version

Model Download Location

By default, PaddleOCR downloads models to ~/.paddleocr/

PaddleOCRConfig

Bases: BaseModel

Configuration for PaddleOCR extractor.

This is a single-backend model (PaddlePaddle - CPU/GPU).

Example
config = PaddleOCRConfig(lang="ch", device="gpu")
ocr = PaddleOCR(config=config)

PaddleOCR

PaddleOCR(config: PaddleOCRConfig)

Bases: BaseOCRExtractor

PaddleOCR text extractor.

Single-backend model (PaddlePaddle - CPU/GPU).

Example
from omnidocs.tasks.ocr_extraction import PaddleOCR, PaddleOCRConfig

ocr = PaddleOCR(config=PaddleOCRConfig(lang="en", device="cpu"))
result = ocr.extract(image)

for block in result.text_blocks:
        print(f"'{block.text}' @ {block.bbox.to_list()}")

Initialize PaddleOCR extractor.

PARAMETER DESCRIPTION
config

Configuration object

TYPE: PaddleOCRConfig

RAISES DESCRIPTION
ImportError

If paddleocr or paddlepaddle is not installed

Source code in omnidocs/tasks/ocr_extraction/paddleocr.py
def __init__(self, config: PaddleOCRConfig):
    """
    Initialize PaddleOCR extractor.

    Args:
        config: Configuration object

    Raises:
        ImportError: If paddleocr or paddlepaddle is not installed
    """
    self.config = config
    self._ocr = None

    # Normalize language code
    self._lang = LANG_CODES.get(config.lang.lower(), config.lang)

    self._load_model()

extract

extract(
    image: Union[Image, ndarray, str, Path],
) -> OCROutput

Run OCR on an image.

PARAMETER DESCRIPTION
image

Input image (PIL Image, numpy array, or path)

TYPE: Union[Image, ndarray, str, Path]

RETURNS DESCRIPTION
OCROutput

OCROutput with detected text blocks

Source code in omnidocs/tasks/ocr_extraction/paddleocr.py
def extract(self, image: Union[Image.Image, np.ndarray, str, Path]) -> OCROutput:
    """
    Run OCR on an image.

    Args:
        image: Input image (PIL Image, numpy array, or path)

    Returns:
        OCROutput with detected text blocks
    """
    if self._ocr is None:
        raise RuntimeError("PaddleOCR not initialized. Call _load_model() first.")

    # Prepare image
    pil_image = self._prepare_image(image)
    image_width, image_height = pil_image.size

    # Convert to numpy array
    image_array = np.array(pil_image)

    # Run PaddleOCR v3.x - use predict() method
    results = self._ocr.predict(image_array)

    # Parse results
    text_blocks = []

    # PaddleOCR may return None or empty results
    if results is None or len(results) == 0:
        return OCROutput(
            text_blocks=[],
            full_text="",
            image_width=image_width,
            image_height=image_height,
            model_name=self.MODEL_NAME,
            languages_detected=[self._lang],
        )

    # PaddleOCR v3.x returns list of dicts with 'rec_texts', 'rec_scores', 'dt_polys'
    for result in results:
        if result is None:
            continue

        rec_texts = result.get("rec_texts", [])
        rec_scores = result.get("rec_scores", [])
        dt_polys = result.get("dt_polys", [])

        for i, text in enumerate(rec_texts):
            if not text.strip():
                continue

            confidence = rec_scores[i] if i < len(rec_scores) else 1.0

            # Get polygon and convert to list
            polygon: Optional[List[List[float]]] = None
            if i < len(dt_polys) and dt_polys[i] is not None:
                poly_array = dt_polys[i]
                # Handle numpy array
                if hasattr(poly_array, "tolist"):
                    polygon = poly_array.tolist()
                else:
                    polygon = list(poly_array)

            # Convert polygon to bbox
            if polygon:
                bbox = BoundingBox.from_polygon(polygon)
            else:
                bbox = BoundingBox(x1=0, y1=0, x2=0, y2=0)

            text_blocks.append(
                TextBlock(
                    text=text,
                    bbox=bbox,
                    confidence=float(confidence),
                    granularity=OCRGranularity.LINE,
                    polygon=polygon,
                    language=self._lang,
                )
            )

    # Sort by position (top to bottom, left to right)
    text_blocks.sort(key=lambda b: (b.bbox.y1, b.bbox.x1))

    # Build full_text from sorted blocks to ensure reading order
    full_text = " ".join(block.text for block in text_blocks)

    return OCROutput(
        text_blocks=text_blocks,
        full_text=full_text,
        image_width=image_width,
        image_height=image_height,
        model_name=self.MODEL_NAME,
        languages_detected=[self._lang],
    )

tesseract

Tesseract OCR extractor.

Tesseract is an open-source OCR engine maintained by Google. - CPU-based (no GPU required) - Requires system installation of Tesseract - Good for printed text, supports 100+ languages

System Requirements

macOS: brew install tesseract Ubuntu: sudo apt-get install tesseract-ocr Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki

Python Package

pip install pytesseract

TesseractOCRConfig

Bases: BaseModel

Configuration for Tesseract OCR extractor.

This is a single-backend model (CPU only, requires system Tesseract).

Example
config = TesseractOCRConfig(languages=["eng", "fra"], psm=3)
ocr = TesseractOCR(config=config)

TesseractOCR

TesseractOCR(config: TesseractOCRConfig)

Bases: BaseOCRExtractor

Tesseract OCR extractor.

Single-backend model (CPU only). Requires system Tesseract installation.

Example
from omnidocs.tasks.ocr_extraction import TesseractOCR, TesseractOCRConfig

ocr = TesseractOCR(config=TesseractOCRConfig(languages=["eng"]))
result = ocr.extract(image)

for block in result.text_blocks:
        print(f"'{block.text}' @ {block.bbox.to_list()}")

Initialize Tesseract OCR extractor.

PARAMETER DESCRIPTION
config

Configuration object

TYPE: TesseractOCRConfig

RAISES DESCRIPTION
RuntimeError

If Tesseract is not installed

ImportError

If pytesseract is not installed

Source code in omnidocs/tasks/ocr_extraction/tesseract.py
def __init__(self, config: TesseractOCRConfig):
    """
    Initialize Tesseract OCR extractor.

    Args:
        config: Configuration object

    Raises:
        RuntimeError: If Tesseract is not installed
        ImportError: If pytesseract is not installed
    """
    self.config = config
    self._pytesseract = None
    self._load_model()

extract

extract(
    image: Union[Image, ndarray, str, Path],
) -> OCROutput

Run OCR on an image.

PARAMETER DESCRIPTION
image

Input image (PIL Image, numpy array, or path)

TYPE: Union[Image, ndarray, str, Path]

RETURNS DESCRIPTION
OCROutput

OCROutput with detected text blocks at word level

Source code in omnidocs/tasks/ocr_extraction/tesseract.py
def extract(self, image: Union[Image.Image, np.ndarray, str, Path]) -> OCROutput:
    """
    Run OCR on an image.

    Args:
        image: Input image (PIL Image, numpy array, or path)

    Returns:
        OCROutput with detected text blocks at word level
    """
    if self._pytesseract is None:
        raise RuntimeError("Tesseract not initialized. Call _load_model() first.")

    # Prepare image
    pil_image = self._prepare_image(image)
    image_width, image_height = pil_image.size

    # Build config string
    config = f"--oem {self.config.oem} --psm {self.config.psm}"
    if self.config.config_params:
        for key, value in self.config.config_params.items():
            config += f" -c {key}={value}"

    # Language string
    lang_str = "+".join(self.config.languages)

    # Get detailed data (word-level boxes)
    data = self._pytesseract.image_to_data(
        pil_image,
        lang=lang_str,
        config=config,
        output_type=self._pytesseract.Output.DICT,
    )

    # Parse results into TextBlocks
    text_blocks = []
    full_text_parts = []

    n_boxes = len(data["text"])
    for i in range(n_boxes):
        text = data["text"][i].strip()
        # Safely convert conf to float (handles string values from some Tesseract versions)
        try:
            conf = float(data["conf"][i])
        except (ValueError, TypeError):
            conf = -1

        # Skip empty text or low confidence (-1 means no confidence)
        if not text or conf == -1:
            continue

        # Tesseract returns confidence as 0-100, normalize to 0-1
        confidence = conf / 100.0

        # Get bounding box
        x = data["left"][i]
        y = data["top"][i]
        w = data["width"][i]
        h = data["height"][i]

        bbox = BoundingBox(
            x1=float(x),
            y1=float(y),
            x2=float(x + w),
            y2=float(y + h),
        )

        text_blocks.append(
            TextBlock(
                text=text,
                bbox=bbox,
                confidence=confidence,
                granularity=OCRGranularity.WORD,
                language=lang_str,
            )
        )

        full_text_parts.append(text)

    # Sort by position (top to bottom, left to right)
    text_blocks.sort(key=lambda b: (b.bbox.y1, b.bbox.x1))

    return OCROutput(
        text_blocks=text_blocks,
        full_text=" ".join(full_text_parts),
        image_width=image_width,
        image_height=image_height,
        model_name=self.MODEL_NAME,
        languages_detected=self.config.languages,
    )

extract_lines

extract_lines(
    image: Union[Image, ndarray, str, Path],
) -> OCROutput

Run OCR and return line-level blocks.

Groups words into lines based on Tesseract's line detection.

PARAMETER DESCRIPTION
image

Input image (PIL Image, numpy array, or path)

TYPE: Union[Image, ndarray, str, Path]

RETURNS DESCRIPTION
OCROutput

OCROutput with line-level text blocks

Source code in omnidocs/tasks/ocr_extraction/tesseract.py
def extract_lines(self, image: Union[Image.Image, np.ndarray, str, Path]) -> OCROutput:
    """
    Run OCR and return line-level blocks.

    Groups words into lines based on Tesseract's line detection.

    Args:
        image: Input image (PIL Image, numpy array, or path)

    Returns:
        OCROutput with line-level text blocks
    """
    if self._pytesseract is None:
        raise RuntimeError("Tesseract not initialized. Call _load_model() first.")

    # Prepare image
    pil_image = self._prepare_image(image)
    image_width, image_height = pil_image.size

    # Build config string (including config_params like extract method)
    config = f"--oem {self.config.oem} --psm {self.config.psm}"
    if self.config.config_params:
        for key, value in self.config.config_params.items():
            config += f" -c {key}={value}"

    # Language string
    lang_str = "+".join(self.config.languages)

    # Get detailed data
    data = self._pytesseract.image_to_data(
        pil_image,
        lang=lang_str,
        config=config,
        output_type=self._pytesseract.Output.DICT,
    )

    # Group words into lines
    lines: Dict[tuple, Dict] = {}
    n_boxes = len(data["text"])

    for i in range(n_boxes):
        text = data["text"][i].strip()
        # Safely convert conf to float (handles string values from some Tesseract versions)
        try:
            conf = float(data["conf"][i])
        except (ValueError, TypeError):
            conf = -1

        if not text or conf == -1:
            continue

        # Tesseract provides block_num, par_num, line_num
        line_key = (data["block_num"][i], data["par_num"][i], data["line_num"][i])

        x = data["left"][i]
        y = data["top"][i]
        w = data["width"][i]
        h = data["height"][i]

        if line_key not in lines:
            lines[line_key] = {
                "words": [],
                "confidences": [],
                "x1": x,
                "y1": y,
                "x2": x + w,
                "y2": y + h,
            }

        lines[line_key]["words"].append(text)
        lines[line_key]["confidences"].append(conf / 100.0)
        lines[line_key]["x1"] = min(lines[line_key]["x1"], x)
        lines[line_key]["y1"] = min(lines[line_key]["y1"], y)
        lines[line_key]["x2"] = max(lines[line_key]["x2"], x + w)
        lines[line_key]["y2"] = max(lines[line_key]["y2"], y + h)

    # Convert to TextBlocks
    text_blocks = []
    full_text_parts = []

    for line_key in sorted(lines.keys()):
        line = lines[line_key]
        line_text = " ".join(line["words"])
        avg_conf = sum(line["confidences"]) / len(line["confidences"])

        bbox = BoundingBox(
            x1=float(line["x1"]),
            y1=float(line["y1"]),
            x2=float(line["x2"]),
            y2=float(line["y2"]),
        )

        text_blocks.append(
            TextBlock(
                text=line_text,
                bbox=bbox,
                confidence=avg_conf,
                granularity=OCRGranularity.LINE,
                language=lang_str,
            )
        )

        full_text_parts.append(line_text)

    return OCROutput(
        text_blocks=text_blocks,
        full_text="\n".join(full_text_parts),
        image_width=image_width,
        image_height=image_height,
        model_name=self.MODEL_NAME,
        languages_detected=self.config.languages,
    )