Bases: BaseLayoutExtractor
DocLayout-YOLO layout extractor.
A YOLO-based model optimized for document layout detection.
Detects: title, text, figure, table, formula, captions, etc.
This is a single-backend model (PyTorch only).
Example
from omnidocs.tasks.layout_extraction import DocLayoutYOLO, DocLayoutYOLOConfig
extractor = DocLayoutYOLO(config=DocLayoutYOLOConfig(device="cuda"))
result = extractor.extract(image)
for box in result.bboxes:
print(f"{box.label.value}: {box.confidence:.2f}")
Initialize DocLayout-YOLO extractor.
| PARAMETER |
DESCRIPTION |
config
|
Configuration object with device, model_path, etc.
TYPE:
DocLayoutYOLOConfig
|
Source code in omnidocs/tasks/layout_extraction/doc_layout_yolo.py
| def __init__(self, config: DocLayoutYOLOConfig):
"""
Initialize DocLayout-YOLO extractor.
Args:
config: Configuration object with device, model_path, etc.
"""
self.config = config
self._model = None
self._device = self._resolve_device(config.device)
self._model_path = self._resolve_model_path(config.model_path)
# Load model
self._load_model()
|
extract(
image: Union[Image, ndarray, str, Path],
) -> LayoutOutput
Run layout extraction on an image.
| PARAMETER |
DESCRIPTION |
image
|
Input image (PIL Image, numpy array, or path)
TYPE:
Union[Image, ndarray, str, Path]
|
| RETURNS |
DESCRIPTION |
LayoutOutput
|
LayoutOutput with detected layout boxes
|
Source code in omnidocs/tasks/layout_extraction/doc_layout_yolo.py
| def extract(self, image: Union[Image.Image, np.ndarray, str, Path]) -> LayoutOutput:
"""
Run layout extraction on an image.
Args:
image: Input image (PIL Image, numpy array, or path)
Returns:
LayoutOutput with detected layout boxes
"""
if self._model is None:
raise RuntimeError("Model not loaded. Call _load_model() first.")
# Prepare image
pil_image = self._prepare_image(image)
img_width, img_height = pil_image.size
# Run inference
results = self._model.predict(
pil_image,
imgsz=self.config.img_size,
conf=self.config.confidence,
device=self._device,
)
result = results[0]
# Parse detections
layout_boxes = []
if hasattr(result, "boxes") and result.boxes is not None:
boxes = result.boxes
for i in range(len(boxes)):
# Get coordinates
bbox_coords = boxes.xyxy[i].cpu().numpy().tolist()
# Get class and confidence
class_id = int(boxes.cls[i].item())
confidence = float(boxes.conf[i].item())
# Get original label from class names
original_label = DOCLAYOUT_YOLO_CLASS_NAMES.get(class_id, f"class_{class_id}")
# Map to standardized label
standard_label = DOCLAYOUT_YOLO_MAPPING.to_standard(original_label)
layout_boxes.append(
LayoutBox(
label=standard_label,
bbox=BoundingBox.from_list(bbox_coords),
confidence=confidence,
class_id=class_id,
original_label=original_label,
)
)
# Sort by y-coordinate (top to bottom reading order)
layout_boxes.sort(key=lambda b: (b.bbox.y1, b.bbox.x1))
return LayoutOutput(
bboxes=layout_boxes,
image_width=img_width,
image_height=img_height,
model_name="DocLayout-YOLO",
)
|