Ocr test

In [ ]:

Copied!





def test_ocr_extraction():
    from omnidocs.tasks.ocr_extraction.extractors.paddle import PaddleOCRExtractor
    from omnidocs.tasks.ocr_extraction.extractors.tesseract_ocr import TesseractOCRExtractor
    from omnidocs.tasks.ocr_extraction.extractors.easy_ocr import EasyOCRExtractor
    from omnidocs.tasks.ocr_extraction.extractors.surya_ocr import SuryaOCRExtractor
    from pathlib import Path
    extractors = [PaddleOCRExtractor, TesseractOCRExtractor, EasyOCRExtractor, SuryaOCRExtractor]
    image_path = Path("Omnidocs") / "tests" / "ocr_extraction" / "assets" / "invoice.jpg"
    
    for extractor_cls in extractors:
        print(f"\nTesting {extractor_cls.__name__}")
        print("-" * 40)
        
        try:
            extractor = extractor_cls()
            result = extractor.extract(image_path)
            print(f"Text length: {len(result.full_text)} chars")
        
            vis_path = f"visualized_{extractor_cls.__name__}.png"
            extractor.visualize(result, image_path, vis_path)


            #load and visualize, if already saved as json
            #extractor.visualize_from_json("image.jpg", "results.json", "viz.png")


            # with custom styling
            extractor.visualize(
                result, 
                image_path, 
                f"styled_{extractor_cls.__name__}.png",
                box_color='green',
                box_width=3,
                show_text=True,
                text_color='red'
            )
            
            print("SUCCESS")
        except Exception as e:
            print(f"ERROR: {e}")
def test_ocr_extraction():
    from omnidocs.tasks.ocr_extraction.extractors.paddle import PaddleOCRExtractor
    from omnidocs.tasks.ocr_extraction.extractors.tesseract_ocr import TesseractOCRExtractor
    from omnidocs.tasks.ocr_extraction.extractors.easy_ocr import EasyOCRExtractor
    from omnidocs.tasks.ocr_extraction.extractors.surya_ocr import SuryaOCRExtractor
    from pathlib import Path
    extractors = [PaddleOCRExtractor, TesseractOCRExtractor, EasyOCRExtractor, SuryaOCRExtractor]
    image_path = Path("Omnidocs") / "tests" / "ocr_extraction" / "assets" / "invoice.jpg"
    
    for extractor_cls in extractors:
        print(f"\nTesting {extractor_cls.__name__}")
        print("-" * 40)
        
        try:
            extractor = extractor_cls()
            result = extractor.extract(image_path)
            print(f"Text length: {len(result.full_text)} chars")
        
            vis_path = f"visualized_{extractor_cls.__name__}.png"
            extractor.visualize(result, image_path, vis_path)


            #load and visualize, if already saved as json
            #extractor.visualize_from_json("image.jpg", "results.json", "viz.png")


            # with custom styling
            extractor.visualize(
                result, 
                image_path, 
                f"styled_{extractor_cls.__name__}.png",
                box_color='green',
                box_width=3,
                show_text=True,
                text_color='red'
            )
            
            print("SUCCESS")
        except Exception as e:
            print(f"ERROR: {e}")

In [ ]:

Copied!

test_ocr_extraction()
test_ocr_extraction()