Changes: - Replace PP-Structure 7-slider parameter UI with simple 3-option layout model selector - Add layout model mapping: chinese (PP-DocLayout-S), default (PubLayNet), cdla - Add LayoutModelSelector component and zh-TW translations - Fix "default" model behavior with sentinel value for PubLayNet - Add gap filling service for OCR track coverage improvement - Add PP-Structure debug utilities - Archive completed/incomplete proposals: - add-ocr-track-gap-filling (complete) - fix-ocr-track-table-rendering (incomplete) - simplify-ppstructure-model-selection (22/25 tasks) - Add new layout model tests, archive old PP-Structure param tests - Update OpenSpec ocr-processing spec with layout model requirements 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
504 lines
19 KiB
Python
504 lines
19 KiB
Python
"""
|
|
Tests for Gap Filling Service
|
|
|
|
Tests the detection and filling of gaps in PP-StructureV3 output
|
|
using raw OCR text regions.
|
|
"""
|
|
|
|
import pytest
|
|
from typing import List, Dict, Any
|
|
|
|
from app.services.gap_filling_service import GapFillingService, TextRegion, SKIP_ELEMENT_TYPES
|
|
from app.models.unified_document import DocumentElement, BoundingBox, ElementType, Dimensions
|
|
|
|
|
|
class TestGapFillingService:
|
|
"""Tests for GapFillingService class."""
|
|
|
|
@pytest.fixture
|
|
def service(self) -> GapFillingService:
|
|
"""Create a GapFillingService instance with default settings."""
|
|
return GapFillingService(
|
|
coverage_threshold=0.7,
|
|
iou_threshold=0.15,
|
|
confidence_threshold=0.3,
|
|
dedup_iou_threshold=0.5,
|
|
enabled=True
|
|
)
|
|
|
|
@pytest.fixture
|
|
def disabled_service(self) -> GapFillingService:
|
|
"""Create a disabled GapFillingService instance."""
|
|
return GapFillingService(enabled=False)
|
|
|
|
@pytest.fixture
|
|
def sample_raw_regions(self) -> List[TextRegion]:
|
|
"""Create sample raw OCR text regions."""
|
|
return [
|
|
TextRegion(text="Header text", bbox=[100, 50, 300, 80], confidence=0.95, page=1),
|
|
TextRegion(text="Title of document", bbox=[100, 100, 500, 150], confidence=0.92, page=1),
|
|
TextRegion(text="First paragraph", bbox=[100, 200, 500, 250], confidence=0.90, page=1),
|
|
TextRegion(text="Second paragraph", bbox=[100, 300, 500, 350], confidence=0.88, page=1),
|
|
TextRegion(text="Footer note", bbox=[100, 900, 300, 930], confidence=0.85, page=1),
|
|
# Low confidence region (should be filtered)
|
|
TextRegion(text="Noise", bbox=[50, 50, 80, 80], confidence=0.1, page=1),
|
|
]
|
|
|
|
@pytest.fixture
|
|
def sample_pp_elements(self) -> List[DocumentElement]:
|
|
"""Create sample PP-StructureV3 elements that cover only some regions."""
|
|
return [
|
|
DocumentElement(
|
|
element_id="pp_1",
|
|
type=ElementType.TITLE,
|
|
content="Title of document",
|
|
bbox=BoundingBox(x0=100, y0=100, x1=500, y1=150),
|
|
confidence=0.95
|
|
),
|
|
DocumentElement(
|
|
element_id="pp_2",
|
|
type=ElementType.TEXT,
|
|
content="First paragraph",
|
|
bbox=BoundingBox(x0=100, y0=200, x1=500, y1=250),
|
|
confidence=0.90
|
|
),
|
|
# Note: Header, Second paragraph, and Footer are NOT covered
|
|
]
|
|
|
|
def test_service_initialization(self, service: GapFillingService):
|
|
"""Test service initializes with correct parameters."""
|
|
assert service.enabled is True
|
|
assert service.coverage_threshold == 0.7
|
|
assert service.iou_threshold == 0.15
|
|
assert service.confidence_threshold == 0.3
|
|
assert service.dedup_iou_threshold == 0.5
|
|
|
|
def test_disabled_service(self, disabled_service: GapFillingService):
|
|
"""Test disabled service does not activate."""
|
|
regions = [TextRegion(text="Test", bbox=[0, 0, 100, 100], confidence=0.9, page=1)]
|
|
elements = []
|
|
|
|
should_activate, coverage = disabled_service.should_activate(regions, elements)
|
|
assert should_activate is False
|
|
assert coverage == 1.0
|
|
|
|
def test_should_activate_low_coverage(
|
|
self,
|
|
service: GapFillingService,
|
|
sample_raw_regions: List[TextRegion],
|
|
sample_pp_elements: List[DocumentElement]
|
|
):
|
|
"""Test activation when coverage is below threshold."""
|
|
# Filter out low confidence regions
|
|
valid_regions = [r for r in sample_raw_regions if r.confidence >= 0.3]
|
|
|
|
should_activate, coverage = service.should_activate(valid_regions, sample_pp_elements)
|
|
|
|
# Only 2 out of 5 valid regions are covered (Title, First paragraph)
|
|
assert should_activate is True
|
|
assert coverage < 0.7 # Below threshold
|
|
|
|
def test_should_not_activate_high_coverage(self, service: GapFillingService):
|
|
"""Test no activation when coverage is above threshold."""
|
|
# All regions covered
|
|
regions = [
|
|
TextRegion(text="Text 1", bbox=[100, 100, 200, 150], confidence=0.9, page=1),
|
|
TextRegion(text="Text 2", bbox=[100, 200, 200, 250], confidence=0.9, page=1),
|
|
]
|
|
|
|
elements = [
|
|
DocumentElement(
|
|
element_id="pp_1",
|
|
type=ElementType.TEXT,
|
|
content="Text 1",
|
|
bbox=BoundingBox(x0=50, y0=50, x1=250, y1=200), # Covers first region
|
|
confidence=0.95
|
|
),
|
|
DocumentElement(
|
|
element_id="pp_2",
|
|
type=ElementType.TEXT,
|
|
content="Text 2",
|
|
bbox=BoundingBox(x0=50, y0=180, x1=250, y1=300), # Covers second region
|
|
confidence=0.95
|
|
),
|
|
]
|
|
|
|
should_activate, coverage = service.should_activate(regions, elements)
|
|
|
|
assert should_activate is False
|
|
assert coverage >= 0.7
|
|
|
|
def test_find_uncovered_regions(
|
|
self,
|
|
service: GapFillingService,
|
|
sample_raw_regions: List[TextRegion],
|
|
sample_pp_elements: List[DocumentElement]
|
|
):
|
|
"""Test finding uncovered regions."""
|
|
uncovered = service.find_uncovered_regions(sample_raw_regions, sample_pp_elements)
|
|
|
|
# Should find Header, Second paragraph, Footer (not Title, First paragraph, or low-confidence Noise)
|
|
assert len(uncovered) == 3
|
|
|
|
uncovered_texts = [r.text for r in uncovered]
|
|
assert "Header text" in uncovered_texts
|
|
assert "Second paragraph" in uncovered_texts
|
|
assert "Footer note" in uncovered_texts
|
|
assert "Title of document" not in uncovered_texts # Covered
|
|
assert "First paragraph" not in uncovered_texts # Covered
|
|
assert "Noise" not in uncovered_texts # Low confidence
|
|
|
|
def test_coverage_by_center_point(self, service: GapFillingService):
|
|
"""Test coverage detection via center point."""
|
|
region = TextRegion(text="Test", bbox=[150, 150, 250, 200], confidence=0.9, page=1)
|
|
|
|
element = DocumentElement(
|
|
element_id="pp_1",
|
|
type=ElementType.TEXT,
|
|
content="Container",
|
|
bbox=BoundingBox(x0=100, y0=100, x1=300, y1=250), # Contains region's center
|
|
confidence=0.95
|
|
)
|
|
|
|
is_covered = service._is_region_covered(region, [element])
|
|
assert is_covered is True
|
|
|
|
def test_coverage_by_iou(self, service: GapFillingService):
|
|
"""Test coverage detection via IoU threshold."""
|
|
region = TextRegion(text="Test", bbox=[100, 100, 200, 150], confidence=0.9, page=1)
|
|
|
|
element = DocumentElement(
|
|
element_id="pp_1",
|
|
type=ElementType.TEXT,
|
|
content="Overlap",
|
|
bbox=BoundingBox(x0=150, y0=100, x1=250, y1=150), # Partial overlap
|
|
confidence=0.95
|
|
)
|
|
|
|
# Calculate expected IoU
|
|
# Intersection: (150-200) x (100-150) = 50 x 50 = 2500
|
|
# Union: 100x50 + 100x50 - 2500 = 7500
|
|
# IoU = 2500/7500 = 0.33 > 0.15 threshold
|
|
|
|
is_covered = service._is_region_covered(region, [element])
|
|
assert is_covered is True
|
|
|
|
def test_deduplication(
|
|
self,
|
|
service: GapFillingService,
|
|
sample_pp_elements: List[DocumentElement]
|
|
):
|
|
"""Test deduplication removes high-overlap regions."""
|
|
uncovered = [
|
|
# High overlap with pp_2 (First paragraph)
|
|
TextRegion(text="First paragraph variant", bbox=[100, 200, 500, 250], confidence=0.9, page=1),
|
|
# No overlap
|
|
TextRegion(text="Unique region", bbox=[100, 500, 300, 550], confidence=0.9, page=1),
|
|
]
|
|
|
|
deduplicated = service.deduplicate_regions(uncovered, sample_pp_elements)
|
|
|
|
assert len(deduplicated) == 1
|
|
assert deduplicated[0].text == "Unique region"
|
|
|
|
def test_convert_regions_to_elements(self, service: GapFillingService):
|
|
"""Test conversion of TextRegions to DocumentElements."""
|
|
regions = [
|
|
TextRegion(text="Test text 1", bbox=[100, 100, 200, 150], confidence=0.85, page=1),
|
|
TextRegion(text="Test text 2", bbox=[100, 200, 200, 250], confidence=0.90, page=1),
|
|
]
|
|
|
|
elements = service.convert_regions_to_elements(regions, page_number=1, start_element_id=0)
|
|
|
|
assert len(elements) == 2
|
|
assert elements[0].element_id == "gap_fill_1_0"
|
|
assert elements[0].type == ElementType.TEXT
|
|
assert elements[0].content == "Test text 1"
|
|
assert elements[0].confidence == 0.85
|
|
assert elements[0].metadata.get('source') == 'gap_filling'
|
|
|
|
assert elements[1].element_id == "gap_fill_1_1"
|
|
assert elements[1].content == "Test text 2"
|
|
|
|
def test_recalculate_reading_order(self, service: GapFillingService):
|
|
"""Test reading order recalculation."""
|
|
elements = [
|
|
DocumentElement(
|
|
element_id="e3",
|
|
type=ElementType.TEXT,
|
|
content="Bottom",
|
|
bbox=BoundingBox(x0=100, y0=300, x1=200, y1=350),
|
|
confidence=0.9
|
|
),
|
|
DocumentElement(
|
|
element_id="e1",
|
|
type=ElementType.TEXT,
|
|
content="Top",
|
|
bbox=BoundingBox(x0=100, y0=100, x1=200, y1=150),
|
|
confidence=0.9
|
|
),
|
|
DocumentElement(
|
|
element_id="e2",
|
|
type=ElementType.TEXT,
|
|
content="Middle",
|
|
bbox=BoundingBox(x0=100, y0=200, x1=200, y1=250),
|
|
confidence=0.9
|
|
),
|
|
]
|
|
|
|
reading_order = service.recalculate_reading_order(elements)
|
|
|
|
# Should be sorted by y0: Top (100), Middle (200), Bottom (300)
|
|
assert reading_order == [1, 2, 0] # Indices of elements in reading order
|
|
|
|
def test_fill_gaps_integration(
|
|
self,
|
|
service: GapFillingService,
|
|
):
|
|
"""Integration test for fill_gaps method."""
|
|
# Raw OCR regions (dict format as received from OCR service)
|
|
raw_regions = [
|
|
{'text': 'Header', 'bbox': [100, 50, 300, 80], 'confidence': 0.95, 'page': 1},
|
|
{'text': 'Title', 'bbox': [100, 100, 500, 150], 'confidence': 0.92, 'page': 1},
|
|
{'text': 'Paragraph 1', 'bbox': [100, 200, 500, 250], 'confidence': 0.90, 'page': 1},
|
|
{'text': 'Paragraph 2', 'bbox': [100, 300, 500, 350], 'confidence': 0.88, 'page': 1},
|
|
{'text': 'Paragraph 3', 'bbox': [100, 400, 500, 450], 'confidence': 0.86, 'page': 1},
|
|
{'text': 'Footer', 'bbox': [100, 900, 300, 930], 'confidence': 0.85, 'page': 1},
|
|
]
|
|
|
|
# PP-StructureV3 only detected Title (missing 5 out of 6 regions = 16.7% coverage)
|
|
pp_elements = [
|
|
DocumentElement(
|
|
element_id="pp_1",
|
|
type=ElementType.TITLE,
|
|
content="Title",
|
|
bbox=BoundingBox(x0=100, y0=100, x1=500, y1=150),
|
|
confidence=0.95
|
|
),
|
|
]
|
|
|
|
supplemented, stats = service.fill_gaps(
|
|
raw_ocr_regions=raw_regions,
|
|
pp_structure_elements=pp_elements,
|
|
page_number=1
|
|
)
|
|
|
|
# Should have activated and supplemented missing regions
|
|
assert stats['activated'] is True
|
|
assert stats['coverage_ratio'] < 0.7
|
|
assert len(supplemented) == 5 # Header, Paragraph 1, 2, 3, Footer
|
|
|
|
def test_fill_gaps_no_activation_when_coverage_high(self, service: GapFillingService):
|
|
"""Test fill_gaps does not activate when coverage is high."""
|
|
raw_regions = [
|
|
{'text': 'Text 1', 'bbox': [100, 100, 200, 150], 'confidence': 0.9, 'page': 1},
|
|
]
|
|
|
|
pp_elements = [
|
|
DocumentElement(
|
|
element_id="pp_1",
|
|
type=ElementType.TEXT,
|
|
content="Text 1",
|
|
bbox=BoundingBox(x0=50, y0=50, x1=250, y1=200), # Fully covers
|
|
confidence=0.95
|
|
),
|
|
]
|
|
|
|
supplemented, stats = service.fill_gaps(
|
|
raw_ocr_regions=raw_regions,
|
|
pp_structure_elements=pp_elements,
|
|
page_number=1
|
|
)
|
|
|
|
assert stats['activated'] is False
|
|
assert len(supplemented) == 0
|
|
|
|
def test_skip_element_types_not_supplemented(self, service: GapFillingService):
|
|
"""Test that TABLE/IMAGE/etc. elements are not supplemented over."""
|
|
raw_regions = [
|
|
{'text': 'Table cell text', 'bbox': [100, 100, 200, 150], 'confidence': 0.9, 'page': 1},
|
|
]
|
|
|
|
# PP-StructureV3 has a table covering this region
|
|
pp_elements = [
|
|
DocumentElement(
|
|
element_id="pp_1",
|
|
type=ElementType.TABLE,
|
|
content="<table>...</table>",
|
|
bbox=BoundingBox(x0=50, y0=50, x1=250, y1=200),
|
|
confidence=0.95
|
|
),
|
|
]
|
|
|
|
# The region should be considered covered by the table
|
|
supplemented, stats = service.fill_gaps(
|
|
raw_ocr_regions=raw_regions,
|
|
pp_structure_elements=pp_elements,
|
|
page_number=1
|
|
)
|
|
|
|
# Should not supplement because the table covers it
|
|
assert len(supplemented) == 0
|
|
|
|
def test_coordinate_scaling(self, service: GapFillingService):
|
|
"""Test coordinate alignment with different dimensions."""
|
|
# OCR was done at 2000x3000, PP-Structure at 1000x1500
|
|
ocr_dimensions = {'width': 2000, 'height': 3000}
|
|
pp_dimensions = Dimensions(width=1000, height=1500)
|
|
|
|
raw_regions = [
|
|
# At OCR scale: (200, 300) to (400, 450) -> at PP scale: (100, 150) to (200, 225)
|
|
{'text': 'Scaled text', 'bbox': [200, 300, 400, 450], 'confidence': 0.9, 'page': 1},
|
|
]
|
|
|
|
pp_elements = [
|
|
DocumentElement(
|
|
element_id="pp_1",
|
|
type=ElementType.TEXT,
|
|
content="Scaled text",
|
|
bbox=BoundingBox(x0=100, y0=150, x1=200, y1=225), # Should cover after scaling
|
|
confidence=0.95
|
|
),
|
|
]
|
|
|
|
supplemented, stats = service.fill_gaps(
|
|
raw_ocr_regions=raw_regions,
|
|
pp_structure_elements=pp_elements,
|
|
page_number=1,
|
|
ocr_dimensions=ocr_dimensions,
|
|
pp_dimensions=pp_dimensions
|
|
)
|
|
|
|
# After scaling, the region should be covered
|
|
assert stats['coverage_ratio'] >= 0.7 or len(supplemented) == 0
|
|
|
|
def test_iou_calculation(self, service: GapFillingService):
|
|
"""Test IoU calculation accuracy."""
|
|
# Two identical boxes
|
|
bbox1 = (0, 0, 100, 100)
|
|
bbox2 = (0, 0, 100, 100)
|
|
assert service._calculate_iou(bbox1, bbox2) == 1.0
|
|
|
|
# No overlap
|
|
bbox1 = (0, 0, 100, 100)
|
|
bbox2 = (200, 200, 300, 300)
|
|
assert service._calculate_iou(bbox1, bbox2) == 0.0
|
|
|
|
# 50% overlap
|
|
bbox1 = (0, 0, 100, 100)
|
|
bbox2 = (50, 0, 150, 100) # Shifted right by 50
|
|
# Intersection: 50x100 = 5000
|
|
# Union: 10000 + 10000 - 5000 = 15000
|
|
# IoU = 5000/15000 = 0.333...
|
|
iou = service._calculate_iou(bbox1, bbox2)
|
|
assert abs(iou - 1/3) < 0.01
|
|
|
|
def test_point_in_bbox(self, service: GapFillingService):
|
|
"""Test point-in-bbox check."""
|
|
bbox = (100, 100, 200, 200)
|
|
|
|
# Inside
|
|
assert service._point_in_bbox(150, 150, bbox) is True
|
|
|
|
# On edge
|
|
assert service._point_in_bbox(100, 100, bbox) is True
|
|
assert service._point_in_bbox(200, 200, bbox) is True
|
|
|
|
# Outside
|
|
assert service._point_in_bbox(50, 150, bbox) is False
|
|
assert service._point_in_bbox(250, 150, bbox) is False
|
|
|
|
def test_merge_adjacent_regions(self, service: GapFillingService):
|
|
"""Test merging of adjacent text regions."""
|
|
regions = [
|
|
TextRegion(text="Hello", bbox=[100, 100, 150, 130], confidence=0.9, page=1),
|
|
TextRegion(text="World", bbox=[160, 100, 210, 130], confidence=0.85, page=1), # Adjacent
|
|
TextRegion(text="Far away", bbox=[100, 300, 200, 330], confidence=0.9, page=1), # Not adjacent
|
|
]
|
|
|
|
merged = service.merge_adjacent_regions(regions, max_horizontal_gap=20, max_vertical_gap=10)
|
|
|
|
assert len(merged) == 2
|
|
# First two should be merged
|
|
assert "Hello" in merged[0].text and "World" in merged[0].text
|
|
assert merged[1].text == "Far away"
|
|
|
|
|
|
class TestTextRegion:
|
|
"""Tests for TextRegion dataclass."""
|
|
|
|
def test_normalized_bbox_4_values(self):
|
|
"""Test bbox normalization with 4 values."""
|
|
region = TextRegion(text="Test", bbox=[100, 200, 300, 400], confidence=0.9, page=1)
|
|
assert region.normalized_bbox == (100, 200, 300, 400)
|
|
|
|
def test_normalized_bbox_polygon_flat(self):
|
|
"""Test bbox normalization with flat polygon format (8 values)."""
|
|
# Polygon: 4 points as flat list [x1, y1, x2, y2, x3, y3, x4, y4]
|
|
region = TextRegion(
|
|
text="Test",
|
|
bbox=[100, 200, 300, 200, 300, 400, 100, 400],
|
|
confidence=0.9,
|
|
page=1
|
|
)
|
|
assert region.normalized_bbox == (100, 200, 300, 400)
|
|
|
|
def test_normalized_bbox_polygon_nested(self):
|
|
"""Test bbox normalization with nested polygon format (PaddleOCR format)."""
|
|
# PaddleOCR format: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
|
|
region = TextRegion(
|
|
text="Test",
|
|
bbox=[[100, 200], [300, 200], [300, 400], [100, 400]],
|
|
confidence=0.9,
|
|
page=1
|
|
)
|
|
assert region.normalized_bbox == (100, 200, 300, 400)
|
|
|
|
def test_normalized_bbox_numpy_polygon(self):
|
|
"""Test bbox normalization with numpy-like nested format."""
|
|
# Sometimes PaddleOCR returns numpy arrays converted to lists
|
|
region = TextRegion(
|
|
text="Test",
|
|
bbox=[[100.5, 200.5], [300.5, 200.5], [300.5, 400.5], [100.5, 400.5]],
|
|
confidence=0.9,
|
|
page=1
|
|
)
|
|
bbox = region.normalized_bbox
|
|
assert bbox[0] == 100.5
|
|
assert bbox[1] == 200.5
|
|
assert bbox[2] == 300.5
|
|
assert bbox[3] == 400.5
|
|
|
|
def test_center_calculation(self):
|
|
"""Test center point calculation."""
|
|
region = TextRegion(text="Test", bbox=[100, 200, 300, 400], confidence=0.9, page=1)
|
|
assert region.center == (200, 300)
|
|
|
|
def test_center_calculation_nested_bbox(self):
|
|
"""Test center point calculation with nested bbox format."""
|
|
region = TextRegion(
|
|
text="Test",
|
|
bbox=[[100, 200], [300, 200], [300, 400], [100, 400]],
|
|
confidence=0.9,
|
|
page=1
|
|
)
|
|
assert region.center == (200, 300)
|
|
|
|
|
|
class TestOCRToUnifiedConverterIntegration:
|
|
"""Integration tests for OCRToUnifiedConverter with gap filling."""
|
|
|
|
def test_converter_with_gap_filling_enabled(self):
|
|
"""Test converter initializes with gap filling enabled."""
|
|
from app.services.ocr_to_unified_converter import OCRToUnifiedConverter
|
|
|
|
converter = OCRToUnifiedConverter(enable_gap_filling=True)
|
|
assert converter.gap_filling_service is not None
|
|
|
|
def test_converter_with_gap_filling_disabled(self):
|
|
"""Test converter initializes without gap filling."""
|
|
from app.services.ocr_to_unified_converter import OCRToUnifiedConverter
|
|
|
|
converter = OCRToUnifiedConverter(enable_gap_filling=False)
|
|
assert converter.gap_filling_service is None
|