feat: simplify layout model selection and archive proposals

Changes:
- Replace PP-Structure 7-slider parameter UI with simple 3-option layout model selector
- Add layout model mapping: chinese (PP-DocLayout-S), default (PubLayNet), cdla
- Add LayoutModelSelector component and zh-TW translations
- Fix "default" model behavior with sentinel value for PubLayNet
- Add gap filling service for OCR track coverage improvement
- Add PP-Structure debug utilities
- Archive completed/incomplete proposals:
  - add-ocr-track-gap-filling (complete)
  - fix-ocr-track-table-rendering (incomplete)
  - simplify-ppstructure-model-selection (22/25 tasks)
- Add new layout model tests, archive old PP-Structure param tests
- Update OpenSpec ocr-processing spec with layout model requirements

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-27 13:27:00 +08:00
parent c65df754cf
commit 59206a6ab8
35 changed files with 3621 additions and 658 deletions

View File

@@ -1,381 +0,0 @@
"""
Performance benchmarks for PP-StructureV3 parameter customization
Measures memory usage, processing time, and engine initialization overhead
"""
import pytest
import psutil
import gc
import time
from pathlib import Path
from unittest.mock import Mock, patch
from app.services.ocr_service import OCRService
@pytest.fixture
def ocr_service():
"""Create OCR service instance"""
return OCRService()
@pytest.fixture
def sample_image():
"""Find a sample image for testing"""
# Try to find any image in demo_docs
demo_dir = Path('/home/egg/project/Tool_OCR/demo_docs')
if demo_dir.exists():
for ext in ['.pdf', '.png', '.jpg', '.jpeg']:
images = list(demo_dir.glob(f'*{ext}'))
if images:
return images[0]
return None
class MemoryTracker:
"""Helper class to track memory usage"""
def __init__(self):
self.process = psutil.Process()
self.start_memory = 0
self.peak_memory = 0
def start(self):
"""Start tracking memory"""
gc.collect() # Force garbage collection
self.start_memory = self.process.memory_info().rss / 1024 / 1024 # MB
self.peak_memory = self.start_memory
def sample(self):
"""Sample current memory"""
current = self.process.memory_info().rss / 1024 / 1024 # MB
self.peak_memory = max(self.peak_memory, current)
return current
def get_delta(self):
"""Get memory delta since start"""
current = self.sample()
return current - self.start_memory
def get_peak_delta(self):
"""Get peak memory delta"""
return self.peak_memory - self.start_memory
@pytest.mark.performance
class TestEngineInitializationPerformance:
"""Test performance of engine initialization with custom parameters"""
def test_default_engine_initialization_time(self, ocr_service):
"""Measure time to initialize default (cached) engine"""
print("\n=== Default Engine Initialization ===")
with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure:
mock_engine = Mock()
mock_ppstructure.return_value = mock_engine
# First initialization (creates engine)
start = time.time()
engine1 = ocr_service._ensure_structure_engine(custom_params=None)
first_init_time = time.time() - start
print(f"First initialization: {first_init_time * 1000:.2f}ms")
# Second initialization (uses cache)
start = time.time()
engine2 = ocr_service._ensure_structure_engine(custom_params=None)
cached_time = time.time() - start
print(f"Cached access: {cached_time * 1000:.2f}ms")
print(f"Speedup: {first_init_time / cached_time:.1f}x")
# Verify caching works
assert engine1 is engine2
assert mock_ppstructure.call_count == 1
# Cached access should be much faster
assert cached_time < first_init_time / 10
def test_custom_engine_initialization_time(self, ocr_service):
"""Measure time to initialize engine with custom parameters"""
print("\n=== Custom Engine Initialization ===")
custom_params = {
'layout_detection_threshold': 0.15,
'text_det_thresh': 0.2
}
with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure:
mock_ppstructure.return_value = Mock()
# Multiple initializations (no caching)
times = []
for i in range(3):
start = time.time()
engine = ocr_service._ensure_structure_engine(custom_params=custom_params)
init_time = time.time() - start
times.append(init_time)
print(f"Run {i+1}: {init_time * 1000:.2f}ms")
avg_time = sum(times) / len(times)
print(f"Average: {avg_time * 1000:.2f}ms")
# Each call should create new engine (no caching)
assert mock_ppstructure.call_count == 3
def test_parameter_extraction_overhead(self):
"""Measure overhead of parameter extraction and validation"""
print("\n=== Parameter Extraction Overhead ===")
from app.schemas.task import PPStructureV3Params
# Test parameter validation performance
iterations = 1000
# Valid parameters
start = time.time()
for _ in range(iterations):
params = PPStructureV3Params(
layout_detection_threshold=0.15,
text_det_thresh=0.2
)
_ = params.model_dump(exclude_none=True)
valid_time = time.time() - start
print(f"Valid params ({iterations} iterations): {valid_time * 1000:.2f}ms")
print(f"Per-operation: {valid_time / iterations * 1000:.4f}ms")
# Validation should be fast
assert valid_time / iterations < 0.001 # < 1ms per operation
@pytest.mark.performance
class TestMemoryUsage:
"""Test memory usage of custom parameters"""
def test_default_engine_memory_usage(self, ocr_service):
"""Measure memory usage of default engine"""
print("\n=== Default Engine Memory Usage ===")
tracker = MemoryTracker()
tracker.start()
with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure:
# Create mock engine with some memory footprint
mock_engine = Mock()
mock_engine.memory_size = 100 # Simulated memory
mock_ppstructure.return_value = mock_engine
print(f"Baseline memory: {tracker.start_memory:.2f} MB")
# Initialize engine
ocr_service._ensure_structure_engine(custom_params=None)
memory_delta = tracker.get_delta()
print(f"After initialization: {memory_delta:.2f} MB")
# Access cached engine multiple times
for _ in range(10):
ocr_service._ensure_structure_engine(custom_params=None)
memory_after_reuse = tracker.get_delta()
print(f"After 10 reuses: {memory_after_reuse:.2f} MB")
# Memory should not increase significantly with reuse
assert abs(memory_after_reuse - memory_delta) < 10 # < 10MB increase
def test_custom_engine_memory_cleanup(self, ocr_service):
"""Verify custom engines are properly cleaned up"""
print("\n=== Custom Engine Memory Cleanup ===")
tracker = MemoryTracker()
tracker.start()
custom_params = {'layout_detection_threshold': 0.15}
with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure:
mock_ppstructure.return_value = Mock()
print(f"Baseline memory: {tracker.start_memory:.2f} MB")
# Create multiple engines with custom params
engines = []
for i in range(5):
engine = ocr_service._ensure_structure_engine(custom_params=custom_params)
engines.append(engine)
if i == 0:
first_engine_memory = tracker.get_delta()
print(f"After 1st engine: {first_engine_memory:.2f} MB")
memory_after_all = tracker.get_delta()
print(f"After 5 engines: {memory_after_all:.2f} MB")
# Clear references
engines.clear()
gc.collect()
memory_after_cleanup = tracker.get_delta()
print(f"After cleanup: {memory_after_cleanup:.2f} MB")
# Memory should be recoverable (within 20% of peak)
# This is a rough check as actual cleanup depends on Python GC
peak_delta = tracker.get_peak_delta()
print(f"Peak delta: {peak_delta:.2f} MB")
def test_no_memory_leak_in_parameter_passing(self, ocr_service):
"""Test that parameter passing doesn't cause memory leaks"""
print("\n=== Memory Leak Test ===")
tracker = MemoryTracker()
tracker.start()
custom_params = {
'layout_detection_threshold': 0.15,
'text_det_thresh': 0.2,
'layout_merge_bboxes_mode': 'small'
}
with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure:
mock_ppstructure.return_value = Mock()
print(f"Baseline: {tracker.start_memory:.2f} MB")
# Simulate many requests with custom params
iterations = 100
for i in range(iterations):
# Create engine
engine = ocr_service._ensure_structure_engine(custom_params=custom_params.copy())
# Sample memory every 10 iterations
if i % 10 == 0:
memory_delta = tracker.get_delta()
print(f"Iteration {i}: {memory_delta:.2f} MB")
# Clear reference
del engine
# Force GC periodically
if i % 50 == 0:
gc.collect()
final_memory = tracker.get_delta()
print(f"Final: {final_memory:.2f} MB")
print(f"Peak: {tracker.get_peak_delta():.2f} MB")
# Memory growth should be bounded
# Allow up to 50MB growth for 100 iterations
assert tracker.get_peak_delta() < 50
@pytest.mark.performance
class TestProcessingPerformance:
"""Test end-to-end processing performance with custom parameters"""
def test_processing_time_comparison(self, ocr_service, sample_image):
"""Compare processing time: default vs custom parameters"""
if sample_image is None:
pytest.skip("No sample image available")
print(f"\n=== Processing Time Comparison ===")
print(f"Image: {sample_image.name}")
with patch.object(ocr_service, 'get_ocr_engine') as mock_get_ocr:
with patch.object(ocr_service, 'structure_engine', None):
with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure:
# Setup mocks
mock_ocr_engine = Mock()
mock_ocr_engine.ocr.return_value = [[[[0, 0], [100, 0], [100, 50], [0, 50]], ('test', 0.9)]]
mock_get_ocr.return_value = mock_ocr_engine
mock_structure_engine = Mock()
mock_structure_engine.return_value = []
mock_ppstructure.return_value = mock_structure_engine
# Test with default parameters
start = time.time()
result_default = ocr_service.process_image(
image_path=sample_image,
detect_layout=True,
pp_structure_params=None
)
time_default = time.time() - start
print(f"Default params: {time_default * 1000:.2f}ms")
# Test with custom parameters
custom_params = {
'layout_detection_threshold': 0.15,
'text_det_thresh': 0.2
}
start = time.time()
result_custom = ocr_service.process_image(
image_path=sample_image,
detect_layout=True,
pp_structure_params=custom_params
)
time_custom = time.time() - start
print(f"Custom params: {time_custom * 1000:.2f}ms")
print(f"Difference: {abs(time_custom - time_default) * 1000:.2f}ms")
# Both should succeed
assert result_default['status'] == 'success'
assert result_custom['status'] == 'success'
@pytest.mark.performance
@pytest.mark.benchmark
class TestConcurrentPerformance:
"""Test performance under concurrent load"""
def test_concurrent_custom_params_no_cache_pollution(self, ocr_service):
"""Verify custom params don't pollute cache in concurrent scenario"""
print("\n=== Concurrent Cache Test ===")
with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure:
default_engine = Mock()
default_engine.type = 'default'
custom_engine = Mock()
custom_engine.type = 'custom'
# First call creates default engine
mock_ppstructure.return_value = default_engine
engine1 = ocr_service._ensure_structure_engine(custom_params=None)
assert engine1.type == 'default'
print("✓ Created default (cached) engine")
# Second call with custom params creates new engine
mock_ppstructure.return_value = custom_engine
custom_params = {'layout_detection_threshold': 0.15}
engine2 = ocr_service._ensure_structure_engine(custom_params=custom_params)
assert engine2.type == 'custom'
print("✓ Created custom (uncached) engine")
# Third call without custom params should return cached default
engine3 = ocr_service._ensure_structure_engine(custom_params=None)
assert engine3.type == 'default'
assert engine3 is engine1
print("✓ Retrieved default engine from cache (not polluted)")
# Verify default engine was only created once
assert mock_ppstructure.call_count == 2 # default + custom
def run_benchmarks():
"""Run all performance benchmarks and generate report"""
print("=" * 60)
print("PP-StructureV3 Parameters - Performance Benchmark Report")
print("=" * 60)
pytest.main([
__file__,
'-v',
'-s',
'-m', 'performance',
'--tb=short'
])
if __name__ == '__main__':
run_benchmarks()