feat: simplify layout model selection and archive proposals
Changes: - Replace PP-Structure 7-slider parameter UI with simple 3-option layout model selector - Add layout model mapping: chinese (PP-DocLayout-S), default (PubLayNet), cdla - Add LayoutModelSelector component and zh-TW translations - Fix "default" model behavior with sentinel value for PubLayNet - Add gap filling service for OCR track coverage improvement - Add PP-Structure debug utilities - Archive completed/incomplete proposals: - add-ocr-track-gap-filling (complete) - fix-ocr-track-table-rendering (incomplete) - simplify-ppstructure-model-selection (22/25 tasks) - Add new layout model tests, archive old PP-Structure param tests - Update OpenSpec ocr-processing spec with layout model requirements 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -3,7 +3,7 @@ Tool_OCR - Configuration Management
|
|||||||
Loads environment variables and provides centralized configuration
|
Loads environment variables and provides centralized configuration
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from typing import List
|
from typing import List, Optional
|
||||||
from pydantic_settings import BaseSettings
|
from pydantic_settings import BaseSettings
|
||||||
from pydantic import Field
|
from pydantic import Field
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -99,6 +99,33 @@ class Settings(BaseSettings):
|
|||||||
text_det_box_thresh: float = Field(default=0.3) # Lower box threshold for better detection
|
text_det_box_thresh: float = Field(default=0.3) # Lower box threshold for better detection
|
||||||
text_det_unclip_ratio: float = Field(default=1.2) # Smaller unclip for tighter text boxes
|
text_det_unclip_ratio: float = Field(default=1.2) # Smaller unclip for tighter text boxes
|
||||||
|
|
||||||
|
# Layout Detection Model Configuration
|
||||||
|
# Available models:
|
||||||
|
# - None (default): Use PP-StructureV3's built-in model (PubLayNet-based)
|
||||||
|
# - "PP-DocLayout-S": Better for Chinese docs, papers, contracts, exams (23 categories)
|
||||||
|
# - "picodet_lcnet_x1_0_fgd_layout_cdla": CDLA-based model for Chinese document layout
|
||||||
|
layout_detection_model_name: Optional[str] = Field(
|
||||||
|
default="PP-DocLayout-S",
|
||||||
|
description="Layout detection model name. Set to 'PP-DocLayout-S' for better Chinese document support."
|
||||||
|
)
|
||||||
|
layout_detection_model_dir: Optional[str] = Field(
|
||||||
|
default=None,
|
||||||
|
description="Custom layout detection model directory. If None, downloads official model."
|
||||||
|
)
|
||||||
|
|
||||||
|
# ===== Gap Filling Configuration =====
|
||||||
|
# Supplements PP-StructureV3 output with raw OCR regions when detection is incomplete
|
||||||
|
gap_filling_enabled: bool = Field(default=True) # Enable gap filling for OCR track
|
||||||
|
gap_filling_coverage_threshold: float = Field(default=0.7) # Activate when coverage < 70%
|
||||||
|
gap_filling_iou_threshold: float = Field(default=0.15) # IoU threshold for coverage detection
|
||||||
|
gap_filling_confidence_threshold: float = Field(default=0.3) # Min confidence for raw OCR regions
|
||||||
|
gap_filling_dedup_iou_threshold: float = Field(default=0.5) # IoU threshold for deduplication
|
||||||
|
|
||||||
|
# ===== Debug Configuration =====
|
||||||
|
# Enable debug outputs for PP-StructureV3 analysis
|
||||||
|
pp_structure_debug_enabled: bool = Field(default=True) # Save debug files for PP-StructureV3
|
||||||
|
pp_structure_debug_visualization: bool = Field(default=True) # Generate visualization images
|
||||||
|
|
||||||
# Performance tuning
|
# Performance tuning
|
||||||
use_fp16_inference: bool = Field(default=False) # Half-precision (if supported)
|
use_fp16_inference: bool = Field(default=False) # Half-precision (if supported)
|
||||||
enable_cudnn_benchmark: bool = Field(default=True) # Optimize convolution algorithms
|
enable_cudnn_benchmark: bool = Field(default=True) # Optimize convolution algorithms
|
||||||
|
|||||||
@@ -68,7 +68,7 @@ def process_task_ocr(
|
|||||||
use_dual_track: bool = True,
|
use_dual_track: bool = True,
|
||||||
force_track: Optional[str] = None,
|
force_track: Optional[str] = None,
|
||||||
language: str = 'ch',
|
language: str = 'ch',
|
||||||
pp_structure_params: Optional[dict] = None
|
layout_model: Optional[str] = "chinese"
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Background task to process OCR for a task with dual-track support.
|
Background task to process OCR for a task with dual-track support.
|
||||||
@@ -84,7 +84,7 @@ def process_task_ocr(
|
|||||||
use_dual_track: Enable dual-track processing
|
use_dual_track: Enable dual-track processing
|
||||||
force_track: Force specific track ('ocr' or 'direct')
|
force_track: Force specific track ('ocr' or 'direct')
|
||||||
language: OCR language code
|
language: OCR language code
|
||||||
pp_structure_params: Optional custom PP-StructureV3 parameters (dict)
|
layout_model: Layout detection model ('chinese', 'default', 'cdla')
|
||||||
"""
|
"""
|
||||||
from app.core.database import SessionLocal
|
from app.core.database import SessionLocal
|
||||||
from app.models.task import Task
|
from app.models.task import Task
|
||||||
@@ -143,7 +143,7 @@ def process_task_ocr(
|
|||||||
output_dir=result_dir,
|
output_dir=result_dir,
|
||||||
use_dual_track=use_dual_track,
|
use_dual_track=use_dual_track,
|
||||||
force_track=force_track,
|
force_track=force_track,
|
||||||
pp_structure_params=pp_structure_params
|
layout_model=layout_model
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# Fall back to traditional processing (no force_track support)
|
# Fall back to traditional processing (no force_track support)
|
||||||
@@ -152,7 +152,7 @@ def process_task_ocr(
|
|||||||
lang=language,
|
lang=language,
|
||||||
detect_layout=True,
|
detect_layout=True,
|
||||||
output_dir=result_dir,
|
output_dir=result_dir,
|
||||||
pp_structure_params=pp_structure_params
|
layout_model=layout_model
|
||||||
)
|
)
|
||||||
|
|
||||||
# Calculate processing time
|
# Calculate processing time
|
||||||
@@ -717,14 +717,14 @@ async def start_task(
|
|||||||
current_user: User = Depends(get_current_user)
|
current_user: User = Depends(get_current_user)
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Start processing a pending task with dual-track support and optional PP-StructureV3 parameter tuning
|
Start processing a pending task with dual-track support and layout model selection
|
||||||
|
|
||||||
- **task_id**: Task UUID
|
- **task_id**: Task UUID
|
||||||
- **options**: Processing options (in request body):
|
- **options**: Processing options (in request body):
|
||||||
- **use_dual_track**: Enable intelligent track selection (default: true)
|
- **use_dual_track**: Enable intelligent track selection (default: true)
|
||||||
- **force_track**: Force specific processing track ('ocr' or 'direct')
|
- **force_track**: Force specific processing track ('ocr' or 'direct')
|
||||||
- **language**: OCR language code (default: 'ch')
|
- **language**: OCR language code (default: 'ch')
|
||||||
- **pp_structure_params**: Fine-tuning parameters for PP-StructureV3 (OCR track only)
|
- **layout_model**: Layout detection model ('chinese', 'default', 'cdla')
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
# Parse processing options with defaults
|
# Parse processing options with defaults
|
||||||
@@ -735,11 +735,9 @@ async def start_task(
|
|||||||
force_track = options.force_track.value if options.force_track else None
|
force_track = options.force_track.value if options.force_track else None
|
||||||
language = options.language
|
language = options.language
|
||||||
|
|
||||||
# Extract and convert PP-StructureV3 parameters to dict
|
# Extract layout model (default to 'chinese' for best Chinese document support)
|
||||||
pp_structure_params = None
|
layout_model = options.layout_model.value if options.layout_model else "chinese"
|
||||||
if options.pp_structure_params:
|
logger.info(f"Using layout model: {layout_model}")
|
||||||
pp_structure_params = options.pp_structure_params.model_dump(exclude_none=True)
|
|
||||||
logger.info(f"Using custom PP-StructureV3 parameters: {pp_structure_params}")
|
|
||||||
|
|
||||||
# Get task details
|
# Get task details
|
||||||
task = task_service.get_task_by_id(
|
task = task_service.get_task_by_id(
|
||||||
@@ -777,7 +775,7 @@ async def start_task(
|
|||||||
status=TaskStatus.PROCESSING
|
status=TaskStatus.PROCESSING
|
||||||
)
|
)
|
||||||
|
|
||||||
# Start OCR processing in background with dual-track parameters and custom PP-StructureV3 params
|
# Start OCR processing in background with dual-track parameters and layout model
|
||||||
background_tasks.add_task(
|
background_tasks.add_task(
|
||||||
process_task_ocr,
|
process_task_ocr,
|
||||||
task_id=task_id,
|
task_id=task_id,
|
||||||
@@ -787,13 +785,11 @@ async def start_task(
|
|||||||
use_dual_track=use_dual_track,
|
use_dual_track=use_dual_track,
|
||||||
force_track=force_track,
|
force_track=force_track,
|
||||||
language=language,
|
language=language,
|
||||||
pp_structure_params=pp_structure_params
|
layout_model=layout_model
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info(f"Started OCR processing task {task_id} for user {current_user.email}")
|
logger.info(f"Started OCR processing task {task_id} for user {current_user.email}")
|
||||||
logger.info(f"Options: dual_track={use_dual_track}, force_track={force_track}, lang={language}")
|
logger.info(f"Options: dual_track={use_dual_track}, force_track={force_track}, lang={language}, layout_model={layout_model}")
|
||||||
if pp_structure_params:
|
|
||||||
logger.info(f"Custom PP-StructureV3 params: {pp_structure_params}")
|
|
||||||
return task
|
return task
|
||||||
|
|
||||||
except HTTPException:
|
except HTTPException:
|
||||||
|
|||||||
@@ -24,6 +24,19 @@ class ProcessingTrackEnum(str, Enum):
|
|||||||
AUTO = "auto" # Auto-detect best track
|
AUTO = "auto" # Auto-detect best track
|
||||||
|
|
||||||
|
|
||||||
|
class LayoutModelEnum(str, Enum):
|
||||||
|
"""Layout detection model selection for OCR track.
|
||||||
|
|
||||||
|
Different models are optimized for different document types:
|
||||||
|
- CHINESE: PP-DocLayout-S, optimized for Chinese documents (forms, contracts, invoices)
|
||||||
|
- DEFAULT: PubLayNet-based, optimized for English academic papers
|
||||||
|
- CDLA: CDLA model, specialized Chinese document layout analysis
|
||||||
|
"""
|
||||||
|
CHINESE = "chinese" # PP-DocLayout-S - Best for Chinese documents (recommended)
|
||||||
|
DEFAULT = "default" # PubLayNet-based - Best for English documents
|
||||||
|
CDLA = "cdla" # CDLA model - Alternative for Chinese layout
|
||||||
|
|
||||||
|
|
||||||
class TaskCreate(BaseModel):
|
class TaskCreate(BaseModel):
|
||||||
"""Task creation request"""
|
"""Task creation request"""
|
||||||
filename: Optional[str] = Field(None, description="Original filename")
|
filename: Optional[str] = Field(None, description="Original filename")
|
||||||
@@ -132,7 +145,11 @@ class UploadResponse(BaseModel):
|
|||||||
# ===== Dual-Track Processing Schemas =====
|
# ===== Dual-Track Processing Schemas =====
|
||||||
|
|
||||||
class PPStructureV3Params(BaseModel):
|
class PPStructureV3Params(BaseModel):
|
||||||
"""PP-StructureV3 fine-tuning parameters for OCR track"""
|
"""PP-StructureV3 fine-tuning parameters for OCR track.
|
||||||
|
|
||||||
|
DEPRECATED: This class is deprecated and will be removed in a future version.
|
||||||
|
Use `layout_model` parameter in ProcessingOptions instead.
|
||||||
|
"""
|
||||||
layout_detection_threshold: Optional[float] = Field(
|
layout_detection_threshold: Optional[float] = Field(
|
||||||
None, ge=0, le=1,
|
None, ge=0, le=1,
|
||||||
description="Layout block detection score threshold (lower=more blocks, higher=high confidence only)"
|
description="Layout block detection score threshold (lower=more blocks, higher=high confidence only)"
|
||||||
@@ -172,10 +189,10 @@ class ProcessingOptions(BaseModel):
|
|||||||
include_images: bool = Field(default=True, description="Extract and save images")
|
include_images: bool = Field(default=True, description="Extract and save images")
|
||||||
confidence_threshold: Optional[float] = Field(None, ge=0, le=1, description="OCR confidence threshold")
|
confidence_threshold: Optional[float] = Field(None, ge=0, le=1, description="OCR confidence threshold")
|
||||||
|
|
||||||
# PP-StructureV3 fine-tuning parameters (OCR track only)
|
# Layout model selection (OCR track only)
|
||||||
pp_structure_params: Optional[PPStructureV3Params] = Field(
|
layout_model: Optional[LayoutModelEnum] = Field(
|
||||||
None,
|
default=LayoutModelEnum.CHINESE,
|
||||||
description="Fine-tuning parameters for PP-StructureV3 (OCR track only)"
|
description="Layout detection model: 'chinese' (recommended for Chinese docs), 'default' (English docs), 'cdla' (Chinese layout)"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
649
backend/app/services/gap_filling_service.py
Normal file
649
backend/app/services/gap_filling_service.py
Normal file
@@ -0,0 +1,649 @@
|
|||||||
|
"""
|
||||||
|
Gap Filling Service for OCR Track
|
||||||
|
|
||||||
|
This service detects and fills gaps in PP-StructureV3 output by supplementing
|
||||||
|
with Raw OCR text regions when significant content loss is detected.
|
||||||
|
|
||||||
|
The hybrid approach uses Raw OCR's comprehensive text detection to compensate
|
||||||
|
for PP-StructureV3's layout model limitations on certain document types.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from typing import Dict, List, Optional, Tuple, Set, Any
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
from app.models.unified_document import (
|
||||||
|
DocumentElement, BoundingBox, ElementType, Dimensions
|
||||||
|
)
|
||||||
|
from app.core.config import settings
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
# Element types that should NOT be supplemented (preserve structural integrity)
|
||||||
|
SKIP_ELEMENT_TYPES: Set[ElementType] = {
|
||||||
|
ElementType.TABLE,
|
||||||
|
ElementType.IMAGE,
|
||||||
|
ElementType.FIGURE,
|
||||||
|
ElementType.CHART,
|
||||||
|
ElementType.DIAGRAM,
|
||||||
|
ElementType.HEADER,
|
||||||
|
ElementType.FOOTER,
|
||||||
|
ElementType.FORMULA,
|
||||||
|
ElementType.CODE,
|
||||||
|
ElementType.BARCODE,
|
||||||
|
ElementType.QR_CODE,
|
||||||
|
ElementType.LOGO,
|
||||||
|
ElementType.STAMP,
|
||||||
|
ElementType.SIGNATURE,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class TextRegion:
|
||||||
|
"""Represents a raw OCR text region."""
|
||||||
|
text: str
|
||||||
|
bbox: List[float] # [x0, y0, x1, y1] or polygon format
|
||||||
|
confidence: float
|
||||||
|
page: int = 0
|
||||||
|
|
||||||
|
@property
|
||||||
|
def normalized_bbox(self) -> Tuple[float, float, float, float]:
|
||||||
|
"""Get normalized bbox as (x0, y0, x1, y1)."""
|
||||||
|
if not self.bbox:
|
||||||
|
return (0, 0, 0, 0)
|
||||||
|
|
||||||
|
# Check if bbox is nested list format [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
|
||||||
|
# This is common PaddleOCR polygon format
|
||||||
|
if len(self.bbox) >= 1 and isinstance(self.bbox[0], (list, tuple)):
|
||||||
|
# Nested format: extract all x and y coordinates
|
||||||
|
xs = [pt[0] for pt in self.bbox if len(pt) >= 2]
|
||||||
|
ys = [pt[1] for pt in self.bbox if len(pt) >= 2]
|
||||||
|
if xs and ys:
|
||||||
|
return (min(xs), min(ys), max(xs), max(ys))
|
||||||
|
return (0, 0, 0, 0)
|
||||||
|
|
||||||
|
# Flat format
|
||||||
|
if len(self.bbox) == 4:
|
||||||
|
# Simple [x0, y0, x1, y1] format
|
||||||
|
return (float(self.bbox[0]), float(self.bbox[1]),
|
||||||
|
float(self.bbox[2]), float(self.bbox[3]))
|
||||||
|
elif len(self.bbox) >= 8:
|
||||||
|
# Flat polygon format: [x1, y1, x2, y2, x3, y3, x4, y4]
|
||||||
|
xs = [self.bbox[i] for i in range(0, len(self.bbox), 2)]
|
||||||
|
ys = [self.bbox[i] for i in range(1, len(self.bbox), 2)]
|
||||||
|
return (min(xs), min(ys), max(xs), max(ys))
|
||||||
|
|
||||||
|
return (0, 0, 0, 0)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def center(self) -> Tuple[float, float]:
|
||||||
|
"""Get center point of the bbox."""
|
||||||
|
x0, y0, x1, y1 = self.normalized_bbox
|
||||||
|
return ((x0 + x1) / 2, (y0 + y1) / 2)
|
||||||
|
|
||||||
|
|
||||||
|
class GapFillingService:
|
||||||
|
"""
|
||||||
|
Service for detecting and filling gaps in PP-StructureV3 output.
|
||||||
|
|
||||||
|
This service:
|
||||||
|
1. Calculates coverage of PP-StructureV3 elements over raw OCR regions
|
||||||
|
2. Identifies uncovered raw OCR regions
|
||||||
|
3. Supplements uncovered regions as TEXT elements
|
||||||
|
4. Deduplicates against existing PP-StructureV3 TEXT elements
|
||||||
|
5. Recalculates reading order for the combined result
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
coverage_threshold: float = None,
|
||||||
|
iou_threshold: float = None,
|
||||||
|
confidence_threshold: float = None,
|
||||||
|
dedup_iou_threshold: float = None,
|
||||||
|
enabled: bool = None
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Initialize the gap filling service.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
coverage_threshold: Coverage ratio below which gap filling activates (default: 0.7)
|
||||||
|
iou_threshold: IoU threshold for coverage detection (default: 0.15)
|
||||||
|
confidence_threshold: Minimum confidence for raw OCR regions (default: 0.3)
|
||||||
|
dedup_iou_threshold: IoU threshold for deduplication (default: 0.5)
|
||||||
|
enabled: Whether gap filling is enabled (default: True)
|
||||||
|
"""
|
||||||
|
self.coverage_threshold = coverage_threshold if coverage_threshold is not None else getattr(
|
||||||
|
settings, 'gap_filling_coverage_threshold', 0.7
|
||||||
|
)
|
||||||
|
self.iou_threshold = iou_threshold if iou_threshold is not None else getattr(
|
||||||
|
settings, 'gap_filling_iou_threshold', 0.15
|
||||||
|
)
|
||||||
|
self.confidence_threshold = confidence_threshold if confidence_threshold is not None else getattr(
|
||||||
|
settings, 'gap_filling_confidence_threshold', 0.3
|
||||||
|
)
|
||||||
|
self.dedup_iou_threshold = dedup_iou_threshold if dedup_iou_threshold is not None else getattr(
|
||||||
|
settings, 'gap_filling_dedup_iou_threshold', 0.5
|
||||||
|
)
|
||||||
|
self.enabled = enabled if enabled is not None else getattr(
|
||||||
|
settings, 'gap_filling_enabled', True
|
||||||
|
)
|
||||||
|
|
||||||
|
def should_activate(
|
||||||
|
self,
|
||||||
|
raw_ocr_regions: List[TextRegion],
|
||||||
|
pp_structure_elements: List[DocumentElement]
|
||||||
|
) -> Tuple[bool, float]:
|
||||||
|
"""
|
||||||
|
Determine if gap filling should be activated.
|
||||||
|
|
||||||
|
Gap filling activates when:
|
||||||
|
1. Coverage ratio is below threshold (default: 70%)
|
||||||
|
2. OR element count disparity is significant
|
||||||
|
|
||||||
|
Args:
|
||||||
|
raw_ocr_regions: List of raw OCR text regions
|
||||||
|
pp_structure_elements: List of PP-StructureV3 elements
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (should_activate, coverage_ratio)
|
||||||
|
"""
|
||||||
|
if not self.enabled:
|
||||||
|
return False, 1.0
|
||||||
|
|
||||||
|
if not raw_ocr_regions:
|
||||||
|
return False, 1.0
|
||||||
|
|
||||||
|
# Calculate coverage
|
||||||
|
covered_count = 0
|
||||||
|
for region in raw_ocr_regions:
|
||||||
|
if self._is_region_covered(region, pp_structure_elements):
|
||||||
|
covered_count += 1
|
||||||
|
|
||||||
|
coverage_ratio = covered_count / len(raw_ocr_regions)
|
||||||
|
|
||||||
|
# Check activation conditions
|
||||||
|
should_activate = coverage_ratio < self.coverage_threshold
|
||||||
|
|
||||||
|
if should_activate:
|
||||||
|
logger.info(
|
||||||
|
f"Gap filling activated: coverage={coverage_ratio:.2%} < threshold={self.coverage_threshold:.0%}, "
|
||||||
|
f"raw_regions={len(raw_ocr_regions)}, pp_elements={len(pp_structure_elements)}"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.debug(
|
||||||
|
f"Gap filling not needed: coverage={coverage_ratio:.2%} >= threshold={self.coverage_threshold:.0%}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return should_activate, coverage_ratio
|
||||||
|
|
||||||
|
def find_uncovered_regions(
|
||||||
|
self,
|
||||||
|
raw_ocr_regions: List[TextRegion],
|
||||||
|
pp_structure_elements: List[DocumentElement]
|
||||||
|
) -> List[TextRegion]:
|
||||||
|
"""
|
||||||
|
Find raw OCR regions not covered by PP-StructureV3 elements.
|
||||||
|
|
||||||
|
A region is considered covered if:
|
||||||
|
1. Its center point falls inside any PP-StructureV3 element bbox, OR
|
||||||
|
2. IoU with any PP-StructureV3 element exceeds iou_threshold
|
||||||
|
|
||||||
|
Args:
|
||||||
|
raw_ocr_regions: List of raw OCR text regions
|
||||||
|
pp_structure_elements: List of PP-StructureV3 elements
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of uncovered raw OCR regions
|
||||||
|
"""
|
||||||
|
uncovered = []
|
||||||
|
|
||||||
|
for region in raw_ocr_regions:
|
||||||
|
# Skip low confidence regions
|
||||||
|
if region.confidence < self.confidence_threshold:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not self._is_region_covered(region, pp_structure_elements):
|
||||||
|
uncovered.append(region)
|
||||||
|
|
||||||
|
logger.debug(f"Found {len(uncovered)} uncovered regions out of {len(raw_ocr_regions)}")
|
||||||
|
return uncovered
|
||||||
|
|
||||||
|
def _is_region_covered(
|
||||||
|
self,
|
||||||
|
region: TextRegion,
|
||||||
|
pp_structure_elements: List[DocumentElement]
|
||||||
|
) -> bool:
|
||||||
|
"""
|
||||||
|
Check if a raw OCR region is covered by any PP-StructureV3 element.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
region: Raw OCR text region
|
||||||
|
pp_structure_elements: List of PP-StructureV3 elements
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if the region is covered
|
||||||
|
"""
|
||||||
|
center_x, center_y = region.center
|
||||||
|
region_bbox = region.normalized_bbox
|
||||||
|
|
||||||
|
for element in pp_structure_elements:
|
||||||
|
elem_bbox = (
|
||||||
|
element.bbox.x0, element.bbox.y0,
|
||||||
|
element.bbox.x1, element.bbox.y1
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check 1: Center point falls inside element bbox
|
||||||
|
if self._point_in_bbox(center_x, center_y, elem_bbox):
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Check 2: IoU exceeds threshold
|
||||||
|
iou = self._calculate_iou(region_bbox, elem_bbox)
|
||||||
|
if iou > self.iou_threshold:
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def deduplicate_regions(
|
||||||
|
self,
|
||||||
|
uncovered_regions: List[TextRegion],
|
||||||
|
pp_structure_elements: List[DocumentElement]
|
||||||
|
) -> List[TextRegion]:
|
||||||
|
"""
|
||||||
|
Remove regions that highly overlap with existing PP-StructureV3 TEXT elements.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
uncovered_regions: List of uncovered raw OCR regions
|
||||||
|
pp_structure_elements: List of PP-StructureV3 elements
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Deduplicated list of regions
|
||||||
|
"""
|
||||||
|
# Get TEXT elements only for deduplication
|
||||||
|
text_elements = [
|
||||||
|
e for e in pp_structure_elements
|
||||||
|
if e.type not in SKIP_ELEMENT_TYPES
|
||||||
|
]
|
||||||
|
|
||||||
|
deduplicated = []
|
||||||
|
for region in uncovered_regions:
|
||||||
|
region_bbox = region.normalized_bbox
|
||||||
|
is_duplicate = False
|
||||||
|
|
||||||
|
for element in text_elements:
|
||||||
|
elem_bbox = (
|
||||||
|
element.bbox.x0, element.bbox.y0,
|
||||||
|
element.bbox.x1, element.bbox.y1
|
||||||
|
)
|
||||||
|
|
||||||
|
iou = self._calculate_iou(region_bbox, elem_bbox)
|
||||||
|
if iou > self.dedup_iou_threshold:
|
||||||
|
logger.debug(
|
||||||
|
f"Skipping duplicate region (IoU={iou:.2f}): '{region.text[:30]}...'"
|
||||||
|
)
|
||||||
|
is_duplicate = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if not is_duplicate:
|
||||||
|
deduplicated.append(region)
|
||||||
|
|
||||||
|
removed_count = len(uncovered_regions) - len(deduplicated)
|
||||||
|
if removed_count > 0:
|
||||||
|
logger.debug(f"Removed {removed_count} duplicate regions")
|
||||||
|
|
||||||
|
return deduplicated
|
||||||
|
|
||||||
|
def convert_regions_to_elements(
|
||||||
|
self,
|
||||||
|
regions: List[TextRegion],
|
||||||
|
page_number: int,
|
||||||
|
start_element_id: int = 0
|
||||||
|
) -> List[DocumentElement]:
|
||||||
|
"""
|
||||||
|
Convert raw OCR regions to DocumentElement objects.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
regions: List of raw OCR regions to convert
|
||||||
|
page_number: Page number for the elements
|
||||||
|
start_element_id: Starting ID counter for elements
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of DocumentElement objects
|
||||||
|
"""
|
||||||
|
elements = []
|
||||||
|
|
||||||
|
for idx, region in enumerate(regions):
|
||||||
|
x0, y0, x1, y1 = region.normalized_bbox
|
||||||
|
|
||||||
|
element = DocumentElement(
|
||||||
|
element_id=f"gap_fill_{page_number}_{start_element_id + idx}",
|
||||||
|
type=ElementType.TEXT,
|
||||||
|
content=region.text,
|
||||||
|
bbox=BoundingBox(x0=x0, y0=y0, x1=x1, y1=y1),
|
||||||
|
confidence=region.confidence,
|
||||||
|
metadata={
|
||||||
|
'source': 'gap_filling',
|
||||||
|
'original_confidence': region.confidence
|
||||||
|
}
|
||||||
|
)
|
||||||
|
elements.append(element)
|
||||||
|
|
||||||
|
return elements
|
||||||
|
|
||||||
|
def recalculate_reading_order(
|
||||||
|
self,
|
||||||
|
elements: List[DocumentElement]
|
||||||
|
) -> List[int]:
|
||||||
|
"""
|
||||||
|
Recalculate reading order for elements based on position.
|
||||||
|
|
||||||
|
Sorts elements by y0 (top to bottom) then x0 (left to right).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
elements: List of DocumentElement objects
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of element indices in reading order
|
||||||
|
"""
|
||||||
|
# Create indexed list with position info
|
||||||
|
indexed_elements = [
|
||||||
|
(idx, e.bbox.y0, e.bbox.x0)
|
||||||
|
for idx, e in enumerate(elements)
|
||||||
|
]
|
||||||
|
|
||||||
|
# Sort by y0 then x0
|
||||||
|
indexed_elements.sort(key=lambda x: (x[1], x[2]))
|
||||||
|
|
||||||
|
# Return indices in reading order
|
||||||
|
return [idx for idx, _, _ in indexed_elements]
|
||||||
|
|
||||||
|
def merge_adjacent_regions(
|
||||||
|
self,
|
||||||
|
regions: List[TextRegion],
|
||||||
|
max_horizontal_gap: float = 20.0,
|
||||||
|
max_vertical_gap: float = 5.0
|
||||||
|
) -> List[TextRegion]:
|
||||||
|
"""
|
||||||
|
Merge fragmented adjacent regions on the same line.
|
||||||
|
|
||||||
|
This is optional and can reduce fragmentation from raw OCR.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
regions: List of raw OCR regions
|
||||||
|
max_horizontal_gap: Maximum horizontal gap to merge (pixels)
|
||||||
|
max_vertical_gap: Maximum vertical gap to merge (pixels)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of merged regions
|
||||||
|
"""
|
||||||
|
if not regions:
|
||||||
|
return regions
|
||||||
|
|
||||||
|
# Sort by y0, then x0
|
||||||
|
sorted_regions = sorted(
|
||||||
|
regions,
|
||||||
|
key=lambda r: (r.normalized_bbox[1], r.normalized_bbox[0])
|
||||||
|
)
|
||||||
|
|
||||||
|
merged = []
|
||||||
|
current = sorted_regions[0]
|
||||||
|
|
||||||
|
for next_region in sorted_regions[1:]:
|
||||||
|
curr_bbox = current.normalized_bbox
|
||||||
|
next_bbox = next_region.normalized_bbox
|
||||||
|
|
||||||
|
# Check if on same line (vertical overlap)
|
||||||
|
curr_y_center = (curr_bbox[1] + curr_bbox[3]) / 2
|
||||||
|
next_y_center = (next_bbox[1] + next_bbox[3]) / 2
|
||||||
|
vertical_distance = abs(curr_y_center - next_y_center)
|
||||||
|
|
||||||
|
# Check horizontal gap
|
||||||
|
horizontal_gap = next_bbox[0] - curr_bbox[2]
|
||||||
|
|
||||||
|
if (vertical_distance < max_vertical_gap and
|
||||||
|
0 <= horizontal_gap <= max_horizontal_gap):
|
||||||
|
# Merge regions
|
||||||
|
merged_bbox = [
|
||||||
|
min(curr_bbox[0], next_bbox[0]),
|
||||||
|
min(curr_bbox[1], next_bbox[1]),
|
||||||
|
max(curr_bbox[2], next_bbox[2]),
|
||||||
|
max(curr_bbox[3], next_bbox[3])
|
||||||
|
]
|
||||||
|
current = TextRegion(
|
||||||
|
text=current.text + " " + next_region.text,
|
||||||
|
bbox=merged_bbox,
|
||||||
|
confidence=min(current.confidence, next_region.confidence),
|
||||||
|
page=current.page
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
merged.append(current)
|
||||||
|
current = next_region
|
||||||
|
|
||||||
|
merged.append(current)
|
||||||
|
|
||||||
|
if len(merged) < len(regions):
|
||||||
|
logger.debug(f"Merged {len(regions)} regions into {len(merged)}")
|
||||||
|
|
||||||
|
return merged
|
||||||
|
|
||||||
|
def fill_gaps(
|
||||||
|
self,
|
||||||
|
raw_ocr_regions: List[Dict[str, Any]],
|
||||||
|
pp_structure_elements: List[DocumentElement],
|
||||||
|
page_number: int,
|
||||||
|
ocr_dimensions: Optional[Dict[str, Any]] = None,
|
||||||
|
pp_dimensions: Optional[Dimensions] = None
|
||||||
|
) -> Tuple[List[DocumentElement], Dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
Main entry point: detect gaps and fill with raw OCR regions.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
raw_ocr_regions: Raw OCR results (list of dicts with text, bbox, confidence)
|
||||||
|
pp_structure_elements: PP-StructureV3 elements
|
||||||
|
page_number: Current page number
|
||||||
|
ocr_dimensions: OCR image dimensions for coordinate alignment
|
||||||
|
pp_dimensions: PP-Structure dimensions for coordinate alignment
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (supplemented_elements, statistics)
|
||||||
|
"""
|
||||||
|
statistics = {
|
||||||
|
'enabled': self.enabled,
|
||||||
|
'activated': False,
|
||||||
|
'coverage_ratio': 1.0,
|
||||||
|
'raw_ocr_count': len(raw_ocr_regions),
|
||||||
|
'pp_structure_count': len(pp_structure_elements),
|
||||||
|
'uncovered_count': 0,
|
||||||
|
'deduplicated_count': 0,
|
||||||
|
'supplemented_count': 0
|
||||||
|
}
|
||||||
|
|
||||||
|
if not self.enabled:
|
||||||
|
logger.debug("Gap filling is disabled")
|
||||||
|
return [], statistics
|
||||||
|
|
||||||
|
# Convert raw OCR regions to TextRegion objects
|
||||||
|
text_regions = self._convert_raw_ocr_regions(
|
||||||
|
raw_ocr_regions, page_number, ocr_dimensions, pp_dimensions
|
||||||
|
)
|
||||||
|
|
||||||
|
if not text_regions:
|
||||||
|
logger.debug("No valid text regions to process")
|
||||||
|
return [], statistics
|
||||||
|
|
||||||
|
# Check if gap filling should activate
|
||||||
|
should_activate, coverage_ratio = self.should_activate(
|
||||||
|
text_regions, pp_structure_elements
|
||||||
|
)
|
||||||
|
statistics['coverage_ratio'] = coverage_ratio
|
||||||
|
statistics['activated'] = should_activate
|
||||||
|
|
||||||
|
if not should_activate:
|
||||||
|
return [], statistics
|
||||||
|
|
||||||
|
# Find uncovered regions
|
||||||
|
uncovered = self.find_uncovered_regions(text_regions, pp_structure_elements)
|
||||||
|
statistics['uncovered_count'] = len(uncovered)
|
||||||
|
|
||||||
|
if not uncovered:
|
||||||
|
logger.debug("No uncovered regions found")
|
||||||
|
return [], statistics
|
||||||
|
|
||||||
|
# Deduplicate against existing TEXT elements
|
||||||
|
deduplicated = self.deduplicate_regions(uncovered, pp_structure_elements)
|
||||||
|
statistics['deduplicated_count'] = len(deduplicated)
|
||||||
|
|
||||||
|
if not deduplicated:
|
||||||
|
logger.debug("All uncovered regions were duplicates")
|
||||||
|
return [], statistics
|
||||||
|
|
||||||
|
# Optional: Merge adjacent regions
|
||||||
|
# merged = self.merge_adjacent_regions(deduplicated)
|
||||||
|
|
||||||
|
# Convert to DocumentElements
|
||||||
|
start_id = len(pp_structure_elements)
|
||||||
|
supplemented = self.convert_regions_to_elements(
|
||||||
|
deduplicated, page_number, start_id
|
||||||
|
)
|
||||||
|
statistics['supplemented_count'] = len(supplemented)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Gap filling complete: supplemented {len(supplemented)} elements "
|
||||||
|
f"(coverage: {coverage_ratio:.2%} -> estimated {(coverage_ratio + len(supplemented)/len(text_regions) if text_regions else 0):.2%})"
|
||||||
|
)
|
||||||
|
|
||||||
|
return supplemented, statistics
|
||||||
|
|
||||||
|
def _convert_raw_ocr_regions(
|
||||||
|
self,
|
||||||
|
raw_regions: List[Dict[str, Any]],
|
||||||
|
page_number: int,
|
||||||
|
ocr_dimensions: Optional[Dict[str, Any]] = None,
|
||||||
|
pp_dimensions: Optional[Dimensions] = None
|
||||||
|
) -> List[TextRegion]:
|
||||||
|
"""
|
||||||
|
Convert raw OCR region dicts to TextRegion objects.
|
||||||
|
|
||||||
|
Handles coordinate alignment if dimensions are provided.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
raw_regions: List of raw OCR region dictionaries
|
||||||
|
page_number: Current page number
|
||||||
|
ocr_dimensions: OCR image dimensions
|
||||||
|
pp_dimensions: PP-Structure dimensions
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of TextRegion objects
|
||||||
|
"""
|
||||||
|
text_regions = []
|
||||||
|
|
||||||
|
# Calculate scale factors if needed
|
||||||
|
scale_x, scale_y = 1.0, 1.0
|
||||||
|
if ocr_dimensions and pp_dimensions:
|
||||||
|
ocr_width = ocr_dimensions.get('width', 0)
|
||||||
|
ocr_height = ocr_dimensions.get('height', 0)
|
||||||
|
|
||||||
|
if ocr_width > 0 and pp_dimensions.width > 0:
|
||||||
|
scale_x = pp_dimensions.width / ocr_width
|
||||||
|
if ocr_height > 0 and pp_dimensions.height > 0:
|
||||||
|
scale_y = pp_dimensions.height / ocr_height
|
||||||
|
|
||||||
|
if scale_x != 1.0 or scale_y != 1.0:
|
||||||
|
logger.debug(f"Coordinate scaling: x={scale_x:.3f}, y={scale_y:.3f}")
|
||||||
|
|
||||||
|
for region in raw_regions:
|
||||||
|
text = region.get('text', '')
|
||||||
|
if not text or not text.strip():
|
||||||
|
continue
|
||||||
|
|
||||||
|
confidence = region.get('confidence', 0.0)
|
||||||
|
bbox_raw = region.get('bbox', [])
|
||||||
|
|
||||||
|
# Normalize bbox
|
||||||
|
if isinstance(bbox_raw, dict):
|
||||||
|
# Dict format: {x_min, y_min, x_max, y_max}
|
||||||
|
bbox = [
|
||||||
|
bbox_raw.get('x_min', 0),
|
||||||
|
bbox_raw.get('y_min', 0),
|
||||||
|
bbox_raw.get('x_max', 0),
|
||||||
|
bbox_raw.get('y_max', 0)
|
||||||
|
]
|
||||||
|
elif isinstance(bbox_raw, (list, tuple)):
|
||||||
|
bbox = list(bbox_raw)
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Apply scaling if needed
|
||||||
|
if scale_x != 1.0 or scale_y != 1.0:
|
||||||
|
# Check if nested list format [[x1,y1], [x2,y2], ...]
|
||||||
|
if len(bbox) >= 1 and isinstance(bbox[0], (list, tuple)):
|
||||||
|
bbox = [
|
||||||
|
[pt[0] * scale_x, pt[1] * scale_y]
|
||||||
|
for pt in bbox if len(pt) >= 2
|
||||||
|
]
|
||||||
|
elif len(bbox) == 4 and not isinstance(bbox[0], (list, tuple)):
|
||||||
|
# Simple [x0, y0, x1, y1] format
|
||||||
|
bbox = [
|
||||||
|
bbox[0] * scale_x, bbox[1] * scale_y,
|
||||||
|
bbox[2] * scale_x, bbox[3] * scale_y
|
||||||
|
]
|
||||||
|
elif len(bbox) >= 8:
|
||||||
|
# Flat polygon format [x1, y1, x2, y2, ...]
|
||||||
|
bbox = [
|
||||||
|
bbox[i] * (scale_x if i % 2 == 0 else scale_y)
|
||||||
|
for i in range(len(bbox))
|
||||||
|
]
|
||||||
|
|
||||||
|
text_regions.append(TextRegion(
|
||||||
|
text=text,
|
||||||
|
bbox=bbox,
|
||||||
|
confidence=confidence,
|
||||||
|
page=page_number
|
||||||
|
))
|
||||||
|
|
||||||
|
return text_regions
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _point_in_bbox(
|
||||||
|
x: float, y: float,
|
||||||
|
bbox: Tuple[float, float, float, float]
|
||||||
|
) -> bool:
|
||||||
|
"""Check if point (x, y) is inside bbox (x0, y0, x1, y1)."""
|
||||||
|
x0, y0, x1, y1 = bbox
|
||||||
|
return x0 <= x <= x1 and y0 <= y <= y1
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _calculate_iou(
|
||||||
|
bbox1: Tuple[float, float, float, float],
|
||||||
|
bbox2: Tuple[float, float, float, float]
|
||||||
|
) -> float:
|
||||||
|
"""
|
||||||
|
Calculate Intersection over Union (IoU) of two bboxes.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
bbox1: First bbox (x0, y0, x1, y1)
|
||||||
|
bbox2: Second bbox (x0, y0, x1, y1)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
IoU value between 0 and 1
|
||||||
|
"""
|
||||||
|
# Calculate intersection
|
||||||
|
x0 = max(bbox1[0], bbox2[0])
|
||||||
|
y0 = max(bbox1[1], bbox2[1])
|
||||||
|
x1 = min(bbox1[2], bbox2[2])
|
||||||
|
y1 = min(bbox1[3], bbox2[3])
|
||||||
|
|
||||||
|
if x1 <= x0 or y1 <= y0:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
intersection = (x1 - x0) * (y1 - y0)
|
||||||
|
|
||||||
|
# Calculate union
|
||||||
|
area1 = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
|
||||||
|
area2 = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
|
||||||
|
union = area1 + area2 - intersection
|
||||||
|
|
||||||
|
if union <= 0:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
return intersection / union
|
||||||
@@ -46,6 +46,19 @@ except ImportError as e:
|
|||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Sentinel value for "use PubLayNet default" - explicitly NO model specification
|
||||||
|
_USE_PUBLAYNET_DEFAULT = "__USE_PUBLAYNET_DEFAULT__"
|
||||||
|
|
||||||
|
# Layout model mapping: user-friendly names to actual model names
|
||||||
|
# - "chinese": PP-DocLayout-S - Best for Chinese documents (forms, contracts, invoices)
|
||||||
|
# - "default": PubLayNet-based default model - Best for English documents
|
||||||
|
# - "cdla": picodet_lcnet_x1_0_fgd_layout_cdla - Alternative for Chinese layout
|
||||||
|
LAYOUT_MODEL_MAPPING = {
|
||||||
|
"chinese": "PP-DocLayout-S",
|
||||||
|
"default": _USE_PUBLAYNET_DEFAULT, # Uses default PubLayNet-based model (no custom model)
|
||||||
|
"cdla": "picodet_lcnet_x1_0_fgd_layout_cdla",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
class OCRService:
|
class OCRService:
|
||||||
"""
|
"""
|
||||||
@@ -436,77 +449,45 @@ class OCRService:
|
|||||||
|
|
||||||
return self.ocr_engines[lang]
|
return self.ocr_engines[lang]
|
||||||
|
|
||||||
def _ensure_structure_engine(self, custom_params: Optional[Dict[str, any]] = None) -> PPStructureV3:
|
def _ensure_structure_engine(self, layout_model: Optional[str] = None) -> PPStructureV3:
|
||||||
"""
|
"""
|
||||||
Get or create PP-Structure engine for layout analysis with GPU support.
|
Get or create PP-Structure engine for layout analysis with GPU support.
|
||||||
Supports custom parameters that override default settings.
|
Supports layout model selection for different document types.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
custom_params: Optional dictionary of custom PP-StructureV3 parameters.
|
layout_model: Layout detection model selection:
|
||||||
If provided, creates a new engine instance (not cached).
|
- "chinese": PP-DocLayout-S (best for Chinese documents)
|
||||||
Supported keys: layout_detection_threshold, layout_nms_threshold,
|
- "default": PubLayNet-based (best for English documents)
|
||||||
layout_merge_bboxes_mode, layout_unclip_ratio, text_det_thresh,
|
- "cdla": CDLA model (alternative for Chinese layout)
|
||||||
text_det_box_thresh, text_det_unclip_ratio
|
- None: Use config default
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
PPStructure engine instance
|
PPStructure engine instance
|
||||||
"""
|
"""
|
||||||
# If custom params provided, create a new engine instance (don't use cache)
|
# Resolve layout model name from user-friendly name
|
||||||
if custom_params:
|
resolved_model_name = None
|
||||||
logger.info(f"Creating PP-StructureV3 engine with custom parameters (GPU: {self.use_gpu})")
|
use_publaynet_default = False # Flag to explicitly use PubLayNet default (no model param)
|
||||||
logger.info(f"Custom params: {custom_params}")
|
|
||||||
|
|
||||||
try:
|
if layout_model:
|
||||||
# Base configuration from settings
|
resolved_model_name = LAYOUT_MODEL_MAPPING.get(layout_model)
|
||||||
use_chart = settings.enable_chart_recognition
|
if layout_model not in LAYOUT_MODEL_MAPPING:
|
||||||
use_formula = settings.enable_formula_recognition
|
logger.warning(f"Unknown layout model '{layout_model}', using config default")
|
||||||
use_table = settings.enable_table_recognition
|
resolved_model_name = settings.layout_detection_model_name
|
||||||
|
elif resolved_model_name == _USE_PUBLAYNET_DEFAULT:
|
||||||
|
# User explicitly selected "default" - use PubLayNet without custom model
|
||||||
|
use_publaynet_default = True
|
||||||
|
resolved_model_name = None
|
||||||
|
logger.info(f"Using layout model: {layout_model} -> PubLayNet default (no custom model)")
|
||||||
|
else:
|
||||||
|
logger.info(f"Using layout model: {layout_model} -> {resolved_model_name}")
|
||||||
|
|
||||||
# Parameter priority: custom > settings default
|
# Check if we need to recreate the engine due to different model
|
||||||
layout_threshold = custom_params.get('layout_detection_threshold', settings.layout_detection_threshold)
|
current_model = getattr(self, '_current_layout_model', None)
|
||||||
layout_nms = custom_params.get('layout_nms_threshold', settings.layout_nms_threshold)
|
if self.structure_engine is not None and layout_model and layout_model != current_model:
|
||||||
layout_merge = custom_params.get('layout_merge_bboxes_mode', settings.layout_merge_mode)
|
logger.info(f"Layout model changed from {current_model} to {layout_model}, recreating engine")
|
||||||
layout_unclip = custom_params.get('layout_unclip_ratio', settings.layout_unclip_ratio)
|
self.structure_engine = None # Force recreation
|
||||||
text_thresh = custom_params.get('text_det_thresh', settings.text_det_thresh)
|
|
||||||
text_box_thresh = custom_params.get('text_det_box_thresh', settings.text_det_box_thresh)
|
|
||||||
text_unclip = custom_params.get('text_det_unclip_ratio', settings.text_det_unclip_ratio)
|
|
||||||
|
|
||||||
logger.info(f"PP-StructureV3 config: table={use_table}, formula={use_formula}, chart={use_chart}")
|
# Use cached engine or create new one
|
||||||
logger.info(f"Layout config: threshold={layout_threshold}, nms={layout_nms}, merge={layout_merge}, unclip={layout_unclip}")
|
|
||||||
logger.info(f"Text detection: thresh={text_thresh}, box_thresh={text_box_thresh}, unclip={text_unclip}")
|
|
||||||
|
|
||||||
# Create temporary engine with custom params (not cached)
|
|
||||||
custom_engine = PPStructureV3(
|
|
||||||
use_doc_orientation_classify=False,
|
|
||||||
use_doc_unwarping=False,
|
|
||||||
use_textline_orientation=False,
|
|
||||||
use_table_recognition=use_table,
|
|
||||||
use_formula_recognition=use_formula,
|
|
||||||
use_chart_recognition=use_chart,
|
|
||||||
layout_threshold=layout_threshold,
|
|
||||||
layout_nms=layout_nms,
|
|
||||||
layout_unclip_ratio=layout_unclip,
|
|
||||||
layout_merge_bboxes_mode=layout_merge,
|
|
||||||
text_det_thresh=text_thresh,
|
|
||||||
text_det_box_thresh=text_box_thresh,
|
|
||||||
text_det_unclip_ratio=text_unclip,
|
|
||||||
)
|
|
||||||
|
|
||||||
logger.info(f"PP-StructureV3 engine with custom params ready (PaddlePaddle {paddle.__version__}, {'GPU' if self.use_gpu else 'CPU'} mode)")
|
|
||||||
|
|
||||||
# Check GPU memory after loading
|
|
||||||
if self.use_gpu and settings.enable_memory_optimization:
|
|
||||||
self._check_gpu_memory_usage()
|
|
||||||
|
|
||||||
return custom_engine
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to create PP-StructureV3 engine with custom params: {e}")
|
|
||||||
# Fall back to default cached engine
|
|
||||||
logger.warning("Falling back to default cached engine")
|
|
||||||
custom_params = None # Clear custom params to use cached engine
|
|
||||||
|
|
||||||
# Use cached default engine
|
|
||||||
if self.structure_engine is None:
|
if self.structure_engine is None:
|
||||||
logger.info(f"Initializing PP-StructureV3 engine (GPU: {self.use_gpu})")
|
logger.info(f"Initializing PP-StructureV3 engine (GPU: {self.use_gpu})")
|
||||||
|
|
||||||
@@ -524,28 +505,51 @@ class OCRService:
|
|||||||
text_box_thresh = settings.text_det_box_thresh
|
text_box_thresh = settings.text_det_box_thresh
|
||||||
text_unclip = settings.text_det_unclip_ratio
|
text_unclip = settings.text_det_unclip_ratio
|
||||||
|
|
||||||
|
# Layout model configuration:
|
||||||
|
# - If use_publaynet_default: don't specify any model (use PubLayNet default)
|
||||||
|
# - If resolved_model_name: use the specified model
|
||||||
|
# - Otherwise: use config default
|
||||||
|
if use_publaynet_default:
|
||||||
|
layout_model_name = None # Explicitly no model = PubLayNet default
|
||||||
|
elif resolved_model_name:
|
||||||
|
layout_model_name = resolved_model_name
|
||||||
|
else:
|
||||||
|
layout_model_name = settings.layout_detection_model_name
|
||||||
|
layout_model_dir = settings.layout_detection_model_dir
|
||||||
|
|
||||||
logger.info(f"PP-StructureV3 config: table={use_table}, formula={use_formula}, chart={use_chart}")
|
logger.info(f"PP-StructureV3 config: table={use_table}, formula={use_formula}, chart={use_chart}")
|
||||||
|
logger.info(f"Layout model: name={layout_model_name}, dir={layout_model_dir}")
|
||||||
logger.info(f"Layout config: threshold={layout_threshold}, nms={layout_nms}, merge={layout_merge}, unclip={layout_unclip}")
|
logger.info(f"Layout config: threshold={layout_threshold}, nms={layout_nms}, merge={layout_merge}, unclip={layout_unclip}")
|
||||||
logger.info(f"Text detection: thresh={text_thresh}, box_thresh={text_box_thresh}, unclip={text_unclip}")
|
logger.info(f"Text detection: thresh={text_thresh}, box_thresh={text_box_thresh}, unclip={text_unclip}")
|
||||||
|
|
||||||
self.structure_engine = PPStructureV3(
|
# Build PPStructureV3 kwargs
|
||||||
use_doc_orientation_classify=False,
|
pp_kwargs = {
|
||||||
use_doc_unwarping=False,
|
'use_doc_orientation_classify': False,
|
||||||
use_textline_orientation=False,
|
'use_doc_unwarping': False,
|
||||||
use_table_recognition=use_table,
|
'use_textline_orientation': False,
|
||||||
use_formula_recognition=use_formula,
|
'use_table_recognition': use_table,
|
||||||
use_chart_recognition=use_chart,
|
'use_formula_recognition': use_formula,
|
||||||
layout_threshold=layout_threshold,
|
'use_chart_recognition': use_chart,
|
||||||
layout_nms=layout_nms,
|
'layout_threshold': layout_threshold,
|
||||||
layout_unclip_ratio=layout_unclip,
|
'layout_nms': layout_nms,
|
||||||
layout_merge_bboxes_mode=layout_merge, # Use 'small' to minimize merging
|
'layout_unclip_ratio': layout_unclip,
|
||||||
text_det_thresh=text_thresh,
|
'layout_merge_bboxes_mode': layout_merge,
|
||||||
text_det_box_thresh=text_box_thresh,
|
'text_det_thresh': text_thresh,
|
||||||
text_det_unclip_ratio=text_unclip,
|
'text_det_box_thresh': text_box_thresh,
|
||||||
)
|
'text_det_unclip_ratio': text_unclip,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add layout model configuration if specified
|
||||||
|
if layout_model_name:
|
||||||
|
pp_kwargs['layout_detection_model_name'] = layout_model_name
|
||||||
|
if layout_model_dir:
|
||||||
|
pp_kwargs['layout_detection_model_dir'] = layout_model_dir
|
||||||
|
|
||||||
|
self.structure_engine = PPStructureV3(**pp_kwargs)
|
||||||
|
|
||||||
# Track model loading for cache management
|
# Track model loading for cache management
|
||||||
self._model_last_used['structure'] = datetime.now()
|
self._model_last_used['structure'] = datetime.now()
|
||||||
|
self._current_layout_model = layout_model # Track current model for recreation check
|
||||||
|
|
||||||
logger.info(f"PP-StructureV3 engine ready (PaddlePaddle {paddle.__version__}, {'GPU' if self.use_gpu else 'CPU'} mode)")
|
logger.info(f"PP-StructureV3 engine ready (PaddlePaddle {paddle.__version__}, {'GPU' if self.use_gpu else 'CPU'} mode)")
|
||||||
|
|
||||||
@@ -565,17 +569,27 @@ class OCRService:
|
|||||||
use_formula = settings.enable_formula_recognition
|
use_formula = settings.enable_formula_recognition
|
||||||
use_table = settings.enable_table_recognition
|
use_table = settings.enable_table_recognition
|
||||||
layout_threshold = settings.layout_detection_threshold
|
layout_threshold = settings.layout_detection_threshold
|
||||||
|
layout_model_name = settings.layout_detection_model_name
|
||||||
|
layout_model_dir = settings.layout_detection_model_dir
|
||||||
|
|
||||||
self.structure_engine = PPStructureV3(
|
# Build CPU fallback kwargs
|
||||||
use_doc_orientation_classify=False,
|
cpu_kwargs = {
|
||||||
use_doc_unwarping=False,
|
'use_doc_orientation_classify': False,
|
||||||
use_textline_orientation=False,
|
'use_doc_unwarping': False,
|
||||||
use_table_recognition=use_table,
|
'use_textline_orientation': False,
|
||||||
use_formula_recognition=use_formula,
|
'use_table_recognition': use_table,
|
||||||
use_chart_recognition=use_chart,
|
'use_formula_recognition': use_formula,
|
||||||
layout_threshold=layout_threshold,
|
'use_chart_recognition': use_chart,
|
||||||
)
|
'layout_threshold': layout_threshold,
|
||||||
logger.info("PP-StructureV3 engine ready (CPU mode - fallback)")
|
}
|
||||||
|
if layout_model_name:
|
||||||
|
cpu_kwargs['layout_detection_model_name'] = layout_model_name
|
||||||
|
if layout_model_dir:
|
||||||
|
cpu_kwargs['layout_detection_model_dir'] = layout_model_dir
|
||||||
|
|
||||||
|
self.structure_engine = PPStructureV3(**cpu_kwargs)
|
||||||
|
self._current_layout_model = layout_model # Track current model for recreation check
|
||||||
|
logger.info(f"PP-StructureV3 engine ready (CPU mode - fallback, layout_model={layout_model_name})")
|
||||||
else:
|
else:
|
||||||
raise
|
raise
|
||||||
|
|
||||||
@@ -813,7 +827,7 @@ class OCRService:
|
|||||||
confidence_threshold: Optional[float] = None,
|
confidence_threshold: Optional[float] = None,
|
||||||
output_dir: Optional[Path] = None,
|
output_dir: Optional[Path] = None,
|
||||||
current_page: int = 0,
|
current_page: int = 0,
|
||||||
pp_structure_params: Optional[Dict[str, any]] = None
|
layout_model: Optional[str] = None
|
||||||
) -> Dict:
|
) -> Dict:
|
||||||
"""
|
"""
|
||||||
Process single image with OCR and layout analysis
|
Process single image with OCR and layout analysis
|
||||||
@@ -825,7 +839,7 @@ class OCRService:
|
|||||||
confidence_threshold: Minimum confidence threshold (uses default if None)
|
confidence_threshold: Minimum confidence threshold (uses default if None)
|
||||||
output_dir: Optional output directory for saving extracted images
|
output_dir: Optional output directory for saving extracted images
|
||||||
current_page: Current page number (0-based) for multi-page documents
|
current_page: Current page number (0-based) for multi-page documents
|
||||||
pp_structure_params: Optional custom PP-StructureV3 parameters
|
layout_model: Layout detection model ('chinese', 'default', 'cdla')
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dictionary with OCR results and metadata
|
Dictionary with OCR results and metadata
|
||||||
@@ -894,7 +908,7 @@ class OCRService:
|
|||||||
confidence_threshold=confidence_threshold,
|
confidence_threshold=confidence_threshold,
|
||||||
output_dir=output_dir,
|
output_dir=output_dir,
|
||||||
current_page=page_num - 1, # Convert to 0-based page number for layout data
|
current_page=page_num - 1, # Convert to 0-based page number for layout data
|
||||||
pp_structure_params=pp_structure_params
|
layout_model=layout_model
|
||||||
)
|
)
|
||||||
|
|
||||||
# Accumulate results
|
# Accumulate results
|
||||||
@@ -1040,7 +1054,7 @@ class OCRService:
|
|||||||
image_path,
|
image_path,
|
||||||
output_dir=output_dir,
|
output_dir=output_dir,
|
||||||
current_page=current_page,
|
current_page=current_page,
|
||||||
pp_structure_params=pp_structure_params
|
layout_model=layout_model
|
||||||
)
|
)
|
||||||
|
|
||||||
# Generate Markdown
|
# Generate Markdown
|
||||||
@@ -1078,6 +1092,38 @@ class OCRService:
|
|||||||
'height': ocr_height
|
'height': ocr_height
|
||||||
}]
|
}]
|
||||||
|
|
||||||
|
# Generate PP-StructureV3 debug outputs if enabled
|
||||||
|
if settings.pp_structure_debug_enabled and output_dir:
|
||||||
|
try:
|
||||||
|
from app.services.pp_structure_debug import PPStructureDebug
|
||||||
|
debug_service = PPStructureDebug(output_dir)
|
||||||
|
|
||||||
|
# Save raw results as JSON
|
||||||
|
debug_service.save_raw_results(
|
||||||
|
pp_structure_results={
|
||||||
|
'elements': layout_data.get('elements', []),
|
||||||
|
'total_elements': layout_data.get('total_elements', 0),
|
||||||
|
'element_types': layout_data.get('element_types', {}),
|
||||||
|
'reading_order': layout_data.get('reading_order', []),
|
||||||
|
'enhanced': True,
|
||||||
|
'has_parsing_res_list': True
|
||||||
|
},
|
||||||
|
raw_ocr_regions=text_regions,
|
||||||
|
filename_prefix=image_path.stem
|
||||||
|
)
|
||||||
|
|
||||||
|
# Generate visualization if enabled
|
||||||
|
if settings.pp_structure_debug_visualization:
|
||||||
|
debug_service.generate_visualization(
|
||||||
|
image_path=image_path,
|
||||||
|
pp_structure_elements=layout_data.get('elements', []),
|
||||||
|
raw_ocr_regions=text_regions,
|
||||||
|
filename_prefix=image_path.stem
|
||||||
|
)
|
||||||
|
logger.info(f"Generated PP-StructureV3 debug outputs for {image_path.name}")
|
||||||
|
except Exception as debug_error:
|
||||||
|
logger.warning(f"Failed to generate debug outputs: {debug_error}")
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"OCR completed: {image_path.name} - "
|
f"OCR completed: {image_path.name} - "
|
||||||
f"{len(text_regions)} regions, "
|
f"{len(text_regions)} regions, "
|
||||||
@@ -1164,7 +1210,7 @@ class OCRService:
|
|||||||
image_path: Path,
|
image_path: Path,
|
||||||
output_dir: Optional[Path] = None,
|
output_dir: Optional[Path] = None,
|
||||||
current_page: int = 0,
|
current_page: int = 0,
|
||||||
pp_structure_params: Optional[Dict[str, any]] = None
|
layout_model: Optional[str] = None
|
||||||
) -> Tuple[Optional[Dict], List[Dict]]:
|
) -> Tuple[Optional[Dict], List[Dict]]:
|
||||||
"""
|
"""
|
||||||
Analyze document layout using PP-StructureV3 with enhanced element extraction
|
Analyze document layout using PP-StructureV3 with enhanced element extraction
|
||||||
@@ -1173,7 +1219,7 @@ class OCRService:
|
|||||||
image_path: Path to image file
|
image_path: Path to image file
|
||||||
output_dir: Optional output directory for saving extracted images (defaults to image_path.parent)
|
output_dir: Optional output directory for saving extracted images (defaults to image_path.parent)
|
||||||
current_page: Current page number (0-based) for multi-page documents
|
current_page: Current page number (0-based) for multi-page documents
|
||||||
pp_structure_params: Optional custom PP-StructureV3 parameters
|
layout_model: Layout detection model ('chinese', 'default', 'cdla')
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Tuple of (layout_data, images_metadata)
|
Tuple of (layout_data, images_metadata)
|
||||||
@@ -1191,7 +1237,7 @@ class OCRService:
|
|||||||
f"Mode: {'CPU fallback' if self._cpu_fallback_active else 'GPU'}"
|
f"Mode: {'CPU fallback' if self._cpu_fallback_active else 'GPU'}"
|
||||||
)
|
)
|
||||||
|
|
||||||
structure_engine = self._ensure_structure_engine(pp_structure_params)
|
structure_engine = self._ensure_structure_engine(layout_model)
|
||||||
|
|
||||||
# Try enhanced processing first
|
# Try enhanced processing first
|
||||||
try:
|
try:
|
||||||
@@ -1425,7 +1471,7 @@ class OCRService:
|
|||||||
confidence_threshold: Optional[float] = None,
|
confidence_threshold: Optional[float] = None,
|
||||||
output_dir: Optional[Path] = None,
|
output_dir: Optional[Path] = None,
|
||||||
force_track: Optional[str] = None,
|
force_track: Optional[str] = None,
|
||||||
pp_structure_params: Optional[Dict[str, any]] = None
|
layout_model: Optional[str] = None
|
||||||
) -> Union[UnifiedDocument, Dict]:
|
) -> Union[UnifiedDocument, Dict]:
|
||||||
"""
|
"""
|
||||||
Process document using dual-track approach.
|
Process document using dual-track approach.
|
||||||
@@ -1437,7 +1483,7 @@ class OCRService:
|
|||||||
confidence_threshold: Minimum confidence threshold
|
confidence_threshold: Minimum confidence threshold
|
||||||
output_dir: Optional output directory for extracted images
|
output_dir: Optional output directory for extracted images
|
||||||
force_track: Force specific track ("ocr" or "direct"), None for auto-detection
|
force_track: Force specific track ("ocr" or "direct"), None for auto-detection
|
||||||
pp_structure_params: Optional custom PP-StructureV3 parameters (used for OCR track only)
|
layout_model: Layout detection model ('chinese', 'default', 'cdla') (used for OCR track only)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
UnifiedDocument if dual-track is enabled, Dict otherwise
|
UnifiedDocument if dual-track is enabled, Dict otherwise
|
||||||
@@ -1445,7 +1491,7 @@ class OCRService:
|
|||||||
if not self.dual_track_enabled:
|
if not self.dual_track_enabled:
|
||||||
# Fallback to traditional OCR processing
|
# Fallback to traditional OCR processing
|
||||||
return self.process_file_traditional(
|
return self.process_file_traditional(
|
||||||
file_path, lang, detect_layout, confidence_threshold, output_dir, pp_structure_params
|
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model
|
||||||
)
|
)
|
||||||
|
|
||||||
start_time = datetime.now()
|
start_time = datetime.now()
|
||||||
@@ -1517,7 +1563,7 @@ class OCRService:
|
|||||||
ocr_result = self.process_file_traditional(
|
ocr_result = self.process_file_traditional(
|
||||||
actual_file_path, lang, detect_layout=True,
|
actual_file_path, lang, detect_layout=True,
|
||||||
confidence_threshold=confidence_threshold,
|
confidence_threshold=confidence_threshold,
|
||||||
output_dir=output_dir, pp_structure_params=pp_structure_params
|
output_dir=output_dir, layout_model=layout_model
|
||||||
)
|
)
|
||||||
|
|
||||||
# Convert OCR result to extract images
|
# Convert OCR result to extract images
|
||||||
@@ -1550,7 +1596,7 @@ class OCRService:
|
|||||||
# Use OCR for scanned documents, images, etc.
|
# Use OCR for scanned documents, images, etc.
|
||||||
logger.info("Using OCR track (PaddleOCR)")
|
logger.info("Using OCR track (PaddleOCR)")
|
||||||
ocr_result = self.process_file_traditional(
|
ocr_result = self.process_file_traditional(
|
||||||
file_path, lang, detect_layout, confidence_threshold, output_dir, pp_structure_params
|
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model
|
||||||
)
|
)
|
||||||
|
|
||||||
# Convert OCR result to UnifiedDocument using the converter
|
# Convert OCR result to UnifiedDocument using the converter
|
||||||
@@ -1580,7 +1626,7 @@ class OCRService:
|
|||||||
logger.error(f"Error in dual-track processing: {e}")
|
logger.error(f"Error in dual-track processing: {e}")
|
||||||
# Fallback to traditional OCR
|
# Fallback to traditional OCR
|
||||||
return self.process_file_traditional(
|
return self.process_file_traditional(
|
||||||
file_path, lang, detect_layout, confidence_threshold, output_dir, pp_structure_params
|
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model
|
||||||
)
|
)
|
||||||
|
|
||||||
def _merge_ocr_images_into_direct(
|
def _merge_ocr_images_into_direct(
|
||||||
@@ -1659,7 +1705,7 @@ class OCRService:
|
|||||||
detect_layout: bool = True,
|
detect_layout: bool = True,
|
||||||
confidence_threshold: Optional[float] = None,
|
confidence_threshold: Optional[float] = None,
|
||||||
output_dir: Optional[Path] = None,
|
output_dir: Optional[Path] = None,
|
||||||
pp_structure_params: Optional[Dict[str, any]] = None
|
layout_model: Optional[str] = None
|
||||||
) -> Dict:
|
) -> Dict:
|
||||||
"""
|
"""
|
||||||
Traditional OCR processing (legacy method).
|
Traditional OCR processing (legacy method).
|
||||||
@@ -1670,7 +1716,7 @@ class OCRService:
|
|||||||
detect_layout: Whether to perform layout analysis
|
detect_layout: Whether to perform layout analysis
|
||||||
confidence_threshold: Minimum confidence threshold
|
confidence_threshold: Minimum confidence threshold
|
||||||
output_dir: Optional output directory
|
output_dir: Optional output directory
|
||||||
pp_structure_params: Optional custom PP-StructureV3 parameters
|
layout_model: Layout detection model ('chinese', 'default', 'cdla')
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dictionary with OCR results in legacy format
|
Dictionary with OCR results in legacy format
|
||||||
@@ -1683,7 +1729,7 @@ class OCRService:
|
|||||||
all_results = []
|
all_results = []
|
||||||
for i, image_path in enumerate(image_paths):
|
for i, image_path in enumerate(image_paths):
|
||||||
result = self.process_image(
|
result = self.process_image(
|
||||||
image_path, lang, detect_layout, confidence_threshold, output_dir, i, pp_structure_params
|
image_path, lang, detect_layout, confidence_threshold, output_dir, i, layout_model
|
||||||
)
|
)
|
||||||
all_results.append(result)
|
all_results.append(result)
|
||||||
|
|
||||||
@@ -1699,7 +1745,7 @@ class OCRService:
|
|||||||
else:
|
else:
|
||||||
# Single image or other file
|
# Single image or other file
|
||||||
return self.process_image(
|
return self.process_image(
|
||||||
file_path, lang, detect_layout, confidence_threshold, output_dir, 0, pp_structure_params
|
file_path, lang, detect_layout, confidence_threshold, output_dir, 0, layout_model
|
||||||
)
|
)
|
||||||
|
|
||||||
def _combine_results(self, results: List[Dict]) -> Dict:
|
def _combine_results(self, results: List[Dict]) -> Dict:
|
||||||
@@ -1784,7 +1830,7 @@ class OCRService:
|
|||||||
output_dir: Optional[Path] = None,
|
output_dir: Optional[Path] = None,
|
||||||
use_dual_track: bool = True,
|
use_dual_track: bool = True,
|
||||||
force_track: Optional[str] = None,
|
force_track: Optional[str] = None,
|
||||||
pp_structure_params: Optional[Dict[str, any]] = None
|
layout_model: Optional[str] = None
|
||||||
) -> Union[UnifiedDocument, Dict]:
|
) -> Union[UnifiedDocument, Dict]:
|
||||||
"""
|
"""
|
||||||
Main processing method with dual-track support.
|
Main processing method with dual-track support.
|
||||||
@@ -1797,7 +1843,7 @@ class OCRService:
|
|||||||
output_dir: Optional output directory
|
output_dir: Optional output directory
|
||||||
use_dual_track: Whether to use dual-track processing (default True)
|
use_dual_track: Whether to use dual-track processing (default True)
|
||||||
force_track: Force specific track ("ocr" or "direct")
|
force_track: Force specific track ("ocr" or "direct")
|
||||||
pp_structure_params: Optional custom PP-StructureV3 parameters (used for OCR track only)
|
layout_model: Layout detection model ('chinese', 'default', 'cdla') (used for OCR track only)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
UnifiedDocument if dual-track is enabled and use_dual_track=True,
|
UnifiedDocument if dual-track is enabled and use_dual_track=True,
|
||||||
@@ -1809,12 +1855,12 @@ class OCRService:
|
|||||||
if (use_dual_track or force_track) and self.dual_track_enabled:
|
if (use_dual_track or force_track) and self.dual_track_enabled:
|
||||||
# Use dual-track processing (or forced track)
|
# Use dual-track processing (or forced track)
|
||||||
return self.process_with_dual_track(
|
return self.process_with_dual_track(
|
||||||
file_path, lang, detect_layout, confidence_threshold, output_dir, force_track, pp_structure_params
|
file_path, lang, detect_layout, confidence_threshold, output_dir, force_track, layout_model
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# Use traditional OCR processing (no force_track support)
|
# Use traditional OCR processing (no force_track support)
|
||||||
return self.process_file_traditional(
|
return self.process_file_traditional(
|
||||||
file_path, lang, detect_layout, confidence_threshold, output_dir, pp_structure_params
|
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model
|
||||||
)
|
)
|
||||||
|
|
||||||
def process_legacy(
|
def process_legacy(
|
||||||
|
|||||||
@@ -3,6 +3,9 @@ OCR to UnifiedDocument Converter
|
|||||||
|
|
||||||
Converts PP-StructureV3 OCR results to UnifiedDocument format, preserving
|
Converts PP-StructureV3 OCR results to UnifiedDocument format, preserving
|
||||||
all structure information and metadata.
|
all structure information and metadata.
|
||||||
|
|
||||||
|
Includes gap filling support to supplement PP-StructureV3 output with raw OCR
|
||||||
|
regions when significant content loss is detected.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
@@ -16,10 +19,165 @@ from app.models.unified_document import (
|
|||||||
BoundingBox, StyleInfo, TableData, ElementType,
|
BoundingBox, StyleInfo, TableData, ElementType,
|
||||||
ProcessingTrack, TableCell, Dimensions
|
ProcessingTrack, TableCell, Dimensions
|
||||||
)
|
)
|
||||||
|
from app.services.gap_filling_service import GapFillingService
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def trim_empty_columns(table_dict: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Remove empty columns from a table dictionary.
|
||||||
|
|
||||||
|
A column is considered empty if ALL cells in that column have content that is
|
||||||
|
empty or whitespace-only (using .strip() to determine emptiness).
|
||||||
|
|
||||||
|
This function:
|
||||||
|
1. Identifies columns where every cell's content is empty/whitespace
|
||||||
|
2. Removes identified empty columns
|
||||||
|
3. Updates cols/columns value
|
||||||
|
4. Recalculates each cell's col index
|
||||||
|
5. Adjusts col_span when spans cross removed columns
|
||||||
|
6. Removes cells entirely when their complete span falls within removed columns
|
||||||
|
7. Preserves original bbox (no layout drift)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
table_dict: Table dictionary with keys: rows, cols/columns, cells
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Cleaned table dictionary with empty columns removed
|
||||||
|
"""
|
||||||
|
cells = table_dict.get('cells', [])
|
||||||
|
if not cells:
|
||||||
|
return table_dict
|
||||||
|
|
||||||
|
# Get original column count
|
||||||
|
original_cols = table_dict.get('cols', table_dict.get('columns', 0))
|
||||||
|
if original_cols == 0:
|
||||||
|
# Calculate from cells if not provided
|
||||||
|
max_col = 0
|
||||||
|
for cell in cells:
|
||||||
|
cell_col = cell.get('col', 0) if isinstance(cell, dict) else getattr(cell, 'col', 0)
|
||||||
|
cell_span = cell.get('col_span', 1) if isinstance(cell, dict) else getattr(cell, 'col_span', 1)
|
||||||
|
max_col = max(max_col, cell_col + cell_span)
|
||||||
|
original_cols = max_col
|
||||||
|
|
||||||
|
if original_cols == 0:
|
||||||
|
return table_dict
|
||||||
|
|
||||||
|
# Build a map: column_index -> list of cell contents
|
||||||
|
# For cells with col_span > 1, we only check their primary column
|
||||||
|
column_contents: Dict[int, List[str]] = {i: [] for i in range(original_cols)}
|
||||||
|
|
||||||
|
for cell in cells:
|
||||||
|
if isinstance(cell, dict):
|
||||||
|
col = cell.get('col', 0)
|
||||||
|
col_span = cell.get('col_span', 1)
|
||||||
|
content = cell.get('content', '')
|
||||||
|
else:
|
||||||
|
col = getattr(cell, 'col', 0)
|
||||||
|
col_span = getattr(cell, 'col_span', 1)
|
||||||
|
content = getattr(cell, 'content', '')
|
||||||
|
|
||||||
|
# Mark content for each column this cell spans
|
||||||
|
for c in range(col, min(col + col_span, original_cols)):
|
||||||
|
if c in column_contents:
|
||||||
|
column_contents[c].append(str(content).strip() if content else '')
|
||||||
|
|
||||||
|
# Identify empty columns (all content is empty/whitespace)
|
||||||
|
empty_columns = set()
|
||||||
|
for col_idx, contents in column_contents.items():
|
||||||
|
# A column is empty if ALL cells in it have empty content
|
||||||
|
# Note: If a column has no cells at all, it's considered empty
|
||||||
|
if all(c == '' for c in contents):
|
||||||
|
empty_columns.add(col_idx)
|
||||||
|
|
||||||
|
if not empty_columns:
|
||||||
|
# No empty columns to remove, just ensure cols is set
|
||||||
|
result = dict(table_dict)
|
||||||
|
if result.get('cols', result.get('columns', 0)) == 0:
|
||||||
|
result['cols'] = original_cols
|
||||||
|
if 'columns' in result:
|
||||||
|
result['columns'] = original_cols
|
||||||
|
return result
|
||||||
|
|
||||||
|
logger.debug(f"Removing empty columns: {sorted(empty_columns)} from table with {original_cols} cols")
|
||||||
|
|
||||||
|
# Build column mapping: old_col -> new_col (or None if removed)
|
||||||
|
col_mapping: Dict[int, Optional[int]] = {}
|
||||||
|
new_col = 0
|
||||||
|
for old_col in range(original_cols):
|
||||||
|
if old_col in empty_columns:
|
||||||
|
col_mapping[old_col] = None
|
||||||
|
else:
|
||||||
|
col_mapping[old_col] = new_col
|
||||||
|
new_col += 1
|
||||||
|
|
||||||
|
new_cols = new_col
|
||||||
|
|
||||||
|
# Process cells
|
||||||
|
new_cells = []
|
||||||
|
for cell in cells:
|
||||||
|
if isinstance(cell, dict):
|
||||||
|
old_col = cell.get('col', 0)
|
||||||
|
old_col_span = cell.get('col_span', 1)
|
||||||
|
else:
|
||||||
|
old_col = getattr(cell, 'col', 0)
|
||||||
|
old_col_span = getattr(cell, 'col_span', 1)
|
||||||
|
|
||||||
|
# Calculate new col and col_span
|
||||||
|
# Find the first non-removed column in this cell's span
|
||||||
|
new_start_col = None
|
||||||
|
new_end_col = None
|
||||||
|
|
||||||
|
for c in range(old_col, min(old_col + old_col_span, original_cols)):
|
||||||
|
mapped = col_mapping.get(c)
|
||||||
|
if mapped is not None:
|
||||||
|
if new_start_col is None:
|
||||||
|
new_start_col = mapped
|
||||||
|
new_end_col = mapped
|
||||||
|
|
||||||
|
# If entire span falls within removed columns, skip this cell
|
||||||
|
if new_start_col is None:
|
||||||
|
logger.debug(f"Removing cell at row={cell.get('row', 0) if isinstance(cell, dict) else cell.row}, "
|
||||||
|
f"col={old_col} (entire span in removed columns)")
|
||||||
|
continue
|
||||||
|
|
||||||
|
new_col_span = new_end_col - new_start_col + 1
|
||||||
|
|
||||||
|
# Create new cell
|
||||||
|
if isinstance(cell, dict):
|
||||||
|
new_cell = dict(cell)
|
||||||
|
new_cell['col'] = new_start_col
|
||||||
|
new_cell['col_span'] = new_col_span
|
||||||
|
else:
|
||||||
|
# Handle TableCell objects
|
||||||
|
new_cell = {
|
||||||
|
'row': cell.row,
|
||||||
|
'col': new_start_col,
|
||||||
|
'row_span': cell.row_span,
|
||||||
|
'col_span': new_col_span,
|
||||||
|
'content': cell.content
|
||||||
|
}
|
||||||
|
if hasattr(cell, 'bbox') and cell.bbox:
|
||||||
|
new_cell['bbox'] = cell.bbox
|
||||||
|
if hasattr(cell, 'style') and cell.style:
|
||||||
|
new_cell['style'] = cell.style
|
||||||
|
|
||||||
|
new_cells.append(new_cell)
|
||||||
|
|
||||||
|
# Build result
|
||||||
|
result = dict(table_dict)
|
||||||
|
result['cells'] = new_cells
|
||||||
|
result['cols'] = new_cols
|
||||||
|
if 'columns' in result:
|
||||||
|
result['columns'] = new_cols
|
||||||
|
|
||||||
|
logger.info(f"Trimmed table: {original_cols} -> {new_cols} columns, "
|
||||||
|
f"{len(cells)} -> {len(new_cells)} cells")
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
class OCRToUnifiedConverter:
|
class OCRToUnifiedConverter:
|
||||||
"""
|
"""
|
||||||
Converter for transforming PP-StructureV3 OCR results to UnifiedDocument format.
|
Converter for transforming PP-StructureV3 OCR results to UnifiedDocument format.
|
||||||
@@ -30,11 +188,19 @@ class OCRToUnifiedConverter:
|
|||||||
- Multi-page document assembly
|
- Multi-page document assembly
|
||||||
- Metadata preservation
|
- Metadata preservation
|
||||||
- Structure relationship mapping
|
- Structure relationship mapping
|
||||||
|
- Gap filling with raw OCR regions (when PP-StructureV3 misses content)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, enable_gap_filling: bool = True):
|
||||||
"""Initialize the converter."""
|
"""
|
||||||
|
Initialize the converter.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
enable_gap_filling: Whether to enable gap filling with raw OCR regions
|
||||||
|
"""
|
||||||
self.element_counter = 0
|
self.element_counter = 0
|
||||||
|
self.gap_filling_service = GapFillingService() if enable_gap_filling else None
|
||||||
|
self.gap_filling_stats: Dict[str, Any] = {}
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self,
|
self,
|
||||||
@@ -120,13 +286,21 @@ class OCRToUnifiedConverter:
|
|||||||
Extract pages from OCR results.
|
Extract pages from OCR results.
|
||||||
|
|
||||||
Handles both enhanced PP-StructureV3 results (with parsing_res_list)
|
Handles both enhanced PP-StructureV3 results (with parsing_res_list)
|
||||||
and traditional markdown results.
|
and traditional markdown results. Applies gap filling when enabled.
|
||||||
"""
|
"""
|
||||||
pages = []
|
pages = []
|
||||||
|
|
||||||
|
# Extract raw OCR text regions for gap filling
|
||||||
|
raw_text_regions = ocr_results.get('text_regions', [])
|
||||||
|
ocr_dimensions = ocr_results.get('ocr_dimensions', {})
|
||||||
|
|
||||||
# Check if we have enhanced results from PPStructureEnhanced
|
# Check if we have enhanced results from PPStructureEnhanced
|
||||||
if 'enhanced_results' in ocr_results:
|
if 'enhanced_results' in ocr_results:
|
||||||
pages = self._extract_from_enhanced_results(ocr_results['enhanced_results'])
|
pages = self._extract_from_enhanced_results(
|
||||||
|
ocr_results['enhanced_results'],
|
||||||
|
raw_text_regions=raw_text_regions,
|
||||||
|
ocr_dimensions=ocr_dimensions
|
||||||
|
)
|
||||||
# Check for traditional OCR results with text_regions at top level (from process_file_traditional)
|
# Check for traditional OCR results with text_regions at top level (from process_file_traditional)
|
||||||
elif 'text_regions' in ocr_results:
|
elif 'text_regions' in ocr_results:
|
||||||
pages = self._extract_from_traditional_ocr(ocr_results)
|
pages = self._extract_from_traditional_ocr(ocr_results)
|
||||||
@@ -143,9 +317,21 @@ class OCRToUnifiedConverter:
|
|||||||
|
|
||||||
def _extract_from_enhanced_results(
|
def _extract_from_enhanced_results(
|
||||||
self,
|
self,
|
||||||
enhanced_results: List[Dict[str, Any]]
|
enhanced_results: List[Dict[str, Any]],
|
||||||
|
raw_text_regions: Optional[List[Dict[str, Any]]] = None,
|
||||||
|
ocr_dimensions: Optional[Dict[str, Any]] = None
|
||||||
) -> List[Page]:
|
) -> List[Page]:
|
||||||
"""Extract pages from enhanced PP-StructureV3 results."""
|
"""
|
||||||
|
Extract pages from enhanced PP-StructureV3 results.
|
||||||
|
|
||||||
|
Applies gap filling when enabled to supplement PP-StructureV3 output
|
||||||
|
with raw OCR regions that were not detected by the layout model.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
enhanced_results: PP-StructureV3 enhanced results
|
||||||
|
raw_text_regions: Raw OCR text regions for gap filling
|
||||||
|
ocr_dimensions: OCR image dimensions for coordinate alignment
|
||||||
|
"""
|
||||||
pages = []
|
pages = []
|
||||||
|
|
||||||
for page_idx, page_result in enumerate(enhanced_results):
|
for page_idx, page_result in enumerate(enhanced_results):
|
||||||
@@ -158,15 +344,52 @@ class OCRToUnifiedConverter:
|
|||||||
if element:
|
if element:
|
||||||
elements.append(element)
|
elements.append(element)
|
||||||
|
|
||||||
|
# Get page dimensions
|
||||||
|
pp_dimensions = Dimensions(
|
||||||
|
width=page_result.get('width', 0),
|
||||||
|
height=page_result.get('height', 0)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Apply gap filling if enabled and raw regions available
|
||||||
|
if self.gap_filling_service and raw_text_regions:
|
||||||
|
# Filter raw regions for current page
|
||||||
|
page_raw_regions = [
|
||||||
|
r for r in raw_text_regions
|
||||||
|
if r.get('page', 0) == page_idx or r.get('page', 1) == page_idx + 1
|
||||||
|
]
|
||||||
|
|
||||||
|
if page_raw_regions:
|
||||||
|
supplemented, stats = self.gap_filling_service.fill_gaps(
|
||||||
|
raw_ocr_regions=page_raw_regions,
|
||||||
|
pp_structure_elements=elements,
|
||||||
|
page_number=page_idx + 1,
|
||||||
|
ocr_dimensions=ocr_dimensions,
|
||||||
|
pp_dimensions=pp_dimensions
|
||||||
|
)
|
||||||
|
|
||||||
|
# Store statistics
|
||||||
|
self.gap_filling_stats[f'page_{page_idx + 1}'] = stats
|
||||||
|
|
||||||
|
if supplemented:
|
||||||
|
logger.info(
|
||||||
|
f"Page {page_idx + 1}: Gap filling added {len(supplemented)} elements "
|
||||||
|
f"(coverage: {stats.get('coverage_ratio', 0):.2%})"
|
||||||
|
)
|
||||||
|
elements.extend(supplemented)
|
||||||
|
|
||||||
|
# Recalculate reading order for combined elements
|
||||||
|
reading_order = self.gap_filling_service.recalculate_reading_order(elements)
|
||||||
|
page_result['reading_order'] = reading_order
|
||||||
|
|
||||||
# Create page
|
# Create page
|
||||||
page = Page(
|
page = Page(
|
||||||
page_number=page_idx + 1,
|
page_number=page_idx + 1,
|
||||||
dimensions=Dimensions(
|
dimensions=pp_dimensions,
|
||||||
width=page_result.get('width', 0),
|
|
||||||
height=page_result.get('height', 0)
|
|
||||||
),
|
|
||||||
elements=elements,
|
elements=elements,
|
||||||
metadata={'reading_order': page_result.get('reading_order', [])}
|
metadata={
|
||||||
|
'reading_order': page_result.get('reading_order', []),
|
||||||
|
'gap_filling': self.gap_filling_stats.get(f'page_{page_idx + 1}', {})
|
||||||
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
pages.append(page)
|
pages.append(page)
|
||||||
@@ -500,6 +723,9 @@ class OCRToUnifiedConverter:
|
|||||||
) -> Optional[DocumentElement]:
|
) -> Optional[DocumentElement]:
|
||||||
"""Convert table data to DocumentElement."""
|
"""Convert table data to DocumentElement."""
|
||||||
try:
|
try:
|
||||||
|
# Clean up empty columns before building TableData
|
||||||
|
table_dict = trim_empty_columns(table_dict)
|
||||||
|
|
||||||
# Extract bbox
|
# Extract bbox
|
||||||
bbox_data = table_dict.get('bbox', [0, 0, 0, 0])
|
bbox_data = table_dict.get('bbox', [0, 0, 0, 0])
|
||||||
bbox = BoundingBox(
|
bbox = BoundingBox(
|
||||||
@@ -587,14 +813,22 @@ class OCRToUnifiedConverter:
|
|||||||
cells = []
|
cells = []
|
||||||
headers = []
|
headers = []
|
||||||
rows = table.find_all('tr')
|
rows = table.find_all('tr')
|
||||||
|
num_rows = len(rows)
|
||||||
|
|
||||||
# Track actual column positions accounting for rowspan/colspan
|
# First pass: calculate total columns by finding max column extent
|
||||||
# This is a simplified approach - complex spanning may need enhancement
|
# Track cells that span multiple rows: occupied[row][col] = True
|
||||||
|
occupied: Dict[int, Dict[int, bool]] = {r: {} for r in range(num_rows)}
|
||||||
|
|
||||||
|
# Parse all cells with proper rowspan/colspan handling
|
||||||
for row_idx, row in enumerate(rows):
|
for row_idx, row in enumerate(rows):
|
||||||
row_cells = row.find_all(['td', 'th'])
|
row_cells = row.find_all(['td', 'th'])
|
||||||
col_idx = 0
|
col_idx = 0
|
||||||
|
|
||||||
for cell in row_cells:
|
for cell in row_cells:
|
||||||
|
# Skip columns that are occupied by rowspan from previous rows
|
||||||
|
while occupied[row_idx].get(col_idx, False):
|
||||||
|
col_idx += 1
|
||||||
|
|
||||||
cell_content = cell.get_text(strip=True)
|
cell_content = cell.get_text(strip=True)
|
||||||
rowspan = int(cell.get('rowspan', 1))
|
rowspan = int(cell.get('rowspan', 1))
|
||||||
colspan = int(cell.get('colspan', 1))
|
colspan = int(cell.get('colspan', 1))
|
||||||
@@ -611,26 +845,66 @@ class OCRToUnifiedConverter:
|
|||||||
if cell.name == 'th' or row_idx == 0:
|
if cell.name == 'th' or row_idx == 0:
|
||||||
headers.append(cell_content)
|
headers.append(cell_content)
|
||||||
|
|
||||||
|
# Mark cells as occupied for rowspan/colspan
|
||||||
|
for r in range(row_idx, min(row_idx + rowspan, num_rows)):
|
||||||
|
for c in range(col_idx, col_idx + colspan):
|
||||||
|
if r not in occupied:
|
||||||
|
occupied[r] = {}
|
||||||
|
occupied[r][c] = True
|
||||||
|
|
||||||
# Advance column index by colspan
|
# Advance column index by colspan
|
||||||
col_idx += colspan
|
col_idx += colspan
|
||||||
|
|
||||||
# Calculate actual dimensions
|
# Calculate actual column count from occupied cells
|
||||||
num_rows = len(rows)
|
num_cols = 0
|
||||||
num_cols = max(
|
for r in range(num_rows):
|
||||||
sum(int(cell.get('colspan', 1)) for cell in row.find_all(['td', 'th']))
|
if occupied[r]:
|
||||||
for row in rows
|
max_col_in_row = max(occupied[r].keys()) + 1
|
||||||
) if rows else 0
|
num_cols = max(num_cols, max_col_in_row)
|
||||||
|
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"Parsed HTML table: {num_rows} rows, {num_cols} cols, {len(cells)} cells"
|
f"Parsed HTML table: {num_rows} rows, {num_cols} cols, {len(cells)} cells"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Build table dict for cleanup
|
||||||
|
table_dict = {
|
||||||
|
'rows': num_rows,
|
||||||
|
'cols': num_cols,
|
||||||
|
'cells': [
|
||||||
|
{
|
||||||
|
'row': c.row,
|
||||||
|
'col': c.col,
|
||||||
|
'row_span': c.row_span,
|
||||||
|
'col_span': c.col_span,
|
||||||
|
'content': c.content
|
||||||
|
}
|
||||||
|
for c in cells
|
||||||
|
],
|
||||||
|
'headers': headers if headers else None,
|
||||||
|
'caption': extracted_text if extracted_text else None
|
||||||
|
}
|
||||||
|
|
||||||
|
# Clean up empty columns
|
||||||
|
table_dict = trim_empty_columns(table_dict)
|
||||||
|
|
||||||
|
# Convert cleaned cells back to TableCell objects
|
||||||
|
cleaned_cells = [
|
||||||
|
TableCell(
|
||||||
|
row=c['row'],
|
||||||
|
col=c['col'],
|
||||||
|
row_span=c.get('row_span', 1),
|
||||||
|
col_span=c.get('col_span', 1),
|
||||||
|
content=c.get('content', '')
|
||||||
|
)
|
||||||
|
for c in table_dict.get('cells', [])
|
||||||
|
]
|
||||||
|
|
||||||
return TableData(
|
return TableData(
|
||||||
rows=num_rows,
|
rows=table_dict.get('rows', num_rows),
|
||||||
cols=num_cols,
|
cols=table_dict.get('cols', num_cols),
|
||||||
cells=cells,
|
cells=cleaned_cells,
|
||||||
headers=headers if headers else None,
|
headers=table_dict.get('headers'),
|
||||||
caption=extracted_text if extracted_text else None
|
caption=table_dict.get('caption')
|
||||||
)
|
)
|
||||||
|
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
|||||||
344
backend/app/services/pp_structure_debug.py
Normal file
344
backend/app/services/pp_structure_debug.py
Normal file
@@ -0,0 +1,344 @@
|
|||||||
|
"""
|
||||||
|
PP-StructureV3 Debug Service
|
||||||
|
|
||||||
|
Provides debugging tools for visualizing and saving PP-StructureV3 results:
|
||||||
|
- Save raw results as JSON for inspection
|
||||||
|
- Generate visualization images showing detected bboxes
|
||||||
|
- Compare raw OCR regions with PP-StructureV3 elements
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, List, Any, Optional, Tuple
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
from PIL import Image, ImageDraw, ImageFont
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Color palette for different element types (RGB)
|
||||||
|
ELEMENT_COLORS: Dict[str, Tuple[int, int, int]] = {
|
||||||
|
'text': (0, 128, 0), # Green
|
||||||
|
'title': (0, 0, 255), # Blue
|
||||||
|
'table': (255, 0, 0), # Red
|
||||||
|
'figure': (255, 165, 0), # Orange
|
||||||
|
'image': (255, 165, 0), # Orange
|
||||||
|
'header': (128, 0, 128), # Purple
|
||||||
|
'footer': (128, 0, 128), # Purple
|
||||||
|
'equation': (0, 255, 255), # Cyan
|
||||||
|
'chart': (255, 192, 203), # Pink
|
||||||
|
'list': (139, 69, 19), # Brown
|
||||||
|
'reference': (128, 128, 128), # Gray
|
||||||
|
'default': (255, 0, 255), # Magenta for unknown types
|
||||||
|
}
|
||||||
|
|
||||||
|
# Color for raw OCR regions
|
||||||
|
RAW_OCR_COLOR = (255, 215, 0) # Gold
|
||||||
|
|
||||||
|
|
||||||
|
class PPStructureDebug:
|
||||||
|
"""Debug service for PP-StructureV3 analysis results."""
|
||||||
|
|
||||||
|
def __init__(self, output_dir: Path):
|
||||||
|
"""
|
||||||
|
Initialize debug service.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
output_dir: Directory to save debug outputs
|
||||||
|
"""
|
||||||
|
self.output_dir = Path(output_dir)
|
||||||
|
self.output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
def save_raw_results(
|
||||||
|
self,
|
||||||
|
pp_structure_results: Dict[str, Any],
|
||||||
|
raw_ocr_regions: List[Dict[str, Any]],
|
||||||
|
filename_prefix: str = "debug"
|
||||||
|
) -> Dict[str, Path]:
|
||||||
|
"""
|
||||||
|
Save raw PP-StructureV3 results and OCR regions as JSON files.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pp_structure_results: Raw PP-StructureV3 analysis results
|
||||||
|
raw_ocr_regions: Raw OCR text regions
|
||||||
|
filename_prefix: Prefix for output files
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with paths to saved files
|
||||||
|
"""
|
||||||
|
saved_files = {}
|
||||||
|
|
||||||
|
# Save PP-StructureV3 results
|
||||||
|
pp_json_path = self.output_dir / f"{filename_prefix}_pp_structure_raw.json"
|
||||||
|
try:
|
||||||
|
# Convert any non-serializable types
|
||||||
|
serializable_results = self._make_serializable(pp_structure_results)
|
||||||
|
with open(pp_json_path, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(serializable_results, f, ensure_ascii=False, indent=2)
|
||||||
|
saved_files['pp_structure'] = pp_json_path
|
||||||
|
logger.info(f"Saved PP-StructureV3 raw results to {pp_json_path}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to save PP-StructureV3 results: {e}")
|
||||||
|
|
||||||
|
# Save raw OCR regions
|
||||||
|
ocr_json_path = self.output_dir / f"{filename_prefix}_raw_ocr_regions.json"
|
||||||
|
try:
|
||||||
|
serializable_ocr = self._make_serializable(raw_ocr_regions)
|
||||||
|
with open(ocr_json_path, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(serializable_ocr, f, ensure_ascii=False, indent=2)
|
||||||
|
saved_files['raw_ocr'] = ocr_json_path
|
||||||
|
logger.info(f"Saved raw OCR regions to {ocr_json_path}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to save raw OCR regions: {e}")
|
||||||
|
|
||||||
|
# Save summary comparison
|
||||||
|
summary_path = self.output_dir / f"{filename_prefix}_debug_summary.json"
|
||||||
|
try:
|
||||||
|
summary = self._generate_summary(pp_structure_results, raw_ocr_regions)
|
||||||
|
with open(summary_path, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(summary, f, ensure_ascii=False, indent=2)
|
||||||
|
saved_files['summary'] = summary_path
|
||||||
|
logger.info(f"Saved debug summary to {summary_path}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to save debug summary: {e}")
|
||||||
|
|
||||||
|
return saved_files
|
||||||
|
|
||||||
|
def generate_visualization(
|
||||||
|
self,
|
||||||
|
image_path: Path,
|
||||||
|
pp_structure_elements: List[Dict[str, Any]],
|
||||||
|
raw_ocr_regions: Optional[List[Dict[str, Any]]] = None,
|
||||||
|
filename_prefix: str = "debug",
|
||||||
|
show_labels: bool = True,
|
||||||
|
show_raw_ocr: bool = True
|
||||||
|
) -> Optional[Path]:
|
||||||
|
"""
|
||||||
|
Generate visualization image showing detected elements.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
image_path: Path to original image
|
||||||
|
pp_structure_elements: PP-StructureV3 detected elements
|
||||||
|
raw_ocr_regions: Optional raw OCR regions to overlay
|
||||||
|
filename_prefix: Prefix for output file
|
||||||
|
show_labels: Whether to show element type labels
|
||||||
|
show_raw_ocr: Whether to show raw OCR regions
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Path to generated visualization image
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Load original image
|
||||||
|
img = Image.open(image_path)
|
||||||
|
if img.mode != 'RGB':
|
||||||
|
img = img.convert('RGB')
|
||||||
|
|
||||||
|
# Create copy for drawing
|
||||||
|
viz_img = img.copy()
|
||||||
|
draw = ImageDraw.Draw(viz_img)
|
||||||
|
|
||||||
|
# Try to load a font, fall back to default
|
||||||
|
try:
|
||||||
|
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 14)
|
||||||
|
small_font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 10)
|
||||||
|
except (IOError, OSError):
|
||||||
|
try:
|
||||||
|
font = ImageFont.truetype("/home/egg/project/Tool_OCR/backend/fonts/NotoSansSC-Regular.ttf", 14)
|
||||||
|
small_font = ImageFont.truetype("/home/egg/project/Tool_OCR/backend/fonts/NotoSansSC-Regular.ttf", 10)
|
||||||
|
except (IOError, OSError):
|
||||||
|
font = ImageFont.load_default()
|
||||||
|
small_font = font
|
||||||
|
|
||||||
|
# Draw raw OCR regions first (so PP-Structure boxes are on top)
|
||||||
|
if show_raw_ocr and raw_ocr_regions:
|
||||||
|
for idx, region in enumerate(raw_ocr_regions):
|
||||||
|
bbox = self._normalize_bbox(region.get('bbox', []))
|
||||||
|
if bbox:
|
||||||
|
# Draw with dashed style simulation (draw thin lines)
|
||||||
|
x0, y0, x1, y1 = bbox
|
||||||
|
draw.rectangle([x0, y0, x1, y1], outline=RAW_OCR_COLOR, width=1)
|
||||||
|
|
||||||
|
# Add small label
|
||||||
|
if show_labels:
|
||||||
|
confidence = region.get('confidence', 0)
|
||||||
|
label = f"OCR:{confidence:.2f}"
|
||||||
|
draw.text((x0, y0 - 12), label, fill=RAW_OCR_COLOR, font=small_font)
|
||||||
|
|
||||||
|
# Draw PP-StructureV3 elements
|
||||||
|
for idx, elem in enumerate(pp_structure_elements):
|
||||||
|
elem_type = elem.get('type', 'default')
|
||||||
|
if hasattr(elem_type, 'value'):
|
||||||
|
elem_type = elem_type.value
|
||||||
|
elem_type = str(elem_type).lower()
|
||||||
|
|
||||||
|
color = ELEMENT_COLORS.get(elem_type, ELEMENT_COLORS['default'])
|
||||||
|
bbox = self._normalize_bbox(elem.get('bbox', []))
|
||||||
|
|
||||||
|
if bbox:
|
||||||
|
x0, y0, x1, y1 = bbox
|
||||||
|
# Draw thicker rectangle for PP-Structure elements
|
||||||
|
draw.rectangle([x0, y0, x1, y1], outline=color, width=3)
|
||||||
|
|
||||||
|
# Add label
|
||||||
|
if show_labels:
|
||||||
|
label = f"{idx}:{elem_type}"
|
||||||
|
# Draw label background
|
||||||
|
text_bbox = draw.textbbox((x0, y0 - 18), label, font=font)
|
||||||
|
draw.rectangle(text_bbox, fill=(255, 255, 255, 200))
|
||||||
|
draw.text((x0, y0 - 18), label, fill=color, font=font)
|
||||||
|
|
||||||
|
# Add legend
|
||||||
|
self._draw_legend(draw, img.width, font)
|
||||||
|
|
||||||
|
# Add image info
|
||||||
|
info_text = f"PP-Structure: {len(pp_structure_elements)} elements"
|
||||||
|
if raw_ocr_regions:
|
||||||
|
info_text += f" | Raw OCR: {len(raw_ocr_regions)} regions"
|
||||||
|
info_text += f" | Size: {img.width}x{img.height}"
|
||||||
|
draw.text((10, img.height - 25), info_text, fill=(0, 0, 0), font=font)
|
||||||
|
|
||||||
|
# Save visualization
|
||||||
|
viz_path = self.output_dir / f"{filename_prefix}_pp_structure_viz.png"
|
||||||
|
viz_img.save(viz_path, 'PNG')
|
||||||
|
logger.info(f"Saved visualization to {viz_path}")
|
||||||
|
|
||||||
|
return viz_path
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to generate visualization: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _draw_legend(self, draw: ImageDraw, img_width: int, font: ImageFont):
|
||||||
|
"""Draw a legend showing element type colors."""
|
||||||
|
legend_x = img_width - 150
|
||||||
|
legend_y = 10
|
||||||
|
|
||||||
|
# Draw legend background
|
||||||
|
draw.rectangle(
|
||||||
|
[legend_x - 5, legend_y - 5, img_width - 5, legend_y + len(ELEMENT_COLORS) * 18 + 25],
|
||||||
|
fill=(255, 255, 255, 230),
|
||||||
|
outline=(0, 0, 0)
|
||||||
|
)
|
||||||
|
|
||||||
|
draw.text((legend_x, legend_y), "Legend:", fill=(0, 0, 0), font=font)
|
||||||
|
legend_y += 20
|
||||||
|
|
||||||
|
for elem_type, color in ELEMENT_COLORS.items():
|
||||||
|
if elem_type == 'default':
|
||||||
|
continue
|
||||||
|
draw.rectangle([legend_x, legend_y + 2, legend_x + 12, legend_y + 14], fill=color)
|
||||||
|
draw.text((legend_x + 18, legend_y), elem_type, fill=(0, 0, 0), font=font)
|
||||||
|
legend_y += 18
|
||||||
|
|
||||||
|
# Add raw OCR legend entry
|
||||||
|
draw.rectangle([legend_x, legend_y + 2, legend_x + 12, legend_y + 14], fill=RAW_OCR_COLOR)
|
||||||
|
draw.text((legend_x + 18, legend_y), "raw_ocr", fill=(0, 0, 0), font=font)
|
||||||
|
|
||||||
|
def _normalize_bbox(self, bbox: Any) -> Optional[Tuple[float, float, float, float]]:
|
||||||
|
"""Normalize bbox to (x0, y0, x1, y1) format."""
|
||||||
|
if not bbox:
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Handle nested list format [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
|
||||||
|
if isinstance(bbox, (list, tuple)) and len(bbox) >= 1:
|
||||||
|
if isinstance(bbox[0], (list, tuple)):
|
||||||
|
xs = [pt[0] for pt in bbox if len(pt) >= 2]
|
||||||
|
ys = [pt[1] for pt in bbox if len(pt) >= 2]
|
||||||
|
if xs and ys:
|
||||||
|
return (min(xs), min(ys), max(xs), max(ys))
|
||||||
|
|
||||||
|
# Handle flat list [x0, y0, x1, y1]
|
||||||
|
if isinstance(bbox, (list, tuple)) and len(bbox) == 4:
|
||||||
|
return (float(bbox[0]), float(bbox[1]), float(bbox[2]), float(bbox[3]))
|
||||||
|
|
||||||
|
# Handle flat polygon [x1, y1, x2, y2, ...]
|
||||||
|
if isinstance(bbox, (list, tuple)) and len(bbox) >= 8:
|
||||||
|
xs = [bbox[i] for i in range(0, len(bbox), 2)]
|
||||||
|
ys = [bbox[i] for i in range(1, len(bbox), 2)]
|
||||||
|
return (min(xs), min(ys), max(xs), max(ys))
|
||||||
|
|
||||||
|
# Handle dict format
|
||||||
|
if isinstance(bbox, dict):
|
||||||
|
return (
|
||||||
|
float(bbox.get('x0', bbox.get('x_min', 0))),
|
||||||
|
float(bbox.get('y0', bbox.get('y_min', 0))),
|
||||||
|
float(bbox.get('x1', bbox.get('x_max', 0))),
|
||||||
|
float(bbox.get('y1', bbox.get('y_max', 0)))
|
||||||
|
)
|
||||||
|
|
||||||
|
except (TypeError, ValueError, IndexError) as e:
|
||||||
|
logger.warning(f"Failed to normalize bbox {bbox}: {e}")
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _generate_summary(
|
||||||
|
self,
|
||||||
|
pp_structure_results: Dict[str, Any],
|
||||||
|
raw_ocr_regions: List[Dict[str, Any]]
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""Generate summary comparing PP-Structure and raw OCR."""
|
||||||
|
pp_elements = pp_structure_results.get('elements', [])
|
||||||
|
|
||||||
|
# Count element types
|
||||||
|
type_counts = {}
|
||||||
|
for elem in pp_elements:
|
||||||
|
elem_type = elem.get('type', 'unknown')
|
||||||
|
if hasattr(elem_type, 'value'):
|
||||||
|
elem_type = elem_type.value
|
||||||
|
type_counts[str(elem_type)] = type_counts.get(str(elem_type), 0) + 1
|
||||||
|
|
||||||
|
# Calculate bounding box coverage
|
||||||
|
pp_bbox_area = 0
|
||||||
|
ocr_bbox_area = 0
|
||||||
|
|
||||||
|
for elem in pp_elements:
|
||||||
|
bbox = self._normalize_bbox(elem.get('bbox'))
|
||||||
|
if bbox:
|
||||||
|
pp_bbox_area += (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
|
||||||
|
|
||||||
|
for region in raw_ocr_regions:
|
||||||
|
bbox = self._normalize_bbox(region.get('bbox'))
|
||||||
|
if bbox:
|
||||||
|
ocr_bbox_area += (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
|
||||||
|
|
||||||
|
return {
|
||||||
|
'timestamp': datetime.now().isoformat(),
|
||||||
|
'pp_structure': {
|
||||||
|
'total_elements': len(pp_elements),
|
||||||
|
'element_types': type_counts,
|
||||||
|
'total_bbox_area': pp_bbox_area,
|
||||||
|
'has_parsing_res_list': pp_structure_results.get('has_parsing_res_list', False)
|
||||||
|
},
|
||||||
|
'raw_ocr': {
|
||||||
|
'total_regions': len(raw_ocr_regions),
|
||||||
|
'total_bbox_area': ocr_bbox_area,
|
||||||
|
'avg_confidence': sum(r.get('confidence', 0) for r in raw_ocr_regions) / len(raw_ocr_regions) if raw_ocr_regions else 0
|
||||||
|
},
|
||||||
|
'comparison': {
|
||||||
|
'element_count_ratio': len(pp_elements) / len(raw_ocr_regions) if raw_ocr_regions else 0,
|
||||||
|
'area_ratio': pp_bbox_area / ocr_bbox_area if ocr_bbox_area > 0 else 0,
|
||||||
|
'potential_gap': len(raw_ocr_regions) - len(pp_elements) if raw_ocr_regions else 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
def _make_serializable(self, obj: Any) -> Any:
|
||||||
|
"""Convert object to JSON-serializable format."""
|
||||||
|
if obj is None:
|
||||||
|
return None
|
||||||
|
if isinstance(obj, (str, int, float, bool)):
|
||||||
|
return obj
|
||||||
|
if isinstance(obj, (list, tuple)):
|
||||||
|
return [self._make_serializable(item) for item in obj]
|
||||||
|
if isinstance(obj, dict):
|
||||||
|
return {str(k): self._make_serializable(v) for k, v in obj.items()}
|
||||||
|
if hasattr(obj, 'value'):
|
||||||
|
return obj.value
|
||||||
|
if hasattr(obj, '__dict__'):
|
||||||
|
return self._make_serializable(obj.__dict__)
|
||||||
|
if hasattr(obj, 'tolist'): # numpy array
|
||||||
|
return obj.tolist()
|
||||||
|
return str(obj)
|
||||||
332
backend/tests/api/test_layout_model_api.py
Normal file
332
backend/tests/api/test_layout_model_api.py
Normal file
@@ -0,0 +1,332 @@
|
|||||||
|
"""
|
||||||
|
API integration tests for Layout Model Selection feature.
|
||||||
|
|
||||||
|
This replaces the deprecated PP-StructureV3 parameter tests.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
from unittest.mock import patch
|
||||||
|
from app.main import app
|
||||||
|
from app.core.database import get_db
|
||||||
|
from app.models.user import User
|
||||||
|
from app.models.task import Task, TaskStatus, TaskFile
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def client():
|
||||||
|
"""Create test client"""
|
||||||
|
return TestClient(app)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def test_user(db_session):
|
||||||
|
"""Create test user"""
|
||||||
|
user = User(
|
||||||
|
email="test@example.com",
|
||||||
|
hashed_password="test_hash",
|
||||||
|
is_active=True
|
||||||
|
)
|
||||||
|
db_session.add(user)
|
||||||
|
db_session.commit()
|
||||||
|
db_session.refresh(user)
|
||||||
|
return user
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def test_task(db_session, test_user):
|
||||||
|
"""Create test task with uploaded file"""
|
||||||
|
task = Task(
|
||||||
|
user_id=test_user.id,
|
||||||
|
task_id="test-task-123",
|
||||||
|
filename="test.pdf",
|
||||||
|
status=TaskStatus.PENDING
|
||||||
|
)
|
||||||
|
db_session.add(task)
|
||||||
|
db_session.commit()
|
||||||
|
db_session.refresh(task)
|
||||||
|
|
||||||
|
# Add task file
|
||||||
|
task_file = TaskFile(
|
||||||
|
task_id=task.id,
|
||||||
|
original_name="test.pdf",
|
||||||
|
stored_path="/tmp/test.pdf",
|
||||||
|
file_size=1024,
|
||||||
|
mime_type="application/pdf"
|
||||||
|
)
|
||||||
|
db_session.add(task_file)
|
||||||
|
db_session.commit()
|
||||||
|
|
||||||
|
return task
|
||||||
|
|
||||||
|
|
||||||
|
class TestLayoutModelSchema:
|
||||||
|
"""Test LayoutModel and ProcessingOptions schema validation"""
|
||||||
|
|
||||||
|
def test_processing_options_accepts_layout_model(self):
|
||||||
|
"""Verify ProcessingOptions schema accepts layout_model parameter"""
|
||||||
|
from app.schemas.task import ProcessingOptions, LayoutModelEnum
|
||||||
|
|
||||||
|
options = ProcessingOptions(
|
||||||
|
use_dual_track=True,
|
||||||
|
language='ch',
|
||||||
|
layout_model=LayoutModelEnum.CHINESE
|
||||||
|
)
|
||||||
|
|
||||||
|
assert options.layout_model == LayoutModelEnum.CHINESE
|
||||||
|
|
||||||
|
def test_layout_model_enum_values(self):
|
||||||
|
"""Verify all layout model enum values are valid"""
|
||||||
|
from app.schemas.task import LayoutModelEnum
|
||||||
|
|
||||||
|
assert LayoutModelEnum.CHINESE.value == "chinese"
|
||||||
|
assert LayoutModelEnum.DEFAULT.value == "default"
|
||||||
|
assert LayoutModelEnum.CDLA.value == "cdla"
|
||||||
|
|
||||||
|
def test_default_layout_model_is_chinese(self):
|
||||||
|
"""Verify default layout model is 'chinese' for best Chinese document support"""
|
||||||
|
from app.schemas.task import ProcessingOptions
|
||||||
|
|
||||||
|
options = ProcessingOptions()
|
||||||
|
|
||||||
|
# Default should be chinese
|
||||||
|
assert options.layout_model.value == "chinese"
|
||||||
|
|
||||||
|
def test_layout_model_string_values_accepted(self):
|
||||||
|
"""Verify string values are accepted for layout_model"""
|
||||||
|
from app.schemas.task import ProcessingOptions
|
||||||
|
|
||||||
|
# String values should be converted to enum
|
||||||
|
options = ProcessingOptions(layout_model="default")
|
||||||
|
assert options.layout_model.value == "default"
|
||||||
|
|
||||||
|
options = ProcessingOptions(layout_model="cdla")
|
||||||
|
assert options.layout_model.value == "cdla"
|
||||||
|
|
||||||
|
def test_invalid_layout_model_rejected(self):
|
||||||
|
"""Verify invalid layout model values are rejected"""
|
||||||
|
from app.schemas.task import ProcessingOptions
|
||||||
|
from pydantic import ValidationError
|
||||||
|
|
||||||
|
with pytest.raises(ValidationError):
|
||||||
|
ProcessingOptions(layout_model="invalid_model")
|
||||||
|
|
||||||
|
|
||||||
|
class TestStartTaskEndpoint:
|
||||||
|
"""Test /tasks/{task_id}/start endpoint with layout_model parameter"""
|
||||||
|
|
||||||
|
@patch('app.routers.tasks.process_task_ocr')
|
||||||
|
def test_start_task_with_layout_model(self, mock_process_ocr, client, test_task, db_session):
|
||||||
|
"""Verify layout_model is accepted and passed to OCR service"""
|
||||||
|
|
||||||
|
# Override get_db dependency
|
||||||
|
def override_get_db():
|
||||||
|
try:
|
||||||
|
yield db_session
|
||||||
|
finally:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Override auth dependency
|
||||||
|
def override_get_current_user():
|
||||||
|
return test_task.user
|
||||||
|
|
||||||
|
app.dependency_overrides[get_db] = override_get_db
|
||||||
|
from app.core.deps import get_current_user
|
||||||
|
app.dependency_overrides[get_current_user] = override_get_current_user
|
||||||
|
|
||||||
|
# Request body with layout_model
|
||||||
|
request_body = {
|
||||||
|
"use_dual_track": True,
|
||||||
|
"language": "ch",
|
||||||
|
"layout_model": "chinese"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Make API call
|
||||||
|
response = client.post(
|
||||||
|
f"/api/v2/tasks/{test_task.task_id}/start",
|
||||||
|
json=request_body
|
||||||
|
)
|
||||||
|
|
||||||
|
# Verify response
|
||||||
|
assert response.status_code == 200
|
||||||
|
data = response.json()
|
||||||
|
assert data['status'] == 'processing'
|
||||||
|
|
||||||
|
# Verify background task was called with layout_model
|
||||||
|
mock_process_ocr.assert_called_once()
|
||||||
|
call_kwargs = mock_process_ocr.call_args[1]
|
||||||
|
|
||||||
|
assert 'layout_model' in call_kwargs
|
||||||
|
assert call_kwargs['layout_model'] == 'chinese'
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
app.dependency_overrides.clear()
|
||||||
|
|
||||||
|
@patch('app.routers.tasks.process_task_ocr')
|
||||||
|
def test_start_task_with_default_model(self, mock_process_ocr, client, test_task, db_session):
|
||||||
|
"""Verify 'default' layout model is accepted"""
|
||||||
|
|
||||||
|
def override_get_db():
|
||||||
|
try:
|
||||||
|
yield db_session
|
||||||
|
finally:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def override_get_current_user():
|
||||||
|
return test_task.user
|
||||||
|
|
||||||
|
app.dependency_overrides[get_db] = override_get_db
|
||||||
|
from app.core.deps import get_current_user
|
||||||
|
app.dependency_overrides[get_current_user] = override_get_current_user
|
||||||
|
|
||||||
|
request_body = {
|
||||||
|
"use_dual_track": True,
|
||||||
|
"layout_model": "default"
|
||||||
|
}
|
||||||
|
|
||||||
|
response = client.post(
|
||||||
|
f"/api/v2/tasks/{test_task.task_id}/start",
|
||||||
|
json=request_body
|
||||||
|
)
|
||||||
|
|
||||||
|
assert response.status_code == 200
|
||||||
|
|
||||||
|
mock_process_ocr.assert_called_once()
|
||||||
|
call_kwargs = mock_process_ocr.call_args[1]
|
||||||
|
assert call_kwargs['layout_model'] == 'default'
|
||||||
|
|
||||||
|
app.dependency_overrides.clear()
|
||||||
|
|
||||||
|
@patch('app.routers.tasks.process_task_ocr')
|
||||||
|
def test_start_task_with_cdla_model(self, mock_process_ocr, client, test_task, db_session):
|
||||||
|
"""Verify 'cdla' layout model is accepted"""
|
||||||
|
|
||||||
|
def override_get_db():
|
||||||
|
try:
|
||||||
|
yield db_session
|
||||||
|
finally:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def override_get_current_user():
|
||||||
|
return test_task.user
|
||||||
|
|
||||||
|
app.dependency_overrides[get_db] = override_get_db
|
||||||
|
from app.core.deps import get_current_user
|
||||||
|
app.dependency_overrides[get_current_user] = override_get_current_user
|
||||||
|
|
||||||
|
request_body = {
|
||||||
|
"use_dual_track": True,
|
||||||
|
"layout_model": "cdla"
|
||||||
|
}
|
||||||
|
|
||||||
|
response = client.post(
|
||||||
|
f"/api/v2/tasks/{test_task.task_id}/start",
|
||||||
|
json=request_body
|
||||||
|
)
|
||||||
|
|
||||||
|
assert response.status_code == 200
|
||||||
|
|
||||||
|
mock_process_ocr.assert_called_once()
|
||||||
|
call_kwargs = mock_process_ocr.call_args[1]
|
||||||
|
assert call_kwargs['layout_model'] == 'cdla'
|
||||||
|
|
||||||
|
app.dependency_overrides.clear()
|
||||||
|
|
||||||
|
@patch('app.routers.tasks.process_task_ocr')
|
||||||
|
def test_start_task_without_layout_model_uses_default(self, mock_process_ocr, client, test_task, db_session):
|
||||||
|
"""Verify task can start without layout_model (uses 'chinese' as default)"""
|
||||||
|
|
||||||
|
def override_get_db():
|
||||||
|
try:
|
||||||
|
yield db_session
|
||||||
|
finally:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def override_get_current_user():
|
||||||
|
return test_task.user
|
||||||
|
|
||||||
|
app.dependency_overrides[get_db] = override_get_db
|
||||||
|
from app.core.deps import get_current_user
|
||||||
|
app.dependency_overrides[get_current_user] = override_get_current_user
|
||||||
|
|
||||||
|
# Request without layout_model
|
||||||
|
request_body = {
|
||||||
|
"use_dual_track": True,
|
||||||
|
"language": "ch"
|
||||||
|
}
|
||||||
|
|
||||||
|
response = client.post(
|
||||||
|
f"/api/v2/tasks/{test_task.task_id}/start",
|
||||||
|
json=request_body
|
||||||
|
)
|
||||||
|
|
||||||
|
assert response.status_code == 200
|
||||||
|
|
||||||
|
mock_process_ocr.assert_called_once()
|
||||||
|
call_kwargs = mock_process_ocr.call_args[1]
|
||||||
|
|
||||||
|
# layout_model should default to 'chinese'
|
||||||
|
assert call_kwargs['layout_model'] == 'chinese'
|
||||||
|
|
||||||
|
app.dependency_overrides.clear()
|
||||||
|
|
||||||
|
def test_start_task_with_invalid_layout_model(self, client, test_task, db_session):
|
||||||
|
"""Verify invalid layout_model returns 422 validation error"""
|
||||||
|
|
||||||
|
def override_get_db():
|
||||||
|
try:
|
||||||
|
yield db_session
|
||||||
|
finally:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def override_get_current_user():
|
||||||
|
return test_task.user
|
||||||
|
|
||||||
|
app.dependency_overrides[get_db] = override_get_db
|
||||||
|
from app.core.deps import get_current_user
|
||||||
|
app.dependency_overrides[get_current_user] = override_get_current_user
|
||||||
|
|
||||||
|
# Request with invalid layout_model
|
||||||
|
request_body = {
|
||||||
|
"use_dual_track": True,
|
||||||
|
"layout_model": "invalid_model"
|
||||||
|
}
|
||||||
|
|
||||||
|
response = client.post(
|
||||||
|
f"/api/v2/tasks/{test_task.task_id}/start",
|
||||||
|
json=request_body
|
||||||
|
)
|
||||||
|
|
||||||
|
# Should return validation error
|
||||||
|
assert response.status_code == 422
|
||||||
|
|
||||||
|
app.dependency_overrides.clear()
|
||||||
|
|
||||||
|
|
||||||
|
class TestOpenAPISchema:
|
||||||
|
"""Test OpenAPI schema includes layout_model parameter"""
|
||||||
|
|
||||||
|
def test_openapi_schema_includes_layout_model(self, client):
|
||||||
|
"""Verify OpenAPI schema documents layout_model parameter"""
|
||||||
|
response = client.get("/openapi.json")
|
||||||
|
assert response.status_code == 200
|
||||||
|
|
||||||
|
schema = response.json()
|
||||||
|
|
||||||
|
# Check LayoutModelEnum schema exists
|
||||||
|
assert 'LayoutModelEnum' in schema['components']['schemas']
|
||||||
|
|
||||||
|
model_schema = schema['components']['schemas']['LayoutModelEnum']
|
||||||
|
|
||||||
|
# Verify all 3 model options are documented
|
||||||
|
assert 'chinese' in model_schema['enum']
|
||||||
|
assert 'default' in model_schema['enum']
|
||||||
|
assert 'cdla' in model_schema['enum']
|
||||||
|
|
||||||
|
# Verify ProcessingOptions includes layout_model
|
||||||
|
options_schema = schema['components']['schemas']['ProcessingOptions']
|
||||||
|
assert 'layout_model' in options_schema['properties']
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
pytest.main([__file__, '-v'])
|
||||||
244
backend/tests/services/test_layout_model.py
Normal file
244
backend/tests/services/test_layout_model.py
Normal file
@@ -0,0 +1,244 @@
|
|||||||
|
"""
|
||||||
|
Unit tests for Layout Model Selection feature in OCR Service.
|
||||||
|
|
||||||
|
This replaces the deprecated PP-StructureV3 parameter tests.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import Mock, patch, MagicMock
|
||||||
|
|
||||||
|
# Mock all external dependencies before importing OCRService
|
||||||
|
sys.modules['paddleocr'] = MagicMock()
|
||||||
|
sys.modules['PIL'] = MagicMock()
|
||||||
|
sys.modules['pdf2image'] = MagicMock()
|
||||||
|
|
||||||
|
# Mock paddle with version attribute
|
||||||
|
paddle_mock = MagicMock()
|
||||||
|
paddle_mock.__version__ = '2.5.0'
|
||||||
|
paddle_mock.device.get_device.return_value = 'cpu'
|
||||||
|
paddle_mock.device.get_available_device.return_value = 'cpu'
|
||||||
|
sys.modules['paddle'] = paddle_mock
|
||||||
|
|
||||||
|
# Mock torch
|
||||||
|
torch_mock = MagicMock()
|
||||||
|
torch_mock.cuda.is_available.return_value = False
|
||||||
|
sys.modules['torch'] = torch_mock
|
||||||
|
|
||||||
|
from app.services.ocr_service import OCRService, LAYOUT_MODEL_MAPPING, _USE_PUBLAYNET_DEFAULT
|
||||||
|
from app.core.config import settings
|
||||||
|
|
||||||
|
|
||||||
|
class TestLayoutModelMapping:
|
||||||
|
"""Test layout model name mapping"""
|
||||||
|
|
||||||
|
def test_layout_model_mapping_exists(self):
|
||||||
|
"""Verify LAYOUT_MODEL_MAPPING constant exists and has correct values"""
|
||||||
|
assert 'chinese' in LAYOUT_MODEL_MAPPING
|
||||||
|
assert 'default' in LAYOUT_MODEL_MAPPING
|
||||||
|
assert 'cdla' in LAYOUT_MODEL_MAPPING
|
||||||
|
|
||||||
|
def test_chinese_model_maps_to_pp_doclayout(self):
|
||||||
|
"""Verify 'chinese' maps to PP-DocLayout-S"""
|
||||||
|
assert LAYOUT_MODEL_MAPPING['chinese'] == 'PP-DocLayout-S'
|
||||||
|
|
||||||
|
def test_default_model_maps_to_publaynet_sentinel(self):
|
||||||
|
"""Verify 'default' maps to sentinel value for PubLayNet default"""
|
||||||
|
# The 'default' model uses a sentinel value that signals "use PubLayNet default (no custom model)"
|
||||||
|
assert LAYOUT_MODEL_MAPPING['default'] == _USE_PUBLAYNET_DEFAULT
|
||||||
|
|
||||||
|
def test_cdla_model_maps_to_picodet(self):
|
||||||
|
"""Verify 'cdla' maps to picodet_lcnet_x1_0_fgd_layout_cdla"""
|
||||||
|
assert LAYOUT_MODEL_MAPPING['cdla'] == 'picodet_lcnet_x1_0_fgd_layout_cdla'
|
||||||
|
|
||||||
|
|
||||||
|
class TestLayoutModelEngine:
|
||||||
|
"""Test engine creation with different layout models"""
|
||||||
|
|
||||||
|
def test_chinese_model_creates_engine_with_pp_doclayout(self):
|
||||||
|
"""Verify 'chinese' layout model uses PP-DocLayout-S"""
|
||||||
|
ocr_service = OCRService()
|
||||||
|
|
||||||
|
with patch.object(ocr_service, 'structure_engine', None):
|
||||||
|
with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure:
|
||||||
|
mock_engine = Mock()
|
||||||
|
mock_ppstructure.return_value = mock_engine
|
||||||
|
|
||||||
|
engine = ocr_service._ensure_structure_engine(layout_model='chinese')
|
||||||
|
|
||||||
|
mock_ppstructure.assert_called_once()
|
||||||
|
call_kwargs = mock_ppstructure.call_args[1]
|
||||||
|
|
||||||
|
assert call_kwargs.get('layout_detection_model_name') == 'PP-DocLayout-S'
|
||||||
|
|
||||||
|
def test_default_model_creates_engine_without_model_name(self):
|
||||||
|
"""Verify 'default' layout model does not specify model name (uses default)"""
|
||||||
|
ocr_service = OCRService()
|
||||||
|
|
||||||
|
with patch.object(ocr_service, 'structure_engine', None):
|
||||||
|
with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure:
|
||||||
|
mock_engine = Mock()
|
||||||
|
mock_ppstructure.return_value = mock_engine
|
||||||
|
|
||||||
|
engine = ocr_service._ensure_structure_engine(layout_model='default')
|
||||||
|
|
||||||
|
mock_ppstructure.assert_called_once()
|
||||||
|
call_kwargs = mock_ppstructure.call_args[1]
|
||||||
|
|
||||||
|
# For 'default', layout_detection_model_name should be None or not set
|
||||||
|
assert call_kwargs.get('layout_detection_model_name') is None
|
||||||
|
|
||||||
|
def test_cdla_model_creates_engine_with_picodet(self):
|
||||||
|
"""Verify 'cdla' layout model uses picodet_lcnet_x1_0_fgd_layout_cdla"""
|
||||||
|
ocr_service = OCRService()
|
||||||
|
|
||||||
|
with patch.object(ocr_service, 'structure_engine', None):
|
||||||
|
with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure:
|
||||||
|
mock_engine = Mock()
|
||||||
|
mock_ppstructure.return_value = mock_engine
|
||||||
|
|
||||||
|
engine = ocr_service._ensure_structure_engine(layout_model='cdla')
|
||||||
|
|
||||||
|
mock_ppstructure.assert_called_once()
|
||||||
|
call_kwargs = mock_ppstructure.call_args[1]
|
||||||
|
|
||||||
|
assert call_kwargs.get('layout_detection_model_name') == 'picodet_lcnet_x1_0_fgd_layout_cdla'
|
||||||
|
|
||||||
|
def test_none_layout_model_uses_chinese_default(self):
|
||||||
|
"""Verify None layout_model defaults to 'chinese' model"""
|
||||||
|
ocr_service = OCRService()
|
||||||
|
|
||||||
|
with patch.object(ocr_service, 'structure_engine', None):
|
||||||
|
with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure:
|
||||||
|
mock_engine = Mock()
|
||||||
|
mock_ppstructure.return_value = mock_engine
|
||||||
|
|
||||||
|
# Pass None for layout_model
|
||||||
|
engine = ocr_service._ensure_structure_engine(layout_model=None)
|
||||||
|
|
||||||
|
mock_ppstructure.assert_called_once()
|
||||||
|
call_kwargs = mock_ppstructure.call_args[1]
|
||||||
|
|
||||||
|
# Should use 'chinese' model as default
|
||||||
|
assert call_kwargs.get('layout_detection_model_name') == 'PP-DocLayout-S'
|
||||||
|
|
||||||
|
|
||||||
|
class TestLayoutModelCaching:
|
||||||
|
"""Test engine caching behavior with layout models"""
|
||||||
|
|
||||||
|
def test_same_layout_model_uses_cached_engine(self):
|
||||||
|
"""Verify same layout model reuses cached engine"""
|
||||||
|
ocr_service = OCRService()
|
||||||
|
|
||||||
|
with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure:
|
||||||
|
mock_engine = Mock()
|
||||||
|
mock_ppstructure.return_value = mock_engine
|
||||||
|
|
||||||
|
# First call with 'chinese'
|
||||||
|
engine1 = ocr_service._ensure_structure_engine(layout_model='chinese')
|
||||||
|
|
||||||
|
# Second call with same model should use cache
|
||||||
|
engine2 = ocr_service._ensure_structure_engine(layout_model='chinese')
|
||||||
|
|
||||||
|
# Verify only one engine was created
|
||||||
|
assert mock_ppstructure.call_count == 1
|
||||||
|
assert engine1 is engine2
|
||||||
|
|
||||||
|
def test_different_layout_model_creates_new_engine(self):
|
||||||
|
"""Verify different layout model creates new engine"""
|
||||||
|
ocr_service = OCRService()
|
||||||
|
|
||||||
|
with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure:
|
||||||
|
mock_engine1 = Mock()
|
||||||
|
mock_engine2 = Mock()
|
||||||
|
mock_ppstructure.side_effect = [mock_engine1, mock_engine2]
|
||||||
|
|
||||||
|
# First call with 'chinese'
|
||||||
|
engine1 = ocr_service._ensure_structure_engine(layout_model='chinese')
|
||||||
|
|
||||||
|
# Second call with 'cdla' should create new engine
|
||||||
|
engine2 = ocr_service._ensure_structure_engine(layout_model='cdla')
|
||||||
|
|
||||||
|
# Verify two engines were created
|
||||||
|
assert mock_ppstructure.call_count == 2
|
||||||
|
assert engine1 is not engine2
|
||||||
|
|
||||||
|
|
||||||
|
class TestLayoutModelFlow:
|
||||||
|
"""Test layout model parameter flow through processing pipeline"""
|
||||||
|
|
||||||
|
def test_layout_model_passed_to_engine_creation(self):
|
||||||
|
"""Verify layout_model is passed through to _ensure_structure_engine"""
|
||||||
|
ocr_service = OCRService()
|
||||||
|
|
||||||
|
# Test that _ensure_structure_engine accepts layout_model parameter
|
||||||
|
with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure:
|
||||||
|
mock_engine = Mock()
|
||||||
|
mock_ppstructure.return_value = mock_engine
|
||||||
|
|
||||||
|
# Call with specific layout_model
|
||||||
|
engine = ocr_service._ensure_structure_engine(layout_model='cdla')
|
||||||
|
|
||||||
|
# Verify correct model was requested
|
||||||
|
mock_ppstructure.assert_called_once()
|
||||||
|
call_kwargs = mock_ppstructure.call_args[1]
|
||||||
|
assert call_kwargs.get('layout_detection_model_name') == 'picodet_lcnet_x1_0_fgd_layout_cdla'
|
||||||
|
|
||||||
|
def test_layout_model_default_behavior(self):
|
||||||
|
"""Verify default layout model behavior when None is passed"""
|
||||||
|
ocr_service = OCRService()
|
||||||
|
|
||||||
|
with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure:
|
||||||
|
mock_engine = Mock()
|
||||||
|
mock_ppstructure.return_value = mock_engine
|
||||||
|
|
||||||
|
# Call without layout_model (None)
|
||||||
|
engine = ocr_service._ensure_structure_engine(layout_model=None)
|
||||||
|
|
||||||
|
# Should use config default (PP-DocLayout-S)
|
||||||
|
mock_ppstructure.assert_called_once()
|
||||||
|
call_kwargs = mock_ppstructure.call_args[1]
|
||||||
|
assert call_kwargs.get('layout_detection_model_name') == settings.layout_detection_model_name
|
||||||
|
|
||||||
|
def test_layout_model_unknown_value_falls_back(self):
|
||||||
|
"""Verify unknown layout model falls back to config default"""
|
||||||
|
ocr_service = OCRService()
|
||||||
|
|
||||||
|
with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure:
|
||||||
|
mock_engine = Mock()
|
||||||
|
mock_ppstructure.return_value = mock_engine
|
||||||
|
|
||||||
|
# Call with unknown layout_model
|
||||||
|
engine = ocr_service._ensure_structure_engine(layout_model='unknown_model')
|
||||||
|
|
||||||
|
# Should use config default
|
||||||
|
mock_ppstructure.assert_called_once()
|
||||||
|
call_kwargs = mock_ppstructure.call_args[1]
|
||||||
|
assert call_kwargs.get('layout_detection_model_name') == settings.layout_detection_model_name
|
||||||
|
|
||||||
|
|
||||||
|
class TestLayoutModelLogging:
|
||||||
|
"""Test layout model logging"""
|
||||||
|
|
||||||
|
def test_layout_model_is_logged(self):
|
||||||
|
"""Verify layout model selection is logged"""
|
||||||
|
ocr_service = OCRService()
|
||||||
|
|
||||||
|
with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure:
|
||||||
|
with patch('app.services.ocr_service.logger') as mock_logger:
|
||||||
|
mock_engine = Mock()
|
||||||
|
mock_ppstructure.return_value = mock_engine
|
||||||
|
|
||||||
|
# Call with specific layout_model
|
||||||
|
ocr_service._ensure_structure_engine(layout_model='cdla')
|
||||||
|
|
||||||
|
# Verify logging occurred
|
||||||
|
assert mock_logger.info.call_count >= 1
|
||||||
|
# Check that model name was logged
|
||||||
|
log_calls = [str(call) for call in mock_logger.info.call_args_list]
|
||||||
|
assert any('cdla' in str(call).lower() or 'layout' in str(call).lower() for call in log_calls)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
pytest.main([__file__, '-v'])
|
||||||
503
backend/tests/test_gap_filling.py
Normal file
503
backend/tests/test_gap_filling.py
Normal file
@@ -0,0 +1,503 @@
|
|||||||
|
"""
|
||||||
|
Tests for Gap Filling Service
|
||||||
|
|
||||||
|
Tests the detection and filling of gaps in PP-StructureV3 output
|
||||||
|
using raw OCR text regions.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from typing import List, Dict, Any
|
||||||
|
|
||||||
|
from app.services.gap_filling_service import GapFillingService, TextRegion, SKIP_ELEMENT_TYPES
|
||||||
|
from app.models.unified_document import DocumentElement, BoundingBox, ElementType, Dimensions
|
||||||
|
|
||||||
|
|
||||||
|
class TestGapFillingService:
|
||||||
|
"""Tests for GapFillingService class."""
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def service(self) -> GapFillingService:
|
||||||
|
"""Create a GapFillingService instance with default settings."""
|
||||||
|
return GapFillingService(
|
||||||
|
coverage_threshold=0.7,
|
||||||
|
iou_threshold=0.15,
|
||||||
|
confidence_threshold=0.3,
|
||||||
|
dedup_iou_threshold=0.5,
|
||||||
|
enabled=True
|
||||||
|
)
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def disabled_service(self) -> GapFillingService:
|
||||||
|
"""Create a disabled GapFillingService instance."""
|
||||||
|
return GapFillingService(enabled=False)
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sample_raw_regions(self) -> List[TextRegion]:
|
||||||
|
"""Create sample raw OCR text regions."""
|
||||||
|
return [
|
||||||
|
TextRegion(text="Header text", bbox=[100, 50, 300, 80], confidence=0.95, page=1),
|
||||||
|
TextRegion(text="Title of document", bbox=[100, 100, 500, 150], confidence=0.92, page=1),
|
||||||
|
TextRegion(text="First paragraph", bbox=[100, 200, 500, 250], confidence=0.90, page=1),
|
||||||
|
TextRegion(text="Second paragraph", bbox=[100, 300, 500, 350], confidence=0.88, page=1),
|
||||||
|
TextRegion(text="Footer note", bbox=[100, 900, 300, 930], confidence=0.85, page=1),
|
||||||
|
# Low confidence region (should be filtered)
|
||||||
|
TextRegion(text="Noise", bbox=[50, 50, 80, 80], confidence=0.1, page=1),
|
||||||
|
]
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sample_pp_elements(self) -> List[DocumentElement]:
|
||||||
|
"""Create sample PP-StructureV3 elements that cover only some regions."""
|
||||||
|
return [
|
||||||
|
DocumentElement(
|
||||||
|
element_id="pp_1",
|
||||||
|
type=ElementType.TITLE,
|
||||||
|
content="Title of document",
|
||||||
|
bbox=BoundingBox(x0=100, y0=100, x1=500, y1=150),
|
||||||
|
confidence=0.95
|
||||||
|
),
|
||||||
|
DocumentElement(
|
||||||
|
element_id="pp_2",
|
||||||
|
type=ElementType.TEXT,
|
||||||
|
content="First paragraph",
|
||||||
|
bbox=BoundingBox(x0=100, y0=200, x1=500, y1=250),
|
||||||
|
confidence=0.90
|
||||||
|
),
|
||||||
|
# Note: Header, Second paragraph, and Footer are NOT covered
|
||||||
|
]
|
||||||
|
|
||||||
|
def test_service_initialization(self, service: GapFillingService):
|
||||||
|
"""Test service initializes with correct parameters."""
|
||||||
|
assert service.enabled is True
|
||||||
|
assert service.coverage_threshold == 0.7
|
||||||
|
assert service.iou_threshold == 0.15
|
||||||
|
assert service.confidence_threshold == 0.3
|
||||||
|
assert service.dedup_iou_threshold == 0.5
|
||||||
|
|
||||||
|
def test_disabled_service(self, disabled_service: GapFillingService):
|
||||||
|
"""Test disabled service does not activate."""
|
||||||
|
regions = [TextRegion(text="Test", bbox=[0, 0, 100, 100], confidence=0.9, page=1)]
|
||||||
|
elements = []
|
||||||
|
|
||||||
|
should_activate, coverage = disabled_service.should_activate(regions, elements)
|
||||||
|
assert should_activate is False
|
||||||
|
assert coverage == 1.0
|
||||||
|
|
||||||
|
def test_should_activate_low_coverage(
|
||||||
|
self,
|
||||||
|
service: GapFillingService,
|
||||||
|
sample_raw_regions: List[TextRegion],
|
||||||
|
sample_pp_elements: List[DocumentElement]
|
||||||
|
):
|
||||||
|
"""Test activation when coverage is below threshold."""
|
||||||
|
# Filter out low confidence regions
|
||||||
|
valid_regions = [r for r in sample_raw_regions if r.confidence >= 0.3]
|
||||||
|
|
||||||
|
should_activate, coverage = service.should_activate(valid_regions, sample_pp_elements)
|
||||||
|
|
||||||
|
# Only 2 out of 5 valid regions are covered (Title, First paragraph)
|
||||||
|
assert should_activate is True
|
||||||
|
assert coverage < 0.7 # Below threshold
|
||||||
|
|
||||||
|
def test_should_not_activate_high_coverage(self, service: GapFillingService):
|
||||||
|
"""Test no activation when coverage is above threshold."""
|
||||||
|
# All regions covered
|
||||||
|
regions = [
|
||||||
|
TextRegion(text="Text 1", bbox=[100, 100, 200, 150], confidence=0.9, page=1),
|
||||||
|
TextRegion(text="Text 2", bbox=[100, 200, 200, 250], confidence=0.9, page=1),
|
||||||
|
]
|
||||||
|
|
||||||
|
elements = [
|
||||||
|
DocumentElement(
|
||||||
|
element_id="pp_1",
|
||||||
|
type=ElementType.TEXT,
|
||||||
|
content="Text 1",
|
||||||
|
bbox=BoundingBox(x0=50, y0=50, x1=250, y1=200), # Covers first region
|
||||||
|
confidence=0.95
|
||||||
|
),
|
||||||
|
DocumentElement(
|
||||||
|
element_id="pp_2",
|
||||||
|
type=ElementType.TEXT,
|
||||||
|
content="Text 2",
|
||||||
|
bbox=BoundingBox(x0=50, y0=180, x1=250, y1=300), # Covers second region
|
||||||
|
confidence=0.95
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
should_activate, coverage = service.should_activate(regions, elements)
|
||||||
|
|
||||||
|
assert should_activate is False
|
||||||
|
assert coverage >= 0.7
|
||||||
|
|
||||||
|
def test_find_uncovered_regions(
|
||||||
|
self,
|
||||||
|
service: GapFillingService,
|
||||||
|
sample_raw_regions: List[TextRegion],
|
||||||
|
sample_pp_elements: List[DocumentElement]
|
||||||
|
):
|
||||||
|
"""Test finding uncovered regions."""
|
||||||
|
uncovered = service.find_uncovered_regions(sample_raw_regions, sample_pp_elements)
|
||||||
|
|
||||||
|
# Should find Header, Second paragraph, Footer (not Title, First paragraph, or low-confidence Noise)
|
||||||
|
assert len(uncovered) == 3
|
||||||
|
|
||||||
|
uncovered_texts = [r.text for r in uncovered]
|
||||||
|
assert "Header text" in uncovered_texts
|
||||||
|
assert "Second paragraph" in uncovered_texts
|
||||||
|
assert "Footer note" in uncovered_texts
|
||||||
|
assert "Title of document" not in uncovered_texts # Covered
|
||||||
|
assert "First paragraph" not in uncovered_texts # Covered
|
||||||
|
assert "Noise" not in uncovered_texts # Low confidence
|
||||||
|
|
||||||
|
def test_coverage_by_center_point(self, service: GapFillingService):
|
||||||
|
"""Test coverage detection via center point."""
|
||||||
|
region = TextRegion(text="Test", bbox=[150, 150, 250, 200], confidence=0.9, page=1)
|
||||||
|
|
||||||
|
element = DocumentElement(
|
||||||
|
element_id="pp_1",
|
||||||
|
type=ElementType.TEXT,
|
||||||
|
content="Container",
|
||||||
|
bbox=BoundingBox(x0=100, y0=100, x1=300, y1=250), # Contains region's center
|
||||||
|
confidence=0.95
|
||||||
|
)
|
||||||
|
|
||||||
|
is_covered = service._is_region_covered(region, [element])
|
||||||
|
assert is_covered is True
|
||||||
|
|
||||||
|
def test_coverage_by_iou(self, service: GapFillingService):
|
||||||
|
"""Test coverage detection via IoU threshold."""
|
||||||
|
region = TextRegion(text="Test", bbox=[100, 100, 200, 150], confidence=0.9, page=1)
|
||||||
|
|
||||||
|
element = DocumentElement(
|
||||||
|
element_id="pp_1",
|
||||||
|
type=ElementType.TEXT,
|
||||||
|
content="Overlap",
|
||||||
|
bbox=BoundingBox(x0=150, y0=100, x1=250, y1=150), # Partial overlap
|
||||||
|
confidence=0.95
|
||||||
|
)
|
||||||
|
|
||||||
|
# Calculate expected IoU
|
||||||
|
# Intersection: (150-200) x (100-150) = 50 x 50 = 2500
|
||||||
|
# Union: 100x50 + 100x50 - 2500 = 7500
|
||||||
|
# IoU = 2500/7500 = 0.33 > 0.15 threshold
|
||||||
|
|
||||||
|
is_covered = service._is_region_covered(region, [element])
|
||||||
|
assert is_covered is True
|
||||||
|
|
||||||
|
def test_deduplication(
|
||||||
|
self,
|
||||||
|
service: GapFillingService,
|
||||||
|
sample_pp_elements: List[DocumentElement]
|
||||||
|
):
|
||||||
|
"""Test deduplication removes high-overlap regions."""
|
||||||
|
uncovered = [
|
||||||
|
# High overlap with pp_2 (First paragraph)
|
||||||
|
TextRegion(text="First paragraph variant", bbox=[100, 200, 500, 250], confidence=0.9, page=1),
|
||||||
|
# No overlap
|
||||||
|
TextRegion(text="Unique region", bbox=[100, 500, 300, 550], confidence=0.9, page=1),
|
||||||
|
]
|
||||||
|
|
||||||
|
deduplicated = service.deduplicate_regions(uncovered, sample_pp_elements)
|
||||||
|
|
||||||
|
assert len(deduplicated) == 1
|
||||||
|
assert deduplicated[0].text == "Unique region"
|
||||||
|
|
||||||
|
def test_convert_regions_to_elements(self, service: GapFillingService):
|
||||||
|
"""Test conversion of TextRegions to DocumentElements."""
|
||||||
|
regions = [
|
||||||
|
TextRegion(text="Test text 1", bbox=[100, 100, 200, 150], confidence=0.85, page=1),
|
||||||
|
TextRegion(text="Test text 2", bbox=[100, 200, 200, 250], confidence=0.90, page=1),
|
||||||
|
]
|
||||||
|
|
||||||
|
elements = service.convert_regions_to_elements(regions, page_number=1, start_element_id=0)
|
||||||
|
|
||||||
|
assert len(elements) == 2
|
||||||
|
assert elements[0].element_id == "gap_fill_1_0"
|
||||||
|
assert elements[0].type == ElementType.TEXT
|
||||||
|
assert elements[0].content == "Test text 1"
|
||||||
|
assert elements[0].confidence == 0.85
|
||||||
|
assert elements[0].metadata.get('source') == 'gap_filling'
|
||||||
|
|
||||||
|
assert elements[1].element_id == "gap_fill_1_1"
|
||||||
|
assert elements[1].content == "Test text 2"
|
||||||
|
|
||||||
|
def test_recalculate_reading_order(self, service: GapFillingService):
|
||||||
|
"""Test reading order recalculation."""
|
||||||
|
elements = [
|
||||||
|
DocumentElement(
|
||||||
|
element_id="e3",
|
||||||
|
type=ElementType.TEXT,
|
||||||
|
content="Bottom",
|
||||||
|
bbox=BoundingBox(x0=100, y0=300, x1=200, y1=350),
|
||||||
|
confidence=0.9
|
||||||
|
),
|
||||||
|
DocumentElement(
|
||||||
|
element_id="e1",
|
||||||
|
type=ElementType.TEXT,
|
||||||
|
content="Top",
|
||||||
|
bbox=BoundingBox(x0=100, y0=100, x1=200, y1=150),
|
||||||
|
confidence=0.9
|
||||||
|
),
|
||||||
|
DocumentElement(
|
||||||
|
element_id="e2",
|
||||||
|
type=ElementType.TEXT,
|
||||||
|
content="Middle",
|
||||||
|
bbox=BoundingBox(x0=100, y0=200, x1=200, y1=250),
|
||||||
|
confidence=0.9
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
reading_order = service.recalculate_reading_order(elements)
|
||||||
|
|
||||||
|
# Should be sorted by y0: Top (100), Middle (200), Bottom (300)
|
||||||
|
assert reading_order == [1, 2, 0] # Indices of elements in reading order
|
||||||
|
|
||||||
|
def test_fill_gaps_integration(
|
||||||
|
self,
|
||||||
|
service: GapFillingService,
|
||||||
|
):
|
||||||
|
"""Integration test for fill_gaps method."""
|
||||||
|
# Raw OCR regions (dict format as received from OCR service)
|
||||||
|
raw_regions = [
|
||||||
|
{'text': 'Header', 'bbox': [100, 50, 300, 80], 'confidence': 0.95, 'page': 1},
|
||||||
|
{'text': 'Title', 'bbox': [100, 100, 500, 150], 'confidence': 0.92, 'page': 1},
|
||||||
|
{'text': 'Paragraph 1', 'bbox': [100, 200, 500, 250], 'confidence': 0.90, 'page': 1},
|
||||||
|
{'text': 'Paragraph 2', 'bbox': [100, 300, 500, 350], 'confidence': 0.88, 'page': 1},
|
||||||
|
{'text': 'Paragraph 3', 'bbox': [100, 400, 500, 450], 'confidence': 0.86, 'page': 1},
|
||||||
|
{'text': 'Footer', 'bbox': [100, 900, 300, 930], 'confidence': 0.85, 'page': 1},
|
||||||
|
]
|
||||||
|
|
||||||
|
# PP-StructureV3 only detected Title (missing 5 out of 6 regions = 16.7% coverage)
|
||||||
|
pp_elements = [
|
||||||
|
DocumentElement(
|
||||||
|
element_id="pp_1",
|
||||||
|
type=ElementType.TITLE,
|
||||||
|
content="Title",
|
||||||
|
bbox=BoundingBox(x0=100, y0=100, x1=500, y1=150),
|
||||||
|
confidence=0.95
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
supplemented, stats = service.fill_gaps(
|
||||||
|
raw_ocr_regions=raw_regions,
|
||||||
|
pp_structure_elements=pp_elements,
|
||||||
|
page_number=1
|
||||||
|
)
|
||||||
|
|
||||||
|
# Should have activated and supplemented missing regions
|
||||||
|
assert stats['activated'] is True
|
||||||
|
assert stats['coverage_ratio'] < 0.7
|
||||||
|
assert len(supplemented) == 5 # Header, Paragraph 1, 2, 3, Footer
|
||||||
|
|
||||||
|
def test_fill_gaps_no_activation_when_coverage_high(self, service: GapFillingService):
|
||||||
|
"""Test fill_gaps does not activate when coverage is high."""
|
||||||
|
raw_regions = [
|
||||||
|
{'text': 'Text 1', 'bbox': [100, 100, 200, 150], 'confidence': 0.9, 'page': 1},
|
||||||
|
]
|
||||||
|
|
||||||
|
pp_elements = [
|
||||||
|
DocumentElement(
|
||||||
|
element_id="pp_1",
|
||||||
|
type=ElementType.TEXT,
|
||||||
|
content="Text 1",
|
||||||
|
bbox=BoundingBox(x0=50, y0=50, x1=250, y1=200), # Fully covers
|
||||||
|
confidence=0.95
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
supplemented, stats = service.fill_gaps(
|
||||||
|
raw_ocr_regions=raw_regions,
|
||||||
|
pp_structure_elements=pp_elements,
|
||||||
|
page_number=1
|
||||||
|
)
|
||||||
|
|
||||||
|
assert stats['activated'] is False
|
||||||
|
assert len(supplemented) == 0
|
||||||
|
|
||||||
|
def test_skip_element_types_not_supplemented(self, service: GapFillingService):
|
||||||
|
"""Test that TABLE/IMAGE/etc. elements are not supplemented over."""
|
||||||
|
raw_regions = [
|
||||||
|
{'text': 'Table cell text', 'bbox': [100, 100, 200, 150], 'confidence': 0.9, 'page': 1},
|
||||||
|
]
|
||||||
|
|
||||||
|
# PP-StructureV3 has a table covering this region
|
||||||
|
pp_elements = [
|
||||||
|
DocumentElement(
|
||||||
|
element_id="pp_1",
|
||||||
|
type=ElementType.TABLE,
|
||||||
|
content="<table>...</table>",
|
||||||
|
bbox=BoundingBox(x0=50, y0=50, x1=250, y1=200),
|
||||||
|
confidence=0.95
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
# The region should be considered covered by the table
|
||||||
|
supplemented, stats = service.fill_gaps(
|
||||||
|
raw_ocr_regions=raw_regions,
|
||||||
|
pp_structure_elements=pp_elements,
|
||||||
|
page_number=1
|
||||||
|
)
|
||||||
|
|
||||||
|
# Should not supplement because the table covers it
|
||||||
|
assert len(supplemented) == 0
|
||||||
|
|
||||||
|
def test_coordinate_scaling(self, service: GapFillingService):
|
||||||
|
"""Test coordinate alignment with different dimensions."""
|
||||||
|
# OCR was done at 2000x3000, PP-Structure at 1000x1500
|
||||||
|
ocr_dimensions = {'width': 2000, 'height': 3000}
|
||||||
|
pp_dimensions = Dimensions(width=1000, height=1500)
|
||||||
|
|
||||||
|
raw_regions = [
|
||||||
|
# At OCR scale: (200, 300) to (400, 450) -> at PP scale: (100, 150) to (200, 225)
|
||||||
|
{'text': 'Scaled text', 'bbox': [200, 300, 400, 450], 'confidence': 0.9, 'page': 1},
|
||||||
|
]
|
||||||
|
|
||||||
|
pp_elements = [
|
||||||
|
DocumentElement(
|
||||||
|
element_id="pp_1",
|
||||||
|
type=ElementType.TEXT,
|
||||||
|
content="Scaled text",
|
||||||
|
bbox=BoundingBox(x0=100, y0=150, x1=200, y1=225), # Should cover after scaling
|
||||||
|
confidence=0.95
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
supplemented, stats = service.fill_gaps(
|
||||||
|
raw_ocr_regions=raw_regions,
|
||||||
|
pp_structure_elements=pp_elements,
|
||||||
|
page_number=1,
|
||||||
|
ocr_dimensions=ocr_dimensions,
|
||||||
|
pp_dimensions=pp_dimensions
|
||||||
|
)
|
||||||
|
|
||||||
|
# After scaling, the region should be covered
|
||||||
|
assert stats['coverage_ratio'] >= 0.7 or len(supplemented) == 0
|
||||||
|
|
||||||
|
def test_iou_calculation(self, service: GapFillingService):
|
||||||
|
"""Test IoU calculation accuracy."""
|
||||||
|
# Two identical boxes
|
||||||
|
bbox1 = (0, 0, 100, 100)
|
||||||
|
bbox2 = (0, 0, 100, 100)
|
||||||
|
assert service._calculate_iou(bbox1, bbox2) == 1.0
|
||||||
|
|
||||||
|
# No overlap
|
||||||
|
bbox1 = (0, 0, 100, 100)
|
||||||
|
bbox2 = (200, 200, 300, 300)
|
||||||
|
assert service._calculate_iou(bbox1, bbox2) == 0.0
|
||||||
|
|
||||||
|
# 50% overlap
|
||||||
|
bbox1 = (0, 0, 100, 100)
|
||||||
|
bbox2 = (50, 0, 150, 100) # Shifted right by 50
|
||||||
|
# Intersection: 50x100 = 5000
|
||||||
|
# Union: 10000 + 10000 - 5000 = 15000
|
||||||
|
# IoU = 5000/15000 = 0.333...
|
||||||
|
iou = service._calculate_iou(bbox1, bbox2)
|
||||||
|
assert abs(iou - 1/3) < 0.01
|
||||||
|
|
||||||
|
def test_point_in_bbox(self, service: GapFillingService):
|
||||||
|
"""Test point-in-bbox check."""
|
||||||
|
bbox = (100, 100, 200, 200)
|
||||||
|
|
||||||
|
# Inside
|
||||||
|
assert service._point_in_bbox(150, 150, bbox) is True
|
||||||
|
|
||||||
|
# On edge
|
||||||
|
assert service._point_in_bbox(100, 100, bbox) is True
|
||||||
|
assert service._point_in_bbox(200, 200, bbox) is True
|
||||||
|
|
||||||
|
# Outside
|
||||||
|
assert service._point_in_bbox(50, 150, bbox) is False
|
||||||
|
assert service._point_in_bbox(250, 150, bbox) is False
|
||||||
|
|
||||||
|
def test_merge_adjacent_regions(self, service: GapFillingService):
|
||||||
|
"""Test merging of adjacent text regions."""
|
||||||
|
regions = [
|
||||||
|
TextRegion(text="Hello", bbox=[100, 100, 150, 130], confidence=0.9, page=1),
|
||||||
|
TextRegion(text="World", bbox=[160, 100, 210, 130], confidence=0.85, page=1), # Adjacent
|
||||||
|
TextRegion(text="Far away", bbox=[100, 300, 200, 330], confidence=0.9, page=1), # Not adjacent
|
||||||
|
]
|
||||||
|
|
||||||
|
merged = service.merge_adjacent_regions(regions, max_horizontal_gap=20, max_vertical_gap=10)
|
||||||
|
|
||||||
|
assert len(merged) == 2
|
||||||
|
# First two should be merged
|
||||||
|
assert "Hello" in merged[0].text and "World" in merged[0].text
|
||||||
|
assert merged[1].text == "Far away"
|
||||||
|
|
||||||
|
|
||||||
|
class TestTextRegion:
|
||||||
|
"""Tests for TextRegion dataclass."""
|
||||||
|
|
||||||
|
def test_normalized_bbox_4_values(self):
|
||||||
|
"""Test bbox normalization with 4 values."""
|
||||||
|
region = TextRegion(text="Test", bbox=[100, 200, 300, 400], confidence=0.9, page=1)
|
||||||
|
assert region.normalized_bbox == (100, 200, 300, 400)
|
||||||
|
|
||||||
|
def test_normalized_bbox_polygon_flat(self):
|
||||||
|
"""Test bbox normalization with flat polygon format (8 values)."""
|
||||||
|
# Polygon: 4 points as flat list [x1, y1, x2, y2, x3, y3, x4, y4]
|
||||||
|
region = TextRegion(
|
||||||
|
text="Test",
|
||||||
|
bbox=[100, 200, 300, 200, 300, 400, 100, 400],
|
||||||
|
confidence=0.9,
|
||||||
|
page=1
|
||||||
|
)
|
||||||
|
assert region.normalized_bbox == (100, 200, 300, 400)
|
||||||
|
|
||||||
|
def test_normalized_bbox_polygon_nested(self):
|
||||||
|
"""Test bbox normalization with nested polygon format (PaddleOCR format)."""
|
||||||
|
# PaddleOCR format: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
|
||||||
|
region = TextRegion(
|
||||||
|
text="Test",
|
||||||
|
bbox=[[100, 200], [300, 200], [300, 400], [100, 400]],
|
||||||
|
confidence=0.9,
|
||||||
|
page=1
|
||||||
|
)
|
||||||
|
assert region.normalized_bbox == (100, 200, 300, 400)
|
||||||
|
|
||||||
|
def test_normalized_bbox_numpy_polygon(self):
|
||||||
|
"""Test bbox normalization with numpy-like nested format."""
|
||||||
|
# Sometimes PaddleOCR returns numpy arrays converted to lists
|
||||||
|
region = TextRegion(
|
||||||
|
text="Test",
|
||||||
|
bbox=[[100.5, 200.5], [300.5, 200.5], [300.5, 400.5], [100.5, 400.5]],
|
||||||
|
confidence=0.9,
|
||||||
|
page=1
|
||||||
|
)
|
||||||
|
bbox = region.normalized_bbox
|
||||||
|
assert bbox[0] == 100.5
|
||||||
|
assert bbox[1] == 200.5
|
||||||
|
assert bbox[2] == 300.5
|
||||||
|
assert bbox[3] == 400.5
|
||||||
|
|
||||||
|
def test_center_calculation(self):
|
||||||
|
"""Test center point calculation."""
|
||||||
|
region = TextRegion(text="Test", bbox=[100, 200, 300, 400], confidence=0.9, page=1)
|
||||||
|
assert region.center == (200, 300)
|
||||||
|
|
||||||
|
def test_center_calculation_nested_bbox(self):
|
||||||
|
"""Test center point calculation with nested bbox format."""
|
||||||
|
region = TextRegion(
|
||||||
|
text="Test",
|
||||||
|
bbox=[[100, 200], [300, 200], [300, 400], [100, 400]],
|
||||||
|
confidence=0.9,
|
||||||
|
page=1
|
||||||
|
)
|
||||||
|
assert region.center == (200, 300)
|
||||||
|
|
||||||
|
|
||||||
|
class TestOCRToUnifiedConverterIntegration:
|
||||||
|
"""Integration tests for OCRToUnifiedConverter with gap filling."""
|
||||||
|
|
||||||
|
def test_converter_with_gap_filling_enabled(self):
|
||||||
|
"""Test converter initializes with gap filling enabled."""
|
||||||
|
from app.services.ocr_to_unified_converter import OCRToUnifiedConverter
|
||||||
|
|
||||||
|
converter = OCRToUnifiedConverter(enable_gap_filling=True)
|
||||||
|
assert converter.gap_filling_service is not None
|
||||||
|
|
||||||
|
def test_converter_with_gap_filling_disabled(self):
|
||||||
|
"""Test converter initializes without gap filling."""
|
||||||
|
from app.services.ocr_to_unified_converter import OCRToUnifiedConverter
|
||||||
|
|
||||||
|
converter = OCRToUnifiedConverter(enable_gap_filling=False)
|
||||||
|
assert converter.gap_filling_service is None
|
||||||
31
docs/commit-history-report.md
Normal file
31
docs/commit-history-report.md
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
# Tool_OCR Commit History Review (2025-11-12 ~ 2025-11-26)
|
||||||
|
|
||||||
|
本報告依 `git log` 全量 97 筆提交整理,涵蓋開發脈絡、里程碑、測試/品質信號與後續風險。提交類型統計:35 `feat` / 37 `fix` / 9 `chore` / 5 `test` / 4 `docs` / 2 `refactor`,主要集中於 2025-11-18、11-19、11-20 與 11-24 的密集開發。
|
||||||
|
|
||||||
|
## 時間軸與里程碑
|
||||||
|
- **前期基礎與前端現代化 (11-12~11-13)**:`21bc2f9`, `57cf912` 將前端改為 Tailwind v4 +專業 UI,`0f81d5e` 單容器 Docker 化、`d7e6473` WSL Ubuntu 開發環境。
|
||||||
|
- **GPU 加速與相容性 (11-14)**:`6452797` 提案 + `7536f43` 實作 GPU OCR,`d80d60f`/`3694411`/`80c091b` 修正 Paddle 3.x API 與安裝來源,`b048f2d` 暫停圖表識別以避免 API 缺口。
|
||||||
|
- **外部 Auth V2 與管理後台 (11-14~11-16)**:`28e419f`~`fd98018` 完成外部認證 V2、資料表前綴與架構移除 V1;`8f94191` 新增後台/稽核/Token 檢查;`90fca50`/`6bb5b76` 讓 18/18 測試全過。
|
||||||
|
- **V2 UI 串接與初版版面保持 PDF (11-16~11-18)**:前端/後端全面切換 V2 API (`ad5c8be` 之後),`fa1abcd` 版面保持 PDF + 多次座標/重疊修正 (`d33f605`~`0edc56b`),強化 logging (`d99d37d`)。
|
||||||
|
- **雙軌處理架構 (11-18~11-20)**:`2d50c12` + `82139c8` 導入 OCR/Direct 雙軌與 UnifiedDocument;`a3a6fbe`/`ab89a40`/`ecdce96` 完成轉換、JSON 匯出與 PDF 支援;`1d0b638` 後端 API,`c2288ba` 前端支援,`c50a5e9` 單元/整合測試;`0974fc3` E2E 修復,`ef335cf` Office 直抽,`b997f93`/`9f449e8` GPU 記憶體管理與文件化,`2ecd022` E2E 測試完成。
|
||||||
|
- **PDF 版面復原計畫 (11-20 提案,11-24 實作高峰)**:`cf894b0` 提案後,`0aff468` Phase1 圖片/表格修復,`3fc32bc` Phase2 風格保存,`77fe4cc`/`ad879d4`/`75c194f` 等完成 Alignment、List、Span 級渲染與多欄位;一系列 `93bd9f5`~`3358d97` 針對位置/重疊/缺圖修正,`4325d02` 專案清理並封存提案。
|
||||||
|
- **PP-Structure V3 調校 (11-25)**:`a659e7a` 改善複雜圖示結構保留,`2312b4c` 前端可調 `pp_structure` 參數 + 測試,`0999898` 多頁 PDF 座標校正。
|
||||||
|
- **記憶體管理與混合抽圖 (11-25~11-26)**:`ba8ddf2` 提案,`1afdb82` 混合圖片抽取+記憶體管理落地,`b997f93` 系列 GPU 釋放/可選 torch,引入 ModelManager、ServicePool、MemoryGuard(詳見 `openspec/changes/archive/2025-11-26-enhance-memory-management`);`a227311` 封存提案但僅完成 75/80 任務(剩餘文件化);隨後多筆修復(`79cffe6`~`fa9b542`)處理 PDF 回歸與文字渲染,`6e050eb` 為最新 OCR 軌表格格式/裁剪修正。
|
||||||
|
|
||||||
|
## 品質與測試信號
|
||||||
|
- 11-16 完成 V2 API 測試 18/18 (`6bb5b76`),建立初步信心。
|
||||||
|
- 雙軌導入時新增單元/整合/E2E 測試 (`0fcb249`, `c50a5e9`, `2ecd022`),但後續 PDF 版面復原大量依賴人工驗證,Phase 4 測試仍未完成(見下)。
|
||||||
|
- 記憶體管理變更伴隨 57+18+10 測試檔(任務 8.1 完成),但文件化缺失可能影響交接與調參。
|
||||||
|
- 11-24 大量 PDF 修復連續提交顯示迭代式修 bug,建議增加回歸測試覆蓋(特別是表格/多欄/列表與跨軌道 PDF)。
|
||||||
|
|
||||||
|
## 未盡事項與風險
|
||||||
|
- **記憶體管理文件化缺口**:`openspec/changes/archive/2025-11-26-enhance-memory-management/tasks.md` 未完成 Section 8.2(架構說明、調校指南、疑難排解、監控、遷移指南),可能影響部署可操作性。
|
||||||
|
- **PDF 版面復原驗證不足**:同一變更的 Phase 4 測試/效能/文件與多類文件驗證均未勾選,現階段品質依賴手測。
|
||||||
|
- **近期修正集中於 PDF 與表格**(`79cffe6`, `5c561f4`, `19bd5fd`, `fa9b542`, `6e050eb`),顯示 Direct/OCR 軌 PDF 路徑仍脆弱;缺乏自動化回歸易再度回歸。
|
||||||
|
- **主分支狀態**:`main` 比 `origin/main` 超前 1 提交(`6e050eb`),請推送前確認 CI/測試。
|
||||||
|
|
||||||
|
## 建議後續行動
|
||||||
|
1) 完成記憶體管理文件(架構、調參、故障排除、Prometheus 監控指南)並加入 sanity check。
|
||||||
|
2) 為 PDF 版面復原建立最小回歸集:多欄文檔、含圖表/表格的 Direct/OCR 軌、列表與 span 混排。
|
||||||
|
3) 圍繞 `processing_track` 分流與 UnifiedDocument/PDF 生成的邊界條件增加測試(LOGO/未知元素、跨頁表格、OCR/Direct 混合圖片)。
|
||||||
|
4) 推送前跑現有單元/整合/E2E 測試,補上近兩週新增場景的腳本以降低回歸風險。
|
||||||
24
docs/project-risk-assessment.md
Normal file
24
docs/project-risk-assessment.md
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
# Project Risk & Issue Outlook
|
||||||
|
|
||||||
|
本文件整理當前專案的可預見問題、潛在問題與建議修復方向(依風險與可行性排序)。依據來源:`git log`(97 commits, 2025-11-12~11-26)、`docs/architecture-overview.md`、`openspec/changes/archive/2025-11-26-enhance-memory-management/tasks.md` 等。
|
||||||
|
|
||||||
|
## 可預見的問題項目
|
||||||
|
- **記憶體管理文件缺口**:`openspec/changes/archive/2025-11-26-enhance-memory-management/tasks.md` 的 8.2 文檔未完成,ModelManager/ServicePool/MemoryGuard 的調參與故障處置缺乏 runbook,部署或擴容時易踩坑。方向:補完架構說明、調參指南、故障排解與監控落地範例(Prometheus 指標與警戒值)。
|
||||||
|
- **PDF 生成回歸風險高**:版面保持與表格/圖片渲染在 `fa1abcd` 之後多次修正(例如 `d33f605`→`92e326b`、`108784a`→`3358d97`、`6e050eb`),顯示缺少自動回歸。方向:建立最小回歸集(多欄文本、含圖表/表格、列表/Span 混排)與 golden PDF/JSON 比對,覆蓋 Direct/OCR 雙軌。
|
||||||
|
- **最新 OCR 表格格式修復未經回歸**:`6e050eb` 修正 OCR 軌表格資料格式與裁剪,無對應測試。方向:為 OCR 軌加表格解析/PDF 出圖的整合測試,確保與前端下載/展示一致。
|
||||||
|
- **PP-Structure 參數調校可能影響資源**:`frontend` 支援前端可調 `pp_structure_params`(`2312b4c`),若缺乏 guard,可能放大 GPU/記憶體壓力。方向:在後端對超參做白名單與上限檢查,並納入 MemoryGuard 預估。
|
||||||
|
- **Chart 能力啟停策略缺少驗證**:`b048f2d` 禁用 → `7e12f16` 重新啟用;缺少覆蓋率與性能數據。方向:為 chart 模型啟用/關閉建立健康檢查與 A/B 測試數據收集。
|
||||||
|
|
||||||
|
## 潛在的問題項目
|
||||||
|
- **UnifiedDocument 結構漂移風險**:雙軌共用輸出,近期多次調整(列表、Span、多欄、LOGO 元素),缺少結構驗證或 schema 鎖定。可能導致前端/匯出器/PDF 生成不一致。方向:定義 JSON Schema 或 pydantic 驗證,建立 contract 測試。
|
||||||
|
- **服務池與記憶體守護的長時間行為未驗證**:雖有單元/整合測試,缺乏長時間 soak/stress(GPU 記憶碎片、模型 unload/reload、信號處理)。方向:加入 24h soak 測試與記憶體走勢告警,驗證 SIGTERM/SIGINT 清理。
|
||||||
|
- **LibreOffice 轉檔鏈低觀測性**:Office 直抽與轉 PDF (`ef335cf`) 依賴系統 LibreOffice,缺少失敗監控與重試策略。方向:為轉檔階段增加 metrics/告警,並提供 fallback/重試。
|
||||||
|
- **前端/後端 API 契約缺少檢查**:多次 V1→V2 遷移與新增參數(`pp_structure_params` 等),目前僅靠 E2E,缺少型別/契約檢查。方向:加入 OpenAPI 契約測試或生成型別校驗(ts-sdk 對齊 FastAPI schema)。
|
||||||
|
- **混合抽圖/圖片保存路徑邊界**:Direct/OCR 混合抽圖與 `_save_image` 實作曾多次修復,仍缺少對 None/缺檔路徑的防禦。方向:為缺檔/無圖的 PDF 生成加強斷言與 fallback。
|
||||||
|
|
||||||
|
## 建議修復與方向
|
||||||
|
1) **完成記憶體管理文檔與樣板設定**:在 `docs/` 新增 MemoryGuard/ServicePool 調參與故障排除指南,附 `.env` 範例與 Prometheus 規則,對應 tasks 8.2 清單。
|
||||||
|
2) **建立 PDF/UnifiedDocument 回歸套件**:收集代表性樣本(多欄、表格、列表、含圖/LOGO、OCR/Direct 雙軌),產生 golden JSON/PDF,加入 CI 比對,並為 `6e050eb` 相關表格路徑新增測試。
|
||||||
|
3) **加入 UnifiedDocument Schema 驗證**:定義 schema(pydantic/JSON Schema),在匯出/PDF 生成前驗證;同時讓前端型別由 OpenAPI 生成以防 drift。
|
||||||
|
4) **PP-Structure 參數防護與資源估算**:後端實作白名單/上限與 MemoryGuard 預估,避免前端自由調參造成 GPU OOM;增加拒絕/降級回饋。
|
||||||
|
5) **長時間穩定性與轉檔可觀測性**:增加 soak/stress pipeline,追蹤 GPU/CPU/記憶碎片;為 LibreOffice/轉檔階段加 metrics、重試與錯誤分類告警。
|
||||||
110
frontend/src/components/LayoutModelSelector.tsx
Normal file
110
frontend/src/components/LayoutModelSelector.tsx
Normal file
@@ -0,0 +1,110 @@
|
|||||||
|
import { cn } from '@/lib/utils'
|
||||||
|
import { Check, FileText, Globe, BookOpen } from 'lucide-react'
|
||||||
|
import { useTranslation } from 'react-i18next'
|
||||||
|
import type { LayoutModel } from '@/types/apiV2'
|
||||||
|
|
||||||
|
interface LayoutModelSelectorProps {
|
||||||
|
value: LayoutModel
|
||||||
|
onChange: (model: LayoutModel) => void
|
||||||
|
disabled?: boolean
|
||||||
|
className?: string
|
||||||
|
}
|
||||||
|
|
||||||
|
const MODEL_ICONS: Record<LayoutModel, React.ReactNode> = {
|
||||||
|
chinese: <FileText className="w-5 h-5" />,
|
||||||
|
default: <Globe className="w-5 h-5" />,
|
||||||
|
cdla: <BookOpen className="w-5 h-5" />,
|
||||||
|
}
|
||||||
|
|
||||||
|
export default function LayoutModelSelector({
|
||||||
|
value,
|
||||||
|
onChange,
|
||||||
|
disabled = false,
|
||||||
|
className,
|
||||||
|
}: LayoutModelSelectorProps) {
|
||||||
|
const { t } = useTranslation()
|
||||||
|
const models: LayoutModel[] = ['chinese', 'default', 'cdla']
|
||||||
|
|
||||||
|
const getModelInfo = (model: LayoutModel) => ({
|
||||||
|
label: t(`processing.layoutModel.${model}`),
|
||||||
|
description: t(`processing.layoutModel.${model}Desc`),
|
||||||
|
})
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div className={cn('border rounded-lg p-4 bg-white', className)}>
|
||||||
|
{/* Header */}
|
||||||
|
<div className="flex items-center gap-2 mb-4">
|
||||||
|
<FileText className="w-5 h-5 text-gray-600" />
|
||||||
|
<h3 className="text-lg font-semibold text-gray-900">{t('processing.layoutModel.title')}</h3>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Model Options */}
|
||||||
|
<div className="space-y-3">
|
||||||
|
{models.map((model) => {
|
||||||
|
const info = getModelInfo(model)
|
||||||
|
const isSelected = value === model
|
||||||
|
|
||||||
|
return (
|
||||||
|
<button
|
||||||
|
key={model}
|
||||||
|
type="button"
|
||||||
|
disabled={disabled}
|
||||||
|
onClick={() => onChange(model)}
|
||||||
|
className={cn(
|
||||||
|
'w-full flex items-start gap-4 p-4 rounded-lg border-2 transition-all text-left',
|
||||||
|
isSelected
|
||||||
|
? 'border-blue-500 bg-blue-50'
|
||||||
|
: 'border-gray-200 hover:border-gray-300 hover:bg-gray-50',
|
||||||
|
disabled && 'opacity-50 cursor-not-allowed'
|
||||||
|
)}
|
||||||
|
>
|
||||||
|
{/* Icon */}
|
||||||
|
<div
|
||||||
|
className={cn(
|
||||||
|
'p-2 rounded-lg flex-shrink-0',
|
||||||
|
isSelected ? 'bg-blue-100 text-blue-600' : 'bg-gray-100 text-gray-500'
|
||||||
|
)}
|
||||||
|
>
|
||||||
|
{MODEL_ICONS[model]}
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Content */}
|
||||||
|
<div className="flex-1 min-w-0">
|
||||||
|
<div className="flex items-center gap-2">
|
||||||
|
<span
|
||||||
|
className={cn(
|
||||||
|
'font-medium',
|
||||||
|
isSelected ? 'text-blue-700' : 'text-gray-900'
|
||||||
|
)}
|
||||||
|
>
|
||||||
|
{info.label}
|
||||||
|
</span>
|
||||||
|
{model === 'chinese' && (
|
||||||
|
<span className="text-xs bg-green-100 text-green-700 px-2 py-0.5 rounded-full">
|
||||||
|
{t('processing.layoutModel.recommended')}
|
||||||
|
</span>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
<p className="text-sm text-gray-500 mt-1">{info.description}</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Check mark */}
|
||||||
|
{isSelected && (
|
||||||
|
<div className="flex-shrink-0">
|
||||||
|
<Check className="w-5 h-5 text-blue-600" />
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</button>
|
||||||
|
)
|
||||||
|
})}
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Info Note */}
|
||||||
|
<div className="mt-4 p-3 bg-blue-50 border border-blue-200 rounded-md">
|
||||||
|
<p className="text-sm text-blue-800">
|
||||||
|
{t('processing.layoutModel.note')}
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
}
|
||||||
@@ -1,408 +0,0 @@
|
|||||||
import { useState, useEffect } from 'react'
|
|
||||||
import { Settings, RotateCcw, HelpCircle, Save, Upload, Download, Check, AlertCircle } from 'lucide-react'
|
|
||||||
import { cn } from '@/lib/utils'
|
|
||||||
import type { PPStructureV3Params } from '@/types/apiV2'
|
|
||||||
|
|
||||||
const STORAGE_KEY = 'pp_structure_params_presets'
|
|
||||||
const LAST_USED_KEY = 'pp_structure_params_last_used'
|
|
||||||
|
|
||||||
interface PPStructureParamsProps {
|
|
||||||
value: PPStructureV3Params
|
|
||||||
onChange: (params: PPStructureV3Params) => void
|
|
||||||
disabled?: boolean
|
|
||||||
className?: string
|
|
||||||
}
|
|
||||||
|
|
||||||
interface ParamConfig {
|
|
||||||
key: keyof PPStructureV3Params
|
|
||||||
label: string
|
|
||||||
description: string
|
|
||||||
min: number
|
|
||||||
max: number
|
|
||||||
step: number
|
|
||||||
default: number
|
|
||||||
type: 'slider'
|
|
||||||
}
|
|
||||||
|
|
||||||
interface SelectParamConfig {
|
|
||||||
key: keyof PPStructureV3Params
|
|
||||||
label: string
|
|
||||||
description: string
|
|
||||||
options: Array<{ value: string; label: string }>
|
|
||||||
default: string
|
|
||||||
type: 'select'
|
|
||||||
}
|
|
||||||
|
|
||||||
// Preset configurations
|
|
||||||
const PRESETS = {
|
|
||||||
default: {} as PPStructureV3Params,
|
|
||||||
'high-quality': {
|
|
||||||
layout_detection_threshold: 0.1,
|
|
||||||
layout_nms_threshold: 0.15,
|
|
||||||
text_det_thresh: 0.1,
|
|
||||||
text_det_box_thresh: 0.2,
|
|
||||||
layout_merge_bboxes_mode: 'small' as const,
|
|
||||||
} as PPStructureV3Params,
|
|
||||||
fast: {
|
|
||||||
layout_detection_threshold: 0.3,
|
|
||||||
layout_nms_threshold: 0.3,
|
|
||||||
text_det_thresh: 0.3,
|
|
||||||
text_det_box_thresh: 0.4,
|
|
||||||
layout_merge_bboxes_mode: 'large' as const,
|
|
||||||
} as PPStructureV3Params,
|
|
||||||
}
|
|
||||||
|
|
||||||
const PARAM_CONFIGS: Array<ParamConfig | SelectParamConfig> = [
|
|
||||||
{
|
|
||||||
key: 'layout_detection_threshold',
|
|
||||||
label: 'Layout Detection Threshold',
|
|
||||||
description: 'Lower = detect more blocks (including weak signals), Higher = only high-confidence blocks',
|
|
||||||
min: 0,
|
|
||||||
max: 1,
|
|
||||||
step: 0.05,
|
|
||||||
default: 0.2,
|
|
||||||
type: 'slider' as const,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
key: 'layout_nms_threshold',
|
|
||||||
label: 'Layout NMS Threshold',
|
|
||||||
description: 'Lower = aggressive overlap removal, Higher = allow more overlapping boxes',
|
|
||||||
min: 0,
|
|
||||||
max: 1,
|
|
||||||
step: 0.05,
|
|
||||||
default: 0.2,
|
|
||||||
type: 'slider' as const,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
key: 'layout_merge_bboxes_mode',
|
|
||||||
label: 'Layout Merge Mode',
|
|
||||||
description: 'Bounding box merging strategy',
|
|
||||||
options: [
|
|
||||||
{ value: 'small', label: 'Small (Conservative)' },
|
|
||||||
{ value: 'union', label: 'Union (Balanced)' },
|
|
||||||
{ value: 'large', label: 'Large (Aggressive)' },
|
|
||||||
],
|
|
||||||
default: 'small',
|
|
||||||
type: 'select' as const,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
key: 'layout_unclip_ratio',
|
|
||||||
label: 'Layout Unclip Ratio',
|
|
||||||
description: 'Larger = looser bounding boxes, Smaller = tighter bounding boxes',
|
|
||||||
min: 0.5,
|
|
||||||
max: 3.0,
|
|
||||||
step: 0.1,
|
|
||||||
default: 1.2,
|
|
||||||
type: 'slider' as const,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
key: 'text_det_thresh',
|
|
||||||
label: 'Text Detection Threshold',
|
|
||||||
description: 'Lower = detect more small/low-contrast text, Higher = cleaner but may miss text',
|
|
||||||
min: 0,
|
|
||||||
max: 1,
|
|
||||||
step: 0.05,
|
|
||||||
default: 0.2,
|
|
||||||
type: 'slider' as const,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
key: 'text_det_box_thresh',
|
|
||||||
label: 'Text Box Threshold',
|
|
||||||
description: 'Lower = more text boxes retained, Higher = fewer false positives',
|
|
||||||
min: 0,
|
|
||||||
max: 1,
|
|
||||||
step: 0.05,
|
|
||||||
default: 0.3,
|
|
||||||
type: 'slider' as const,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
key: 'text_det_unclip_ratio',
|
|
||||||
label: 'Text Unclip Ratio',
|
|
||||||
description: 'Larger = looser text boxes, Smaller = tighter text boxes',
|
|
||||||
min: 0.5,
|
|
||||||
max: 3.0,
|
|
||||||
step: 0.1,
|
|
||||||
default: 1.2,
|
|
||||||
type: 'slider' as const,
|
|
||||||
},
|
|
||||||
]
|
|
||||||
|
|
||||||
export default function PPStructureParams({
|
|
||||||
value,
|
|
||||||
onChange,
|
|
||||||
disabled = false,
|
|
||||||
className,
|
|
||||||
}: PPStructureParamsProps) {
|
|
||||||
const [showTooltip, setShowTooltip] = useState<string | null>(null)
|
|
||||||
const [isExpanded, setIsExpanded] = useState(false)
|
|
||||||
const [selectedPreset, setSelectedPreset] = useState<string>('custom')
|
|
||||||
const [showSaveSuccess, setShowSaveSuccess] = useState(false)
|
|
||||||
|
|
||||||
// Load last used parameters on mount
|
|
||||||
useEffect(() => {
|
|
||||||
try {
|
|
||||||
const lastUsed = localStorage.getItem(LAST_USED_KEY)
|
|
||||||
if (lastUsed && Object.keys(value).length === 0) {
|
|
||||||
const params = JSON.parse(lastUsed)
|
|
||||||
onChange(params)
|
|
||||||
}
|
|
||||||
} catch (error) {
|
|
||||||
console.error('Failed to load last used parameters:', error)
|
|
||||||
}
|
|
||||||
}, [])
|
|
||||||
|
|
||||||
// Save to localStorage when parameters change
|
|
||||||
useEffect(() => {
|
|
||||||
if (Object.keys(value).length > 0) {
|
|
||||||
try {
|
|
||||||
localStorage.setItem(LAST_USED_KEY, JSON.stringify(value))
|
|
||||||
} catch (error) {
|
|
||||||
console.error('Failed to save parameters:', error)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}, [value])
|
|
||||||
|
|
||||||
const handleReset = () => {
|
|
||||||
onChange({})
|
|
||||||
setSelectedPreset('default')
|
|
||||||
setShowSaveSuccess(false)
|
|
||||||
}
|
|
||||||
|
|
||||||
const handlePresetChange = (presetKey: string) => {
|
|
||||||
setSelectedPreset(presetKey)
|
|
||||||
if (presetKey === 'custom') return
|
|
||||||
|
|
||||||
const preset = PRESETS[presetKey as keyof typeof PRESETS]
|
|
||||||
if (preset) {
|
|
||||||
onChange(preset)
|
|
||||||
setShowSaveSuccess(false)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const handleChange = (key: keyof PPStructureV3Params, newValue: any) => {
|
|
||||||
const newParams = {
|
|
||||||
...value,
|
|
||||||
[key]: newValue,
|
|
||||||
}
|
|
||||||
onChange(newParams)
|
|
||||||
setSelectedPreset('custom')
|
|
||||||
}
|
|
||||||
|
|
||||||
const handleExport = () => {
|
|
||||||
const dataStr = JSON.stringify(value, null, 2)
|
|
||||||
const dataUri = 'data:application/json;charset=utf-8,' + encodeURIComponent(dataStr)
|
|
||||||
const exportFileDefaultName = 'pp_structure_params.json'
|
|
||||||
|
|
||||||
const linkElement = document.createElement('a')
|
|
||||||
linkElement.setAttribute('href', dataUri)
|
|
||||||
linkElement.setAttribute('download', exportFileDefaultName)
|
|
||||||
linkElement.click()
|
|
||||||
}
|
|
||||||
|
|
||||||
const handleImport = () => {
|
|
||||||
const input = document.createElement('input')
|
|
||||||
input.type = 'file'
|
|
||||||
input.accept = 'application/json'
|
|
||||||
input.onchange = (e) => {
|
|
||||||
const file = (e.target as HTMLInputElement).files?.[0]
|
|
||||||
if (file) {
|
|
||||||
const reader = new FileReader()
|
|
||||||
reader.onload = (event) => {
|
|
||||||
try {
|
|
||||||
const params = JSON.parse(event.target?.result as string)
|
|
||||||
onChange(params)
|
|
||||||
setSelectedPreset('custom')
|
|
||||||
setShowSaveSuccess(true)
|
|
||||||
setTimeout(() => setShowSaveSuccess(false), 3000)
|
|
||||||
} catch (error) {
|
|
||||||
console.error('Failed to import parameters:', error)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
reader.readAsText(file)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
input.click()
|
|
||||||
}
|
|
||||||
|
|
||||||
const hasCustomValues = Object.keys(value).length > 0
|
|
||||||
|
|
||||||
return (
|
|
||||||
<div className={cn('border rounded-lg p-4 bg-white', className)}>
|
|
||||||
{/* Header */}
|
|
||||||
<div className="flex items-center justify-between mb-4">
|
|
||||||
<div className="flex items-center gap-2">
|
|
||||||
<Settings className="w-5 h-5 text-gray-600" />
|
|
||||||
<h3 className="text-lg font-semibold text-gray-900">PP-StructureV3 Parameters</h3>
|
|
||||||
{hasCustomValues && (
|
|
||||||
<span className="text-xs bg-blue-100 text-blue-700 px-2 py-1 rounded">Custom</span>
|
|
||||||
)}
|
|
||||||
{showSaveSuccess && (
|
|
||||||
<span className="flex items-center gap-1 text-xs bg-green-100 text-green-700 px-2 py-1 rounded animate-in fade-in">
|
|
||||||
<Check className="w-3 h-3" />
|
|
||||||
Saved
|
|
||||||
</span>
|
|
||||||
)}
|
|
||||||
</div>
|
|
||||||
<div className="flex items-center gap-2">
|
|
||||||
<button
|
|
||||||
type="button"
|
|
||||||
onClick={() => setIsExpanded(!isExpanded)}
|
|
||||||
className="text-sm text-blue-600 hover:text-blue-700 px-3 py-1.5 rounded-md hover:bg-blue-50"
|
|
||||||
>
|
|
||||||
{isExpanded ? 'Hide' : 'Show'} Parameters
|
|
||||||
</button>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
{/* Preset Selector & Actions */}
|
|
||||||
{isExpanded && (
|
|
||||||
<div className="mb-4 p-3 bg-gray-50 rounded-md space-y-3">
|
|
||||||
<div className="flex items-center gap-3">
|
|
||||||
<label className="text-sm font-medium text-gray-700">Preset:</label>
|
|
||||||
<select
|
|
||||||
value={selectedPreset}
|
|
||||||
onChange={(e) => handlePresetChange(e.target.value)}
|
|
||||||
disabled={disabled}
|
|
||||||
className="flex-1 px-3 py-1.5 text-sm border border-gray-300 rounded-md focus:outline-none focus:ring-2 focus:ring-blue-500 disabled:bg-gray-100"
|
|
||||||
>
|
|
||||||
<option value="default">Default (Backend Settings)</option>
|
|
||||||
<option value="high-quality">High Quality (Lower Thresholds)</option>
|
|
||||||
<option value="fast">Fast (Higher Thresholds)</option>
|
|
||||||
<option value="custom">Custom</option>
|
|
||||||
</select>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div className="flex items-center gap-2">
|
|
||||||
<button
|
|
||||||
type="button"
|
|
||||||
onClick={handleReset}
|
|
||||||
disabled={disabled || !hasCustomValues}
|
|
||||||
className={cn(
|
|
||||||
'flex items-center gap-1 px-3 py-1.5 text-sm rounded-md transition-colors',
|
|
||||||
disabled || !hasCustomValues
|
|
||||||
? 'bg-gray-200 text-gray-400 cursor-not-allowed'
|
|
||||||
: 'bg-white border border-gray-300 text-gray-700 hover:bg-gray-50'
|
|
||||||
)}
|
|
||||||
>
|
|
||||||
<RotateCcw className="w-4 h-4" />
|
|
||||||
Reset
|
|
||||||
</button>
|
|
||||||
<button
|
|
||||||
type="button"
|
|
||||||
onClick={handleExport}
|
|
||||||
disabled={disabled || !hasCustomValues}
|
|
||||||
className={cn(
|
|
||||||
'flex items-center gap-1 px-3 py-1.5 text-sm rounded-md transition-colors',
|
|
||||||
disabled || !hasCustomValues
|
|
||||||
? 'bg-gray-200 text-gray-400 cursor-not-allowed'
|
|
||||||
: 'bg-white border border-gray-300 text-gray-700 hover:bg-gray-50'
|
|
||||||
)}
|
|
||||||
>
|
|
||||||
<Download className="w-4 h-4" />
|
|
||||||
Export
|
|
||||||
</button>
|
|
||||||
<button
|
|
||||||
type="button"
|
|
||||||
onClick={handleImport}
|
|
||||||
disabled={disabled}
|
|
||||||
className={cn(
|
|
||||||
'flex items-center gap-1 px-3 py-1.5 text-sm rounded-md transition-colors',
|
|
||||||
disabled
|
|
||||||
? 'bg-gray-200 text-gray-400 cursor-not-allowed'
|
|
||||||
: 'bg-white border border-gray-300 text-gray-700 hover:bg-gray-50'
|
|
||||||
)}
|
|
||||||
>
|
|
||||||
<Upload className="w-4 h-4" />
|
|
||||||
Import
|
|
||||||
</button>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
)}
|
|
||||||
|
|
||||||
{/* Expanded Parameters */}
|
|
||||||
{isExpanded && (
|
|
||||||
<div className="space-y-6 pt-4 border-t">
|
|
||||||
{PARAM_CONFIGS.map((config) => (
|
|
||||||
<div key={config.key} className="space-y-2">
|
|
||||||
<div className="flex items-center justify-between">
|
|
||||||
<div className="flex items-center gap-2">
|
|
||||||
<label htmlFor={config.key} className="text-sm font-medium text-gray-700">
|
|
||||||
{config.label}
|
|
||||||
</label>
|
|
||||||
<button
|
|
||||||
type="button"
|
|
||||||
onMouseEnter={() => setShowTooltip(config.key)}
|
|
||||||
onMouseLeave={() => setShowTooltip(null)}
|
|
||||||
className="text-gray-400 hover:text-gray-600 relative"
|
|
||||||
>
|
|
||||||
<HelpCircle className="w-4 h-4" />
|
|
||||||
{showTooltip === config.key && (
|
|
||||||
<div className="absolute left-6 top-0 w-64 p-2 bg-gray-900 text-white text-xs rounded shadow-lg z-10">
|
|
||||||
{config.description}
|
|
||||||
</div>
|
|
||||||
)}
|
|
||||||
</button>
|
|
||||||
</div>
|
|
||||||
{config.type === 'slider' && (
|
|
||||||
<div className="flex items-center gap-2">
|
|
||||||
<span className="text-sm font-semibold text-blue-600">
|
|
||||||
{value[config.key] ?? config.default}
|
|
||||||
</span>
|
|
||||||
{value[config.key] !== undefined && value[config.key] !== config.default && (
|
|
||||||
<span className="text-xs text-gray-500">
|
|
||||||
(default: {config.default})
|
|
||||||
</span>
|
|
||||||
)}
|
|
||||||
</div>
|
|
||||||
)}
|
|
||||||
</div>
|
|
||||||
|
|
||||||
{config.type === 'slider' ? (
|
|
||||||
<input
|
|
||||||
type="range"
|
|
||||||
id={config.key}
|
|
||||||
min={config.min}
|
|
||||||
max={config.max}
|
|
||||||
step={config.step}
|
|
||||||
value={value[config.key] ?? config.default}
|
|
||||||
onChange={(e) => handleChange(config.key, parseFloat(e.target.value))}
|
|
||||||
disabled={disabled}
|
|
||||||
className="w-full h-2 bg-gray-200 rounded-lg appearance-none cursor-pointer disabled:cursor-not-allowed disabled:opacity-50"
|
|
||||||
/>
|
|
||||||
) : (
|
|
||||||
<select
|
|
||||||
id={config.key}
|
|
||||||
value={(value[config.key] as string) ?? config.default}
|
|
||||||
onChange={(e) => handleChange(config.key, e.target.value)}
|
|
||||||
disabled={disabled}
|
|
||||||
className="w-full px-3 py-2 border border-gray-300 rounded-md focus:outline-none focus:ring-2 focus:ring-blue-500 disabled:bg-gray-100 disabled:cursor-not-allowed"
|
|
||||||
>
|
|
||||||
{config.options.map((option) => (
|
|
||||||
<option key={option.value} value={option.value}>
|
|
||||||
{option.label}
|
|
||||||
</option>
|
|
||||||
))}
|
|
||||||
</select>
|
|
||||||
)}
|
|
||||||
</div>
|
|
||||||
))}
|
|
||||||
|
|
||||||
{/* Info Note */}
|
|
||||||
<div className="mt-4 p-3 bg-blue-50 border border-blue-200 rounded-md">
|
|
||||||
<p className="text-sm text-blue-800">
|
|
||||||
<strong>Note:</strong> These parameters only apply when using the OCR track. Adjusting them
|
|
||||||
can help improve accuracy for specific document types.
|
|
||||||
</p>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
)}
|
|
||||||
|
|
||||||
{/* Collapsed Summary */}
|
|
||||||
{!isExpanded && hasCustomValues && (
|
|
||||||
<div className="text-sm text-gray-600">
|
|
||||||
{Object.keys(value).length} parameter(s) customized
|
|
||||||
</div>
|
|
||||||
)}
|
|
||||||
</div>
|
|
||||||
)
|
|
||||||
}
|
|
||||||
@@ -52,6 +52,17 @@
|
|||||||
"language": "識別語言",
|
"language": "識別語言",
|
||||||
"threshold": "信心度閾值",
|
"threshold": "信心度閾值",
|
||||||
"layoutDetection": "版面偵測"
|
"layoutDetection": "版面偵測"
|
||||||
|
},
|
||||||
|
"layoutModel": {
|
||||||
|
"title": "版面偵測模型",
|
||||||
|
"chinese": "中文文件模型",
|
||||||
|
"chineseDesc": "PP-DocLayout-S - 適用於中文表單、合約、發票(推薦)",
|
||||||
|
"default": "標準模型",
|
||||||
|
"defaultDesc": "PubLayNet 模型 - 適用於英文學術論文、報告",
|
||||||
|
"cdla": "CDLA 模型",
|
||||||
|
"cdlaDesc": "專用中文版面分析模型 - 適用於複雜中文版面",
|
||||||
|
"recommended": "推薦",
|
||||||
|
"note": "版面模型會影響文件結構(表格、文字區塊、圖片)的偵測效果。請根據您的文件類型選擇適合的模型。"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"results": {
|
"results": {
|
||||||
|
|||||||
@@ -9,10 +9,10 @@ import { Badge } from '@/components/ui/badge'
|
|||||||
import { useToast } from '@/components/ui/toast'
|
import { useToast } from '@/components/ui/toast'
|
||||||
import { apiClientV2 } from '@/services/apiV2'
|
import { apiClientV2 } from '@/services/apiV2'
|
||||||
import { Play, CheckCircle, FileText, AlertCircle, Clock, Activity, Loader2 } from 'lucide-react'
|
import { Play, CheckCircle, FileText, AlertCircle, Clock, Activity, Loader2 } from 'lucide-react'
|
||||||
import PPStructureParams from '@/components/PPStructureParams'
|
import LayoutModelSelector from '@/components/LayoutModelSelector'
|
||||||
import TaskNotFound from '@/components/TaskNotFound'
|
import TaskNotFound from '@/components/TaskNotFound'
|
||||||
import { useTaskValidation } from '@/hooks/useTaskValidation'
|
import { useTaskValidation } from '@/hooks/useTaskValidation'
|
||||||
import type { PPStructureV3Params, ProcessingOptions } from '@/types/apiV2'
|
import type { LayoutModel, ProcessingOptions } from '@/types/apiV2'
|
||||||
|
|
||||||
export default function ProcessingPage() {
|
export default function ProcessingPage() {
|
||||||
const { t } = useTranslation()
|
const { t } = useTranslation()
|
||||||
@@ -31,8 +31,8 @@ export default function ProcessingPage() {
|
|||||||
},
|
},
|
||||||
})
|
})
|
||||||
|
|
||||||
// PP-StructureV3 parameters state
|
// Layout model state (default to 'chinese' for best Chinese document support)
|
||||||
const [ppStructureParams, setPpStructureParams] = useState<PPStructureV3Params>({})
|
const [layoutModel, setLayoutModel] = useState<LayoutModel>('chinese')
|
||||||
|
|
||||||
// Start OCR processing
|
// Start OCR processing
|
||||||
const processOCRMutation = useMutation({
|
const processOCRMutation = useMutation({
|
||||||
@@ -40,11 +40,7 @@ export default function ProcessingPage() {
|
|||||||
const options: ProcessingOptions = {
|
const options: ProcessingOptions = {
|
||||||
use_dual_track: true,
|
use_dual_track: true,
|
||||||
language: 'ch',
|
language: 'ch',
|
||||||
}
|
layout_model: layoutModel,
|
||||||
|
|
||||||
// Only include pp_structure_params if user has customized them
|
|
||||||
if (Object.keys(ppStructureParams).length > 0) {
|
|
||||||
options.pp_structure_params = ppStructureParams
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return apiClientV2.startTask(taskId!, options)
|
return apiClientV2.startTask(taskId!, options)
|
||||||
@@ -346,11 +342,11 @@ export default function ProcessingPage() {
|
|||||||
</Card>
|
</Card>
|
||||||
)}
|
)}
|
||||||
|
|
||||||
{/* PP-StructureV3 Parameters (only show when task is pending) */}
|
{/* Layout Model Selection (only show when task is pending) */}
|
||||||
{isPending && (
|
{isPending && (
|
||||||
<PPStructureParams
|
<LayoutModelSelector
|
||||||
value={ppStructureParams}
|
value={layoutModel}
|
||||||
onChange={setPpStructureParams}
|
onChange={setLayoutModel}
|
||||||
disabled={processOCRMutation.isPending}
|
disabled={processOCRMutation.isPending}
|
||||||
/>
|
/>
|
||||||
)}
|
)}
|
||||||
|
|||||||
@@ -73,15 +73,14 @@ export interface DocumentAnalysisResponse {
|
|||||||
page_count: number | null
|
page_count: number | null
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface PPStructureV3Params {
|
/**
|
||||||
layout_detection_threshold?: number // 0-1: Lower=more blocks, Higher=high confidence only
|
* Layout detection model selection for OCR track.
|
||||||
layout_nms_threshold?: number // 0-1: Lower=aggressive overlap removal, Higher=allow more overlap
|
* Different models are optimized for different document types:
|
||||||
layout_merge_bboxes_mode?: 'union' | 'large' | 'small' // small=conservative, large=aggressive, union=middle
|
* - chinese: PP-DocLayout-S - Best for Chinese forms, contracts, invoices
|
||||||
layout_unclip_ratio?: number // >0: Larger=looser boxes, Smaller=tighter boxes
|
* - default: PubLayNet-based - Best for English academic papers
|
||||||
text_det_thresh?: number // 0-1: Lower=detect more small/low-contrast text, Higher=cleaner
|
* - cdla: Specialized for Chinese document layout analysis
|
||||||
text_det_box_thresh?: number // 0-1: Lower=more text boxes, Higher=fewer false positives
|
*/
|
||||||
text_det_unclip_ratio?: number // >0: Larger=looser text boxes, Smaller=tighter boxes
|
export type LayoutModel = 'chinese' | 'default' | 'cdla'
|
||||||
}
|
|
||||||
|
|
||||||
export interface ProcessingOptions {
|
export interface ProcessingOptions {
|
||||||
use_dual_track?: boolean
|
use_dual_track?: boolean
|
||||||
@@ -89,7 +88,7 @@ export interface ProcessingOptions {
|
|||||||
language?: string
|
language?: string
|
||||||
include_layout?: boolean
|
include_layout?: boolean
|
||||||
include_images?: boolean
|
include_images?: boolean
|
||||||
pp_structure_params?: PPStructureV3Params // Fine-tuning parameters for PP-StructureV3 (OCR track only)
|
layout_model?: LayoutModel // Layout detection model selection (OCR track only)
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface TaskCreate {
|
export interface TaskCreate {
|
||||||
|
|||||||
@@ -0,0 +1,28 @@
|
|||||||
|
# Change: Fix OCR Track Table Empty Columns and Alignment
|
||||||
|
|
||||||
|
## Why
|
||||||
|
|
||||||
|
PP-Structure 生成的表格經常包含空白欄位(所有 row 該欄皆為空/空白),導致轉換後的 UnifiedDocument 表格出現空欄與欄位錯位。目前 OCR Track 直接使用原始資料,未進行清理,影響 PDF/JSON/Markdown 輸出品質。
|
||||||
|
|
||||||
|
## What Changes
|
||||||
|
|
||||||
|
- 新增 `trim_empty_columns()` 函數,清理 OCR Track 表格的空欄
|
||||||
|
- 在 `_convert_table_data` 入口調用清洗邏輯,確保 TableData 乾淨
|
||||||
|
- 處理 col_span 重算:若 span 跨過被移除欄位,縮小 span
|
||||||
|
- 更新 columns/cols 數值、調整各 cell 的 col 索引
|
||||||
|
- 可選:依 bbox x0 進行欄對齊排序
|
||||||
|
|
||||||
|
## Impact
|
||||||
|
|
||||||
|
- Affected specs: `ocr-processing`
|
||||||
|
- Affected code:
|
||||||
|
- `backend/app/services/ocr_to_unified_converter.py` (主要修改)
|
||||||
|
- 不影響 Direct/HYBRID 路徑
|
||||||
|
- PDF/JSON/Markdown 輸出將更乾淨
|
||||||
|
|
||||||
|
## Constraints
|
||||||
|
|
||||||
|
- 保持表格 bbox、頁面座標不變
|
||||||
|
- 不修改 Direct/HYBRID 路徑
|
||||||
|
- 只移除「所有行皆空」的欄;若表頭空但數據有值,不應移除
|
||||||
|
- 保留原 bbox,避免 PDF 版面漂移
|
||||||
@@ -0,0 +1,61 @@
|
|||||||
|
## ADDED Requirements
|
||||||
|
|
||||||
|
### Requirement: OCR Table Empty Column Cleanup
|
||||||
|
|
||||||
|
The OCR Track converter SHALL clean up PP-Structure generated tables by removing columns where all rows have empty or whitespace-only content.
|
||||||
|
|
||||||
|
The system SHALL:
|
||||||
|
1. Identify columns where every cell's content is empty or contains only whitespace (using `.strip()` to determine emptiness)
|
||||||
|
2. Remove identified empty columns from the table structure
|
||||||
|
3. Update the `columns`/`cols` value to reflect the new column count
|
||||||
|
4. Recalculate each cell's `col` index to maintain continuity
|
||||||
|
5. Adjust `col_span` values when spans cross removed columns (shrink span size)
|
||||||
|
6. Remove cells entirely when their complete span falls within removed columns
|
||||||
|
7. Preserve original bbox and page coordinates (no layout drift)
|
||||||
|
8. If `columns` is 0 or missing after cleanup, fill with the calculated column count
|
||||||
|
|
||||||
|
The cleanup SHALL NOT:
|
||||||
|
- Remove columns where the header is empty but data rows contain values
|
||||||
|
- Modify tables in Direct or HYBRID track
|
||||||
|
- Alter the original bbox coordinates
|
||||||
|
|
||||||
|
#### Scenario: All rows in column are empty
|
||||||
|
- **WHEN** a table has a column where all cells contain only empty or whitespace content
|
||||||
|
- **THEN** that column is removed
|
||||||
|
- **AND** remaining cells have their `col` indices decremented appropriately
|
||||||
|
- **AND** `cols` count is reduced by 1
|
||||||
|
|
||||||
|
#### Scenario: Column has empty header but data has values
|
||||||
|
- **WHEN** a table has a column where the header cell is empty
|
||||||
|
- **AND** at least one data row cell in that column contains non-whitespace content
|
||||||
|
- **THEN** that column is NOT removed
|
||||||
|
|
||||||
|
#### Scenario: Cell span crosses removed column
|
||||||
|
- **WHEN** a cell has `col_span > 1`
|
||||||
|
- **AND** one or more columns within the span are removed
|
||||||
|
- **THEN** the `col_span` is reduced by the number of removed columns within the span
|
||||||
|
|
||||||
|
#### Scenario: Cell span entirely within removed columns
|
||||||
|
- **WHEN** a cell's entire span falls within columns that are all removed
|
||||||
|
- **THEN** that cell is removed from the table
|
||||||
|
|
||||||
|
#### Scenario: Missing columns metadata
|
||||||
|
- **WHEN** the table dict has `columns` set to 0 or missing
|
||||||
|
- **AFTER** cleanup is performed
|
||||||
|
- **THEN** `columns` is set to the calculated number of remaining columns
|
||||||
|
|
||||||
|
### Requirement: OCR Table Column Alignment by Bbox
|
||||||
|
|
||||||
|
(Optional Enhancement) When bbox coordinates are available for table cells, the OCR Track converter SHALL use cell bbox x0 coordinates to improve column alignment accuracy.
|
||||||
|
|
||||||
|
The system SHALL:
|
||||||
|
1. Sort cells by bbox `x0` coordinate before assigning column indices
|
||||||
|
2. Reassign `col` indices based on spatial position rather than HTML order
|
||||||
|
|
||||||
|
This requirement is optional and implementation MAY be deferred if bbox data is not reliably available.
|
||||||
|
|
||||||
|
#### Scenario: Cells reordered by bbox position
|
||||||
|
- **WHEN** bbox coordinates are available for table cells
|
||||||
|
- **AND** the original HTML order does not match spatial order
|
||||||
|
- **THEN** cells are reordered by `x0` coordinate
|
||||||
|
- **AND** `col` indices are reassigned to reflect spatial positioning
|
||||||
@@ -0,0 +1,43 @@
|
|||||||
|
# Tasks: Fix OCR Track Table Empty Columns
|
||||||
|
|
||||||
|
## 1. Core Implementation
|
||||||
|
|
||||||
|
- [x] 1.1 在 `ocr_to_unified_converter.py` 實作 `trim_empty_columns(table_dict: Dict[str, Any]) -> Dict[str, Any]`
|
||||||
|
- 依據 cells 陣列計算每一欄是否「所有 row 的內容皆為空/空白」
|
||||||
|
- 使用 `.strip()` 判斷空白字元
|
||||||
|
- [x] 1.2 實作欄位移除邏輯
|
||||||
|
- 更新 columns/cols 數值
|
||||||
|
- 調整各 cell 的 col 索引
|
||||||
|
- [x] 1.3 實作 col_span 重算邏輯
|
||||||
|
- 若 span 跨過被移除欄位,縮小 span
|
||||||
|
- 若整個 span 落在被刪欄位上,移除該 cell
|
||||||
|
- [x] 1.4 在 `_convert_table_data` 入口呼叫 `trim_empty_columns`
|
||||||
|
- 在建 TableData 之前執行清洗
|
||||||
|
- 同時也在 `_extract_table_data` (HTML 表格解析) 中加入清洗
|
||||||
|
- [ ] 1.5 (可選) 依 bbox x0/x1 進行欄對齊排序
|
||||||
|
- 若可取得 bbox 網格,先依 x0 排序再重排 col index
|
||||||
|
- 此功能延後實作,待 bbox 資料確認可用性後進行
|
||||||
|
|
||||||
|
## 2. Testing & Validation
|
||||||
|
|
||||||
|
- [x] 2.1 單元測試通過
|
||||||
|
- 測試基本空欄移除
|
||||||
|
- 測試表頭空但數據有值(不移除)
|
||||||
|
- 測試 col_span 跨越被移除欄位(縮小 span)
|
||||||
|
- 測試 cell 完全落在被移除欄位(移除 cell)
|
||||||
|
- 測試無空欄情況(不變更)
|
||||||
|
- [x] 2.2 檢查現有 OCR 結果
|
||||||
|
- 現有結果中無「整欄為空」的表格
|
||||||
|
- 實作已就緒,遇到空欄時會正確清理
|
||||||
|
- [x] 2.3 確認 Direct/HYBRID 表格不變
|
||||||
|
- `OCRToUnifiedConverter` 僅在 `ocr_service.py` 中使用
|
||||||
|
- Direct 軌使用 `DirectExtractionEngine`,不受影響
|
||||||
|
|
||||||
|
## 3. Edge Cases & Validation
|
||||||
|
|
||||||
|
- [x] 3.1 處理 columns 欄位為 0/缺失的情況
|
||||||
|
- 以計算後的欄數回填,避免 downstream 依賴出錯
|
||||||
|
- [x] 3.2 處理表頭為空但數據有值的情況
|
||||||
|
- 只移除「所有行皆空」的欄
|
||||||
|
- [x] 3.3 確保不直接修改 `backend/storage/results/...`
|
||||||
|
- 修改 converter,需重新跑任務驗證
|
||||||
@@ -0,0 +1,183 @@
|
|||||||
|
# Design: OCR Track Gap Filling
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
PP-StructureV3 版面分析模型在處理某些掃描文件時會嚴重漏檢。實測顯示 Raw PaddleOCR 能偵測 56 個文字區域,但 PP-StructureV3 僅輸出 9 個元素(遺失 84%)。
|
||||||
|
|
||||||
|
問題發生在 PP-StructureV3 內部的 Layout Detection Model,這是 PaddleOCR 函式庫的限制,無法從外部修復。但 Raw OCR 的 `text_regions` 資料仍然完整可用。
|
||||||
|
|
||||||
|
### Stakeholders
|
||||||
|
- **End users**: 需要完整的 OCR 輸出,不能有大量文字遺失
|
||||||
|
- **OCR track**: 需要整合 Raw OCR 與 PP-StructureV3 結果
|
||||||
|
- **Direct/Hybrid track**: 不應受此變更影響
|
||||||
|
|
||||||
|
## Goals / Non-Goals
|
||||||
|
|
||||||
|
### Goals
|
||||||
|
- 偵測 PP-StructureV3 漏檢區域並以 Raw OCR 結果補回
|
||||||
|
- 確保補回的文字不會與現有元素重複
|
||||||
|
- 維持正確的閱讀順序
|
||||||
|
- 僅影響 OCR track,不改變其他 track 的行為
|
||||||
|
|
||||||
|
### Non-Goals
|
||||||
|
- 不修改 PP-StructureV3 或 PaddleOCR 內部邏輯
|
||||||
|
- 不處理圖片/表格/圖表等非文字元素的補漏
|
||||||
|
- 不實作複雜的版面分析(僅做 gap filling)
|
||||||
|
|
||||||
|
## Decisions
|
||||||
|
|
||||||
|
### Decision 1: 覆蓋判定策略
|
||||||
|
**選擇**: 優先使用「中心點落入」判定,輔以 IoU 閾值
|
||||||
|
|
||||||
|
**理由**:
|
||||||
|
- 中心點判定計算簡單,效能好
|
||||||
|
- IoU 閾值作為補充,處理邊界情況
|
||||||
|
- 建議 IoU 閾值 0.1~0.2,避免低 IoU 被誤判為未覆蓋
|
||||||
|
|
||||||
|
**替代方案**:
|
||||||
|
- 純 IoU 判定:計算量較大,且對部分重疊的處理較複雜
|
||||||
|
- 面積比例判定:對不同大小的區域不夠公平
|
||||||
|
|
||||||
|
### Decision 2: 補漏觸發條件
|
||||||
|
**選擇**: 當 PP-Structure 覆蓋率 < 70% 或元素數顯著低於 Raw OCR
|
||||||
|
|
||||||
|
**理由**:
|
||||||
|
- 避免正常文件出現重複文字
|
||||||
|
- 70% 閾值經驗值,可透過設定調整
|
||||||
|
- 元素數比較作為快速判斷條件
|
||||||
|
|
||||||
|
### Decision 3: 補漏元素類型
|
||||||
|
**選擇**: 僅補 TEXT 類型,跳過 TABLE/IMAGE/FIGURE/FLOWCHART/HEADER/FOOTER
|
||||||
|
|
||||||
|
**理由**:
|
||||||
|
- PP-StructureV3 對結構化元素(表格、圖片)的識別通常較準確
|
||||||
|
- 補回原始 OCR 文字可能破壞表格結構
|
||||||
|
- 這些元素需要保持結構完整性
|
||||||
|
|
||||||
|
### Decision 4: 重複判定與去重
|
||||||
|
**選擇**: IoU > 0.5 的 Raw OCR 區域視為與 PP-Structure TEXT 重複,跳過
|
||||||
|
|
||||||
|
**理由**:
|
||||||
|
- 0.5 是常見的重疊閾值
|
||||||
|
- 避免同一文字出現兩次
|
||||||
|
- 對細碎的 Raw OCR 框可考慮輕量合併
|
||||||
|
|
||||||
|
### Decision 5: 座標對齊
|
||||||
|
**選擇**: 使用 `ocr_dimensions` 進行 bbox 換算
|
||||||
|
|
||||||
|
**理由**:
|
||||||
|
- OCR 可能有 resize 處理
|
||||||
|
- 確保 Raw OCR 與 PP-Structure 的座標在同一空間
|
||||||
|
- 避免因尺寸不一致導致覆蓋誤判
|
||||||
|
|
||||||
|
## Data Flow
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─────────────────┐ ┌──────────────────────┐
|
||||||
|
│ Raw OCR Result │ │ PP-StructureV3 Result│
|
||||||
|
│ (56 regions) │ │ (9 elements) │
|
||||||
|
└────────┬────────┘ └──────────┬───────────┘
|
||||||
|
│ │
|
||||||
|
└────────────┬────────────┘
|
||||||
|
│
|
||||||
|
┌───────▼───────┐
|
||||||
|
│ GapFillingService │
|
||||||
|
│ 1. Calculate coverage
|
||||||
|
│ 2. Find uncovered regions
|
||||||
|
│ 3. Filter by confidence
|
||||||
|
│ 4. Deduplicate
|
||||||
|
│ 5. Merge if needed
|
||||||
|
└───────┬───────┘
|
||||||
|
│
|
||||||
|
┌───────▼───────┐
|
||||||
|
│ OCRToUnifiedConverter │
|
||||||
|
│ - Combine elements
|
||||||
|
│ - Recalculate reading order
|
||||||
|
└───────┬───────┘
|
||||||
|
│
|
||||||
|
┌───────▼───────┐
|
||||||
|
│ UnifiedDocument │
|
||||||
|
│ (complete content)
|
||||||
|
└───────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
## Algorithm: Gap Detection
|
||||||
|
|
||||||
|
```python
|
||||||
|
def find_uncovered_regions(
|
||||||
|
raw_ocr_regions: List[TextRegion],
|
||||||
|
pp_structure_elements: List[Element],
|
||||||
|
iou_threshold: float = 0.15
|
||||||
|
) -> List[TextRegion]:
|
||||||
|
"""
|
||||||
|
Find Raw OCR regions not covered by PP-Structure elements.
|
||||||
|
|
||||||
|
Coverage criteria (either one):
|
||||||
|
1. Center point of raw region falls inside any PP-Structure bbox
|
||||||
|
2. IoU with any PP-Structure bbox > iou_threshold
|
||||||
|
"""
|
||||||
|
uncovered = []
|
||||||
|
|
||||||
|
# Filter PP-Structure elements: only consider TEXT, skip TABLE/IMAGE/etc.
|
||||||
|
text_elements = [e for e in pp_structure_elements
|
||||||
|
if e.type not in SKIP_TYPES]
|
||||||
|
|
||||||
|
for region in raw_ocr_regions:
|
||||||
|
center = get_center(region.bbox)
|
||||||
|
is_covered = False
|
||||||
|
|
||||||
|
for element in text_elements:
|
||||||
|
# Check center point
|
||||||
|
if point_in_bbox(center, element.bbox):
|
||||||
|
is_covered = True
|
||||||
|
break
|
||||||
|
|
||||||
|
# Check IoU
|
||||||
|
if calculate_iou(region.bbox, element.bbox) > iou_threshold:
|
||||||
|
is_covered = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if not is_covered:
|
||||||
|
uncovered.append(region)
|
||||||
|
|
||||||
|
return uncovered
|
||||||
|
```
|
||||||
|
|
||||||
|
## Configuration Parameters
|
||||||
|
|
||||||
|
| Parameter | Type | Default | Description |
|
||||||
|
|-----------|------|---------|-------------|
|
||||||
|
| `gap_filling_enabled` | bool | True | 是否啟用 gap filling |
|
||||||
|
| `gap_filling_coverage_threshold` | float | 0.7 | 覆蓋率低於此值時啟用 |
|
||||||
|
| `gap_filling_iou_threshold` | float | 0.15 | 覆蓋判定 IoU 閾值 |
|
||||||
|
| `gap_filling_confidence_threshold` | float | 0.3 | Raw OCR 信心度門檻 |
|
||||||
|
| `gap_filling_dedup_iou_threshold` | float | 0.5 | 去重 IoU 閾值 |
|
||||||
|
|
||||||
|
## Risks / Trade-offs
|
||||||
|
|
||||||
|
### Risk 1: 補漏造成文字重複
|
||||||
|
**Mitigation**: 設定 dedup_iou_threshold,對高重疊區域進行去重
|
||||||
|
|
||||||
|
### Risk 2: 閱讀順序錯亂
|
||||||
|
**Mitigation**: 補回元素後重新計算整頁的 reading_order(依 y0, x0 排序)
|
||||||
|
|
||||||
|
### Risk 3: 效能影響
|
||||||
|
**Mitigation**:
|
||||||
|
- 先做快速的覆蓋率檢查,若 > 70% 則跳過 gap filling
|
||||||
|
- 使用 R-tree 或 interval tree 加速 bbox 查詢(若效能成為瓶頸)
|
||||||
|
|
||||||
|
### Risk 4: 座標不對齊
|
||||||
|
**Mitigation**: 使用 `ocr_dimensions` 確保座標空間一致
|
||||||
|
|
||||||
|
## Migration Plan
|
||||||
|
|
||||||
|
1. 新增功能為可選(預設啟用)
|
||||||
|
2. 可透過設定關閉 gap filling
|
||||||
|
3. 不影響現有 API 介面
|
||||||
|
4. 向後相容:不傳參數時使用預設行為
|
||||||
|
|
||||||
|
## Open Questions
|
||||||
|
|
||||||
|
1. 是否需要 UI 開關讓使用者選擇啟用/停用 gap filling?
|
||||||
|
2. 對於細碎的 Raw OCR 框,是否需要實作合併邏輯?(同行、相鄰且間距很小)
|
||||||
|
3. 是否需要在輸出中標記哪些元素是補漏來的?(debug 用途)
|
||||||
@@ -0,0 +1,30 @@
|
|||||||
|
# Change: Add OCR Track Gap Filling with Raw OCR Text Regions
|
||||||
|
|
||||||
|
## Why
|
||||||
|
|
||||||
|
PP-StructureV3 的版面分析模型在處理某些掃描文件時會嚴重漏檢,導致大量文字內容遺失。實測 scan.pdf 顯示:
|
||||||
|
- Raw PaddleOCR 文字識別:偵測到 **56 個文字區域**
|
||||||
|
- PP-StructureV3 版面分析:僅輸出 **9 個元素**
|
||||||
|
- 遺失比例:約 **84%** 的內容未被 PP-StructureV3 識別
|
||||||
|
|
||||||
|
問題根源在於 PP-StructureV3 內部的 Layout Detection Model 對掃描文件類型支援不足,而非我們的程式碼問題。Raw OCR 能正確偵測所有文字區域,但這些資訊在 PP-StructureV3 的結構化處理過程中被遺失。
|
||||||
|
|
||||||
|
## What Changes
|
||||||
|
|
||||||
|
實作「混合式處理」(Hybrid Approach):使用 Raw OCR 的文字區域來補充 PP-StructureV3 遺失的內容。
|
||||||
|
|
||||||
|
- **新增** `GapFillingService` 類別,負責偵測並補回 PP-StructureV3 遺漏的文字區域
|
||||||
|
- **新增** 覆蓋率計算邏輯(中心點落入或 IoU 閾值判斷)
|
||||||
|
- **新增** 自動啟用條件:當 PP-Structure 覆蓋率 < 70% 或元素數顯著低於 Raw OCR 框數
|
||||||
|
- **修改** `OCRToUnifiedConverter` 整合 gap filling 邏輯
|
||||||
|
- **新增** 重新計算 reading_order 邏輯(依 y0, x0 排序)
|
||||||
|
- **新增** 測試案例:PP-Structure 嚴重漏檢案例、無漏檢正常文件驗證
|
||||||
|
|
||||||
|
## Impact
|
||||||
|
|
||||||
|
- **Affected specs**: `ocr-processing`
|
||||||
|
- **Affected code**:
|
||||||
|
- `backend/app/services/ocr_to_unified_converter.py` - 整合 gap filling
|
||||||
|
- `backend/app/services/gap_filling_service.py` - 新增 (核心邏輯)
|
||||||
|
- `backend/tests/test_gap_filling.py` - 新增 (測試)
|
||||||
|
- **Track isolation**: 僅作用於 OCR track;Direct/Hybrid track 不受影響
|
||||||
@@ -0,0 +1,111 @@
|
|||||||
|
## ADDED Requirements
|
||||||
|
|
||||||
|
### Requirement: OCR Track Gap Filling with Raw OCR Regions
|
||||||
|
|
||||||
|
The system SHALL detect and fill gaps in PP-StructureV3 output by supplementing with Raw OCR text regions when significant content loss is detected.
|
||||||
|
|
||||||
|
#### Scenario: Gap filling activates when coverage is low
|
||||||
|
- **GIVEN** an OCR track processing task
|
||||||
|
- **WHEN** PP-StructureV3 outputs elements that cover less than 70% of Raw OCR text regions
|
||||||
|
- **THEN** the system SHALL activate gap filling
|
||||||
|
- **AND** identify Raw OCR regions not covered by any PP-StructureV3 element
|
||||||
|
- **AND** supplement these regions as TEXT elements in the output
|
||||||
|
|
||||||
|
#### Scenario: Coverage is determined by center-point and IoU
|
||||||
|
- **GIVEN** a Raw OCR text region with bounding box
|
||||||
|
- **WHEN** checking if the region is covered by PP-StructureV3
|
||||||
|
- **THEN** the region SHALL be considered covered if its center point falls inside any PP-StructureV3 element bbox
|
||||||
|
- **OR** if IoU with any PP-StructureV3 element exceeds 0.15 threshold
|
||||||
|
- **AND** regions not meeting either criterion SHALL be marked as uncovered
|
||||||
|
|
||||||
|
#### Scenario: Only TEXT elements are supplemented
|
||||||
|
- **GIVEN** uncovered Raw OCR regions identified for supplementation
|
||||||
|
- **WHEN** PP-StructureV3 has detected TABLE, IMAGE, FIGURE, FLOWCHART, HEADER, or FOOTER elements
|
||||||
|
- **THEN** the system SHALL NOT supplement regions that overlap with these structural elements
|
||||||
|
- **AND** only supplement regions as TEXT type to preserve structural integrity
|
||||||
|
|
||||||
|
#### Scenario: Supplemented regions meet confidence threshold
|
||||||
|
- **GIVEN** Raw OCR regions to be supplemented
|
||||||
|
- **WHEN** a region has confidence score below 0.3
|
||||||
|
- **THEN** the system SHALL skip that region
|
||||||
|
- **AND** only supplement regions with confidence >= 0.3
|
||||||
|
|
||||||
|
#### Scenario: Deduplication prevents repeated text
|
||||||
|
- **GIVEN** a Raw OCR region being considered for supplementation
|
||||||
|
- **WHEN** the region has IoU > 0.5 with any existing PP-StructureV3 TEXT element
|
||||||
|
- **THEN** the system SHALL skip that region to prevent duplicate text
|
||||||
|
- **AND** the original PP-StructureV3 element SHALL be preserved
|
||||||
|
|
||||||
|
#### Scenario: Reading order is recalculated after gap filling
|
||||||
|
- **GIVEN** supplemented elements have been added to the page
|
||||||
|
- **WHEN** assembling the final element list
|
||||||
|
- **THEN** the system SHALL recalculate reading order for the entire page
|
||||||
|
- **AND** sort elements by y0 coordinate (top to bottom) then x0 (left to right)
|
||||||
|
- **AND** ensure logical document flow is maintained
|
||||||
|
|
||||||
|
#### Scenario: Coordinate alignment with ocr_dimensions
|
||||||
|
- **GIVEN** Raw OCR processing may involve image resizing
|
||||||
|
- **WHEN** comparing Raw OCR bbox with PP-StructureV3 bbox
|
||||||
|
- **THEN** the system SHALL use ocr_dimensions to normalize coordinates
|
||||||
|
- **AND** ensure both sources reference the same coordinate space
|
||||||
|
- **AND** prevent coverage misdetection due to scale differences
|
||||||
|
|
||||||
|
#### Scenario: Supplemented elements have complete metadata
|
||||||
|
- **GIVEN** a Raw OCR region being added as supplemented element
|
||||||
|
- **WHEN** creating the DocumentElement
|
||||||
|
- **THEN** the element SHALL include page_number
|
||||||
|
- **AND** include confidence score from Raw OCR
|
||||||
|
- **AND** include original bbox coordinates
|
||||||
|
- **AND** optionally include source indicator for debugging
|
||||||
|
|
||||||
|
### Requirement: Gap Filling Track Isolation
|
||||||
|
|
||||||
|
The gap filling feature SHALL only apply to OCR track processing and SHALL NOT affect Direct or Hybrid track outputs.
|
||||||
|
|
||||||
|
#### Scenario: Gap filling only activates for OCR track
|
||||||
|
- **GIVEN** a document processing task
|
||||||
|
- **WHEN** the processing track is OCR
|
||||||
|
- **THEN** the system SHALL evaluate and apply gap filling as needed
|
||||||
|
- **AND** produce enhanced output with supplemented content
|
||||||
|
|
||||||
|
#### Scenario: Direct track is unaffected
|
||||||
|
- **GIVEN** a document processing task with Direct track
|
||||||
|
- **WHEN** the task is processed
|
||||||
|
- **THEN** the system SHALL NOT invoke any gap filling logic
|
||||||
|
- **AND** produce output identical to current Direct track behavior
|
||||||
|
|
||||||
|
#### Scenario: Hybrid track is unaffected
|
||||||
|
- **GIVEN** a document processing task with Hybrid track
|
||||||
|
- **WHEN** the task is processed
|
||||||
|
- **THEN** the system SHALL NOT invoke gap filling logic
|
||||||
|
- **AND** use existing Hybrid track processing pipeline
|
||||||
|
|
||||||
|
### Requirement: Gap Filling Configuration
|
||||||
|
|
||||||
|
The system SHALL provide configurable parameters for gap filling behavior.
|
||||||
|
|
||||||
|
#### Scenario: Gap filling can be disabled via configuration
|
||||||
|
- **GIVEN** gap_filling_enabled is set to false in configuration
|
||||||
|
- **WHEN** OCR track processing runs
|
||||||
|
- **THEN** the system SHALL skip all gap filling logic
|
||||||
|
- **AND** output only PP-StructureV3 results as before
|
||||||
|
|
||||||
|
#### Scenario: Coverage threshold is configurable
|
||||||
|
- **GIVEN** gap_filling_coverage_threshold is set to 0.8
|
||||||
|
- **WHEN** PP-StructureV3 coverage is 75%
|
||||||
|
- **THEN** the system SHALL activate gap filling
|
||||||
|
- **AND** supplement uncovered regions
|
||||||
|
|
||||||
|
#### Scenario: IoU thresholds are configurable
|
||||||
|
- **GIVEN** custom IoU thresholds configured:
|
||||||
|
- gap_filling_iou_threshold: 0.2
|
||||||
|
- gap_filling_dedup_iou_threshold: 0.6
|
||||||
|
- **WHEN** evaluating coverage and deduplication
|
||||||
|
- **THEN** the system SHALL use the configured values
|
||||||
|
- **AND** apply them consistently throughout gap filling process
|
||||||
|
|
||||||
|
#### Scenario: Confidence threshold is configurable
|
||||||
|
- **GIVEN** gap_filling_confidence_threshold is set to 0.5
|
||||||
|
- **WHEN** supplementing Raw OCR regions
|
||||||
|
- **THEN** the system SHALL only include regions with confidence >= 0.5
|
||||||
|
- **AND** filter out lower confidence regions
|
||||||
@@ -0,0 +1,44 @@
|
|||||||
|
# Tasks: Add OCR Track Gap Filling
|
||||||
|
|
||||||
|
## 1. Core Implementation
|
||||||
|
|
||||||
|
- [x] 1.1 Create `gap_filling_service.py` with `GapFillingService` class
|
||||||
|
- [x] 1.2 Implement bbox coverage calculation (center-point and IoU methods)
|
||||||
|
- [x] 1.3 Implement gap detection logic (find uncovered raw OCR regions)
|
||||||
|
- [x] 1.4 Implement confidence threshold filtering for supplemented regions
|
||||||
|
- [x] 1.5 Implement element type filtering (only supplement TEXT, skip TABLE/IMAGE/FIGURE/etc.)
|
||||||
|
- [x] 1.6 Implement reading order recalculation (sort by y0, x0)
|
||||||
|
- [x] 1.7 Implement deduplication logic (skip high IoU overlaps with PP-Structure TEXT)
|
||||||
|
- [x] 1.8 Implement optional text merging for fragmented adjacent regions
|
||||||
|
|
||||||
|
## 2. Integration
|
||||||
|
|
||||||
|
- [x] 2.1 Modify `OCRToUnifiedConverter` to accept raw OCR text_regions
|
||||||
|
- [x] 2.2 Add gap filling activation condition check (coverage < 70% or element count disparity)
|
||||||
|
- [x] 2.3 Ensure coordinate alignment between raw OCR and PP-Structure (ocr_dimensions handling)
|
||||||
|
- [x] 2.4 Add page metadata (page_number, confidence, bbox) to supplemented elements
|
||||||
|
- [x] 2.5 Ensure track isolation (only OCR track, not Direct/Hybrid)
|
||||||
|
|
||||||
|
## 3. Configuration
|
||||||
|
|
||||||
|
- [x] 3.1 Add configurable parameters to settings:
|
||||||
|
- `gap_filling_enabled`: bool (default: True)
|
||||||
|
- `gap_filling_coverage_threshold`: float (default: 0.7)
|
||||||
|
- `gap_filling_iou_threshold`: float (default: 0.15)
|
||||||
|
- `gap_filling_confidence_threshold`: float (default: 0.3)
|
||||||
|
- `gap_filling_dedup_iou_threshold`: float (default: 0.5)
|
||||||
|
|
||||||
|
## 4. Testing(with env)
|
||||||
|
|
||||||
|
- [x] 4.1 Create test fixtures with PP-Structure severe miss-detection case(with scan.pdf / scan2.pdf)
|
||||||
|
- [x] 4.2 Test gap detection correctly identifies uncovered regions
|
||||||
|
- [x] 4.3 Test supplemented elements have correct metadata
|
||||||
|
- [x] 4.4 Test reading order is correctly recalculated
|
||||||
|
- [x] 4.5 Test deduplication prevents duplicate text
|
||||||
|
- [x] 4.6 Test normal document without miss-detection has no duplicate/inflation
|
||||||
|
- [x] 4.7 Test track isolation (Direct track unaffected)
|
||||||
|
|
||||||
|
## 5. Documentation
|
||||||
|
|
||||||
|
- [x] 5.1 Add inline documentation to GapFillingService
|
||||||
|
- [x] 5.2 Update configuration documentation with new settings
|
||||||
@@ -0,0 +1,40 @@
|
|||||||
|
# Change: Simplify PP-StructureV3 Configuration with Layout Model Selection
|
||||||
|
|
||||||
|
## Why
|
||||||
|
|
||||||
|
Current PP-StructureV3 parameter adjustment UI exposes 7 technical ML parameters (thresholds, ratios, merge modes) that are difficult for end users to understand. Meanwhile, switching to a different layout detection model (e.g., CDLA-trained models for Chinese documents) would have a much greater impact on OCR quality than fine-tuning these parameters.
|
||||||
|
|
||||||
|
**Problems with current approach:**
|
||||||
|
- Users don't understand what `layout_detection_threshold` or `text_det_unclip_ratio` mean
|
||||||
|
- Wrong parameter values can make OCR results worse
|
||||||
|
- The default model (PubLayNet-based) is optimized for English academic papers, not Chinese business documents
|
||||||
|
- Model selection is far more impactful than parameter tuning
|
||||||
|
|
||||||
|
## What Changes
|
||||||
|
|
||||||
|
### Backend Changes
|
||||||
|
- **REMOVED**: API parameter `pp_structure_params` from task start endpoint
|
||||||
|
- **ADDED**: New API parameter `layout_model` with predefined options:
|
||||||
|
- `"default"` - Standard model (PubLayNet-based, for English documents)
|
||||||
|
- `"chinese"` - PP-DocLayout-S model (for Chinese documents, forms, contracts)
|
||||||
|
- `"cdla"` - CDLA model (alternative Chinese document layout model)
|
||||||
|
- **MODIFIED**: PP-StructureV3 initialization uses `layout_detection_model_name` based on selection
|
||||||
|
- Keep fine-tuning parameters in backend `config.py` with optimized defaults
|
||||||
|
|
||||||
|
### Frontend Changes
|
||||||
|
- **REMOVED**: `PPStructureParams.tsx` component (slider/dropdown UI for 7 parameters)
|
||||||
|
- **ADDED**: Simple radio button/dropdown for layout model selection with clear descriptions
|
||||||
|
- **MODIFIED**: Task start request body to send `layout_model` instead of `pp_structure_params`
|
||||||
|
|
||||||
|
### API Changes
|
||||||
|
- **BREAKING**: Remove `pp_structure_params` from `POST /api/v2/tasks/{task_id}/start`
|
||||||
|
- **ADDED**: New optional parameter `layout_model: "default" | "chinese" | "cdla"`
|
||||||
|
|
||||||
|
## Impact
|
||||||
|
|
||||||
|
- Affected specs: `ocr-processing`
|
||||||
|
- Affected code:
|
||||||
|
- Backend: `app/routers/tasks.py`, `app/services/ocr_service.py`, `app/core/config.py`
|
||||||
|
- Frontend: `src/components/PPStructureParams.tsx` (remove), `src/types/apiV2.ts`, task start form
|
||||||
|
- Breaking change: Clients using `pp_structure_params` will need to migrate to `layout_model`
|
||||||
|
- User impact: Simpler UI, better default OCR quality for Chinese documents
|
||||||
@@ -0,0 +1,86 @@
|
|||||||
|
# ocr-processing Specification Delta
|
||||||
|
|
||||||
|
## REMOVED Requirements
|
||||||
|
|
||||||
|
### Requirement: Frontend-Adjustable PP-StructureV3 Parameters
|
||||||
|
**Reason**: Complex ML parameters are difficult for end users to understand and tune. Model selection provides better UX and more significant quality improvements.
|
||||||
|
**Migration**: Replace `pp_structure_params` API parameter with `layout_model` parameter.
|
||||||
|
|
||||||
|
### Requirement: PP-StructureV3 Parameter UI Controls
|
||||||
|
**Reason**: Slider/dropdown UI for 7 technical parameters adds complexity without proportional benefit. Simple model selection is more user-friendly.
|
||||||
|
**Migration**: Remove `PPStructureParams.tsx` component, add `LayoutModelSelector.tsx` component.
|
||||||
|
|
||||||
|
## ADDED Requirements
|
||||||
|
|
||||||
|
### Requirement: Layout Model Selection
|
||||||
|
The system SHALL allow users to select a layout detection model optimized for their document type, providing a simple choice between pre-configured models instead of manual parameter tuning.
|
||||||
|
|
||||||
|
#### Scenario: User selects Chinese document model
|
||||||
|
- **GIVEN** a user is processing Chinese business documents (forms, contracts, invoices)
|
||||||
|
- **WHEN** the user selects "Chinese Document Model" (PP-DocLayout-S)
|
||||||
|
- **THEN** the OCR engine SHALL use the PP-DocLayout-S layout detection model
|
||||||
|
- **AND** the model SHALL be optimized for 23 Chinese document element types
|
||||||
|
- **AND** table and form detection accuracy SHALL be improved over the default model
|
||||||
|
|
||||||
|
#### Scenario: User selects standard model for English documents
|
||||||
|
- **GIVEN** a user is processing English academic papers or reports
|
||||||
|
- **WHEN** the user selects "Standard Model" (PubLayNet-based)
|
||||||
|
- **THEN** the OCR engine SHALL use the default PubLayNet-based layout detection model
|
||||||
|
- **AND** the model SHALL be optimized for English document layouts
|
||||||
|
|
||||||
|
#### Scenario: User selects CDLA model for specialized Chinese layout
|
||||||
|
- **GIVEN** a user is processing Chinese documents with complex layouts
|
||||||
|
- **WHEN** the user selects "CDLA Model"
|
||||||
|
- **THEN** the OCR engine SHALL use the picodet_lcnet_x1_0_fgd_layout_cdla model
|
||||||
|
- **AND** the model SHALL provide specialized Chinese document layout analysis
|
||||||
|
|
||||||
|
#### Scenario: Layout model is sent via API request
|
||||||
|
- **GIVEN** a frontend application with model selection UI
|
||||||
|
- **WHEN** the user starts task processing with a selected model
|
||||||
|
- **THEN** the frontend SHALL send the model choice in the request body:
|
||||||
|
```json
|
||||||
|
POST /api/v2/tasks/{task_id}/start
|
||||||
|
{
|
||||||
|
"use_dual_track": true,
|
||||||
|
"force_track": "ocr",
|
||||||
|
"language": "ch",
|
||||||
|
"layout_model": "chinese"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
- **AND** the backend SHALL configure PP-StructureV3 with the corresponding model
|
||||||
|
|
||||||
|
#### Scenario: Default model when not specified
|
||||||
|
- **GIVEN** an API request without `layout_model` parameter
|
||||||
|
- **WHEN** the task is started
|
||||||
|
- **THEN** the system SHALL use "chinese" (PP-DocLayout-S) as the default model
|
||||||
|
- **AND** processing SHALL work correctly without requiring model selection
|
||||||
|
|
||||||
|
#### Scenario: Invalid model name is rejected
|
||||||
|
- **GIVEN** a request with an invalid `layout_model` value
|
||||||
|
- **WHEN** the user sends `layout_model: "invalid_model"`
|
||||||
|
- **THEN** the API SHALL return 422 Validation Error
|
||||||
|
- **AND** provide a clear error message listing valid model options
|
||||||
|
|
||||||
|
### Requirement: Layout Model Selection UI
|
||||||
|
The frontend SHALL provide a simple, user-friendly interface for selecting layout detection models with clear descriptions of each option.
|
||||||
|
|
||||||
|
#### Scenario: Model options are displayed with descriptions
|
||||||
|
- **GIVEN** the model selection UI is displayed
|
||||||
|
- **WHEN** the user views the available options
|
||||||
|
- **THEN** the UI SHALL show the following options:
|
||||||
|
- "Chinese Document Model (Recommended)" - for Chinese forms, contracts, invoices
|
||||||
|
- "Standard Model" - for English academic papers, reports
|
||||||
|
- "CDLA Model" - for specialized Chinese layout analysis
|
||||||
|
- **AND** each option SHALL have a brief description of its use case
|
||||||
|
|
||||||
|
#### Scenario: Chinese model is selected by default
|
||||||
|
- **GIVEN** the user opens the task processing interface
|
||||||
|
- **WHEN** the model selection is displayed
|
||||||
|
- **THEN** "Chinese Document Model" SHALL be pre-selected as the default
|
||||||
|
- **AND** the user MAY change the selection before starting processing
|
||||||
|
|
||||||
|
#### Scenario: Model selection is visible only for OCR track
|
||||||
|
- **GIVEN** a document processing interface
|
||||||
|
- **WHEN** the user selects processing track
|
||||||
|
- **THEN** layout model selection SHALL be shown ONLY when OCR track is selected or auto-detected
|
||||||
|
- **AND** SHALL be hidden for Direct track (which does not use PP-StructureV3)
|
||||||
@@ -0,0 +1,56 @@
|
|||||||
|
# Implementation Tasks
|
||||||
|
|
||||||
|
## 1. Backend API Changes
|
||||||
|
|
||||||
|
- [x] 1.1 Update `app/schemas/task.py` to add `layout_model` enum type
|
||||||
|
- [x] 1.2 Update `app/routers/tasks.py` to replace `pp_structure_params` with `layout_model` parameter
|
||||||
|
- [x] 1.3 Update `app/services/ocr_service.py` to map `layout_model` to `layout_detection_model_name`
|
||||||
|
- [x] 1.4 Remove custom PP-Structure engine creation logic (use model selection instead)
|
||||||
|
- [x] 1.5 Add backward compatibility: default to "chinese" if no model specified
|
||||||
|
|
||||||
|
## 2. Backend Configuration
|
||||||
|
|
||||||
|
- [x] 2.1 Keep `layout_detection_model_name` in `config.py` as fallback default
|
||||||
|
- [x] 2.2 Keep fine-tuning parameters in `config.py` (not exposed to API)
|
||||||
|
- [x] 2.3 Document available layout models in config comments
|
||||||
|
|
||||||
|
## 3. Frontend Changes
|
||||||
|
|
||||||
|
- [x] 3.1 Remove `PPStructureParams.tsx` component
|
||||||
|
- [x] 3.2 Update `src/types/apiV2.ts`:
|
||||||
|
- Remove `PPStructureV3Params` interface
|
||||||
|
- Add `LayoutModel` type: `"default" | "chinese" | "cdla"`
|
||||||
|
- Update `ProcessingOptions` to use `layout_model` instead of `pp_structure_params`
|
||||||
|
- [x] 3.3 Create `LayoutModelSelector.tsx` component with:
|
||||||
|
- Radio buttons or dropdown for model selection
|
||||||
|
- Clear descriptions for each model option
|
||||||
|
- Default selection: "chinese"
|
||||||
|
- [x] 3.4 Update task start form to use new `LayoutModelSelector`
|
||||||
|
- [x] 3.5 Update API calls to send `layout_model` instead of `pp_structure_params`
|
||||||
|
|
||||||
|
## 4. Internationalization
|
||||||
|
|
||||||
|
- [x] 4.1 Add i18n strings for layout model options:
|
||||||
|
- `layoutModel.default`: "Standard Model (English documents)"
|
||||||
|
- `layoutModel.chinese`: "Chinese Document Model (Recommended)"
|
||||||
|
- `layoutModel.cdla`: "CDLA Model (Chinese layout analysis)"
|
||||||
|
- [x] 4.2 Add i18n strings for model descriptions
|
||||||
|
|
||||||
|
## 5. Testing
|
||||||
|
|
||||||
|
- [x] 5.1 Create new tests for `layout_model` parameter (`test_layout_model_api.py`, `test_layout_model.py`)
|
||||||
|
- [x] 5.2 Archive tests for `pp_structure_params` validation (moved to `tests/archived/`)
|
||||||
|
- [x] 5.3 Add tests for layout model selection (19 tests passing)
|
||||||
|
- [x] 5.4 Test backward compatibility (no model specified → use chinese default)
|
||||||
|
|
||||||
|
## 6. Documentation
|
||||||
|
|
||||||
|
- [ ] 6.1 Update API documentation for task start endpoint
|
||||||
|
- [ ] 6.2 Remove PP-Structure parameter documentation
|
||||||
|
- [ ] 6.3 Add layout model selection documentation
|
||||||
|
|
||||||
|
## 7. Cleanup
|
||||||
|
|
||||||
|
- [x] 7.1 Remove localStorage keys for PP-Structure params (`pp_structure_params_presets`, `pp_structure_params_last_used`)
|
||||||
|
- [x] 7.2 Remove any unused imports/types related to PP-Structure params
|
||||||
|
- [x] 7.3 Archive old PP-Structure params test files
|
||||||
@@ -3,100 +3,186 @@
|
|||||||
## Purpose
|
## Purpose
|
||||||
TBD - created by archiving change frontend-adjustable-ppstructure-params. Update Purpose after archive.
|
TBD - created by archiving change frontend-adjustable-ppstructure-params. Update Purpose after archive.
|
||||||
## Requirements
|
## Requirements
|
||||||
### Requirement: Frontend-Adjustable PP-StructureV3 Parameters
|
### Requirement: OCR Track Gap Filling with Raw OCR Regions
|
||||||
The system SHALL allow frontend users to dynamically adjust PP-StructureV3 OCR parameters for fine-tuning document processing without backend configuration changes.
|
|
||||||
|
|
||||||
#### Scenario: User adjusts layout detection threshold
|
The system SHALL detect and fill gaps in PP-StructureV3 output by supplementing with Raw OCR text regions when significant content loss is detected.
|
||||||
- **GIVEN** a user is processing a document with OCR track
|
|
||||||
- **WHEN** the user sets `layout_detection_threshold` to 0.1 (lower than default 0.2)
|
|
||||||
- **THEN** the OCR engine SHALL detect more layout blocks including weak signals
|
|
||||||
- **AND** the processing SHALL use the custom parameter instead of backend defaults
|
|
||||||
- **AND** the custom parameter SHALL NOT be cached for reuse
|
|
||||||
|
|
||||||
#### Scenario: User selects high-quality preset configuration
|
#### Scenario: Gap filling activates when coverage is low
|
||||||
- **GIVEN** a user wants to process a complex document with many small text elements
|
- **GIVEN** an OCR track processing task
|
||||||
- **WHEN** the user selects "High Quality" preset mode
|
- **WHEN** PP-StructureV3 outputs elements that cover less than 70% of Raw OCR text regions
|
||||||
- **THEN** the system SHALL automatically set:
|
- **THEN** the system SHALL activate gap filling
|
||||||
- `layout_detection_threshold` to 0.1
|
- **AND** identify Raw OCR regions not covered by any PP-StructureV3 element
|
||||||
- `layout_nms_threshold` to 0.15
|
- **AND** supplement these regions as TEXT elements in the output
|
||||||
- `text_det_thresh` to 0.1
|
|
||||||
- `text_det_box_thresh` to 0.2
|
|
||||||
- **AND** process the document with these optimized parameters
|
|
||||||
|
|
||||||
#### Scenario: User adjusts text detection parameters
|
#### Scenario: Coverage is determined by center-point and IoU
|
||||||
- **GIVEN** a document with low-contrast text
|
- **GIVEN** a Raw OCR text region with bounding box
|
||||||
- **WHEN** the user sets:
|
- **WHEN** checking if the region is covered by PP-StructureV3
|
||||||
- `text_det_thresh` to 0.05 (very low)
|
- **THEN** the region SHALL be considered covered if its center point falls inside any PP-StructureV3 element bbox
|
||||||
- `text_det_unclip_ratio` to 1.5 (larger boxes)
|
- **OR** if IoU with any PP-StructureV3 element exceeds 0.15 threshold
|
||||||
- **THEN** the OCR SHALL detect more small and low-contrast text
|
- **AND** regions not meeting either criterion SHALL be marked as uncovered
|
||||||
- **AND** text bounding boxes SHALL be expanded by the specified ratio
|
|
||||||
|
|
||||||
#### Scenario: Parameters are sent via API request body
|
#### Scenario: Only TEXT elements are supplemented
|
||||||
- **GIVEN** a frontend application with parameter adjustment UI
|
- **GIVEN** uncovered Raw OCR regions identified for supplementation
|
||||||
- **WHEN** the user starts task processing with custom parameters
|
- **WHEN** PP-StructureV3 has detected TABLE, IMAGE, FIGURE, FLOWCHART, HEADER, or FOOTER elements
|
||||||
- **THEN** the frontend SHALL send parameters in the request body (not query params):
|
- **THEN** the system SHALL NOT supplement regions that overlap with these structural elements
|
||||||
|
- **AND** only supplement regions as TEXT type to preserve structural integrity
|
||||||
|
|
||||||
|
#### Scenario: Supplemented regions meet confidence threshold
|
||||||
|
- **GIVEN** Raw OCR regions to be supplemented
|
||||||
|
- **WHEN** a region has confidence score below 0.3
|
||||||
|
- **THEN** the system SHALL skip that region
|
||||||
|
- **AND** only supplement regions with confidence >= 0.3
|
||||||
|
|
||||||
|
#### Scenario: Deduplication prevents repeated text
|
||||||
|
- **GIVEN** a Raw OCR region being considered for supplementation
|
||||||
|
- **WHEN** the region has IoU > 0.5 with any existing PP-StructureV3 TEXT element
|
||||||
|
- **THEN** the system SHALL skip that region to prevent duplicate text
|
||||||
|
- **AND** the original PP-StructureV3 element SHALL be preserved
|
||||||
|
|
||||||
|
#### Scenario: Reading order is recalculated after gap filling
|
||||||
|
- **GIVEN** supplemented elements have been added to the page
|
||||||
|
- **WHEN** assembling the final element list
|
||||||
|
- **THEN** the system SHALL recalculate reading order for the entire page
|
||||||
|
- **AND** sort elements by y0 coordinate (top to bottom) then x0 (left to right)
|
||||||
|
- **AND** ensure logical document flow is maintained
|
||||||
|
|
||||||
|
#### Scenario: Coordinate alignment with ocr_dimensions
|
||||||
|
- **GIVEN** Raw OCR processing may involve image resizing
|
||||||
|
- **WHEN** comparing Raw OCR bbox with PP-StructureV3 bbox
|
||||||
|
- **THEN** the system SHALL use ocr_dimensions to normalize coordinates
|
||||||
|
- **AND** ensure both sources reference the same coordinate space
|
||||||
|
- **AND** prevent coverage misdetection due to scale differences
|
||||||
|
|
||||||
|
#### Scenario: Supplemented elements have complete metadata
|
||||||
|
- **GIVEN** a Raw OCR region being added as supplemented element
|
||||||
|
- **WHEN** creating the DocumentElement
|
||||||
|
- **THEN** the element SHALL include page_number
|
||||||
|
- **AND** include confidence score from Raw OCR
|
||||||
|
- **AND** include original bbox coordinates
|
||||||
|
- **AND** optionally include source indicator for debugging
|
||||||
|
|
||||||
|
### Requirement: Gap Filling Track Isolation
|
||||||
|
|
||||||
|
The gap filling feature SHALL only apply to OCR track processing and SHALL NOT affect Direct or Hybrid track outputs.
|
||||||
|
|
||||||
|
#### Scenario: Gap filling only activates for OCR track
|
||||||
|
- **GIVEN** a document processing task
|
||||||
|
- **WHEN** the processing track is OCR
|
||||||
|
- **THEN** the system SHALL evaluate and apply gap filling as needed
|
||||||
|
- **AND** produce enhanced output with supplemented content
|
||||||
|
|
||||||
|
#### Scenario: Direct track is unaffected
|
||||||
|
- **GIVEN** a document processing task with Direct track
|
||||||
|
- **WHEN** the task is processed
|
||||||
|
- **THEN** the system SHALL NOT invoke any gap filling logic
|
||||||
|
- **AND** produce output identical to current Direct track behavior
|
||||||
|
|
||||||
|
#### Scenario: Hybrid track is unaffected
|
||||||
|
- **GIVEN** a document processing task with Hybrid track
|
||||||
|
- **WHEN** the task is processed
|
||||||
|
- **THEN** the system SHALL NOT invoke gap filling logic
|
||||||
|
- **AND** use existing Hybrid track processing pipeline
|
||||||
|
|
||||||
|
### Requirement: Gap Filling Configuration
|
||||||
|
|
||||||
|
The system SHALL provide configurable parameters for gap filling behavior.
|
||||||
|
|
||||||
|
#### Scenario: Gap filling can be disabled via configuration
|
||||||
|
- **GIVEN** gap_filling_enabled is set to false in configuration
|
||||||
|
- **WHEN** OCR track processing runs
|
||||||
|
- **THEN** the system SHALL skip all gap filling logic
|
||||||
|
- **AND** output only PP-StructureV3 results as before
|
||||||
|
|
||||||
|
#### Scenario: Coverage threshold is configurable
|
||||||
|
- **GIVEN** gap_filling_coverage_threshold is set to 0.8
|
||||||
|
- **WHEN** PP-StructureV3 coverage is 75%
|
||||||
|
- **THEN** the system SHALL activate gap filling
|
||||||
|
- **AND** supplement uncovered regions
|
||||||
|
|
||||||
|
#### Scenario: IoU thresholds are configurable
|
||||||
|
- **GIVEN** custom IoU thresholds configured:
|
||||||
|
- gap_filling_iou_threshold: 0.2
|
||||||
|
- gap_filling_dedup_iou_threshold: 0.6
|
||||||
|
- **WHEN** evaluating coverage and deduplication
|
||||||
|
- **THEN** the system SHALL use the configured values
|
||||||
|
- **AND** apply them consistently throughout gap filling process
|
||||||
|
|
||||||
|
#### Scenario: Confidence threshold is configurable
|
||||||
|
- **GIVEN** gap_filling_confidence_threshold is set to 0.5
|
||||||
|
- **WHEN** supplementing Raw OCR regions
|
||||||
|
- **THEN** the system SHALL only include regions with confidence >= 0.5
|
||||||
|
- **AND** filter out lower confidence regions
|
||||||
|
|
||||||
|
### Requirement: Layout Model Selection
|
||||||
|
The system SHALL allow users to select a layout detection model optimized for their document type, providing a simple choice between pre-configured models instead of manual parameter tuning.
|
||||||
|
|
||||||
|
#### Scenario: User selects Chinese document model
|
||||||
|
- **GIVEN** a user is processing Chinese business documents (forms, contracts, invoices)
|
||||||
|
- **WHEN** the user selects "Chinese Document Model" (PP-DocLayout-S)
|
||||||
|
- **THEN** the OCR engine SHALL use the PP-DocLayout-S layout detection model
|
||||||
|
- **AND** the model SHALL be optimized for 23 Chinese document element types
|
||||||
|
- **AND** table and form detection accuracy SHALL be improved over the default model
|
||||||
|
|
||||||
|
#### Scenario: User selects standard model for English documents
|
||||||
|
- **GIVEN** a user is processing English academic papers or reports
|
||||||
|
- **WHEN** the user selects "Standard Model" (PubLayNet-based)
|
||||||
|
- **THEN** the OCR engine SHALL use the default PubLayNet-based layout detection model
|
||||||
|
- **AND** the model SHALL be optimized for English document layouts
|
||||||
|
|
||||||
|
#### Scenario: User selects CDLA model for specialized Chinese layout
|
||||||
|
- **GIVEN** a user is processing Chinese documents with complex layouts
|
||||||
|
- **WHEN** the user selects "CDLA Model"
|
||||||
|
- **THEN** the OCR engine SHALL use the picodet_lcnet_x1_0_fgd_layout_cdla model
|
||||||
|
- **AND** the model SHALL provide specialized Chinese document layout analysis
|
||||||
|
|
||||||
|
#### Scenario: Layout model is sent via API request
|
||||||
|
- **GIVEN** a frontend application with model selection UI
|
||||||
|
- **WHEN** the user starts task processing with a selected model
|
||||||
|
- **THEN** the frontend SHALL send the model choice in the request body:
|
||||||
```json
|
```json
|
||||||
POST /api/v2/tasks/{task_id}/start
|
POST /api/v2/tasks/{task_id}/start
|
||||||
{
|
{
|
||||||
"use_dual_track": true,
|
"use_dual_track": true,
|
||||||
"force_track": "ocr",
|
"force_track": "ocr",
|
||||||
"language": "ch",
|
"language": "ch",
|
||||||
"pp_structure_params": {
|
"layout_model": "chinese"
|
||||||
"layout_detection_threshold": 0.15,
|
|
||||||
"layout_merge_bboxes_mode": "small",
|
|
||||||
"text_det_thresh": 0.1
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
- **AND** the backend SHALL parse and apply these parameters
|
- **AND** the backend SHALL configure PP-StructureV3 with the corresponding model
|
||||||
|
|
||||||
#### Scenario: Backward compatibility is maintained
|
#### Scenario: Default model when not specified
|
||||||
- **GIVEN** existing API clients without PP-StructureV3 parameter support
|
- **GIVEN** an API request without `layout_model` parameter
|
||||||
- **WHEN** a task is started without `pp_structure_params`
|
- **WHEN** the task is started
|
||||||
- **THEN** the system SHALL use backend default settings
|
- **THEN** the system SHALL use "chinese" (PP-DocLayout-S) as the default model
|
||||||
- **AND** processing SHALL work exactly as before
|
- **AND** processing SHALL work correctly without requiring model selection
|
||||||
- **AND** no errors SHALL occur
|
|
||||||
|
|
||||||
#### Scenario: Invalid parameters are rejected
|
#### Scenario: Invalid model name is rejected
|
||||||
- **GIVEN** a request with invalid parameter values
|
- **GIVEN** a request with an invalid `layout_model` value
|
||||||
- **WHEN** the user sends:
|
- **WHEN** the user sends `layout_model: "invalid_model"`
|
||||||
- `layout_detection_threshold` = 1.5 (exceeds max 1.0)
|
|
||||||
- `layout_merge_bboxes_mode` = "invalid" (not in allowed values)
|
|
||||||
- **THEN** the API SHALL return 422 Validation Error
|
- **THEN** the API SHALL return 422 Validation Error
|
||||||
- **AND** provide clear error messages about invalid parameters
|
- **AND** provide a clear error message listing valid model options
|
||||||
|
|
||||||
#### Scenario: Custom parameters affect only current processing
|
### Requirement: Layout Model Selection UI
|
||||||
- **GIVEN** multiple concurrent OCR processing tasks
|
The frontend SHALL provide a simple, user-friendly interface for selecting layout detection models with clear descriptions of each option.
|
||||||
- **WHEN** Task A uses custom parameters and Task B uses defaults
|
|
||||||
- **THEN** Task A SHALL process with its custom parameters
|
|
||||||
- **AND** Task B SHALL process with default parameters
|
|
||||||
- **AND** no parameter interference SHALL occur between tasks
|
|
||||||
|
|
||||||
### Requirement: PP-StructureV3 Parameter UI Controls
|
#### Scenario: Model options are displayed with descriptions
|
||||||
The frontend SHALL provide intuitive UI controls for adjusting PP-StructureV3 parameters with appropriate constraints and help text.
|
- **GIVEN** the model selection UI is displayed
|
||||||
|
- **WHEN** the user views the available options
|
||||||
|
- **THEN** the UI SHALL show the following options:
|
||||||
|
- "Chinese Document Model (Recommended)" - for Chinese forms, contracts, invoices
|
||||||
|
- "Standard Model" - for English academic papers, reports
|
||||||
|
- "CDLA Model" - for specialized Chinese layout analysis
|
||||||
|
- **AND** each option SHALL have a brief description of its use case
|
||||||
|
|
||||||
#### Scenario: Slider controls for numeric parameters
|
#### Scenario: Chinese model is selected by default
|
||||||
- **GIVEN** the parameter adjustment UI is displayed
|
- **GIVEN** the user opens the task processing interface
|
||||||
- **WHEN** the user adjusts a numeric parameter slider
|
- **WHEN** the model selection is displayed
|
||||||
- **THEN** the slider SHALL enforce min/max constraints:
|
- **THEN** "Chinese Document Model" SHALL be pre-selected as the default
|
||||||
- Threshold parameters: 0.0 to 1.0
|
- **AND** the user MAY change the selection before starting processing
|
||||||
- Ratio parameters: > 0 (typically 0.5 to 3.0)
|
|
||||||
- **AND** display current value in real-time
|
|
||||||
- **AND** show help text explaining the parameter effect
|
|
||||||
|
|
||||||
#### Scenario: Dropdown for merge mode selection
|
#### Scenario: Model selection is visible only for OCR track
|
||||||
- **GIVEN** the layout merge mode parameter
|
|
||||||
- **WHEN** the user clicks the dropdown
|
|
||||||
- **THEN** the UI SHALL show exactly three options:
|
|
||||||
- "small" (conservative merging)
|
|
||||||
- "large" (aggressive merging)
|
|
||||||
- "union" (middle ground)
|
|
||||||
- **AND** display description for each option
|
|
||||||
|
|
||||||
#### Scenario: Parameters shown only for OCR track
|
|
||||||
- **GIVEN** a document processing interface
|
- **GIVEN** a document processing interface
|
||||||
- **WHEN** the user selects processing track
|
- **WHEN** the user selects processing track
|
||||||
- **THEN** PP-StructureV3 parameters SHALL be shown ONLY when OCR track is selected
|
- **THEN** layout model selection SHALL be shown ONLY when OCR track is selected or auto-detected
|
||||||
- **AND** SHALL be hidden for Direct track
|
- **AND** SHALL be hidden for Direct track (which does not use PP-StructureV3)
|
||||||
- **AND** SHALL be disabled for Auto track until track is determined
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user