feat: simplify layout model selection and archive proposals

Changes:
- Replace PP-Structure 7-slider parameter UI with simple 3-option layout model selector
- Add layout model mapping: chinese (PP-DocLayout-S), default (PubLayNet), cdla
- Add LayoutModelSelector component and zh-TW translations
- Fix "default" model behavior with sentinel value for PubLayNet
- Add gap filling service for OCR track coverage improvement
- Add PP-Structure debug utilities
- Archive completed/incomplete proposals:
  - add-ocr-track-gap-filling (complete)
  - fix-ocr-track-table-rendering (incomplete)
  - simplify-ppstructure-model-selection (22/25 tasks)
- Add new layout model tests, archive old PP-Structure param tests
- Update OpenSpec ocr-processing spec with layout model requirements

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-27 13:27:00 +08:00
parent c65df754cf
commit 59206a6ab8
35 changed files with 3621 additions and 658 deletions

View File

@@ -3,7 +3,7 @@ Tool_OCR - Configuration Management
Loads environment variables and provides centralized configuration
"""
from typing import List
from typing import List, Optional
from pydantic_settings import BaseSettings
from pydantic import Field
from pathlib import Path
@@ -99,6 +99,33 @@ class Settings(BaseSettings):
text_det_box_thresh: float = Field(default=0.3) # Lower box threshold for better detection
text_det_unclip_ratio: float = Field(default=1.2) # Smaller unclip for tighter text boxes
# Layout Detection Model Configuration
# Available models:
# - None (default): Use PP-StructureV3's built-in model (PubLayNet-based)
# - "PP-DocLayout-S": Better for Chinese docs, papers, contracts, exams (23 categories)
# - "picodet_lcnet_x1_0_fgd_layout_cdla": CDLA-based model for Chinese document layout
layout_detection_model_name: Optional[str] = Field(
default="PP-DocLayout-S",
description="Layout detection model name. Set to 'PP-DocLayout-S' for better Chinese document support."
)
layout_detection_model_dir: Optional[str] = Field(
default=None,
description="Custom layout detection model directory. If None, downloads official model."
)
# ===== Gap Filling Configuration =====
# Supplements PP-StructureV3 output with raw OCR regions when detection is incomplete
gap_filling_enabled: bool = Field(default=True) # Enable gap filling for OCR track
gap_filling_coverage_threshold: float = Field(default=0.7) # Activate when coverage < 70%
gap_filling_iou_threshold: float = Field(default=0.15) # IoU threshold for coverage detection
gap_filling_confidence_threshold: float = Field(default=0.3) # Min confidence for raw OCR regions
gap_filling_dedup_iou_threshold: float = Field(default=0.5) # IoU threshold for deduplication
# ===== Debug Configuration =====
# Enable debug outputs for PP-StructureV3 analysis
pp_structure_debug_enabled: bool = Field(default=True) # Save debug files for PP-StructureV3
pp_structure_debug_visualization: bool = Field(default=True) # Generate visualization images
# Performance tuning
use_fp16_inference: bool = Field(default=False) # Half-precision (if supported)
enable_cudnn_benchmark: bool = Field(default=True) # Optimize convolution algorithms

View File

@@ -68,7 +68,7 @@ def process_task_ocr(
use_dual_track: bool = True,
force_track: Optional[str] = None,
language: str = 'ch',
pp_structure_params: Optional[dict] = None
layout_model: Optional[str] = "chinese"
):
"""
Background task to process OCR for a task with dual-track support.
@@ -84,7 +84,7 @@ def process_task_ocr(
use_dual_track: Enable dual-track processing
force_track: Force specific track ('ocr' or 'direct')
language: OCR language code
pp_structure_params: Optional custom PP-StructureV3 parameters (dict)
layout_model: Layout detection model ('chinese', 'default', 'cdla')
"""
from app.core.database import SessionLocal
from app.models.task import Task
@@ -143,7 +143,7 @@ def process_task_ocr(
output_dir=result_dir,
use_dual_track=use_dual_track,
force_track=force_track,
pp_structure_params=pp_structure_params
layout_model=layout_model
)
else:
# Fall back to traditional processing (no force_track support)
@@ -152,7 +152,7 @@ def process_task_ocr(
lang=language,
detect_layout=True,
output_dir=result_dir,
pp_structure_params=pp_structure_params
layout_model=layout_model
)
# Calculate processing time
@@ -717,14 +717,14 @@ async def start_task(
current_user: User = Depends(get_current_user)
):
"""
Start processing a pending task with dual-track support and optional PP-StructureV3 parameter tuning
Start processing a pending task with dual-track support and layout model selection
- **task_id**: Task UUID
- **options**: Processing options (in request body):
- **use_dual_track**: Enable intelligent track selection (default: true)
- **force_track**: Force specific processing track ('ocr' or 'direct')
- **language**: OCR language code (default: 'ch')
- **pp_structure_params**: Fine-tuning parameters for PP-StructureV3 (OCR track only)
- **layout_model**: Layout detection model ('chinese', 'default', 'cdla')
"""
try:
# Parse processing options with defaults
@@ -735,11 +735,9 @@ async def start_task(
force_track = options.force_track.value if options.force_track else None
language = options.language
# Extract and convert PP-StructureV3 parameters to dict
pp_structure_params = None
if options.pp_structure_params:
pp_structure_params = options.pp_structure_params.model_dump(exclude_none=True)
logger.info(f"Using custom PP-StructureV3 parameters: {pp_structure_params}")
# Extract layout model (default to 'chinese' for best Chinese document support)
layout_model = options.layout_model.value if options.layout_model else "chinese"
logger.info(f"Using layout model: {layout_model}")
# Get task details
task = task_service.get_task_by_id(
@@ -777,7 +775,7 @@ async def start_task(
status=TaskStatus.PROCESSING
)
# Start OCR processing in background with dual-track parameters and custom PP-StructureV3 params
# Start OCR processing in background with dual-track parameters and layout model
background_tasks.add_task(
process_task_ocr,
task_id=task_id,
@@ -787,13 +785,11 @@ async def start_task(
use_dual_track=use_dual_track,
force_track=force_track,
language=language,
pp_structure_params=pp_structure_params
layout_model=layout_model
)
logger.info(f"Started OCR processing task {task_id} for user {current_user.email}")
logger.info(f"Options: dual_track={use_dual_track}, force_track={force_track}, lang={language}")
if pp_structure_params:
logger.info(f"Custom PP-StructureV3 params: {pp_structure_params}")
logger.info(f"Options: dual_track={use_dual_track}, force_track={force_track}, lang={language}, layout_model={layout_model}")
return task
except HTTPException:

View File

@@ -24,6 +24,19 @@ class ProcessingTrackEnum(str, Enum):
AUTO = "auto" # Auto-detect best track
class LayoutModelEnum(str, Enum):
"""Layout detection model selection for OCR track.
Different models are optimized for different document types:
- CHINESE: PP-DocLayout-S, optimized for Chinese documents (forms, contracts, invoices)
- DEFAULT: PubLayNet-based, optimized for English academic papers
- CDLA: CDLA model, specialized Chinese document layout analysis
"""
CHINESE = "chinese" # PP-DocLayout-S - Best for Chinese documents (recommended)
DEFAULT = "default" # PubLayNet-based - Best for English documents
CDLA = "cdla" # CDLA model - Alternative for Chinese layout
class TaskCreate(BaseModel):
"""Task creation request"""
filename: Optional[str] = Field(None, description="Original filename")
@@ -132,7 +145,11 @@ class UploadResponse(BaseModel):
# ===== Dual-Track Processing Schemas =====
class PPStructureV3Params(BaseModel):
"""PP-StructureV3 fine-tuning parameters for OCR track"""
"""PP-StructureV3 fine-tuning parameters for OCR track.
DEPRECATED: This class is deprecated and will be removed in a future version.
Use `layout_model` parameter in ProcessingOptions instead.
"""
layout_detection_threshold: Optional[float] = Field(
None, ge=0, le=1,
description="Layout block detection score threshold (lower=more blocks, higher=high confidence only)"
@@ -172,10 +189,10 @@ class ProcessingOptions(BaseModel):
include_images: bool = Field(default=True, description="Extract and save images")
confidence_threshold: Optional[float] = Field(None, ge=0, le=1, description="OCR confidence threshold")
# PP-StructureV3 fine-tuning parameters (OCR track only)
pp_structure_params: Optional[PPStructureV3Params] = Field(
None,
description="Fine-tuning parameters for PP-StructureV3 (OCR track only)"
# Layout model selection (OCR track only)
layout_model: Optional[LayoutModelEnum] = Field(
default=LayoutModelEnum.CHINESE,
description="Layout detection model: 'chinese' (recommended for Chinese docs), 'default' (English docs), 'cdla' (Chinese layout)"
)

View File

@@ -0,0 +1,649 @@
"""
Gap Filling Service for OCR Track
This service detects and fills gaps in PP-StructureV3 output by supplementing
with Raw OCR text regions when significant content loss is detected.
The hybrid approach uses Raw OCR's comprehensive text detection to compensate
for PP-StructureV3's layout model limitations on certain document types.
"""
import logging
from typing import Dict, List, Optional, Tuple, Set, Any
from dataclasses import dataclass
from app.models.unified_document import (
DocumentElement, BoundingBox, ElementType, Dimensions
)
from app.core.config import settings
logger = logging.getLogger(__name__)
# Element types that should NOT be supplemented (preserve structural integrity)
SKIP_ELEMENT_TYPES: Set[ElementType] = {
ElementType.TABLE,
ElementType.IMAGE,
ElementType.FIGURE,
ElementType.CHART,
ElementType.DIAGRAM,
ElementType.HEADER,
ElementType.FOOTER,
ElementType.FORMULA,
ElementType.CODE,
ElementType.BARCODE,
ElementType.QR_CODE,
ElementType.LOGO,
ElementType.STAMP,
ElementType.SIGNATURE,
}
@dataclass
class TextRegion:
"""Represents a raw OCR text region."""
text: str
bbox: List[float] # [x0, y0, x1, y1] or polygon format
confidence: float
page: int = 0
@property
def normalized_bbox(self) -> Tuple[float, float, float, float]:
"""Get normalized bbox as (x0, y0, x1, y1)."""
if not self.bbox:
return (0, 0, 0, 0)
# Check if bbox is nested list format [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
# This is common PaddleOCR polygon format
if len(self.bbox) >= 1 and isinstance(self.bbox[0], (list, tuple)):
# Nested format: extract all x and y coordinates
xs = [pt[0] for pt in self.bbox if len(pt) >= 2]
ys = [pt[1] for pt in self.bbox if len(pt) >= 2]
if xs and ys:
return (min(xs), min(ys), max(xs), max(ys))
return (0, 0, 0, 0)
# Flat format
if len(self.bbox) == 4:
# Simple [x0, y0, x1, y1] format
return (float(self.bbox[0]), float(self.bbox[1]),
float(self.bbox[2]), float(self.bbox[3]))
elif len(self.bbox) >= 8:
# Flat polygon format: [x1, y1, x2, y2, x3, y3, x4, y4]
xs = [self.bbox[i] for i in range(0, len(self.bbox), 2)]
ys = [self.bbox[i] for i in range(1, len(self.bbox), 2)]
return (min(xs), min(ys), max(xs), max(ys))
return (0, 0, 0, 0)
@property
def center(self) -> Tuple[float, float]:
"""Get center point of the bbox."""
x0, y0, x1, y1 = self.normalized_bbox
return ((x0 + x1) / 2, (y0 + y1) / 2)
class GapFillingService:
"""
Service for detecting and filling gaps in PP-StructureV3 output.
This service:
1. Calculates coverage of PP-StructureV3 elements over raw OCR regions
2. Identifies uncovered raw OCR regions
3. Supplements uncovered regions as TEXT elements
4. Deduplicates against existing PP-StructureV3 TEXT elements
5. Recalculates reading order for the combined result
"""
def __init__(
self,
coverage_threshold: float = None,
iou_threshold: float = None,
confidence_threshold: float = None,
dedup_iou_threshold: float = None,
enabled: bool = None
):
"""
Initialize the gap filling service.
Args:
coverage_threshold: Coverage ratio below which gap filling activates (default: 0.7)
iou_threshold: IoU threshold for coverage detection (default: 0.15)
confidence_threshold: Minimum confidence for raw OCR regions (default: 0.3)
dedup_iou_threshold: IoU threshold for deduplication (default: 0.5)
enabled: Whether gap filling is enabled (default: True)
"""
self.coverage_threshold = coverage_threshold if coverage_threshold is not None else getattr(
settings, 'gap_filling_coverage_threshold', 0.7
)
self.iou_threshold = iou_threshold if iou_threshold is not None else getattr(
settings, 'gap_filling_iou_threshold', 0.15
)
self.confidence_threshold = confidence_threshold if confidence_threshold is not None else getattr(
settings, 'gap_filling_confidence_threshold', 0.3
)
self.dedup_iou_threshold = dedup_iou_threshold if dedup_iou_threshold is not None else getattr(
settings, 'gap_filling_dedup_iou_threshold', 0.5
)
self.enabled = enabled if enabled is not None else getattr(
settings, 'gap_filling_enabled', True
)
def should_activate(
self,
raw_ocr_regions: List[TextRegion],
pp_structure_elements: List[DocumentElement]
) -> Tuple[bool, float]:
"""
Determine if gap filling should be activated.
Gap filling activates when:
1. Coverage ratio is below threshold (default: 70%)
2. OR element count disparity is significant
Args:
raw_ocr_regions: List of raw OCR text regions
pp_structure_elements: List of PP-StructureV3 elements
Returns:
Tuple of (should_activate, coverage_ratio)
"""
if not self.enabled:
return False, 1.0
if not raw_ocr_regions:
return False, 1.0
# Calculate coverage
covered_count = 0
for region in raw_ocr_regions:
if self._is_region_covered(region, pp_structure_elements):
covered_count += 1
coverage_ratio = covered_count / len(raw_ocr_regions)
# Check activation conditions
should_activate = coverage_ratio < self.coverage_threshold
if should_activate:
logger.info(
f"Gap filling activated: coverage={coverage_ratio:.2%} < threshold={self.coverage_threshold:.0%}, "
f"raw_regions={len(raw_ocr_regions)}, pp_elements={len(pp_structure_elements)}"
)
else:
logger.debug(
f"Gap filling not needed: coverage={coverage_ratio:.2%} >= threshold={self.coverage_threshold:.0%}"
)
return should_activate, coverage_ratio
def find_uncovered_regions(
self,
raw_ocr_regions: List[TextRegion],
pp_structure_elements: List[DocumentElement]
) -> List[TextRegion]:
"""
Find raw OCR regions not covered by PP-StructureV3 elements.
A region is considered covered if:
1. Its center point falls inside any PP-StructureV3 element bbox, OR
2. IoU with any PP-StructureV3 element exceeds iou_threshold
Args:
raw_ocr_regions: List of raw OCR text regions
pp_structure_elements: List of PP-StructureV3 elements
Returns:
List of uncovered raw OCR regions
"""
uncovered = []
for region in raw_ocr_regions:
# Skip low confidence regions
if region.confidence < self.confidence_threshold:
continue
if not self._is_region_covered(region, pp_structure_elements):
uncovered.append(region)
logger.debug(f"Found {len(uncovered)} uncovered regions out of {len(raw_ocr_regions)}")
return uncovered
def _is_region_covered(
self,
region: TextRegion,
pp_structure_elements: List[DocumentElement]
) -> bool:
"""
Check if a raw OCR region is covered by any PP-StructureV3 element.
Args:
region: Raw OCR text region
pp_structure_elements: List of PP-StructureV3 elements
Returns:
True if the region is covered
"""
center_x, center_y = region.center
region_bbox = region.normalized_bbox
for element in pp_structure_elements:
elem_bbox = (
element.bbox.x0, element.bbox.y0,
element.bbox.x1, element.bbox.y1
)
# Check 1: Center point falls inside element bbox
if self._point_in_bbox(center_x, center_y, elem_bbox):
return True
# Check 2: IoU exceeds threshold
iou = self._calculate_iou(region_bbox, elem_bbox)
if iou > self.iou_threshold:
return True
return False
def deduplicate_regions(
self,
uncovered_regions: List[TextRegion],
pp_structure_elements: List[DocumentElement]
) -> List[TextRegion]:
"""
Remove regions that highly overlap with existing PP-StructureV3 TEXT elements.
Args:
uncovered_regions: List of uncovered raw OCR regions
pp_structure_elements: List of PP-StructureV3 elements
Returns:
Deduplicated list of regions
"""
# Get TEXT elements only for deduplication
text_elements = [
e for e in pp_structure_elements
if e.type not in SKIP_ELEMENT_TYPES
]
deduplicated = []
for region in uncovered_regions:
region_bbox = region.normalized_bbox
is_duplicate = False
for element in text_elements:
elem_bbox = (
element.bbox.x0, element.bbox.y0,
element.bbox.x1, element.bbox.y1
)
iou = self._calculate_iou(region_bbox, elem_bbox)
if iou > self.dedup_iou_threshold:
logger.debug(
f"Skipping duplicate region (IoU={iou:.2f}): '{region.text[:30]}...'"
)
is_duplicate = True
break
if not is_duplicate:
deduplicated.append(region)
removed_count = len(uncovered_regions) - len(deduplicated)
if removed_count > 0:
logger.debug(f"Removed {removed_count} duplicate regions")
return deduplicated
def convert_regions_to_elements(
self,
regions: List[TextRegion],
page_number: int,
start_element_id: int = 0
) -> List[DocumentElement]:
"""
Convert raw OCR regions to DocumentElement objects.
Args:
regions: List of raw OCR regions to convert
page_number: Page number for the elements
start_element_id: Starting ID counter for elements
Returns:
List of DocumentElement objects
"""
elements = []
for idx, region in enumerate(regions):
x0, y0, x1, y1 = region.normalized_bbox
element = DocumentElement(
element_id=f"gap_fill_{page_number}_{start_element_id + idx}",
type=ElementType.TEXT,
content=region.text,
bbox=BoundingBox(x0=x0, y0=y0, x1=x1, y1=y1),
confidence=region.confidence,
metadata={
'source': 'gap_filling',
'original_confidence': region.confidence
}
)
elements.append(element)
return elements
def recalculate_reading_order(
self,
elements: List[DocumentElement]
) -> List[int]:
"""
Recalculate reading order for elements based on position.
Sorts elements by y0 (top to bottom) then x0 (left to right).
Args:
elements: List of DocumentElement objects
Returns:
List of element indices in reading order
"""
# Create indexed list with position info
indexed_elements = [
(idx, e.bbox.y0, e.bbox.x0)
for idx, e in enumerate(elements)
]
# Sort by y0 then x0
indexed_elements.sort(key=lambda x: (x[1], x[2]))
# Return indices in reading order
return [idx for idx, _, _ in indexed_elements]
def merge_adjacent_regions(
self,
regions: List[TextRegion],
max_horizontal_gap: float = 20.0,
max_vertical_gap: float = 5.0
) -> List[TextRegion]:
"""
Merge fragmented adjacent regions on the same line.
This is optional and can reduce fragmentation from raw OCR.
Args:
regions: List of raw OCR regions
max_horizontal_gap: Maximum horizontal gap to merge (pixels)
max_vertical_gap: Maximum vertical gap to merge (pixels)
Returns:
List of merged regions
"""
if not regions:
return regions
# Sort by y0, then x0
sorted_regions = sorted(
regions,
key=lambda r: (r.normalized_bbox[1], r.normalized_bbox[0])
)
merged = []
current = sorted_regions[0]
for next_region in sorted_regions[1:]:
curr_bbox = current.normalized_bbox
next_bbox = next_region.normalized_bbox
# Check if on same line (vertical overlap)
curr_y_center = (curr_bbox[1] + curr_bbox[3]) / 2
next_y_center = (next_bbox[1] + next_bbox[3]) / 2
vertical_distance = abs(curr_y_center - next_y_center)
# Check horizontal gap
horizontal_gap = next_bbox[0] - curr_bbox[2]
if (vertical_distance < max_vertical_gap and
0 <= horizontal_gap <= max_horizontal_gap):
# Merge regions
merged_bbox = [
min(curr_bbox[0], next_bbox[0]),
min(curr_bbox[1], next_bbox[1]),
max(curr_bbox[2], next_bbox[2]),
max(curr_bbox[3], next_bbox[3])
]
current = TextRegion(
text=current.text + " " + next_region.text,
bbox=merged_bbox,
confidence=min(current.confidence, next_region.confidence),
page=current.page
)
else:
merged.append(current)
current = next_region
merged.append(current)
if len(merged) < len(regions):
logger.debug(f"Merged {len(regions)} regions into {len(merged)}")
return merged
def fill_gaps(
self,
raw_ocr_regions: List[Dict[str, Any]],
pp_structure_elements: List[DocumentElement],
page_number: int,
ocr_dimensions: Optional[Dict[str, Any]] = None,
pp_dimensions: Optional[Dimensions] = None
) -> Tuple[List[DocumentElement], Dict[str, Any]]:
"""
Main entry point: detect gaps and fill with raw OCR regions.
Args:
raw_ocr_regions: Raw OCR results (list of dicts with text, bbox, confidence)
pp_structure_elements: PP-StructureV3 elements
page_number: Current page number
ocr_dimensions: OCR image dimensions for coordinate alignment
pp_dimensions: PP-Structure dimensions for coordinate alignment
Returns:
Tuple of (supplemented_elements, statistics)
"""
statistics = {
'enabled': self.enabled,
'activated': False,
'coverage_ratio': 1.0,
'raw_ocr_count': len(raw_ocr_regions),
'pp_structure_count': len(pp_structure_elements),
'uncovered_count': 0,
'deduplicated_count': 0,
'supplemented_count': 0
}
if not self.enabled:
logger.debug("Gap filling is disabled")
return [], statistics
# Convert raw OCR regions to TextRegion objects
text_regions = self._convert_raw_ocr_regions(
raw_ocr_regions, page_number, ocr_dimensions, pp_dimensions
)
if not text_regions:
logger.debug("No valid text regions to process")
return [], statistics
# Check if gap filling should activate
should_activate, coverage_ratio = self.should_activate(
text_regions, pp_structure_elements
)
statistics['coverage_ratio'] = coverage_ratio
statistics['activated'] = should_activate
if not should_activate:
return [], statistics
# Find uncovered regions
uncovered = self.find_uncovered_regions(text_regions, pp_structure_elements)
statistics['uncovered_count'] = len(uncovered)
if not uncovered:
logger.debug("No uncovered regions found")
return [], statistics
# Deduplicate against existing TEXT elements
deduplicated = self.deduplicate_regions(uncovered, pp_structure_elements)
statistics['deduplicated_count'] = len(deduplicated)
if not deduplicated:
logger.debug("All uncovered regions were duplicates")
return [], statistics
# Optional: Merge adjacent regions
# merged = self.merge_adjacent_regions(deduplicated)
# Convert to DocumentElements
start_id = len(pp_structure_elements)
supplemented = self.convert_regions_to_elements(
deduplicated, page_number, start_id
)
statistics['supplemented_count'] = len(supplemented)
logger.info(
f"Gap filling complete: supplemented {len(supplemented)} elements "
f"(coverage: {coverage_ratio:.2%} -> estimated {(coverage_ratio + len(supplemented)/len(text_regions) if text_regions else 0):.2%})"
)
return supplemented, statistics
def _convert_raw_ocr_regions(
self,
raw_regions: List[Dict[str, Any]],
page_number: int,
ocr_dimensions: Optional[Dict[str, Any]] = None,
pp_dimensions: Optional[Dimensions] = None
) -> List[TextRegion]:
"""
Convert raw OCR region dicts to TextRegion objects.
Handles coordinate alignment if dimensions are provided.
Args:
raw_regions: List of raw OCR region dictionaries
page_number: Current page number
ocr_dimensions: OCR image dimensions
pp_dimensions: PP-Structure dimensions
Returns:
List of TextRegion objects
"""
text_regions = []
# Calculate scale factors if needed
scale_x, scale_y = 1.0, 1.0
if ocr_dimensions and pp_dimensions:
ocr_width = ocr_dimensions.get('width', 0)
ocr_height = ocr_dimensions.get('height', 0)
if ocr_width > 0 and pp_dimensions.width > 0:
scale_x = pp_dimensions.width / ocr_width
if ocr_height > 0 and pp_dimensions.height > 0:
scale_y = pp_dimensions.height / ocr_height
if scale_x != 1.0 or scale_y != 1.0:
logger.debug(f"Coordinate scaling: x={scale_x:.3f}, y={scale_y:.3f}")
for region in raw_regions:
text = region.get('text', '')
if not text or not text.strip():
continue
confidence = region.get('confidence', 0.0)
bbox_raw = region.get('bbox', [])
# Normalize bbox
if isinstance(bbox_raw, dict):
# Dict format: {x_min, y_min, x_max, y_max}
bbox = [
bbox_raw.get('x_min', 0),
bbox_raw.get('y_min', 0),
bbox_raw.get('x_max', 0),
bbox_raw.get('y_max', 0)
]
elif isinstance(bbox_raw, (list, tuple)):
bbox = list(bbox_raw)
else:
continue
# Apply scaling if needed
if scale_x != 1.0 or scale_y != 1.0:
# Check if nested list format [[x1,y1], [x2,y2], ...]
if len(bbox) >= 1 and isinstance(bbox[0], (list, tuple)):
bbox = [
[pt[0] * scale_x, pt[1] * scale_y]
for pt in bbox if len(pt) >= 2
]
elif len(bbox) == 4 and not isinstance(bbox[0], (list, tuple)):
# Simple [x0, y0, x1, y1] format
bbox = [
bbox[0] * scale_x, bbox[1] * scale_y,
bbox[2] * scale_x, bbox[3] * scale_y
]
elif len(bbox) >= 8:
# Flat polygon format [x1, y1, x2, y2, ...]
bbox = [
bbox[i] * (scale_x if i % 2 == 0 else scale_y)
for i in range(len(bbox))
]
text_regions.append(TextRegion(
text=text,
bbox=bbox,
confidence=confidence,
page=page_number
))
return text_regions
@staticmethod
def _point_in_bbox(
x: float, y: float,
bbox: Tuple[float, float, float, float]
) -> bool:
"""Check if point (x, y) is inside bbox (x0, y0, x1, y1)."""
x0, y0, x1, y1 = bbox
return x0 <= x <= x1 and y0 <= y <= y1
@staticmethod
def _calculate_iou(
bbox1: Tuple[float, float, float, float],
bbox2: Tuple[float, float, float, float]
) -> float:
"""
Calculate Intersection over Union (IoU) of two bboxes.
Args:
bbox1: First bbox (x0, y0, x1, y1)
bbox2: Second bbox (x0, y0, x1, y1)
Returns:
IoU value between 0 and 1
"""
# Calculate intersection
x0 = max(bbox1[0], bbox2[0])
y0 = max(bbox1[1], bbox2[1])
x1 = min(bbox1[2], bbox2[2])
y1 = min(bbox1[3], bbox2[3])
if x1 <= x0 or y1 <= y0:
return 0.0
intersection = (x1 - x0) * (y1 - y0)
# Calculate union
area1 = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
area2 = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
union = area1 + area2 - intersection
if union <= 0:
return 0.0
return intersection / union

View File

@@ -46,6 +46,19 @@ except ImportError as e:
logger = logging.getLogger(__name__)
# Sentinel value for "use PubLayNet default" - explicitly NO model specification
_USE_PUBLAYNET_DEFAULT = "__USE_PUBLAYNET_DEFAULT__"
# Layout model mapping: user-friendly names to actual model names
# - "chinese": PP-DocLayout-S - Best for Chinese documents (forms, contracts, invoices)
# - "default": PubLayNet-based default model - Best for English documents
# - "cdla": picodet_lcnet_x1_0_fgd_layout_cdla - Alternative for Chinese layout
LAYOUT_MODEL_MAPPING = {
"chinese": "PP-DocLayout-S",
"default": _USE_PUBLAYNET_DEFAULT, # Uses default PubLayNet-based model (no custom model)
"cdla": "picodet_lcnet_x1_0_fgd_layout_cdla",
}
class OCRService:
"""
@@ -436,77 +449,45 @@ class OCRService:
return self.ocr_engines[lang]
def _ensure_structure_engine(self, custom_params: Optional[Dict[str, any]] = None) -> PPStructureV3:
def _ensure_structure_engine(self, layout_model: Optional[str] = None) -> PPStructureV3:
"""
Get or create PP-Structure engine for layout analysis with GPU support.
Supports custom parameters that override default settings.
Supports layout model selection for different document types.
Args:
custom_params: Optional dictionary of custom PP-StructureV3 parameters.
If provided, creates a new engine instance (not cached).
Supported keys: layout_detection_threshold, layout_nms_threshold,
layout_merge_bboxes_mode, layout_unclip_ratio, text_det_thresh,
text_det_box_thresh, text_det_unclip_ratio
layout_model: Layout detection model selection:
- "chinese": PP-DocLayout-S (best for Chinese documents)
- "default": PubLayNet-based (best for English documents)
- "cdla": CDLA model (alternative for Chinese layout)
- None: Use config default
Returns:
PPStructure engine instance
"""
# If custom params provided, create a new engine instance (don't use cache)
if custom_params:
logger.info(f"Creating PP-StructureV3 engine with custom parameters (GPU: {self.use_gpu})")
logger.info(f"Custom params: {custom_params}")
# Resolve layout model name from user-friendly name
resolved_model_name = None
use_publaynet_default = False # Flag to explicitly use PubLayNet default (no model param)
try:
# Base configuration from settings
use_chart = settings.enable_chart_recognition
use_formula = settings.enable_formula_recognition
use_table = settings.enable_table_recognition
if layout_model:
resolved_model_name = LAYOUT_MODEL_MAPPING.get(layout_model)
if layout_model not in LAYOUT_MODEL_MAPPING:
logger.warning(f"Unknown layout model '{layout_model}', using config default")
resolved_model_name = settings.layout_detection_model_name
elif resolved_model_name == _USE_PUBLAYNET_DEFAULT:
# User explicitly selected "default" - use PubLayNet without custom model
use_publaynet_default = True
resolved_model_name = None
logger.info(f"Using layout model: {layout_model} -> PubLayNet default (no custom model)")
else:
logger.info(f"Using layout model: {layout_model} -> {resolved_model_name}")
# Parameter priority: custom > settings default
layout_threshold = custom_params.get('layout_detection_threshold', settings.layout_detection_threshold)
layout_nms = custom_params.get('layout_nms_threshold', settings.layout_nms_threshold)
layout_merge = custom_params.get('layout_merge_bboxes_mode', settings.layout_merge_mode)
layout_unclip = custom_params.get('layout_unclip_ratio', settings.layout_unclip_ratio)
text_thresh = custom_params.get('text_det_thresh', settings.text_det_thresh)
text_box_thresh = custom_params.get('text_det_box_thresh', settings.text_det_box_thresh)
text_unclip = custom_params.get('text_det_unclip_ratio', settings.text_det_unclip_ratio)
# Check if we need to recreate the engine due to different model
current_model = getattr(self, '_current_layout_model', None)
if self.structure_engine is not None and layout_model and layout_model != current_model:
logger.info(f"Layout model changed from {current_model} to {layout_model}, recreating engine")
self.structure_engine = None # Force recreation
logger.info(f"PP-StructureV3 config: table={use_table}, formula={use_formula}, chart={use_chart}")
logger.info(f"Layout config: threshold={layout_threshold}, nms={layout_nms}, merge={layout_merge}, unclip={layout_unclip}")
logger.info(f"Text detection: thresh={text_thresh}, box_thresh={text_box_thresh}, unclip={text_unclip}")
# Create temporary engine with custom params (not cached)
custom_engine = PPStructureV3(
use_doc_orientation_classify=False,
use_doc_unwarping=False,
use_textline_orientation=False,
use_table_recognition=use_table,
use_formula_recognition=use_formula,
use_chart_recognition=use_chart,
layout_threshold=layout_threshold,
layout_nms=layout_nms,
layout_unclip_ratio=layout_unclip,
layout_merge_bboxes_mode=layout_merge,
text_det_thresh=text_thresh,
text_det_box_thresh=text_box_thresh,
text_det_unclip_ratio=text_unclip,
)
logger.info(f"PP-StructureV3 engine with custom params ready (PaddlePaddle {paddle.__version__}, {'GPU' if self.use_gpu else 'CPU'} mode)")
# Check GPU memory after loading
if self.use_gpu and settings.enable_memory_optimization:
self._check_gpu_memory_usage()
return custom_engine
except Exception as e:
logger.error(f"Failed to create PP-StructureV3 engine with custom params: {e}")
# Fall back to default cached engine
logger.warning("Falling back to default cached engine")
custom_params = None # Clear custom params to use cached engine
# Use cached default engine
# Use cached engine or create new one
if self.structure_engine is None:
logger.info(f"Initializing PP-StructureV3 engine (GPU: {self.use_gpu})")
@@ -524,28 +505,51 @@ class OCRService:
text_box_thresh = settings.text_det_box_thresh
text_unclip = settings.text_det_unclip_ratio
# Layout model configuration:
# - If use_publaynet_default: don't specify any model (use PubLayNet default)
# - If resolved_model_name: use the specified model
# - Otherwise: use config default
if use_publaynet_default:
layout_model_name = None # Explicitly no model = PubLayNet default
elif resolved_model_name:
layout_model_name = resolved_model_name
else:
layout_model_name = settings.layout_detection_model_name
layout_model_dir = settings.layout_detection_model_dir
logger.info(f"PP-StructureV3 config: table={use_table}, formula={use_formula}, chart={use_chart}")
logger.info(f"Layout model: name={layout_model_name}, dir={layout_model_dir}")
logger.info(f"Layout config: threshold={layout_threshold}, nms={layout_nms}, merge={layout_merge}, unclip={layout_unclip}")
logger.info(f"Text detection: thresh={text_thresh}, box_thresh={text_box_thresh}, unclip={text_unclip}")
self.structure_engine = PPStructureV3(
use_doc_orientation_classify=False,
use_doc_unwarping=False,
use_textline_orientation=False,
use_table_recognition=use_table,
use_formula_recognition=use_formula,
use_chart_recognition=use_chart,
layout_threshold=layout_threshold,
layout_nms=layout_nms,
layout_unclip_ratio=layout_unclip,
layout_merge_bboxes_mode=layout_merge, # Use 'small' to minimize merging
text_det_thresh=text_thresh,
text_det_box_thresh=text_box_thresh,
text_det_unclip_ratio=text_unclip,
)
# Build PPStructureV3 kwargs
pp_kwargs = {
'use_doc_orientation_classify': False,
'use_doc_unwarping': False,
'use_textline_orientation': False,
'use_table_recognition': use_table,
'use_formula_recognition': use_formula,
'use_chart_recognition': use_chart,
'layout_threshold': layout_threshold,
'layout_nms': layout_nms,
'layout_unclip_ratio': layout_unclip,
'layout_merge_bboxes_mode': layout_merge,
'text_det_thresh': text_thresh,
'text_det_box_thresh': text_box_thresh,
'text_det_unclip_ratio': text_unclip,
}
# Add layout model configuration if specified
if layout_model_name:
pp_kwargs['layout_detection_model_name'] = layout_model_name
if layout_model_dir:
pp_kwargs['layout_detection_model_dir'] = layout_model_dir
self.structure_engine = PPStructureV3(**pp_kwargs)
# Track model loading for cache management
self._model_last_used['structure'] = datetime.now()
self._current_layout_model = layout_model # Track current model for recreation check
logger.info(f"PP-StructureV3 engine ready (PaddlePaddle {paddle.__version__}, {'GPU' if self.use_gpu else 'CPU'} mode)")
@@ -565,17 +569,27 @@ class OCRService:
use_formula = settings.enable_formula_recognition
use_table = settings.enable_table_recognition
layout_threshold = settings.layout_detection_threshold
layout_model_name = settings.layout_detection_model_name
layout_model_dir = settings.layout_detection_model_dir
self.structure_engine = PPStructureV3(
use_doc_orientation_classify=False,
use_doc_unwarping=False,
use_textline_orientation=False,
use_table_recognition=use_table,
use_formula_recognition=use_formula,
use_chart_recognition=use_chart,
layout_threshold=layout_threshold,
)
logger.info("PP-StructureV3 engine ready (CPU mode - fallback)")
# Build CPU fallback kwargs
cpu_kwargs = {
'use_doc_orientation_classify': False,
'use_doc_unwarping': False,
'use_textline_orientation': False,
'use_table_recognition': use_table,
'use_formula_recognition': use_formula,
'use_chart_recognition': use_chart,
'layout_threshold': layout_threshold,
}
if layout_model_name:
cpu_kwargs['layout_detection_model_name'] = layout_model_name
if layout_model_dir:
cpu_kwargs['layout_detection_model_dir'] = layout_model_dir
self.structure_engine = PPStructureV3(**cpu_kwargs)
self._current_layout_model = layout_model # Track current model for recreation check
logger.info(f"PP-StructureV3 engine ready (CPU mode - fallback, layout_model={layout_model_name})")
else:
raise
@@ -813,7 +827,7 @@ class OCRService:
confidence_threshold: Optional[float] = None,
output_dir: Optional[Path] = None,
current_page: int = 0,
pp_structure_params: Optional[Dict[str, any]] = None
layout_model: Optional[str] = None
) -> Dict:
"""
Process single image with OCR and layout analysis
@@ -825,7 +839,7 @@ class OCRService:
confidence_threshold: Minimum confidence threshold (uses default if None)
output_dir: Optional output directory for saving extracted images
current_page: Current page number (0-based) for multi-page documents
pp_structure_params: Optional custom PP-StructureV3 parameters
layout_model: Layout detection model ('chinese', 'default', 'cdla')
Returns:
Dictionary with OCR results and metadata
@@ -894,7 +908,7 @@ class OCRService:
confidence_threshold=confidence_threshold,
output_dir=output_dir,
current_page=page_num - 1, # Convert to 0-based page number for layout data
pp_structure_params=pp_structure_params
layout_model=layout_model
)
# Accumulate results
@@ -1040,7 +1054,7 @@ class OCRService:
image_path,
output_dir=output_dir,
current_page=current_page,
pp_structure_params=pp_structure_params
layout_model=layout_model
)
# Generate Markdown
@@ -1078,6 +1092,38 @@ class OCRService:
'height': ocr_height
}]
# Generate PP-StructureV3 debug outputs if enabled
if settings.pp_structure_debug_enabled and output_dir:
try:
from app.services.pp_structure_debug import PPStructureDebug
debug_service = PPStructureDebug(output_dir)
# Save raw results as JSON
debug_service.save_raw_results(
pp_structure_results={
'elements': layout_data.get('elements', []),
'total_elements': layout_data.get('total_elements', 0),
'element_types': layout_data.get('element_types', {}),
'reading_order': layout_data.get('reading_order', []),
'enhanced': True,
'has_parsing_res_list': True
},
raw_ocr_regions=text_regions,
filename_prefix=image_path.stem
)
# Generate visualization if enabled
if settings.pp_structure_debug_visualization:
debug_service.generate_visualization(
image_path=image_path,
pp_structure_elements=layout_data.get('elements', []),
raw_ocr_regions=text_regions,
filename_prefix=image_path.stem
)
logger.info(f"Generated PP-StructureV3 debug outputs for {image_path.name}")
except Exception as debug_error:
logger.warning(f"Failed to generate debug outputs: {debug_error}")
logger.info(
f"OCR completed: {image_path.name} - "
f"{len(text_regions)} regions, "
@@ -1164,7 +1210,7 @@ class OCRService:
image_path: Path,
output_dir: Optional[Path] = None,
current_page: int = 0,
pp_structure_params: Optional[Dict[str, any]] = None
layout_model: Optional[str] = None
) -> Tuple[Optional[Dict], List[Dict]]:
"""
Analyze document layout using PP-StructureV3 with enhanced element extraction
@@ -1173,7 +1219,7 @@ class OCRService:
image_path: Path to image file
output_dir: Optional output directory for saving extracted images (defaults to image_path.parent)
current_page: Current page number (0-based) for multi-page documents
pp_structure_params: Optional custom PP-StructureV3 parameters
layout_model: Layout detection model ('chinese', 'default', 'cdla')
Returns:
Tuple of (layout_data, images_metadata)
@@ -1191,7 +1237,7 @@ class OCRService:
f"Mode: {'CPU fallback' if self._cpu_fallback_active else 'GPU'}"
)
structure_engine = self._ensure_structure_engine(pp_structure_params)
structure_engine = self._ensure_structure_engine(layout_model)
# Try enhanced processing first
try:
@@ -1425,7 +1471,7 @@ class OCRService:
confidence_threshold: Optional[float] = None,
output_dir: Optional[Path] = None,
force_track: Optional[str] = None,
pp_structure_params: Optional[Dict[str, any]] = None
layout_model: Optional[str] = None
) -> Union[UnifiedDocument, Dict]:
"""
Process document using dual-track approach.
@@ -1437,7 +1483,7 @@ class OCRService:
confidence_threshold: Minimum confidence threshold
output_dir: Optional output directory for extracted images
force_track: Force specific track ("ocr" or "direct"), None for auto-detection
pp_structure_params: Optional custom PP-StructureV3 parameters (used for OCR track only)
layout_model: Layout detection model ('chinese', 'default', 'cdla') (used for OCR track only)
Returns:
UnifiedDocument if dual-track is enabled, Dict otherwise
@@ -1445,7 +1491,7 @@ class OCRService:
if not self.dual_track_enabled:
# Fallback to traditional OCR processing
return self.process_file_traditional(
file_path, lang, detect_layout, confidence_threshold, output_dir, pp_structure_params
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model
)
start_time = datetime.now()
@@ -1517,7 +1563,7 @@ class OCRService:
ocr_result = self.process_file_traditional(
actual_file_path, lang, detect_layout=True,
confidence_threshold=confidence_threshold,
output_dir=output_dir, pp_structure_params=pp_structure_params
output_dir=output_dir, layout_model=layout_model
)
# Convert OCR result to extract images
@@ -1550,7 +1596,7 @@ class OCRService:
# Use OCR for scanned documents, images, etc.
logger.info("Using OCR track (PaddleOCR)")
ocr_result = self.process_file_traditional(
file_path, lang, detect_layout, confidence_threshold, output_dir, pp_structure_params
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model
)
# Convert OCR result to UnifiedDocument using the converter
@@ -1580,7 +1626,7 @@ class OCRService:
logger.error(f"Error in dual-track processing: {e}")
# Fallback to traditional OCR
return self.process_file_traditional(
file_path, lang, detect_layout, confidence_threshold, output_dir, pp_structure_params
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model
)
def _merge_ocr_images_into_direct(
@@ -1659,7 +1705,7 @@ class OCRService:
detect_layout: bool = True,
confidence_threshold: Optional[float] = None,
output_dir: Optional[Path] = None,
pp_structure_params: Optional[Dict[str, any]] = None
layout_model: Optional[str] = None
) -> Dict:
"""
Traditional OCR processing (legacy method).
@@ -1670,7 +1716,7 @@ class OCRService:
detect_layout: Whether to perform layout analysis
confidence_threshold: Minimum confidence threshold
output_dir: Optional output directory
pp_structure_params: Optional custom PP-StructureV3 parameters
layout_model: Layout detection model ('chinese', 'default', 'cdla')
Returns:
Dictionary with OCR results in legacy format
@@ -1683,7 +1729,7 @@ class OCRService:
all_results = []
for i, image_path in enumerate(image_paths):
result = self.process_image(
image_path, lang, detect_layout, confidence_threshold, output_dir, i, pp_structure_params
image_path, lang, detect_layout, confidence_threshold, output_dir, i, layout_model
)
all_results.append(result)
@@ -1699,7 +1745,7 @@ class OCRService:
else:
# Single image or other file
return self.process_image(
file_path, lang, detect_layout, confidence_threshold, output_dir, 0, pp_structure_params
file_path, lang, detect_layout, confidence_threshold, output_dir, 0, layout_model
)
def _combine_results(self, results: List[Dict]) -> Dict:
@@ -1784,7 +1830,7 @@ class OCRService:
output_dir: Optional[Path] = None,
use_dual_track: bool = True,
force_track: Optional[str] = None,
pp_structure_params: Optional[Dict[str, any]] = None
layout_model: Optional[str] = None
) -> Union[UnifiedDocument, Dict]:
"""
Main processing method with dual-track support.
@@ -1797,7 +1843,7 @@ class OCRService:
output_dir: Optional output directory
use_dual_track: Whether to use dual-track processing (default True)
force_track: Force specific track ("ocr" or "direct")
pp_structure_params: Optional custom PP-StructureV3 parameters (used for OCR track only)
layout_model: Layout detection model ('chinese', 'default', 'cdla') (used for OCR track only)
Returns:
UnifiedDocument if dual-track is enabled and use_dual_track=True,
@@ -1809,12 +1855,12 @@ class OCRService:
if (use_dual_track or force_track) and self.dual_track_enabled:
# Use dual-track processing (or forced track)
return self.process_with_dual_track(
file_path, lang, detect_layout, confidence_threshold, output_dir, force_track, pp_structure_params
file_path, lang, detect_layout, confidence_threshold, output_dir, force_track, layout_model
)
else:
# Use traditional OCR processing (no force_track support)
return self.process_file_traditional(
file_path, lang, detect_layout, confidence_threshold, output_dir, pp_structure_params
file_path, lang, detect_layout, confidence_threshold, output_dir, layout_model
)
def process_legacy(

View File

@@ -3,6 +3,9 @@ OCR to UnifiedDocument Converter
Converts PP-StructureV3 OCR results to UnifiedDocument format, preserving
all structure information and metadata.
Includes gap filling support to supplement PP-StructureV3 output with raw OCR
regions when significant content loss is detected.
"""
import logging
@@ -16,10 +19,165 @@ from app.models.unified_document import (
BoundingBox, StyleInfo, TableData, ElementType,
ProcessingTrack, TableCell, Dimensions
)
from app.services.gap_filling_service import GapFillingService
logger = logging.getLogger(__name__)
def trim_empty_columns(table_dict: Dict[str, Any]) -> Dict[str, Any]:
"""
Remove empty columns from a table dictionary.
A column is considered empty if ALL cells in that column have content that is
empty or whitespace-only (using .strip() to determine emptiness).
This function:
1. Identifies columns where every cell's content is empty/whitespace
2. Removes identified empty columns
3. Updates cols/columns value
4. Recalculates each cell's col index
5. Adjusts col_span when spans cross removed columns
6. Removes cells entirely when their complete span falls within removed columns
7. Preserves original bbox (no layout drift)
Args:
table_dict: Table dictionary with keys: rows, cols/columns, cells
Returns:
Cleaned table dictionary with empty columns removed
"""
cells = table_dict.get('cells', [])
if not cells:
return table_dict
# Get original column count
original_cols = table_dict.get('cols', table_dict.get('columns', 0))
if original_cols == 0:
# Calculate from cells if not provided
max_col = 0
for cell in cells:
cell_col = cell.get('col', 0) if isinstance(cell, dict) else getattr(cell, 'col', 0)
cell_span = cell.get('col_span', 1) if isinstance(cell, dict) else getattr(cell, 'col_span', 1)
max_col = max(max_col, cell_col + cell_span)
original_cols = max_col
if original_cols == 0:
return table_dict
# Build a map: column_index -> list of cell contents
# For cells with col_span > 1, we only check their primary column
column_contents: Dict[int, List[str]] = {i: [] for i in range(original_cols)}
for cell in cells:
if isinstance(cell, dict):
col = cell.get('col', 0)
col_span = cell.get('col_span', 1)
content = cell.get('content', '')
else:
col = getattr(cell, 'col', 0)
col_span = getattr(cell, 'col_span', 1)
content = getattr(cell, 'content', '')
# Mark content for each column this cell spans
for c in range(col, min(col + col_span, original_cols)):
if c in column_contents:
column_contents[c].append(str(content).strip() if content else '')
# Identify empty columns (all content is empty/whitespace)
empty_columns = set()
for col_idx, contents in column_contents.items():
# A column is empty if ALL cells in it have empty content
# Note: If a column has no cells at all, it's considered empty
if all(c == '' for c in contents):
empty_columns.add(col_idx)
if not empty_columns:
# No empty columns to remove, just ensure cols is set
result = dict(table_dict)
if result.get('cols', result.get('columns', 0)) == 0:
result['cols'] = original_cols
if 'columns' in result:
result['columns'] = original_cols
return result
logger.debug(f"Removing empty columns: {sorted(empty_columns)} from table with {original_cols} cols")
# Build column mapping: old_col -> new_col (or None if removed)
col_mapping: Dict[int, Optional[int]] = {}
new_col = 0
for old_col in range(original_cols):
if old_col in empty_columns:
col_mapping[old_col] = None
else:
col_mapping[old_col] = new_col
new_col += 1
new_cols = new_col
# Process cells
new_cells = []
for cell in cells:
if isinstance(cell, dict):
old_col = cell.get('col', 0)
old_col_span = cell.get('col_span', 1)
else:
old_col = getattr(cell, 'col', 0)
old_col_span = getattr(cell, 'col_span', 1)
# Calculate new col and col_span
# Find the first non-removed column in this cell's span
new_start_col = None
new_end_col = None
for c in range(old_col, min(old_col + old_col_span, original_cols)):
mapped = col_mapping.get(c)
if mapped is not None:
if new_start_col is None:
new_start_col = mapped
new_end_col = mapped
# If entire span falls within removed columns, skip this cell
if new_start_col is None:
logger.debug(f"Removing cell at row={cell.get('row', 0) if isinstance(cell, dict) else cell.row}, "
f"col={old_col} (entire span in removed columns)")
continue
new_col_span = new_end_col - new_start_col + 1
# Create new cell
if isinstance(cell, dict):
new_cell = dict(cell)
new_cell['col'] = new_start_col
new_cell['col_span'] = new_col_span
else:
# Handle TableCell objects
new_cell = {
'row': cell.row,
'col': new_start_col,
'row_span': cell.row_span,
'col_span': new_col_span,
'content': cell.content
}
if hasattr(cell, 'bbox') and cell.bbox:
new_cell['bbox'] = cell.bbox
if hasattr(cell, 'style') and cell.style:
new_cell['style'] = cell.style
new_cells.append(new_cell)
# Build result
result = dict(table_dict)
result['cells'] = new_cells
result['cols'] = new_cols
if 'columns' in result:
result['columns'] = new_cols
logger.info(f"Trimmed table: {original_cols} -> {new_cols} columns, "
f"{len(cells)} -> {len(new_cells)} cells")
return result
class OCRToUnifiedConverter:
"""
Converter for transforming PP-StructureV3 OCR results to UnifiedDocument format.
@@ -30,11 +188,19 @@ class OCRToUnifiedConverter:
- Multi-page document assembly
- Metadata preservation
- Structure relationship mapping
- Gap filling with raw OCR regions (when PP-StructureV3 misses content)
"""
def __init__(self):
"""Initialize the converter."""
def __init__(self, enable_gap_filling: bool = True):
"""
Initialize the converter.
Args:
enable_gap_filling: Whether to enable gap filling with raw OCR regions
"""
self.element_counter = 0
self.gap_filling_service = GapFillingService() if enable_gap_filling else None
self.gap_filling_stats: Dict[str, Any] = {}
def convert(
self,
@@ -120,13 +286,21 @@ class OCRToUnifiedConverter:
Extract pages from OCR results.
Handles both enhanced PP-StructureV3 results (with parsing_res_list)
and traditional markdown results.
and traditional markdown results. Applies gap filling when enabled.
"""
pages = []
# Extract raw OCR text regions for gap filling
raw_text_regions = ocr_results.get('text_regions', [])
ocr_dimensions = ocr_results.get('ocr_dimensions', {})
# Check if we have enhanced results from PPStructureEnhanced
if 'enhanced_results' in ocr_results:
pages = self._extract_from_enhanced_results(ocr_results['enhanced_results'])
pages = self._extract_from_enhanced_results(
ocr_results['enhanced_results'],
raw_text_regions=raw_text_regions,
ocr_dimensions=ocr_dimensions
)
# Check for traditional OCR results with text_regions at top level (from process_file_traditional)
elif 'text_regions' in ocr_results:
pages = self._extract_from_traditional_ocr(ocr_results)
@@ -143,9 +317,21 @@ class OCRToUnifiedConverter:
def _extract_from_enhanced_results(
self,
enhanced_results: List[Dict[str, Any]]
enhanced_results: List[Dict[str, Any]],
raw_text_regions: Optional[List[Dict[str, Any]]] = None,
ocr_dimensions: Optional[Dict[str, Any]] = None
) -> List[Page]:
"""Extract pages from enhanced PP-StructureV3 results."""
"""
Extract pages from enhanced PP-StructureV3 results.
Applies gap filling when enabled to supplement PP-StructureV3 output
with raw OCR regions that were not detected by the layout model.
Args:
enhanced_results: PP-StructureV3 enhanced results
raw_text_regions: Raw OCR text regions for gap filling
ocr_dimensions: OCR image dimensions for coordinate alignment
"""
pages = []
for page_idx, page_result in enumerate(enhanced_results):
@@ -158,15 +344,52 @@ class OCRToUnifiedConverter:
if element:
elements.append(element)
# Get page dimensions
pp_dimensions = Dimensions(
width=page_result.get('width', 0),
height=page_result.get('height', 0)
)
# Apply gap filling if enabled and raw regions available
if self.gap_filling_service and raw_text_regions:
# Filter raw regions for current page
page_raw_regions = [
r for r in raw_text_regions
if r.get('page', 0) == page_idx or r.get('page', 1) == page_idx + 1
]
if page_raw_regions:
supplemented, stats = self.gap_filling_service.fill_gaps(
raw_ocr_regions=page_raw_regions,
pp_structure_elements=elements,
page_number=page_idx + 1,
ocr_dimensions=ocr_dimensions,
pp_dimensions=pp_dimensions
)
# Store statistics
self.gap_filling_stats[f'page_{page_idx + 1}'] = stats
if supplemented:
logger.info(
f"Page {page_idx + 1}: Gap filling added {len(supplemented)} elements "
f"(coverage: {stats.get('coverage_ratio', 0):.2%})"
)
elements.extend(supplemented)
# Recalculate reading order for combined elements
reading_order = self.gap_filling_service.recalculate_reading_order(elements)
page_result['reading_order'] = reading_order
# Create page
page = Page(
page_number=page_idx + 1,
dimensions=Dimensions(
width=page_result.get('width', 0),
height=page_result.get('height', 0)
),
dimensions=pp_dimensions,
elements=elements,
metadata={'reading_order': page_result.get('reading_order', [])}
metadata={
'reading_order': page_result.get('reading_order', []),
'gap_filling': self.gap_filling_stats.get(f'page_{page_idx + 1}', {})
}
)
pages.append(page)
@@ -500,6 +723,9 @@ class OCRToUnifiedConverter:
) -> Optional[DocumentElement]:
"""Convert table data to DocumentElement."""
try:
# Clean up empty columns before building TableData
table_dict = trim_empty_columns(table_dict)
# Extract bbox
bbox_data = table_dict.get('bbox', [0, 0, 0, 0])
bbox = BoundingBox(
@@ -587,14 +813,22 @@ class OCRToUnifiedConverter:
cells = []
headers = []
rows = table.find_all('tr')
num_rows = len(rows)
# Track actual column positions accounting for rowspan/colspan
# This is a simplified approach - complex spanning may need enhancement
# First pass: calculate total columns by finding max column extent
# Track cells that span multiple rows: occupied[row][col] = True
occupied: Dict[int, Dict[int, bool]] = {r: {} for r in range(num_rows)}
# Parse all cells with proper rowspan/colspan handling
for row_idx, row in enumerate(rows):
row_cells = row.find_all(['td', 'th'])
col_idx = 0
for cell in row_cells:
# Skip columns that are occupied by rowspan from previous rows
while occupied[row_idx].get(col_idx, False):
col_idx += 1
cell_content = cell.get_text(strip=True)
rowspan = int(cell.get('rowspan', 1))
colspan = int(cell.get('colspan', 1))
@@ -611,26 +845,66 @@ class OCRToUnifiedConverter:
if cell.name == 'th' or row_idx == 0:
headers.append(cell_content)
# Mark cells as occupied for rowspan/colspan
for r in range(row_idx, min(row_idx + rowspan, num_rows)):
for c in range(col_idx, col_idx + colspan):
if r not in occupied:
occupied[r] = {}
occupied[r][c] = True
# Advance column index by colspan
col_idx += colspan
# Calculate actual dimensions
num_rows = len(rows)
num_cols = max(
sum(int(cell.get('colspan', 1)) for cell in row.find_all(['td', 'th']))
for row in rows
) if rows else 0
# Calculate actual column count from occupied cells
num_cols = 0
for r in range(num_rows):
if occupied[r]:
max_col_in_row = max(occupied[r].keys()) + 1
num_cols = max(num_cols, max_col_in_row)
logger.debug(
f"Parsed HTML table: {num_rows} rows, {num_cols} cols, {len(cells)} cells"
)
# Build table dict for cleanup
table_dict = {
'rows': num_rows,
'cols': num_cols,
'cells': [
{
'row': c.row,
'col': c.col,
'row_span': c.row_span,
'col_span': c.col_span,
'content': c.content
}
for c in cells
],
'headers': headers if headers else None,
'caption': extracted_text if extracted_text else None
}
# Clean up empty columns
table_dict = trim_empty_columns(table_dict)
# Convert cleaned cells back to TableCell objects
cleaned_cells = [
TableCell(
row=c['row'],
col=c['col'],
row_span=c.get('row_span', 1),
col_span=c.get('col_span', 1),
content=c.get('content', '')
)
for c in table_dict.get('cells', [])
]
return TableData(
rows=num_rows,
cols=num_cols,
cells=cells,
headers=headers if headers else None,
caption=extracted_text if extracted_text else None
rows=table_dict.get('rows', num_rows),
cols=table_dict.get('cols', num_cols),
cells=cleaned_cells,
headers=table_dict.get('headers'),
caption=table_dict.get('caption')
)
except ImportError:

View File

@@ -0,0 +1,344 @@
"""
PP-StructureV3 Debug Service
Provides debugging tools for visualizing and saving PP-StructureV3 results:
- Save raw results as JSON for inspection
- Generate visualization images showing detected bboxes
- Compare raw OCR regions with PP-StructureV3 elements
"""
import json
import logging
from pathlib import Path
from typing import Dict, List, Any, Optional, Tuple
from datetime import datetime
from PIL import Image, ImageDraw, ImageFont
logger = logging.getLogger(__name__)
# Color palette for different element types (RGB)
ELEMENT_COLORS: Dict[str, Tuple[int, int, int]] = {
'text': (0, 128, 0), # Green
'title': (0, 0, 255), # Blue
'table': (255, 0, 0), # Red
'figure': (255, 165, 0), # Orange
'image': (255, 165, 0), # Orange
'header': (128, 0, 128), # Purple
'footer': (128, 0, 128), # Purple
'equation': (0, 255, 255), # Cyan
'chart': (255, 192, 203), # Pink
'list': (139, 69, 19), # Brown
'reference': (128, 128, 128), # Gray
'default': (255, 0, 255), # Magenta for unknown types
}
# Color for raw OCR regions
RAW_OCR_COLOR = (255, 215, 0) # Gold
class PPStructureDebug:
"""Debug service for PP-StructureV3 analysis results."""
def __init__(self, output_dir: Path):
"""
Initialize debug service.
Args:
output_dir: Directory to save debug outputs
"""
self.output_dir = Path(output_dir)
self.output_dir.mkdir(parents=True, exist_ok=True)
def save_raw_results(
self,
pp_structure_results: Dict[str, Any],
raw_ocr_regions: List[Dict[str, Any]],
filename_prefix: str = "debug"
) -> Dict[str, Path]:
"""
Save raw PP-StructureV3 results and OCR regions as JSON files.
Args:
pp_structure_results: Raw PP-StructureV3 analysis results
raw_ocr_regions: Raw OCR text regions
filename_prefix: Prefix for output files
Returns:
Dictionary with paths to saved files
"""
saved_files = {}
# Save PP-StructureV3 results
pp_json_path = self.output_dir / f"{filename_prefix}_pp_structure_raw.json"
try:
# Convert any non-serializable types
serializable_results = self._make_serializable(pp_structure_results)
with open(pp_json_path, 'w', encoding='utf-8') as f:
json.dump(serializable_results, f, ensure_ascii=False, indent=2)
saved_files['pp_structure'] = pp_json_path
logger.info(f"Saved PP-StructureV3 raw results to {pp_json_path}")
except Exception as e:
logger.error(f"Failed to save PP-StructureV3 results: {e}")
# Save raw OCR regions
ocr_json_path = self.output_dir / f"{filename_prefix}_raw_ocr_regions.json"
try:
serializable_ocr = self._make_serializable(raw_ocr_regions)
with open(ocr_json_path, 'w', encoding='utf-8') as f:
json.dump(serializable_ocr, f, ensure_ascii=False, indent=2)
saved_files['raw_ocr'] = ocr_json_path
logger.info(f"Saved raw OCR regions to {ocr_json_path}")
except Exception as e:
logger.error(f"Failed to save raw OCR regions: {e}")
# Save summary comparison
summary_path = self.output_dir / f"{filename_prefix}_debug_summary.json"
try:
summary = self._generate_summary(pp_structure_results, raw_ocr_regions)
with open(summary_path, 'w', encoding='utf-8') as f:
json.dump(summary, f, ensure_ascii=False, indent=2)
saved_files['summary'] = summary_path
logger.info(f"Saved debug summary to {summary_path}")
except Exception as e:
logger.error(f"Failed to save debug summary: {e}")
return saved_files
def generate_visualization(
self,
image_path: Path,
pp_structure_elements: List[Dict[str, Any]],
raw_ocr_regions: Optional[List[Dict[str, Any]]] = None,
filename_prefix: str = "debug",
show_labels: bool = True,
show_raw_ocr: bool = True
) -> Optional[Path]:
"""
Generate visualization image showing detected elements.
Args:
image_path: Path to original image
pp_structure_elements: PP-StructureV3 detected elements
raw_ocr_regions: Optional raw OCR regions to overlay
filename_prefix: Prefix for output file
show_labels: Whether to show element type labels
show_raw_ocr: Whether to show raw OCR regions
Returns:
Path to generated visualization image
"""
try:
# Load original image
img = Image.open(image_path)
if img.mode != 'RGB':
img = img.convert('RGB')
# Create copy for drawing
viz_img = img.copy()
draw = ImageDraw.Draw(viz_img)
# Try to load a font, fall back to default
try:
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 14)
small_font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 10)
except (IOError, OSError):
try:
font = ImageFont.truetype("/home/egg/project/Tool_OCR/backend/fonts/NotoSansSC-Regular.ttf", 14)
small_font = ImageFont.truetype("/home/egg/project/Tool_OCR/backend/fonts/NotoSansSC-Regular.ttf", 10)
except (IOError, OSError):
font = ImageFont.load_default()
small_font = font
# Draw raw OCR regions first (so PP-Structure boxes are on top)
if show_raw_ocr and raw_ocr_regions:
for idx, region in enumerate(raw_ocr_regions):
bbox = self._normalize_bbox(region.get('bbox', []))
if bbox:
# Draw with dashed style simulation (draw thin lines)
x0, y0, x1, y1 = bbox
draw.rectangle([x0, y0, x1, y1], outline=RAW_OCR_COLOR, width=1)
# Add small label
if show_labels:
confidence = region.get('confidence', 0)
label = f"OCR:{confidence:.2f}"
draw.text((x0, y0 - 12), label, fill=RAW_OCR_COLOR, font=small_font)
# Draw PP-StructureV3 elements
for idx, elem in enumerate(pp_structure_elements):
elem_type = elem.get('type', 'default')
if hasattr(elem_type, 'value'):
elem_type = elem_type.value
elem_type = str(elem_type).lower()
color = ELEMENT_COLORS.get(elem_type, ELEMENT_COLORS['default'])
bbox = self._normalize_bbox(elem.get('bbox', []))
if bbox:
x0, y0, x1, y1 = bbox
# Draw thicker rectangle for PP-Structure elements
draw.rectangle([x0, y0, x1, y1], outline=color, width=3)
# Add label
if show_labels:
label = f"{idx}:{elem_type}"
# Draw label background
text_bbox = draw.textbbox((x0, y0 - 18), label, font=font)
draw.rectangle(text_bbox, fill=(255, 255, 255, 200))
draw.text((x0, y0 - 18), label, fill=color, font=font)
# Add legend
self._draw_legend(draw, img.width, font)
# Add image info
info_text = f"PP-Structure: {len(pp_structure_elements)} elements"
if raw_ocr_regions:
info_text += f" | Raw OCR: {len(raw_ocr_regions)} regions"
info_text += f" | Size: {img.width}x{img.height}"
draw.text((10, img.height - 25), info_text, fill=(0, 0, 0), font=font)
# Save visualization
viz_path = self.output_dir / f"{filename_prefix}_pp_structure_viz.png"
viz_img.save(viz_path, 'PNG')
logger.info(f"Saved visualization to {viz_path}")
return viz_path
except Exception as e:
logger.error(f"Failed to generate visualization: {e}")
import traceback
traceback.print_exc()
return None
def _draw_legend(self, draw: ImageDraw, img_width: int, font: ImageFont):
"""Draw a legend showing element type colors."""
legend_x = img_width - 150
legend_y = 10
# Draw legend background
draw.rectangle(
[legend_x - 5, legend_y - 5, img_width - 5, legend_y + len(ELEMENT_COLORS) * 18 + 25],
fill=(255, 255, 255, 230),
outline=(0, 0, 0)
)
draw.text((legend_x, legend_y), "Legend:", fill=(0, 0, 0), font=font)
legend_y += 20
for elem_type, color in ELEMENT_COLORS.items():
if elem_type == 'default':
continue
draw.rectangle([legend_x, legend_y + 2, legend_x + 12, legend_y + 14], fill=color)
draw.text((legend_x + 18, legend_y), elem_type, fill=(0, 0, 0), font=font)
legend_y += 18
# Add raw OCR legend entry
draw.rectangle([legend_x, legend_y + 2, legend_x + 12, legend_y + 14], fill=RAW_OCR_COLOR)
draw.text((legend_x + 18, legend_y), "raw_ocr", fill=(0, 0, 0), font=font)
def _normalize_bbox(self, bbox: Any) -> Optional[Tuple[float, float, float, float]]:
"""Normalize bbox to (x0, y0, x1, y1) format."""
if not bbox:
return None
try:
# Handle nested list format [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
if isinstance(bbox, (list, tuple)) and len(bbox) >= 1:
if isinstance(bbox[0], (list, tuple)):
xs = [pt[0] for pt in bbox if len(pt) >= 2]
ys = [pt[1] for pt in bbox if len(pt) >= 2]
if xs and ys:
return (min(xs), min(ys), max(xs), max(ys))
# Handle flat list [x0, y0, x1, y1]
if isinstance(bbox, (list, tuple)) and len(bbox) == 4:
return (float(bbox[0]), float(bbox[1]), float(bbox[2]), float(bbox[3]))
# Handle flat polygon [x1, y1, x2, y2, ...]
if isinstance(bbox, (list, tuple)) and len(bbox) >= 8:
xs = [bbox[i] for i in range(0, len(bbox), 2)]
ys = [bbox[i] for i in range(1, len(bbox), 2)]
return (min(xs), min(ys), max(xs), max(ys))
# Handle dict format
if isinstance(bbox, dict):
return (
float(bbox.get('x0', bbox.get('x_min', 0))),
float(bbox.get('y0', bbox.get('y_min', 0))),
float(bbox.get('x1', bbox.get('x_max', 0))),
float(bbox.get('y1', bbox.get('y_max', 0)))
)
except (TypeError, ValueError, IndexError) as e:
logger.warning(f"Failed to normalize bbox {bbox}: {e}")
return None
def _generate_summary(
self,
pp_structure_results: Dict[str, Any],
raw_ocr_regions: List[Dict[str, Any]]
) -> Dict[str, Any]:
"""Generate summary comparing PP-Structure and raw OCR."""
pp_elements = pp_structure_results.get('elements', [])
# Count element types
type_counts = {}
for elem in pp_elements:
elem_type = elem.get('type', 'unknown')
if hasattr(elem_type, 'value'):
elem_type = elem_type.value
type_counts[str(elem_type)] = type_counts.get(str(elem_type), 0) + 1
# Calculate bounding box coverage
pp_bbox_area = 0
ocr_bbox_area = 0
for elem in pp_elements:
bbox = self._normalize_bbox(elem.get('bbox'))
if bbox:
pp_bbox_area += (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
for region in raw_ocr_regions:
bbox = self._normalize_bbox(region.get('bbox'))
if bbox:
ocr_bbox_area += (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
return {
'timestamp': datetime.now().isoformat(),
'pp_structure': {
'total_elements': len(pp_elements),
'element_types': type_counts,
'total_bbox_area': pp_bbox_area,
'has_parsing_res_list': pp_structure_results.get('has_parsing_res_list', False)
},
'raw_ocr': {
'total_regions': len(raw_ocr_regions),
'total_bbox_area': ocr_bbox_area,
'avg_confidence': sum(r.get('confidence', 0) for r in raw_ocr_regions) / len(raw_ocr_regions) if raw_ocr_regions else 0
},
'comparison': {
'element_count_ratio': len(pp_elements) / len(raw_ocr_regions) if raw_ocr_regions else 0,
'area_ratio': pp_bbox_area / ocr_bbox_area if ocr_bbox_area > 0 else 0,
'potential_gap': len(raw_ocr_regions) - len(pp_elements) if raw_ocr_regions else 0
}
}
def _make_serializable(self, obj: Any) -> Any:
"""Convert object to JSON-serializable format."""
if obj is None:
return None
if isinstance(obj, (str, int, float, bool)):
return obj
if isinstance(obj, (list, tuple)):
return [self._make_serializable(item) for item in obj]
if isinstance(obj, dict):
return {str(k): self._make_serializable(v) for k, v in obj.items()}
if hasattr(obj, 'value'):
return obj.value
if hasattr(obj, '__dict__'):
return self._make_serializable(obj.__dict__)
if hasattr(obj, 'tolist'): # numpy array
return obj.tolist()
return str(obj)

View File

@@ -0,0 +1,332 @@
"""
API integration tests for Layout Model Selection feature.
This replaces the deprecated PP-StructureV3 parameter tests.
"""
import pytest
from fastapi.testclient import TestClient
from unittest.mock import patch
from app.main import app
from app.core.database import get_db
from app.models.user import User
from app.models.task import Task, TaskStatus, TaskFile
@pytest.fixture
def client():
"""Create test client"""
return TestClient(app)
@pytest.fixture
def test_user(db_session):
"""Create test user"""
user = User(
email="test@example.com",
hashed_password="test_hash",
is_active=True
)
db_session.add(user)
db_session.commit()
db_session.refresh(user)
return user
@pytest.fixture
def test_task(db_session, test_user):
"""Create test task with uploaded file"""
task = Task(
user_id=test_user.id,
task_id="test-task-123",
filename="test.pdf",
status=TaskStatus.PENDING
)
db_session.add(task)
db_session.commit()
db_session.refresh(task)
# Add task file
task_file = TaskFile(
task_id=task.id,
original_name="test.pdf",
stored_path="/tmp/test.pdf",
file_size=1024,
mime_type="application/pdf"
)
db_session.add(task_file)
db_session.commit()
return task
class TestLayoutModelSchema:
"""Test LayoutModel and ProcessingOptions schema validation"""
def test_processing_options_accepts_layout_model(self):
"""Verify ProcessingOptions schema accepts layout_model parameter"""
from app.schemas.task import ProcessingOptions, LayoutModelEnum
options = ProcessingOptions(
use_dual_track=True,
language='ch',
layout_model=LayoutModelEnum.CHINESE
)
assert options.layout_model == LayoutModelEnum.CHINESE
def test_layout_model_enum_values(self):
"""Verify all layout model enum values are valid"""
from app.schemas.task import LayoutModelEnum
assert LayoutModelEnum.CHINESE.value == "chinese"
assert LayoutModelEnum.DEFAULT.value == "default"
assert LayoutModelEnum.CDLA.value == "cdla"
def test_default_layout_model_is_chinese(self):
"""Verify default layout model is 'chinese' for best Chinese document support"""
from app.schemas.task import ProcessingOptions
options = ProcessingOptions()
# Default should be chinese
assert options.layout_model.value == "chinese"
def test_layout_model_string_values_accepted(self):
"""Verify string values are accepted for layout_model"""
from app.schemas.task import ProcessingOptions
# String values should be converted to enum
options = ProcessingOptions(layout_model="default")
assert options.layout_model.value == "default"
options = ProcessingOptions(layout_model="cdla")
assert options.layout_model.value == "cdla"
def test_invalid_layout_model_rejected(self):
"""Verify invalid layout model values are rejected"""
from app.schemas.task import ProcessingOptions
from pydantic import ValidationError
with pytest.raises(ValidationError):
ProcessingOptions(layout_model="invalid_model")
class TestStartTaskEndpoint:
"""Test /tasks/{task_id}/start endpoint with layout_model parameter"""
@patch('app.routers.tasks.process_task_ocr')
def test_start_task_with_layout_model(self, mock_process_ocr, client, test_task, db_session):
"""Verify layout_model is accepted and passed to OCR service"""
# Override get_db dependency
def override_get_db():
try:
yield db_session
finally:
pass
# Override auth dependency
def override_get_current_user():
return test_task.user
app.dependency_overrides[get_db] = override_get_db
from app.core.deps import get_current_user
app.dependency_overrides[get_current_user] = override_get_current_user
# Request body with layout_model
request_body = {
"use_dual_track": True,
"language": "ch",
"layout_model": "chinese"
}
# Make API call
response = client.post(
f"/api/v2/tasks/{test_task.task_id}/start",
json=request_body
)
# Verify response
assert response.status_code == 200
data = response.json()
assert data['status'] == 'processing'
# Verify background task was called with layout_model
mock_process_ocr.assert_called_once()
call_kwargs = mock_process_ocr.call_args[1]
assert 'layout_model' in call_kwargs
assert call_kwargs['layout_model'] == 'chinese'
# Clean up
app.dependency_overrides.clear()
@patch('app.routers.tasks.process_task_ocr')
def test_start_task_with_default_model(self, mock_process_ocr, client, test_task, db_session):
"""Verify 'default' layout model is accepted"""
def override_get_db():
try:
yield db_session
finally:
pass
def override_get_current_user():
return test_task.user
app.dependency_overrides[get_db] = override_get_db
from app.core.deps import get_current_user
app.dependency_overrides[get_current_user] = override_get_current_user
request_body = {
"use_dual_track": True,
"layout_model": "default"
}
response = client.post(
f"/api/v2/tasks/{test_task.task_id}/start",
json=request_body
)
assert response.status_code == 200
mock_process_ocr.assert_called_once()
call_kwargs = mock_process_ocr.call_args[1]
assert call_kwargs['layout_model'] == 'default'
app.dependency_overrides.clear()
@patch('app.routers.tasks.process_task_ocr')
def test_start_task_with_cdla_model(self, mock_process_ocr, client, test_task, db_session):
"""Verify 'cdla' layout model is accepted"""
def override_get_db():
try:
yield db_session
finally:
pass
def override_get_current_user():
return test_task.user
app.dependency_overrides[get_db] = override_get_db
from app.core.deps import get_current_user
app.dependency_overrides[get_current_user] = override_get_current_user
request_body = {
"use_dual_track": True,
"layout_model": "cdla"
}
response = client.post(
f"/api/v2/tasks/{test_task.task_id}/start",
json=request_body
)
assert response.status_code == 200
mock_process_ocr.assert_called_once()
call_kwargs = mock_process_ocr.call_args[1]
assert call_kwargs['layout_model'] == 'cdla'
app.dependency_overrides.clear()
@patch('app.routers.tasks.process_task_ocr')
def test_start_task_without_layout_model_uses_default(self, mock_process_ocr, client, test_task, db_session):
"""Verify task can start without layout_model (uses 'chinese' as default)"""
def override_get_db():
try:
yield db_session
finally:
pass
def override_get_current_user():
return test_task.user
app.dependency_overrides[get_db] = override_get_db
from app.core.deps import get_current_user
app.dependency_overrides[get_current_user] = override_get_current_user
# Request without layout_model
request_body = {
"use_dual_track": True,
"language": "ch"
}
response = client.post(
f"/api/v2/tasks/{test_task.task_id}/start",
json=request_body
)
assert response.status_code == 200
mock_process_ocr.assert_called_once()
call_kwargs = mock_process_ocr.call_args[1]
# layout_model should default to 'chinese'
assert call_kwargs['layout_model'] == 'chinese'
app.dependency_overrides.clear()
def test_start_task_with_invalid_layout_model(self, client, test_task, db_session):
"""Verify invalid layout_model returns 422 validation error"""
def override_get_db():
try:
yield db_session
finally:
pass
def override_get_current_user():
return test_task.user
app.dependency_overrides[get_db] = override_get_db
from app.core.deps import get_current_user
app.dependency_overrides[get_current_user] = override_get_current_user
# Request with invalid layout_model
request_body = {
"use_dual_track": True,
"layout_model": "invalid_model"
}
response = client.post(
f"/api/v2/tasks/{test_task.task_id}/start",
json=request_body
)
# Should return validation error
assert response.status_code == 422
app.dependency_overrides.clear()
class TestOpenAPISchema:
"""Test OpenAPI schema includes layout_model parameter"""
def test_openapi_schema_includes_layout_model(self, client):
"""Verify OpenAPI schema documents layout_model parameter"""
response = client.get("/openapi.json")
assert response.status_code == 200
schema = response.json()
# Check LayoutModelEnum schema exists
assert 'LayoutModelEnum' in schema['components']['schemas']
model_schema = schema['components']['schemas']['LayoutModelEnum']
# Verify all 3 model options are documented
assert 'chinese' in model_schema['enum']
assert 'default' in model_schema['enum']
assert 'cdla' in model_schema['enum']
# Verify ProcessingOptions includes layout_model
options_schema = schema['components']['schemas']['ProcessingOptions']
assert 'layout_model' in options_schema['properties']
if __name__ == '__main__':
pytest.main([__file__, '-v'])

View File

@@ -0,0 +1,244 @@
"""
Unit tests for Layout Model Selection feature in OCR Service.
This replaces the deprecated PP-StructureV3 parameter tests.
"""
import pytest
import sys
from pathlib import Path
from unittest.mock import Mock, patch, MagicMock
# Mock all external dependencies before importing OCRService
sys.modules['paddleocr'] = MagicMock()
sys.modules['PIL'] = MagicMock()
sys.modules['pdf2image'] = MagicMock()
# Mock paddle with version attribute
paddle_mock = MagicMock()
paddle_mock.__version__ = '2.5.0'
paddle_mock.device.get_device.return_value = 'cpu'
paddle_mock.device.get_available_device.return_value = 'cpu'
sys.modules['paddle'] = paddle_mock
# Mock torch
torch_mock = MagicMock()
torch_mock.cuda.is_available.return_value = False
sys.modules['torch'] = torch_mock
from app.services.ocr_service import OCRService, LAYOUT_MODEL_MAPPING, _USE_PUBLAYNET_DEFAULT
from app.core.config import settings
class TestLayoutModelMapping:
"""Test layout model name mapping"""
def test_layout_model_mapping_exists(self):
"""Verify LAYOUT_MODEL_MAPPING constant exists and has correct values"""
assert 'chinese' in LAYOUT_MODEL_MAPPING
assert 'default' in LAYOUT_MODEL_MAPPING
assert 'cdla' in LAYOUT_MODEL_MAPPING
def test_chinese_model_maps_to_pp_doclayout(self):
"""Verify 'chinese' maps to PP-DocLayout-S"""
assert LAYOUT_MODEL_MAPPING['chinese'] == 'PP-DocLayout-S'
def test_default_model_maps_to_publaynet_sentinel(self):
"""Verify 'default' maps to sentinel value for PubLayNet default"""
# The 'default' model uses a sentinel value that signals "use PubLayNet default (no custom model)"
assert LAYOUT_MODEL_MAPPING['default'] == _USE_PUBLAYNET_DEFAULT
def test_cdla_model_maps_to_picodet(self):
"""Verify 'cdla' maps to picodet_lcnet_x1_0_fgd_layout_cdla"""
assert LAYOUT_MODEL_MAPPING['cdla'] == 'picodet_lcnet_x1_0_fgd_layout_cdla'
class TestLayoutModelEngine:
"""Test engine creation with different layout models"""
def test_chinese_model_creates_engine_with_pp_doclayout(self):
"""Verify 'chinese' layout model uses PP-DocLayout-S"""
ocr_service = OCRService()
with patch.object(ocr_service, 'structure_engine', None):
with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure:
mock_engine = Mock()
mock_ppstructure.return_value = mock_engine
engine = ocr_service._ensure_structure_engine(layout_model='chinese')
mock_ppstructure.assert_called_once()
call_kwargs = mock_ppstructure.call_args[1]
assert call_kwargs.get('layout_detection_model_name') == 'PP-DocLayout-S'
def test_default_model_creates_engine_without_model_name(self):
"""Verify 'default' layout model does not specify model name (uses default)"""
ocr_service = OCRService()
with patch.object(ocr_service, 'structure_engine', None):
with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure:
mock_engine = Mock()
mock_ppstructure.return_value = mock_engine
engine = ocr_service._ensure_structure_engine(layout_model='default')
mock_ppstructure.assert_called_once()
call_kwargs = mock_ppstructure.call_args[1]
# For 'default', layout_detection_model_name should be None or not set
assert call_kwargs.get('layout_detection_model_name') is None
def test_cdla_model_creates_engine_with_picodet(self):
"""Verify 'cdla' layout model uses picodet_lcnet_x1_0_fgd_layout_cdla"""
ocr_service = OCRService()
with patch.object(ocr_service, 'structure_engine', None):
with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure:
mock_engine = Mock()
mock_ppstructure.return_value = mock_engine
engine = ocr_service._ensure_structure_engine(layout_model='cdla')
mock_ppstructure.assert_called_once()
call_kwargs = mock_ppstructure.call_args[1]
assert call_kwargs.get('layout_detection_model_name') == 'picodet_lcnet_x1_0_fgd_layout_cdla'
def test_none_layout_model_uses_chinese_default(self):
"""Verify None layout_model defaults to 'chinese' model"""
ocr_service = OCRService()
with patch.object(ocr_service, 'structure_engine', None):
with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure:
mock_engine = Mock()
mock_ppstructure.return_value = mock_engine
# Pass None for layout_model
engine = ocr_service._ensure_structure_engine(layout_model=None)
mock_ppstructure.assert_called_once()
call_kwargs = mock_ppstructure.call_args[1]
# Should use 'chinese' model as default
assert call_kwargs.get('layout_detection_model_name') == 'PP-DocLayout-S'
class TestLayoutModelCaching:
"""Test engine caching behavior with layout models"""
def test_same_layout_model_uses_cached_engine(self):
"""Verify same layout model reuses cached engine"""
ocr_service = OCRService()
with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure:
mock_engine = Mock()
mock_ppstructure.return_value = mock_engine
# First call with 'chinese'
engine1 = ocr_service._ensure_structure_engine(layout_model='chinese')
# Second call with same model should use cache
engine2 = ocr_service._ensure_structure_engine(layout_model='chinese')
# Verify only one engine was created
assert mock_ppstructure.call_count == 1
assert engine1 is engine2
def test_different_layout_model_creates_new_engine(self):
"""Verify different layout model creates new engine"""
ocr_service = OCRService()
with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure:
mock_engine1 = Mock()
mock_engine2 = Mock()
mock_ppstructure.side_effect = [mock_engine1, mock_engine2]
# First call with 'chinese'
engine1 = ocr_service._ensure_structure_engine(layout_model='chinese')
# Second call with 'cdla' should create new engine
engine2 = ocr_service._ensure_structure_engine(layout_model='cdla')
# Verify two engines were created
assert mock_ppstructure.call_count == 2
assert engine1 is not engine2
class TestLayoutModelFlow:
"""Test layout model parameter flow through processing pipeline"""
def test_layout_model_passed_to_engine_creation(self):
"""Verify layout_model is passed through to _ensure_structure_engine"""
ocr_service = OCRService()
# Test that _ensure_structure_engine accepts layout_model parameter
with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure:
mock_engine = Mock()
mock_ppstructure.return_value = mock_engine
# Call with specific layout_model
engine = ocr_service._ensure_structure_engine(layout_model='cdla')
# Verify correct model was requested
mock_ppstructure.assert_called_once()
call_kwargs = mock_ppstructure.call_args[1]
assert call_kwargs.get('layout_detection_model_name') == 'picodet_lcnet_x1_0_fgd_layout_cdla'
def test_layout_model_default_behavior(self):
"""Verify default layout model behavior when None is passed"""
ocr_service = OCRService()
with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure:
mock_engine = Mock()
mock_ppstructure.return_value = mock_engine
# Call without layout_model (None)
engine = ocr_service._ensure_structure_engine(layout_model=None)
# Should use config default (PP-DocLayout-S)
mock_ppstructure.assert_called_once()
call_kwargs = mock_ppstructure.call_args[1]
assert call_kwargs.get('layout_detection_model_name') == settings.layout_detection_model_name
def test_layout_model_unknown_value_falls_back(self):
"""Verify unknown layout model falls back to config default"""
ocr_service = OCRService()
with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure:
mock_engine = Mock()
mock_ppstructure.return_value = mock_engine
# Call with unknown layout_model
engine = ocr_service._ensure_structure_engine(layout_model='unknown_model')
# Should use config default
mock_ppstructure.assert_called_once()
call_kwargs = mock_ppstructure.call_args[1]
assert call_kwargs.get('layout_detection_model_name') == settings.layout_detection_model_name
class TestLayoutModelLogging:
"""Test layout model logging"""
def test_layout_model_is_logged(self):
"""Verify layout model selection is logged"""
ocr_service = OCRService()
with patch('app.services.ocr_service.PPStructureV3') as mock_ppstructure:
with patch('app.services.ocr_service.logger') as mock_logger:
mock_engine = Mock()
mock_ppstructure.return_value = mock_engine
# Call with specific layout_model
ocr_service._ensure_structure_engine(layout_model='cdla')
# Verify logging occurred
assert mock_logger.info.call_count >= 1
# Check that model name was logged
log_calls = [str(call) for call in mock_logger.info.call_args_list]
assert any('cdla' in str(call).lower() or 'layout' in str(call).lower() for call in log_calls)
if __name__ == '__main__':
pytest.main([__file__, '-v'])

View File

@@ -0,0 +1,503 @@
"""
Tests for Gap Filling Service
Tests the detection and filling of gaps in PP-StructureV3 output
using raw OCR text regions.
"""
import pytest
from typing import List, Dict, Any
from app.services.gap_filling_service import GapFillingService, TextRegion, SKIP_ELEMENT_TYPES
from app.models.unified_document import DocumentElement, BoundingBox, ElementType, Dimensions
class TestGapFillingService:
"""Tests for GapFillingService class."""
@pytest.fixture
def service(self) -> GapFillingService:
"""Create a GapFillingService instance with default settings."""
return GapFillingService(
coverage_threshold=0.7,
iou_threshold=0.15,
confidence_threshold=0.3,
dedup_iou_threshold=0.5,
enabled=True
)
@pytest.fixture
def disabled_service(self) -> GapFillingService:
"""Create a disabled GapFillingService instance."""
return GapFillingService(enabled=False)
@pytest.fixture
def sample_raw_regions(self) -> List[TextRegion]:
"""Create sample raw OCR text regions."""
return [
TextRegion(text="Header text", bbox=[100, 50, 300, 80], confidence=0.95, page=1),
TextRegion(text="Title of document", bbox=[100, 100, 500, 150], confidence=0.92, page=1),
TextRegion(text="First paragraph", bbox=[100, 200, 500, 250], confidence=0.90, page=1),
TextRegion(text="Second paragraph", bbox=[100, 300, 500, 350], confidence=0.88, page=1),
TextRegion(text="Footer note", bbox=[100, 900, 300, 930], confidence=0.85, page=1),
# Low confidence region (should be filtered)
TextRegion(text="Noise", bbox=[50, 50, 80, 80], confidence=0.1, page=1),
]
@pytest.fixture
def sample_pp_elements(self) -> List[DocumentElement]:
"""Create sample PP-StructureV3 elements that cover only some regions."""
return [
DocumentElement(
element_id="pp_1",
type=ElementType.TITLE,
content="Title of document",
bbox=BoundingBox(x0=100, y0=100, x1=500, y1=150),
confidence=0.95
),
DocumentElement(
element_id="pp_2",
type=ElementType.TEXT,
content="First paragraph",
bbox=BoundingBox(x0=100, y0=200, x1=500, y1=250),
confidence=0.90
),
# Note: Header, Second paragraph, and Footer are NOT covered
]
def test_service_initialization(self, service: GapFillingService):
"""Test service initializes with correct parameters."""
assert service.enabled is True
assert service.coverage_threshold == 0.7
assert service.iou_threshold == 0.15
assert service.confidence_threshold == 0.3
assert service.dedup_iou_threshold == 0.5
def test_disabled_service(self, disabled_service: GapFillingService):
"""Test disabled service does not activate."""
regions = [TextRegion(text="Test", bbox=[0, 0, 100, 100], confidence=0.9, page=1)]
elements = []
should_activate, coverage = disabled_service.should_activate(regions, elements)
assert should_activate is False
assert coverage == 1.0
def test_should_activate_low_coverage(
self,
service: GapFillingService,
sample_raw_regions: List[TextRegion],
sample_pp_elements: List[DocumentElement]
):
"""Test activation when coverage is below threshold."""
# Filter out low confidence regions
valid_regions = [r for r in sample_raw_regions if r.confidence >= 0.3]
should_activate, coverage = service.should_activate(valid_regions, sample_pp_elements)
# Only 2 out of 5 valid regions are covered (Title, First paragraph)
assert should_activate is True
assert coverage < 0.7 # Below threshold
def test_should_not_activate_high_coverage(self, service: GapFillingService):
"""Test no activation when coverage is above threshold."""
# All regions covered
regions = [
TextRegion(text="Text 1", bbox=[100, 100, 200, 150], confidence=0.9, page=1),
TextRegion(text="Text 2", bbox=[100, 200, 200, 250], confidence=0.9, page=1),
]
elements = [
DocumentElement(
element_id="pp_1",
type=ElementType.TEXT,
content="Text 1",
bbox=BoundingBox(x0=50, y0=50, x1=250, y1=200), # Covers first region
confidence=0.95
),
DocumentElement(
element_id="pp_2",
type=ElementType.TEXT,
content="Text 2",
bbox=BoundingBox(x0=50, y0=180, x1=250, y1=300), # Covers second region
confidence=0.95
),
]
should_activate, coverage = service.should_activate(regions, elements)
assert should_activate is False
assert coverage >= 0.7
def test_find_uncovered_regions(
self,
service: GapFillingService,
sample_raw_regions: List[TextRegion],
sample_pp_elements: List[DocumentElement]
):
"""Test finding uncovered regions."""
uncovered = service.find_uncovered_regions(sample_raw_regions, sample_pp_elements)
# Should find Header, Second paragraph, Footer (not Title, First paragraph, or low-confidence Noise)
assert len(uncovered) == 3
uncovered_texts = [r.text for r in uncovered]
assert "Header text" in uncovered_texts
assert "Second paragraph" in uncovered_texts
assert "Footer note" in uncovered_texts
assert "Title of document" not in uncovered_texts # Covered
assert "First paragraph" not in uncovered_texts # Covered
assert "Noise" not in uncovered_texts # Low confidence
def test_coverage_by_center_point(self, service: GapFillingService):
"""Test coverage detection via center point."""
region = TextRegion(text="Test", bbox=[150, 150, 250, 200], confidence=0.9, page=1)
element = DocumentElement(
element_id="pp_1",
type=ElementType.TEXT,
content="Container",
bbox=BoundingBox(x0=100, y0=100, x1=300, y1=250), # Contains region's center
confidence=0.95
)
is_covered = service._is_region_covered(region, [element])
assert is_covered is True
def test_coverage_by_iou(self, service: GapFillingService):
"""Test coverage detection via IoU threshold."""
region = TextRegion(text="Test", bbox=[100, 100, 200, 150], confidence=0.9, page=1)
element = DocumentElement(
element_id="pp_1",
type=ElementType.TEXT,
content="Overlap",
bbox=BoundingBox(x0=150, y0=100, x1=250, y1=150), # Partial overlap
confidence=0.95
)
# Calculate expected IoU
# Intersection: (150-200) x (100-150) = 50 x 50 = 2500
# Union: 100x50 + 100x50 - 2500 = 7500
# IoU = 2500/7500 = 0.33 > 0.15 threshold
is_covered = service._is_region_covered(region, [element])
assert is_covered is True
def test_deduplication(
self,
service: GapFillingService,
sample_pp_elements: List[DocumentElement]
):
"""Test deduplication removes high-overlap regions."""
uncovered = [
# High overlap with pp_2 (First paragraph)
TextRegion(text="First paragraph variant", bbox=[100, 200, 500, 250], confidence=0.9, page=1),
# No overlap
TextRegion(text="Unique region", bbox=[100, 500, 300, 550], confidence=0.9, page=1),
]
deduplicated = service.deduplicate_regions(uncovered, sample_pp_elements)
assert len(deduplicated) == 1
assert deduplicated[0].text == "Unique region"
def test_convert_regions_to_elements(self, service: GapFillingService):
"""Test conversion of TextRegions to DocumentElements."""
regions = [
TextRegion(text="Test text 1", bbox=[100, 100, 200, 150], confidence=0.85, page=1),
TextRegion(text="Test text 2", bbox=[100, 200, 200, 250], confidence=0.90, page=1),
]
elements = service.convert_regions_to_elements(regions, page_number=1, start_element_id=0)
assert len(elements) == 2
assert elements[0].element_id == "gap_fill_1_0"
assert elements[0].type == ElementType.TEXT
assert elements[0].content == "Test text 1"
assert elements[0].confidence == 0.85
assert elements[0].metadata.get('source') == 'gap_filling'
assert elements[1].element_id == "gap_fill_1_1"
assert elements[1].content == "Test text 2"
def test_recalculate_reading_order(self, service: GapFillingService):
"""Test reading order recalculation."""
elements = [
DocumentElement(
element_id="e3",
type=ElementType.TEXT,
content="Bottom",
bbox=BoundingBox(x0=100, y0=300, x1=200, y1=350),
confidence=0.9
),
DocumentElement(
element_id="e1",
type=ElementType.TEXT,
content="Top",
bbox=BoundingBox(x0=100, y0=100, x1=200, y1=150),
confidence=0.9
),
DocumentElement(
element_id="e2",
type=ElementType.TEXT,
content="Middle",
bbox=BoundingBox(x0=100, y0=200, x1=200, y1=250),
confidence=0.9
),
]
reading_order = service.recalculate_reading_order(elements)
# Should be sorted by y0: Top (100), Middle (200), Bottom (300)
assert reading_order == [1, 2, 0] # Indices of elements in reading order
def test_fill_gaps_integration(
self,
service: GapFillingService,
):
"""Integration test for fill_gaps method."""
# Raw OCR regions (dict format as received from OCR service)
raw_regions = [
{'text': 'Header', 'bbox': [100, 50, 300, 80], 'confidence': 0.95, 'page': 1},
{'text': 'Title', 'bbox': [100, 100, 500, 150], 'confidence': 0.92, 'page': 1},
{'text': 'Paragraph 1', 'bbox': [100, 200, 500, 250], 'confidence': 0.90, 'page': 1},
{'text': 'Paragraph 2', 'bbox': [100, 300, 500, 350], 'confidence': 0.88, 'page': 1},
{'text': 'Paragraph 3', 'bbox': [100, 400, 500, 450], 'confidence': 0.86, 'page': 1},
{'text': 'Footer', 'bbox': [100, 900, 300, 930], 'confidence': 0.85, 'page': 1},
]
# PP-StructureV3 only detected Title (missing 5 out of 6 regions = 16.7% coverage)
pp_elements = [
DocumentElement(
element_id="pp_1",
type=ElementType.TITLE,
content="Title",
bbox=BoundingBox(x0=100, y0=100, x1=500, y1=150),
confidence=0.95
),
]
supplemented, stats = service.fill_gaps(
raw_ocr_regions=raw_regions,
pp_structure_elements=pp_elements,
page_number=1
)
# Should have activated and supplemented missing regions
assert stats['activated'] is True
assert stats['coverage_ratio'] < 0.7
assert len(supplemented) == 5 # Header, Paragraph 1, 2, 3, Footer
def test_fill_gaps_no_activation_when_coverage_high(self, service: GapFillingService):
"""Test fill_gaps does not activate when coverage is high."""
raw_regions = [
{'text': 'Text 1', 'bbox': [100, 100, 200, 150], 'confidence': 0.9, 'page': 1},
]
pp_elements = [
DocumentElement(
element_id="pp_1",
type=ElementType.TEXT,
content="Text 1",
bbox=BoundingBox(x0=50, y0=50, x1=250, y1=200), # Fully covers
confidence=0.95
),
]
supplemented, stats = service.fill_gaps(
raw_ocr_regions=raw_regions,
pp_structure_elements=pp_elements,
page_number=1
)
assert stats['activated'] is False
assert len(supplemented) == 0
def test_skip_element_types_not_supplemented(self, service: GapFillingService):
"""Test that TABLE/IMAGE/etc. elements are not supplemented over."""
raw_regions = [
{'text': 'Table cell text', 'bbox': [100, 100, 200, 150], 'confidence': 0.9, 'page': 1},
]
# PP-StructureV3 has a table covering this region
pp_elements = [
DocumentElement(
element_id="pp_1",
type=ElementType.TABLE,
content="<table>...</table>",
bbox=BoundingBox(x0=50, y0=50, x1=250, y1=200),
confidence=0.95
),
]
# The region should be considered covered by the table
supplemented, stats = service.fill_gaps(
raw_ocr_regions=raw_regions,
pp_structure_elements=pp_elements,
page_number=1
)
# Should not supplement because the table covers it
assert len(supplemented) == 0
def test_coordinate_scaling(self, service: GapFillingService):
"""Test coordinate alignment with different dimensions."""
# OCR was done at 2000x3000, PP-Structure at 1000x1500
ocr_dimensions = {'width': 2000, 'height': 3000}
pp_dimensions = Dimensions(width=1000, height=1500)
raw_regions = [
# At OCR scale: (200, 300) to (400, 450) -> at PP scale: (100, 150) to (200, 225)
{'text': 'Scaled text', 'bbox': [200, 300, 400, 450], 'confidence': 0.9, 'page': 1},
]
pp_elements = [
DocumentElement(
element_id="pp_1",
type=ElementType.TEXT,
content="Scaled text",
bbox=BoundingBox(x0=100, y0=150, x1=200, y1=225), # Should cover after scaling
confidence=0.95
),
]
supplemented, stats = service.fill_gaps(
raw_ocr_regions=raw_regions,
pp_structure_elements=pp_elements,
page_number=1,
ocr_dimensions=ocr_dimensions,
pp_dimensions=pp_dimensions
)
# After scaling, the region should be covered
assert stats['coverage_ratio'] >= 0.7 or len(supplemented) == 0
def test_iou_calculation(self, service: GapFillingService):
"""Test IoU calculation accuracy."""
# Two identical boxes
bbox1 = (0, 0, 100, 100)
bbox2 = (0, 0, 100, 100)
assert service._calculate_iou(bbox1, bbox2) == 1.0
# No overlap
bbox1 = (0, 0, 100, 100)
bbox2 = (200, 200, 300, 300)
assert service._calculate_iou(bbox1, bbox2) == 0.0
# 50% overlap
bbox1 = (0, 0, 100, 100)
bbox2 = (50, 0, 150, 100) # Shifted right by 50
# Intersection: 50x100 = 5000
# Union: 10000 + 10000 - 5000 = 15000
# IoU = 5000/15000 = 0.333...
iou = service._calculate_iou(bbox1, bbox2)
assert abs(iou - 1/3) < 0.01
def test_point_in_bbox(self, service: GapFillingService):
"""Test point-in-bbox check."""
bbox = (100, 100, 200, 200)
# Inside
assert service._point_in_bbox(150, 150, bbox) is True
# On edge
assert service._point_in_bbox(100, 100, bbox) is True
assert service._point_in_bbox(200, 200, bbox) is True
# Outside
assert service._point_in_bbox(50, 150, bbox) is False
assert service._point_in_bbox(250, 150, bbox) is False
def test_merge_adjacent_regions(self, service: GapFillingService):
"""Test merging of adjacent text regions."""
regions = [
TextRegion(text="Hello", bbox=[100, 100, 150, 130], confidence=0.9, page=1),
TextRegion(text="World", bbox=[160, 100, 210, 130], confidence=0.85, page=1), # Adjacent
TextRegion(text="Far away", bbox=[100, 300, 200, 330], confidence=0.9, page=1), # Not adjacent
]
merged = service.merge_adjacent_regions(regions, max_horizontal_gap=20, max_vertical_gap=10)
assert len(merged) == 2
# First two should be merged
assert "Hello" in merged[0].text and "World" in merged[0].text
assert merged[1].text == "Far away"
class TestTextRegion:
"""Tests for TextRegion dataclass."""
def test_normalized_bbox_4_values(self):
"""Test bbox normalization with 4 values."""
region = TextRegion(text="Test", bbox=[100, 200, 300, 400], confidence=0.9, page=1)
assert region.normalized_bbox == (100, 200, 300, 400)
def test_normalized_bbox_polygon_flat(self):
"""Test bbox normalization with flat polygon format (8 values)."""
# Polygon: 4 points as flat list [x1, y1, x2, y2, x3, y3, x4, y4]
region = TextRegion(
text="Test",
bbox=[100, 200, 300, 200, 300, 400, 100, 400],
confidence=0.9,
page=1
)
assert region.normalized_bbox == (100, 200, 300, 400)
def test_normalized_bbox_polygon_nested(self):
"""Test bbox normalization with nested polygon format (PaddleOCR format)."""
# PaddleOCR format: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
region = TextRegion(
text="Test",
bbox=[[100, 200], [300, 200], [300, 400], [100, 400]],
confidence=0.9,
page=1
)
assert region.normalized_bbox == (100, 200, 300, 400)
def test_normalized_bbox_numpy_polygon(self):
"""Test bbox normalization with numpy-like nested format."""
# Sometimes PaddleOCR returns numpy arrays converted to lists
region = TextRegion(
text="Test",
bbox=[[100.5, 200.5], [300.5, 200.5], [300.5, 400.5], [100.5, 400.5]],
confidence=0.9,
page=1
)
bbox = region.normalized_bbox
assert bbox[0] == 100.5
assert bbox[1] == 200.5
assert bbox[2] == 300.5
assert bbox[3] == 400.5
def test_center_calculation(self):
"""Test center point calculation."""
region = TextRegion(text="Test", bbox=[100, 200, 300, 400], confidence=0.9, page=1)
assert region.center == (200, 300)
def test_center_calculation_nested_bbox(self):
"""Test center point calculation with nested bbox format."""
region = TextRegion(
text="Test",
bbox=[[100, 200], [300, 200], [300, 400], [100, 400]],
confidence=0.9,
page=1
)
assert region.center == (200, 300)
class TestOCRToUnifiedConverterIntegration:
"""Integration tests for OCRToUnifiedConverter with gap filling."""
def test_converter_with_gap_filling_enabled(self):
"""Test converter initializes with gap filling enabled."""
from app.services.ocr_to_unified_converter import OCRToUnifiedConverter
converter = OCRToUnifiedConverter(enable_gap_filling=True)
assert converter.gap_filling_service is not None
def test_converter_with_gap_filling_disabled(self):
"""Test converter initializes without gap filling."""
from app.services.ocr_to_unified_converter import OCRToUnifiedConverter
converter = OCRToUnifiedConverter(enable_gap_filling=False)
assert converter.gap_filling_service is None

View File

@@ -0,0 +1,31 @@
# Tool_OCR Commit History Review (2025-11-12 ~ 2025-11-26)
本報告依 `git log` 全量 97 筆提交整理,涵蓋開發脈絡、里程碑、測試/品質信號與後續風險。提交類型統計35 `feat` / 37 `fix` / 9 `chore` / 5 `test` / 4 `docs` / 2 `refactor`,主要集中於 2025-11-18、11-19、11-20 與 11-24 的密集開發。
## 時間軸與里程碑
- **前期基礎與前端現代化 (11-12~11-13)**`21bc2f9`, `57cf912` 將前端改為 Tailwind v4 +專業 UI`0f81d5e` 單容器 Docker 化、`d7e6473` WSL Ubuntu 開發環境。
- **GPU 加速與相容性 (11-14)**`6452797` 提案 + `7536f43` 實作 GPU OCR`d80d60f`/`3694411`/`80c091b` 修正 Paddle 3.x API 與安裝來源,`b048f2d` 暫停圖表識別以避免 API 缺口。
- **外部 Auth V2 與管理後台 (11-14~11-16)**`28e419f`~`fd98018` 完成外部認證 V2、資料表前綴與架構移除 V1`8f94191` 新增後台/稽核/Token 檢查;`90fca50`/`6bb5b76` 讓 18/18 測試全過。
- **V2 UI 串接與初版版面保持 PDF (11-16~11-18)**:前端/後端全面切換 V2 API (`ad5c8be` 之後)`fa1abcd` 版面保持 PDF + 多次座標/重疊修正 (`d33f605`~`0edc56b`),強化 logging (`d99d37d`)。
- **雙軌處理架構 (11-18~11-20)**`2d50c12` + `82139c8` 導入 OCR/Direct 雙軌與 UnifiedDocument`a3a6fbe`/`ab89a40`/`ecdce96` 完成轉換、JSON 匯出與 PDF 支援;`1d0b638` 後端 API`c2288ba` 前端支援,`c50a5e9` 單元/整合測試;`0974fc3` E2E 修復,`ef335cf` Office 直抽,`b997f93`/`9f449e8` GPU 記憶體管理與文件化,`2ecd022` E2E 測試完成。
- **PDF 版面復原計畫 (11-20 提案11-24 實作高峰)**`cf894b0` 提案後,`0aff468` Phase1 圖片/表格修復,`3fc32bc` Phase2 風格保存,`77fe4cc`/`ad879d4`/`75c194f` 等完成 Alignment、List、Span 級渲染與多欄位;一系列 `93bd9f5`~`3358d97` 針對位置/重疊/缺圖修正,`4325d02` 專案清理並封存提案。
- **PP-Structure V3 調校 (11-25)**`a659e7a` 改善複雜圖示結構保留,`2312b4c` 前端可調 `pp_structure` 參數 + 測試,`0999898` 多頁 PDF 座標校正。
- **記憶體管理與混合抽圖 (11-25~11-26)**`ba8ddf2` 提案,`1afdb82` 混合圖片抽取+記憶體管理落地,`b997f93` 系列 GPU 釋放/可選 torch引入 ModelManager、ServicePool、MemoryGuard詳見 `openspec/changes/archive/2025-11-26-enhance-memory-management``a227311` 封存提案但僅完成 75/80 任務(剩餘文件化);隨後多筆修復(`79cffe6`~`fa9b542`)處理 PDF 回歸與文字渲染,`6e050eb` 為最新 OCR 軌表格格式/裁剪修正。
## 品質與測試信號
- 11-16 完成 V2 API 測試 18/18 (`6bb5b76`),建立初步信心。
- 雙軌導入時新增單元/整合/E2E 測試 (`0fcb249`, `c50a5e9`, `2ecd022`),但後續 PDF 版面復原大量依賴人工驗證Phase 4 測試仍未完成(見下)。
- 記憶體管理變更伴隨 57+18+10 測試檔(任務 8.1 完成),但文件化缺失可能影響交接與調參。
- 11-24 大量 PDF 修復連續提交顯示迭代式修 bug建議增加回歸測試覆蓋特別是表格/多欄/列表與跨軌道 PDF
## 未盡事項與風險
- **記憶體管理文件化缺口**`openspec/changes/archive/2025-11-26-enhance-memory-management/tasks.md` 未完成 Section 8.2(架構說明、調校指南、疑難排解、監控、遷移指南),可能影響部署可操作性。
- **PDF 版面復原驗證不足**:同一變更的 Phase 4 測試/效能/文件與多類文件驗證均未勾選,現階段品質依賴手測。
- **近期修正集中於 PDF 與表格**`79cffe6`, `5c561f4`, `19bd5fd`, `fa9b542`, `6e050eb`),顯示 Direct/OCR 軌 PDF 路徑仍脆弱;缺乏自動化回歸易再度回歸。
- **主分支狀態**`main``origin/main` 超前 1 提交(`6e050eb`),請推送前確認 CI/測試。
## 建議後續行動
1) 完成記憶體管理文件架構、調參、故障排除、Prometheus 監控指南)並加入 sanity check。
2) 為 PDF 版面復原建立最小回歸集:多欄文檔、含圖表/表格的 Direct/OCR 軌、列表與 span 混排。
3) 圍繞 `processing_track` 分流與 UnifiedDocument/PDF 生成的邊界條件增加測試LOGO/未知元素、跨頁表格、OCR/Direct 混合圖片)。
4) 推送前跑現有單元/整合/E2E 測試,補上近兩週新增場景的腳本以降低回歸風險。

View File

@@ -0,0 +1,24 @@
# Project Risk & Issue Outlook
本文件整理當前專案的可預見問題、潛在問題與建議修復方向(依風險與可行性排序)。依據來源:`git log`97 commits, 2025-11-12~11-26`docs/architecture-overview.md``openspec/changes/archive/2025-11-26-enhance-memory-management/tasks.md` 等。
## 可預見的問題項目
- **記憶體管理文件缺口**`openspec/changes/archive/2025-11-26-enhance-memory-management/tasks.md` 的 8.2 文檔未完成ModelManager/ServicePool/MemoryGuard 的調參與故障處置缺乏 runbook部署或擴容時易踩坑。方向補完架構說明、調參指南、故障排解與監控落地範例Prometheus 指標與警戒值)。
- **PDF 生成回歸風險高**:版面保持與表格/圖片渲染在 `fa1abcd` 之後多次修正(例如 `d33f605``92e326b``108784a``3358d97``6e050eb`),顯示缺少自動回歸。方向:建立最小回歸集(多欄文本、含圖表/表格、列表/Span 混排)與 golden PDF/JSON 比對,覆蓋 Direct/OCR 雙軌。
- **最新 OCR 表格格式修復未經回歸**`6e050eb` 修正 OCR 軌表格資料格式與裁剪,無對應測試。方向:為 OCR 軌加表格解析/PDF 出圖的整合測試,確保與前端下載/展示一致。
- **PP-Structure 參數調校可能影響資源**`frontend` 支援前端可調 `pp_structure_params``2312b4c`),若缺乏 guard可能放大 GPU/記憶體壓力。方向:在後端對超參做白名單與上限檢查,並納入 MemoryGuard 預估。
- **Chart 能力啟停策略缺少驗證**`b048f2d` 禁用 → `7e12f16` 重新啟用;缺少覆蓋率與性能數據。方向:為 chart 模型啟用/關閉建立健康檢查與 A/B 測試數據收集。
## 潛在的問題項目
- **UnifiedDocument 結構漂移風險**雙軌共用輸出近期多次調整列表、Span、多欄、LOGO 元素),缺少結構驗證或 schema 鎖定。可能導致前端/匯出器/PDF 生成不一致。方向:定義 JSON Schema 或 pydantic 驗證,建立 contract 測試。
- **服務池與記憶體守護的長時間行為未驗證**:雖有單元/整合測試,缺乏長時間 soak/stressGPU 記憶碎片、模型 unload/reload、信號處理。方向加入 24h soak 測試與記憶體走勢告警,驗證 SIGTERM/SIGINT 清理。
- **LibreOffice 轉檔鏈低觀測性**Office 直抽與轉 PDF (`ef335cf`) 依賴系統 LibreOffice缺少失敗監控與重試策略。方向為轉檔階段增加 metrics/告警,並提供 fallback/重試。
- **前端/後端 API 契約缺少檢查**:多次 V1→V2 遷移與新增參數(`pp_structure_params` 等),目前僅靠 E2E缺少型別/契約檢查。方向:加入 OpenAPI 契約測試或生成型別校驗ts-sdk 對齊 FastAPI schema
- **混合抽圖/圖片保存路徑邊界**Direct/OCR 混合抽圖與 `_save_image` 實作曾多次修復,仍缺少對 None/缺檔路徑的防禦。方向:為缺檔/無圖的 PDF 生成加強斷言與 fallback。
## 建議修復與方向
1) **完成記憶體管理文檔與樣板設定**:在 `docs/` 新增 MemoryGuard/ServicePool 調參與故障排除指南,附 `.env` 範例與 Prometheus 規則,對應 tasks 8.2 清單。
2) **建立 PDF/UnifiedDocument 回歸套件**:收集代表性樣本(多欄、表格、列表、含圖/LOGO、OCR/Direct 雙軌),產生 golden JSON/PDF加入 CI 比對,並為 `6e050eb` 相關表格路徑新增測試。
3) **加入 UnifiedDocument Schema 驗證**:定義 schemapydantic/JSON Schema在匯出/PDF 生成前驗證;同時讓前端型別由 OpenAPI 生成以防 drift。
4) **PP-Structure 參數防護與資源估算**:後端實作白名單/上限與 MemoryGuard 預估,避免前端自由調參造成 GPU OOM增加拒絕/降級回饋。
5) **長時間穩定性與轉檔可觀測性**:增加 soak/stress pipeline追蹤 GPU/CPU/記憶碎片;為 LibreOffice/轉檔階段加 metrics、重試與錯誤分類告警。

View File

@@ -0,0 +1,110 @@
import { cn } from '@/lib/utils'
import { Check, FileText, Globe, BookOpen } from 'lucide-react'
import { useTranslation } from 'react-i18next'
import type { LayoutModel } from '@/types/apiV2'
interface LayoutModelSelectorProps {
value: LayoutModel
onChange: (model: LayoutModel) => void
disabled?: boolean
className?: string
}
const MODEL_ICONS: Record<LayoutModel, React.ReactNode> = {
chinese: <FileText className="w-5 h-5" />,
default: <Globe className="w-5 h-5" />,
cdla: <BookOpen className="w-5 h-5" />,
}
export default function LayoutModelSelector({
value,
onChange,
disabled = false,
className,
}: LayoutModelSelectorProps) {
const { t } = useTranslation()
const models: LayoutModel[] = ['chinese', 'default', 'cdla']
const getModelInfo = (model: LayoutModel) => ({
label: t(`processing.layoutModel.${model}`),
description: t(`processing.layoutModel.${model}Desc`),
})
return (
<div className={cn('border rounded-lg p-4 bg-white', className)}>
{/* Header */}
<div className="flex items-center gap-2 mb-4">
<FileText className="w-5 h-5 text-gray-600" />
<h3 className="text-lg font-semibold text-gray-900">{t('processing.layoutModel.title')}</h3>
</div>
{/* Model Options */}
<div className="space-y-3">
{models.map((model) => {
const info = getModelInfo(model)
const isSelected = value === model
return (
<button
key={model}
type="button"
disabled={disabled}
onClick={() => onChange(model)}
className={cn(
'w-full flex items-start gap-4 p-4 rounded-lg border-2 transition-all text-left',
isSelected
? 'border-blue-500 bg-blue-50'
: 'border-gray-200 hover:border-gray-300 hover:bg-gray-50',
disabled && 'opacity-50 cursor-not-allowed'
)}
>
{/* Icon */}
<div
className={cn(
'p-2 rounded-lg flex-shrink-0',
isSelected ? 'bg-blue-100 text-blue-600' : 'bg-gray-100 text-gray-500'
)}
>
{MODEL_ICONS[model]}
</div>
{/* Content */}
<div className="flex-1 min-w-0">
<div className="flex items-center gap-2">
<span
className={cn(
'font-medium',
isSelected ? 'text-blue-700' : 'text-gray-900'
)}
>
{info.label}
</span>
{model === 'chinese' && (
<span className="text-xs bg-green-100 text-green-700 px-2 py-0.5 rounded-full">
{t('processing.layoutModel.recommended')}
</span>
)}
</div>
<p className="text-sm text-gray-500 mt-1">{info.description}</p>
</div>
{/* Check mark */}
{isSelected && (
<div className="flex-shrink-0">
<Check className="w-5 h-5 text-blue-600" />
</div>
)}
</button>
)
})}
</div>
{/* Info Note */}
<div className="mt-4 p-3 bg-blue-50 border border-blue-200 rounded-md">
<p className="text-sm text-blue-800">
{t('processing.layoutModel.note')}
</p>
</div>
</div>
)
}

View File

@@ -1,408 +0,0 @@
import { useState, useEffect } from 'react'
import { Settings, RotateCcw, HelpCircle, Save, Upload, Download, Check, AlertCircle } from 'lucide-react'
import { cn } from '@/lib/utils'
import type { PPStructureV3Params } from '@/types/apiV2'
const STORAGE_KEY = 'pp_structure_params_presets'
const LAST_USED_KEY = 'pp_structure_params_last_used'
interface PPStructureParamsProps {
value: PPStructureV3Params
onChange: (params: PPStructureV3Params) => void
disabled?: boolean
className?: string
}
interface ParamConfig {
key: keyof PPStructureV3Params
label: string
description: string
min: number
max: number
step: number
default: number
type: 'slider'
}
interface SelectParamConfig {
key: keyof PPStructureV3Params
label: string
description: string
options: Array<{ value: string; label: string }>
default: string
type: 'select'
}
// Preset configurations
const PRESETS = {
default: {} as PPStructureV3Params,
'high-quality': {
layout_detection_threshold: 0.1,
layout_nms_threshold: 0.15,
text_det_thresh: 0.1,
text_det_box_thresh: 0.2,
layout_merge_bboxes_mode: 'small' as const,
} as PPStructureV3Params,
fast: {
layout_detection_threshold: 0.3,
layout_nms_threshold: 0.3,
text_det_thresh: 0.3,
text_det_box_thresh: 0.4,
layout_merge_bboxes_mode: 'large' as const,
} as PPStructureV3Params,
}
const PARAM_CONFIGS: Array<ParamConfig | SelectParamConfig> = [
{
key: 'layout_detection_threshold',
label: 'Layout Detection Threshold',
description: 'Lower = detect more blocks (including weak signals), Higher = only high-confidence blocks',
min: 0,
max: 1,
step: 0.05,
default: 0.2,
type: 'slider' as const,
},
{
key: 'layout_nms_threshold',
label: 'Layout NMS Threshold',
description: 'Lower = aggressive overlap removal, Higher = allow more overlapping boxes',
min: 0,
max: 1,
step: 0.05,
default: 0.2,
type: 'slider' as const,
},
{
key: 'layout_merge_bboxes_mode',
label: 'Layout Merge Mode',
description: 'Bounding box merging strategy',
options: [
{ value: 'small', label: 'Small (Conservative)' },
{ value: 'union', label: 'Union (Balanced)' },
{ value: 'large', label: 'Large (Aggressive)' },
],
default: 'small',
type: 'select' as const,
},
{
key: 'layout_unclip_ratio',
label: 'Layout Unclip Ratio',
description: 'Larger = looser bounding boxes, Smaller = tighter bounding boxes',
min: 0.5,
max: 3.0,
step: 0.1,
default: 1.2,
type: 'slider' as const,
},
{
key: 'text_det_thresh',
label: 'Text Detection Threshold',
description: 'Lower = detect more small/low-contrast text, Higher = cleaner but may miss text',
min: 0,
max: 1,
step: 0.05,
default: 0.2,
type: 'slider' as const,
},
{
key: 'text_det_box_thresh',
label: 'Text Box Threshold',
description: 'Lower = more text boxes retained, Higher = fewer false positives',
min: 0,
max: 1,
step: 0.05,
default: 0.3,
type: 'slider' as const,
},
{
key: 'text_det_unclip_ratio',
label: 'Text Unclip Ratio',
description: 'Larger = looser text boxes, Smaller = tighter text boxes',
min: 0.5,
max: 3.0,
step: 0.1,
default: 1.2,
type: 'slider' as const,
},
]
export default function PPStructureParams({
value,
onChange,
disabled = false,
className,
}: PPStructureParamsProps) {
const [showTooltip, setShowTooltip] = useState<string | null>(null)
const [isExpanded, setIsExpanded] = useState(false)
const [selectedPreset, setSelectedPreset] = useState<string>('custom')
const [showSaveSuccess, setShowSaveSuccess] = useState(false)
// Load last used parameters on mount
useEffect(() => {
try {
const lastUsed = localStorage.getItem(LAST_USED_KEY)
if (lastUsed && Object.keys(value).length === 0) {
const params = JSON.parse(lastUsed)
onChange(params)
}
} catch (error) {
console.error('Failed to load last used parameters:', error)
}
}, [])
// Save to localStorage when parameters change
useEffect(() => {
if (Object.keys(value).length > 0) {
try {
localStorage.setItem(LAST_USED_KEY, JSON.stringify(value))
} catch (error) {
console.error('Failed to save parameters:', error)
}
}
}, [value])
const handleReset = () => {
onChange({})
setSelectedPreset('default')
setShowSaveSuccess(false)
}
const handlePresetChange = (presetKey: string) => {
setSelectedPreset(presetKey)
if (presetKey === 'custom') return
const preset = PRESETS[presetKey as keyof typeof PRESETS]
if (preset) {
onChange(preset)
setShowSaveSuccess(false)
}
}
const handleChange = (key: keyof PPStructureV3Params, newValue: any) => {
const newParams = {
...value,
[key]: newValue,
}
onChange(newParams)
setSelectedPreset('custom')
}
const handleExport = () => {
const dataStr = JSON.stringify(value, null, 2)
const dataUri = 'data:application/json;charset=utf-8,' + encodeURIComponent(dataStr)
const exportFileDefaultName = 'pp_structure_params.json'
const linkElement = document.createElement('a')
linkElement.setAttribute('href', dataUri)
linkElement.setAttribute('download', exportFileDefaultName)
linkElement.click()
}
const handleImport = () => {
const input = document.createElement('input')
input.type = 'file'
input.accept = 'application/json'
input.onchange = (e) => {
const file = (e.target as HTMLInputElement).files?.[0]
if (file) {
const reader = new FileReader()
reader.onload = (event) => {
try {
const params = JSON.parse(event.target?.result as string)
onChange(params)
setSelectedPreset('custom')
setShowSaveSuccess(true)
setTimeout(() => setShowSaveSuccess(false), 3000)
} catch (error) {
console.error('Failed to import parameters:', error)
}
}
reader.readAsText(file)
}
}
input.click()
}
const hasCustomValues = Object.keys(value).length > 0
return (
<div className={cn('border rounded-lg p-4 bg-white', className)}>
{/* Header */}
<div className="flex items-center justify-between mb-4">
<div className="flex items-center gap-2">
<Settings className="w-5 h-5 text-gray-600" />
<h3 className="text-lg font-semibold text-gray-900">PP-StructureV3 Parameters</h3>
{hasCustomValues && (
<span className="text-xs bg-blue-100 text-blue-700 px-2 py-1 rounded">Custom</span>
)}
{showSaveSuccess && (
<span className="flex items-center gap-1 text-xs bg-green-100 text-green-700 px-2 py-1 rounded animate-in fade-in">
<Check className="w-3 h-3" />
Saved
</span>
)}
</div>
<div className="flex items-center gap-2">
<button
type="button"
onClick={() => setIsExpanded(!isExpanded)}
className="text-sm text-blue-600 hover:text-blue-700 px-3 py-1.5 rounded-md hover:bg-blue-50"
>
{isExpanded ? 'Hide' : 'Show'} Parameters
</button>
</div>
</div>
{/* Preset Selector & Actions */}
{isExpanded && (
<div className="mb-4 p-3 bg-gray-50 rounded-md space-y-3">
<div className="flex items-center gap-3">
<label className="text-sm font-medium text-gray-700">Preset:</label>
<select
value={selectedPreset}
onChange={(e) => handlePresetChange(e.target.value)}
disabled={disabled}
className="flex-1 px-3 py-1.5 text-sm border border-gray-300 rounded-md focus:outline-none focus:ring-2 focus:ring-blue-500 disabled:bg-gray-100"
>
<option value="default">Default (Backend Settings)</option>
<option value="high-quality">High Quality (Lower Thresholds)</option>
<option value="fast">Fast (Higher Thresholds)</option>
<option value="custom">Custom</option>
</select>
</div>
<div className="flex items-center gap-2">
<button
type="button"
onClick={handleReset}
disabled={disabled || !hasCustomValues}
className={cn(
'flex items-center gap-1 px-3 py-1.5 text-sm rounded-md transition-colors',
disabled || !hasCustomValues
? 'bg-gray-200 text-gray-400 cursor-not-allowed'
: 'bg-white border border-gray-300 text-gray-700 hover:bg-gray-50'
)}
>
<RotateCcw className="w-4 h-4" />
Reset
</button>
<button
type="button"
onClick={handleExport}
disabled={disabled || !hasCustomValues}
className={cn(
'flex items-center gap-1 px-3 py-1.5 text-sm rounded-md transition-colors',
disabled || !hasCustomValues
? 'bg-gray-200 text-gray-400 cursor-not-allowed'
: 'bg-white border border-gray-300 text-gray-700 hover:bg-gray-50'
)}
>
<Download className="w-4 h-4" />
Export
</button>
<button
type="button"
onClick={handleImport}
disabled={disabled}
className={cn(
'flex items-center gap-1 px-3 py-1.5 text-sm rounded-md transition-colors',
disabled
? 'bg-gray-200 text-gray-400 cursor-not-allowed'
: 'bg-white border border-gray-300 text-gray-700 hover:bg-gray-50'
)}
>
<Upload className="w-4 h-4" />
Import
</button>
</div>
</div>
)}
{/* Expanded Parameters */}
{isExpanded && (
<div className="space-y-6 pt-4 border-t">
{PARAM_CONFIGS.map((config) => (
<div key={config.key} className="space-y-2">
<div className="flex items-center justify-between">
<div className="flex items-center gap-2">
<label htmlFor={config.key} className="text-sm font-medium text-gray-700">
{config.label}
</label>
<button
type="button"
onMouseEnter={() => setShowTooltip(config.key)}
onMouseLeave={() => setShowTooltip(null)}
className="text-gray-400 hover:text-gray-600 relative"
>
<HelpCircle className="w-4 h-4" />
{showTooltip === config.key && (
<div className="absolute left-6 top-0 w-64 p-2 bg-gray-900 text-white text-xs rounded shadow-lg z-10">
{config.description}
</div>
)}
</button>
</div>
{config.type === 'slider' && (
<div className="flex items-center gap-2">
<span className="text-sm font-semibold text-blue-600">
{value[config.key] ?? config.default}
</span>
{value[config.key] !== undefined && value[config.key] !== config.default && (
<span className="text-xs text-gray-500">
(default: {config.default})
</span>
)}
</div>
)}
</div>
{config.type === 'slider' ? (
<input
type="range"
id={config.key}
min={config.min}
max={config.max}
step={config.step}
value={value[config.key] ?? config.default}
onChange={(e) => handleChange(config.key, parseFloat(e.target.value))}
disabled={disabled}
className="w-full h-2 bg-gray-200 rounded-lg appearance-none cursor-pointer disabled:cursor-not-allowed disabled:opacity-50"
/>
) : (
<select
id={config.key}
value={(value[config.key] as string) ?? config.default}
onChange={(e) => handleChange(config.key, e.target.value)}
disabled={disabled}
className="w-full px-3 py-2 border border-gray-300 rounded-md focus:outline-none focus:ring-2 focus:ring-blue-500 disabled:bg-gray-100 disabled:cursor-not-allowed"
>
{config.options.map((option) => (
<option key={option.value} value={option.value}>
{option.label}
</option>
))}
</select>
)}
</div>
))}
{/* Info Note */}
<div className="mt-4 p-3 bg-blue-50 border border-blue-200 rounded-md">
<p className="text-sm text-blue-800">
<strong>Note:</strong> These parameters only apply when using the OCR track. Adjusting them
can help improve accuracy for specific document types.
</p>
</div>
</div>
)}
{/* Collapsed Summary */}
{!isExpanded && hasCustomValues && (
<div className="text-sm text-gray-600">
{Object.keys(value).length} parameter(s) customized
</div>
)}
</div>
)
}

View File

@@ -52,6 +52,17 @@
"language": "識別語言",
"threshold": "信心度閾值",
"layoutDetection": "版面偵測"
},
"layoutModel": {
"title": "版面偵測模型",
"chinese": "中文文件模型",
"chineseDesc": "PP-DocLayout-S - 適用於中文表單、合約、發票(推薦)",
"default": "標準模型",
"defaultDesc": "PubLayNet 模型 - 適用於英文學術論文、報告",
"cdla": "CDLA 模型",
"cdlaDesc": "專用中文版面分析模型 - 適用於複雜中文版面",
"recommended": "推薦",
"note": "版面模型會影響文件結構(表格、文字區塊、圖片)的偵測效果。請根據您的文件類型選擇適合的模型。"
}
},
"results": {

View File

@@ -9,10 +9,10 @@ import { Badge } from '@/components/ui/badge'
import { useToast } from '@/components/ui/toast'
import { apiClientV2 } from '@/services/apiV2'
import { Play, CheckCircle, FileText, AlertCircle, Clock, Activity, Loader2 } from 'lucide-react'
import PPStructureParams from '@/components/PPStructureParams'
import LayoutModelSelector from '@/components/LayoutModelSelector'
import TaskNotFound from '@/components/TaskNotFound'
import { useTaskValidation } from '@/hooks/useTaskValidation'
import type { PPStructureV3Params, ProcessingOptions } from '@/types/apiV2'
import type { LayoutModel, ProcessingOptions } from '@/types/apiV2'
export default function ProcessingPage() {
const { t } = useTranslation()
@@ -31,8 +31,8 @@ export default function ProcessingPage() {
},
})
// PP-StructureV3 parameters state
const [ppStructureParams, setPpStructureParams] = useState<PPStructureV3Params>({})
// Layout model state (default to 'chinese' for best Chinese document support)
const [layoutModel, setLayoutModel] = useState<LayoutModel>('chinese')
// Start OCR processing
const processOCRMutation = useMutation({
@@ -40,11 +40,7 @@ export default function ProcessingPage() {
const options: ProcessingOptions = {
use_dual_track: true,
language: 'ch',
}
// Only include pp_structure_params if user has customized them
if (Object.keys(ppStructureParams).length > 0) {
options.pp_structure_params = ppStructureParams
layout_model: layoutModel,
}
return apiClientV2.startTask(taskId!, options)
@@ -346,11 +342,11 @@ export default function ProcessingPage() {
</Card>
)}
{/* PP-StructureV3 Parameters (only show when task is pending) */}
{/* Layout Model Selection (only show when task is pending) */}
{isPending && (
<PPStructureParams
value={ppStructureParams}
onChange={setPpStructureParams}
<LayoutModelSelector
value={layoutModel}
onChange={setLayoutModel}
disabled={processOCRMutation.isPending}
/>
)}

View File

@@ -73,15 +73,14 @@ export interface DocumentAnalysisResponse {
page_count: number | null
}
export interface PPStructureV3Params {
layout_detection_threshold?: number // 0-1: Lower=more blocks, Higher=high confidence only
layout_nms_threshold?: number // 0-1: Lower=aggressive overlap removal, Higher=allow more overlap
layout_merge_bboxes_mode?: 'union' | 'large' | 'small' // small=conservative, large=aggressive, union=middle
layout_unclip_ratio?: number // >0: Larger=looser boxes, Smaller=tighter boxes
text_det_thresh?: number // 0-1: Lower=detect more small/low-contrast text, Higher=cleaner
text_det_box_thresh?: number // 0-1: Lower=more text boxes, Higher=fewer false positives
text_det_unclip_ratio?: number // >0: Larger=looser text boxes, Smaller=tighter boxes
}
/**
* Layout detection model selection for OCR track.
* Different models are optimized for different document types:
* - chinese: PP-DocLayout-S - Best for Chinese forms, contracts, invoices
* - default: PubLayNet-based - Best for English academic papers
* - cdla: Specialized for Chinese document layout analysis
*/
export type LayoutModel = 'chinese' | 'default' | 'cdla'
export interface ProcessingOptions {
use_dual_track?: boolean
@@ -89,7 +88,7 @@ export interface ProcessingOptions {
language?: string
include_layout?: boolean
include_images?: boolean
pp_structure_params?: PPStructureV3Params // Fine-tuning parameters for PP-StructureV3 (OCR track only)
layout_model?: LayoutModel // Layout detection model selection (OCR track only)
}
export interface TaskCreate {

View File

@@ -0,0 +1,28 @@
# Change: Fix OCR Track Table Empty Columns and Alignment
## Why
PP-Structure 生成的表格經常包含空白欄位(所有 row 該欄皆為空/空白),導致轉換後的 UnifiedDocument 表格出現空欄與欄位錯位。目前 OCR Track 直接使用原始資料,未進行清理,影響 PDF/JSON/Markdown 輸出品質。
## What Changes
- 新增 `trim_empty_columns()` 函數,清理 OCR Track 表格的空欄
-`_convert_table_data` 入口調用清洗邏輯,確保 TableData 乾淨
- 處理 col_span 重算:若 span 跨過被移除欄位,縮小 span
- 更新 columns/cols 數值、調整各 cell 的 col 索引
- 可選:依 bbox x0 進行欄對齊排序
## Impact
- Affected specs: `ocr-processing`
- Affected code:
- `backend/app/services/ocr_to_unified_converter.py` (主要修改)
- 不影響 Direct/HYBRID 路徑
- PDF/JSON/Markdown 輸出將更乾淨
## Constraints
- 保持表格 bbox、頁面座標不變
- 不修改 Direct/HYBRID 路徑
- 只移除「所有行皆空」的欄;若表頭空但數據有值,不應移除
- 保留原 bbox避免 PDF 版面漂移

View File

@@ -0,0 +1,61 @@
## ADDED Requirements
### Requirement: OCR Table Empty Column Cleanup
The OCR Track converter SHALL clean up PP-Structure generated tables by removing columns where all rows have empty or whitespace-only content.
The system SHALL:
1. Identify columns where every cell's content is empty or contains only whitespace (using `.strip()` to determine emptiness)
2. Remove identified empty columns from the table structure
3. Update the `columns`/`cols` value to reflect the new column count
4. Recalculate each cell's `col` index to maintain continuity
5. Adjust `col_span` values when spans cross removed columns (shrink span size)
6. Remove cells entirely when their complete span falls within removed columns
7. Preserve original bbox and page coordinates (no layout drift)
8. If `columns` is 0 or missing after cleanup, fill with the calculated column count
The cleanup SHALL NOT:
- Remove columns where the header is empty but data rows contain values
- Modify tables in Direct or HYBRID track
- Alter the original bbox coordinates
#### Scenario: All rows in column are empty
- **WHEN** a table has a column where all cells contain only empty or whitespace content
- **THEN** that column is removed
- **AND** remaining cells have their `col` indices decremented appropriately
- **AND** `cols` count is reduced by 1
#### Scenario: Column has empty header but data has values
- **WHEN** a table has a column where the header cell is empty
- **AND** at least one data row cell in that column contains non-whitespace content
- **THEN** that column is NOT removed
#### Scenario: Cell span crosses removed column
- **WHEN** a cell has `col_span > 1`
- **AND** one or more columns within the span are removed
- **THEN** the `col_span` is reduced by the number of removed columns within the span
#### Scenario: Cell span entirely within removed columns
- **WHEN** a cell's entire span falls within columns that are all removed
- **THEN** that cell is removed from the table
#### Scenario: Missing columns metadata
- **WHEN** the table dict has `columns` set to 0 or missing
- **AFTER** cleanup is performed
- **THEN** `columns` is set to the calculated number of remaining columns
### Requirement: OCR Table Column Alignment by Bbox
(Optional Enhancement) When bbox coordinates are available for table cells, the OCR Track converter SHALL use cell bbox x0 coordinates to improve column alignment accuracy.
The system SHALL:
1. Sort cells by bbox `x0` coordinate before assigning column indices
2. Reassign `col` indices based on spatial position rather than HTML order
This requirement is optional and implementation MAY be deferred if bbox data is not reliably available.
#### Scenario: Cells reordered by bbox position
- **WHEN** bbox coordinates are available for table cells
- **AND** the original HTML order does not match spatial order
- **THEN** cells are reordered by `x0` coordinate
- **AND** `col` indices are reassigned to reflect spatial positioning

View File

@@ -0,0 +1,43 @@
# Tasks: Fix OCR Track Table Empty Columns
## 1. Core Implementation
- [x] 1.1 在 `ocr_to_unified_converter.py` 實作 `trim_empty_columns(table_dict: Dict[str, Any]) -> Dict[str, Any]`
- 依據 cells 陣列計算每一欄是否「所有 row 的內容皆為空/空白」
- 使用 `.strip()` 判斷空白字元
- [x] 1.2 實作欄位移除邏輯
- 更新 columns/cols 數值
- 調整各 cell 的 col 索引
- [x] 1.3 實作 col_span 重算邏輯
- 若 span 跨過被移除欄位,縮小 span
- 若整個 span 落在被刪欄位上,移除該 cell
- [x] 1.4 在 `_convert_table_data` 入口呼叫 `trim_empty_columns`
- 在建 TableData 之前執行清洗
- 同時也在 `_extract_table_data` (HTML 表格解析) 中加入清洗
- [ ] 1.5 (可選) 依 bbox x0/x1 進行欄對齊排序
- 若可取得 bbox 網格,先依 x0 排序再重排 col index
- 此功能延後實作,待 bbox 資料確認可用性後進行
## 2. Testing & Validation
- [x] 2.1 單元測試通過
- 測試基本空欄移除
- 測試表頭空但數據有值(不移除)
- 測試 col_span 跨越被移除欄位(縮小 span
- 測試 cell 完全落在被移除欄位(移除 cell
- 測試無空欄情況(不變更)
- [x] 2.2 檢查現有 OCR 結果
- 現有結果中無「整欄為空」的表格
- 實作已就緒,遇到空欄時會正確清理
- [x] 2.3 確認 Direct/HYBRID 表格不變
- `OCRToUnifiedConverter` 僅在 `ocr_service.py` 中使用
- Direct 軌使用 `DirectExtractionEngine`,不受影響
## 3. Edge Cases & Validation
- [x] 3.1 處理 columns 欄位為 0/缺失的情況
- 以計算後的欄數回填,避免 downstream 依賴出錯
- [x] 3.2 處理表頭為空但數據有值的情況
- 只移除「所有行皆空」的欄
- [x] 3.3 確保不直接修改 `backend/storage/results/...`
- 修改 converter需重新跑任務驗證

View File

@@ -0,0 +1,183 @@
# Design: OCR Track Gap Filling
## Context
PP-StructureV3 版面分析模型在處理某些掃描文件時會嚴重漏檢。實測顯示 Raw PaddleOCR 能偵測 56 個文字區域,但 PP-StructureV3 僅輸出 9 個元素(遺失 84%)。
問題發生在 PP-StructureV3 內部的 Layout Detection Model這是 PaddleOCR 函式庫的限制,無法從外部修復。但 Raw OCR 的 `text_regions` 資料仍然完整可用。
### Stakeholders
- **End users**: 需要完整的 OCR 輸出,不能有大量文字遺失
- **OCR track**: 需要整合 Raw OCR 與 PP-StructureV3 結果
- **Direct/Hybrid track**: 不應受此變更影響
## Goals / Non-Goals
### Goals
- 偵測 PP-StructureV3 漏檢區域並以 Raw OCR 結果補回
- 確保補回的文字不會與現有元素重複
- 維持正確的閱讀順序
- 僅影響 OCR track不改變其他 track 的行為
### Non-Goals
- 不修改 PP-StructureV3 或 PaddleOCR 內部邏輯
- 不處理圖片/表格/圖表等非文字元素的補漏
- 不實作複雜的版面分析(僅做 gap filling
## Decisions
### Decision 1: 覆蓋判定策略
**選擇**: 優先使用「中心點落入」判定,輔以 IoU 閾值
**理由**:
- 中心點判定計算簡單,效能好
- IoU 閾值作為補充,處理邊界情況
- 建議 IoU 閾值 0.1~0.2,避免低 IoU 被誤判為未覆蓋
**替代方案**:
- 純 IoU 判定:計算量較大,且對部分重疊的處理較複雜
- 面積比例判定:對不同大小的區域不夠公平
### Decision 2: 補漏觸發條件
**選擇**: 當 PP-Structure 覆蓋率 < 70% 或元素數顯著低於 Raw OCR
**理由**:
- 避免正常文件出現重複文字
- 70% 閾值經驗值可透過設定調整
- 元素數比較作為快速判斷條件
### Decision 3: 補漏元素類型
**選擇**: 僅補 TEXT 類型跳過 TABLE/IMAGE/FIGURE/FLOWCHART/HEADER/FOOTER
**理由**:
- PP-StructureV3 對結構化元素表格圖片的識別通常較準確
- 補回原始 OCR 文字可能破壞表格結構
- 這些元素需要保持結構完整性
### Decision 4: 重複判定與去重
**選擇**: IoU > 0.5 的 Raw OCR 區域視為與 PP-Structure TEXT 重複,跳過
**理由**:
- 0.5 是常見的重疊閾值
- 避免同一文字出現兩次
- 對細碎的 Raw OCR 框可考慮輕量合併
### Decision 5: 座標對齊
**選擇**: 使用 `ocr_dimensions` 進行 bbox 換算
**理由**:
- OCR 可能有 resize 處理
- 確保 Raw OCR 與 PP-Structure 的座標在同一空間
- 避免因尺寸不一致導致覆蓋誤判
## Data Flow
```
┌─────────────────┐ ┌──────────────────────┐
│ Raw OCR Result │ │ PP-StructureV3 Result│
│ (56 regions) │ │ (9 elements) │
└────────┬────────┘ └──────────┬───────────┘
│ │
└────────────┬────────────┘
┌───────▼───────┐
│ GapFillingService │
│ 1. Calculate coverage
│ 2. Find uncovered regions
│ 3. Filter by confidence
│ 4. Deduplicate
│ 5. Merge if needed
└───────┬───────┘
┌───────▼───────┐
│ OCRToUnifiedConverter │
│ - Combine elements
│ - Recalculate reading order
└───────┬───────┘
┌───────▼───────┐
│ UnifiedDocument │
│ (complete content)
└───────────────┘
```
## Algorithm: Gap Detection
```python
def find_uncovered_regions(
raw_ocr_regions: List[TextRegion],
pp_structure_elements: List[Element],
iou_threshold: float = 0.15
) -> List[TextRegion]:
"""
Find Raw OCR regions not covered by PP-Structure elements.
Coverage criteria (either one):
1. Center point of raw region falls inside any PP-Structure bbox
2. IoU with any PP-Structure bbox > iou_threshold
"""
uncovered = []
# Filter PP-Structure elements: only consider TEXT, skip TABLE/IMAGE/etc.
text_elements = [e for e in pp_structure_elements
if e.type not in SKIP_TYPES]
for region in raw_ocr_regions:
center = get_center(region.bbox)
is_covered = False
for element in text_elements:
# Check center point
if point_in_bbox(center, element.bbox):
is_covered = True
break
# Check IoU
if calculate_iou(region.bbox, element.bbox) > iou_threshold:
is_covered = True
break
if not is_covered:
uncovered.append(region)
return uncovered
```
## Configuration Parameters
| Parameter | Type | Default | Description |
|-----------|------|---------|-------------|
| `gap_filling_enabled` | bool | True | 是否啟用 gap filling |
| `gap_filling_coverage_threshold` | float | 0.7 | 覆蓋率低於此值時啟用 |
| `gap_filling_iou_threshold` | float | 0.15 | 覆蓋判定 IoU 閾值 |
| `gap_filling_confidence_threshold` | float | 0.3 | Raw OCR 信心度門檻 |
| `gap_filling_dedup_iou_threshold` | float | 0.5 | 去重 IoU 閾值 |
## Risks / Trade-offs
### Risk 1: 補漏造成文字重複
**Mitigation**: 設定 dedup_iou_threshold對高重疊區域進行去重
### Risk 2: 閱讀順序錯亂
**Mitigation**: 補回元素後重新計算整頁的 reading_order依 y0, x0 排序)
### Risk 3: 效能影響
**Mitigation**:
- 先做快速的覆蓋率檢查,若 > 70% 則跳過 gap filling
- 使用 R-tree 或 interval tree 加速 bbox 查詢(若效能成為瓶頸)
### Risk 4: 座標不對齊
**Mitigation**: 使用 `ocr_dimensions` 確保座標空間一致
## Migration Plan
1. 新增功能為可選(預設啟用)
2. 可透過設定關閉 gap filling
3. 不影響現有 API 介面
4. 向後相容:不傳參數時使用預設行為
## Open Questions
1. 是否需要 UI 開關讓使用者選擇啟用/停用 gap filling
2. 對於細碎的 Raw OCR 框,是否需要實作合併邏輯?(同行、相鄰且間距很小)
3. 是否需要在輸出中標記哪些元素是補漏來的debug 用途)

View File

@@ -0,0 +1,30 @@
# Change: Add OCR Track Gap Filling with Raw OCR Text Regions
## Why
PP-StructureV3 的版面分析模型在處理某些掃描文件時會嚴重漏檢,導致大量文字內容遺失。實測 scan.pdf 顯示:
- Raw PaddleOCR 文字識別:偵測到 **56 個文字區域**
- PP-StructureV3 版面分析:僅輸出 **9 個元素**
- 遺失比例:約 **84%** 的內容未被 PP-StructureV3 識別
問題根源在於 PP-StructureV3 內部的 Layout Detection Model 對掃描文件類型支援不足而非我們的程式碼問題。Raw OCR 能正確偵測所有文字區域,但這些資訊在 PP-StructureV3 的結構化處理過程中被遺失。
## What Changes
實作「混合式處理」(Hybrid Approach):使用 Raw OCR 的文字區域來補充 PP-StructureV3 遺失的內容。
- **新增** `GapFillingService` 類別,負責偵測並補回 PP-StructureV3 遺漏的文字區域
- **新增** 覆蓋率計算邏輯(中心點落入或 IoU 閾值判斷)
- **新增** 自動啟用條件:當 PP-Structure 覆蓋率 < 70% 或元素數顯著低於 Raw OCR 框數
- **修改** `OCRToUnifiedConverter` 整合 gap filling 邏輯
- **新增** 重新計算 reading_order 邏輯 y0, x0 排序
- **新增** 測試案例PP-Structure 嚴重漏檢案例無漏檢正常文件驗證
## Impact
- **Affected specs**: `ocr-processing`
- **Affected code**:
- `backend/app/services/ocr_to_unified_converter.py` - 整合 gap filling
- `backend/app/services/gap_filling_service.py` - 新增 (核心邏輯)
- `backend/tests/test_gap_filling.py` - 新增 (測試)
- **Track isolation**: 僅作用於 OCR trackDirect/Hybrid track 不受影響

View File

@@ -0,0 +1,111 @@
## ADDED Requirements
### Requirement: OCR Track Gap Filling with Raw OCR Regions
The system SHALL detect and fill gaps in PP-StructureV3 output by supplementing with Raw OCR text regions when significant content loss is detected.
#### Scenario: Gap filling activates when coverage is low
- **GIVEN** an OCR track processing task
- **WHEN** PP-StructureV3 outputs elements that cover less than 70% of Raw OCR text regions
- **THEN** the system SHALL activate gap filling
- **AND** identify Raw OCR regions not covered by any PP-StructureV3 element
- **AND** supplement these regions as TEXT elements in the output
#### Scenario: Coverage is determined by center-point and IoU
- **GIVEN** a Raw OCR text region with bounding box
- **WHEN** checking if the region is covered by PP-StructureV3
- **THEN** the region SHALL be considered covered if its center point falls inside any PP-StructureV3 element bbox
- **OR** if IoU with any PP-StructureV3 element exceeds 0.15 threshold
- **AND** regions not meeting either criterion SHALL be marked as uncovered
#### Scenario: Only TEXT elements are supplemented
- **GIVEN** uncovered Raw OCR regions identified for supplementation
- **WHEN** PP-StructureV3 has detected TABLE, IMAGE, FIGURE, FLOWCHART, HEADER, or FOOTER elements
- **THEN** the system SHALL NOT supplement regions that overlap with these structural elements
- **AND** only supplement regions as TEXT type to preserve structural integrity
#### Scenario: Supplemented regions meet confidence threshold
- **GIVEN** Raw OCR regions to be supplemented
- **WHEN** a region has confidence score below 0.3
- **THEN** the system SHALL skip that region
- **AND** only supplement regions with confidence >= 0.3
#### Scenario: Deduplication prevents repeated text
- **GIVEN** a Raw OCR region being considered for supplementation
- **WHEN** the region has IoU > 0.5 with any existing PP-StructureV3 TEXT element
- **THEN** the system SHALL skip that region to prevent duplicate text
- **AND** the original PP-StructureV3 element SHALL be preserved
#### Scenario: Reading order is recalculated after gap filling
- **GIVEN** supplemented elements have been added to the page
- **WHEN** assembling the final element list
- **THEN** the system SHALL recalculate reading order for the entire page
- **AND** sort elements by y0 coordinate (top to bottom) then x0 (left to right)
- **AND** ensure logical document flow is maintained
#### Scenario: Coordinate alignment with ocr_dimensions
- **GIVEN** Raw OCR processing may involve image resizing
- **WHEN** comparing Raw OCR bbox with PP-StructureV3 bbox
- **THEN** the system SHALL use ocr_dimensions to normalize coordinates
- **AND** ensure both sources reference the same coordinate space
- **AND** prevent coverage misdetection due to scale differences
#### Scenario: Supplemented elements have complete metadata
- **GIVEN** a Raw OCR region being added as supplemented element
- **WHEN** creating the DocumentElement
- **THEN** the element SHALL include page_number
- **AND** include confidence score from Raw OCR
- **AND** include original bbox coordinates
- **AND** optionally include source indicator for debugging
### Requirement: Gap Filling Track Isolation
The gap filling feature SHALL only apply to OCR track processing and SHALL NOT affect Direct or Hybrid track outputs.
#### Scenario: Gap filling only activates for OCR track
- **GIVEN** a document processing task
- **WHEN** the processing track is OCR
- **THEN** the system SHALL evaluate and apply gap filling as needed
- **AND** produce enhanced output with supplemented content
#### Scenario: Direct track is unaffected
- **GIVEN** a document processing task with Direct track
- **WHEN** the task is processed
- **THEN** the system SHALL NOT invoke any gap filling logic
- **AND** produce output identical to current Direct track behavior
#### Scenario: Hybrid track is unaffected
- **GIVEN** a document processing task with Hybrid track
- **WHEN** the task is processed
- **THEN** the system SHALL NOT invoke gap filling logic
- **AND** use existing Hybrid track processing pipeline
### Requirement: Gap Filling Configuration
The system SHALL provide configurable parameters for gap filling behavior.
#### Scenario: Gap filling can be disabled via configuration
- **GIVEN** gap_filling_enabled is set to false in configuration
- **WHEN** OCR track processing runs
- **THEN** the system SHALL skip all gap filling logic
- **AND** output only PP-StructureV3 results as before
#### Scenario: Coverage threshold is configurable
- **GIVEN** gap_filling_coverage_threshold is set to 0.8
- **WHEN** PP-StructureV3 coverage is 75%
- **THEN** the system SHALL activate gap filling
- **AND** supplement uncovered regions
#### Scenario: IoU thresholds are configurable
- **GIVEN** custom IoU thresholds configured:
- gap_filling_iou_threshold: 0.2
- gap_filling_dedup_iou_threshold: 0.6
- **WHEN** evaluating coverage and deduplication
- **THEN** the system SHALL use the configured values
- **AND** apply them consistently throughout gap filling process
#### Scenario: Confidence threshold is configurable
- **GIVEN** gap_filling_confidence_threshold is set to 0.5
- **WHEN** supplementing Raw OCR regions
- **THEN** the system SHALL only include regions with confidence >= 0.5
- **AND** filter out lower confidence regions

View File

@@ -0,0 +1,44 @@
# Tasks: Add OCR Track Gap Filling
## 1. Core Implementation
- [x] 1.1 Create `gap_filling_service.py` with `GapFillingService` class
- [x] 1.2 Implement bbox coverage calculation (center-point and IoU methods)
- [x] 1.3 Implement gap detection logic (find uncovered raw OCR regions)
- [x] 1.4 Implement confidence threshold filtering for supplemented regions
- [x] 1.5 Implement element type filtering (only supplement TEXT, skip TABLE/IMAGE/FIGURE/etc.)
- [x] 1.6 Implement reading order recalculation (sort by y0, x0)
- [x] 1.7 Implement deduplication logic (skip high IoU overlaps with PP-Structure TEXT)
- [x] 1.8 Implement optional text merging for fragmented adjacent regions
## 2. Integration
- [x] 2.1 Modify `OCRToUnifiedConverter` to accept raw OCR text_regions
- [x] 2.2 Add gap filling activation condition check (coverage < 70% or element count disparity)
- [x] 2.3 Ensure coordinate alignment between raw OCR and PP-Structure (ocr_dimensions handling)
- [x] 2.4 Add page metadata (page_number, confidence, bbox) to supplemented elements
- [x] 2.5 Ensure track isolation (only OCR track, not Direct/Hybrid)
## 3. Configuration
- [x] 3.1 Add configurable parameters to settings:
- `gap_filling_enabled`: bool (default: True)
- `gap_filling_coverage_threshold`: float (default: 0.7)
- `gap_filling_iou_threshold`: float (default: 0.15)
- `gap_filling_confidence_threshold`: float (default: 0.3)
- `gap_filling_dedup_iou_threshold`: float (default: 0.5)
## 4. Testing(with env)
- [x] 4.1 Create test fixtures with PP-Structure severe miss-detection case(with scan.pdf / scan2.pdf)
- [x] 4.2 Test gap detection correctly identifies uncovered regions
- [x] 4.3 Test supplemented elements have correct metadata
- [x] 4.4 Test reading order is correctly recalculated
- [x] 4.5 Test deduplication prevents duplicate text
- [x] 4.6 Test normal document without miss-detection has no duplicate/inflation
- [x] 4.7 Test track isolation (Direct track unaffected)
## 5. Documentation
- [x] 5.1 Add inline documentation to GapFillingService
- [x] 5.2 Update configuration documentation with new settings

View File

@@ -0,0 +1,40 @@
# Change: Simplify PP-StructureV3 Configuration with Layout Model Selection
## Why
Current PP-StructureV3 parameter adjustment UI exposes 7 technical ML parameters (thresholds, ratios, merge modes) that are difficult for end users to understand. Meanwhile, switching to a different layout detection model (e.g., CDLA-trained models for Chinese documents) would have a much greater impact on OCR quality than fine-tuning these parameters.
**Problems with current approach:**
- Users don't understand what `layout_detection_threshold` or `text_det_unclip_ratio` mean
- Wrong parameter values can make OCR results worse
- The default model (PubLayNet-based) is optimized for English academic papers, not Chinese business documents
- Model selection is far more impactful than parameter tuning
## What Changes
### Backend Changes
- **REMOVED**: API parameter `pp_structure_params` from task start endpoint
- **ADDED**: New API parameter `layout_model` with predefined options:
- `"default"` - Standard model (PubLayNet-based, for English documents)
- `"chinese"` - PP-DocLayout-S model (for Chinese documents, forms, contracts)
- `"cdla"` - CDLA model (alternative Chinese document layout model)
- **MODIFIED**: PP-StructureV3 initialization uses `layout_detection_model_name` based on selection
- Keep fine-tuning parameters in backend `config.py` with optimized defaults
### Frontend Changes
- **REMOVED**: `PPStructureParams.tsx` component (slider/dropdown UI for 7 parameters)
- **ADDED**: Simple radio button/dropdown for layout model selection with clear descriptions
- **MODIFIED**: Task start request body to send `layout_model` instead of `pp_structure_params`
### API Changes
- **BREAKING**: Remove `pp_structure_params` from `POST /api/v2/tasks/{task_id}/start`
- **ADDED**: New optional parameter `layout_model: "default" | "chinese" | "cdla"`
## Impact
- Affected specs: `ocr-processing`
- Affected code:
- Backend: `app/routers/tasks.py`, `app/services/ocr_service.py`, `app/core/config.py`
- Frontend: `src/components/PPStructureParams.tsx` (remove), `src/types/apiV2.ts`, task start form
- Breaking change: Clients using `pp_structure_params` will need to migrate to `layout_model`
- User impact: Simpler UI, better default OCR quality for Chinese documents

View File

@@ -0,0 +1,86 @@
# ocr-processing Specification Delta
## REMOVED Requirements
### Requirement: Frontend-Adjustable PP-StructureV3 Parameters
**Reason**: Complex ML parameters are difficult for end users to understand and tune. Model selection provides better UX and more significant quality improvements.
**Migration**: Replace `pp_structure_params` API parameter with `layout_model` parameter.
### Requirement: PP-StructureV3 Parameter UI Controls
**Reason**: Slider/dropdown UI for 7 technical parameters adds complexity without proportional benefit. Simple model selection is more user-friendly.
**Migration**: Remove `PPStructureParams.tsx` component, add `LayoutModelSelector.tsx` component.
## ADDED Requirements
### Requirement: Layout Model Selection
The system SHALL allow users to select a layout detection model optimized for their document type, providing a simple choice between pre-configured models instead of manual parameter tuning.
#### Scenario: User selects Chinese document model
- **GIVEN** a user is processing Chinese business documents (forms, contracts, invoices)
- **WHEN** the user selects "Chinese Document Model" (PP-DocLayout-S)
- **THEN** the OCR engine SHALL use the PP-DocLayout-S layout detection model
- **AND** the model SHALL be optimized for 23 Chinese document element types
- **AND** table and form detection accuracy SHALL be improved over the default model
#### Scenario: User selects standard model for English documents
- **GIVEN** a user is processing English academic papers or reports
- **WHEN** the user selects "Standard Model" (PubLayNet-based)
- **THEN** the OCR engine SHALL use the default PubLayNet-based layout detection model
- **AND** the model SHALL be optimized for English document layouts
#### Scenario: User selects CDLA model for specialized Chinese layout
- **GIVEN** a user is processing Chinese documents with complex layouts
- **WHEN** the user selects "CDLA Model"
- **THEN** the OCR engine SHALL use the picodet_lcnet_x1_0_fgd_layout_cdla model
- **AND** the model SHALL provide specialized Chinese document layout analysis
#### Scenario: Layout model is sent via API request
- **GIVEN** a frontend application with model selection UI
- **WHEN** the user starts task processing with a selected model
- **THEN** the frontend SHALL send the model choice in the request body:
```json
POST /api/v2/tasks/{task_id}/start
{
"use_dual_track": true,
"force_track": "ocr",
"language": "ch",
"layout_model": "chinese"
}
```
- **AND** the backend SHALL configure PP-StructureV3 with the corresponding model
#### Scenario: Default model when not specified
- **GIVEN** an API request without `layout_model` parameter
- **WHEN** the task is started
- **THEN** the system SHALL use "chinese" (PP-DocLayout-S) as the default model
- **AND** processing SHALL work correctly without requiring model selection
#### Scenario: Invalid model name is rejected
- **GIVEN** a request with an invalid `layout_model` value
- **WHEN** the user sends `layout_model: "invalid_model"`
- **THEN** the API SHALL return 422 Validation Error
- **AND** provide a clear error message listing valid model options
### Requirement: Layout Model Selection UI
The frontend SHALL provide a simple, user-friendly interface for selecting layout detection models with clear descriptions of each option.
#### Scenario: Model options are displayed with descriptions
- **GIVEN** the model selection UI is displayed
- **WHEN** the user views the available options
- **THEN** the UI SHALL show the following options:
- "Chinese Document Model (Recommended)" - for Chinese forms, contracts, invoices
- "Standard Model" - for English academic papers, reports
- "CDLA Model" - for specialized Chinese layout analysis
- **AND** each option SHALL have a brief description of its use case
#### Scenario: Chinese model is selected by default
- **GIVEN** the user opens the task processing interface
- **WHEN** the model selection is displayed
- **THEN** "Chinese Document Model" SHALL be pre-selected as the default
- **AND** the user MAY change the selection before starting processing
#### Scenario: Model selection is visible only for OCR track
- **GIVEN** a document processing interface
- **WHEN** the user selects processing track
- **THEN** layout model selection SHALL be shown ONLY when OCR track is selected or auto-detected
- **AND** SHALL be hidden for Direct track (which does not use PP-StructureV3)

View File

@@ -0,0 +1,56 @@
# Implementation Tasks
## 1. Backend API Changes
- [x] 1.1 Update `app/schemas/task.py` to add `layout_model` enum type
- [x] 1.2 Update `app/routers/tasks.py` to replace `pp_structure_params` with `layout_model` parameter
- [x] 1.3 Update `app/services/ocr_service.py` to map `layout_model` to `layout_detection_model_name`
- [x] 1.4 Remove custom PP-Structure engine creation logic (use model selection instead)
- [x] 1.5 Add backward compatibility: default to "chinese" if no model specified
## 2. Backend Configuration
- [x] 2.1 Keep `layout_detection_model_name` in `config.py` as fallback default
- [x] 2.2 Keep fine-tuning parameters in `config.py` (not exposed to API)
- [x] 2.3 Document available layout models in config comments
## 3. Frontend Changes
- [x] 3.1 Remove `PPStructureParams.tsx` component
- [x] 3.2 Update `src/types/apiV2.ts`:
- Remove `PPStructureV3Params` interface
- Add `LayoutModel` type: `"default" | "chinese" | "cdla"`
- Update `ProcessingOptions` to use `layout_model` instead of `pp_structure_params`
- [x] 3.3 Create `LayoutModelSelector.tsx` component with:
- Radio buttons or dropdown for model selection
- Clear descriptions for each model option
- Default selection: "chinese"
- [x] 3.4 Update task start form to use new `LayoutModelSelector`
- [x] 3.5 Update API calls to send `layout_model` instead of `pp_structure_params`
## 4. Internationalization
- [x] 4.1 Add i18n strings for layout model options:
- `layoutModel.default`: "Standard Model (English documents)"
- `layoutModel.chinese`: "Chinese Document Model (Recommended)"
- `layoutModel.cdla`: "CDLA Model (Chinese layout analysis)"
- [x] 4.2 Add i18n strings for model descriptions
## 5. Testing
- [x] 5.1 Create new tests for `layout_model` parameter (`test_layout_model_api.py`, `test_layout_model.py`)
- [x] 5.2 Archive tests for `pp_structure_params` validation (moved to `tests/archived/`)
- [x] 5.3 Add tests for layout model selection (19 tests passing)
- [x] 5.4 Test backward compatibility (no model specified → use chinese default)
## 6. Documentation
- [ ] 6.1 Update API documentation for task start endpoint
- [ ] 6.2 Remove PP-Structure parameter documentation
- [ ] 6.3 Add layout model selection documentation
## 7. Cleanup
- [x] 7.1 Remove localStorage keys for PP-Structure params (`pp_structure_params_presets`, `pp_structure_params_last_used`)
- [x] 7.2 Remove any unused imports/types related to PP-Structure params
- [x] 7.3 Archive old PP-Structure params test files

View File

@@ -3,100 +3,186 @@
## Purpose
TBD - created by archiving change frontend-adjustable-ppstructure-params. Update Purpose after archive.
## Requirements
### Requirement: Frontend-Adjustable PP-StructureV3 Parameters
The system SHALL allow frontend users to dynamically adjust PP-StructureV3 OCR parameters for fine-tuning document processing without backend configuration changes.
### Requirement: OCR Track Gap Filling with Raw OCR Regions
#### Scenario: User adjusts layout detection threshold
- **GIVEN** a user is processing a document with OCR track
- **WHEN** the user sets `layout_detection_threshold` to 0.1 (lower than default 0.2)
- **THEN** the OCR engine SHALL detect more layout blocks including weak signals
- **AND** the processing SHALL use the custom parameter instead of backend defaults
- **AND** the custom parameter SHALL NOT be cached for reuse
The system SHALL detect and fill gaps in PP-StructureV3 output by supplementing with Raw OCR text regions when significant content loss is detected.
#### Scenario: User selects high-quality preset configuration
- **GIVEN** a user wants to process a complex document with many small text elements
- **WHEN** the user selects "High Quality" preset mode
- **THEN** the system SHALL automatically set:
- `layout_detection_threshold` to 0.1
- `layout_nms_threshold` to 0.15
- `text_det_thresh` to 0.1
- `text_det_box_thresh` to 0.2
- **AND** process the document with these optimized parameters
#### Scenario: Gap filling activates when coverage is low
- **GIVEN** an OCR track processing task
- **WHEN** PP-StructureV3 outputs elements that cover less than 70% of Raw OCR text regions
- **THEN** the system SHALL activate gap filling
- **AND** identify Raw OCR regions not covered by any PP-StructureV3 element
- **AND** supplement these regions as TEXT elements in the output
#### Scenario: User adjusts text detection parameters
- **GIVEN** a document with low-contrast text
- **WHEN** the user sets:
- `text_det_thresh` to 0.05 (very low)
- `text_det_unclip_ratio` to 1.5 (larger boxes)
- **THEN** the OCR SHALL detect more small and low-contrast text
- **AND** text bounding boxes SHALL be expanded by the specified ratio
#### Scenario: Coverage is determined by center-point and IoU
- **GIVEN** a Raw OCR text region with bounding box
- **WHEN** checking if the region is covered by PP-StructureV3
- **THEN** the region SHALL be considered covered if its center point falls inside any PP-StructureV3 element bbox
- **OR** if IoU with any PP-StructureV3 element exceeds 0.15 threshold
- **AND** regions not meeting either criterion SHALL be marked as uncovered
#### Scenario: Parameters are sent via API request body
- **GIVEN** a frontend application with parameter adjustment UI
- **WHEN** the user starts task processing with custom parameters
- **THEN** the frontend SHALL send parameters in the request body (not query params):
#### Scenario: Only TEXT elements are supplemented
- **GIVEN** uncovered Raw OCR regions identified for supplementation
- **WHEN** PP-StructureV3 has detected TABLE, IMAGE, FIGURE, FLOWCHART, HEADER, or FOOTER elements
- **THEN** the system SHALL NOT supplement regions that overlap with these structural elements
- **AND** only supplement regions as TEXT type to preserve structural integrity
#### Scenario: Supplemented regions meet confidence threshold
- **GIVEN** Raw OCR regions to be supplemented
- **WHEN** a region has confidence score below 0.3
- **THEN** the system SHALL skip that region
- **AND** only supplement regions with confidence >= 0.3
#### Scenario: Deduplication prevents repeated text
- **GIVEN** a Raw OCR region being considered for supplementation
- **WHEN** the region has IoU > 0.5 with any existing PP-StructureV3 TEXT element
- **THEN** the system SHALL skip that region to prevent duplicate text
- **AND** the original PP-StructureV3 element SHALL be preserved
#### Scenario: Reading order is recalculated after gap filling
- **GIVEN** supplemented elements have been added to the page
- **WHEN** assembling the final element list
- **THEN** the system SHALL recalculate reading order for the entire page
- **AND** sort elements by y0 coordinate (top to bottom) then x0 (left to right)
- **AND** ensure logical document flow is maintained
#### Scenario: Coordinate alignment with ocr_dimensions
- **GIVEN** Raw OCR processing may involve image resizing
- **WHEN** comparing Raw OCR bbox with PP-StructureV3 bbox
- **THEN** the system SHALL use ocr_dimensions to normalize coordinates
- **AND** ensure both sources reference the same coordinate space
- **AND** prevent coverage misdetection due to scale differences
#### Scenario: Supplemented elements have complete metadata
- **GIVEN** a Raw OCR region being added as supplemented element
- **WHEN** creating the DocumentElement
- **THEN** the element SHALL include page_number
- **AND** include confidence score from Raw OCR
- **AND** include original bbox coordinates
- **AND** optionally include source indicator for debugging
### Requirement: Gap Filling Track Isolation
The gap filling feature SHALL only apply to OCR track processing and SHALL NOT affect Direct or Hybrid track outputs.
#### Scenario: Gap filling only activates for OCR track
- **GIVEN** a document processing task
- **WHEN** the processing track is OCR
- **THEN** the system SHALL evaluate and apply gap filling as needed
- **AND** produce enhanced output with supplemented content
#### Scenario: Direct track is unaffected
- **GIVEN** a document processing task with Direct track
- **WHEN** the task is processed
- **THEN** the system SHALL NOT invoke any gap filling logic
- **AND** produce output identical to current Direct track behavior
#### Scenario: Hybrid track is unaffected
- **GIVEN** a document processing task with Hybrid track
- **WHEN** the task is processed
- **THEN** the system SHALL NOT invoke gap filling logic
- **AND** use existing Hybrid track processing pipeline
### Requirement: Gap Filling Configuration
The system SHALL provide configurable parameters for gap filling behavior.
#### Scenario: Gap filling can be disabled via configuration
- **GIVEN** gap_filling_enabled is set to false in configuration
- **WHEN** OCR track processing runs
- **THEN** the system SHALL skip all gap filling logic
- **AND** output only PP-StructureV3 results as before
#### Scenario: Coverage threshold is configurable
- **GIVEN** gap_filling_coverage_threshold is set to 0.8
- **WHEN** PP-StructureV3 coverage is 75%
- **THEN** the system SHALL activate gap filling
- **AND** supplement uncovered regions
#### Scenario: IoU thresholds are configurable
- **GIVEN** custom IoU thresholds configured:
- gap_filling_iou_threshold: 0.2
- gap_filling_dedup_iou_threshold: 0.6
- **WHEN** evaluating coverage and deduplication
- **THEN** the system SHALL use the configured values
- **AND** apply them consistently throughout gap filling process
#### Scenario: Confidence threshold is configurable
- **GIVEN** gap_filling_confidence_threshold is set to 0.5
- **WHEN** supplementing Raw OCR regions
- **THEN** the system SHALL only include regions with confidence >= 0.5
- **AND** filter out lower confidence regions
### Requirement: Layout Model Selection
The system SHALL allow users to select a layout detection model optimized for their document type, providing a simple choice between pre-configured models instead of manual parameter tuning.
#### Scenario: User selects Chinese document model
- **GIVEN** a user is processing Chinese business documents (forms, contracts, invoices)
- **WHEN** the user selects "Chinese Document Model" (PP-DocLayout-S)
- **THEN** the OCR engine SHALL use the PP-DocLayout-S layout detection model
- **AND** the model SHALL be optimized for 23 Chinese document element types
- **AND** table and form detection accuracy SHALL be improved over the default model
#### Scenario: User selects standard model for English documents
- **GIVEN** a user is processing English academic papers or reports
- **WHEN** the user selects "Standard Model" (PubLayNet-based)
- **THEN** the OCR engine SHALL use the default PubLayNet-based layout detection model
- **AND** the model SHALL be optimized for English document layouts
#### Scenario: User selects CDLA model for specialized Chinese layout
- **GIVEN** a user is processing Chinese documents with complex layouts
- **WHEN** the user selects "CDLA Model"
- **THEN** the OCR engine SHALL use the picodet_lcnet_x1_0_fgd_layout_cdla model
- **AND** the model SHALL provide specialized Chinese document layout analysis
#### Scenario: Layout model is sent via API request
- **GIVEN** a frontend application with model selection UI
- **WHEN** the user starts task processing with a selected model
- **THEN** the frontend SHALL send the model choice in the request body:
```json
POST /api/v2/tasks/{task_id}/start
{
"use_dual_track": true,
"force_track": "ocr",
"language": "ch",
"pp_structure_params": {
"layout_detection_threshold": 0.15,
"layout_merge_bboxes_mode": "small",
"text_det_thresh": 0.1
}
"layout_model": "chinese"
}
```
- **AND** the backend SHALL parse and apply these parameters
- **AND** the backend SHALL configure PP-StructureV3 with the corresponding model
#### Scenario: Backward compatibility is maintained
- **GIVEN** existing API clients without PP-StructureV3 parameter support
- **WHEN** a task is started without `pp_structure_params`
- **THEN** the system SHALL use backend default settings
- **AND** processing SHALL work exactly as before
- **AND** no errors SHALL occur
#### Scenario: Default model when not specified
- **GIVEN** an API request without `layout_model` parameter
- **WHEN** the task is started
- **THEN** the system SHALL use "chinese" (PP-DocLayout-S) as the default model
- **AND** processing SHALL work correctly without requiring model selection
#### Scenario: Invalid parameters are rejected
- **GIVEN** a request with invalid parameter values
- **WHEN** the user sends:
- `layout_detection_threshold` = 1.5 (exceeds max 1.0)
- `layout_merge_bboxes_mode` = "invalid" (not in allowed values)
#### Scenario: Invalid model name is rejected
- **GIVEN** a request with an invalid `layout_model` value
- **WHEN** the user sends `layout_model: "invalid_model"`
- **THEN** the API SHALL return 422 Validation Error
- **AND** provide clear error messages about invalid parameters
- **AND** provide a clear error message listing valid model options
#### Scenario: Custom parameters affect only current processing
- **GIVEN** multiple concurrent OCR processing tasks
- **WHEN** Task A uses custom parameters and Task B uses defaults
- **THEN** Task A SHALL process with its custom parameters
- **AND** Task B SHALL process with default parameters
- **AND** no parameter interference SHALL occur between tasks
### Requirement: Layout Model Selection UI
The frontend SHALL provide a simple, user-friendly interface for selecting layout detection models with clear descriptions of each option.
### Requirement: PP-StructureV3 Parameter UI Controls
The frontend SHALL provide intuitive UI controls for adjusting PP-StructureV3 parameters with appropriate constraints and help text.
#### Scenario: Model options are displayed with descriptions
- **GIVEN** the model selection UI is displayed
- **WHEN** the user views the available options
- **THEN** the UI SHALL show the following options:
- "Chinese Document Model (Recommended)" - for Chinese forms, contracts, invoices
- "Standard Model" - for English academic papers, reports
- "CDLA Model" - for specialized Chinese layout analysis
- **AND** each option SHALL have a brief description of its use case
#### Scenario: Slider controls for numeric parameters
- **GIVEN** the parameter adjustment UI is displayed
- **WHEN** the user adjusts a numeric parameter slider
- **THEN** the slider SHALL enforce min/max constraints:
- Threshold parameters: 0.0 to 1.0
- Ratio parameters: > 0 (typically 0.5 to 3.0)
- **AND** display current value in real-time
- **AND** show help text explaining the parameter effect
#### Scenario: Chinese model is selected by default
- **GIVEN** the user opens the task processing interface
- **WHEN** the model selection is displayed
- **THEN** "Chinese Document Model" SHALL be pre-selected as the default
- **AND** the user MAY change the selection before starting processing
#### Scenario: Dropdown for merge mode selection
- **GIVEN** the layout merge mode parameter
- **WHEN** the user clicks the dropdown
- **THEN** the UI SHALL show exactly three options:
- "small" (conservative merging)
- "large" (aggressive merging)
- "union" (middle ground)
- **AND** display description for each option
#### Scenario: Parameters shown only for OCR track
#### Scenario: Model selection is visible only for OCR track
- **GIVEN** a document processing interface
- **WHEN** the user selects processing track
- **THEN** PP-StructureV3 parameters SHALL be shown ONLY when OCR track is selected
- **AND** SHALL be hidden for Direct track
- **AND** SHALL be disabled for Auto track until track is determined
- **THEN** layout model selection SHALL be shown ONLY when OCR track is selected or auto-detected
- **AND** SHALL be hidden for Direct track (which does not use PP-StructureV3)