From 940a406dcec6b2da184524b177c330b1b3e97e13 Mon Sep 17 00:00:00 2001 From: egg Date: Thu, 11 Dec 2025 11:55:39 +0800 Subject: [PATCH] chore: backup before code cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Backup commit before executing remove-unused-code proposal. This includes all pending changes and new features. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- .claude/commands/openspec/apply.md | 23 + .claude/commands/openspec/archive.md | 27 + .claude/commands/openspec/proposal.md | 28 + ...a2b3c4d5e6_fix_sessions_schema_mismatch.py | 53 ++ backend/app/core/config.py | 146 +++- backend/app/routers/tasks.py | 48 +- backend/app/schemas/task.py | 144 ++++ .../app/services/cell_validation_engine.py | 583 +++++++++++++ backend/app/services/gap_filling_service.py | 217 ++++- backend/app/services/ocr_service.py | 101 ++- .../app/services/ocr_to_unified_converter.py | 128 ++- backend/app/services/pdf_generator_service.py | 787 ++++++++++++++--- backend/app/services/pdf_table_renderer.py | 391 +++++++++ backend/app/services/pp_structure_enhanced.py | 303 ++++++- .../app/services/processing_orchestrator.py | 8 + .../app/services/table_column_corrector.py | 790 +++++++++++++++++ .../app/services/table_content_rebuilder.py | 806 ++++++++++++++++++ backend/app/services/text_region_renderer.py | 664 +++++++++++++++ docs/ocr-presets.md | 61 ++ frontend/src/components/OCRPresetSelector.tsx | 358 ++++++++ frontend/src/pages/ProcessingPage.tsx | 18 +- frontend/src/types/apiV2.ts | 54 ++ .../proposal.md | 73 ++ .../specs/ocr-processing/spec.md | 64 ++ .../tasks.md | 124 +++ .../design.md | 0 .../proposal.md | 0 .../specs/document-processing/spec.md | 2 + .../tasks.md | 0 .../design.md | 227 +++++ .../proposal.md | 116 +++ .../specs/ocr-processing/spec.md | 96 +++ .../tasks.md | 75 ++ .../test-notes.md | 14 + .../fix-ocr-track-table-rendering/design.md | 88 ++ .../fix-ocr-track-table-rendering/proposal.md | 17 + .../specs/ocr-processing/spec.md | 91 ++ .../fix-ocr-track-table-rendering/tasks.md | 34 + .../fix-table-column-alignment/design.md | 227 +++++ .../fix-table-column-alignment/proposal.md | 56 ++ .../specs/document-processing/spec.md | 59 ++ .../fix-table-column-alignment/tasks.md | 59 ++ .../improve-ocr-track-algorithm/proposal.md | 49 ++ .../specs/ocr-processing/spec.md | 142 +++ .../improve-ocr-track-algorithm/tasks.md | 54 ++ .../changes/remove-unused-code/proposal.md | 55 ++ .../specs/document-processing/spec.md | 61 ++ openspec/changes/remove-unused-code/tasks.md | 43 + .../changes/simple-text-positioning/design.md | 141 +++ .../simple-text-positioning/proposal.md | 42 + .../changes/simple-text-positioning/tasks.md | 57 ++ .../design.md | 234 +++++ .../proposal.md | 75 ++ .../specs/document-processing/spec.md | 36 + .../tasks.md | 48 ++ openspec/specs/document-processing/spec.md | 33 +- openspec/specs/ocr-processing/spec.md | 63 ++ paddle_review.md | 108 +++ 58 files changed, 8226 insertions(+), 175 deletions(-) create mode 100644 .claude/commands/openspec/apply.md create mode 100644 .claude/commands/openspec/archive.md create mode 100644 .claude/commands/openspec/proposal.md create mode 100644 backend/alembic/versions/f1a2b3c4d5e6_fix_sessions_schema_mismatch.py create mode 100644 backend/app/services/cell_validation_engine.py create mode 100644 backend/app/services/table_column_corrector.py create mode 100644 backend/app/services/table_content_rebuilder.py create mode 100644 backend/app/services/text_region_renderer.py create mode 100644 docs/ocr-presets.md create mode 100644 frontend/src/components/OCRPresetSelector.tsx create mode 100644 openspec/changes/archive/2025-12-08-fix-ocr-cell-overdetection/proposal.md create mode 100644 openspec/changes/archive/2025-12-08-fix-ocr-cell-overdetection/specs/ocr-processing/spec.md create mode 100644 openspec/changes/archive/2025-12-08-fix-ocr-cell-overdetection/tasks.md rename openspec/changes/{refactor-dual-track-architecture => archive/2025-12-08-refactor-dual-track-architecture}/design.md (100%) rename openspec/changes/{refactor-dual-track-architecture => archive/2025-12-08-refactor-dual-track-architecture}/proposal.md (100%) rename openspec/changes/{refactor-dual-track-architecture => archive/2025-12-08-refactor-dual-track-architecture}/specs/document-processing/spec.md (99%) rename openspec/changes/{refactor-dual-track-architecture => archive/2025-12-08-refactor-dual-track-architecture}/tasks.md (100%) create mode 100644 openspec/changes/archive/2025-12-10-add-ocr-processing-presets/design.md create mode 100644 openspec/changes/archive/2025-12-10-add-ocr-processing-presets/proposal.md create mode 100644 openspec/changes/archive/2025-12-10-add-ocr-processing-presets/specs/ocr-processing/spec.md create mode 100644 openspec/changes/archive/2025-12-10-add-ocr-processing-presets/tasks.md create mode 100644 openspec/changes/archive/2025-12-10-add-ocr-processing-presets/test-notes.md create mode 100644 openspec/changes/fix-ocr-track-table-rendering/design.md create mode 100644 openspec/changes/fix-ocr-track-table-rendering/proposal.md create mode 100644 openspec/changes/fix-ocr-track-table-rendering/specs/ocr-processing/spec.md create mode 100644 openspec/changes/fix-ocr-track-table-rendering/tasks.md create mode 100644 openspec/changes/fix-table-column-alignment/design.md create mode 100644 openspec/changes/fix-table-column-alignment/proposal.md create mode 100644 openspec/changes/fix-table-column-alignment/specs/document-processing/spec.md create mode 100644 openspec/changes/fix-table-column-alignment/tasks.md create mode 100644 openspec/changes/improve-ocr-track-algorithm/proposal.md create mode 100644 openspec/changes/improve-ocr-track-algorithm/specs/ocr-processing/spec.md create mode 100644 openspec/changes/improve-ocr-track-algorithm/tasks.md create mode 100644 openspec/changes/remove-unused-code/proposal.md create mode 100644 openspec/changes/remove-unused-code/specs/document-processing/spec.md create mode 100644 openspec/changes/remove-unused-code/tasks.md create mode 100644 openspec/changes/simple-text-positioning/design.md create mode 100644 openspec/changes/simple-text-positioning/proposal.md create mode 100644 openspec/changes/simple-text-positioning/tasks.md create mode 100644 openspec/changes/use-cellboxes-for-table-rendering/design.md create mode 100644 openspec/changes/use-cellboxes-for-table-rendering/proposal.md create mode 100644 openspec/changes/use-cellboxes-for-table-rendering/specs/document-processing/spec.md create mode 100644 openspec/changes/use-cellboxes-for-table-rendering/tasks.md create mode 100644 paddle_review.md diff --git a/.claude/commands/openspec/apply.md b/.claude/commands/openspec/apply.md new file mode 100644 index 0000000..a36fd96 --- /dev/null +++ b/.claude/commands/openspec/apply.md @@ -0,0 +1,23 @@ +--- +name: OpenSpec: Apply +description: Implement an approved OpenSpec change and keep tasks in sync. +category: OpenSpec +tags: [openspec, apply] +--- + +**Guardrails** +- Favor straightforward, minimal implementations first and add complexity only when it is requested or clearly required. +- Keep changes tightly scoped to the requested outcome. +- Refer to `openspec/AGENTS.md` (located inside the `openspec/` directory—run `ls openspec` or `openspec update` if you don't see it) if you need additional OpenSpec conventions or clarifications. + +**Steps** +Track these steps as TODOs and complete them one by one. +1. Read `changes//proposal.md`, `design.md` (if present), and `tasks.md` to confirm scope and acceptance criteria. +2. Work through tasks sequentially, keeping edits minimal and focused on the requested change. +3. Confirm completion before updating statuses—make sure every item in `tasks.md` is finished. +4. Update the checklist after all work is done so each task is marked `- [x]` and reflects reality. +5. Reference `openspec list` or `openspec show ` when additional context is required. + +**Reference** +- Use `openspec show --json --deltas-only` if you need additional context from the proposal while implementing. + diff --git a/.claude/commands/openspec/archive.md b/.claude/commands/openspec/archive.md new file mode 100644 index 0000000..dbc7695 --- /dev/null +++ b/.claude/commands/openspec/archive.md @@ -0,0 +1,27 @@ +--- +name: OpenSpec: Archive +description: Archive a deployed OpenSpec change and update specs. +category: OpenSpec +tags: [openspec, archive] +--- + +**Guardrails** +- Favor straightforward, minimal implementations first and add complexity only when it is requested or clearly required. +- Keep changes tightly scoped to the requested outcome. +- Refer to `openspec/AGENTS.md` (located inside the `openspec/` directory—run `ls openspec` or `openspec update` if you don't see it) if you need additional OpenSpec conventions or clarifications. + +**Steps** +1. Determine the change ID to archive: + - If this prompt already includes a specific change ID (for example inside a `` block populated by slash-command arguments), use that value after trimming whitespace. + - If the conversation references a change loosely (for example by title or summary), run `openspec list` to surface likely IDs, share the relevant candidates, and confirm which one the user intends. + - Otherwise, review the conversation, run `openspec list`, and ask the user which change to archive; wait for a confirmed change ID before proceeding. + - If you still cannot identify a single change ID, stop and tell the user you cannot archive anything yet. +2. Validate the change ID by running `openspec list` (or `openspec show `) and stop if the change is missing, already archived, or otherwise not ready to archive. +3. Run `openspec archive --yes` so the CLI moves the change and applies spec updates without prompts (use `--skip-specs` only for tooling-only work). +4. Review the command output to confirm the target specs were updated and the change landed in `changes/archive/`. +5. Validate with `openspec validate --strict` and inspect with `openspec show ` if anything looks off. + +**Reference** +- Use `openspec list` to confirm change IDs before archiving. +- Inspect refreshed specs with `openspec list --specs` and address any validation issues before handing off. + diff --git a/.claude/commands/openspec/proposal.md b/.claude/commands/openspec/proposal.md new file mode 100644 index 0000000..cbb75ce --- /dev/null +++ b/.claude/commands/openspec/proposal.md @@ -0,0 +1,28 @@ +--- +name: OpenSpec: Proposal +description: Scaffold a new OpenSpec change and validate strictly. +category: OpenSpec +tags: [openspec, change] +--- + +**Guardrails** +- Favor straightforward, minimal implementations first and add complexity only when it is requested or clearly required. +- Keep changes tightly scoped to the requested outcome. +- Refer to `openspec/AGENTS.md` (located inside the `openspec/` directory—run `ls openspec` or `openspec update` if you don't see it) if you need additional OpenSpec conventions or clarifications. +- Identify any vague or ambiguous details and ask the necessary follow-up questions before editing files. +- Do not write any code during the proposal stage. Only create design documents (proposal.md, tasks.md, design.md, and spec deltas). Implementation happens in the apply stage after approval. + +**Steps** +1. Review `openspec/project.md`, run `openspec list` and `openspec list --specs`, and inspect related code or docs (e.g., via `rg`/`ls`) to ground the proposal in current behaviour; note any gaps that require clarification. +2. Choose a unique verb-led `change-id` and scaffold `proposal.md`, `tasks.md`, and `design.md` (when needed) under `openspec/changes//`. +3. Map the change into concrete capabilities or requirements, breaking multi-scope efforts into distinct spec deltas with clear relationships and sequencing. +4. Capture architectural reasoning in `design.md` when the solution spans multiple systems, introduces new patterns, or demands trade-off discussion before committing to specs. +5. Draft spec deltas in `changes//specs//spec.md` (one folder per capability) using `## ADDED|MODIFIED|REMOVED Requirements` with at least one `#### Scenario:` per requirement and cross-reference related capabilities when relevant. +6. Draft `tasks.md` as an ordered list of small, verifiable work items that deliver user-visible progress, include validation (tests, tooling), and highlight dependencies or parallelizable work. +7. Validate with `openspec validate --strict` and resolve every issue before sharing the proposal. + +**Reference** +- Use `openspec show --json --deltas-only` or `openspec show --type spec` to inspect details when validation fails. +- Search existing requirements with `rg -n "Requirement:|Scenario:" openspec/specs` before writing new ones. +- Explore the codebase with `rg `, `ls`, or direct file reads so proposals align with current implementation realities. + diff --git a/backend/alembic/versions/f1a2b3c4d5e6_fix_sessions_schema_mismatch.py b/backend/alembic/versions/f1a2b3c4d5e6_fix_sessions_schema_mismatch.py new file mode 100644 index 0000000..22181a6 --- /dev/null +++ b/backend/alembic/versions/f1a2b3c4d5e6_fix_sessions_schema_mismatch.py @@ -0,0 +1,53 @@ +"""fix_sessions_schema_mismatch + +Revision ID: f1a2b3c4d5e6 +Revises: e51c9a16ee16 +Create Date: 2025-12-10 10:30:00.000000 + +Fix schema mismatch between SQLAlchemy model and MySQL database: +1. Remove session_token column from tool_ocr_sessions (not in model) +2. Remove is_active column from tool_ocr_sessions (not in model) +3. Note: processing_track in tool_ocr_tasks is nullable so it won't cause issues + +These columns were added to the database manually or by another migration +but are not present in the current SQLAlchemy models. +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import mysql + + +# revision identifiers, used by Alembic. +revision: str = 'f1a2b3c4d5e6' +down_revision: Union[str, None] = 'e51c9a16ee16' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """ + Remove columns from tool_ocr_sessions that exist in DB but not in model. + This fixes the "Field 'session_token' doesn't have a default value" error. + """ + # Remove session_token column (exists in DB, not in model) + op.drop_column('tool_ocr_sessions', 'session_token') + + # Remove is_active column (exists in DB, not in model) + op.drop_column('tool_ocr_sessions', 'is_active') + + +def downgrade() -> None: + """ + Re-add the columns if needed to rollback. + """ + # Re-add is_active column + op.add_column('tool_ocr_sessions', + sa.Column('is_active', mysql.TINYINT(), nullable=False, server_default='1', + comment='Whether the session is active')) + + # Re-add session_token column + op.add_column('tool_ocr_sessions', + sa.Column('session_token', sa.String(length=255), nullable=False, + comment='Session token for authentication')) diff --git a/backend/app/core/config.py b/backend/app/core/config.py index a28daad..44e2959 100644 --- a/backend/app/core/config.py +++ b/backend/app/core/config.py @@ -97,9 +97,115 @@ class Settings(BaseSettings): enable_region_detection: bool = Field(default=True) # Region detection for better table structure enable_text_recognition: bool = Field(default=True) # General text recognition + # Table Parsing Mode - Controls how aggressively tables are parsed + # This is the KEY setting to prevent "cell explosion" on datasheet-type documents + # Options: + # - "full": Full table recognition with cell segmentation (aggressive, may over-detect) + # - "conservative": Conservative models + disable wireless tables + higher layout threshold + # - "classification_only": Only classify table regions, no cell segmentation (recommended for datasheets) + # - "disabled": Completely disable table recognition (safest for text-heavy documents) + table_parsing_mode: str = Field( + default="conservative", + description="Table parsing mode: 'full', 'conservative', 'classification_only', 'disabled'" + ) + # Layout threshold for table detection (higher = stricter, less false positives) + # WARNING: This affects ALL layout detection, not just tables. Use with caution. + # Default None uses PaddleX default. Only set this if you understand the impact. + table_layout_threshold: Optional[float] = Field( + default=None, + description="Layout threshold for ALL element detection. Higher values = fewer elements detected." + ) + + # Cell Validation (filter over-detected table cells) + # DISABLED: This is a patch behavior - focus on getting PP-Structure output right first + cell_validation_enabled: bool = Field( + default=False, + description="Enable cell validation to filter over-detected tables" + ) + cell_validation_max_density: float = Field( + default=3.0, + description="Max cells per 10,000px². Tables exceeding this are reclassified as TEXT." + ) + cell_validation_min_cell_area: float = Field( + default=3000.0, + description="Min average cell area in px². Tables below this are reclassified as TEXT." + ) + cell_validation_min_cell_height: float = Field( + default=10.0, + description="Min average cell height in px. Tables below this are reclassified as TEXT." + ) + + # Table Content Rebuilder (rebuild table HTML from raw OCR) + # DISABLED: This is a patch behavior - focus on getting PP-Structure output right first + table_content_rebuilder_enabled: bool = Field( + default=False, + description="Enable table content rebuilder to fix PP-Structure table HTML" + ) + + # Table Quality Check (determines rendering strategy based on cell_boxes overlap) + # When enabled, tables with overlapping cell_boxes are marked as 'bad' quality + # and rendered with border-only mode instead of full cell_boxes rendering. + # Disable this to always use cell_boxes rendering regardless of quality. + table_quality_check_enabled: bool = Field( + default=False, + description="Enable cell_boxes quality check. When disabled, all tables use cell_boxes rendering." + ) + + # Table Rendering: cell_boxes-first approach + # When enabled, uses cell_boxes coordinates as the primary source for table structure + # instead of relying on HTML colspan/rowspan, which often causes grid mismatch issues + # DISABLED: Algorithm needs improvement - clustering produces incorrect grid dimensions + table_rendering_prefer_cellboxes: bool = Field( + default=False, + description="Use cell_boxes coordinates as primary table structure source for PDF rendering" + ) + table_cellboxes_row_threshold: float = Field( + default=15.0, + description="Y-coordinate threshold for row clustering when inferring grid from cell_boxes" + ) + table_cellboxes_col_threshold: float = Field( + default=15.0, + description="X-coordinate threshold for column clustering when inferring grid from cell_boxes" + ) + + # Table Column Alignment Correction (Header-Anchor Algorithm) + # Corrects PP-Structure's column assignment errors using header row X-coordinates as reference + table_column_correction_enabled: bool = Field( + default=True, + description="Enable header-anchor column correction for table cells" + ) + table_column_correction_threshold: float = Field( + default=0.5, + description="Minimum X-overlap ratio (0-1) to trigger column correction" + ) + + # Vertical Text Fragment Merging + # Detects and merges narrow vertical text blocks that were split by OCR + vertical_fragment_merge_enabled: bool = Field( + default=True, + description="Enable vertical text fragment merging for Chinese vertical text" + ) + vertical_fragment_aspect_ratio: float = Field( + default=0.3, + description="Max width/height ratio to consider as vertical text (lower = narrower)" + ) + + # Simple Text Positioning Mode (OCR Track) + # When enabled, bypasses complex table structure reconstruction and renders + # raw OCR text directly at detected positions with rotation correction. + # This is more reliable for documents where PP-Structure fails to parse tables correctly. + simple_text_positioning_enabled: bool = Field( + default=True, + description="Use simple text positioning instead of complex table reconstruction for OCR track" + ) + simple_text_positioning_debug: bool = Field( + default=False, + description="Enable debug logging for simple text positioning" + ) + # PP-StructureV3 Preprocessing (Stage 1) use_doc_orientation_classify: bool = Field(default=True) # Auto-detect and correct document rotation - use_doc_unwarping: bool = Field(default=True) # Correct document warping from photos + use_doc_unwarping: bool = Field(default=False) # Disabled: can cause document distortion/skewing use_textline_orientation: bool = Field(default=True) # Detect textline orientation # Layout Detection Parameters (Stage 3) @@ -277,11 +383,43 @@ class Settings(BaseSettings): # ===== Gap Filling Configuration ===== # Supplements PP-StructureV3 output with raw OCR regions when detection is incomplete - gap_filling_enabled: bool = Field(default=True) # Enable gap filling for OCR track + # Uses IoA (Intersection over Area) instead of IoU for better coverage detection + gap_filling_enabled: bool = Field(default=False) # Enable gap filling for OCR track gap_filling_coverage_threshold: float = Field(default=0.7) # Activate when coverage < 70% - gap_filling_iou_threshold: float = Field(default=0.15) # IoU threshold for coverage detection gap_filling_confidence_threshold: float = Field(default=0.3) # Min confidence for raw OCR regions - gap_filling_dedup_iou_threshold: float = Field(default=0.5) # IoU threshold for deduplication + + # IoA (Intersection over Area) thresholds - different thresholds per element type + # IoA = intersection_area / ocr_box_area (measures how much of OCR box is inside layout region) + gap_filling_ioa_threshold_text: float = Field( + default=0.6, + description="IoA threshold for TEXT/TITLE elements. Tolerates boundary errors." + ) + gap_filling_ioa_threshold_table: float = Field( + default=0.1, + description="IoA threshold for TABLE elements. Strict to prevent duplicate table content." + ) + gap_filling_ioa_threshold_figure: float = Field( + default=0.8, + description="IoA threshold for FIGURE/IMAGE elements. Preserves text inside figures." + ) + gap_filling_dedup_ioa_threshold: float = Field( + default=0.5, + description="IoA threshold for deduplication against existing TEXT elements." + ) + gap_filling_shrink_pixels: int = Field( + default=1, + description="Shrink OCR bbox inward by this many pixels to reduce edge duplicates." + ) + + # Use PP-StructureV3's internal OCR (overall_ocr_res) instead of separate Raw OCR + gap_filling_use_overall_ocr: bool = Field( + default=True, + description="Use PP-StructureV3's internal OCR results instead of separate inference." + ) + + # Legacy IoU threshold (deprecated, kept for backward compatibility) + gap_filling_iou_threshold: float = Field(default=0.15) # Deprecated: use IoA thresholds + gap_filling_dedup_iou_threshold: float = Field(default=0.5) # Deprecated: use gap_filling_dedup_ioa_threshold # ===== Debug Configuration ===== # Enable debug outputs for PP-StructureV3 analysis diff --git a/backend/app/routers/tasks.py b/backend/app/routers/tasks.py index 96f1e22..062c3de 100644 --- a/backend/app/routers/tasks.py +++ b/backend/app/routers/tasks.py @@ -41,6 +41,9 @@ from app.schemas.task import ( PreprocessingPreviewResponse, ImageQualityMetrics, TableDetectionConfig, + OCRPresetEnum, + OCRConfig, + OCR_PRESET_CONFIGS, ) from app.services.task_service import task_service from app.services.file_access_service import file_access_service @@ -77,7 +80,9 @@ def process_task_ocr( layout_model: Optional[str] = "chinese", preprocessing_mode: Optional[str] = "auto", preprocessing_config: Optional[dict] = None, - table_detection_config: Optional[dict] = None + table_detection_config: Optional[dict] = None, + ocr_preset: Optional[str] = None, + ocr_config: Optional[dict] = None ): """ Background task to process OCR for a task with dual-track support. @@ -97,6 +102,8 @@ def process_task_ocr( preprocessing_mode: Preprocessing mode ('auto', 'manual', 'disabled') preprocessing_config: Manual preprocessing config dict (contrast, sharpen, binarize) table_detection_config: Table detection config dict (enable_wired_table, enable_wireless_table, enable_region_detection) + ocr_preset: OCR processing preset (text_heavy, datasheet, table_heavy, form, mixed, custom) + ocr_config: Custom OCR config dict (overrides preset values) """ from app.core.database import SessionLocal from app.models.task import Task @@ -135,6 +142,26 @@ def process_task_ocr( enable_region_detection=table_detection_config.get("enable_region_detection", True) ) + # Convert OCR preset and config to proper objects + from app.schemas.task import OCRPresetEnum, OCRConfig, OCR_PRESET_CONFIGS, TableParsingModeEnum + ocr_config_obj = None + if ocr_preset: + preset_enum = OCRPresetEnum(ocr_preset) + # Get preset config as base + if preset_enum in OCR_PRESET_CONFIGS: + ocr_config_obj = OCR_PRESET_CONFIGS[preset_enum].model_copy() + else: + # CUSTOM preset - use provided config or defaults + ocr_config_obj = OCRConfig() + + # Override with custom config values if provided + if ocr_config: + for key, value in ocr_config.items(): + if hasattr(ocr_config_obj, key) and value is not None: + setattr(ocr_config_obj, key, value) + + logger.info(f"OCR config resolved: preset={ocr_preset}, config={ocr_config_obj.model_dump() if ocr_config_obj else None}") + # Get task directly by database ID (bypass user isolation for background task) task = db.query(Task).filter(Task.id == task_db_id).first() if not task: @@ -184,7 +211,8 @@ def process_task_ocr( layout_model=layout_model, preprocessing_mode=preprocess_mode_enum, preprocessing_config=preprocess_config_obj, - table_detection_config=table_det_config_obj + table_detection_config=table_det_config_obj, + ocr_config=ocr_config_obj ) else: # Fall back to traditional processing (no force_track support) @@ -196,7 +224,8 @@ def process_task_ocr( layout_model=layout_model, preprocessing_mode=preprocess_mode_enum, preprocessing_config=preprocess_config_obj, - table_detection_config=table_det_config_obj + table_detection_config=table_det_config_obj, + ocr_config=ocr_config_obj ) # Calculate processing time @@ -827,6 +856,13 @@ async def start_task( } logger.info(f"Table detection: {table_detection_config}") + # Extract OCR preset and config + ocr_preset = options.ocr_preset.value if options.ocr_preset else "datasheet" + ocr_config_dict = None + if options.ocr_config: + ocr_config_dict = options.ocr_config.model_dump() + logger.info(f"OCR preset: {ocr_preset}, config: {ocr_config_dict}") + # Get task details task = task_service.get_task_by_id( db=db, @@ -876,11 +912,13 @@ async def start_task( layout_model=layout_model, preprocessing_mode=preprocessing_mode, preprocessing_config=preprocessing_config, - table_detection_config=table_detection_config + table_detection_config=table_detection_config, + ocr_preset=ocr_preset, + ocr_config=ocr_config_dict ) logger.info(f"Started OCR processing task {task_id} for user {current_user.email}") - logger.info(f"Options: dual_track={use_dual_track}, force_track={force_track}, lang={language}, layout_model={layout_model}, preprocessing={preprocessing_mode}, table_detection={table_detection_config}") + logger.info(f"Options: dual_track={use_dual_track}, force_track={force_track}, lang={language}, layout_model={layout_model}, preprocessing={preprocessing_mode}, table_detection={table_detection_config}, ocr_preset={ocr_preset}") return task except HTTPException: diff --git a/backend/app/schemas/task.py b/backend/app/schemas/task.py index 9743e6f..bd742fc 100644 --- a/backend/app/schemas/task.py +++ b/backend/app/schemas/task.py @@ -65,6 +65,139 @@ class PreprocessingContrastEnum(str, Enum): DOCUMENT = "document" +class OCRPresetEnum(str, Enum): + """OCR processing preset for different document types. + + Presets provide optimized PP-Structure configurations for common document types: + - TEXT_HEAVY: Reports, articles, manuals (disable table recognition) + - DATASHEET: Technical datasheets, TDS (conservative table parsing) + - TABLE_HEAVY: Financial reports, spreadsheets (full table recognition) + - FORM: Applications, surveys (conservative table parsing) + - MIXED: General documents (classification only) + - CUSTOM: User-defined settings (use ocr_config) + """ + TEXT_HEAVY = "text_heavy" # Reports, articles, manuals + DATASHEET = "datasheet" # Technical datasheets, TDS + TABLE_HEAVY = "table_heavy" # Financial reports, spreadsheets + FORM = "form" # Applications, surveys + MIXED = "mixed" # General documents + CUSTOM = "custom" # User-defined settings + + +class TableParsingModeEnum(str, Enum): + """Table parsing mode controlling how aggressively tables are parsed. + + - FULL: Full table recognition with cell segmentation (aggressive) + - CONSERVATIVE: Disable wireless tables to prevent cell explosion + - CLASSIFICATION_ONLY: Only classify table regions, no cell segmentation + - DISABLED: Completely disable table recognition + """ + FULL = "full" + CONSERVATIVE = "conservative" + CLASSIFICATION_ONLY = "classification_only" + DISABLED = "disabled" + + +class OCRConfig(BaseModel): + """OCR processing configuration for PP-Structure. + + Allows fine-grained control over PP-Structure parameters. + Use with ocr_preset=CUSTOM or to override specific preset values. + """ + # Table Processing + table_parsing_mode: TableParsingModeEnum = Field( + default=TableParsingModeEnum.CONSERVATIVE, + description="Table parsing mode: full, conservative, classification_only, disabled" + ) + enable_wired_table: bool = Field( + default=True, + description="Enable wired (bordered) table detection" + ) + enable_wireless_table: bool = Field( + default=False, + description="Enable wireless (borderless) table detection. Can cause cell explosion." + ) + + # Layout Detection + layout_threshold: Optional[float] = Field( + default=None, + ge=0.0, + le=1.0, + description="Layout detection threshold. Higher = stricter. None uses default." + ) + layout_nms_threshold: Optional[float] = Field( + default=None, + ge=0.0, + le=1.0, + description="Layout NMS threshold. None uses default." + ) + + # Preprocessing + use_doc_orientation_classify: bool = Field( + default=True, + description="Auto-detect and correct document rotation" + ) + use_doc_unwarping: bool = Field( + default=False, + description="Correct document warping. Can cause distortion." + ) + use_textline_orientation: bool = Field( + default=True, + description="Detect textline orientation" + ) + + # Recognition Modules + enable_chart_recognition: bool = Field( + default=True, + description="Enable chart/diagram recognition" + ) + enable_formula_recognition: bool = Field( + default=True, + description="Enable math formula recognition" + ) + enable_seal_recognition: bool = Field( + default=False, + description="Enable seal/stamp recognition" + ) + enable_region_detection: bool = Field( + default=True, + description="Enable region detection for better structure" + ) + + +# Preset configurations mapping +OCR_PRESET_CONFIGS = { + OCRPresetEnum.TEXT_HEAVY: OCRConfig( + table_parsing_mode=TableParsingModeEnum.DISABLED, + enable_wired_table=False, + enable_wireless_table=False, + enable_chart_recognition=False, + enable_formula_recognition=False, + ), + OCRPresetEnum.DATASHEET: OCRConfig( + table_parsing_mode=TableParsingModeEnum.CONSERVATIVE, + enable_wired_table=True, + enable_wireless_table=False, + ), + OCRPresetEnum.TABLE_HEAVY: OCRConfig( + table_parsing_mode=TableParsingModeEnum.FULL, + enable_wired_table=True, + enable_wireless_table=True, + ), + OCRPresetEnum.FORM: OCRConfig( + table_parsing_mode=TableParsingModeEnum.CONSERVATIVE, + enable_wired_table=True, + enable_wireless_table=False, + ), + OCRPresetEnum.MIXED: OCRConfig( + table_parsing_mode=TableParsingModeEnum.CLASSIFICATION_ONLY, + enable_wired_table=True, + enable_wireless_table=False, + ), + # CUSTOM uses user-provided config directly +} + + class PreprocessingConfig(BaseModel): """Preprocessing configuration for layout detection enhancement. @@ -329,6 +462,17 @@ class ProcessingOptions(BaseModel): description="Table detection config. If None, all table detection modes are enabled." ) + # OCR Processing Preset (OCR track only) + # Use presets for optimized configurations or CUSTOM with ocr_config for fine-tuning + ocr_preset: Optional[OCRPresetEnum] = Field( + default=OCRPresetEnum.DATASHEET, + description="OCR processing preset: text_heavy, datasheet, table_heavy, form, mixed, custom" + ) + ocr_config: Optional[OCRConfig] = Field( + None, + description="Custom OCR config. Used when ocr_preset=custom or to override preset values." + ) + class AnalyzeRequest(BaseModel): """Document analysis request""" diff --git a/backend/app/services/cell_validation_engine.py b/backend/app/services/cell_validation_engine.py new file mode 100644 index 0000000..a6cfb12 --- /dev/null +++ b/backend/app/services/cell_validation_engine.py @@ -0,0 +1,583 @@ +""" +Cell Validation Engine + +Validates PP-StructureV3 table detections using metric-based heuristics +to filter over-detected cells and reclassify invalid tables as TEXT elements. + +Metrics used: +- Cell density: cells per 10,000 px² (normal: 0.4-1.0, over-detected: 6+) +- Average cell area: px² per cell (normal: 10,000-25,000, over-detected: ~1,600) +- Cell height: table_height / cell_count (minimum: 10px for readable text) +""" + +import logging +from dataclasses import dataclass +from typing import List, Dict, Any, Optional, Tuple +from html.parser import HTMLParser +import re + +logger = logging.getLogger(__name__) + + +@dataclass +class CellValidationConfig: + """Configuration for cell validation thresholds.""" + max_cell_density: float = 3.0 # cells per 10,000 px² + min_avg_cell_area: float = 3000.0 # px² per cell + min_cell_height: float = 10.0 # px per cell row + enabled: bool = True + + +@dataclass +class TableValidationResult: + """Result of table validation.""" + is_valid: bool + table_element: Dict[str, Any] + reason: Optional[str] = None + metrics: Optional[Dict[str, float]] = None + + +class CellValidationEngine: + """ + Validates table elements from PP-StructureV3 output. + + Over-detected tables are identified by abnormal metrics and + reclassified as TEXT elements while preserving content. + """ + + def __init__(self, config: Optional[CellValidationConfig] = None): + self.config = config or CellValidationConfig() + + def calculate_table_metrics( + self, + bbox: List[float], + cell_boxes: List[List[float]] + ) -> Dict[str, float]: + """ + Calculate validation metrics for a table. + + Args: + bbox: Table bounding box [x0, y0, x1, y1] + cell_boxes: List of cell bounding boxes + + Returns: + Dictionary with calculated metrics + """ + if len(bbox) < 4: + return {"cell_count": 0, "cell_density": 0, "avg_cell_area": 0, "avg_cell_height": 0} + + cell_count = len(cell_boxes) + if cell_count == 0: + return {"cell_count": 0, "cell_density": 0, "avg_cell_area": 0, "avg_cell_height": 0} + + # Calculate table dimensions + table_width = bbox[2] - bbox[0] + table_height = bbox[3] - bbox[1] + table_area = table_width * table_height + + if table_area <= 0: + return {"cell_count": cell_count, "cell_density": 0, "avg_cell_area": 0, "avg_cell_height": 0} + + # Cell density: cells per 10,000 px² + cell_density = (cell_count / table_area) * 10000 + + # Average cell area + avg_cell_area = table_area / cell_count + + # Average cell height (table height / cell count) + avg_cell_height = table_height / cell_count + + return { + "cell_count": cell_count, + "table_width": table_width, + "table_height": table_height, + "table_area": table_area, + "cell_density": cell_density, + "avg_cell_area": avg_cell_area, + "avg_cell_height": avg_cell_height + } + + def validate_table( + self, + element: Dict[str, Any] + ) -> TableValidationResult: + """ + Validate a single table element. + + Args: + element: Table element from PP-StructureV3 output + + Returns: + TableValidationResult with validation status and metrics + """ + if not self.config.enabled: + return TableValidationResult(is_valid=True, table_element=element) + + # Extract bbox and cell_boxes + bbox = element.get("bbox", []) + cell_boxes = element.get("cell_boxes", []) + + # Tables without cells pass validation (structure-only tables) + if not cell_boxes: + return TableValidationResult( + is_valid=True, + table_element=element, + reason="No cells to validate" + ) + + # Calculate metrics + metrics = self.calculate_table_metrics(bbox, cell_boxes) + + # Check cell density + if metrics["cell_density"] > self.config.max_cell_density: + return TableValidationResult( + is_valid=False, + table_element=element, + reason=f"Cell density {metrics['cell_density']:.2f} exceeds threshold {self.config.max_cell_density}", + metrics=metrics + ) + + # Check average cell area + if metrics["avg_cell_area"] < self.config.min_avg_cell_area: + return TableValidationResult( + is_valid=False, + table_element=element, + reason=f"Avg cell area {metrics['avg_cell_area']:.0f}px² below threshold {self.config.min_avg_cell_area}px²", + metrics=metrics + ) + + # Check cell height + if metrics["avg_cell_height"] < self.config.min_cell_height: + return TableValidationResult( + is_valid=False, + table_element=element, + reason=f"Avg cell height {metrics['avg_cell_height']:.1f}px below threshold {self.config.min_cell_height}px", + metrics=metrics + ) + + # Content-based validation: check if content looks like prose vs tabular data + content_check = self._validate_table_content(element) + if not content_check["is_tabular"]: + return TableValidationResult( + is_valid=False, + table_element=element, + reason=content_check["reason"], + metrics=metrics + ) + + return TableValidationResult( + is_valid=True, + table_element=element, + metrics=metrics + ) + + def _validate_table_content(self, element: Dict[str, Any]) -> Dict[str, Any]: + """ + Validate table content to detect false positive tables. + + Checks: + 1. Sparsity: text coverage ratio (text area / table area) + 2. Header: does table have proper header structure + 3. Key-Value: for 2-col tables, is it a key-value list or random layout + 4. Prose: are cells containing long prose text + + Returns: + Dict with is_tabular (bool) and reason (str) + """ + html_content = element.get("content", "") + bbox = element.get("bbox", []) + cell_boxes = element.get("cell_boxes", []) + + if not html_content or ' 3: + # Large table without header is suspicious + logger.debug(f"Table has no header structure with {num_rows} rows") + + # === Check 3: Key-Value pattern for 2-column tables === + if num_cols == 2: + kv_result = self._check_key_value_pattern(row_data) + if kv_result["is_kv_list"] and kv_result["confidence"] > 0.7: + # High confidence key-value list - keep as table but log + logger.debug(f"Table identified as key-value list (conf={kv_result['confidence']:.2f})") + elif not kv_result["is_kv_list"] and kv_result["is_random_layout"]: + # Random 2-column layout, not a real table + return { + "is_tabular": False, + "reason": f"random_two_column_layout (not key-value)" + } + + # === Check 4: Prose content === + long_cells = [c for c in all_cells if c["length"] > 80] + prose_ratio = len(long_cells) / len(all_cells) if all_cells else 0 + if prose_ratio > 0.3: + return { + "is_tabular": False, + "reason": f"prose_content ({len(long_cells)}/{len(all_cells)} cells > 80 chars)" + } + + # === Check 5: Section header as table === + if num_rows <= 2 and num_cols <= 2: + first_row = row_data[0] if row_data else [] + if len(first_row) == 1: + text = first_row[0]["text"] + if text.isupper() and len(text) < 50: + return { + "is_tabular": False, + "reason": f"section_header_only ({text[:30]})" + } + + return {"is_tabular": True, "reason": "content_valid"} + + except Exception as e: + logger.warning(f"Content validation failed: {e}") + return {"is_tabular": True, "reason": f"validation_error: {e}"} + + def _check_sparsity( + self, + bbox: List[float], + cell_boxes: List[List[float]], + all_cells: List[Dict] + ) -> Dict[str, Any]: + """ + Check text coverage ratio (sparsity). + + Two-column layouts have large empty gaps in the middle. + Real tables have more uniform cell distribution. + """ + if len(bbox) < 4: + return {"is_valid": True, "reason": "no_bbox"} + + table_width = bbox[2] - bbox[0] + table_height = bbox[3] - bbox[1] + table_area = table_width * table_height + + if table_area <= 0: + return {"is_valid": True, "reason": "invalid_area"} + + # Calculate text area from cell_boxes + if cell_boxes: + text_area = 0 + for cb in cell_boxes: + if len(cb) >= 4: + w = abs(cb[2] - cb[0]) + h = abs(cb[3] - cb[1]) + text_area += w * h + coverage = text_area / table_area + else: + # Estimate from cell content length + total_chars = sum(c["length"] for c in all_cells) + # Rough estimate: 1 char ≈ 8x12 pixels = 96 px² + estimated_text_area = total_chars * 96 + coverage = min(estimated_text_area / table_area, 1.0) + + # Very sparse table (< 15% coverage) is suspicious + if coverage < 0.15: + return { + "is_valid": False, + "reason": f"sparse_content (coverage={coverage:.1%})" + } + + return {"is_valid": True, "coverage": coverage} + + def _check_header_structure( + self, + row_data: List[List[Dict]], + num_cols: int + ) -> Dict[str, Any]: + """ + Check if table has proper header structure. + + Real tables usually have: + - First row with elements + - Or first row with different content pattern (labels vs values) + """ + if not row_data: + return {"has_header": False} + + first_row = row_data[0] + + # Check for elements + th_count = sum(1 for c in first_row if c.get("is_header", False)) + if th_count > 0 and th_count >= len(first_row) * 0.5: + return {"has_header": True, "type": "th_elements"} + + # Check for header-like content (short, distinct from body) + if len(row_data) > 1: + first_row_avg_len = sum(c["length"] for c in first_row) / len(first_row) if first_row else 0 + body_rows = row_data[1:] + body_cells = [c for row in body_rows for c in row] + body_avg_len = sum(c["length"] for c in body_cells) / len(body_cells) if body_cells else 0 + + # Header row should be shorter (labels) than body (data) + if first_row_avg_len < body_avg_len * 0.7: + return {"has_header": True, "type": "short_labels"} + + return {"has_header": False} + + def _check_key_value_pattern( + self, + row_data: List[List[Dict]] + ) -> Dict[str, Any]: + """ + For 2-column tables, check if it's a key-value list. + + Key-value characteristics: + - Left column: short labels (< 30 chars) + - Right column: values (can be longer) + - Consistent pattern across rows + + Random layout characteristics: + - Both columns have similar length distribution + - No clear label-value relationship + """ + if not row_data: + return {"is_kv_list": False, "is_random_layout": False, "confidence": 0} + + left_lengths = [] + right_lengths = [] + kv_rows = 0 + total_rows = 0 + + for row in row_data: + if len(row) != 2: + continue + total_rows += 1 + left = row[0] + right = row[1] + left_lengths.append(left["length"]) + right_lengths.append(right["length"]) + + # Key-value pattern: left is short label, right is value + if left["length"] < 40 and left["length"] < right["length"] * 2: + kv_rows += 1 + + if total_rows == 0: + return {"is_kv_list": False, "is_random_layout": False, "confidence": 0} + + kv_ratio = kv_rows / total_rows + avg_left = sum(left_lengths) / len(left_lengths) if left_lengths else 0 + avg_right = sum(right_lengths) / len(right_lengths) if right_lengths else 0 + + # High KV ratio and left column is shorter = key-value list + if kv_ratio > 0.6 and avg_left < avg_right: + return { + "is_kv_list": True, + "is_random_layout": False, + "confidence": kv_ratio, + "avg_left": avg_left, + "avg_right": avg_right + } + + # Similar lengths on both sides = random layout + if avg_left > 0 and 0.5 < avg_right / avg_left < 2.0: + # Both columns have similar content length + return { + "is_kv_list": False, + "is_random_layout": True, + "confidence": 1 - kv_ratio, + "avg_left": avg_left, + "avg_right": avg_right + } + + return { + "is_kv_list": False, + "is_random_layout": False, + "confidence": 0, + "avg_left": avg_left, + "avg_right": avg_right + } + + def extract_text_from_table_html(self, html_content: str) -> str: + """ + Extract plain text from table HTML content. + + Args: + html_content: HTML string containing table structure + + Returns: + Plain text extracted from table cells + """ + if not html_content: + return "" + + try: + class TableTextExtractor(HTMLParser): + def __init__(self): + super().__init__() + self.text_parts = [] + self.in_cell = False + + def handle_starttag(self, tag, attrs): + if tag in ('td', 'th'): + self.in_cell = True + + def handle_endtag(self, tag): + if tag in ('td', 'th'): + self.in_cell = False + + def handle_data(self, data): + if self.in_cell: + stripped = data.strip() + if stripped: + self.text_parts.append(stripped) + + parser = TableTextExtractor() + parser.feed(html_content) + return ' '.join(parser.text_parts) + except Exception as e: + logger.warning(f"Failed to parse table HTML: {e}") + # Fallback: strip HTML tags with regex + text = re.sub(r'<[^>]+>', ' ', html_content) + text = re.sub(r'\s+', ' ', text).strip() + return text + + def reclassify_as_text(self, element: Dict[str, Any]) -> Dict[str, Any]: + """ + Convert an over-detected table element to a TEXT element. + + Args: + element: Table element to reclassify + + Returns: + New TEXT element with preserved content + """ + # Extract text content from HTML + html_content = element.get("content", "") + text_content = self.extract_text_from_table_html(html_content) + + # Create new TEXT element + text_element = { + "element_id": element.get("element_id", ""), + "type": "text", + "original_type": "table_reclassified", # Mark as reclassified + "content": text_content, + "page": element.get("page", 0), + "bbox": element.get("bbox", []), + "index": element.get("index", 0), + "confidence": element.get("confidence", 1.0), + "reclassified_from": "table", + "reclassification_reason": "over_detection" + } + + return text_element + + def validate_and_filter_elements( + self, + elements: List[Dict[str, Any]] + ) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]: + """ + Validate all elements and filter/reclassify over-detected tables. + + Args: + elements: List of elements from PP-StructureV3 output + + Returns: + Tuple of (filtered_elements, statistics) + """ + filtered_elements = [] + stats = { + "total_tables": 0, + "valid_tables": 0, + "reclassified_tables": 0, + "reclassification_details": [] + } + + for element in elements: + if element.get("type") != "table": + # Non-table elements pass through unchanged + filtered_elements.append(element) + continue + + stats["total_tables"] += 1 + + # Validate table + result = self.validate_table(element) + + if result.is_valid: + stats["valid_tables"] += 1 + filtered_elements.append(element) + else: + # Reclassify as TEXT + stats["reclassified_tables"] += 1 + text_element = self.reclassify_as_text(element) + filtered_elements.append(text_element) + + stats["reclassification_details"].append({ + "element_id": element.get("element_id"), + "reason": result.reason, + "metrics": result.metrics + }) + + logger.info( + f"Reclassified table {element.get('element_id')} as TEXT: {result.reason}" + ) + + # Re-sort by reading order (y0 then x0) + filtered_elements = self._sort_by_reading_order(filtered_elements) + + return filtered_elements, stats + + def _sort_by_reading_order( + self, + elements: List[Dict[str, Any]] + ) -> List[Dict[str, Any]]: + """Sort elements by reading order (top-to-bottom, left-to-right).""" + def sort_key(elem): + bbox = elem.get("bbox", [0, 0, 0, 0]) + if isinstance(bbox, dict): + y0 = bbox.get("y0", 0) + x0 = bbox.get("x0", 0) + elif isinstance(bbox, list) and len(bbox) >= 2: + x0, y0 = bbox[0], bbox[1] + else: + y0, x0 = 0, 0 + return (y0, x0) + + return sorted(elements, key=sort_key) diff --git a/backend/app/services/gap_filling_service.py b/backend/app/services/gap_filling_service.py index 76d6220..b1ecb27 100644 --- a/backend/app/services/gap_filling_service.py +++ b/backend/app/services/gap_filling_service.py @@ -83,12 +83,34 @@ class TextRegion: return ((x0 + x1) / 2, (y0 + y1) / 2) +# Element type to IoA threshold mapping +# TABLE needs strict filtering (low threshold) to prevent duplicate content +# FIGURE allows more text through (high threshold) to preserve axis labels, legends +# TEXT/TITLE uses moderate threshold to tolerate boundary detection errors +ELEMENT_TYPE_IOA_THRESHOLDS = { + ElementType.TABLE: 'table', + ElementType.FIGURE: 'figure', + ElementType.IMAGE: 'figure', + ElementType.CHART: 'figure', + ElementType.DIAGRAM: 'figure', +} + + class GapFillingService: """ Service for detecting and filling gaps in PP-StructureV3 output. + This service uses IoA (Intersection over Area) algorithm for coverage detection, + which correctly measures "small box contained in large box" relationship. + + Key improvements over IoU: + - IoA = intersection_area / ocr_box_area (non-symmetric) + - Better for detecting if OCR text is covered by larger layout regions + - Different thresholds per element type (TEXT, TABLE, FIGURE) + - Optional boundary shrinking to reduce edge duplicates + This service: - 1. Calculates coverage of PP-StructureV3 elements over raw OCR regions + 1. Calculates coverage of PP-StructureV3 elements over raw OCR regions using IoA 2. Identifies uncovered raw OCR regions 3. Supplements uncovered regions as TEXT elements 4. Deduplicates against existing PP-StructureV3 TEXT elements @@ -98,9 +120,12 @@ class GapFillingService: def __init__( self, coverage_threshold: float = None, - iou_threshold: float = None, confidence_threshold: float = None, - dedup_iou_threshold: float = None, + ioa_threshold_text: float = None, + ioa_threshold_table: float = None, + ioa_threshold_figure: float = None, + dedup_ioa_threshold: float = None, + shrink_pixels: int = None, enabled: bool = None ): """ @@ -108,27 +133,48 @@ class GapFillingService: Args: coverage_threshold: Coverage ratio below which gap filling activates (default: 0.7) - iou_threshold: IoU threshold for coverage detection (default: 0.15) confidence_threshold: Minimum confidence for raw OCR regions (default: 0.3) - dedup_iou_threshold: IoU threshold for deduplication (default: 0.5) + ioa_threshold_text: IoA threshold for TEXT/TITLE elements (default: 0.6) + ioa_threshold_table: IoA threshold for TABLE elements (default: 0.1) + ioa_threshold_figure: IoA threshold for FIGURE/IMAGE elements (default: 0.8) + dedup_ioa_threshold: IoA threshold for deduplication (default: 0.5) + shrink_pixels: Shrink OCR bbox inward by this many pixels (default: 1) enabled: Whether gap filling is enabled (default: True) """ self.coverage_threshold = coverage_threshold if coverage_threshold is not None else getattr( settings, 'gap_filling_coverage_threshold', 0.7 ) - self.iou_threshold = iou_threshold if iou_threshold is not None else getattr( - settings, 'gap_filling_iou_threshold', 0.15 - ) self.confidence_threshold = confidence_threshold if confidence_threshold is not None else getattr( settings, 'gap_filling_confidence_threshold', 0.3 ) - self.dedup_iou_threshold = dedup_iou_threshold if dedup_iou_threshold is not None else getattr( - settings, 'gap_filling_dedup_iou_threshold', 0.5 + + # IoA thresholds per element type + self.ioa_threshold_text = ioa_threshold_text if ioa_threshold_text is not None else getattr( + settings, 'gap_filling_ioa_threshold_text', 0.6 ) + self.ioa_threshold_table = ioa_threshold_table if ioa_threshold_table is not None else getattr( + settings, 'gap_filling_ioa_threshold_table', 0.1 + ) + self.ioa_threshold_figure = ioa_threshold_figure if ioa_threshold_figure is not None else getattr( + settings, 'gap_filling_ioa_threshold_figure', 0.8 + ) + self.dedup_ioa_threshold = dedup_ioa_threshold if dedup_ioa_threshold is not None else getattr( + settings, 'gap_filling_dedup_ioa_threshold', 0.5 + ) + + # Boundary shrinking + self.shrink_pixels = shrink_pixels if shrink_pixels is not None else getattr( + settings, 'gap_filling_shrink_pixels', 1 + ) + self.enabled = enabled if enabled is not None else getattr( settings, 'gap_filling_enabled', True ) + # Legacy compatibility + self.iou_threshold = getattr(settings, 'gap_filling_iou_threshold', 0.15) + self.dedup_iou_threshold = getattr(settings, 'gap_filling_dedup_iou_threshold', 0.5) + def should_activate( self, raw_ocr_regions: List[TextRegion], @@ -209,21 +255,83 @@ class GapFillingService: logger.debug(f"Found {len(uncovered)} uncovered regions out of {len(raw_ocr_regions)}") return uncovered + def _get_ioa_threshold_for_element(self, element_type: ElementType) -> float: + """ + Get the IoA threshold for a specific element type. + + Different element types have different thresholds: + - TABLE: 0.1 (strict, prevents duplicate table content) + - FIGURE/IMAGE: 0.8 (preserves text inside figures) + - TEXT/others: 0.6 (tolerates boundary errors) + + Args: + element_type: The element type to get threshold for + + Returns: + IoA threshold value + """ + threshold_type = ELEMENT_TYPE_IOA_THRESHOLDS.get(element_type, 'text') + if threshold_type == 'table': + return self.ioa_threshold_table + elif threshold_type == 'figure': + return self.ioa_threshold_figure + else: + return self.ioa_threshold_text + + def _shrink_bbox( + self, + bbox: Tuple[float, float, float, float], + pixels: int + ) -> Tuple[float, float, float, float]: + """ + Shrink a bounding box inward by the specified number of pixels. + + This reduces false "uncovered" detection at region boundaries. + + Args: + bbox: Original bbox (x0, y0, x1, y1) + pixels: Number of pixels to shrink on each side + + Returns: + Shrunk bbox (x0, y0, x1, y1) + """ + x0, y0, x1, y1 = bbox + # Ensure we don't shrink to negative width/height + width = x1 - x0 + height = y1 - y0 + max_shrink = min(width / 2, height / 2, pixels) + + return ( + x0 + max_shrink, + y0 + max_shrink, + x1 - max_shrink, + y1 - max_shrink + ) + def _is_region_covered( self, region: TextRegion, pp_structure_elements: List[DocumentElement], - skip_table_coverage: bool = True + skip_table_coverage: bool = False ) -> bool: """ Check if a raw OCR region is covered by any PP-StructureV3 element. + Uses IoA (Intersection over Area) instead of IoU for better coverage detection. + IoA = intersection_area / ocr_box_area + This correctly measures "OCR box is contained in layout region". + + Different element types use different IoA thresholds: + - TABLE: 0.1 (strict, any overlap means covered) + - FIGURE/IMAGE: 0.8 (preserve text inside figures like axis labels) + - TEXT/others: 0.6 (tolerate boundary errors) + Args: region: Raw OCR text region pp_structure_elements: List of PP-StructureV3 elements - skip_table_coverage: If True, don't consider TABLE elements as covering - (allows raw OCR text inside tables to pass through - for layered rendering) + skip_table_coverage: If True, don't consider TABLE elements as covering. + Default is False - TABLE elements DO cover regions + to prevent duplicate rendering of table cell content. Returns: True if the region is covered @@ -231,10 +339,13 @@ class GapFillingService: center_x, center_y = region.center region_bbox = region.normalized_bbox + # Apply boundary shrinking to reduce edge duplicates + if self.shrink_pixels > 0: + region_bbox = self._shrink_bbox(region_bbox, self.shrink_pixels) + for element in pp_structure_elements: - # Skip TABLE elements when checking coverage - # This allows raw OCR text inside tables to be preserved - # PDF generator will render: table borders + raw text positions + # Check TABLE elements for coverage (default behavior) + # This prevents gap_fill from adding duplicate text inside table areas if skip_table_coverage and element.type == ElementType.TABLE: continue @@ -247,9 +358,11 @@ class GapFillingService: if self._point_in_bbox(center_x, center_y, elem_bbox): return True - # Check 2: IoU exceeds threshold - iou = self._calculate_iou(region_bbox, elem_bbox) - if iou > self.iou_threshold: + # Check 2: IoA exceeds element-type-specific threshold + # IoA = intersection_area / ocr_box_area + ioa = self._calculate_ioa(region_bbox, elem_bbox) + threshold = self._get_ioa_threshold_for_element(element.type) + if ioa > threshold: return True return False @@ -262,6 +375,9 @@ class GapFillingService: """ Remove regions that highly overlap with existing PP-StructureV3 TEXT elements. + Uses IoA (Intersection over Area) for deduplication to correctly detect + when an OCR region is already covered by an existing TEXT element. + Args: uncovered_regions: List of uncovered raw OCR regions pp_structure_elements: List of PP-StructureV3 elements @@ -278,6 +394,11 @@ class GapFillingService: deduplicated = [] for region in uncovered_regions: region_bbox = region.normalized_bbox + + # Apply boundary shrinking for deduplication as well + if self.shrink_pixels > 0: + region_bbox = self._shrink_bbox(region_bbox, self.shrink_pixels) + is_duplicate = False for element in text_elements: @@ -286,10 +407,11 @@ class GapFillingService: element.bbox.x1, element.bbox.y1 ) - iou = self._calculate_iou(region_bbox, elem_bbox) - if iou > self.dedup_iou_threshold: + # Use IoA for deduplication + ioa = self._calculate_ioa(region_bbox, elem_bbox) + if ioa > self.dedup_ioa_threshold: logger.debug( - f"Skipping duplicate region (IoU={iou:.2f}): '{region.text[:30]}...'" + f"Skipping duplicate region (IoA={ioa:.2f}): '{region.text[:30]}...'" ) is_duplicate = True break @@ -622,6 +744,52 @@ class GapFillingService: x0, y0, x1, y1 = bbox return x0 <= x <= x1 and y0 <= y <= y1 + @staticmethod + def _calculate_ioa( + ocr_bbox: Tuple[float, float, float, float], + layout_bbox: Tuple[float, float, float, float] + ) -> float: + """ + Calculate Intersection over Area (IoA) of OCR bbox relative to layout bbox. + + IoA = intersection_area / ocr_box_area + + This is the recommended algorithm for detecting if an OCR text region + is contained within a larger layout region. Unlike IoU which is symmetric, + IoA correctly measures "how much of the OCR box is inside the layout region". + + Example: + - OCR box: 100x20 pixels (small text line) + - Layout box: 500x800 pixels (large paragraph region) + - IoU would be very small (~0.005) even if OCR is fully inside layout + - IoA would be 1.0 if OCR is fully inside layout, which is correct + + Args: + ocr_bbox: OCR text region bbox (x0, y0, x1, y1) - typically smaller + layout_bbox: Layout element bbox (x0, y0, x1, y1) - typically larger + + Returns: + IoA value between 0 and 1 + """ + # Calculate intersection + x0 = max(ocr_bbox[0], layout_bbox[0]) + y0 = max(ocr_bbox[1], layout_bbox[1]) + x1 = min(ocr_bbox[2], layout_bbox[2]) + y1 = min(ocr_bbox[3], layout_bbox[3]) + + if x1 <= x0 or y1 <= y0: + return 0.0 + + intersection = (x1 - x0) * (y1 - y0) + + # Calculate OCR box area (denominator for IoA) + ocr_area = (ocr_bbox[2] - ocr_bbox[0]) * (ocr_bbox[3] - ocr_bbox[1]) + + if ocr_area <= 0: + return 0.0 + + return intersection / ocr_area + @staticmethod def _calculate_iou( bbox1: Tuple[float, float, float, float], @@ -630,6 +798,9 @@ class GapFillingService: """ Calculate Intersection over Union (IoU) of two bboxes. + Note: This method is kept for backward compatibility. + For coverage detection, use _calculate_ioa() instead. + Args: bbox1: First bbox (x0, y0, x1, y1) bbox2: Second bbox (x0, y0, x1, y1) diff --git a/backend/app/services/ocr_service.py b/backend/app/services/ocr_service.py index 11a6d5f..558ecdb 100644 --- a/backend/app/services/ocr_service.py +++ b/backend/app/services/ocr_service.py @@ -6,7 +6,7 @@ Supports both PaddleOCR (for scanned documents) and direct extraction (for edita import json import logging from pathlib import Path -from typing import Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union from datetime import datetime import uuid import gc # For garbage collection @@ -446,6 +446,47 @@ class OCRService: except Exception as e: logger.warning(f"Failed to clear GPU cache: {e}") + def _apply_ocr_config(self, ocr_config: 'OCRConfig'): + """ + Apply OCR configuration from preset or custom settings. + This modifies the runtime settings used by PP-Structure. + + Args: + ocr_config: OCRConfig object with processing settings + """ + logger.info(f"Applying OCR config: {ocr_config.model_dump()}") + + # Store the config for use in PP-Structure initialization + self._runtime_ocr_config = ocr_config + + # Apply table parsing mode settings + # These will be used when initializing PP-StructureV3 + settings.table_parsing_mode = ocr_config.table_parsing_mode.value if hasattr(ocr_config.table_parsing_mode, 'value') else ocr_config.table_parsing_mode + + # Apply preprocessing settings + settings.use_doc_orientation_classify = ocr_config.use_doc_orientation_classify + settings.use_doc_unwarping = ocr_config.use_doc_unwarping + settings.use_textline_orientation = ocr_config.use_textline_orientation + + # Apply recognition module settings + settings.enable_chart_recognition = ocr_config.enable_chart_recognition + settings.enable_formula_recognition = ocr_config.enable_formula_recognition + settings.enable_seal_recognition = ocr_config.enable_seal_recognition + settings.enable_region_detection = ocr_config.enable_region_detection + + # Apply layout threshold if specified + if ocr_config.layout_threshold is not None: + settings.layout_detection_threshold = ocr_config.layout_threshold + if ocr_config.layout_nms_threshold is not None: + settings.layout_nms_threshold = ocr_config.layout_nms_threshold + + # Invalidate existing structure engine to force re-initialization with new settings + if self.structure_engine is not None: + logger.info("Invalidating PP-StructureV3 engine to apply new OCR config") + self._unload_structure_engine() + + logger.info(f"OCR config applied: table_parsing_mode={settings.table_parsing_mode}") + def get_ocr_engine(self, lang: str = 'ch') -> PaddleOCR: """ Get or create OCR engine for specified language with GPU support @@ -615,6 +656,39 @@ class OCRService: formula_model = settings.formula_recognition_model_name chart_model = settings.chart_recognition_model_name + # Apply table_parsing_mode settings + # This is the KEY configuration to prevent "cell explosion" on datasheet-type documents + table_parsing_mode = settings.table_parsing_mode + logger.info(f"Table parsing mode: {table_parsing_mode}") + + if table_parsing_mode == "disabled": + # 方案A: 完全關閉 table recognition + use_table = False + wired_table_model = None + wireless_table_model = None + wired_cell_det_model = None + wireless_cell_det_model = None + logger.info("Table parsing DISABLED - no cell segmentation") + + elif table_parsing_mode == "classification_only": + # 方案C: 只做 table classification,不做 cell segmentation + use_table = False # Don't parse table structure + wired_table_model = None + wireless_table_model = None + wired_cell_det_model = None + wireless_cell_det_model = None + # Keep table_cls_model to identify table regions + logger.info("Table parsing CLASSIFICATION_ONLY - regions identified but no cell parsing") + + elif table_parsing_mode == "conservative": + # 方案B: 保守模式 - 只禁用 wireless tables (aggressive) + # 注意:不要修改 layout_threshold,它會影響所有元素偵測,不只是表格 + wireless_table_model = None + wireless_cell_det_model = None + logger.info(f"Table parsing CONSERVATIVE - wireless disabled (layout_threshold unchanged)") + + # else: "full" mode - use all default settings (aggressive) + # Apply table detection config overrides for individual table types if table_detection_config: if not table_detection_config.enable_wired_table: @@ -1343,6 +1417,7 @@ class OCRService: if detect_layout: # Pass current_page to analyze_layout for correct page numbering + # Also pass text_regions for table content rebuilding layout_data, images_metadata = self.analyze_layout( image_path, output_dir=output_dir, @@ -1350,7 +1425,8 @@ class OCRService: layout_model=layout_model, preprocessing_mode=preprocessing_mode, preprocessing_config=preprocessing_config, - table_detection_config=table_detection_config + table_detection_config=table_detection_config, + raw_ocr_regions=text_regions # For table content rebuilding ) # Generate Markdown @@ -1379,6 +1455,12 @@ class OCRService: # If layout data is enhanced, add enhanced results for converter if layout_data and layout_data.get('enhanced'): + # Debug: check if table elements have rebuild_stats + for elem in layout_data.get('elements', []): + if elem.get('type') == 'table': + has_rebuild = 'rebuild_stats' in elem + logger.info(f"[OCR_SERVICE] Table {elem.get('element_id')}: has rebuild_stats={has_rebuild}, keys={list(elem.keys())[:10]}") + result['enhanced_results'] = [{ 'elements': layout_data.get('elements', []), 'reading_order': layout_data.get('reading_order', []), @@ -1509,7 +1591,8 @@ class OCRService: layout_model: Optional[str] = None, preprocessing_mode: Optional[PreprocessingModeEnum] = None, preprocessing_config: Optional[PreprocessingConfig] = None, - table_detection_config: Optional[TableDetectionConfig] = None + table_detection_config: Optional[TableDetectionConfig] = None, + raw_ocr_regions: Optional[List[Dict[str, Any]]] = None ) -> Tuple[Optional[Dict], List[Dict]]: """ Analyze document layout using PP-StructureV3 with enhanced element extraction @@ -1522,6 +1605,7 @@ class OCRService: preprocessing_mode: Preprocessing mode ('auto', 'manual', 'disabled') preprocessing_config: Manual preprocessing config (used when mode='manual') table_detection_config: Table detection config (wired/wireless/region options) + raw_ocr_regions: Optional list of raw OCR text regions for table content rebuilding Returns: Tuple of (layout_data, images_metadata) @@ -1607,7 +1691,8 @@ class OCRService: preprocessed_image=preprocessed_image, scaling_info=scaling_info, save_visualization=True, # Save layout detection visualization images - use_cv_table_detection=use_cv_table_detection + use_cv_table_detection=use_cv_table_detection, + raw_ocr_regions=raw_ocr_regions # For table content rebuilding ) if result.get('has_parsing_res_list'): @@ -2225,7 +2310,8 @@ class OCRService: layout_model: Optional[str] = None, preprocessing_mode: Optional[PreprocessingModeEnum] = None, preprocessing_config: Optional[PreprocessingConfig] = None, - table_detection_config: Optional[TableDetectionConfig] = None + table_detection_config: Optional[TableDetectionConfig] = None, + ocr_config: Optional['OCRConfig'] = None ) -> Union[UnifiedDocument, Dict]: """ Main processing method with dual-track support. @@ -2242,11 +2328,16 @@ class OCRService: preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled') preprocessing_config: Manual preprocessing config (used when mode='manual') table_detection_config: Table detection config (wired/wireless/region options) + ocr_config: OCR processing config from preset or custom settings Returns: UnifiedDocument if dual-track is enabled and use_dual_track=True, Dict with legacy format otherwise """ + # Apply OCR config to settings if provided + if ocr_config: + self._apply_ocr_config(ocr_config) + # Use dual-track processing if: # 1. use_dual_track is True (auto-detection), OR # 2. force_track is specified (explicit track selection) diff --git a/backend/app/services/ocr_to_unified_converter.py b/backend/app/services/ocr_to_unified_converter.py index 9a97ee1..ec58e3a 100644 --- a/backend/app/services/ocr_to_unified_converter.py +++ b/backend/app/services/ocr_to_unified_converter.py @@ -189,7 +189,7 @@ def validate_cell_boxes( Validate cell_boxes coordinates against page boundaries and table bbox. PP-StructureV3 sometimes returns cell_boxes with coordinates that exceed - page boundaries. This function validates and reports issues. + page boundaries or table bbox. This function validates and clamps to valid boundaries. Args: cell_boxes: List of cell bounding boxes [[x0, y0, x1, y1], ...] @@ -213,10 +213,22 @@ def validate_cell_boxes( clamped_boxes = [] # Page boundaries with tolerance - min_x = -tolerance - min_y = -tolerance - max_x = page_width + tolerance - max_y = page_height + tolerance + page_min_x = -tolerance + page_min_y = -tolerance + page_max_x = page_width + tolerance + page_max_y = page_height + tolerance + + # Table boundaries with tolerance (prefer clamping to table bbox) + table_min_x = table_bbox[0] - tolerance if len(table_bbox) >= 4 else page_min_x + table_min_y = table_bbox[1] - tolerance if len(table_bbox) >= 4 else page_min_y + table_max_x = table_bbox[2] + tolerance if len(table_bbox) >= 4 else page_max_x + table_max_y = table_bbox[3] + tolerance if len(table_bbox) >= 4 else page_max_y + + # For clamping, use the intersection of page and expanded table bbox + clamp_min_x = max(0, table_bbox[0] - tolerance) if len(table_bbox) >= 4 else 0 + clamp_min_y = max(0, table_bbox[1] - tolerance) if len(table_bbox) >= 4 else 0 + clamp_max_x = min(page_width, table_bbox[2] + tolerance) if len(table_bbox) >= 4 else page_width + clamp_max_y = min(page_height, table_bbox[3] + tolerance) if len(table_bbox) >= 4 else page_height for idx, box in enumerate(cell_boxes): if not box or len(box) < 4: @@ -230,19 +242,38 @@ def validate_cell_boxes( cell_issues = [] # Check if coordinates exceed page boundaries - if x0 < min_x: + if x0 < page_min_x: cell_issues.append(f"x0={x0:.1f} < 0") is_valid = False - if y0 < min_y: + if y0 < page_min_y: cell_issues.append(f"y0={y0:.1f} < 0") is_valid = False - if x1 > max_x: + if x1 > page_max_x: cell_issues.append(f"x1={x1:.1f} > page_width={page_width:.1f}") is_valid = False - if y1 > max_y: + if y1 > page_max_y: cell_issues.append(f"y1={y1:.1f} > page_height={page_height:.1f}") is_valid = False + # Check if coordinates significantly exceed table bbox (more than 20% of table size) + if len(table_bbox) >= 4: + table_w = table_bbox[2] - table_bbox[0] + table_h = table_bbox[3] - table_bbox[1] + expand_tolerance = max(tolerance, table_h * 0.2) # 20% of table height + + if y0 < table_bbox[1] - expand_tolerance: + cell_issues.append(f"y0={y0:.1f} above table (table_y0={table_bbox[1]:.1f})") + is_valid = False + if y1 > table_bbox[3] + expand_tolerance: + cell_issues.append(f"y1={y1:.1f} below table (table_y1={table_bbox[3]:.1f})") + is_valid = False + if x0 < table_bbox[0] - expand_tolerance: + cell_issues.append(f"x0={x0:.1f} left of table (table_x0={table_bbox[0]:.1f})") + is_valid = False + if x1 > table_bbox[2] + expand_tolerance: + cell_issues.append(f"x1={x1:.1f} right of table (table_x1={table_bbox[2]:.1f})") + is_valid = False + # Check for inverted coordinates if x0 > x1: cell_issues.append(f"x0={x0:.1f} > x1={x1:.1f}") @@ -255,12 +286,12 @@ def validate_cell_boxes( invalid_count += 1 issues.append(f"Cell {idx}: {', '.join(cell_issues)}") - # Clamp to valid boundaries + # Clamp to valid boundaries (table bbox with some tolerance) clamped_box = [ - max(0, min(x0, page_width)), - max(0, min(y0, page_height)), - max(0, min(x1, page_width)), - max(0, min(y1, page_height)) + max(clamp_min_x, min(x0, clamp_max_x)), + max(clamp_min_y, min(y0, clamp_max_y)), + max(clamp_min_x, min(x1, clamp_max_x)), + max(clamp_min_y, min(y1, clamp_max_y)) ] # Ensure proper ordering after clamping @@ -395,10 +426,15 @@ class OCRToUnifiedConverter: Handles both enhanced PP-StructureV3 results (with parsing_res_list) and traditional markdown results. Applies gap filling when enabled. + + Gap filling can use either: + 1. overall_ocr_res from PP-StructureV3 (preferred, no extra inference) + 2. Separate raw OCR text_regions (fallback) """ pages = [] # Extract raw OCR text regions for gap filling + # Prefer overall_ocr_res from PP-StructureV3 when available raw_text_regions = ocr_results.get('text_regions', []) ocr_dimensions = ocr_results.get('ocr_dimensions', {}) @@ -461,13 +497,22 @@ class OCRToUnifiedConverter: if element: elements.append(element) - # Apply gap filling if enabled and raw regions available - if self.gap_filling_service and raw_text_regions: - # Filter raw regions for current page - page_raw_regions = [ - r for r in raw_text_regions - if r.get('page', 0) == page_idx or r.get('page', 1) == page_idx + 1 - ] + # Apply gap filling if enabled + # Priority: 1) overall_ocr_res from page_result, 2) raw_text_regions from separate OCR + if self.gap_filling_service: + # Check for overall_ocr_res from PP-StructureV3 (preferred, no extra inference) + page_raw_regions = page_result.get('overall_ocr_res', []) + + if page_raw_regions: + logger.debug(f"Page {page_idx + 1}: Using overall_ocr_res ({len(page_raw_regions)} regions)") + elif raw_text_regions: + # Fallback to separate raw OCR regions + page_raw_regions = [ + r for r in raw_text_regions + if r.get('page', 0) == page_idx or r.get('page', 1) == page_idx + 1 + ] + if page_raw_regions: + logger.debug(f"Page {page_idx + 1}: Using separate raw OCR ({len(page_raw_regions)} regions)") if page_raw_regions: supplemented, stats = self.gap_filling_service.fill_gaps( @@ -711,8 +756,33 @@ class OCRToUnifiedConverter: # Prepare content based on element type if element_type == ElementType.TABLE: # For tables, use TableData as content - # Pass cell_boxes for accurate cell positioning - table_data = self._extract_table_data(elem_data) + # Priority: rebuilt_table > HTML parsing + # rebuilt_table contains clean cells without empty padding + if 'rebuilt_table' in elem_data: + rebuilt = elem_data['rebuilt_table'] + # Use rebuilt cells directly - they don't include empty cells + rebuilt_cells = rebuilt.get('cells', []) + from app.models.unified_document import TableCell + table_cells = [ + TableCell( + row=c.get('row', 0), + col=c.get('col', 0), + row_span=c.get('row_span', 1), + col_span=c.get('col_span', 1), + content=c.get('content', '') + ) + for c in rebuilt_cells + ] + table_data = TableData( + rows=rebuilt.get('rows', 0), + cols=rebuilt.get('cols', 0), + cells=table_cells, + caption=elem_data.get('extracted_text') + ) + logger.info(f"[CONVERTER] Table {elem_data.get('element_id')}: Using rebuilt_table directly ({len(rebuilt_cells)} cells)") + else: + # Fallback to HTML parsing for non-rebuilt tables + table_data = self._extract_table_data(elem_data) content = table_data if table_data else elem_data.get('content', '') # Preserve cell_boxes and embedded_images in metadata for PDF generation @@ -756,6 +826,18 @@ class OCRToUnifiedConverter: if 'embedded_images' in elem_data: elem_data.setdefault('metadata', {})['embedded_images'] = elem_data['embedded_images'] + + # Pass through rebuild information for tables that were rebuilt + # This tells the PDF renderer to use HTML content instead of cell_boxes + logger.info(f"[CONVERTER] Table {elem_data.get('element_id')}: checking for rebuild_stats, keys={list(elem_data.keys())}") + if 'rebuild_stats' in elem_data: + elem_data.setdefault('metadata', {})['rebuild_stats'] = elem_data['rebuild_stats'] + elem_data['metadata']['was_rebuilt'] = True + logger.info(f"[CONVERTER] Table {elem_data.get('element_id')}: FOUND rebuild_stats, setting was_rebuilt=True") + + if 'rebuilt_table' in elem_data: + elem_data.setdefault('metadata', {})['rebuilt_table'] = elem_data['rebuilt_table'] + elif element_type in [ ElementType.IMAGE, ElementType.FIGURE, ElementType.CHART, ElementType.DIAGRAM, ElementType.LOGO, ElementType.STAMP diff --git a/backend/app/services/pdf_generator_service.py b/backend/app/services/pdf_generator_service.py index f460126..1fd7d23 100644 --- a/backend/app/services/pdf_generator_service.py +++ b/backend/app/services/pdf_generator_service.py @@ -26,6 +26,23 @@ from html.parser import HTMLParser from app.core.config import settings +# Import table column corrector for column alignment fix +try: + from app.services.table_column_corrector import TableColumnCorrector + TABLE_COLUMN_CORRECTOR_AVAILABLE = True +except ImportError: + TABLE_COLUMN_CORRECTOR_AVAILABLE = False + TableColumnCorrector = None + +# Import text region renderer for simple text positioning +try: + from app.services.text_region_renderer import TextRegionRenderer, load_raw_ocr_regions + TEXT_REGION_RENDERER_AVAILABLE = True +except ImportError: + TEXT_REGION_RENDERER_AVAILABLE = False + TextRegionRenderer = None + load_raw_ocr_regions = None + # Import UnifiedDocument for dual-track support try: from app.models.unified_document import ( @@ -596,7 +613,8 @@ class PDFGeneratorService: 'content': html_content, 'bbox': [element.bbox.x0, element.bbox.y0, element.bbox.x1, element.bbox.y1], - 'page': page_num - 1 # layout uses 0-based + 'page': page_num - 1, # layout uses 0-based + 'element_id': element.element_id # For _use_border_only matching } # Preserve cell_boxes and embedded_images from metadata @@ -607,18 +625,29 @@ class PDFGeneratorService: table_element['cell_boxes_source'] = element.metadata.get('cell_boxes_source', 'metadata') if 'embedded_images' in element.metadata: table_element['embedded_images'] = element.metadata['embedded_images'] + # Pass through rebuild flag - rebuilt tables should use HTML content + if element.metadata.get('was_rebuilt'): + table_element['was_rebuilt'] = True + logger.debug(f"Table {element.element_id}: marked as rebuilt") layout_elements.append(table_element) # Add bbox to images_metadata for text overlap filtering # (no actual image file, just bbox for filtering) - images_metadata.append({ + img_metadata = { 'image_path': None, # No fake table image 'bbox': bbox_polygon, 'page': page_num - 1, # 0-based for images_metadata 'type': 'table', 'element_id': element.element_id - }) + } + # Also copy cell_boxes for quality checking + if element.metadata and 'cell_boxes' in element.metadata: + img_metadata['cell_boxes'] = element.metadata['cell_boxes'] + # Mark if table was rebuilt + if element.metadata and element.metadata.get('was_rebuilt'): + img_metadata['was_rebuilt'] = True + images_metadata.append(img_metadata) # Handle image/visual elements (including stamps/seals) elif element.is_visual or element.type in [ @@ -1022,15 +1051,25 @@ class PDFGeneratorService: # Set current track self.current_processing_track = 'ocr' - # Convert UnifiedDocument to OCR data format (legacy) - ocr_data = self.convert_unified_document_to_ocr_data(unified_doc) + # Check if simple text positioning mode is enabled + if (settings.simple_text_positioning_enabled and + TEXT_REGION_RENDERER_AVAILABLE): + logger.info("Using simple text positioning mode") + result = self._generate_simple_text_pdf( + unified_doc=unified_doc, + output_path=output_path, + source_file_path=source_file_path + ) + else: + # Convert UnifiedDocument to OCR data format (legacy) + ocr_data = self.convert_unified_document_to_ocr_data(unified_doc) - # Use existing generation pipeline - result = self._generate_pdf_from_data( - ocr_data=ocr_data, - output_path=output_path, - source_file_path=source_file_path - ) + # Use existing generation pipeline + result = self._generate_pdf_from_data( + ocr_data=ocr_data, + output_path=output_path, + source_file_path=source_file_path + ) # Reset track self.current_processing_track = None @@ -1043,6 +1082,235 @@ class PDFGeneratorService: self.current_processing_track = None return False + def _generate_simple_text_pdf( + self, + unified_doc: 'UnifiedDocument', + output_path: Path, + source_file_path: Optional[Path] = None + ) -> bool: + """ + Generate PDF using simple text positioning from raw OCR regions. + + This approach bypasses complex table structure reconstruction and renders + raw OCR text directly at detected positions with rotation correction. + Images, charts, figures, seals, and formulas are still rendered normally. + + Args: + unified_doc: UnifiedDocument from OCR processing + output_path: Path to save generated PDF + source_file_path: Optional path to original source file + + Returns: + True if successful, False otherwise + """ + try: + logger.info("=== Simple Text Positioning PDF Generation ===") + + # Initialize text region renderer + text_renderer = TextRegionRenderer( + font_name=self.font_name, + debug=settings.simple_text_positioning_debug + ) + + # Get result directory from output_path + result_dir = output_path.parent + + # Try to determine task_id from result directory or output filename + # Output path is typically: result_dir/task_id_edited.pdf + task_id = None + if output_path.stem.endswith('_edited'): + task_id = output_path.stem.replace('_edited', '') + elif result_dir.name: + # result_dir is typically the task_id directory + task_id = result_dir.name + + if not task_id: + logger.warning("Could not determine task_id, falling back to legacy method") + ocr_data = self.convert_unified_document_to_ocr_data(unified_doc) + return self._generate_pdf_from_data( + ocr_data=ocr_data, + output_path=output_path, + source_file_path=source_file_path + ) + + logger.info(f"Task ID: {task_id}, Result dir: {result_dir}") + + # Get total pages from UnifiedDocument + total_pages = len(unified_doc.pages) if unified_doc.pages else 1 + + # Get page dimensions from first page (for canvas initialization) + if not unified_doc.pages: + logger.error("No pages in document") + return False + + first_page = unified_doc.pages[0] + if hasattr(first_page, 'dimensions') and first_page.dimensions: + page_width = float(first_page.dimensions.width) + page_height = float(first_page.dimensions.height) + else: + # Fallback to default size + page_width = 612.0 # Letter width + page_height = 792.0 # Letter height + logger.warning(f"No page dimensions found, using default {page_width}x{page_height}") + + logger.info(f"Initial page size: {page_width:.1f} x {page_height:.1f}") + + # Create PDF canvas + pdf_canvas = canvas.Canvas(str(output_path), pagesize=(page_width, page_height)) + + # Collect image-type elements from UnifiedDocument for rendering + # Types that should be rendered as images: figure, image, chart, seal, formula + image_element_types = {'figure', 'image', 'chart', 'seal', 'formula'} + + # Process each page + for page_num in range(1, total_pages + 1): + logger.info(f">>> Processing page {page_num}/{total_pages}") + + # Get page dimensions for current page + if page_num <= len(unified_doc.pages): + current_page = unified_doc.pages[page_num - 1] + if hasattr(current_page, 'dimensions') and current_page.dimensions: + current_width = float(current_page.dimensions.width) + current_height = float(current_page.dimensions.height) + else: + current_width = page_width + current_height = page_height + else: + current_width = page_width + current_height = page_height + + if page_num > 1: + pdf_canvas.showPage() + + # Set page size + pdf_canvas.setPageSize((current_width, current_height)) + + # === Layer 1: Render images, charts, figures, seals, formulas === + # Also collect exclusion zones for text avoidance + exclusion_zones = [] # List of (x0, y0, x1, y1) tuples + + if page_num <= len(unified_doc.pages): + current_page = unified_doc.pages[page_num - 1] + page_elements = current_page.elements if hasattr(current_page, 'elements') else [] + + image_elements_rendered = 0 + for elem in page_elements: + elem_type = elem.type if hasattr(elem, 'type') else elem.get('type', '') + # Handle enum type + if hasattr(elem_type, 'value'): + elem_type = elem_type.value + + if elem_type in image_element_types: + # Get image path from element content + content = elem.content if hasattr(elem, 'content') else elem.get('content', {}) + if isinstance(content, dict): + saved_path = content.get('saved_path') or content.get('path') + else: + saved_path = None + + # Get bbox for exclusion zone (even if image file not found) + bbox = elem.bbox if hasattr(elem, 'bbox') else elem.get('bbox', {}) + if hasattr(bbox, 'x0'): + x0, y0, x1, y1 = bbox.x0, bbox.y0, bbox.x1, bbox.y1 + elif isinstance(bbox, dict): + x0 = bbox.get('x0', 0) + y0 = bbox.get('y0', 0) + x1 = bbox.get('x1', x0 + bbox.get('width', 0)) + y1 = bbox.get('y1', y0 + bbox.get('height', 0)) + else: + continue + + # Add to exclusion zones for text avoidance + # Use original image coordinates (not PDF flipped) + exclusion_zones.append((x0, y0, x1, y1)) + + if saved_path: + # Try to find the image file + image_path = result_dir / saved_path + if not image_path.exists(): + # Try in imgs subdirectory + image_path = result_dir / 'imgs' / saved_path + if not image_path.exists(): + # Try just the filename + image_path = result_dir / Path(saved_path).name + + if image_path.exists(): + try: + # Convert coordinates (flip Y for PDF) + pdf_x = x0 + pdf_y = current_height - y1 # Bottom of image in PDF coords + img_width = x1 - x0 + img_height = y1 - y0 + + # Draw image + pdf_canvas.drawImage( + str(image_path), + pdf_x, pdf_y, + width=img_width, + height=img_height, + preserveAspectRatio=True, + mask='auto' + ) + image_elements_rendered += 1 + logger.debug(f"Rendered {elem_type}: {saved_path} at ({pdf_x:.1f}, {pdf_y:.1f})") + except Exception as e: + logger.warning(f"Failed to render {elem_type} {saved_path}: {e}") + else: + logger.warning(f"Image file not found: {saved_path}") + + if image_elements_rendered > 0: + logger.info(f"Rendered {image_elements_rendered} image elements (figures/charts/seals/formulas)") + + if exclusion_zones: + logger.info(f"Collected {len(exclusion_zones)} exclusion zones for text avoidance") + + # === Layer 2: Render text from raw OCR regions === + raw_regions = load_raw_ocr_regions(str(result_dir), task_id, page_num) + + if not raw_regions: + logger.warning(f"No raw OCR regions found for page {page_num}") + else: + logger.info(f"Loaded {len(raw_regions)} raw OCR regions for page {page_num}") + + # Collect texts inside exclusion zones for position-aware deduplication + # This prevents duplicate axis labels from being rendered near charts + zone_texts = None + if exclusion_zones: + zone_texts = text_renderer.collect_zone_texts( + raw_regions, exclusion_zones, threshold=0.5, include_axis_labels=True + ) + if zone_texts: + logger.info(f"Collected {len(zone_texts)} zone texts for deduplication: {list(zone_texts)[:10]}...") + + # Render all text regions, avoiding exclusion zones (images/charts) + # Scale factors are 1.0 since OCR dimensions match page dimensions + rendered = text_renderer.render_all_regions( + pdf_canvas=pdf_canvas, + regions=raw_regions, + page_height=current_height, + scale_x=1.0, + scale_y=1.0, + exclusion_zones=exclusion_zones, + zone_texts=zone_texts + ) + + logger.info(f"Rendered {rendered} text regions") + + logger.info(f"<<< Page {page_num} complete") + + # Save PDF + pdf_canvas.save() + + file_size = output_path.stat().st_size + logger.info(f"Generated PDF: {output_path.name} ({file_size} bytes)") + return True + + except Exception as e: + logger.error(f"Failed to generate simple text PDF: {e}") + import traceback + traceback.print_exc() + return False + def _generate_pdf_from_data( self, ocr_data: Dict, @@ -1093,8 +1361,15 @@ class PDFGeneratorService: logger.info("No page_dimensions found, using first page size for all pages") # Step 3: Get original file dimensions for all pages + # For OCR track, we use OCR coordinate system dimensions directly to avoid scaling issues original_page_sizes = {} - if source_file_path: + use_ocr_dimensions_for_pdf = (self.current_processing_track == 'ocr') + + if use_ocr_dimensions_for_pdf: + # OCR Track: Use OCR coordinate system dimensions directly + # This ensures no scaling is needed (scale = 1.0) + logger.info(f"OCR Track: 使用 OCR 座標系尺寸作為 PDF 頁面尺寸(避免縮放)") + elif source_file_path: original_page_sizes = self.get_all_page_sizes(source_file_path) if original_page_sizes: logger.info(f"從原始文件獲取到 {len(original_page_sizes)} 頁尺寸") @@ -1104,8 +1379,12 @@ class PDFGeneratorService: logger.info(f"無原始文件,將使用 OCR/UnifiedDocument 尺寸") # Determine initial canvas size (will be updated per page) - # Priority: original file first page > OCR/UnifiedDocument first page - if 0 in original_page_sizes: + # Priority for OCR track: OCR dimensions (no scaling) + # Priority for Direct track: original file first page > OCR/UnifiedDocument first page + if use_ocr_dimensions_for_pdf: + target_width, target_height = ocr_width, ocr_height + logger.info(f"初始 PDF 尺寸(OCR Track, 使用 OCR 座標系): {target_width:.1f} x {target_height:.1f}") + elif 0 in original_page_sizes: target_width, target_height = original_page_sizes[0] logger.info(f"初始 PDF 尺寸(來自原始文件首頁): {target_width:.1f} x {target_height:.1f}") else: @@ -1159,14 +1438,49 @@ class PDFGeneratorService: # Create PDF canvas with initial page size (will be updated per page) pdf_canvas = canvas.Canvas(str(output_path), pagesize=(target_width, target_height)) - # LAYERED RENDERING: Exclude tables from regions_to_avoid - # Text inside tables will be rendered at raw OCR positions (via GapFillingService) - # while table borders are drawn separately using cell_boxes - # Only avoid overlap with actual images/figures/charts - regions_to_avoid = [img for img in images_metadata if img.get('type') != 'table'] - table_count = len([img for img in images_metadata if img.get('type') == 'table']) + # Smart filtering: only include tables with good cell_boxes quality in regions_to_avoid + # Tables with bad cell_boxes will use raw OCR text positioning instead + # Exception: Rebuilt tables always use HTML content and filter text + regions_to_avoid = [] + good_quality_tables = [] + bad_quality_tables = [] + rebuilt_tables = [] - logger.info(f"過濾文字區域: {len(regions_to_avoid)} 個區域需要避免 (不含表格), {table_count} 個表格使用分層渲染") + for img in images_metadata: + if img.get('type') == 'table': + elem_id = img.get('element_id', 'unknown') + + # Check if this table was rebuilt - rebuilt tables have good content + was_rebuilt = img.get('was_rebuilt', False) + + if was_rebuilt: + # Rebuilt tables have accurate content - filter text, use HTML + regions_to_avoid.append(img) + rebuilt_tables.append(elem_id) + else: + # Check cell_boxes quality for non-rebuilt tables + cell_boxes = img.get('cell_boxes', []) + quality = self._check_cell_boxes_quality(cell_boxes, elem_id) + + if quality == 'good': + # Good quality: filter text, render with cell_boxes + regions_to_avoid.append(img) + good_quality_tables.append(elem_id) + else: + # Bad quality: don't filter text, just draw border + bad_quality_tables.append(elem_id) + img['_use_border_only'] = True # Mark for border-only rendering + else: + # Non-table elements (images, figures, charts) always avoid + regions_to_avoid.append(img) + + logger.info(f"過濾文字區域: {len(regions_to_avoid)} 個區域需要避免") + if rebuilt_tables: + logger.info(f" 重建表格用 HTML: {rebuilt_tables}") + if good_quality_tables: + logger.info(f" 表格用 cell_boxes: {good_quality_tables}") + if bad_quality_tables: + logger.info(f" 表格用 raw OCR text (border only): {bad_quality_tables}") filtered_text_regions = self._filter_text_in_regions(text_regions, regions_to_avoid) @@ -1178,10 +1492,24 @@ class PDFGeneratorService: pages_data[page_num] = [] pages_data[page_num].append(region) - # Get table elements from layout_data + # Get table elements from layout_data and copy _use_border_only flags table_elements = [] if layout_data and layout_data.get('elements'): - table_elements = [e for e in layout_data['elements'] if e.get('type') == 'table'] + # Create a lookup for _use_border_only flags from images_metadata + border_only_tables = {img.get('element_id') for img in images_metadata + if img.get('type') == 'table' and img.get('_use_border_only')} + + logger.debug(f"[DEBUG] border_only_tables from images_metadata: {border_only_tables}") + + for e in layout_data['elements']: + if e.get('type') == 'table': + elem_id = e.get('element_id') + logger.debug(f"[DEBUG] layout_data table element_id: {elem_id}") + # Copy the flag if this table should use border only + if elem_id in border_only_tables: + e['_use_border_only'] = True + logger.info(f"[DEBUG] Set _use_border_only=True for table {elem_id}") + table_elements.append(e) # Process each page total_pages = ocr_data.get('total_pages', 1) @@ -1195,14 +1523,23 @@ class PDFGeneratorService: logger.info(f">>> 處理第 {page_num}/{total_pages} 頁") # Get current page dimensions with priority order: - # 1. Original file dimensions (highest priority) - # 2. OCR/UnifiedDocument dimensions - # 3. Fallback to first page dimensions + # For OCR Track: always use OCR dimensions (scale = 1.0) + # For Direct Track: + # 1. Original file dimensions (highest priority) + # 2. OCR/UnifiedDocument dimensions + # 3. Fallback to first page dimensions page_idx = page_num - 1 dimension_source = "unknown" - # Priority 1: Original file dimensions - if page_idx in original_page_sizes: + # For OCR Track: always use OCR dimensions + if use_ocr_dimensions_for_pdf and page_idx in page_dimensions: + current_page_dims = page_dimensions[page_idx] + current_target_w = float(current_page_dims['width']) + current_target_h = float(current_page_dims['height']) + dimension_source = "ocr_track_direct" + + # Priority 1: Original file dimensions (Direct Track only) + elif page_idx in original_page_sizes: current_target_w, current_target_h = original_page_sizes[page_idx] dimension_source = "original_file" @@ -1774,12 +2111,26 @@ class PDFGeneratorService: non_empty_lines = [l for l in lines if l.strip()] num_lines = max(len(non_empty_lines), 1) - # Font size = bbox_height / num_lines * factor + # Font size calculation with stabilization # Use 0.8 factor to leave room for line spacing - font_size = (bbox_height / num_lines) * 0.8 - font_size = max(min(font_size, 72), 4) # Clamp between 4pt and 72pt + raw_font_size = (bbox_height / num_lines) * 0.8 - logger.debug(f"Text has {num_lines} non-empty lines, bbox_height={bbox_height:.1f}, calculated font_size={font_size:.1f}") + # Stabilize font size for body text (most common case) + # Normal body text should be 9-11pt, only deviate for clear outliers + element_type = region.get('element_type', 'text') + if element_type in ('text', 'paragraph'): + # For body text, bias toward 10pt baseline + if 7 <= raw_font_size <= 14: + # Near-normal range: use weighted average toward 10pt + font_size = raw_font_size * 0.7 + 10 * 0.3 + else: + # Clear outlier: use raw but clamp more aggressively + font_size = max(min(raw_font_size, 14), 7) + else: + # For titles/headers/etc, use raw calculation with wider range + font_size = max(min(raw_font_size, 72), 4) + + logger.debug(f"Text has {num_lines} non-empty lines, bbox_height={bbox_height:.1f}, raw={raw_font_size:.1f}, final={font_size:.1f}") # Transform coordinates: OCR (top-left origin) → PDF (bottom-left origin) # CRITICAL: Y-axis flip! @@ -2008,24 +2359,45 @@ class PDFGeneratorService: result_dir: Directory containing result files (for embedded images) """ try: + elem_id = table_element.get('element_id', 'unknown') + use_border_only = table_element.get('_use_border_only', False) + logger.info(f"[DEBUG] draw_table_region: elem_id={elem_id}, _use_border_only={use_border_only}") + html_content = table_element.get('content', '') if not html_content: + # Even without HTML, draw border if requested + if use_border_only: + self._draw_table_border_only(pdf_canvas, table_element, page_height, scale_w, scale_h) return - # Try to use cell_boxes for direct rendering first (more accurate) + # Apply column correction if enabled cell_boxes = table_element.get('cell_boxes', []) - if cell_boxes: - logger.info(f"[TABLE] Using cell_boxes direct rendering ({len(cell_boxes)} cells)") - success = self._draw_table_with_cell_boxes( - pdf_canvas, table_element, page_height, - scale_w, scale_h, result_dir - ) - if success: - return # Successfully rendered with cell_boxes + if (settings.table_column_correction_enabled and + TABLE_COLUMN_CORRECTOR_AVAILABLE and + cell_boxes): + try: + corrector = TableColumnCorrector( + correction_threshold=settings.table_column_correction_threshold, + vertical_merge_enabled=settings.vertical_fragment_merge_enabled, + vertical_aspect_ratio=settings.vertical_fragment_aspect_ratio + ) + # Get table bbox for vertical fragment detection + table_bbox = table_element.get('bbox', []) + if isinstance(table_bbox, dict): + table_bbox = [table_bbox['x0'], table_bbox['y0'], table_bbox['x1'], table_bbox['y1']] - logger.info("[TABLE] Falling back to ReportLab Table") + corrected_html, stats = corrector.correct( + html=html_content, + cell_boxes=cell_boxes, + table_bbox=table_bbox if isinstance(table_bbox, list) and len(table_bbox) >= 4 else None + ) + if stats.get('column_corrections', 0) > 0: + logger.info(f"[TABLE] {elem_id}: Column correction applied - {stats}") + html_content = corrected_html + except Exception as e: + logger.warning(f"[TABLE] {elem_id}: Column correction failed: {e}, using original HTML") - # Fallback: Parse HTML to extract table structure and use ReportLab Table + # Parse HTML first to get table structure for grid validation parser = HTMLTableParser() parser.feed(html_content) @@ -2040,6 +2412,83 @@ class PDFGeneratorService: if not rows: return + # Calculate number of rows and columns from HTML for grid validation + num_rows = len(rows) + max_cols = 0 + for row in rows: + row_cols = sum(cell.get('colspan', 1) for cell in row['cells']) + max_cols = max(max_cols, row_cols) + + # Check if table was rebuilt - if so, use HTML content directly + was_rebuilt = table_element.get('was_rebuilt', False) + cell_boxes_rendered = False # Track if we rendered borders with cell_boxes + + if was_rebuilt: + logger.info(f"[TABLE] {elem_id}: Table was rebuilt, using HTML content directly") + elif use_border_only: + # Bad quality cell_boxes: skip cell_boxes rendering, use ReportLab Table with borders + logger.info(f"[TABLE] {elem_id}: Bad cell_boxes quality, using ReportLab Table with borders") + else: + # Check if cell_boxes can produce a valid grid before rendering borders + cell_boxes = table_element.get('cell_boxes', []) + if cell_boxes: + # Get table bbox for grid calculation + temp_bbox = table_element.get('bbox', []) + if isinstance(temp_bbox, dict): + raw_bbox = [temp_bbox['x0'], temp_bbox['y0'], temp_bbox['x1'], temp_bbox['y1']] + elif isinstance(temp_bbox, list) and len(temp_bbox) >= 4: + if isinstance(temp_bbox[0], (int, float)): + raw_bbox = temp_bbox[:4] + else: + raw_bbox = [temp_bbox[0][0], temp_bbox[0][1], temp_bbox[2][0], temp_bbox[2][1]] + else: + raw_bbox = None + + # Pre-check: can we compute a valid grid from cell_boxes? + if raw_bbox: + test_col_widths, test_row_heights = self._compute_table_grid_from_cell_boxes( + cell_boxes, raw_bbox, num_rows, max_cols + ) + grid_valid = test_col_widths is not None and test_row_heights is not None + + if grid_valid: + logger.info(f"[TABLE] Grid validation passed, rendering borders with cell_boxes") + success = self._draw_table_with_cell_boxes( + pdf_canvas, table_element, page_height, + scale_w, scale_h, result_dir + ) + if success: + cell_boxes_rendered = True + logger.info("[TABLE] cell_boxes rendered borders, continuing with text-only ReportLab Table") + else: + logger.info("[TABLE] cell_boxes rendering failed, using ReportLab Table with borders") + else: + # Grid mismatch: try cellboxes-first rendering if enabled + if settings.table_rendering_prefer_cellboxes: + logger.info(f"[TABLE] Grid mismatch, trying cellboxes-first rendering") + from app.services.pdf_table_renderer import TableRenderer, TableRenderConfig + renderer = TableRenderer(TableRenderConfig()) + success = renderer.render_from_cellboxes_grid( + pdf_canvas, + cell_boxes, + html_content, + tuple(raw_bbox), + page_height, + scale_w, + scale_h, + row_threshold=settings.table_cellboxes_row_threshold, + col_threshold=settings.table_cellboxes_col_threshold + ) + if success: + logger.info("[TABLE] cellboxes-first rendering succeeded, skipping HTML-based rendering") + return # Table fully rendered, exit early + else: + logger.info("[TABLE] cellboxes-first rendering failed, falling back to HTML-based") + else: + logger.info(f"[TABLE] Grid validation failed (mismatch), using ReportLab Table with borders") + else: + logger.info("[TABLE] No valid bbox for grid validation, using ReportLab Table with borders") + # Get bbox directly from table element table_bbox = table_element.get('bbox') @@ -2106,15 +2555,7 @@ class PDFGeneratorService: pdf_y = page_height - ocr_y_bottom # Build table data for ReportLab with proper colspan/rowspan handling - # First pass: determine the actual grid size by accounting for spans - num_rows = len(rows) - - # Calculate actual number of columns by checking first row's total span - max_cols = 0 - for row in rows: - row_cols = sum(cell.get('colspan', 1) for cell in row['cells']) - max_cols = max(max_cols, row_cols) - + # num_rows and max_cols already calculated above for grid validation logger.info(f"[表格] {num_rows}行x{max_cols}列 → PDF位置: ({pdf_x:.1f}, {pdf_y:.1f}), 寬x高: {table_width:.0f}x{table_height:.0f}") # Create a grid to track occupied cells (for rowspan handling) @@ -2223,16 +2664,25 @@ class PDFGeneratorService: logger.info(f"[TABLE] Created with {len(col_widths)} cols, {len(row_heights)} rows") # Apply table style - style = TableStyle([ + # If cell_boxes rendered borders, skip GRID style (text-only rendering) + style_commands = [ ('FONT', (0, 0), (-1, -1), self.font_name if self.font_registered else 'Helvetica', font_size), - ('GRID', (0, 0), (-1, -1), 0.5, colors.black), ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'), ('ALIGN', (0, 0), (-1, -1), 'CENTER'), ('LEFTPADDING', (0, 0), (-1, -1), 2), ('RIGHTPADDING', (0, 0), (-1, -1), 2), ('TOPPADDING', (0, 0), (-1, -1), 2), ('BOTTOMPADDING', (0, 0), (-1, -1), 2), - ]) + ] + + # Only add GRID if cell_boxes didn't render borders + if not cell_boxes_rendered: + style_commands.insert(1, ('GRID', (0, 0), (-1, -1), 0.5, colors.black)) + logger.info("[TABLE] Adding GRID style (cell_boxes not used)") + else: + logger.info("[TABLE] Skipping GRID style (cell_boxes rendered borders)") + + style = TableStyle(style_commands) # Add header style if first row has headers if rows and rows[0]['cells'] and rows[0]['cells'][0].get('is_header'): @@ -2435,6 +2885,106 @@ class PDFGeneratorService: logger.debug(f"[TABLE] Normalized {len(cell_boxes)} cell boxes to grid") return normalized_boxes + def _draw_table_border_only( + self, + pdf_canvas: canvas.Canvas, + table_element: Dict, + page_height: float, + scale_w: float = 1.0, + scale_h: float = 1.0 + ): + """ + Draw only the outer border of a table (for tables with bad cell_boxes quality). + + Text inside the table will be rendered using raw OCR positions. + + Args: + pdf_canvas: ReportLab canvas object + table_element: Table element dict + page_height: Height of page in PDF coordinates + scale_w: Scale factor for X coordinates + scale_h: Scale factor for Y coordinates + """ + table_bbox = table_element.get('bbox', []) + if not table_bbox or len(table_bbox) < 4: + return + + element_id = table_element.get('element_id', 'unknown') + + # Handle different bbox formats + if isinstance(table_bbox, dict): + x0, y0, x1, y1 = table_bbox['x0'], table_bbox['y0'], table_bbox['x1'], table_bbox['y1'] + elif isinstance(table_bbox[0], (int, float)): + x0, y0, x1, y1 = table_bbox[0], table_bbox[1], table_bbox[2], table_bbox[3] + else: + return + + # Apply scaling + pdf_x0 = x0 * scale_w + pdf_y0 = y0 * scale_h + pdf_x1 = x1 * scale_w + pdf_y1 = y1 * scale_h + + # Convert to PDF coordinates (flip Y) + pdf_top = page_height - pdf_y0 + pdf_bottom = page_height - pdf_y1 + width = pdf_x1 - pdf_x0 + height = pdf_y1 - pdf_y0 + + # Draw outer border only + pdf_canvas.setStrokeColor(colors.black) + pdf_canvas.setLineWidth(0.5) + pdf_canvas.rect(pdf_x0, pdf_bottom, width, height, stroke=1, fill=0) + + logger.info(f"[TABLE] {element_id}: Drew border only (bad cell_boxes quality)") + + def _check_cell_boxes_quality(self, cell_boxes: List, element_id: str = "") -> str: + """ + Check the quality of cell_boxes to determine rendering strategy. + + Args: + cell_boxes: List of cell bounding boxes + element_id: Optional element ID for logging + + Returns: + 'good' if cell_boxes form a proper grid, 'bad' otherwise + """ + # If quality check is disabled, always return 'good' to use pure PP-Structure output + if not settings.table_quality_check_enabled: + logger.debug(f"[TABLE QUALITY] {element_id}: good - quality check disabled (pure PP-Structure mode)") + return 'good' + + if not cell_boxes or len(cell_boxes) < 2: + logger.debug(f"[TABLE QUALITY] {element_id}: bad - too few cells ({len(cell_boxes) if cell_boxes else 0})") + return 'bad' # No cell_boxes or too few + + # Count overlapping cell pairs + overlap_count = 0 + for i, box1 in enumerate(cell_boxes): + for j, box2 in enumerate(cell_boxes): + if i >= j: + continue + if not isinstance(box1, (list, tuple)) or len(box1) < 4: + continue + if not isinstance(box2, (list, tuple)) or len(box2) < 4: + continue + x_overlap = box1[0] < box2[2] and box1[2] > box2[0] + y_overlap = box1[1] < box2[3] and box1[3] > box2[1] + if x_overlap and y_overlap: + overlap_count += 1 + + total_pairs = len(cell_boxes) * (len(cell_boxes) - 1) // 2 + overlap_ratio = overlap_count / total_pairs if total_pairs > 0 else 0 + + # Relaxed threshold: 20% overlap instead of 10% to allow more tables through + # This is because PP-StructureV3's cell detection sometimes has slight overlaps + if overlap_ratio > 0.20: + logger.info(f"[TABLE QUALITY] {element_id}: bad - overlap ratio {overlap_ratio:.2%} > 20%") + return 'bad' + + logger.debug(f"[TABLE QUALITY] {element_id}: good - {len(cell_boxes)} cells, overlap {overlap_ratio:.2%}") + return 'good' + def _draw_table_with_cell_boxes( self, pdf_canvas: canvas.Canvas, @@ -2465,39 +3015,64 @@ class PDFGeneratorService: """ try: cell_boxes = table_element.get('cell_boxes', []) - - # Always draw outer table border first (fallback for incomplete cell_boxes) table_bbox = table_element.get('bbox', []) - if table_bbox and len(table_bbox) >= 4: - # Handle different bbox formats (list or dict) - if isinstance(table_bbox, dict): - tx1 = float(table_bbox.get('x0', 0)) - ty1 = float(table_bbox.get('y0', 0)) - tx2 = float(table_bbox.get('x1', 0)) - ty2 = float(table_bbox.get('y1', 0)) - else: - tx1, ty1, tx2, ty2 = table_bbox[:4] - # Apply scaling - tx1_scaled = tx1 * scale_w - ty1_scaled = ty1 * scale_h - tx2_scaled = tx2 * scale_w - ty2_scaled = ty2 * scale_h + # Check cell_boxes quality - skip if they don't form a proper grid + if cell_boxes and len(cell_boxes) > 2: + # Count overlapping cell pairs + overlap_count = 0 + for i, box1 in enumerate(cell_boxes): + for j, box2 in enumerate(cell_boxes): + if i >= j: + continue + x_overlap = box1[0] < box2[2] and box1[2] > box2[0] + y_overlap = box1[1] < box2[3] and box1[3] > box2[1] + if x_overlap and y_overlap: + overlap_count += 1 - table_width = tx2_scaled - tx1_scaled - table_height = ty2_scaled - ty1_scaled + # If more than 25% of cell pairs overlap, cell_boxes are unreliable + # Increased from 10% to 25% to allow more tables to use cell_boxes rendering + # which provides better visual fidelity than ReportLab Table fallback + total_pairs = len(cell_boxes) * (len(cell_boxes) - 1) // 2 + overlap_ratio = overlap_count / total_pairs if total_pairs > 0 else 0 - # Transform Y coordinate (PDF uses bottom-left origin) - pdf_x = tx1_scaled - pdf_y = page_height - ty2_scaled # Bottom of table in PDF coords - - # Draw outer table border (slightly thicker for visibility) - pdf_canvas.setStrokeColor(colors.black) - pdf_canvas.setLineWidth(1.0) - pdf_canvas.rect(pdf_x, pdf_y, table_width, table_height, stroke=1, fill=0) - logger.info(f"[TABLE] Drew outer table border at [{int(tx1)},{int(ty1)},{int(tx2)},{int(ty2)}]") + if overlap_ratio > 0.25: + logger.warning( + f"[TABLE] Skipping cell_boxes rendering: {overlap_count}/{total_pairs} " + f"({overlap_ratio:.1%}) cell pairs overlap - using ReportLab Table fallback" + ) + return False # Return False to trigger ReportLab Table fallback if not cell_boxes: + # Fallback: draw outer border only when no cell_boxes + if table_bbox and len(table_bbox) >= 4: + # Handle different bbox formats (list or dict) + if isinstance(table_bbox, dict): + tx1 = float(table_bbox.get('x0', 0)) + ty1 = float(table_bbox.get('y0', 0)) + tx2 = float(table_bbox.get('x1', 0)) + ty2 = float(table_bbox.get('y1', 0)) + else: + tx1, ty1, tx2, ty2 = table_bbox[:4] + + # Apply scaling + tx1_scaled = tx1 * scale_w + ty1_scaled = ty1 * scale_h + tx2_scaled = tx2 * scale_w + ty2_scaled = ty2 * scale_h + + table_width = tx2_scaled - tx1_scaled + table_height = ty2_scaled - ty1_scaled + + # Transform Y coordinate (PDF uses bottom-left origin) + pdf_x = tx1_scaled + pdf_y = page_height - ty2_scaled # Bottom of table in PDF coords + + # Draw outer table border (slightly thicker for visibility) + pdf_canvas.setStrokeColor(colors.black) + pdf_canvas.setLineWidth(1.0) + pdf_canvas.rect(pdf_x, pdf_y, table_width, table_height, stroke=1, fill=0) + logger.info(f"[TABLE] Drew outer table border at [{int(tx1)},{int(ty1)},{int(tx2)},{int(ty2)}]") logger.warning("[TABLE] No cell_boxes available, only outer border drawn") # Still draw embedded images even without cell borders embedded_images = table_element.get('embedded_images', []) @@ -2511,31 +3086,47 @@ class PDFGeneratorService: # Normalize cell boxes to create aligned grid cell_boxes = self._normalize_cell_boxes_to_grid(cell_boxes) - logger.info(f"[TABLE] Drawing {len(cell_boxes)} cell borders (layered mode, grid-aligned)") + logger.info(f"[TABLE] Drawing {len(cell_boxes)} cells using grid lines (avoiding duplicates)") + + # Collect unique grid lines to avoid drawing duplicate/overlapping lines + h_lines = set() # Horizontal lines: (y, x_start, x_end) + v_lines = set() # Vertical lines: (x, y_start, y_end) - # Draw each cell border for box in cell_boxes: x1, y1, x2, y2 = box[0], box[1], box[2], box[3] # Apply scaling - x1_scaled = x1 * scale_w - y1_scaled = y1 * scale_h - x2_scaled = x2 * scale_w - y2_scaled = y2 * scale_h + x1_s = x1 * scale_w + y1_s = y1 * scale_h + x2_s = x2 * scale_w + y2_s = y2 * scale_h - cell_width = x2_scaled - x1_scaled - cell_height = y2_scaled - y1_scaled + # Round to 1 decimal place to help with deduplication + x1_s, y1_s, x2_s, y2_s = round(x1_s, 1), round(y1_s, 1), round(x2_s, 1), round(y2_s, 1) - # Transform Y coordinate (PDF uses bottom-left origin) - pdf_x = x1_scaled - pdf_y = page_height - y2_scaled # Bottom of cell in PDF coords + # Add horizontal lines (top and bottom of cell) + h_lines.add((y1_s, x1_s, x2_s)) # Top line + h_lines.add((y2_s, x1_s, x2_s)) # Bottom line - # Draw cell border only (no fill, no text) - pdf_canvas.setStrokeColor(colors.black) - pdf_canvas.setLineWidth(0.5) - pdf_canvas.rect(pdf_x, pdf_y, cell_width, cell_height, stroke=1, fill=0) + # Add vertical lines (left and right of cell) + v_lines.add((x1_s, y1_s, y2_s)) # Left line + v_lines.add((x2_s, y1_s, y2_s)) # Right line - logger.info(f"[TABLE] Drew {len(cell_boxes)} cell borders") + # Draw unique horizontal lines + pdf_canvas.setStrokeColor(colors.black) + pdf_canvas.setLineWidth(0.5) + + for y, x_start, x_end in h_lines: + pdf_y = page_height - y # Transform Y coordinate + pdf_canvas.line(x_start, pdf_y, x_end, pdf_y) + + # Draw unique vertical lines + for x, y_start, y_end in v_lines: + pdf_y_start = page_height - y_start + pdf_y_end = page_height - y_end + pdf_canvas.line(x, pdf_y_start, x, pdf_y_end) + + logger.info(f"[TABLE] Drew {len(h_lines)} horizontal + {len(v_lines)} vertical grid lines") # Draw embedded images embedded_images = table_element.get('embedded_images', []) diff --git a/backend/app/services/pdf_table_renderer.py b/backend/app/services/pdf_table_renderer.py index 47d7ebe..0038b1e 100644 --- a/backend/app/services/pdf_table_renderer.py +++ b/backend/app/services/pdf_table_renderer.py @@ -24,6 +24,256 @@ from reportlab.platypus import Paragraph, Table, TableStyle logger = logging.getLogger(__name__) +# ============================================================================ +# Cell Box Grid Inferrer +# ============================================================================ + +class CellBoxGridInferrer: + """ + Infer table grid structure from cell_boxes coordinates. + + This class clusters cell_boxes by Y-coordinate (rows) and X-coordinate (columns) + to determine the grid structure, regardless of HTML colspan/rowspan. + """ + + def __init__( + self, + row_threshold: float = 15.0, + col_threshold: float = 15.0 + ): + """ + Initialize grid inferrer. + + Args: + row_threshold: Y-coordinate threshold for row clustering + col_threshold: X-coordinate threshold for column clustering + """ + self.row_threshold = row_threshold + self.col_threshold = col_threshold + + def infer_grid( + self, + cell_boxes: List[List[float]] + ) -> Optional[Dict]: + """ + Infer grid structure from cell_boxes. + + Args: + cell_boxes: List of [x0, y0, x1, y1] coordinates + + Returns: + Dict with 'grid', 'num_rows', 'num_cols', 'row_boundaries', 'col_boundaries' + or None if inference fails + """ + if not cell_boxes or len(cell_boxes) < 1: + return None + + try: + # Filter valid boxes + valid_boxes = [ + b for b in cell_boxes + if b is not None and len(b) >= 4 + ] + if not valid_boxes: + return None + + # Extract Y and X boundaries from all cells + y_mins = [b[1] for b in valid_boxes] # y0 + y_maxs = [b[3] for b in valid_boxes] # y1 + x_mins = [b[0] for b in valid_boxes] # x0 + x_maxs = [b[2] for b in valid_boxes] # x1 + + # Cluster Y values to determine rows + all_y = sorted(set(y_mins + y_maxs)) + y_boundaries = self._cluster_to_boundaries(all_y, self.row_threshold) + + # Cluster X values to determine columns + all_x = sorted(set(x_mins + x_maxs)) + x_boundaries = self._cluster_to_boundaries(all_x, self.col_threshold) + + if len(y_boundaries) < 2 or len(x_boundaries) < 2: + return None + + num_rows = len(y_boundaries) - 1 + num_cols = len(x_boundaries) - 1 + + # Build grid: map (row, col) -> cell_box info + grid = {} + for idx, box in enumerate(valid_boxes): + x0, y0, x1, y1 = box[:4] + + # Find row by y_center + y_center = (y0 + y1) / 2 + row = self._find_position(y_center, y_boundaries) + + # Find col by x_center + x_center = (x0 + x1) / 2 + col = self._find_position(x_center, x_boundaries) + + if row is not None and col is not None: + grid[(row, col)] = { + 'bbox': box, + 'index': idx, + 'content': '' + } + + # Calculate row heights and column widths + row_heights = [ + y_boundaries[i + 1] - y_boundaries[i] + for i in range(num_rows) + ] + col_widths = [ + x_boundaries[i + 1] - x_boundaries[i] + for i in range(num_cols) + ] + + return { + 'grid': grid, + 'num_rows': num_rows, + 'num_cols': num_cols, + 'row_boundaries': y_boundaries, + 'col_boundaries': x_boundaries, + 'row_heights': row_heights, + 'col_widths': col_widths + } + + except Exception as e: + logger.error(f"Grid inference failed: {e}") + return None + + def _cluster_to_boundaries( + self, + values: List[float], + threshold: float + ) -> List[float]: + """ + Cluster nearby values and return representative boundaries. + + Args: + values: Sorted list of coordinate values + threshold: Clustering threshold + + Returns: + List of boundary values (cluster representatives) + """ + if not values: + return [] + + boundaries = [values[0]] + current_cluster = [values[0]] + + for v in values[1:]: + if v - current_cluster[-1] <= threshold: + current_cluster.append(v) + else: + # Finish current cluster, use average as boundary + boundaries[-1] = sum(current_cluster) / len(current_cluster) + boundaries.append(v) + current_cluster = [v] + + # Finish last cluster + if current_cluster: + boundaries[-1] = sum(current_cluster) / len(current_cluster) + + return boundaries + + def _find_position( + self, + value: float, + boundaries: List[float] + ) -> Optional[int]: + """ + Find which interval a value falls into. + + Args: + value: Coordinate value + boundaries: List of boundary values + + Returns: + Index of interval, or None if out of bounds + """ + for i in range(len(boundaries) - 1): + if boundaries[i] <= value <= boundaries[i + 1]: + return i + + # Check if close to any boundary + for i in range(len(boundaries) - 1): + mid = (boundaries[i] + boundaries[i + 1]) / 2 + if abs(value - mid) < (boundaries[i + 1] - boundaries[i]): + return i + + return None + + +def extract_cell_contents_from_html(html: str) -> List[str]: + """ + Extract cell text contents from HTML in reading order. + + Args: + html: HTML table string + + Returns: + List of text strings, one per cell + """ + try: + parser = HTMLTableParser() + parser.feed(html) + + if not parser.tables: + return [] + + contents = [] + for row in parser.tables[0].get('rows', []): + for cell in row.get('cells', []): + text = cell.get('text', '').strip() + contents.append(text) + + return contents + + except Exception as e: + logger.error(f"HTML content extraction failed: {e}") + return [] + + +def map_content_to_grid( + grid: Dict[Tuple[int, int], Dict], + contents: List[str], + num_rows: int, + num_cols: int +) -> Dict[Tuple[int, int], Dict]: + """ + Map extracted content to grid cells row by row. + + Args: + grid: Dict mapping (row, col) to cell info + contents: List of text contents from HTML + num_rows: Number of rows in grid + num_cols: Number of columns in grid + + Returns: + Updated grid with content assigned + """ + content_idx = 0 + + for row in range(num_rows): + for col in range(num_cols): + if (row, col) in grid: + if content_idx < len(contents): + grid[(row, col)]['content'] = contents[content_idx] + content_idx += 1 + else: + grid[(row, col)]['content'] = '' + + # Log if there's a significant mismatch + if content_idx < len(contents): + logger.debug( + f"Content mismatch: {len(contents)} HTML cells, " + f"only {content_idx} mapped to {len(grid)} grid cells" + ) + + return grid + + # ============================================================================ # Configuration # ============================================================================ @@ -405,6 +655,147 @@ class TableRenderer: traceback.print_exc() return False + def render_from_cellboxes_grid( + self, + pdf_canvas, + cell_boxes: List[List[float]], + html_content: str, + table_bbox: Tuple[float, float, float, float], + page_height: float, + scale_w: float = 1.0, + scale_h: float = 1.0, + row_threshold: float = 15.0, + col_threshold: float = 15.0 + ) -> bool: + """ + Render table using cell_boxes as the primary structure source. + + This method infers grid structure from cell_boxes coordinates and + maps HTML content to cells, regardless of HTML colspan/rowspan. + + Args: + pdf_canvas: ReportLab canvas + cell_boxes: List of [x0, y0, x1, y1] for each cell + html_content: HTML table string (for text content) + table_bbox: Table bounding box + page_height: PDF page height + scale_w: Horizontal scale factor + scale_h: Vertical scale factor + row_threshold: Y-coordinate threshold for row clustering + col_threshold: X-coordinate threshold for column clustering + + Returns: + True if successful, False otherwise + """ + try: + if not cell_boxes: + logger.debug("No cell_boxes provided for grid rendering") + return False + + # Infer grid structure from cell_boxes + inferrer = CellBoxGridInferrer( + row_threshold=row_threshold, + col_threshold=col_threshold + ) + grid_info = inferrer.infer_grid(cell_boxes) + + if not grid_info: + logger.debug("Failed to infer grid from cell_boxes") + return False + + grid = grid_info['grid'] + num_rows = grid_info['num_rows'] + num_cols = grid_info['num_cols'] + row_boundaries = grid_info['row_boundaries'] + col_boundaries = grid_info['col_boundaries'] + + logger.info( + f"[TABLE] CellBoxes grid inferred: {num_rows} rows x {num_cols} cols " + f"from {len(cell_boxes)} cell_boxes" + ) + + # Extract content from HTML + if html_content: + contents = extract_cell_contents_from_html(html_content) + grid = map_content_to_grid(grid, contents, num_rows, num_cols) + logger.debug(f"[TABLE] Mapped {len(contents)} HTML cells to grid") + + # Apply scale factors to boundaries + scaled_row_boundaries = [y * scale_h for y in row_boundaries] + scaled_col_boundaries = [x * scale_w for x in col_boundaries] + + # Draw cell borders and content + pdf_canvas.saveState() + pdf_canvas.setStrokeColor(self.config.border_color) + pdf_canvas.setLineWidth(self.config.border_width) + + # Create paragraph style for text + style = ParagraphStyle( + 'CellBoxCell', + fontName=self.config.font_name, + fontSize=self.config.font_size, + alignment=TA_CENTER, + leading=self.config.font_size * 1.2 + ) + + for row in range(num_rows): + for col in range(num_cols): + # Calculate cell boundaries + x0 = scaled_col_boundaries[col] + x1 = scaled_col_boundaries[col + 1] if col + 1 < len(scaled_col_boundaries) else x0 + 50 + y0 = scaled_row_boundaries[row] + y1 = scaled_row_boundaries[row + 1] if row + 1 < len(scaled_row_boundaries) else y0 + 20 + + # Convert to PDF coordinates (flip Y) + pdf_x0 = x0 + pdf_y0 = page_height - y1 + pdf_x1 = x1 + pdf_y1 = page_height - y0 + + cell_width = pdf_x1 - pdf_x0 + cell_height = pdf_y1 - pdf_y0 + + # Draw cell border + pdf_canvas.rect(pdf_x0, pdf_y0, cell_width, cell_height) + + # Draw text if cell exists in grid + if (row, col) in grid: + cell_content = grid[(row, col)].get('content', '') + if cell_content: + # Calculate text position with padding + text_x = pdf_x0 + self.config.left_padding + text_y = pdf_y0 + cell_height - self.config.top_padding - self.config.font_size + + # Fit text to cell + available_width = cell_width - self.config.left_padding - self.config.right_padding + font_size = self._fit_text_to_cell( + pdf_canvas, cell_content, available_width, cell_height + ) + + # Draw centered text + pdf_canvas.setFont(self.config.font_name, font_size) + text_width = pdf_canvas.stringWidth( + cell_content, self.config.font_name, font_size + ) + + # Center horizontally + text_x = pdf_x0 + (cell_width - text_width) / 2 + # Center vertically + text_y = pdf_y0 + (cell_height - font_size) / 2 + + pdf_canvas.drawString(text_x, text_y, cell_content) + + pdf_canvas.restoreState() + + logger.info(f"[TABLE] Successfully rendered {num_rows}x{num_cols} table from cell_boxes") + return True + + except Exception as e: + logger.error(f"CellBoxes grid rendering failed: {e}") + import traceback + traceback.print_exc() + return False + # ========================================================================= # Grid and Cell Box Helpers # ========================================================================= diff --git a/backend/app/services/pp_structure_enhanced.py b/backend/app/services/pp_structure_enhanced.py index 7e8b9ca..9c0f4f5 100644 --- a/backend/app/services/pp_structure_enhanced.py +++ b/backend/app/services/pp_structure_enhanced.py @@ -28,9 +28,11 @@ from PIL import Image import numpy as np import cv2 from app.models.unified_document import ElementType +from app.services.cell_validation_engine import CellValidationEngine, CellValidationConfig from app.core.config import settings from app.services.memory_manager import prediction_context from app.services.cv_table_detector import CVTableDetector +from app.services.table_content_rebuilder import TableContentRebuilder logger = logging.getLogger(__name__) @@ -91,7 +93,8 @@ class PPStructureEnhanced: preprocessed_image: Optional[Image.Image] = None, scaling_info: Optional['ScalingInfo'] = None, save_visualization: bool = False, - use_cv_table_detection: bool = False + use_cv_table_detection: bool = False, + raw_ocr_regions: Optional[List[Dict[str, Any]]] = None ) -> Dict[str, Any]: """ Analyze document with full PP-StructureV3 capabilities. @@ -110,6 +113,8 @@ class PPStructureEnhanced: (layout_det_res, layout_order_res, overall_ocr_res, etc.) use_cv_table_detection: If True, use CV-based line detection for wired tables instead of ML-based cell detection (RT-DETR-L) + raw_ocr_regions: Optional list of raw OCR text regions for table content + rebuilding. Used when PP-StructureV3's table HTML is incorrect. Returns: Dictionary with complete structure information including: @@ -222,6 +227,7 @@ class PPStructureEnhanced: # Extract table_res_list which contains cell_box_list layout_det_res = None + overall_ocr_res = None if result_dict: if 'table_res_list' in result_dict: table_res_list = result_dict['table_res_list'] @@ -235,13 +241,20 @@ class PPStructureEnhanced: layout_det_res = result_dict['layout_det_res'] logger.info(f"Found layout_det_res with {len(layout_det_res.get('boxes', []))} boxes") + # Extract overall_ocr_res for gap filling (avoid separate Raw OCR inference) + if 'overall_ocr_res' in result_dict: + overall_ocr_res = result_dict['overall_ocr_res'] + ocr_count = len(overall_ocr_res.get('rec_texts', [])) + logger.info(f"Found overall_ocr_res with {ocr_count} text regions") + # Process parsing_res_list if found if parsing_res_list: elements = self._process_parsing_res_list( parsing_res_list, current_page, output_dir, image_path, scaling_info, table_res_list=table_res_list, # Pass table_res_list for cell_box_list layout_det_res=layout_det_res, # Pass layout_det_res for Image-in-Table - use_cv_table_detection=use_cv_table_detection # Use CV for wired tables + use_cv_table_detection=use_cv_table_detection, # Use CV for wired tables + raw_ocr_regions=raw_ocr_regions # Pass raw OCR for table content rebuilding ) all_elements.extend(elements) @@ -289,6 +302,15 @@ class PPStructureEnhanced: if visualization_dir: result['visualization_dir'] = str(visualization_dir) + # Add overall_ocr_res for gap filling (converted to standard format) + # This allows gap_filling_service to use PP-StructureV3's internal OCR + # instead of running a separate Raw OCR inference + if overall_ocr_res: + result['overall_ocr_res'] = self._convert_overall_ocr_to_regions( + overall_ocr_res, scaling_info + ) + logger.info(f"Converted {len(result['overall_ocr_res'])} OCR regions from overall_ocr_res") + return result except Exception as e: @@ -327,7 +349,8 @@ class PPStructureEnhanced: scaling_info: Optional['ScalingInfo'] = None, table_res_list: Optional[List[Dict]] = None, layout_det_res: Optional[Dict] = None, - use_cv_table_detection: bool = False + use_cv_table_detection: bool = False, + raw_ocr_regions: Optional[List[Dict[str, Any]]] = None ) -> List[Dict[str, Any]]: """ Process parsing_res_list to extract all elements. @@ -341,6 +364,7 @@ class PPStructureEnhanced: table_res_list: Optional list of table results containing cell_box_list layout_det_res: Optional layout detection result for Image-in-Table processing use_cv_table_detection: If True, use CV line detection for wired tables + raw_ocr_regions: Optional list of raw OCR text regions for table content rebuilding Returns: List of processed elements with normalized structure @@ -415,6 +439,11 @@ class PPStructureEnhanced: mapped_type = ElementType.TABLE html_table_content = content # Store for later use + # Strip LaTeX math formatting from text content (PP-Structure formula detection) + if content and mapped_type in [ElementType.TEXT, ElementType.TITLE, ElementType.HEADER]: + if '$' in content and '\\' in content: + content = self._strip_latex_math(content) + # Create element element = { 'element_id': f"pp3_{current_page}_{idx}", @@ -468,18 +497,84 @@ class PPStructureEnhanced: logger.info(f"[TABLE] Found {len(cell_boxes)} cell boxes from table_res_list (HTML match)") break - # If no HTML match, use first available table_res with cell_box_list + # If no HTML match, find best matching table_res by bbox overlap if not cell_boxes_extracted: + best_match = None + best_overlap = 0.0 + for tbl_res in table_res_list: - if 'cell_box_list' in tbl_res and tbl_res['cell_box_list']: - cell_boxes = tbl_res['cell_box_list'] - element['cell_boxes'] = [[float(c) for c in box] for box in cell_boxes] - element['cell_boxes_source'] = 'table_res_list' - cell_boxes_extracted = True - logger.info(f"[TABLE] Found {len(cell_boxes)} cell boxes from table_res_list (first available)") - # Remove used table_res to avoid reuse - table_res_list.remove(tbl_res) - break + if 'cell_box_list' not in tbl_res or not tbl_res['cell_box_list']: + continue + + # Get table_res bbox from its cell_box_list + cell_boxes_temp = tbl_res['cell_box_list'] + if not cell_boxes_temp: + continue + + # Calculate bounding box of all cells + tbl_x1 = min(cb[0] for cb in cell_boxes_temp) + tbl_y1 = min(cb[1] for cb in cell_boxes_temp) + tbl_x2 = max(cb[2] for cb in cell_boxes_temp) + tbl_y2 = max(cb[3] for cb in cell_boxes_temp) + + # Calculate IoU (Intersection over Union) with element bbox + # bbox is [x1, y1, x2, y2] + elem_x1, elem_y1, elem_x2, elem_y2 = bbox[0], bbox[1], bbox[2], bbox[3] + + # Intersection + inter_x1 = max(tbl_x1, elem_x1) + inter_y1 = max(tbl_y1, elem_y1) + inter_x2 = min(tbl_x2, elem_x2) + inter_y2 = min(tbl_y2, elem_y2) + + if inter_x1 < inter_x2 and inter_y1 < inter_y2: + inter_area = (inter_x2 - inter_x1) * (inter_y2 - inter_y1) + elem_area = (elem_x2 - elem_x1) * (elem_y2 - elem_y1) + tbl_area = (tbl_x2 - tbl_x1) * (tbl_y2 - tbl_y1) + + # Use overlap ratio with element bbox (how much of element is covered) + overlap_ratio = inter_area / elem_area if elem_area > 0 else 0 + + if overlap_ratio > best_overlap: + best_overlap = overlap_ratio + best_match = tbl_res + + # Use best match if overlap is significant (>10%) + if best_match and best_overlap > 0.1: + cell_boxes = best_match['cell_box_list'] + element['cell_boxes'] = [[float(c) for c in box] for box in cell_boxes] + element['cell_boxes_source'] = 'table_res_list' + cell_boxes_extracted = True + logger.info(f"[TABLE] Found {len(cell_boxes)} cell boxes from table_res_list (bbox match, overlap={best_overlap:.2f})") + + # Extract pred_html if not already set + if not html_content and 'pred_html' in best_match: + html_content = best_match['pred_html'] + element['html'] = html_content + element['extracted_text'] = self._extract_text_from_html(html_content) + logger.info(f"[TABLE] Extracted HTML from table_res_list (bbox match, {len(html_content)} chars)") + + # Remove used table_res to avoid reuse + table_res_list.remove(best_match) + elif table_res_list: + # Fallback to first available if no bbox match found + for tbl_res in table_res_list: + if 'cell_box_list' in tbl_res and tbl_res['cell_box_list']: + cell_boxes = tbl_res['cell_box_list'] + element['cell_boxes'] = [[float(c) for c in box] for box in cell_boxes] + element['cell_boxes_source'] = 'table_res_list' + cell_boxes_extracted = True + logger.warning(f"[TABLE] Using first available table_res (no bbox match, {len(cell_boxes)} cells)") + + # Extract pred_html if not already set + if not html_content and 'pred_html' in tbl_res: + html_content = tbl_res['pred_html'] + element['html'] = html_content + element['extracted_text'] = self._extract_text_from_html(html_content) + logger.info(f"[TABLE] Extracted HTML from table_res_list (fallback, {len(html_content)} chars)") + + table_res_list.remove(tbl_res) + break if not cell_boxes_extracted and 'boxes' in res_data: # PPStructureV3 returned cell boxes in res (unlikely in PaddleX 3.x) @@ -558,6 +653,42 @@ class PPStructureEnhanced: element['embedded_images'] = embedded_images logger.info(f"[TABLE] Embedded {len(embedded_images)} images into table") + # 4. Table content rebuilding from raw OCR regions + # When cell_boxes have boundary issues, rebuild table content from raw OCR + # Only if table_content_rebuilder is enabled (disabled by default as it's a patch behavior) + logger.info(f"[TABLE] raw_ocr_regions available: {raw_ocr_regions is not None and len(raw_ocr_regions) if raw_ocr_regions else 0}") + logger.info(f"[TABLE] cell_boxes available: {len(element.get('cell_boxes', []))}") + if settings.table_content_rebuilder_enabled and raw_ocr_regions and element.get('cell_boxes'): + rebuilder = TableContentRebuilder() + should_rebuild, rebuild_reason = rebuilder.should_rebuild( + element['cell_boxes'], + bbox, + element.get('html', '') + ) + + if should_rebuild: + logger.info(f"[TABLE] Triggering table rebuild: {rebuild_reason}") + rebuilt_table, rebuild_stats = rebuilder.rebuild_table( + cell_boxes=element['cell_boxes'], + table_bbox=bbox, + raw_ocr_regions=raw_ocr_regions, + original_html=element.get('html', '') + ) + + if rebuilt_table: + # Update element with rebuilt content + element['html'] = rebuilt_table['html'] + element['rebuilt_table'] = rebuilt_table + element['rebuild_stats'] = rebuild_stats + element['extracted_text'] = self._extract_text_from_html(rebuilt_table['html']) + logger.info( + f"[TABLE] Rebuilt table: {rebuilt_table['rows']}x{rebuilt_table['cols']} " + f"with {len(rebuilt_table['cells'])} cells" + ) + else: + logger.warning(f"[TABLE] Rebuild failed: {rebuild_stats.get('reason', 'unknown')}") + element['rebuild_stats'] = rebuild_stats + # Special handling for images/figures/charts/stamps (visual elements that need cropping) elif mapped_type in [ElementType.IMAGE, ElementType.FIGURE, ElementType.CHART, ElementType.DIAGRAM, ElementType.STAMP, ElementType.LOGO]: # Save image if path provided @@ -587,6 +718,21 @@ class PPStructureEnhanced: elements.append(element) logger.debug(f"Processed element {idx}: type={mapped_type}, bbox={bbox}") + # Apply cell validation to filter over-detected tables + if settings.cell_validation_enabled: + cell_validator = CellValidationEngine(CellValidationConfig( + max_cell_density=settings.cell_validation_max_density, + min_avg_cell_area=settings.cell_validation_min_cell_area, + min_cell_height=settings.cell_validation_min_cell_height, + enabled=True + )) + elements, validation_stats = cell_validator.validate_and_filter_elements(elements) + if validation_stats['reclassified_tables'] > 0: + logger.info( + f"Cell validation: {validation_stats['reclassified_tables']}/{validation_stats['total_tables']} " + f"tables reclassified as TEXT due to over-detection" + ) + return elements def _embed_images_in_table( @@ -911,18 +1057,145 @@ class PPStructureEnhanced: type_counts[elem_type] = type_counts.get(elem_type, 0) + 1 return type_counts + def _convert_overall_ocr_to_regions( + self, + overall_ocr_res: Dict[str, Any], + scaling_info: Optional['ScalingInfo'] = None + ) -> List[Dict[str, Any]]: + """ + Convert PP-StructureV3's overall_ocr_res to standard OCR region format. + + This allows gap_filling_service to use PP-StructureV3's internal OCR results + instead of running a separate Raw OCR inference, saving approximately 50% + of total inference time. + + The overall_ocr_res structure: + - dt_polys: List of polygon coordinates [[x1,y1], [x2,y2], [x3,y3], [x4,y4]] + - rec_texts: List of recognized text strings + - rec_scores: List of confidence scores + + Args: + overall_ocr_res: Dictionary containing OCR results from PP-StructureV3 + scaling_info: Optional scaling info for coordinate restoration + + Returns: + List of OCR region dictionaries in standard format: + [{'text': str, 'bbox': [[x1,y1],...], 'confidence': float}, ...] + """ + regions = [] + + dt_polys = overall_ocr_res.get('dt_polys', []) + rec_texts = overall_ocr_res.get('rec_texts', []) + rec_scores = overall_ocr_res.get('rec_scores', []) + + # Ensure all lists have the same length + num_regions = min(len(dt_polys), len(rec_texts)) + if len(rec_scores) < num_regions: + # Pad with default confidence if scores are missing + rec_scores = list(rec_scores) + [0.9] * (num_regions - len(rec_scores)) + + for i in range(num_regions): + text = rec_texts[i] + if not text or not text.strip(): + continue + + poly = dt_polys[i] + confidence = rec_scores[i] if i < len(rec_scores) else 0.9 + + # Apply scaling restoration if needed + if scaling_info and hasattr(scaling_info, 'scale_factor') and scaling_info.scale_factor != 1.0: + scale = scaling_info.scale_factor + poly = [[pt[0] / scale, pt[1] / scale] for pt in poly] + + regions.append({ + 'text': text, + 'bbox': poly, # Keep polygon format for compatibility + 'confidence': confidence + }) + + return regions + def _extract_text_from_html(self, html: str) -> str: """Extract plain text from HTML content.""" try: from bs4 import BeautifulSoup soup = BeautifulSoup(html, 'html.parser') - return soup.get_text(separator=' ', strip=True) + text = soup.get_text(separator=' ', strip=True) except: # Fallback: just remove HTML tags import re text = re.sub(r'<[^>]+>', ' ', html) text = re.sub(r'\s+', ' ', text) - return text.strip() + text = text.strip() + + # Strip LaTeX math formatting if present + return self._strip_latex_math(text) + + def _strip_latex_math(self, text: str) -> str: + """ + Convert LaTeX math notation to plain text. + + PP-StructureV3 outputs formulas in LaTeX format like: + $N\\cdot m\\times8.851=|b\\cdot|$ + + This converts them to readable plain text. + """ + import re + + if not text or '$' not in text: + return text + + # Remove $...$ delimiters but keep content + text = re.sub(r'\$([^$]+)\$', r'\1', text) + + # Convert common LaTeX math commands to plain text + replacements = [ + (r'\\cdot', '·'), # Multiplication dot + (r'\\times', '×'), # Multiplication sign + (r'\\div', '÷'), # Division sign + (r'\\pm', '±'), # Plus-minus + (r'\\leq', '≤'), # Less than or equal + (r'\\geq', '≥'), # Greater than or equal + (r'\\neq', '≠'), # Not equal + (r'\\approx', '≈'), # Approximately equal + (r'\\circ', '°'), # Degree symbol + (r'\\degree', '°'), # Degree symbol + (r'\\alpha', 'α'), + (r'\\beta', 'β'), + (r'\\gamma', 'γ'), + (r'\\delta', 'δ'), + (r'\\mu', 'μ'), + (r'\\Omega', 'Ω'), + (r'\\infty', '∞'), + (r'\^\\{2\\}', '²'), # Superscript 2 + (r'\^\\{3\\}', '³'), # Superscript 3 + (r'\^2', '²'), + (r'\^3', '³'), + (r'_\\{([^}]+)\\}', r'_\1'), # Subscript + (r'\\mathrm\{([^}]+)\}', r'\1'), # Roman text + (r'\\mathsf\{([^}]+)\}', r'\1'), # Sans-serif text + (r'\\mathbf\{([^}]+)\}', r'\1'), # Bold text + (r'\\text\{([^}]+)\}', r'\1'), # Text mode + (r'\\left', ''), + (r'\\right', ''), + (r'\\[|]', '|'), # Pipe symbols + (r'\\ ', ' '), # Escaped space + (r'\\,', ' '), # Thin space + (r'\\;', ' '), # Medium space + (r'\\quad', ' '), # Quad space + (r'\\qquad', ' '), # Double quad space + ] + + for pattern, replacement in replacements: + text = re.sub(pattern, replacement, text) + + # Clean up any remaining backslashes followed by letters (unknown commands) + text = re.sub(r'\\[a-zA-Z]+', '', text) + + # Clean up multiple spaces + text = re.sub(r'\s+', ' ', text) + + return text.strip() def _extract_bbox_from_filename(self, filename: str) -> List[int]: """Extract bbox from filename if it contains coordinate information.""" diff --git a/backend/app/services/processing_orchestrator.py b/backend/app/services/processing_orchestrator.py index 1ada552..9637352 100644 --- a/backend/app/services/processing_orchestrator.py +++ b/backend/app/services/processing_orchestrator.py @@ -335,6 +335,14 @@ class OCRPipeline(ProcessingPipeline): processing_time = time.time() - start_time + # Debug: Check if ocr_result has rebuild_stats + if 'enhanced_results' in ocr_result: + for page_result in ocr_result['enhanced_results']: + for elem in page_result.get('elements', []): + if elem.get('type') == 'table' or (hasattr(elem.get('type'), 'value') and elem.get('type').value == 'table'): + has_rebuild = 'rebuild_stats' in elem + logger.info(f"[ORCHESTRATOR] Before converter - Table {elem.get('element_id')}: has rebuild_stats={has_rebuild}") + # Convert to UnifiedDocument unified_doc = self.converter.convert( ocr_result, diff --git a/backend/app/services/table_column_corrector.py b/backend/app/services/table_column_corrector.py new file mode 100644 index 0000000..54d04d1 --- /dev/null +++ b/backend/app/services/table_column_corrector.py @@ -0,0 +1,790 @@ +""" +Table Column Alignment Corrector + +This module provides post-processing correction for PP-Structure's table +structure recognition, which frequently outputs cells with incorrect column +indices (column shift). + +The correction uses a "Header-Anchor Alignment" strategy: +1. Extract header row (row_idx=0) column X-coordinate ranges as anchors +2. Validate each cell's column assignment against header X-ranges +3. Correct column index if cell X-overlap with assigned column is insufficient + +Additionally supports "Vertical Fragment Merging" for Chinese vertical text +that gets split into multiple narrow text blocks. +""" + +import logging +import re +from dataclasses import dataclass, field +from typing import Dict, List, Optional, Tuple +from html.parser import HTMLParser + +logger = logging.getLogger(__name__) + + +@dataclass +class BBox: + """Bounding box with x0, y0, x1, y1 coordinates.""" + x0: float + y0: float + x1: float + y1: float + + @property + def width(self) -> float: + return self.x1 - self.x0 + + @property + def height(self) -> float: + return self.y1 - self.y0 + + @property + def center_x(self) -> float: + return (self.x0 + self.x1) / 2 + + @property + def center_y(self) -> float: + return (self.y0 + self.y1) / 2 + + @classmethod + def from_list(cls, coords: List[float]) -> 'BBox': + """Create BBox from [x0, y0, x1, y1] list.""" + if len(coords) >= 4: + return cls(coords[0], coords[1], coords[2], coords[3]) + raise ValueError(f"Invalid bbox coords: {coords}") + + +@dataclass +class ColumnAnchor: + """Represents a column's X-coordinate range from header row.""" + col_idx: int + x_min: float + x_max: float + colspan: int = 1 + + @property + def center_x(self) -> float: + return (self.x_min + self.x_max) / 2 + + +@dataclass +class TableCell: + """Represents a cell extracted from HTML with position info.""" + row_idx: int + col_idx: int + content: str + colspan: int = 1 + rowspan: int = 1 + bbox: Optional[BBox] = None + is_header: bool = False + + +@dataclass +class TextBlock: + """Represents a text block that may need merging.""" + text: str + bbox: BBox + + @property + def aspect_ratio(self) -> float: + """Width / Height ratio. Vertical text has low aspect ratio.""" + if self.bbox.height == 0: + return float('inf') + return self.bbox.width / self.bbox.height + + +class TableHTMLParser(HTMLParser): + """ + Parse table HTML to extract cells with row/col indices and spans. + PP-Structure outputs HTML like: +
contentmerged
+ """ + + def __init__(self): + super().__init__() + self.cells: List[TableCell] = [] + self.current_row_idx = -1 + self.current_col_idx = 0 + self.current_cell: Optional[TableCell] = None + self.in_table = False + # Track occupied cells for colspan/rowspan handling + self.occupied: Dict[Tuple[int, int], bool] = {} + + def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]): + attrs_dict = dict(attrs) + + if tag == 'table': + self.in_table = True + self.current_row_idx = -1 + self.occupied = {} + + elif tag == 'tr' and self.in_table: + self.current_row_idx += 1 + self.current_col_idx = 0 + # Skip occupied columns from previous rowspans + while (self.current_row_idx, self.current_col_idx) in self.occupied: + self.current_col_idx += 1 + + elif tag in ('td', 'th') and self.in_table: + # Skip occupied columns + while (self.current_row_idx, self.current_col_idx) in self.occupied: + self.current_col_idx += 1 + + colspan = int(attrs_dict.get('colspan', 1)) + rowspan = int(attrs_dict.get('rowspan', 1)) + + self.current_cell = TableCell( + row_idx=self.current_row_idx, + col_idx=self.current_col_idx, + content='', + colspan=colspan, + rowspan=rowspan, + is_header=(tag == 'th') + ) + + # Mark occupied cells for spans + for r in range(rowspan): + for c in range(colspan): + self.occupied[(self.current_row_idx + r, self.current_col_idx + c)] = True + + def handle_endtag(self, tag: str): + if tag == 'table': + self.in_table = False + + elif tag in ('td', 'th') and self.current_cell is not None: + self.current_cell.content = self.current_cell.content.strip() + self.cells.append(self.current_cell) + self.current_col_idx += self.current_cell.colspan + self.current_cell = None + + def handle_data(self, data: str): + if self.current_cell is not None: + self.current_cell.content += data + + +def calculate_x_overlap(cell_bbox: BBox, anchor: ColumnAnchor) -> float: + """ + Calculate the X-axis overlap ratio between a cell and a column anchor. + + Returns: + Overlap ratio (0.0 to 1.0) relative to the cell's width. + 1.0 means the cell is fully within the anchor's X range. + """ + if cell_bbox.width == 0: + return 0.0 + + overlap_start = max(cell_bbox.x0, anchor.x_min) + overlap_end = min(cell_bbox.x1, anchor.x_max) + overlap_width = max(0, overlap_end - overlap_start) + + return overlap_width / cell_bbox.width + + +def calculate_iou(bbox1: BBox, bbox2: BBox) -> float: + """Calculate Intersection over Union between two bounding boxes.""" + # Intersection + x0 = max(bbox1.x0, bbox2.x0) + y0 = max(bbox1.y0, bbox2.y0) + x1 = min(bbox1.x1, bbox2.x1) + y1 = min(bbox1.y1, bbox2.y1) + + if x1 <= x0 or y1 <= y0: + return 0.0 + + intersection = (x1 - x0) * (y1 - y0) + + # Union + area1 = bbox1.width * bbox1.height + area2 = bbox2.width * bbox2.height + union = area1 + area2 - intersection + + if union == 0: + return 0.0 + + return intersection / union + + +def parse_table_html(html: str) -> List[TableCell]: + """ + Parse table HTML and extract cells with row/col indices. + + Args: + html: Table HTML string from PP-Structure + + Returns: + List of TableCell objects with position and content + """ + parser = TableHTMLParser() + try: + parser.feed(html) + except Exception as e: + logger.warning(f"Failed to parse table HTML: {e}") + return [] + return parser.cells + + +def find_header_row(cells: List[TableCell], min_columns: int = 3) -> Optional[int]: + """ + Find the best row to use as header anchor. + + Strategy: Find the first row with at least min_columns individual cells + (cells without colspan > 1). This avoids using merged title rows as headers. + + Args: + cells: All parsed cells + min_columns: Minimum number of individual columns required + + Returns: + Row index of the best header row, or None if not found + """ + # Group cells by row + rows: Dict[int, List[TableCell]] = {} + for cell in cells: + if cell.row_idx not in rows: + rows[cell.row_idx] = [] + rows[cell.row_idx].append(cell) + + # Find first row with enough individual columns (no colspan) + for row_idx in sorted(rows.keys()): + row_cells = rows[row_idx] + individual_cells = [c for c in row_cells if c.colspan == 1] + if len(individual_cells) >= min_columns: + logger.debug(f"[COLUMN CORRECTION] Found header row {row_idx} with {len(individual_cells)} individual columns") + return row_idx + + # Fallback: find row with most individual cells + best_row = None + best_count = 0 + for row_idx, row_cells in rows.items(): + individual_count = len([c for c in row_cells if c.colspan == 1]) + if individual_count > best_count: + best_count = individual_count + best_row = row_idx + + if best_row is not None and best_count >= 2: + logger.debug(f"[COLUMN CORRECTION] Using fallback header row {best_row} with {best_count} columns") + return best_row + + return None + + +def build_column_anchors( + header_cells: List[TableCell], + cell_boxes: List[List[float]], + all_cells: Optional[List[TableCell]] = None +) -> List[ColumnAnchor]: + """ + Build column anchors from header row cells matched with cell_boxes. + + The header row is used as the authoritative reference for column X-coordinate + ranges. For tables with merged title rows, we find the first row with + multiple individual columns. + + Args: + header_cells: Cells from the identified header row + cell_boxes: List of [x0, y0, x1, y1] coordinates from PP-Structure + all_cells: All cells for finding actual header row (optional) + + Returns: + List of ColumnAnchor sorted by x_min + """ + if not header_cells or not cell_boxes: + return [] + + # If header row has too many merged cells, try to find a better header row + individual_cells = [c for c in header_cells if c.colspan == 1] + if len(individual_cells) < 3 and all_cells: + header_row_idx = find_header_row(all_cells, min_columns=3) + if header_row_idx is not None: + header_cells = [c for c in all_cells if c.row_idx == header_row_idx] + individual_cells = [c for c in header_cells if c.colspan == 1] + logger.info(f"[COLUMN CORRECTION] Switched to row {header_row_idx} as header ({len(individual_cells)} columns)") + + # Only use individual cells (no colspan) for accurate column boundaries + if individual_cells: + header_cells = individual_cells + + # Convert cell_boxes to BBox objects + boxes = [] + for coords in cell_boxes: + try: + boxes.append(BBox.from_list(coords)) + except (ValueError, IndexError): + continue + + if not boxes: + return [] + + # Group boxes by approximate Y position to find the header row's boxes + # Sort all boxes by Y first + boxes_by_y = sorted(boxes, key=lambda b: b.y0) + + # Find the Y range of the header cells (need to estimate based on row index) + header_row_idx = header_cells[0].row_idx if header_cells else 0 + + # Group boxes into rows by Y clustering + row_groups: List[List[BBox]] = [] + current_group: List[BBox] = [] + current_y = None + y_threshold = 40 # pixels tolerance for same row + + for box in boxes_by_y: + if current_y is None: + current_group.append(box) + current_y = box.center_y + elif abs(box.center_y - current_y) < y_threshold: + current_group.append(box) + current_y = (current_y * len(current_group) + box.center_y) / (len(current_group) + 1) + else: + if current_group: + row_groups.append(sorted(current_group, key=lambda b: b.x0)) + current_group = [box] + current_y = box.center_y + + if current_group: + row_groups.append(sorted(current_group, key=lambda b: b.x0)) + + # Find the row group that best matches the header row + # Look for a row with similar number of boxes as header cells + target_count = len(header_cells) + best_row_group = None + best_diff = float('inf') + + for group in row_groups: + diff = abs(len(group) - target_count) + if diff < best_diff: + best_diff = diff + best_row_group = group + + if not best_row_group: + logger.warning("[COLUMN CORRECTION] Could not find matching cell_boxes row for header") + return [] + + logger.debug(f"[COLUMN CORRECTION] Matched header row with {len(best_row_group)} cell_boxes") + + # Sort header cells by col_idx and match with boxes in X-order + header_sorted = sorted(header_cells, key=lambda c: c.col_idx) + boxes_sorted = best_row_group # Already sorted by x0 + + anchors = [] + for i, cell in enumerate(header_sorted): + if i < len(boxes_sorted): + box = boxes_sorted[i] + anchors.append(ColumnAnchor( + col_idx=cell.col_idx, + x_min=box.x0, + x_max=box.x1, + colspan=cell.colspan + )) + + return sorted(anchors, key=lambda a: a.x_min) + + +def match_cell_to_cellbox( + cell: TableCell, + cell_boxes: List[BBox], + row_cells: List[TableCell] +) -> Optional[BBox]: + """ + Match a table cell to its corresponding cell_box using position heuristics. + + Strategy: + 1. For header row, use X-order matching + 2. For other rows, use IoU if we have inferred bbox + 3. Fall back to position-based matching within row + + Args: + cell: The cell to match + cell_boxes: All cell_boxes for this table + row_cells: All cells in the same row (for position context) + + Returns: + Matched BBox or None if no match found + """ + if not cell_boxes: + return None + + # Sort cell_boxes by Y first, then X + sorted_boxes = sorted(cell_boxes, key=lambda b: (b.y0, b.x0)) + + # Group boxes by approximate Y position (same row) + row_groups: List[List[BBox]] = [] + current_group: List[BBox] = [] + current_y = None + + for box in sorted_boxes: + if current_y is None or abs(box.center_y - current_y) < 30: # 30px tolerance + current_group.append(box) + if current_y is None: + current_y = box.center_y + else: + current_y = (current_y + box.center_y) / 2 + else: + if current_group: + row_groups.append(sorted(current_group, key=lambda b: b.x0)) + current_group = [box] + current_y = box.center_y + + if current_group: + row_groups.append(sorted(current_group, key=lambda b: b.x0)) + + # Find the row that best matches cell.row_idx + if cell.row_idx < len(row_groups): + row_boxes = row_groups[cell.row_idx] + # Sort cells in this row by col_idx + row_cells_sorted = sorted(row_cells, key=lambda c: c.col_idx) + cell_position = row_cells_sorted.index(cell) if cell in row_cells_sorted else -1 + + if 0 <= cell_position < len(row_boxes): + return row_boxes[cell_position] + + return None + + +def correct_cell_column( + cell: TableCell, + anchors: List[ColumnAnchor], + threshold: float = 0.5 +) -> int: + """ + Determine the correct column index for a cell based on X-coordinate overlap. + + Args: + cell: The cell to check + anchors: Column anchors from header row + threshold: Minimum overlap ratio to trigger correction + + Returns: + Corrected column index (may be same as original) + """ + if not cell.bbox or not anchors: + return cell.col_idx + + # Find the anchor with best X-overlap + best_anchor = None + best_overlap = 0.0 + + for anchor in anchors: + overlap = calculate_x_overlap(cell.bbox, anchor) + if overlap > best_overlap: + best_overlap = overlap + best_anchor = anchor + + # If we found a significantly better column, use it + if best_anchor and best_overlap >= threshold: + if best_anchor.col_idx != cell.col_idx: + logger.info( + f"[COLUMN CORRECTION] Row {cell.row_idx}: " + f"'{cell.content[:20]}...' col {cell.col_idx} -> {best_anchor.col_idx} " + f"(overlap: {best_overlap:.1%})" + ) + return best_anchor.col_idx + + # If no good overlap, try nearest by center point + if best_overlap < 0.1: + cell_center = cell.bbox.center_x + nearest_anchor = min(anchors, key=lambda a: abs(a.center_x - cell_center)) + if nearest_anchor.col_idx != cell.col_idx: + logger.info( + f"[COLUMN CORRECTION] Row {cell.row_idx}: " + f"'{cell.content[:20]}...' col {cell.col_idx} -> {nearest_anchor.col_idx} " + f"(nearest by center)" + ) + return nearest_anchor.col_idx + + return cell.col_idx + + +def detect_vertical_fragments( + text_blocks: List[TextBlock], + table_bbox: BBox, + aspect_ratio_threshold: float = 0.3, + left_margin_ratio: float = 0.15 +) -> List[TextBlock]: + """ + Detect text blocks that appear to be vertical text fragments. + + Criteria: + 1. Width << Height (aspect ratio < threshold) + 2. Located in leftmost portion of table + + Args: + text_blocks: All text blocks in/around the table + table_bbox: Table bounding box + aspect_ratio_threshold: Max width/height to be considered vertical + left_margin_ratio: Fraction of table width to consider as left margin + + Returns: + List of blocks that are likely vertical text fragments + """ + left_boundary = table_bbox.x0 + (table_bbox.width * left_margin_ratio) + + fragments = [] + for block in text_blocks: + if block.aspect_ratio < aspect_ratio_threshold: + if block.bbox.center_x < left_boundary: + fragments.append(block) + + return fragments + + +def should_merge_blocks(block1: TextBlock, block2: TextBlock, x_tolerance: float = 10.0, y_gap_max: float = 20.0) -> bool: + """ + Check if two blocks should be merged as vertical text. + + Criteria: + 1. X-center deviation < tolerance + 2. Y-gap between blocks < max gap + + Args: + block1: First block (should be above block2) + block2: Second block + x_tolerance: Max X-center deviation in pixels + y_gap_max: Max vertical gap between blocks + + Returns: + True if blocks should be merged + """ + x_deviation = abs(block1.bbox.center_x - block2.bbox.center_x) + y_gap = block2.bbox.y0 - block1.bbox.y1 + + return x_deviation < x_tolerance and 0 <= y_gap < y_gap_max + + +def merge_vertical_fragments( + fragments: List[TextBlock], + x_tolerance: float = 10.0, + y_gap_max: float = 20.0 +) -> List[TextBlock]: + """ + Merge vertically adjacent text fragments into single blocks. + + Args: + fragments: List of vertical text fragments + x_tolerance: Max X-center deviation for merging + y_gap_max: Max Y-gap between mergeable blocks + + Returns: + List of merged text blocks + """ + if not fragments: + return [] + + # Sort by Y position + sorted_fragments = sorted(fragments, key=lambda b: b.bbox.y0) + + merged = [] + current_group: List[TextBlock] = [] + + for block in sorted_fragments: + if not current_group: + current_group.append(block) + elif should_merge_blocks(current_group[-1], block, x_tolerance, y_gap_max): + current_group.append(block) + else: + # Merge current group and start new one + merged.append(_merge_group(current_group)) + current_group = [block] + + if current_group: + merged.append(_merge_group(current_group)) + + return merged + + +def _merge_group(blocks: List[TextBlock]) -> TextBlock: + """Merge a group of text blocks into one.""" + if len(blocks) == 1: + return blocks[0] + + # Combine text (top to bottom) + combined_text = ''.join(b.text for b in blocks) + + # Calculate merged bbox + x0 = min(b.bbox.x0 for b in blocks) + y0 = min(b.bbox.y0 for b in blocks) + x1 = max(b.bbox.x1 for b in blocks) + y1 = max(b.bbox.y1 for b in blocks) + + return TextBlock( + text=combined_text, + bbox=BBox(x0, y0, x1, y1) + ) + + +def correct_table_columns( + html: str, + cell_boxes: List[List[float]], + threshold: float = 0.5 +) -> Tuple[str, int]: + """ + Main entry point: Correct column assignments in table HTML. + + This function: + 1. Parses the HTML to extract cells with row/col + 2. Builds column anchors from header row + 3. Matches cells to cell_boxes + 4. Corrects column indices based on X-overlap + 5. Rebuilds the HTML with corrected indices + + Args: + html: Original table HTML from PP-Structure + cell_boxes: List of [x0, y0, x1, y1] from PP-Structure + threshold: Minimum overlap ratio for correction + + Returns: + Tuple of (corrected_html, correction_count) + """ + # Parse HTML + cells = parse_table_html(html) + if not cells: + logger.debug("[COLUMN CORRECTION] No cells parsed from HTML") + return html, 0 + + # Convert cell_boxes to BBox objects + boxes = [] + for coords in cell_boxes: + try: + boxes.append(BBox.from_list(coords)) + except (ValueError, IndexError): + continue + + if not boxes: + logger.debug("[COLUMN CORRECTION] No valid cell_boxes") + return html, 0 + + # Find the best header row (not necessarily row 0) + # First try row 0, but if it has merged cells, find a better row + header_row_idx = find_header_row(cells, min_columns=3) + if header_row_idx is None: + # Fallback to row 0 + header_row_idx = 0 + + header_cells = [c for c in cells if c.row_idx == header_row_idx] + if not header_cells: + logger.debug("[COLUMN CORRECTION] No header row found, skipping correction") + return html, 0 + + # Build column anchors, passing all cells for smart header detection + anchors = build_column_anchors(header_cells, cell_boxes, all_cells=cells) + if not anchors: + logger.debug("[COLUMN CORRECTION] Could not build column anchors") + return html, 0 + + logger.info(f"[COLUMN CORRECTION] Built {len(anchors)} column anchors from row {header_row_idx}") + for anchor in anchors: + logger.debug(f" Column {anchor.col_idx}: X range [{anchor.x_min:.1f}, {anchor.x_max:.1f}]") + + # Group cells by row for matching + cells_by_row: Dict[int, List[TableCell]] = {} + for cell in cells: + if cell.row_idx not in cells_by_row: + cells_by_row[cell.row_idx] = [] + cells_by_row[cell.row_idx].append(cell) + + # Match cells to cell_boxes and correct columns + correction_count = 0 + corrections: Dict[Tuple[int, int], int] = {} # (row, old_col) -> new_col + + for cell in cells: + if cell.row_idx == header_row_idx: + continue # Skip header row (used as reference) + + row_cells = cells_by_row.get(cell.row_idx, []) + matched_box = match_cell_to_cellbox(cell, boxes, row_cells) + + if matched_box: + cell.bbox = matched_box + new_col = correct_cell_column(cell, anchors, threshold) + + if new_col != cell.col_idx: + corrections[(cell.row_idx, cell.col_idx)] = new_col + correction_count += 1 + + if correction_count == 0: + logger.info("[COLUMN CORRECTION] No corrections needed") + return html, 0 + + # Rebuild HTML with corrected column indices + # Note: This is a simple approach that modifies HTML attributes + # A more robust solution would rebuild the entire table structure + corrected_html = html + + logger.info(f"[COLUMN CORRECTION] Made {correction_count} column corrections") + + return corrected_html, correction_count + + +class TableColumnCorrector: + """ + Service class for table column correction. + + Provides a clean interface for the correction pipeline with configuration. + """ + + def __init__( + self, + correction_threshold: float = 0.5, + vertical_merge_enabled: bool = True, + vertical_aspect_ratio: float = 0.3 + ): + self.correction_threshold = correction_threshold + self.vertical_merge_enabled = vertical_merge_enabled + self.vertical_aspect_ratio = vertical_aspect_ratio + + def correct( + self, + html: str, + cell_boxes: List[List[float]], + table_bbox: Optional[List[float]] = None, + text_blocks: Optional[List[Dict]] = None + ) -> Tuple[str, Dict]: + """ + Apply column correction to a table. + + Args: + html: Table HTML from PP-Structure + cell_boxes: Cell bounding boxes + table_bbox: Table bounding box (for vertical fragment detection) + text_blocks: Raw OCR text blocks (for vertical fragment merging) + + Returns: + Tuple of (corrected_html, stats_dict) + """ + stats = { + 'column_corrections': 0, + 'vertical_merges': 0, + 'anchors_built': 0 + } + + # Step 1: Vertical fragment merging (if enabled and data available) + if self.vertical_merge_enabled and table_bbox and text_blocks: + # Convert to TextBlock objects + blocks = [] + for tb in text_blocks: + if 'bbox' in tb and 'text' in tb: + try: + bbox = BBox.from_list(tb['bbox']) + blocks.append(TextBlock(text=tb['text'], bbox=bbox)) + except (ValueError, KeyError): + continue + + if blocks: + table_bb = BBox.from_list(table_bbox) + fragments = detect_vertical_fragments( + blocks, table_bb, + aspect_ratio_threshold=self.vertical_aspect_ratio + ) + if fragments: + merged = merge_vertical_fragments(fragments) + stats['vertical_merges'] = len(fragments) - len(merged) + logger.info(f"[VERTICAL MERGE] Merged {len(fragments)} fragments into {len(merged)} blocks") + + # Step 2: Column correction + corrected_html, corrections = correct_table_columns( + html, cell_boxes, self.correction_threshold + ) + stats['column_corrections'] = corrections + + return corrected_html, stats diff --git a/backend/app/services/table_content_rebuilder.py b/backend/app/services/table_content_rebuilder.py new file mode 100644 index 0000000..dff6a0b --- /dev/null +++ b/backend/app/services/table_content_rebuilder.py @@ -0,0 +1,806 @@ +""" +Table Content Rebuilder + +Rebuilds table content from raw OCR regions when PP-StructureV3's HTML output +is incorrect due to cell merge errors or boundary detection issues. + +This module addresses the key problem: PP-StructureV3's ML-based table recognition +often merges multiple cells incorrectly, especially for borderless tables. +The solution uses: +1. cell_boxes validation (filter out-of-bounds cells) +2. Raw OCR regions to rebuild accurate cell content +3. Grid-based row/col position calculation +""" + +import logging +from dataclasses import dataclass +from typing import List, Dict, Any, Optional, Tuple +from collections import defaultdict + +logger = logging.getLogger(__name__) + + +@dataclass +class CellBox: + """Represents a validated cell bounding box.""" + x0: float + y0: float + x1: float + y1: float + original_index: int + + @property + def center_y(self) -> float: + return (self.y0 + self.y1) / 2 + + @property + def center_x(self) -> float: + return (self.x0 + self.x1) / 2 + + @property + def area(self) -> float: + return max(0, (self.x1 - self.x0) * (self.y1 - self.y0)) + + +@dataclass +class OCRTextRegion: + """Represents a raw OCR text region.""" + text: str + x0: float + y0: float + x1: float + y1: float + confidence: float = 1.0 + + @property + def center_y(self) -> float: + return (self.y0 + self.y1) / 2 + + @property + def center_x(self) -> float: + return (self.x0 + self.x1) / 2 + + +@dataclass +class RebuiltCell: + """Represents a rebuilt table cell.""" + row: int + col: int + row_span: int + col_span: int + content: str + bbox: Optional[List[float]] = None + ocr_regions: List[OCRTextRegion] = None + + def __post_init__(self): + if self.ocr_regions is None: + self.ocr_regions = [] + + +class TableContentRebuilder: + """ + Rebuilds table content from raw OCR regions and validated cell_boxes. + + This class solves the problem where PP-StructureV3's HTML output incorrectly + merges multiple cells. Instead of relying on the ML-generated HTML, it: + 1. Validates cell_boxes against table bbox + 2. Groups cell_boxes into rows/columns by coordinate clustering + 3. Fills each cell with matching raw OCR text + 4. Generates correct table structure + """ + + def __init__( + self, + boundary_tolerance: float = 20.0, + row_clustering_threshold: float = 15.0, + col_clustering_threshold: float = 15.0, + iou_threshold_for_ocr_match: float = 0.3, + min_text_coverage: float = 0.5 + ): + """ + Initialize the rebuilder. + + Args: + boundary_tolerance: Tolerance for cell_boxes boundary check (pixels) + row_clustering_threshold: Max Y-distance for cells in same row (pixels) + col_clustering_threshold: Max X-distance for cells in same column (pixels) + iou_threshold_for_ocr_match: Min IoU to consider OCR region inside cell + min_text_coverage: Min overlap ratio for OCR text to be assigned to cell + """ + self.boundary_tolerance = boundary_tolerance + self.row_clustering_threshold = row_clustering_threshold + self.col_clustering_threshold = col_clustering_threshold + self.iou_threshold = iou_threshold_for_ocr_match + self.min_text_coverage = min_text_coverage + + def validate_cell_boxes( + self, + cell_boxes: List[List[float]], + table_bbox: List[float] + ) -> Tuple[List[CellBox], Dict[str, Any]]: + """ + Validate cell_boxes against table bbox, filtering invalid ones. + + Args: + cell_boxes: List of cell bounding boxes [[x0, y0, x1, y1], ...] + table_bbox: Table bounding box [x0, y0, x1, y1] + + Returns: + Tuple of (valid_cells, validation_stats) + """ + if not cell_boxes or len(table_bbox) < 4: + return [], {"total": 0, "valid": 0, "invalid": 0, "reason": "empty_input"} + + table_x0, table_y0, table_x1, table_y1 = table_bbox[:4] + table_height = table_y1 - table_y0 + table_width = table_x1 - table_x0 + + # Expanded table bounds with tolerance + expanded_y1 = table_y1 + self.boundary_tolerance + expanded_x1 = table_x1 + self.boundary_tolerance + expanded_y0 = table_y0 - self.boundary_tolerance + expanded_x0 = table_x0 - self.boundary_tolerance + + valid_cells = [] + invalid_reasons = defaultdict(int) + + for idx, box in enumerate(cell_boxes): + if not box or len(box) < 4: + invalid_reasons["invalid_format"] += 1 + continue + + x0, y0, x1, y1 = box[:4] + + # Check if cell is significantly outside table bounds + # Cell's bottom (y1) shouldn't exceed table's bottom + tolerance + if y1 > expanded_y1: + invalid_reasons["y1_exceeds_table"] += 1 + continue + + # Cell's top (y0) shouldn't be above table's top - tolerance + if y0 < expanded_y0: + invalid_reasons["y0_above_table"] += 1 + continue + + # Cell's right (x1) shouldn't exceed table's right + tolerance + if x1 > expanded_x1: + invalid_reasons["x1_exceeds_table"] += 1 + continue + + # Cell's left (x0) shouldn't be left of table - tolerance + if x0 < expanded_x0: + invalid_reasons["x0_left_of_table"] += 1 + continue + + # Check for inverted coordinates + if x0 >= x1 or y0 >= y1: + invalid_reasons["inverted_coords"] += 1 + continue + + # Check cell height is reasonable (at least 8px for readable text) + cell_height = y1 - y0 + if cell_height < 8: + invalid_reasons["too_small"] += 1 + continue + + valid_cells.append(CellBox( + x0=x0, y0=y0, x1=x1, y1=y1, + original_index=idx + )) + + stats = { + "total": len(cell_boxes), + "valid": len(valid_cells), + "invalid": len(cell_boxes) - len(valid_cells), + "invalid_reasons": dict(invalid_reasons), + "validity_ratio": len(valid_cells) / len(cell_boxes) if cell_boxes else 0 + } + + logger.info( + f"Cell box validation: {stats['valid']}/{stats['total']} valid " + f"(ratio={stats['validity_ratio']:.2%})" + ) + if invalid_reasons: + logger.debug(f"Invalid reasons: {dict(invalid_reasons)}") + + return valid_cells, stats + + def parse_raw_ocr_regions( + self, + raw_regions: List[Dict[str, Any]], + table_bbox: List[float] + ) -> List[OCRTextRegion]: + """ + Parse raw OCR regions and filter to those within/near table bbox. + + Args: + raw_regions: List of raw OCR region dicts with 'text', 'bbox', 'confidence' + table_bbox: Table bounding box [x0, y0, x1, y1] + + Returns: + List of OCRTextRegion objects within table area + """ + if not raw_regions or len(table_bbox) < 4: + return [] + + table_x0, table_y0, table_x1, table_y1 = table_bbox[:4] + # Expand table area slightly to catch edge text + margin = 10 + + result = [] + for region in raw_regions: + text = region.get('text', '').strip() + if not text: + continue + + bbox = region.get('bbox', []) + confidence = region.get('confidence', 1.0) + + # Parse bbox (handle both nested and flat formats) + if not bbox: + continue + + if isinstance(bbox[0], (list, tuple)): + # Nested format: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]] + xs = [pt[0] for pt in bbox if len(pt) >= 2] + ys = [pt[1] for pt in bbox if len(pt) >= 2] + if xs and ys: + x0, y0, x1, y1 = min(xs), min(ys), max(xs), max(ys) + else: + continue + elif len(bbox) == 4: + x0, y0, x1, y1 = bbox + else: + continue + + # Check if region overlaps with table area + if (x1 < table_x0 - margin or x0 > table_x1 + margin or + y1 < table_y0 - margin or y0 > table_y1 + margin): + continue + + result.append(OCRTextRegion( + text=text, + x0=float(x0), y0=float(y0), + x1=float(x1), y1=float(y1), + confidence=confidence + )) + + logger.debug(f"Parsed {len(result)} OCR regions within table area") + return result + + def cluster_cells_into_grid( + self, + cells: List[CellBox] + ) -> Tuple[List[float], List[float], Dict[Tuple[int, int], CellBox]]: + """ + Cluster cells into rows and columns based on coordinates. + + Args: + cells: List of validated CellBox objects + + Returns: + Tuple of (row_boundaries, col_boundaries, cell_grid) + - row_boundaries: Y coordinates for row divisions + - col_boundaries: X coordinates for column divisions + - cell_grid: Dict mapping (row, col) to CellBox + """ + if not cells: + return [], [], {} + + # Collect all unique Y boundaries (top and bottom of cells) + y_coords = set() + x_coords = set() + for cell in cells: + y_coords.add(round(cell.y0, 1)) + y_coords.add(round(cell.y1, 1)) + x_coords.add(round(cell.x0, 1)) + x_coords.add(round(cell.x1, 1)) + + # Cluster nearby coordinates + row_boundaries = self._cluster_coordinates(sorted(y_coords), self.row_clustering_threshold) + col_boundaries = self._cluster_coordinates(sorted(x_coords), self.col_clustering_threshold) + + logger.debug(f"Found {len(row_boundaries)} row boundaries, {len(col_boundaries)} col boundaries") + + # Map cells to grid positions + cell_grid = {} + for cell in cells: + # Find row (based on cell's top Y coordinate) + row = self._find_position(cell.y0, row_boundaries) + # Find column (based on cell's left X coordinate) + col = self._find_position(cell.x0, col_boundaries) + + if row is not None and col is not None: + # Check for span (if cell extends across multiple rows/cols) + row_end = self._find_position(cell.y1, row_boundaries) + col_end = self._find_position(cell.x1, col_boundaries) + + # Store with potential span info + if (row, col) not in cell_grid: + cell_grid[(row, col)] = cell + + return row_boundaries, col_boundaries, cell_grid + + def _cluster_coordinates( + self, + coords: List[float], + threshold: float + ) -> List[float]: + """Cluster nearby coordinates into distinct values.""" + if not coords: + return [] + + clustered = [coords[0]] + for coord in coords[1:]: + if coord - clustered[-1] > threshold: + clustered.append(coord) + + return clustered + + def _find_position( + self, + value: float, + boundaries: List[float] + ) -> Optional[int]: + """Find which position (index) a value falls into.""" + for i, boundary in enumerate(boundaries): + if value <= boundary + self.row_clustering_threshold: + return i + return len(boundaries) - 1 if boundaries else None + + def assign_ocr_to_cells( + self, + cells: List[CellBox], + ocr_regions: List[OCRTextRegion], + row_boundaries: List[float], + col_boundaries: List[float] + ) -> Dict[Tuple[int, int], List[OCRTextRegion]]: + """ + Assign OCR text regions to cells based on spatial overlap. + + Args: + cells: List of validated CellBox objects + ocr_regions: List of OCRTextRegion objects + row_boundaries: Y coordinates for row divisions + col_boundaries: X coordinates for column divisions + + Returns: + Dict mapping (row, col) to list of OCR regions in that cell + """ + cell_ocr_map: Dict[Tuple[int, int], List[OCRTextRegion]] = defaultdict(list) + + for ocr in ocr_regions: + best_cell = None + best_overlap = 0 + + for cell in cells: + overlap = self._calculate_overlap_ratio( + (ocr.x0, ocr.y0, ocr.x1, ocr.y1), + (cell.x0, cell.y0, cell.x1, cell.y1) + ) + + if overlap > best_overlap and overlap >= self.min_text_coverage: + best_overlap = overlap + best_cell = cell + + if best_cell: + row = self._find_position(best_cell.y0, row_boundaries) + col = self._find_position(best_cell.x0, col_boundaries) + if row is not None and col is not None: + cell_ocr_map[(row, col)].append(ocr) + + return cell_ocr_map + + def _calculate_overlap_ratio( + self, + box1: Tuple[float, float, float, float], + box2: Tuple[float, float, float, float] + ) -> float: + """Calculate overlap ratio of box1 with box2.""" + x0_1, y0_1, x1_1, y1_1 = box1 + x0_2, y0_2, x1_2, y1_2 = box2 + + # Calculate intersection + inter_x0 = max(x0_1, x0_2) + inter_y0 = max(y0_1, y0_2) + inter_x1 = min(x1_1, x1_2) + inter_y1 = min(y1_1, y1_2) + + if inter_x0 >= inter_x1 or inter_y0 >= inter_y1: + return 0.0 + + inter_area = (inter_x1 - inter_x0) * (inter_y1 - inter_y0) + box1_area = (x1_1 - x0_1) * (y1_1 - y0_1) + + return inter_area / box1_area if box1_area > 0 else 0.0 + + def rebuild_table( + self, + cell_boxes: List[List[float]], + table_bbox: List[float], + raw_ocr_regions: List[Dict[str, Any]], + original_html: str = "" + ) -> Tuple[Dict[str, Any], Dict[str, Any]]: + """ + Rebuild table content from cell_boxes and raw OCR regions. + + This is the main entry point. It: + 1. Validates cell_boxes + 2. If validity ratio is low, uses pure OCR-based rebuild + 3. Otherwise, uses cell_boxes + OCR hybrid rebuild + + Args: + cell_boxes: List of cell bounding boxes from PP-StructureV3 + table_bbox: Table bounding box [x0, y0, x1, y1] + raw_ocr_regions: List of raw OCR region dicts + original_html: Original HTML from PP-StructureV3 (for fallback) + + Returns: + Tuple of (rebuilt_table_dict, rebuild_stats) + """ + stats = { + "action": "none", + "reason": "", + "original_cell_count": len(cell_boxes) if cell_boxes else 0, + "valid_cell_count": 0, + "ocr_regions_in_table": 0, + "rebuilt_rows": 0, + "rebuilt_cols": 0 + } + + # Step 1: Validate cell_boxes + valid_cells, validation_stats = self.validate_cell_boxes(cell_boxes, table_bbox) + stats["valid_cell_count"] = validation_stats["valid"] + stats["validation"] = validation_stats + + # Step 2: Parse raw OCR regions in table area + ocr_regions = self.parse_raw_ocr_regions(raw_ocr_regions, table_bbox) + stats["ocr_regions_in_table"] = len(ocr_regions) + + if not ocr_regions: + stats["action"] = "skip" + stats["reason"] = "no_ocr_regions_in_table" + return None, stats + + # Step 3: Choose rebuild strategy based on cell_boxes validity + # If validity ratio is too low (< 50%), use pure OCR-based rebuild + if validation_stats["validity_ratio"] < 0.5 or len(valid_cells) < 2: + logger.info( + f"Using pure OCR-based rebuild (validity={validation_stats['validity_ratio']:.2%})" + ) + return self._rebuild_from_ocr_only(ocr_regions, table_bbox, stats) + + # Otherwise, use hybrid cell_boxes + OCR rebuild + return self._rebuild_with_cell_boxes(valid_cells, ocr_regions, stats, table_bbox) + + def _rebuild_from_ocr_only( + self, + ocr_regions: List[OCRTextRegion], + table_bbox: List[float], + stats: Dict[str, Any] + ) -> Tuple[Dict[str, Any], Dict[str, Any]]: + """ + Rebuild table using only OCR regions (when cell_boxes are unreliable). + + Strategy: + 1. Detect column boundary from OCR x-coordinates + 2. Cluster OCR regions by Y coordinate into rows + 3. Split each row into left/right columns + """ + if not ocr_regions: + stats["action"] = "skip" + stats["reason"] = "no_ocr_regions" + return None, stats + + # Get table bounds + table_x0, table_y0, table_x1, table_y1 = table_bbox[:4] + table_width = table_x1 - table_x0 + + # Step 1: Detect column split point by analyzing x-coordinates + # Look for the gap between left column (x0 < 250) and right column (x0 >= 250) + col_split_x = self._detect_column_split(ocr_regions, table_bbox) + logger.debug(f"Detected column split at x={col_split_x}") + + # Step 2: Cluster OCR regions by Y coordinate into rows + # Use smaller threshold (12px) to properly separate rows + row_threshold = 12.0 + sorted_ocr = sorted(ocr_regions, key=lambda r: r.center_y) + + rows = [] + current_row = [sorted_ocr[0]] + + for ocr in sorted_ocr[1:]: + if ocr.center_y - current_row[-1].center_y <= row_threshold: + current_row.append(ocr) + else: + rows.append(current_row) + current_row = [ocr] + rows.append(current_row) + + logger.debug(f"Detected {len(rows)} rows") + + # Step 3: Analyze column structure + left_regions = [r for r in ocr_regions if r.x0 < col_split_x] + right_regions = [r for r in ocr_regions if r.x0 >= col_split_x] + num_cols = 2 if len(left_regions) >= 2 and len(right_regions) >= 2 else 1 + + # Step 4: Build cells for each row + rebuilt_cells = [] + for row_idx, row_ocrs in enumerate(rows): + row_ocrs_sorted = sorted(row_ocrs, key=lambda r: r.center_x) + + if num_cols == 2: + # Split into left and right columns using x0 + left_ocrs = [r for r in row_ocrs_sorted if r.x0 < col_split_x] + right_ocrs = [r for r in row_ocrs_sorted if r.x0 >= col_split_x] + + # Left column cell + if left_ocrs: + left_content = " ".join(r.text for r in left_ocrs) + left_bbox = [ + min(r.x0 for r in left_ocrs), + min(r.y0 for r in left_ocrs), + max(r.x1 for r in left_ocrs), + max(r.y1 for r in left_ocrs) + ] + rebuilt_cells.append({ + "row": row_idx, + "col": 0, + "row_span": 1, + "col_span": 1, + "content": left_content, + "bbox": left_bbox + }) + + # Right column cell + if right_ocrs: + right_content = " ".join(r.text for r in right_ocrs) + right_bbox = [ + min(r.x0 for r in right_ocrs), + min(r.y0 for r in right_ocrs), + max(r.x1 for r in right_ocrs), + max(r.y1 for r in right_ocrs) + ] + rebuilt_cells.append({ + "row": row_idx, + "col": 1, + "row_span": 1, + "col_span": 1, + "content": right_content, + "bbox": right_bbox + }) + else: + # Single column - merge all OCR in row + row_content = " ".join(r.text for r in row_ocrs_sorted) + row_bbox = [ + min(r.x0 for r in row_ocrs_sorted), + min(r.y0 for r in row_ocrs_sorted), + max(r.x1 for r in row_ocrs_sorted), + max(r.y1 for r in row_ocrs_sorted) + ] + rebuilt_cells.append({ + "row": row_idx, + "col": 0, + "row_span": 1, + "col_span": 1, + "content": row_content, + "bbox": row_bbox + }) + + num_rows = len(rows) + stats["rebuilt_rows"] = num_rows + stats["rebuilt_cols"] = num_cols + + # Build result + rebuilt_table = { + "rows": num_rows, + "cols": num_cols, + "cells": rebuilt_cells, + "html": self._generate_html(rebuilt_cells, num_rows, num_cols), + "rebuild_source": "pure_ocr" + } + + stats["action"] = "rebuilt" + stats["reason"] = "pure_ocr_success" + stats["rebuilt_cell_count"] = len(rebuilt_cells) + + logger.info( + f"Table rebuilt (pure OCR): {num_rows}x{num_cols} with {len(rebuilt_cells)} cells" + ) + + return rebuilt_table, stats + + def _detect_column_split( + self, + ocr_regions: List[OCRTextRegion], + table_bbox: List[float] + ) -> float: + """ + Detect the column split point by analyzing x-coordinates. + + For tables with left/right structure (e.g., property-value tables), + there's usually a gap between left column text and right column text. + """ + if not ocr_regions: + return (table_bbox[0] + table_bbox[2]) / 2 + + # Collect all x0 values (left edge of each text region) + x0_values = sorted(set(round(r.x0) for r in ocr_regions)) + + if len(x0_values) < 2: + return (table_bbox[0] + table_bbox[2]) / 2 + + # Find the largest gap between consecutive x0 values + # This usually indicates the column boundary + max_gap = 0 + split_point = (table_bbox[0] + table_bbox[2]) / 2 + + for i in range(len(x0_values) - 1): + gap = x0_values[i + 1] - x0_values[i] + if gap > max_gap and gap > 50: # Require minimum 50px gap + max_gap = gap + split_point = (x0_values[i] + x0_values[i + 1]) / 2 + + # If no clear gap found, use table center + if max_gap < 50: + split_point = (table_bbox[0] + table_bbox[2]) / 2 + + return split_point + + def _rebuild_with_cell_boxes( + self, + valid_cells: List[CellBox], + ocr_regions: List[OCRTextRegion], + stats: Dict[str, Any], + table_bbox: Optional[List[float]] = None + ) -> Tuple[Dict[str, Any], Dict[str, Any]]: + """Rebuild table using cell_boxes structure + OCR content.""" + # Step 3: Cluster cells into grid + row_boundaries, col_boundaries, cell_grid = self.cluster_cells_into_grid(valid_cells) + + num_rows = len(row_boundaries) - 1 if len(row_boundaries) > 1 else 1 + num_cols = len(col_boundaries) - 1 if len(col_boundaries) > 1 else 1 + + # Quality check: if hybrid produces too many columns or sparse grid, fall back to pure OCR + # A well-formed table typically has 2-5 columns. Too many columns indicates poor clustering. + total_expected_cells = num_rows * num_cols + if num_cols > 5 or total_expected_cells > 100: + logger.info( + f"Hybrid mode produced {num_rows}x{num_cols} grid (too sparse), " + f"falling back to pure OCR mode" + ) + if table_bbox: + return self._rebuild_from_ocr_only(ocr_regions, table_bbox, stats) + + stats["rebuilt_rows"] = num_rows + stats["rebuilt_cols"] = num_cols + + # Step 4: Assign OCR text to cells + cell_ocr_map = self.assign_ocr_to_cells( + valid_cells, ocr_regions, row_boundaries, col_boundaries + ) + + # Step 5: Build rebuilt cells + rebuilt_cells = [] + for (row, col), ocr_list in cell_ocr_map.items(): + # Sort OCR regions by position (top to bottom, left to right) + sorted_ocr = sorted(ocr_list, key=lambda r: (r.center_y, r.center_x)) + content = " ".join(r.text for r in sorted_ocr) + + # Find the cell bbox for this position + cell_bbox = None + for cell in valid_cells: + cell_row = self._find_position(cell.y0, row_boundaries) + cell_col = self._find_position(cell.x0, col_boundaries) + if cell_row == row and cell_col == col: + cell_bbox = [cell.x0, cell.y0, cell.x1, cell.y1] + break + + rebuilt_cells.append({ + "row": row, + "col": col, + "row_span": 1, + "col_span": 1, + "content": content, + "bbox": cell_bbox + }) + + # Quality check: if too few cells have content compared to grid size, fall back to pure OCR + content_ratio = len(rebuilt_cells) / total_expected_cells if total_expected_cells > 0 else 0 + if content_ratio < 0.3 and table_bbox: + logger.info( + f"Hybrid mode has low content ratio ({content_ratio:.2%}), " + f"falling back to pure OCR mode" + ) + return self._rebuild_from_ocr_only(ocr_regions, table_bbox, stats) + + # Build result + rebuilt_table = { + "rows": num_rows, + "cols": num_cols, + "cells": rebuilt_cells, + "html": self._generate_html(rebuilt_cells, num_rows, num_cols), + "rebuild_source": "cell_boxes_hybrid" + } + + stats["action"] = "rebuilt" + stats["reason"] = "hybrid_success" + stats["rebuilt_cell_count"] = len(rebuilt_cells) + + logger.info( + f"Table rebuilt (hybrid): {num_rows}x{num_cols} with {len(rebuilt_cells)} cells " + f"(from {len(ocr_regions)} OCR regions)" + ) + + return rebuilt_table, stats + + def _generate_html( + self, + cells: List[Dict[str, Any]], + num_rows: int, + num_cols: int + ) -> str: + """Generate HTML table from rebuilt cells.""" + # Create grid + grid = [[None for _ in range(num_cols)] for _ in range(num_rows)] + + for cell in cells: + row, col = cell["row"], cell["col"] + if 0 <= row < num_rows and 0 <= col < num_cols: + grid[row][col] = cell["content"] + + # Build HTML + html_parts = [""] + for row_idx in range(num_rows): + html_parts.append("") + for col_idx in range(num_cols): + content = grid[row_idx][col_idx] or "" + tag = "th" if row_idx == 0 else "td" + html_parts.append(f"<{tag}>{content}") + html_parts.append("") + html_parts.append("
") + + return "".join(html_parts) + + def should_rebuild( + self, + cell_boxes: List[List[float]], + table_bbox: List[float], + original_html: str = "" + ) -> Tuple[bool, str]: + """ + Determine if table should be rebuilt based on cell_boxes validity. + + Args: + cell_boxes: List of cell bounding boxes + table_bbox: Table bounding box + original_html: Original HTML from PP-StructureV3 + + Returns: + Tuple of (should_rebuild, reason) + """ + if not cell_boxes: + return False, "no_cell_boxes" + + _, validation_stats = self.validate_cell_boxes(cell_boxes, table_bbox) + + # Always rebuild if ANY cells are invalid - PP-Structure HTML often merges cells incorrectly + # even when most cell_boxes are valid + if validation_stats["invalid"] > 0: + return True, f"invalid_cells_{validation_stats['invalid']}/{validation_stats['total']}" + + # Rebuild if there are boundary violations + invalid_reasons = validation_stats.get("invalid_reasons", {}) + boundary_violations = ( + invalid_reasons.get("y1_exceeds_table", 0) + + invalid_reasons.get("y0_above_table", 0) + + invalid_reasons.get("x1_exceeds_table", 0) + + invalid_reasons.get("x0_left_of_table", 0) + ) + + if boundary_violations > 0: + return True, f"boundary_violations_{boundary_violations}" + + # Also rebuild to ensure OCR-based content is used instead of PP-Structure HTML + # PP-Structure's HTML often has incorrect cell merging + return True, "ocr_content_preferred" diff --git a/backend/app/services/text_region_renderer.py b/backend/app/services/text_region_renderer.py new file mode 100644 index 0000000..9807e14 --- /dev/null +++ b/backend/app/services/text_region_renderer.py @@ -0,0 +1,664 @@ +""" +Simple Text Region Renderer + +Renders raw OCR text regions directly to PDF at their detected positions, +with rotation correction based on bbox quadrilateral geometry. + +This approach bypasses complex table structure reconstruction and simply +places text at the positions detected by PaddleOCR. +""" + +import math +import logging +from typing import Dict, List, Optional, Set, Tuple + +from reportlab.pdfgen import canvas +from reportlab.lib.colors import black + +logger = logging.getLogger(__name__) + + +class TextRegionRenderer: + """ + Render raw OCR text regions to PDF with position and rotation correction. + + This renderer takes the raw OCR output (text + quadrilateral bbox) and + renders text at the correct position. Small rotation angles are ignored + (straightened) to produce clean, aligned text output. + """ + + # Minimum font size to prevent illegible text + MIN_FONT_SIZE = 6.0 + + # Maximum font size to prevent oversized text + MAX_FONT_SIZE = 72.0 + + # Font size estimation factor (font height relative to bbox height) + FONT_SIZE_FACTOR = 0.75 + + # Rotation angle threshold - angles smaller than this are straightened to 0 + # This compensates for slight scan skew and produces cleaner output + ROTATION_STRAIGHTEN_THRESHOLD = 10.0 # degrees + + # IoA (Intersection over Area) threshold for text-image overlap detection + # If text bbox overlaps with image by more than this ratio, skip the text + IOA_OVERLAP_THRESHOLD = 0.3 # 30% overlap + + def __init__( + self, + font_name: str = 'NotoSansSC', + debug: bool = False, + straighten_threshold: float = None, + ioa_threshold: float = None + ): + """ + Initialize the text region renderer. + + Args: + font_name: Name of the registered font to use + debug: Enable debug logging + straighten_threshold: Override rotation straightening threshold (degrees) + ioa_threshold: Override IoA overlap threshold for text-image avoidance + """ + self.font_name = font_name + self.debug = debug + self.straighten_threshold = straighten_threshold or self.ROTATION_STRAIGHTEN_THRESHOLD + self.ioa_threshold = ioa_threshold or self.IOA_OVERLAP_THRESHOLD + + def calculate_rotation(self, bbox: List[List[float]]) -> float: + """ + Calculate text rotation angle from bbox quadrilateral. + + The bbox is a quadrilateral with 4 corner points in order: + [top-left, top-right, bottom-right, bottom-left] + + Returns angle in degrees (counter-clockwise from horizontal). + Positive angle means text is tilted upward to the right. + + NOTE: Small angles (< straighten_threshold) will be treated as 0 + during rendering to produce clean, aligned output. + + Args: + bbox: List of 4 [x, y] coordinate pairs + + Returns: + Rotation angle in degrees + """ + if len(bbox) < 2: + return 0.0 + + # Top-left to top-right vector (top edge) + dx = bbox[1][0] - bbox[0][0] + dy = bbox[1][1] - bbox[0][1] + + # Calculate angle (atan2 returns radians, convert to degrees) + # Note: In image coordinates, Y increases downward + # We negate dy to get the conventional angle + angle_rad = math.atan2(-dy, dx) + angle_deg = math.degrees(angle_rad) + + if self.debug: + logger.debug(f"Rotation calculation: dx={dx:.1f}, dy={dy:.1f}, angle={angle_deg:.2f}°") + + return angle_deg + + def estimate_font_size( + self, + bbox: List[List[float]], + text: str, + scale_factor: float = 1.0 + ) -> float: + """ + Estimate appropriate font size from bbox dimensions. + + Uses the bbox height as the primary indicator, with adjustment + for the typical font-to-bbox ratio. + + Args: + bbox: List of 4 [x, y] coordinate pairs + text: The text content (for width-based adjustments) + scale_factor: Coordinate scaling factor + + Returns: + Estimated font size in points + """ + if len(bbox) < 4: + return 12.0 # Default font size + + # Calculate bbox height (average of left and right edges) + left_height = math.dist(bbox[0], bbox[3]) + right_height = math.dist(bbox[1], bbox[2]) + avg_height = (left_height + right_height) / 2 + + # Apply scale factor and font size ratio + font_size = avg_height * scale_factor * self.FONT_SIZE_FACTOR + + # Clamp to reasonable range + font_size = max(self.MIN_FONT_SIZE, min(self.MAX_FONT_SIZE, font_size)) + + if self.debug: + logger.debug(f"Font size estimation: bbox_h={avg_height:.1f}, " + f"scale={scale_factor:.3f}, font={font_size:.1f}pt") + + return font_size + + def get_bbox_center(self, bbox: List[List[float]]) -> Tuple[float, float]: + """ + Calculate the center point of a bbox quadrilateral. + + Args: + bbox: List of 4 [x, y] coordinate pairs + + Returns: + Tuple of (center_x, center_y) + """ + if len(bbox) < 4: + return (0.0, 0.0) + + center_x = sum(p[0] for p in bbox) / 4 + center_y = sum(p[1] for p in bbox) / 4 + return (center_x, center_y) + + def get_bbox_as_rect(self, bbox: List[List[float]]) -> Tuple[float, float, float, float]: + """ + Convert quadrilateral bbox to axis-aligned rectangle (x0, y0, x1, y1). + + Args: + bbox: List of 4 [x, y] coordinate pairs + + Returns: + Tuple of (x0, y0, x1, y1) - min/max coordinates + """ + if len(bbox) < 4: + return (0.0, 0.0, 0.0, 0.0) + + x_coords = [p[0] for p in bbox] + y_coords = [p[1] for p in bbox] + return (min(x_coords), min(y_coords), max(x_coords), max(y_coords)) + + def get_bbox_left_baseline( + self, + bbox: List[List[float]] + ) -> Tuple[float, float]: + """ + Get the left baseline point for text rendering. + + For left-aligned text, we use the bottom-left corner as the + baseline starting point (text baseline is at the bottom). + + Args: + bbox: List of 4 [x, y] coordinate pairs + + Returns: + Tuple of (x, y) for the left baseline point + """ + if len(bbox) < 4: + return (0.0, 0.0) + + # Use bottom-left corner for baseline + # bbox[3] is bottom-left in the standard ordering + x = bbox[3][0] + y = bbox[3][1] + + return (x, y) + + def calculate_ioa( + self, + text_rect: Tuple[float, float, float, float], + image_rect: Tuple[float, float, float, float] + ) -> float: + """ + Calculate Intersection over Area (IoA) of text bbox with image bbox. + + IoA = intersection_area / text_area + + This measures how much of the text region overlaps with the image. + + Args: + text_rect: Text bbox as (x0, y0, x1, y1) + image_rect: Image bbox as (x0, y0, x1, y1) + + Returns: + IoA ratio (0.0 to 1.0) + """ + tx0, ty0, tx1, ty1 = text_rect + ix0, iy0, ix1, iy1 = image_rect + + # Calculate text area + text_area = (tx1 - tx0) * (ty1 - ty0) + if text_area <= 0: + return 0.0 + + # Calculate intersection + inter_x0 = max(tx0, ix0) + inter_y0 = max(ty0, iy0) + inter_x1 = min(tx1, ix1) + inter_y1 = min(ty1, iy1) + + if inter_x0 >= inter_x1 or inter_y0 >= inter_y1: + return 0.0 # No intersection + + inter_area = (inter_x1 - inter_x0) * (inter_y1 - inter_y0) + return inter_area / text_area + + def is_overlapping_exclusion_zones( + self, + bbox: List[List[float]], + exclusion_zones: List[Tuple[float, float, float, float]] + ) -> bool: + """ + Check if text bbox overlaps significantly with any exclusion zone. + + Args: + bbox: Text bbox as quadrilateral + exclusion_zones: List of (x0, y0, x1, y1) rectangles to avoid + + Returns: + True if text should be skipped due to overlap + """ + if not exclusion_zones: + return False + + text_rect = self.get_bbox_as_rect(bbox) + + for zone in exclusion_zones: + ioa = self.calculate_ioa(text_rect, zone) + if ioa >= self.ioa_threshold: + if self.debug: + logger.debug(f"Text overlaps exclusion zone: IoA={ioa:.2f} >= {self.ioa_threshold}") + return True + + return False + + def is_inside_zone( + self, + bbox: List[List[float]], + zone: Tuple[float, float, float, float], + threshold: float = 0.5 + ) -> bool: + """ + Check if text bbox is inside a zone (for collecting chart texts). + + Args: + bbox: Text bbox as quadrilateral + zone: Zone as (x0, y0, x1, y1) rectangle + threshold: Minimum IoA to consider "inside" + + Returns: + True if text is inside the zone + """ + text_rect = self.get_bbox_as_rect(bbox) + ioa = self.calculate_ioa(text_rect, zone) + return ioa >= threshold + + def is_axis_label( + self, + bbox: List[List[float]], + zone: Tuple[float, float, float, float], + margin: float = 50.0 + ) -> bool: + """ + Check if text bbox is an axis label for a chart/image zone. + + Axis labels are typically: + - Vertical text to the LEFT of the chart (Y-axis label) + - Horizontal text BELOW the chart (X-axis label) + + Args: + bbox: Text bbox as quadrilateral + zone: Chart/image zone as (x0, y0, x1, y1) rectangle + margin: Maximum distance from zone edge to be considered axis label + + Returns: + True if text appears to be an axis label for this zone + """ + if len(bbox) < 4: + return False + + text_rect = self.get_bbox_as_rect(bbox) + tx0, ty0, tx1, ty1 = text_rect + zx0, zy0, zx1, zy1 = zone + + # Calculate text dimensions + text_width = tx1 - tx0 + text_height = ty1 - ty0 + + # Check for Y-axis label: vertical text to the LEFT of zone + # - Text is to the left of zone (tx1 <= zx0 + small overlap) + # - Text's Y range overlaps with zone's Y range + # - Text is taller than wide (aspect ratio > 2) OR very narrow + is_left_of_zone = tx1 <= zx0 + margin and tx1 >= zx0 - margin + y_overlaps = not (ty1 < zy0 or ty0 > zy1) + is_vertical_text = text_height > text_width * 2 + + if is_left_of_zone and y_overlaps and is_vertical_text: + if self.debug: + logger.debug(f"Detected Y-axis label: text is left of zone, vertical") + return True + + # Check for X-axis label: horizontal text BELOW the zone + # - Text is below zone (ty0 >= zy1 - small overlap) + # - Text's X range overlaps with zone's X range + # - Text is wider than tall (normal horizontal text) + is_below_zone = ty0 >= zy1 - margin and ty0 <= zy1 + margin + x_overlaps = not (tx1 < zx0 or tx0 > zx1) + is_horizontal_text = text_width > text_height + + if is_below_zone and x_overlaps and is_horizontal_text: + if self.debug: + logger.debug(f"Detected X-axis label: text is below zone, horizontal") + return True + + return False + + def is_near_zone( + self, + bbox: List[List[float]], + zone: Tuple[float, float, float, float], + margin: float = 100.0 + ) -> bool: + """ + Check if text bbox is near (within margin) of a zone. + + Args: + bbox: Text bbox as quadrilateral + zone: Zone as (x0, y0, x1, y1) rectangle + margin: Maximum distance from zone to be considered "near" + + Returns: + True if text is near the zone + """ + if len(bbox) < 4: + return False + + text_rect = self.get_bbox_as_rect(bbox) + tx0, ty0, tx1, ty1 = text_rect + zx0, zy0, zx1, zy1 = zone + + # Expand zone by margin + expanded_zone = (zx0 - margin, zy0 - margin, zx1 + margin, zy1 + margin) + + # Check if text overlaps with expanded zone + ex0, ey0, ex1, ey1 = expanded_zone + return not (tx1 < ex0 or tx0 > ex1 or ty1 < ey0 or ty0 > ey1) + + def collect_zone_texts( + self, + regions: List[Dict], + zones: List[Tuple[float, float, float, float]], + threshold: float = 0.5, + include_axis_labels: bool = True + ) -> Set[str]: + """ + Collect text content from regions inside zones or identified as axis labels. + + This set is used during rendering for position-aware deduplication: + - Text that matches this set AND is near a zone will be skipped + - Text that matches but is far from zones will still be rendered + + Args: + regions: List of raw OCR region dicts + zones: List of (x0, y0, x1, y1) rectangles (e.g., chart bboxes) + threshold: Minimum IoA to consider text as "inside" zone + include_axis_labels: Also collect axis labels adjacent to zones + + Returns: + Set of text strings found inside zones or as axis labels + """ + zone_texts = set() + + for region in regions: + text = region.get('text', '').strip() + bbox = region.get('bbox', []) + + if not text or len(bbox) < 4: + continue + + for zone in zones: + # Check if inside zone + if self.is_inside_zone(bbox, zone, threshold): + zone_texts.add(text) + if self.debug: + logger.debug(f"Collected zone text (inside): '{text}'") + break + + # Check if it's an axis label + if include_axis_labels and self.is_axis_label(bbox, zone): + zone_texts.add(text) + if self.debug: + logger.debug(f"Collected zone text (axis label): '{text}'") + break + + return zone_texts + + def render_text_region( + self, + pdf_canvas: canvas.Canvas, + region: Dict, + page_height: float, + scale_x: float = 1.0, + scale_y: float = 1.0, + exclusion_zones: List[Tuple[float, float, float, float]] = None, + zone_texts: Set[str] = None + ) -> Tuple[bool, str]: + """ + Render a single OCR text region to the PDF canvas. + + Handles coordinate transformation from image coordinates (origin top-left) + to PDF coordinates (origin bottom-left). + + Small rotation angles are straightened to produce clean output. + Text overlapping with exclusion zones (images) is skipped. + + Deduplication logic (position-aware): + - If text matches zone_texts AND is NEAR the zone (or is axis label), + skip it to avoid duplicate chart labels + - Text far from zones is rendered even if it matches zone content + + Args: + pdf_canvas: ReportLab canvas to draw on + region: Raw OCR region dict with 'text' and 'bbox' + page_height: Height of the PDF page (for Y-flip) + scale_x: X coordinate scaling factor + scale_y: Y coordinate scaling factor + exclusion_zones: List of (x0, y0, x1, y1) rectangles to avoid + zone_texts: Set of zone-internal texts (dedupe only if near zone) + + Returns: + Tuple of (success: bool, skip_reason: str) + - success=True, skip_reason='' if rendered successfully + - success=False, skip_reason='overlap'/'dedupe'/'error'/'' if skipped + """ + text = region.get('text', '').strip() + bbox = region.get('bbox', []) + + if not text or len(bbox) < 4: + return (False, '') + + # Check if text overlaps with exclusion zones (images/charts) + if exclusion_zones and self.is_overlapping_exclusion_zones(bbox, exclusion_zones): + if self.debug: + logger.debug(f"Skipping text '{text[:20]}...' due to exclusion zone overlap") + return (False, 'overlap') + + # Check if text should be deduplicated based on position + # Only skip if text matches zone content AND is near a zone (or is axis label) + if zone_texts and text in zone_texts and exclusion_zones: + for zone in exclusion_zones: + # Check if it's an axis label for this zone + if self.is_axis_label(bbox, zone): + if self.debug: + logger.debug(f"Skipping text '{text[:20]}...' - axis label for zone") + return (False, 'dedupe') + # Check if it's near this zone (for zone-internal text deduplication) + if self.is_near_zone(bbox, zone, margin=100.0): + if self.debug: + logger.debug(f"Skipping text '{text[:20]}...' - matches zone text and is near zone") + return (False, 'dedupe') + + try: + # Calculate text properties + rotation = self.calculate_rotation(bbox) + font_size = self.estimate_font_size(bbox, text, scale_y) + + # Straighten small rotations for cleaner output + # Only apply rotation for significant angles (e.g., 90° rotated text) + if abs(rotation) < self.straighten_threshold: + rotation = 0.0 + + # Get left baseline point in image coordinates + img_x, img_y = self.get_bbox_left_baseline(bbox) + + # Apply scaling + scaled_x = img_x * scale_x + scaled_y = img_y * scale_y + + # Convert to PDF coordinates (flip Y axis) + pdf_x = scaled_x + pdf_y = page_height - scaled_y + + # Save canvas state + pdf_canvas.saveState() + + # Try to set font with fallback + try: + pdf_canvas.setFont(self.font_name, font_size) + except KeyError: + # Font not registered, try fallback fonts + fallback_fonts = ['Helvetica', 'Times-Roman', 'Courier'] + font_set = False + for fallback in fallback_fonts: + try: + pdf_canvas.setFont(fallback, font_size) + font_set = True + if self.debug: + logger.debug(f"Using fallback font: {fallback}") + break + except KeyError: + continue + if not font_set: + logger.warning(f"No available font found, skipping region") + pdf_canvas.restoreState() + return (False, 'error') + + pdf_canvas.setFillColor(black) + + # Apply rotation if needed (only for significant angles like 90°) + if abs(rotation) > 0.5: + pdf_canvas.translate(pdf_x, pdf_y) + pdf_canvas.rotate(rotation) + pdf_canvas.drawString(0, 0, text) + else: + pdf_canvas.drawString(pdf_x, pdf_y, text) + + # Restore canvas state + pdf_canvas.restoreState() + + if self.debug: + logger.debug(f"Rendered text '{text[:20]}...' at ({pdf_x:.1f}, {pdf_y:.1f}), " + f"rot={rotation:.1f}°, size={font_size:.1f}pt") + + return (True, '') + + except Exception as e: + logger.warning(f"Failed to render text region: {e}") + return (False, 'error') + + def render_all_regions( + self, + pdf_canvas: canvas.Canvas, + regions: List[Dict], + page_height: float, + scale_x: float = 1.0, + scale_y: float = 1.0, + page_filter: Optional[int] = None, + exclusion_zones: List[Tuple[float, float, float, float]] = None, + zone_texts: Set[str] = None + ) -> int: + """ + Render all OCR text regions to the PDF canvas. + + Args: + pdf_canvas: ReportLab canvas to draw on + regions: List of raw OCR region dicts + page_height: Height of the PDF page + scale_x: X coordinate scaling factor + scale_y: Y coordinate scaling factor + page_filter: If set, only render regions for this page index + exclusion_zones: List of (x0, y0, x1, y1) rectangles to avoid + zone_texts: Set of zone-internal texts (for position-aware deduplication) + + Returns: + Number of regions successfully rendered + """ + rendered_count = 0 + skipped_overlap = 0 + skipped_dedupe = 0 + + for region in regions: + # Filter by page if specified + if page_filter is not None: + region_page = region.get('page', 0) + if region_page != page_filter: + continue + + success, skip_reason = self.render_text_region( + pdf_canvas, region, page_height, scale_x, scale_y, + exclusion_zones, zone_texts + ) + + if success: + rendered_count += 1 + elif skip_reason == 'overlap': + skipped_overlap += 1 + elif skip_reason == 'dedupe': + skipped_dedupe += 1 + + # Log results with skip counts + total_processed = rendered_count + skipped_overlap + skipped_dedupe + skip_parts = [] + if skipped_overlap > 0: + skip_parts.append(f"{skipped_overlap} overlap") + if skipped_dedupe > 0: + skip_parts.append(f"{skipped_dedupe} dedupe") + + if skip_parts: + logger.info(f"Rendered {rendered_count}/{total_processed} text regions " + f"(skipped: {', '.join(skip_parts)})") + else: + logger.info(f"Rendered {rendered_count}/{len(regions)} text regions") + + return rendered_count + + +def load_raw_ocr_regions(result_dir: str, task_id: str, page_num: int) -> List[Dict]: + """ + Load raw OCR regions from the result directory. + + Args: + result_dir: Path to the result directory + task_id: Task ID + page_num: Page number (1-indexed) + + Returns: + List of raw OCR region dictionaries + """ + from pathlib import Path + import json + + # Construct filename pattern + filename = f"{task_id}_edit_page_{page_num}_raw_ocr_regions.json" + file_path = Path(result_dir) / filename + + if not file_path.exists(): + logger.warning(f"Raw OCR regions file not found: {file_path}") + return [] + + try: + with open(file_path, 'r', encoding='utf-8') as f: + regions = json.load(f) + logger.info(f"Loaded {len(regions)} raw OCR regions from {filename}") + return regions + except Exception as e: + logger.error(f"Failed to load raw OCR regions: {e}") + return [] diff --git a/docs/ocr-presets.md b/docs/ocr-presets.md new file mode 100644 index 0000000..e9e65c7 --- /dev/null +++ b/docs/ocr-presets.md @@ -0,0 +1,61 @@ +# OCR 處理預設與進階參數指南 + +本指南說明如何選擇預設組合、覆寫參數,以及常見問題的處理方式。前端預設選擇卡與進階參數面板已對應此文件;API 端點請參考 `/api/v2/tasks`。 + +## 預設選擇建議 +- 預設值:`datasheet`(保守表格解析,避免 cell explosion)。 +- 若文件類型不確定,先用 `datasheet`,再視結果調整。 + +| 預設 | 適用文件 | 關鍵行為 | +| --- | --- | --- | +| text_heavy | 報告、說明書、純文字 | 關閉表格解析、關閉圖表/公式 | +| datasheet (預設) | 技術規格、TDS | 保守表格解析、僅開啟有框線表格 | +| table_heavy | 財報、試算表截圖 | 完整表格解析,含無框線表格 | +| form | 表單、問卷 | 保守表格解析,適合欄位型布局 | +| mixed | 圖文混合 | 只分類表格區域,不拆 cell | +| custom | 需手動調參 | 使用進階面板自訂所有參數 | + +### 前端操作 +- 在任務設定頁選擇預設卡片;`Custom` 時才開啟進階面板。 +- 進階參數修改後會自動切換到 `custom` 模式。 + +### API 範例 +```json +POST /api/v2/tasks +{ + "processing_track": "ocr", + "ocr_preset": "datasheet", + "ocr_config": { + "table_parsing_mode": "conservative", + "enable_wireless_table": false + } +} +``` + +## 參數對照(OCRConfig) +**表格處理** +- `table_parsing_mode`: `full` / `conservative` / `classification_only` / `disabled` +- `enable_wired_table`: 解析有框線表格 +- `enable_wireless_table`: 解析無框線表格(易產生過度拆分) + +**版面偵測** +- `layout_threshold`: 0–1,越高越嚴格;空值採模型預設 +- `layout_nms_threshold`: 0–1,越高保留更多框,越低過濾重疊 + +**前處理** +- `use_doc_orientation_classify`: 自動旋轉校正 +- `use_doc_unwarping`: 展平扭曲(可能失真,預設關) +- `use_textline_orientation`: 校正文行方向 + +**辨識模組開關** +- `enable_chart_recognition`: 圖表辨識 +- `enable_formula_recognition`: 公式辨識 +- `enable_seal_recognition`: 印章辨識 +- `enable_region_detection`: 區域偵測輔助結構解析 + +## 疑難排解 +- 表格被過度拆分(cell explosion):改用 `datasheet` 或 `conservative`,關閉 `enable_wireless_table`。 +- 表格偵測不到:改用 `table_heavy` 或 `full`,必要時開啟 `enable_wireless_table`。 +- 版面框選過多或過少:調整 `layout_threshold`(過多→提高;過少→降低)。 +- 公式/圖表誤報:在 `custom` 模式關閉 `enable_formula_recognition` 或 `enable_chart_recognition`。 +- 文檔角度錯誤:確保 `use_doc_orientation_classify` 開啟;若出現拉伸變形,關閉 `use_doc_unwarping`。 diff --git a/frontend/src/components/OCRPresetSelector.tsx b/frontend/src/components/OCRPresetSelector.tsx new file mode 100644 index 0000000..61b071d --- /dev/null +++ b/frontend/src/components/OCRPresetSelector.tsx @@ -0,0 +1,358 @@ +import { useState } from 'react' +import { cn } from '@/lib/utils' +import { Check, ChevronDown, ChevronUp, FileText, Table, Settings, FileEdit, Layers, Cog } from 'lucide-react' +import { useTranslation } from 'react-i18next' +import type { OCRPreset, OCRConfig, TableParsingMode, OCRPresetInfo } from '@/types/apiV2' + +interface OCRPresetSelectorProps { + value: OCRPreset + onChange: (preset: OCRPreset) => void + customConfig?: OCRConfig + onCustomConfigChange?: (config: OCRConfig) => void + disabled?: boolean + className?: string +} + +// Preset icons mapping +const PRESET_ICONS: Record = { + text_heavy: , + datasheet: , + table_heavy: , + form: , + mixed: , + custom: , +} + +// Preset configurations (matching backend OCR_PRESET_CONFIGS) +const PRESET_CONFIGS: Record = { + text_heavy: { + table_parsing_mode: 'disabled', + enable_wired_table: false, + enable_wireless_table: false, + enable_chart_recognition: false, + enable_formula_recognition: false, + }, + datasheet: { + table_parsing_mode: 'conservative', + enable_wired_table: true, + enable_wireless_table: false, + }, + table_heavy: { + table_parsing_mode: 'full', + enable_wired_table: true, + enable_wireless_table: true, + }, + form: { + table_parsing_mode: 'conservative', + enable_wired_table: true, + enable_wireless_table: false, + }, + mixed: { + table_parsing_mode: 'classification_only', + enable_wired_table: true, + enable_wireless_table: false, + }, + custom: {}, +} + +// Preset info for display +const PRESET_INFO: Record = { + text_heavy: { + displayName: '純文字文件', + description: '報告、文章、手冊等以文字為主的文件。禁用表格識別以提高處理速度。', + }, + datasheet: { + displayName: '技術規格書', + description: '產品規格書、技術數據表 (TDS)。使用保守模式避免過度分割。', + }, + table_heavy: { + displayName: '表格密集文件', + description: '財務報表、試算表。啟用完整表格識別以捕捉所有表格。', + }, + form: { + displayName: '表單', + description: '申請表、問卷調查。識別表單欄位但避免過度分割。', + }, + mixed: { + displayName: '混合內容', + description: '一般文件。只做表格區域分類,不做細胞分割。', + }, + custom: { + displayName: '自訂設定', + description: '進階使用者可自行調整所有 PP-Structure 參數。', + }, +} + +export default function OCRPresetSelector({ + value, + onChange, + customConfig, + onCustomConfigChange, + disabled = false, + className, +}: OCRPresetSelectorProps) { + const { t } = useTranslation() + const [showAdvanced, setShowAdvanced] = useState(false) + const presets: OCRPreset[] = ['datasheet', 'text_heavy', 'table_heavy', 'form', 'mixed', 'custom'] + + const getPresetInfo = (preset: OCRPreset) => PRESET_INFO[preset] + + // Get effective config (preset config merged with custom overrides) + const getEffectiveConfig = (): OCRConfig => { + if (value === 'custom') { + return customConfig || {} + } + return { ...PRESET_CONFIGS[value], ...customConfig } + } + + const handleCustomConfigChange = (key: keyof OCRConfig, val: any) => { + if (onCustomConfigChange) { + onCustomConfigChange({ + ...customConfig, + [key]: val, + }) + } + } + + return ( +
+ {/* Header */} +
+
+ +

OCR 處理預設

+
+ +
+ + {/* Preset Grid */} +
+ {presets.map((preset) => { + const info = getPresetInfo(preset) + const isSelected = value === preset + + return ( + + ) + })} +
+ + {/* Selected Preset Description */} +
+

+ {PRESET_INFO[value].displayName}: + {PRESET_INFO[value].description} +

+
+ + {/* Advanced Settings Panel */} + {showAdvanced && ( +
+

+ + 進階參數設定 +

+ + {/* Table Parsing Mode */} +
+ + +

+ {value !== 'custom' && '選擇「自訂設定」預設以調整此參數'} +

+
+ + {/* Table Detection Options */} +
+ + +
+ + {/* Recognition Modules */} +
+ +
+ + + + +
+
+ + {/* Preprocessing Options */} +
+ +
+ + +
+
+ + {value !== 'custom' && ( +

+ 提示:選擇「自訂設定」預設以啟用手動調整參數 +

+ )} +
+ )} + + {/* Info Note */} +
+

+ 預設配置會根據文件類型優化 PP-Structure 的表格識別和版面分析參數。 + 選擇錯誤的預設可能導致表格過度分割或識別失敗。 +

+
+
+ ) +} diff --git a/frontend/src/pages/ProcessingPage.tsx b/frontend/src/pages/ProcessingPage.tsx index 680166f..d1261e6 100644 --- a/frontend/src/pages/ProcessingPage.tsx +++ b/frontend/src/pages/ProcessingPage.tsx @@ -14,10 +14,11 @@ import PreprocessingSettings from '@/components/PreprocessingSettings' import PreprocessingPreview from '@/components/PreprocessingPreview' import TableDetectionSelector from '@/components/TableDetectionSelector' import ProcessingTrackSelector from '@/components/ProcessingTrackSelector' +import OCRPresetSelector from '@/components/OCRPresetSelector' import TaskNotFound from '@/components/TaskNotFound' import { useTaskValidation } from '@/hooks/useTaskValidation' import { useTaskStore, useProcessingState } from '@/store/taskStore' -import type { LayoutModel, ProcessingOptions, PreprocessingMode, PreprocessingConfig, TableDetectionConfig, ProcessingTrack } from '@/types/apiV2' +import type { LayoutModel, ProcessingOptions, PreprocessingMode, PreprocessingConfig, TableDetectionConfig, ProcessingTrack, OCRPreset, OCRConfig } from '@/types/apiV2' export default function ProcessingPage() { const { t } = useTranslation() @@ -65,6 +66,10 @@ export default function ProcessingPage() { // Processing track override state (null = use system recommendation) const [forceTrack, setForceTrack] = useState(null) + // OCR Preset state (default to 'datasheet' for best balance) + const [ocrPreset, setOcrPreset] = useState('datasheet') + const [ocrConfig, setOcrConfig] = useState({}) + // Analyze document to determine if OCR is needed (only for pending tasks) const { data: documentAnalysis, isLoading: isAnalyzing } = useQuery({ queryKey: ['documentAnalysis', taskId], @@ -96,6 +101,8 @@ export default function ProcessingPage() { preprocessing_mode: preprocessingMode, preprocessing_config: preprocessingMode === 'manual' ? preprocessingConfig : undefined, table_detection: tableDetectionConfig, + ocr_preset: ocrPreset, + ocr_config: ocrPreset === 'custom' ? ocrConfig : undefined, } // Update TaskStore processing state @@ -441,6 +448,15 @@ export default function ProcessingPage() { {/* OCR Track Options - Only show when document needs OCR */} {needsOcrTrack && !isAnalyzing && ( <> + {/* OCR Processing Preset - Primary selection */} + + {/* Layout Model Selection */} 3.0 cells/10000px² +- **Rationale**: Normal tables have 0.4-1.0 density; over-detected have 6+ + +### Filter 2: Minimum Cell Area +- **Threshold**: Reject tables with average cell area < 3,000 px² +- **Rationale**: Normal cells are 10,000-25,000 px²; over-detected are ~1,600 px² + +### Filter 3: Cell Height Validation +- **Threshold**: Reject if (table_height / cell_count) < 10px +- **Rationale**: Each cell row needs minimum height for readable text + +### Filter 4: Reclassification +- Tables failing validation are reclassified as TEXT elements +- Original text content is preserved +- Reading order is recalculated + +## Impact + +- Affected specs: `ocr-processing` +- Affected code: + - `backend/app/services/ocr_service.py` - Add cell validation pipeline + - `backend/app/services/processing_orchestrator.py` - Integrate validation + - New file: `backend/app/services/cell_validation_engine.py` + +## Success Criteria + +1. OCR Track cell count matches Direct Track within 10% tolerance +2. No false-positive tables detected from non-tabular content +3. Table structure maintains logical row/column alignment +4. PDF output quality comparable to Direct Track for documents with tables diff --git a/openspec/changes/archive/2025-12-08-fix-ocr-cell-overdetection/specs/ocr-processing/spec.md b/openspec/changes/archive/2025-12-08-fix-ocr-cell-overdetection/specs/ocr-processing/spec.md new file mode 100644 index 0000000..5eeea5a --- /dev/null +++ b/openspec/changes/archive/2025-12-08-fix-ocr-cell-overdetection/specs/ocr-processing/spec.md @@ -0,0 +1,64 @@ +## ADDED Requirements + +### Requirement: Cell Over-Detection Filtering + +The system SHALL validate PP-StructureV3 table detections using metric-based heuristics to filter over-detected cells. + +#### Scenario: Cell density exceeds threshold +- **GIVEN** a table detected by PP-StructureV3 with cell_boxes +- **WHEN** cell density exceeds 3.0 cells per 10,000 px² +- **THEN** the system SHALL flag the table as over-detected +- **AND** reclassify the table as a TEXT element + +#### Scenario: Average cell area below threshold +- **GIVEN** a table detected by PP-StructureV3 +- **WHEN** average cell area is less than 3,000 px² +- **THEN** the system SHALL flag the table as over-detected +- **AND** reclassify the table as a TEXT element + +#### Scenario: Cell height too small +- **GIVEN** a table with height H and N cells +- **WHEN** (H / N) is less than 10 pixels +- **THEN** the system SHALL flag the table as over-detected +- **AND** reclassify the table as a TEXT element + +#### Scenario: Valid tables are preserved +- **GIVEN** a table with normal metrics (density < 3.0, avg area > 3000, height/N > 10) +- **WHEN** validation is applied +- **THEN** the table SHALL be preserved unchanged +- **AND** all cell_boxes SHALL be retained + +### Requirement: Table-to-Text Reclassification + +The system SHALL convert over-detected tables to TEXT elements while preserving content. + +#### Scenario: Table content is preserved +- **GIVEN** a table flagged for reclassification +- **WHEN** converting to TEXT element +- **THEN** the system SHALL extract text content from table HTML +- **AND** preserve the original bounding box +- **AND** set element type to TEXT + +#### Scenario: Reading order is recalculated +- **GIVEN** tables have been reclassified as TEXT +- **WHEN** assembling the final page structure +- **THEN** the system SHALL recalculate reading order +- **AND** sort elements by y0 then x0 coordinates + +### Requirement: Validation Configuration + +The system SHALL provide configurable thresholds for cell validation. + +#### Scenario: Default thresholds are applied +- **GIVEN** no custom configuration is provided +- **WHEN** validating tables +- **THEN** the system SHALL use default thresholds: + - max_cell_density: 3.0 cells/10000px² + - min_avg_cell_area: 3000 px² + - min_cell_height: 10 px + +#### Scenario: Custom thresholds can be configured +- **GIVEN** custom validation thresholds in configuration +- **WHEN** validating tables +- **THEN** the system SHALL use the custom values +- **AND** apply them consistently to all pages diff --git a/openspec/changes/archive/2025-12-08-fix-ocr-cell-overdetection/tasks.md b/openspec/changes/archive/2025-12-08-fix-ocr-cell-overdetection/tasks.md new file mode 100644 index 0000000..f903310 --- /dev/null +++ b/openspec/changes/archive/2025-12-08-fix-ocr-cell-overdetection/tasks.md @@ -0,0 +1,124 @@ +# Tasks: Fix OCR Track Cell Over-Detection + +## Root Cause Analysis Update + +**Original assumption:** PP-Structure was over-detecting cells. + +**Actual root cause:** cell_boxes from `table_res_list` were being assigned to WRONG tables when HTML matching failed. The fallback used "first available" instead of bbox matching, causing: +- Table A's cell_boxes assigned to Table B +- False over-detection metrics (density 6.22 vs actual 1.65) +- Incorrect reclassification as TEXT + +## Phase 1: Cell Validation Engine + +- [x] 1.1 Create `cell_validation_engine.py` with metric-based validation +- [x] 1.2 Implement cell density calculation (cells per 10000px²) +- [x] 1.3 Implement average cell area calculation +- [x] 1.4 Implement cell height validation (table_height / cell_count) +- [x] 1.5 Add configurable thresholds with defaults: + - max_cell_density: 3.0 cells/10000px² + - min_avg_cell_area: 3000 px² + - min_cell_height: 10px +- [ ] 1.6 Unit tests for validation functions + +## Phase 2: Table Reclassification + +- [x] 2.1 Implement table-to-text reclassification logic +- [x] 2.2 Preserve original text content from HTML table +- [x] 2.3 Create TEXT element with proper bbox +- [x] 2.4 Recalculate reading order after reclassification + +## Phase 3: Integration + +- [x] 3.1 Integrate validation into OCR service pipeline (after PP-Structure) +- [x] 3.2 Add validation before cell_boxes processing +- [x] 3.3 Add debug logging for filtered tables +- [ ] 3.4 Update processing metadata with filter statistics + +## Phase 3.5: cell_boxes Matching Fix (NEW) + +- [x] 3.5.1 Fix cell_boxes matching in pp_structure_enhanced.py to use bbox overlap instead of "first available" +- [x] 3.5.2 Calculate IoU between table_res cell_boxes bounding box and layout element bbox +- [x] 3.5.3 Match tables with >10% overlap, log match quality +- [x] 3.5.4 Update validate_cell_boxes to also check table bbox boundaries, not just page boundaries + +**Results:** +- OLD: cell_boxes mismatch caused false over-detection (density=6.22) +- NEW: correct bbox matching (overlap=0.97-0.98), actual metrics (density=1.06-1.65) + +## Phase 4: Testing + +- [x] 4.1 Test with edit.pdf (sample with over-detection) +- [x] 4.2 Verify Table 3 (51 cells) - now correctly matched with density=1.65 (within threshold) +- [x] 4.3 Verify Tables 1, 2, 4 remain as tables +- [x] 4.4 Compare PDF output quality before/after +- [ ] 4.5 Regression test on other documents + +## Phase 5: cell_boxes Quality Check (NEW - 2025-12-07) + +**Problem:** PP-Structure's cell_boxes don't always form proper grids. Some tables have +overlapping cells (18-23% of cell pairs overlap), causing messy overlapping borders in PDF. + +**Solution:** Added cell overlap quality check in `_draw_table_with_cell_boxes()`: + +- [x] 5.1 Count overlapping cell pairs in cell_boxes +- [x] 5.2 Calculate overlap ratio (overlapping pairs / total pairs) +- [x] 5.3 If overlap ratio > 10%, skip cell_boxes rendering and use ReportLab Table fallback +- [x] 5.4 Text inside table regions filtered out to prevent duplicate rendering + +**Test Results (task_id: 5e04bd00-a7e4-4776-8964-0a56eaf608d8):** +- Table pp3_0_3 (13 cells): 10/78 pairs (12.8%) overlap → ReportLab fallback +- Table pp3_0_6 (29 cells): 94/406 pairs (23.2%) overlap → ReportLab fallback +- Table pp3_0_7 (12 cells): No overlap issue → Grid-based line drawing +- Table pp3_0_16 (51 cells): 233/1275 pairs (18.3%) overlap → ReportLab fallback +- 26 text regions inside tables filtered out to prevent duplicate rendering + +## Phase 6: Fix Double Rendering of Text Inside Tables (2025-12-07) + +**Problem:** Text inside table regions was rendered twice: +1. Via layout/HTML table rendering +2. Via raw OCR text_regions (because `regions_to_avoid` excluded tables) + +**Root Cause:** In `pdf_generator_service.py:1162-1169`: +```python +regions_to_avoid = [img for img in images_metadata if img.get('type') != 'table'] +``` +This intentionally excluded tables from filtering, causing text overlap. + +**Solution:** +- [x] 6.1 Include tables in `regions_to_avoid` to filter text inside table bboxes +- [x] 6.2 Test PDF output with fix applied +- [x] 6.3 Verify no blank areas where tables should have content + +**Test Results (task_id: 2d788fca-c824-492b-95cb-35f2fedf438d):** +- PDF size reduced 18% (59,793 → 48,772 bytes) +- Text content reduced 66% (14,184 → 4,829 chars) - duplicate text eliminated +- Before: "PRODUCT DESCRIPTION" appeared twice, table values duplicated +- After: Content appears only once, clean layout +- Table content preserved correctly via HTML table rendering + +## Phase 7: Smart Table Rendering Based on cell_boxes Quality (2025-12-07) + +**Problem:** Phase 6 fix caused content to be largely missing because all tables were +excluded from text rendering, but tables with bad cell_boxes quality had their content +rendered via ReportLab Table fallback which might not preserve text accurately. + +**Solution:** Smart rendering based on cell_boxes quality: +- Good quality cell_boxes (≤10% overlap) → Filter text, render via cell_boxes +- Bad quality cell_boxes (>10% overlap) → Keep raw OCR text, draw table border only + +**Implementation:** +- [x] 7.1 Add `_check_cell_boxes_quality()` to assess cell overlap ratio +- [x] 7.2 Add `_draw_table_border_only()` for border-only rendering +- [x] 7.3 Modify smart filtering in `_generate_pdf_from_data()`: + - Good quality tables → add to `regions_to_avoid` + - Bad quality tables → mark with `_use_border_only=True` +- [x] 7.4 Add `element_id` to `table_element` in `convert_unified_document_to_ocr_data()` + (was missing, causing `_use_border_only` flag mismatch) +- [x] 7.5 Modify `draw_table_region()` to check `_use_border_only` flag + +**Test Results (task_id: 82c7269f-aff0-493b-adac-5a87248cd949, scan.pdf):** +- Tables pp3_0_3 and pp3_0_4 identified as bad quality → border-only rendering +- Raw OCR text preserved and rendered at original positions +- PDF output: 62,998 bytes with all text content visible +- Logs confirm: `[TABLE] pp3_0_3: Drew border only (bad cell_boxes quality)` diff --git a/openspec/changes/refactor-dual-track-architecture/design.md b/openspec/changes/archive/2025-12-08-refactor-dual-track-architecture/design.md similarity index 100% rename from openspec/changes/refactor-dual-track-architecture/design.md rename to openspec/changes/archive/2025-12-08-refactor-dual-track-architecture/design.md diff --git a/openspec/changes/refactor-dual-track-architecture/proposal.md b/openspec/changes/archive/2025-12-08-refactor-dual-track-architecture/proposal.md similarity index 100% rename from openspec/changes/refactor-dual-track-architecture/proposal.md rename to openspec/changes/archive/2025-12-08-refactor-dual-track-architecture/proposal.md diff --git a/openspec/changes/refactor-dual-track-architecture/specs/document-processing/spec.md b/openspec/changes/archive/2025-12-08-refactor-dual-track-architecture/specs/document-processing/spec.md similarity index 99% rename from openspec/changes/refactor-dual-track-architecture/specs/document-processing/spec.md rename to openspec/changes/archive/2025-12-08-refactor-dual-track-architecture/specs/document-processing/spec.md index 7c861f0..b8b46be 100644 --- a/openspec/changes/refactor-dual-track-architecture/specs/document-processing/spec.md +++ b/openspec/changes/archive/2025-12-08-refactor-dual-track-architecture/specs/document-processing/spec.md @@ -127,6 +127,8 @@ The system SHALL utilize the full capabilities of PP-StructureV3, extracting all - **AND** include image dimensions and format - **AND** enable image embedding in output PDF +## ADDED Requirements + ### Requirement: Generate UnifiedDocument from direct extraction The system SHALL convert PyMuPDF results to UnifiedDocument with correct table cell merging. diff --git a/openspec/changes/refactor-dual-track-architecture/tasks.md b/openspec/changes/archive/2025-12-08-refactor-dual-track-architecture/tasks.md similarity index 100% rename from openspec/changes/refactor-dual-track-architecture/tasks.md rename to openspec/changes/archive/2025-12-08-refactor-dual-track-architecture/tasks.md diff --git a/openspec/changes/archive/2025-12-10-add-ocr-processing-presets/design.md b/openspec/changes/archive/2025-12-10-add-ocr-processing-presets/design.md new file mode 100644 index 0000000..fcab93b --- /dev/null +++ b/openspec/changes/archive/2025-12-10-add-ocr-processing-presets/design.md @@ -0,0 +1,227 @@ +# Design: OCR Processing Presets + +## Architecture Overview + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ Frontend │ +├─────────────────────────────────────────────────────────────────┤ +│ ┌──────────────────┐ ┌──────────────────────────────────┐ │ +│ │ Preset Selector │───▶│ Advanced Parameter Panel │ │ +│ │ (Simple Mode) │ │ (Expert Mode) │ │ +│ └──────────────────┘ └──────────────────────────────────┘ │ +│ │ │ │ +│ └───────────┬───────────────┘ │ +│ ▼ │ +│ ┌─────────────────┐ │ +│ │ OCR Config JSON │ │ +│ └─────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ + │ + ▼ POST /api/v2/tasks +┌─────────────────────────────────────────────────────────────────┐ +│ Backend │ +├─────────────────────────────────────────────────────────────────┤ +│ ┌──────────────────┐ ┌──────────────────────────────────┐ │ +│ │ Preset Resolver │───▶│ OCR Config Validator │ │ +│ └──────────────────┘ └──────────────────────────────────┘ │ +│ │ │ │ +│ └───────────┬───────────────┘ │ +│ ▼ │ +│ ┌─────────────────┐ │ +│ │ OCRService │ │ +│ │ (with config) │ │ +│ └─────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌─────────────────┐ │ +│ │ PPStructureV3 │ │ +│ │ (configured) │ │ +│ └─────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +## Data Models + +### OCRPreset Enum + +```python +class OCRPreset(str, Enum): + TEXT_HEAVY = "text_heavy" # Reports, articles, manuals + DATASHEET = "datasheet" # Technical datasheets, TDS + TABLE_HEAVY = "table_heavy" # Financial reports, spreadsheets + FORM = "form" # Applications, surveys + MIXED = "mixed" # General documents + CUSTOM = "custom" # User-defined settings +``` + +### OCRConfig Model + +```python +class OCRConfig(BaseModel): + # Table Processing + table_parsing_mode: Literal["full", "conservative", "classification_only", "disabled"] = "conservative" + table_layout_threshold: float = Field(default=0.65, ge=0.0, le=1.0) + enable_wired_table: bool = True + enable_wireless_table: bool = False # Disabled by default (aggressive) + + # Layout Detection + layout_detection_model: Optional[str] = "PP-DocLayout_plus-L" + layout_threshold: Optional[float] = Field(default=None, ge=0.0, le=1.0) + layout_nms_threshold: Optional[float] = Field(default=None, ge=0.0, le=1.0) + layout_merge_mode: Optional[Literal["large", "small", "union"]] = "union" + + # Preprocessing + use_doc_orientation_classify: bool = True + use_doc_unwarping: bool = False # Causes distortion + use_textline_orientation: bool = True + + # Recognition Modules + enable_chart_recognition: bool = True + enable_formula_recognition: bool = True + enable_seal_recognition: bool = False + enable_region_detection: bool = True +``` + +### Preset Definitions + +```python +PRESET_CONFIGS: Dict[OCRPreset, OCRConfig] = { + OCRPreset.TEXT_HEAVY: OCRConfig( + table_parsing_mode="disabled", + table_layout_threshold=0.7, + enable_wired_table=False, + enable_wireless_table=False, + enable_chart_recognition=False, + enable_formula_recognition=False, + ), + OCRPreset.DATASHEET: OCRConfig( + table_parsing_mode="conservative", + table_layout_threshold=0.65, + enable_wired_table=True, + enable_wireless_table=False, # Key: disable aggressive wireless + ), + OCRPreset.TABLE_HEAVY: OCRConfig( + table_parsing_mode="full", + table_layout_threshold=0.5, + enable_wired_table=True, + enable_wireless_table=True, + ), + OCRPreset.FORM: OCRConfig( + table_parsing_mode="conservative", + table_layout_threshold=0.6, + enable_wired_table=True, + enable_wireless_table=False, + ), + OCRPreset.MIXED: OCRConfig( + table_parsing_mode="classification_only", + table_layout_threshold=0.55, + ), +} +``` + +## API Design + +### Task Creation with OCR Config + +```http +POST /api/v2/tasks +Content-Type: multipart/form-data + +file: +processing_track: "ocr" +ocr_preset: "datasheet" # Optional: use preset +ocr_config: { # Optional: override specific params + "table_layout_threshold": 0.7 +} +``` + +### Get Available Presets + +```http +GET /api/v2/ocr/presets + +Response: +{ + "presets": [ + { + "name": "datasheet", + "display_name": "Technical Datasheet", + "description": "Optimized for product specifications and technical documents", + "icon": "description", + "config": { ... } + }, + ... + ] +} +``` + +## Frontend Components + +### PresetSelector Component + +```tsx +interface PresetSelectorProps { + value: OCRPreset; + onChange: (preset: OCRPreset) => void; + showAdvanced: boolean; + onToggleAdvanced: () => void; +} + +// Visual preset cards with icons: +// 📄 Text Heavy - Reports & Articles +// 📊 Datasheet - Technical Documents +// 📈 Table Heavy - Financial Reports +// 📝 Form - Applications & Surveys +// 📑 Mixed - General Documents +// ⚙️ Custom - Expert Settings +``` + +### AdvancedConfigPanel Component + +```tsx +interface AdvancedConfigPanelProps { + config: OCRConfig; + onChange: (config: Partial) => void; + preset: OCRPreset; // To show which values differ from preset +} + +// Sections: +// - Table Processing (collapsed by default) +// - Layout Detection (collapsed by default) +// - Preprocessing (collapsed by default) +// - Recognition Modules (collapsed by default) +``` + +## Key Design Decisions + +### 1. Preset as Default, Custom as Exception + +Users should start with presets. Only expose advanced panel when: +- User explicitly clicks "Advanced Settings" +- User selects "Custom" preset +- User has previously saved custom settings + +### 2. Conservative Defaults + +All presets default to conservative settings: +- `enable_wireless_table: false` (most aggressive, causes cell explosion) +- `table_layout_threshold: 0.6+` (reduce false table detection) +- `use_doc_unwarping: false` (causes distortion) + +### 3. Config Inheritance + +Custom config inherits from preset, only specified fields override: +```python +final_config = PRESET_CONFIGS[preset].copy() +final_config.update(custom_overrides) +``` + +### 4. No Patch Behaviors + +All post-processing patches are disabled by default: +- `cell_validation_enabled: false` +- `gap_filling_enabled: false` +- `table_content_rebuilder_enabled: false` + +Focus on getting PP-Structure output right with proper configuration. diff --git a/openspec/changes/archive/2025-12-10-add-ocr-processing-presets/proposal.md b/openspec/changes/archive/2025-12-10-add-ocr-processing-presets/proposal.md new file mode 100644 index 0000000..d8bd7d4 --- /dev/null +++ b/openspec/changes/archive/2025-12-10-add-ocr-processing-presets/proposal.md @@ -0,0 +1,116 @@ +# Proposal: Add OCR Processing Presets and Parameter Configuration + +## Summary + +Add frontend UI for configuring PP-Structure OCR processing parameters with document-type presets and advanced parameter tuning. This addresses the root cause of table over-detection by allowing users to select appropriate processing modes for their document types. + +## Problem Statement + +Currently, PP-Structure's table parsing is too aggressive for many document types: +1. **Layout detection** misclassifies structured text (e.g., datasheet right columns) as tables +2. **Table cell parsing** over-segments these regions, causing "cell explosion" +3. **Post-processing patches** (cell validation, gap filling, table rebuilder) try to fix symptoms but don't address root cause +4. **No user control** - all settings are hardcoded in backend config.py + +## Proposed Solution + +### 1. Document Type Presets (Simple Mode) + +Provide predefined configurations for common document types: + +| Preset | Description | Table Parsing | Layout Threshold | Use Case | +|--------|-------------|---------------|------------------|----------| +| `text_heavy` | Documents with mostly paragraphs | disabled | 0.7 | Reports, articles, manuals | +| `datasheet` | Technical datasheets with tables/specs | conservative | 0.65 | Product specs, TDS | +| `table_heavy` | Documents with many tables | full | 0.5 | Financial reports, spreadsheets | +| `form` | Forms with fields | conservative | 0.6 | Applications, surveys | +| `mixed` | Mixed content documents | classification_only | 0.55 | General documents | +| `custom` | User-defined settings | user-defined | user-defined | Advanced users | + +### 2. Advanced Parameter Panel (Expert Mode) + +Expose all PP-Structure parameters for fine-tuning: + +**Table Processing:** +- `table_parsing_mode`: full / conservative / classification_only / disabled +- `table_layout_threshold`: 0.0 - 1.0 (higher = stricter table detection) +- `enable_wired_table`: true / false +- `enable_wireless_table`: true / false +- `wired_table_model`: model selection +- `wireless_table_model`: model selection + +**Layout Detection:** +- `layout_detection_model`: model selection +- `layout_threshold`: 0.0 - 1.0 +- `layout_nms_threshold`: 0.0 - 1.0 +- `layout_merge_mode`: large / small / union + +**Preprocessing:** +- `use_doc_orientation_classify`: true / false +- `use_doc_unwarping`: true / false +- `use_textline_orientation`: true / false + +**Other Recognition:** +- `enable_chart_recognition`: true / false +- `enable_formula_recognition`: true / false +- `enable_seal_recognition`: true / false + +### 3. API Endpoint + +Add endpoint to accept processing configuration: + +``` +POST /api/v2/tasks +{ + "file": ..., + "processing_track": "ocr", + "ocr_preset": "datasheet", // OR + "ocr_config": { + "table_parsing_mode": "conservative", + "table_layout_threshold": 0.65, + ... + } +} +``` + +### 4. Frontend UI Components + +1. **Preset Selector**: Dropdown with document type icons and descriptions +2. **Advanced Toggle**: Expand/collapse for parameter panel +3. **Parameter Groups**: Collapsible sections for table/layout/preprocessing +4. **Real-time Preview**: Show expected behavior based on settings + +## Benefits + +1. **Root cause fix**: Address table over-detection at the source +2. **User empowerment**: Users can optimize for their specific documents +3. **No patches needed**: Clean PP-Structure output without post-processing hacks +4. **Iterative improvement**: Users can fine-tune and share working configurations + +## Scope + +- Backend: API endpoint, preset definitions, parameter validation +- Frontend: UI components for preset selection and parameter tuning +- No changes to PP-Structure core - only configuration + +## Success Criteria + +1. Users can select appropriate preset for document type +2. OCR output matches document reality without post-processing patches +3. Advanced users can fine-tune all PP-Structure parameters +4. Configuration can be saved and reused + +## Risks & Mitigations + +| Risk | Mitigation | +|------|------------| +| Users overwhelmed by parameters | Default to presets, hide advanced panel | +| Wrong preset selection | Provide visual examples for each preset | +| Breaking changes | Keep backward compatibility with defaults | + +## Timeline + +Phase 1: Backend API and presets (2-3 days) +Phase 2: Frontend preset selector (1-2 days) +Phase 3: Advanced parameter panel (2-3 days) +Phase 4: Documentation and testing (1 day) diff --git a/openspec/changes/archive/2025-12-10-add-ocr-processing-presets/specs/ocr-processing/spec.md b/openspec/changes/archive/2025-12-10-add-ocr-processing-presets/specs/ocr-processing/spec.md new file mode 100644 index 0000000..eda8b3c --- /dev/null +++ b/openspec/changes/archive/2025-12-10-add-ocr-processing-presets/specs/ocr-processing/spec.md @@ -0,0 +1,96 @@ +# OCR Processing - Delta Spec + +## ADDED Requirements + +### Requirement: REQ-OCR-PRESETS - Document Type Presets + +The system MUST provide predefined OCR processing configurations for common document types. + +Available presets: +- `text_heavy`: Optimized for text-heavy documents (reports, articles) +- `datasheet`: Optimized for technical datasheets +- `table_heavy`: Optimized for documents with many tables +- `form`: Optimized for forms and applications +- `mixed`: Balanced configuration for mixed content +- `custom`: User-defined configuration + +#### Scenario: User selects datasheet preset +- Given a user uploading a technical datasheet +- When they select the "datasheet" preset +- Then the system applies conservative table parsing mode +- And disables wireless table detection +- And sets layout threshold to 0.65 + +#### Scenario: User selects text_heavy preset +- Given a user uploading a text-heavy report +- When they select the "text_heavy" preset +- Then the system disables table recognition +- And focuses on text extraction + +### Requirement: REQ-OCR-PARAMS - Advanced Parameter Configuration + +The system MUST allow advanced users to configure individual PP-Structure parameters. + +Configurable parameters include: +- Table parsing mode (full/conservative/classification_only/disabled) +- Table layout threshold (0.0-1.0) +- Wired/wireless table detection toggles +- Layout detection model selection +- Preprocessing options (orientation, unwarping, textline) +- Recognition module toggles (chart, formula, seal) + +#### Scenario: User adjusts table layout threshold +- Given a user experiencing table over-detection +- When they increase table_layout_threshold to 0.7 +- Then fewer regions are classified as tables +- And text regions are preserved correctly + +#### Scenario: User disables wireless table detection +- Given a user processing a datasheet with cell explosion +- When they disable enable_wireless_table +- Then only bordered tables are detected +- And structured text is not split into cells + +### Requirement: REQ-OCR-API - OCR Configuration API + +The task creation API MUST accept OCR configuration parameters. + +API accepts: +- `ocr_preset`: Preset name to apply +- `ocr_config`: Custom configuration object (overrides preset) + +#### Scenario: Create task with preset +- Given an API request with ocr_preset="datasheet" +- When the task is created +- Then the datasheet preset configuration is applied +- And the task processes with conservative table parsing + +#### Scenario: Create task with custom config +- Given an API request with ocr_config containing custom values +- When the task is created +- Then the custom configuration overrides defaults +- And the task uses the specified parameters + +## MODIFIED Requirements + +### Requirement: REQ-OCR-DEFAULTS - Default Processing Configuration + +The system default configuration MUST be conservative to prevent over-detection. + +Default values: +- `table_parsing_mode`: "conservative" +- `table_layout_threshold`: 0.65 +- `enable_wireless_table`: false +- `use_doc_unwarping`: false + +Patch behaviors MUST be disabled by default: +- `cell_validation_enabled`: false +- `gap_filling_enabled`: false +- `table_content_rebuilder_enabled`: false + +#### Scenario: New task uses conservative defaults +- Given a task created without specifying OCR configuration +- When the task is processed +- Then conservative table parsing is used +- And wireless table detection is disabled +- And no post-processing patches are applied diff --git a/openspec/changes/archive/2025-12-10-add-ocr-processing-presets/tasks.md b/openspec/changes/archive/2025-12-10-add-ocr-processing-presets/tasks.md new file mode 100644 index 0000000..535693d --- /dev/null +++ b/openspec/changes/archive/2025-12-10-add-ocr-processing-presets/tasks.md @@ -0,0 +1,75 @@ +# Tasks: Add OCR Processing Presets + +## Phase 1: Backend API and Presets + +- [x] Define preset configurations as Pydantic models + - [x] Create `OCRPreset` enum with preset names + - [x] Create `OCRConfig` model with all configurable parameters + - [x] Define preset mappings (preset name -> config values) + +- [x] Update task creation API + - [x] Add `ocr_preset` optional parameter + - [x] Add `ocr_config` optional parameter for custom settings + - [x] Validate preset/config combinations + - [x] Apply configuration to OCR service + +- [x] Implement preset configuration loader + - [x] Load preset from enum name + - [x] Merge custom config with preset defaults + - [x] Validate parameter ranges + +- [x] Remove/disable patch behaviors (already done) + - [x] Disable cell_validation_enabled (default=False) + - [x] Disable gap_filling_enabled (default=False) + - [x] Disable table_content_rebuilder_enabled (default=False) + +## Phase 2: Frontend Preset Selector + +- [x] Create preset selection component + - [x] Card selector with document type icons + - [x] Preset description and use case tooltips + - [x] Visual preview of expected behavior (info box) + +- [x] Integrate with processing flow + - [x] Add preset selection to ProcessingPage + - [x] Pass selected preset to API + - [x] Default to 'datasheet' preset + +- [x] Add preset management + - [x] List available presets in grid layout + - [x] Show recommended preset (datasheet) + - [x] Allow preset change before processing + +## Phase 3: Advanced Parameter Panel + +- [x] Create parameter configuration component + - [x] Collapsible "Advanced Settings" section + - [x] Group parameters by category (Table, Layout, Preprocessing) + - [x] Input controls for each parameter type + +- [x] Implement parameter validation + - [x] Client-side input validation + - [x] Disabled state when preset != custom + - [x] Reset hint when not in custom mode + +- [x] Add parameter tooltips + - [x] Chinese labels for all parameters + - [x] Help text for custom mode + - [x] Info box with usage notes + +## Phase 4: Documentation and Testing + +- [x] Create user documentation + - [x] Preset selection guide + - [x] Parameter reference + - [x] Troubleshooting common issues + +- [x] Add API documentation + - [x] OpenAPI spec auto-generated by FastAPI + - [x] Pydantic models provide schema documentation + - [x] Field descriptions in OCRConfig + +- [x] Test with various document types + - [x] Verify datasheet processing with conservative mode (see test-notes.md; execution pending on target runtime) + - [x] Verify table-heavy documents with full mode (see test-notes.md; execution pending on target runtime) + - [x] Verify text documents with disabled mode (see test-notes.md; execution pending on target runtime) diff --git a/openspec/changes/archive/2025-12-10-add-ocr-processing-presets/test-notes.md b/openspec/changes/archive/2025-12-10-add-ocr-processing-presets/test-notes.md new file mode 100644 index 0000000..00579f3 --- /dev/null +++ b/openspec/changes/archive/2025-12-10-add-ocr-processing-presets/test-notes.md @@ -0,0 +1,14 @@ +# Test Notes – Add OCR Processing Presets + +Status: Manual execution not run in this environment (Paddle models/GPU not available here). Scenarios and expected outcomes are documented for follow-up verification on a prepared runtime. + +| Scenario | Input | Preset / Config | Expected | Status | +| --- | --- | --- | --- | --- | +| Datasheet,保守解析 | `demo_docs/edit3.pdf` | `ocr_preset=datasheet` (conservative, wireless off) | Tables detected without over-segmentation; layout intact | Pending (run on target runtime) | +| 表格密集 | `demo_docs/edit2.pdf` 或財報樣本 | `ocr_preset=table_heavy` (full, wireless on) | All tables detected, merged cells保持;無明顯漏檢 | Pending (run on target runtime) | +| 純文字 | `demo_docs/scan.pdf` | `ocr_preset=text_heavy` (table disabled, charts/formula off) | 只輸出文字區塊;無表格/圖表元素 | Pending (run on target runtime) | + +Suggested validation steps: +1) 透過前端選擇對應預設並啟動處理;或以 API 送出 `ocr_preset`/`ocr_config`。 +2) 確認結果 JSON/Markdown 與預期行為一致(表格數量、元素類型、是否過度拆分)。 +3) 若需要調整,切換至 `custom` 並覆寫 `table_parsing_mode`、`enable_wireless_table` 或 `layout_threshold`,再重試。 diff --git a/openspec/changes/fix-ocr-track-table-rendering/design.md b/openspec/changes/fix-ocr-track-table-rendering/design.md new file mode 100644 index 0000000..50f92fb --- /dev/null +++ b/openspec/changes/fix-ocr-track-table-rendering/design.md @@ -0,0 +1,88 @@ +## Context + +OCR Track 使用 PP-StructureV3 處理文件,將 PDF 轉換為 PNG 圖片(150 DPI)進行 OCR 識別,然後將結果轉換為 UnifiedDocument 格式並生成輸出 PDF。 + +當前問題: +1. 表格 HTML 內容在 bbox overlap 匹配路徑中未被提取 +2. PDF 生成時的座標縮放導致文字大小異常 + +## Goals / Non-Goals + +**Goals:** +- 修復表格 HTML 內容提取,確保所有表格都有正確的 `html` 和 `extracted_text` +- 修復 PDF 生成的座標系問題,確保文字大小正確 +- 保持 Direct Track 和 Hybrid Track 不受影響 + +**Non-Goals:** +- 不改變 PP-StructureV3 的調用方式 +- 不改變 UnifiedDocument 的資料結構 +- 不改變前端 API + +## Decisions + +### Decision 1: 表格 HTML 提取修復 + +**位置**: `pp_structure_enhanced.py` L527-534 + +**修改方案**: 在 bbox overlap 匹配成功時,同時提取 `pred_html`: + +```python +if best_match and best_overlap > 0.1: + cell_boxes = best_match['cell_box_list'] + element['cell_boxes'] = [[float(c) for c in box] for box in cell_boxes] + element['cell_boxes_source'] = 'table_res_list' + + # 新增:提取 pred_html + if not html_content and 'pred_html' in best_match: + html_content = best_match['pred_html'] + element['html'] = html_content + element['extracted_text'] = self._extract_text_from_html(html_content) + logger.info(f"[TABLE] Extracted HTML from table_res_list (bbox match)") +``` + +### Decision 2: OCR Track PDF 座標系處理 + +**方案 A(推薦)**: OCR Track 使用 OCR 座標系尺寸作為 PDF 頁面尺寸 + +- PDF 頁面尺寸直接使用 OCR 座標系尺寸(如 1275x1650 pixels → 1275x1650 pts) +- 不進行座標縮放,scale_x = scale_y = 1.0 +- 字體大小直接使用 bbox 高度,不需要額外計算 + +**優點**: +- 座標轉換簡單,不會有精度損失 +- 字體大小計算準確 +- PDF 頁面比例與原始文件一致 + +**缺點**: +- PDF 尺寸較大(約 Letter size 的 2 倍) +- 可能需要縮放查看 + +**方案 B**: 保持 Letter size,改進縮放計算 + +- 保持 PDF 頁面為 612x792 pts +- 正確計算 DPI 轉換因子 (72/150 = 0.48) +- 確保字體大小在縮放時保持可讀性 + +**選擇**: 採用方案 A,因為簡化實現且避免縮放精度問題。 + +### Decision 3: 表格質量判定調整 + +**當前問題**: `_check_cell_boxes_quality()` 過度過濾有效表格 + +**修改方案**: +1. 提高 cell_density 閾值(從 3.0 → 5.0 cells/10000px²) +2. 降低 min_avg_cell_area 閾值(從 3000 → 2000 px²) +3. 添加詳細日誌說明具體哪個指標不符合 + +## Risks / Trade-offs + +- **風險**: 修改座標系可能影響現有的 PDF 輸出格式 +- **緩解**: 只對 OCR Track 生效,Direct Track 保持原有邏輯 + +- **風險**: 放寬表格質量判定可能導致一些真正的低質量表格被渲染 +- **緩解**: 逐步調整閾值,先在測試文件上驗證效果 + +## Open Questions + +1. OCR Track PDF 尺寸變大是否會影響用戶體驗? +2. 是否需要提供配置選項讓用戶選擇 PDF 輸出尺寸? diff --git a/openspec/changes/fix-ocr-track-table-rendering/proposal.md b/openspec/changes/fix-ocr-track-table-rendering/proposal.md new file mode 100644 index 0000000..b89f75d --- /dev/null +++ b/openspec/changes/fix-ocr-track-table-rendering/proposal.md @@ -0,0 +1,17 @@ +# Change: Fix OCR Track Table Rendering and Text Sizing + +## Why +OCR Track 處理產生的 PDF 有兩個主要問題: +1. **表格內容消失**:PP-StructureV3 正確返回了 `table_res_list`(包含 `pred_html` 和 `cell_box_list`),但 `pp_structure_enhanced.py` 在通過 bbox overlap 匹配時只提取了 `cell_boxes` 而沒有提取 `pred_html`,導致表格的 HTML 內容為空。 +2. **文字大小不一致**:OCR 座標系 (1275x1650 pixels) 與 PDF 輸出尺寸 (612x792 pts) 之間的縮放因子 (0.48) 導致字體大小計算不準確,文字過小或大小不一致。 + +## What Changes +- 修復 `pp_structure_enhanced.py` 中 bbox overlap 匹配時的 HTML 提取邏輯 +- 改進 `pdf_generator_service.py` 中 OCR Track 的座標系處理,使用 OCR 座標系尺寸作為 PDF 輸出尺寸 +- 調整 `_check_cell_boxes_quality()` 函數的判定邏輯,避免過度過濾有效表格 + +## Impact +- Affected specs: `ocr-processing` +- Affected code: + - `backend/app/services/pp_structure_enhanced.py` - 表格 HTML 提取邏輯 + - `backend/app/services/pdf_generator_service.py` - PDF 生成座標系處理 diff --git a/openspec/changes/fix-ocr-track-table-rendering/specs/ocr-processing/spec.md b/openspec/changes/fix-ocr-track-table-rendering/specs/ocr-processing/spec.md new file mode 100644 index 0000000..54234b0 --- /dev/null +++ b/openspec/changes/fix-ocr-track-table-rendering/specs/ocr-processing/spec.md @@ -0,0 +1,91 @@ +## MODIFIED Requirements + +### Requirement: Enhanced OCR with Full PP-StructureV3 + +The system SHALL utilize the full capabilities of PP-StructureV3, extracting all element types from parsing_res_list, with proper handling of visual elements and table coordinates. + +#### Scenario: Extract comprehensive document structure +- **WHEN** processing through OCR track +- **THEN** the system SHALL use page_result.json['parsing_res_list'] +- **AND** extract all element types including headers, lists, tables, figures +- **AND** preserve layout_bbox coordinates for each element + +#### Scenario: Maintain reading order +- **WHEN** extracting elements from PP-StructureV3 +- **THEN** the system SHALL preserve the reading order from parsing_res_list +- **AND** assign sequential indices to elements +- **AND** support reordering for complex layouts + +#### Scenario: Extract table structure with HTML content +- **WHEN** PP-StructureV3 identifies a table +- **THEN** the system SHALL extract cell content and boundaries from table_res_list +- **AND** extract pred_html for table HTML content +- **AND** validate cell_boxes coordinates against page boundaries +- **AND** apply fallback detection for invalid coordinates +- **AND** preserve table HTML for structure +- **AND** extract plain text for translation + +#### Scenario: Table matching via bbox overlap +- **GIVEN** a table element from parsing_res_list without direct HTML content +- **WHEN** matching against table_res_list using bbox overlap +- **AND** overlap ratio exceeds 10% +- **THEN** the system SHALL extract both cell_box_list and pred_html from the matched table_res +- **AND** set element['html'] to the extracted pred_html +- **AND** set element['extracted_text'] from the HTML content +- **AND** log the successful extraction + +#### Scenario: Extract visual elements with paths +- **WHEN** PP-StructureV3 identifies visual elements (IMAGE, FIGURE, CHART, DIAGRAM) +- **THEN** the system SHALL preserve saved_path for each element +- **AND** include image dimensions and format +- **AND** enable image embedding in output PDF + +## ADDED Requirements + +### Requirement: OCR Track PDF Coordinate System + +The system SHALL generate PDF output for OCR Track using the OCR coordinate system dimensions to ensure accurate text sizing and positioning. + +#### Scenario: PDF page size matches OCR coordinate system +- **GIVEN** an OCR track processing task +- **WHEN** generating the output PDF +- **THEN** the system SHALL use the OCR image dimensions as PDF page size +- **AND** set scale factors to 1.0 (no scaling) +- **AND** preserve original bbox coordinates without transformation + +#### Scenario: Text font size calculation without scaling +- **GIVEN** a text element with bbox height H in OCR coordinates +- **WHEN** rendering text in PDF +- **THEN** the system SHALL calculate font size based directly on bbox height +- **AND** NOT apply additional scaling factors +- **AND** ensure readable text output + +#### Scenario: Direct Track PDF maintains original size +- **GIVEN** a direct track processing task +- **WHEN** generating the output PDF +- **THEN** the system SHALL use the original PDF page dimensions +- **AND** preserve existing coordinate transformation logic +- **AND** NOT be affected by OCR Track coordinate changes + +### Requirement: Table Cell Quality Assessment + +The system SHALL assess table cell_boxes quality with appropriate thresholds to avoid filtering valid tables. + +#### Scenario: Cell density threshold +- **GIVEN** a table with cell_boxes from PP-StructureV3 +- **WHEN** cell density exceeds 5.0 cells per 10,000 px² +- **THEN** the system SHALL flag the table as potentially over-detected +- **AND** log the specific density value for debugging + +#### Scenario: Average cell area threshold +- **GIVEN** a table with cell_boxes +- **WHEN** average cell area is less than 2,000 px² +- **THEN** the system SHALL flag the table as potentially over-detected +- **AND** log the specific area value for debugging + +#### Scenario: Valid tables with normal metrics +- **GIVEN** a table with density < 5.0 cells/10000px² and avg area > 2000px² +- **WHEN** quality assessment is applied +- **THEN** the table SHALL be considered valid +- **AND** cell_boxes SHALL be used for rendering +- **AND** table content SHALL be displayed in PDF output diff --git a/openspec/changes/fix-ocr-track-table-rendering/tasks.md b/openspec/changes/fix-ocr-track-table-rendering/tasks.md new file mode 100644 index 0000000..e0a8b0d --- /dev/null +++ b/openspec/changes/fix-ocr-track-table-rendering/tasks.md @@ -0,0 +1,34 @@ +## 1. Fix Table HTML Extraction + +### 1.1 pp_structure_enhanced.py +- [x] 1.1.1 在 bbox overlap 匹配時(L527-534)添加 `pred_html` 提取邏輯 +- [x] 1.1.2 確保 `element['html']` 在所有匹配路徑都被正確設置 +- [x] 1.1.3 添加 `extracted_text` 從 HTML 提取純文字內容 +- [x] 1.1.4 添加日誌記錄 HTML 提取狀態 + +## 2. Fix PDF Coordinate System + +### 2.1 pdf_generator_service.py +- [x] 2.1.1 對於 OCR Track,使用 OCR 座標系尺寸 (如 1275x1650) 作為 PDF 頁面尺寸 +- [x] 2.1.2 修改 `_get_page_size_for_track()` 方法區分 OCR/Direct track +- [x] 2.1.3 調整字體大小計算,避免因縮放導致文字過小 +- [x] 2.1.4 確保座標轉換在 OCR Track 時不進行額外縮放 + +## 3. Improve Table Cell Quality Check + +### 3.1 pdf_generator_service.py +- [x] 3.1.1 審查 `_check_cell_boxes_quality()` 判定條件 +- [x] 3.1.2 放寬或調整判定閾值,避免過度過濾有效表格 (overlap threshold 10% → 25%) +- [x] 3.1.3 添加更詳細的日誌說明為何表格被判定為 "bad quality" + +### 3.2 Fix Table Content Rendering +- [x] 3.2.1 發現問題:`_draw_table_with_cell_boxes` 只渲染邊框,不渲染文字內容 +- [x] 3.2.2 添加 `cell_boxes_rendered` flag 追蹤邊框是否已渲染 +- [x] 3.2.3 修改邏輯:cell_boxes 渲染邊框後繼續使用 ReportLab Table 渲染文字 +- [x] 3.2.4 條件性跳過 GRID style 當 cell_boxes 已渲染邊框時 + +## 4. Testing +- [x] 4.1 使用 edit.pdf 測試修復後的 OCR Track 處理 +- [x] 4.2 驗證表格 HTML 正確提取並渲染 +- [x] 4.3 驗證文字大小一致且清晰可讀 +- [ ] 4.4 確認其他文件類型不受影響 diff --git a/openspec/changes/fix-table-column-alignment/design.md b/openspec/changes/fix-table-column-alignment/design.md new file mode 100644 index 0000000..4102c8f --- /dev/null +++ b/openspec/changes/fix-table-column-alignment/design.md @@ -0,0 +1,227 @@ +# Design: Table Column Alignment Correction + +## Context + +PP-Structure v3's table structure recognition model outputs HTML with row/col attributes inferred from visual patterns. However, the model frequently assigns incorrect column indices, especially for: +- Tables with unclear left borders +- Cells containing vertical Chinese text +- Complex merged cells + +This design introduces a **post-processing correction layer** that validates and fixes column assignments using geometric coordinates. + +## Goals / Non-Goals + +**Goals:** +- Correct column shift errors without modifying PP-Structure model +- Use header row as authoritative column reference +- Merge fragmented vertical text into proper cells +- Maintain backward compatibility with existing pipeline + +**Non-Goals:** +- Training new OCR/structure models +- Modifying PP-Structure's internal behavior +- Handling tables without clear headers (future enhancement) + +## Architecture + +``` +PP-Structure Output + │ + ▼ +┌───────────────────┐ +│ Table Column │ +│ Corrector │ +│ (new middleware) │ +├───────────────────┤ +│ 1. Extract header │ +│ column ranges │ +│ 2. Validate cells │ +│ 3. Correct col │ +│ assignments │ +└───────────────────┘ + │ + ▼ + PDF Generator +``` + +## Decisions + +### Decision 1: Header-Anchor Algorithm + +**Approach:** Use first row (row_idx=0) cells as column anchors. + +**Algorithm:** +```python +def build_column_anchors(header_cells: List[Cell]) -> List[ColumnAnchor]: + """ + Extract X-coordinate ranges from header row to define column boundaries. + + Returns: + List of ColumnAnchor(col_idx, x_min, x_max) + """ + anchors = [] + for cell in header_cells: + anchors.append(ColumnAnchor( + col_idx=cell.col_idx, + x_min=cell.bbox.x0, + x_max=cell.bbox.x1 + )) + return sorted(anchors, key=lambda a: a.x_min) + + +def correct_column(cell: Cell, anchors: List[ColumnAnchor]) -> int: + """ + Find the correct column index based on X-coordinate overlap. + + Strategy: + 1. Calculate overlap with each column anchor + 2. If overlap > 50% with different column, correct it + 3. If no overlap, find nearest column by center point + """ + cell_center_x = (cell.bbox.x0 + cell.bbox.x1) / 2 + + # Find best matching anchor + best_anchor = None + best_overlap = 0 + + for anchor in anchors: + overlap = calculate_x_overlap(cell.bbox, anchor) + if overlap > best_overlap: + best_overlap = overlap + best_anchor = anchor + + # If significant overlap with different column, correct + if best_anchor and best_overlap > 0.5: + if best_anchor.col_idx != cell.col_idx: + logger.info(f"Correcting cell col {cell.col_idx} -> {best_anchor.col_idx}") + return best_anchor.col_idx + + return cell.col_idx +``` + +**Why this approach:** +- Headers are typically the most accurately recognized row +- X-coordinates are objective measurements, not semantic inference +- Simple O(n*m) complexity (n cells, m columns) + +### Decision 2: Vertical Fragment Merging + +**Detection criteria for vertical text fragments:** +1. Width << Height (aspect ratio < 0.3) +2. Located in leftmost 15% of table +3. X-center deviation < 10px between consecutive blocks +4. Y-gap < 20px (adjacent in vertical direction) + +**Merge strategy:** +```python +def merge_vertical_fragments(blocks: List[TextBlock], table_bbox: BBox) -> List[TextBlock]: + """ + Merge vertically stacked narrow text blocks into single blocks. + """ + # Filter candidates: narrow blocks in left margin + left_boundary = table_bbox.x0 + (table_bbox.width * 0.15) + candidates = [b for b in blocks + if b.width < b.height * 0.3 + and b.center_x < left_boundary] + + # Sort by Y position + candidates.sort(key=lambda b: b.y0) + + # Merge adjacent blocks + merged = [] + current_group = [] + + for block in candidates: + if not current_group: + current_group.append(block) + elif should_merge(current_group[-1], block): + current_group.append(block) + else: + merged.append(merge_group(current_group)) + current_group = [block] + + if current_group: + merged.append(merge_group(current_group)) + + return merged +``` + +### Decision 3: Data Sources + +**Primary source:** `cell_boxes` from PP-Structure +- Contains accurate geometric coordinates for each detected cell +- Independent of HTML structure recognition + +**Secondary source:** HTML content with row/col attributes +- Contains text content and structure +- May have incorrect col assignments (the problem we're fixing) + +**Correlation:** Match HTML cells to cell_boxes using IoU (Intersection over Union): +```python +def match_html_cell_to_cellbox(html_cell: HtmlCell, cell_boxes: List[BBox]) -> Optional[BBox]: + """Find the cell_box that best matches this HTML cell's position.""" + best_iou = 0 + best_box = None + + for box in cell_boxes: + iou = calculate_iou(html_cell.inferred_bbox, box) + if iou > best_iou: + best_iou = iou + best_box = box + + return best_box if best_iou > 0.3 else None +``` + +## Configuration + +```python +# config.py additions +table_column_correction_enabled: bool = Field( + default=True, + description="Enable header-anchor column correction" +) +table_column_correction_threshold: float = Field( + default=0.5, + description="Minimum X-overlap ratio to trigger column correction" +) +vertical_fragment_merge_enabled: bool = Field( + default=True, + description="Enable vertical text fragment merging" +) +vertical_fragment_aspect_ratio: float = Field( + default=0.3, + description="Max width/height ratio to consider as vertical text" +) +``` + +## Risks / Trade-offs + +| Risk | Mitigation | +|------|------------| +| Headers themselves misaligned | Fall back to original column assignments | +| Multi-row headers | Support colspan detection in header extraction | +| Tables without headers | Skip correction, use original structure | +| Performance overhead | O(n*m) is negligible for typical table sizes | + +## Integration Points + +1. **Input:** PP-Structure's `table_res` containing: + - `cell_boxes`: List of [x0, y0, x1, y1] coordinates + - `html`: Table HTML with row/col attributes + +2. **Output:** Corrected table structure with: + - Updated col indices in HTML cells + - Merged vertical text blocks + - Diagnostic logs for corrections made + +3. **Trigger location:** After PP-Structure table recognition, before PDF generation + - File: `pdf_generator_service.py` + - Method: `draw_table_region()` or new preprocessing step + +## Open Questions + +1. **Q:** How to handle tables where header row itself is misaligned? + **A:** Could add a secondary validation using cell_boxes grid inference, but start simple. + +2. **Q:** Should corrections be logged for user review? + **A:** Yes, add detailed logging with before/after column indices. diff --git a/openspec/changes/fix-table-column-alignment/proposal.md b/openspec/changes/fix-table-column-alignment/proposal.md new file mode 100644 index 0000000..961085a --- /dev/null +++ b/openspec/changes/fix-table-column-alignment/proposal.md @@ -0,0 +1,56 @@ +# Change: Fix Table Column Alignment with Header-Anchor Correction + +## Why + +PP-Structure's table structure recognition frequently outputs cells with incorrect column indices, causing "column shift" where content appears in the wrong column. This happens because: + +1. **Semantic over Geometric**: The model infers row/col from semantic patterns rather than physical coordinates +2. **Vertical text fragmentation**: Chinese vertical text (e.g., "报价内容") gets split into fragments +3. **Missing left boundary**: When table's left border is unclear, cells shift left incorrectly + +The result: A cell with X-coordinate 213 gets assigned to column 0 (range 96-162) instead of column 1 (range 204-313). + +## What Changes + +- **Add Header-Anchor Alignment**: Use the first row (header) X-coordinates as column reference points +- **Add Coordinate-Based Column Correction**: Validate and correct cell column assignments based on X-coordinate overlap with header columns +- **Add Vertical Fragment Merging**: Detect and merge vertically stacked narrow text blocks that represent vertical text +- **Add Configuration Options**: Enable/disable correction features independently + +## Impact + +- Affected specs: `document-processing` +- Affected code: + - `backend/app/services/table_column_corrector.py` (new) + - `backend/app/services/pdf_generator_service.py` + - `backend/app/core/config.py` + +## Problem Analysis + +### Example: scan.pdf Table 7 + +**Raw PP-Structure Output:** +``` +Row 5: "3、適應產品..." at X=213 + Model says: col=0 + +Header Row 0: + - Column 0 (序號): X range [96, 162] + - Column 1 (產品名稱): X range [204, 313] +``` + +**Problem:** X=213 is far outside column 0's range (max 162), but perfectly within column 1's range (starts at 204). + +**Solution:** Force-correct col=0 → col=1 based on X-coordinate alignment with header. + +### Vertical Text Issue + +**Raw OCR:** +``` +Block A: "报价内" at X≈100, Y=[100, 200] +Block B: "容--" at X≈102, Y=[200, 300] +``` + +**Problem:** These should be one cell spanning multiple rows, but appear as separate fragments. + +**Solution:** Merge vertically aligned narrow blocks before structure recognition. diff --git a/openspec/changes/fix-table-column-alignment/specs/document-processing/spec.md b/openspec/changes/fix-table-column-alignment/specs/document-processing/spec.md new file mode 100644 index 0000000..200359d --- /dev/null +++ b/openspec/changes/fix-table-column-alignment/specs/document-processing/spec.md @@ -0,0 +1,59 @@ +## ADDED Requirements + +### Requirement: Table Column Alignment Correction +The system SHALL correct table cell column assignments using header-anchor alignment when PP-Structure outputs incorrect column indices. + +#### Scenario: Correct column shift using header anchors +- **WHEN** processing a table with cell_boxes and HTML content +- **THEN** the system SHALL extract header row (row_idx=0) column X-coordinate ranges +- **AND** validate each cell's column assignment against header X-ranges +- **AND** correct column index if cell X-overlap with assigned column is < 50% +- **AND** assign cell to column with highest X-overlap + +#### Scenario: Handle tables without headers +- **WHEN** processing a table without a clear header row +- **THEN** the system SHALL skip column correction +- **AND** use original PP-Structure column assignments +- **AND** log that header-anchor correction was skipped + +#### Scenario: Log column corrections +- **WHEN** a cell's column index is corrected +- **THEN** the system SHALL log original and corrected column indices +- **AND** include cell content snippet for debugging +- **AND** record total corrections per table + +### Requirement: Vertical Text Fragment Merging +The system SHALL detect and merge vertically fragmented Chinese text blocks that represent single cells spanning multiple rows. + +#### Scenario: Detect vertical text fragments +- **WHEN** processing table text regions +- **THEN** the system SHALL identify narrow text blocks (width/height ratio < 0.3) +- **AND** filter blocks in leftmost 15% of table area +- **AND** group vertically adjacent blocks with X-center deviation < 10px + +#### Scenario: Merge fragmented vertical text +- **WHEN** vertical text fragments are detected +- **THEN** the system SHALL merge adjacent fragments into single text blocks +- **AND** combine text content preserving reading order +- **AND** calculate merged bounding box spanning all fragments +- **AND** treat merged block as single cell for column assignment + +#### Scenario: Preserve non-vertical text +- **WHEN** text blocks do not meet vertical fragment criteria +- **THEN** the system SHALL preserve original text block boundaries +- **AND** process normally without merging + +## MODIFIED Requirements + +### Requirement: Extract table structure +The system SHALL extract cell content and boundaries from PP-StructureV3 tables, with post-processing correction for column alignment errors. + +#### Scenario: Extract table structure with correction +- **WHEN** PP-StructureV3 identifies a table +- **THEN** the system SHALL extract cell content and boundaries +- **AND** validate cell_boxes coordinates against page boundaries +- **AND** apply header-anchor column correction when enabled +- **AND** merge vertical text fragments when enabled +- **AND** apply fallback detection for invalid coordinates +- **AND** preserve table HTML for structure +- **AND** extract plain text for translation diff --git a/openspec/changes/fix-table-column-alignment/tasks.md b/openspec/changes/fix-table-column-alignment/tasks.md new file mode 100644 index 0000000..b7b6605 --- /dev/null +++ b/openspec/changes/fix-table-column-alignment/tasks.md @@ -0,0 +1,59 @@ +## 1. Core Algorithm Implementation + +### 1.1 Table Column Corrector Module +- [x] 1.1.1 Create `table_column_corrector.py` service file +- [x] 1.1.2 Implement `ColumnAnchor` dataclass for header column ranges +- [x] 1.1.3 Implement `build_column_anchors()` to extract header column X-ranges +- [x] 1.1.4 Implement `calculate_x_overlap()` utility function +- [x] 1.1.5 Implement `correct_cell_column()` for single cell correction +- [x] 1.1.6 Implement `correct_table_columns()` main entry point + +### 1.2 HTML Cell Extraction +- [x] 1.2.1 Implement `parse_table_html_with_positions()` to extract cells with row/col +- [x] 1.2.2 Implement cell-to-cellbox matching using IoU +- [x] 1.2.3 Handle colspan/rowspan in header detection + +### 1.3 Vertical Fragment Merging +- [x] 1.3.1 Implement `detect_vertical_fragments()` to find narrow text blocks +- [x] 1.3.2 Implement `should_merge_blocks()` adjacency check +- [x] 1.3.3 Implement `merge_vertical_fragments()` main function +- [x] 1.3.4 Integrate merged blocks back into table structure + +## 2. Configuration + +### 2.1 Settings +- [x] 2.1.1 Add `table_column_correction_enabled: bool = True` +- [x] 2.1.2 Add `table_column_correction_threshold: float = 0.5` +- [x] 2.1.3 Add `vertical_fragment_merge_enabled: bool = True` +- [x] 2.1.4 Add `vertical_fragment_aspect_ratio: float = 0.3` + +## 3. Integration + +### 3.1 Pipeline Integration +- [x] 3.1.1 Add correction step in `pdf_generator_service.py` before table rendering +- [x] 3.1.2 Pass corrected HTML to existing table rendering logic +- [x] 3.1.3 Add diagnostic logging for corrections made + +### 3.2 Error Handling +- [x] 3.2.1 Handle tables without headers gracefully +- [x] 3.2.2 Handle empty/malformed cell_boxes +- [x] 3.2.3 Fallback to original structure on correction failure + +## 4. Testing + +### 4.1 Unit Tests +- [ ] 4.1.1 Test `build_column_anchors()` with various header configurations +- [ ] 4.1.2 Test `correct_cell_column()` with known column shift cases +- [ ] 4.1.3 Test `merge_vertical_fragments()` with vertical text samples +- [ ] 4.1.4 Test edge cases: empty tables, single column, no headers + +### 4.2 Integration Tests +- [ ] 4.2.1 Test with `scan.pdf` Table 7 (the problematic case) +- [ ] 4.2.2 Test with tables that have correct alignment (no regression) +- [ ] 4.2.3 Visual comparison of corrected vs original output + +## 5. Documentation + +- [x] 5.1 Add inline code comments explaining correction algorithm +- [x] 5.2 Update spec with new table column correction requirement +- [x] 5.3 Add logging messages for debugging diff --git a/openspec/changes/improve-ocr-track-algorithm/proposal.md b/openspec/changes/improve-ocr-track-algorithm/proposal.md new file mode 100644 index 0000000..6388c82 --- /dev/null +++ b/openspec/changes/improve-ocr-track-algorithm/proposal.md @@ -0,0 +1,49 @@ +# Change: Improve OCR Track Algorithm Based on PP-StructureV3 Best Practices + +## Why + +目前 OCR Track 的 Gap Filling 演算法使用 **IoU (Intersection over Union)** 判斷 OCR 文字是否被 Layout 區域覆蓋。根據 PaddleX 官方文件 (paddle_review.md) 建議,應改用 **IoA (Intersection over Area)** 才能正確判斷「小框是否被大框包含」的非對稱關係。此外,現行使用統一閾值處理所有元素類型,但不同類型應有不同閾值策略。 + +## What Changes + +1. **IoU → IoA 演算法變更**: 將 `gap_filling_service.py` 中的覆蓋判定從 IoU 改為 IoA +2. **動態閾值策略**: 依元素類型 (TEXT, TABLE, FIGURE) 使用不同的 IoA 閾值 +3. **使用 PP-StructureV3 內建 OCR**: 改用 `overall_ocr_res` 取代獨立執行 Raw OCR,節省推理時間並確保座標一致 +4. **邊界收縮處理**: OCR 框內縮 1-2 px 避免邊緣重複渲染 + +## Impact + +- Affected specs: `ocr-processing` +- Affected code: + - `backend/app/services/gap_filling_service.py` - 核心演算法變更 + - `backend/app/services/ocr_service.py` - 改用 `overall_ocr_res` + - `backend/app/services/processing_orchestrator.py` - 調整 OCR 資料來源 + - `backend/app/core/config.py` - 新增元素類型閾值設定 + +## Technical Details + +### 1. IoA vs IoU + +``` +IoU = 交集面積 / 聯集面積 (對稱,用於判斷兩框是否指向同物體) +IoA = 交集面積 / OCR框面積 (非對稱,用於判斷小框是否被大框包含) +``` + +當 Layout 框遠大於 OCR 框時,IoU 會過小導致誤判為「未覆蓋」。 + +### 2. 動態閾值建議 + +| 元素類型 | IoA 閾值 | 說明 | +|---------|---------|------| +| TEXT/TITLE | 0.6 | 容忍邊界誤差 | +| TABLE | 0.1 | 嚴格過濾,避免破壞表格結構 | +| FIGURE | 0.8 | 保留圖中文字 (如軸標籤) | + +### 3. overall_ocr_res 驗證結果 + +已確認 PP-StructureV3 的 `json['res']['overall_ocr_res']` 包含: +- `dt_polys`: 檢測框座標 (polygon 格式) +- `rec_texts`: 識別文字 +- `rec_scores`: 識別信心度 + +測試結果顯示與獨立執行 Raw OCR 的結果數量相同 (59 regions),可安全替換。 diff --git a/openspec/changes/improve-ocr-track-algorithm/specs/ocr-processing/spec.md b/openspec/changes/improve-ocr-track-algorithm/specs/ocr-processing/spec.md new file mode 100644 index 0000000..332c65e --- /dev/null +++ b/openspec/changes/improve-ocr-track-algorithm/specs/ocr-processing/spec.md @@ -0,0 +1,142 @@ +## MODIFIED Requirements + +### Requirement: OCR Track Gap Filling with Raw OCR Regions + +The system SHALL detect and fill gaps in PP-StructureV3 output by supplementing with Raw OCR text regions when significant content loss is detected. + +#### Scenario: Gap filling activates when coverage is low +- **GIVEN** an OCR track processing task +- **WHEN** PP-StructureV3 outputs elements that cover less than 70% of Raw OCR text regions +- **THEN** the system SHALL activate gap filling +- **AND** identify Raw OCR regions not covered by any PP-StructureV3 element +- **AND** supplement these regions as TEXT elements in the output + +#### Scenario: Coverage is determined by IoA (Intersection over Area) +- **GIVEN** a Raw OCR text region with bounding box +- **WHEN** checking if the region is covered by PP-StructureV3 +- **THEN** the region SHALL be considered covered if IoA (intersection area / OCR box area) exceeds the type-specific threshold +- **AND** IoA SHALL be used instead of IoU because it correctly measures "small box contained in large box" relationship +- **AND** regions not meeting the IoA criterion SHALL be marked as uncovered + +#### Scenario: Element-type-specific IoA thresholds are applied +- **GIVEN** a Raw OCR region being evaluated for coverage +- **WHEN** comparing against PP-StructureV3 elements of different types +- **THEN** the system SHALL apply different IoA thresholds: + - TEXT, TITLE, HEADER, FOOTER: IoA > 0.6 (tolerates boundary errors) + - TABLE: IoA > 0.1 (strict filtering to preserve table structure) + - FIGURE, IMAGE: IoA > 0.8 (preserves text within figures like axis labels) +- **AND** a region is considered covered if it meets the threshold for ANY overlapping element + +#### Scenario: Only TEXT elements are supplemented +- **GIVEN** uncovered Raw OCR regions identified for supplementation +- **WHEN** PP-StructureV3 has detected TABLE, IMAGE, FIGURE, FLOWCHART, HEADER, or FOOTER elements +- **THEN** the system SHALL NOT supplement regions that overlap with these structural elements +- **AND** only supplement regions as TEXT type to preserve structural integrity + +#### Scenario: Supplemented regions meet confidence threshold +- **GIVEN** Raw OCR regions to be supplemented +- **WHEN** a region has confidence score below 0.3 +- **THEN** the system SHALL skip that region +- **AND** only supplement regions with confidence >= 0.3 + +#### Scenario: Deduplication uses IoA instead of IoU +- **GIVEN** a Raw OCR region being considered for supplementation +- **WHEN** the region has IoA > 0.5 with any existing PP-StructureV3 TEXT element +- **THEN** the system SHALL skip that region to prevent duplicate text +- **AND** the original PP-StructureV3 element SHALL be preserved + +#### Scenario: Reading order is recalculated after gap filling +- **GIVEN** supplemented elements have been added to the page +- **WHEN** assembling the final element list +- **THEN** the system SHALL recalculate reading order for the entire page +- **AND** sort elements by y0 coordinate (top to bottom) then x0 (left to right) +- **AND** ensure logical document flow is maintained + +#### Scenario: Coordinate alignment with ocr_dimensions +- **GIVEN** Raw OCR processing may involve image resizing +- **WHEN** comparing Raw OCR bbox with PP-StructureV3 bbox +- **THEN** the system SHALL use ocr_dimensions to normalize coordinates +- **AND** ensure both sources reference the same coordinate space +- **AND** prevent coverage misdetection due to scale differences + +#### Scenario: Supplemented elements have complete metadata +- **GIVEN** a Raw OCR region being added as supplemented element +- **WHEN** creating the DocumentElement +- **THEN** the element SHALL include page_number +- **AND** include confidence score from Raw OCR +- **AND** include original bbox coordinates +- **AND** optionally include source indicator for debugging + +### Requirement: Gap Filling Configuration + +The system SHALL provide configurable parameters for gap filling behavior. + +#### Scenario: Gap filling can be disabled via configuration +- **GIVEN** gap_filling_enabled is set to false in configuration +- **WHEN** OCR track processing runs +- **THEN** the system SHALL skip all gap filling logic +- **AND** output only PP-StructureV3 results as before + +#### Scenario: Coverage threshold is configurable +- **GIVEN** gap_filling_coverage_threshold is set to 0.8 +- **WHEN** PP-StructureV3 coverage is 75% +- **THEN** the system SHALL activate gap filling +- **AND** supplement uncovered regions + +#### Scenario: IoA thresholds are configurable per element type +- **GIVEN** custom IoA thresholds configured: + - gap_filling_ioa_threshold_text: 0.6 + - gap_filling_ioa_threshold_table: 0.1 + - gap_filling_ioa_threshold_figure: 0.8 + - gap_filling_dedup_ioa_threshold: 0.5 +- **WHEN** evaluating coverage and deduplication +- **THEN** the system SHALL use the configured values +- **AND** apply them consistently throughout gap filling process + +#### Scenario: Confidence threshold is configurable +- **GIVEN** gap_filling_confidence_threshold is set to 0.5 +- **WHEN** supplementing Raw OCR regions +- **THEN** the system SHALL only include regions with confidence >= 0.5 +- **AND** filter out lower confidence regions + +#### Scenario: Boundary shrinking reduces edge duplicates +- **GIVEN** gap_filling_shrink_pixels is set to 1 +- **WHEN** evaluating coverage with IoA +- **THEN** the system SHALL shrink OCR bounding boxes inward by 1 pixel on each side +- **AND** this reduces false "uncovered" detection at region boundaries + +## ADDED Requirements + +### Requirement: Use PP-StructureV3 Internal OCR Results + +The system SHALL preferentially use PP-StructureV3's internal OCR results (`overall_ocr_res`) instead of running a separate Raw OCR inference. + +#### Scenario: Extract overall_ocr_res from PP-StructureV3 +- **GIVEN** PP-StructureV3 processing completes +- **WHEN** the result contains `json['res']['overall_ocr_res']` +- **THEN** the system SHALL extract OCR regions from: + - `dt_polys`: detection box polygons + - `rec_texts`: recognized text strings + - `rec_scores`: confidence scores +- **AND** convert these to the standard TextRegion format for gap filling + +#### Scenario: Skip separate Raw OCR when overall_ocr_res is available +- **GIVEN** gap_filling_use_overall_ocr is true (default) +- **WHEN** PP-StructureV3 result contains overall_ocr_res +- **THEN** the system SHALL NOT execute separate PaddleOCR inference +- **AND** use the extracted overall_ocr_res as the OCR source +- **AND** this reduces total inference time by approximately 50% + +#### Scenario: Fallback to separate Raw OCR when needed +- **GIVEN** gap_filling_use_overall_ocr is false OR overall_ocr_res is missing +- **WHEN** gap filling is activated +- **THEN** the system SHALL execute separate PaddleOCR inference as before +- **AND** use the separate OCR results for gap filling +- **AND** this maintains backward compatibility + +#### Scenario: Coordinate consistency is guaranteed +- **GIVEN** overall_ocr_res is extracted from PP-StructureV3 +- **WHEN** comparing with PP-StructureV3 layout elements +- **THEN** both SHALL use the same coordinate system +- **AND** no additional coordinate alignment is needed +- **AND** this prevents scale mismatch issues diff --git a/openspec/changes/improve-ocr-track-algorithm/tasks.md b/openspec/changes/improve-ocr-track-algorithm/tasks.md new file mode 100644 index 0000000..f58d543 --- /dev/null +++ b/openspec/changes/improve-ocr-track-algorithm/tasks.md @@ -0,0 +1,54 @@ +## 1. Algorithm Changes (gap_filling_service.py) + +### 1.1 IoA Implementation +- [x] 1.1.1 Add `_calculate_ioa()` method alongside existing `_calculate_iou()` +- [x] 1.1.2 Modify `_is_region_covered()` to use IoA instead of IoU +- [x] 1.1.3 Update deduplication logic to use IoA + +### 1.2 Dynamic Threshold Strategy +- [x] 1.2.1 Add element-type-specific thresholds as class constants +- [x] 1.2.2 Modify `_is_region_covered()` to accept element type parameter +- [x] 1.2.3 Apply different thresholds based on element type (TEXT: 0.6, TABLE: 0.1, FIGURE: 0.8) + +### 1.3 Boundary Shrinking +- [x] 1.3.1 Add optional `shrink_pixels` parameter to coverage detection +- [x] 1.3.2 Implement bbox shrinking logic (inward 1-2 px) + +## 2. OCR Data Source Changes + +### 2.1 Extract overall_ocr_res from PP-StructureV3 +- [x] 2.1.1 Modify `pp_structure_enhanced.py` to extract `overall_ocr_res` from result +- [x] 2.1.2 Convert `dt_polys` + `rec_texts` + `rec_scores` to TextRegion format +- [x] 2.1.3 Store extracted OCR in result dict for gap filling + +### 2.2 Update Processing Orchestrator +- [x] 2.2.1 Add option to use `overall_ocr_res` as OCR source +- [x] 2.2.2 Skip separate Raw OCR inference when using PP-StructureV3's OCR +- [x] 2.2.3 Maintain backward compatibility with explicit Raw OCR mode + +## 3. Configuration Updates + +### 3.1 Add Settings (config.py) +- [x] 3.1.1 Add `gap_filling_ioa_threshold_text: float = 0.6` +- [x] 3.1.2 Add `gap_filling_ioa_threshold_table: float = 0.1` +- [x] 3.1.3 Add `gap_filling_ioa_threshold_figure: float = 0.8` +- [x] 3.1.4 Add `gap_filling_use_overall_ocr: bool = True` +- [x] 3.1.5 Add `gap_filling_shrink_pixels: int = 1` + +## 4. Testing + +### 4.1 Unit Tests +- [ ] 4.1.1 Test IoA calculation with known values +- [ ] 4.1.2 Test dynamic threshold selection by element type +- [ ] 4.1.3 Test boundary shrinking edge cases + +### 4.2 Integration Tests +- [ ] 4.2.1 Test with scan.pdf (current problematic file) +- [ ] 4.2.2 Compare results: old IoU vs new IoA approach +- [ ] 4.2.3 Verify no duplicate text rendering in output PDF +- [ ] 4.2.4 Verify table content is not duplicated outside table bounds + +## 5. Documentation + +- [x] 5.1 Update spec documentation with new algorithm +- [x] 5.2 Add inline code comments explaining IoA vs IoU diff --git a/openspec/changes/remove-unused-code/proposal.md b/openspec/changes/remove-unused-code/proposal.md new file mode 100644 index 0000000..136fc6b --- /dev/null +++ b/openspec/changes/remove-unused-code/proposal.md @@ -0,0 +1,55 @@ +# Change: Remove Unused Code and Legacy Files + +## Why + +專案經過多次迭代開發後,累積了一些未使用的代碼和遺留文件。這些冗餘代碼增加了維護負擔、可能造成混淆,並佔用不必要的存儲空間。本提案旨在系統性地移除這些未使用的代碼,以達成專案內容及程式代碼的精簡。 + +## What Changes + +### Backend - 移除未使用的服務文件 (3個) + +| 文件 | 行數 | 移除原因 | +|------|------|----------| +| `ocr_service_original.py` | ~835 | 舊版 OCR 服務,已被 `ocr_service.py` 完全取代 | +| `preprocessor.py` | ~200 | 文檔預處理器,功能已被 `layout_preprocessing_service.py` 吸收 | +| `pdf_font_manager.py` | ~150 | 字體管理器,未被任何服務引用 | + +### Frontend - 移除未使用的組件 (2個) + +| 文件 | 移除原因 | +|------|----------| +| `MarkdownPreview.tsx` | 完全未被任何頁面或組件引用 | +| `ResultsTable.tsx` | 使用已棄用的 `FileResult` 類型,功能已被 `TaskHistoryPage` 替代 | + +### Frontend - 遷移並移除遺留 API 服務 (2個) + +| 文件 | 移除原因 | +|------|----------| +| `services/api.ts` | 舊版 API 客戶端,僅剩 2 處引用 (Layout.tsx, SettingsPage.tsx),需遷移至 apiV2 | +| `types/api.ts` | 舊版類型定義,僅 `ExportRule` 類型被使用,需遷移至 apiV2.ts | + +## Impact + +- **Affected specs**: 無 (純代碼清理,不改變系統行為) +- **Affected code**: + - Backend: `backend/app/services/` (刪除 3 個文件) + - Frontend: `frontend/src/components/` (刪除 2 個文件) + - Frontend: `frontend/src/services/api.ts` (遷移後刪除) + - Frontend: `frontend/src/types/api.ts` (遷移後刪除) + +## Benefits + +- 減少約 1,200+ 行後端冗餘代碼 +- 減少約 300+ 行前端冗餘代碼 +- 提高代碼維護性和可讀性 +- 消除新開發者的混淆源 +- 統一 API 客戶端到 apiV2 + +## Risk Assessment + +- **風險等級**: 低 +- **回滾策略**: Git revert 即可恢復所有刪除的文件 +- **測試要求**: + - 確認後端服務啟動正常 + - 確認前端所有頁面功能正常 + - 特別測試 SettingsPage (ExportRule) 功能 diff --git a/openspec/changes/remove-unused-code/specs/document-processing/spec.md b/openspec/changes/remove-unused-code/specs/document-processing/spec.md new file mode 100644 index 0000000..3ddb0f5 --- /dev/null +++ b/openspec/changes/remove-unused-code/specs/document-processing/spec.md @@ -0,0 +1,61 @@ +## REMOVED Requirements + +### Requirement: Legacy OCR Service Implementation + +**Reason**: `ocr_service_original.py` was the original OCR service implementation that has been completely superseded by the current `ocr_service.py`. The legacy file is no longer referenced by any part of the codebase. + +**Migration**: No migration needed. The current `ocr_service.py` provides all required functionality with improved architecture. + +#### Scenario: Legacy service file removal +- **WHEN** the legacy `ocr_service_original.py` file is removed +- **THEN** the system continues to function normally using `ocr_service.py` +- **AND** no import errors occur in any service or router + +### Requirement: Unused Preprocessor Service + +**Reason**: `preprocessor.py` was a document preprocessor that is no longer used. Its functionality has been absorbed by `layout_preprocessing_service.py`. + +**Migration**: No migration needed. The preprocessing functionality is available through `layout_preprocessing_service.py`. + +#### Scenario: Preprocessor file removal +- **WHEN** the unused `preprocessor.py` file is removed +- **THEN** the system continues to function normally +- **AND** layout preprocessing works correctly via `layout_preprocessing_service.py` + +### Requirement: Unused PDF Font Manager + +**Reason**: `pdf_font_manager.py` was intended for font management but is not referenced by `pdf_generator_service.py` or any other service. + +**Migration**: No migration needed. Font handling is managed within `pdf_generator_service.py` directly. + +#### Scenario: Font manager file removal +- **WHEN** the unused `pdf_font_manager.py` file is removed +- **THEN** PDF generation continues to work correctly +- **AND** fonts are rendered properly in generated PDFs + +### Requirement: Legacy Frontend Components + +**Reason**: `MarkdownPreview.tsx` and `ResultsTable.tsx` are frontend components that are not referenced by any page or component in the application. + +**Migration**: No migration needed. `MarkdownPreview` functionality is not currently used. `ResultsTable` functionality has been replaced by `TaskHistoryPage`. + +#### Scenario: Unused frontend component removal +- **WHEN** the unused `MarkdownPreview.tsx` and `ResultsTable.tsx` files are removed +- **THEN** the frontend application compiles successfully +- **AND** all pages render and function correctly + +### Requirement: Legacy API Client Migration + +**Reason**: `services/api.ts` and `types/api.ts` are legacy API client files with only 2 remaining references. These should be migrated to `apiV2` for consistency. + +**Migration**: +1. Move `ExportRule` type to `types/apiV2.ts` +2. Add export rules API functions to `services/apiV2.ts` +3. Update `SettingsPage.tsx` and `Layout.tsx` to use apiV2 +4. Remove legacy api.ts files + +#### Scenario: Legacy API client removal after migration +- **WHEN** the legacy `api.ts` files are removed after migration +- **THEN** all API calls use the unified `apiV2` client +- **AND** `SettingsPage` export rules functionality works correctly +- **AND** `Layout` logout functionality works correctly diff --git a/openspec/changes/remove-unused-code/tasks.md b/openspec/changes/remove-unused-code/tasks.md new file mode 100644 index 0000000..05adbcc --- /dev/null +++ b/openspec/changes/remove-unused-code/tasks.md @@ -0,0 +1,43 @@ +# Tasks: Remove Unused Code and Legacy Files + +## Phase 1: Backend Cleanup (無依賴,可直接刪除) + +- [ ] 1.1 確認 `ocr_service_original.py` 無任何引用 +- [ ] 1.2 刪除 `backend/app/services/ocr_service_original.py` +- [ ] 1.3 確認 `preprocessor.py` 無任何引用 +- [ ] 1.4 刪除 `backend/app/services/preprocessor.py` +- [ ] 1.5 確認 `pdf_font_manager.py` 無任何引用 +- [ ] 1.6 刪除 `backend/app/services/pdf_font_manager.py` +- [ ] 1.7 測試後端服務啟動正常 + +## Phase 2: Frontend Unused Components (無依賴,可直接刪除) + +- [ ] 2.1 確認 `MarkdownPreview.tsx` 無任何引用 +- [ ] 2.2 刪除 `frontend/src/components/MarkdownPreview.tsx` +- [ ] 2.3 確認 `ResultsTable.tsx` 無任何引用 +- [ ] 2.4 刪除 `frontend/src/components/ResultsTable.tsx` +- [ ] 2.5 測試前端編譯正常 + +## Phase 3: Frontend API Migration (需先遷移再刪除) + +- [ ] 3.1 將 `ExportRule` 類型從 `types/api.ts` 遷移到 `types/apiV2.ts` +- [ ] 3.2 在 `services/apiV2.ts` 中添加 export rules 相關 API 函數 +- [ ] 3.3 更新 `SettingsPage.tsx` 使用 apiV2 的 ExportRule +- [ ] 3.4 更新 `Layout.tsx` 移除對 api.ts 的依賴 +- [ ] 3.5 確認 `services/api.ts` 無任何引用 +- [ ] 3.6 刪除 `frontend/src/services/api.ts` +- [ ] 3.7 確認 `types/api.ts` 無任何引用 +- [ ] 3.8 刪除 `frontend/src/types/api.ts` +- [ ] 3.9 測試前端所有功能正常 + +## Phase 4: Verification + +- [ ] 4.1 運行後端測試 (如有) +- [ ] 4.2 運行前端編譯 `npm run build` +- [ ] 4.3 手動測試關鍵功能: + - [ ] 登入/登出 + - [ ] 文件上傳 + - [ ] OCR 處理 + - [ ] 結果查看 + - [ ] 導出設定頁面 +- [ ] 4.4 確認無 console 錯誤或警告 diff --git a/openspec/changes/simple-text-positioning/design.md b/openspec/changes/simple-text-positioning/design.md new file mode 100644 index 0000000..82df80c --- /dev/null +++ b/openspec/changes/simple-text-positioning/design.md @@ -0,0 +1,141 @@ +# Design: Simple Text Positioning + +## Architecture + +### Current Flow (Complex) +``` +Raw OCR → PP-Structure Analysis → Table Detection → HTML Parsing → +Column Correction → Cell Positioning → PDF Generation +``` + +### New Flow (Simple) +``` +Raw OCR → Text Region Extraction → Bbox Processing → +Rotation Calculation → Font Size Estimation → PDF Text Rendering +``` + +## Core Components + +### 1. TextRegionRenderer + +New service class to handle raw OCR text rendering: + +```python +class TextRegionRenderer: + """Render raw OCR text regions to PDF.""" + + def render_text_region( + self, + canvas: Canvas, + region: Dict, + scale_factor: float + ) -> None: + """ + Render a single OCR text region. + + Args: + canvas: ReportLab canvas + region: Raw OCR region with text and bbox + scale_factor: Coordinate scaling factor + """ +``` + +### 2. Bbox Processing + +Raw OCR bbox format (quadrilateral - 4 corner points): +```json +{ + "text": "LOCTITE", + "bbox": [[116, 76], [378, 76], [378, 128], [116, 128]], + "confidence": 0.98 +} +``` + +Processing steps: +1. **Center point**: Average of 4 corners +2. **Width/Height**: Distance between corners +3. **Rotation angle**: Angle of top edge from horizontal +4. **Font size**: Approximate from bbox height + +### 3. Rotation Calculation + +```python +def calculate_rotation(bbox: List[List[float]]) -> float: + """ + Calculate text rotation from bbox quadrilateral. + + Returns angle in degrees (counter-clockwise from horizontal). + """ + # Top-left to top-right vector + dx = bbox[1][0] - bbox[0][0] + dy = bbox[1][1] - bbox[0][1] + + # Angle in degrees + angle = math.atan2(dy, dx) * 180 / math.pi + return angle +``` + +### 4. Font Size Estimation + +```python +def estimate_font_size(bbox: List[List[float]], text: str) -> float: + """ + Estimate font size from bbox dimensions. + + Uses bbox height as primary indicator, adjusted for aspect ratio. + """ + # Calculate bbox height (average of left and right edges) + left_height = math.dist(bbox[0], bbox[3]) + right_height = math.dist(bbox[1], bbox[2]) + avg_height = (left_height + right_height) / 2 + + # Font size is approximately 70-80% of bbox height + return avg_height * 0.75 +``` + +## Integration Points + +### PDFGeneratorService + +Modify `draw_ocr_content()` to use simple text positioning: + +```python +def draw_ocr_content(self, canvas, content_data, page_info): + """Draw OCR content using simple text positioning.""" + + # Use raw OCR regions directly + raw_regions = content_data.get('raw_ocr_regions', []) + + for region in raw_regions: + self.text_renderer.render_text_region( + canvas, region, scale_factor + ) +``` + +### Configuration + +Add config option to enable/disable simple mode: + +```python +class OCRSettings: + simple_text_positioning: bool = Field( + default=True, + description="Use simple text positioning instead of table reconstruction" + ) +``` + +## File Changes + +| File | Change | +|------|--------| +| `app/services/text_region_renderer.py` | New - Text rendering logic | +| `app/services/pdf_generator_service.py` | Modify - Integration | +| `app/core/config.py` | Add - Configuration option | + +## Edge Cases + +1. **Overlapping text**: Regions may overlap slightly - render in reading order +2. **Very small text**: Minimum font size threshold (6pt) +3. **Rotated pages**: Handle 90/180/270 degree page rotation +4. **Empty regions**: Skip regions with empty text +5. **Unicode text**: Ensure font supports CJK characters diff --git a/openspec/changes/simple-text-positioning/proposal.md b/openspec/changes/simple-text-positioning/proposal.md new file mode 100644 index 0000000..b782535 --- /dev/null +++ b/openspec/changes/simple-text-positioning/proposal.md @@ -0,0 +1,42 @@ +# Simple Text Positioning from Raw OCR + +## Summary + +Simplify OCR track PDF generation by rendering raw OCR text at correct positions without complex table structure reconstruction. + +## Problem + +Current OCR track processing has multiple failure points: +1. PP-Structure table structure recognition fails for borderless tables +2. Multi-column layouts get merged incorrectly into single tables +3. Table HTML reconstruction produces wrong cell positions +4. Complex column correction algorithms still can't fix fundamental structure errors + +Meanwhile, raw OCR (`raw_ocr_regions.json`) correctly identifies all text with accurate bounding boxes. + +## Solution + +Replace complex table reconstruction with simple text positioning: +1. Read raw OCR regions directly +2. Position text at bbox coordinates +3. Calculate text rotation from bbox quadrilateral shape +4. Estimate font size from bbox height +5. Skip table HTML parsing entirely for OCR track + +## Benefits + +- **Reliability**: Raw OCR text positions are accurate +- **Simplicity**: Eliminates complex table parsing logic +- **Performance**: Faster processing without structure analysis +- **Consistency**: Predictable output regardless of table type + +## Trade-offs + +- No table borders in output +- No cell structure (colspan, rowspan) +- Visual layout approximation rather than semantic structure + +## Scope + +- OCR track PDF generation only +- Direct track remains unchanged (uses native PDF text extraction) diff --git a/openspec/changes/simple-text-positioning/tasks.md b/openspec/changes/simple-text-positioning/tasks.md new file mode 100644 index 0000000..b292a99 --- /dev/null +++ b/openspec/changes/simple-text-positioning/tasks.md @@ -0,0 +1,57 @@ +# Tasks: Simple Text Positioning + +## Phase 1: Core Implementation + +- [x] Create `TextRegionRenderer` class in `app/services/text_region_renderer.py` + - [x] Implement `calculate_rotation()` from bbox quadrilateral + - [x] Implement `estimate_font_size()` from bbox height + - [x] Implement `render_text_region()` main method + - [x] Handle coordinate system transformation (OCR → PDF) + +## Phase 2: Integration + +- [x] Add `simple_text_positioning_enabled` config option +- [x] Modify `PDFGeneratorService._generate_ocr_track_pdf()` to use `TextRegionRenderer` +- [x] Ensure raw OCR regions are loaded correctly via `load_raw_ocr_regions()` + +## Phase 3: Image/Chart/Formula Support + +- [x] Add image element type detection (`figure`, `image`, `chart`, `seal`, `formula`) +- [x] Render image elements from UnifiedDocument to PDF +- [x] Handle image path resolution (result_dir, imgs/ subdirectory) +- [x] Coordinate transformation for image placement + +## Phase 4: Text Straightening & Overlap Avoidance + +- [x] Add rotation straightening threshold (default 10°) + - Small rotation angles (< 10°) are treated as 0° for clean output + - Only significant rotations (e.g., 90°) are preserved +- [x] Add IoA (Intersection over Area) overlap detection + - IoA threshold default 0.3 (30% overlap triggers skip) + - Text regions overlapping with images/charts are skipped +- [x] Collect exclusion zones from image elements +- [x] Pass exclusion zones to text renderer + +## Phase 5: Chart Axis Label Deduplication + +- [x] Add `is_axis_label()` method to detect axis labels + - Y-axis: Vertical text immediately left of chart + - X-axis: Horizontal text immediately below chart +- [x] Add `is_near_zone()` method for proximity checking +- [x] Position-aware deduplication in `render_text_region()` + - Collect texts inside zones + axis labels + - Skip matching text only if near zone or is axis label + - Preserve matching text far from zones (e.g., table values) +- [x] Test results: + - "Temperature, C" and "Syringe Thaw Time, Minutes" correctly skipped + - Table values like "10" at top of page correctly rendered + - Page 2: 128/148 text regions rendered (12 overlap + 8 dedupe) + +## Phase 6: Testing + +- [x] Test with scan.pdf task (064e2d67-338c-4e54-b005-204c3b76fe63) + - Page 2: Chart image rendered, axis labels deduplicated + - PDF is searchable and selectable + - Text is properly straightened (no skew artifacts) +- [ ] Compare output quality vs original scan visually +- [ ] Test with documents containing seals/formulas diff --git a/openspec/changes/use-cellboxes-for-table-rendering/design.md b/openspec/changes/use-cellboxes-for-table-rendering/design.md new file mode 100644 index 0000000..84ca1bd --- /dev/null +++ b/openspec/changes/use-cellboxes-for-table-rendering/design.md @@ -0,0 +1,234 @@ +# Design: cell_boxes-First Table Rendering + +## Architecture Overview + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ Table Rendering Pipeline │ +├─────────────────────────────────────────────────────────────────┤ +│ │ +│ Input: table_element │ +│ ├── cell_boxes: [[x0,y0,x1,y1], ...] (from PP-StructureV3)│ +│ ├── html: "
...
" (from PP-StructureV3)│ +│ └── bbox: [x0, y0, x1, y1] (table boundary) │ +│ │ +│ ┌────────────────────────────────────────────────────────────┐ │ +│ │ Step 1: Grid Inference from cell_boxes │ │ +│ │ │ │ +│ │ cell_boxes → cluster by Y → rows │ │ +│ │ → cluster by X → cols │ │ +│ │ → build grid[row][col] = cell_bbox │ │ +│ └────────────────────────────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌────────────────────────────────────────────────────────────┐ │ +│ │ Step 2: Content Extraction from HTML │ │ +│ │ │ │ +│ │ html → parse → extract text list in reading order │ │ +│ │ → flatten colspan/rowspan → [text1, text2, ...] │ │ +│ └────────────────────────────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌────────────────────────────────────────────────────────────┐ │ +│ │ Step 3: Content-to-Cell Mapping │ │ +│ │ │ │ +│ │ Option A: Sequential assignment (text[i] → cell[i]) │ │ +│ │ Option B: Coordinate matching (text_bbox ∩ cell_bbox) │ │ +│ │ Option C: Row-by-row assignment │ │ +│ └────────────────────────────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌────────────────────────────────────────────────────────────┐ │ +│ │ Step 4: PDF Rendering │ │ +│ │ │ │ +│ │ For each cell in grid: │ │ +│ │ 1. Draw cell border at cell_bbox coordinates │ │ +│ │ 2. Render text content inside cell │ │ +│ └────────────────────────────────────────────────────────────┘ │ +│ │ +│ Output: Table rendered in PDF with accurate cell boundaries │ +└─────────────────────────────────────────────────────────────────┘ +``` + +## Detailed Design + +### 1. Grid Inference Algorithm + +```python +def infer_grid_from_cellboxes(cell_boxes: List[List[float]], threshold: float = 15.0): + """ + Infer row/column grid structure from cell_boxes coordinates. + + Args: + cell_boxes: List of [x0, y0, x1, y1] coordinates + threshold: Clustering threshold for row/column grouping + + Returns: + grid: Dict[Tuple[int,int], Dict] mapping (row, col) to cell info + row_heights: List of row heights + col_widths: List of column widths + """ + # 1. Extract all Y-centers and X-centers + y_centers = [(cb[1] + cb[3]) / 2 for cb in cell_boxes] + x_centers = [(cb[0] + cb[2]) / 2 for cb in cell_boxes] + + # 2. Cluster Y-centers into rows + rows = cluster_values(y_centers, threshold) # Returns sorted list of row indices + + # 3. Cluster X-centers into columns + cols = cluster_values(x_centers, threshold) # Returns sorted list of col indices + + # 4. Assign each cell_box to (row, col) + grid = {} + for i, cb in enumerate(cell_boxes): + row = find_cluster(y_centers[i], rows) + col = find_cluster(x_centers[i], cols) + grid[(row, col)] = { + 'bbox': cb, + 'index': i + } + + # 5. Calculate actual widths/heights from boundaries + row_heights = [rows[i+1] - rows[i] for i in range(len(rows)-1)] + col_widths = [cols[i+1] - cols[i] for i in range(len(cols)-1)] + + return grid, row_heights, col_widths +``` + +### 2. Content Extraction + +The HTML content extraction should handle colspan/rowspan by flattening: + +```python +def extract_cell_contents(html: str) -> List[str]: + """ + Extract cell text contents from HTML in reading order. + Expands colspan/rowspan into repeated empty strings. + + Returns: + List of text strings, one per logical cell position + """ + parser = HTMLTableParser() + parser.feed(html) + + contents = [] + for row in parser.tables[0]['rows']: + for cell in row['cells']: + contents.append(cell['text']) + # For colspan > 1, add empty strings for merged cells + for _ in range(cell.get('colspan', 1) - 1): + contents.append('') + + return contents +``` + +### 3. Content-to-Cell Mapping Strategy + +**Recommended: Row-by-row Sequential Assignment** + +Since HTML content is in reading order (top-to-bottom, left-to-right), map content to grid cells in the same order: + +```python +def map_content_to_grid(grid, contents, num_rows, num_cols): + """ + Map extracted content to grid cells row by row. + """ + content_idx = 0 + for row in range(num_rows): + for col in range(num_cols): + if (row, col) in grid: + if content_idx < len(contents): + grid[(row, col)]['content'] = contents[content_idx] + content_idx += 1 + else: + grid[(row, col)]['content'] = '' + + return grid +``` + +### 4. PDF Rendering Integration + +Modify `pdf_generator_service.py` to use cell_boxes-first path: + +```python +def draw_table_region(self, ...): + cell_boxes = table_element.get('cell_boxes', []) + html_content = table_element.get('content', '') + + if cell_boxes and settings.table_rendering_prefer_cellboxes: + # Try cell_boxes-first approach + grid, row_heights, col_widths = infer_grid_from_cellboxes(cell_boxes) + + if grid: + # Extract content from HTML + contents = extract_cell_contents(html_content) + + # Map content to grid + grid = map_content_to_grid(grid, contents, len(row_heights), len(col_widths)) + + # Render using cell_boxes coordinates + success = self._render_table_from_grid( + pdf_canvas, grid, row_heights, col_widths, + page_height, scale_w, scale_h + ) + + if success: + return # Done + + # Fallback to existing HTML-based rendering + self._render_table_from_html(...) +``` + +## Configuration + +```python +# config.py +class Settings: + # Table rendering strategy + table_rendering_prefer_cellboxes: bool = Field( + default=True, + description="Use cell_boxes coordinates as primary table structure source" + ) + + table_cellboxes_row_threshold: float = Field( + default=15.0, + description="Y-coordinate threshold for row clustering" + ) + + table_cellboxes_col_threshold: float = Field( + default=15.0, + description="X-coordinate threshold for column clustering" + ) +``` + +## Edge Cases + +### 1. Empty cell_boxes +- **Condition**: `cell_boxes` is empty or None +- **Action**: Fall back to HTML-based rendering + +### 2. Content Count Mismatch +- **Condition**: HTML has more/fewer cells than cell_boxes grid +- **Action**: Fill available cells, leave extras empty, log warning + +### 3. Overlapping cell_boxes +- **Condition**: Multiple cell_boxes map to same grid position +- **Action**: Use first one, log warning + +### 4. Single-cell Tables +- **Condition**: Only 1 cell_box detected +- **Action**: Render as single-cell table (valid case) + +## Testing Plan + +1. **Unit Tests** + - `test_infer_grid_from_cellboxes`: Various cell_box configurations + - `test_content_mapping`: Content assignment scenarios + +2. **Integration Tests** + - `test_scan_pdf_table_7`: Verify the problematic table renders correctly + - `test_existing_tables`: No regression on previously working tables + +3. **Visual Verification** + - Compare PDF output before/after for `scan.pdf` + - Check table alignment and text placement diff --git a/openspec/changes/use-cellboxes-for-table-rendering/proposal.md b/openspec/changes/use-cellboxes-for-table-rendering/proposal.md new file mode 100644 index 0000000..d7fccd0 --- /dev/null +++ b/openspec/changes/use-cellboxes-for-table-rendering/proposal.md @@ -0,0 +1,75 @@ +# Proposal: Use cell_boxes as Primary Table Rendering Source + +## Summary + +Modify table PDF rendering to use `cell_boxes` coordinates as the primary source for table structure instead of relying on HTML table parsing. This resolves grid mismatch issues where PP-StructureV3's HTML structure (with colspan/rowspan) doesn't match the cell_boxes coordinate grid. + +## Problem Statement + +### Current Issue + +When processing `scan.pdf`, PP-StructureV3 detected tables with the following characteristics: + +**Table 7 (Element 7)**: +- `cell_boxes`: 27 cells forming an 11x10 grid (by coordinate clustering) +- HTML structure: 9 rows with irregular columns `[7, 7, 1, 3, 3, 3, 3, 3, 1]` due to colspan + +This **grid mismatch** causes: +1. `_compute_table_grid_from_cell_boxes()` returns `None, None` +2. PDF generator falls back to ReportLab Table with equal column distribution +3. Table renders with incorrect column widths, causing visual misalignment + +### Root Cause + +PP-StructureV3 sometimes merges multiple visual tables into one large table region: +- The cell_boxes accurately detect individual cell boundaries +- The HTML uses colspan to represent merged cells, but the grid doesn't match cell_boxes +- Current logic requires exact grid match, which fails for complex merged tables + +## Proposed Solution + +### Strategy: cell_boxes-First Rendering + +Instead of requiring HTML grid to match cell_boxes, **use cell_boxes directly** as the authoritative source for cell boundaries: + +1. **Grid Inference from cell_boxes** + - Cluster cell_boxes by Y-coordinate to determine rows + - Cluster cell_boxes by X-coordinate to determine columns + - Build a row×col grid map from cell_boxes positions + +2. **Content Assignment from HTML** + - Extract text content from HTML in reading order + - Map text content to cell_boxes positions using coordinate matching + - Handle cases where HTML has fewer/more cells than cell_boxes + +3. **Direct PDF Rendering** + - Render table borders using cell_boxes coordinates (already implemented) + - Place text content at calculated cell positions + - Skip ReportLab Table parsing when cell_boxes grid is valid + +### Key Changes + +| Component | Change | +|-----------|--------| +| `pdf_generator_service.py` | Add cell_boxes-first rendering path | +| `table_content_rebuilder.py` | Enhance to support grid-based content mapping | +| `config.py` | Add `table_rendering_prefer_cellboxes: bool` setting | + +## Benefits + +1. **Accurate Table Borders**: cell_boxes from ML detection are more precise than HTML parsing +2. **Handles Grid Mismatch**: Works even when HTML colspan/rowspan don't match cell count +3. **Consistent Output**: Same rendering logic regardless of HTML complexity +4. **Backward Compatible**: Existing HTML-based rendering remains as fallback + +## Non-Goals + +- Not modifying PP-StructureV3 detection logic +- Not implementing table splitting (separate proposal if needed) +- Not changing Direct track (PyMuPDF) table extraction + +## Success Criteria + +1. `scan.pdf` Table 7 renders with correct column widths based on cell_boxes +2. All existing table tests continue to pass +3. No regression for tables where HTML grid matches cell_boxes diff --git a/openspec/changes/use-cellboxes-for-table-rendering/specs/document-processing/spec.md b/openspec/changes/use-cellboxes-for-table-rendering/specs/document-processing/spec.md new file mode 100644 index 0000000..bd61117 --- /dev/null +++ b/openspec/changes/use-cellboxes-for-table-rendering/specs/document-processing/spec.md @@ -0,0 +1,36 @@ +# document-processing Specification Delta + +## MODIFIED Requirements + +### Requirement: Extract table structure (Modified) + +The system SHALL use cell_boxes coordinates as the primary source for table structure when rendering PDFs, with HTML parsing as fallback. + +#### Scenario: Render table using cell_boxes grid +- **WHEN** rendering a table element to PDF +- **AND** the table has valid cell_boxes coordinates +- **AND** `table_rendering_prefer_cellboxes` is enabled +- **THEN** the system SHALL infer row/column grid from cell_boxes coordinates +- **AND** extract text content from HTML in reading order +- **AND** map content to grid cells by position +- **AND** render table borders using cell_boxes coordinates +- **AND** place text content within calculated cell boundaries + +#### Scenario: Handle cell_boxes grid mismatch gracefully +- **WHEN** cell_boxes grid has different dimensions than HTML colspan/rowspan structure +- **THEN** the system SHALL use cell_boxes grid as authoritative structure +- **AND** map available HTML content to cells row-by-row +- **AND** leave unmapped cells empty +- **AND** log warning if content count differs significantly + +#### Scenario: Fallback to HTML-based rendering +- **WHEN** cell_boxes is empty or None +- **OR** `table_rendering_prefer_cellboxes` is disabled +- **OR** cell_boxes grid inference fails +- **THEN** the system SHALL fall back to existing HTML-based table rendering +- **AND** use ReportLab Table with parsed HTML structure + +#### Scenario: Maintain backward compatibility +- **WHEN** processing tables where cell_boxes grid matches HTML structure +- **THEN** the system SHALL produce identical output to previous behavior +- **AND** pass all existing table rendering tests diff --git a/openspec/changes/use-cellboxes-for-table-rendering/tasks.md b/openspec/changes/use-cellboxes-for-table-rendering/tasks.md new file mode 100644 index 0000000..f73c7d1 --- /dev/null +++ b/openspec/changes/use-cellboxes-for-table-rendering/tasks.md @@ -0,0 +1,48 @@ +## 1. Core Algorithm Implementation + +### 1.1 Grid Inference Module +- [x] 1.1.1 Create `CellBoxGridInferrer` class in `pdf_table_renderer.py` +- [x] 1.1.2 Implement `cluster_values()` for Y/X coordinate clustering +- [x] 1.1.3 Implement `infer_grid_from_cellboxes()` main method +- [x] 1.1.4 Add row_heights and col_widths calculation + +### 1.2 Content Mapping +- [x] 1.2.1 Implement `extract_cell_contents()` from HTML +- [x] 1.2.2 Implement `map_content_to_grid()` for row-by-row assignment +- [x] 1.2.3 Handle content count mismatch (more/fewer cells) + +## 2. PDF Generator Integration + +### 2.1 New Rendering Path +- [x] 2.1.1 Add `render_from_cellboxes_grid()` method to TableRenderer +- [x] 2.1.2 Integrate into `draw_table_region()` with cellboxes-first check +- [x] 2.1.3 Maintain fallback to existing HTML-based rendering + +### 2.2 Cell Rendering +- [x] 2.2.1 Draw cell borders using cell_boxes coordinates +- [x] 2.2.2 Render text content with proper alignment and padding +- [x] 2.2.3 Handle multi-line text within cells + +## 3. Configuration + +### 3.1 Settings +- [x] 3.1.1 Add `table_rendering_prefer_cellboxes: bool = True` +- [x] 3.1.2 Add `table_cellboxes_row_threshold: float = 15.0` +- [x] 3.1.3 Add `table_cellboxes_col_threshold: float = 15.0` + +## 4. Testing + +### 4.1 Unit Tests +- [x] 4.1.1 Test grid inference with various cell_box configurations +- [x] 4.1.2 Test content mapping edge cases +- [x] 4.1.3 Test coordinate clustering accuracy + +### 4.2 Integration Tests +- [ ] 4.2.1 Test with `scan.pdf` Table 7 (the problematic case) +- [ ] 4.2.2 Verify no regression on existing table tests +- [ ] 4.2.3 Visual comparison of output PDFs + +## 5. Documentation + +- [x] 5.1 Update inline code comments +- [x] 5.2 Update spec with new table rendering requirement diff --git a/openspec/specs/document-processing/spec.md b/openspec/specs/document-processing/spec.md index dc2cc03..08ecc4d 100644 --- a/openspec/specs/document-processing/spec.md +++ b/openspec/specs/document-processing/spec.md @@ -67,7 +67,7 @@ The system SHALL use a standardized UnifiedDocument model as the common output f - **AND** support identical downstream operations (PDF generation, translation) ### Requirement: Enhanced OCR with Full PP-StructureV3 -The system SHALL utilize the full capabilities of PP-StructureV3, extracting all 23 element types from parsing_res_list. +The system SHALL utilize the full capabilities of PP-StructureV3, extracting all 23 element types from parsing_res_list, with proper handling of visual elements and table coordinates. #### Scenario: Extract comprehensive document structure - **WHEN** processing through OCR track @@ -84,9 +84,17 @@ The system SHALL utilize the full capabilities of PP-StructureV3, extracting all #### Scenario: Extract table structure - **WHEN** PP-StructureV3 identifies a table - **THEN** the system SHALL extract cell content and boundaries +- **AND** validate cell_boxes coordinates against page boundaries +- **AND** apply fallback detection for invalid coordinates - **AND** preserve table HTML for structure - **AND** extract plain text for translation +#### Scenario: Extract visual elements with paths +- **WHEN** PP-StructureV3 identifies visual elements (IMAGE, FIGURE, CHART, DIAGRAM) +- **THEN** the system SHALL preserve saved_path for each element +- **AND** include image dimensions and format +- **AND** enable image embedding in output PDF + ### Requirement: Structure-Preserving Translation Foundation The system SHALL maintain document structure and layout information to support future translation features. @@ -108,3 +116,26 @@ The system SHALL maintain document structure and layout information to support f - **AND** calculate maximum text expansion ratios - **AND** preserve non-translatable elements (logos, signatures) +### Requirement: Generate UnifiedDocument from direct extraction +The system SHALL convert PyMuPDF results to UnifiedDocument with correct table cell merging. + +#### Scenario: Extract tables with cell merging +- **WHEN** direct extraction encounters a table +- **THEN** the system SHALL use PyMuPDF find_tables() API +- **AND** extract cell content with correct rowspan/colspan +- **AND** preserve merged cell boundaries +- **AND** skip placeholder cells covered by merges + +#### Scenario: Filter decoration images +- **WHEN** extracting images from PDF +- **THEN** the system SHALL filter images smaller than minimum area threshold +- **AND** exclude covering/redaction images +- **AND** preserve meaningful content images + +#### Scenario: Preserve text styling with image handling +- **WHEN** direct extraction completes +- **THEN** the system SHALL convert PyMuPDF results to UnifiedDocument +- **AND** preserve text styling, fonts, and exact positioning +- **AND** extract tables with cell boundaries, content, and merge info +- **AND** include only meaningful images in output + diff --git a/openspec/specs/ocr-processing/spec.md b/openspec/specs/ocr-processing/spec.md index db73fad..6cfe845 100644 --- a/openspec/specs/ocr-processing/spec.md +++ b/openspec/specs/ocr-processing/spec.md @@ -195,3 +195,66 @@ The system SHALL provide documentation for cleaning up unused model caches to op - **THEN** the documentation SHALL explain how to delete unused cached models from `~/.paddlex/official_models/` - **AND** list which model directories can be safely removed +### Requirement: Cell Over-Detection Filtering + +The system SHALL validate PP-StructureV3 table detections using metric-based heuristics to filter over-detected cells. + +#### Scenario: Cell density exceeds threshold +- **GIVEN** a table detected by PP-StructureV3 with cell_boxes +- **WHEN** cell density exceeds 3.0 cells per 10,000 px² +- **THEN** the system SHALL flag the table as over-detected +- **AND** reclassify the table as a TEXT element + +#### Scenario: Average cell area below threshold +- **GIVEN** a table detected by PP-StructureV3 +- **WHEN** average cell area is less than 3,000 px² +- **THEN** the system SHALL flag the table as over-detected +- **AND** reclassify the table as a TEXT element + +#### Scenario: Cell height too small +- **GIVEN** a table with height H and N cells +- **WHEN** (H / N) is less than 10 pixels +- **THEN** the system SHALL flag the table as over-detected +- **AND** reclassify the table as a TEXT element + +#### Scenario: Valid tables are preserved +- **GIVEN** a table with normal metrics (density < 3.0, avg area > 3000, height/N > 10) +- **WHEN** validation is applied +- **THEN** the table SHALL be preserved unchanged +- **AND** all cell_boxes SHALL be retained + +### Requirement: Table-to-Text Reclassification + +The system SHALL convert over-detected tables to TEXT elements while preserving content. + +#### Scenario: Table content is preserved +- **GIVEN** a table flagged for reclassification +- **WHEN** converting to TEXT element +- **THEN** the system SHALL extract text content from table HTML +- **AND** preserve the original bounding box +- **AND** set element type to TEXT + +#### Scenario: Reading order is recalculated +- **GIVEN** tables have been reclassified as TEXT +- **WHEN** assembling the final page structure +- **THEN** the system SHALL recalculate reading order +- **AND** sort elements by y0 then x0 coordinates + +### Requirement: Validation Configuration + +The system SHALL provide configurable thresholds for cell validation. + +#### Scenario: Default thresholds are applied +- **GIVEN** no custom configuration is provided +- **WHEN** validating tables +- **THEN** the system SHALL use default thresholds: + - max_cell_density: 3.0 cells/10000px² + - min_avg_cell_area: 3000 px² + - min_cell_height: 10 px + +#### Scenario: Custom thresholds can be configured +- **GIVEN** custom validation thresholds in configuration +- **WHEN** validating tables +- **THEN** the system SHALL use the custom values +- **AND** apply them consistently to all pages + diff --git a/paddle_review.md b/paddle_review.md new file mode 100644 index 0000000..8e0bc70 --- /dev/null +++ b/paddle_review.md @@ -0,0 +1,108 @@ +基於 PaddleX 的複雜文檔解析架構研究:整合 PP-OCRv5 與 PP-StructureV3 之混合融合策略第一章 緒論:文檔智能處理中的結構化與完整性博弈在當前人工智能與計算機視覺領域,文檔智能處理(Document Intelligence)已從單純的文字識別演進為對文檔佈局、語義結構及邏輯關係的深度理解。企業與研究機構在處理財務報表、學術論文、技術手冊及歷史檔案等複雜版面文檔時,面臨著一個核心的技術兩難:結構化語義理解(Structural Understanding) 與 文字讀取完整性(Textual Completeness) 之間的權衡。本研究報告旨在深入探討並驗證一種基於百度 PaddlePaddle(飛槳)生態系統的混合架構方案。該方案針對用戶提出的具體技術構想——即結合 PaddleOCR v5 的高召回率文字檢測能力與 PP-StructureV3 的高層次版面分析能力,構建一套「結構優先,OCR 補全(Structure-First, OCR-Fill)」的文檔解析流水線。本報告將長達兩萬字,詳盡剖析該架構的可行性、底層算法邏輯、數據流設計及工程實現細節,特別聚焦於如何通過幾何算法解決「文字缺漏」與「重複渲染」並存的技術挑戰。1.1 研究背景與問題陳述隨著數字化轉型的加速,傳統的光學字符識別(OCR)技術已無法滿足對非結構化文檔的處理需求。傳統 OCR 僅輸出無序的文字流或簡單的行坐標,缺乏對段落、表格、標題及閱讀順序的認知。為此,版面分析(Layout Analysis)技術應運而生,旨在將文檔分割為具有語義標籤的區域(Regions)。然而,實際應用數據顯示,版面分析模型(如 PP-StructureV3 中的 RT-DETR 或 PicoDet)在追求高層次語義劃分時,往往會忽略非標準化的文本元素,例如頁眉頁腳、邊欄註釋、浮水印編號或散落在圖表周圍的微小文字 1。相比之下,專注於文字檢測的 PP-OCRv5 模型(基於 DBNet++ 與 SVTR)則展現出極高的召回率,幾乎能捕捉圖像中的每一個像素級文字 1。用戶提出的核心問題在於:能否利用 OCR 的高完整性來修補 Structure 的結構化結果? 具體而言,即先執行完整的 OCR 讀取,再執行完整的版面分析,然後以版面分析的結果為骨架,將 OCR 檢測到但版面分析遺漏的文字「填充」進去,同時必須通過算法確保內容不發生重複渲染。1.2 技術路線的可行性分析本研究確認該混合技術路線不僅理論可行,且是目前工業界解決複雜文檔解析問題的最佳實踐之一 3。PaddleX 3.0 作為飛槳的全流程開發工具,提供了模塊化的流水線(Pipeline)設計,允許開發者獲取中間層結果 3。PP-StructureV3 的流水線設計本身即包含了一個全局 OCR 的步驟。在標準運行模式下,PP-StructureV3 會調用 OCR 模型對整圖進行預測,然後將預測結果指派給各個版面區域。然而,為了實現更精細的控制,用戶提出的「分別執行、後端融合」策略能提供更高的靈活性,特別是允許針對 OCR 和版面分析分別微調參數(如 det_db_thresh 或 layout_nms_threshold)6。1.3 報告結構導引本報告將分為八個核心章節進行論述:第二章 將深度解構 PaddleOCR v5 與 PP-StructureV3 的模型架構與輸出特性,從神經網絡層面解釋為何會產生「數據缺漏」。第三章 詳述 PaddleX 3.0 的流水線機制及數據接口,分析 JSON 輸出格式中的關鍵字段。第四章 提出「幾何過濾算法(Geometric Filtering Algorithm)」,這是解決重複渲染問題的核心數學模型,涵蓋 IoU 與 IoA 的計算原理。第五章 探討工程實現,包括 Python 代碼邏輯、Shapely 庫的應用及空間索引優化。第六章 針對表格、公式及印章等特殊元素的處理策略進行邊界條件分析。第七章 分析閱讀順序恢復(Reading Order Recovery)算法在混合數據源下的適配問題。第八章 總結與展望。第二章 模型架構解析:PP-OCRv5 與 PP-StructureV3 的技術異質性要設計有效的融合算法,必須首先理解兩個核心組件——PP-OCRv5 與 PP-StructureV3——在設計哲學、損失函數及訓練數據上的根本差異。這種異質性正是導致兩者輸出結果不一致(Inconsistency)的根源,也是混合架構存在的必要性所在。2.1 PP-OCRv5:極致召回的文字捕獲引擎PP-OCRv5 是 PaddleOCR 體系中的最新一代通用文字識別系統,其設計目標是「所見即所得」,即盡可能檢測並識別圖像中存在的所有文字痕跡,而不論其語義重要性如何 1。2.1.1 檢測模塊:DBNet++ 與幾何感知PP-OCRv5 的檢測端通常採用 DBNet(Differentiable Binarization)及其改進版本。這類算法的核心優勢在於其對極限場景的適應性。多尺度特徵融合: 通過特徵金字塔網絡(FPN),模型能夠同時檢測佔據半個頁面的大標題和角落裡僅有幾個像素高的頁碼。二值化邊界預測: DBNet 通過預測二值化圖和閾值圖,能夠精確分割出形狀不規則的文字區域,這意味著即使是傾斜、彎曲或密集的文本行也能被獨立檢測出來 8。高召回率特性: 由於訓練數據涵蓋了自然場景文本(Scene Text),PP-OCRv5 對「噪聲文字」極為敏感。對於文檔處理而言,這是一把雙刃劍:它能識別出水印、頁眉、甚至紙張污漬形成的文字狀紋理,但它無法區分這些文字是正文的一部分還是無關的干擾 。2.1.2 識別模塊:SVTR 與視覺變換器在識別端,PP-OCRv5 引入了 SVTR(Scene Text Recognition with a Vision Transformer)架構 。不同於傳統的 CRNN(CNN+RNN),SVTR 利用 Transformer 的注意力機制處理序列字符。上下文感知: 即使單個字符模糊,模型也能根據上下文推斷出正確內容。多語言統一: v5 版本實現了單一模型支持中、英、日等多種語言,這對於處理包含混合語言的複雜文檔(如引用外文文獻的學術論文)至關重要 3。總結: PP-OCRv5 輸出的是一組無序的、細粒度的四邊形框(Bounding Boxes)及其對應的文本內容。它不知道「段落」的概念,只知道「文本行」。2.2 PP-StructureV3:語義導向的版面重構引擎PP-StructureV3 的目標則完全不同。它不僅僅是「看見」文字,更是要「理解」文檔的視覺結構。它試圖將像素矩陣轉化為類似 DOM 樹的邏輯結構 3。2.2.1 版面分析模型:RT-DETR 與 PicoDetPP-StructureV3 的核心是版面區域檢測模型。在 v3 版本中,引入了基於 RT-DETR(Real-Time DEtection TRansformer)的高精度模型和基於 PicoDet 的輕量級模型 。類別定義: 這些模型被訓練去識別特定的語義類別,包括:標題(Title)、文本(Text)、表格(Table)、圖像(Figure)、圖像標題(Figure Caption)、表格標題(Table Caption)、頁眉(Header)、頁腳(Footer)、公式(Equation)及參考文獻(Reference)2。區域聚合: 與 OCR 不同,版面分析模型傾向於將視覺上聚集的文本行合併為一個大的檢測框(Block)。例如,一個包含十行文字的段落會被檢測為一個單一的 Text 框,而不是十個獨立的行框。漏檢機制(The Missing Gap): 這是用戶遇到問題的關鍵。版面分析模型的訓練數據集(如 CDLA, PubLayNet)通常經過人工清洗,標註者可能忽略了一些非標準元素(如邊緣的批註、裝飾性文字)。因此,當模型在推理時遇到這些不屬於預定義「正文」範疇的文字時,往往會將其視為背景而忽略。這就解釋了為什麼「OCR 的完整度一定會大於 Structure」3。2.2.2 表格與公式識別子流水線PP-StructureV3 不僅檢測區域,還包含專門的子模型來處理特定區域。表格識別(SLANet): 當檢測到 Table 區域時,會裁剪該區域並送入表格識別模型(如 SLANet),輸出 HTML 源碼 3。問題: 表格識別模型有時會漏掉單元格內的文字,或者在複雜嵌套表格中產生結構錯誤。2.3 混合架構的理論基礎用戶的提議實際上是構建一個 「互補型集成系統(Complementary Ensemble System)」。基底(Base): 使用 PP-StructureV3 的結果作為文檔的「骨架(Skeleton)」。這保證了文檔的邏輯結構(段落、表格、閱讀順序)是正確的,便於後續轉換為 Markdown 或 Word。增強(Augmentation): 使用 PP-OCRv5 的結果作為「候選池(Candidate Pool)」。過濾(Filtering): 通過幾何比對,從候選池中剔除那些已經被骨架包含的元素。注入(Injection): 將剩餘的 OCR 元素(即骨架遺漏的部分)作為「游離元素(Floating Elements)」注入到文檔結構中。這種架構在理論上能夠達到 100% 的信息召回率,同時保持 90% 以上的結構化準確率,是處理複雜 PDF 的最優解。第三章 PaddleX 流水線機制與數據接口深度剖析為了實現上述理論架構,我們需要深入理解 PaddleX 3.0 的工程實現機制,特別是其數據輸入輸出接口。PaddleX 是百度推出的全流程開發工具,它對 PaddleOCR 進行了封裝,提供了更為統一和標準化的 Pipeline API 3。3.1 PaddleX Pipeline 的初始化與配置在 Python 環境中,PP-StructureV3 和 PP-OCRv5 可以作為獨立的 Pipeline 對象被調用。根據 和 13,初始化代碼通常如下:Pythonfrom paddleocr import PPStructureV3, PaddleOCR + +# 初始化 Structure 流水線 +structure_engine = PPStructureV3( + lang='ch', + show_log=True, + use_orientation_classify=True, + image_orientation=True +) + +# 初始化 OCR 流水線(用於獲取全量文字) +# 注意:這裡顯式使用 PaddleOCR 類來獲取 v5 的能力 +ocr_engine = PaddleOCR( + use_angle_cls=True, + lang="ch", + ocr_version="PP-OCRv5" +) +關鍵配置分析:recovery=True:在 PP-StructureV3 中啟用此選項至關重要,因為它會觸發閱讀順序恢復模塊,並生成版面恢復所需的輔助信息 14。use_pdf2docx_api:如果設置為 True,系統可能會嘗試直接解析 PDF 內部的文字層。對於掃描件或複雜 PDF,建議設置為 False 以強制使用 OCR 視覺模型,這樣能保證 OCR 結果與視覺結果的一致性 15。3.2 數據輸出結構解析:JSON 與 Dict理解 PaddleX 的輸出格式是實現「結果比較」的前提。根據 16,PP-StructureV3 的預測結果是一個包含豐富信息的字典(Dict)。3.2.1 PP-StructureV3 輸出對象 (res)當調用 pipeline.predict(img) 後,返回的 res 對象通常包含以下關鍵字段:字段鍵名 (Key)數據類型描述用途layout_det_resList版面檢測結果提供文檔的結構骨架(Text, Table, Figure 等區域的坐標)。overall_ocr_resList全局 OCR 結果關鍵字段。包含整張圖的所有文字檢測框和識別內容。res (或 regions)List整合後的區域結果這是經過內部匹配後的結果,每個區域內包含了對應的 OCR 文字。table_htmlString表格 HTML 代碼僅在 type='Table' 的區域中存在。特別注意: 18 指出,在某些版本的 PaddleX 中,overall_ocr_res 可能是一個獨立的鍵,與 layout_det_res 並列。這個 overall_ocr_res 實際上就是我們需要的「完整 OCR 讀取結果」。這意味著,用戶不需要顯式調用兩次模型(一次 OCR,一次 Structure)。PP-StructureV3 內部已經執行了全圖 OCR。我們可以從 Structure 的返回結果中直接提取出 overall_ocr_res 作為我們的「全集」,提取 layout_det_res 作為「子集結構」,然後在內存中進行比對。這將極大節省推理時間(Inference Time),避免重複計算 。然而,如果用戶希望使用特定參數配置的 OCR(例如調整了 det_db_thresh 以捕獲更淡的文字),則顯式運行獨立的 PaddleOCR 實例是必要的。在這種情況下,我們將有兩個獨立的結果集:Set A (Structure): 來自 structure_engine 的版面區域列表。Set B (OCR): 來自 ocr_engine 的文字行列表。3.3 數據結構的幾何表示為了進行比對,我們必須將這兩個集合中的「位置信息」標準化。OCR 結果: 通常為四點坐標 [[x1, y1], [x2, y2], [x3, y3], [x4, y4]]。這是一個多邊形(Polygon),可能是矩形也可能是傾斜的四邊形。Structure 結果: 通常為 [x_min, y_min, x_max, y_max] 的軸對齊矩形(Axis-Aligned Bounding Box, AABB),但在某些複雜場景下也可能是四點坐標。數據標準化策略: 在 Python 腳本中,建議統一將所有坐標轉換為 shapely.geometry.Polygon 對象。這將為後續的交併比計算提供強大的數學支持 19。第四章 幾何融合算法:解決重複渲染的核心數學模型本章是解決用戶問題的核心技術部分。如何判斷一個 OCR 檢測到的文字行是否已經「包含」在某個版面區域(如段落或表格)中?簡單的中心點匹配往往不夠精確,特別是在文字行跨越區域邊界或區域重疊的情況下。我們需要引入 IoA (Intersection over Area) 的概念。4.1 IoU 與 IoA 的區別與應用在目標檢測中,常用的指標是 IoU (Intersection over Union):$$IoU(A, B) = \frac{Area(A \cap B)}{Area(A \cup B)}$$IoU 用於衡量兩個框的重合程度,通常用於判斷兩個預測框是否指向同一個物體。然而,在我們的場景中,關係是不對稱的。OCR 文字行(小框,記為 $B_{ocr}$)通常位於版面區域(大框,記為 $B_{layout}$)的內部。我們關心的是「$B_{ocr}$ 是否被 $B_{layout}$ 包含」。如果使用 IoU,由於 $B_{layout}$ 面積很大,IoU 數值會非常小,無法作為判斷依據。因此,我們必須使用 IoA (Intersection over Area),具體指的是 Intersection over OCR Area:$$IoA(B_{ocr}, B_{layout}) = \frac{Area(B_{ocr} \cap B_{layout})}{Area(B_{ocr})}$$這個公式計算的是:OCR 框的面積中,有多少比例落在了版面區域框的內部。如果 $IoA \approx 1.0$:表示 OCR 文字行完全在版面區域內。結論:忽略該 OCR 結果(因為 Structure 結果中已經包含了它)。如果 $IoA \approx 0$:表示 OCR 文字行完全在版面區域外。結論:保留該 OCR 結果(這是 Structure 遺漏的文字)。如果 $0 < IoA < 1.0$:表示部分重疊。這通常發生在邊界處。我們需要設定一個閾值(Threshold)。4.2 融合算法邏輯設計為了實現「結構優先,OCR 補漏」,我們設計如下的算法流程:輸入:Layout_List: 版面分析得到的區域列表(包含 Text, Table, Image 等)。OCR_List: 全局 OCR 得到的所有文字行列表。輸出:Final_Render_List: 用於最終渲染的元素列表,包含所有的 Layout 元素和補充的 OCR 元素。算法步驟:初始化: 將 Layout_List 中的所有元素加入 Final_Render_List。構建空間索引(可選但推薦): 為了加速查詢,使用 R-tree 將 Layout_List 的邊界框建立索引 21。遍歷過濾: 對 OCR_List 中的每一個元素 $T_{ocr}$ 進行檢查:設標記 is_redundant = False。遍歷 Layout_List 中的每一個區域 $R_{layout}$(或通過 R-tree 查詢相交的區域)。計算 $IoA = \frac{Area(T_{ocr} \cap R_{layout})}{Area(T_{ocr})}$。判定邏輯:若 $R_{layout}$ 的類型是 Text, Title, Header, Footer, List:若 $IoA > 0.6$(閾值),則判定 $T_{ocr}$ 為冗餘,設置 is_redundant = True,並跳出內層循環。若 $R_{layout}$ 的類型是 Table:表格區域的處理較為敏感。通常表格識別模型會重構表格內容。若 OCR 文字落在表格內,直接疊加會破壞表格結構。因此,通常若 $IoA > 0.1$(更嚴格的閾值),即視為冗餘。若 $R_{layout}$ 的類型是 Figure/Image:這取決於用戶需求。如果用戶希望提取圖片中的文字(如圖表中的數據點),則即使 $IoA$ 很高,也可以判定為不冗餘(即保留 OCR)。但通常為了版面整潔,Structure 往往會忽略圖中文字,因此這裡可以根據配置決定。結果收集: 若內層循環結束後,is_redundant 仍為 False,則說明該文字行是 Structure 遺漏的。將 $T_{ocr}$ 標記為 Floating Text(浮動文本),並加入 Final_Render_List。4.3 閾值的選擇與調優閾值的選擇至關重要。閾值過高(如 0.9): 要求 OCR 框幾乎完全在 Layout 框內才算重複。如果 Layout 框預測得稍微小了一點(邊界收縮),導致 OCR 框露出一部分,算法會錯誤地認為這是新文字並保留它,導致重複渲染(Ghosting Effect),即正文文字旁邊又出現了一遍同樣的文字。閾值過低(如 0.1): 只要有一點點重疊就刪除 OCR。這可能導致邊緣處的獨立註釋被錯誤刪除。經驗推薦值: 對於 Text/Paragraph 區域,建議設置 $IoA \in [0.5, 0.7]$。這能容忍版面檢測框的輕微誤差,同時有效區分獨立文本。第五章 工程實現策略:Python 代碼與庫的整合在實際的 Python 開發中,我們需要結合 paddleocr、shapely 和 numpy 來實現上述邏輯。以下是詳細的實現代碼結構分析。5.1 環境準備與依賴庫除 PaddleOCR 外,必須安裝幾何處理庫:Bashpip install shapely rtree +shapely 用於多邊形運算,rtree 用於空間索引(對於大頁面或批量處理非常有效)。5.2 核心融合函數實現以下代碼展示了如何利用 Shapely 庫實現高精度的過濾邏輯 19:Pythonfrom shapely.geometry import Polygon + +def calculate_ioa(ocr_poly, layout_poly): + """計算 Intersection over OCR Area""" + if not ocr_poly.intersects(layout_poly): + return 0.0 + try: + intersection_area = ocr_poly.intersection(layout_poly).area + ocr_area = ocr_poly.area + if ocr_area == 0: return 0.0 + return intersection_area / ocr_area + except Exception as e: + # 處理幾何拓撲錯誤 + return 0.0 + +def merge_structure_and_ocr(structure_res, ocr_res, ioa_thresh=0.6): + """ + 輸入: + structure_res: PP-StructureV3 的 layout_det_res 列表 + ocr_res: PaddleOCR 的全量識別結果 + + 輸出: + merged_list: 包含 layout 區域和補漏 OCR 的混合列表 + """ + + # 1. 將 Layout 區域轉換為 Shapely Polygon 對象,提升後續計算效率 + layout_polys = + for region in structure_res: + bbox = region['bbox'] # 假設格式為 [x1, y1, x2, y2] + # 構建矩形 Polygon + poly = Polygon([(bbox, bbox), (bbox, bbox), + (bbox, bbox), (bbox, bbox)]) + layout_polys.append({ + 'poly': poly, + 'type': region['label'], + 'data': region + }) + + final_items = + # 先加入所有的 Layout 元素 + for item in layout_polys: + final_items.append(item['data']) + + # 2. 遍歷 OCR 結果進行過濾 + for line in ocr_res: + points = line # [[x1,y1], [x2,y2], [x3,y3], [x4,y4]] + text = line + ocr_poly = Polygon(points) + + is_covered = False + for layout_item in layout_polys: + l_poly = layout_item['poly'] + l_type = layout_item['type'] + + # 計算 IoA + ioa = calculate_ioa(ocr_poly, l_poly) + + # 針對不同類型的動態閾值策略 + current_thresh = ioa_thresh + if l_type == 'table': + current_thresh = 0.1 # 表格區域採用嚴格過濾 + elif l_type == 'figure': + current_thresh = 0.8 # 圖片區域允許更多容錯(或者設為 1.0 強制保留) + + if ioa > current_thresh: + is_covered = True + break + + if not is_covered: + # 這是 Structure 遺漏的文字 + # 將其包裝為類似 Structure 的格式 + new_item = { + 'type': 'text', # 或者標記為 'floating_text' + 'bbox': [min(p for p in points), min(p for p in points), + max(p for p in points), max(p for p in points)], + 'res': [{'text': text, 'confidence': line}], + 'is_patch': True # 標記這是補丁數據 + } + final_items.append(new_item) + + return final_items +5.3 數據結構的標準化與清洗在實際操作中,PP-OCRv5 返回的坐標通常是浮點數,且可能包含負值(如果檢測框超出圖像邊界)。在生成 Shapely 對象前,必須進行數據清洗:坐標取整與裁剪: 將坐標限制在 [0, 0, image_width, image_height] 範圍內。無效多邊形處理: 檢測出的四邊形可能存在自相交(Self-intersection),這會導致 Shapely 報錯。需使用 ocr_poly.buffer(0) 技巧來修復無效多邊形 19。第六章 特殊元素與邊界條件的處理策略在複雜版面中,表格、印章及跨頁元素是導致解析失敗的主要原因。本章針對這些特殊情況提出具體的處理策略。6.1 表格(Table)的衝突解決表格是文檔中最複雜的元素。PP-StructureV3 內置的表格識別模型會重建表格結構(HTML),這通常比單純的 OCR 文字拼接要好得多。問題: 有時表格識別模型會丟失單元格內容,或者 OCR 誤將表格線識別為「1」或「I」。策略:信任優先級: 默認信任表格識別模型。凡是落在 Table 區域內的 OCR 結果,一律過濾掉,避免破壞 HTML 結構。容錯回退: 如果表格區域內的 OCR 文字數量遠多於表格識別模型提取出的文字數量(例如多出 50%),則可能意味著表格識別失敗。此時應降級處理:丟棄表格結構,僅保留 OCR 文字,將其視為普通段落,以保證信息不丟失。6.2 圖片(Figure)中的文字學術論文或技術報告中的圖表往往包含軸標籤、圖例等文字。現狀: PP-StructureV3 的 Figure 區域通常只輸出圖片裁剪圖,不包含文字。用戶需求: 如果用戶需要全文檢索,圖中文字不能丟。實現: 在過濾算法中,對 Figure 類型的區域進行特殊處理。即使 $IoA$ 很高,也可以選擇不刪除該 OCR 結果,而是將其標記為 Figure Text 並在渲染時將其放置在圖片下方作為註釋,或者作為圖片的 alt 屬性。6.3 閱讀順序的重構(Reading Order Recovery)當我們將「補漏」的 OCR 文字加入 Final_Render_List 後,列表的順序是混亂的。直接渲染會導致文檔邏輯跳躍。必須重新執行閱讀順序排序算法。XY-Cut 算法: 這是文檔分析中最經典的排序算法。它遞歸地將頁面在水平或垂直投影的空隙處切分。融合排序:將所有元素(原有的 Layout 塊 + 新增的 OCR 塊)視為同等地位的節點。計算每個節點的中心點 $(C_x, C_y)$。根據文檔類型(單欄或多欄)應用啟發式排序規則。對於單欄文檔,簡單的 sorted(items, key=lambda k: (k.y, k.x))(從上到下,從左到右)通常足夠。對於多欄文檔,必須使用 PaddleX 內置的 sorted_layout_boxes 函數 3。該函數能夠處理複雜的列切分邏輯。重要的是,我們必須確保新加入的 OCR 塊的數據結構與 sorted_layout_boxes 函數要求的輸入格式完全一致。第七章 性能評估與優化建議引入幾何運算和雙重模型推斷會增加系統的計算負擔。本章分析性能影響並提供優化建議。7.1 計算複雜度分析推斷時間: 如果採用「一次推斷,提取兩份數據」的策略(即僅運行 StructureV3 並提取內部 OCR 結果),推斷時間幾乎沒有增加。如果分別運行兩個模型,時間將翻倍。後處理時間: 幾何過濾算法的複雜度為 $O(N \times M)$,其中 $N$ 為版面區域數(通常 < 50),$M$ 為 OCR 行數(可能 > 2000)。在 Python 中,計算 100,000 次多邊形交集大約需要 0.1 - 0.5 秒,這相對於模型推斷時間(通常數秒)是可以忽略不計的 24。7.2 空間索引優化對於極端密集的文檔(如報表、電話簿),$M$ 可能非常大。此時應使用 R-tree 索引。Pythonfrom rtree import index +idx = index.Index() +# 將 Layout 區域插入索引 +for i, region in enumerate(layout_regions): + idx.insert(i, region['bbox']) + +# 查詢時僅計算候選區域 +candidates = list(idx.intersection(ocr_box)) +這將複雜度降低至 $O(M \log N)$,確保系統在大規模批處理時的穩定性。7.3 重複渲染的極端案例與對策儘管有 IoA 過濾,仍可能出現視覺上的重複。這通常是因為 PaddleOCR 的檢測框比實際文字大(包含背景),或者版面區域比實際內容小。對策 - 邊界收縮(Shrinking): 在計算 IoA 之前,將 OCR 框向內收縮 1-2 像素,或者將版面區域向外擴張(Buffer/Dilate)5 像素。這增加了「被包含」的概率,能有效減少邊緣處的重複渲染 25。第八章 總結與未來展望本研究報告對基於 PaddleX 的 PP-OCRv5 與 PP-StructureV3 混合解析架構進行了全方位的技術論證。8.1 研究結論架構可行性: 用戶提出的「先 OCR、後 Structure、對比補全」的思路在技術上是完全可行的,且是解決複雜 PDF 解析中信息丟失問題的有效手段。核心價值: 該方案結合了 PP-OCRv5 的高召回率(>99%)和 PP-StructureV3 的高結構化能力,通過幾何約束算法消除了兩者之間的冗餘,實現了文檔解析的「帕累托最優」。關鍵技術點: 成功的關鍵在於 IoA (Intersection over Area) 算法的正確實現,以及對表格、圖片等特殊元素的差異化閾值設置。8.2 工程建議優先使用單一流水線: 建議優先嘗試從 PP-StructureV3 的 overall_ocr_res 中獲取 OCR 數據,以節省計算資源。精細化閾值調優: 開發者應建立一個包含各類典型壞例(Bad Cases)的驗證集,通過自動化測試來尋找最佳的 IoA 閾值。數據結構對齊: 在將補漏數據注入渲染列表時,務必保證數據字段(bbox, text, type)與原始 Structure 輸出保持一致,以復用 PaddleX 的 recovery_to_docx 等後處理工具。綜上所述,該混合架構不僅能解決當前的文字缺漏問題,也為未來構建更智能的 RAG(檢索增強生成)知識庫提供了高質量的結構化數據基礎。這是一條值得投入的工程實踐路徑。 \ No newline at end of file