feat: enable document orientation detection for scanned PDFs

- Enable PP-StructureV3's use_doc_orientation_classify feature - Detect rotation angle from doc_preprocessor_res.angle - Swap page dimensions (width <-> height) for 90°/270° rotations - Output PDF now correctly displays landscape-scanned content Also includes: - Archive completed openspec proposals - Add simplify-frontend-ocr-config proposal (pending) - Code cleanup and frontend simplification 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-11 17:13:46 +08:00
parent 57070af307
commit cfe65158a3
58 changed files with 1271 additions and 3048 deletions
--- a/backend/app/services/pp_structure_enhanced.py
+++ b/backend/app/services/pp_structure_enhanced.py
@@ -28,11 +28,9 @@ from PIL import Image
 import numpy as np
 import cv2
 from app.models.unified_document import ElementType
-from app.services.cell_validation_engine import CellValidationEngine, CellValidationConfig
 from app.core.config import settings
 from app.services.memory_manager import prediction_context
 from app.services.cv_table_detector import CVTableDetector
-from app.services.table_content_rebuilder import TableContentRebuilder

 logger = logging.getLogger(__name__)

@@ -159,6 +157,7 @@ class PPStructureEnhanced:
            all_images = []
            all_tables = []
            visualization_dir = None
+            detected_rotation = "0"  # Default: no rotation

            # Process each page result
            for page_idx, page_result in enumerate(results):
@@ -247,6 +246,56 @@ class PPStructureEnhanced:
                        ocr_count = len(overall_ocr_res.get('rec_texts', []))
                        logger.info(f"Found overall_ocr_res with {ocr_count} text regions")

+                    # Extract doc_preprocessor_res for orientation detection
+                    # When use_doc_orientation_classify=True, this contains the detected rotation angle
+                    # Note: doc_preprocessor_res may be at top-level result_json OR inside 'res'
+                    doc_preprocessor_res = None
+
+                    # First, check result_dict (might be result_json['res'])
+                    if 'doc_preprocessor_res' in result_dict:
+                        doc_preprocessor_res = result_dict['doc_preprocessor_res']
+                        logger.info("Found doc_preprocessor_res in result_dict")
+                    # Also check top-level result_json if it exists and differs from result_dict
+                    elif hasattr(page_result, 'json') and isinstance(page_result.json, dict):
+                        if 'doc_preprocessor_res' in page_result.json:
+                            doc_preprocessor_res = page_result.json['doc_preprocessor_res']
+                            logger.info("Found doc_preprocessor_res at top-level result_json")
+
+                    # Debug: Log available keys to help diagnose structure issues
+                    if doc_preprocessor_res is None:
+                        logger.warning(f"doc_preprocessor_res NOT found. result_dict keys: {list(result_dict.keys()) if result_dict else 'None'}")
+                        if hasattr(page_result, 'json') and isinstance(page_result.json, dict):
+                            logger.warning(f"result_json keys: {list(page_result.json.keys())}")
+
+                    if doc_preprocessor_res:
+                        # Debug: Log the complete structure of doc_preprocessor_res
+                        logger.info(f"doc_preprocessor_res keys: {list(doc_preprocessor_res.keys()) if isinstance(doc_preprocessor_res, dict) else type(doc_preprocessor_res)}")
+                        logger.info(f"doc_preprocessor_res content: {doc_preprocessor_res}")
+
+                        # Try multiple possible key names for rotation info
+                        # PaddleOCR may use different structures depending on version
+                        label_names = doc_preprocessor_res.get('label_names', [])
+                        class_ids = doc_preprocessor_res.get('class_ids', [])
+                        labels = doc_preprocessor_res.get('labels', [])
+                        angle = doc_preprocessor_res.get('angle', None)
+
+                        # Determine rotation from available data
+                        detected_rotation = "0"
+                        if label_names:
+                            detected_rotation = str(label_names[0])
+                        elif class_ids:
+                            # class_ids: 0=0°, 1=90°, 2=180°, 3=270°
+                            rotation_map = {0: "0", 1: "90", 2: "180", 3: "270"}
+                            detected_rotation = rotation_map.get(class_ids[0], "0")
+                        elif labels:
+                            detected_rotation = str(labels[0])
+                        elif angle is not None:
+                            detected_rotation = str(angle)
+
+                        logger.info(f"Document orientation detected: {detected_rotation}° (label_names={label_names}, class_ids={class_ids}, labels={labels}, angle={angle})")
+                    else:
+                        detected_rotation = "0"  # Default: no rotation
+
                # Process parsing_res_list if found
                if parsing_res_list:
                    elements = self._process_parsing_res_list(
@@ -295,7 +344,8 @@ class PPStructureEnhanced:
                'tables': all_tables,
                'images': all_images,
                'element_types': self._count_element_types(all_elements),
-                'has_parsing_res_list': parsing_res_list is not None
+                'has_parsing_res_list': parsing_res_list is not None,
+                'detected_rotation': detected_rotation  # Document orientation: "0", "90", "180", "270"
            }

            # Add visualization directory if available
@@ -653,42 +703,6 @@ class PPStructureEnhanced:
                        element['embedded_images'] = embedded_images
                        logger.info(f"[TABLE] Embedded {len(embedded_images)} images into table")

-                # 4. Table content rebuilding from raw OCR regions
-                # When cell_boxes have boundary issues, rebuild table content from raw OCR
-                # Only if table_content_rebuilder is enabled (disabled by default as it's a patch behavior)
-                logger.info(f"[TABLE] raw_ocr_regions available: {raw_ocr_regions is not None and len(raw_ocr_regions) if raw_ocr_regions else 0}")
-                logger.info(f"[TABLE] cell_boxes available: {len(element.get('cell_boxes', []))}")
-                if settings.table_content_rebuilder_enabled and raw_ocr_regions and element.get('cell_boxes'):
-                    rebuilder = TableContentRebuilder()
-                    should_rebuild, rebuild_reason = rebuilder.should_rebuild(
-                        element['cell_boxes'],
-                        bbox,
-                        element.get('html', '')
-                    )
-
-                    if should_rebuild:
-                        logger.info(f"[TABLE] Triggering table rebuild: {rebuild_reason}")
-                        rebuilt_table, rebuild_stats = rebuilder.rebuild_table(
-                            cell_boxes=element['cell_boxes'],
-                            table_bbox=bbox,
-                            raw_ocr_regions=raw_ocr_regions,
-                            original_html=element.get('html', '')
-                        )
-
-                        if rebuilt_table:
-                            # Update element with rebuilt content
-                            element['html'] = rebuilt_table['html']
-                            element['rebuilt_table'] = rebuilt_table
-                            element['rebuild_stats'] = rebuild_stats
-                            element['extracted_text'] = self._extract_text_from_html(rebuilt_table['html'])
-                            logger.info(
-                                f"[TABLE] Rebuilt table: {rebuilt_table['rows']}x{rebuilt_table['cols']} "
-                                f"with {len(rebuilt_table['cells'])} cells"
-                            )
-                        else:
-                            logger.warning(f"[TABLE] Rebuild failed: {rebuild_stats.get('reason', 'unknown')}")
-                            element['rebuild_stats'] = rebuild_stats
-
            # Special handling for images/figures/charts/stamps (visual elements that need cropping)
            elif mapped_type in [ElementType.IMAGE, ElementType.FIGURE, ElementType.CHART, ElementType.DIAGRAM, ElementType.STAMP, ElementType.LOGO]:
                # Save image if path provided
@@ -718,21 +732,6 @@ class PPStructureEnhanced:
            elements.append(element)
            logger.debug(f"Processed element {idx}: type={mapped_type}, bbox={bbox}")

-        # Apply cell validation to filter over-detected tables
-        if settings.cell_validation_enabled:
-            cell_validator = CellValidationEngine(CellValidationConfig(
-                max_cell_density=settings.cell_validation_max_density,
-                min_avg_cell_area=settings.cell_validation_min_cell_area,
-                min_cell_height=settings.cell_validation_min_cell_height,
-                enabled=True
-            ))
-            elements, validation_stats = cell_validator.validate_and_filter_elements(elements)
-            if validation_stats['reclassified_tables'] > 0:
-                logger.info(
-                    f"Cell validation: {validation_stats['reclassified_tables']}/{validation_stats['total_tables']} "
-                    f"tables reclassified as TEXT due to over-detection"
-                )
-
        return elements

    def _embed_images_in_table(