feat: enable document orientation detection for scanned PDFs

- Enable PP-StructureV3's use_doc_orientation_classify feature
- Detect rotation angle from doc_preprocessor_res.angle
- Swap page dimensions (width <-> height) for 90°/270° rotations
- Output PDF now correctly displays landscape-scanned content

Also includes:
- Archive completed openspec proposals
- Add simplify-frontend-ocr-config proposal (pending)
- Code cleanup and frontend simplification

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
egg
2025-12-11 17:13:46 +08:00
parent 57070af307
commit cfe65158a3
58 changed files with 1271 additions and 3048 deletions

View File

@@ -28,11 +28,9 @@ from PIL import Image
import numpy as np
import cv2
from app.models.unified_document import ElementType
from app.services.cell_validation_engine import CellValidationEngine, CellValidationConfig
from app.core.config import settings
from app.services.memory_manager import prediction_context
from app.services.cv_table_detector import CVTableDetector
from app.services.table_content_rebuilder import TableContentRebuilder
logger = logging.getLogger(__name__)
@@ -159,6 +157,7 @@ class PPStructureEnhanced:
all_images = []
all_tables = []
visualization_dir = None
detected_rotation = "0" # Default: no rotation
# Process each page result
for page_idx, page_result in enumerate(results):
@@ -247,6 +246,56 @@ class PPStructureEnhanced:
ocr_count = len(overall_ocr_res.get('rec_texts', []))
logger.info(f"Found overall_ocr_res with {ocr_count} text regions")
# Extract doc_preprocessor_res for orientation detection
# When use_doc_orientation_classify=True, this contains the detected rotation angle
# Note: doc_preprocessor_res may be at top-level result_json OR inside 'res'
doc_preprocessor_res = None
# First, check result_dict (might be result_json['res'])
if 'doc_preprocessor_res' in result_dict:
doc_preprocessor_res = result_dict['doc_preprocessor_res']
logger.info("Found doc_preprocessor_res in result_dict")
# Also check top-level result_json if it exists and differs from result_dict
elif hasattr(page_result, 'json') and isinstance(page_result.json, dict):
if 'doc_preprocessor_res' in page_result.json:
doc_preprocessor_res = page_result.json['doc_preprocessor_res']
logger.info("Found doc_preprocessor_res at top-level result_json")
# Debug: Log available keys to help diagnose structure issues
if doc_preprocessor_res is None:
logger.warning(f"doc_preprocessor_res NOT found. result_dict keys: {list(result_dict.keys()) if result_dict else 'None'}")
if hasattr(page_result, 'json') and isinstance(page_result.json, dict):
logger.warning(f"result_json keys: {list(page_result.json.keys())}")
if doc_preprocessor_res:
# Debug: Log the complete structure of doc_preprocessor_res
logger.info(f"doc_preprocessor_res keys: {list(doc_preprocessor_res.keys()) if isinstance(doc_preprocessor_res, dict) else type(doc_preprocessor_res)}")
logger.info(f"doc_preprocessor_res content: {doc_preprocessor_res}")
# Try multiple possible key names for rotation info
# PaddleOCR may use different structures depending on version
label_names = doc_preprocessor_res.get('label_names', [])
class_ids = doc_preprocessor_res.get('class_ids', [])
labels = doc_preprocessor_res.get('labels', [])
angle = doc_preprocessor_res.get('angle', None)
# Determine rotation from available data
detected_rotation = "0"
if label_names:
detected_rotation = str(label_names[0])
elif class_ids:
# class_ids: 0=0°, 1=90°, 2=180°, 3=270°
rotation_map = {0: "0", 1: "90", 2: "180", 3: "270"}
detected_rotation = rotation_map.get(class_ids[0], "0")
elif labels:
detected_rotation = str(labels[0])
elif angle is not None:
detected_rotation = str(angle)
logger.info(f"Document orientation detected: {detected_rotation}° (label_names={label_names}, class_ids={class_ids}, labels={labels}, angle={angle})")
else:
detected_rotation = "0" # Default: no rotation
# Process parsing_res_list if found
if parsing_res_list:
elements = self._process_parsing_res_list(
@@ -295,7 +344,8 @@ class PPStructureEnhanced:
'tables': all_tables,
'images': all_images,
'element_types': self._count_element_types(all_elements),
'has_parsing_res_list': parsing_res_list is not None
'has_parsing_res_list': parsing_res_list is not None,
'detected_rotation': detected_rotation # Document orientation: "0", "90", "180", "270"
}
# Add visualization directory if available
@@ -653,42 +703,6 @@ class PPStructureEnhanced:
element['embedded_images'] = embedded_images
logger.info(f"[TABLE] Embedded {len(embedded_images)} images into table")
# 4. Table content rebuilding from raw OCR regions
# When cell_boxes have boundary issues, rebuild table content from raw OCR
# Only if table_content_rebuilder is enabled (disabled by default as it's a patch behavior)
logger.info(f"[TABLE] raw_ocr_regions available: {raw_ocr_regions is not None and len(raw_ocr_regions) if raw_ocr_regions else 0}")
logger.info(f"[TABLE] cell_boxes available: {len(element.get('cell_boxes', []))}")
if settings.table_content_rebuilder_enabled and raw_ocr_regions and element.get('cell_boxes'):
rebuilder = TableContentRebuilder()
should_rebuild, rebuild_reason = rebuilder.should_rebuild(
element['cell_boxes'],
bbox,
element.get('html', '')
)
if should_rebuild:
logger.info(f"[TABLE] Triggering table rebuild: {rebuild_reason}")
rebuilt_table, rebuild_stats = rebuilder.rebuild_table(
cell_boxes=element['cell_boxes'],
table_bbox=bbox,
raw_ocr_regions=raw_ocr_regions,
original_html=element.get('html', '')
)
if rebuilt_table:
# Update element with rebuilt content
element['html'] = rebuilt_table['html']
element['rebuilt_table'] = rebuilt_table
element['rebuild_stats'] = rebuild_stats
element['extracted_text'] = self._extract_text_from_html(rebuilt_table['html'])
logger.info(
f"[TABLE] Rebuilt table: {rebuilt_table['rows']}x{rebuilt_table['cols']} "
f"with {len(rebuilt_table['cells'])} cells"
)
else:
logger.warning(f"[TABLE] Rebuild failed: {rebuild_stats.get('reason', 'unknown')}")
element['rebuild_stats'] = rebuild_stats
# Special handling for images/figures/charts/stamps (visual elements that need cropping)
elif mapped_type in [ElementType.IMAGE, ElementType.FIGURE, ElementType.CHART, ElementType.DIAGRAM, ElementType.STAMP, ElementType.LOGO]:
# Save image if path provided
@@ -718,21 +732,6 @@ class PPStructureEnhanced:
elements.append(element)
logger.debug(f"Processed element {idx}: type={mapped_type}, bbox={bbox}")
# Apply cell validation to filter over-detected tables
if settings.cell_validation_enabled:
cell_validator = CellValidationEngine(CellValidationConfig(
max_cell_density=settings.cell_validation_max_density,
min_avg_cell_area=settings.cell_validation_min_cell_area,
min_cell_height=settings.cell_validation_min_cell_height,
enabled=True
))
elements, validation_stats = cell_validator.validate_and_filter_elements(elements)
if validation_stats['reclassified_tables'] > 0:
logger.info(
f"Cell validation: {validation_stats['reclassified_tables']}/{validation_stats['total_tables']} "
f"tables reclassified as TEXT due to over-detection"
)
return elements
def _embed_images_in_table(