feat: enable document orientation detection for scanned PDFs
- Enable PP-StructureV3's use_doc_orientation_classify feature - Detect rotation angle from doc_preprocessor_res.angle - Swap page dimensions (width <-> height) for 90°/270° rotations - Output PDF now correctly displays landscape-scanned content Also includes: - Archive completed openspec proposals - Add simplify-frontend-ocr-config proposal (pending) - Code cleanup and frontend simplification 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -28,11 +28,9 @@ from PIL import Image
|
||||
import numpy as np
|
||||
import cv2
|
||||
from app.models.unified_document import ElementType
|
||||
from app.services.cell_validation_engine import CellValidationEngine, CellValidationConfig
|
||||
from app.core.config import settings
|
||||
from app.services.memory_manager import prediction_context
|
||||
from app.services.cv_table_detector import CVTableDetector
|
||||
from app.services.table_content_rebuilder import TableContentRebuilder
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -159,6 +157,7 @@ class PPStructureEnhanced:
|
||||
all_images = []
|
||||
all_tables = []
|
||||
visualization_dir = None
|
||||
detected_rotation = "0" # Default: no rotation
|
||||
|
||||
# Process each page result
|
||||
for page_idx, page_result in enumerate(results):
|
||||
@@ -247,6 +246,56 @@ class PPStructureEnhanced:
|
||||
ocr_count = len(overall_ocr_res.get('rec_texts', []))
|
||||
logger.info(f"Found overall_ocr_res with {ocr_count} text regions")
|
||||
|
||||
# Extract doc_preprocessor_res for orientation detection
|
||||
# When use_doc_orientation_classify=True, this contains the detected rotation angle
|
||||
# Note: doc_preprocessor_res may be at top-level result_json OR inside 'res'
|
||||
doc_preprocessor_res = None
|
||||
|
||||
# First, check result_dict (might be result_json['res'])
|
||||
if 'doc_preprocessor_res' in result_dict:
|
||||
doc_preprocessor_res = result_dict['doc_preprocessor_res']
|
||||
logger.info("Found doc_preprocessor_res in result_dict")
|
||||
# Also check top-level result_json if it exists and differs from result_dict
|
||||
elif hasattr(page_result, 'json') and isinstance(page_result.json, dict):
|
||||
if 'doc_preprocessor_res' in page_result.json:
|
||||
doc_preprocessor_res = page_result.json['doc_preprocessor_res']
|
||||
logger.info("Found doc_preprocessor_res at top-level result_json")
|
||||
|
||||
# Debug: Log available keys to help diagnose structure issues
|
||||
if doc_preprocessor_res is None:
|
||||
logger.warning(f"doc_preprocessor_res NOT found. result_dict keys: {list(result_dict.keys()) if result_dict else 'None'}")
|
||||
if hasattr(page_result, 'json') and isinstance(page_result.json, dict):
|
||||
logger.warning(f"result_json keys: {list(page_result.json.keys())}")
|
||||
|
||||
if doc_preprocessor_res:
|
||||
# Debug: Log the complete structure of doc_preprocessor_res
|
||||
logger.info(f"doc_preprocessor_res keys: {list(doc_preprocessor_res.keys()) if isinstance(doc_preprocessor_res, dict) else type(doc_preprocessor_res)}")
|
||||
logger.info(f"doc_preprocessor_res content: {doc_preprocessor_res}")
|
||||
|
||||
# Try multiple possible key names for rotation info
|
||||
# PaddleOCR may use different structures depending on version
|
||||
label_names = doc_preprocessor_res.get('label_names', [])
|
||||
class_ids = doc_preprocessor_res.get('class_ids', [])
|
||||
labels = doc_preprocessor_res.get('labels', [])
|
||||
angle = doc_preprocessor_res.get('angle', None)
|
||||
|
||||
# Determine rotation from available data
|
||||
detected_rotation = "0"
|
||||
if label_names:
|
||||
detected_rotation = str(label_names[0])
|
||||
elif class_ids:
|
||||
# class_ids: 0=0°, 1=90°, 2=180°, 3=270°
|
||||
rotation_map = {0: "0", 1: "90", 2: "180", 3: "270"}
|
||||
detected_rotation = rotation_map.get(class_ids[0], "0")
|
||||
elif labels:
|
||||
detected_rotation = str(labels[0])
|
||||
elif angle is not None:
|
||||
detected_rotation = str(angle)
|
||||
|
||||
logger.info(f"Document orientation detected: {detected_rotation}° (label_names={label_names}, class_ids={class_ids}, labels={labels}, angle={angle})")
|
||||
else:
|
||||
detected_rotation = "0" # Default: no rotation
|
||||
|
||||
# Process parsing_res_list if found
|
||||
if parsing_res_list:
|
||||
elements = self._process_parsing_res_list(
|
||||
@@ -295,7 +344,8 @@ class PPStructureEnhanced:
|
||||
'tables': all_tables,
|
||||
'images': all_images,
|
||||
'element_types': self._count_element_types(all_elements),
|
||||
'has_parsing_res_list': parsing_res_list is not None
|
||||
'has_parsing_res_list': parsing_res_list is not None,
|
||||
'detected_rotation': detected_rotation # Document orientation: "0", "90", "180", "270"
|
||||
}
|
||||
|
||||
# Add visualization directory if available
|
||||
@@ -653,42 +703,6 @@ class PPStructureEnhanced:
|
||||
element['embedded_images'] = embedded_images
|
||||
logger.info(f"[TABLE] Embedded {len(embedded_images)} images into table")
|
||||
|
||||
# 4. Table content rebuilding from raw OCR regions
|
||||
# When cell_boxes have boundary issues, rebuild table content from raw OCR
|
||||
# Only if table_content_rebuilder is enabled (disabled by default as it's a patch behavior)
|
||||
logger.info(f"[TABLE] raw_ocr_regions available: {raw_ocr_regions is not None and len(raw_ocr_regions) if raw_ocr_regions else 0}")
|
||||
logger.info(f"[TABLE] cell_boxes available: {len(element.get('cell_boxes', []))}")
|
||||
if settings.table_content_rebuilder_enabled and raw_ocr_regions and element.get('cell_boxes'):
|
||||
rebuilder = TableContentRebuilder()
|
||||
should_rebuild, rebuild_reason = rebuilder.should_rebuild(
|
||||
element['cell_boxes'],
|
||||
bbox,
|
||||
element.get('html', '')
|
||||
)
|
||||
|
||||
if should_rebuild:
|
||||
logger.info(f"[TABLE] Triggering table rebuild: {rebuild_reason}")
|
||||
rebuilt_table, rebuild_stats = rebuilder.rebuild_table(
|
||||
cell_boxes=element['cell_boxes'],
|
||||
table_bbox=bbox,
|
||||
raw_ocr_regions=raw_ocr_regions,
|
||||
original_html=element.get('html', '')
|
||||
)
|
||||
|
||||
if rebuilt_table:
|
||||
# Update element with rebuilt content
|
||||
element['html'] = rebuilt_table['html']
|
||||
element['rebuilt_table'] = rebuilt_table
|
||||
element['rebuild_stats'] = rebuild_stats
|
||||
element['extracted_text'] = self._extract_text_from_html(rebuilt_table['html'])
|
||||
logger.info(
|
||||
f"[TABLE] Rebuilt table: {rebuilt_table['rows']}x{rebuilt_table['cols']} "
|
||||
f"with {len(rebuilt_table['cells'])} cells"
|
||||
)
|
||||
else:
|
||||
logger.warning(f"[TABLE] Rebuild failed: {rebuild_stats.get('reason', 'unknown')}")
|
||||
element['rebuild_stats'] = rebuild_stats
|
||||
|
||||
# Special handling for images/figures/charts/stamps (visual elements that need cropping)
|
||||
elif mapped_type in [ElementType.IMAGE, ElementType.FIGURE, ElementType.CHART, ElementType.DIAGRAM, ElementType.STAMP, ElementType.LOGO]:
|
||||
# Save image if path provided
|
||||
@@ -718,21 +732,6 @@ class PPStructureEnhanced:
|
||||
elements.append(element)
|
||||
logger.debug(f"Processed element {idx}: type={mapped_type}, bbox={bbox}")
|
||||
|
||||
# Apply cell validation to filter over-detected tables
|
||||
if settings.cell_validation_enabled:
|
||||
cell_validator = CellValidationEngine(CellValidationConfig(
|
||||
max_cell_density=settings.cell_validation_max_density,
|
||||
min_avg_cell_area=settings.cell_validation_min_cell_area,
|
||||
min_cell_height=settings.cell_validation_min_cell_height,
|
||||
enabled=True
|
||||
))
|
||||
elements, validation_stats = cell_validator.validate_and_filter_elements(elements)
|
||||
if validation_stats['reclassified_tables'] > 0:
|
||||
logger.info(
|
||||
f"Cell validation: {validation_stats['reclassified_tables']}/{validation_stats['total_tables']} "
|
||||
f"tables reclassified as TEXT due to over-detection"
|
||||
)
|
||||
|
||||
return elements
|
||||
|
||||
def _embed_images_in_table(
|
||||
|
||||
Reference in New Issue
Block a user