feat: enable document orientation detection for scanned PDFs

- Enable PP-StructureV3's use_doc_orientation_classify feature
- Detect rotation angle from doc_preprocessor_res.angle
- Swap page dimensions (width <-> height) for 90°/270° rotations
- Output PDF now correctly displays landscape-scanned content

Also includes:
- Archive completed openspec proposals
- Add simplify-frontend-ocr-config proposal (pending)
- Code cleanup and frontend simplification

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
egg
2025-12-11 17:13:46 +08:00
parent 57070af307
commit cfe65158a3
58 changed files with 1271 additions and 3048 deletions

View File

@@ -25,6 +25,7 @@ from PIL import Image
from html.parser import HTMLParser
from app.core.config import settings
from app.utils.bbox_utils import normalize_bbox
# Import table column corrector for column alignment fix
try:
@@ -1258,8 +1259,44 @@ class PDFGeneratorService:
else:
logger.warning(f"Image file not found: {saved_path}")
# Also check for embedded images in table elements
# These are images detected inside table regions by PP-Structure
elif elem_type == 'table':
metadata = elem.metadata if hasattr(elem, 'metadata') else elem.get('metadata', {})
embedded_images = metadata.get('embedded_images', []) if metadata else []
for emb_img in embedded_images:
emb_bbox = emb_img.get('bbox', [])
if emb_bbox and len(emb_bbox) >= 4:
ex0, ey0, ex1, ey1 = emb_bbox[0], emb_bbox[1], emb_bbox[2], emb_bbox[3]
exclusion_zones.append((ex0, ey0, ex1, ey1))
# Also render the embedded image
saved_path = emb_img.get('saved_path', '')
if saved_path:
image_path = result_dir / saved_path
if not image_path.exists():
image_path = result_dir / Path(saved_path).name
if image_path.exists():
try:
pdf_x = ex0
pdf_y = current_height - ey1
img_width = ex1 - ex0
img_height = ey1 - ey0
pdf_canvas.drawImage(
str(image_path),
pdf_x, pdf_y,
width=img_width,
height=img_height,
preserveAspectRatio=True,
mask='auto'
)
image_elements_rendered += 1
logger.debug(f"Rendered embedded image: {saved_path} at ({pdf_x:.1f}, {pdf_y:.1f})")
except Exception as e:
logger.warning(f"Failed to render embedded image {saved_path}: {e}")
if image_elements_rendered > 0:
logger.info(f"Rendered {image_elements_rendered} image elements (figures/charts/seals/formulas)")
logger.info(f"Rendered {image_elements_rendered} image elements (figures/charts/seals/formulas/embedded)")
if exclusion_zones:
logger.info(f"Collected {len(exclusion_zones)} exclusion zones for text avoidance")
@@ -1857,38 +1894,8 @@ class PDFGeneratorService:
return None
def _get_bbox_coords(self, bbox: Union[Dict, List[List[float]], List[float]]) -> Optional[Tuple[float, float, float, float]]:
"""將任何 bbox 格式 (dict, 多邊形或 [x1,y1,x2,y2]) 轉換為 [x_min, y_min, x_max, y_max]"""
try:
if bbox is None:
return None
# Dict format from UnifiedDocument: {"x0": ..., "y0": ..., "x1": ..., "y1": ...}
if isinstance(bbox, dict):
if 'x0' in bbox and 'y0' in bbox and 'x1' in bbox and 'y1' in bbox:
return float(bbox['x0']), float(bbox['y0']), float(bbox['x1']), float(bbox['y1'])
else:
logger.warning(f"Dict bbox 缺少必要欄位: {bbox}")
return None
if not isinstance(bbox, (list, tuple)) or len(bbox) < 4:
return None
if isinstance(bbox[0], (list, tuple)):
# 處理多邊形 [[x, y], ...]
x_coords = [p[0] for p in bbox if isinstance(p, (list, tuple)) and len(p) >= 2]
y_coords = [p[1] for p in bbox if isinstance(p, (list, tuple)) and len(p) >= 2]
if not x_coords or not y_coords:
return None
return min(x_coords), min(y_coords), max(x_coords), max(y_coords)
elif isinstance(bbox[0], (int, float)) and len(bbox) == 4:
# 處理 [x1, y1, x2, y2]
return float(bbox[0]), float(bbox[1]), float(bbox[2]), float(bbox[3])
else:
logger.warning(f"未知的 bbox 格式: {bbox}")
return None
except Exception as e:
logger.error(f"解析 bbox {bbox} 時出錯: {e}")
return None
"""將任何 bbox 格式 (dict, 多邊形或 [x1,y1,x2,y2]) 轉換為 [x_min, y_min, x_max, y_max]. Uses shared bbox utility."""
return normalize_bbox(bbox)
def _is_bbox_inside(self, inner_bbox_data: Dict, outer_bbox_data: Dict, tolerance: float = 5.0) -> bool:
"""
@@ -2463,29 +2470,7 @@ class PDFGeneratorService:
else:
logger.info("[TABLE] cell_boxes rendering failed, using ReportLab Table with borders")
else:
# Grid mismatch: try cellboxes-first rendering if enabled
if settings.table_rendering_prefer_cellboxes:
logger.info(f"[TABLE] Grid mismatch, trying cellboxes-first rendering")
from app.services.pdf_table_renderer import TableRenderer, TableRenderConfig
renderer = TableRenderer(TableRenderConfig())
success = renderer.render_from_cellboxes_grid(
pdf_canvas,
cell_boxes,
html_content,
tuple(raw_bbox),
page_height,
scale_w,
scale_h,
row_threshold=settings.table_cellboxes_row_threshold,
col_threshold=settings.table_cellboxes_col_threshold
)
if success:
logger.info("[TABLE] cellboxes-first rendering succeeded, skipping HTML-based rendering")
return # Table fully rendered, exit early
else:
logger.info("[TABLE] cellboxes-first rendering failed, falling back to HTML-based")
else:
logger.info(f"[TABLE] Grid validation failed (mismatch), using ReportLab Table with borders")
logger.info(f"[TABLE] Grid validation failed (mismatch), using ReportLab Table with borders")
else:
logger.info("[TABLE] No valid bbox for grid validation, using ReportLab Table with borders")
@@ -2942,47 +2927,16 @@ class PDFGeneratorService:
"""
Check the quality of cell_boxes to determine rendering strategy.
Always returns 'good' to use pure PP-Structure output (quality check removed).
Args:
cell_boxes: List of cell bounding boxes
element_id: Optional element ID for logging
Returns:
'good' if cell_boxes form a proper grid, 'bad' otherwise
'good' - always use cell_boxes rendering
"""
# If quality check is disabled, always return 'good' to use pure PP-Structure output
if not settings.table_quality_check_enabled:
logger.debug(f"[TABLE QUALITY] {element_id}: good - quality check disabled (pure PP-Structure mode)")
return 'good'
if not cell_boxes or len(cell_boxes) < 2:
logger.debug(f"[TABLE QUALITY] {element_id}: bad - too few cells ({len(cell_boxes) if cell_boxes else 0})")
return 'bad' # No cell_boxes or too few
# Count overlapping cell pairs
overlap_count = 0
for i, box1 in enumerate(cell_boxes):
for j, box2 in enumerate(cell_boxes):
if i >= j:
continue
if not isinstance(box1, (list, tuple)) or len(box1) < 4:
continue
if not isinstance(box2, (list, tuple)) or len(box2) < 4:
continue
x_overlap = box1[0] < box2[2] and box1[2] > box2[0]
y_overlap = box1[1] < box2[3] and box1[3] > box2[1]
if x_overlap and y_overlap:
overlap_count += 1
total_pairs = len(cell_boxes) * (len(cell_boxes) - 1) // 2
overlap_ratio = overlap_count / total_pairs if total_pairs > 0 else 0
# Relaxed threshold: 20% overlap instead of 10% to allow more tables through
# This is because PP-StructureV3's cell detection sometimes has slight overlaps
if overlap_ratio > 0.20:
logger.info(f"[TABLE QUALITY] {element_id}: bad - overlap ratio {overlap_ratio:.2%} > 20%")
return 'bad'
logger.debug(f"[TABLE QUALITY] {element_id}: good - {len(cell_boxes)} cells, overlap {overlap_ratio:.2%}")
logger.debug(f"[TABLE QUALITY] {element_id}: good - pure PP-Structure mode")
return 'good'
def _draw_table_with_cell_boxes(