feat: enable document orientation detection for scanned PDFs
- Enable PP-StructureV3's use_doc_orientation_classify feature - Detect rotation angle from doc_preprocessor_res.angle - Swap page dimensions (width <-> height) for 90°/270° rotations - Output PDF now correctly displays landscape-scanned content Also includes: - Archive completed openspec proposals - Add simplify-frontend-ocr-config proposal (pending) - Code cleanup and frontend simplification 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -25,6 +25,7 @@ from PIL import Image
|
||||
from html.parser import HTMLParser
|
||||
|
||||
from app.core.config import settings
|
||||
from app.utils.bbox_utils import normalize_bbox
|
||||
|
||||
# Import table column corrector for column alignment fix
|
||||
try:
|
||||
@@ -1258,8 +1259,44 @@ class PDFGeneratorService:
|
||||
else:
|
||||
logger.warning(f"Image file not found: {saved_path}")
|
||||
|
||||
# Also check for embedded images in table elements
|
||||
# These are images detected inside table regions by PP-Structure
|
||||
elif elem_type == 'table':
|
||||
metadata = elem.metadata if hasattr(elem, 'metadata') else elem.get('metadata', {})
|
||||
embedded_images = metadata.get('embedded_images', []) if metadata else []
|
||||
for emb_img in embedded_images:
|
||||
emb_bbox = emb_img.get('bbox', [])
|
||||
if emb_bbox and len(emb_bbox) >= 4:
|
||||
ex0, ey0, ex1, ey1 = emb_bbox[0], emb_bbox[1], emb_bbox[2], emb_bbox[3]
|
||||
exclusion_zones.append((ex0, ey0, ex1, ey1))
|
||||
|
||||
# Also render the embedded image
|
||||
saved_path = emb_img.get('saved_path', '')
|
||||
if saved_path:
|
||||
image_path = result_dir / saved_path
|
||||
if not image_path.exists():
|
||||
image_path = result_dir / Path(saved_path).name
|
||||
if image_path.exists():
|
||||
try:
|
||||
pdf_x = ex0
|
||||
pdf_y = current_height - ey1
|
||||
img_width = ex1 - ex0
|
||||
img_height = ey1 - ey0
|
||||
pdf_canvas.drawImage(
|
||||
str(image_path),
|
||||
pdf_x, pdf_y,
|
||||
width=img_width,
|
||||
height=img_height,
|
||||
preserveAspectRatio=True,
|
||||
mask='auto'
|
||||
)
|
||||
image_elements_rendered += 1
|
||||
logger.debug(f"Rendered embedded image: {saved_path} at ({pdf_x:.1f}, {pdf_y:.1f})")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to render embedded image {saved_path}: {e}")
|
||||
|
||||
if image_elements_rendered > 0:
|
||||
logger.info(f"Rendered {image_elements_rendered} image elements (figures/charts/seals/formulas)")
|
||||
logger.info(f"Rendered {image_elements_rendered} image elements (figures/charts/seals/formulas/embedded)")
|
||||
|
||||
if exclusion_zones:
|
||||
logger.info(f"Collected {len(exclusion_zones)} exclusion zones for text avoidance")
|
||||
@@ -1857,38 +1894,8 @@ class PDFGeneratorService:
|
||||
return None
|
||||
|
||||
def _get_bbox_coords(self, bbox: Union[Dict, List[List[float]], List[float]]) -> Optional[Tuple[float, float, float, float]]:
|
||||
"""將任何 bbox 格式 (dict, 多邊形或 [x1,y1,x2,y2]) 轉換為 [x_min, y_min, x_max, y_max]"""
|
||||
try:
|
||||
if bbox is None:
|
||||
return None
|
||||
|
||||
# Dict format from UnifiedDocument: {"x0": ..., "y0": ..., "x1": ..., "y1": ...}
|
||||
if isinstance(bbox, dict):
|
||||
if 'x0' in bbox and 'y0' in bbox and 'x1' in bbox and 'y1' in bbox:
|
||||
return float(bbox['x0']), float(bbox['y0']), float(bbox['x1']), float(bbox['y1'])
|
||||
else:
|
||||
logger.warning(f"Dict bbox 缺少必要欄位: {bbox}")
|
||||
return None
|
||||
|
||||
if not isinstance(bbox, (list, tuple)) or len(bbox) < 4:
|
||||
return None
|
||||
|
||||
if isinstance(bbox[0], (list, tuple)):
|
||||
# 處理多邊形 [[x, y], ...]
|
||||
x_coords = [p[0] for p in bbox if isinstance(p, (list, tuple)) and len(p) >= 2]
|
||||
y_coords = [p[1] for p in bbox if isinstance(p, (list, tuple)) and len(p) >= 2]
|
||||
if not x_coords or not y_coords:
|
||||
return None
|
||||
return min(x_coords), min(y_coords), max(x_coords), max(y_coords)
|
||||
elif isinstance(bbox[0], (int, float)) and len(bbox) == 4:
|
||||
# 處理 [x1, y1, x2, y2]
|
||||
return float(bbox[0]), float(bbox[1]), float(bbox[2]), float(bbox[3])
|
||||
else:
|
||||
logger.warning(f"未知的 bbox 格式: {bbox}")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"解析 bbox {bbox} 時出錯: {e}")
|
||||
return None
|
||||
"""將任何 bbox 格式 (dict, 多邊形或 [x1,y1,x2,y2]) 轉換為 [x_min, y_min, x_max, y_max]. Uses shared bbox utility."""
|
||||
return normalize_bbox(bbox)
|
||||
|
||||
def _is_bbox_inside(self, inner_bbox_data: Dict, outer_bbox_data: Dict, tolerance: float = 5.0) -> bool:
|
||||
"""
|
||||
@@ -2463,29 +2470,7 @@ class PDFGeneratorService:
|
||||
else:
|
||||
logger.info("[TABLE] cell_boxes rendering failed, using ReportLab Table with borders")
|
||||
else:
|
||||
# Grid mismatch: try cellboxes-first rendering if enabled
|
||||
if settings.table_rendering_prefer_cellboxes:
|
||||
logger.info(f"[TABLE] Grid mismatch, trying cellboxes-first rendering")
|
||||
from app.services.pdf_table_renderer import TableRenderer, TableRenderConfig
|
||||
renderer = TableRenderer(TableRenderConfig())
|
||||
success = renderer.render_from_cellboxes_grid(
|
||||
pdf_canvas,
|
||||
cell_boxes,
|
||||
html_content,
|
||||
tuple(raw_bbox),
|
||||
page_height,
|
||||
scale_w,
|
||||
scale_h,
|
||||
row_threshold=settings.table_cellboxes_row_threshold,
|
||||
col_threshold=settings.table_cellboxes_col_threshold
|
||||
)
|
||||
if success:
|
||||
logger.info("[TABLE] cellboxes-first rendering succeeded, skipping HTML-based rendering")
|
||||
return # Table fully rendered, exit early
|
||||
else:
|
||||
logger.info("[TABLE] cellboxes-first rendering failed, falling back to HTML-based")
|
||||
else:
|
||||
logger.info(f"[TABLE] Grid validation failed (mismatch), using ReportLab Table with borders")
|
||||
logger.info(f"[TABLE] Grid validation failed (mismatch), using ReportLab Table with borders")
|
||||
else:
|
||||
logger.info("[TABLE] No valid bbox for grid validation, using ReportLab Table with borders")
|
||||
|
||||
@@ -2942,47 +2927,16 @@ class PDFGeneratorService:
|
||||
"""
|
||||
Check the quality of cell_boxes to determine rendering strategy.
|
||||
|
||||
Always returns 'good' to use pure PP-Structure output (quality check removed).
|
||||
|
||||
Args:
|
||||
cell_boxes: List of cell bounding boxes
|
||||
element_id: Optional element ID for logging
|
||||
|
||||
Returns:
|
||||
'good' if cell_boxes form a proper grid, 'bad' otherwise
|
||||
'good' - always use cell_boxes rendering
|
||||
"""
|
||||
# If quality check is disabled, always return 'good' to use pure PP-Structure output
|
||||
if not settings.table_quality_check_enabled:
|
||||
logger.debug(f"[TABLE QUALITY] {element_id}: good - quality check disabled (pure PP-Structure mode)")
|
||||
return 'good'
|
||||
|
||||
if not cell_boxes or len(cell_boxes) < 2:
|
||||
logger.debug(f"[TABLE QUALITY] {element_id}: bad - too few cells ({len(cell_boxes) if cell_boxes else 0})")
|
||||
return 'bad' # No cell_boxes or too few
|
||||
|
||||
# Count overlapping cell pairs
|
||||
overlap_count = 0
|
||||
for i, box1 in enumerate(cell_boxes):
|
||||
for j, box2 in enumerate(cell_boxes):
|
||||
if i >= j:
|
||||
continue
|
||||
if not isinstance(box1, (list, tuple)) or len(box1) < 4:
|
||||
continue
|
||||
if not isinstance(box2, (list, tuple)) or len(box2) < 4:
|
||||
continue
|
||||
x_overlap = box1[0] < box2[2] and box1[2] > box2[0]
|
||||
y_overlap = box1[1] < box2[3] and box1[3] > box2[1]
|
||||
if x_overlap and y_overlap:
|
||||
overlap_count += 1
|
||||
|
||||
total_pairs = len(cell_boxes) * (len(cell_boxes) - 1) // 2
|
||||
overlap_ratio = overlap_count / total_pairs if total_pairs > 0 else 0
|
||||
|
||||
# Relaxed threshold: 20% overlap instead of 10% to allow more tables through
|
||||
# This is because PP-StructureV3's cell detection sometimes has slight overlaps
|
||||
if overlap_ratio > 0.20:
|
||||
logger.info(f"[TABLE QUALITY] {element_id}: bad - overlap ratio {overlap_ratio:.2%} > 20%")
|
||||
return 'bad'
|
||||
|
||||
logger.debug(f"[TABLE QUALITY] {element_id}: good - {len(cell_boxes)} cells, overlap {overlap_ratio:.2%}")
|
||||
logger.debug(f"[TABLE QUALITY] {element_id}: good - pure PP-Structure mode")
|
||||
return 'good'
|
||||
|
||||
def _draw_table_with_cell_boxes(
|
||||
|
||||
Reference in New Issue
Block a user