chore: backup before code cleanup

Backup commit before executing remove-unused-code proposal.
This includes all pending changes and new features.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
egg
2025-12-11 11:55:39 +08:00
parent eff9b0bcd5
commit 940a406dce
58 changed files with 8226 additions and 175 deletions

View File

@@ -26,6 +26,23 @@ from html.parser import HTMLParser
from app.core.config import settings
# Import table column corrector for column alignment fix
try:
from app.services.table_column_corrector import TableColumnCorrector
TABLE_COLUMN_CORRECTOR_AVAILABLE = True
except ImportError:
TABLE_COLUMN_CORRECTOR_AVAILABLE = False
TableColumnCorrector = None
# Import text region renderer for simple text positioning
try:
from app.services.text_region_renderer import TextRegionRenderer, load_raw_ocr_regions
TEXT_REGION_RENDERER_AVAILABLE = True
except ImportError:
TEXT_REGION_RENDERER_AVAILABLE = False
TextRegionRenderer = None
load_raw_ocr_regions = None
# Import UnifiedDocument for dual-track support
try:
from app.models.unified_document import (
@@ -596,7 +613,8 @@ class PDFGeneratorService:
'content': html_content,
'bbox': [element.bbox.x0, element.bbox.y0,
element.bbox.x1, element.bbox.y1],
'page': page_num - 1 # layout uses 0-based
'page': page_num - 1, # layout uses 0-based
'element_id': element.element_id # For _use_border_only matching
}
# Preserve cell_boxes and embedded_images from metadata
@@ -607,18 +625,29 @@ class PDFGeneratorService:
table_element['cell_boxes_source'] = element.metadata.get('cell_boxes_source', 'metadata')
if 'embedded_images' in element.metadata:
table_element['embedded_images'] = element.metadata['embedded_images']
# Pass through rebuild flag - rebuilt tables should use HTML content
if element.metadata.get('was_rebuilt'):
table_element['was_rebuilt'] = True
logger.debug(f"Table {element.element_id}: marked as rebuilt")
layout_elements.append(table_element)
# Add bbox to images_metadata for text overlap filtering
# (no actual image file, just bbox for filtering)
images_metadata.append({
img_metadata = {
'image_path': None, # No fake table image
'bbox': bbox_polygon,
'page': page_num - 1, # 0-based for images_metadata
'type': 'table',
'element_id': element.element_id
})
}
# Also copy cell_boxes for quality checking
if element.metadata and 'cell_boxes' in element.metadata:
img_metadata['cell_boxes'] = element.metadata['cell_boxes']
# Mark if table was rebuilt
if element.metadata and element.metadata.get('was_rebuilt'):
img_metadata['was_rebuilt'] = True
images_metadata.append(img_metadata)
# Handle image/visual elements (including stamps/seals)
elif element.is_visual or element.type in [
@@ -1022,15 +1051,25 @@ class PDFGeneratorService:
# Set current track
self.current_processing_track = 'ocr'
# Convert UnifiedDocument to OCR data format (legacy)
ocr_data = self.convert_unified_document_to_ocr_data(unified_doc)
# Check if simple text positioning mode is enabled
if (settings.simple_text_positioning_enabled and
TEXT_REGION_RENDERER_AVAILABLE):
logger.info("Using simple text positioning mode")
result = self._generate_simple_text_pdf(
unified_doc=unified_doc,
output_path=output_path,
source_file_path=source_file_path
)
else:
# Convert UnifiedDocument to OCR data format (legacy)
ocr_data = self.convert_unified_document_to_ocr_data(unified_doc)
# Use existing generation pipeline
result = self._generate_pdf_from_data(
ocr_data=ocr_data,
output_path=output_path,
source_file_path=source_file_path
)
# Use existing generation pipeline
result = self._generate_pdf_from_data(
ocr_data=ocr_data,
output_path=output_path,
source_file_path=source_file_path
)
# Reset track
self.current_processing_track = None
@@ -1043,6 +1082,235 @@ class PDFGeneratorService:
self.current_processing_track = None
return False
def _generate_simple_text_pdf(
self,
unified_doc: 'UnifiedDocument',
output_path: Path,
source_file_path: Optional[Path] = None
) -> bool:
"""
Generate PDF using simple text positioning from raw OCR regions.
This approach bypasses complex table structure reconstruction and renders
raw OCR text directly at detected positions with rotation correction.
Images, charts, figures, seals, and formulas are still rendered normally.
Args:
unified_doc: UnifiedDocument from OCR processing
output_path: Path to save generated PDF
source_file_path: Optional path to original source file
Returns:
True if successful, False otherwise
"""
try:
logger.info("=== Simple Text Positioning PDF Generation ===")
# Initialize text region renderer
text_renderer = TextRegionRenderer(
font_name=self.font_name,
debug=settings.simple_text_positioning_debug
)
# Get result directory from output_path
result_dir = output_path.parent
# Try to determine task_id from result directory or output filename
# Output path is typically: result_dir/task_id_edited.pdf
task_id = None
if output_path.stem.endswith('_edited'):
task_id = output_path.stem.replace('_edited', '')
elif result_dir.name:
# result_dir is typically the task_id directory
task_id = result_dir.name
if not task_id:
logger.warning("Could not determine task_id, falling back to legacy method")
ocr_data = self.convert_unified_document_to_ocr_data(unified_doc)
return self._generate_pdf_from_data(
ocr_data=ocr_data,
output_path=output_path,
source_file_path=source_file_path
)
logger.info(f"Task ID: {task_id}, Result dir: {result_dir}")
# Get total pages from UnifiedDocument
total_pages = len(unified_doc.pages) if unified_doc.pages else 1
# Get page dimensions from first page (for canvas initialization)
if not unified_doc.pages:
logger.error("No pages in document")
return False
first_page = unified_doc.pages[0]
if hasattr(first_page, 'dimensions') and first_page.dimensions:
page_width = float(first_page.dimensions.width)
page_height = float(first_page.dimensions.height)
else:
# Fallback to default size
page_width = 612.0 # Letter width
page_height = 792.0 # Letter height
logger.warning(f"No page dimensions found, using default {page_width}x{page_height}")
logger.info(f"Initial page size: {page_width:.1f} x {page_height:.1f}")
# Create PDF canvas
pdf_canvas = canvas.Canvas(str(output_path), pagesize=(page_width, page_height))
# Collect image-type elements from UnifiedDocument for rendering
# Types that should be rendered as images: figure, image, chart, seal, formula
image_element_types = {'figure', 'image', 'chart', 'seal', 'formula'}
# Process each page
for page_num in range(1, total_pages + 1):
logger.info(f">>> Processing page {page_num}/{total_pages}")
# Get page dimensions for current page
if page_num <= len(unified_doc.pages):
current_page = unified_doc.pages[page_num - 1]
if hasattr(current_page, 'dimensions') and current_page.dimensions:
current_width = float(current_page.dimensions.width)
current_height = float(current_page.dimensions.height)
else:
current_width = page_width
current_height = page_height
else:
current_width = page_width
current_height = page_height
if page_num > 1:
pdf_canvas.showPage()
# Set page size
pdf_canvas.setPageSize((current_width, current_height))
# === Layer 1: Render images, charts, figures, seals, formulas ===
# Also collect exclusion zones for text avoidance
exclusion_zones = [] # List of (x0, y0, x1, y1) tuples
if page_num <= len(unified_doc.pages):
current_page = unified_doc.pages[page_num - 1]
page_elements = current_page.elements if hasattr(current_page, 'elements') else []
image_elements_rendered = 0
for elem in page_elements:
elem_type = elem.type if hasattr(elem, 'type') else elem.get('type', '')
# Handle enum type
if hasattr(elem_type, 'value'):
elem_type = elem_type.value
if elem_type in image_element_types:
# Get image path from element content
content = elem.content if hasattr(elem, 'content') else elem.get('content', {})
if isinstance(content, dict):
saved_path = content.get('saved_path') or content.get('path')
else:
saved_path = None
# Get bbox for exclusion zone (even if image file not found)
bbox = elem.bbox if hasattr(elem, 'bbox') else elem.get('bbox', {})
if hasattr(bbox, 'x0'):
x0, y0, x1, y1 = bbox.x0, bbox.y0, bbox.x1, bbox.y1
elif isinstance(bbox, dict):
x0 = bbox.get('x0', 0)
y0 = bbox.get('y0', 0)
x1 = bbox.get('x1', x0 + bbox.get('width', 0))
y1 = bbox.get('y1', y0 + bbox.get('height', 0))
else:
continue
# Add to exclusion zones for text avoidance
# Use original image coordinates (not PDF flipped)
exclusion_zones.append((x0, y0, x1, y1))
if saved_path:
# Try to find the image file
image_path = result_dir / saved_path
if not image_path.exists():
# Try in imgs subdirectory
image_path = result_dir / 'imgs' / saved_path
if not image_path.exists():
# Try just the filename
image_path = result_dir / Path(saved_path).name
if image_path.exists():
try:
# Convert coordinates (flip Y for PDF)
pdf_x = x0
pdf_y = current_height - y1 # Bottom of image in PDF coords
img_width = x1 - x0
img_height = y1 - y0
# Draw image
pdf_canvas.drawImage(
str(image_path),
pdf_x, pdf_y,
width=img_width,
height=img_height,
preserveAspectRatio=True,
mask='auto'
)
image_elements_rendered += 1
logger.debug(f"Rendered {elem_type}: {saved_path} at ({pdf_x:.1f}, {pdf_y:.1f})")
except Exception as e:
logger.warning(f"Failed to render {elem_type} {saved_path}: {e}")
else:
logger.warning(f"Image file not found: {saved_path}")
if image_elements_rendered > 0:
logger.info(f"Rendered {image_elements_rendered} image elements (figures/charts/seals/formulas)")
if exclusion_zones:
logger.info(f"Collected {len(exclusion_zones)} exclusion zones for text avoidance")
# === Layer 2: Render text from raw OCR regions ===
raw_regions = load_raw_ocr_regions(str(result_dir), task_id, page_num)
if not raw_regions:
logger.warning(f"No raw OCR regions found for page {page_num}")
else:
logger.info(f"Loaded {len(raw_regions)} raw OCR regions for page {page_num}")
# Collect texts inside exclusion zones for position-aware deduplication
# This prevents duplicate axis labels from being rendered near charts
zone_texts = None
if exclusion_zones:
zone_texts = text_renderer.collect_zone_texts(
raw_regions, exclusion_zones, threshold=0.5, include_axis_labels=True
)
if zone_texts:
logger.info(f"Collected {len(zone_texts)} zone texts for deduplication: {list(zone_texts)[:10]}...")
# Render all text regions, avoiding exclusion zones (images/charts)
# Scale factors are 1.0 since OCR dimensions match page dimensions
rendered = text_renderer.render_all_regions(
pdf_canvas=pdf_canvas,
regions=raw_regions,
page_height=current_height,
scale_x=1.0,
scale_y=1.0,
exclusion_zones=exclusion_zones,
zone_texts=zone_texts
)
logger.info(f"Rendered {rendered} text regions")
logger.info(f"<<< Page {page_num} complete")
# Save PDF
pdf_canvas.save()
file_size = output_path.stat().st_size
logger.info(f"Generated PDF: {output_path.name} ({file_size} bytes)")
return True
except Exception as e:
logger.error(f"Failed to generate simple text PDF: {e}")
import traceback
traceback.print_exc()
return False
def _generate_pdf_from_data(
self,
ocr_data: Dict,
@@ -1093,8 +1361,15 @@ class PDFGeneratorService:
logger.info("No page_dimensions found, using first page size for all pages")
# Step 3: Get original file dimensions for all pages
# For OCR track, we use OCR coordinate system dimensions directly to avoid scaling issues
original_page_sizes = {}
if source_file_path:
use_ocr_dimensions_for_pdf = (self.current_processing_track == 'ocr')
if use_ocr_dimensions_for_pdf:
# OCR Track: Use OCR coordinate system dimensions directly
# This ensures no scaling is needed (scale = 1.0)
logger.info(f"OCR Track: 使用 OCR 座標系尺寸作為 PDF 頁面尺寸(避免縮放)")
elif source_file_path:
original_page_sizes = self.get_all_page_sizes(source_file_path)
if original_page_sizes:
logger.info(f"從原始文件獲取到 {len(original_page_sizes)} 頁尺寸")
@@ -1104,8 +1379,12 @@ class PDFGeneratorService:
logger.info(f"無原始文件,將使用 OCR/UnifiedDocument 尺寸")
# Determine initial canvas size (will be updated per page)
# Priority: original file first page > OCR/UnifiedDocument first page
if 0 in original_page_sizes:
# Priority for OCR track: OCR dimensions (no scaling)
# Priority for Direct track: original file first page > OCR/UnifiedDocument first page
if use_ocr_dimensions_for_pdf:
target_width, target_height = ocr_width, ocr_height
logger.info(f"初始 PDF 尺寸OCR Track, 使用 OCR 座標系): {target_width:.1f} x {target_height:.1f}")
elif 0 in original_page_sizes:
target_width, target_height = original_page_sizes[0]
logger.info(f"初始 PDF 尺寸(來自原始文件首頁): {target_width:.1f} x {target_height:.1f}")
else:
@@ -1159,14 +1438,49 @@ class PDFGeneratorService:
# Create PDF canvas with initial page size (will be updated per page)
pdf_canvas = canvas.Canvas(str(output_path), pagesize=(target_width, target_height))
# LAYERED RENDERING: Exclude tables from regions_to_avoid
# Text inside tables will be rendered at raw OCR positions (via GapFillingService)
# while table borders are drawn separately using cell_boxes
# Only avoid overlap with actual images/figures/charts
regions_to_avoid = [img for img in images_metadata if img.get('type') != 'table']
table_count = len([img for img in images_metadata if img.get('type') == 'table'])
# Smart filtering: only include tables with good cell_boxes quality in regions_to_avoid
# Tables with bad cell_boxes will use raw OCR text positioning instead
# Exception: Rebuilt tables always use HTML content and filter text
regions_to_avoid = []
good_quality_tables = []
bad_quality_tables = []
rebuilt_tables = []
logger.info(f"過濾文字區域: {len(regions_to_avoid)} 個區域需要避免 (不含表格), {table_count} 個表格使用分層渲染")
for img in images_metadata:
if img.get('type') == 'table':
elem_id = img.get('element_id', 'unknown')
# Check if this table was rebuilt - rebuilt tables have good content
was_rebuilt = img.get('was_rebuilt', False)
if was_rebuilt:
# Rebuilt tables have accurate content - filter text, use HTML
regions_to_avoid.append(img)
rebuilt_tables.append(elem_id)
else:
# Check cell_boxes quality for non-rebuilt tables
cell_boxes = img.get('cell_boxes', [])
quality = self._check_cell_boxes_quality(cell_boxes, elem_id)
if quality == 'good':
# Good quality: filter text, render with cell_boxes
regions_to_avoid.append(img)
good_quality_tables.append(elem_id)
else:
# Bad quality: don't filter text, just draw border
bad_quality_tables.append(elem_id)
img['_use_border_only'] = True # Mark for border-only rendering
else:
# Non-table elements (images, figures, charts) always avoid
regions_to_avoid.append(img)
logger.info(f"過濾文字區域: {len(regions_to_avoid)} 個區域需要避免")
if rebuilt_tables:
logger.info(f" 重建表格用 HTML: {rebuilt_tables}")
if good_quality_tables:
logger.info(f" 表格用 cell_boxes: {good_quality_tables}")
if bad_quality_tables:
logger.info(f" 表格用 raw OCR text (border only): {bad_quality_tables}")
filtered_text_regions = self._filter_text_in_regions(text_regions, regions_to_avoid)
@@ -1178,10 +1492,24 @@ class PDFGeneratorService:
pages_data[page_num] = []
pages_data[page_num].append(region)
# Get table elements from layout_data
# Get table elements from layout_data and copy _use_border_only flags
table_elements = []
if layout_data and layout_data.get('elements'):
table_elements = [e for e in layout_data['elements'] if e.get('type') == 'table']
# Create a lookup for _use_border_only flags from images_metadata
border_only_tables = {img.get('element_id') for img in images_metadata
if img.get('type') == 'table' and img.get('_use_border_only')}
logger.debug(f"[DEBUG] border_only_tables from images_metadata: {border_only_tables}")
for e in layout_data['elements']:
if e.get('type') == 'table':
elem_id = e.get('element_id')
logger.debug(f"[DEBUG] layout_data table element_id: {elem_id}")
# Copy the flag if this table should use border only
if elem_id in border_only_tables:
e['_use_border_only'] = True
logger.info(f"[DEBUG] Set _use_border_only=True for table {elem_id}")
table_elements.append(e)
# Process each page
total_pages = ocr_data.get('total_pages', 1)
@@ -1195,14 +1523,23 @@ class PDFGeneratorService:
logger.info(f">>> 處理第 {page_num}/{total_pages}")
# Get current page dimensions with priority order:
# 1. Original file dimensions (highest priority)
# 2. OCR/UnifiedDocument dimensions
# 3. Fallback to first page dimensions
# For OCR Track: always use OCR dimensions (scale = 1.0)
# For Direct Track:
# 1. Original file dimensions (highest priority)
# 2. OCR/UnifiedDocument dimensions
# 3. Fallback to first page dimensions
page_idx = page_num - 1
dimension_source = "unknown"
# Priority 1: Original file dimensions
if page_idx in original_page_sizes:
# For OCR Track: always use OCR dimensions
if use_ocr_dimensions_for_pdf and page_idx in page_dimensions:
current_page_dims = page_dimensions[page_idx]
current_target_w = float(current_page_dims['width'])
current_target_h = float(current_page_dims['height'])
dimension_source = "ocr_track_direct"
# Priority 1: Original file dimensions (Direct Track only)
elif page_idx in original_page_sizes:
current_target_w, current_target_h = original_page_sizes[page_idx]
dimension_source = "original_file"
@@ -1774,12 +2111,26 @@ class PDFGeneratorService:
non_empty_lines = [l for l in lines if l.strip()]
num_lines = max(len(non_empty_lines), 1)
# Font size = bbox_height / num_lines * factor
# Font size calculation with stabilization
# Use 0.8 factor to leave room for line spacing
font_size = (bbox_height / num_lines) * 0.8
font_size = max(min(font_size, 72), 4) # Clamp between 4pt and 72pt
raw_font_size = (bbox_height / num_lines) * 0.8
logger.debug(f"Text has {num_lines} non-empty lines, bbox_height={bbox_height:.1f}, calculated font_size={font_size:.1f}")
# Stabilize font size for body text (most common case)
# Normal body text should be 9-11pt, only deviate for clear outliers
element_type = region.get('element_type', 'text')
if element_type in ('text', 'paragraph'):
# For body text, bias toward 10pt baseline
if 7 <= raw_font_size <= 14:
# Near-normal range: use weighted average toward 10pt
font_size = raw_font_size * 0.7 + 10 * 0.3
else:
# Clear outlier: use raw but clamp more aggressively
font_size = max(min(raw_font_size, 14), 7)
else:
# For titles/headers/etc, use raw calculation with wider range
font_size = max(min(raw_font_size, 72), 4)
logger.debug(f"Text has {num_lines} non-empty lines, bbox_height={bbox_height:.1f}, raw={raw_font_size:.1f}, final={font_size:.1f}")
# Transform coordinates: OCR (top-left origin) → PDF (bottom-left origin)
# CRITICAL: Y-axis flip!
@@ -2008,24 +2359,45 @@ class PDFGeneratorService:
result_dir: Directory containing result files (for embedded images)
"""
try:
elem_id = table_element.get('element_id', 'unknown')
use_border_only = table_element.get('_use_border_only', False)
logger.info(f"[DEBUG] draw_table_region: elem_id={elem_id}, _use_border_only={use_border_only}")
html_content = table_element.get('content', '')
if not html_content:
# Even without HTML, draw border if requested
if use_border_only:
self._draw_table_border_only(pdf_canvas, table_element, page_height, scale_w, scale_h)
return
# Try to use cell_boxes for direct rendering first (more accurate)
# Apply column correction if enabled
cell_boxes = table_element.get('cell_boxes', [])
if cell_boxes:
logger.info(f"[TABLE] Using cell_boxes direct rendering ({len(cell_boxes)} cells)")
success = self._draw_table_with_cell_boxes(
pdf_canvas, table_element, page_height,
scale_w, scale_h, result_dir
)
if success:
return # Successfully rendered with cell_boxes
if (settings.table_column_correction_enabled and
TABLE_COLUMN_CORRECTOR_AVAILABLE and
cell_boxes):
try:
corrector = TableColumnCorrector(
correction_threshold=settings.table_column_correction_threshold,
vertical_merge_enabled=settings.vertical_fragment_merge_enabled,
vertical_aspect_ratio=settings.vertical_fragment_aspect_ratio
)
# Get table bbox for vertical fragment detection
table_bbox = table_element.get('bbox', [])
if isinstance(table_bbox, dict):
table_bbox = [table_bbox['x0'], table_bbox['y0'], table_bbox['x1'], table_bbox['y1']]
logger.info("[TABLE] Falling back to ReportLab Table")
corrected_html, stats = corrector.correct(
html=html_content,
cell_boxes=cell_boxes,
table_bbox=table_bbox if isinstance(table_bbox, list) and len(table_bbox) >= 4 else None
)
if stats.get('column_corrections', 0) > 0:
logger.info(f"[TABLE] {elem_id}: Column correction applied - {stats}")
html_content = corrected_html
except Exception as e:
logger.warning(f"[TABLE] {elem_id}: Column correction failed: {e}, using original HTML")
# Fallback: Parse HTML to extract table structure and use ReportLab Table
# Parse HTML first to get table structure for grid validation
parser = HTMLTableParser()
parser.feed(html_content)
@@ -2040,6 +2412,83 @@ class PDFGeneratorService:
if not rows:
return
# Calculate number of rows and columns from HTML for grid validation
num_rows = len(rows)
max_cols = 0
for row in rows:
row_cols = sum(cell.get('colspan', 1) for cell in row['cells'])
max_cols = max(max_cols, row_cols)
# Check if table was rebuilt - if so, use HTML content directly
was_rebuilt = table_element.get('was_rebuilt', False)
cell_boxes_rendered = False # Track if we rendered borders with cell_boxes
if was_rebuilt:
logger.info(f"[TABLE] {elem_id}: Table was rebuilt, using HTML content directly")
elif use_border_only:
# Bad quality cell_boxes: skip cell_boxes rendering, use ReportLab Table with borders
logger.info(f"[TABLE] {elem_id}: Bad cell_boxes quality, using ReportLab Table with borders")
else:
# Check if cell_boxes can produce a valid grid before rendering borders
cell_boxes = table_element.get('cell_boxes', [])
if cell_boxes:
# Get table bbox for grid calculation
temp_bbox = table_element.get('bbox', [])
if isinstance(temp_bbox, dict):
raw_bbox = [temp_bbox['x0'], temp_bbox['y0'], temp_bbox['x1'], temp_bbox['y1']]
elif isinstance(temp_bbox, list) and len(temp_bbox) >= 4:
if isinstance(temp_bbox[0], (int, float)):
raw_bbox = temp_bbox[:4]
else:
raw_bbox = [temp_bbox[0][0], temp_bbox[0][1], temp_bbox[2][0], temp_bbox[2][1]]
else:
raw_bbox = None
# Pre-check: can we compute a valid grid from cell_boxes?
if raw_bbox:
test_col_widths, test_row_heights = self._compute_table_grid_from_cell_boxes(
cell_boxes, raw_bbox, num_rows, max_cols
)
grid_valid = test_col_widths is not None and test_row_heights is not None
if grid_valid:
logger.info(f"[TABLE] Grid validation passed, rendering borders with cell_boxes")
success = self._draw_table_with_cell_boxes(
pdf_canvas, table_element, page_height,
scale_w, scale_h, result_dir
)
if success:
cell_boxes_rendered = True
logger.info("[TABLE] cell_boxes rendered borders, continuing with text-only ReportLab Table")
else:
logger.info("[TABLE] cell_boxes rendering failed, using ReportLab Table with borders")
else:
# Grid mismatch: try cellboxes-first rendering if enabled
if settings.table_rendering_prefer_cellboxes:
logger.info(f"[TABLE] Grid mismatch, trying cellboxes-first rendering")
from app.services.pdf_table_renderer import TableRenderer, TableRenderConfig
renderer = TableRenderer(TableRenderConfig())
success = renderer.render_from_cellboxes_grid(
pdf_canvas,
cell_boxes,
html_content,
tuple(raw_bbox),
page_height,
scale_w,
scale_h,
row_threshold=settings.table_cellboxes_row_threshold,
col_threshold=settings.table_cellboxes_col_threshold
)
if success:
logger.info("[TABLE] cellboxes-first rendering succeeded, skipping HTML-based rendering")
return # Table fully rendered, exit early
else:
logger.info("[TABLE] cellboxes-first rendering failed, falling back to HTML-based")
else:
logger.info(f"[TABLE] Grid validation failed (mismatch), using ReportLab Table with borders")
else:
logger.info("[TABLE] No valid bbox for grid validation, using ReportLab Table with borders")
# Get bbox directly from table element
table_bbox = table_element.get('bbox')
@@ -2106,15 +2555,7 @@ class PDFGeneratorService:
pdf_y = page_height - ocr_y_bottom
# Build table data for ReportLab with proper colspan/rowspan handling
# First pass: determine the actual grid size by accounting for spans
num_rows = len(rows)
# Calculate actual number of columns by checking first row's total span
max_cols = 0
for row in rows:
row_cols = sum(cell.get('colspan', 1) for cell in row['cells'])
max_cols = max(max_cols, row_cols)
# num_rows and max_cols already calculated above for grid validation
logger.info(f"[表格] {num_rows}行x{max_cols}列 → PDF位置: ({pdf_x:.1f}, {pdf_y:.1f}), 寬x高: {table_width:.0f}x{table_height:.0f}")
# Create a grid to track occupied cells (for rowspan handling)
@@ -2223,16 +2664,25 @@ class PDFGeneratorService:
logger.info(f"[TABLE] Created with {len(col_widths)} cols, {len(row_heights)} rows")
# Apply table style
style = TableStyle([
# If cell_boxes rendered borders, skip GRID style (text-only rendering)
style_commands = [
('FONT', (0, 0), (-1, -1), self.font_name if self.font_registered else 'Helvetica', font_size),
('GRID', (0, 0), (-1, -1), 0.5, colors.black),
('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
('ALIGN', (0, 0), (-1, -1), 'CENTER'),
('LEFTPADDING', (0, 0), (-1, -1), 2),
('RIGHTPADDING', (0, 0), (-1, -1), 2),
('TOPPADDING', (0, 0), (-1, -1), 2),
('BOTTOMPADDING', (0, 0), (-1, -1), 2),
])
]
# Only add GRID if cell_boxes didn't render borders
if not cell_boxes_rendered:
style_commands.insert(1, ('GRID', (0, 0), (-1, -1), 0.5, colors.black))
logger.info("[TABLE] Adding GRID style (cell_boxes not used)")
else:
logger.info("[TABLE] Skipping GRID style (cell_boxes rendered borders)")
style = TableStyle(style_commands)
# Add header style if first row has headers
if rows and rows[0]['cells'] and rows[0]['cells'][0].get('is_header'):
@@ -2435,6 +2885,106 @@ class PDFGeneratorService:
logger.debug(f"[TABLE] Normalized {len(cell_boxes)} cell boxes to grid")
return normalized_boxes
def _draw_table_border_only(
self,
pdf_canvas: canvas.Canvas,
table_element: Dict,
page_height: float,
scale_w: float = 1.0,
scale_h: float = 1.0
):
"""
Draw only the outer border of a table (for tables with bad cell_boxes quality).
Text inside the table will be rendered using raw OCR positions.
Args:
pdf_canvas: ReportLab canvas object
table_element: Table element dict
page_height: Height of page in PDF coordinates
scale_w: Scale factor for X coordinates
scale_h: Scale factor for Y coordinates
"""
table_bbox = table_element.get('bbox', [])
if not table_bbox or len(table_bbox) < 4:
return
element_id = table_element.get('element_id', 'unknown')
# Handle different bbox formats
if isinstance(table_bbox, dict):
x0, y0, x1, y1 = table_bbox['x0'], table_bbox['y0'], table_bbox['x1'], table_bbox['y1']
elif isinstance(table_bbox[0], (int, float)):
x0, y0, x1, y1 = table_bbox[0], table_bbox[1], table_bbox[2], table_bbox[3]
else:
return
# Apply scaling
pdf_x0 = x0 * scale_w
pdf_y0 = y0 * scale_h
pdf_x1 = x1 * scale_w
pdf_y1 = y1 * scale_h
# Convert to PDF coordinates (flip Y)
pdf_top = page_height - pdf_y0
pdf_bottom = page_height - pdf_y1
width = pdf_x1 - pdf_x0
height = pdf_y1 - pdf_y0
# Draw outer border only
pdf_canvas.setStrokeColor(colors.black)
pdf_canvas.setLineWidth(0.5)
pdf_canvas.rect(pdf_x0, pdf_bottom, width, height, stroke=1, fill=0)
logger.info(f"[TABLE] {element_id}: Drew border only (bad cell_boxes quality)")
def _check_cell_boxes_quality(self, cell_boxes: List, element_id: str = "") -> str:
"""
Check the quality of cell_boxes to determine rendering strategy.
Args:
cell_boxes: List of cell bounding boxes
element_id: Optional element ID for logging
Returns:
'good' if cell_boxes form a proper grid, 'bad' otherwise
"""
# If quality check is disabled, always return 'good' to use pure PP-Structure output
if not settings.table_quality_check_enabled:
logger.debug(f"[TABLE QUALITY] {element_id}: good - quality check disabled (pure PP-Structure mode)")
return 'good'
if not cell_boxes or len(cell_boxes) < 2:
logger.debug(f"[TABLE QUALITY] {element_id}: bad - too few cells ({len(cell_boxes) if cell_boxes else 0})")
return 'bad' # No cell_boxes or too few
# Count overlapping cell pairs
overlap_count = 0
for i, box1 in enumerate(cell_boxes):
for j, box2 in enumerate(cell_boxes):
if i >= j:
continue
if not isinstance(box1, (list, tuple)) or len(box1) < 4:
continue
if not isinstance(box2, (list, tuple)) or len(box2) < 4:
continue
x_overlap = box1[0] < box2[2] and box1[2] > box2[0]
y_overlap = box1[1] < box2[3] and box1[3] > box2[1]
if x_overlap and y_overlap:
overlap_count += 1
total_pairs = len(cell_boxes) * (len(cell_boxes) - 1) // 2
overlap_ratio = overlap_count / total_pairs if total_pairs > 0 else 0
# Relaxed threshold: 20% overlap instead of 10% to allow more tables through
# This is because PP-StructureV3's cell detection sometimes has slight overlaps
if overlap_ratio > 0.20:
logger.info(f"[TABLE QUALITY] {element_id}: bad - overlap ratio {overlap_ratio:.2%} > 20%")
return 'bad'
logger.debug(f"[TABLE QUALITY] {element_id}: good - {len(cell_boxes)} cells, overlap {overlap_ratio:.2%}")
return 'good'
def _draw_table_with_cell_boxes(
self,
pdf_canvas: canvas.Canvas,
@@ -2465,39 +3015,64 @@ class PDFGeneratorService:
"""
try:
cell_boxes = table_element.get('cell_boxes', [])
# Always draw outer table border first (fallback for incomplete cell_boxes)
table_bbox = table_element.get('bbox', [])
if table_bbox and len(table_bbox) >= 4:
# Handle different bbox formats (list or dict)
if isinstance(table_bbox, dict):
tx1 = float(table_bbox.get('x0', 0))
ty1 = float(table_bbox.get('y0', 0))
tx2 = float(table_bbox.get('x1', 0))
ty2 = float(table_bbox.get('y1', 0))
else:
tx1, ty1, tx2, ty2 = table_bbox[:4]
# Apply scaling
tx1_scaled = tx1 * scale_w
ty1_scaled = ty1 * scale_h
tx2_scaled = tx2 * scale_w
ty2_scaled = ty2 * scale_h
# Check cell_boxes quality - skip if they don't form a proper grid
if cell_boxes and len(cell_boxes) > 2:
# Count overlapping cell pairs
overlap_count = 0
for i, box1 in enumerate(cell_boxes):
for j, box2 in enumerate(cell_boxes):
if i >= j:
continue
x_overlap = box1[0] < box2[2] and box1[2] > box2[0]
y_overlap = box1[1] < box2[3] and box1[3] > box2[1]
if x_overlap and y_overlap:
overlap_count += 1
table_width = tx2_scaled - tx1_scaled
table_height = ty2_scaled - ty1_scaled
# If more than 25% of cell pairs overlap, cell_boxes are unreliable
# Increased from 10% to 25% to allow more tables to use cell_boxes rendering
# which provides better visual fidelity than ReportLab Table fallback
total_pairs = len(cell_boxes) * (len(cell_boxes) - 1) // 2
overlap_ratio = overlap_count / total_pairs if total_pairs > 0 else 0
# Transform Y coordinate (PDF uses bottom-left origin)
pdf_x = tx1_scaled
pdf_y = page_height - ty2_scaled # Bottom of table in PDF coords
# Draw outer table border (slightly thicker for visibility)
pdf_canvas.setStrokeColor(colors.black)
pdf_canvas.setLineWidth(1.0)
pdf_canvas.rect(pdf_x, pdf_y, table_width, table_height, stroke=1, fill=0)
logger.info(f"[TABLE] Drew outer table border at [{int(tx1)},{int(ty1)},{int(tx2)},{int(ty2)}]")
if overlap_ratio > 0.25:
logger.warning(
f"[TABLE] Skipping cell_boxes rendering: {overlap_count}/{total_pairs} "
f"({overlap_ratio:.1%}) cell pairs overlap - using ReportLab Table fallback"
)
return False # Return False to trigger ReportLab Table fallback
if not cell_boxes:
# Fallback: draw outer border only when no cell_boxes
if table_bbox and len(table_bbox) >= 4:
# Handle different bbox formats (list or dict)
if isinstance(table_bbox, dict):
tx1 = float(table_bbox.get('x0', 0))
ty1 = float(table_bbox.get('y0', 0))
tx2 = float(table_bbox.get('x1', 0))
ty2 = float(table_bbox.get('y1', 0))
else:
tx1, ty1, tx2, ty2 = table_bbox[:4]
# Apply scaling
tx1_scaled = tx1 * scale_w
ty1_scaled = ty1 * scale_h
tx2_scaled = tx2 * scale_w
ty2_scaled = ty2 * scale_h
table_width = tx2_scaled - tx1_scaled
table_height = ty2_scaled - ty1_scaled
# Transform Y coordinate (PDF uses bottom-left origin)
pdf_x = tx1_scaled
pdf_y = page_height - ty2_scaled # Bottom of table in PDF coords
# Draw outer table border (slightly thicker for visibility)
pdf_canvas.setStrokeColor(colors.black)
pdf_canvas.setLineWidth(1.0)
pdf_canvas.rect(pdf_x, pdf_y, table_width, table_height, stroke=1, fill=0)
logger.info(f"[TABLE] Drew outer table border at [{int(tx1)},{int(ty1)},{int(tx2)},{int(ty2)}]")
logger.warning("[TABLE] No cell_boxes available, only outer border drawn")
# Still draw embedded images even without cell borders
embedded_images = table_element.get('embedded_images', [])
@@ -2511,31 +3086,47 @@ class PDFGeneratorService:
# Normalize cell boxes to create aligned grid
cell_boxes = self._normalize_cell_boxes_to_grid(cell_boxes)
logger.info(f"[TABLE] Drawing {len(cell_boxes)} cell borders (layered mode, grid-aligned)")
logger.info(f"[TABLE] Drawing {len(cell_boxes)} cells using grid lines (avoiding duplicates)")
# Collect unique grid lines to avoid drawing duplicate/overlapping lines
h_lines = set() # Horizontal lines: (y, x_start, x_end)
v_lines = set() # Vertical lines: (x, y_start, y_end)
# Draw each cell border
for box in cell_boxes:
x1, y1, x2, y2 = box[0], box[1], box[2], box[3]
# Apply scaling
x1_scaled = x1 * scale_w
y1_scaled = y1 * scale_h
x2_scaled = x2 * scale_w
y2_scaled = y2 * scale_h
x1_s = x1 * scale_w
y1_s = y1 * scale_h
x2_s = x2 * scale_w
y2_s = y2 * scale_h
cell_width = x2_scaled - x1_scaled
cell_height = y2_scaled - y1_scaled
# Round to 1 decimal place to help with deduplication
x1_s, y1_s, x2_s, y2_s = round(x1_s, 1), round(y1_s, 1), round(x2_s, 1), round(y2_s, 1)
# Transform Y coordinate (PDF uses bottom-left origin)
pdf_x = x1_scaled
pdf_y = page_height - y2_scaled # Bottom of cell in PDF coords
# Add horizontal lines (top and bottom of cell)
h_lines.add((y1_s, x1_s, x2_s)) # Top line
h_lines.add((y2_s, x1_s, x2_s)) # Bottom line
# Draw cell border only (no fill, no text)
pdf_canvas.setStrokeColor(colors.black)
pdf_canvas.setLineWidth(0.5)
pdf_canvas.rect(pdf_x, pdf_y, cell_width, cell_height, stroke=1, fill=0)
# Add vertical lines (left and right of cell)
v_lines.add((x1_s, y1_s, y2_s)) # Left line
v_lines.add((x2_s, y1_s, y2_s)) # Right line
logger.info(f"[TABLE] Drew {len(cell_boxes)} cell borders")
# Draw unique horizontal lines
pdf_canvas.setStrokeColor(colors.black)
pdf_canvas.setLineWidth(0.5)
for y, x_start, x_end in h_lines:
pdf_y = page_height - y # Transform Y coordinate
pdf_canvas.line(x_start, pdf_y, x_end, pdf_y)
# Draw unique vertical lines
for x, y_start, y_end in v_lines:
pdf_y_start = page_height - y_start
pdf_y_end = page_height - y_end
pdf_canvas.line(x, pdf_y_start, x, pdf_y_end)
logger.info(f"[TABLE] Drew {len(h_lines)} horizontal + {len(v_lines)} vertical grid lines")
# Draw embedded images
embedded_images = table_element.get('embedded_images', [])