chore: backup before code cleanup
Backup commit before executing remove-unused-code proposal. This includes all pending changes and new features. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -26,6 +26,23 @@ from html.parser import HTMLParser
|
||||
|
||||
from app.core.config import settings
|
||||
|
||||
# Import table column corrector for column alignment fix
|
||||
try:
|
||||
from app.services.table_column_corrector import TableColumnCorrector
|
||||
TABLE_COLUMN_CORRECTOR_AVAILABLE = True
|
||||
except ImportError:
|
||||
TABLE_COLUMN_CORRECTOR_AVAILABLE = False
|
||||
TableColumnCorrector = None
|
||||
|
||||
# Import text region renderer for simple text positioning
|
||||
try:
|
||||
from app.services.text_region_renderer import TextRegionRenderer, load_raw_ocr_regions
|
||||
TEXT_REGION_RENDERER_AVAILABLE = True
|
||||
except ImportError:
|
||||
TEXT_REGION_RENDERER_AVAILABLE = False
|
||||
TextRegionRenderer = None
|
||||
load_raw_ocr_regions = None
|
||||
|
||||
# Import UnifiedDocument for dual-track support
|
||||
try:
|
||||
from app.models.unified_document import (
|
||||
@@ -596,7 +613,8 @@ class PDFGeneratorService:
|
||||
'content': html_content,
|
||||
'bbox': [element.bbox.x0, element.bbox.y0,
|
||||
element.bbox.x1, element.bbox.y1],
|
||||
'page': page_num - 1 # layout uses 0-based
|
||||
'page': page_num - 1, # layout uses 0-based
|
||||
'element_id': element.element_id # For _use_border_only matching
|
||||
}
|
||||
|
||||
# Preserve cell_boxes and embedded_images from metadata
|
||||
@@ -607,18 +625,29 @@ class PDFGeneratorService:
|
||||
table_element['cell_boxes_source'] = element.metadata.get('cell_boxes_source', 'metadata')
|
||||
if 'embedded_images' in element.metadata:
|
||||
table_element['embedded_images'] = element.metadata['embedded_images']
|
||||
# Pass through rebuild flag - rebuilt tables should use HTML content
|
||||
if element.metadata.get('was_rebuilt'):
|
||||
table_element['was_rebuilt'] = True
|
||||
logger.debug(f"Table {element.element_id}: marked as rebuilt")
|
||||
|
||||
layout_elements.append(table_element)
|
||||
|
||||
# Add bbox to images_metadata for text overlap filtering
|
||||
# (no actual image file, just bbox for filtering)
|
||||
images_metadata.append({
|
||||
img_metadata = {
|
||||
'image_path': None, # No fake table image
|
||||
'bbox': bbox_polygon,
|
||||
'page': page_num - 1, # 0-based for images_metadata
|
||||
'type': 'table',
|
||||
'element_id': element.element_id
|
||||
})
|
||||
}
|
||||
# Also copy cell_boxes for quality checking
|
||||
if element.metadata and 'cell_boxes' in element.metadata:
|
||||
img_metadata['cell_boxes'] = element.metadata['cell_boxes']
|
||||
# Mark if table was rebuilt
|
||||
if element.metadata and element.metadata.get('was_rebuilt'):
|
||||
img_metadata['was_rebuilt'] = True
|
||||
images_metadata.append(img_metadata)
|
||||
|
||||
# Handle image/visual elements (including stamps/seals)
|
||||
elif element.is_visual or element.type in [
|
||||
@@ -1022,15 +1051,25 @@ class PDFGeneratorService:
|
||||
# Set current track
|
||||
self.current_processing_track = 'ocr'
|
||||
|
||||
# Convert UnifiedDocument to OCR data format (legacy)
|
||||
ocr_data = self.convert_unified_document_to_ocr_data(unified_doc)
|
||||
# Check if simple text positioning mode is enabled
|
||||
if (settings.simple_text_positioning_enabled and
|
||||
TEXT_REGION_RENDERER_AVAILABLE):
|
||||
logger.info("Using simple text positioning mode")
|
||||
result = self._generate_simple_text_pdf(
|
||||
unified_doc=unified_doc,
|
||||
output_path=output_path,
|
||||
source_file_path=source_file_path
|
||||
)
|
||||
else:
|
||||
# Convert UnifiedDocument to OCR data format (legacy)
|
||||
ocr_data = self.convert_unified_document_to_ocr_data(unified_doc)
|
||||
|
||||
# Use existing generation pipeline
|
||||
result = self._generate_pdf_from_data(
|
||||
ocr_data=ocr_data,
|
||||
output_path=output_path,
|
||||
source_file_path=source_file_path
|
||||
)
|
||||
# Use existing generation pipeline
|
||||
result = self._generate_pdf_from_data(
|
||||
ocr_data=ocr_data,
|
||||
output_path=output_path,
|
||||
source_file_path=source_file_path
|
||||
)
|
||||
|
||||
# Reset track
|
||||
self.current_processing_track = None
|
||||
@@ -1043,6 +1082,235 @@ class PDFGeneratorService:
|
||||
self.current_processing_track = None
|
||||
return False
|
||||
|
||||
def _generate_simple_text_pdf(
|
||||
self,
|
||||
unified_doc: 'UnifiedDocument',
|
||||
output_path: Path,
|
||||
source_file_path: Optional[Path] = None
|
||||
) -> bool:
|
||||
"""
|
||||
Generate PDF using simple text positioning from raw OCR regions.
|
||||
|
||||
This approach bypasses complex table structure reconstruction and renders
|
||||
raw OCR text directly at detected positions with rotation correction.
|
||||
Images, charts, figures, seals, and formulas are still rendered normally.
|
||||
|
||||
Args:
|
||||
unified_doc: UnifiedDocument from OCR processing
|
||||
output_path: Path to save generated PDF
|
||||
source_file_path: Optional path to original source file
|
||||
|
||||
Returns:
|
||||
True if successful, False otherwise
|
||||
"""
|
||||
try:
|
||||
logger.info("=== Simple Text Positioning PDF Generation ===")
|
||||
|
||||
# Initialize text region renderer
|
||||
text_renderer = TextRegionRenderer(
|
||||
font_name=self.font_name,
|
||||
debug=settings.simple_text_positioning_debug
|
||||
)
|
||||
|
||||
# Get result directory from output_path
|
||||
result_dir = output_path.parent
|
||||
|
||||
# Try to determine task_id from result directory or output filename
|
||||
# Output path is typically: result_dir/task_id_edited.pdf
|
||||
task_id = None
|
||||
if output_path.stem.endswith('_edited'):
|
||||
task_id = output_path.stem.replace('_edited', '')
|
||||
elif result_dir.name:
|
||||
# result_dir is typically the task_id directory
|
||||
task_id = result_dir.name
|
||||
|
||||
if not task_id:
|
||||
logger.warning("Could not determine task_id, falling back to legacy method")
|
||||
ocr_data = self.convert_unified_document_to_ocr_data(unified_doc)
|
||||
return self._generate_pdf_from_data(
|
||||
ocr_data=ocr_data,
|
||||
output_path=output_path,
|
||||
source_file_path=source_file_path
|
||||
)
|
||||
|
||||
logger.info(f"Task ID: {task_id}, Result dir: {result_dir}")
|
||||
|
||||
# Get total pages from UnifiedDocument
|
||||
total_pages = len(unified_doc.pages) if unified_doc.pages else 1
|
||||
|
||||
# Get page dimensions from first page (for canvas initialization)
|
||||
if not unified_doc.pages:
|
||||
logger.error("No pages in document")
|
||||
return False
|
||||
|
||||
first_page = unified_doc.pages[0]
|
||||
if hasattr(first_page, 'dimensions') and first_page.dimensions:
|
||||
page_width = float(first_page.dimensions.width)
|
||||
page_height = float(first_page.dimensions.height)
|
||||
else:
|
||||
# Fallback to default size
|
||||
page_width = 612.0 # Letter width
|
||||
page_height = 792.0 # Letter height
|
||||
logger.warning(f"No page dimensions found, using default {page_width}x{page_height}")
|
||||
|
||||
logger.info(f"Initial page size: {page_width:.1f} x {page_height:.1f}")
|
||||
|
||||
# Create PDF canvas
|
||||
pdf_canvas = canvas.Canvas(str(output_path), pagesize=(page_width, page_height))
|
||||
|
||||
# Collect image-type elements from UnifiedDocument for rendering
|
||||
# Types that should be rendered as images: figure, image, chart, seal, formula
|
||||
image_element_types = {'figure', 'image', 'chart', 'seal', 'formula'}
|
||||
|
||||
# Process each page
|
||||
for page_num in range(1, total_pages + 1):
|
||||
logger.info(f">>> Processing page {page_num}/{total_pages}")
|
||||
|
||||
# Get page dimensions for current page
|
||||
if page_num <= len(unified_doc.pages):
|
||||
current_page = unified_doc.pages[page_num - 1]
|
||||
if hasattr(current_page, 'dimensions') and current_page.dimensions:
|
||||
current_width = float(current_page.dimensions.width)
|
||||
current_height = float(current_page.dimensions.height)
|
||||
else:
|
||||
current_width = page_width
|
||||
current_height = page_height
|
||||
else:
|
||||
current_width = page_width
|
||||
current_height = page_height
|
||||
|
||||
if page_num > 1:
|
||||
pdf_canvas.showPage()
|
||||
|
||||
# Set page size
|
||||
pdf_canvas.setPageSize((current_width, current_height))
|
||||
|
||||
# === Layer 1: Render images, charts, figures, seals, formulas ===
|
||||
# Also collect exclusion zones for text avoidance
|
||||
exclusion_zones = [] # List of (x0, y0, x1, y1) tuples
|
||||
|
||||
if page_num <= len(unified_doc.pages):
|
||||
current_page = unified_doc.pages[page_num - 1]
|
||||
page_elements = current_page.elements if hasattr(current_page, 'elements') else []
|
||||
|
||||
image_elements_rendered = 0
|
||||
for elem in page_elements:
|
||||
elem_type = elem.type if hasattr(elem, 'type') else elem.get('type', '')
|
||||
# Handle enum type
|
||||
if hasattr(elem_type, 'value'):
|
||||
elem_type = elem_type.value
|
||||
|
||||
if elem_type in image_element_types:
|
||||
# Get image path from element content
|
||||
content = elem.content if hasattr(elem, 'content') else elem.get('content', {})
|
||||
if isinstance(content, dict):
|
||||
saved_path = content.get('saved_path') or content.get('path')
|
||||
else:
|
||||
saved_path = None
|
||||
|
||||
# Get bbox for exclusion zone (even if image file not found)
|
||||
bbox = elem.bbox if hasattr(elem, 'bbox') else elem.get('bbox', {})
|
||||
if hasattr(bbox, 'x0'):
|
||||
x0, y0, x1, y1 = bbox.x0, bbox.y0, bbox.x1, bbox.y1
|
||||
elif isinstance(bbox, dict):
|
||||
x0 = bbox.get('x0', 0)
|
||||
y0 = bbox.get('y0', 0)
|
||||
x1 = bbox.get('x1', x0 + bbox.get('width', 0))
|
||||
y1 = bbox.get('y1', y0 + bbox.get('height', 0))
|
||||
else:
|
||||
continue
|
||||
|
||||
# Add to exclusion zones for text avoidance
|
||||
# Use original image coordinates (not PDF flipped)
|
||||
exclusion_zones.append((x0, y0, x1, y1))
|
||||
|
||||
if saved_path:
|
||||
# Try to find the image file
|
||||
image_path = result_dir / saved_path
|
||||
if not image_path.exists():
|
||||
# Try in imgs subdirectory
|
||||
image_path = result_dir / 'imgs' / saved_path
|
||||
if not image_path.exists():
|
||||
# Try just the filename
|
||||
image_path = result_dir / Path(saved_path).name
|
||||
|
||||
if image_path.exists():
|
||||
try:
|
||||
# Convert coordinates (flip Y for PDF)
|
||||
pdf_x = x0
|
||||
pdf_y = current_height - y1 # Bottom of image in PDF coords
|
||||
img_width = x1 - x0
|
||||
img_height = y1 - y0
|
||||
|
||||
# Draw image
|
||||
pdf_canvas.drawImage(
|
||||
str(image_path),
|
||||
pdf_x, pdf_y,
|
||||
width=img_width,
|
||||
height=img_height,
|
||||
preserveAspectRatio=True,
|
||||
mask='auto'
|
||||
)
|
||||
image_elements_rendered += 1
|
||||
logger.debug(f"Rendered {elem_type}: {saved_path} at ({pdf_x:.1f}, {pdf_y:.1f})")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to render {elem_type} {saved_path}: {e}")
|
||||
else:
|
||||
logger.warning(f"Image file not found: {saved_path}")
|
||||
|
||||
if image_elements_rendered > 0:
|
||||
logger.info(f"Rendered {image_elements_rendered} image elements (figures/charts/seals/formulas)")
|
||||
|
||||
if exclusion_zones:
|
||||
logger.info(f"Collected {len(exclusion_zones)} exclusion zones for text avoidance")
|
||||
|
||||
# === Layer 2: Render text from raw OCR regions ===
|
||||
raw_regions = load_raw_ocr_regions(str(result_dir), task_id, page_num)
|
||||
|
||||
if not raw_regions:
|
||||
logger.warning(f"No raw OCR regions found for page {page_num}")
|
||||
else:
|
||||
logger.info(f"Loaded {len(raw_regions)} raw OCR regions for page {page_num}")
|
||||
|
||||
# Collect texts inside exclusion zones for position-aware deduplication
|
||||
# This prevents duplicate axis labels from being rendered near charts
|
||||
zone_texts = None
|
||||
if exclusion_zones:
|
||||
zone_texts = text_renderer.collect_zone_texts(
|
||||
raw_regions, exclusion_zones, threshold=0.5, include_axis_labels=True
|
||||
)
|
||||
if zone_texts:
|
||||
logger.info(f"Collected {len(zone_texts)} zone texts for deduplication: {list(zone_texts)[:10]}...")
|
||||
|
||||
# Render all text regions, avoiding exclusion zones (images/charts)
|
||||
# Scale factors are 1.0 since OCR dimensions match page dimensions
|
||||
rendered = text_renderer.render_all_regions(
|
||||
pdf_canvas=pdf_canvas,
|
||||
regions=raw_regions,
|
||||
page_height=current_height,
|
||||
scale_x=1.0,
|
||||
scale_y=1.0,
|
||||
exclusion_zones=exclusion_zones,
|
||||
zone_texts=zone_texts
|
||||
)
|
||||
|
||||
logger.info(f"Rendered {rendered} text regions")
|
||||
|
||||
logger.info(f"<<< Page {page_num} complete")
|
||||
|
||||
# Save PDF
|
||||
pdf_canvas.save()
|
||||
|
||||
file_size = output_path.stat().st_size
|
||||
logger.info(f"Generated PDF: {output_path.name} ({file_size} bytes)")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to generate simple text PDF: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
def _generate_pdf_from_data(
|
||||
self,
|
||||
ocr_data: Dict,
|
||||
@@ -1093,8 +1361,15 @@ class PDFGeneratorService:
|
||||
logger.info("No page_dimensions found, using first page size for all pages")
|
||||
|
||||
# Step 3: Get original file dimensions for all pages
|
||||
# For OCR track, we use OCR coordinate system dimensions directly to avoid scaling issues
|
||||
original_page_sizes = {}
|
||||
if source_file_path:
|
||||
use_ocr_dimensions_for_pdf = (self.current_processing_track == 'ocr')
|
||||
|
||||
if use_ocr_dimensions_for_pdf:
|
||||
# OCR Track: Use OCR coordinate system dimensions directly
|
||||
# This ensures no scaling is needed (scale = 1.0)
|
||||
logger.info(f"OCR Track: 使用 OCR 座標系尺寸作為 PDF 頁面尺寸(避免縮放)")
|
||||
elif source_file_path:
|
||||
original_page_sizes = self.get_all_page_sizes(source_file_path)
|
||||
if original_page_sizes:
|
||||
logger.info(f"從原始文件獲取到 {len(original_page_sizes)} 頁尺寸")
|
||||
@@ -1104,8 +1379,12 @@ class PDFGeneratorService:
|
||||
logger.info(f"無原始文件,將使用 OCR/UnifiedDocument 尺寸")
|
||||
|
||||
# Determine initial canvas size (will be updated per page)
|
||||
# Priority: original file first page > OCR/UnifiedDocument first page
|
||||
if 0 in original_page_sizes:
|
||||
# Priority for OCR track: OCR dimensions (no scaling)
|
||||
# Priority for Direct track: original file first page > OCR/UnifiedDocument first page
|
||||
if use_ocr_dimensions_for_pdf:
|
||||
target_width, target_height = ocr_width, ocr_height
|
||||
logger.info(f"初始 PDF 尺寸(OCR Track, 使用 OCR 座標系): {target_width:.1f} x {target_height:.1f}")
|
||||
elif 0 in original_page_sizes:
|
||||
target_width, target_height = original_page_sizes[0]
|
||||
logger.info(f"初始 PDF 尺寸(來自原始文件首頁): {target_width:.1f} x {target_height:.1f}")
|
||||
else:
|
||||
@@ -1159,14 +1438,49 @@ class PDFGeneratorService:
|
||||
# Create PDF canvas with initial page size (will be updated per page)
|
||||
pdf_canvas = canvas.Canvas(str(output_path), pagesize=(target_width, target_height))
|
||||
|
||||
# LAYERED RENDERING: Exclude tables from regions_to_avoid
|
||||
# Text inside tables will be rendered at raw OCR positions (via GapFillingService)
|
||||
# while table borders are drawn separately using cell_boxes
|
||||
# Only avoid overlap with actual images/figures/charts
|
||||
regions_to_avoid = [img for img in images_metadata if img.get('type') != 'table']
|
||||
table_count = len([img for img in images_metadata if img.get('type') == 'table'])
|
||||
# Smart filtering: only include tables with good cell_boxes quality in regions_to_avoid
|
||||
# Tables with bad cell_boxes will use raw OCR text positioning instead
|
||||
# Exception: Rebuilt tables always use HTML content and filter text
|
||||
regions_to_avoid = []
|
||||
good_quality_tables = []
|
||||
bad_quality_tables = []
|
||||
rebuilt_tables = []
|
||||
|
||||
logger.info(f"過濾文字區域: {len(regions_to_avoid)} 個區域需要避免 (不含表格), {table_count} 個表格使用分層渲染")
|
||||
for img in images_metadata:
|
||||
if img.get('type') == 'table':
|
||||
elem_id = img.get('element_id', 'unknown')
|
||||
|
||||
# Check if this table was rebuilt - rebuilt tables have good content
|
||||
was_rebuilt = img.get('was_rebuilt', False)
|
||||
|
||||
if was_rebuilt:
|
||||
# Rebuilt tables have accurate content - filter text, use HTML
|
||||
regions_to_avoid.append(img)
|
||||
rebuilt_tables.append(elem_id)
|
||||
else:
|
||||
# Check cell_boxes quality for non-rebuilt tables
|
||||
cell_boxes = img.get('cell_boxes', [])
|
||||
quality = self._check_cell_boxes_quality(cell_boxes, elem_id)
|
||||
|
||||
if quality == 'good':
|
||||
# Good quality: filter text, render with cell_boxes
|
||||
regions_to_avoid.append(img)
|
||||
good_quality_tables.append(elem_id)
|
||||
else:
|
||||
# Bad quality: don't filter text, just draw border
|
||||
bad_quality_tables.append(elem_id)
|
||||
img['_use_border_only'] = True # Mark for border-only rendering
|
||||
else:
|
||||
# Non-table elements (images, figures, charts) always avoid
|
||||
regions_to_avoid.append(img)
|
||||
|
||||
logger.info(f"過濾文字區域: {len(regions_to_avoid)} 個區域需要避免")
|
||||
if rebuilt_tables:
|
||||
logger.info(f" 重建表格用 HTML: {rebuilt_tables}")
|
||||
if good_quality_tables:
|
||||
logger.info(f" 表格用 cell_boxes: {good_quality_tables}")
|
||||
if bad_quality_tables:
|
||||
logger.info(f" 表格用 raw OCR text (border only): {bad_quality_tables}")
|
||||
|
||||
filtered_text_regions = self._filter_text_in_regions(text_regions, regions_to_avoid)
|
||||
|
||||
@@ -1178,10 +1492,24 @@ class PDFGeneratorService:
|
||||
pages_data[page_num] = []
|
||||
pages_data[page_num].append(region)
|
||||
|
||||
# Get table elements from layout_data
|
||||
# Get table elements from layout_data and copy _use_border_only flags
|
||||
table_elements = []
|
||||
if layout_data and layout_data.get('elements'):
|
||||
table_elements = [e for e in layout_data['elements'] if e.get('type') == 'table']
|
||||
# Create a lookup for _use_border_only flags from images_metadata
|
||||
border_only_tables = {img.get('element_id') for img in images_metadata
|
||||
if img.get('type') == 'table' and img.get('_use_border_only')}
|
||||
|
||||
logger.debug(f"[DEBUG] border_only_tables from images_metadata: {border_only_tables}")
|
||||
|
||||
for e in layout_data['elements']:
|
||||
if e.get('type') == 'table':
|
||||
elem_id = e.get('element_id')
|
||||
logger.debug(f"[DEBUG] layout_data table element_id: {elem_id}")
|
||||
# Copy the flag if this table should use border only
|
||||
if elem_id in border_only_tables:
|
||||
e['_use_border_only'] = True
|
||||
logger.info(f"[DEBUG] Set _use_border_only=True for table {elem_id}")
|
||||
table_elements.append(e)
|
||||
|
||||
# Process each page
|
||||
total_pages = ocr_data.get('total_pages', 1)
|
||||
@@ -1195,14 +1523,23 @@ class PDFGeneratorService:
|
||||
logger.info(f">>> 處理第 {page_num}/{total_pages} 頁")
|
||||
|
||||
# Get current page dimensions with priority order:
|
||||
# 1. Original file dimensions (highest priority)
|
||||
# 2. OCR/UnifiedDocument dimensions
|
||||
# 3. Fallback to first page dimensions
|
||||
# For OCR Track: always use OCR dimensions (scale = 1.0)
|
||||
# For Direct Track:
|
||||
# 1. Original file dimensions (highest priority)
|
||||
# 2. OCR/UnifiedDocument dimensions
|
||||
# 3. Fallback to first page dimensions
|
||||
page_idx = page_num - 1
|
||||
dimension_source = "unknown"
|
||||
|
||||
# Priority 1: Original file dimensions
|
||||
if page_idx in original_page_sizes:
|
||||
# For OCR Track: always use OCR dimensions
|
||||
if use_ocr_dimensions_for_pdf and page_idx in page_dimensions:
|
||||
current_page_dims = page_dimensions[page_idx]
|
||||
current_target_w = float(current_page_dims['width'])
|
||||
current_target_h = float(current_page_dims['height'])
|
||||
dimension_source = "ocr_track_direct"
|
||||
|
||||
# Priority 1: Original file dimensions (Direct Track only)
|
||||
elif page_idx in original_page_sizes:
|
||||
current_target_w, current_target_h = original_page_sizes[page_idx]
|
||||
dimension_source = "original_file"
|
||||
|
||||
@@ -1774,12 +2111,26 @@ class PDFGeneratorService:
|
||||
non_empty_lines = [l for l in lines if l.strip()]
|
||||
num_lines = max(len(non_empty_lines), 1)
|
||||
|
||||
# Font size = bbox_height / num_lines * factor
|
||||
# Font size calculation with stabilization
|
||||
# Use 0.8 factor to leave room for line spacing
|
||||
font_size = (bbox_height / num_lines) * 0.8
|
||||
font_size = max(min(font_size, 72), 4) # Clamp between 4pt and 72pt
|
||||
raw_font_size = (bbox_height / num_lines) * 0.8
|
||||
|
||||
logger.debug(f"Text has {num_lines} non-empty lines, bbox_height={bbox_height:.1f}, calculated font_size={font_size:.1f}")
|
||||
# Stabilize font size for body text (most common case)
|
||||
# Normal body text should be 9-11pt, only deviate for clear outliers
|
||||
element_type = region.get('element_type', 'text')
|
||||
if element_type in ('text', 'paragraph'):
|
||||
# For body text, bias toward 10pt baseline
|
||||
if 7 <= raw_font_size <= 14:
|
||||
# Near-normal range: use weighted average toward 10pt
|
||||
font_size = raw_font_size * 0.7 + 10 * 0.3
|
||||
else:
|
||||
# Clear outlier: use raw but clamp more aggressively
|
||||
font_size = max(min(raw_font_size, 14), 7)
|
||||
else:
|
||||
# For titles/headers/etc, use raw calculation with wider range
|
||||
font_size = max(min(raw_font_size, 72), 4)
|
||||
|
||||
logger.debug(f"Text has {num_lines} non-empty lines, bbox_height={bbox_height:.1f}, raw={raw_font_size:.1f}, final={font_size:.1f}")
|
||||
|
||||
# Transform coordinates: OCR (top-left origin) → PDF (bottom-left origin)
|
||||
# CRITICAL: Y-axis flip!
|
||||
@@ -2008,24 +2359,45 @@ class PDFGeneratorService:
|
||||
result_dir: Directory containing result files (for embedded images)
|
||||
"""
|
||||
try:
|
||||
elem_id = table_element.get('element_id', 'unknown')
|
||||
use_border_only = table_element.get('_use_border_only', False)
|
||||
logger.info(f"[DEBUG] draw_table_region: elem_id={elem_id}, _use_border_only={use_border_only}")
|
||||
|
||||
html_content = table_element.get('content', '')
|
||||
if not html_content:
|
||||
# Even without HTML, draw border if requested
|
||||
if use_border_only:
|
||||
self._draw_table_border_only(pdf_canvas, table_element, page_height, scale_w, scale_h)
|
||||
return
|
||||
|
||||
# Try to use cell_boxes for direct rendering first (more accurate)
|
||||
# Apply column correction if enabled
|
||||
cell_boxes = table_element.get('cell_boxes', [])
|
||||
if cell_boxes:
|
||||
logger.info(f"[TABLE] Using cell_boxes direct rendering ({len(cell_boxes)} cells)")
|
||||
success = self._draw_table_with_cell_boxes(
|
||||
pdf_canvas, table_element, page_height,
|
||||
scale_w, scale_h, result_dir
|
||||
)
|
||||
if success:
|
||||
return # Successfully rendered with cell_boxes
|
||||
if (settings.table_column_correction_enabled and
|
||||
TABLE_COLUMN_CORRECTOR_AVAILABLE and
|
||||
cell_boxes):
|
||||
try:
|
||||
corrector = TableColumnCorrector(
|
||||
correction_threshold=settings.table_column_correction_threshold,
|
||||
vertical_merge_enabled=settings.vertical_fragment_merge_enabled,
|
||||
vertical_aspect_ratio=settings.vertical_fragment_aspect_ratio
|
||||
)
|
||||
# Get table bbox for vertical fragment detection
|
||||
table_bbox = table_element.get('bbox', [])
|
||||
if isinstance(table_bbox, dict):
|
||||
table_bbox = [table_bbox['x0'], table_bbox['y0'], table_bbox['x1'], table_bbox['y1']]
|
||||
|
||||
logger.info("[TABLE] Falling back to ReportLab Table")
|
||||
corrected_html, stats = corrector.correct(
|
||||
html=html_content,
|
||||
cell_boxes=cell_boxes,
|
||||
table_bbox=table_bbox if isinstance(table_bbox, list) and len(table_bbox) >= 4 else None
|
||||
)
|
||||
if stats.get('column_corrections', 0) > 0:
|
||||
logger.info(f"[TABLE] {elem_id}: Column correction applied - {stats}")
|
||||
html_content = corrected_html
|
||||
except Exception as e:
|
||||
logger.warning(f"[TABLE] {elem_id}: Column correction failed: {e}, using original HTML")
|
||||
|
||||
# Fallback: Parse HTML to extract table structure and use ReportLab Table
|
||||
# Parse HTML first to get table structure for grid validation
|
||||
parser = HTMLTableParser()
|
||||
parser.feed(html_content)
|
||||
|
||||
@@ -2040,6 +2412,83 @@ class PDFGeneratorService:
|
||||
if not rows:
|
||||
return
|
||||
|
||||
# Calculate number of rows and columns from HTML for grid validation
|
||||
num_rows = len(rows)
|
||||
max_cols = 0
|
||||
for row in rows:
|
||||
row_cols = sum(cell.get('colspan', 1) for cell in row['cells'])
|
||||
max_cols = max(max_cols, row_cols)
|
||||
|
||||
# Check if table was rebuilt - if so, use HTML content directly
|
||||
was_rebuilt = table_element.get('was_rebuilt', False)
|
||||
cell_boxes_rendered = False # Track if we rendered borders with cell_boxes
|
||||
|
||||
if was_rebuilt:
|
||||
logger.info(f"[TABLE] {elem_id}: Table was rebuilt, using HTML content directly")
|
||||
elif use_border_only:
|
||||
# Bad quality cell_boxes: skip cell_boxes rendering, use ReportLab Table with borders
|
||||
logger.info(f"[TABLE] {elem_id}: Bad cell_boxes quality, using ReportLab Table with borders")
|
||||
else:
|
||||
# Check if cell_boxes can produce a valid grid before rendering borders
|
||||
cell_boxes = table_element.get('cell_boxes', [])
|
||||
if cell_boxes:
|
||||
# Get table bbox for grid calculation
|
||||
temp_bbox = table_element.get('bbox', [])
|
||||
if isinstance(temp_bbox, dict):
|
||||
raw_bbox = [temp_bbox['x0'], temp_bbox['y0'], temp_bbox['x1'], temp_bbox['y1']]
|
||||
elif isinstance(temp_bbox, list) and len(temp_bbox) >= 4:
|
||||
if isinstance(temp_bbox[0], (int, float)):
|
||||
raw_bbox = temp_bbox[:4]
|
||||
else:
|
||||
raw_bbox = [temp_bbox[0][0], temp_bbox[0][1], temp_bbox[2][0], temp_bbox[2][1]]
|
||||
else:
|
||||
raw_bbox = None
|
||||
|
||||
# Pre-check: can we compute a valid grid from cell_boxes?
|
||||
if raw_bbox:
|
||||
test_col_widths, test_row_heights = self._compute_table_grid_from_cell_boxes(
|
||||
cell_boxes, raw_bbox, num_rows, max_cols
|
||||
)
|
||||
grid_valid = test_col_widths is not None and test_row_heights is not None
|
||||
|
||||
if grid_valid:
|
||||
logger.info(f"[TABLE] Grid validation passed, rendering borders with cell_boxes")
|
||||
success = self._draw_table_with_cell_boxes(
|
||||
pdf_canvas, table_element, page_height,
|
||||
scale_w, scale_h, result_dir
|
||||
)
|
||||
if success:
|
||||
cell_boxes_rendered = True
|
||||
logger.info("[TABLE] cell_boxes rendered borders, continuing with text-only ReportLab Table")
|
||||
else:
|
||||
logger.info("[TABLE] cell_boxes rendering failed, using ReportLab Table with borders")
|
||||
else:
|
||||
# Grid mismatch: try cellboxes-first rendering if enabled
|
||||
if settings.table_rendering_prefer_cellboxes:
|
||||
logger.info(f"[TABLE] Grid mismatch, trying cellboxes-first rendering")
|
||||
from app.services.pdf_table_renderer import TableRenderer, TableRenderConfig
|
||||
renderer = TableRenderer(TableRenderConfig())
|
||||
success = renderer.render_from_cellboxes_grid(
|
||||
pdf_canvas,
|
||||
cell_boxes,
|
||||
html_content,
|
||||
tuple(raw_bbox),
|
||||
page_height,
|
||||
scale_w,
|
||||
scale_h,
|
||||
row_threshold=settings.table_cellboxes_row_threshold,
|
||||
col_threshold=settings.table_cellboxes_col_threshold
|
||||
)
|
||||
if success:
|
||||
logger.info("[TABLE] cellboxes-first rendering succeeded, skipping HTML-based rendering")
|
||||
return # Table fully rendered, exit early
|
||||
else:
|
||||
logger.info("[TABLE] cellboxes-first rendering failed, falling back to HTML-based")
|
||||
else:
|
||||
logger.info(f"[TABLE] Grid validation failed (mismatch), using ReportLab Table with borders")
|
||||
else:
|
||||
logger.info("[TABLE] No valid bbox for grid validation, using ReportLab Table with borders")
|
||||
|
||||
# Get bbox directly from table element
|
||||
table_bbox = table_element.get('bbox')
|
||||
|
||||
@@ -2106,15 +2555,7 @@ class PDFGeneratorService:
|
||||
pdf_y = page_height - ocr_y_bottom
|
||||
|
||||
# Build table data for ReportLab with proper colspan/rowspan handling
|
||||
# First pass: determine the actual grid size by accounting for spans
|
||||
num_rows = len(rows)
|
||||
|
||||
# Calculate actual number of columns by checking first row's total span
|
||||
max_cols = 0
|
||||
for row in rows:
|
||||
row_cols = sum(cell.get('colspan', 1) for cell in row['cells'])
|
||||
max_cols = max(max_cols, row_cols)
|
||||
|
||||
# num_rows and max_cols already calculated above for grid validation
|
||||
logger.info(f"[表格] {num_rows}行x{max_cols}列 → PDF位置: ({pdf_x:.1f}, {pdf_y:.1f}), 寬x高: {table_width:.0f}x{table_height:.0f}")
|
||||
|
||||
# Create a grid to track occupied cells (for rowspan handling)
|
||||
@@ -2223,16 +2664,25 @@ class PDFGeneratorService:
|
||||
logger.info(f"[TABLE] Created with {len(col_widths)} cols, {len(row_heights)} rows")
|
||||
|
||||
# Apply table style
|
||||
style = TableStyle([
|
||||
# If cell_boxes rendered borders, skip GRID style (text-only rendering)
|
||||
style_commands = [
|
||||
('FONT', (0, 0), (-1, -1), self.font_name if self.font_registered else 'Helvetica', font_size),
|
||||
('GRID', (0, 0), (-1, -1), 0.5, colors.black),
|
||||
('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
|
||||
('ALIGN', (0, 0), (-1, -1), 'CENTER'),
|
||||
('LEFTPADDING', (0, 0), (-1, -1), 2),
|
||||
('RIGHTPADDING', (0, 0), (-1, -1), 2),
|
||||
('TOPPADDING', (0, 0), (-1, -1), 2),
|
||||
('BOTTOMPADDING', (0, 0), (-1, -1), 2),
|
||||
])
|
||||
]
|
||||
|
||||
# Only add GRID if cell_boxes didn't render borders
|
||||
if not cell_boxes_rendered:
|
||||
style_commands.insert(1, ('GRID', (0, 0), (-1, -1), 0.5, colors.black))
|
||||
logger.info("[TABLE] Adding GRID style (cell_boxes not used)")
|
||||
else:
|
||||
logger.info("[TABLE] Skipping GRID style (cell_boxes rendered borders)")
|
||||
|
||||
style = TableStyle(style_commands)
|
||||
|
||||
# Add header style if first row has headers
|
||||
if rows and rows[0]['cells'] and rows[0]['cells'][0].get('is_header'):
|
||||
@@ -2435,6 +2885,106 @@ class PDFGeneratorService:
|
||||
logger.debug(f"[TABLE] Normalized {len(cell_boxes)} cell boxes to grid")
|
||||
return normalized_boxes
|
||||
|
||||
def _draw_table_border_only(
|
||||
self,
|
||||
pdf_canvas: canvas.Canvas,
|
||||
table_element: Dict,
|
||||
page_height: float,
|
||||
scale_w: float = 1.0,
|
||||
scale_h: float = 1.0
|
||||
):
|
||||
"""
|
||||
Draw only the outer border of a table (for tables with bad cell_boxes quality).
|
||||
|
||||
Text inside the table will be rendered using raw OCR positions.
|
||||
|
||||
Args:
|
||||
pdf_canvas: ReportLab canvas object
|
||||
table_element: Table element dict
|
||||
page_height: Height of page in PDF coordinates
|
||||
scale_w: Scale factor for X coordinates
|
||||
scale_h: Scale factor for Y coordinates
|
||||
"""
|
||||
table_bbox = table_element.get('bbox', [])
|
||||
if not table_bbox or len(table_bbox) < 4:
|
||||
return
|
||||
|
||||
element_id = table_element.get('element_id', 'unknown')
|
||||
|
||||
# Handle different bbox formats
|
||||
if isinstance(table_bbox, dict):
|
||||
x0, y0, x1, y1 = table_bbox['x0'], table_bbox['y0'], table_bbox['x1'], table_bbox['y1']
|
||||
elif isinstance(table_bbox[0], (int, float)):
|
||||
x0, y0, x1, y1 = table_bbox[0], table_bbox[1], table_bbox[2], table_bbox[3]
|
||||
else:
|
||||
return
|
||||
|
||||
# Apply scaling
|
||||
pdf_x0 = x0 * scale_w
|
||||
pdf_y0 = y0 * scale_h
|
||||
pdf_x1 = x1 * scale_w
|
||||
pdf_y1 = y1 * scale_h
|
||||
|
||||
# Convert to PDF coordinates (flip Y)
|
||||
pdf_top = page_height - pdf_y0
|
||||
pdf_bottom = page_height - pdf_y1
|
||||
width = pdf_x1 - pdf_x0
|
||||
height = pdf_y1 - pdf_y0
|
||||
|
||||
# Draw outer border only
|
||||
pdf_canvas.setStrokeColor(colors.black)
|
||||
pdf_canvas.setLineWidth(0.5)
|
||||
pdf_canvas.rect(pdf_x0, pdf_bottom, width, height, stroke=1, fill=0)
|
||||
|
||||
logger.info(f"[TABLE] {element_id}: Drew border only (bad cell_boxes quality)")
|
||||
|
||||
def _check_cell_boxes_quality(self, cell_boxes: List, element_id: str = "") -> str:
|
||||
"""
|
||||
Check the quality of cell_boxes to determine rendering strategy.
|
||||
|
||||
Args:
|
||||
cell_boxes: List of cell bounding boxes
|
||||
element_id: Optional element ID for logging
|
||||
|
||||
Returns:
|
||||
'good' if cell_boxes form a proper grid, 'bad' otherwise
|
||||
"""
|
||||
# If quality check is disabled, always return 'good' to use pure PP-Structure output
|
||||
if not settings.table_quality_check_enabled:
|
||||
logger.debug(f"[TABLE QUALITY] {element_id}: good - quality check disabled (pure PP-Structure mode)")
|
||||
return 'good'
|
||||
|
||||
if not cell_boxes or len(cell_boxes) < 2:
|
||||
logger.debug(f"[TABLE QUALITY] {element_id}: bad - too few cells ({len(cell_boxes) if cell_boxes else 0})")
|
||||
return 'bad' # No cell_boxes or too few
|
||||
|
||||
# Count overlapping cell pairs
|
||||
overlap_count = 0
|
||||
for i, box1 in enumerate(cell_boxes):
|
||||
for j, box2 in enumerate(cell_boxes):
|
||||
if i >= j:
|
||||
continue
|
||||
if not isinstance(box1, (list, tuple)) or len(box1) < 4:
|
||||
continue
|
||||
if not isinstance(box2, (list, tuple)) or len(box2) < 4:
|
||||
continue
|
||||
x_overlap = box1[0] < box2[2] and box1[2] > box2[0]
|
||||
y_overlap = box1[1] < box2[3] and box1[3] > box2[1]
|
||||
if x_overlap and y_overlap:
|
||||
overlap_count += 1
|
||||
|
||||
total_pairs = len(cell_boxes) * (len(cell_boxes) - 1) // 2
|
||||
overlap_ratio = overlap_count / total_pairs if total_pairs > 0 else 0
|
||||
|
||||
# Relaxed threshold: 20% overlap instead of 10% to allow more tables through
|
||||
# This is because PP-StructureV3's cell detection sometimes has slight overlaps
|
||||
if overlap_ratio > 0.20:
|
||||
logger.info(f"[TABLE QUALITY] {element_id}: bad - overlap ratio {overlap_ratio:.2%} > 20%")
|
||||
return 'bad'
|
||||
|
||||
logger.debug(f"[TABLE QUALITY] {element_id}: good - {len(cell_boxes)} cells, overlap {overlap_ratio:.2%}")
|
||||
return 'good'
|
||||
|
||||
def _draw_table_with_cell_boxes(
|
||||
self,
|
||||
pdf_canvas: canvas.Canvas,
|
||||
@@ -2465,39 +3015,64 @@ class PDFGeneratorService:
|
||||
"""
|
||||
try:
|
||||
cell_boxes = table_element.get('cell_boxes', [])
|
||||
|
||||
# Always draw outer table border first (fallback for incomplete cell_boxes)
|
||||
table_bbox = table_element.get('bbox', [])
|
||||
if table_bbox and len(table_bbox) >= 4:
|
||||
# Handle different bbox formats (list or dict)
|
||||
if isinstance(table_bbox, dict):
|
||||
tx1 = float(table_bbox.get('x0', 0))
|
||||
ty1 = float(table_bbox.get('y0', 0))
|
||||
tx2 = float(table_bbox.get('x1', 0))
|
||||
ty2 = float(table_bbox.get('y1', 0))
|
||||
else:
|
||||
tx1, ty1, tx2, ty2 = table_bbox[:4]
|
||||
|
||||
# Apply scaling
|
||||
tx1_scaled = tx1 * scale_w
|
||||
ty1_scaled = ty1 * scale_h
|
||||
tx2_scaled = tx2 * scale_w
|
||||
ty2_scaled = ty2 * scale_h
|
||||
# Check cell_boxes quality - skip if they don't form a proper grid
|
||||
if cell_boxes and len(cell_boxes) > 2:
|
||||
# Count overlapping cell pairs
|
||||
overlap_count = 0
|
||||
for i, box1 in enumerate(cell_boxes):
|
||||
for j, box2 in enumerate(cell_boxes):
|
||||
if i >= j:
|
||||
continue
|
||||
x_overlap = box1[0] < box2[2] and box1[2] > box2[0]
|
||||
y_overlap = box1[1] < box2[3] and box1[3] > box2[1]
|
||||
if x_overlap and y_overlap:
|
||||
overlap_count += 1
|
||||
|
||||
table_width = tx2_scaled - tx1_scaled
|
||||
table_height = ty2_scaled - ty1_scaled
|
||||
# If more than 25% of cell pairs overlap, cell_boxes are unreliable
|
||||
# Increased from 10% to 25% to allow more tables to use cell_boxes rendering
|
||||
# which provides better visual fidelity than ReportLab Table fallback
|
||||
total_pairs = len(cell_boxes) * (len(cell_boxes) - 1) // 2
|
||||
overlap_ratio = overlap_count / total_pairs if total_pairs > 0 else 0
|
||||
|
||||
# Transform Y coordinate (PDF uses bottom-left origin)
|
||||
pdf_x = tx1_scaled
|
||||
pdf_y = page_height - ty2_scaled # Bottom of table in PDF coords
|
||||
|
||||
# Draw outer table border (slightly thicker for visibility)
|
||||
pdf_canvas.setStrokeColor(colors.black)
|
||||
pdf_canvas.setLineWidth(1.0)
|
||||
pdf_canvas.rect(pdf_x, pdf_y, table_width, table_height, stroke=1, fill=0)
|
||||
logger.info(f"[TABLE] Drew outer table border at [{int(tx1)},{int(ty1)},{int(tx2)},{int(ty2)}]")
|
||||
if overlap_ratio > 0.25:
|
||||
logger.warning(
|
||||
f"[TABLE] Skipping cell_boxes rendering: {overlap_count}/{total_pairs} "
|
||||
f"({overlap_ratio:.1%}) cell pairs overlap - using ReportLab Table fallback"
|
||||
)
|
||||
return False # Return False to trigger ReportLab Table fallback
|
||||
|
||||
if not cell_boxes:
|
||||
# Fallback: draw outer border only when no cell_boxes
|
||||
if table_bbox and len(table_bbox) >= 4:
|
||||
# Handle different bbox formats (list or dict)
|
||||
if isinstance(table_bbox, dict):
|
||||
tx1 = float(table_bbox.get('x0', 0))
|
||||
ty1 = float(table_bbox.get('y0', 0))
|
||||
tx2 = float(table_bbox.get('x1', 0))
|
||||
ty2 = float(table_bbox.get('y1', 0))
|
||||
else:
|
||||
tx1, ty1, tx2, ty2 = table_bbox[:4]
|
||||
|
||||
# Apply scaling
|
||||
tx1_scaled = tx1 * scale_w
|
||||
ty1_scaled = ty1 * scale_h
|
||||
tx2_scaled = tx2 * scale_w
|
||||
ty2_scaled = ty2 * scale_h
|
||||
|
||||
table_width = tx2_scaled - tx1_scaled
|
||||
table_height = ty2_scaled - ty1_scaled
|
||||
|
||||
# Transform Y coordinate (PDF uses bottom-left origin)
|
||||
pdf_x = tx1_scaled
|
||||
pdf_y = page_height - ty2_scaled # Bottom of table in PDF coords
|
||||
|
||||
# Draw outer table border (slightly thicker for visibility)
|
||||
pdf_canvas.setStrokeColor(colors.black)
|
||||
pdf_canvas.setLineWidth(1.0)
|
||||
pdf_canvas.rect(pdf_x, pdf_y, table_width, table_height, stroke=1, fill=0)
|
||||
logger.info(f"[TABLE] Drew outer table border at [{int(tx1)},{int(ty1)},{int(tx2)},{int(ty2)}]")
|
||||
logger.warning("[TABLE] No cell_boxes available, only outer border drawn")
|
||||
# Still draw embedded images even without cell borders
|
||||
embedded_images = table_element.get('embedded_images', [])
|
||||
@@ -2511,31 +3086,47 @@ class PDFGeneratorService:
|
||||
# Normalize cell boxes to create aligned grid
|
||||
cell_boxes = self._normalize_cell_boxes_to_grid(cell_boxes)
|
||||
|
||||
logger.info(f"[TABLE] Drawing {len(cell_boxes)} cell borders (layered mode, grid-aligned)")
|
||||
logger.info(f"[TABLE] Drawing {len(cell_boxes)} cells using grid lines (avoiding duplicates)")
|
||||
|
||||
# Collect unique grid lines to avoid drawing duplicate/overlapping lines
|
||||
h_lines = set() # Horizontal lines: (y, x_start, x_end)
|
||||
v_lines = set() # Vertical lines: (x, y_start, y_end)
|
||||
|
||||
# Draw each cell border
|
||||
for box in cell_boxes:
|
||||
x1, y1, x2, y2 = box[0], box[1], box[2], box[3]
|
||||
|
||||
# Apply scaling
|
||||
x1_scaled = x1 * scale_w
|
||||
y1_scaled = y1 * scale_h
|
||||
x2_scaled = x2 * scale_w
|
||||
y2_scaled = y2 * scale_h
|
||||
x1_s = x1 * scale_w
|
||||
y1_s = y1 * scale_h
|
||||
x2_s = x2 * scale_w
|
||||
y2_s = y2 * scale_h
|
||||
|
||||
cell_width = x2_scaled - x1_scaled
|
||||
cell_height = y2_scaled - y1_scaled
|
||||
# Round to 1 decimal place to help with deduplication
|
||||
x1_s, y1_s, x2_s, y2_s = round(x1_s, 1), round(y1_s, 1), round(x2_s, 1), round(y2_s, 1)
|
||||
|
||||
# Transform Y coordinate (PDF uses bottom-left origin)
|
||||
pdf_x = x1_scaled
|
||||
pdf_y = page_height - y2_scaled # Bottom of cell in PDF coords
|
||||
# Add horizontal lines (top and bottom of cell)
|
||||
h_lines.add((y1_s, x1_s, x2_s)) # Top line
|
||||
h_lines.add((y2_s, x1_s, x2_s)) # Bottom line
|
||||
|
||||
# Draw cell border only (no fill, no text)
|
||||
pdf_canvas.setStrokeColor(colors.black)
|
||||
pdf_canvas.setLineWidth(0.5)
|
||||
pdf_canvas.rect(pdf_x, pdf_y, cell_width, cell_height, stroke=1, fill=0)
|
||||
# Add vertical lines (left and right of cell)
|
||||
v_lines.add((x1_s, y1_s, y2_s)) # Left line
|
||||
v_lines.add((x2_s, y1_s, y2_s)) # Right line
|
||||
|
||||
logger.info(f"[TABLE] Drew {len(cell_boxes)} cell borders")
|
||||
# Draw unique horizontal lines
|
||||
pdf_canvas.setStrokeColor(colors.black)
|
||||
pdf_canvas.setLineWidth(0.5)
|
||||
|
||||
for y, x_start, x_end in h_lines:
|
||||
pdf_y = page_height - y # Transform Y coordinate
|
||||
pdf_canvas.line(x_start, pdf_y, x_end, pdf_y)
|
||||
|
||||
# Draw unique vertical lines
|
||||
for x, y_start, y_end in v_lines:
|
||||
pdf_y_start = page_height - y_start
|
||||
pdf_y_end = page_height - y_end
|
||||
pdf_canvas.line(x, pdf_y_start, x, pdf_y_end)
|
||||
|
||||
logger.info(f"[TABLE] Drew {len(h_lines)} horizontal + {len(v_lines)} vertical grid lines")
|
||||
|
||||
# Draw embedded images
|
||||
embedded_images = table_element.get('embedded_images', [])
|
||||
|
||||
Reference in New Issue
Block a user