feat: unify Direct Track PDF rendering and simplify export options
Backend changes: - Apply background image + invisible text layer to all Direct Track PDFs - Add CHART to regions_to_avoid for text extraction - Improve visual fidelity for native PDFs and Office documents Frontend changes: - Remove JSON, UnifiedDocument, Markdown download buttons - Simplify to 2-column layout with only Layout PDF and Reflow PDF - Remove translation JSON download and Layout PDF option - Keep only Reflow PDF for translated document downloads - Clean up unused imports (FileJson, Database, FileOutput) Archives two OpenSpec proposals: - unify-direct-track-pdf-rendering - simplify-frontend-export-options 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -2920,6 +2920,7 @@ class DirectExtractionEngine:
|
||||
1. Are mostly solid black or white
|
||||
2. Are within page boundaries
|
||||
3. Actually overlap with text content (IoU check)
|
||||
4. Are rendered AFTER the text they overlap (z-order check)
|
||||
|
||||
Args:
|
||||
page: PyMuPDF page object
|
||||
@@ -2939,6 +2940,22 @@ class DirectExtractionEngine:
|
||||
if not image_list:
|
||||
return covering_images
|
||||
|
||||
# Get rendering order (z-order) using get_bboxlog()
|
||||
# Items rendered later (higher index) appear on top
|
||||
bboxlog = page.get_bboxlog()
|
||||
|
||||
# Build a map of bbox -> sequence number for images and text
|
||||
# This helps determine if an image is rendered before or after text
|
||||
image_seqnos = {} # bbox tuple -> seqno
|
||||
text_seqnos = {} # bbox tuple -> seqno
|
||||
|
||||
for seqno, (action_type, bbox) in enumerate(bboxlog):
|
||||
bbox_tuple = tuple(fitz.Rect(bbox))
|
||||
if "image" in action_type:
|
||||
image_seqnos[bbox_tuple] = seqno
|
||||
elif "text" in action_type:
|
||||
text_seqnos[bbox_tuple] = seqno
|
||||
|
||||
# Get all text words for coverage check
|
||||
words = page.get_text("words") # (x0, y0, x1, y1, word, block_no, line_no, word_no)
|
||||
|
||||
@@ -3005,8 +3022,23 @@ class DirectExtractionEngine:
|
||||
# Clip image rect to page boundaries
|
||||
clipped_rect = img_rect & page_rect
|
||||
|
||||
# Get image's rendering sequence number
|
||||
img_bbox_tuple = tuple(clipped_rect)
|
||||
img_seqno = image_seqnos.get(img_bbox_tuple, -1)
|
||||
|
||||
# If we can't find exact match, try to find closest match
|
||||
if img_seqno == -1:
|
||||
for bbox_tuple, seqno in image_seqnos.items():
|
||||
if fitz.Rect(bbox_tuple).intersects(clipped_rect):
|
||||
# Use the matching seqno
|
||||
img_seqno = seqno
|
||||
break
|
||||
|
||||
# Check if image actually covers any text (IoU check)
|
||||
# AND is rendered AFTER the text (z-order check)
|
||||
covered_text_count = 0
|
||||
is_background_image = False
|
||||
|
||||
for word_info in words:
|
||||
word_rect = fitz.Rect(word_info[:4])
|
||||
word_area = word_rect.width * word_rect.height
|
||||
@@ -3017,13 +3049,35 @@ class DirectExtractionEngine:
|
||||
if not intersection.is_empty:
|
||||
intersection_area = intersection.width * intersection.height
|
||||
coverage_ratio = intersection_area / word_area
|
||||
|
||||
# Count as covered if >= 50% of word is under the image
|
||||
if coverage_ratio >= 0.5:
|
||||
covered_text_count += 1
|
||||
# Z-order check: Find the text's rendering sequence
|
||||
text_seqno = -1
|
||||
for bbox_tuple, seqno in text_seqnos.items():
|
||||
text_bbox = fitz.Rect(bbox_tuple)
|
||||
if text_bbox.intersects(word_rect):
|
||||
text_seqno = seqno
|
||||
break
|
||||
|
||||
# Only count as covered if image is rendered AFTER text
|
||||
# If image is rendered BEFORE text, it's a background
|
||||
if img_seqno > text_seqno and text_seqno >= 0:
|
||||
covered_text_count += 1
|
||||
elif img_seqno < text_seqno and img_seqno >= 0:
|
||||
# Image is rendered before text = background
|
||||
is_background_image = True
|
||||
|
||||
# Skip this image if it's detected as a background image
|
||||
if is_background_image and covered_text_count == 0:
|
||||
logger.debug(f"Page {page_num}: Skipping background image xref={xref} "
|
||||
f"(rendered before text, seqno={img_seqno})")
|
||||
continue
|
||||
|
||||
# Report if image covers text OR is pure solid black/white
|
||||
# Pure solid fills are likely redaction/placeholder boxes
|
||||
if covered_text_count > 0 or is_pure_solid:
|
||||
# But skip if it's a background image (rendered before text)
|
||||
if covered_text_count > 0 or (is_pure_solid and not is_background_image):
|
||||
covering_images.append({
|
||||
'xref': xref, # Include xref for filtering
|
||||
'bbox': tuple(clipped_rect),
|
||||
@@ -3031,7 +3085,9 @@ class DirectExtractionEngine:
|
||||
'avg_color': (avg_r, avg_g, avg_b),
|
||||
'size': (width, height),
|
||||
'covered_text_count': covered_text_count,
|
||||
'is_pure_solid': is_pure_solid
|
||||
'is_pure_solid': is_pure_solid,
|
||||
'is_background': is_background_image,
|
||||
'render_seqno': img_seqno
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
|
||||
@@ -709,7 +709,8 @@ class PDFGeneratorService:
|
||||
self,
|
||||
unified_doc: 'UnifiedDocument',
|
||||
output_path: Path,
|
||||
source_file_path: Optional[Path] = None
|
||||
source_file_path: Optional[Path] = None,
|
||||
result_dir: Optional[Path] = None
|
||||
) -> bool:
|
||||
"""
|
||||
Generate layout-preserving PDF directly from UnifiedDocument.
|
||||
@@ -721,6 +722,7 @@ class PDFGeneratorService:
|
||||
unified_doc: UnifiedDocument object
|
||||
output_path: Path to save generated PDF
|
||||
source_file_path: Optional path to original source file
|
||||
result_dir: Optional path to result directory (for finding converted PDFs)
|
||||
|
||||
Returns:
|
||||
True if successful, False otherwise
|
||||
@@ -751,7 +753,8 @@ class PDFGeneratorService:
|
||||
return self._generate_direct_track_pdf(
|
||||
unified_doc=unified_doc,
|
||||
output_path=output_path,
|
||||
source_file_path=source_file_path
|
||||
source_file_path=source_file_path,
|
||||
result_dir=result_dir
|
||||
)
|
||||
else:
|
||||
# OCR track: Simplified rendering (backward compatible)
|
||||
@@ -823,7 +826,8 @@ class PDFGeneratorService:
|
||||
self,
|
||||
unified_doc: 'UnifiedDocument',
|
||||
output_path: Path,
|
||||
source_file_path: Optional[Path] = None
|
||||
source_file_path: Optional[Path] = None,
|
||||
result_dir: Optional[Path] = None
|
||||
) -> bool:
|
||||
"""
|
||||
Generate PDF with rich formatting preservation for Direct track.
|
||||
@@ -836,6 +840,7 @@ class PDFGeneratorService:
|
||||
unified_doc: UnifiedDocument from Direct extraction
|
||||
output_path: Path to save generated PDF
|
||||
source_file_path: Optional path to original source file
|
||||
result_dir: Optional path to result directory (for finding converted PDFs)
|
||||
|
||||
Returns:
|
||||
True if successful, False otherwise
|
||||
@@ -865,6 +870,55 @@ class PDFGeneratorService:
|
||||
from reportlab.pdfgen import canvas
|
||||
pdf_canvas = canvas.Canvas(str(output_path), pagesize=(page_width, page_height))
|
||||
|
||||
# For ALL Direct Track documents, render source page as background image
|
||||
# This preserves visual fidelity (vector graphics, charts, complex layouts)
|
||||
# and overlays invisible text layer for searchability/translation
|
||||
use_background_rendering = (
|
||||
self.current_processing_track == ProcessingTrack.DIRECT or
|
||||
self.current_processing_track == ProcessingTrack.HYBRID
|
||||
)
|
||||
source_pdf = None
|
||||
|
||||
if use_background_rendering:
|
||||
# Find the source PDF for background rendering
|
||||
# For Office documents: source_file_path points to .pptx/.docx, need converted PDF
|
||||
# For native PDFs: source_file_path should be the PDF itself
|
||||
actual_source_pdf = None
|
||||
|
||||
# Use provided result_dir, or fall back to output_path.parent
|
||||
search_dir = result_dir if result_dir else output_path.parent
|
||||
if search_dir.exists():
|
||||
# Look for PDF files that match the pattern: {task_id}_{name}.pdf
|
||||
pdf_files = list(search_dir.glob('*.pdf'))
|
||||
# Filter out layout/output PDFs
|
||||
source_pdfs = [
|
||||
f for f in pdf_files
|
||||
if not f.name.endswith('_layout.pdf')
|
||||
and not f.name.endswith('_reflow.pdf')
|
||||
and f.name != output_path.name
|
||||
]
|
||||
if source_pdfs:
|
||||
actual_source_pdf = source_pdfs[0]
|
||||
logger.debug(f"Found converted PDF in result dir: {actual_source_pdf.name}")
|
||||
|
||||
# Fallback: use source_file_path if it's a PDF
|
||||
if not actual_source_pdf and source_file_path and source_file_path.exists():
|
||||
if source_file_path.suffix.lower() == '.pdf':
|
||||
actual_source_pdf = source_file_path
|
||||
|
||||
if actual_source_pdf and actual_source_pdf.exists():
|
||||
try:
|
||||
import fitz
|
||||
source_pdf = fitz.open(str(actual_source_pdf))
|
||||
logger.info(f"Direct Track: will render source pages as background from: {actual_source_pdf.name}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to open source PDF for background rendering: {e}")
|
||||
use_background_rendering = False
|
||||
source_pdf = None
|
||||
else:
|
||||
logger.warning(f"Direct Track: no source PDF found in {search_dir}, skipping background rendering")
|
||||
use_background_rendering = False
|
||||
|
||||
# Process each page
|
||||
for page_idx, page in enumerate(unified_doc.pages):
|
||||
logger.info(f">>> Processing page {page_idx + 1}/{len(unified_doc.pages)}")
|
||||
@@ -880,6 +934,42 @@ class PDFGeneratorService:
|
||||
# Set page size for current page
|
||||
pdf_canvas.setPageSize((current_page_width, current_page_height))
|
||||
|
||||
# For Direct Track: render source page as background image
|
||||
# This preserves all visual content (vector graphics, shapes, charts)
|
||||
rendered_background = False
|
||||
if use_background_rendering and source_pdf and page_idx < len(source_pdf):
|
||||
try:
|
||||
source_page = source_pdf[page_idx]
|
||||
# Render at 2x resolution for quality
|
||||
mat = fitz.Matrix(2.0, 2.0)
|
||||
pix = source_page.get_pixmap(matrix=mat, alpha=False)
|
||||
|
||||
# Save to temporary file
|
||||
import tempfile
|
||||
with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:
|
||||
pix.save(tmp.name)
|
||||
temp_bg_path = tmp.name
|
||||
|
||||
# Draw background image (full page)
|
||||
from reportlab.lib.utils import ImageReader
|
||||
bg_img = ImageReader(temp_bg_path)
|
||||
pdf_canvas.drawImage(
|
||||
bg_img,
|
||||
0, 0,
|
||||
width=current_page_width,
|
||||
height=current_page_height,
|
||||
preserveAspectRatio=False
|
||||
)
|
||||
rendered_background = True
|
||||
logger.info(f" Rendered source page {page_idx + 1} as background image")
|
||||
|
||||
# Clean up temp file
|
||||
import os
|
||||
os.unlink(temp_bg_path)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to render background for page {page_idx + 1}: {e}")
|
||||
rendered_background = False
|
||||
|
||||
# Separate elements by type
|
||||
text_elements = []
|
||||
table_elements = []
|
||||
@@ -918,22 +1008,28 @@ class PDFGeneratorService:
|
||||
continue
|
||||
|
||||
image_elements.append(element)
|
||||
# Only add real images to exclusion regions, NOT charts/diagrams
|
||||
# Charts often have large bounding boxes that include text labels
|
||||
# which should be rendered as selectable text on top
|
||||
if element.type in [ElementType.IMAGE, ElementType.FIGURE, ElementType.LOGO, ElementType.STAMP]:
|
||||
# Check if this is Direct track (text from PDF text layer, not OCR)
|
||||
is_direct = (self.current_processing_track == ProcessingTrack.DIRECT or
|
||||
self.current_processing_track == ProcessingTrack.HYBRID)
|
||||
|
||||
if is_direct:
|
||||
# Direct track: text is from PDF text layer, not OCR'd from images
|
||||
# Don't exclude any images - text should be rendered on top
|
||||
# This is critical for Office documents with background images
|
||||
# Check if this is Direct track (text from PDF text layer, not OCR)
|
||||
is_direct = (self.current_processing_track == ProcessingTrack.DIRECT or
|
||||
self.current_processing_track == ProcessingTrack.HYBRID)
|
||||
|
||||
# For Direct Track with background rendering:
|
||||
# - CHART regions should be excluded from text layer (chart text already in background)
|
||||
# - Other images don't need exclusion (text rendered as invisible overlay)
|
||||
if is_direct:
|
||||
if element.type == ElementType.CHART:
|
||||
# Add chart to exclusion regions - chart-internal text should NOT be
|
||||
# in the invisible text layer (already visible in background image)
|
||||
regions_to_avoid.append(element)
|
||||
logger.debug(f"Direct track: excluding CHART {element.element_id} - text inside chart not needed")
|
||||
else:
|
||||
# Other image types: don't exclude, text will be invisible overlay
|
||||
logger.debug(f"Direct track: not excluding {element.element_id} from text regions")
|
||||
continue
|
||||
continue
|
||||
|
||||
# OCR track: Skip full-page background images from exclusion regions
|
||||
# OCR track: Handle image exclusion for text rendered on images
|
||||
if element.type in [ElementType.IMAGE, ElementType.FIGURE, ElementType.LOGO, ElementType.STAMP]:
|
||||
# Skip full-page background images from exclusion regions
|
||||
# Smaller images that might contain OCR'd text should still be excluded
|
||||
if element.bbox:
|
||||
elem_area = (element.bbox.x1 - element.bbox.x0) * (element.bbox.y1 - element.bbox.y0)
|
||||
@@ -965,23 +1061,20 @@ class PDFGeneratorService:
|
||||
f"{len(table_elements)} tables, {len(image_elements)} images, "
|
||||
f"{len(list_elements)} list items")
|
||||
|
||||
# Use original element order from extraction engine
|
||||
# The extraction engine has already sorted elements by reading order,
|
||||
# handling multi-column layouts correctly (top-to-bottom, left-to-right)
|
||||
all_elements = []
|
||||
# FIX: Render in proper z-order for Office/PPT documents
|
||||
# Images (backgrounds) must be rendered FIRST, then tables, then text on top
|
||||
# This ensures white text on dark backgrounds is visible
|
||||
|
||||
# Preserve original order by iterating through page.elements
|
||||
for elem in page.elements:
|
||||
if elem in image_elements:
|
||||
all_elements.append(('image', elem))
|
||||
elif elem in table_elements:
|
||||
all_elements.append(('table', elem))
|
||||
elif elem in list_elements:
|
||||
all_elements.append(('list', elem))
|
||||
elif elem in text_elements:
|
||||
all_elements.append(('text', elem))
|
||||
# Sort images by area (largest first = background images)
|
||||
def get_element_area(elem):
|
||||
if elem.bbox:
|
||||
return (elem.bbox.x1 - elem.bbox.x0) * (elem.bbox.y1 - elem.bbox.y0)
|
||||
return 0
|
||||
|
||||
logger.debug(f"Drawing {len(all_elements)} elements in extraction order (preserves multi-column reading order)")
|
||||
sorted_images = sorted(image_elements, key=get_element_area, reverse=True)
|
||||
|
||||
logger.debug(f"Rendering order: {len(sorted_images)} images (largest first), "
|
||||
f"{len(table_elements)} tables, {len(text_elements)+len(list_elements)} text elements")
|
||||
logger.debug(f"Exclusion regions: {len(regions_to_avoid)} (tables/images/charts)")
|
||||
|
||||
# Debug: Log exclusion region types
|
||||
@@ -992,29 +1085,61 @@ class PDFGeneratorService:
|
||||
if region_types:
|
||||
logger.debug(f" Exclusion region breakdown: {region_types}")
|
||||
|
||||
# Draw elements in document order
|
||||
for elem_type, elem in all_elements:
|
||||
if elem_type == 'image':
|
||||
# Step 1: Draw images (backgrounds)
|
||||
# Skip if we already rendered the source page as background (Office documents)
|
||||
if rendered_background:
|
||||
logger.debug(f" Skipping {len(sorted_images)} individual images - background already rendered")
|
||||
else:
|
||||
# Larger images (backgrounds) are drawn first, smaller images on top
|
||||
for elem in sorted_images:
|
||||
self._draw_image_element_direct(pdf_canvas, elem, current_page_height, output_path.parent)
|
||||
elif elem_type == 'table':
|
||||
|
||||
# For Office documents with full-page background rendering:
|
||||
# - Skip tables (already visible in background image)
|
||||
# - Draw text as INVISIBLE layer (for searchability/translation, but no visual overlap)
|
||||
if rendered_background:
|
||||
logger.debug(f" Skipping {len(table_elements)} tables - already in background")
|
||||
logger.debug(f" Drawing {len(text_elements)+len(list_elements)} text elements as invisible layer")
|
||||
|
||||
# Set text rendering mode to invisible (mode 3)
|
||||
# This makes text selectable/searchable but not visible
|
||||
pdf_canvas._code.append('3 Tr') # Text render mode: invisible
|
||||
|
||||
for elem in page.elements:
|
||||
if elem in list_elements or elem in text_elements:
|
||||
self._draw_text_element_direct(pdf_canvas, elem, current_page_height)
|
||||
|
||||
# Reset text rendering mode to normal
|
||||
pdf_canvas._code.append('0 Tr') # Text render mode: fill
|
||||
else:
|
||||
# Step 2: Draw tables
|
||||
for elem in table_elements:
|
||||
self._draw_table_element_direct(pdf_canvas, elem, current_page_height)
|
||||
elif elem_type == 'list':
|
||||
# FIX: Check if list item overlaps with table/image
|
||||
if not self._is_element_inside_regions(elem.bbox, regions_to_avoid):
|
||||
self._draw_text_element_direct(pdf_canvas, elem, current_page_height)
|
||||
else:
|
||||
logger.debug(f"Skipping list element {elem.element_id} inside table/image region")
|
||||
elif elem_type == 'text':
|
||||
# FIX: Check if text overlaps with table/image before drawing
|
||||
if not self._is_element_inside_regions(elem.bbox, regions_to_avoid):
|
||||
self._draw_text_element_direct(pdf_canvas, elem, current_page_height)
|
||||
else:
|
||||
logger.debug(f"Skipping text element {elem.element_id} inside table/image region")
|
||||
|
||||
# Step 3: Draw text and list elements (on top of images/tables)
|
||||
# Use original document order for reading flow
|
||||
for elem in page.elements:
|
||||
if elem in list_elements:
|
||||
# Check if list item overlaps with table/image
|
||||
if not self._is_element_inside_regions(elem.bbox, regions_to_avoid):
|
||||
self._draw_text_element_direct(pdf_canvas, elem, current_page_height)
|
||||
else:
|
||||
logger.debug(f"Skipping list element {elem.element_id} inside table/image region")
|
||||
elif elem in text_elements:
|
||||
# Check if text overlaps with table/image before drawing
|
||||
if not self._is_element_inside_regions(elem.bbox, regions_to_avoid):
|
||||
self._draw_text_element_direct(pdf_canvas, elem, current_page_height)
|
||||
else:
|
||||
logger.debug(f"Skipping text element {elem.element_id} inside table/image region")
|
||||
|
||||
# Save PDF
|
||||
pdf_canvas.save()
|
||||
logger.info(f"Direct track PDF saved to {output_path}")
|
||||
|
||||
# Close source PDF if opened
|
||||
if source_pdf:
|
||||
source_pdf.close()
|
||||
|
||||
# Reset track
|
||||
self.current_processing_track = None
|
||||
return True
|
||||
@@ -1023,6 +1148,12 @@ class PDFGeneratorService:
|
||||
logger.error(f"Failed to generate Direct track PDF: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
# Clean up source PDF on error
|
||||
if source_pdf:
|
||||
try:
|
||||
source_pdf.close()
|
||||
except:
|
||||
pass
|
||||
self.current_processing_track = None
|
||||
return False
|
||||
|
||||
@@ -3249,7 +3380,8 @@ class PDFGeneratorService:
|
||||
return self.generate_from_unified_document(
|
||||
unified_doc=unified_doc,
|
||||
output_path=output_path,
|
||||
source_file_path=source_file_path
|
||||
source_file_path=source_file_path,
|
||||
result_dir=json_path.parent # Pass result dir for finding converted PDFs
|
||||
)
|
||||
else:
|
||||
logger.error("Failed to convert JSON to UnifiedDocument")
|
||||
@@ -3309,6 +3441,7 @@ class PDFGeneratorService:
|
||||
keywords=metadata_dict.get('keywords'),
|
||||
producer=metadata_dict.get('producer'),
|
||||
creator=metadata_dict.get('creator'),
|
||||
original_filename=metadata_dict.get('original_filename'), # For Office document detection
|
||||
creation_date=datetime.fromisoformat(metadata_dict['creation_date'].replace('Z', '+00:00')) if metadata_dict.get('creation_date') else None,
|
||||
modification_date=datetime.fromisoformat(metadata_dict['modification_date'].replace('Z', '+00:00')) if metadata_dict.get('modification_date') else None,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user