feat: unify Direct Track PDF rendering and simplify export options

Backend changes:
- Apply background image + invisible text layer to all Direct Track PDFs
- Add CHART to regions_to_avoid for text extraction
- Improve visual fidelity for native PDFs and Office documents

Frontend changes:
- Remove JSON, UnifiedDocument, Markdown download buttons
- Simplify to 2-column layout with only Layout PDF and Reflow PDF
- Remove translation JSON download and Layout PDF option
- Keep only Reflow PDF for translated document downloads
- Clean up unused imports (FileJson, Database, FileOutput)

Archives two OpenSpec proposals:
- unify-direct-track-pdf-rendering
- simplify-frontend-export-options

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
egg
2025-12-12 07:50:43 +08:00
parent 53bfa88773
commit 24253ac15e
15 changed files with 891 additions and 195 deletions

View File

@@ -2920,6 +2920,7 @@ class DirectExtractionEngine:
1. Are mostly solid black or white
2. Are within page boundaries
3. Actually overlap with text content (IoU check)
4. Are rendered AFTER the text they overlap (z-order check)
Args:
page: PyMuPDF page object
@@ -2939,6 +2940,22 @@ class DirectExtractionEngine:
if not image_list:
return covering_images
# Get rendering order (z-order) using get_bboxlog()
# Items rendered later (higher index) appear on top
bboxlog = page.get_bboxlog()
# Build a map of bbox -> sequence number for images and text
# This helps determine if an image is rendered before or after text
image_seqnos = {} # bbox tuple -> seqno
text_seqnos = {} # bbox tuple -> seqno
for seqno, (action_type, bbox) in enumerate(bboxlog):
bbox_tuple = tuple(fitz.Rect(bbox))
if "image" in action_type:
image_seqnos[bbox_tuple] = seqno
elif "text" in action_type:
text_seqnos[bbox_tuple] = seqno
# Get all text words for coverage check
words = page.get_text("words") # (x0, y0, x1, y1, word, block_no, line_no, word_no)
@@ -3005,8 +3022,23 @@ class DirectExtractionEngine:
# Clip image rect to page boundaries
clipped_rect = img_rect & page_rect
# Get image's rendering sequence number
img_bbox_tuple = tuple(clipped_rect)
img_seqno = image_seqnos.get(img_bbox_tuple, -1)
# If we can't find exact match, try to find closest match
if img_seqno == -1:
for bbox_tuple, seqno in image_seqnos.items():
if fitz.Rect(bbox_tuple).intersects(clipped_rect):
# Use the matching seqno
img_seqno = seqno
break
# Check if image actually covers any text (IoU check)
# AND is rendered AFTER the text (z-order check)
covered_text_count = 0
is_background_image = False
for word_info in words:
word_rect = fitz.Rect(word_info[:4])
word_area = word_rect.width * word_rect.height
@@ -3017,13 +3049,35 @@ class DirectExtractionEngine:
if not intersection.is_empty:
intersection_area = intersection.width * intersection.height
coverage_ratio = intersection_area / word_area
# Count as covered if >= 50% of word is under the image
if coverage_ratio >= 0.5:
covered_text_count += 1
# Z-order check: Find the text's rendering sequence
text_seqno = -1
for bbox_tuple, seqno in text_seqnos.items():
text_bbox = fitz.Rect(bbox_tuple)
if text_bbox.intersects(word_rect):
text_seqno = seqno
break
# Only count as covered if image is rendered AFTER text
# If image is rendered BEFORE text, it's a background
if img_seqno > text_seqno and text_seqno >= 0:
covered_text_count += 1
elif img_seqno < text_seqno and img_seqno >= 0:
# Image is rendered before text = background
is_background_image = True
# Skip this image if it's detected as a background image
if is_background_image and covered_text_count == 0:
logger.debug(f"Page {page_num}: Skipping background image xref={xref} "
f"(rendered before text, seqno={img_seqno})")
continue
# Report if image covers text OR is pure solid black/white
# Pure solid fills are likely redaction/placeholder boxes
if covered_text_count > 0 or is_pure_solid:
# But skip if it's a background image (rendered before text)
if covered_text_count > 0 or (is_pure_solid and not is_background_image):
covering_images.append({
'xref': xref, # Include xref for filtering
'bbox': tuple(clipped_rect),
@@ -3031,7 +3085,9 @@ class DirectExtractionEngine:
'avg_color': (avg_r, avg_g, avg_b),
'size': (width, height),
'covered_text_count': covered_text_count,
'is_pure_solid': is_pure_solid
'is_pure_solid': is_pure_solid,
'is_background': is_background_image,
'render_seqno': img_seqno
})
except Exception as e: