feat: unify Direct Track PDF rendering and simplify export options
Backend changes: - Apply background image + invisible text layer to all Direct Track PDFs - Add CHART to regions_to_avoid for text extraction - Improve visual fidelity for native PDFs and Office documents Frontend changes: - Remove JSON, UnifiedDocument, Markdown download buttons - Simplify to 2-column layout with only Layout PDF and Reflow PDF - Remove translation JSON download and Layout PDF option - Keep only Reflow PDF for translated document downloads - Clean up unused imports (FileJson, Database, FileOutput) Archives two OpenSpec proposals: - unify-direct-track-pdf-rendering - simplify-frontend-export-options 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -2920,6 +2920,7 @@ class DirectExtractionEngine:
|
||||
1. Are mostly solid black or white
|
||||
2. Are within page boundaries
|
||||
3. Actually overlap with text content (IoU check)
|
||||
4. Are rendered AFTER the text they overlap (z-order check)
|
||||
|
||||
Args:
|
||||
page: PyMuPDF page object
|
||||
@@ -2939,6 +2940,22 @@ class DirectExtractionEngine:
|
||||
if not image_list:
|
||||
return covering_images
|
||||
|
||||
# Get rendering order (z-order) using get_bboxlog()
|
||||
# Items rendered later (higher index) appear on top
|
||||
bboxlog = page.get_bboxlog()
|
||||
|
||||
# Build a map of bbox -> sequence number for images and text
|
||||
# This helps determine if an image is rendered before or after text
|
||||
image_seqnos = {} # bbox tuple -> seqno
|
||||
text_seqnos = {} # bbox tuple -> seqno
|
||||
|
||||
for seqno, (action_type, bbox) in enumerate(bboxlog):
|
||||
bbox_tuple = tuple(fitz.Rect(bbox))
|
||||
if "image" in action_type:
|
||||
image_seqnos[bbox_tuple] = seqno
|
||||
elif "text" in action_type:
|
||||
text_seqnos[bbox_tuple] = seqno
|
||||
|
||||
# Get all text words for coverage check
|
||||
words = page.get_text("words") # (x0, y0, x1, y1, word, block_no, line_no, word_no)
|
||||
|
||||
@@ -3005,8 +3022,23 @@ class DirectExtractionEngine:
|
||||
# Clip image rect to page boundaries
|
||||
clipped_rect = img_rect & page_rect
|
||||
|
||||
# Get image's rendering sequence number
|
||||
img_bbox_tuple = tuple(clipped_rect)
|
||||
img_seqno = image_seqnos.get(img_bbox_tuple, -1)
|
||||
|
||||
# If we can't find exact match, try to find closest match
|
||||
if img_seqno == -1:
|
||||
for bbox_tuple, seqno in image_seqnos.items():
|
||||
if fitz.Rect(bbox_tuple).intersects(clipped_rect):
|
||||
# Use the matching seqno
|
||||
img_seqno = seqno
|
||||
break
|
||||
|
||||
# Check if image actually covers any text (IoU check)
|
||||
# AND is rendered AFTER the text (z-order check)
|
||||
covered_text_count = 0
|
||||
is_background_image = False
|
||||
|
||||
for word_info in words:
|
||||
word_rect = fitz.Rect(word_info[:4])
|
||||
word_area = word_rect.width * word_rect.height
|
||||
@@ -3017,13 +3049,35 @@ class DirectExtractionEngine:
|
||||
if not intersection.is_empty:
|
||||
intersection_area = intersection.width * intersection.height
|
||||
coverage_ratio = intersection_area / word_area
|
||||
|
||||
# Count as covered if >= 50% of word is under the image
|
||||
if coverage_ratio >= 0.5:
|
||||
covered_text_count += 1
|
||||
# Z-order check: Find the text's rendering sequence
|
||||
text_seqno = -1
|
||||
for bbox_tuple, seqno in text_seqnos.items():
|
||||
text_bbox = fitz.Rect(bbox_tuple)
|
||||
if text_bbox.intersects(word_rect):
|
||||
text_seqno = seqno
|
||||
break
|
||||
|
||||
# Only count as covered if image is rendered AFTER text
|
||||
# If image is rendered BEFORE text, it's a background
|
||||
if img_seqno > text_seqno and text_seqno >= 0:
|
||||
covered_text_count += 1
|
||||
elif img_seqno < text_seqno and img_seqno >= 0:
|
||||
# Image is rendered before text = background
|
||||
is_background_image = True
|
||||
|
||||
# Skip this image if it's detected as a background image
|
||||
if is_background_image and covered_text_count == 0:
|
||||
logger.debug(f"Page {page_num}: Skipping background image xref={xref} "
|
||||
f"(rendered before text, seqno={img_seqno})")
|
||||
continue
|
||||
|
||||
# Report if image covers text OR is pure solid black/white
|
||||
# Pure solid fills are likely redaction/placeholder boxes
|
||||
if covered_text_count > 0 or is_pure_solid:
|
||||
# But skip if it's a background image (rendered before text)
|
||||
if covered_text_count > 0 or (is_pure_solid and not is_background_image):
|
||||
covering_images.append({
|
||||
'xref': xref, # Include xref for filtering
|
||||
'bbox': tuple(clipped_rect),
|
||||
@@ -3031,7 +3085,9 @@ class DirectExtractionEngine:
|
||||
'avg_color': (avg_r, avg_g, avg_b),
|
||||
'size': (width, height),
|
||||
'covered_text_count': covered_text_count,
|
||||
'is_pure_solid': is_pure_solid
|
||||
'is_pure_solid': is_pure_solid,
|
||||
'is_background': is_background_image,
|
||||
'render_seqno': img_seqno
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
|
||||
Reference in New Issue
Block a user