From 24253ac15e8a6a647ffc07ef0b20318769b41dcc Mon Sep 17 00:00:00 2001 From: egg Date: Fri, 12 Dec 2025 07:50:43 +0800 Subject: [PATCH] feat: unify Direct Track PDF rendering and simplify export options MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Backend changes: - Apply background image + invisible text layer to all Direct Track PDFs - Add CHART to regions_to_avoid for text extraction - Improve visual fidelity for native PDFs and Office documents Frontend changes: - Remove JSON, UnifiedDocument, Markdown download buttons - Simplify to 2-column layout with only Layout PDF and Reflow PDF - Remove translation JSON download and Layout PDF option - Keep only Reflow PDF for translated document downloads - Clean up unused imports (FileJson, Database, FileOutput) Archives two OpenSpec proposals: - unify-direct-track-pdf-rendering - simplify-frontend-export-options ๐Ÿค– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- .../app/services/direct_extraction_engine.py | 62 ++++- backend/app/services/pdf_generator_service.py | 229 ++++++++++++++---- frontend/src/pages/TaskDetailPage.tsx | 111 +-------- .../proposal.md | 59 +++++ .../specs/result-export/spec.md | 24 ++ .../tasks.md | 57 +++++ .../design.md | 130 ++++++++++ .../proposal.md | 54 +++++ .../specs/document-processing/spec.md | 43 ++++ .../specs/result-export/spec.md | 36 +++ .../specs/translation/spec.md | 46 ++++ .../tasks.md | 78 ++++++ openspec/specs/document-processing/spec.md | 42 ++++ openspec/specs/result-export/spec.md | 70 +++--- openspec/specs/translation/spec.md | 45 ++++ 15 files changed, 891 insertions(+), 195 deletions(-) create mode 100644 openspec/changes/archive/2025-12-11-simplify-frontend-export-options/proposal.md create mode 100644 openspec/changes/archive/2025-12-11-simplify-frontend-export-options/specs/result-export/spec.md create mode 100644 openspec/changes/archive/2025-12-11-simplify-frontend-export-options/tasks.md create mode 100644 openspec/changes/archive/2025-12-11-unify-direct-track-pdf-rendering/design.md create mode 100644 openspec/changes/archive/2025-12-11-unify-direct-track-pdf-rendering/proposal.md create mode 100644 openspec/changes/archive/2025-12-11-unify-direct-track-pdf-rendering/specs/document-processing/spec.md create mode 100644 openspec/changes/archive/2025-12-11-unify-direct-track-pdf-rendering/specs/result-export/spec.md create mode 100644 openspec/changes/archive/2025-12-11-unify-direct-track-pdf-rendering/specs/translation/spec.md create mode 100644 openspec/changes/archive/2025-12-11-unify-direct-track-pdf-rendering/tasks.md diff --git a/backend/app/services/direct_extraction_engine.py b/backend/app/services/direct_extraction_engine.py index b613014..2ebe39e 100644 --- a/backend/app/services/direct_extraction_engine.py +++ b/backend/app/services/direct_extraction_engine.py @@ -2920,6 +2920,7 @@ class DirectExtractionEngine: 1. Are mostly solid black or white 2. Are within page boundaries 3. Actually overlap with text content (IoU check) + 4. Are rendered AFTER the text they overlap (z-order check) Args: page: PyMuPDF page object @@ -2939,6 +2940,22 @@ class DirectExtractionEngine: if not image_list: return covering_images + # Get rendering order (z-order) using get_bboxlog() + # Items rendered later (higher index) appear on top + bboxlog = page.get_bboxlog() + + # Build a map of bbox -> sequence number for images and text + # This helps determine if an image is rendered before or after text + image_seqnos = {} # bbox tuple -> seqno + text_seqnos = {} # bbox tuple -> seqno + + for seqno, (action_type, bbox) in enumerate(bboxlog): + bbox_tuple = tuple(fitz.Rect(bbox)) + if "image" in action_type: + image_seqnos[bbox_tuple] = seqno + elif "text" in action_type: + text_seqnos[bbox_tuple] = seqno + # Get all text words for coverage check words = page.get_text("words") # (x0, y0, x1, y1, word, block_no, line_no, word_no) @@ -3005,8 +3022,23 @@ class DirectExtractionEngine: # Clip image rect to page boundaries clipped_rect = img_rect & page_rect + # Get image's rendering sequence number + img_bbox_tuple = tuple(clipped_rect) + img_seqno = image_seqnos.get(img_bbox_tuple, -1) + + # If we can't find exact match, try to find closest match + if img_seqno == -1: + for bbox_tuple, seqno in image_seqnos.items(): + if fitz.Rect(bbox_tuple).intersects(clipped_rect): + # Use the matching seqno + img_seqno = seqno + break + # Check if image actually covers any text (IoU check) + # AND is rendered AFTER the text (z-order check) covered_text_count = 0 + is_background_image = False + for word_info in words: word_rect = fitz.Rect(word_info[:4]) word_area = word_rect.width * word_rect.height @@ -3017,13 +3049,35 @@ class DirectExtractionEngine: if not intersection.is_empty: intersection_area = intersection.width * intersection.height coverage_ratio = intersection_area / word_area + # Count as covered if >= 50% of word is under the image if coverage_ratio >= 0.5: - covered_text_count += 1 + # Z-order check: Find the text's rendering sequence + text_seqno = -1 + for bbox_tuple, seqno in text_seqnos.items(): + text_bbox = fitz.Rect(bbox_tuple) + if text_bbox.intersects(word_rect): + text_seqno = seqno + break + + # Only count as covered if image is rendered AFTER text + # If image is rendered BEFORE text, it's a background + if img_seqno > text_seqno and text_seqno >= 0: + covered_text_count += 1 + elif img_seqno < text_seqno and img_seqno >= 0: + # Image is rendered before text = background + is_background_image = True + + # Skip this image if it's detected as a background image + if is_background_image and covered_text_count == 0: + logger.debug(f"Page {page_num}: Skipping background image xref={xref} " + f"(rendered before text, seqno={img_seqno})") + continue # Report if image covers text OR is pure solid black/white # Pure solid fills are likely redaction/placeholder boxes - if covered_text_count > 0 or is_pure_solid: + # But skip if it's a background image (rendered before text) + if covered_text_count > 0 or (is_pure_solid and not is_background_image): covering_images.append({ 'xref': xref, # Include xref for filtering 'bbox': tuple(clipped_rect), @@ -3031,7 +3085,9 @@ class DirectExtractionEngine: 'avg_color': (avg_r, avg_g, avg_b), 'size': (width, height), 'covered_text_count': covered_text_count, - 'is_pure_solid': is_pure_solid + 'is_pure_solid': is_pure_solid, + 'is_background': is_background_image, + 'render_seqno': img_seqno }) except Exception as e: diff --git a/backend/app/services/pdf_generator_service.py b/backend/app/services/pdf_generator_service.py index f91e548..04de55e 100644 --- a/backend/app/services/pdf_generator_service.py +++ b/backend/app/services/pdf_generator_service.py @@ -709,7 +709,8 @@ class PDFGeneratorService: self, unified_doc: 'UnifiedDocument', output_path: Path, - source_file_path: Optional[Path] = None + source_file_path: Optional[Path] = None, + result_dir: Optional[Path] = None ) -> bool: """ Generate layout-preserving PDF directly from UnifiedDocument. @@ -721,6 +722,7 @@ class PDFGeneratorService: unified_doc: UnifiedDocument object output_path: Path to save generated PDF source_file_path: Optional path to original source file + result_dir: Optional path to result directory (for finding converted PDFs) Returns: True if successful, False otherwise @@ -751,7 +753,8 @@ class PDFGeneratorService: return self._generate_direct_track_pdf( unified_doc=unified_doc, output_path=output_path, - source_file_path=source_file_path + source_file_path=source_file_path, + result_dir=result_dir ) else: # OCR track: Simplified rendering (backward compatible) @@ -823,7 +826,8 @@ class PDFGeneratorService: self, unified_doc: 'UnifiedDocument', output_path: Path, - source_file_path: Optional[Path] = None + source_file_path: Optional[Path] = None, + result_dir: Optional[Path] = None ) -> bool: """ Generate PDF with rich formatting preservation for Direct track. @@ -836,6 +840,7 @@ class PDFGeneratorService: unified_doc: UnifiedDocument from Direct extraction output_path: Path to save generated PDF source_file_path: Optional path to original source file + result_dir: Optional path to result directory (for finding converted PDFs) Returns: True if successful, False otherwise @@ -865,6 +870,55 @@ class PDFGeneratorService: from reportlab.pdfgen import canvas pdf_canvas = canvas.Canvas(str(output_path), pagesize=(page_width, page_height)) + # For ALL Direct Track documents, render source page as background image + # This preserves visual fidelity (vector graphics, charts, complex layouts) + # and overlays invisible text layer for searchability/translation + use_background_rendering = ( + self.current_processing_track == ProcessingTrack.DIRECT or + self.current_processing_track == ProcessingTrack.HYBRID + ) + source_pdf = None + + if use_background_rendering: + # Find the source PDF for background rendering + # For Office documents: source_file_path points to .pptx/.docx, need converted PDF + # For native PDFs: source_file_path should be the PDF itself + actual_source_pdf = None + + # Use provided result_dir, or fall back to output_path.parent + search_dir = result_dir if result_dir else output_path.parent + if search_dir.exists(): + # Look for PDF files that match the pattern: {task_id}_{name}.pdf + pdf_files = list(search_dir.glob('*.pdf')) + # Filter out layout/output PDFs + source_pdfs = [ + f for f in pdf_files + if not f.name.endswith('_layout.pdf') + and not f.name.endswith('_reflow.pdf') + and f.name != output_path.name + ] + if source_pdfs: + actual_source_pdf = source_pdfs[0] + logger.debug(f"Found converted PDF in result dir: {actual_source_pdf.name}") + + # Fallback: use source_file_path if it's a PDF + if not actual_source_pdf and source_file_path and source_file_path.exists(): + if source_file_path.suffix.lower() == '.pdf': + actual_source_pdf = source_file_path + + if actual_source_pdf and actual_source_pdf.exists(): + try: + import fitz + source_pdf = fitz.open(str(actual_source_pdf)) + logger.info(f"Direct Track: will render source pages as background from: {actual_source_pdf.name}") + except Exception as e: + logger.warning(f"Failed to open source PDF for background rendering: {e}") + use_background_rendering = False + source_pdf = None + else: + logger.warning(f"Direct Track: no source PDF found in {search_dir}, skipping background rendering") + use_background_rendering = False + # Process each page for page_idx, page in enumerate(unified_doc.pages): logger.info(f">>> Processing page {page_idx + 1}/{len(unified_doc.pages)}") @@ -880,6 +934,42 @@ class PDFGeneratorService: # Set page size for current page pdf_canvas.setPageSize((current_page_width, current_page_height)) + # For Direct Track: render source page as background image + # This preserves all visual content (vector graphics, shapes, charts) + rendered_background = False + if use_background_rendering and source_pdf and page_idx < len(source_pdf): + try: + source_page = source_pdf[page_idx] + # Render at 2x resolution for quality + mat = fitz.Matrix(2.0, 2.0) + pix = source_page.get_pixmap(matrix=mat, alpha=False) + + # Save to temporary file + import tempfile + with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp: + pix.save(tmp.name) + temp_bg_path = tmp.name + + # Draw background image (full page) + from reportlab.lib.utils import ImageReader + bg_img = ImageReader(temp_bg_path) + pdf_canvas.drawImage( + bg_img, + 0, 0, + width=current_page_width, + height=current_page_height, + preserveAspectRatio=False + ) + rendered_background = True + logger.info(f" Rendered source page {page_idx + 1} as background image") + + # Clean up temp file + import os + os.unlink(temp_bg_path) + except Exception as e: + logger.warning(f"Failed to render background for page {page_idx + 1}: {e}") + rendered_background = False + # Separate elements by type text_elements = [] table_elements = [] @@ -918,22 +1008,28 @@ class PDFGeneratorService: continue image_elements.append(element) - # Only add real images to exclusion regions, NOT charts/diagrams - # Charts often have large bounding boxes that include text labels - # which should be rendered as selectable text on top - if element.type in [ElementType.IMAGE, ElementType.FIGURE, ElementType.LOGO, ElementType.STAMP]: - # Check if this is Direct track (text from PDF text layer, not OCR) - is_direct = (self.current_processing_track == ProcessingTrack.DIRECT or - self.current_processing_track == ProcessingTrack.HYBRID) - if is_direct: - # Direct track: text is from PDF text layer, not OCR'd from images - # Don't exclude any images - text should be rendered on top - # This is critical for Office documents with background images + # Check if this is Direct track (text from PDF text layer, not OCR) + is_direct = (self.current_processing_track == ProcessingTrack.DIRECT or + self.current_processing_track == ProcessingTrack.HYBRID) + + # For Direct Track with background rendering: + # - CHART regions should be excluded from text layer (chart text already in background) + # - Other images don't need exclusion (text rendered as invisible overlay) + if is_direct: + if element.type == ElementType.CHART: + # Add chart to exclusion regions - chart-internal text should NOT be + # in the invisible text layer (already visible in background image) + regions_to_avoid.append(element) + logger.debug(f"Direct track: excluding CHART {element.element_id} - text inside chart not needed") + else: + # Other image types: don't exclude, text will be invisible overlay logger.debug(f"Direct track: not excluding {element.element_id} from text regions") - continue + continue - # OCR track: Skip full-page background images from exclusion regions + # OCR track: Handle image exclusion for text rendered on images + if element.type in [ElementType.IMAGE, ElementType.FIGURE, ElementType.LOGO, ElementType.STAMP]: + # Skip full-page background images from exclusion regions # Smaller images that might contain OCR'd text should still be excluded if element.bbox: elem_area = (element.bbox.x1 - element.bbox.x0) * (element.bbox.y1 - element.bbox.y0) @@ -965,23 +1061,20 @@ class PDFGeneratorService: f"{len(table_elements)} tables, {len(image_elements)} images, " f"{len(list_elements)} list items") - # Use original element order from extraction engine - # The extraction engine has already sorted elements by reading order, - # handling multi-column layouts correctly (top-to-bottom, left-to-right) - all_elements = [] + # FIX: Render in proper z-order for Office/PPT documents + # Images (backgrounds) must be rendered FIRST, then tables, then text on top + # This ensures white text on dark backgrounds is visible - # Preserve original order by iterating through page.elements - for elem in page.elements: - if elem in image_elements: - all_elements.append(('image', elem)) - elif elem in table_elements: - all_elements.append(('table', elem)) - elif elem in list_elements: - all_elements.append(('list', elem)) - elif elem in text_elements: - all_elements.append(('text', elem)) + # Sort images by area (largest first = background images) + def get_element_area(elem): + if elem.bbox: + return (elem.bbox.x1 - elem.bbox.x0) * (elem.bbox.y1 - elem.bbox.y0) + return 0 - logger.debug(f"Drawing {len(all_elements)} elements in extraction order (preserves multi-column reading order)") + sorted_images = sorted(image_elements, key=get_element_area, reverse=True) + + logger.debug(f"Rendering order: {len(sorted_images)} images (largest first), " + f"{len(table_elements)} tables, {len(text_elements)+len(list_elements)} text elements") logger.debug(f"Exclusion regions: {len(regions_to_avoid)} (tables/images/charts)") # Debug: Log exclusion region types @@ -992,29 +1085,61 @@ class PDFGeneratorService: if region_types: logger.debug(f" Exclusion region breakdown: {region_types}") - # Draw elements in document order - for elem_type, elem in all_elements: - if elem_type == 'image': + # Step 1: Draw images (backgrounds) + # Skip if we already rendered the source page as background (Office documents) + if rendered_background: + logger.debug(f" Skipping {len(sorted_images)} individual images - background already rendered") + else: + # Larger images (backgrounds) are drawn first, smaller images on top + for elem in sorted_images: self._draw_image_element_direct(pdf_canvas, elem, current_page_height, output_path.parent) - elif elem_type == 'table': + + # For Office documents with full-page background rendering: + # - Skip tables (already visible in background image) + # - Draw text as INVISIBLE layer (for searchability/translation, but no visual overlap) + if rendered_background: + logger.debug(f" Skipping {len(table_elements)} tables - already in background") + logger.debug(f" Drawing {len(text_elements)+len(list_elements)} text elements as invisible layer") + + # Set text rendering mode to invisible (mode 3) + # This makes text selectable/searchable but not visible + pdf_canvas._code.append('3 Tr') # Text render mode: invisible + + for elem in page.elements: + if elem in list_elements or elem in text_elements: + self._draw_text_element_direct(pdf_canvas, elem, current_page_height) + + # Reset text rendering mode to normal + pdf_canvas._code.append('0 Tr') # Text render mode: fill + else: + # Step 2: Draw tables + for elem in table_elements: self._draw_table_element_direct(pdf_canvas, elem, current_page_height) - elif elem_type == 'list': - # FIX: Check if list item overlaps with table/image - if not self._is_element_inside_regions(elem.bbox, regions_to_avoid): - self._draw_text_element_direct(pdf_canvas, elem, current_page_height) - else: - logger.debug(f"Skipping list element {elem.element_id} inside table/image region") - elif elem_type == 'text': - # FIX: Check if text overlaps with table/image before drawing - if not self._is_element_inside_regions(elem.bbox, regions_to_avoid): - self._draw_text_element_direct(pdf_canvas, elem, current_page_height) - else: - logger.debug(f"Skipping text element {elem.element_id} inside table/image region") + + # Step 3: Draw text and list elements (on top of images/tables) + # Use original document order for reading flow + for elem in page.elements: + if elem in list_elements: + # Check if list item overlaps with table/image + if not self._is_element_inside_regions(elem.bbox, regions_to_avoid): + self._draw_text_element_direct(pdf_canvas, elem, current_page_height) + else: + logger.debug(f"Skipping list element {elem.element_id} inside table/image region") + elif elem in text_elements: + # Check if text overlaps with table/image before drawing + if not self._is_element_inside_regions(elem.bbox, regions_to_avoid): + self._draw_text_element_direct(pdf_canvas, elem, current_page_height) + else: + logger.debug(f"Skipping text element {elem.element_id} inside table/image region") # Save PDF pdf_canvas.save() logger.info(f"Direct track PDF saved to {output_path}") + # Close source PDF if opened + if source_pdf: + source_pdf.close() + # Reset track self.current_processing_track = None return True @@ -1023,6 +1148,12 @@ class PDFGeneratorService: logger.error(f"Failed to generate Direct track PDF: {e}") import traceback traceback.print_exc() + # Clean up source PDF on error + if source_pdf: + try: + source_pdf.close() + except: + pass self.current_processing_track = None return False @@ -3249,7 +3380,8 @@ class PDFGeneratorService: return self.generate_from_unified_document( unified_doc=unified_doc, output_path=output_path, - source_file_path=source_file_path + source_file_path=source_file_path, + result_dir=json_path.parent # Pass result dir for finding converted PDFs ) else: logger.error("Failed to convert JSON to UnifiedDocument") @@ -3309,6 +3441,7 @@ class PDFGeneratorService: keywords=metadata_dict.get('keywords'), producer=metadata_dict.get('producer'), creator=metadata_dict.get('creator'), + original_filename=metadata_dict.get('original_filename'), # For Office document detection creation_date=datetime.fromisoformat(metadata_dict['creation_date'].replace('Z', '+00:00')) if metadata_dict.get('creation_date') else None, modification_date=datetime.fromisoformat(metadata_dict['modification_date'].replace('Z', '+00:00')) if metadata_dict.get('modification_date') else None, ) diff --git a/frontend/src/pages/TaskDetailPage.tsx b/frontend/src/pages/TaskDetailPage.tsx index 639ba91..95b6fe9 100644 --- a/frontend/src/pages/TaskDetailPage.tsx +++ b/frontend/src/pages/TaskDetailPage.tsx @@ -14,7 +14,6 @@ import { AlertCircle, Clock, Layers, - FileJson, Loader2, ArrowLeft, RefreshCw, @@ -22,12 +21,10 @@ import { Table2, Image, BarChart3, - Database, Languages, Globe, CheckCircle, - Trash2, - FileOutput + Trash2 } from 'lucide-react' import type { ProcessingTrack, TranslationStatus, TranslationListItem } from '@/types/apiV2' import { Badge } from '@/components/ui/badge' @@ -224,60 +221,6 @@ export default function TaskDetailPage() { } } - const handleDownloadMarkdown = async () => { - if (!taskId) return - try { - await apiClientV2.downloadMarkdown(taskId) - toast({ - title: t('export.exportSuccess'), - description: 'Markdown ๅทฒไธ‹่ผ‰', - variant: 'success', - }) - } catch (error: any) { - toast({ - title: t('export.exportError'), - description: error.response?.data?.detail || t('errors.networkError'), - variant: 'destructive', - }) - } - } - - const handleDownloadJSON = async () => { - if (!taskId) return - try { - await apiClientV2.downloadJSON(taskId) - toast({ - title: t('export.exportSuccess'), - description: 'JSON ๅทฒไธ‹่ผ‰', - variant: 'success', - }) - } catch (error: any) { - toast({ - title: t('export.exportError'), - description: error.response?.data?.detail || t('errors.networkError'), - variant: 'destructive', - }) - } - } - - const handleDownloadUnified = async () => { - if (!taskId) return - try { - await apiClientV2.downloadUnified(taskId) - toast({ - title: t('export.exportSuccess'), - description: 'UnifiedDocument JSON ๅทฒไธ‹่ผ‰', - variant: 'success', - }) - } catch (error: any) { - toast({ - title: t('export.exportError'), - description: error.response?.data?.detail || t('errors.networkError'), - variant: 'destructive', - }) - } - } - const handleStartTranslation = async () => { if (!taskId || isTranslating) return @@ -319,24 +262,6 @@ export default function TaskDetailPage() { } } - const handleDownloadTranslation = async (lang: string) => { - if (!taskId) return - try { - await apiClientV2.downloadTranslation(taskId, lang) - toast({ - title: 'ไธ‹่ผ‰ๆˆๅŠŸ', - description: `็ฟป่ญฏ็ตๆžœ (${lang}) ๅทฒไธ‹่ผ‰`, - variant: 'success', - }) - } catch (error: any) { - toast({ - title: 'ไธ‹่ผ‰ๅคฑๆ•—', - description: error.response?.data?.detail || t('errors.networkError'), - variant: 'destructive', - }) - } - } - const handleDeleteTranslation = async (lang: string) => { if (!taskId) return try { @@ -542,19 +467,7 @@ export default function TaskDetailPage() { -
- - - +
-