From 1f180100409b31c168f1044d06ff07616195a378 Mon Sep 17 00:00:00 2001 From: egg Date: Fri, 12 Dec 2025 11:02:35 +0800 Subject: [PATCH] fix: OCR Track reflow PDF and translation with image text filtering MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add OCR Track support for reflow PDF generation using raw_ocr_regions.json - Add OCR Track translation extraction from raw_ocr_regions instead of elements - Add raw_ocr_translations output format for OCR Track documents - Add exclusion zone filtering to remove text overlapping with images - Update API validation to accept both translations and raw_ocr_translations - Add page_number field to TranslatedItem for proper tracking 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- backend/app/routers/translate.py | 5 +- backend/app/schemas/translation.py | 1 + backend/app/services/pdf_generator_service.py | 574 +++++++++++++++--- backend/app/services/translation_service.py | 239 ++++++-- .../proposal.md | 51 ++ .../specs/result-export/spec.md | 23 + .../tasks.md | 51 ++ .../proposal.md | 70 +++ .../specs/translation/spec.md | 56 ++ .../tasks.md | 76 +++ openspec/specs/result-export/spec.md | 43 +- 11 files changed, 1040 insertions(+), 149 deletions(-) create mode 100644 openspec/changes/archive/2025-12-12-fix-ocr-track-reflow-pdf/proposal.md create mode 100644 openspec/changes/archive/2025-12-12-fix-ocr-track-reflow-pdf/specs/result-export/spec.md create mode 100644 openspec/changes/archive/2025-12-12-fix-ocr-track-reflow-pdf/tasks.md create mode 100644 openspec/changes/archive/2025-12-12-fix-ocr-track-translation/proposal.md create mode 100644 openspec/changes/archive/2025-12-12-fix-ocr-track-translation/specs/translation/spec.md create mode 100644 openspec/changes/archive/2025-12-12-fix-ocr-track-translation/tasks.md diff --git a/backend/app/routers/translate.py b/backend/app/routers/translate.py index a9df2c6..dbe60d8 100644 --- a/backend/app/routers/translate.py +++ b/backend/app/routers/translate.py @@ -578,7 +578,10 @@ async def download_translated_pdf( with open(translation_file, 'r', encoding='utf-8') as f: translation_data = json.load(f) - if not translation_data.get('translations'): + # Check for translations (Direct Track) or raw_ocr_translations (OCR Track) + has_translations = translation_data.get('translations') + has_raw_ocr_translations = translation_data.get('raw_ocr_translations') + if not has_translations and not has_raw_ocr_translations: raise HTTPException( status_code=status.HTTP_400_BAD_REQUEST, detail="Translation file is empty or incomplete" diff --git a/backend/app/schemas/translation.py b/backend/app/schemas/translation.py index 11c2902..2fd32a9 100644 --- a/backend/app/schemas/translation.py +++ b/backend/app/schemas/translation.py @@ -146,6 +146,7 @@ class TranslatedItem: original_content: str translated_content: str element_type: str + page_number: int = 1 cell_position: Optional[Tuple[int, int]] = None diff --git a/backend/app/services/pdf_generator_service.py b/backend/app/services/pdf_generator_service.py index 04de55e..5ede0af 100644 --- a/backend/app/services/pdf_generator_service.py +++ b/backend/app/services/pdf_generator_service.py @@ -4701,11 +4701,242 @@ class PDFGeneratorService: logger.error(f"Failed to embed image for reflow: {e}") return None + def _collect_exclusion_zones(self, page_data: Dict) -> List[Tuple[float, float, float, float]]: + """ + Collect exclusion zones (image bboxes) from page elements. + + These zones are used to filter out OCR text that overlaps with images, + preventing text inside images from appearing in reflow PDFs. + + Args: + page_data: Page dictionary containing 'elements' + + Returns: + List of (x0, y0, x1, y1) tuples representing image bounding boxes + """ + exclusion_zones = [] + + elements = page_data.get('elements', []) + for elem in elements: + elem_type = elem.get('type', '') + + # Collect image/chart bboxes + if elem_type in ('image', 'Image', 'figure', 'Figure', 'chart', 'Chart'): + bbox = elem.get('bbox', {}) + if isinstance(bbox, dict): + x0 = bbox.get('x0', 0) + y0 = bbox.get('y0', 0) + x1 = bbox.get('x1', 0) + y1 = bbox.get('y1', 0) + if x1 > x0 and y1 > y0: + exclusion_zones.append((x0, y0, x1, y1)) + + # Collect embedded images in tables + if elem_type in ('table', 'Table'): + metadata = elem.get('metadata', {}) + embedded_images = metadata.get('embedded_images', []) + for emb_img in embedded_images: + emb_bbox = emb_img.get('bbox', []) + if isinstance(emb_bbox, list) and len(emb_bbox) >= 4: + x0, y0, x1, y1 = emb_bbox[0], emb_bbox[1], emb_bbox[2], emb_bbox[3] + if x1 > x0 and y1 > y0: + exclusion_zones.append((x0, y0, x1, y1)) + + return exclusion_zones + + def _is_region_overlapping_exclusion( + self, + region_bbox: List, + exclusion_zones: List[Tuple[float, float, float, float]], + ioa_threshold: float = 0.3 + ) -> bool: + """ + Check if a text region overlaps significantly with any exclusion zone. + + Uses IoA (Intersection over Area) to determine overlap. + + Args: + region_bbox: Quadrilateral bbox [[x0,y0], [x1,y1], [x2,y2], [x3,y3]] + exclusion_zones: List of (x0, y0, x1, y1) tuples + ioa_threshold: Overlap threshold (default 0.3 = 30%) + + Returns: + True if region should be excluded + """ + if not exclusion_zones or not region_bbox: + return False + + # Convert quadrilateral to rectangular bbox + if len(region_bbox) >= 4: + xs = [p[0] for p in region_bbox] + ys = [p[1] for p in region_bbox] + tx0, ty0, tx1, ty1 = min(xs), min(ys), max(xs), max(ys) + else: + return False + + text_area = (tx1 - tx0) * (ty1 - ty0) + if text_area <= 0: + return False + + for zx0, zy0, zx1, zy1 in exclusion_zones: + # Calculate intersection + ix0 = max(tx0, zx0) + iy0 = max(ty0, zy0) + ix1 = min(tx1, zx1) + iy1 = min(ty1, zy1) + + if ix1 > ix0 and iy1 > iy0: + intersection_area = (ix1 - ix0) * (iy1 - iy0) + ioa = intersection_area / text_area + + if ioa >= ioa_threshold: + return True + + return False + + def _filter_regions_by_exclusion( + self, + regions: List[Dict], + exclusion_zones: List[Tuple[float, float, float, float]], + ioa_threshold: float = 0.3 + ) -> List[Dict]: + """ + Filter out text regions that overlap with exclusion zones (images). + + Args: + regions: List of raw OCR regions with 'text' and 'bbox' + exclusion_zones: List of (x0, y0, x1, y1) tuples + ioa_threshold: Overlap threshold + + Returns: + Filtered list of regions + """ + if not exclusion_zones: + return regions + + filtered = [] + excluded_count = 0 + + for region in regions: + bbox = region.get('bbox', []) + if self._is_region_overlapping_exclusion(bbox, exclusion_zones, ioa_threshold): + excluded_count += 1 + text = region.get('text', '')[:20] + logger.debug(f"Excluding text '{text}...' due to image overlap") + else: + filtered.append(region) + + if excluded_count > 0: + logger.info(f"Filtered {excluded_count} text regions overlapping with images") + + return filtered + + def _render_reflow_elements( + self, + page_data: Dict, + result_dir: Path, + styles: Dict, + story: List + ) -> None: + """ + Render page elements in reflow format (Direct Track logic). + + This method processes elements from the JSON and renders them + as flowing content (text, tables, images). + + Args: + page_data: Page dictionary containing 'elements' + result_dir: Path to result directory for images + styles: Style dictionary for paragraphs + story: List to append rendered elements to + """ + # Get elements in reading order + elements = self._get_elements_in_reading_order(page_data) + + for elem in elements: + elem_type = elem.get('type', elem.get('element_type', 'text')) + content = elem.get('content', elem.get('text', '')) + + # Types that can have dict content (handled specially) + dict_content_types = ('table', 'Table', 'image', 'figure', 'Image', 'Figure', 'chart', 'Chart') + + # Ensure content is a string for text elements + if isinstance(content, dict): + # Tables, images, charts have dict content - handled by their respective methods + if elem_type not in dict_content_types: + # Skip other elements with dict content + continue + elif not isinstance(content, str): + content = str(content) if content else '' + + if elem_type in ('table', 'Table'): + # Handle table + table = self._create_reflow_table(elem, styles) + if table: + story.append(table) + story.append(Spacer(1, 12)) + + # Handle embedded images in table (from metadata) + metadata = elem.get('metadata', {}) + embedded_images = metadata.get('embedded_images', []) + for emb_img in embedded_images: + img_path_str = emb_img.get('saved_path', '') + if img_path_str: + img_path = result_dir / img_path_str + if not img_path.exists(): + img_path = result_dir / Path(img_path_str).name + if img_path.exists(): + try: + img = PlatypusImage(str(img_path)) + # Scale to fit page width if necessary + max_width = 450 + if img.drawWidth > max_width: + ratio = max_width / img.drawWidth + img.drawWidth = max_width + img.drawHeight *= ratio + story.append(img) + story.append(Spacer(1, 8)) + logger.info(f"Embedded table image in reflow: {img_path.name}") + except Exception as e: + logger.warning(f"Failed to embed table image: {e}") + + elif elem_type in ('image', 'figure', 'Image', 'Figure', 'chart', 'Chart'): + # Handle image/chart + img = self._embed_image_reflow(elem, result_dir) + if img: + story.append(img) + story.append(Spacer(1, 8)) + + elif elem_type in ('title', 'Title'): + # Title text + if content: + content = content.replace('&', '&').replace('<', '<').replace('>', '>') + story.append(Paragraph(content, styles['Title'])) + + elif elem_type in ('section_header', 'SectionHeader', 'h1', 'H1'): + # Heading 1 + if content: + content = content.replace('&', '&').replace('<', '<').replace('>', '>') + story.append(Paragraph(content, styles['Heading1'])) + + elif elem_type in ('h2', 'H2', 'Heading2'): + # Heading 2 + if content: + content = content.replace('&', '&').replace('<', '<').replace('>', '>') + story.append(Paragraph(content, styles['Heading2'])) + + else: + # Body text (default) + if content: + content = content.replace('&', '&').replace('<', '<').replace('>', '>') + story.append(Paragraph(content, styles['Body'])) + def generate_reflow_pdf( self, json_path: Path, output_path: Path, - source_file_path: Optional[Path] = None + source_file_path: Optional[Path] = None, + use_elements_only: bool = False ) -> bool: """ Generate reflow layout PDF from OCR/Direct JSON data. @@ -4713,10 +4944,15 @@ class PDFGeneratorService: This creates a flowing document with consistent font sizes, proper reading order, and inline tables/images. + For OCR Track: Uses raw_ocr_regions.json for text content (ensures all text is included) + For Direct Track: Uses content.cells for tables (structured data available) + Args: json_path: Path to result JSON file (UnifiedDocument format) output_path: Path to save generated PDF source_file_path: Optional path to original source file (for images) + use_elements_only: If True, always use elements from JSON (for translated PDFs + where translations are applied to elements, not raw_ocr_regions) Returns: True if successful, False otherwise @@ -4727,6 +4963,12 @@ class PDFGeneratorService: with open(json_path, 'r', encoding='utf-8') as f: json_data = json.load(f) + # Detect processing track + metadata = json_data.get('metadata', {}) + processing_track = metadata.get('processing_track', 'direct') + is_ocr_track = processing_track == 'ocr' + logger.info(f"Reflow PDF generation - Processing track: {processing_track}") + # Get styles styles = self._get_reflow_styles() @@ -4741,93 +4983,88 @@ class PDFGeneratorService: else: result_dir = json_path.parent + # Extract task_id from result_dir (directory name is the task_id) + task_id = result_dir.name + # Process each page pages = json_data.get('pages', []) for page_idx, page_data in enumerate(pages): + page_num = page_idx + 1 # 1-indexed if page_idx > 0: # Add page break between pages story.append(Spacer(1, 30)) - # Get elements in reading order - elements = self._get_elements_in_reading_order(page_data) + # === OCR Track: Use raw_ocr_regions.json for text === + # But for translated PDFs (use_elements_only=True), use elements which have translations applied + if is_ocr_track and not use_elements_only and TEXT_REGION_RENDERER_AVAILABLE and load_raw_ocr_regions: + # Load raw OCR regions for this page + raw_regions = load_raw_ocr_regions(str(result_dir), task_id, page_num) - for elem in elements: - elem_type = elem.get('type', elem.get('element_type', 'text')) - content = elem.get('content', elem.get('text', '')) + if raw_regions: + logger.info(f"OCR Track reflow: Using {len(raw_regions)} raw OCR regions for page {page_num}") - # Types that can have dict content (handled specially) - dict_content_types = ('table', 'Table', 'image', 'figure', 'Image', 'Figure', 'chart', 'Chart') + # Collect exclusion zones (image bboxes) to filter text inside images + exclusion_zones = self._collect_exclusion_zones(page_data) + if exclusion_zones: + logger.info(f"Collected {len(exclusion_zones)} exclusion zones for text filtering") + raw_regions = self._filter_regions_by_exclusion(raw_regions, exclusion_zones) - # Ensure content is a string for text elements - if isinstance(content, dict): - # Tables, images, charts have dict content - handled by their respective methods - if elem_type not in dict_content_types: - # Skip other elements with dict content - continue - elif not isinstance(content, str): - content = str(content) if content else '' + # Sort by Y coordinate (top to bottom reading order) + def get_y_coord(region): + bbox = region.get('bbox', []) + if bbox and len(bbox) >= 4: + # bbox is [[x0,y0], [x1,y1], [x2,y2], [x3,y3]] + # Get average Y of top-left and top-right corners + return (bbox[0][1] + bbox[1][1]) / 2 + return 0 - if elem_type in ('table', 'Table'): - # Handle table - table = self._create_reflow_table(elem, styles) - if table: - story.append(table) - story.append(Spacer(1, 12)) + sorted_regions = sorted(raw_regions, key=get_y_coord) - # Handle embedded images in table (from metadata) - metadata = elem.get('metadata', {}) - embedded_images = metadata.get('embedded_images', []) - for emb_img in embedded_images: - img_path_str = emb_img.get('saved_path', '') - if img_path_str: - img_path = result_dir / img_path_str - if not img_path.exists(): - img_path = result_dir / Path(img_path_str).name - if img_path.exists(): - try: - img = PlatypusImage(str(img_path)) - # Scale to fit page width if necessary - max_width = 450 - if img.drawWidth > max_width: - ratio = max_width / img.drawWidth - img.drawWidth = max_width - img.drawHeight *= ratio - story.append(img) - story.append(Spacer(1, 8)) - logger.info(f"Embedded table image in reflow: {img_path.name}") - except Exception as e: - logger.warning(f"Failed to embed table image: {e}") - - elif elem_type in ('image', 'figure', 'Image', 'Figure', 'chart', 'Chart'): - # Handle image/chart - img = self._embed_image_reflow(elem, result_dir) - if img: - story.append(img) - story.append(Spacer(1, 8)) - - elif elem_type in ('title', 'Title'): - # Title text - if content: - content = content.replace('&', '&').replace('<', '<').replace('>', '>') - story.append(Paragraph(content, styles['Title'])) - - elif elem_type in ('section_header', 'SectionHeader', 'h1', 'H1'): - # Heading 1 - if content: - content = content.replace('&', '&').replace('<', '<').replace('>', '>') - story.append(Paragraph(content, styles['Heading1'])) - - elif elem_type in ('h2', 'H2', 'Heading2'): - # Heading 2 - if content: - content = content.replace('&', '&').replace('<', '<').replace('>', '>') - story.append(Paragraph(content, styles['Heading2'])) + # Render text blocks as paragraphs + for region in sorted_regions: + text = region.get('text', '') + if text: + text = text.replace('&', '&').replace('<', '<').replace('>', '>') + story.append(Paragraph(text, styles['Body'])) + # Also render images/charts from elements + elements = self._get_elements_in_reading_order(page_data) + for elem in elements: + elem_type = elem.get('type', elem.get('element_type', '')) + if elem_type in ('image', 'figure', 'Image', 'Figure', 'chart', 'Chart'): + img = self._embed_image_reflow(elem, result_dir) + if img: + story.append(img) + story.append(Spacer(1, 8)) + # Handle embedded images in tables + elif elem_type in ('table', 'Table'): + elem_metadata = elem.get('metadata', {}) + embedded_images = elem_metadata.get('embedded_images', []) + for emb_img in embedded_images: + img_path_str = emb_img.get('saved_path', '') + if img_path_str: + img_path = result_dir / img_path_str + if not img_path.exists(): + img_path = result_dir / Path(img_path_str).name + if img_path.exists(): + try: + img = PlatypusImage(str(img_path)) + max_width = 450 + if img.drawWidth > max_width: + ratio = max_width / img.drawWidth + img.drawWidth = max_width + img.drawHeight *= ratio + story.append(img) + story.append(Spacer(1, 8)) + except Exception as e: + logger.warning(f"Failed to embed table image: {e}") else: - # Body text (default) - if content: - content = content.replace('&', '&').replace('<', '<').replace('>', '>') - story.append(Paragraph(content, styles['Body'])) + # Fallback to elements if raw OCR regions not found + logger.warning(f"OCR Track: No raw OCR regions found for page {page_num}, falling back to elements") + self._render_reflow_elements(page_data, result_dir, styles, story) + else: + # === Direct Track: Use structured content === + self._render_reflow_elements(page_data, result_dir, styles, story) if not story: logger.warning("No content to generate reflow PDF") @@ -4869,6 +5106,9 @@ class PDFGeneratorService: merges them to replace original content with translations, and generates a PDF with the translated content at original positions. + For OCR Track: Uses raw_ocr_translations to translate raw OCR regions + For Direct Track: Uses translations dict to translate elements + Args: result_json_path: Path to original result JSON file (UnifiedDocument format) translation_json_path: Path to translation JSON file @@ -4894,7 +5134,25 @@ class PDFGeneratorService: with open(translation_json_path, 'r', encoding='utf-8') as f: translation_json = json.load(f) - # Extract translations dict from translation JSON + # Check if this is OCR Track with raw_ocr_translations + raw_ocr_translations = translation_json.get('raw_ocr_translations', []) + processing_track = translation_json.get('processing_track', '') + target_lang = translation_json.get('target_lang', 'unknown') + + if raw_ocr_translations and processing_track == 'ocr': + # OCR Track: Generate PDF using translated raw OCR regions + logger.info( + f"Generating translated PDF (OCR Track): {len(raw_ocr_translations)} " + f"raw OCR translations, target_lang={target_lang}" + ) + return self._generate_translated_pdf_ocr_track( + result_json=result_json, + raw_ocr_translations=raw_ocr_translations, + output_path=output_path, + result_dir=result_json_path.parent + ) + + # Direct Track: Use element-based translations translations = translation_json.get('translations', {}) if not translations: logger.warning("No translations found in translation JSON") @@ -4908,9 +5166,8 @@ class PDFGeneratorService: # Apply translations to result JSON translated_doc = apply_translations(result_json, translations) - target_lang = translation_json.get('target_lang', 'unknown') logger.info( - f"Generating translated PDF: {len(translations)} translations applied, " + f"Generating translated PDF (Direct Track): {len(translations)} translations applied, " f"target_lang={target_lang}" ) @@ -4927,10 +5184,12 @@ class PDFGeneratorService: try: # Use reflow PDF generation for better translated content display # Pass result_json_path.parent as image directory (not the temp file's parent) + # use_elements_only=True ensures we use translated elements, not raw_ocr_regions success = self.generate_reflow_pdf( json_path=tmp_path, output_path=output_path, - source_file_path=result_json_path.parent # Contains extracted images + source_file_path=result_json_path.parent, # Contains extracted images + use_elements_only=True # Use elements with translations applied ) return success finally: @@ -4950,6 +5209,165 @@ class PDFGeneratorService: traceback.print_exc() return False + def _generate_translated_pdf_ocr_track( + self, + result_json: Dict, + raw_ocr_translations: List[Dict], + output_path: Path, + result_dir: Path + ) -> bool: + """ + Generate translated reflow PDF for OCR Track documents. + + Uses raw_ocr_translations to render translated text in reading order. + + Args: + result_json: Original result JSON data + raw_ocr_translations: List of {page, index, original, translated} + output_path: Path to save generated PDF + result_dir: Path to result directory for images + + Returns: + True if successful, False otherwise + """ + try: + # Get styles + styles = self._get_reflow_styles() + + # Build document content + story = [] + + # Build translation lookup: {(page, index): translated_text} + translation_lookup: Dict[Tuple[int, int], str] = {} + for trans in raw_ocr_translations: + page = trans.get('page', 1) + idx = trans.get('index', 0) + translated = trans.get('translated', '') + if translated: + translation_lookup[(page, idx)] = translated + + logger.info(f"Built translation lookup with {len(translation_lookup)} entries") + + # Process each page + pages = result_json.get('pages', []) + task_id = result_dir.name + + for page_idx, page_data in enumerate(pages): + page_num = page_idx + 1 # 1-indexed + if page_idx > 0: + # Add page break between pages + story.append(Spacer(1, 30)) + + # Load raw OCR regions for this page + if TEXT_REGION_RENDERER_AVAILABLE and load_raw_ocr_regions: + raw_regions = load_raw_ocr_regions(str(result_dir), task_id, page_num) + + if raw_regions: + logger.info( + f"OCR Track translated PDF: Processing {len(raw_regions)} regions " + f"for page {page_num}" + ) + + # Collect exclusion zones (image bboxes) to filter text inside images + exclusion_zones = self._collect_exclusion_zones(page_data) + + # Sort by Y coordinate (top to bottom reading order) + # Keep original indices for translation lookup + def get_y_coord(region_tuple): + region = region_tuple[1] + bbox = region.get('bbox', []) + if bbox and len(bbox) >= 4: + return (bbox[0][1] + bbox[1][1]) / 2 + return 0 + + indexed_regions = list(enumerate(raw_regions)) + sorted_regions = sorted(indexed_regions, key=get_y_coord) + + # Render translated text blocks as paragraphs (skip those overlapping images) + for original_idx, region in sorted_regions: + # Skip regions overlapping with images + bbox = region.get('bbox', []) + if exclusion_zones and self._is_region_overlapping_exclusion(bbox, exclusion_zones): + continue + # Look up translation + translated_text = translation_lookup.get( + (page_num, original_idx), + region.get('text', '') # Fallback to original + ) + + if translated_text: + # Escape HTML special chars + translated_text = (translated_text + .replace('&', '&') + .replace('<', '<') + .replace('>', '>')) + story.append(Paragraph(translated_text, styles['Body'])) + + # Also render images/charts from elements + elements = self._get_elements_in_reading_order(page_data) + for elem in elements: + elem_type = elem.get('type', elem.get('element_type', '')) + if elem_type in ('image', 'figure', 'Image', 'Figure', 'chart', 'Chart'): + img = self._embed_image_reflow(elem, result_dir) + if img: + story.append(img) + story.append(Spacer(1, 8)) + # Handle embedded images in tables + elif elem_type in ('table', 'Table'): + elem_metadata = elem.get('metadata', {}) + embedded_images = elem_metadata.get('embedded_images', []) + for emb_img in embedded_images: + img_path_str = emb_img.get('saved_path', '') + if img_path_str: + img_path = result_dir / img_path_str + if not img_path.exists(): + img_path = result_dir / Path(img_path_str).name + if img_path.exists(): + try: + img = PlatypusImage(str(img_path)) + max_width = 450 + if img.drawWidth > max_width: + ratio = max_width / img.drawWidth + img.drawWidth = max_width + img.drawHeight *= ratio + story.append(img) + story.append(Spacer(1, 8)) + except Exception as e: + logger.warning(f"Failed to embed table image: {e}") + else: + logger.warning( + f"No raw OCR regions found for page {page_num}, skipping" + ) + + if not story: + logger.warning("No content to generate translated OCR Track PDF") + return False + + # Create PDF document + doc = SimpleDocTemplate( + str(output_path), + pagesize=A4, + leftMargin=50, + rightMargin=50, + topMargin=50, + bottomMargin=50 + ) + + # Build PDF + doc.build(story) + + logger.info( + f"Generated translated OCR Track PDF: {output_path} " + f"({output_path.stat().st_size} bytes)" + ) + return True + + except Exception as e: + logger.error(f"Failed to generate translated OCR Track PDF: {e}") + import traceback + traceback.print_exc() + return False + def generate_translated_layout_pdf( self, result_json_path: Path, diff --git a/backend/app/services/translation_service.py b/backend/app/services/translation_service.py index 8d72d18..bb777e3 100644 --- a/backend/app/services/translation_service.py +++ b/backend/app/services/translation_service.py @@ -233,19 +233,118 @@ class TranslationService: self._total_tokens = 0 self._total_latency = 0.0 + def _load_raw_ocr_regions( + self, + result_dir: Path, + task_id: str, + page_num: int + ) -> List[Dict]: + """ + Load raw OCR regions for a specific page. + + Args: + result_dir: Path to result directory + task_id: Task ID + page_num: Page number (1-indexed) + + Returns: + List of raw OCR region dictionaries with 'text' and 'bbox' + """ + import glob + + # Pattern: {task_id}_*_page_{page_num}_raw_ocr_regions.json + pattern = str(result_dir / f"{task_id}_*_page_{page_num}_raw_ocr_regions.json") + matches = glob.glob(pattern) + + if not matches: + logger.warning(f"No raw OCR regions file found for page {page_num}") + return [] + + try: + with open(matches[0], 'r', encoding='utf-8') as f: + regions = json.load(f) + logger.info(f"Loaded {len(regions)} raw OCR regions from {matches[0]}") + return regions + except Exception as e: + logger.error(f"Failed to load raw OCR regions: {e}") + return [] + + def extract_translatable_elements_ocr_track( + self, + result_json: Dict, + result_dir: Path, + task_id: str + ) -> Tuple[List[TranslatableItem], int]: + """ + Extract translatable elements from raw OCR regions for OCR Track documents. + + Args: + result_json: UnifiedDocument JSON data + result_dir: Path to result directory + task_id: Task ID + + Returns: + Tuple of (list of TranslatableItem, total region count) + """ + items = [] + total_regions = 0 + + for page in result_json.get('pages', []): + page_number = page.get('page_number', 1) + + # Load raw OCR regions for this page + raw_regions = self._load_raw_ocr_regions(result_dir, task_id, page_number) + + for idx, region in enumerate(raw_regions): + total_regions += 1 + text = region.get('text', '').strip() + + if text: + # Use index as element_id for raw OCR regions + items.append(TranslatableItem( + element_id=f"raw_ocr_{page_number}_{idx}", + content=text, + element_type='raw_ocr_region', + page_number=page_number, + cell_position=(idx, 0) # Store original index in cell_position + )) + + logger.info( + f"Extracted {len(items)} translatable items from {total_regions} raw OCR regions (OCR Track)" + ) + return items, total_regions + def extract_translatable_elements( self, - result_json: Dict + result_json: Dict, + result_dir: Optional[Path] = None, + task_id: Optional[str] = None ) -> Tuple[List[TranslatableItem], int]: """ Extract all translatable elements from a result JSON. + For OCR Track documents, extracts from raw_ocr_regions.json files. + For Direct Track documents, extracts from elements in result JSON. + Args: result_json: UnifiedDocument JSON data + result_dir: Path to result directory (required for OCR Track) + task_id: Task ID (required for OCR Track) Returns: Tuple of (list of TranslatableItem, total element count) """ + # Check processing track + metadata = result_json.get('metadata', {}) + processing_track = metadata.get('processing_track', 'direct') + + # For OCR Track, use raw OCR regions + if processing_track == 'ocr' and result_dir and task_id: + return self.extract_translatable_elements_ocr_track( + result_json, result_dir, task_id + ) + + # For Direct Track, use element-based extraction items = [] total_elements = 0 @@ -290,7 +389,7 @@ class TranslationService: )) logger.info( - f"Extracted {len(items)} translatable items from {total_elements} elements" + f"Extracted {len(items)} translatable items from {total_elements} elements (Direct Track)" ) return items, total_elements @@ -378,6 +477,7 @@ class TranslationService: original_content=item.content, translated_content=translated_content, element_type=item.element_type, + page_number=item.page_number, cell_position=item.cell_position )) @@ -392,6 +492,7 @@ class TranslationService: original_content=item.content, translated_content=item.content, # Keep original element_type=item.element_type, + page_number=item.page_number, cell_position=item.cell_position ) for item in batch.items @@ -429,6 +530,7 @@ class TranslationService: original_content=item.content, translated_content=response.translated_text, element_type=item.element_type, + page_number=item.page_number, cell_position=item.cell_position ) @@ -440,6 +542,7 @@ class TranslationService: original_content=item.content, translated_content=item.content, # Keep original element_type=item.element_type, + page_number=item.page_number, cell_position=item.cell_position ) @@ -451,7 +554,8 @@ class TranslationService: target_lang: str, total_elements: int, processing_time: float, - batch_count: int + batch_count: int, + processing_track: str = 'direct' ) -> Dict: """ Build the translation result JSON structure. @@ -464,52 +568,98 @@ class TranslationService: total_elements: Total elements in document processing_time: Processing time in seconds batch_count: Number of batches used + processing_track: 'ocr' or 'direct' - determines output format Returns: Translation result dictionary """ - # Build translations dict - translations: Dict[str, Any] = {} total_chars = 0 + is_ocr_track = processing_track == 'ocr' - for item in translated_items: - total_chars += len(item.translated_content) + if is_ocr_track: + # OCR Track: Build raw_ocr_translations list + raw_ocr_translations: List[Dict] = [] - if item.element_type == 'table_cell': - # Group table cells by element_id - if item.element_id not in translations: - translations[item.element_id] = {'cells': []} + for item in translated_items: + total_chars += len(item.translated_content) - translations[item.element_id]['cells'].append({ - 'row': item.cell_position[0] if item.cell_position else 0, - 'col': item.cell_position[1] if item.cell_position else 0, - 'content': item.translated_content - }) - else: - translations[item.element_id] = item.translated_content + if item.element_type == 'raw_ocr_region': + # Extract page and index from element_id: "raw_ocr_{page}_{idx}" + page_num = item.page_number + original_idx = item.cell_position[0] if item.cell_position else 0 - # Build statistics - translated_element_ids = set(item.element_id for item in translated_items) - skipped = total_elements - len(translated_element_ids) + raw_ocr_translations.append({ + 'page': page_num, + 'index': original_idx, + 'original': item.original_content, + 'translated': item.translated_content + }) - result = { - 'schema_version': '1.0.0', - 'source_document': source_document, - 'source_lang': source_lang, - 'target_lang': target_lang, - 'provider': 'dify', - 'translated_at': datetime.utcnow().isoformat() + 'Z', - 'statistics': { - 'total_elements': total_elements, - 'translated_elements': len(translated_element_ids), - 'skipped_elements': skipped, - 'total_characters': total_chars, - 'processing_time_seconds': round(processing_time, 2), - 'total_tokens': self._total_tokens, - 'batch_count': batch_count - }, - 'translations': translations - } + # Build statistics + skipped = total_elements - len(raw_ocr_translations) + + result = { + 'schema_version': '1.0.0', + 'source_document': source_document, + 'source_lang': source_lang, + 'target_lang': target_lang, + 'provider': 'dify', + 'translated_at': datetime.utcnow().isoformat() + 'Z', + 'processing_track': 'ocr', + 'statistics': { + 'total_elements': total_elements, + 'translated_elements': len(raw_ocr_translations), + 'skipped_elements': skipped, + 'total_characters': total_chars, + 'processing_time_seconds': round(processing_time, 2), + 'total_tokens': self._total_tokens, + 'batch_count': batch_count + }, + 'translations': {}, # Empty for OCR Track + 'raw_ocr_translations': raw_ocr_translations + } + else: + # Direct Track: Build translations dict (existing logic) + translations: Dict[str, Any] = {} + + for item in translated_items: + total_chars += len(item.translated_content) + + if item.element_type == 'table_cell': + # Group table cells by element_id + if item.element_id not in translations: + translations[item.element_id] = {'cells': []} + + translations[item.element_id]['cells'].append({ + 'row': item.cell_position[0] if item.cell_position else 0, + 'col': item.cell_position[1] if item.cell_position else 0, + 'content': item.translated_content + }) + else: + translations[item.element_id] = item.translated_content + + # Build statistics + translated_element_ids = set(item.element_id for item in translated_items) + skipped = total_elements - len(translated_element_ids) + + result = { + 'schema_version': '1.0.0', + 'source_document': source_document, + 'source_lang': source_lang, + 'target_lang': target_lang, + 'provider': 'dify', + 'translated_at': datetime.utcnow().isoformat() + 'Z', + 'statistics': { + 'total_elements': total_elements, + 'translated_elements': len(translated_element_ids), + 'skipped_elements': skipped, + 'total_characters': total_chars, + 'processing_time_seconds': round(processing_time, 2), + 'total_tokens': self._total_tokens, + 'batch_count': batch_count + }, + 'translations': translations + } return result @@ -548,9 +698,13 @@ class TranslationService: result_json = json.load(f) source_document = result_json.get('metadata', {}).get('filename', 'unknown') + processing_track = result_json.get('metadata', {}).get('processing_track', 'direct') + result_dir = result_json_path.parent - # Extract translatable elements - items, total_elements = self.extract_translatable_elements(result_json) + # Extract translatable elements (passes result_dir and task_id for OCR Track) + items, total_elements = self.extract_translatable_elements( + result_json, result_dir, task_id + ) if not items: logger.warning("No translatable elements found") @@ -597,7 +751,8 @@ class TranslationService: target_lang=target_lang, total_elements=total_elements, processing_time=processing_time, - batch_count=len(batches) + batch_count=len(batches), + processing_track=processing_track ) # Save result diff --git a/openspec/changes/archive/2025-12-12-fix-ocr-track-reflow-pdf/proposal.md b/openspec/changes/archive/2025-12-12-fix-ocr-track-reflow-pdf/proposal.md new file mode 100644 index 0000000..6f949f9 --- /dev/null +++ b/openspec/changes/archive/2025-12-12-fix-ocr-track-reflow-pdf/proposal.md @@ -0,0 +1,51 @@ +# Change: Fix OCR Track Reflow PDF + +## Why + +The OCR Track reflow PDF generation is missing most content because: + +1. PP-StructureV3 extracts tables as elements but stores `content: ""` (empty string) instead of structured `content.cells` data +2. The `generate_reflow_pdf` method expects `content.cells` for tables, so tables are skipped +3. Table text exists in `raw_ocr_regions.json` (59 text blocks) but is not used by reflow PDF generation +4. This causes significant content loss - only 6 text elements vs 59 raw OCR regions + +The Layout PDF works correctly because it uses `raw_ocr_regions.json` via Simple Text Positioning mode, bypassing the need for structured table data. + +## What Changes + +### Reflow PDF Generation for OCR Track + +Modify `generate_reflow_pdf` to use `raw_ocr_regions.json` as the primary text source for OCR Track documents: + +1. **Detect processing track** from JSON metadata +2. **For OCR Track**: Load `raw_ocr_regions.json` and render all text blocks in reading order +3. **For Direct Track**: Continue using `content.cells` for tables (already works) +4. **Images/Charts**: Continue using `content.saved_path` from elements (works for both tracks) + +### Data Flow + +**OCR Track Reflow PDF (NEW):** +``` +raw_ocr_regions.json (59 text blocks) + + scan_result.json (images/charts only) + → Sort by Y coordinate (reading order) + → Render text paragraphs + images +``` + +**Direct Track Reflow PDF (UNCHANGED):** +``` +*_result.json (elements with content.cells) + → Render tables, text, images in order +``` + +## Impact + +- **Affected file**: `backend/app/services/pdf_generator_service.py` +- **User experience**: OCR Track reflow PDF will contain all text content (matching Layout PDF) +- **Translation**: Reflow translated PDF will also work correctly for OCR Track + +## Migration + +- No data migration required +- Existing `raw_ocr_regions.json` files contain all necessary data +- No API changes diff --git a/openspec/changes/archive/2025-12-12-fix-ocr-track-reflow-pdf/specs/result-export/spec.md b/openspec/changes/archive/2025-12-12-fix-ocr-track-reflow-pdf/specs/result-export/spec.md new file mode 100644 index 0000000..5d50832 --- /dev/null +++ b/openspec/changes/archive/2025-12-12-fix-ocr-track-reflow-pdf/specs/result-export/spec.md @@ -0,0 +1,23 @@ +## MODIFIED Requirements + +### Requirement: Enhanced PDF Export with Layout Preservation + +The PDF export SHALL accurately preserve document layout from both OCR and direct extraction tracks with correct coordinate transformation and multi-page support. For Direct Track, a background image rendering approach SHALL be used for visual fidelity. + +#### Scenario: OCR Track reflow PDF uses raw OCR regions +- **WHEN** generating reflow PDF for an OCR Track document +- **THEN** the system SHALL load text content from `raw_ocr_regions.json` files +- **AND** text blocks SHALL be sorted by Y coordinate for reading order +- **AND** all text content SHALL match the Layout PDF output +- **AND** images and charts SHALL be embedded from element `saved_path` + +#### Scenario: Direct Track reflow PDF uses structured content +- **WHEN** generating reflow PDF for a Direct Track document +- **THEN** the system SHALL use `content.cells` for table rendering +- **AND** text elements SHALL use `content` string directly +- **AND** images and charts SHALL be embedded from element `saved_path` + +#### Scenario: Reflow PDF content consistency +- **WHEN** comparing Layout PDF and Reflow PDF for the same document +- **THEN** both PDFs SHALL contain the same text content +- **AND** only the presentation format SHALL differ (positioned vs flowing) diff --git a/openspec/changes/archive/2025-12-12-fix-ocr-track-reflow-pdf/tasks.md b/openspec/changes/archive/2025-12-12-fix-ocr-track-reflow-pdf/tasks.md new file mode 100644 index 0000000..de52831 --- /dev/null +++ b/openspec/changes/archive/2025-12-12-fix-ocr-track-reflow-pdf/tasks.md @@ -0,0 +1,51 @@ +# Tasks: Fix OCR Track Reflow PDF + +## 1. Modify generate_reflow_pdf Method + +- [x] 1.1 Add processing track detection + - File: `backend/app/services/pdf_generator_service.py` + - Location: `generate_reflow_pdf` method (line ~4704) + - Read `metadata.processing_track` from JSON data + - Branch logic based on track type + +- [x] 1.2 Add helper function to load raw OCR regions + - File: `backend/app/services/pdf_generator_service.py` + - Using existing: `load_raw_ocr_regions` from `text_region_renderer.py` + - Pattern: `{task_id}_*_page_{page_num}_raw_ocr_regions.json` + - Return: List of text regions with bbox and content + +- [x] 1.3 Implement OCR Track reflow rendering + - File: `backend/app/services/pdf_generator_service.py` + - For OCR Track: Load raw OCR regions per page + - Sort text blocks by Y coordinate (top to bottom reading order) + - Render text blocks as paragraphs + - Still render images/charts from elements + +- [x] 1.4 Keep Direct Track logic unchanged + - File: `backend/app/services/pdf_generator_service.py` + - Direct Track continues using `content.cells` for tables + - Extracted to `_render_reflow_elements` helper method + - No changes to existing Direct Track flow + +## 2. Handle Multi-page Documents + +- [x] 2.1 Support per-page raw OCR files + - Pattern: `{task_id}_*_page_{page_num}_raw_ocr_regions.json` + - Iterate through pages and load corresponding raw OCR file + - Handle missing files gracefully (fall back to elements) + +## 3. Testing + +- [x] 3.1 Test OCR Track reflow PDF + - Test with: `a9259180-fc49-4890-8184-2e6d5f4edad3` (scan document) + - Verify: All 59 text blocks appear in reflow PDF + - Verify: Images are embedded correctly + +- [x] 3.2 Test Direct Track reflow PDF + - Test with: `1b32428d-0609-4cfd-bc52-56be6956ac2e` (editable PDF) + - Verify: Tables render with cells + - Verify: No regression from changes + +- [x] 3.3 Test translated reflow PDF + - Test: Complete translation then download reflow PDF + - Verify: Translated text appears correctly diff --git a/openspec/changes/archive/2025-12-12-fix-ocr-track-translation/proposal.md b/openspec/changes/archive/2025-12-12-fix-ocr-track-translation/proposal.md new file mode 100644 index 0000000..f4fae0a --- /dev/null +++ b/openspec/changes/archive/2025-12-12-fix-ocr-track-translation/proposal.md @@ -0,0 +1,70 @@ +# Change: Fix OCR Track Translation + +## Why + +OCR Track translation is missing most content because: + +1. Translation service (`extract_translatable_elements`) only processes elements from `scan_result.json` +2. OCR Track tables have `content: ""` (empty string) - no `content.cells` data +3. All table text exists in `raw_ocr_regions.json` (59 text blocks) but translation service ignores it +4. Result: Only 6 text elements translated vs 59 raw OCR regions available + +**Current Data Flow (OCR Track):** +``` +scan_result.json (10 elements, 6 text, 2 empty tables) + → Translation extracts 6 text items + → 53 text blocks in tables are NOT translated +``` + +**Expected Data Flow (OCR Track):** +``` +raw_ocr_regions.json (59 text blocks) + → Translation extracts ALL 59 text items + → Complete translation coverage +``` + +## What Changes + +### 1. Translation Service Enhancement + +Modify `translate_document` in `translation_service.py` to: + +1. **Detect processing track** from result JSON metadata +2. **For OCR Track**: Load and translate `raw_ocr_regions.json` instead of elements +3. **For Direct Track**: Continue using elements with `content.cells` (already works) + +### 2. Translation Result Format for OCR Track + +Add new field `raw_ocr_translations` to translation JSON for OCR Track: + +```json +{ + "translations": { ... }, // element-based (for Direct Track) + "raw_ocr_translations": [ // NEW: for OCR Track + { + "index": 0, + "original": "华天科技(宝鸡)有限公司", + "translated": "Huatian Technology (Baoji) Co., Ltd." + }, + ... + ] +} +``` + +### 3. Translated PDF Generation + +Modify `generate_translated_pdf` to use `raw_ocr_translations` when available for OCR Track documents. + +## Impact + +- **Affected files**: + - `backend/app/services/translation_service.py` - extraction and translation logic + - `backend/app/services/pdf_generator_service.py` - translated PDF rendering +- **User experience**: OCR Track translations will include ALL text content +- **API**: Translation JSON format extended (backward compatible) + +## Migration + +- No data migration required +- Existing translations continue to work (Direct Track unaffected) +- Re-translation needed for OCR Track documents to get full coverage diff --git a/openspec/changes/archive/2025-12-12-fix-ocr-track-translation/specs/translation/spec.md b/openspec/changes/archive/2025-12-12-fix-ocr-track-translation/specs/translation/spec.md new file mode 100644 index 0000000..08e6192 --- /dev/null +++ b/openspec/changes/archive/2025-12-12-fix-ocr-track-translation/specs/translation/spec.md @@ -0,0 +1,56 @@ +# translation Specification Delta + +## MODIFIED Requirements + +### Requirement: Translation Content Extraction + +The translation service SHALL extract content based on processing track type. + +#### Scenario: OCR Track translation extraction +- **GIVEN** a document processed with OCR Track +- **AND** the result JSON has `metadata.processing_track = "ocr"` +- **WHEN** translation service extracts translatable content +- **THEN** it SHALL load `raw_ocr_regions.json` for each page +- **AND** it SHALL extract all text blocks from raw OCR regions +- **AND** it SHALL NOT rely on `content.cells` from table elements + +#### Scenario: Direct Track translation extraction (unchanged) +- **GIVEN** a document processed with Direct Track +- **AND** the result JSON has `metadata.processing_track = "direct"` or no track specified +- **WHEN** translation service extracts translatable content +- **THEN** it SHALL extract from `pages[].elements[]` in result JSON +- **AND** it SHALL extract table cell content from `content.cells` + +### Requirement: Translation Result Format + +The translation result JSON SHALL support both element-based and raw OCR translations. + +#### Scenario: OCR Track translation result format +- **GIVEN** an OCR Track document has been translated +- **WHEN** translation result is saved +- **THEN** the JSON SHALL include `raw_ocr_translations` array +- **AND** each item SHALL have `index`, `original`, and `translated` fields +- **AND** the `translations` object MAY be empty or contain header text translations + +#### Scenario: Direct Track translation result format (unchanged) +- **GIVEN** a Direct Track document has been translated +- **WHEN** translation result is saved +- **THEN** the JSON SHALL use `translations` object mapping element_id to translated text +- **AND** `raw_ocr_translations` field SHALL NOT be present + +### Requirement: Translated PDF Generation + +The translated PDF generation SHALL use appropriate translation source based on processing track. + +#### Scenario: OCR Track translated PDF generation +- **GIVEN** an OCR Track document with translations +- **AND** the translation JSON contains `raw_ocr_translations` +- **WHEN** generating translated reflow PDF +- **THEN** it SHALL apply translations from `raw_ocr_translations` by index +- **AND** it SHALL render all translated text blocks in reading order + +#### Scenario: Direct Track translated PDF generation (unchanged) +- **GIVEN** a Direct Track document with translations +- **WHEN** generating translated reflow PDF +- **THEN** it SHALL apply translations from `translations` object by element_id +- **AND** existing behavior SHALL be unchanged diff --git a/openspec/changes/archive/2025-12-12-fix-ocr-track-translation/tasks.md b/openspec/changes/archive/2025-12-12-fix-ocr-track-translation/tasks.md new file mode 100644 index 0000000..98207c0 --- /dev/null +++ b/openspec/changes/archive/2025-12-12-fix-ocr-track-translation/tasks.md @@ -0,0 +1,76 @@ +# Tasks: Fix OCR Track Translation + +## 1. Modify Translation Service + +- [x] 1.1 Add processing track detection + - File: `backend/app/services/translation_service.py` + - Location: `translate_document` method + - Read `metadata.processing_track` from result JSON + - Pass track type to extraction method + +- [x] 1.2 Create helper to load raw OCR regions + - File: `backend/app/services/translation_service.py` + - Function: `_load_raw_ocr_regions(result_dir, task_id, page_num)` + - Pattern: `{task_id}_*_page_{page_num}_raw_ocr_regions.json` + - Return: List of text regions with index and content + +- [x] 1.3 Modify extract_translatable_elements for OCR Track + - File: `backend/app/services/translation_service.py` + - Added: `extract_translatable_elements_ocr_track` method + - Added parameters: `result_dir: Path`, `task_id: str` + - For OCR Track: Extract from raw_ocr_regions.json + - For Direct Track: Keep existing element-based extraction + +- [x] 1.4 Update translation result format + - File: `backend/app/services/translation_service.py` + - Location: `build_translation_result` method + - Added `processing_track` parameter + - For OCR Track: Output `raw_ocr_translations` field + - Structure: `[{"page": 1, "index": 0, "original": "...", "translated": "..."}]` + +## 2. Modify PDF Generation + +- [x] 2.1 Update generate_translated_pdf for OCR Track + - File: `backend/app/services/pdf_generator_service.py` + - Detect `processing_track` and `raw_ocr_translations` from translation JSON + - For OCR Track: Call `_generate_translated_pdf_ocr_track` + - For Direct Track: Continue using `apply_translations` (element-based) + +- [x] 2.2 Create helper to apply raw OCR translations + - File: `backend/app/services/pdf_generator_service.py` + - Function: `_generate_translated_pdf_ocr_track` + - Build translation lookup: `{(page, index): translated_text}` + - Load raw OCR regions, sort by Y coordinate + - Render translated text with original fallback + +## 3. Additional Fixes + +- [x] 3.1 Add page_number to TranslatedItem + - File: `backend/app/schemas/translation.py` + - Added `page_number: int = 1` to TranslatedItem dataclass + - Updated `translate_batch` and `translate_item` to pass page_number + +- [x] 3.2 Update API endpoint validation + - File: `backend/app/routers/translate.py` + - Check for both `translations` (Direct Track) and `raw_ocr_translations` (OCR Track) + +- [x] 3.3 Filter text overlapping with images + - File: `backend/app/services/pdf_generator_service.py` + - Added `_collect_exclusion_zones`, `_is_region_overlapping_exclusion`, `_filter_regions_by_exclusion` + - Applied filtering in `generate_reflow_pdf` and `_generate_translated_pdf_ocr_track` + +## 4. Testing + +- [x] 4.1 Test OCR Track translation + - Test with: `f8265449-6cb7-425d-a213-5d2e1af73955` + - Verify: All 59 text blocks are sent for translation + - Verify: Translation JSON contains `raw_ocr_translations` + +- [x] 4.2 Test OCR Track translated PDF + - Generate translated reflow PDF + - Verify: All translated text blocks appear correctly + - Verify: Text inside images (like EWsenel) is filtered out + +- [x] 4.3 Test Direct Track unchanged + - Verify: Translation still uses element-based approach + - Verify: No regression in Direct Track flow diff --git a/openspec/specs/result-export/spec.md b/openspec/specs/result-export/spec.md index fbdb700..5712e0d 100644 --- a/openspec/specs/result-export/spec.md +++ b/openspec/specs/result-export/spec.md @@ -58,36 +58,23 @@ Export settings (format, thresholds, templates) SHALL apply consistently to V2 t The PDF export SHALL accurately preserve document layout from both OCR and direct extraction tracks with correct coordinate transformation and multi-page support. For Direct Track, a background image rendering approach SHALL be used for visual fidelity. -#### Scenario: Export PDF from direct extraction track -- **WHEN** exporting PDF from a direct-extraction processed document -- **THEN** the system SHALL render source PDF pages as full-page background images at 2x resolution -- **AND** overlay invisible text elements using PDF Text Rendering Mode 3 -- **AND** text SHALL remain selectable and searchable despite being invisible -- **AND** visual output SHALL match source document exactly +#### Scenario: OCR Track reflow PDF uses raw OCR regions +- **WHEN** generating reflow PDF for an OCR Track document +- **THEN** the system SHALL load text content from `raw_ocr_regions.json` files +- **AND** text blocks SHALL be sorted by Y coordinate for reading order +- **AND** all text content SHALL match the Layout PDF output +- **AND** images and charts SHALL be embedded from element `saved_path` -#### Scenario: Export PDF from OCR track with full structure -- **WHEN** exporting PDF from OCR-processed document -- **THEN** the PDF SHALL use all 23 PP-StructureV3 element types -- **AND** render tables with proper cell boundaries -- **AND** maintain reading order from parsing_res_list +#### Scenario: Direct Track reflow PDF uses structured content +- **WHEN** generating reflow PDF for a Direct Track document +- **THEN** the system SHALL use `content.cells` for table rendering +- **AND** text elements SHALL use `content` string directly +- **AND** images and charts SHALL be embedded from element `saved_path` -#### Scenario: Handle coordinate transformations correctly -- **WHEN** generating PDF from UnifiedDocument -- **THEN** system SHALL use explicit page dimensions from OCR results (not inferred from bounding boxes) -- **AND** correctly transform Y-axis coordinates from top-left (OCR) to bottom-left (PDF/ReportLab) origin -- **AND** prevent vertical flipping or position misalignment errors - -#### Scenario: Direct Track PDF file size increase -- **WHEN** generating Layout PDF for Direct Track documents -- **THEN** the system SHALL accept increased file size due to embedded page images -- **AND** approximately 1-2 MB per page at 2x resolution is expected -- **AND** this trade-off is accepted for improved visual fidelity - -#### Scenario: Chart elements excluded from text layer -- **WHEN** generating Layout PDF containing charts -- **THEN** the system SHALL NOT include chart-internal text in the invisible text layer -- **AND** chart visuals SHALL be preserved in the background image -- **AND** chart text SHALL NOT be available for text selection or translation +#### Scenario: Reflow PDF content consistency +- **WHEN** comparing Layout PDF and Reflow PDF for the same document +- **THEN** both PDFs SHALL contain the same text content +- **AND** only the presentation format SHALL differ (positioned vs flowing) ### Requirement: Structure Data Export The system SHALL provide export formats that preserve document structure for downstream processing.