fix: OCR Track reflow PDF and translation with image text filtering

- Add OCR Track support for reflow PDF generation using raw_ocr_regions.json - Add OCR Track translation extraction from raw_ocr_regions instead of elements - Add raw_ocr_translations output format for OCR Track documents - Add exclusion zone filtering to remove text overlapping with images - Update API validation to accept both translations and raw_ocr_translations - Add page_number field to TranslatedItem for proper tracking 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-12 11:02:35 +08:00
parent 24253ac15e
commit 1f18010040
11 changed files with 1040 additions and 149 deletions
--- a/backend/app/services/translation_service.py
+++ b/backend/app/services/translation_service.py
@@ -233,19 +233,118 @@ class TranslationService:
        self._total_tokens = 0
        self._total_latency = 0.0

+    def _load_raw_ocr_regions(
+        self,
+        result_dir: Path,
+        task_id: str,
+        page_num: int
+    ) -> List[Dict]:
+        """
+        Load raw OCR regions for a specific page.
+
+        Args:
+            result_dir: Path to result directory
+            task_id: Task ID
+            page_num: Page number (1-indexed)
+
+        Returns:
+            List of raw OCR region dictionaries with 'text' and 'bbox'
+        """
+        import glob
+
+        # Pattern: {task_id}_*_page_{page_num}_raw_ocr_regions.json
+        pattern = str(result_dir / f"{task_id}_*_page_{page_num}_raw_ocr_regions.json")
+        matches = glob.glob(pattern)
+
+        if not matches:
+            logger.warning(f"No raw OCR regions file found for page {page_num}")
+            return []
+
+        try:
+            with open(matches[0], 'r', encoding='utf-8') as f:
+                regions = json.load(f)
+            logger.info(f"Loaded {len(regions)} raw OCR regions from {matches[0]}")
+            return regions
+        except Exception as e:
+            logger.error(f"Failed to load raw OCR regions: {e}")
+            return []
+
+    def extract_translatable_elements_ocr_track(
+        self,
+        result_json: Dict,
+        result_dir: Path,
+        task_id: str
+    ) -> Tuple[List[TranslatableItem], int]:
+        """
+        Extract translatable elements from raw OCR regions for OCR Track documents.
+
+        Args:
+            result_json: UnifiedDocument JSON data
+            result_dir: Path to result directory
+            task_id: Task ID
+
+        Returns:
+            Tuple of (list of TranslatableItem, total region count)
+        """
+        items = []
+        total_regions = 0
+
+        for page in result_json.get('pages', []):
+            page_number = page.get('page_number', 1)
+
+            # Load raw OCR regions for this page
+            raw_regions = self._load_raw_ocr_regions(result_dir, task_id, page_number)
+
+            for idx, region in enumerate(raw_regions):
+                total_regions += 1
+                text = region.get('text', '').strip()
+
+                if text:
+                    # Use index as element_id for raw OCR regions
+                    items.append(TranslatableItem(
+                        element_id=f"raw_ocr_{page_number}_{idx}",
+                        content=text,
+                        element_type='raw_ocr_region',
+                        page_number=page_number,
+                        cell_position=(idx, 0)  # Store original index in cell_position
+                    ))
+
+        logger.info(
+            f"Extracted {len(items)} translatable items from {total_regions} raw OCR regions (OCR Track)"
+        )
+        return items, total_regions
+
    def extract_translatable_elements(
        self,
-        result_json: Dict
+        result_json: Dict,
+        result_dir: Optional[Path] = None,
+        task_id: Optional[str] = None
    ) -> Tuple[List[TranslatableItem], int]:
        """
        Extract all translatable elements from a result JSON.

+        For OCR Track documents, extracts from raw_ocr_regions.json files.
+        For Direct Track documents, extracts from elements in result JSON.
+
        Args:
            result_json: UnifiedDocument JSON data
+            result_dir: Path to result directory (required for OCR Track)
+            task_id: Task ID (required for OCR Track)

        Returns:
            Tuple of (list of TranslatableItem, total element count)
        """
+        # Check processing track
+        metadata = result_json.get('metadata', {})
+        processing_track = metadata.get('processing_track', 'direct')
+
+        # For OCR Track, use raw OCR regions
+        if processing_track == 'ocr' and result_dir and task_id:
+            return self.extract_translatable_elements_ocr_track(
+                result_json, result_dir, task_id
+            )
+
+        # For Direct Track, use element-based extraction
        items = []
        total_elements = 0

@@ -290,7 +389,7 @@ class TranslationService:
                            ))

        logger.info(
-            f"Extracted {len(items)} translatable items from {total_elements} elements"
+            f"Extracted {len(items)} translatable items from {total_elements} elements (Direct Track)"
        )
        return items, total_elements

@@ -378,6 +477,7 @@ class TranslationService:
                    original_content=item.content,
                    translated_content=translated_content,
                    element_type=item.element_type,
+                    page_number=item.page_number,
                    cell_position=item.cell_position
                ))

@@ -392,6 +492,7 @@ class TranslationService:
                    original_content=item.content,
                    translated_content=item.content,  # Keep original
                    element_type=item.element_type,
+                    page_number=item.page_number,
                    cell_position=item.cell_position
                )
                for item in batch.items
@@ -429,6 +530,7 @@ class TranslationService:
                original_content=item.content,
                translated_content=response.translated_text,
                element_type=item.element_type,
+                page_number=item.page_number,
                cell_position=item.cell_position
            )

@@ -440,6 +542,7 @@ class TranslationService:
                original_content=item.content,
                translated_content=item.content,  # Keep original
                element_type=item.element_type,
+                page_number=item.page_number,
                cell_position=item.cell_position
            )

@@ -451,7 +554,8 @@ class TranslationService:
        target_lang: str,
        total_elements: int,
        processing_time: float,
-        batch_count: int
+        batch_count: int,
+        processing_track: str = 'direct'
    ) -> Dict:
        """
        Build the translation result JSON structure.
@@ -464,52 +568,98 @@ class TranslationService:
            total_elements: Total elements in document
            processing_time: Processing time in seconds
            batch_count: Number of batches used
+            processing_track: 'ocr' or 'direct' - determines output format

        Returns:
            Translation result dictionary
        """
-        # Build translations dict
-        translations: Dict[str, Any] = {}
        total_chars = 0
+        is_ocr_track = processing_track == 'ocr'

-        for item in translated_items:
-            total_chars += len(item.translated_content)
+        if is_ocr_track:
+            # OCR Track: Build raw_ocr_translations list
+            raw_ocr_translations: List[Dict] = []

-            if item.element_type == 'table_cell':
-                # Group table cells by element_id
-                if item.element_id not in translations:
-                    translations[item.element_id] = {'cells': []}
+            for item in translated_items:
+                total_chars += len(item.translated_content)

-                translations[item.element_id]['cells'].append({
-                    'row': item.cell_position[0] if item.cell_position else 0,
-                    'col': item.cell_position[1] if item.cell_position else 0,
-                    'content': item.translated_content
-                })
-            else:
-                translations[item.element_id] = item.translated_content
+                if item.element_type == 'raw_ocr_region':
+                    # Extract page and index from element_id: "raw_ocr_{page}_{idx}"
+                    page_num = item.page_number
+                    original_idx = item.cell_position[0] if item.cell_position else 0

-        # Build statistics
-        translated_element_ids = set(item.element_id for item in translated_items)
-        skipped = total_elements - len(translated_element_ids)
+                    raw_ocr_translations.append({
+                        'page': page_num,
+                        'index': original_idx,
+                        'original': item.original_content,
+                        'translated': item.translated_content
+                    })

-        result = {
-            'schema_version': '1.0.0',
-            'source_document': source_document,
-            'source_lang': source_lang,
-            'target_lang': target_lang,
-            'provider': 'dify',
-            'translated_at': datetime.utcnow().isoformat() + 'Z',
-            'statistics': {
-                'total_elements': total_elements,
-                'translated_elements': len(translated_element_ids),
-                'skipped_elements': skipped,
-                'total_characters': total_chars,
-                'processing_time_seconds': round(processing_time, 2),
-                'total_tokens': self._total_tokens,
-                'batch_count': batch_count
-            },
-            'translations': translations
-        }
+            # Build statistics
+            skipped = total_elements - len(raw_ocr_translations)
+
+            result = {
+                'schema_version': '1.0.0',
+                'source_document': source_document,
+                'source_lang': source_lang,
+                'target_lang': target_lang,
+                'provider': 'dify',
+                'translated_at': datetime.utcnow().isoformat() + 'Z',
+                'processing_track': 'ocr',
+                'statistics': {
+                    'total_elements': total_elements,
+                    'translated_elements': len(raw_ocr_translations),
+                    'skipped_elements': skipped,
+                    'total_characters': total_chars,
+                    'processing_time_seconds': round(processing_time, 2),
+                    'total_tokens': self._total_tokens,
+                    'batch_count': batch_count
+                },
+                'translations': {},  # Empty for OCR Track
+                'raw_ocr_translations': raw_ocr_translations
+            }
+        else:
+            # Direct Track: Build translations dict (existing logic)
+            translations: Dict[str, Any] = {}
+
+            for item in translated_items:
+                total_chars += len(item.translated_content)
+
+                if item.element_type == 'table_cell':
+                    # Group table cells by element_id
+                    if item.element_id not in translations:
+                        translations[item.element_id] = {'cells': []}
+
+                    translations[item.element_id]['cells'].append({
+                        'row': item.cell_position[0] if item.cell_position else 0,
+                        'col': item.cell_position[1] if item.cell_position else 0,
+                        'content': item.translated_content
+                    })
+                else:
+                    translations[item.element_id] = item.translated_content
+
+            # Build statistics
+            translated_element_ids = set(item.element_id for item in translated_items)
+            skipped = total_elements - len(translated_element_ids)
+
+            result = {
+                'schema_version': '1.0.0',
+                'source_document': source_document,
+                'source_lang': source_lang,
+                'target_lang': target_lang,
+                'provider': 'dify',
+                'translated_at': datetime.utcnow().isoformat() + 'Z',
+                'statistics': {
+                    'total_elements': total_elements,
+                    'translated_elements': len(translated_element_ids),
+                    'skipped_elements': skipped,
+                    'total_characters': total_chars,
+                    'processing_time_seconds': round(processing_time, 2),
+                    'total_tokens': self._total_tokens,
+                    'batch_count': batch_count
+                },
+                'translations': translations
+            }

        return result

@@ -548,9 +698,13 @@ class TranslationService:
                result_json = json.load(f)

            source_document = result_json.get('metadata', {}).get('filename', 'unknown')
+            processing_track = result_json.get('metadata', {}).get('processing_track', 'direct')
+            result_dir = result_json_path.parent

-            # Extract translatable elements
-            items, total_elements = self.extract_translatable_elements(result_json)
+            # Extract translatable elements (passes result_dir and task_id for OCR Track)
+            items, total_elements = self.extract_translatable_elements(
+                result_json, result_dir, task_id
+            )

            if not items:
                logger.warning("No translatable elements found")
@@ -597,7 +751,8 @@ class TranslationService:
                target_lang=target_lang,
                total_elements=total_elements,
                processing_time=processing_time,
-                batch_count=len(batches)
+                batch_count=len(batches),
+                processing_track=processing_track
            )

            # Save result