fix: OCR Track reflow PDF and translation with image text filtering

- Add OCR Track support for reflow PDF generation using raw_ocr_regions.json
- Add OCR Track translation extraction from raw_ocr_regions instead of elements
- Add raw_ocr_translations output format for OCR Track documents
- Add exclusion zone filtering to remove text overlapping with images
- Update API validation to accept both translations and raw_ocr_translations
- Add page_number field to TranslatedItem for proper tracking

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
egg
2025-12-12 11:02:35 +08:00
parent 24253ac15e
commit 1f18010040
11 changed files with 1040 additions and 149 deletions

View File

@@ -233,19 +233,118 @@ class TranslationService:
self._total_tokens = 0
self._total_latency = 0.0
def _load_raw_ocr_regions(
self,
result_dir: Path,
task_id: str,
page_num: int
) -> List[Dict]:
"""
Load raw OCR regions for a specific page.
Args:
result_dir: Path to result directory
task_id: Task ID
page_num: Page number (1-indexed)
Returns:
List of raw OCR region dictionaries with 'text' and 'bbox'
"""
import glob
# Pattern: {task_id}_*_page_{page_num}_raw_ocr_regions.json
pattern = str(result_dir / f"{task_id}_*_page_{page_num}_raw_ocr_regions.json")
matches = glob.glob(pattern)
if not matches:
logger.warning(f"No raw OCR regions file found for page {page_num}")
return []
try:
with open(matches[0], 'r', encoding='utf-8') as f:
regions = json.load(f)
logger.info(f"Loaded {len(regions)} raw OCR regions from {matches[0]}")
return regions
except Exception as e:
logger.error(f"Failed to load raw OCR regions: {e}")
return []
def extract_translatable_elements_ocr_track(
self,
result_json: Dict,
result_dir: Path,
task_id: str
) -> Tuple[List[TranslatableItem], int]:
"""
Extract translatable elements from raw OCR regions for OCR Track documents.
Args:
result_json: UnifiedDocument JSON data
result_dir: Path to result directory
task_id: Task ID
Returns:
Tuple of (list of TranslatableItem, total region count)
"""
items = []
total_regions = 0
for page in result_json.get('pages', []):
page_number = page.get('page_number', 1)
# Load raw OCR regions for this page
raw_regions = self._load_raw_ocr_regions(result_dir, task_id, page_number)
for idx, region in enumerate(raw_regions):
total_regions += 1
text = region.get('text', '').strip()
if text:
# Use index as element_id for raw OCR regions
items.append(TranslatableItem(
element_id=f"raw_ocr_{page_number}_{idx}",
content=text,
element_type='raw_ocr_region',
page_number=page_number,
cell_position=(idx, 0) # Store original index in cell_position
))
logger.info(
f"Extracted {len(items)} translatable items from {total_regions} raw OCR regions (OCR Track)"
)
return items, total_regions
def extract_translatable_elements(
self,
result_json: Dict
result_json: Dict,
result_dir: Optional[Path] = None,
task_id: Optional[str] = None
) -> Tuple[List[TranslatableItem], int]:
"""
Extract all translatable elements from a result JSON.
For OCR Track documents, extracts from raw_ocr_regions.json files.
For Direct Track documents, extracts from elements in result JSON.
Args:
result_json: UnifiedDocument JSON data
result_dir: Path to result directory (required for OCR Track)
task_id: Task ID (required for OCR Track)
Returns:
Tuple of (list of TranslatableItem, total element count)
"""
# Check processing track
metadata = result_json.get('metadata', {})
processing_track = metadata.get('processing_track', 'direct')
# For OCR Track, use raw OCR regions
if processing_track == 'ocr' and result_dir and task_id:
return self.extract_translatable_elements_ocr_track(
result_json, result_dir, task_id
)
# For Direct Track, use element-based extraction
items = []
total_elements = 0
@@ -290,7 +389,7 @@ class TranslationService:
))
logger.info(
f"Extracted {len(items)} translatable items from {total_elements} elements"
f"Extracted {len(items)} translatable items from {total_elements} elements (Direct Track)"
)
return items, total_elements
@@ -378,6 +477,7 @@ class TranslationService:
original_content=item.content,
translated_content=translated_content,
element_type=item.element_type,
page_number=item.page_number,
cell_position=item.cell_position
))
@@ -392,6 +492,7 @@ class TranslationService:
original_content=item.content,
translated_content=item.content, # Keep original
element_type=item.element_type,
page_number=item.page_number,
cell_position=item.cell_position
)
for item in batch.items
@@ -429,6 +530,7 @@ class TranslationService:
original_content=item.content,
translated_content=response.translated_text,
element_type=item.element_type,
page_number=item.page_number,
cell_position=item.cell_position
)
@@ -440,6 +542,7 @@ class TranslationService:
original_content=item.content,
translated_content=item.content, # Keep original
element_type=item.element_type,
page_number=item.page_number,
cell_position=item.cell_position
)
@@ -451,7 +554,8 @@ class TranslationService:
target_lang: str,
total_elements: int,
processing_time: float,
batch_count: int
batch_count: int,
processing_track: str = 'direct'
) -> Dict:
"""
Build the translation result JSON structure.
@@ -464,52 +568,98 @@ class TranslationService:
total_elements: Total elements in document
processing_time: Processing time in seconds
batch_count: Number of batches used
processing_track: 'ocr' or 'direct' - determines output format
Returns:
Translation result dictionary
"""
# Build translations dict
translations: Dict[str, Any] = {}
total_chars = 0
is_ocr_track = processing_track == 'ocr'
for item in translated_items:
total_chars += len(item.translated_content)
if is_ocr_track:
# OCR Track: Build raw_ocr_translations list
raw_ocr_translations: List[Dict] = []
if item.element_type == 'table_cell':
# Group table cells by element_id
if item.element_id not in translations:
translations[item.element_id] = {'cells': []}
for item in translated_items:
total_chars += len(item.translated_content)
translations[item.element_id]['cells'].append({
'row': item.cell_position[0] if item.cell_position else 0,
'col': item.cell_position[1] if item.cell_position else 0,
'content': item.translated_content
})
else:
translations[item.element_id] = item.translated_content
if item.element_type == 'raw_ocr_region':
# Extract page and index from element_id: "raw_ocr_{page}_{idx}"
page_num = item.page_number
original_idx = item.cell_position[0] if item.cell_position else 0
# Build statistics
translated_element_ids = set(item.element_id for item in translated_items)
skipped = total_elements - len(translated_element_ids)
raw_ocr_translations.append({
'page': page_num,
'index': original_idx,
'original': item.original_content,
'translated': item.translated_content
})
result = {
'schema_version': '1.0.0',
'source_document': source_document,
'source_lang': source_lang,
'target_lang': target_lang,
'provider': 'dify',
'translated_at': datetime.utcnow().isoformat() + 'Z',
'statistics': {
'total_elements': total_elements,
'translated_elements': len(translated_element_ids),
'skipped_elements': skipped,
'total_characters': total_chars,
'processing_time_seconds': round(processing_time, 2),
'total_tokens': self._total_tokens,
'batch_count': batch_count
},
'translations': translations
}
# Build statistics
skipped = total_elements - len(raw_ocr_translations)
result = {
'schema_version': '1.0.0',
'source_document': source_document,
'source_lang': source_lang,
'target_lang': target_lang,
'provider': 'dify',
'translated_at': datetime.utcnow().isoformat() + 'Z',
'processing_track': 'ocr',
'statistics': {
'total_elements': total_elements,
'translated_elements': len(raw_ocr_translations),
'skipped_elements': skipped,
'total_characters': total_chars,
'processing_time_seconds': round(processing_time, 2),
'total_tokens': self._total_tokens,
'batch_count': batch_count
},
'translations': {}, # Empty for OCR Track
'raw_ocr_translations': raw_ocr_translations
}
else:
# Direct Track: Build translations dict (existing logic)
translations: Dict[str, Any] = {}
for item in translated_items:
total_chars += len(item.translated_content)
if item.element_type == 'table_cell':
# Group table cells by element_id
if item.element_id not in translations:
translations[item.element_id] = {'cells': []}
translations[item.element_id]['cells'].append({
'row': item.cell_position[0] if item.cell_position else 0,
'col': item.cell_position[1] if item.cell_position else 0,
'content': item.translated_content
})
else:
translations[item.element_id] = item.translated_content
# Build statistics
translated_element_ids = set(item.element_id for item in translated_items)
skipped = total_elements - len(translated_element_ids)
result = {
'schema_version': '1.0.0',
'source_document': source_document,
'source_lang': source_lang,
'target_lang': target_lang,
'provider': 'dify',
'translated_at': datetime.utcnow().isoformat() + 'Z',
'statistics': {
'total_elements': total_elements,
'translated_elements': len(translated_element_ids),
'skipped_elements': skipped,
'total_characters': total_chars,
'processing_time_seconds': round(processing_time, 2),
'total_tokens': self._total_tokens,
'batch_count': batch_count
},
'translations': translations
}
return result
@@ -548,9 +698,13 @@ class TranslationService:
result_json = json.load(f)
source_document = result_json.get('metadata', {}).get('filename', 'unknown')
processing_track = result_json.get('metadata', {}).get('processing_track', 'direct')
result_dir = result_json_path.parent
# Extract translatable elements
items, total_elements = self.extract_translatable_elements(result_json)
# Extract translatable elements (passes result_dir and task_id for OCR Track)
items, total_elements = self.extract_translatable_elements(
result_json, result_dir, task_id
)
if not items:
logger.warning("No translatable elements found")
@@ -597,7 +751,8 @@ class TranslationService:
target_lang=target_lang,
total_elements=total_elements,
processing_time=processing_time,
batch_count=len(batches)
batch_count=len(batches),
processing_track=processing_track
)
# Save result