fix: OCR Track reflow PDF and translation with image text filtering
- Add OCR Track support for reflow PDF generation using raw_ocr_regions.json - Add OCR Track translation extraction from raw_ocr_regions instead of elements - Add raw_ocr_translations output format for OCR Track documents - Add exclusion zone filtering to remove text overlapping with images - Update API validation to accept both translations and raw_ocr_translations - Add page_number field to TranslatedItem for proper tracking 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -233,19 +233,118 @@ class TranslationService:
|
||||
self._total_tokens = 0
|
||||
self._total_latency = 0.0
|
||||
|
||||
def _load_raw_ocr_regions(
|
||||
self,
|
||||
result_dir: Path,
|
||||
task_id: str,
|
||||
page_num: int
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Load raw OCR regions for a specific page.
|
||||
|
||||
Args:
|
||||
result_dir: Path to result directory
|
||||
task_id: Task ID
|
||||
page_num: Page number (1-indexed)
|
||||
|
||||
Returns:
|
||||
List of raw OCR region dictionaries with 'text' and 'bbox'
|
||||
"""
|
||||
import glob
|
||||
|
||||
# Pattern: {task_id}_*_page_{page_num}_raw_ocr_regions.json
|
||||
pattern = str(result_dir / f"{task_id}_*_page_{page_num}_raw_ocr_regions.json")
|
||||
matches = glob.glob(pattern)
|
||||
|
||||
if not matches:
|
||||
logger.warning(f"No raw OCR regions file found for page {page_num}")
|
||||
return []
|
||||
|
||||
try:
|
||||
with open(matches[0], 'r', encoding='utf-8') as f:
|
||||
regions = json.load(f)
|
||||
logger.info(f"Loaded {len(regions)} raw OCR regions from {matches[0]}")
|
||||
return regions
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load raw OCR regions: {e}")
|
||||
return []
|
||||
|
||||
def extract_translatable_elements_ocr_track(
|
||||
self,
|
||||
result_json: Dict,
|
||||
result_dir: Path,
|
||||
task_id: str
|
||||
) -> Tuple[List[TranslatableItem], int]:
|
||||
"""
|
||||
Extract translatable elements from raw OCR regions for OCR Track documents.
|
||||
|
||||
Args:
|
||||
result_json: UnifiedDocument JSON data
|
||||
result_dir: Path to result directory
|
||||
task_id: Task ID
|
||||
|
||||
Returns:
|
||||
Tuple of (list of TranslatableItem, total region count)
|
||||
"""
|
||||
items = []
|
||||
total_regions = 0
|
||||
|
||||
for page in result_json.get('pages', []):
|
||||
page_number = page.get('page_number', 1)
|
||||
|
||||
# Load raw OCR regions for this page
|
||||
raw_regions = self._load_raw_ocr_regions(result_dir, task_id, page_number)
|
||||
|
||||
for idx, region in enumerate(raw_regions):
|
||||
total_regions += 1
|
||||
text = region.get('text', '').strip()
|
||||
|
||||
if text:
|
||||
# Use index as element_id for raw OCR regions
|
||||
items.append(TranslatableItem(
|
||||
element_id=f"raw_ocr_{page_number}_{idx}",
|
||||
content=text,
|
||||
element_type='raw_ocr_region',
|
||||
page_number=page_number,
|
||||
cell_position=(idx, 0) # Store original index in cell_position
|
||||
))
|
||||
|
||||
logger.info(
|
||||
f"Extracted {len(items)} translatable items from {total_regions} raw OCR regions (OCR Track)"
|
||||
)
|
||||
return items, total_regions
|
||||
|
||||
def extract_translatable_elements(
|
||||
self,
|
||||
result_json: Dict
|
||||
result_json: Dict,
|
||||
result_dir: Optional[Path] = None,
|
||||
task_id: Optional[str] = None
|
||||
) -> Tuple[List[TranslatableItem], int]:
|
||||
"""
|
||||
Extract all translatable elements from a result JSON.
|
||||
|
||||
For OCR Track documents, extracts from raw_ocr_regions.json files.
|
||||
For Direct Track documents, extracts from elements in result JSON.
|
||||
|
||||
Args:
|
||||
result_json: UnifiedDocument JSON data
|
||||
result_dir: Path to result directory (required for OCR Track)
|
||||
task_id: Task ID (required for OCR Track)
|
||||
|
||||
Returns:
|
||||
Tuple of (list of TranslatableItem, total element count)
|
||||
"""
|
||||
# Check processing track
|
||||
metadata = result_json.get('metadata', {})
|
||||
processing_track = metadata.get('processing_track', 'direct')
|
||||
|
||||
# For OCR Track, use raw OCR regions
|
||||
if processing_track == 'ocr' and result_dir and task_id:
|
||||
return self.extract_translatable_elements_ocr_track(
|
||||
result_json, result_dir, task_id
|
||||
)
|
||||
|
||||
# For Direct Track, use element-based extraction
|
||||
items = []
|
||||
total_elements = 0
|
||||
|
||||
@@ -290,7 +389,7 @@ class TranslationService:
|
||||
))
|
||||
|
||||
logger.info(
|
||||
f"Extracted {len(items)} translatable items from {total_elements} elements"
|
||||
f"Extracted {len(items)} translatable items from {total_elements} elements (Direct Track)"
|
||||
)
|
||||
return items, total_elements
|
||||
|
||||
@@ -378,6 +477,7 @@ class TranslationService:
|
||||
original_content=item.content,
|
||||
translated_content=translated_content,
|
||||
element_type=item.element_type,
|
||||
page_number=item.page_number,
|
||||
cell_position=item.cell_position
|
||||
))
|
||||
|
||||
@@ -392,6 +492,7 @@ class TranslationService:
|
||||
original_content=item.content,
|
||||
translated_content=item.content, # Keep original
|
||||
element_type=item.element_type,
|
||||
page_number=item.page_number,
|
||||
cell_position=item.cell_position
|
||||
)
|
||||
for item in batch.items
|
||||
@@ -429,6 +530,7 @@ class TranslationService:
|
||||
original_content=item.content,
|
||||
translated_content=response.translated_text,
|
||||
element_type=item.element_type,
|
||||
page_number=item.page_number,
|
||||
cell_position=item.cell_position
|
||||
)
|
||||
|
||||
@@ -440,6 +542,7 @@ class TranslationService:
|
||||
original_content=item.content,
|
||||
translated_content=item.content, # Keep original
|
||||
element_type=item.element_type,
|
||||
page_number=item.page_number,
|
||||
cell_position=item.cell_position
|
||||
)
|
||||
|
||||
@@ -451,7 +554,8 @@ class TranslationService:
|
||||
target_lang: str,
|
||||
total_elements: int,
|
||||
processing_time: float,
|
||||
batch_count: int
|
||||
batch_count: int,
|
||||
processing_track: str = 'direct'
|
||||
) -> Dict:
|
||||
"""
|
||||
Build the translation result JSON structure.
|
||||
@@ -464,52 +568,98 @@ class TranslationService:
|
||||
total_elements: Total elements in document
|
||||
processing_time: Processing time in seconds
|
||||
batch_count: Number of batches used
|
||||
processing_track: 'ocr' or 'direct' - determines output format
|
||||
|
||||
Returns:
|
||||
Translation result dictionary
|
||||
"""
|
||||
# Build translations dict
|
||||
translations: Dict[str, Any] = {}
|
||||
total_chars = 0
|
||||
is_ocr_track = processing_track == 'ocr'
|
||||
|
||||
for item in translated_items:
|
||||
total_chars += len(item.translated_content)
|
||||
if is_ocr_track:
|
||||
# OCR Track: Build raw_ocr_translations list
|
||||
raw_ocr_translations: List[Dict] = []
|
||||
|
||||
if item.element_type == 'table_cell':
|
||||
# Group table cells by element_id
|
||||
if item.element_id not in translations:
|
||||
translations[item.element_id] = {'cells': []}
|
||||
for item in translated_items:
|
||||
total_chars += len(item.translated_content)
|
||||
|
||||
translations[item.element_id]['cells'].append({
|
||||
'row': item.cell_position[0] if item.cell_position else 0,
|
||||
'col': item.cell_position[1] if item.cell_position else 0,
|
||||
'content': item.translated_content
|
||||
})
|
||||
else:
|
||||
translations[item.element_id] = item.translated_content
|
||||
if item.element_type == 'raw_ocr_region':
|
||||
# Extract page and index from element_id: "raw_ocr_{page}_{idx}"
|
||||
page_num = item.page_number
|
||||
original_idx = item.cell_position[0] if item.cell_position else 0
|
||||
|
||||
# Build statistics
|
||||
translated_element_ids = set(item.element_id for item in translated_items)
|
||||
skipped = total_elements - len(translated_element_ids)
|
||||
raw_ocr_translations.append({
|
||||
'page': page_num,
|
||||
'index': original_idx,
|
||||
'original': item.original_content,
|
||||
'translated': item.translated_content
|
||||
})
|
||||
|
||||
result = {
|
||||
'schema_version': '1.0.0',
|
||||
'source_document': source_document,
|
||||
'source_lang': source_lang,
|
||||
'target_lang': target_lang,
|
||||
'provider': 'dify',
|
||||
'translated_at': datetime.utcnow().isoformat() + 'Z',
|
||||
'statistics': {
|
||||
'total_elements': total_elements,
|
||||
'translated_elements': len(translated_element_ids),
|
||||
'skipped_elements': skipped,
|
||||
'total_characters': total_chars,
|
||||
'processing_time_seconds': round(processing_time, 2),
|
||||
'total_tokens': self._total_tokens,
|
||||
'batch_count': batch_count
|
||||
},
|
||||
'translations': translations
|
||||
}
|
||||
# Build statistics
|
||||
skipped = total_elements - len(raw_ocr_translations)
|
||||
|
||||
result = {
|
||||
'schema_version': '1.0.0',
|
||||
'source_document': source_document,
|
||||
'source_lang': source_lang,
|
||||
'target_lang': target_lang,
|
||||
'provider': 'dify',
|
||||
'translated_at': datetime.utcnow().isoformat() + 'Z',
|
||||
'processing_track': 'ocr',
|
||||
'statistics': {
|
||||
'total_elements': total_elements,
|
||||
'translated_elements': len(raw_ocr_translations),
|
||||
'skipped_elements': skipped,
|
||||
'total_characters': total_chars,
|
||||
'processing_time_seconds': round(processing_time, 2),
|
||||
'total_tokens': self._total_tokens,
|
||||
'batch_count': batch_count
|
||||
},
|
||||
'translations': {}, # Empty for OCR Track
|
||||
'raw_ocr_translations': raw_ocr_translations
|
||||
}
|
||||
else:
|
||||
# Direct Track: Build translations dict (existing logic)
|
||||
translations: Dict[str, Any] = {}
|
||||
|
||||
for item in translated_items:
|
||||
total_chars += len(item.translated_content)
|
||||
|
||||
if item.element_type == 'table_cell':
|
||||
# Group table cells by element_id
|
||||
if item.element_id not in translations:
|
||||
translations[item.element_id] = {'cells': []}
|
||||
|
||||
translations[item.element_id]['cells'].append({
|
||||
'row': item.cell_position[0] if item.cell_position else 0,
|
||||
'col': item.cell_position[1] if item.cell_position else 0,
|
||||
'content': item.translated_content
|
||||
})
|
||||
else:
|
||||
translations[item.element_id] = item.translated_content
|
||||
|
||||
# Build statistics
|
||||
translated_element_ids = set(item.element_id for item in translated_items)
|
||||
skipped = total_elements - len(translated_element_ids)
|
||||
|
||||
result = {
|
||||
'schema_version': '1.0.0',
|
||||
'source_document': source_document,
|
||||
'source_lang': source_lang,
|
||||
'target_lang': target_lang,
|
||||
'provider': 'dify',
|
||||
'translated_at': datetime.utcnow().isoformat() + 'Z',
|
||||
'statistics': {
|
||||
'total_elements': total_elements,
|
||||
'translated_elements': len(translated_element_ids),
|
||||
'skipped_elements': skipped,
|
||||
'total_characters': total_chars,
|
||||
'processing_time_seconds': round(processing_time, 2),
|
||||
'total_tokens': self._total_tokens,
|
||||
'batch_count': batch_count
|
||||
},
|
||||
'translations': translations
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
@@ -548,9 +698,13 @@ class TranslationService:
|
||||
result_json = json.load(f)
|
||||
|
||||
source_document = result_json.get('metadata', {}).get('filename', 'unknown')
|
||||
processing_track = result_json.get('metadata', {}).get('processing_track', 'direct')
|
||||
result_dir = result_json_path.parent
|
||||
|
||||
# Extract translatable elements
|
||||
items, total_elements = self.extract_translatable_elements(result_json)
|
||||
# Extract translatable elements (passes result_dir and task_id for OCR Track)
|
||||
items, total_elements = self.extract_translatable_elements(
|
||||
result_json, result_dir, task_id
|
||||
)
|
||||
|
||||
if not items:
|
||||
logger.warning("No translatable elements found")
|
||||
@@ -597,7 +751,8 @@ class TranslationService:
|
||||
target_lang=target_lang,
|
||||
total_elements=total_elements,
|
||||
processing_time=processing_time,
|
||||
batch_count=len(batches)
|
||||
batch_count=len(batches),
|
||||
processing_track=processing_track
|
||||
)
|
||||
|
||||
# Save result
|
||||
|
||||
Reference in New Issue
Block a user