fix: OCR Track reflow PDF and translation with image text filtering
- Add OCR Track support for reflow PDF generation using raw_ocr_regions.json - Add OCR Track translation extraction from raw_ocr_regions instead of elements - Add raw_ocr_translations output format for OCR Track documents - Add exclusion zone filtering to remove text overlapping with images - Update API validation to accept both translations and raw_ocr_translations - Add page_number field to TranslatedItem for proper tracking 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -578,7 +578,10 @@ async def download_translated_pdf(
|
|||||||
with open(translation_file, 'r', encoding='utf-8') as f:
|
with open(translation_file, 'r', encoding='utf-8') as f:
|
||||||
translation_data = json.load(f)
|
translation_data = json.load(f)
|
||||||
|
|
||||||
if not translation_data.get('translations'):
|
# Check for translations (Direct Track) or raw_ocr_translations (OCR Track)
|
||||||
|
has_translations = translation_data.get('translations')
|
||||||
|
has_raw_ocr_translations = translation_data.get('raw_ocr_translations')
|
||||||
|
if not has_translations and not has_raw_ocr_translations:
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
status_code=status.HTTP_400_BAD_REQUEST,
|
status_code=status.HTTP_400_BAD_REQUEST,
|
||||||
detail="Translation file is empty or incomplete"
|
detail="Translation file is empty or incomplete"
|
||||||
|
|||||||
@@ -146,6 +146,7 @@ class TranslatedItem:
|
|||||||
original_content: str
|
original_content: str
|
||||||
translated_content: str
|
translated_content: str
|
||||||
element_type: str
|
element_type: str
|
||||||
|
page_number: int = 1
|
||||||
cell_position: Optional[Tuple[int, int]] = None
|
cell_position: Optional[Tuple[int, int]] = None
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -4701,53 +4701,155 @@ class PDFGeneratorService:
|
|||||||
logger.error(f"Failed to embed image for reflow: {e}")
|
logger.error(f"Failed to embed image for reflow: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def generate_reflow_pdf(
|
def _collect_exclusion_zones(self, page_data: Dict) -> List[Tuple[float, float, float, float]]:
|
||||||
self,
|
|
||||||
json_path: Path,
|
|
||||||
output_path: Path,
|
|
||||||
source_file_path: Optional[Path] = None
|
|
||||||
) -> bool:
|
|
||||||
"""
|
"""
|
||||||
Generate reflow layout PDF from OCR/Direct JSON data.
|
Collect exclusion zones (image bboxes) from page elements.
|
||||||
|
|
||||||
This creates a flowing document with consistent font sizes,
|
These zones are used to filter out OCR text that overlaps with images,
|
||||||
proper reading order, and inline tables/images.
|
preventing text inside images from appearing in reflow PDFs.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
json_path: Path to result JSON file (UnifiedDocument format)
|
page_data: Page dictionary containing 'elements'
|
||||||
output_path: Path to save generated PDF
|
|
||||||
source_file_path: Optional path to original source file (for images)
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
True if successful, False otherwise
|
List of (x0, y0, x1, y1) tuples representing image bounding boxes
|
||||||
"""
|
"""
|
||||||
try:
|
exclusion_zones = []
|
||||||
# Load JSON data
|
|
||||||
logger.info(f"Generating reflow PDF from: {json_path}")
|
|
||||||
with open(json_path, 'r', encoding='utf-8') as f:
|
|
||||||
json_data = json.load(f)
|
|
||||||
|
|
||||||
# Get styles
|
elements = page_data.get('elements', [])
|
||||||
styles = self._get_reflow_styles()
|
for elem in elements:
|
||||||
|
elem_type = elem.get('type', '')
|
||||||
|
|
||||||
# Build document content
|
# Collect image/chart bboxes
|
||||||
story = []
|
if elem_type in ('image', 'Image', 'figure', 'Figure', 'chart', 'Chart'):
|
||||||
# Use source_file_path if provided (for translated PDFs where JSON is in temp dir)
|
bbox = elem.get('bbox', {})
|
||||||
# Otherwise use json_path.parent (for regular reflow PDFs)
|
if isinstance(bbox, dict):
|
||||||
if source_file_path and source_file_path.is_dir():
|
x0 = bbox.get('x0', 0)
|
||||||
result_dir = source_file_path
|
y0 = bbox.get('y0', 0)
|
||||||
elif source_file_path and source_file_path.is_file():
|
x1 = bbox.get('x1', 0)
|
||||||
result_dir = source_file_path.parent
|
y1 = bbox.get('y1', 0)
|
||||||
|
if x1 > x0 and y1 > y0:
|
||||||
|
exclusion_zones.append((x0, y0, x1, y1))
|
||||||
|
|
||||||
|
# Collect embedded images in tables
|
||||||
|
if elem_type in ('table', 'Table'):
|
||||||
|
metadata = elem.get('metadata', {})
|
||||||
|
embedded_images = metadata.get('embedded_images', [])
|
||||||
|
for emb_img in embedded_images:
|
||||||
|
emb_bbox = emb_img.get('bbox', [])
|
||||||
|
if isinstance(emb_bbox, list) and len(emb_bbox) >= 4:
|
||||||
|
x0, y0, x1, y1 = emb_bbox[0], emb_bbox[1], emb_bbox[2], emb_bbox[3]
|
||||||
|
if x1 > x0 and y1 > y0:
|
||||||
|
exclusion_zones.append((x0, y0, x1, y1))
|
||||||
|
|
||||||
|
return exclusion_zones
|
||||||
|
|
||||||
|
def _is_region_overlapping_exclusion(
|
||||||
|
self,
|
||||||
|
region_bbox: List,
|
||||||
|
exclusion_zones: List[Tuple[float, float, float, float]],
|
||||||
|
ioa_threshold: float = 0.3
|
||||||
|
) -> bool:
|
||||||
|
"""
|
||||||
|
Check if a text region overlaps significantly with any exclusion zone.
|
||||||
|
|
||||||
|
Uses IoA (Intersection over Area) to determine overlap.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
region_bbox: Quadrilateral bbox [[x0,y0], [x1,y1], [x2,y2], [x3,y3]]
|
||||||
|
exclusion_zones: List of (x0, y0, x1, y1) tuples
|
||||||
|
ioa_threshold: Overlap threshold (default 0.3 = 30%)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if region should be excluded
|
||||||
|
"""
|
||||||
|
if not exclusion_zones or not region_bbox:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Convert quadrilateral to rectangular bbox
|
||||||
|
if len(region_bbox) >= 4:
|
||||||
|
xs = [p[0] for p in region_bbox]
|
||||||
|
ys = [p[1] for p in region_bbox]
|
||||||
|
tx0, ty0, tx1, ty1 = min(xs), min(ys), max(xs), max(ys)
|
||||||
else:
|
else:
|
||||||
result_dir = json_path.parent
|
return False
|
||||||
|
|
||||||
# Process each page
|
text_area = (tx1 - tx0) * (ty1 - ty0)
|
||||||
pages = json_data.get('pages', [])
|
if text_area <= 0:
|
||||||
for page_idx, page_data in enumerate(pages):
|
return False
|
||||||
if page_idx > 0:
|
|
||||||
# Add page break between pages
|
|
||||||
story.append(Spacer(1, 30))
|
|
||||||
|
|
||||||
|
for zx0, zy0, zx1, zy1 in exclusion_zones:
|
||||||
|
# Calculate intersection
|
||||||
|
ix0 = max(tx0, zx0)
|
||||||
|
iy0 = max(ty0, zy0)
|
||||||
|
ix1 = min(tx1, zx1)
|
||||||
|
iy1 = min(ty1, zy1)
|
||||||
|
|
||||||
|
if ix1 > ix0 and iy1 > iy0:
|
||||||
|
intersection_area = (ix1 - ix0) * (iy1 - iy0)
|
||||||
|
ioa = intersection_area / text_area
|
||||||
|
|
||||||
|
if ioa >= ioa_threshold:
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _filter_regions_by_exclusion(
|
||||||
|
self,
|
||||||
|
regions: List[Dict],
|
||||||
|
exclusion_zones: List[Tuple[float, float, float, float]],
|
||||||
|
ioa_threshold: float = 0.3
|
||||||
|
) -> List[Dict]:
|
||||||
|
"""
|
||||||
|
Filter out text regions that overlap with exclusion zones (images).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
regions: List of raw OCR regions with 'text' and 'bbox'
|
||||||
|
exclusion_zones: List of (x0, y0, x1, y1) tuples
|
||||||
|
ioa_threshold: Overlap threshold
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Filtered list of regions
|
||||||
|
"""
|
||||||
|
if not exclusion_zones:
|
||||||
|
return regions
|
||||||
|
|
||||||
|
filtered = []
|
||||||
|
excluded_count = 0
|
||||||
|
|
||||||
|
for region in regions:
|
||||||
|
bbox = region.get('bbox', [])
|
||||||
|
if self._is_region_overlapping_exclusion(bbox, exclusion_zones, ioa_threshold):
|
||||||
|
excluded_count += 1
|
||||||
|
text = region.get('text', '')[:20]
|
||||||
|
logger.debug(f"Excluding text '{text}...' due to image overlap")
|
||||||
|
else:
|
||||||
|
filtered.append(region)
|
||||||
|
|
||||||
|
if excluded_count > 0:
|
||||||
|
logger.info(f"Filtered {excluded_count} text regions overlapping with images")
|
||||||
|
|
||||||
|
return filtered
|
||||||
|
|
||||||
|
def _render_reflow_elements(
|
||||||
|
self,
|
||||||
|
page_data: Dict,
|
||||||
|
result_dir: Path,
|
||||||
|
styles: Dict,
|
||||||
|
story: List
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Render page elements in reflow format (Direct Track logic).
|
||||||
|
|
||||||
|
This method processes elements from the JSON and renders them
|
||||||
|
as flowing content (text, tables, images).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
page_data: Page dictionary containing 'elements'
|
||||||
|
result_dir: Path to result directory for images
|
||||||
|
styles: Style dictionary for paragraphs
|
||||||
|
story: List to append rendered elements to
|
||||||
|
"""
|
||||||
# Get elements in reading order
|
# Get elements in reading order
|
||||||
elements = self._get_elements_in_reading_order(page_data)
|
elements = self._get_elements_in_reading_order(page_data)
|
||||||
|
|
||||||
@@ -4829,6 +4931,141 @@ class PDFGeneratorService:
|
|||||||
content = content.replace('&', '&').replace('<', '<').replace('>', '>')
|
content = content.replace('&', '&').replace('<', '<').replace('>', '>')
|
||||||
story.append(Paragraph(content, styles['Body']))
|
story.append(Paragraph(content, styles['Body']))
|
||||||
|
|
||||||
|
def generate_reflow_pdf(
|
||||||
|
self,
|
||||||
|
json_path: Path,
|
||||||
|
output_path: Path,
|
||||||
|
source_file_path: Optional[Path] = None,
|
||||||
|
use_elements_only: bool = False
|
||||||
|
) -> bool:
|
||||||
|
"""
|
||||||
|
Generate reflow layout PDF from OCR/Direct JSON data.
|
||||||
|
|
||||||
|
This creates a flowing document with consistent font sizes,
|
||||||
|
proper reading order, and inline tables/images.
|
||||||
|
|
||||||
|
For OCR Track: Uses raw_ocr_regions.json for text content (ensures all text is included)
|
||||||
|
For Direct Track: Uses content.cells for tables (structured data available)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
json_path: Path to result JSON file (UnifiedDocument format)
|
||||||
|
output_path: Path to save generated PDF
|
||||||
|
source_file_path: Optional path to original source file (for images)
|
||||||
|
use_elements_only: If True, always use elements from JSON (for translated PDFs
|
||||||
|
where translations are applied to elements, not raw_ocr_regions)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if successful, False otherwise
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Load JSON data
|
||||||
|
logger.info(f"Generating reflow PDF from: {json_path}")
|
||||||
|
with open(json_path, 'r', encoding='utf-8') as f:
|
||||||
|
json_data = json.load(f)
|
||||||
|
|
||||||
|
# Detect processing track
|
||||||
|
metadata = json_data.get('metadata', {})
|
||||||
|
processing_track = metadata.get('processing_track', 'direct')
|
||||||
|
is_ocr_track = processing_track == 'ocr'
|
||||||
|
logger.info(f"Reflow PDF generation - Processing track: {processing_track}")
|
||||||
|
|
||||||
|
# Get styles
|
||||||
|
styles = self._get_reflow_styles()
|
||||||
|
|
||||||
|
# Build document content
|
||||||
|
story = []
|
||||||
|
# Use source_file_path if provided (for translated PDFs where JSON is in temp dir)
|
||||||
|
# Otherwise use json_path.parent (for regular reflow PDFs)
|
||||||
|
if source_file_path and source_file_path.is_dir():
|
||||||
|
result_dir = source_file_path
|
||||||
|
elif source_file_path and source_file_path.is_file():
|
||||||
|
result_dir = source_file_path.parent
|
||||||
|
else:
|
||||||
|
result_dir = json_path.parent
|
||||||
|
|
||||||
|
# Extract task_id from result_dir (directory name is the task_id)
|
||||||
|
task_id = result_dir.name
|
||||||
|
|
||||||
|
# Process each page
|
||||||
|
pages = json_data.get('pages', [])
|
||||||
|
for page_idx, page_data in enumerate(pages):
|
||||||
|
page_num = page_idx + 1 # 1-indexed
|
||||||
|
if page_idx > 0:
|
||||||
|
# Add page break between pages
|
||||||
|
story.append(Spacer(1, 30))
|
||||||
|
|
||||||
|
# === OCR Track: Use raw_ocr_regions.json for text ===
|
||||||
|
# But for translated PDFs (use_elements_only=True), use elements which have translations applied
|
||||||
|
if is_ocr_track and not use_elements_only and TEXT_REGION_RENDERER_AVAILABLE and load_raw_ocr_regions:
|
||||||
|
# Load raw OCR regions for this page
|
||||||
|
raw_regions = load_raw_ocr_regions(str(result_dir), task_id, page_num)
|
||||||
|
|
||||||
|
if raw_regions:
|
||||||
|
logger.info(f"OCR Track reflow: Using {len(raw_regions)} raw OCR regions for page {page_num}")
|
||||||
|
|
||||||
|
# Collect exclusion zones (image bboxes) to filter text inside images
|
||||||
|
exclusion_zones = self._collect_exclusion_zones(page_data)
|
||||||
|
if exclusion_zones:
|
||||||
|
logger.info(f"Collected {len(exclusion_zones)} exclusion zones for text filtering")
|
||||||
|
raw_regions = self._filter_regions_by_exclusion(raw_regions, exclusion_zones)
|
||||||
|
|
||||||
|
# Sort by Y coordinate (top to bottom reading order)
|
||||||
|
def get_y_coord(region):
|
||||||
|
bbox = region.get('bbox', [])
|
||||||
|
if bbox and len(bbox) >= 4:
|
||||||
|
# bbox is [[x0,y0], [x1,y1], [x2,y2], [x3,y3]]
|
||||||
|
# Get average Y of top-left and top-right corners
|
||||||
|
return (bbox[0][1] + bbox[1][1]) / 2
|
||||||
|
return 0
|
||||||
|
|
||||||
|
sorted_regions = sorted(raw_regions, key=get_y_coord)
|
||||||
|
|
||||||
|
# Render text blocks as paragraphs
|
||||||
|
for region in sorted_regions:
|
||||||
|
text = region.get('text', '')
|
||||||
|
if text:
|
||||||
|
text = text.replace('&', '&').replace('<', '<').replace('>', '>')
|
||||||
|
story.append(Paragraph(text, styles['Body']))
|
||||||
|
|
||||||
|
# Also render images/charts from elements
|
||||||
|
elements = self._get_elements_in_reading_order(page_data)
|
||||||
|
for elem in elements:
|
||||||
|
elem_type = elem.get('type', elem.get('element_type', ''))
|
||||||
|
if elem_type in ('image', 'figure', 'Image', 'Figure', 'chart', 'Chart'):
|
||||||
|
img = self._embed_image_reflow(elem, result_dir)
|
||||||
|
if img:
|
||||||
|
story.append(img)
|
||||||
|
story.append(Spacer(1, 8))
|
||||||
|
# Handle embedded images in tables
|
||||||
|
elif elem_type in ('table', 'Table'):
|
||||||
|
elem_metadata = elem.get('metadata', {})
|
||||||
|
embedded_images = elem_metadata.get('embedded_images', [])
|
||||||
|
for emb_img in embedded_images:
|
||||||
|
img_path_str = emb_img.get('saved_path', '')
|
||||||
|
if img_path_str:
|
||||||
|
img_path = result_dir / img_path_str
|
||||||
|
if not img_path.exists():
|
||||||
|
img_path = result_dir / Path(img_path_str).name
|
||||||
|
if img_path.exists():
|
||||||
|
try:
|
||||||
|
img = PlatypusImage(str(img_path))
|
||||||
|
max_width = 450
|
||||||
|
if img.drawWidth > max_width:
|
||||||
|
ratio = max_width / img.drawWidth
|
||||||
|
img.drawWidth = max_width
|
||||||
|
img.drawHeight *= ratio
|
||||||
|
story.append(img)
|
||||||
|
story.append(Spacer(1, 8))
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to embed table image: {e}")
|
||||||
|
else:
|
||||||
|
# Fallback to elements if raw OCR regions not found
|
||||||
|
logger.warning(f"OCR Track: No raw OCR regions found for page {page_num}, falling back to elements")
|
||||||
|
self._render_reflow_elements(page_data, result_dir, styles, story)
|
||||||
|
else:
|
||||||
|
# === Direct Track: Use structured content ===
|
||||||
|
self._render_reflow_elements(page_data, result_dir, styles, story)
|
||||||
|
|
||||||
if not story:
|
if not story:
|
||||||
logger.warning("No content to generate reflow PDF")
|
logger.warning("No content to generate reflow PDF")
|
||||||
return False
|
return False
|
||||||
@@ -4869,6 +5106,9 @@ class PDFGeneratorService:
|
|||||||
merges them to replace original content with translations, and
|
merges them to replace original content with translations, and
|
||||||
generates a PDF with the translated content at original positions.
|
generates a PDF with the translated content at original positions.
|
||||||
|
|
||||||
|
For OCR Track: Uses raw_ocr_translations to translate raw OCR regions
|
||||||
|
For Direct Track: Uses translations dict to translate elements
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
result_json_path: Path to original result JSON file (UnifiedDocument format)
|
result_json_path: Path to original result JSON file (UnifiedDocument format)
|
||||||
translation_json_path: Path to translation JSON file
|
translation_json_path: Path to translation JSON file
|
||||||
@@ -4894,7 +5134,25 @@ class PDFGeneratorService:
|
|||||||
with open(translation_json_path, 'r', encoding='utf-8') as f:
|
with open(translation_json_path, 'r', encoding='utf-8') as f:
|
||||||
translation_json = json.load(f)
|
translation_json = json.load(f)
|
||||||
|
|
||||||
# Extract translations dict from translation JSON
|
# Check if this is OCR Track with raw_ocr_translations
|
||||||
|
raw_ocr_translations = translation_json.get('raw_ocr_translations', [])
|
||||||
|
processing_track = translation_json.get('processing_track', '')
|
||||||
|
target_lang = translation_json.get('target_lang', 'unknown')
|
||||||
|
|
||||||
|
if raw_ocr_translations and processing_track == 'ocr':
|
||||||
|
# OCR Track: Generate PDF using translated raw OCR regions
|
||||||
|
logger.info(
|
||||||
|
f"Generating translated PDF (OCR Track): {len(raw_ocr_translations)} "
|
||||||
|
f"raw OCR translations, target_lang={target_lang}"
|
||||||
|
)
|
||||||
|
return self._generate_translated_pdf_ocr_track(
|
||||||
|
result_json=result_json,
|
||||||
|
raw_ocr_translations=raw_ocr_translations,
|
||||||
|
output_path=output_path,
|
||||||
|
result_dir=result_json_path.parent
|
||||||
|
)
|
||||||
|
|
||||||
|
# Direct Track: Use element-based translations
|
||||||
translations = translation_json.get('translations', {})
|
translations = translation_json.get('translations', {})
|
||||||
if not translations:
|
if not translations:
|
||||||
logger.warning("No translations found in translation JSON")
|
logger.warning("No translations found in translation JSON")
|
||||||
@@ -4908,9 +5166,8 @@ class PDFGeneratorService:
|
|||||||
# Apply translations to result JSON
|
# Apply translations to result JSON
|
||||||
translated_doc = apply_translations(result_json, translations)
|
translated_doc = apply_translations(result_json, translations)
|
||||||
|
|
||||||
target_lang = translation_json.get('target_lang', 'unknown')
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Generating translated PDF: {len(translations)} translations applied, "
|
f"Generating translated PDF (Direct Track): {len(translations)} translations applied, "
|
||||||
f"target_lang={target_lang}"
|
f"target_lang={target_lang}"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -4927,10 +5184,12 @@ class PDFGeneratorService:
|
|||||||
try:
|
try:
|
||||||
# Use reflow PDF generation for better translated content display
|
# Use reflow PDF generation for better translated content display
|
||||||
# Pass result_json_path.parent as image directory (not the temp file's parent)
|
# Pass result_json_path.parent as image directory (not the temp file's parent)
|
||||||
|
# use_elements_only=True ensures we use translated elements, not raw_ocr_regions
|
||||||
success = self.generate_reflow_pdf(
|
success = self.generate_reflow_pdf(
|
||||||
json_path=tmp_path,
|
json_path=tmp_path,
|
||||||
output_path=output_path,
|
output_path=output_path,
|
||||||
source_file_path=result_json_path.parent # Contains extracted images
|
source_file_path=result_json_path.parent, # Contains extracted images
|
||||||
|
use_elements_only=True # Use elements with translations applied
|
||||||
)
|
)
|
||||||
return success
|
return success
|
||||||
finally:
|
finally:
|
||||||
@@ -4950,6 +5209,165 @@ class PDFGeneratorService:
|
|||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def _generate_translated_pdf_ocr_track(
|
||||||
|
self,
|
||||||
|
result_json: Dict,
|
||||||
|
raw_ocr_translations: List[Dict],
|
||||||
|
output_path: Path,
|
||||||
|
result_dir: Path
|
||||||
|
) -> bool:
|
||||||
|
"""
|
||||||
|
Generate translated reflow PDF for OCR Track documents.
|
||||||
|
|
||||||
|
Uses raw_ocr_translations to render translated text in reading order.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
result_json: Original result JSON data
|
||||||
|
raw_ocr_translations: List of {page, index, original, translated}
|
||||||
|
output_path: Path to save generated PDF
|
||||||
|
result_dir: Path to result directory for images
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if successful, False otherwise
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Get styles
|
||||||
|
styles = self._get_reflow_styles()
|
||||||
|
|
||||||
|
# Build document content
|
||||||
|
story = []
|
||||||
|
|
||||||
|
# Build translation lookup: {(page, index): translated_text}
|
||||||
|
translation_lookup: Dict[Tuple[int, int], str] = {}
|
||||||
|
for trans in raw_ocr_translations:
|
||||||
|
page = trans.get('page', 1)
|
||||||
|
idx = trans.get('index', 0)
|
||||||
|
translated = trans.get('translated', '')
|
||||||
|
if translated:
|
||||||
|
translation_lookup[(page, idx)] = translated
|
||||||
|
|
||||||
|
logger.info(f"Built translation lookup with {len(translation_lookup)} entries")
|
||||||
|
|
||||||
|
# Process each page
|
||||||
|
pages = result_json.get('pages', [])
|
||||||
|
task_id = result_dir.name
|
||||||
|
|
||||||
|
for page_idx, page_data in enumerate(pages):
|
||||||
|
page_num = page_idx + 1 # 1-indexed
|
||||||
|
if page_idx > 0:
|
||||||
|
# Add page break between pages
|
||||||
|
story.append(Spacer(1, 30))
|
||||||
|
|
||||||
|
# Load raw OCR regions for this page
|
||||||
|
if TEXT_REGION_RENDERER_AVAILABLE and load_raw_ocr_regions:
|
||||||
|
raw_regions = load_raw_ocr_regions(str(result_dir), task_id, page_num)
|
||||||
|
|
||||||
|
if raw_regions:
|
||||||
|
logger.info(
|
||||||
|
f"OCR Track translated PDF: Processing {len(raw_regions)} regions "
|
||||||
|
f"for page {page_num}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Collect exclusion zones (image bboxes) to filter text inside images
|
||||||
|
exclusion_zones = self._collect_exclusion_zones(page_data)
|
||||||
|
|
||||||
|
# Sort by Y coordinate (top to bottom reading order)
|
||||||
|
# Keep original indices for translation lookup
|
||||||
|
def get_y_coord(region_tuple):
|
||||||
|
region = region_tuple[1]
|
||||||
|
bbox = region.get('bbox', [])
|
||||||
|
if bbox and len(bbox) >= 4:
|
||||||
|
return (bbox[0][1] + bbox[1][1]) / 2
|
||||||
|
return 0
|
||||||
|
|
||||||
|
indexed_regions = list(enumerate(raw_regions))
|
||||||
|
sorted_regions = sorted(indexed_regions, key=get_y_coord)
|
||||||
|
|
||||||
|
# Render translated text blocks as paragraphs (skip those overlapping images)
|
||||||
|
for original_idx, region in sorted_regions:
|
||||||
|
# Skip regions overlapping with images
|
||||||
|
bbox = region.get('bbox', [])
|
||||||
|
if exclusion_zones and self._is_region_overlapping_exclusion(bbox, exclusion_zones):
|
||||||
|
continue
|
||||||
|
# Look up translation
|
||||||
|
translated_text = translation_lookup.get(
|
||||||
|
(page_num, original_idx),
|
||||||
|
region.get('text', '') # Fallback to original
|
||||||
|
)
|
||||||
|
|
||||||
|
if translated_text:
|
||||||
|
# Escape HTML special chars
|
||||||
|
translated_text = (translated_text
|
||||||
|
.replace('&', '&')
|
||||||
|
.replace('<', '<')
|
||||||
|
.replace('>', '>'))
|
||||||
|
story.append(Paragraph(translated_text, styles['Body']))
|
||||||
|
|
||||||
|
# Also render images/charts from elements
|
||||||
|
elements = self._get_elements_in_reading_order(page_data)
|
||||||
|
for elem in elements:
|
||||||
|
elem_type = elem.get('type', elem.get('element_type', ''))
|
||||||
|
if elem_type in ('image', 'figure', 'Image', 'Figure', 'chart', 'Chart'):
|
||||||
|
img = self._embed_image_reflow(elem, result_dir)
|
||||||
|
if img:
|
||||||
|
story.append(img)
|
||||||
|
story.append(Spacer(1, 8))
|
||||||
|
# Handle embedded images in tables
|
||||||
|
elif elem_type in ('table', 'Table'):
|
||||||
|
elem_metadata = elem.get('metadata', {})
|
||||||
|
embedded_images = elem_metadata.get('embedded_images', [])
|
||||||
|
for emb_img in embedded_images:
|
||||||
|
img_path_str = emb_img.get('saved_path', '')
|
||||||
|
if img_path_str:
|
||||||
|
img_path = result_dir / img_path_str
|
||||||
|
if not img_path.exists():
|
||||||
|
img_path = result_dir / Path(img_path_str).name
|
||||||
|
if img_path.exists():
|
||||||
|
try:
|
||||||
|
img = PlatypusImage(str(img_path))
|
||||||
|
max_width = 450
|
||||||
|
if img.drawWidth > max_width:
|
||||||
|
ratio = max_width / img.drawWidth
|
||||||
|
img.drawWidth = max_width
|
||||||
|
img.drawHeight *= ratio
|
||||||
|
story.append(img)
|
||||||
|
story.append(Spacer(1, 8))
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to embed table image: {e}")
|
||||||
|
else:
|
||||||
|
logger.warning(
|
||||||
|
f"No raw OCR regions found for page {page_num}, skipping"
|
||||||
|
)
|
||||||
|
|
||||||
|
if not story:
|
||||||
|
logger.warning("No content to generate translated OCR Track PDF")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Create PDF document
|
||||||
|
doc = SimpleDocTemplate(
|
||||||
|
str(output_path),
|
||||||
|
pagesize=A4,
|
||||||
|
leftMargin=50,
|
||||||
|
rightMargin=50,
|
||||||
|
topMargin=50,
|
||||||
|
bottomMargin=50
|
||||||
|
)
|
||||||
|
|
||||||
|
# Build PDF
|
||||||
|
doc.build(story)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Generated translated OCR Track PDF: {output_path} "
|
||||||
|
f"({output_path.stat().st_size} bytes)"
|
||||||
|
)
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to generate translated OCR Track PDF: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
return False
|
||||||
|
|
||||||
def generate_translated_layout_pdf(
|
def generate_translated_layout_pdf(
|
||||||
self,
|
self,
|
||||||
result_json_path: Path,
|
result_json_path: Path,
|
||||||
|
|||||||
@@ -233,19 +233,118 @@ class TranslationService:
|
|||||||
self._total_tokens = 0
|
self._total_tokens = 0
|
||||||
self._total_latency = 0.0
|
self._total_latency = 0.0
|
||||||
|
|
||||||
|
def _load_raw_ocr_regions(
|
||||||
|
self,
|
||||||
|
result_dir: Path,
|
||||||
|
task_id: str,
|
||||||
|
page_num: int
|
||||||
|
) -> List[Dict]:
|
||||||
|
"""
|
||||||
|
Load raw OCR regions for a specific page.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
result_dir: Path to result directory
|
||||||
|
task_id: Task ID
|
||||||
|
page_num: Page number (1-indexed)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of raw OCR region dictionaries with 'text' and 'bbox'
|
||||||
|
"""
|
||||||
|
import glob
|
||||||
|
|
||||||
|
# Pattern: {task_id}_*_page_{page_num}_raw_ocr_regions.json
|
||||||
|
pattern = str(result_dir / f"{task_id}_*_page_{page_num}_raw_ocr_regions.json")
|
||||||
|
matches = glob.glob(pattern)
|
||||||
|
|
||||||
|
if not matches:
|
||||||
|
logger.warning(f"No raw OCR regions file found for page {page_num}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(matches[0], 'r', encoding='utf-8') as f:
|
||||||
|
regions = json.load(f)
|
||||||
|
logger.info(f"Loaded {len(regions)} raw OCR regions from {matches[0]}")
|
||||||
|
return regions
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to load raw OCR regions: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def extract_translatable_elements_ocr_track(
|
||||||
|
self,
|
||||||
|
result_json: Dict,
|
||||||
|
result_dir: Path,
|
||||||
|
task_id: str
|
||||||
|
) -> Tuple[List[TranslatableItem], int]:
|
||||||
|
"""
|
||||||
|
Extract translatable elements from raw OCR regions for OCR Track documents.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
result_json: UnifiedDocument JSON data
|
||||||
|
result_dir: Path to result directory
|
||||||
|
task_id: Task ID
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (list of TranslatableItem, total region count)
|
||||||
|
"""
|
||||||
|
items = []
|
||||||
|
total_regions = 0
|
||||||
|
|
||||||
|
for page in result_json.get('pages', []):
|
||||||
|
page_number = page.get('page_number', 1)
|
||||||
|
|
||||||
|
# Load raw OCR regions for this page
|
||||||
|
raw_regions = self._load_raw_ocr_regions(result_dir, task_id, page_number)
|
||||||
|
|
||||||
|
for idx, region in enumerate(raw_regions):
|
||||||
|
total_regions += 1
|
||||||
|
text = region.get('text', '').strip()
|
||||||
|
|
||||||
|
if text:
|
||||||
|
# Use index as element_id for raw OCR regions
|
||||||
|
items.append(TranslatableItem(
|
||||||
|
element_id=f"raw_ocr_{page_number}_{idx}",
|
||||||
|
content=text,
|
||||||
|
element_type='raw_ocr_region',
|
||||||
|
page_number=page_number,
|
||||||
|
cell_position=(idx, 0) # Store original index in cell_position
|
||||||
|
))
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Extracted {len(items)} translatable items from {total_regions} raw OCR regions (OCR Track)"
|
||||||
|
)
|
||||||
|
return items, total_regions
|
||||||
|
|
||||||
def extract_translatable_elements(
|
def extract_translatable_elements(
|
||||||
self,
|
self,
|
||||||
result_json: Dict
|
result_json: Dict,
|
||||||
|
result_dir: Optional[Path] = None,
|
||||||
|
task_id: Optional[str] = None
|
||||||
) -> Tuple[List[TranslatableItem], int]:
|
) -> Tuple[List[TranslatableItem], int]:
|
||||||
"""
|
"""
|
||||||
Extract all translatable elements from a result JSON.
|
Extract all translatable elements from a result JSON.
|
||||||
|
|
||||||
|
For OCR Track documents, extracts from raw_ocr_regions.json files.
|
||||||
|
For Direct Track documents, extracts from elements in result JSON.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
result_json: UnifiedDocument JSON data
|
result_json: UnifiedDocument JSON data
|
||||||
|
result_dir: Path to result directory (required for OCR Track)
|
||||||
|
task_id: Task ID (required for OCR Track)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Tuple of (list of TranslatableItem, total element count)
|
Tuple of (list of TranslatableItem, total element count)
|
||||||
"""
|
"""
|
||||||
|
# Check processing track
|
||||||
|
metadata = result_json.get('metadata', {})
|
||||||
|
processing_track = metadata.get('processing_track', 'direct')
|
||||||
|
|
||||||
|
# For OCR Track, use raw OCR regions
|
||||||
|
if processing_track == 'ocr' and result_dir and task_id:
|
||||||
|
return self.extract_translatable_elements_ocr_track(
|
||||||
|
result_json, result_dir, task_id
|
||||||
|
)
|
||||||
|
|
||||||
|
# For Direct Track, use element-based extraction
|
||||||
items = []
|
items = []
|
||||||
total_elements = 0
|
total_elements = 0
|
||||||
|
|
||||||
@@ -290,7 +389,7 @@ class TranslationService:
|
|||||||
))
|
))
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Extracted {len(items)} translatable items from {total_elements} elements"
|
f"Extracted {len(items)} translatable items from {total_elements} elements (Direct Track)"
|
||||||
)
|
)
|
||||||
return items, total_elements
|
return items, total_elements
|
||||||
|
|
||||||
@@ -378,6 +477,7 @@ class TranslationService:
|
|||||||
original_content=item.content,
|
original_content=item.content,
|
||||||
translated_content=translated_content,
|
translated_content=translated_content,
|
||||||
element_type=item.element_type,
|
element_type=item.element_type,
|
||||||
|
page_number=item.page_number,
|
||||||
cell_position=item.cell_position
|
cell_position=item.cell_position
|
||||||
))
|
))
|
||||||
|
|
||||||
@@ -392,6 +492,7 @@ class TranslationService:
|
|||||||
original_content=item.content,
|
original_content=item.content,
|
||||||
translated_content=item.content, # Keep original
|
translated_content=item.content, # Keep original
|
||||||
element_type=item.element_type,
|
element_type=item.element_type,
|
||||||
|
page_number=item.page_number,
|
||||||
cell_position=item.cell_position
|
cell_position=item.cell_position
|
||||||
)
|
)
|
||||||
for item in batch.items
|
for item in batch.items
|
||||||
@@ -429,6 +530,7 @@ class TranslationService:
|
|||||||
original_content=item.content,
|
original_content=item.content,
|
||||||
translated_content=response.translated_text,
|
translated_content=response.translated_text,
|
||||||
element_type=item.element_type,
|
element_type=item.element_type,
|
||||||
|
page_number=item.page_number,
|
||||||
cell_position=item.cell_position
|
cell_position=item.cell_position
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -440,6 +542,7 @@ class TranslationService:
|
|||||||
original_content=item.content,
|
original_content=item.content,
|
||||||
translated_content=item.content, # Keep original
|
translated_content=item.content, # Keep original
|
||||||
element_type=item.element_type,
|
element_type=item.element_type,
|
||||||
|
page_number=item.page_number,
|
||||||
cell_position=item.cell_position
|
cell_position=item.cell_position
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -451,7 +554,8 @@ class TranslationService:
|
|||||||
target_lang: str,
|
target_lang: str,
|
||||||
total_elements: int,
|
total_elements: int,
|
||||||
processing_time: float,
|
processing_time: float,
|
||||||
batch_count: int
|
batch_count: int,
|
||||||
|
processing_track: str = 'direct'
|
||||||
) -> Dict:
|
) -> Dict:
|
||||||
"""
|
"""
|
||||||
Build the translation result JSON structure.
|
Build the translation result JSON structure.
|
||||||
@@ -464,13 +568,59 @@ class TranslationService:
|
|||||||
total_elements: Total elements in document
|
total_elements: Total elements in document
|
||||||
processing_time: Processing time in seconds
|
processing_time: Processing time in seconds
|
||||||
batch_count: Number of batches used
|
batch_count: Number of batches used
|
||||||
|
processing_track: 'ocr' or 'direct' - determines output format
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Translation result dictionary
|
Translation result dictionary
|
||||||
"""
|
"""
|
||||||
# Build translations dict
|
|
||||||
translations: Dict[str, Any] = {}
|
|
||||||
total_chars = 0
|
total_chars = 0
|
||||||
|
is_ocr_track = processing_track == 'ocr'
|
||||||
|
|
||||||
|
if is_ocr_track:
|
||||||
|
# OCR Track: Build raw_ocr_translations list
|
||||||
|
raw_ocr_translations: List[Dict] = []
|
||||||
|
|
||||||
|
for item in translated_items:
|
||||||
|
total_chars += len(item.translated_content)
|
||||||
|
|
||||||
|
if item.element_type == 'raw_ocr_region':
|
||||||
|
# Extract page and index from element_id: "raw_ocr_{page}_{idx}"
|
||||||
|
page_num = item.page_number
|
||||||
|
original_idx = item.cell_position[0] if item.cell_position else 0
|
||||||
|
|
||||||
|
raw_ocr_translations.append({
|
||||||
|
'page': page_num,
|
||||||
|
'index': original_idx,
|
||||||
|
'original': item.original_content,
|
||||||
|
'translated': item.translated_content
|
||||||
|
})
|
||||||
|
|
||||||
|
# Build statistics
|
||||||
|
skipped = total_elements - len(raw_ocr_translations)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
'schema_version': '1.0.0',
|
||||||
|
'source_document': source_document,
|
||||||
|
'source_lang': source_lang,
|
||||||
|
'target_lang': target_lang,
|
||||||
|
'provider': 'dify',
|
||||||
|
'translated_at': datetime.utcnow().isoformat() + 'Z',
|
||||||
|
'processing_track': 'ocr',
|
||||||
|
'statistics': {
|
||||||
|
'total_elements': total_elements,
|
||||||
|
'translated_elements': len(raw_ocr_translations),
|
||||||
|
'skipped_elements': skipped,
|
||||||
|
'total_characters': total_chars,
|
||||||
|
'processing_time_seconds': round(processing_time, 2),
|
||||||
|
'total_tokens': self._total_tokens,
|
||||||
|
'batch_count': batch_count
|
||||||
|
},
|
||||||
|
'translations': {}, # Empty for OCR Track
|
||||||
|
'raw_ocr_translations': raw_ocr_translations
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
# Direct Track: Build translations dict (existing logic)
|
||||||
|
translations: Dict[str, Any] = {}
|
||||||
|
|
||||||
for item in translated_items:
|
for item in translated_items:
|
||||||
total_chars += len(item.translated_content)
|
total_chars += len(item.translated_content)
|
||||||
@@ -548,9 +698,13 @@ class TranslationService:
|
|||||||
result_json = json.load(f)
|
result_json = json.load(f)
|
||||||
|
|
||||||
source_document = result_json.get('metadata', {}).get('filename', 'unknown')
|
source_document = result_json.get('metadata', {}).get('filename', 'unknown')
|
||||||
|
processing_track = result_json.get('metadata', {}).get('processing_track', 'direct')
|
||||||
|
result_dir = result_json_path.parent
|
||||||
|
|
||||||
# Extract translatable elements
|
# Extract translatable elements (passes result_dir and task_id for OCR Track)
|
||||||
items, total_elements = self.extract_translatable_elements(result_json)
|
items, total_elements = self.extract_translatable_elements(
|
||||||
|
result_json, result_dir, task_id
|
||||||
|
)
|
||||||
|
|
||||||
if not items:
|
if not items:
|
||||||
logger.warning("No translatable elements found")
|
logger.warning("No translatable elements found")
|
||||||
@@ -597,7 +751,8 @@ class TranslationService:
|
|||||||
target_lang=target_lang,
|
target_lang=target_lang,
|
||||||
total_elements=total_elements,
|
total_elements=total_elements,
|
||||||
processing_time=processing_time,
|
processing_time=processing_time,
|
||||||
batch_count=len(batches)
|
batch_count=len(batches),
|
||||||
|
processing_track=processing_track
|
||||||
)
|
)
|
||||||
|
|
||||||
# Save result
|
# Save result
|
||||||
|
|||||||
@@ -0,0 +1,51 @@
|
|||||||
|
# Change: Fix OCR Track Reflow PDF
|
||||||
|
|
||||||
|
## Why
|
||||||
|
|
||||||
|
The OCR Track reflow PDF generation is missing most content because:
|
||||||
|
|
||||||
|
1. PP-StructureV3 extracts tables as elements but stores `content: ""` (empty string) instead of structured `content.cells` data
|
||||||
|
2. The `generate_reflow_pdf` method expects `content.cells` for tables, so tables are skipped
|
||||||
|
3. Table text exists in `raw_ocr_regions.json` (59 text blocks) but is not used by reflow PDF generation
|
||||||
|
4. This causes significant content loss - only 6 text elements vs 59 raw OCR regions
|
||||||
|
|
||||||
|
The Layout PDF works correctly because it uses `raw_ocr_regions.json` via Simple Text Positioning mode, bypassing the need for structured table data.
|
||||||
|
|
||||||
|
## What Changes
|
||||||
|
|
||||||
|
### Reflow PDF Generation for OCR Track
|
||||||
|
|
||||||
|
Modify `generate_reflow_pdf` to use `raw_ocr_regions.json` as the primary text source for OCR Track documents:
|
||||||
|
|
||||||
|
1. **Detect processing track** from JSON metadata
|
||||||
|
2. **For OCR Track**: Load `raw_ocr_regions.json` and render all text blocks in reading order
|
||||||
|
3. **For Direct Track**: Continue using `content.cells` for tables (already works)
|
||||||
|
4. **Images/Charts**: Continue using `content.saved_path` from elements (works for both tracks)
|
||||||
|
|
||||||
|
### Data Flow
|
||||||
|
|
||||||
|
**OCR Track Reflow PDF (NEW):**
|
||||||
|
```
|
||||||
|
raw_ocr_regions.json (59 text blocks)
|
||||||
|
+ scan_result.json (images/charts only)
|
||||||
|
→ Sort by Y coordinate (reading order)
|
||||||
|
→ Render text paragraphs + images
|
||||||
|
```
|
||||||
|
|
||||||
|
**Direct Track Reflow PDF (UNCHANGED):**
|
||||||
|
```
|
||||||
|
*_result.json (elements with content.cells)
|
||||||
|
→ Render tables, text, images in order
|
||||||
|
```
|
||||||
|
|
||||||
|
## Impact
|
||||||
|
|
||||||
|
- **Affected file**: `backend/app/services/pdf_generator_service.py`
|
||||||
|
- **User experience**: OCR Track reflow PDF will contain all text content (matching Layout PDF)
|
||||||
|
- **Translation**: Reflow translated PDF will also work correctly for OCR Track
|
||||||
|
|
||||||
|
## Migration
|
||||||
|
|
||||||
|
- No data migration required
|
||||||
|
- Existing `raw_ocr_regions.json` files contain all necessary data
|
||||||
|
- No API changes
|
||||||
@@ -0,0 +1,23 @@
|
|||||||
|
## MODIFIED Requirements
|
||||||
|
|
||||||
|
### Requirement: Enhanced PDF Export with Layout Preservation
|
||||||
|
|
||||||
|
The PDF export SHALL accurately preserve document layout from both OCR and direct extraction tracks with correct coordinate transformation and multi-page support. For Direct Track, a background image rendering approach SHALL be used for visual fidelity.
|
||||||
|
|
||||||
|
#### Scenario: OCR Track reflow PDF uses raw OCR regions
|
||||||
|
- **WHEN** generating reflow PDF for an OCR Track document
|
||||||
|
- **THEN** the system SHALL load text content from `raw_ocr_regions.json` files
|
||||||
|
- **AND** text blocks SHALL be sorted by Y coordinate for reading order
|
||||||
|
- **AND** all text content SHALL match the Layout PDF output
|
||||||
|
- **AND** images and charts SHALL be embedded from element `saved_path`
|
||||||
|
|
||||||
|
#### Scenario: Direct Track reflow PDF uses structured content
|
||||||
|
- **WHEN** generating reflow PDF for a Direct Track document
|
||||||
|
- **THEN** the system SHALL use `content.cells` for table rendering
|
||||||
|
- **AND** text elements SHALL use `content` string directly
|
||||||
|
- **AND** images and charts SHALL be embedded from element `saved_path`
|
||||||
|
|
||||||
|
#### Scenario: Reflow PDF content consistency
|
||||||
|
- **WHEN** comparing Layout PDF and Reflow PDF for the same document
|
||||||
|
- **THEN** both PDFs SHALL contain the same text content
|
||||||
|
- **AND** only the presentation format SHALL differ (positioned vs flowing)
|
||||||
@@ -0,0 +1,51 @@
|
|||||||
|
# Tasks: Fix OCR Track Reflow PDF
|
||||||
|
|
||||||
|
## 1. Modify generate_reflow_pdf Method
|
||||||
|
|
||||||
|
- [x] 1.1 Add processing track detection
|
||||||
|
- File: `backend/app/services/pdf_generator_service.py`
|
||||||
|
- Location: `generate_reflow_pdf` method (line ~4704)
|
||||||
|
- Read `metadata.processing_track` from JSON data
|
||||||
|
- Branch logic based on track type
|
||||||
|
|
||||||
|
- [x] 1.2 Add helper function to load raw OCR regions
|
||||||
|
- File: `backend/app/services/pdf_generator_service.py`
|
||||||
|
- Using existing: `load_raw_ocr_regions` from `text_region_renderer.py`
|
||||||
|
- Pattern: `{task_id}_*_page_{page_num}_raw_ocr_regions.json`
|
||||||
|
- Return: List of text regions with bbox and content
|
||||||
|
|
||||||
|
- [x] 1.3 Implement OCR Track reflow rendering
|
||||||
|
- File: `backend/app/services/pdf_generator_service.py`
|
||||||
|
- For OCR Track: Load raw OCR regions per page
|
||||||
|
- Sort text blocks by Y coordinate (top to bottom reading order)
|
||||||
|
- Render text blocks as paragraphs
|
||||||
|
- Still render images/charts from elements
|
||||||
|
|
||||||
|
- [x] 1.4 Keep Direct Track logic unchanged
|
||||||
|
- File: `backend/app/services/pdf_generator_service.py`
|
||||||
|
- Direct Track continues using `content.cells` for tables
|
||||||
|
- Extracted to `_render_reflow_elements` helper method
|
||||||
|
- No changes to existing Direct Track flow
|
||||||
|
|
||||||
|
## 2. Handle Multi-page Documents
|
||||||
|
|
||||||
|
- [x] 2.1 Support per-page raw OCR files
|
||||||
|
- Pattern: `{task_id}_*_page_{page_num}_raw_ocr_regions.json`
|
||||||
|
- Iterate through pages and load corresponding raw OCR file
|
||||||
|
- Handle missing files gracefully (fall back to elements)
|
||||||
|
|
||||||
|
## 3. Testing
|
||||||
|
|
||||||
|
- [x] 3.1 Test OCR Track reflow PDF
|
||||||
|
- Test with: `a9259180-fc49-4890-8184-2e6d5f4edad3` (scan document)
|
||||||
|
- Verify: All 59 text blocks appear in reflow PDF
|
||||||
|
- Verify: Images are embedded correctly
|
||||||
|
|
||||||
|
- [x] 3.2 Test Direct Track reflow PDF
|
||||||
|
- Test with: `1b32428d-0609-4cfd-bc52-56be6956ac2e` (editable PDF)
|
||||||
|
- Verify: Tables render with cells
|
||||||
|
- Verify: No regression from changes
|
||||||
|
|
||||||
|
- [x] 3.3 Test translated reflow PDF
|
||||||
|
- Test: Complete translation then download reflow PDF
|
||||||
|
- Verify: Translated text appears correctly
|
||||||
@@ -0,0 +1,70 @@
|
|||||||
|
# Change: Fix OCR Track Translation
|
||||||
|
|
||||||
|
## Why
|
||||||
|
|
||||||
|
OCR Track translation is missing most content because:
|
||||||
|
|
||||||
|
1. Translation service (`extract_translatable_elements`) only processes elements from `scan_result.json`
|
||||||
|
2. OCR Track tables have `content: ""` (empty string) - no `content.cells` data
|
||||||
|
3. All table text exists in `raw_ocr_regions.json` (59 text blocks) but translation service ignores it
|
||||||
|
4. Result: Only 6 text elements translated vs 59 raw OCR regions available
|
||||||
|
|
||||||
|
**Current Data Flow (OCR Track):**
|
||||||
|
```
|
||||||
|
scan_result.json (10 elements, 6 text, 2 empty tables)
|
||||||
|
→ Translation extracts 6 text items
|
||||||
|
→ 53 text blocks in tables are NOT translated
|
||||||
|
```
|
||||||
|
|
||||||
|
**Expected Data Flow (OCR Track):**
|
||||||
|
```
|
||||||
|
raw_ocr_regions.json (59 text blocks)
|
||||||
|
→ Translation extracts ALL 59 text items
|
||||||
|
→ Complete translation coverage
|
||||||
|
```
|
||||||
|
|
||||||
|
## What Changes
|
||||||
|
|
||||||
|
### 1. Translation Service Enhancement
|
||||||
|
|
||||||
|
Modify `translate_document` in `translation_service.py` to:
|
||||||
|
|
||||||
|
1. **Detect processing track** from result JSON metadata
|
||||||
|
2. **For OCR Track**: Load and translate `raw_ocr_regions.json` instead of elements
|
||||||
|
3. **For Direct Track**: Continue using elements with `content.cells` (already works)
|
||||||
|
|
||||||
|
### 2. Translation Result Format for OCR Track
|
||||||
|
|
||||||
|
Add new field `raw_ocr_translations` to translation JSON for OCR Track:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"translations": { ... }, // element-based (for Direct Track)
|
||||||
|
"raw_ocr_translations": [ // NEW: for OCR Track
|
||||||
|
{
|
||||||
|
"index": 0,
|
||||||
|
"original": "华天科技(宝鸡)有限公司",
|
||||||
|
"translated": "Huatian Technology (Baoji) Co., Ltd."
|
||||||
|
},
|
||||||
|
...
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Translated PDF Generation
|
||||||
|
|
||||||
|
Modify `generate_translated_pdf` to use `raw_ocr_translations` when available for OCR Track documents.
|
||||||
|
|
||||||
|
## Impact
|
||||||
|
|
||||||
|
- **Affected files**:
|
||||||
|
- `backend/app/services/translation_service.py` - extraction and translation logic
|
||||||
|
- `backend/app/services/pdf_generator_service.py` - translated PDF rendering
|
||||||
|
- **User experience**: OCR Track translations will include ALL text content
|
||||||
|
- **API**: Translation JSON format extended (backward compatible)
|
||||||
|
|
||||||
|
## Migration
|
||||||
|
|
||||||
|
- No data migration required
|
||||||
|
- Existing translations continue to work (Direct Track unaffected)
|
||||||
|
- Re-translation needed for OCR Track documents to get full coverage
|
||||||
@@ -0,0 +1,56 @@
|
|||||||
|
# translation Specification Delta
|
||||||
|
|
||||||
|
## MODIFIED Requirements
|
||||||
|
|
||||||
|
### Requirement: Translation Content Extraction
|
||||||
|
|
||||||
|
The translation service SHALL extract content based on processing track type.
|
||||||
|
|
||||||
|
#### Scenario: OCR Track translation extraction
|
||||||
|
- **GIVEN** a document processed with OCR Track
|
||||||
|
- **AND** the result JSON has `metadata.processing_track = "ocr"`
|
||||||
|
- **WHEN** translation service extracts translatable content
|
||||||
|
- **THEN** it SHALL load `raw_ocr_regions.json` for each page
|
||||||
|
- **AND** it SHALL extract all text blocks from raw OCR regions
|
||||||
|
- **AND** it SHALL NOT rely on `content.cells` from table elements
|
||||||
|
|
||||||
|
#### Scenario: Direct Track translation extraction (unchanged)
|
||||||
|
- **GIVEN** a document processed with Direct Track
|
||||||
|
- **AND** the result JSON has `metadata.processing_track = "direct"` or no track specified
|
||||||
|
- **WHEN** translation service extracts translatable content
|
||||||
|
- **THEN** it SHALL extract from `pages[].elements[]` in result JSON
|
||||||
|
- **AND** it SHALL extract table cell content from `content.cells`
|
||||||
|
|
||||||
|
### Requirement: Translation Result Format
|
||||||
|
|
||||||
|
The translation result JSON SHALL support both element-based and raw OCR translations.
|
||||||
|
|
||||||
|
#### Scenario: OCR Track translation result format
|
||||||
|
- **GIVEN** an OCR Track document has been translated
|
||||||
|
- **WHEN** translation result is saved
|
||||||
|
- **THEN** the JSON SHALL include `raw_ocr_translations` array
|
||||||
|
- **AND** each item SHALL have `index`, `original`, and `translated` fields
|
||||||
|
- **AND** the `translations` object MAY be empty or contain header text translations
|
||||||
|
|
||||||
|
#### Scenario: Direct Track translation result format (unchanged)
|
||||||
|
- **GIVEN** a Direct Track document has been translated
|
||||||
|
- **WHEN** translation result is saved
|
||||||
|
- **THEN** the JSON SHALL use `translations` object mapping element_id to translated text
|
||||||
|
- **AND** `raw_ocr_translations` field SHALL NOT be present
|
||||||
|
|
||||||
|
### Requirement: Translated PDF Generation
|
||||||
|
|
||||||
|
The translated PDF generation SHALL use appropriate translation source based on processing track.
|
||||||
|
|
||||||
|
#### Scenario: OCR Track translated PDF generation
|
||||||
|
- **GIVEN** an OCR Track document with translations
|
||||||
|
- **AND** the translation JSON contains `raw_ocr_translations`
|
||||||
|
- **WHEN** generating translated reflow PDF
|
||||||
|
- **THEN** it SHALL apply translations from `raw_ocr_translations` by index
|
||||||
|
- **AND** it SHALL render all translated text blocks in reading order
|
||||||
|
|
||||||
|
#### Scenario: Direct Track translated PDF generation (unchanged)
|
||||||
|
- **GIVEN** a Direct Track document with translations
|
||||||
|
- **WHEN** generating translated reflow PDF
|
||||||
|
- **THEN** it SHALL apply translations from `translations` object by element_id
|
||||||
|
- **AND** existing behavior SHALL be unchanged
|
||||||
@@ -0,0 +1,76 @@
|
|||||||
|
# Tasks: Fix OCR Track Translation
|
||||||
|
|
||||||
|
## 1. Modify Translation Service
|
||||||
|
|
||||||
|
- [x] 1.1 Add processing track detection
|
||||||
|
- File: `backend/app/services/translation_service.py`
|
||||||
|
- Location: `translate_document` method
|
||||||
|
- Read `metadata.processing_track` from result JSON
|
||||||
|
- Pass track type to extraction method
|
||||||
|
|
||||||
|
- [x] 1.2 Create helper to load raw OCR regions
|
||||||
|
- File: `backend/app/services/translation_service.py`
|
||||||
|
- Function: `_load_raw_ocr_regions(result_dir, task_id, page_num)`
|
||||||
|
- Pattern: `{task_id}_*_page_{page_num}_raw_ocr_regions.json`
|
||||||
|
- Return: List of text regions with index and content
|
||||||
|
|
||||||
|
- [x] 1.3 Modify extract_translatable_elements for OCR Track
|
||||||
|
- File: `backend/app/services/translation_service.py`
|
||||||
|
- Added: `extract_translatable_elements_ocr_track` method
|
||||||
|
- Added parameters: `result_dir: Path`, `task_id: str`
|
||||||
|
- For OCR Track: Extract from raw_ocr_regions.json
|
||||||
|
- For Direct Track: Keep existing element-based extraction
|
||||||
|
|
||||||
|
- [x] 1.4 Update translation result format
|
||||||
|
- File: `backend/app/services/translation_service.py`
|
||||||
|
- Location: `build_translation_result` method
|
||||||
|
- Added `processing_track` parameter
|
||||||
|
- For OCR Track: Output `raw_ocr_translations` field
|
||||||
|
- Structure: `[{"page": 1, "index": 0, "original": "...", "translated": "..."}]`
|
||||||
|
|
||||||
|
## 2. Modify PDF Generation
|
||||||
|
|
||||||
|
- [x] 2.1 Update generate_translated_pdf for OCR Track
|
||||||
|
- File: `backend/app/services/pdf_generator_service.py`
|
||||||
|
- Detect `processing_track` and `raw_ocr_translations` from translation JSON
|
||||||
|
- For OCR Track: Call `_generate_translated_pdf_ocr_track`
|
||||||
|
- For Direct Track: Continue using `apply_translations` (element-based)
|
||||||
|
|
||||||
|
- [x] 2.2 Create helper to apply raw OCR translations
|
||||||
|
- File: `backend/app/services/pdf_generator_service.py`
|
||||||
|
- Function: `_generate_translated_pdf_ocr_track`
|
||||||
|
- Build translation lookup: `{(page, index): translated_text}`
|
||||||
|
- Load raw OCR regions, sort by Y coordinate
|
||||||
|
- Render translated text with original fallback
|
||||||
|
|
||||||
|
## 3. Additional Fixes
|
||||||
|
|
||||||
|
- [x] 3.1 Add page_number to TranslatedItem
|
||||||
|
- File: `backend/app/schemas/translation.py`
|
||||||
|
- Added `page_number: int = 1` to TranslatedItem dataclass
|
||||||
|
- Updated `translate_batch` and `translate_item` to pass page_number
|
||||||
|
|
||||||
|
- [x] 3.2 Update API endpoint validation
|
||||||
|
- File: `backend/app/routers/translate.py`
|
||||||
|
- Check for both `translations` (Direct Track) and `raw_ocr_translations` (OCR Track)
|
||||||
|
|
||||||
|
- [x] 3.3 Filter text overlapping with images
|
||||||
|
- File: `backend/app/services/pdf_generator_service.py`
|
||||||
|
- Added `_collect_exclusion_zones`, `_is_region_overlapping_exclusion`, `_filter_regions_by_exclusion`
|
||||||
|
- Applied filtering in `generate_reflow_pdf` and `_generate_translated_pdf_ocr_track`
|
||||||
|
|
||||||
|
## 4. Testing
|
||||||
|
|
||||||
|
- [x] 4.1 Test OCR Track translation
|
||||||
|
- Test with: `f8265449-6cb7-425d-a213-5d2e1af73955`
|
||||||
|
- Verify: All 59 text blocks are sent for translation
|
||||||
|
- Verify: Translation JSON contains `raw_ocr_translations`
|
||||||
|
|
||||||
|
- [x] 4.2 Test OCR Track translated PDF
|
||||||
|
- Generate translated reflow PDF
|
||||||
|
- Verify: All translated text blocks appear correctly
|
||||||
|
- Verify: Text inside images (like EWsenel) is filtered out
|
||||||
|
|
||||||
|
- [x] 4.3 Test Direct Track unchanged
|
||||||
|
- Verify: Translation still uses element-based approach
|
||||||
|
- Verify: No regression in Direct Track flow
|
||||||
@@ -58,36 +58,23 @@ Export settings (format, thresholds, templates) SHALL apply consistently to V2 t
|
|||||||
|
|
||||||
The PDF export SHALL accurately preserve document layout from both OCR and direct extraction tracks with correct coordinate transformation and multi-page support. For Direct Track, a background image rendering approach SHALL be used for visual fidelity.
|
The PDF export SHALL accurately preserve document layout from both OCR and direct extraction tracks with correct coordinate transformation and multi-page support. For Direct Track, a background image rendering approach SHALL be used for visual fidelity.
|
||||||
|
|
||||||
#### Scenario: Export PDF from direct extraction track
|
#### Scenario: OCR Track reflow PDF uses raw OCR regions
|
||||||
- **WHEN** exporting PDF from a direct-extraction processed document
|
- **WHEN** generating reflow PDF for an OCR Track document
|
||||||
- **THEN** the system SHALL render source PDF pages as full-page background images at 2x resolution
|
- **THEN** the system SHALL load text content from `raw_ocr_regions.json` files
|
||||||
- **AND** overlay invisible text elements using PDF Text Rendering Mode 3
|
- **AND** text blocks SHALL be sorted by Y coordinate for reading order
|
||||||
- **AND** text SHALL remain selectable and searchable despite being invisible
|
- **AND** all text content SHALL match the Layout PDF output
|
||||||
- **AND** visual output SHALL match source document exactly
|
- **AND** images and charts SHALL be embedded from element `saved_path`
|
||||||
|
|
||||||
#### Scenario: Export PDF from OCR track with full structure
|
#### Scenario: Direct Track reflow PDF uses structured content
|
||||||
- **WHEN** exporting PDF from OCR-processed document
|
- **WHEN** generating reflow PDF for a Direct Track document
|
||||||
- **THEN** the PDF SHALL use all 23 PP-StructureV3 element types
|
- **THEN** the system SHALL use `content.cells` for table rendering
|
||||||
- **AND** render tables with proper cell boundaries
|
- **AND** text elements SHALL use `content` string directly
|
||||||
- **AND** maintain reading order from parsing_res_list
|
- **AND** images and charts SHALL be embedded from element `saved_path`
|
||||||
|
|
||||||
#### Scenario: Handle coordinate transformations correctly
|
#### Scenario: Reflow PDF content consistency
|
||||||
- **WHEN** generating PDF from UnifiedDocument
|
- **WHEN** comparing Layout PDF and Reflow PDF for the same document
|
||||||
- **THEN** system SHALL use explicit page dimensions from OCR results (not inferred from bounding boxes)
|
- **THEN** both PDFs SHALL contain the same text content
|
||||||
- **AND** correctly transform Y-axis coordinates from top-left (OCR) to bottom-left (PDF/ReportLab) origin
|
- **AND** only the presentation format SHALL differ (positioned vs flowing)
|
||||||
- **AND** prevent vertical flipping or position misalignment errors
|
|
||||||
|
|
||||||
#### Scenario: Direct Track PDF file size increase
|
|
||||||
- **WHEN** generating Layout PDF for Direct Track documents
|
|
||||||
- **THEN** the system SHALL accept increased file size due to embedded page images
|
|
||||||
- **AND** approximately 1-2 MB per page at 2x resolution is expected
|
|
||||||
- **AND** this trade-off is accepted for improved visual fidelity
|
|
||||||
|
|
||||||
#### Scenario: Chart elements excluded from text layer
|
|
||||||
- **WHEN** generating Layout PDF containing charts
|
|
||||||
- **THEN** the system SHALL NOT include chart-internal text in the invisible text layer
|
|
||||||
- **AND** chart visuals SHALL be preserved in the background image
|
|
||||||
- **AND** chart text SHALL NOT be available for text selection or translation
|
|
||||||
|
|
||||||
### Requirement: Structure Data Export
|
### Requirement: Structure Data Export
|
||||||
The system SHALL provide export formats that preserve document structure for downstream processing.
|
The system SHALL provide export formats that preserve document structure for downstream processing.
|
||||||
|
|||||||
Reference in New Issue
Block a user