#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Debug the complete translation flow to find where translations are lost """ import sys import os # Fix encoding for Windows console if sys.stdout.encoding != 'utf-8': sys.stdout.reconfigure(encoding='utf-8') if sys.stderr.encoding != 'utf-8': sys.stderr.reconfigure(encoding='utf-8') sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'app')) from app import create_app from app.services.document_processor import DocumentProcessor from app.services.dify_client import DifyClient from pathlib import Path def debug_translation_flow(): """Debug the complete translation flow""" app = create_app() with app.app_context(): # Use the actual job file job_file_path = r"C:\Users\EGG\WORK\data\user_scrip\TOOL\Document_translator_V2\uploads\8cada04e-da42-4416-af46-f01cca5a452f\original_-OR026_8cada04e.docx" if not Path(job_file_path).exists(): print(f"ERROR: Job file does not exist: {job_file_path}") return print("=== DEBUGGING TRANSLATION FLOW ===") print(f"File: {job_file_path}") # Step 1: Extract segments print("\n1. EXTRACTING SEGMENTS...") processor = DocumentProcessor() segments = processor.extract_docx_segments(job_file_path) translatable_segments = [] for i, seg in enumerate(segments): if processor.should_translate_text(seg.text, 'auto'): translatable_segments.append(seg) print(f"Total segments: {len(segments)}") print(f"Translatable segments: {len(translatable_segments)}") print(f"First 3 translatable segments:") for i, seg in enumerate(translatable_segments[:3]): print(f" {i+1}. {repr(seg.text[:50])}") # Step 2: Test Dify translation on first few segments print("\n2. TESTING DIFY TRANSLATIONS...") dify_client = DifyClient() translation_map = {} target_languages = ['en', 'vi'] for target_lang in target_languages: print(f"\nTesting translation to {target_lang}:") for i, seg in enumerate(translatable_segments[:3]): # Test first 3 try: print(f" Translating: {repr(seg.text)}") result = dify_client.translate_text( text=seg.text, source_language='zh-cn', target_language=target_lang, user_id=1, job_id=1 ) translated_text = result.get('translated_text', '') translation_map[(target_lang, seg.text)] = translated_text print(f" Result: {repr(translated_text)}") print(f" Success: {translated_text != seg.text and translated_text.strip()}") except Exception as e: print(f" ERROR: {e}") translation_map[(target_lang, seg.text)] = f"[ERROR] {seg.text}" # Step 3: Test translation insertion print(f"\n3. TESTING TRANSLATION INSERTION...") print(f"Translation map entries: {len(translation_map)}") for key, value in list(translation_map.items())[:6]: lang, source = key print(f" {lang} | {repr(source[:30])} -> {repr(value[:30])}") # Debug: Check which segments will be matched print(f"\n3.1. SEGMENT MATCHING DEBUG...") target_langs_for_test = ['en'] matched_count = 0 for i, seg in enumerate(segments[:10]): # Check first 10 segments has_translation = any((tgt, seg.text) in translation_map for tgt in target_langs_for_test) status = "MATCH" if has_translation else "NO MATCH" print(f" Segment {i+1}: {status} | {repr(seg.text[:40])}") if has_translation: matched_count += 1 for tgt in target_langs_for_test: if (tgt, seg.text) in translation_map: translation = translation_map[(tgt, seg.text)] print(f" -> {tgt}: {repr(translation[:40])}") print(f"Segments that will match: {matched_count}/10 (in first 10)") # Step 4: Check translation cache for real job data print(f"\n4. CHECKING TRANSLATION CACHE...") from app.models.cache import TranslationCache # Check if there are any cached translations for the segments cache_hits = 0 cache_misses = 0 for i, seg in enumerate(translatable_segments[:5]): # Check first 5 for target_lang in ['en', 'vi']: cached = TranslationCache.get_translation( text=seg.text, source_language='zh-cn', target_language=target_lang ) if cached: print(f" CACHE HIT: {target_lang} | {repr(seg.text[:30])} -> {repr(cached[:30])}") cache_hits += 1 else: cache_misses += 1 print(f"Cache hits: {cache_hits}, Cache misses: {cache_misses}") # Create test output file output_path = str(Path(job_file_path).parent / "flow_debug_translated.docx") try: ok_count, skip_count = processor.insert_docx_translations( file_path=job_file_path, segments=segments, translation_map=translation_map, target_languages=['en'], # Test with one language first output_path=output_path ) print(f"Translation insertion: {ok_count} OK, {skip_count} skipped") if Path(output_path).exists(): print(f"✅ Output file created: {Path(output_path).stat().st_size} bytes") # Verify the output contains translations test_segments = processor.extract_docx_segments(output_path) print(f"Output file segments: {len(test_segments)}") # Look for evidence of translations translation_evidence = [] for seg in test_segments: # Check if segment text appears to be a translation if any(word in seg.text.lower() for word in ['purpose', 'equipment', 'maintenance', 'check']): translation_evidence.append(seg.text[:50]) print(f"Translation evidence found: {len(translation_evidence)} segments") for evidence in translation_evidence[:3]: print(f" - {repr(evidence)}") else: print("❌ Output file was not created") except Exception as e: print(f"ERROR during insertion: {e}") import traceback traceback.print_exc() if __name__ == "__main__": debug_translation_flow()