2ND

2025-09-02 13:11:48 +08:00
parent a60d965317
commit b11a8272c4
76 changed files with 15321 additions and 200 deletions
--- a/debug_translation_flow.py
+++ b/debug_translation_flow.py
@@ -0,0 +1,176 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Debug the complete translation flow to find where translations are lost
+"""
+
+import sys
+import os
+
+# Fix encoding for Windows console
+if sys.stdout.encoding != 'utf-8':
+    sys.stdout.reconfigure(encoding='utf-8')
+if sys.stderr.encoding != 'utf-8':
+    sys.stderr.reconfigure(encoding='utf-8')
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'app'))
+
+from app import create_app
+from app.services.document_processor import DocumentProcessor
+from app.services.dify_client import DifyClient
+from pathlib import Path
+
+def debug_translation_flow():
+    """Debug the complete translation flow"""
+    
+    app = create_app()
+    
+    with app.app_context():
+        # Use the actual job file
+        job_file_path = r"C:\Users\EGG\WORK\data\user_scrip\TOOL\Document_translator_V2\uploads\8cada04e-da42-4416-af46-f01cca5a452f\original_-OR026_8cada04e.docx"
+        
+        if not Path(job_file_path).exists():
+            print(f"ERROR: Job file does not exist: {job_file_path}")
+            return
+        
+        print("=== DEBUGGING TRANSLATION FLOW ===")
+        print(f"File: {job_file_path}")
+        
+        # Step 1: Extract segments
+        print("\n1. EXTRACTING SEGMENTS...")
+        processor = DocumentProcessor()
+        segments = processor.extract_docx_segments(job_file_path)
+        
+        translatable_segments = []
+        for i, seg in enumerate(segments):
+            if processor.should_translate_text(seg.text, 'auto'):
+                translatable_segments.append(seg)
+        
+        print(f"Total segments: {len(segments)}")
+        print(f"Translatable segments: {len(translatable_segments)}")
+        print(f"First 3 translatable segments:")
+        for i, seg in enumerate(translatable_segments[:3]):
+            print(f"  {i+1}. {repr(seg.text[:50])}")
+        
+        # Step 2: Test Dify translation on first few segments
+        print("\n2. TESTING DIFY TRANSLATIONS...")
+        dify_client = DifyClient()
+        translation_map = {}
+        target_languages = ['en', 'vi']
+        
+        for target_lang in target_languages:
+            print(f"\nTesting translation to {target_lang}:")
+            
+            for i, seg in enumerate(translatable_segments[:3]):  # Test first 3
+                try:
+                    print(f"  Translating: {repr(seg.text)}")
+                    
+                    result = dify_client.translate_text(
+                        text=seg.text,
+                        source_language='zh-cn',
+                        target_language=target_lang,
+                        user_id=1,
+                        job_id=1
+                    )
+                    
+                    translated_text = result.get('translated_text', '')
+                    translation_map[(target_lang, seg.text)] = translated_text
+                    
+                    print(f"  Result: {repr(translated_text)}")
+                    print(f"  Success: {translated_text != seg.text and translated_text.strip()}")
+                    
+                except Exception as e:
+                    print(f"  ERROR: {e}")
+                    translation_map[(target_lang, seg.text)] = f"[ERROR] {seg.text}"
+        
+        # Step 3: Test translation insertion
+        print(f"\n3. TESTING TRANSLATION INSERTION...")
+        print(f"Translation map entries: {len(translation_map)}")
+        
+        for key, value in list(translation_map.items())[:6]:
+            lang, source = key
+            print(f"  {lang} | {repr(source[:30])} -> {repr(value[:30])}")
+        
+        # Debug: Check which segments will be matched
+        print(f"\n3.1. SEGMENT MATCHING DEBUG...")
+        target_langs_for_test = ['en']
+        matched_count = 0
+        
+        for i, seg in enumerate(segments[:10]):  # Check first 10 segments
+            has_translation = any((tgt, seg.text) in translation_map for tgt in target_langs_for_test)
+            status = "MATCH" if has_translation else "NO MATCH"
+            print(f"  Segment {i+1}: {status} | {repr(seg.text[:40])}")
+            if has_translation:
+                matched_count += 1
+                for tgt in target_langs_for_test:
+                    if (tgt, seg.text) in translation_map:
+                        translation = translation_map[(tgt, seg.text)]
+                        print(f"    -> {tgt}: {repr(translation[:40])}")
+        
+        print(f"Segments that will match: {matched_count}/10 (in first 10)")
+        
+        # Step 4: Check translation cache for real job data
+        print(f"\n4. CHECKING TRANSLATION CACHE...")
+        from app.models.cache import TranslationCache
+        
+        # Check if there are any cached translations for the segments
+        cache_hits = 0
+        cache_misses = 0
+        
+        for i, seg in enumerate(translatable_segments[:5]):  # Check first 5
+            for target_lang in ['en', 'vi']:
+                cached = TranslationCache.get_translation(
+                    text=seg.text,
+                    source_language='zh-cn',
+                    target_language=target_lang
+                )
+                if cached:
+                    print(f"  CACHE HIT: {target_lang} | {repr(seg.text[:30])} -> {repr(cached[:30])}")
+                    cache_hits += 1
+                else:
+                    cache_misses += 1
+        
+        print(f"Cache hits: {cache_hits}, Cache misses: {cache_misses}")
+        
+        # Create test output file
+        output_path = str(Path(job_file_path).parent / "flow_debug_translated.docx")
+        
+        try:
+            ok_count, skip_count = processor.insert_docx_translations(
+                file_path=job_file_path,
+                segments=segments,
+                translation_map=translation_map,
+                target_languages=['en'],  # Test with one language first
+                output_path=output_path
+            )
+            
+            print(f"Translation insertion: {ok_count} OK, {skip_count} skipped")
+            
+            if Path(output_path).exists():
+                print(f"✅ Output file created: {Path(output_path).stat().st_size} bytes")
+                
+                # Verify the output contains translations
+                test_segments = processor.extract_docx_segments(output_path)
+                print(f"Output file segments: {len(test_segments)}")
+                
+                # Look for evidence of translations
+                translation_evidence = []
+                for seg in test_segments:
+                    # Check if segment text appears to be a translation
+                    if any(word in seg.text.lower() for word in ['purpose', 'equipment', 'maintenance', 'check']):
+                        translation_evidence.append(seg.text[:50])
+                
+                print(f"Translation evidence found: {len(translation_evidence)} segments")
+                for evidence in translation_evidence[:3]:
+                    print(f"  - {repr(evidence)}")
+                    
+            else:
+                print("❌ Output file was not created")
+                
+        except Exception as e:
+            print(f"ERROR during insertion: {e}")
+            import traceback
+            traceback.print_exc()
+
+if __name__ == "__main__":
+    debug_translation_flow()