2ND
This commit is contained in:
176
debug_translation_flow.py
Normal file
176
debug_translation_flow.py
Normal file
@@ -0,0 +1,176 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Debug the complete translation flow to find where translations are lost
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
# Fix encoding for Windows console
|
||||
if sys.stdout.encoding != 'utf-8':
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
if sys.stderr.encoding != 'utf-8':
|
||||
sys.stderr.reconfigure(encoding='utf-8')
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'app'))
|
||||
|
||||
from app import create_app
|
||||
from app.services.document_processor import DocumentProcessor
|
||||
from app.services.dify_client import DifyClient
|
||||
from pathlib import Path
|
||||
|
||||
def debug_translation_flow():
|
||||
"""Debug the complete translation flow"""
|
||||
|
||||
app = create_app()
|
||||
|
||||
with app.app_context():
|
||||
# Use the actual job file
|
||||
job_file_path = r"C:\Users\EGG\WORK\data\user_scrip\TOOL\Document_translator_V2\uploads\8cada04e-da42-4416-af46-f01cca5a452f\original_-OR026_8cada04e.docx"
|
||||
|
||||
if not Path(job_file_path).exists():
|
||||
print(f"ERROR: Job file does not exist: {job_file_path}")
|
||||
return
|
||||
|
||||
print("=== DEBUGGING TRANSLATION FLOW ===")
|
||||
print(f"File: {job_file_path}")
|
||||
|
||||
# Step 1: Extract segments
|
||||
print("\n1. EXTRACTING SEGMENTS...")
|
||||
processor = DocumentProcessor()
|
||||
segments = processor.extract_docx_segments(job_file_path)
|
||||
|
||||
translatable_segments = []
|
||||
for i, seg in enumerate(segments):
|
||||
if processor.should_translate_text(seg.text, 'auto'):
|
||||
translatable_segments.append(seg)
|
||||
|
||||
print(f"Total segments: {len(segments)}")
|
||||
print(f"Translatable segments: {len(translatable_segments)}")
|
||||
print(f"First 3 translatable segments:")
|
||||
for i, seg in enumerate(translatable_segments[:3]):
|
||||
print(f" {i+1}. {repr(seg.text[:50])}")
|
||||
|
||||
# Step 2: Test Dify translation on first few segments
|
||||
print("\n2. TESTING DIFY TRANSLATIONS...")
|
||||
dify_client = DifyClient()
|
||||
translation_map = {}
|
||||
target_languages = ['en', 'vi']
|
||||
|
||||
for target_lang in target_languages:
|
||||
print(f"\nTesting translation to {target_lang}:")
|
||||
|
||||
for i, seg in enumerate(translatable_segments[:3]): # Test first 3
|
||||
try:
|
||||
print(f" Translating: {repr(seg.text)}")
|
||||
|
||||
result = dify_client.translate_text(
|
||||
text=seg.text,
|
||||
source_language='zh-cn',
|
||||
target_language=target_lang,
|
||||
user_id=1,
|
||||
job_id=1
|
||||
)
|
||||
|
||||
translated_text = result.get('translated_text', '')
|
||||
translation_map[(target_lang, seg.text)] = translated_text
|
||||
|
||||
print(f" Result: {repr(translated_text)}")
|
||||
print(f" Success: {translated_text != seg.text and translated_text.strip()}")
|
||||
|
||||
except Exception as e:
|
||||
print(f" ERROR: {e}")
|
||||
translation_map[(target_lang, seg.text)] = f"[ERROR] {seg.text}"
|
||||
|
||||
# Step 3: Test translation insertion
|
||||
print(f"\n3. TESTING TRANSLATION INSERTION...")
|
||||
print(f"Translation map entries: {len(translation_map)}")
|
||||
|
||||
for key, value in list(translation_map.items())[:6]:
|
||||
lang, source = key
|
||||
print(f" {lang} | {repr(source[:30])} -> {repr(value[:30])}")
|
||||
|
||||
# Debug: Check which segments will be matched
|
||||
print(f"\n3.1. SEGMENT MATCHING DEBUG...")
|
||||
target_langs_for_test = ['en']
|
||||
matched_count = 0
|
||||
|
||||
for i, seg in enumerate(segments[:10]): # Check first 10 segments
|
||||
has_translation = any((tgt, seg.text) in translation_map for tgt in target_langs_for_test)
|
||||
status = "MATCH" if has_translation else "NO MATCH"
|
||||
print(f" Segment {i+1}: {status} | {repr(seg.text[:40])}")
|
||||
if has_translation:
|
||||
matched_count += 1
|
||||
for tgt in target_langs_for_test:
|
||||
if (tgt, seg.text) in translation_map:
|
||||
translation = translation_map[(tgt, seg.text)]
|
||||
print(f" -> {tgt}: {repr(translation[:40])}")
|
||||
|
||||
print(f"Segments that will match: {matched_count}/10 (in first 10)")
|
||||
|
||||
# Step 4: Check translation cache for real job data
|
||||
print(f"\n4. CHECKING TRANSLATION CACHE...")
|
||||
from app.models.cache import TranslationCache
|
||||
|
||||
# Check if there are any cached translations for the segments
|
||||
cache_hits = 0
|
||||
cache_misses = 0
|
||||
|
||||
for i, seg in enumerate(translatable_segments[:5]): # Check first 5
|
||||
for target_lang in ['en', 'vi']:
|
||||
cached = TranslationCache.get_translation(
|
||||
text=seg.text,
|
||||
source_language='zh-cn',
|
||||
target_language=target_lang
|
||||
)
|
||||
if cached:
|
||||
print(f" CACHE HIT: {target_lang} | {repr(seg.text[:30])} -> {repr(cached[:30])}")
|
||||
cache_hits += 1
|
||||
else:
|
||||
cache_misses += 1
|
||||
|
||||
print(f"Cache hits: {cache_hits}, Cache misses: {cache_misses}")
|
||||
|
||||
# Create test output file
|
||||
output_path = str(Path(job_file_path).parent / "flow_debug_translated.docx")
|
||||
|
||||
try:
|
||||
ok_count, skip_count = processor.insert_docx_translations(
|
||||
file_path=job_file_path,
|
||||
segments=segments,
|
||||
translation_map=translation_map,
|
||||
target_languages=['en'], # Test with one language first
|
||||
output_path=output_path
|
||||
)
|
||||
|
||||
print(f"Translation insertion: {ok_count} OK, {skip_count} skipped")
|
||||
|
||||
if Path(output_path).exists():
|
||||
print(f"✅ Output file created: {Path(output_path).stat().st_size} bytes")
|
||||
|
||||
# Verify the output contains translations
|
||||
test_segments = processor.extract_docx_segments(output_path)
|
||||
print(f"Output file segments: {len(test_segments)}")
|
||||
|
||||
# Look for evidence of translations
|
||||
translation_evidence = []
|
||||
for seg in test_segments:
|
||||
# Check if segment text appears to be a translation
|
||||
if any(word in seg.text.lower() for word in ['purpose', 'equipment', 'maintenance', 'check']):
|
||||
translation_evidence.append(seg.text[:50])
|
||||
|
||||
print(f"Translation evidence found: {len(translation_evidence)} segments")
|
||||
for evidence in translation_evidence[:3]:
|
||||
print(f" - {repr(evidence)}")
|
||||
|
||||
else:
|
||||
print("❌ Output file was not created")
|
||||
|
||||
except Exception as e:
|
||||
print(f"ERROR during insertion: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
if __name__ == "__main__":
|
||||
debug_translation_flow()
|
Reference in New Issue
Block a user