4th_fix time error
This commit is contained in:
161
debug_paragraph_structure.py
Normal file
161
debug_paragraph_structure.py
Normal file
@@ -0,0 +1,161 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
調試段落結構問題
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import tempfile
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
# Fix encoding for Windows console
|
||||
if sys.stdout.encoding != 'utf-8':
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
if sys.stderr.encoding != 'utf-8':
|
||||
sys.stderr.reconfigure(encoding='utf-8')
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'app'))
|
||||
|
||||
from app import create_app, db
|
||||
from app.services.document_processor import DocumentProcessor, _append_after
|
||||
from sqlalchemy import text as sql_text
|
||||
|
||||
def debug_paragraph_structure():
|
||||
"""調試段落結構問題"""
|
||||
|
||||
app = create_app()
|
||||
|
||||
with app.app_context():
|
||||
print("=== 調試段落結構問題 ===")
|
||||
|
||||
# 原始文件
|
||||
original_path = r"C:\Users\EGG\WORK\data\user_scrip\TOOL\Document_translator_V2\uploads\9c6548ac-2f59-45f4-aade-0a9b3895bbfd\original_-OR026_9c6548ac.docx"
|
||||
|
||||
# 創建測試副本
|
||||
test_dir = Path(tempfile.gettempdir()) / "debug_paragraph"
|
||||
test_dir.mkdir(exist_ok=True)
|
||||
test_path = test_dir / "debug_paragraph.docx"
|
||||
|
||||
shutil.copy2(original_path, test_path)
|
||||
print(f"✅ 創建測試副本: {test_path}")
|
||||
|
||||
# 創建處理器
|
||||
processor = DocumentProcessor()
|
||||
|
||||
# 提取段落
|
||||
segments = processor.extract_docx_segments(str(test_path))
|
||||
|
||||
# 只看前3個段落
|
||||
debug_segments = segments[:3]
|
||||
|
||||
# 載入文檔
|
||||
try:
|
||||
from docx import Document
|
||||
doc = Document(str(test_path))
|
||||
|
||||
print(f"\n📊 文檔分析:")
|
||||
print(f"總段落數: {len(doc.paragraphs)}")
|
||||
|
||||
print(f"\n🔍 前3個段落詳細分析:")
|
||||
|
||||
for i, seg in enumerate(debug_segments):
|
||||
if seg.kind == "para":
|
||||
p = seg.ref
|
||||
|
||||
print(f"\n段落 {i+1}:")
|
||||
print(f" 文本: {seg.text[:50]}...")
|
||||
print(f" 段落類型: {type(p)}")
|
||||
print(f" 段落父元素類型: {type(p._parent)}")
|
||||
print(f" 段落XML標籤: {p._p.tag if hasattr(p._p, 'tag') else 'N/A'}")
|
||||
|
||||
# 檢查段落位置
|
||||
try:
|
||||
all_paras = list(doc.paragraphs)
|
||||
current_index = -1
|
||||
for idx, doc_p in enumerate(all_paras):
|
||||
if doc_p._element == p._element:
|
||||
current_index = idx
|
||||
break
|
||||
print(f" 在文檔中的位置: {current_index} (總共{len(all_paras)}段)")
|
||||
|
||||
# 測試_append_after插入
|
||||
print(f" 測試插入翻譯...")
|
||||
|
||||
test_translation = f"TEST TRANSLATION {i+1}: This is a test."
|
||||
|
||||
try:
|
||||
before_count = len(doc.paragraphs)
|
||||
|
||||
# 記錄插入前的下一個段落
|
||||
next_para_before = None
|
||||
if current_index + 1 < len(all_paras):
|
||||
next_para_before = all_paras[current_index + 1].text[:30]
|
||||
|
||||
new_para = _append_after(p, test_translation, italic=True, font_size_pt=12)
|
||||
|
||||
after_count = len(doc.paragraphs)
|
||||
|
||||
print(f" 插入前段落數: {before_count}")
|
||||
print(f" 插入後段落數: {after_count}")
|
||||
print(f" 段落數變化: +{after_count - before_count}")
|
||||
|
||||
if new_para:
|
||||
print(f" 新段落文本: {new_para.text}")
|
||||
print(f" 新段落類型: {type(new_para)}")
|
||||
|
||||
# 檢查插入位置
|
||||
updated_paras = list(doc.paragraphs)
|
||||
if current_index + 1 < len(updated_paras):
|
||||
next_para_after = updated_paras[current_index + 1].text[:30]
|
||||
print(f" 插入前下一段: {next_para_before}")
|
||||
print(f" 插入後下一段: {next_para_after}")
|
||||
|
||||
if next_para_after != next_para_before:
|
||||
print(f" ✅ 插入成功:下一段內容已改變")
|
||||
else:
|
||||
print(f" ❌ 插入失敗:下一段內容未變")
|
||||
|
||||
except Exception as e:
|
||||
print(f" ❌ _append_after失敗: {e}")
|
||||
|
||||
# 嘗試簡單的段落添加測試
|
||||
try:
|
||||
simple_para = doc.add_paragraph(f"SIMPLE TEST {i+1}")
|
||||
print(f" 替代測試: doc.add_paragraph成功")
|
||||
print(f" 新段落文本: {simple_para.text}")
|
||||
except Exception as e2:
|
||||
print(f" 替代測試也失敗: {e2}")
|
||||
except Exception as outer_e:
|
||||
print(f" ❌ 段落分析失敗: {outer_e}")
|
||||
|
||||
# 保存並重新讀取驗證
|
||||
output_path = test_dir / "debug_paragraph_modified.docx"
|
||||
doc.save(str(output_path))
|
||||
print(f"\n✅ 修改後文檔已保存: {output_path}")
|
||||
|
||||
# 重新讀取驗證
|
||||
doc2 = Document(str(output_path))
|
||||
print(f"保存後重讀段落數: {len(doc2.paragraphs)}")
|
||||
|
||||
print(f"\n📄 前10段內容:")
|
||||
for i, para in enumerate(doc2.paragraphs[:10]):
|
||||
if para.text.strip():
|
||||
lang_info = ""
|
||||
if "TEST TRANSLATION" in para.text:
|
||||
lang_info = "🆕 測試翻譯"
|
||||
elif "SIMPLE TEST" in para.text:
|
||||
lang_info = "🆕 簡單測試"
|
||||
elif any('\u4e00' <= c <= '\u9fff' for c in para.text):
|
||||
lang_info = "🇨🇳 中文"
|
||||
else:
|
||||
lang_info = "❓ 其他"
|
||||
|
||||
print(f" 段落 {i+1}: {lang_info} - {para.text.strip()[:60]}...")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ 調試失敗: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
debug_paragraph_structure()
|
Reference in New Issue
Block a user