161 lines
6.9 KiB
Python
161 lines
6.9 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
調試段落結構問題
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
import tempfile
|
|
import shutil
|
|
from pathlib import Path
|
|
|
|
# Fix encoding for Windows console
|
|
if sys.stdout.encoding != 'utf-8':
|
|
sys.stdout.reconfigure(encoding='utf-8')
|
|
if sys.stderr.encoding != 'utf-8':
|
|
sys.stderr.reconfigure(encoding='utf-8')
|
|
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'app'))
|
|
|
|
from app import create_app, db
|
|
from app.services.document_processor import DocumentProcessor, _append_after
|
|
from sqlalchemy import text as sql_text
|
|
|
|
def debug_paragraph_structure():
|
|
"""調試段落結構問題"""
|
|
|
|
app = create_app()
|
|
|
|
with app.app_context():
|
|
print("=== 調試段落結構問題 ===")
|
|
|
|
# 原始文件
|
|
original_path = r"C:\Users\EGG\WORK\data\user_scrip\TOOL\Document_translator_V2\uploads\9c6548ac-2f59-45f4-aade-0a9b3895bbfd\original_-OR026_9c6548ac.docx"
|
|
|
|
# 創建測試副本
|
|
test_dir = Path(tempfile.gettempdir()) / "debug_paragraph"
|
|
test_dir.mkdir(exist_ok=True)
|
|
test_path = test_dir / "debug_paragraph.docx"
|
|
|
|
shutil.copy2(original_path, test_path)
|
|
print(f"✅ 創建測試副本: {test_path}")
|
|
|
|
# 創建處理器
|
|
processor = DocumentProcessor()
|
|
|
|
# 提取段落
|
|
segments = processor.extract_docx_segments(str(test_path))
|
|
|
|
# 只看前3個段落
|
|
debug_segments = segments[:3]
|
|
|
|
# 載入文檔
|
|
try:
|
|
from docx import Document
|
|
doc = Document(str(test_path))
|
|
|
|
print(f"\n📊 文檔分析:")
|
|
print(f"總段落數: {len(doc.paragraphs)}")
|
|
|
|
print(f"\n🔍 前3個段落詳細分析:")
|
|
|
|
for i, seg in enumerate(debug_segments):
|
|
if seg.kind == "para":
|
|
p = seg.ref
|
|
|
|
print(f"\n段落 {i+1}:")
|
|
print(f" 文本: {seg.text[:50]}...")
|
|
print(f" 段落類型: {type(p)}")
|
|
print(f" 段落父元素類型: {type(p._parent)}")
|
|
print(f" 段落XML標籤: {p._p.tag if hasattr(p._p, 'tag') else 'N/A'}")
|
|
|
|
# 檢查段落位置
|
|
try:
|
|
all_paras = list(doc.paragraphs)
|
|
current_index = -1
|
|
for idx, doc_p in enumerate(all_paras):
|
|
if doc_p._element == p._element:
|
|
current_index = idx
|
|
break
|
|
print(f" 在文檔中的位置: {current_index} (總共{len(all_paras)}段)")
|
|
|
|
# 測試_append_after插入
|
|
print(f" 測試插入翻譯...")
|
|
|
|
test_translation = f"TEST TRANSLATION {i+1}: This is a test."
|
|
|
|
try:
|
|
before_count = len(doc.paragraphs)
|
|
|
|
# 記錄插入前的下一個段落
|
|
next_para_before = None
|
|
if current_index + 1 < len(all_paras):
|
|
next_para_before = all_paras[current_index + 1].text[:30]
|
|
|
|
new_para = _append_after(p, test_translation, italic=True, font_size_pt=12)
|
|
|
|
after_count = len(doc.paragraphs)
|
|
|
|
print(f" 插入前段落數: {before_count}")
|
|
print(f" 插入後段落數: {after_count}")
|
|
print(f" 段落數變化: +{after_count - before_count}")
|
|
|
|
if new_para:
|
|
print(f" 新段落文本: {new_para.text}")
|
|
print(f" 新段落類型: {type(new_para)}")
|
|
|
|
# 檢查插入位置
|
|
updated_paras = list(doc.paragraphs)
|
|
if current_index + 1 < len(updated_paras):
|
|
next_para_after = updated_paras[current_index + 1].text[:30]
|
|
print(f" 插入前下一段: {next_para_before}")
|
|
print(f" 插入後下一段: {next_para_after}")
|
|
|
|
if next_para_after != next_para_before:
|
|
print(f" ✅ 插入成功:下一段內容已改變")
|
|
else:
|
|
print(f" ❌ 插入失敗:下一段內容未變")
|
|
|
|
except Exception as e:
|
|
print(f" ❌ _append_after失敗: {e}")
|
|
|
|
# 嘗試簡單的段落添加測試
|
|
try:
|
|
simple_para = doc.add_paragraph(f"SIMPLE TEST {i+1}")
|
|
print(f" 替代測試: doc.add_paragraph成功")
|
|
print(f" 新段落文本: {simple_para.text}")
|
|
except Exception as e2:
|
|
print(f" 替代測試也失敗: {e2}")
|
|
except Exception as outer_e:
|
|
print(f" ❌ 段落分析失敗: {outer_e}")
|
|
|
|
# 保存並重新讀取驗證
|
|
output_path = test_dir / "debug_paragraph_modified.docx"
|
|
doc.save(str(output_path))
|
|
print(f"\n✅ 修改後文檔已保存: {output_path}")
|
|
|
|
# 重新讀取驗證
|
|
doc2 = Document(str(output_path))
|
|
print(f"保存後重讀段落數: {len(doc2.paragraphs)}")
|
|
|
|
print(f"\n📄 前10段內容:")
|
|
for i, para in enumerate(doc2.paragraphs[:10]):
|
|
if para.text.strip():
|
|
lang_info = ""
|
|
if "TEST TRANSLATION" in para.text:
|
|
lang_info = "🆕 測試翻譯"
|
|
elif "SIMPLE TEST" in para.text:
|
|
lang_info = "🆕 簡單測試"
|
|
elif any('\u4e00' <= c <= '\u9fff' for c in para.text):
|
|
lang_info = "🇨🇳 中文"
|
|
else:
|
|
lang_info = "❓ 其他"
|
|
|
|
print(f" 段落 {i+1}: {lang_info} - {para.text.strip()[:60]}...")
|
|
|
|
except Exception as e:
|
|
print(f"❌ 調試失敗: {e}")
|
|
|
|
if __name__ == "__main__":
|
|
debug_paragraph_structure() |