chore: remove test scripts and clean up codebase

This commit is contained in:
egg
2025-11-18 08:16:50 +08:00
parent 92e326b3a3
commit 41ddee5c46
4 changed files with 0 additions and 504 deletions

View File

@@ -1,144 +0,0 @@
#!/usr/bin/env python
"""
測試 calculate_page_dimensions 是否正確檢查所有可能的區域
包括: text_regions, image_regions, tables, layout, layout_data.elements
"""
import json
from pathlib import Path
from app.services.pdf_generator_service import pdf_generator_service
import logging
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
def test_all_region_types():
"""
測試場景:
- layout: [] (空列表)
- text_regions: 包含文字區域
- image_regions: 包含圖片區域 (關鍵!)
- tables: 包含表格區域 (關鍵!)
"""
test_dir = Path("test_output_all_regions")
test_dir.mkdir(exist_ok=True)
print("\n" + "="*70)
print("測試場景:檢查所有區域類型 (text, image, table)")
print("="*70)
# 模擬包含所有區域類型的 JSON
mock_ocr_data = {
"status": "success",
"file_name": "complete_document.pdf",
"language": "ch",
"layout": [], # 空列表
"text_regions": [
{
"text": "標題文字",
"bbox": [[461, 270], [819, 270], [819, 408], [461, 408]],
"confidence": 0.95
},
{
"text": "內容文字",
"bbox": [[1521, 936], [1850, 936], [1850, 1020], [1521, 1020]],
"confidence": 0.93
}
],
"image_regions": [
{
"type": "figure",
"bbox": [[1434, 1500], [2204, 1500], [2204, 2100], [1434, 2100]], # 圖片在右下角
"image_path": "imgs/figure_1.jpg"
},
{
"type": "chart",
"bbox": [[200, 2200], [800, 2200], [800, 2800], [200, 2800]],
"image_path": "imgs/chart_1.jpg"
}
],
"tables": [
{
"type": "table",
"bbox": [[300, 3000], [1900, 3000], [1900, 3500], [300, 3500]], # 表格在底部
"html": "<table>...</table>"
}
],
"total_text_regions": 2,
"average_confidence": 0.94,
"layout_data": None,
"images_metadata": [],
"markdown_content": "標題文字\n內容文字",
"processing_time": 4.5,
"timestamp": "2025-11-17T00:00:00"
}
# Save mock JSON
json_path = test_dir / "all_regions_test.json"
with open(json_path, "w", encoding="utf-8") as f:
json.dump(mock_ocr_data, f, ensure_ascii=False, indent=2)
print(f"\n✓ 創建測試 JSON: {json_path}")
print(f" - layout: [] (空列表)")
print(f" - text_regions: 2 個區域 (max X=1850)")
print(f" - image_regions: 2 個區域 (max X=2204) *** 關鍵!")
print(f" - tables: 1 個區域 (max Y=3500) *** 關鍵!")
print(f" - 預期 OCR 尺寸: ~2204 x ~3500 (取自所有區域的最大值)")
# Create A4 source PDF
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import A4
source_pdf = test_dir / "source_a4.pdf"
c = canvas.Canvas(str(source_pdf), pagesize=A4)
c.drawString(100, 800, "Original A4 Document")
c.save()
print(f"✓ 創建 A4 源文件: {source_pdf}")
print(f" - A4 尺寸: 595 x 842 點")
# Test PDF generation
pdf_path = test_dir / "output_all_regions.pdf"
print(f"\n開始生成 PDF...")
print("-" * 70)
success = pdf_generator_service.generate_layout_pdf(
json_path=json_path,
output_path=pdf_path,
source_file_path=source_pdf
)
print("-" * 70)
if success:
print(f"\n✓ PDF 生成成功: {pdf_path}")
print(f"\n預期結果:")
print(f" - OCR 尺寸(從所有區域推斷): ~2204 x ~3500")
print(f" - 目標 PDF 尺寸: 595 x 842")
print(f" - 預期縮放因子: X={595/2204:.3f}, Y={842/3500:.3f}")
print(f"\n關鍵驗證:")
print(f" - 如果只檢查 text_regionsmax_x 只有 1850 (錯誤!)")
print(f" - 必須檢查 image_regions 才能得到正確的 max_x=2204")
print(f" - 必須檢查 tables 才能得到正確的 max_y=3500")
return True
else:
print(f"\n✗ PDF 生成失敗")
return False
if __name__ == "__main__":
import sys
sys.path.insert(0, str(Path(__file__).parent))
success = test_all_region_types()
print("\n" + "="*70)
if success:
print("✓ 測試通過!所有區域類型都被正確檢查")
print("="*70)
sys.exit(0)
else:
print("✗ 測試失敗")
print("="*70)
sys.exit(1)

View File

@@ -1,130 +0,0 @@
#!/usr/bin/env python
"""
Test script for PDF generation with proper bbox-based dimension calculation
Simulates the real scenario where OCR processes on high-res images (e.g., 2189x3500)
but we want to generate PDFs at original size (e.g., A4: 595x842)
"""
import json
from pathlib import Path
from app.services.pdf_generator_service import pdf_generator_service
import logging
# Set up logging to see dimension calculations
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
def test_high_res_ocr_to_a4_pdf():
"""
Test the scenario user described:
- PaddleOCR processes PDF at high resolution (2189x3500)
- OCR bbox coordinates are in this high-res space
- We want to generate A4 PDF (595x842)
- Scale factors should be ~0.27 and ~0.24
"""
# Create test directory
test_dir = Path("test_output_bbox")
test_dir.mkdir(exist_ok=True)
print("\n" + "="*70)
print("測試場景:高解析度 OCR → A4 PDF 縮放")
print("="*70)
# Create mock OCR data with high-res bbox coordinates
# Simulating text at various positions in the 2189x3500 coordinate space
mock_ocr_data = {
"status": "success",
"file_name": "test_document.pdf",
"language": "ch",
"text_regions": [
{
"text": "標題文字在頂部",
"bbox": [[230, 195], [1189, 182], [1189, 350], [230, 363]], # Top of page
"confidence": 0.95
},
{
"text": "中間的文字內容",
"bbox": [[1521, 1750], [2185, 1750], [2185, 1820], [1521, 1820]], # Middle
"confidence": 0.92
},
{
"text": "底部的文字",
"bbox": [[400, 3200], [1200, 3200], [1200, 3280], [400, 3280]], # Bottom
"confidence": 0.93
}
],
"total_text_regions": 3,
"average_confidence": 0.933,
"layout_data": None,
"images_metadata": [],
"markdown_content": "# Test Document\n\n標題文字在頂部\n中間的文字內容\n底部的文字",
"processing_time": 2.5,
"timestamp": "2025-11-17T00:00:00"
}
# Save mock JSON
json_path = test_dir / "high_res_ocr_result.json"
with open(json_path, "w", encoding="utf-8") as f:
json.dump(mock_ocr_data, f, ensure_ascii=False, indent=2)
print(f"\n✓ 創建測試 JSON: {json_path}")
print(f" - OCR 座標範圍: X=[230..2185], Y=[182..3280]")
print(f" - 預期 OCR 尺寸: ~2185 x ~3280")
# Create a mock A4 source PDF for target dimensions
from PIL import Image
from reportlab.lib.pagesizes import A4
# Create dummy source image at A4 size (595x842 points)
source_pdf = test_dir / "source_a4.pdf"
# For this test, we'll create a simple PDF using reportlab
from reportlab.pdfgen import canvas
c = canvas.Canvas(str(source_pdf), pagesize=A4)
c.drawString(100, 800, "Original A4 Document")
c.save()
print(f"✓ 創建 A4 源文件: {source_pdf}")
print(f" - A4 尺寸: 595 x 842 點")
# Test PDF generation
pdf_path = test_dir / "scaled_output.pdf"
print(f"\n開始生成 PDF...")
print("-" * 70)
success = pdf_generator_service.generate_layout_pdf(
json_path=json_path,
output_path=pdf_path,
source_file_path=source_pdf
)
print("-" * 70)
if success:
print(f"\n✓ PDF 生成成功: {pdf_path}")
print(f"\n預期結果:")
print(f" - OCR 尺寸: ~2185 x ~3280")
print(f" - 目標 PDF 尺寸: 595 x 842")
print(f" - 預期縮放因子: X={595/2185:.3f}, Y={842/3280:.3f}")
print(f"\n實際結果應該與預期一致(見上方日誌)")
return True
else:
print(f"\n✗ PDF 生成失敗")
return False
if __name__ == "__main__":
import sys
sys.path.insert(0, str(Path(__file__).parent))
success = test_high_res_ocr_to_a4_pdf()
print("\n" + "="*70)
if success:
print("✓ 測試通過!縮放邏輯正確")
print("="*70)
sys.exit(0)
else:
print("✗ 測試失敗")
print("="*70)
sys.exit(1)

View File

@@ -1,130 +0,0 @@
#!/usr/bin/env python
"""
測試 calculate_page_dimensions 是否正確處理 layout=[] 但 text_regions 有數據的情況
這模擬了用戶報告的 ELER-8-100HFV Data Sheet 的場景
"""
import json
from pathlib import Path
from app.services.pdf_generator_service import pdf_generator_service
import logging
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
def test_empty_layout_with_text_regions():
"""
測試場景:
- layout: [] (空列表)
- text_regions: 包含高解析度 bbox 數據
- 應該從 text_regions 推斷出正確的 OCR 尺寸
"""
test_dir = Path("test_output_empty_layout")
test_dir.mkdir(exist_ok=True)
print("\n" + "="*70)
print("測試場景layout=[] 但 text_regions 包含數據")
print("="*70)
# 模擬用戶的 JSON 結構
mock_ocr_data = {
"status": "success",
"file_name": "ELER-8-100HFV_Data_Sheet.pdf",
"language": "ch",
"layout": [], # *** 關鍵:這是空的 ***
"text_regions": [
{
"text": "義典科技",
"bbox": [[461, 270], [819, 252], [822, 408], [464, 426]], # 高解析度座標
"confidence": 0.95
},
{
"text": "ELER-8-100HFV",
"bbox": [[1150, 580], [1850, 580], [1850, 680], [1150, 680]],
"confidence": 0.93
},
{
"text": "表格中的文字",
"bbox": [[1259, 936], [1317, 936], [1317, 960], [1259, 960]], # X=1259 超出 A4 寬度
"confidence": 0.92
},
{
"text": "底部文字",
"bbox": [[400, 2800], [1200, 2800], [1200, 2880], [400, 2880]], # Y=2880
"confidence": 0.91
}
],
"total_text_regions": 4,
"average_confidence": 0.928,
"layout_data": None,
"images_metadata": [],
"markdown_content": "義典科技\nELER-8-100HFV\n表格中的文字\n底部文字",
"processing_time": 3.2,
"timestamp": "2025-11-17T00:00:00"
}
# Save mock JSON
json_path = test_dir / "empty_layout_test.json"
with open(json_path, "w", encoding="utf-8") as f:
json.dump(mock_ocr_data, f, ensure_ascii=False, indent=2)
print(f"\n✓ 創建測試 JSON: {json_path}")
print(f" - layout: [] (空列表)")
print(f" - text_regions: 4 個區域")
print(f" - OCR 座標範圍: X=[400..1850], Y=[252..2880]")
print(f" - 預期 OCR 尺寸: ~1850 x ~2880")
# Create A4 source PDF
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import A4
source_pdf = test_dir / "source_a4.pdf"
c = canvas.Canvas(str(source_pdf), pagesize=A4)
c.drawString(100, 800, "Original A4 Document")
c.save()
print(f"✓ 創建 A4 源文件: {source_pdf}")
print(f" - A4 尺寸: 595 x 842 點")
# Test PDF generation
pdf_path = test_dir / "output.pdf"
print(f"\n開始生成 PDF...")
print("-" * 70)
success = pdf_generator_service.generate_layout_pdf(
json_path=json_path,
output_path=pdf_path,
source_file_path=source_pdf
)
print("-" * 70)
if success:
print(f"\n✓ PDF 生成成功: {pdf_path}")
print(f"\n預期結果:")
print(f" - OCR 尺寸(從 text_regions 推斷): ~1850 x ~2880")
print(f" - 目標 PDF 尺寸: 595 x 842")
print(f" - 預期縮放因子: X={595/1850:.3f}, Y={842/2880:.3f}")
print(f"\n如果實際縮放因子是 1.0,說明 Bug 仍存在!")
return True
else:
print(f"\n✗ PDF 生成失敗")
return False
if __name__ == "__main__":
import sys
sys.path.insert(0, str(Path(__file__).parent))
success = test_empty_layout_with_text_regions()
print("\n" + "="*70)
if success:
print("✓ 測試完成")
print("="*70)
sys.exit(0)
else:
print("✗ 測試失敗")
print("="*70)
sys.exit(1)

View File

@@ -1,100 +0,0 @@
#!/usr/bin/env python
"""
Test script for PDF generation with proper scaling
"""
import json
from pathlib import Path
from app.services.pdf_generator_service import pdf_generator_service
def test_pdf_generation():
"""Test PDF generation with mock data that includes OCR dimensions"""
# Create a test directory
test_dir = Path("test_output")
test_dir.mkdir(exist_ok=True)
# Create mock OCR JSON data with OCR dimensions
mock_ocr_data = {
"status": "success",
"file_name": "test_image.jpg",
"language": "ch",
"ocr_dimensions": {
"width": 500, # OCR processed at 500px wide
"height": 700 # OCR processed at 700px tall
},
"text_regions": [
{
"text": "測試文字 Test Text",
"bbox": [[50, 100], [250, 100], [250, 150], [50, 150]],
"confidence": 0.95
},
{
"text": "第二行文字 Second line",
"bbox": [[50, 200], [300, 200], [300, 250], [50, 250]],
"confidence": 0.92
}
],
"total_text_regions": 2,
"average_confidence": 0.935,
"layout_data": None,
"images_metadata": [],
"markdown_content": "# Test Document\n\n測試文字 Test Text\n\n第二行文字 Second line",
"processing_time": 1.5,
"timestamp": "2025-11-17T00:00:00"
}
# Save mock JSON
json_path = test_dir / "test_ocr_result.json"
with open(json_path, "w", encoding="utf-8") as f:
json.dump(mock_ocr_data, f, ensure_ascii=False, indent=2)
print(f"Created test JSON at: {json_path}")
# Test PDF generation
pdf_path = test_dir / "test_output.pdf"
# Create a dummy source file for dimensions (1000x1400 target PDF size)
from PIL import Image
source_image = test_dir / "test_source.jpg"
img = Image.new('RGB', (1000, 1400), color='white')
img.save(source_image)
print(f"Created test source image: {source_image} (1000x1400)")
# Generate PDF
print("\nGenerating PDF with scaling...")
# Set up logging to see scale factors
import logging
logging.basicConfig(level=logging.INFO, format='%(message)s')
success = pdf_generator_service.generate_layout_pdf(
json_path=json_path,
output_path=pdf_path,
source_file_path=source_image
)
if success:
print(f"✓ PDF generated successfully: {pdf_path}")
print(f" Expected scale factors: X={1000/500:.2f}, Y={1400/700:.2f}")
print(" Text should now be properly scaled and positioned!")
else:
print("✗ PDF generation failed")
return success
if __name__ == "__main__":
import sys
sys.path.insert(0, str(Path(__file__).parent))
print("Testing PDF generation with proper scaling...")
print("=" * 60)
success = test_pdf_generation()
print("\n" + "=" * 60)
if success:
print("✓ Test completed successfully!")
print("Check test_output/test_output.pdf to verify scaling")
else:
print("✗ Test failed")