Files
OCR/backend/tests/test_export_service.py
beabigegg da700721fa first
2025-11-12 22:53:17 +08:00

638 lines
21 KiB
Python

"""
Tool_OCR - Export Service Unit Tests
Tests for app/services/export_service.py
"""
import pytest
import json
import zipfile
from pathlib import Path
from unittest.mock import Mock, patch, MagicMock
from datetime import datetime
import pandas as pd
from app.services.export_service import ExportService, ExportError
from app.models.ocr import FileStatus
@pytest.fixture
def export_service():
"""Create an ExportService instance"""
return ExportService()
@pytest.fixture
def mock_ocr_result(temp_dir):
"""Create a mock OCRResult with markdown file"""
# Create mock markdown file
md_file = temp_dir / "test_result.md"
md_file.write_text("# Test Document\n\nThis is test content.", encoding="utf-8")
# Create mock result
result = Mock()
result.id = 1
result.markdown_path = str(md_file)
result.json_path = None
result.detected_language = "zh"
result.total_text_regions = 10
result.average_confidence = 0.95
result.layout_data = {"elements": [{"type": "text"}]}
result.images_metadata = []
# Mock file
result.file = Mock()
result.file.id = 1
result.file.original_filename = "test.png"
result.file.file_format = "png"
result.file.file_size = 1024
result.file.processing_time = 2.5
return result
@pytest.fixture
def mock_db():
"""Create a mock database session"""
return Mock()
@pytest.mark.unit
class TestExportServiceInit:
"""Test ExportService initialization"""
def test_init(self, export_service):
"""Test export service initialization"""
assert export_service is not None
assert export_service.pdf_generator is not None
@pytest.mark.unit
class TestApplyFilters:
"""Test filter application"""
def test_apply_filters_confidence_threshold(self, export_service):
"""Test confidence threshold filter"""
result1 = Mock()
result1.average_confidence = 0.95
result1.file = Mock()
result1.file.original_filename = "test1.png"
result2 = Mock()
result2.average_confidence = 0.75
result2.file = Mock()
result2.file.original_filename = "test2.png"
result3 = Mock()
result3.average_confidence = 0.85
result3.file = Mock()
result3.file.original_filename = "test3.png"
results = [result1, result2, result3]
filters = {"confidence_threshold": 0.80}
filtered = export_service.apply_filters(results, filters)
assert len(filtered) == 2
assert result1 in filtered
assert result3 in filtered
assert result2 not in filtered
def test_apply_filters_filename_pattern(self, export_service):
"""Test filename pattern filter"""
result1 = Mock()
result1.average_confidence = 0.95
result1.file = Mock()
result1.file.original_filename = "invoice_2024.png"
result2 = Mock()
result2.average_confidence = 0.95
result2.file = Mock()
result2.file.original_filename = "receipt.png"
results = [result1, result2]
filters = {"filename_pattern": "invoice"}
filtered = export_service.apply_filters(results, filters)
assert len(filtered) == 1
assert result1 in filtered
def test_apply_filters_language(self, export_service):
"""Test language filter"""
result1 = Mock()
result1.detected_language = "zh"
result1.average_confidence = 0.95
result1.file = Mock()
result1.file.original_filename = "chinese.png"
result2 = Mock()
result2.detected_language = "en"
result2.average_confidence = 0.95
result2.file = Mock()
result2.file.original_filename = "english.png"
results = [result1, result2]
filters = {"language": "zh"}
filtered = export_service.apply_filters(results, filters)
assert len(filtered) == 1
assert result1 in filtered
def test_apply_filters_combined(self, export_service):
"""Test multiple filters combined"""
result1 = Mock()
result1.detected_language = "zh"
result1.average_confidence = 0.95
result1.file = Mock()
result1.file.original_filename = "invoice_chinese.png"
result2 = Mock()
result2.detected_language = "zh"
result2.average_confidence = 0.75
result2.file = Mock()
result2.file.original_filename = "invoice_low.png"
result3 = Mock()
result3.detected_language = "en"
result3.average_confidence = 0.95
result3.file = Mock()
result3.file.original_filename = "invoice_english.png"
results = [result1, result2, result3]
filters = {
"confidence_threshold": 0.80,
"language": "zh",
"filename_pattern": "invoice"
}
filtered = export_service.apply_filters(results, filters)
assert len(filtered) == 1
assert result1 in filtered
def test_apply_filters_no_filters(self, export_service):
"""Test with no filters applied"""
results = [Mock(), Mock(), Mock()]
filtered = export_service.apply_filters(results, {})
assert len(filtered) == len(results)
@pytest.mark.unit
class TestExportToTXT:
"""Test TXT export"""
def test_export_to_txt_basic(self, export_service, mock_ocr_result, temp_dir):
"""Test basic TXT export"""
output_path = temp_dir / "output.txt"
result_path = export_service.export_to_txt([mock_ocr_result], output_path)
assert result_path.exists()
content = result_path.read_text(encoding="utf-8")
assert "Test Document" in content
assert "test content" in content
def test_export_to_txt_with_line_numbers(self, export_service, mock_ocr_result, temp_dir):
"""Test TXT export with line numbers"""
output_path = temp_dir / "output.txt"
formatting = {"add_line_numbers": True}
result_path = export_service.export_to_txt(
[mock_ocr_result],
output_path,
formatting=formatting
)
content = result_path.read_text(encoding="utf-8")
assert "|" in content # Line number separator
def test_export_to_txt_with_metadata(self, export_service, mock_ocr_result, temp_dir):
"""Test TXT export with metadata headers"""
output_path = temp_dir / "output.txt"
formatting = {"include_metadata": True}
result_path = export_service.export_to_txt(
[mock_ocr_result],
output_path,
formatting=formatting
)
content = result_path.read_text(encoding="utf-8")
assert "文件:" in content
assert "test.png" in content
assert "信心度:" in content
def test_export_to_txt_with_grouping(self, export_service, mock_ocr_result, temp_dir):
"""Test TXT export with file grouping"""
output_path = temp_dir / "output.txt"
formatting = {"group_by_filename": True}
result_path = export_service.export_to_txt(
[mock_ocr_result, mock_ocr_result],
output_path,
formatting=formatting
)
content = result_path.read_text(encoding="utf-8")
assert "-" * 80 in content # Separator
def test_export_to_txt_missing_markdown(self, export_service, temp_dir):
"""Test TXT export with missing markdown file"""
result = Mock()
result.id = 1
result.markdown_path = "/nonexistent/path.md"
result.file = Mock()
result.file.original_filename = "test.png"
output_path = temp_dir / "output.txt"
# Should not fail, just skip the file
result_path = export_service.export_to_txt([result], output_path)
assert result_path.exists()
def test_export_to_txt_creates_parent_directories(self, export_service, mock_ocr_result, temp_dir):
"""Test that export creates necessary parent directories"""
output_path = temp_dir / "subdir" / "output.txt"
result_path = export_service.export_to_txt([mock_ocr_result], output_path)
assert result_path.exists()
assert result_path.parent.exists()
@pytest.mark.unit
class TestExportToJSON:
"""Test JSON export"""
def test_export_to_json_basic(self, export_service, mock_ocr_result, temp_dir):
"""Test basic JSON export"""
output_path = temp_dir / "output.json"
result_path = export_service.export_to_json([mock_ocr_result], output_path)
assert result_path.exists()
data = json.loads(result_path.read_text(encoding="utf-8"))
assert "export_time" in data
assert data["total_files"] == 1
assert len(data["results"]) == 1
assert data["results"][0]["filename"] == "test.png"
assert data["results"][0]["average_confidence"] == 0.95
def test_export_to_json_with_layout(self, export_service, mock_ocr_result, temp_dir):
"""Test JSON export with layout data"""
output_path = temp_dir / "output.json"
result_path = export_service.export_to_json(
[mock_ocr_result],
output_path,
include_layout=True
)
data = json.loads(result_path.read_text(encoding="utf-8"))
assert "layout_data" in data["results"][0]
def test_export_to_json_without_layout(self, export_service, mock_ocr_result, temp_dir):
"""Test JSON export without layout data"""
output_path = temp_dir / "output.json"
result_path = export_service.export_to_json(
[mock_ocr_result],
output_path,
include_layout=False
)
data = json.loads(result_path.read_text(encoding="utf-8"))
assert "layout_data" not in data["results"][0]
def test_export_to_json_multiple_results(self, export_service, mock_ocr_result, temp_dir):
"""Test JSON export with multiple results"""
output_path = temp_dir / "output.json"
result_path = export_service.export_to_json(
[mock_ocr_result, mock_ocr_result],
output_path
)
data = json.loads(result_path.read_text(encoding="utf-8"))
assert data["total_files"] == 2
assert len(data["results"]) == 2
@pytest.mark.unit
class TestExportToExcel:
"""Test Excel export"""
def test_export_to_excel_basic(self, export_service, mock_ocr_result, temp_dir):
"""Test basic Excel export"""
output_path = temp_dir / "output.xlsx"
result_path = export_service.export_to_excel([mock_ocr_result], output_path)
assert result_path.exists()
df = pd.read_excel(result_path)
assert len(df) == 1
assert "文件名" in df.columns
assert df.iloc[0]["文件名"] == "test.png"
def test_export_to_excel_with_confidence(self, export_service, mock_ocr_result, temp_dir):
"""Test Excel export with confidence scores"""
output_path = temp_dir / "output.xlsx"
result_path = export_service.export_to_excel(
[mock_ocr_result],
output_path,
include_confidence=True
)
df = pd.read_excel(result_path)
assert "平均信心度" in df.columns
def test_export_to_excel_without_processing_time(self, export_service, mock_ocr_result, temp_dir):
"""Test Excel export without processing time"""
output_path = temp_dir / "output.xlsx"
result_path = export_service.export_to_excel(
[mock_ocr_result],
output_path,
include_processing_time=False
)
df = pd.read_excel(result_path)
assert "處理時間(秒)" not in df.columns
def test_export_to_excel_long_content_truncation(self, export_service, temp_dir):
"""Test that long content is truncated in Excel"""
# Create result with long content
md_file = temp_dir / "long.md"
md_file.write_text("x" * 2000, encoding="utf-8")
result = Mock()
result.id = 1
result.markdown_path = str(md_file)
result.detected_language = "zh"
result.total_text_regions = 10
result.average_confidence = 0.95
result.file = Mock()
result.file.original_filename = "long.png"
result.file.file_format = "png"
result.file.file_size = 1024
result.file.processing_time = 1.0
output_path = temp_dir / "output.xlsx"
result_path = export_service.export_to_excel([result], output_path)
df = pd.read_excel(result_path)
content = df.iloc[0]["提取內容"]
assert "..." in content
assert len(content) <= 1004 # 1000 + "..."
@pytest.mark.unit
class TestExportToMarkdown:
"""Test Markdown export"""
def test_export_to_markdown_combined(self, export_service, mock_ocr_result, temp_dir):
"""Test combined Markdown export"""
output_path = temp_dir / "combined.md"
result_path = export_service.export_to_markdown(
[mock_ocr_result],
output_path,
combine=True
)
assert result_path.exists()
assert result_path.is_file()
content = result_path.read_text(encoding="utf-8")
assert "test.png" in content
assert "Test Document" in content
def test_export_to_markdown_separate(self, export_service, mock_ocr_result, temp_dir):
"""Test separate Markdown export"""
output_dir = temp_dir / "markdown_files"
result_path = export_service.export_to_markdown(
[mock_ocr_result],
output_dir,
combine=False
)
assert result_path.exists()
assert result_path.is_dir()
files = list(result_path.glob("*.md"))
assert len(files) == 1
def test_export_to_markdown_multiple_files(self, export_service, mock_ocr_result, temp_dir):
"""Test Markdown export with multiple files"""
output_path = temp_dir / "combined.md"
result_path = export_service.export_to_markdown(
[mock_ocr_result, mock_ocr_result],
output_path,
combine=True
)
content = result_path.read_text(encoding="utf-8")
assert content.count("---") >= 1 # Separators
@pytest.mark.unit
class TestExportToPDF:
"""Test PDF export"""
@patch.object(ExportService, '__init__', lambda self: None)
def test_export_to_pdf_success(self, mock_ocr_result, temp_dir):
"""Test successful PDF export"""
from app.services.pdf_generator import PDFGenerator
service = ExportService()
service.pdf_generator = Mock(spec=PDFGenerator)
service.pdf_generator.generate_pdf = Mock(return_value=temp_dir / "output.pdf")
output_path = temp_dir / "output.pdf"
result_path = service.export_to_pdf(mock_ocr_result, output_path)
service.pdf_generator.generate_pdf.assert_called_once()
call_kwargs = service.pdf_generator.generate_pdf.call_args[1]
assert call_kwargs["css_template"] == "default"
@patch.object(ExportService, '__init__', lambda self: None)
def test_export_to_pdf_with_custom_template(self, mock_ocr_result, temp_dir):
"""Test PDF export with custom CSS template"""
from app.services.pdf_generator import PDFGenerator
service = ExportService()
service.pdf_generator = Mock(spec=PDFGenerator)
service.pdf_generator.generate_pdf = Mock(return_value=temp_dir / "output.pdf")
output_path = temp_dir / "output.pdf"
service.export_to_pdf(mock_ocr_result, output_path, css_template="academic")
call_kwargs = service.pdf_generator.generate_pdf.call_args[1]
assert call_kwargs["css_template"] == "academic"
@patch.object(ExportService, '__init__', lambda self: None)
def test_export_to_pdf_missing_markdown(self, temp_dir):
"""Test PDF export with missing markdown file"""
from app.services.pdf_generator import PDFGenerator
result = Mock()
result.id = 1
result.markdown_path = None
result.file = Mock()
service = ExportService()
service.pdf_generator = Mock(spec=PDFGenerator)
output_path = temp_dir / "output.pdf"
with pytest.raises(ExportError) as exc_info:
service.export_to_pdf(result, output_path)
assert "not found" in str(exc_info.value).lower()
@pytest.mark.unit
class TestGetExportFormats:
"""Test getting available export formats"""
def test_get_export_formats(self, export_service):
"""Test getting export formats"""
formats = export_service.get_export_formats()
assert isinstance(formats, dict)
assert "txt" in formats
assert "json" in formats
assert "excel" in formats
assert "markdown" in formats
assert "pdf" in formats
assert "zip" in formats
# Check descriptions are in Chinese
for desc in formats.values():
assert isinstance(desc, str)
assert len(desc) > 0
@pytest.mark.unit
class TestApplyExportRule:
"""Test export rule application"""
def test_apply_export_rule_success(self, export_service, mock_db):
"""Test applying export rule"""
# Create mock rule
rule = Mock()
rule.id = 1
rule.config_json = {
"filters": {
"confidence_threshold": 0.80
}
}
mock_db.query.return_value.filter.return_value.first.return_value = rule
# Create mock results
result1 = Mock()
result1.average_confidence = 0.95
result1.file = Mock()
result1.file.original_filename = "test1.png"
result2 = Mock()
result2.average_confidence = 0.70
result2.file = Mock()
result2.file.original_filename = "test2.png"
results = [result1, result2]
filtered = export_service.apply_export_rule(mock_db, results, rule_id=1)
assert len(filtered) == 1
assert result1 in filtered
def test_apply_export_rule_not_found(self, export_service, mock_db):
"""Test applying non-existent rule"""
mock_db.query.return_value.filter.return_value.first.return_value = None
with pytest.raises(ExportError) as exc_info:
export_service.apply_export_rule(mock_db, [], rule_id=999)
assert "not found" in str(exc_info.value).lower()
@pytest.mark.unit
class TestEdgeCases:
"""Test edge cases and error handling"""
def test_export_to_txt_empty_results(self, export_service, temp_dir):
"""Test TXT export with empty results list"""
output_path = temp_dir / "output.txt"
result_path = export_service.export_to_txt([], output_path)
assert result_path.exists()
content = result_path.read_text(encoding="utf-8")
assert content == ""
def test_export_to_json_empty_results(self, export_service, temp_dir):
"""Test JSON export with empty results list"""
output_path = temp_dir / "output.json"
result_path = export_service.export_to_json([], output_path)
data = json.loads(result_path.read_text(encoding="utf-8"))
assert data["total_files"] == 0
assert len(data["results"]) == 0
def test_export_with_unicode_content(self, export_service, temp_dir):
"""Test export with Unicode/Chinese content"""
md_file = temp_dir / "chinese.md"
md_file.write_text("# 測試文檔\n\n這是中文內容。", encoding="utf-8")
result = Mock()
result.id = 1
result.markdown_path = str(md_file)
result.json_path = None
result.detected_language = "zh"
result.total_text_regions = 10
result.average_confidence = 0.95
result.layout_data = None # Use None instead of Mock for JSON serialization
result.images_metadata = None # Use None instead of Mock
result.file = Mock()
result.file.id = 1
result.file.original_filename = "中文測試.png"
result.file.file_format = "png"
result.file.file_size = 1024
result.file.processing_time = 1.0
# Test TXT export
txt_path = temp_dir / "output.txt"
export_service.export_to_txt([result], txt_path)
assert "測試文檔" in txt_path.read_text(encoding="utf-8")
# Test JSON export
json_path = temp_dir / "output.json"
export_service.export_to_json([result], json_path)
data = json.loads(json_path.read_text(encoding="utf-8"))
assert data["results"][0]["filename"] == "中文測試.png"
def test_apply_filters_with_none_values(self, export_service):
"""Test filters with None values in results"""
result = Mock()
result.average_confidence = None
result.detected_language = None
result.file = Mock()
result.file.original_filename = "test.png"
filters = {"confidence_threshold": 0.80}
filtered = export_service.apply_filters([result], filters)
# Should filter out result with None confidence
assert len(filtered) == 0