chore: backup before code cleanup

Backup commit before executing remove-unused-code proposal.
This includes all pending changes and new features.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
egg
2025-12-11 11:55:39 +08:00
parent eff9b0bcd5
commit 940a406dce
58 changed files with 8226 additions and 175 deletions

View File

@@ -6,7 +6,7 @@ Supports both PaddleOCR (for scanned documents) and direct extraction (for edita
import json
import logging
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union
from typing import Any, Dict, List, Optional, Tuple, Union
from datetime import datetime
import uuid
import gc # For garbage collection
@@ -446,6 +446,47 @@ class OCRService:
except Exception as e:
logger.warning(f"Failed to clear GPU cache: {e}")
def _apply_ocr_config(self, ocr_config: 'OCRConfig'):
"""
Apply OCR configuration from preset or custom settings.
This modifies the runtime settings used by PP-Structure.
Args:
ocr_config: OCRConfig object with processing settings
"""
logger.info(f"Applying OCR config: {ocr_config.model_dump()}")
# Store the config for use in PP-Structure initialization
self._runtime_ocr_config = ocr_config
# Apply table parsing mode settings
# These will be used when initializing PP-StructureV3
settings.table_parsing_mode = ocr_config.table_parsing_mode.value if hasattr(ocr_config.table_parsing_mode, 'value') else ocr_config.table_parsing_mode
# Apply preprocessing settings
settings.use_doc_orientation_classify = ocr_config.use_doc_orientation_classify
settings.use_doc_unwarping = ocr_config.use_doc_unwarping
settings.use_textline_orientation = ocr_config.use_textline_orientation
# Apply recognition module settings
settings.enable_chart_recognition = ocr_config.enable_chart_recognition
settings.enable_formula_recognition = ocr_config.enable_formula_recognition
settings.enable_seal_recognition = ocr_config.enable_seal_recognition
settings.enable_region_detection = ocr_config.enable_region_detection
# Apply layout threshold if specified
if ocr_config.layout_threshold is not None:
settings.layout_detection_threshold = ocr_config.layout_threshold
if ocr_config.layout_nms_threshold is not None:
settings.layout_nms_threshold = ocr_config.layout_nms_threshold
# Invalidate existing structure engine to force re-initialization with new settings
if self.structure_engine is not None:
logger.info("Invalidating PP-StructureV3 engine to apply new OCR config")
self._unload_structure_engine()
logger.info(f"OCR config applied: table_parsing_mode={settings.table_parsing_mode}")
def get_ocr_engine(self, lang: str = 'ch') -> PaddleOCR:
"""
Get or create OCR engine for specified language with GPU support
@@ -615,6 +656,39 @@ class OCRService:
formula_model = settings.formula_recognition_model_name
chart_model = settings.chart_recognition_model_name
# Apply table_parsing_mode settings
# This is the KEY configuration to prevent "cell explosion" on datasheet-type documents
table_parsing_mode = settings.table_parsing_mode
logger.info(f"Table parsing mode: {table_parsing_mode}")
if table_parsing_mode == "disabled":
# 方案A: 完全關閉 table recognition
use_table = False
wired_table_model = None
wireless_table_model = None
wired_cell_det_model = None
wireless_cell_det_model = None
logger.info("Table parsing DISABLED - no cell segmentation")
elif table_parsing_mode == "classification_only":
# 方案C: 只做 table classification不做 cell segmentation
use_table = False # Don't parse table structure
wired_table_model = None
wireless_table_model = None
wired_cell_det_model = None
wireless_cell_det_model = None
# Keep table_cls_model to identify table regions
logger.info("Table parsing CLASSIFICATION_ONLY - regions identified but no cell parsing")
elif table_parsing_mode == "conservative":
# 方案B: 保守模式 - 只禁用 wireless tables (aggressive)
# 注意:不要修改 layout_threshold它會影響所有元素偵測不只是表格
wireless_table_model = None
wireless_cell_det_model = None
logger.info(f"Table parsing CONSERVATIVE - wireless disabled (layout_threshold unchanged)")
# else: "full" mode - use all default settings (aggressive)
# Apply table detection config overrides for individual table types
if table_detection_config:
if not table_detection_config.enable_wired_table:
@@ -1343,6 +1417,7 @@ class OCRService:
if detect_layout:
# Pass current_page to analyze_layout for correct page numbering
# Also pass text_regions for table content rebuilding
layout_data, images_metadata = self.analyze_layout(
image_path,
output_dir=output_dir,
@@ -1350,7 +1425,8 @@ class OCRService:
layout_model=layout_model,
preprocessing_mode=preprocessing_mode,
preprocessing_config=preprocessing_config,
table_detection_config=table_detection_config
table_detection_config=table_detection_config,
raw_ocr_regions=text_regions # For table content rebuilding
)
# Generate Markdown
@@ -1379,6 +1455,12 @@ class OCRService:
# If layout data is enhanced, add enhanced results for converter
if layout_data and layout_data.get('enhanced'):
# Debug: check if table elements have rebuild_stats
for elem in layout_data.get('elements', []):
if elem.get('type') == 'table':
has_rebuild = 'rebuild_stats' in elem
logger.info(f"[OCR_SERVICE] Table {elem.get('element_id')}: has rebuild_stats={has_rebuild}, keys={list(elem.keys())[:10]}")
result['enhanced_results'] = [{
'elements': layout_data.get('elements', []),
'reading_order': layout_data.get('reading_order', []),
@@ -1509,7 +1591,8 @@ class OCRService:
layout_model: Optional[str] = None,
preprocessing_mode: Optional[PreprocessingModeEnum] = None,
preprocessing_config: Optional[PreprocessingConfig] = None,
table_detection_config: Optional[TableDetectionConfig] = None
table_detection_config: Optional[TableDetectionConfig] = None,
raw_ocr_regions: Optional[List[Dict[str, Any]]] = None
) -> Tuple[Optional[Dict], List[Dict]]:
"""
Analyze document layout using PP-StructureV3 with enhanced element extraction
@@ -1522,6 +1605,7 @@ class OCRService:
preprocessing_mode: Preprocessing mode ('auto', 'manual', 'disabled')
preprocessing_config: Manual preprocessing config (used when mode='manual')
table_detection_config: Table detection config (wired/wireless/region options)
raw_ocr_regions: Optional list of raw OCR text regions for table content rebuilding
Returns:
Tuple of (layout_data, images_metadata)
@@ -1607,7 +1691,8 @@ class OCRService:
preprocessed_image=preprocessed_image,
scaling_info=scaling_info,
save_visualization=True, # Save layout detection visualization images
use_cv_table_detection=use_cv_table_detection
use_cv_table_detection=use_cv_table_detection,
raw_ocr_regions=raw_ocr_regions # For table content rebuilding
)
if result.get('has_parsing_res_list'):
@@ -2225,7 +2310,8 @@ class OCRService:
layout_model: Optional[str] = None,
preprocessing_mode: Optional[PreprocessingModeEnum] = None,
preprocessing_config: Optional[PreprocessingConfig] = None,
table_detection_config: Optional[TableDetectionConfig] = None
table_detection_config: Optional[TableDetectionConfig] = None,
ocr_config: Optional['OCRConfig'] = None
) -> Union[UnifiedDocument, Dict]:
"""
Main processing method with dual-track support.
@@ -2242,11 +2328,16 @@ class OCRService:
preprocessing_mode: Layout preprocessing mode ('auto', 'manual', 'disabled')
preprocessing_config: Manual preprocessing config (used when mode='manual')
table_detection_config: Table detection config (wired/wireless/region options)
ocr_config: OCR processing config from preset or custom settings
Returns:
UnifiedDocument if dual-track is enabled and use_dual_track=True,
Dict with legacy format otherwise
"""
# Apply OCR config to settings if provided
if ocr_config:
self._apply_ocr_config(ocr_config)
# Use dual-track processing if:
# 1. use_dual_track is True (auto-detection), OR
# 2. force_track is specified (explicit track selection)