feat: enable document orientation detection for scanned PDFs
- Enable PP-StructureV3's use_doc_orientation_classify feature - Detect rotation angle from doc_preprocessor_res.angle - Swap page dimensions (width <-> height) for 90°/270° rotations - Output PDF now correctly displays landscape-scanned content Also includes: - Archive completed openspec proposals - Add simplify-frontend-ocr-config proposal (pending) - Code cleanup and frontend simplification 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -40,10 +40,6 @@ from app.schemas.task import (
|
||||
PreprocessingPreviewRequest,
|
||||
PreprocessingPreviewResponse,
|
||||
ImageQualityMetrics,
|
||||
TableDetectionConfig,
|
||||
OCRPresetEnum,
|
||||
OCRConfig,
|
||||
OCR_PRESET_CONFIGS,
|
||||
)
|
||||
from app.services.task_service import task_service
|
||||
from app.services.file_access_service import file_access_service
|
||||
@@ -79,10 +75,7 @@ def process_task_ocr(
|
||||
language: str = 'ch',
|
||||
layout_model: Optional[str] = "chinese",
|
||||
preprocessing_mode: Optional[str] = "auto",
|
||||
preprocessing_config: Optional[dict] = None,
|
||||
table_detection_config: Optional[dict] = None,
|
||||
ocr_preset: Optional[str] = None,
|
||||
ocr_config: Optional[dict] = None
|
||||
preprocessing_config: Optional[dict] = None
|
||||
):
|
||||
"""
|
||||
Background task to process OCR for a task with dual-track support.
|
||||
@@ -101,9 +94,6 @@ def process_task_ocr(
|
||||
layout_model: Layout detection model ('chinese', 'default', 'cdla')
|
||||
preprocessing_mode: Preprocessing mode ('auto', 'manual', 'disabled')
|
||||
preprocessing_config: Manual preprocessing config dict (contrast, sharpen, binarize)
|
||||
table_detection_config: Table detection config dict (enable_wired_table, enable_wireless_table, enable_region_detection)
|
||||
ocr_preset: OCR processing preset (text_heavy, datasheet, table_heavy, form, mixed, custom)
|
||||
ocr_config: Custom OCR config dict (overrides preset values)
|
||||
"""
|
||||
from app.core.database import SessionLocal
|
||||
from app.models.task import Task
|
||||
@@ -116,7 +106,6 @@ def process_task_ocr(
|
||||
logger.info(f"Starting OCR processing for task {task_id}, file: {filename}")
|
||||
logger.info(f"Processing options: dual_track={use_dual_track}, force_track={force_track}, lang={language}")
|
||||
logger.info(f"Preprocessing options: mode={preprocessing_mode}, config={preprocessing_config}")
|
||||
logger.info(f"Table detection options: {table_detection_config}")
|
||||
|
||||
# Convert preprocessing parameters to proper types
|
||||
preprocess_mode_enum = None
|
||||
@@ -133,35 +122,6 @@ def process_task_ocr(
|
||||
binarize=preprocessing_config.get("binarize", False)
|
||||
)
|
||||
|
||||
# Convert table detection config to object
|
||||
table_det_config_obj = None
|
||||
if table_detection_config:
|
||||
table_det_config_obj = TableDetectionConfig(
|
||||
enable_wired_table=table_detection_config.get("enable_wired_table", True),
|
||||
enable_wireless_table=table_detection_config.get("enable_wireless_table", True),
|
||||
enable_region_detection=table_detection_config.get("enable_region_detection", True)
|
||||
)
|
||||
|
||||
# Convert OCR preset and config to proper objects
|
||||
from app.schemas.task import OCRPresetEnum, OCRConfig, OCR_PRESET_CONFIGS, TableParsingModeEnum
|
||||
ocr_config_obj = None
|
||||
if ocr_preset:
|
||||
preset_enum = OCRPresetEnum(ocr_preset)
|
||||
# Get preset config as base
|
||||
if preset_enum in OCR_PRESET_CONFIGS:
|
||||
ocr_config_obj = OCR_PRESET_CONFIGS[preset_enum].model_copy()
|
||||
else:
|
||||
# CUSTOM preset - use provided config or defaults
|
||||
ocr_config_obj = OCRConfig()
|
||||
|
||||
# Override with custom config values if provided
|
||||
if ocr_config:
|
||||
for key, value in ocr_config.items():
|
||||
if hasattr(ocr_config_obj, key) and value is not None:
|
||||
setattr(ocr_config_obj, key, value)
|
||||
|
||||
logger.info(f"OCR config resolved: preset={ocr_preset}, config={ocr_config_obj.model_dump() if ocr_config_obj else None}")
|
||||
|
||||
# Get task directly by database ID (bypass user isolation for background task)
|
||||
task = db.query(Task).filter(Task.id == task_db_id).first()
|
||||
if not task:
|
||||
@@ -210,9 +170,7 @@ def process_task_ocr(
|
||||
force_track=force_track,
|
||||
layout_model=layout_model,
|
||||
preprocessing_mode=preprocess_mode_enum,
|
||||
preprocessing_config=preprocess_config_obj,
|
||||
table_detection_config=table_det_config_obj,
|
||||
ocr_config=ocr_config_obj
|
||||
preprocessing_config=preprocess_config_obj
|
||||
)
|
||||
else:
|
||||
# Fall back to traditional processing (no force_track support)
|
||||
@@ -223,9 +181,7 @@ def process_task_ocr(
|
||||
output_dir=result_dir,
|
||||
layout_model=layout_model,
|
||||
preprocessing_mode=preprocess_mode_enum,
|
||||
preprocessing_config=preprocess_config_obj,
|
||||
table_detection_config=table_det_config_obj,
|
||||
ocr_config=ocr_config_obj
|
||||
preprocessing_config=preprocess_config_obj
|
||||
)
|
||||
|
||||
# Calculate processing time
|
||||
@@ -818,7 +774,7 @@ async def start_task(
|
||||
- **force_track**: Force specific processing track ('ocr' or 'direct')
|
||||
- **language**: OCR language code (default: 'ch')
|
||||
- **layout_model**: Layout detection model ('chinese', 'default', 'cdla')
|
||||
- **table_detection**: Table detection config (enable_wired_table, enable_wireless_table, enable_region_detection)
|
||||
- **preprocessing_mode**: Preprocessing mode ('auto', 'manual', 'disabled')
|
||||
"""
|
||||
try:
|
||||
# Parse processing options with defaults
|
||||
@@ -846,23 +802,6 @@ async def start_task(
|
||||
}
|
||||
logger.info(f"Preprocessing: mode={preprocessing_mode}, config={preprocessing_config}")
|
||||
|
||||
# Extract table detection options
|
||||
table_detection_config = None
|
||||
if options.table_detection:
|
||||
table_detection_config = {
|
||||
"enable_wired_table": options.table_detection.enable_wired_table,
|
||||
"enable_wireless_table": options.table_detection.enable_wireless_table,
|
||||
"enable_region_detection": options.table_detection.enable_region_detection
|
||||
}
|
||||
logger.info(f"Table detection: {table_detection_config}")
|
||||
|
||||
# Extract OCR preset and config
|
||||
ocr_preset = options.ocr_preset.value if options.ocr_preset else "datasheet"
|
||||
ocr_config_dict = None
|
||||
if options.ocr_config:
|
||||
ocr_config_dict = options.ocr_config.model_dump()
|
||||
logger.info(f"OCR preset: {ocr_preset}, config: {ocr_config_dict}")
|
||||
|
||||
# Get task details
|
||||
task = task_service.get_task_by_id(
|
||||
db=db,
|
||||
@@ -911,14 +850,11 @@ async def start_task(
|
||||
language=language,
|
||||
layout_model=layout_model,
|
||||
preprocessing_mode=preprocessing_mode,
|
||||
preprocessing_config=preprocessing_config,
|
||||
table_detection_config=table_detection_config,
|
||||
ocr_preset=ocr_preset,
|
||||
ocr_config=ocr_config_dict
|
||||
preprocessing_config=preprocessing_config
|
||||
)
|
||||
|
||||
logger.info(f"Started OCR processing task {task_id} for user {current_user.email}")
|
||||
logger.info(f"Options: dual_track={use_dual_track}, force_track={force_track}, lang={language}, layout_model={layout_model}, preprocessing={preprocessing_mode}, table_detection={table_detection_config}, ocr_preset={ocr_preset}")
|
||||
logger.info(f"Options: dual_track={use_dual_track}, force_track={force_track}, lang={language}, layout_model={layout_model}, preprocessing={preprocessing_mode}")
|
||||
return task
|
||||
|
||||
except HTTPException:
|
||||
|
||||
Reference in New Issue
Block a user