Files
OCR/openspec/changes/archive/2025-12-10-add-ocr-processing-presets/design.md
egg 940a406dce chore: backup before code cleanup
Backup commit before executing remove-unused-code proposal.
This includes all pending changes and new features.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-11 11:55:39 +08:00

9.1 KiB

Design: OCR Processing Presets

Architecture Overview

┌─────────────────────────────────────────────────────────────────┐
│                        Frontend                                  │
├─────────────────────────────────────────────────────────────────┤
│  ┌──────────────────┐    ┌──────────────────────────────────┐   │
│  │ Preset Selector  │───▶│  Advanced Parameter Panel        │   │
│  │ (Simple Mode)    │    │  (Expert Mode)                   │   │
│  └──────────────────┘    └──────────────────────────────────┘   │
│           │                           │                          │
│           └───────────┬───────────────┘                          │
│                       ▼                                          │
│              ┌─────────────────┐                                 │
│              │ OCR Config JSON │                                 │
│              └─────────────────┘                                 │
└─────────────────────────────────────────────────────────────────┘
                        │
                        ▼ POST /api/v2/tasks
┌─────────────────────────────────────────────────────────────────┐
│                        Backend                                   │
├─────────────────────────────────────────────────────────────────┤
│  ┌──────────────────┐    ┌──────────────────────────────────┐   │
│  │ Preset Resolver  │───▶│  OCR Config Validator            │   │
│  └──────────────────┘    └──────────────────────────────────┘   │
│           │                           │                          │
│           └───────────┬───────────────┘                          │
│                       ▼                                          │
│              ┌─────────────────┐                                 │
│              │ OCRService      │                                 │
│              │ (with config)   │                                 │
│              └─────────────────┘                                 │
│                       │                                          │
│                       ▼                                          │
│              ┌─────────────────┐                                 │
│              │ PPStructureV3   │                                 │
│              │ (configured)    │                                 │
│              └─────────────────┘                                 │
└─────────────────────────────────────────────────────────────────┘

Data Models

OCRPreset Enum

class OCRPreset(str, Enum):
    TEXT_HEAVY = "text_heavy"       # Reports, articles, manuals
    DATASHEET = "datasheet"         # Technical datasheets, TDS
    TABLE_HEAVY = "table_heavy"     # Financial reports, spreadsheets
    FORM = "form"                   # Applications, surveys
    MIXED = "mixed"                 # General documents
    CUSTOM = "custom"               # User-defined settings

OCRConfig Model

class OCRConfig(BaseModel):
    # Table Processing
    table_parsing_mode: Literal["full", "conservative", "classification_only", "disabled"] = "conservative"
    table_layout_threshold: float = Field(default=0.65, ge=0.0, le=1.0)
    enable_wired_table: bool = True
    enable_wireless_table: bool = False  # Disabled by default (aggressive)

    # Layout Detection
    layout_detection_model: Optional[str] = "PP-DocLayout_plus-L"
    layout_threshold: Optional[float] = Field(default=None, ge=0.0, le=1.0)
    layout_nms_threshold: Optional[float] = Field(default=None, ge=0.0, le=1.0)
    layout_merge_mode: Optional[Literal["large", "small", "union"]] = "union"

    # Preprocessing
    use_doc_orientation_classify: bool = True
    use_doc_unwarping: bool = False  # Causes distortion
    use_textline_orientation: bool = True

    # Recognition Modules
    enable_chart_recognition: bool = True
    enable_formula_recognition: bool = True
    enable_seal_recognition: bool = False
    enable_region_detection: bool = True

Preset Definitions

PRESET_CONFIGS: Dict[OCRPreset, OCRConfig] = {
    OCRPreset.TEXT_HEAVY: OCRConfig(
        table_parsing_mode="disabled",
        table_layout_threshold=0.7,
        enable_wired_table=False,
        enable_wireless_table=False,
        enable_chart_recognition=False,
        enable_formula_recognition=False,
    ),
    OCRPreset.DATASHEET: OCRConfig(
        table_parsing_mode="conservative",
        table_layout_threshold=0.65,
        enable_wired_table=True,
        enable_wireless_table=False,  # Key: disable aggressive wireless
    ),
    OCRPreset.TABLE_HEAVY: OCRConfig(
        table_parsing_mode="full",
        table_layout_threshold=0.5,
        enable_wired_table=True,
        enable_wireless_table=True,
    ),
    OCRPreset.FORM: OCRConfig(
        table_parsing_mode="conservative",
        table_layout_threshold=0.6,
        enable_wired_table=True,
        enable_wireless_table=False,
    ),
    OCRPreset.MIXED: OCRConfig(
        table_parsing_mode="classification_only",
        table_layout_threshold=0.55,
    ),
}

API Design

Task Creation with OCR Config

POST /api/v2/tasks
Content-Type: multipart/form-data

file: <binary>
processing_track: "ocr"
ocr_preset: "datasheet"  # Optional: use preset
ocr_config: {            # Optional: override specific params
  "table_layout_threshold": 0.7
}

Get Available Presets

GET /api/v2/ocr/presets

Response:
{
  "presets": [
    {
      "name": "datasheet",
      "display_name": "Technical Datasheet",
      "description": "Optimized for product specifications and technical documents",
      "icon": "description",
      "config": { ... }
    },
    ...
  ]
}

Frontend Components

PresetSelector Component

interface PresetSelectorProps {
  value: OCRPreset;
  onChange: (preset: OCRPreset) => void;
  showAdvanced: boolean;
  onToggleAdvanced: () => void;
}

// Visual preset cards with icons:
// 📄 Text Heavy - Reports & Articles
// 📊 Datasheet - Technical Documents
// 📈 Table Heavy - Financial Reports
// 📝 Form - Applications & Surveys
// 📑 Mixed - General Documents
// ⚙️ Custom - Expert Settings

AdvancedConfigPanel Component

interface AdvancedConfigPanelProps {
  config: OCRConfig;
  onChange: (config: Partial<OCRConfig>) => void;
  preset: OCRPreset;  // To show which values differ from preset
}

// Sections:
// - Table Processing (collapsed by default)
// - Layout Detection (collapsed by default)
// - Preprocessing (collapsed by default)
// - Recognition Modules (collapsed by default)

Key Design Decisions

1. Preset as Default, Custom as Exception

Users should start with presets. Only expose advanced panel when:

  • User explicitly clicks "Advanced Settings"
  • User selects "Custom" preset
  • User has previously saved custom settings

2. Conservative Defaults

All presets default to conservative settings:

  • enable_wireless_table: false (most aggressive, causes cell explosion)
  • table_layout_threshold: 0.6+ (reduce false table detection)
  • use_doc_unwarping: false (causes distortion)

3. Config Inheritance

Custom config inherits from preset, only specified fields override:

final_config = PRESET_CONFIGS[preset].copy()
final_config.update(custom_overrides)

4. No Patch Behaviors

All post-processing patches are disabled by default:

  • cell_validation_enabled: false
  • gap_filling_enabled: false
  • table_content_rebuilder_enabled: false

Focus on getting PP-Structure output right with proper configuration.