Backup commit before executing remove-unused-code proposal. This includes all pending changes and new features. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
228 lines
9.1 KiB
Markdown
228 lines
9.1 KiB
Markdown
# Design: OCR Processing Presets
|
|
|
|
## Architecture Overview
|
|
|
|
```
|
|
┌─────────────────────────────────────────────────────────────────┐
|
|
│ Frontend │
|
|
├─────────────────────────────────────────────────────────────────┤
|
|
│ ┌──────────────────┐ ┌──────────────────────────────────┐ │
|
|
│ │ Preset Selector │───▶│ Advanced Parameter Panel │ │
|
|
│ │ (Simple Mode) │ │ (Expert Mode) │ │
|
|
│ └──────────────────┘ └──────────────────────────────────┘ │
|
|
│ │ │ │
|
|
│ └───────────┬───────────────┘ │
|
|
│ ▼ │
|
|
│ ┌─────────────────┐ │
|
|
│ │ OCR Config JSON │ │
|
|
│ └─────────────────┘ │
|
|
└─────────────────────────────────────────────────────────────────┘
|
|
│
|
|
▼ POST /api/v2/tasks
|
|
┌─────────────────────────────────────────────────────────────────┐
|
|
│ Backend │
|
|
├─────────────────────────────────────────────────────────────────┤
|
|
│ ┌──────────────────┐ ┌──────────────────────────────────┐ │
|
|
│ │ Preset Resolver │───▶│ OCR Config Validator │ │
|
|
│ └──────────────────┘ └──────────────────────────────────┘ │
|
|
│ │ │ │
|
|
│ └───────────┬───────────────┘ │
|
|
│ ▼ │
|
|
│ ┌─────────────────┐ │
|
|
│ │ OCRService │ │
|
|
│ │ (with config) │ │
|
|
│ └─────────────────┘ │
|
|
│ │ │
|
|
│ ▼ │
|
|
│ ┌─────────────────┐ │
|
|
│ │ PPStructureV3 │ │
|
|
│ │ (configured) │ │
|
|
│ └─────────────────┘ │
|
|
└─────────────────────────────────────────────────────────────────┘
|
|
```
|
|
|
|
## Data Models
|
|
|
|
### OCRPreset Enum
|
|
|
|
```python
|
|
class OCRPreset(str, Enum):
|
|
TEXT_HEAVY = "text_heavy" # Reports, articles, manuals
|
|
DATASHEET = "datasheet" # Technical datasheets, TDS
|
|
TABLE_HEAVY = "table_heavy" # Financial reports, spreadsheets
|
|
FORM = "form" # Applications, surveys
|
|
MIXED = "mixed" # General documents
|
|
CUSTOM = "custom" # User-defined settings
|
|
```
|
|
|
|
### OCRConfig Model
|
|
|
|
```python
|
|
class OCRConfig(BaseModel):
|
|
# Table Processing
|
|
table_parsing_mode: Literal["full", "conservative", "classification_only", "disabled"] = "conservative"
|
|
table_layout_threshold: float = Field(default=0.65, ge=0.0, le=1.0)
|
|
enable_wired_table: bool = True
|
|
enable_wireless_table: bool = False # Disabled by default (aggressive)
|
|
|
|
# Layout Detection
|
|
layout_detection_model: Optional[str] = "PP-DocLayout_plus-L"
|
|
layout_threshold: Optional[float] = Field(default=None, ge=0.0, le=1.0)
|
|
layout_nms_threshold: Optional[float] = Field(default=None, ge=0.0, le=1.0)
|
|
layout_merge_mode: Optional[Literal["large", "small", "union"]] = "union"
|
|
|
|
# Preprocessing
|
|
use_doc_orientation_classify: bool = True
|
|
use_doc_unwarping: bool = False # Causes distortion
|
|
use_textline_orientation: bool = True
|
|
|
|
# Recognition Modules
|
|
enable_chart_recognition: bool = True
|
|
enable_formula_recognition: bool = True
|
|
enable_seal_recognition: bool = False
|
|
enable_region_detection: bool = True
|
|
```
|
|
|
|
### Preset Definitions
|
|
|
|
```python
|
|
PRESET_CONFIGS: Dict[OCRPreset, OCRConfig] = {
|
|
OCRPreset.TEXT_HEAVY: OCRConfig(
|
|
table_parsing_mode="disabled",
|
|
table_layout_threshold=0.7,
|
|
enable_wired_table=False,
|
|
enable_wireless_table=False,
|
|
enable_chart_recognition=False,
|
|
enable_formula_recognition=False,
|
|
),
|
|
OCRPreset.DATASHEET: OCRConfig(
|
|
table_parsing_mode="conservative",
|
|
table_layout_threshold=0.65,
|
|
enable_wired_table=True,
|
|
enable_wireless_table=False, # Key: disable aggressive wireless
|
|
),
|
|
OCRPreset.TABLE_HEAVY: OCRConfig(
|
|
table_parsing_mode="full",
|
|
table_layout_threshold=0.5,
|
|
enable_wired_table=True,
|
|
enable_wireless_table=True,
|
|
),
|
|
OCRPreset.FORM: OCRConfig(
|
|
table_parsing_mode="conservative",
|
|
table_layout_threshold=0.6,
|
|
enable_wired_table=True,
|
|
enable_wireless_table=False,
|
|
),
|
|
OCRPreset.MIXED: OCRConfig(
|
|
table_parsing_mode="classification_only",
|
|
table_layout_threshold=0.55,
|
|
),
|
|
}
|
|
```
|
|
|
|
## API Design
|
|
|
|
### Task Creation with OCR Config
|
|
|
|
```http
|
|
POST /api/v2/tasks
|
|
Content-Type: multipart/form-data
|
|
|
|
file: <binary>
|
|
processing_track: "ocr"
|
|
ocr_preset: "datasheet" # Optional: use preset
|
|
ocr_config: { # Optional: override specific params
|
|
"table_layout_threshold": 0.7
|
|
}
|
|
```
|
|
|
|
### Get Available Presets
|
|
|
|
```http
|
|
GET /api/v2/ocr/presets
|
|
|
|
Response:
|
|
{
|
|
"presets": [
|
|
{
|
|
"name": "datasheet",
|
|
"display_name": "Technical Datasheet",
|
|
"description": "Optimized for product specifications and technical documents",
|
|
"icon": "description",
|
|
"config": { ... }
|
|
},
|
|
...
|
|
]
|
|
}
|
|
```
|
|
|
|
## Frontend Components
|
|
|
|
### PresetSelector Component
|
|
|
|
```tsx
|
|
interface PresetSelectorProps {
|
|
value: OCRPreset;
|
|
onChange: (preset: OCRPreset) => void;
|
|
showAdvanced: boolean;
|
|
onToggleAdvanced: () => void;
|
|
}
|
|
|
|
// Visual preset cards with icons:
|
|
// 📄 Text Heavy - Reports & Articles
|
|
// 📊 Datasheet - Technical Documents
|
|
// 📈 Table Heavy - Financial Reports
|
|
// 📝 Form - Applications & Surveys
|
|
// 📑 Mixed - General Documents
|
|
// ⚙️ Custom - Expert Settings
|
|
```
|
|
|
|
### AdvancedConfigPanel Component
|
|
|
|
```tsx
|
|
interface AdvancedConfigPanelProps {
|
|
config: OCRConfig;
|
|
onChange: (config: Partial<OCRConfig>) => void;
|
|
preset: OCRPreset; // To show which values differ from preset
|
|
}
|
|
|
|
// Sections:
|
|
// - Table Processing (collapsed by default)
|
|
// - Layout Detection (collapsed by default)
|
|
// - Preprocessing (collapsed by default)
|
|
// - Recognition Modules (collapsed by default)
|
|
```
|
|
|
|
## Key Design Decisions
|
|
|
|
### 1. Preset as Default, Custom as Exception
|
|
|
|
Users should start with presets. Only expose advanced panel when:
|
|
- User explicitly clicks "Advanced Settings"
|
|
- User selects "Custom" preset
|
|
- User has previously saved custom settings
|
|
|
|
### 2. Conservative Defaults
|
|
|
|
All presets default to conservative settings:
|
|
- `enable_wireless_table: false` (most aggressive, causes cell explosion)
|
|
- `table_layout_threshold: 0.6+` (reduce false table detection)
|
|
- `use_doc_unwarping: false` (causes distortion)
|
|
|
|
### 3. Config Inheritance
|
|
|
|
Custom config inherits from preset, only specified fields override:
|
|
```python
|
|
final_config = PRESET_CONFIGS[preset].copy()
|
|
final_config.update(custom_overrides)
|
|
```
|
|
|
|
### 4. No Patch Behaviors
|
|
|
|
All post-processing patches are disabled by default:
|
|
- `cell_validation_enabled: false`
|
|
- `gap_filling_enabled: false`
|
|
- `table_content_rebuilder_enabled: false`
|
|
|
|
Focus on getting PP-Structure output right with proper configuration.
|