feat: simplify layout model selection and archive proposals

Changes:
- Replace PP-Structure 7-slider parameter UI with simple 3-option layout model selector
- Add layout model mapping: chinese (PP-DocLayout-S), default (PubLayNet), cdla
- Add LayoutModelSelector component and zh-TW translations
- Fix "default" model behavior with sentinel value for PubLayNet
- Add gap filling service for OCR track coverage improvement
- Add PP-Structure debug utilities
- Archive completed/incomplete proposals:
  - add-ocr-track-gap-filling (complete)
  - fix-ocr-track-table-rendering (incomplete)
  - simplify-ppstructure-model-selection (22/25 tasks)
- Add new layout model tests, archive old PP-Structure param tests
- Update OpenSpec ocr-processing spec with layout model requirements

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-27 13:27:00 +08:00
parent c65df754cf
commit 59206a6ab8
35 changed files with 3621 additions and 658 deletions

View File

@@ -0,0 +1,110 @@
import { cn } from '@/lib/utils'
import { Check, FileText, Globe, BookOpen } from 'lucide-react'
import { useTranslation } from 'react-i18next'
import type { LayoutModel } from '@/types/apiV2'
interface LayoutModelSelectorProps {
value: LayoutModel
onChange: (model: LayoutModel) => void
disabled?: boolean
className?: string
}
const MODEL_ICONS: Record<LayoutModel, React.ReactNode> = {
chinese: <FileText className="w-5 h-5" />,
default: <Globe className="w-5 h-5" />,
cdla: <BookOpen className="w-5 h-5" />,
}
export default function LayoutModelSelector({
value,
onChange,
disabled = false,
className,
}: LayoutModelSelectorProps) {
const { t } = useTranslation()
const models: LayoutModel[] = ['chinese', 'default', 'cdla']
const getModelInfo = (model: LayoutModel) => ({
label: t(`processing.layoutModel.${model}`),
description: t(`processing.layoutModel.${model}Desc`),
})
return (
<div className={cn('border rounded-lg p-4 bg-white', className)}>
{/* Header */}
<div className="flex items-center gap-2 mb-4">
<FileText className="w-5 h-5 text-gray-600" />
<h3 className="text-lg font-semibold text-gray-900">{t('processing.layoutModel.title')}</h3>
</div>
{/* Model Options */}
<div className="space-y-3">
{models.map((model) => {
const info = getModelInfo(model)
const isSelected = value === model
return (
<button
key={model}
type="button"
disabled={disabled}
onClick={() => onChange(model)}
className={cn(
'w-full flex items-start gap-4 p-4 rounded-lg border-2 transition-all text-left',
isSelected
? 'border-blue-500 bg-blue-50'
: 'border-gray-200 hover:border-gray-300 hover:bg-gray-50',
disabled && 'opacity-50 cursor-not-allowed'
)}
>
{/* Icon */}
<div
className={cn(
'p-2 rounded-lg flex-shrink-0',
isSelected ? 'bg-blue-100 text-blue-600' : 'bg-gray-100 text-gray-500'
)}
>
{MODEL_ICONS[model]}
</div>
{/* Content */}
<div className="flex-1 min-w-0">
<div className="flex items-center gap-2">
<span
className={cn(
'font-medium',
isSelected ? 'text-blue-700' : 'text-gray-900'
)}
>
{info.label}
</span>
{model === 'chinese' && (
<span className="text-xs bg-green-100 text-green-700 px-2 py-0.5 rounded-full">
{t('processing.layoutModel.recommended')}
</span>
)}
</div>
<p className="text-sm text-gray-500 mt-1">{info.description}</p>
</div>
{/* Check mark */}
{isSelected && (
<div className="flex-shrink-0">
<Check className="w-5 h-5 text-blue-600" />
</div>
)}
</button>
)
})}
</div>
{/* Info Note */}
<div className="mt-4 p-3 bg-blue-50 border border-blue-200 rounded-md">
<p className="text-sm text-blue-800">
{t('processing.layoutModel.note')}
</p>
</div>
</div>
)
}

View File

@@ -1,408 +0,0 @@
import { useState, useEffect } from 'react'
import { Settings, RotateCcw, HelpCircle, Save, Upload, Download, Check, AlertCircle } from 'lucide-react'
import { cn } from '@/lib/utils'
import type { PPStructureV3Params } from '@/types/apiV2'
const STORAGE_KEY = 'pp_structure_params_presets'
const LAST_USED_KEY = 'pp_structure_params_last_used'
interface PPStructureParamsProps {
value: PPStructureV3Params
onChange: (params: PPStructureV3Params) => void
disabled?: boolean
className?: string
}
interface ParamConfig {
key: keyof PPStructureV3Params
label: string
description: string
min: number
max: number
step: number
default: number
type: 'slider'
}
interface SelectParamConfig {
key: keyof PPStructureV3Params
label: string
description: string
options: Array<{ value: string; label: string }>
default: string
type: 'select'
}
// Preset configurations
const PRESETS = {
default: {} as PPStructureV3Params,
'high-quality': {
layout_detection_threshold: 0.1,
layout_nms_threshold: 0.15,
text_det_thresh: 0.1,
text_det_box_thresh: 0.2,
layout_merge_bboxes_mode: 'small' as const,
} as PPStructureV3Params,
fast: {
layout_detection_threshold: 0.3,
layout_nms_threshold: 0.3,
text_det_thresh: 0.3,
text_det_box_thresh: 0.4,
layout_merge_bboxes_mode: 'large' as const,
} as PPStructureV3Params,
}
const PARAM_CONFIGS: Array<ParamConfig | SelectParamConfig> = [
{
key: 'layout_detection_threshold',
label: 'Layout Detection Threshold',
description: 'Lower = detect more blocks (including weak signals), Higher = only high-confidence blocks',
min: 0,
max: 1,
step: 0.05,
default: 0.2,
type: 'slider' as const,
},
{
key: 'layout_nms_threshold',
label: 'Layout NMS Threshold',
description: 'Lower = aggressive overlap removal, Higher = allow more overlapping boxes',
min: 0,
max: 1,
step: 0.05,
default: 0.2,
type: 'slider' as const,
},
{
key: 'layout_merge_bboxes_mode',
label: 'Layout Merge Mode',
description: 'Bounding box merging strategy',
options: [
{ value: 'small', label: 'Small (Conservative)' },
{ value: 'union', label: 'Union (Balanced)' },
{ value: 'large', label: 'Large (Aggressive)' },
],
default: 'small',
type: 'select' as const,
},
{
key: 'layout_unclip_ratio',
label: 'Layout Unclip Ratio',
description: 'Larger = looser bounding boxes, Smaller = tighter bounding boxes',
min: 0.5,
max: 3.0,
step: 0.1,
default: 1.2,
type: 'slider' as const,
},
{
key: 'text_det_thresh',
label: 'Text Detection Threshold',
description: 'Lower = detect more small/low-contrast text, Higher = cleaner but may miss text',
min: 0,
max: 1,
step: 0.05,
default: 0.2,
type: 'slider' as const,
},
{
key: 'text_det_box_thresh',
label: 'Text Box Threshold',
description: 'Lower = more text boxes retained, Higher = fewer false positives',
min: 0,
max: 1,
step: 0.05,
default: 0.3,
type: 'slider' as const,
},
{
key: 'text_det_unclip_ratio',
label: 'Text Unclip Ratio',
description: 'Larger = looser text boxes, Smaller = tighter text boxes',
min: 0.5,
max: 3.0,
step: 0.1,
default: 1.2,
type: 'slider' as const,
},
]
export default function PPStructureParams({
value,
onChange,
disabled = false,
className,
}: PPStructureParamsProps) {
const [showTooltip, setShowTooltip] = useState<string | null>(null)
const [isExpanded, setIsExpanded] = useState(false)
const [selectedPreset, setSelectedPreset] = useState<string>('custom')
const [showSaveSuccess, setShowSaveSuccess] = useState(false)
// Load last used parameters on mount
useEffect(() => {
try {
const lastUsed = localStorage.getItem(LAST_USED_KEY)
if (lastUsed && Object.keys(value).length === 0) {
const params = JSON.parse(lastUsed)
onChange(params)
}
} catch (error) {
console.error('Failed to load last used parameters:', error)
}
}, [])
// Save to localStorage when parameters change
useEffect(() => {
if (Object.keys(value).length > 0) {
try {
localStorage.setItem(LAST_USED_KEY, JSON.stringify(value))
} catch (error) {
console.error('Failed to save parameters:', error)
}
}
}, [value])
const handleReset = () => {
onChange({})
setSelectedPreset('default')
setShowSaveSuccess(false)
}
const handlePresetChange = (presetKey: string) => {
setSelectedPreset(presetKey)
if (presetKey === 'custom') return
const preset = PRESETS[presetKey as keyof typeof PRESETS]
if (preset) {
onChange(preset)
setShowSaveSuccess(false)
}
}
const handleChange = (key: keyof PPStructureV3Params, newValue: any) => {
const newParams = {
...value,
[key]: newValue,
}
onChange(newParams)
setSelectedPreset('custom')
}
const handleExport = () => {
const dataStr = JSON.stringify(value, null, 2)
const dataUri = 'data:application/json;charset=utf-8,' + encodeURIComponent(dataStr)
const exportFileDefaultName = 'pp_structure_params.json'
const linkElement = document.createElement('a')
linkElement.setAttribute('href', dataUri)
linkElement.setAttribute('download', exportFileDefaultName)
linkElement.click()
}
const handleImport = () => {
const input = document.createElement('input')
input.type = 'file'
input.accept = 'application/json'
input.onchange = (e) => {
const file = (e.target as HTMLInputElement).files?.[0]
if (file) {
const reader = new FileReader()
reader.onload = (event) => {
try {
const params = JSON.parse(event.target?.result as string)
onChange(params)
setSelectedPreset('custom')
setShowSaveSuccess(true)
setTimeout(() => setShowSaveSuccess(false), 3000)
} catch (error) {
console.error('Failed to import parameters:', error)
}
}
reader.readAsText(file)
}
}
input.click()
}
const hasCustomValues = Object.keys(value).length > 0
return (
<div className={cn('border rounded-lg p-4 bg-white', className)}>
{/* Header */}
<div className="flex items-center justify-between mb-4">
<div className="flex items-center gap-2">
<Settings className="w-5 h-5 text-gray-600" />
<h3 className="text-lg font-semibold text-gray-900">PP-StructureV3 Parameters</h3>
{hasCustomValues && (
<span className="text-xs bg-blue-100 text-blue-700 px-2 py-1 rounded">Custom</span>
)}
{showSaveSuccess && (
<span className="flex items-center gap-1 text-xs bg-green-100 text-green-700 px-2 py-1 rounded animate-in fade-in">
<Check className="w-3 h-3" />
Saved
</span>
)}
</div>
<div className="flex items-center gap-2">
<button
type="button"
onClick={() => setIsExpanded(!isExpanded)}
className="text-sm text-blue-600 hover:text-blue-700 px-3 py-1.5 rounded-md hover:bg-blue-50"
>
{isExpanded ? 'Hide' : 'Show'} Parameters
</button>
</div>
</div>
{/* Preset Selector & Actions */}
{isExpanded && (
<div className="mb-4 p-3 bg-gray-50 rounded-md space-y-3">
<div className="flex items-center gap-3">
<label className="text-sm font-medium text-gray-700">Preset:</label>
<select
value={selectedPreset}
onChange={(e) => handlePresetChange(e.target.value)}
disabled={disabled}
className="flex-1 px-3 py-1.5 text-sm border border-gray-300 rounded-md focus:outline-none focus:ring-2 focus:ring-blue-500 disabled:bg-gray-100"
>
<option value="default">Default (Backend Settings)</option>
<option value="high-quality">High Quality (Lower Thresholds)</option>
<option value="fast">Fast (Higher Thresholds)</option>
<option value="custom">Custom</option>
</select>
</div>
<div className="flex items-center gap-2">
<button
type="button"
onClick={handleReset}
disabled={disabled || !hasCustomValues}
className={cn(
'flex items-center gap-1 px-3 py-1.5 text-sm rounded-md transition-colors',
disabled || !hasCustomValues
? 'bg-gray-200 text-gray-400 cursor-not-allowed'
: 'bg-white border border-gray-300 text-gray-700 hover:bg-gray-50'
)}
>
<RotateCcw className="w-4 h-4" />
Reset
</button>
<button
type="button"
onClick={handleExport}
disabled={disabled || !hasCustomValues}
className={cn(
'flex items-center gap-1 px-3 py-1.5 text-sm rounded-md transition-colors',
disabled || !hasCustomValues
? 'bg-gray-200 text-gray-400 cursor-not-allowed'
: 'bg-white border border-gray-300 text-gray-700 hover:bg-gray-50'
)}
>
<Download className="w-4 h-4" />
Export
</button>
<button
type="button"
onClick={handleImport}
disabled={disabled}
className={cn(
'flex items-center gap-1 px-3 py-1.5 text-sm rounded-md transition-colors',
disabled
? 'bg-gray-200 text-gray-400 cursor-not-allowed'
: 'bg-white border border-gray-300 text-gray-700 hover:bg-gray-50'
)}
>
<Upload className="w-4 h-4" />
Import
</button>
</div>
</div>
)}
{/* Expanded Parameters */}
{isExpanded && (
<div className="space-y-6 pt-4 border-t">
{PARAM_CONFIGS.map((config) => (
<div key={config.key} className="space-y-2">
<div className="flex items-center justify-between">
<div className="flex items-center gap-2">
<label htmlFor={config.key} className="text-sm font-medium text-gray-700">
{config.label}
</label>
<button
type="button"
onMouseEnter={() => setShowTooltip(config.key)}
onMouseLeave={() => setShowTooltip(null)}
className="text-gray-400 hover:text-gray-600 relative"
>
<HelpCircle className="w-4 h-4" />
{showTooltip === config.key && (
<div className="absolute left-6 top-0 w-64 p-2 bg-gray-900 text-white text-xs rounded shadow-lg z-10">
{config.description}
</div>
)}
</button>
</div>
{config.type === 'slider' && (
<div className="flex items-center gap-2">
<span className="text-sm font-semibold text-blue-600">
{value[config.key] ?? config.default}
</span>
{value[config.key] !== undefined && value[config.key] !== config.default && (
<span className="text-xs text-gray-500">
(default: {config.default})
</span>
)}
</div>
)}
</div>
{config.type === 'slider' ? (
<input
type="range"
id={config.key}
min={config.min}
max={config.max}
step={config.step}
value={value[config.key] ?? config.default}
onChange={(e) => handleChange(config.key, parseFloat(e.target.value))}
disabled={disabled}
className="w-full h-2 bg-gray-200 rounded-lg appearance-none cursor-pointer disabled:cursor-not-allowed disabled:opacity-50"
/>
) : (
<select
id={config.key}
value={(value[config.key] as string) ?? config.default}
onChange={(e) => handleChange(config.key, e.target.value)}
disabled={disabled}
className="w-full px-3 py-2 border border-gray-300 rounded-md focus:outline-none focus:ring-2 focus:ring-blue-500 disabled:bg-gray-100 disabled:cursor-not-allowed"
>
{config.options.map((option) => (
<option key={option.value} value={option.value}>
{option.label}
</option>
))}
</select>
)}
</div>
))}
{/* Info Note */}
<div className="mt-4 p-3 bg-blue-50 border border-blue-200 rounded-md">
<p className="text-sm text-blue-800">
<strong>Note:</strong> These parameters only apply when using the OCR track. Adjusting them
can help improve accuracy for specific document types.
</p>
</div>
</div>
)}
{/* Collapsed Summary */}
{!isExpanded && hasCustomValues && (
<div className="text-sm text-gray-600">
{Object.keys(value).length} parameter(s) customized
</div>
)}
</div>
)
}

View File

@@ -52,6 +52,17 @@
"language": "識別語言",
"threshold": "信心度閾值",
"layoutDetection": "版面偵測"
},
"layoutModel": {
"title": "版面偵測模型",
"chinese": "中文文件模型",
"chineseDesc": "PP-DocLayout-S - 適用於中文表單、合約、發票(推薦)",
"default": "標準模型",
"defaultDesc": "PubLayNet 模型 - 適用於英文學術論文、報告",
"cdla": "CDLA 模型",
"cdlaDesc": "專用中文版面分析模型 - 適用於複雜中文版面",
"recommended": "推薦",
"note": "版面模型會影響文件結構(表格、文字區塊、圖片)的偵測效果。請根據您的文件類型選擇適合的模型。"
}
},
"results": {

View File

@@ -9,10 +9,10 @@ import { Badge } from '@/components/ui/badge'
import { useToast } from '@/components/ui/toast'
import { apiClientV2 } from '@/services/apiV2'
import { Play, CheckCircle, FileText, AlertCircle, Clock, Activity, Loader2 } from 'lucide-react'
import PPStructureParams from '@/components/PPStructureParams'
import LayoutModelSelector from '@/components/LayoutModelSelector'
import TaskNotFound from '@/components/TaskNotFound'
import { useTaskValidation } from '@/hooks/useTaskValidation'
import type { PPStructureV3Params, ProcessingOptions } from '@/types/apiV2'
import type { LayoutModel, ProcessingOptions } from '@/types/apiV2'
export default function ProcessingPage() {
const { t } = useTranslation()
@@ -31,8 +31,8 @@ export default function ProcessingPage() {
},
})
// PP-StructureV3 parameters state
const [ppStructureParams, setPpStructureParams] = useState<PPStructureV3Params>({})
// Layout model state (default to 'chinese' for best Chinese document support)
const [layoutModel, setLayoutModel] = useState<LayoutModel>('chinese')
// Start OCR processing
const processOCRMutation = useMutation({
@@ -40,11 +40,7 @@ export default function ProcessingPage() {
const options: ProcessingOptions = {
use_dual_track: true,
language: 'ch',
}
// Only include pp_structure_params if user has customized them
if (Object.keys(ppStructureParams).length > 0) {
options.pp_structure_params = ppStructureParams
layout_model: layoutModel,
}
return apiClientV2.startTask(taskId!, options)
@@ -346,11 +342,11 @@ export default function ProcessingPage() {
</Card>
)}
{/* PP-StructureV3 Parameters (only show when task is pending) */}
{/* Layout Model Selection (only show when task is pending) */}
{isPending && (
<PPStructureParams
value={ppStructureParams}
onChange={setPpStructureParams}
<LayoutModelSelector
value={layoutModel}
onChange={setLayoutModel}
disabled={processOCRMutation.isPending}
/>
)}

View File

@@ -73,15 +73,14 @@ export interface DocumentAnalysisResponse {
page_count: number | null
}
export interface PPStructureV3Params {
layout_detection_threshold?: number // 0-1: Lower=more blocks, Higher=high confidence only
layout_nms_threshold?: number // 0-1: Lower=aggressive overlap removal, Higher=allow more overlap
layout_merge_bboxes_mode?: 'union' | 'large' | 'small' // small=conservative, large=aggressive, union=middle
layout_unclip_ratio?: number // >0: Larger=looser boxes, Smaller=tighter boxes
text_det_thresh?: number // 0-1: Lower=detect more small/low-contrast text, Higher=cleaner
text_det_box_thresh?: number // 0-1: Lower=more text boxes, Higher=fewer false positives
text_det_unclip_ratio?: number // >0: Larger=looser text boxes, Smaller=tighter boxes
}
/**
* Layout detection model selection for OCR track.
* Different models are optimized for different document types:
* - chinese: PP-DocLayout-S - Best for Chinese forms, contracts, invoices
* - default: PubLayNet-based - Best for English academic papers
* - cdla: Specialized for Chinese document layout analysis
*/
export type LayoutModel = 'chinese' | 'default' | 'cdla'
export interface ProcessingOptions {
use_dual_track?: boolean
@@ -89,7 +88,7 @@ export interface ProcessingOptions {
language?: string
include_layout?: boolean
include_images?: boolean
pp_structure_params?: PPStructureV3Params // Fine-tuning parameters for PP-StructureV3 (OCR track only)
layout_model?: LayoutModel // Layout detection model selection (OCR track only)
}
export interface TaskCreate {