fix: OCR track table data format and image cropping

Table data format fixes (ocr_to_unified_converter.py):
- Fix ElementType string conversion using value-based lookup
- Add content-based HTML table detection (reclassify TEXT to TABLE)
- Use BeautifulSoup for robust HTML table parsing
- Generate TableData with fully populated cells arrays

Image cropping for OCR track (pp_structure_enhanced.py):
- Add _crop_and_save_image method for extracting image regions
- Pass source_image_path to _process_parsing_res_list
- Return relative filename (not full path) for saved_path
- Consistent with Direct Track image saving pattern

Also includes:
- Add beautifulsoup4 to requirements.txt
- Add architecture overview documentation
- Archive fix-ocr-track-table-data-format proposal (22/24 tasks)

Known issues: OCR track images are restored but still have quality issues
that will be addressed in a follow-up proposal.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-26 18:48:15 +08:00
parent a227311b2d
commit 6e050eb540
8 changed files with 585 additions and 30 deletions

View File

@@ -350,7 +350,19 @@ class OCRToUnifiedConverter:
element_type = elem_data.get('type', ElementType.TEXT) element_type = elem_data.get('type', ElementType.TEXT)
if isinstance(element_type, str): if isinstance(element_type, str):
# Convert string to ElementType if needed # Convert string to ElementType if needed
element_type = ElementType[element_type] if element_type in ElementType.__members__ else ElementType.TEXT # ElementType is a str-based enum, so we can construct from value (lowercase)
try:
element_type = ElementType(element_type)
except ValueError:
# If value doesn't match, try member name (uppercase)
element_type = ElementType[element_type.upper()] if element_type.upper() in ElementType.__members__ else ElementType.TEXT
# Content-based reclassification: detect HTML tables in text content
content_str = elem_data.get('content', '')
if isinstance(content_str, str) and '<table' in content_str.lower():
if element_type == ElementType.TEXT:
logger.info(f"Element {elem_data.get('element_id')}: Reclassifying TEXT to TABLE (HTML table in content)")
element_type = ElementType.TABLE
# Prepare content based on element type # Prepare content based on element type
if element_type == ElementType.TABLE: if element_type == ElementType.TABLE:
@@ -538,7 +550,12 @@ class OCRToUnifiedConverter:
return None return None
def _extract_table_data(self, elem_data: Dict) -> Optional[TableData]: def _extract_table_data(self, elem_data: Dict) -> Optional[TableData]:
"""Extract table data from element.""" """
Extract table data from element using BeautifulSoup for robust HTML parsing.
This method produces TableData objects with fully populated cells arrays,
matching the format produced by DirectExtractionEngine for consistency.
"""
try: try:
html = elem_data.get('html', '') html = elem_data.get('html', '')
extracted_text = elem_data.get('extracted_text', '') extracted_text = elem_data.get('extracted_text', '')
@@ -550,31 +567,101 @@ class OCRToUnifiedConverter:
html = content html = content
logger.debug("Using content field as HTML table source") logger.debug("Using content field as HTML table source")
# Try to parse HTML to get rows and columns # Return None if no HTML table content
rows = 0 if not html or '<table' not in html.lower():
if extracted_text:
# Return minimal TableData with just caption if we have text
return TableData(rows=0, cols=0, cells=[], caption=extracted_text)
return None
# Parse HTML table using BeautifulSoup
try:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
table = soup.find('table')
if not table:
logger.warning("No <table> element found in HTML")
return self._fallback_table_data(html, extracted_text)
cells = []
headers = []
rows = table.find_all('tr')
# Track actual column positions accounting for rowspan/colspan
# This is a simplified approach - complex spanning may need enhancement
for row_idx, row in enumerate(rows):
row_cells = row.find_all(['td', 'th'])
col_idx = 0
for cell in row_cells:
cell_content = cell.get_text(strip=True)
rowspan = int(cell.get('rowspan', 1))
colspan = int(cell.get('colspan', 1))
cells.append(TableCell(
row=row_idx,
col=col_idx,
row_span=rowspan,
col_span=colspan,
content=cell_content
))
# Collect headers from <th> elements or first row
if cell.name == 'th' or row_idx == 0:
headers.append(cell_content)
# Advance column index by colspan
col_idx += colspan
# Calculate actual dimensions
num_rows = len(rows)
num_cols = max(
sum(int(cell.get('colspan', 1)) for cell in row.find_all(['td', 'th']))
for row in rows
) if rows else 0
logger.debug(
f"Parsed HTML table: {num_rows} rows, {num_cols} cols, {len(cells)} cells"
)
return TableData(
rows=num_rows,
cols=num_cols,
cells=cells,
headers=headers if headers else None,
caption=extracted_text if extracted_text else None
)
except ImportError:
logger.warning("BeautifulSoup not available, using fallback parsing")
return self._fallback_table_data(html, extracted_text)
except Exception as e:
logger.warning(f"Failed to extract table data: {e}")
return None
def _fallback_table_data(self, html: str, extracted_text: str = '') -> Optional[TableData]:
"""
Fallback table parsing when BeautifulSoup is not available.
Returns basic TableData with row/col counts only (no cells).
"""
try:
rows = html.count('<tr')
cols = 0 cols = 0
cells = [] if rows > 0:
first_row_end = html.find('</tr>')
if first_row_end > 0:
first_row = html[:first_row_end]
cols = first_row.count('<td') + first_row.count('<th')
if html:
# Simple HTML parsing (could be enhanced with BeautifulSoup)
rows = html.count('<tr')
if rows > 0:
# Estimate columns from first row
first_row_end = html.find('</tr>')
if first_row_end > 0:
first_row = html[:first_row_end]
cols = first_row.count('<td') + first_row.count('<th')
# Return None if no valid table data found
if rows == 0 and cols == 0 and not extracted_text: if rows == 0 and cols == 0 and not extracted_text:
return None return None
# Note: TableData uses 'cols' not 'columns'
# HTML content can be stored as caption or in element metadata
return TableData( return TableData(
rows=rows, rows=rows,
cols=cols, cols=cols,
cells=cells, cells=[], # Empty cells in fallback mode
caption=extracted_text if extracted_text else None caption=extracted_text if extracted_text else None
) )
except: except:
@@ -653,9 +740,9 @@ class OCRToUnifiedConverter:
min_distance = float('inf') min_distance = float('inf')
for target in targets: for target in targets:
# Caption should be below the target # Caption should be below the target (y1 is bottom in BoundingBox)
if target.bbox.y2 <= caption.bbox.y1: if target.bbox.y1 <= caption.bbox.y0:
distance = caption.bbox.y1 - target.bbox.y2 distance = caption.bbox.y0 - target.bbox.y1
if distance < min_distance: if distance < min_distance:
min_distance = distance min_distance = distance
best_target = target best_target = target
@@ -684,8 +771,8 @@ class OCRToUnifiedConverter:
else: else:
prev_item = list_items[i-1] prev_item = list_items[i-1]
# Check if items are consecutive (similar x position, reasonable y gap) # Check if items are consecutive (similar x position, reasonable y gap)
x_aligned = abs(item.bbox.x1 - prev_item.bbox.x1) < 20 x_aligned = abs(item.bbox.x0 - prev_item.bbox.x0) < 20
y_consecutive = (item.bbox.y1 - prev_item.bbox.y2) < 30 y_consecutive = (item.bbox.y0 - prev_item.bbox.y1) < 30
if x_aligned and y_consecutive: if x_aligned and y_consecutive:
current_group.append(item) current_group.append(item)
@@ -714,11 +801,11 @@ class OCRToUnifiedConverter:
if i + 1 < len(headers): if i + 1 < len(headers):
next_header_y = headers[i + 1].bbox.y1 next_header_y = headers[i + 1].bbox.y1
# Find all elements between headers # Find all elements between headers (y0=top, y1=bottom)
content_elements = [ content_elements = [
e for e in elements e for e in elements
if (e.bbox.y1 > header.bbox.y2 and if (e.bbox.y0 > header.bbox.y1 and
e.bbox.y1 < next_header_y and e.bbox.y0 < next_header_y and
e.type not in [ElementType.HEADER, ElementType.TITLE]) e.type not in [ElementType.HEADER, ElementType.TITLE])
] ]

View File

@@ -167,7 +167,7 @@ class PPStructureEnhanced:
# Process parsing_res_list if found # Process parsing_res_list if found
if parsing_res_list: if parsing_res_list:
elements = self._process_parsing_res_list( elements = self._process_parsing_res_list(
parsing_res_list, current_page, output_dir parsing_res_list, current_page, output_dir, image_path
) )
all_elements.extend(elements) all_elements.extend(elements)
@@ -229,7 +229,8 @@ class PPStructureEnhanced:
self, self,
parsing_res_list: List[Dict], parsing_res_list: List[Dict],
current_page: int, current_page: int,
output_dir: Optional[Path] output_dir: Optional[Path],
source_image_path: Optional[Path] = None
) -> List[Dict[str, Any]]: ) -> List[Dict[str, Any]]:
""" """
Process parsing_res_list to extract all elements. Process parsing_res_list to extract all elements.
@@ -238,6 +239,7 @@ class PPStructureEnhanced:
parsing_res_list: List of parsed elements from PP-StructureV3 parsing_res_list: List of parsed elements from PP-StructureV3
current_page: Current page number current_page: Current page number
output_dir: Optional output directory output_dir: Optional output directory
source_image_path: Path to source image for cropping image regions
Returns: Returns:
List of processed elements with normalized structure List of processed elements with normalized structure
@@ -327,6 +329,17 @@ class PPStructureEnhanced:
element['img_path'] = item['img_path'] # Keep original for reference element['img_path'] = item['img_path'] # Keep original for reference
else: else:
logger.warning(f"Failed to save image for element {element['element_id']}") logger.warning(f"Failed to save image for element {element['element_id']}")
# Crop image from source if no img_path but source image is available
elif source_image_path and output_dir and bbox != [0, 0, 0, 0]:
cropped_path = self._crop_and_save_image(
source_image_path, bbox, output_dir, element['element_id']
)
if cropped_path:
element['saved_path'] = cropped_path
element['img_path'] = cropped_path
logger.info(f"Cropped and saved image region for {element['element_id']}")
else:
logger.warning(f"Failed to crop image for element {element['element_id']}")
# Add any additional metadata # Add any additional metadata
if 'metadata' in item: if 'metadata' in item:
@@ -535,4 +548,62 @@ class PPStructureEnhanced:
img_obj.save(str(img_path)) img_obj.save(str(img_path))
logger.info(f"Saved image to {img_path}") logger.info(f"Saved image to {img_path}")
except Exception as e: except Exception as e:
logger.warning(f"Failed to save PIL image: {e}") logger.warning(f"Failed to save PIL image: {e}")
def _crop_and_save_image(
self,
source_image_path: Path,
bbox: List[float],
output_dir: Path,
element_id: str
) -> Optional[str]:
"""
Crop image region from source image and save to output directory.
Args:
source_image_path: Path to the source image
bbox: Bounding box [x1, y1, x2, y2]
output_dir: Output directory for saving cropped image
element_id: Element ID for naming
Returns:
Relative filename (not full path) to saved image, consistent with
Direct Track which stores "filename.png" that gets joined with
result_dir by pdf_generator_service.
"""
try:
from PIL import Image
# Open source image
with Image.open(source_image_path) as img:
# Ensure bbox values are integers
x1, y1, x2, y2 = [int(v) for v in bbox[:4]]
# Validate bbox
img_width, img_height = img.size
x1 = max(0, min(x1, img_width))
x2 = max(0, min(x2, img_width))
y1 = max(0, min(y1, img_height))
y2 = max(0, min(y2, img_height))
if x2 <= x1 or y2 <= y1:
logger.warning(f"Invalid bbox for cropping: {bbox}")
return None
# Crop the region
cropped = img.crop((x1, y1, x2, y2))
# Save directly to output directory (no subdirectory)
# Consistent with Direct Track which saves to output_dir directly
image_filename = f"{element_id}.png"
img_path = output_dir / image_filename
cropped.save(str(img_path), "PNG")
# Return just the filename (relative to result_dir)
# PDF generator will join with result_dir to get full path
logger.info(f"Cropped image saved: {img_path} ({x2-x1}x{y2-y1} pixels)")
return image_filename
except Exception as e:
logger.error(f"Failed to crop and save image for {element_id}: {e}")
return None

View File

@@ -0,0 +1,84 @@
# Tool_OCR 架構說明與 UML
本文件概覽 Tool_OCR 的主要組件、資料流與雙軌處理OCR / Direct並附上 UML 關係圖以協助判斷改動的影響範圍。
## 系統分層與重點元件
- **API 層FastAPI**`app/main.py` 啟動應用、掛載路由(`routers/auth.py`, `routers/tasks.py`, `routers/admin.py`),並在 lifespan 初始化記憶體管理、服務池與併發控制。
- **任務/檔案管理**`task_service.py``file_access_service.py` 掌管任務 CRUD、路徑與權限`Task` / `TaskFile` 模型紀錄結果檔路徑。
- **核心處理服務**`OCRService``services/ocr_service.py`)負責雙軌路由與 OCR整合偵測、直抽、OCR、統一格式轉換、匯出與 PDF 生成。
- **雙軌偵測/直抽**`DocumentTypeDetector` 判斷走 Direct 或 OCR`DirectExtractionEngine` 使用 PyMuPDF 直接抽取文字/表格/圖片(必要時觸發混合模式補抽圖片)。
- **OCR 解析**PaddleOCR + `PPStructureEnhanced` 抽取 23 類元素;`OCRToUnifiedConverter` 轉成 `UnifiedDocument` 統一格式。
- **匯出/呈現**`UnifiedDocumentExporter` 產出 JSON/Markdown`pdf_generator_service.py` 產生版面保持 PDF前端透過 `/api/v2/tasks/{id}/download/*` 取得。
- **資源控管**`memory_manager.py`MemoryGuard、prediction semaphore、模型生命週期`service_pool.py``OCRService` 池)避免多重載模與 GPU 爆滿。
## 處理流程(任務層級)
1. **上傳**`POST /api/v2/upload` 建立 Task 並寫檔到 `uploads/`(含 SHA256、檔案資訊
2. **啟動**`POST /api/v2/tasks/{id}/start``ProcessingOptions`,可含 `pp_structure_params`)→ 背景 `process_task_ocr` 取得服務池中的 `OCRService`
3. **軌道決策**`DocumentTypeDetector.detect` 分析 MIME、PDF 文字覆蓋率或 Office 轉 PDF 後的抽樣結果:
- **Direct**`DirectExtractionEngine.extract` 產出 `UnifiedDocument`;若偵測缺圖則啟用混合模式呼叫 OCR 抽圖或渲染 inline 圖。
- **OCR**`process_file_traditional` → PaddleOCR + PP-Structure → `OCRToUnifiedConverter.convert` 產生 `UnifiedDocument`
-`ProcessingTrack` 記錄 `ocr` / `direct` / `hybrid`,處理時間與統計寫入 metadata。
4. **輸出保存**`UnifiedDocumentExporter``_result.json`(含 metadata、statistics`_output.md``pdf_generator_service` 產出 `_layout.pdf`;路徑回寫 DB。
5. **下載/檢視**:前端透過 `/download/json|markdown|pdf|unified` 取檔;`/metadata` 讀 JSON metadata 回傳統計與 `processing_track`
## 前端流程摘要
- `UploadPage`:呼叫 `apiClientV2.uploadFile`,首個 `task_id` 存於 `uploadStore.batchId`
- `ProcessingPage`:對 `batchId` 呼叫 `startTask`(預設 `use_dual_track=true`,支援自訂 `pp_structure_params`),輪詢狀態。
- `ResultsPage` / `TaskDetailPage`:使用 `getTask``getProcessingMetadata` 顯示 `processing_track`、統計並提供 JSON/Markdown/PDF/Unified 下載。
- `TaskHistoryPage`:列出任務、支援重新啟動、重試、下載。
## 共同模組與影響點
- **UnifiedDocument**`models/unified_document.py`)為 Direct/OCR 共用輸出格式;所有匯出/PDF/前端 track 顯示依賴其欄位與 metadata。
- **服務池/記憶體守護**Direct 與 OCR 共用同一 `OCRService` 實例池與 MemoryGuard新增資源或改動需確保遵循 acquire/release、清理與 semaphore 規則。
- **偵測閾值變更**`DocumentTypeDetector` 參數調整會影響 Direct 與 OCR 分流比例,間接改變 GPU 載荷與結果格式。
- **匯出/PDF**:任何 UnifiedDocument 結構變動會影響 JSON/Markdown/PDF 產出與前端下載/預覽;需同步維護轉換與匯出器。
## UML 關係圖Mermaid
```mermaid
classDiagram
class TasksRouter {
+upload_file()
+start_task()
+download_json/markdown/pdf/unified()
+get_metadata()
}
class TaskService {+create_task(); +update_task_status(); +get_task_by_id()}
class FileAccessService
class OCRService {
+process()
+process_with_dual_track()
+process_file_traditional()
+save_results()
}
class DocumentTypeDetector {+detect()}
class DirectExtractionEngine {+extract(); +check_document_for_missing_images()}
class OCRToUnifiedConverter {+convert()}
class UnifiedDocument
class UnifiedDocumentExporter {+export_to_json(); +export_to_markdown()}
class PDFGeneratorService {+generate_layout_pdf(); +generate_from_unified_document()}
class ServicePool {+acquire(); +release()}
class MemoryManager <<singleton>>
class OfficeConverter {+convert_to_pdf()}
class PPStructureEnhanced {+analyze_with_full_structure()}
TasksRouter --> TaskService
TasksRouter --> FileAccessService
TasksRouter --> OCRService : background process via process_task_ocr
OCRService --> DocumentTypeDetector : track recommendation
OCRService --> DirectExtractionEngine : direct track
OCRService --> OCRToUnifiedConverter : OCR track result -> UnifiedDocument
OCRService --> OfficeConverter : Office -> PDF
OCRService --> PPStructureEnhanced : layout analysis (PP-StructureV3)
OCRService --> UnifiedDocumentExporter : persist results
OCRService --> PDFGeneratorService : layout-preserving PDF
OCRService --> ServicePool : acquired instance
ServicePool --> MemoryManager : model lifecycle / GPU guard
UnifiedDocumentExporter --> UnifiedDocument
PDFGeneratorService --> UnifiedDocument
```
## 影響判斷指引
- **改 Direct/偵測邏輯**:會改變 `processing_track` 與結果格式;前端顯示與下載 JSON/Markdown/PDF 仍依賴 UnifiedDocument需驗證匯出與 PDF 生成。
- **改 OCR/PP-Structure 參數**:僅影響 OCR trackDirect track 不受 `pp_structure_params` 影響(符合 spec需維持 `processing_track` 填寫。
- **改 UnifiedDocument 結構/統計**:需同步 `UnifiedDocumentExporter``pdf_generator_service`、前端 `getProcessingMetadata`/下載端點。
- **改資源控管**:服務池或 MemoryGuard 調整會同時影響 Direct/OCR 執行時序與穩定性,須確保 acquire/release 與 semaphore 不被破壞。

View File

@@ -0,0 +1,173 @@
# Design: Fix OCR Track Table Data Format
## Context
The OCR processing pipeline has three modes:
1. **Direct Track**: Extracts structured data directly from native PDFs using `direct_extraction_engine.py`
2. **OCR Track**: Uses PP-StructureV3 for layout analysis and OCR, then converts results via `ocr_to_unified_converter.py`
3. **Hybrid Mode**: Uses Direct Track as primary, supplements with OCR Track for missing images only
Both tracks produce `UnifiedDocument` containing `DocumentElement` objects. For tables, the `content` field should contain a `TableData` object with populated `cells` array. However, OCR Track currently produces `TableData` with empty `cells`, causing PDF generation failures.
## Track Isolation Analysis (Safety Guarantee)
This section documents why the proposed changes will NOT affect Direct Track or Hybrid Mode.
### Code Flow Analysis
```
┌─────────────────────────────────────────────────────────────────────────┐
│ ocr_service.py │
├─────────────────────────────────────────────────────────────────────────┤
│ │
│ Direct Track ──► DirectExtractionEngine ──► UnifiedDocument │
│ (direct_extraction_engine.py) (tables: TableData ✓) │
│ [NOT MODIFIED] │
│ │
│ OCR Track ────► PP-StructureV3 ──► OCRToUnifiedConverter ──► UnifiedDoc│
│ (ocr_to_unified_converter.py) │
│ [MODIFIED: _extract_table_data] │
│ │
│ Hybrid Mode ──► Direct Track (primary) + OCR Track (images only) │
│ │ │ │
│ │ └──► _merge_ocr_images_into_ │
│ │ direct() merges ONLY: │
│ │ - ElementType.FIGURE │
│ │ - ElementType.IMAGE │
│ │ - ElementType.LOGO │
│ │ [Tables NOT merged] │
│ └──► Tables come from Direct Track (unchanged) │
└─────────────────────────────────────────────────────────────────────────┘
```
### Evidence from ocr_service.py
**Line 1610** (Hybrid mode merge logic):
```python
image_types = {ElementType.FIGURE, ElementType.IMAGE, ElementType.LOGO}
```
**Lines 1634-1635** (Only image types are merged):
```python
for element in ocr_page.elements:
if element.type in image_types: # Tables excluded
```
### Impact Matrix
| Mode | Table Source | Uses OCRToUnifiedConverter? | Affected by Change? |
|------|--------------|----------------------------|---------------------|
| Direct Track | `DirectExtractionEngine` | No | **No** |
| OCR Track | `OCRToUnifiedConverter` | Yes | **Yes (Fixed)** |
| Hybrid Mode | `DirectExtractionEngine` (tables) | Only for images | **No** |
### Conclusion
The fix is **isolated to OCR Track only**:
- Direct Track: Uses separate engine (`DirectExtractionEngine`), completely unaffected
- Hybrid Mode: Tables come from Direct Track; OCR Track is only used for image extraction
- OCR Track: Will benefit from the fix with proper `TableData` output
## Goals / Non-Goals
### Goals
- OCR Track table output format matches Direct Track format exactly
- PDF Generator receives consistent `TableData` objects from both tracks
- Robust HTML table parsing that handles real-world OCR output
### Non-Goals
- Modifying Direct Track behavior (it's the reference implementation)
- Changing the `TableData` or `TableCell` data models
- Modifying PDF Generator to handle HTML strings as a workaround
## Decisions
### Decision 1: Use BeautifulSoup for HTML Parsing
**Rationale**: The current regex/string-counting approach is fragile and cannot extract cell content. BeautifulSoup provides:
- Robust handling of malformed HTML (common in OCR output)
- Easy extraction of cell content, attributes (rowspan, colspan)
- Well-tested library already used in many Python projects
**Alternatives considered**:
- Manual regex parsing: Too fragile for complex tables
- lxml: More complex API, overkill for this use case
- html.parser (stdlib): Less tolerant of malformed HTML
### Decision 2: Maintain Backward Compatibility
**Rationale**: If BeautifulSoup parsing fails, fall back to current behavior (return `TableData` with basic row/col counts). This ensures existing functionality isn't broken.
### Decision 3: Single Point of Change
**Rationale**: Only modify `ocr_to_unified_converter.py`. This:
- Minimizes regression risk
- Keeps Direct Track untouched as reference
- Requires no changes to downstream PDF Generator
## Implementation Approach
```python
def _extract_table_data(self, elem_data: Dict) -> Optional[TableData]:
"""Extract table data from element using BeautifulSoup."""
try:
html = elem_data.get('html', '') or elem_data.get('content', '')
if not html or '<table' not in html.lower():
return None
soup = BeautifulSoup(html, 'html.parser')
table = soup.find('table')
if not table:
return None
cells = []
headers = []
rows = table.find_all('tr')
for row_idx, row in enumerate(rows):
row_cells = row.find_all(['td', 'th'])
for col_idx, cell in enumerate(row_cells):
cell_content = cell.get_text(strip=True)
rowspan = int(cell.get('rowspan', 1))
colspan = int(cell.get('colspan', 1))
cells.append(TableCell(
row=row_idx,
col=col_idx,
row_span=rowspan,
col_span=colspan,
content=cell_content
))
# Collect headers from first row or <th> elements
if row_idx == 0 or cell.name == 'th':
headers.append(cell_content)
return TableData(
rows=len(rows),
cols=max(len(row.find_all(['td', 'th'])) for row in rows) if rows else 0,
cells=cells,
headers=headers if headers else None
)
except Exception as e:
logger.warning(f"Failed to parse HTML table: {e}")
return None # Fallback handled by caller
```
## Risks / Trade-offs
| Risk | Mitigation |
|------|------------|
| BeautifulSoup not installed | Add to requirements.txt; it's already a common dependency |
| Malformed HTML causes parsing errors | Use try/except with fallback to current behavior |
| Performance impact from HTML parsing | Minimal; tables are small; BeautifulSoup is fast |
| Complex rowspan/colspan calculations | Start with simple col tracking; enhance if needed |
## Dependencies
- `beautifulsoup4`: Already commonly available, add to requirements.txt if not present
## Open Questions
- Q: Should we preserve the original HTML in metadata for debugging?
- A: Optional enhancement; not required for initial fix

View File

@@ -0,0 +1,45 @@
# Change: Fix OCR Track Table Data Format to Match Direct Track
## Why
OCR Track produces HTML strings for table content instead of structured `TableData` objects, causing PDF generation to render raw HTML code as plain text. Direct Track correctly produces `TableData` objects with populated `cells` array, resulting in proper table rendering. This inconsistency creates poor user experience when using OCR Track for documents containing tables.
## What Changes
- **Enhance `_extract_table_data` method** in `ocr_to_unified_converter.py` to properly parse HTML tables into structured `TableData` objects with populated `TableCell` arrays
- **Add BeautifulSoup-based HTML table parsing** to robustly extract cell content, row/column spans from OCR-generated HTML tables
- **Ensure format consistency** between OCR Track and Direct Track table output, allowing PDF Generator to handle a single standardized format
## Impact
- Affected specs: `ocr-processing`
- Affected code:
- `backend/app/services/ocr_to_unified_converter.py` (primary changes)
- `backend/app/services/pdf_generator_service.py` (no changes needed - already handles `TableData`)
- `backend/app/services/direct_extraction_engine.py` (no changes - serves as reference implementation)
## Evidence
### Direct Track (Reference - Correct Behavior)
`direct_extraction_engine.py:846-850`:
```python
table_data = TableData(
rows=len(data),
cols=max(len(row) for row in data) if data else 0,
cells=cells, # Properly populated with TableCell objects
headers=data[0] if data else None
)
```
### OCR Track (Current - Problematic)
`ocr_to_unified_converter.py:574-579`:
```python
return TableData(
rows=rows, # Only counts from html.count('<tr')
cols=cols, # Only counts from <td>/<th> in first row
cells=cells, # Always empty list []
caption=extracted_text
)
```
The `cells` array is always empty because the current HTML parsing only counts tags but doesn't extract actual cell content.

View File

@@ -0,0 +1,51 @@
## ADDED Requirements
### Requirement: OCR Track Table Data Structure Consistency
The OCR Track SHALL produce `TableData` objects with fully populated `cells` arrays that match the format produced by Direct Track, ensuring consistent table rendering across both processing tracks.
#### Scenario: OCR Track produces structured TableData for HTML tables
- **GIVEN** a document with tables is processed via OCR Track
- **WHEN** PP-StructureV3 returns HTML table content in the `html` or `content` field
- **THEN** the `ocr_to_unified_converter` SHALL parse the HTML and produce a `TableData` object
- **AND** the `TableData.cells` array SHALL contain `TableCell` objects for each cell
- **AND** each `TableCell` SHALL have correct `row`, `col`, and `content` values
- **AND** the output format SHALL match Direct Track's `TableData` structure
#### Scenario: OCR Track handles tables with merged cells
- **GIVEN** an HTML table with `rowspan` or `colspan` attributes
- **WHEN** the table is converted to `TableData`
- **THEN** each `TableCell` SHALL have correct `row_span` and `col_span` values
- **AND** the cell content SHALL be correctly extracted
#### Scenario: OCR Track handles header rows
- **GIVEN** an HTML table with `<th>` elements or a header row
- **WHEN** the table is converted to `TableData`
- **THEN** the `TableData.headers` field SHALL contain the header cell contents
- **AND** header cells SHALL also be included in the `cells` array
#### Scenario: OCR Track gracefully handles malformed HTML tables
- **GIVEN** an HTML table with malformed markup (missing closing tags, invalid nesting)
- **WHEN** parsing is attempted
- **THEN** the system SHALL attempt best-effort parsing using a tolerant HTML parser
- **AND** if parsing fails completely, SHALL fall back to returning basic `TableData` with row/col counts
- **AND** SHALL log a warning for debugging purposes
#### Scenario: PDF Generator renders OCR Track tables correctly
- **GIVEN** a `UnifiedDocument` from OCR Track containing table elements
- **WHEN** the PDF Generator processes the document
- **THEN** tables SHALL be rendered as formatted tables (not as raw HTML text)
- **AND** the rendering SHALL be identical to Direct Track table rendering
#### Scenario: Direct Track table processing remains unchanged
- **GIVEN** a native PDF with embedded tables
- **WHEN** the document is processed via Direct Track
- **THEN** the `DirectExtractionEngine` SHALL continue to produce `TableData` objects as before
- **AND** the `ocr_to_unified_converter.py` changes SHALL NOT affect Direct Track processing
- **AND** table rendering in PDF output SHALL be identical to pre-fix behavior
#### Scenario: Hybrid Mode table source isolation
- **GIVEN** a document processed via Hybrid Mode (Direct Track primary + OCR Track for images)
- **WHEN** the system merges OCR Track results into Direct Track results
- **THEN** only image elements (FIGURE, IMAGE, LOGO) SHALL be merged from OCR Track
- **AND** table elements SHALL exclusively come from Direct Track
- **AND** no OCR Track table data SHALL contaminate the final output

View File

@@ -0,0 +1,43 @@
# Tasks: Fix OCR Track Table Data Format
## 1. Implementation
- [x] 1.1 Add BeautifulSoup import and dependency check in `ocr_to_unified_converter.py`
- [x] 1.2 Rewrite `_extract_table_data` method to parse HTML using BeautifulSoup
- [x] 1.3 Extract cell content, row index, column index for each `<td>` and `<th>` element
- [x] 1.4 Handle `rowspan` and `colspan` attributes for merged cells
- [x] 1.5 Create `TableCell` objects with proper content and positioning
- [x] 1.6 Populate `TableData.cells` array with extracted `TableCell` objects
- [x] 1.7 Preserve header detection (`<th>` elements) and store in `TableData.headers`
## 2. Edge Case Handling
- [x] 2.1 Handle malformed HTML tables gracefully (missing closing tags, nested tables)
- [x] 2.2 Handle empty cells (create TableCell with empty string content)
- [x] 2.3 Handle tables without `<tr>` structure (fallback to current behavior)
- [x] 2.4 Log warnings for unparseable tables instead of failing silently
## 3. Testing
- [x] 3.1 Create unit tests for `_extract_table_data` with various HTML table formats
- [x] 3.2 Test simple tables (basic rows/columns)
- [x] 3.3 Test tables with merged cells (rowspan/colspan)
- [x] 3.4 Test tables with header rows (`<th>` elements)
- [x] 3.5 Test malformed HTML tables (handled via BeautifulSoup's tolerance)
- [ ] 3.6 Integration test: OCR Track PDF generation with tables
## 4. Verification (Track Isolation)
- [x] 4.1 Compare OCR Track table output format with Direct Track output format
- [ ] 4.2 Verify PDF Generator renders OCR Track tables correctly
- [x] 4.3 **Direct Track regression test**: `direct_extraction_engine.py` NOT modified (confirmed via git status)
- [x] 4.4 **Hybrid Mode regression test**: `ocr_service.py` NOT modified, image merge logic unchanged
- [x] 4.5 **OCR Track fix verification**: Unit tests confirm:
- `TableData.cells` array is populated (6 cells in 3x2 table)
- `TableCell` objects have correct row/col/content values
- Headers extracted correctly
- [x] 4.6 Verify `DirectExtractionEngine` code is NOT modified (isolation check - confirmed)
## 5. Dependencies
- [x] 5.1 Add `beautifulsoup4>=4.12.0` to `requirements.txt`

View File

@@ -69,3 +69,4 @@ pylint>=3.0.0
# ===== Utilities ===== # ===== Utilities =====
python-magic>=0.4.27 # File type detection python-magic>=0.4.27 # File type detection
beautifulsoup4>=4.12.0 # HTML table parsing for OCR track