element found in HTML")
+ return self._fallback_table_data(html, extracted_text)
+
+ cells = []
+ headers = []
+ rows = table.find_all('tr')
+
+ # Track actual column positions accounting for rowspan/colspan
+ # This is a simplified approach - complex spanning may need enhancement
+ for row_idx, row in enumerate(rows):
+ row_cells = row.find_all(['td', 'th'])
+ col_idx = 0
+
+ for cell in row_cells:
+ cell_content = cell.get_text(strip=True)
+ rowspan = int(cell.get('rowspan', 1))
+ colspan = int(cell.get('colspan', 1))
+
+ cells.append(TableCell(
+ row=row_idx,
+ col=col_idx,
+ row_span=rowspan,
+ col_span=colspan,
+ content=cell_content
+ ))
+
+ # Collect headers from | elements or first row
+ if cell.name == 'th' or row_idx == 0:
+ headers.append(cell_content)
+
+ # Advance column index by colspan
+ col_idx += colspan
+
+ # Calculate actual dimensions
+ num_rows = len(rows)
+ num_cols = max(
+ sum(int(cell.get('colspan', 1)) for cell in row.find_all(['td', 'th']))
+ for row in rows
+ ) if rows else 0
+
+ logger.debug(
+ f"Parsed HTML table: {num_rows} rows, {num_cols} cols, {len(cells)} cells"
+ )
+
+ return TableData(
+ rows=num_rows,
+ cols=num_cols,
+ cells=cells,
+ headers=headers if headers else None,
+ caption=extracted_text if extracted_text else None
+ )
+
+ except ImportError:
+ logger.warning("BeautifulSoup not available, using fallback parsing")
+ return self._fallback_table_data(html, extracted_text)
+
+ except Exception as e:
+ logger.warning(f"Failed to extract table data: {e}")
+ return None
+
+ def _fallback_table_data(self, html: str, extracted_text: str = '') -> Optional[TableData]:
+ """
+ Fallback table parsing when BeautifulSoup is not available.
+ Returns basic TableData with row/col counts only (no cells).
+ """
+ try:
+ rows = html.count(' | 0:
+ first_row_end = html.find('
')
+ if first_row_end > 0:
+ first_row = html[:first_row_end]
+ cols = first_row.count(' 0:
- # Estimate columns from first row
- first_row_end = html.find('')
- if first_row_end > 0:
- first_row = html[:first_row_end]
- cols = first_row.count(' | header.bbox.y2 and
- e.bbox.y1 < next_header_y and
+ if (e.bbox.y0 > header.bbox.y1 and
+ e.bbox.y0 < next_header_y and
e.type not in [ElementType.HEADER, ElementType.TITLE])
]
diff --git a/backend/app/services/pp_structure_enhanced.py b/backend/app/services/pp_structure_enhanced.py
index 98382fc..7833e7a 100644
--- a/backend/app/services/pp_structure_enhanced.py
+++ b/backend/app/services/pp_structure_enhanced.py
@@ -167,7 +167,7 @@ class PPStructureEnhanced:
# Process parsing_res_list if found
if parsing_res_list:
elements = self._process_parsing_res_list(
- parsing_res_list, current_page, output_dir
+ parsing_res_list, current_page, output_dir, image_path
)
all_elements.extend(elements)
@@ -229,7 +229,8 @@ class PPStructureEnhanced:
self,
parsing_res_list: List[Dict],
current_page: int,
- output_dir: Optional[Path]
+ output_dir: Optional[Path],
+ source_image_path: Optional[Path] = None
) -> List[Dict[str, Any]]:
"""
Process parsing_res_list to extract all elements.
@@ -238,6 +239,7 @@ class PPStructureEnhanced:
parsing_res_list: List of parsed elements from PP-StructureV3
current_page: Current page number
output_dir: Optional output directory
+ source_image_path: Path to source image for cropping image regions
Returns:
List of processed elements with normalized structure
@@ -327,6 +329,17 @@ class PPStructureEnhanced:
element['img_path'] = item['img_path'] # Keep original for reference
else:
logger.warning(f"Failed to save image for element {element['element_id']}")
+ # Crop image from source if no img_path but source image is available
+ elif source_image_path and output_dir and bbox != [0, 0, 0, 0]:
+ cropped_path = self._crop_and_save_image(
+ source_image_path, bbox, output_dir, element['element_id']
+ )
+ if cropped_path:
+ element['saved_path'] = cropped_path
+ element['img_path'] = cropped_path
+ logger.info(f"Cropped and saved image region for {element['element_id']}")
+ else:
+ logger.warning(f"Failed to crop image for element {element['element_id']}")
# Add any additional metadata
if 'metadata' in item:
@@ -535,4 +548,62 @@ class PPStructureEnhanced:
img_obj.save(str(img_path))
logger.info(f"Saved image to {img_path}")
except Exception as e:
- logger.warning(f"Failed to save PIL image: {e}")
\ No newline at end of file
+ logger.warning(f"Failed to save PIL image: {e}")
+
+ def _crop_and_save_image(
+ self,
+ source_image_path: Path,
+ bbox: List[float],
+ output_dir: Path,
+ element_id: str
+ ) -> Optional[str]:
+ """
+ Crop image region from source image and save to output directory.
+
+ Args:
+ source_image_path: Path to the source image
+ bbox: Bounding box [x1, y1, x2, y2]
+ output_dir: Output directory for saving cropped image
+ element_id: Element ID for naming
+
+ Returns:
+ Relative filename (not full path) to saved image, consistent with
+ Direct Track which stores "filename.png" that gets joined with
+ result_dir by pdf_generator_service.
+ """
+ try:
+ from PIL import Image
+
+ # Open source image
+ with Image.open(source_image_path) as img:
+ # Ensure bbox values are integers
+ x1, y1, x2, y2 = [int(v) for v in bbox[:4]]
+
+ # Validate bbox
+ img_width, img_height = img.size
+ x1 = max(0, min(x1, img_width))
+ x2 = max(0, min(x2, img_width))
+ y1 = max(0, min(y1, img_height))
+ y2 = max(0, min(y2, img_height))
+
+ if x2 <= x1 or y2 <= y1:
+ logger.warning(f"Invalid bbox for cropping: {bbox}")
+ return None
+
+ # Crop the region
+ cropped = img.crop((x1, y1, x2, y2))
+
+ # Save directly to output directory (no subdirectory)
+ # Consistent with Direct Track which saves to output_dir directly
+ image_filename = f"{element_id}.png"
+ img_path = output_dir / image_filename
+ cropped.save(str(img_path), "PNG")
+
+ # Return just the filename (relative to result_dir)
+ # PDF generator will join with result_dir to get full path
+ logger.info(f"Cropped image saved: {img_path} ({x2-x1}x{y2-y1} pixels)")
+ return image_filename
+
+ except Exception as e:
+ logger.error(f"Failed to crop and save image for {element_id}: {e}")
+ return None
\ No newline at end of file
diff --git a/docs/architecture-overview.md b/docs/architecture-overview.md
new file mode 100644
index 0000000..2f33784
--- /dev/null
+++ b/docs/architecture-overview.md
@@ -0,0 +1,84 @@
+# Tool_OCR 架構說明與 UML
+
+本文件概覽 Tool_OCR 的主要組件、資料流與雙軌處理(OCR / Direct),並附上 UML 關係圖以協助判斷改動的影響範圍。
+
+## 系統分層與重點元件
+- **API 層(FastAPI)**:`app/main.py` 啟動應用、掛載路由(`routers/auth.py`, `routers/tasks.py`, `routers/admin.py`),並在 lifespan 初始化記憶體管理、服務池與併發控制。
+- **任務/檔案管理**:`task_service.py` 與 `file_access_service.py` 掌管任務 CRUD、路徑與權限;`Task` / `TaskFile` 模型紀錄結果檔路徑。
+- **核心處理服務**:`OCRService`(`services/ocr_service.py`)負責雙軌路由與 OCR;整合偵測、直抽、OCR、統一格式轉換、匯出與 PDF 生成。
+- **雙軌偵測/直抽**:`DocumentTypeDetector` 判斷走 Direct 或 OCR;`DirectExtractionEngine` 使用 PyMuPDF 直接抽取文字/表格/圖片(必要時觸發混合模式補抽圖片)。
+- **OCR 解析**:PaddleOCR + `PPStructureEnhanced` 抽取 23 類元素;`OCRToUnifiedConverter` 轉成 `UnifiedDocument` 統一格式。
+- **匯出/呈現**:`UnifiedDocumentExporter` 產出 JSON/Markdown;`pdf_generator_service.py` 產生版面保持 PDF;前端透過 `/api/v2/tasks/{id}/download/*` 取得。
+- **資源控管**:`memory_manager.py`(MemoryGuard、prediction semaphore、模型生命週期),`service_pool.py`(`OCRService` 池)避免多重載模與 GPU 爆滿。
+
+## 處理流程(任務層級)
+1. **上傳**:`POST /api/v2/upload` 建立 Task 並寫檔到 `uploads/`(含 SHA256、檔案資訊)。
+2. **啟動**:`POST /api/v2/tasks/{id}/start`(`ProcessingOptions`,可含 `pp_structure_params`)→ 背景 `process_task_ocr` 取得服務池中的 `OCRService`。
+3. **軌道決策**:`DocumentTypeDetector.detect` 分析 MIME、PDF 文字覆蓋率或 Office 轉 PDF 後的抽樣結果:
+ - **Direct**:`DirectExtractionEngine.extract` 產出 `UnifiedDocument`;若偵測缺圖則啟用混合模式呼叫 OCR 抽圖或渲染 inline 圖。
+ - **OCR**:`process_file_traditional` → PaddleOCR + PP-Structure → `OCRToUnifiedConverter.convert` 產生 `UnifiedDocument`。
+ - 以 `ProcessingTrack` 記錄 `ocr` / `direct` / `hybrid`,處理時間與統計寫入 metadata。
+4. **輸出保存**:`UnifiedDocumentExporter` 寫 `_result.json`(含 metadata、statistics)與 `_output.md`;`pdf_generator_service` 產出 `_layout.pdf`;路徑回寫 DB。
+5. **下載/檢視**:前端透過 `/download/json|markdown|pdf|unified` 取檔;`/metadata` 讀 JSON metadata 回傳統計與 `processing_track`。
+
+## 前端流程摘要
+- `UploadPage`:呼叫 `apiClientV2.uploadFile`,首個 `task_id` 存於 `uploadStore.batchId`。
+- `ProcessingPage`:對 `batchId` 呼叫 `startTask`(預設 `use_dual_track=true`,支援自訂 `pp_structure_params`),輪詢狀態。
+- `ResultsPage` / `TaskDetailPage`:使用 `getTask` 與 `getProcessingMetadata` 顯示 `processing_track`、統計並提供 JSON/Markdown/PDF/Unified 下載。
+- `TaskHistoryPage`:列出任務、支援重新啟動、重試、下載。
+
+## 共同模組與影響點
+- **UnifiedDocument**(`models/unified_document.py`)為 Direct/OCR 共用輸出格式;所有匯出/PDF/前端 track 顯示依賴其欄位與 metadata。
+- **服務池/記憶體守護**:Direct 與 OCR 共用同一 `OCRService` 實例池與 MemoryGuard;新增資源或改動需確保遵循 acquire/release、清理與 semaphore 規則。
+- **偵測閾值變更**:`DocumentTypeDetector` 參數調整會影響 Direct 與 OCR 分流比例,間接改變 GPU 載荷與結果格式。
+- **匯出/PDF**:任何 UnifiedDocument 結構變動會影響 JSON/Markdown/PDF 產出與前端下載/預覽;需同步維護轉換與匯出器。
+
+## UML 關係圖(Mermaid)
+```mermaid
+classDiagram
+ class TasksRouter {
+ +upload_file()
+ +start_task()
+ +download_json/markdown/pdf/unified()
+ +get_metadata()
+ }
+ class TaskService {+create_task(); +update_task_status(); +get_task_by_id()}
+ class FileAccessService
+ class OCRService {
+ +process()
+ +process_with_dual_track()
+ +process_file_traditional()
+ +save_results()
+ }
+ class DocumentTypeDetector {+detect()}
+ class DirectExtractionEngine {+extract(); +check_document_for_missing_images()}
+ class OCRToUnifiedConverter {+convert()}
+ class UnifiedDocument
+ class UnifiedDocumentExporter {+export_to_json(); +export_to_markdown()}
+ class PDFGeneratorService {+generate_layout_pdf(); +generate_from_unified_document()}
+ class ServicePool {+acquire(); +release()}
+ class MemoryManager <>
+ class OfficeConverter {+convert_to_pdf()}
+ class PPStructureEnhanced {+analyze_with_full_structure()}
+
+ TasksRouter --> TaskService
+ TasksRouter --> FileAccessService
+ TasksRouter --> OCRService : background process via process_task_ocr
+ OCRService --> DocumentTypeDetector : track recommendation
+ OCRService --> DirectExtractionEngine : direct track
+ OCRService --> OCRToUnifiedConverter : OCR track result -> UnifiedDocument
+ OCRService --> OfficeConverter : Office -> PDF
+ OCRService --> PPStructureEnhanced : layout analysis (PP-StructureV3)
+ OCRService --> UnifiedDocumentExporter : persist results
+ OCRService --> PDFGeneratorService : layout-preserving PDF
+ OCRService --> ServicePool : acquired instance
+ ServicePool --> MemoryManager : model lifecycle / GPU guard
+ UnifiedDocumentExporter --> UnifiedDocument
+ PDFGeneratorService --> UnifiedDocument
+```
+
+## 影響判斷指引
+- **改 Direct/偵測邏輯**:會改變 `processing_track` 與結果格式;前端顯示與下載 JSON/Markdown/PDF 仍依賴 UnifiedDocument,需驗證匯出與 PDF 生成。
+- **改 OCR/PP-Structure 參數**:僅影響 OCR track;Direct track 不受 `pp_structure_params` 影響(符合 spec),需維持 `processing_track` 填寫。
+- **改 UnifiedDocument 結構/統計**:需同步 `UnifiedDocumentExporter`、`pdf_generator_service`、前端 `getProcessingMetadata`/下載端點。
+- **改資源控管**:服務池或 MemoryGuard 調整會同時影響 Direct/OCR 執行時序與穩定性,須確保 acquire/release 與 semaphore 不被破壞。
diff --git a/openspec/changes/archive/2025-11-26-fix-ocr-track-table-data-format/design.md b/openspec/changes/archive/2025-11-26-fix-ocr-track-table-data-format/design.md
new file mode 100644
index 0000000..6ff7380
--- /dev/null
+++ b/openspec/changes/archive/2025-11-26-fix-ocr-track-table-data-format/design.md
@@ -0,0 +1,173 @@
+# Design: Fix OCR Track Table Data Format
+
+## Context
+
+The OCR processing pipeline has three modes:
+1. **Direct Track**: Extracts structured data directly from native PDFs using `direct_extraction_engine.py`
+2. **OCR Track**: Uses PP-StructureV3 for layout analysis and OCR, then converts results via `ocr_to_unified_converter.py`
+3. **Hybrid Mode**: Uses Direct Track as primary, supplements with OCR Track for missing images only
+
+Both tracks produce `UnifiedDocument` containing `DocumentElement` objects. For tables, the `content` field should contain a `TableData` object with populated `cells` array. However, OCR Track currently produces `TableData` with empty `cells`, causing PDF generation failures.
+
+## Track Isolation Analysis (Safety Guarantee)
+
+This section documents why the proposed changes will NOT affect Direct Track or Hybrid Mode.
+
+### Code Flow Analysis
+
+```
+┌─────────────────────────────────────────────────────────────────────────┐
+│ ocr_service.py │
+├─────────────────────────────────────────────────────────────────────────┤
+│ │
+│ Direct Track ──► DirectExtractionEngine ──► UnifiedDocument │
+│ (direct_extraction_engine.py) (tables: TableData ✓) │
+│ [NOT MODIFIED] │
+│ │
+│ OCR Track ────► PP-StructureV3 ──► OCRToUnifiedConverter ──► UnifiedDoc│
+│ (ocr_to_unified_converter.py) │
+│ [MODIFIED: _extract_table_data] │
+│ │
+│ Hybrid Mode ──► Direct Track (primary) + OCR Track (images only) │
+│ │ │ │
+│ │ └──► _merge_ocr_images_into_ │
+│ │ direct() merges ONLY: │
+│ │ - ElementType.FIGURE │
+│ │ - ElementType.IMAGE │
+│ │ - ElementType.LOGO │
+│ │ [Tables NOT merged] │
+│ └──► Tables come from Direct Track (unchanged) │
+└─────────────────────────────────────────────────────────────────────────┘
+```
+
+### Evidence from ocr_service.py
+
+**Line 1610** (Hybrid mode merge logic):
+```python
+image_types = {ElementType.FIGURE, ElementType.IMAGE, ElementType.LOGO}
+```
+
+**Lines 1634-1635** (Only image types are merged):
+```python
+for element in ocr_page.elements:
+ if element.type in image_types: # Tables excluded
+```
+
+### Impact Matrix
+
+| Mode | Table Source | Uses OCRToUnifiedConverter? | Affected by Change? |
+|------|--------------|----------------------------|---------------------|
+| Direct Track | `DirectExtractionEngine` | No | **No** |
+| OCR Track | `OCRToUnifiedConverter` | Yes | **Yes (Fixed)** |
+| Hybrid Mode | `DirectExtractionEngine` (tables) | Only for images | **No** |
+
+### Conclusion
+
+The fix is **isolated to OCR Track only**:
+- Direct Track: Uses separate engine (`DirectExtractionEngine`), completely unaffected
+- Hybrid Mode: Tables come from Direct Track; OCR Track is only used for image extraction
+- OCR Track: Will benefit from the fix with proper `TableData` output
+
+## Goals / Non-Goals
+
+### Goals
+- OCR Track table output format matches Direct Track format exactly
+- PDF Generator receives consistent `TableData` objects from both tracks
+- Robust HTML table parsing that handles real-world OCR output
+
+### Non-Goals
+- Modifying Direct Track behavior (it's the reference implementation)
+- Changing the `TableData` or `TableCell` data models
+- Modifying PDF Generator to handle HTML strings as a workaround
+
+## Decisions
+
+### Decision 1: Use BeautifulSoup for HTML Parsing
+
+**Rationale**: The current regex/string-counting approach is fragile and cannot extract cell content. BeautifulSoup provides:
+- Robust handling of malformed HTML (common in OCR output)
+- Easy extraction of cell content, attributes (rowspan, colspan)
+- Well-tested library already used in many Python projects
+
+**Alternatives considered**:
+- Manual regex parsing: Too fragile for complex tables
+- lxml: More complex API, overkill for this use case
+- html.parser (stdlib): Less tolerant of malformed HTML
+
+### Decision 2: Maintain Backward Compatibility
+
+**Rationale**: If BeautifulSoup parsing fails, fall back to current behavior (return `TableData` with basic row/col counts). This ensures existing functionality isn't broken.
+
+### Decision 3: Single Point of Change
+
+**Rationale**: Only modify `ocr_to_unified_converter.py`. This:
+- Minimizes regression risk
+- Keeps Direct Track untouched as reference
+- Requires no changes to downstream PDF Generator
+
+## Implementation Approach
+
+```python
+def _extract_table_data(self, elem_data: Dict) -> Optional[TableData]:
+ """Extract table data from element using BeautifulSoup."""
+ try:
+ html = elem_data.get('html', '') or elem_data.get('content', '')
+ if not html or ' elements
+ if row_idx == 0 or cell.name == 'th':
+ headers.append(cell_content)
+
+ return TableData(
+ rows=len(rows),
+ cols=max(len(row.find_all(['td', 'th'])) for row in rows) if rows else 0,
+ cells=cells,
+ headers=headers if headers else None
+ )
+ except Exception as e:
+ logger.warning(f"Failed to parse HTML table: {e}")
+ return None # Fallback handled by caller
+```
+
+## Risks / Trade-offs
+
+| Risk | Mitigation |
+|------|------------|
+| BeautifulSoup not installed | Add to requirements.txt; it's already a common dependency |
+| Malformed HTML causes parsing errors | Use try/except with fallback to current behavior |
+| Performance impact from HTML parsing | Minimal; tables are small; BeautifulSoup is fast |
+| Complex rowspan/colspan calculations | Start with simple col tracking; enhance if needed |
+
+## Dependencies
+
+- `beautifulsoup4`: Already commonly available, add to requirements.txt if not present
+
+## Open Questions
+
+- Q: Should we preserve the original HTML in metadata for debugging?
+ - A: Optional enhancement; not required for initial fix
diff --git a/openspec/changes/archive/2025-11-26-fix-ocr-track-table-data-format/proposal.md b/openspec/changes/archive/2025-11-26-fix-ocr-track-table-data-format/proposal.md
new file mode 100644
index 0000000..6e89cfa
--- /dev/null
+++ b/openspec/changes/archive/2025-11-26-fix-ocr-track-table-data-format/proposal.md
@@ -0,0 +1,45 @@
+# Change: Fix OCR Track Table Data Format to Match Direct Track
+
+## Why
+
+OCR Track produces HTML strings for table content instead of structured `TableData` objects, causing PDF generation to render raw HTML code as plain text. Direct Track correctly produces `TableData` objects with populated `cells` array, resulting in proper table rendering. This inconsistency creates poor user experience when using OCR Track for documents containing tables.
+
+## What Changes
+
+- **Enhance `_extract_table_data` method** in `ocr_to_unified_converter.py` to properly parse HTML tables into structured `TableData` objects with populated `TableCell` arrays
+- **Add BeautifulSoup-based HTML table parsing** to robustly extract cell content, row/column spans from OCR-generated HTML tables
+- **Ensure format consistency** between OCR Track and Direct Track table output, allowing PDF Generator to handle a single standardized format
+
+## Impact
+
+- Affected specs: `ocr-processing`
+- Affected code:
+ - `backend/app/services/ocr_to_unified_converter.py` (primary changes)
+ - `backend/app/services/pdf_generator_service.py` (no changes needed - already handles `TableData`)
+ - `backend/app/services/direct_extraction_engine.py` (no changes - serves as reference implementation)
+
+## Evidence
+
+### Direct Track (Reference - Correct Behavior)
+`direct_extraction_engine.py:846-850`:
+```python
+table_data = TableData(
+ rows=len(data),
+ cols=max(len(row) for row in data) if data else 0,
+ cells=cells, # Properly populated with TableCell objects
+ headers=data[0] if data else None
+)
+```
+
+### OCR Track (Current - Problematic)
+`ocr_to_unified_converter.py:574-579`:
+```python
+return TableData(
+ rows=rows, # Only counts from html.count('/| in first row
+ cells=cells, # Always empty list []
+ caption=extracted_text
+)
+```
+
+The `cells` array is always empty because the current HTML parsing only counts tags but doesn't extract actual cell content.
diff --git a/openspec/changes/archive/2025-11-26-fix-ocr-track-table-data-format/specs/ocr-processing/spec.md b/openspec/changes/archive/2025-11-26-fix-ocr-track-table-data-format/specs/ocr-processing/spec.md
new file mode 100644
index 0000000..ebc701c
--- /dev/null
+++ b/openspec/changes/archive/2025-11-26-fix-ocr-track-table-data-format/specs/ocr-processing/spec.md
@@ -0,0 +1,51 @@
+## ADDED Requirements
+
+### Requirement: OCR Track Table Data Structure Consistency
+The OCR Track SHALL produce `TableData` objects with fully populated `cells` arrays that match the format produced by Direct Track, ensuring consistent table rendering across both processing tracks.
+
+#### Scenario: OCR Track produces structured TableData for HTML tables
+- **GIVEN** a document with tables is processed via OCR Track
+- **WHEN** PP-StructureV3 returns HTML table content in the `html` or `content` field
+- **THEN** the `ocr_to_unified_converter` SHALL parse the HTML and produce a `TableData` object
+- **AND** the `TableData.cells` array SHALL contain `TableCell` objects for each cell
+- **AND** each `TableCell` SHALL have correct `row`, `col`, and `content` values
+- **AND** the output format SHALL match Direct Track's `TableData` structure
+
+#### Scenario: OCR Track handles tables with merged cells
+- **GIVEN** an HTML table with `rowspan` or `colspan` attributes
+- **WHEN** the table is converted to `TableData`
+- **THEN** each `TableCell` SHALL have correct `row_span` and `col_span` values
+- **AND** the cell content SHALL be correctly extracted
+
+#### Scenario: OCR Track handles header rows
+- **GIVEN** an HTML table with ` | ` elements or a header row
+- **WHEN** the table is converted to `TableData`
+- **THEN** the `TableData.headers` field SHALL contain the header cell contents
+- **AND** header cells SHALL also be included in the `cells` array
+
+#### Scenario: OCR Track gracefully handles malformed HTML tables
+- **GIVEN** an HTML table with malformed markup (missing closing tags, invalid nesting)
+- **WHEN** parsing is attempted
+- **THEN** the system SHALL attempt best-effort parsing using a tolerant HTML parser
+- **AND** if parsing fails completely, SHALL fall back to returning basic `TableData` with row/col counts
+- **AND** SHALL log a warning for debugging purposes
+
+#### Scenario: PDF Generator renders OCR Track tables correctly
+- **GIVEN** a `UnifiedDocument` from OCR Track containing table elements
+- **WHEN** the PDF Generator processes the document
+- **THEN** tables SHALL be rendered as formatted tables (not as raw HTML text)
+- **AND** the rendering SHALL be identical to Direct Track table rendering
+
+#### Scenario: Direct Track table processing remains unchanged
+- **GIVEN** a native PDF with embedded tables
+- **WHEN** the document is processed via Direct Track
+- **THEN** the `DirectExtractionEngine` SHALL continue to produce `TableData` objects as before
+- **AND** the `ocr_to_unified_converter.py` changes SHALL NOT affect Direct Track processing
+- **AND** table rendering in PDF output SHALL be identical to pre-fix behavior
+
+#### Scenario: Hybrid Mode table source isolation
+- **GIVEN** a document processed via Hybrid Mode (Direct Track primary + OCR Track for images)
+- **WHEN** the system merges OCR Track results into Direct Track results
+- **THEN** only image elements (FIGURE, IMAGE, LOGO) SHALL be merged from OCR Track
+- **AND** table elements SHALL exclusively come from Direct Track
+- **AND** no OCR Track table data SHALL contaminate the final output
diff --git a/openspec/changes/archive/2025-11-26-fix-ocr-track-table-data-format/tasks.md b/openspec/changes/archive/2025-11-26-fix-ocr-track-table-data-format/tasks.md
new file mode 100644
index 0000000..df303f8
--- /dev/null
+++ b/openspec/changes/archive/2025-11-26-fix-ocr-track-table-data-format/tasks.md
@@ -0,0 +1,43 @@
+# Tasks: Fix OCR Track Table Data Format
+
+## 1. Implementation
+
+- [x] 1.1 Add BeautifulSoup import and dependency check in `ocr_to_unified_converter.py`
+- [x] 1.2 Rewrite `_extract_table_data` method to parse HTML using BeautifulSoup
+- [x] 1.3 Extract cell content, row index, column index for each ` | ` and ` | ` element
+- [x] 1.4 Handle `rowspan` and `colspan` attributes for merged cells
+- [x] 1.5 Create `TableCell` objects with proper content and positioning
+- [x] 1.6 Populate `TableData.cells` array with extracted `TableCell` objects
+- [x] 1.7 Preserve header detection (` | ` elements) and store in `TableData.headers`
+
+## 2. Edge Case Handling
+
+- [x] 2.1 Handle malformed HTML tables gracefully (missing closing tags, nested tables)
+- [x] 2.2 Handle empty cells (create TableCell with empty string content)
+- [x] 2.3 Handle tables without ` | ` structure (fallback to current behavior)
+- [x] 2.4 Log warnings for unparseable tables instead of failing silently
+
+## 3. Testing
+
+- [x] 3.1 Create unit tests for `_extract_table_data` with various HTML table formats
+- [x] 3.2 Test simple tables (basic rows/columns)
+- [x] 3.3 Test tables with merged cells (rowspan/colspan)
+- [x] 3.4 Test tables with header rows (`| ` elements)
+- [x] 3.5 Test malformed HTML tables (handled via BeautifulSoup's tolerance)
+- [ ] 3.6 Integration test: OCR Track PDF generation with tables
+
+## 4. Verification (Track Isolation)
+
+- [x] 4.1 Compare OCR Track table output format with Direct Track output format
+- [ ] 4.2 Verify PDF Generator renders OCR Track tables correctly
+- [x] 4.3 **Direct Track regression test**: `direct_extraction_engine.py` NOT modified (confirmed via git status)
+- [x] 4.4 **Hybrid Mode regression test**: `ocr_service.py` NOT modified, image merge logic unchanged
+- [x] 4.5 **OCR Track fix verification**: Unit tests confirm:
+ - `TableData.cells` array is populated (6 cells in 3x2 table)
+ - `TableCell` objects have correct row/col/content values
+ - Headers extracted correctly
+- [x] 4.6 Verify `DirectExtractionEngine` code is NOT modified (isolation check - confirmed)
+
+## 5. Dependencies
+
+- [x] 5.1 Add `beautifulsoup4>=4.12.0` to `requirements.txt`
diff --git a/requirements.txt b/requirements.txt
index c018589..25eec3c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -69,3 +69,4 @@ pylint>=3.0.0
# ===== Utilities =====
python-magic>=0.4.27 # File type detection
+beautifulsoup4>=4.12.0 # HTML table parsing for OCR track
| |