diff --git a/backend/app/services/ocr_to_unified_converter.py b/backend/app/services/ocr_to_unified_converter.py index 721eed4..8971bea 100644 --- a/backend/app/services/ocr_to_unified_converter.py +++ b/backend/app/services/ocr_to_unified_converter.py @@ -350,7 +350,19 @@ class OCRToUnifiedConverter: element_type = elem_data.get('type', ElementType.TEXT) if isinstance(element_type, str): # Convert string to ElementType if needed - element_type = ElementType[element_type] if element_type in ElementType.__members__ else ElementType.TEXT + # ElementType is a str-based enum, so we can construct from value (lowercase) + try: + element_type = ElementType(element_type) + except ValueError: + # If value doesn't match, try member name (uppercase) + element_type = ElementType[element_type.upper()] if element_type.upper() in ElementType.__members__ else ElementType.TEXT + + # Content-based reclassification: detect HTML tables in text content + content_str = elem_data.get('content', '') + if isinstance(content_str, str) and ' Optional[TableData]: - """Extract table data from element.""" + """ + Extract table data from element using BeautifulSoup for robust HTML parsing. + + This method produces TableData objects with fully populated cells arrays, + matching the format produced by DirectExtractionEngine for consistency. + """ try: html = elem_data.get('html', '') extracted_text = elem_data.get('extracted_text', '') @@ -550,31 +567,101 @@ class OCRToUnifiedConverter: html = content logger.debug("Using content field as HTML table source") - # Try to parse HTML to get rows and columns - rows = 0 + # Return None if no HTML table content + if not html or ' element found in HTML") + return self._fallback_table_data(html, extracted_text) + + cells = [] + headers = [] + rows = table.find_all('tr') + + # Track actual column positions accounting for rowspan/colspan + # This is a simplified approach - complex spanning may need enhancement + for row_idx, row in enumerate(rows): + row_cells = row.find_all(['td', 'th']) + col_idx = 0 + + for cell in row_cells: + cell_content = cell.get_text(strip=True) + rowspan = int(cell.get('rowspan', 1)) + colspan = int(cell.get('colspan', 1)) + + cells.append(TableCell( + row=row_idx, + col=col_idx, + row_span=rowspan, + col_span=colspan, + content=cell_content + )) + + # Collect headers from elements or first row + if cell.name == 'th' or row_idx == 0: + headers.append(cell_content) + + # Advance column index by colspan + col_idx += colspan + + # Calculate actual dimensions + num_rows = len(rows) + num_cols = max( + sum(int(cell.get('colspan', 1)) for cell in row.find_all(['td', 'th'])) + for row in rows + ) if rows else 0 + + logger.debug( + f"Parsed HTML table: {num_rows} rows, {num_cols} cols, {len(cells)} cells" + ) + + return TableData( + rows=num_rows, + cols=num_cols, + cells=cells, + headers=headers if headers else None, + caption=extracted_text if extracted_text else None + ) + + except ImportError: + logger.warning("BeautifulSoup not available, using fallback parsing") + return self._fallback_table_data(html, extracted_text) + + except Exception as e: + logger.warning(f"Failed to extract table data: {e}") + return None + + def _fallback_table_data(self, html: str, extracted_text: str = '') -> Optional[TableData]: + """ + Fallback table parsing when BeautifulSoup is not available. + Returns basic TableData with row/col counts only (no cells). + """ + try: + rows = html.count(' 0: + first_row_end = html.find('') + if first_row_end > 0: + first_row = html[:first_row_end] + cols = first_row.count(' 0: - # Estimate columns from first row - first_row_end = html.find('') - if first_row_end > 0: - first_row = html[:first_row_end] - cols = first_row.count(' header.bbox.y2 and - e.bbox.y1 < next_header_y and + if (e.bbox.y0 > header.bbox.y1 and + e.bbox.y0 < next_header_y and e.type not in [ElementType.HEADER, ElementType.TITLE]) ] diff --git a/backend/app/services/pp_structure_enhanced.py b/backend/app/services/pp_structure_enhanced.py index 98382fc..7833e7a 100644 --- a/backend/app/services/pp_structure_enhanced.py +++ b/backend/app/services/pp_structure_enhanced.py @@ -167,7 +167,7 @@ class PPStructureEnhanced: # Process parsing_res_list if found if parsing_res_list: elements = self._process_parsing_res_list( - parsing_res_list, current_page, output_dir + parsing_res_list, current_page, output_dir, image_path ) all_elements.extend(elements) @@ -229,7 +229,8 @@ class PPStructureEnhanced: self, parsing_res_list: List[Dict], current_page: int, - output_dir: Optional[Path] + output_dir: Optional[Path], + source_image_path: Optional[Path] = None ) -> List[Dict[str, Any]]: """ Process parsing_res_list to extract all elements. @@ -238,6 +239,7 @@ class PPStructureEnhanced: parsing_res_list: List of parsed elements from PP-StructureV3 current_page: Current page number output_dir: Optional output directory + source_image_path: Path to source image for cropping image regions Returns: List of processed elements with normalized structure @@ -327,6 +329,17 @@ class PPStructureEnhanced: element['img_path'] = item['img_path'] # Keep original for reference else: logger.warning(f"Failed to save image for element {element['element_id']}") + # Crop image from source if no img_path but source image is available + elif source_image_path and output_dir and bbox != [0, 0, 0, 0]: + cropped_path = self._crop_and_save_image( + source_image_path, bbox, output_dir, element['element_id'] + ) + if cropped_path: + element['saved_path'] = cropped_path + element['img_path'] = cropped_path + logger.info(f"Cropped and saved image region for {element['element_id']}") + else: + logger.warning(f"Failed to crop image for element {element['element_id']}") # Add any additional metadata if 'metadata' in item: @@ -535,4 +548,62 @@ class PPStructureEnhanced: img_obj.save(str(img_path)) logger.info(f"Saved image to {img_path}") except Exception as e: - logger.warning(f"Failed to save PIL image: {e}") \ No newline at end of file + logger.warning(f"Failed to save PIL image: {e}") + + def _crop_and_save_image( + self, + source_image_path: Path, + bbox: List[float], + output_dir: Path, + element_id: str + ) -> Optional[str]: + """ + Crop image region from source image and save to output directory. + + Args: + source_image_path: Path to the source image + bbox: Bounding box [x1, y1, x2, y2] + output_dir: Output directory for saving cropped image + element_id: Element ID for naming + + Returns: + Relative filename (not full path) to saved image, consistent with + Direct Track which stores "filename.png" that gets joined with + result_dir by pdf_generator_service. + """ + try: + from PIL import Image + + # Open source image + with Image.open(source_image_path) as img: + # Ensure bbox values are integers + x1, y1, x2, y2 = [int(v) for v in bbox[:4]] + + # Validate bbox + img_width, img_height = img.size + x1 = max(0, min(x1, img_width)) + x2 = max(0, min(x2, img_width)) + y1 = max(0, min(y1, img_height)) + y2 = max(0, min(y2, img_height)) + + if x2 <= x1 or y2 <= y1: + logger.warning(f"Invalid bbox for cropping: {bbox}") + return None + + # Crop the region + cropped = img.crop((x1, y1, x2, y2)) + + # Save directly to output directory (no subdirectory) + # Consistent with Direct Track which saves to output_dir directly + image_filename = f"{element_id}.png" + img_path = output_dir / image_filename + cropped.save(str(img_path), "PNG") + + # Return just the filename (relative to result_dir) + # PDF generator will join with result_dir to get full path + logger.info(f"Cropped image saved: {img_path} ({x2-x1}x{y2-y1} pixels)") + return image_filename + + except Exception as e: + logger.error(f"Failed to crop and save image for {element_id}: {e}") + return None \ No newline at end of file diff --git a/docs/architecture-overview.md b/docs/architecture-overview.md new file mode 100644 index 0000000..2f33784 --- /dev/null +++ b/docs/architecture-overview.md @@ -0,0 +1,84 @@ +# Tool_OCR 架構說明與 UML + +本文件概覽 Tool_OCR 的主要組件、資料流與雙軌處理(OCR / Direct),並附上 UML 關係圖以協助判斷改動的影響範圍。 + +## 系統分層與重點元件 +- **API 層(FastAPI)**:`app/main.py` 啟動應用、掛載路由(`routers/auth.py`, `routers/tasks.py`, `routers/admin.py`),並在 lifespan 初始化記憶體管理、服務池與併發控制。 +- **任務/檔案管理**:`task_service.py` 與 `file_access_service.py` 掌管任務 CRUD、路徑與權限;`Task` / `TaskFile` 模型紀錄結果檔路徑。 +- **核心處理服務**:`OCRService`(`services/ocr_service.py`)負責雙軌路由與 OCR;整合偵測、直抽、OCR、統一格式轉換、匯出與 PDF 生成。 +- **雙軌偵測/直抽**:`DocumentTypeDetector` 判斷走 Direct 或 OCR;`DirectExtractionEngine` 使用 PyMuPDF 直接抽取文字/表格/圖片(必要時觸發混合模式補抽圖片)。 +- **OCR 解析**:PaddleOCR + `PPStructureEnhanced` 抽取 23 類元素;`OCRToUnifiedConverter` 轉成 `UnifiedDocument` 統一格式。 +- **匯出/呈現**:`UnifiedDocumentExporter` 產出 JSON/Markdown;`pdf_generator_service.py` 產生版面保持 PDF;前端透過 `/api/v2/tasks/{id}/download/*` 取得。 +- **資源控管**:`memory_manager.py`(MemoryGuard、prediction semaphore、模型生命週期),`service_pool.py`(`OCRService` 池)避免多重載模與 GPU 爆滿。 + +## 處理流程(任務層級) +1. **上傳**:`POST /api/v2/upload` 建立 Task 並寫檔到 `uploads/`(含 SHA256、檔案資訊)。 +2. **啟動**:`POST /api/v2/tasks/{id}/start`(`ProcessingOptions`,可含 `pp_structure_params`)→ 背景 `process_task_ocr` 取得服務池中的 `OCRService`。 +3. **軌道決策**:`DocumentTypeDetector.detect` 分析 MIME、PDF 文字覆蓋率或 Office 轉 PDF 後的抽樣結果: + - **Direct**:`DirectExtractionEngine.extract` 產出 `UnifiedDocument`;若偵測缺圖則啟用混合模式呼叫 OCR 抽圖或渲染 inline 圖。 + - **OCR**:`process_file_traditional` → PaddleOCR + PP-Structure → `OCRToUnifiedConverter.convert` 產生 `UnifiedDocument`。 + - 以 `ProcessingTrack` 記錄 `ocr` / `direct` / `hybrid`,處理時間與統計寫入 metadata。 +4. **輸出保存**:`UnifiedDocumentExporter` 寫 `_result.json`(含 metadata、statistics)與 `_output.md`;`pdf_generator_service` 產出 `_layout.pdf`;路徑回寫 DB。 +5. **下載/檢視**:前端透過 `/download/json|markdown|pdf|unified` 取檔;`/metadata` 讀 JSON metadata 回傳統計與 `processing_track`。 + +## 前端流程摘要 +- `UploadPage`:呼叫 `apiClientV2.uploadFile`,首個 `task_id` 存於 `uploadStore.batchId`。 +- `ProcessingPage`:對 `batchId` 呼叫 `startTask`(預設 `use_dual_track=true`,支援自訂 `pp_structure_params`),輪詢狀態。 +- `ResultsPage` / `TaskDetailPage`:使用 `getTask` 與 `getProcessingMetadata` 顯示 `processing_track`、統計並提供 JSON/Markdown/PDF/Unified 下載。 +- `TaskHistoryPage`:列出任務、支援重新啟動、重試、下載。 + +## 共同模組與影響點 +- **UnifiedDocument**(`models/unified_document.py`)為 Direct/OCR 共用輸出格式;所有匯出/PDF/前端 track 顯示依賴其欄位與 metadata。 +- **服務池/記憶體守護**:Direct 與 OCR 共用同一 `OCRService` 實例池與 MemoryGuard;新增資源或改動需確保遵循 acquire/release、清理與 semaphore 規則。 +- **偵測閾值變更**:`DocumentTypeDetector` 參數調整會影響 Direct 與 OCR 分流比例,間接改變 GPU 載荷與結果格式。 +- **匯出/PDF**:任何 UnifiedDocument 結構變動會影響 JSON/Markdown/PDF 產出與前端下載/預覽;需同步維護轉換與匯出器。 + +## UML 關係圖(Mermaid) +```mermaid +classDiagram + class TasksRouter { + +upload_file() + +start_task() + +download_json/markdown/pdf/unified() + +get_metadata() + } + class TaskService {+create_task(); +update_task_status(); +get_task_by_id()} + class FileAccessService + class OCRService { + +process() + +process_with_dual_track() + +process_file_traditional() + +save_results() + } + class DocumentTypeDetector {+detect()} + class DirectExtractionEngine {+extract(); +check_document_for_missing_images()} + class OCRToUnifiedConverter {+convert()} + class UnifiedDocument + class UnifiedDocumentExporter {+export_to_json(); +export_to_markdown()} + class PDFGeneratorService {+generate_layout_pdf(); +generate_from_unified_document()} + class ServicePool {+acquire(); +release()} + class MemoryManager <> + class OfficeConverter {+convert_to_pdf()} + class PPStructureEnhanced {+analyze_with_full_structure()} + + TasksRouter --> TaskService + TasksRouter --> FileAccessService + TasksRouter --> OCRService : background process via process_task_ocr + OCRService --> DocumentTypeDetector : track recommendation + OCRService --> DirectExtractionEngine : direct track + OCRService --> OCRToUnifiedConverter : OCR track result -> UnifiedDocument + OCRService --> OfficeConverter : Office -> PDF + OCRService --> PPStructureEnhanced : layout analysis (PP-StructureV3) + OCRService --> UnifiedDocumentExporter : persist results + OCRService --> PDFGeneratorService : layout-preserving PDF + OCRService --> ServicePool : acquired instance + ServicePool --> MemoryManager : model lifecycle / GPU guard + UnifiedDocumentExporter --> UnifiedDocument + PDFGeneratorService --> UnifiedDocument +``` + +## 影響判斷指引 +- **改 Direct/偵測邏輯**:會改變 `processing_track` 與結果格式;前端顯示與下載 JSON/Markdown/PDF 仍依賴 UnifiedDocument,需驗證匯出與 PDF 生成。 +- **改 OCR/PP-Structure 參數**:僅影響 OCR track;Direct track 不受 `pp_structure_params` 影響(符合 spec),需維持 `processing_track` 填寫。 +- **改 UnifiedDocument 結構/統計**:需同步 `UnifiedDocumentExporter`、`pdf_generator_service`、前端 `getProcessingMetadata`/下載端點。 +- **改資源控管**:服務池或 MemoryGuard 調整會同時影響 Direct/OCR 執行時序與穩定性,須確保 acquire/release 與 semaphore 不被破壞。 diff --git a/openspec/changes/archive/2025-11-26-fix-ocr-track-table-data-format/design.md b/openspec/changes/archive/2025-11-26-fix-ocr-track-table-data-format/design.md new file mode 100644 index 0000000..6ff7380 --- /dev/null +++ b/openspec/changes/archive/2025-11-26-fix-ocr-track-table-data-format/design.md @@ -0,0 +1,173 @@ +# Design: Fix OCR Track Table Data Format + +## Context + +The OCR processing pipeline has three modes: +1. **Direct Track**: Extracts structured data directly from native PDFs using `direct_extraction_engine.py` +2. **OCR Track**: Uses PP-StructureV3 for layout analysis and OCR, then converts results via `ocr_to_unified_converter.py` +3. **Hybrid Mode**: Uses Direct Track as primary, supplements with OCR Track for missing images only + +Both tracks produce `UnifiedDocument` containing `DocumentElement` objects. For tables, the `content` field should contain a `TableData` object with populated `cells` array. However, OCR Track currently produces `TableData` with empty `cells`, causing PDF generation failures. + +## Track Isolation Analysis (Safety Guarantee) + +This section documents why the proposed changes will NOT affect Direct Track or Hybrid Mode. + +### Code Flow Analysis + +``` +┌─────────────────────────────────────────────────────────────────────────┐ +│ ocr_service.py │ +├─────────────────────────────────────────────────────────────────────────┤ +│ │ +│ Direct Track ──► DirectExtractionEngine ──► UnifiedDocument │ +│ (direct_extraction_engine.py) (tables: TableData ✓) │ +│ [NOT MODIFIED] │ +│ │ +│ OCR Track ────► PP-StructureV3 ──► OCRToUnifiedConverter ──► UnifiedDoc│ +│ (ocr_to_unified_converter.py) │ +│ [MODIFIED: _extract_table_data] │ +│ │ +│ Hybrid Mode ──► Direct Track (primary) + OCR Track (images only) │ +│ │ │ │ +│ │ └──► _merge_ocr_images_into_ │ +│ │ direct() merges ONLY: │ +│ │ - ElementType.FIGURE │ +│ │ - ElementType.IMAGE │ +│ │ - ElementType.LOGO │ +│ │ [Tables NOT merged] │ +│ └──► Tables come from Direct Track (unchanged) │ +└─────────────────────────────────────────────────────────────────────────┘ +``` + +### Evidence from ocr_service.py + +**Line 1610** (Hybrid mode merge logic): +```python +image_types = {ElementType.FIGURE, ElementType.IMAGE, ElementType.LOGO} +``` + +**Lines 1634-1635** (Only image types are merged): +```python +for element in ocr_page.elements: + if element.type in image_types: # Tables excluded +``` + +### Impact Matrix + +| Mode | Table Source | Uses OCRToUnifiedConverter? | Affected by Change? | +|------|--------------|----------------------------|---------------------| +| Direct Track | `DirectExtractionEngine` | No | **No** | +| OCR Track | `OCRToUnifiedConverter` | Yes | **Yes (Fixed)** | +| Hybrid Mode | `DirectExtractionEngine` (tables) | Only for images | **No** | + +### Conclusion + +The fix is **isolated to OCR Track only**: +- Direct Track: Uses separate engine (`DirectExtractionEngine`), completely unaffected +- Hybrid Mode: Tables come from Direct Track; OCR Track is only used for image extraction +- OCR Track: Will benefit from the fix with proper `TableData` output + +## Goals / Non-Goals + +### Goals +- OCR Track table output format matches Direct Track format exactly +- PDF Generator receives consistent `TableData` objects from both tracks +- Robust HTML table parsing that handles real-world OCR output + +### Non-Goals +- Modifying Direct Track behavior (it's the reference implementation) +- Changing the `TableData` or `TableCell` data models +- Modifying PDF Generator to handle HTML strings as a workaround + +## Decisions + +### Decision 1: Use BeautifulSoup for HTML Parsing + +**Rationale**: The current regex/string-counting approach is fragile and cannot extract cell content. BeautifulSoup provides: +- Robust handling of malformed HTML (common in OCR output) +- Easy extraction of cell content, attributes (rowspan, colspan) +- Well-tested library already used in many Python projects + +**Alternatives considered**: +- Manual regex parsing: Too fragile for complex tables +- lxml: More complex API, overkill for this use case +- html.parser (stdlib): Less tolerant of malformed HTML + +### Decision 2: Maintain Backward Compatibility + +**Rationale**: If BeautifulSoup parsing fails, fall back to current behavior (return `TableData` with basic row/col counts). This ensures existing functionality isn't broken. + +### Decision 3: Single Point of Change + +**Rationale**: Only modify `ocr_to_unified_converter.py`. This: +- Minimizes regression risk +- Keeps Direct Track untouched as reference +- Requires no changes to downstream PDF Generator + +## Implementation Approach + +```python +def _extract_table_data(self, elem_data: Dict) -> Optional[TableData]: + """Extract table data from element using BeautifulSoup.""" + try: + html = elem_data.get('html', '') or elem_data.get('content', '') + if not html or ' elements + if row_idx == 0 or cell.name == 'th': + headers.append(cell_content) + + return TableData( + rows=len(rows), + cols=max(len(row.find_all(['td', 'th'])) for row in rows) if rows else 0, + cells=cells, + headers=headers if headers else None + ) + except Exception as e: + logger.warning(f"Failed to parse HTML table: {e}") + return None # Fallback handled by caller +``` + +## Risks / Trade-offs + +| Risk | Mitigation | +|------|------------| +| BeautifulSoup not installed | Add to requirements.txt; it's already a common dependency | +| Malformed HTML causes parsing errors | Use try/except with fallback to current behavior | +| Performance impact from HTML parsing | Minimal; tables are small; BeautifulSoup is fast | +| Complex rowspan/colspan calculations | Start with simple col tracking; enhance if needed | + +## Dependencies + +- `beautifulsoup4`: Already commonly available, add to requirements.txt if not present + +## Open Questions + +- Q: Should we preserve the original HTML in metadata for debugging? + - A: Optional enhancement; not required for initial fix diff --git a/openspec/changes/archive/2025-11-26-fix-ocr-track-table-data-format/proposal.md b/openspec/changes/archive/2025-11-26-fix-ocr-track-table-data-format/proposal.md new file mode 100644 index 0000000..6e89cfa --- /dev/null +++ b/openspec/changes/archive/2025-11-26-fix-ocr-track-table-data-format/proposal.md @@ -0,0 +1,45 @@ +# Change: Fix OCR Track Table Data Format to Match Direct Track + +## Why + +OCR Track produces HTML strings for table content instead of structured `TableData` objects, causing PDF generation to render raw HTML code as plain text. Direct Track correctly produces `TableData` objects with populated `cells` array, resulting in proper table rendering. This inconsistency creates poor user experience when using OCR Track for documents containing tables. + +## What Changes + +- **Enhance `_extract_table_data` method** in `ocr_to_unified_converter.py` to properly parse HTML tables into structured `TableData` objects with populated `TableCell` arrays +- **Add BeautifulSoup-based HTML table parsing** to robustly extract cell content, row/column spans from OCR-generated HTML tables +- **Ensure format consistency** between OCR Track and Direct Track table output, allowing PDF Generator to handle a single standardized format + +## Impact + +- Affected specs: `ocr-processing` +- Affected code: + - `backend/app/services/ocr_to_unified_converter.py` (primary changes) + - `backend/app/services/pdf_generator_service.py` (no changes needed - already handles `TableData`) + - `backend/app/services/direct_extraction_engine.py` (no changes - serves as reference implementation) + +## Evidence + +### Direct Track (Reference - Correct Behavior) +`direct_extraction_engine.py:846-850`: +```python +table_data = TableData( + rows=len(data), + cols=max(len(row) for row in data) if data else 0, + cells=cells, # Properly populated with TableCell objects + headers=data[0] if data else None +) +``` + +### OCR Track (Current - Problematic) +`ocr_to_unified_converter.py:574-579`: +```python +return TableData( + rows=rows, # Only counts from html.count('/ in first row + cells=cells, # Always empty list [] + caption=extracted_text +) +``` + +The `cells` array is always empty because the current HTML parsing only counts tags but doesn't extract actual cell content. diff --git a/openspec/changes/archive/2025-11-26-fix-ocr-track-table-data-format/specs/ocr-processing/spec.md b/openspec/changes/archive/2025-11-26-fix-ocr-track-table-data-format/specs/ocr-processing/spec.md new file mode 100644 index 0000000..ebc701c --- /dev/null +++ b/openspec/changes/archive/2025-11-26-fix-ocr-track-table-data-format/specs/ocr-processing/spec.md @@ -0,0 +1,51 @@ +## ADDED Requirements + +### Requirement: OCR Track Table Data Structure Consistency +The OCR Track SHALL produce `TableData` objects with fully populated `cells` arrays that match the format produced by Direct Track, ensuring consistent table rendering across both processing tracks. + +#### Scenario: OCR Track produces structured TableData for HTML tables +- **GIVEN** a document with tables is processed via OCR Track +- **WHEN** PP-StructureV3 returns HTML table content in the `html` or `content` field +- **THEN** the `ocr_to_unified_converter` SHALL parse the HTML and produce a `TableData` object +- **AND** the `TableData.cells` array SHALL contain `TableCell` objects for each cell +- **AND** each `TableCell` SHALL have correct `row`, `col`, and `content` values +- **AND** the output format SHALL match Direct Track's `TableData` structure + +#### Scenario: OCR Track handles tables with merged cells +- **GIVEN** an HTML table with `rowspan` or `colspan` attributes +- **WHEN** the table is converted to `TableData` +- **THEN** each `TableCell` SHALL have correct `row_span` and `col_span` values +- **AND** the cell content SHALL be correctly extracted + +#### Scenario: OCR Track handles header rows +- **GIVEN** an HTML table with `` elements or a header row +- **WHEN** the table is converted to `TableData` +- **THEN** the `TableData.headers` field SHALL contain the header cell contents +- **AND** header cells SHALL also be included in the `cells` array + +#### Scenario: OCR Track gracefully handles malformed HTML tables +- **GIVEN** an HTML table with malformed markup (missing closing tags, invalid nesting) +- **WHEN** parsing is attempted +- **THEN** the system SHALL attempt best-effort parsing using a tolerant HTML parser +- **AND** if parsing fails completely, SHALL fall back to returning basic `TableData` with row/col counts +- **AND** SHALL log a warning for debugging purposes + +#### Scenario: PDF Generator renders OCR Track tables correctly +- **GIVEN** a `UnifiedDocument` from OCR Track containing table elements +- **WHEN** the PDF Generator processes the document +- **THEN** tables SHALL be rendered as formatted tables (not as raw HTML text) +- **AND** the rendering SHALL be identical to Direct Track table rendering + +#### Scenario: Direct Track table processing remains unchanged +- **GIVEN** a native PDF with embedded tables +- **WHEN** the document is processed via Direct Track +- **THEN** the `DirectExtractionEngine` SHALL continue to produce `TableData` objects as before +- **AND** the `ocr_to_unified_converter.py` changes SHALL NOT affect Direct Track processing +- **AND** table rendering in PDF output SHALL be identical to pre-fix behavior + +#### Scenario: Hybrid Mode table source isolation +- **GIVEN** a document processed via Hybrid Mode (Direct Track primary + OCR Track for images) +- **WHEN** the system merges OCR Track results into Direct Track results +- **THEN** only image elements (FIGURE, IMAGE, LOGO) SHALL be merged from OCR Track +- **AND** table elements SHALL exclusively come from Direct Track +- **AND** no OCR Track table data SHALL contaminate the final output diff --git a/openspec/changes/archive/2025-11-26-fix-ocr-track-table-data-format/tasks.md b/openspec/changes/archive/2025-11-26-fix-ocr-track-table-data-format/tasks.md new file mode 100644 index 0000000..df303f8 --- /dev/null +++ b/openspec/changes/archive/2025-11-26-fix-ocr-track-table-data-format/tasks.md @@ -0,0 +1,43 @@ +# Tasks: Fix OCR Track Table Data Format + +## 1. Implementation + +- [x] 1.1 Add BeautifulSoup import and dependency check in `ocr_to_unified_converter.py` +- [x] 1.2 Rewrite `_extract_table_data` method to parse HTML using BeautifulSoup +- [x] 1.3 Extract cell content, row index, column index for each `` and `` element +- [x] 1.4 Handle `rowspan` and `colspan` attributes for merged cells +- [x] 1.5 Create `TableCell` objects with proper content and positioning +- [x] 1.6 Populate `TableData.cells` array with extracted `TableCell` objects +- [x] 1.7 Preserve header detection (`` elements) and store in `TableData.headers` + +## 2. Edge Case Handling + +- [x] 2.1 Handle malformed HTML tables gracefully (missing closing tags, nested tables) +- [x] 2.2 Handle empty cells (create TableCell with empty string content) +- [x] 2.3 Handle tables without `` structure (fallback to current behavior) +- [x] 2.4 Log warnings for unparseable tables instead of failing silently + +## 3. Testing + +- [x] 3.1 Create unit tests for `_extract_table_data` with various HTML table formats +- [x] 3.2 Test simple tables (basic rows/columns) +- [x] 3.3 Test tables with merged cells (rowspan/colspan) +- [x] 3.4 Test tables with header rows (`` elements) +- [x] 3.5 Test malformed HTML tables (handled via BeautifulSoup's tolerance) +- [ ] 3.6 Integration test: OCR Track PDF generation with tables + +## 4. Verification (Track Isolation) + +- [x] 4.1 Compare OCR Track table output format with Direct Track output format +- [ ] 4.2 Verify PDF Generator renders OCR Track tables correctly +- [x] 4.3 **Direct Track regression test**: `direct_extraction_engine.py` NOT modified (confirmed via git status) +- [x] 4.4 **Hybrid Mode regression test**: `ocr_service.py` NOT modified, image merge logic unchanged +- [x] 4.5 **OCR Track fix verification**: Unit tests confirm: + - `TableData.cells` array is populated (6 cells in 3x2 table) + - `TableCell` objects have correct row/col/content values + - Headers extracted correctly +- [x] 4.6 Verify `DirectExtractionEngine` code is NOT modified (isolation check - confirmed) + +## 5. Dependencies + +- [x] 5.1 Add `beautifulsoup4>=4.12.0` to `requirements.txt` diff --git a/requirements.txt b/requirements.txt index c018589..25eec3c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -69,3 +69,4 @@ pylint>=3.0.0 # ===== Utilities ===== python-magic>=0.4.27 # File type detection +beautifulsoup4>=4.12.0 # HTML table parsing for OCR track