feat: refactor dual-track architecture (Phase 1-5)

## Backend Changes - **Service Layer Refactoring**: - Add ProcessingOrchestrator for unified document processing - Add PDFTableRenderer for table rendering extraction - Add PDFFontManager for font management with CJK support - Add MemoryPolicyEngine (73% code reduction from MemoryGuard) - **Bug Fixes**: - Fix Direct Track table row span calculation - Fix OCR Track image path handling - Add cell_boxes coordinate validation - Filter out small decorative images - Add covering image detection ## Frontend Changes - **State Management**: - Add TaskStore for centralized task state management - Add localStorage persistence for recent tasks - Add processing state tracking - **Type Consolidation**: - Merge shared types from api.ts to apiV2.ts - Update imports in authStore, uploadStore, ResultsTable, SettingsPage - **Page Integration**: - Integrate TaskStore in ProcessingPage and TaskDetailPage - Update useTaskValidation hook with cache sync ## Testing - Direct Track: edit.pdf (3 pages, 1.281s), edit3.pdf (2 pages, 0.203s) - Cell boxes validation: 43 valid, 0 invalid - Table merging: 12 merged cells verified 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-07 07:18:27 +08:00
parent 8265be1741
commit eff9b0bcd5
19 changed files with 3637 additions and 173 deletions
--- a/backend/app/services/direct_extraction_engine.py
+++ b/backend/app/services/direct_extraction_engine.py
@@ -1048,19 +1048,24 @@ class DirectExtractionEngine:
                        bbox=cell_bbox
                    ))

-            # Try to detect visual column boundaries from page drawings
+            # Try to detect visual column and row boundaries from page drawings
            # This is more accurate than PyMuPDF's column detection for complex tables
            visual_boundaries = self._detect_visual_column_boundaries(
                fitz_page, bbox_data, column_widths
            )
+            # Use table.cells (flat list of bboxes) for more accurate row detection
+            raw_table_cells = getattr(table, 'cells', None)
+            row_boundaries = self._detect_visual_row_boundaries(
+                fitz_page, bbox_data, raw_table_cells
+            )

            if visual_boundaries:
-                # Remap cells to visual columns
-                cells, column_widths, num_cols = self._remap_cells_to_visual_columns(
-                    cells, column_widths, num_rows, num_cols, visual_boundaries
+                # Remap cells to visual columns and rows
+                cells, column_widths, num_cols, num_rows = self._remap_cells_to_visual_columns(
+                    cells, column_widths, num_rows, num_cols, visual_boundaries, row_boundaries
                )
            else:
-                # Fallback to narrow column merging
+                # Fallback to narrow column merging (doesn't modify rows)
                cells, column_widths, num_cols = self._merge_narrow_columns(
                    cells, column_widths, num_rows, num_cols,
                    min_column_width=10.0
@@ -1290,7 +1295,13 @@ class DirectExtractionEngine:

        For tables with complex merged cells, PyMuPDF's column detection often
        creates too many columns. This method analyzes the visual rectangles
-        (cell backgrounds) to find the true column boundaries.
+        (cell backgrounds) to find the MAIN column boundaries by frequency analysis.
+
+        Strategy:
+        1. Collect all cell rectangles from drawings
+        2. Count how frequently each x boundary appears (rounded to 5pt)
+        3. Keep only boundaries that appear frequently (>= threshold)
+        4. These are the main column boundaries that span most rows

        Args:
            page: PyMuPDF page object
@@ -1301,67 +1312,215 @@ class DirectExtractionEngine:
            List of column boundary x-coordinates, or None if detection fails
        """
        try:
-            table_rect = fitz.Rect(table_bbox)
+            from collections import Counter

            # Collect cell rectangles from page drawings
            cell_rects = []
            drawings = page.get_drawings()
            for d in drawings:
-                rect = fitz.Rect(d.get('rect', (0, 0, 0, 0)))
-                # Filter: must intersect table, must be large enough to be a cell
-                if (table_rect.intersects(rect) and
-                    rect.width > 30 and rect.height > 15):
-                    cell_rects.append(rect)
+                if d.get('items'):
+                    for item in d['items']:
+                        if item[0] == 're':  # Rectangle
+                            rect = item[1]
+                            # Filter: within table bounds, large enough to be a cell
+                            if (rect.x0 >= table_bbox[0] - 5 and
+                                rect.x1 <= table_bbox[2] + 5 and
+                                rect.y0 >= table_bbox[1] - 5 and
+                                rect.y1 <= table_bbox[3] + 5):
+                                width = rect.x1 - rect.x0
+                                height = rect.y1 - rect.y0
+                                if width > 30 and height > 15:
+                                    cell_rects.append(rect)

            if len(cell_rects) < 4:
                # Not enough cell rectangles detected
+                logger.debug(f"Only {len(cell_rects)} cell rectangles found, skipping visual detection")
                return None

-            # Collect unique x boundaries
-            all_x = set()
+            logger.debug(f"Found {len(cell_rects)} cell rectangles for visual column detection")
+
+            # Count frequency of each boundary (rounded to 5pt)
+            boundary_counts = Counter()
            for r in cell_rects:
-                all_x.add(round(r.x0, 0))
-                all_x.add(round(r.x1, 0))
+                boundary_counts[round(r.x0 / 5) * 5] += 1
+                boundary_counts[round(r.x1 / 5) * 5] += 1

-            # Merge close boundaries (within 15pt threshold)
-            def merge_close(values, threshold=15):
-                if not values:
-                    return []
-                values = sorted(values)
-                result = [values[0]]
-                for v in values[1:]:
-                    if v - result[-1] > threshold:
-                        result.append(v)
-                return result
+            # Keep only boundaries that appear frequently
+            # Use 8% threshold to catch internal column boundaries (like nested sub-columns)
+            min_frequency = max(3, len(cell_rects) * 0.08)
+            frequent_boundaries = sorted([
+                x for x, count in boundary_counts.items()
+                if count >= min_frequency
+            ])

-            boundaries = merge_close(list(all_x), threshold=15)
+            # Always include table edges
+            table_left = round(table_bbox[0] / 5) * 5
+            table_right = round(table_bbox[2] / 5) * 5
+            if not frequent_boundaries or frequent_boundaries[0] > table_left + 10:
+                frequent_boundaries.insert(0, table_left)
+            if not frequent_boundaries or frequent_boundaries[-1] < table_right - 10:
+                frequent_boundaries.append(table_right)

-            if len(boundaries) < 3:
+            logger.debug(f"Frequent boundaries (min_freq={min_frequency:.0f}): {frequent_boundaries}")
+
+            if len(frequent_boundaries) < 3:
                # Need at least 3 boundaries for 2 columns
                return None

-            # Calculate column widths from visual boundaries
-            visual_widths = [boundaries[i+1] - boundaries[i]
-                           for i in range(len(boundaries)-1)]
+            # Merge close boundaries (within 10pt) - take the one with higher frequency
+            def merge_close_by_frequency(boundaries, counts, threshold=10):
+                if not boundaries:
+                    return []
+                result = [boundaries[0]]
+                for b in boundaries[1:]:
+                    if b - result[-1] <= threshold:
+                        # Keep the one with higher frequency
+                        if counts[b] > counts[result[-1]]:
+                            result[-1] = b
+                    else:
+                        result.append(b)
+                return result

-            # Filter out narrow "separator" columns (< 20pt)
-            # and keep only content columns
-            content_boundaries = [boundaries[0]]
-            for i, width in enumerate(visual_widths):
-                if width >= 20:  # Content column
-                    content_boundaries.append(boundaries[i+1])
-                # Skip narrow separator columns
+            merged_boundaries = merge_close_by_frequency(
+                frequent_boundaries, boundary_counts, threshold=10
+            )

-            if len(content_boundaries) < 3:
+            if len(merged_boundaries) < 3:
                return None

-            logger.info(f"Visual column detection: {len(content_boundaries)-1} columns from drawings")
-            logger.debug(f"Visual boundaries: {content_boundaries}")
+            # Calculate column widths
+            widths = [merged_boundaries[i+1] - merged_boundaries[i]
+                     for i in range(len(merged_boundaries)-1)]

-            return content_boundaries
+            logger.info(f"Visual column detection: {len(widths)} columns")
+            logger.info(f"  Boundaries: {merged_boundaries}")
+            logger.info(f"  Widths: {[round(w) for w in widths]}")
+
+            return merged_boundaries

        except Exception as e:
            logger.warning(f"Visual column detection failed: {e}")
+            import traceback
+            logger.debug(traceback.format_exc())
+            return None
+
+    def _detect_visual_row_boundaries(
+        self,
+        page: fitz.Page,
+        table_bbox: Tuple[float, float, float, float],
+        table_cells: Optional[List] = None
+    ) -> Optional[List[float]]:
+        """
+        Detect actual row boundaries from table cell bboxes.
+
+        Uses cell bboxes from PyMuPDF table detection for more accurate
+        row boundary detection than page drawings.
+
+        Args:
+            page: PyMuPDF page object
+            table_bbox: Table bounding box (x0, y0, x1, y1)
+            table_cells: List of cell bboxes from table.cells (preferred)
+
+        Returns:
+            List of row boundary y-coordinates, or None if detection fails
+        """
+        try:
+            from collections import Counter
+
+            boundary_counts = Counter()
+            cell_count = 0
+
+            if table_cells:
+                # Use table cells directly (more accurate for row detection)
+                for cell_bbox in table_cells:
+                    if cell_bbox:
+                        y0 = round(cell_bbox[1] / 5) * 5
+                        y1 = round(cell_bbox[3] / 5) * 5
+                        boundary_counts[y0] += 1
+                        boundary_counts[y1] += 1
+                        cell_count += 1
+            else:
+                # Fallback to page drawings
+                drawings = page.get_drawings()
+                for d in drawings:
+                    if d.get('items'):
+                        for item in d['items']:
+                            if item[0] == 're':
+                                rect = item[1]
+                                if (rect.x0 >= table_bbox[0] - 5 and
+                                    rect.x1 <= table_bbox[2] + 5 and
+                                    rect.y0 >= table_bbox[1] - 5 and
+                                    rect.y1 <= table_bbox[3] + 5):
+                                    width = rect.x1 - rect.x0
+                                    height = rect.y1 - rect.y0
+                                    if width > 30 and height > 15:
+                                        y0 = round(rect.y0 / 5) * 5
+                                        y1 = round(rect.y1 / 5) * 5
+                                        boundary_counts[y0] += 1
+                                        boundary_counts[y1] += 1
+                                        cell_count += 1
+
+            if cell_count < 4:
+                logger.debug(f"Only {cell_count} cells found, skipping visual row detection")
+                return None
+
+            # Keep only boundaries that appear frequently
+            # Use 8% threshold similar to column detection
+            min_frequency = max(3, cell_count * 0.08)
+            frequent_boundaries = sorted([
+                y for y, count in boundary_counts.items()
+                if count >= min_frequency
+            ])
+
+            # Always include table edges
+            table_top = round(table_bbox[1] / 5) * 5
+            table_bottom = round(table_bbox[3] / 5) * 5
+            if not frequent_boundaries or frequent_boundaries[0] > table_top + 10:
+                frequent_boundaries.insert(0, table_top)
+            if not frequent_boundaries or frequent_boundaries[-1] < table_bottom - 10:
+                frequent_boundaries.append(table_bottom)
+
+            logger.debug(f"Frequent Y boundaries (min_freq={min_frequency:.0f}): {frequent_boundaries}")
+
+            if len(frequent_boundaries) < 3:
+                # Need at least 3 boundaries for 2 rows
+                return None
+
+            # Merge close boundaries (within 10pt) - take the one with higher frequency
+            def merge_close_by_frequency(boundaries, counts, threshold=10):
+                if not boundaries:
+                    return []
+                result = [boundaries[0]]
+                for b in boundaries[1:]:
+                    if b - result[-1] <= threshold:
+                        # Keep the one with higher frequency
+                        if counts[b] > counts[result[-1]]:
+                            result[-1] = b
+                    else:
+                        result.append(b)
+                return result
+
+            merged_boundaries = merge_close_by_frequency(
+                frequent_boundaries, boundary_counts, threshold=10
+            )
+
+            if len(merged_boundaries) < 3:
+                return None
+
+            # Calculate row heights
+            heights = [merged_boundaries[i+1] - merged_boundaries[i]
+                      for i in range(len(merged_boundaries)-1)]
+
+            logger.info(f"Visual row detection: {len(heights)} rows")
+            logger.info(f"  Y Boundaries: {merged_boundaries}")
+            logger.info(f"  Heights: {[round(h) for h in heights]}")
+
+            return merged_boundaries
+
+        except Exception as e:
+            logger.warning(f"Visual row detection failed: {e}")
+            import traceback
+            logger.debug(traceback.format_exc())
            return None

    def _remap_cells_to_visual_columns(
@@ -1370,8 +1529,9 @@ class DirectExtractionEngine:
        column_widths: List[float],
        num_rows: int,
        num_cols: int,
-        visual_boundaries: List[float]
-    ) -> Tuple[List[TableCell], List[float], int]:
+        visual_boundaries: List[float],
+        row_boundaries: Optional[List[float]] = None
+    ) -> Tuple[List[TableCell], List[float], int, int]:
        """
        Remap cells from PyMuPDF columns to visual columns based on cell bbox.

@@ -1381,35 +1541,64 @@ class DirectExtractionEngine:
            num_rows: Number of rows
            num_cols: Original number of columns
            visual_boundaries: Column boundaries from visual detection
+            row_boundaries: Row boundaries from visual detection (optional)

        Returns:
-            Tuple of (remapped_cells, new_widths, new_num_cols)
+            Tuple of (remapped_cells, new_widths, new_num_cols, new_num_rows)
        """
        try:
            new_num_cols = len(visual_boundaries) - 1
            new_widths = [visual_boundaries[i+1] - visual_boundaries[i]
                         for i in range(new_num_cols)]

-            logger.info(f"Remapping {len(cells)} cells from {num_cols} to {new_num_cols} visual columns")
+            new_num_rows = len(row_boundaries) - 1 if row_boundaries else num_rows

-            # Map each cell to visual column based on its bbox center
-            cell_map = {}  # (row, new_col) -> list of cells
+            logger.info(f"Remapping {len(cells)} cells from {num_cols} to {new_num_cols} visual columns")
+            if row_boundaries:
+                logger.info(f"Using {new_num_rows} visual rows for row_span calculation")
+
+            # Map each cell to visual column and row based on its bbox
+            # This ensures spanning cells are placed at their correct position
+            cell_map = {}  # (visual_row, start_col) -> list of cells

            for cell in cells:
                if not cell.bbox:
                    continue

-                # Find which visual column this cell belongs to
-                cell_center_x = (cell.bbox.x0 + cell.bbox.x1) / 2
-                new_col = 0
-                for i in range(new_num_cols):
-                    if visual_boundaries[i] <= cell_center_x < visual_boundaries[i+1]:
-                        new_col = i
-                        break
-                    elif cell_center_x >= visual_boundaries[-1]:
-                        new_col = new_num_cols - 1
+                # Find start column based on left edge of cell
+                cell_x0 = cell.bbox.x0
+                start_col = 0

-                key = (cell.row, new_col)
+                # First check if cell_x0 is very close to any boundary (within 5pt)
+                # If so, it belongs to the column that starts at that boundary
+                snapped = False
+                for i in range(1, len(visual_boundaries)):  # Skip first (left edge)
+                    if abs(cell_x0 - visual_boundaries[i]) <= 5:
+                        start_col = min(i, new_num_cols - 1)
+                        snapped = True
+                        break
+
+                # If not snapped to boundary, use standard containment check
+                if not snapped:
+                    for i in range(new_num_cols):
+                        if visual_boundaries[i] <= cell_x0 < visual_boundaries[i+1]:
+                            start_col = i
+                            break
+                        elif cell_x0 >= visual_boundaries[-1]:
+                            start_col = new_num_cols - 1
+
+                # Find visual row based on top edge of cell
+                visual_row = cell.row  # Default to original row
+                if row_boundaries:
+                    cell_y0 = cell.bbox.y0
+                    for i in range(new_num_rows):
+                        if row_boundaries[i] <= cell_y0 + 5 < row_boundaries[i+1]:
+                            visual_row = i
+                            break
+                        elif cell_y0 >= row_boundaries[-1] - 5:
+                            visual_row = new_num_rows - 1
+
+                key = (visual_row, start_col)
                if key not in cell_map:
                    cell_map[key] = []
                cell_map[key].append(cell)
@@ -1418,8 +1607,8 @@ class DirectExtractionEngine:
            remapped_cells = []
            processed = set()

-            for (row, new_col), cell_list in sorted(cell_map.items()):
-                if (row, new_col) in processed:
+            for (visual_row, start_col), cell_list in sorted(cell_map.items()):
+                if (visual_row, start_col) in processed:
                    continue

                # Sort by original column
@@ -1433,23 +1622,35 @@ class DirectExtractionEngine:

                merged_content = '\n'.join(contents) if contents else ''

-                # Use the first cell for span info
-                base_cell = cell_list[0]
+                # Use the cell with tallest bbox for row span calculation
+                # (handles case where multiple cells merge into one)
+                tallest_cell = max(cell_list, key=lambda c: (c.bbox.y1 - c.bbox.y0) if c.bbox else 0)
+                widest_cell = max(cell_list, key=lambda c: (c.bbox.x1 - c.bbox.x0) if c.bbox else 0)

-                # Calculate col_span based on visual boundaries
-                if base_cell.bbox:
-                    cell_x1 = base_cell.bbox.x1
-                    # Find end column
-                    end_col = new_col
-                    for i in range(new_col, new_num_cols):
-                        if visual_boundaries[i+1] <= cell_x1 + 5:  # 5pt tolerance
+                # Calculate col_span based on right edge of widest cell
+                col_span = 1
+                if widest_cell.bbox:
+                    cell_x1 = widest_cell.bbox.x1
+                    end_col = start_col
+                    for i in range(start_col, new_num_cols):
+                        if cell_x1 > visual_boundaries[i] + 5:  # 5pt tolerance
                            end_col = i
-                    col_span = max(1, end_col - new_col + 1)
-                else:
-                    col_span = 1
+                    col_span = max(1, end_col - start_col + 1)
+
+                # Calculate row_span based on visual row boundaries
+                row_span = 1
+                if row_boundaries and tallest_cell.bbox:
+                    cell_y1 = tallest_cell.bbox.y1
+
+                    # Find end row based on bottom edge of tallest cell
+                    end_row = visual_row
+                    for i in range(visual_row, new_num_rows):
+                        if cell_y1 > row_boundaries[i] + 5:  # 5pt tolerance
+                            end_row = i
+                    row_span = max(1, end_row - visual_row + 1)

                # Merge bbox from all cells
-                merged_bbox = base_cell.bbox
+                merged_bbox = tallest_cell.bbox
                for c in cell_list:
                    if c.bbox and merged_bbox:
                        merged_bbox = BoundingBox(
@@ -1462,23 +1663,39 @@ class DirectExtractionEngine:
                        merged_bbox = c.bbox

                remapped_cells.append(TableCell(
-                    row=row,
-                    col=new_col,
-                    row_span=base_cell.row_span,
+                    row=visual_row,
+                    col=start_col,
+                    row_span=row_span,
                    col_span=col_span,
                    content=merged_content,
                    bbox=merged_bbox
                ))
-                processed.add((row, new_col))
+                processed.add((visual_row, start_col))

-            logger.info(f"Remapped to {len(remapped_cells)} cells in {new_num_cols} columns")
+            # Filter out cells that are covered by spans from other cells
+            # Build a set of positions covered by spans
+            covered_positions = set()
+            for cell in remapped_cells:
+                if cell.col_span > 1 or cell.row_span > 1:
+                    for r in range(cell.row, cell.row + cell.row_span):
+                        for c in range(cell.col, cell.col + cell.col_span):
+                            if (r, c) != (cell.row, cell.col):  # Don't cover the origin
+                                covered_positions.add((r, c))

-            return remapped_cells, new_widths, new_num_cols
+            # Remove covered cells
+            final_cells = [
+                cell for cell in remapped_cells
+                if (cell.row, cell.col) not in covered_positions
+            ]
+
+            logger.info(f"Remapped to {len(final_cells)} cells in {new_num_cols} columns x {new_num_rows} rows (filtered {len(remapped_cells) - len(final_cells)} covered cells)")
+
+            return final_cells, new_widths, new_num_cols, new_num_rows

        except Exception as e:
            logger.error(f"Cell remapping failed: {e}")
            # Fallback to original
-            return cells, column_widths, num_cols
+            return cells, column_widths, num_cols, num_rows

    def _detect_tables_by_position(self, page: fitz.Page, page_num: int, counter: int) -> List[DocumentElement]:
        """Detect tables by analyzing text positioning"""
@@ -2138,12 +2355,23 @@ class DirectExtractionEngine:
                logger.warning(f"Custom clustering failed ({e}), using fallback method")
                drawing_clusters = self._cluster_drawings_fallback(page, non_table_drawings)

+            # Get page dimensions for filtering
+            page_rect = page.rect
+            page_area = page_rect.width * page_rect.height
+
            for cluster_idx, bbox in enumerate(drawing_clusters):
                # Ignore small regions (likely noise or separator lines)
                if bbox.width < 50 or bbox.height < 50:
                    logger.debug(f"Skipping small cluster {cluster_idx}: {bbox.width:.1f}x{bbox.height:.1f}")
                    continue

+                # Ignore very large regions that cover most of the page
+                # These are usually background elements, page borders, or misdetected regions
+                cluster_area = bbox.width * bbox.height
+                if cluster_area > page_area * 0.7:  # More than 70% of page
+                    logger.debug(f"Skipping large cluster {cluster_idx}: covers {cluster_area/page_area*100:.0f}% of page")
+                    continue
+
                # Render the region to a raster image
                # matrix=fitz.Matrix(2, 2) increases resolution to ~200 DPI
                try: