feat: refactor dual-track architecture (Phase 1-5)

## Backend Changes
- **Service Layer Refactoring**:
  - Add ProcessingOrchestrator for unified document processing
  - Add PDFTableRenderer for table rendering extraction
  - Add PDFFontManager for font management with CJK support
  - Add MemoryPolicyEngine (73% code reduction from MemoryGuard)

- **Bug Fixes**:
  - Fix Direct Track table row span calculation
  - Fix OCR Track image path handling
  - Add cell_boxes coordinate validation
  - Filter out small decorative images
  - Add covering image detection

## Frontend Changes
- **State Management**:
  - Add TaskStore for centralized task state management
  - Add localStorage persistence for recent tasks
  - Add processing state tracking

- **Type Consolidation**:
  - Merge shared types from api.ts to apiV2.ts
  - Update imports in authStore, uploadStore, ResultsTable, SettingsPage

- **Page Integration**:
  - Integrate TaskStore in ProcessingPage and TaskDetailPage
  - Update useTaskValidation hook with cache sync

## Testing
- Direct Track: edit.pdf (3 pages, 1.281s), edit3.pdf (2 pages, 0.203s)
- Cell boxes validation: 43 valid, 0 invalid
- Table merging: 12 merged cells verified

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-12-07 07:18:27 +08:00
parent 8265be1741
commit eff9b0bcd5
19 changed files with 3637 additions and 173 deletions

View File

@@ -1048,19 +1048,24 @@ class DirectExtractionEngine:
bbox=cell_bbox
))
# Try to detect visual column boundaries from page drawings
# Try to detect visual column and row boundaries from page drawings
# This is more accurate than PyMuPDF's column detection for complex tables
visual_boundaries = self._detect_visual_column_boundaries(
fitz_page, bbox_data, column_widths
)
# Use table.cells (flat list of bboxes) for more accurate row detection
raw_table_cells = getattr(table, 'cells', None)
row_boundaries = self._detect_visual_row_boundaries(
fitz_page, bbox_data, raw_table_cells
)
if visual_boundaries:
# Remap cells to visual columns
cells, column_widths, num_cols = self._remap_cells_to_visual_columns(
cells, column_widths, num_rows, num_cols, visual_boundaries
# Remap cells to visual columns and rows
cells, column_widths, num_cols, num_rows = self._remap_cells_to_visual_columns(
cells, column_widths, num_rows, num_cols, visual_boundaries, row_boundaries
)
else:
# Fallback to narrow column merging
# Fallback to narrow column merging (doesn't modify rows)
cells, column_widths, num_cols = self._merge_narrow_columns(
cells, column_widths, num_rows, num_cols,
min_column_width=10.0
@@ -1290,7 +1295,13 @@ class DirectExtractionEngine:
For tables with complex merged cells, PyMuPDF's column detection often
creates too many columns. This method analyzes the visual rectangles
(cell backgrounds) to find the true column boundaries.
(cell backgrounds) to find the MAIN column boundaries by frequency analysis.
Strategy:
1. Collect all cell rectangles from drawings
2. Count how frequently each x boundary appears (rounded to 5pt)
3. Keep only boundaries that appear frequently (>= threshold)
4. These are the main column boundaries that span most rows
Args:
page: PyMuPDF page object
@@ -1301,67 +1312,215 @@ class DirectExtractionEngine:
List of column boundary x-coordinates, or None if detection fails
"""
try:
table_rect = fitz.Rect(table_bbox)
from collections import Counter
# Collect cell rectangles from page drawings
cell_rects = []
drawings = page.get_drawings()
for d in drawings:
rect = fitz.Rect(d.get('rect', (0, 0, 0, 0)))
# Filter: must intersect table, must be large enough to be a cell
if (table_rect.intersects(rect) and
rect.width > 30 and rect.height > 15):
cell_rects.append(rect)
if d.get('items'):
for item in d['items']:
if item[0] == 're': # Rectangle
rect = item[1]
# Filter: within table bounds, large enough to be a cell
if (rect.x0 >= table_bbox[0] - 5 and
rect.x1 <= table_bbox[2] + 5 and
rect.y0 >= table_bbox[1] - 5 and
rect.y1 <= table_bbox[3] + 5):
width = rect.x1 - rect.x0
height = rect.y1 - rect.y0
if width > 30 and height > 15:
cell_rects.append(rect)
if len(cell_rects) < 4:
# Not enough cell rectangles detected
logger.debug(f"Only {len(cell_rects)} cell rectangles found, skipping visual detection")
return None
# Collect unique x boundaries
all_x = set()
logger.debug(f"Found {len(cell_rects)} cell rectangles for visual column detection")
# Count frequency of each boundary (rounded to 5pt)
boundary_counts = Counter()
for r in cell_rects:
all_x.add(round(r.x0, 0))
all_x.add(round(r.x1, 0))
boundary_counts[round(r.x0 / 5) * 5] += 1
boundary_counts[round(r.x1 / 5) * 5] += 1
# Merge close boundaries (within 15pt threshold)
def merge_close(values, threshold=15):
if not values:
return []
values = sorted(values)
result = [values[0]]
for v in values[1:]:
if v - result[-1] > threshold:
result.append(v)
return result
# Keep only boundaries that appear frequently
# Use 8% threshold to catch internal column boundaries (like nested sub-columns)
min_frequency = max(3, len(cell_rects) * 0.08)
frequent_boundaries = sorted([
x for x, count in boundary_counts.items()
if count >= min_frequency
])
boundaries = merge_close(list(all_x), threshold=15)
# Always include table edges
table_left = round(table_bbox[0] / 5) * 5
table_right = round(table_bbox[2] / 5) * 5
if not frequent_boundaries or frequent_boundaries[0] > table_left + 10:
frequent_boundaries.insert(0, table_left)
if not frequent_boundaries or frequent_boundaries[-1] < table_right - 10:
frequent_boundaries.append(table_right)
if len(boundaries) < 3:
logger.debug(f"Frequent boundaries (min_freq={min_frequency:.0f}): {frequent_boundaries}")
if len(frequent_boundaries) < 3:
# Need at least 3 boundaries for 2 columns
return None
# Calculate column widths from visual boundaries
visual_widths = [boundaries[i+1] - boundaries[i]
for i in range(len(boundaries)-1)]
# Merge close boundaries (within 10pt) - take the one with higher frequency
def merge_close_by_frequency(boundaries, counts, threshold=10):
if not boundaries:
return []
result = [boundaries[0]]
for b in boundaries[1:]:
if b - result[-1] <= threshold:
# Keep the one with higher frequency
if counts[b] > counts[result[-1]]:
result[-1] = b
else:
result.append(b)
return result
# Filter out narrow "separator" columns (< 20pt)
# and keep only content columns
content_boundaries = [boundaries[0]]
for i, width in enumerate(visual_widths):
if width >= 20: # Content column
content_boundaries.append(boundaries[i+1])
# Skip narrow separator columns
merged_boundaries = merge_close_by_frequency(
frequent_boundaries, boundary_counts, threshold=10
)
if len(content_boundaries) < 3:
if len(merged_boundaries) < 3:
return None
logger.info(f"Visual column detection: {len(content_boundaries)-1} columns from drawings")
logger.debug(f"Visual boundaries: {content_boundaries}")
# Calculate column widths
widths = [merged_boundaries[i+1] - merged_boundaries[i]
for i in range(len(merged_boundaries)-1)]
return content_boundaries
logger.info(f"Visual column detection: {len(widths)} columns")
logger.info(f" Boundaries: {merged_boundaries}")
logger.info(f" Widths: {[round(w) for w in widths]}")
return merged_boundaries
except Exception as e:
logger.warning(f"Visual column detection failed: {e}")
import traceback
logger.debug(traceback.format_exc())
return None
def _detect_visual_row_boundaries(
self,
page: fitz.Page,
table_bbox: Tuple[float, float, float, float],
table_cells: Optional[List] = None
) -> Optional[List[float]]:
"""
Detect actual row boundaries from table cell bboxes.
Uses cell bboxes from PyMuPDF table detection for more accurate
row boundary detection than page drawings.
Args:
page: PyMuPDF page object
table_bbox: Table bounding box (x0, y0, x1, y1)
table_cells: List of cell bboxes from table.cells (preferred)
Returns:
List of row boundary y-coordinates, or None if detection fails
"""
try:
from collections import Counter
boundary_counts = Counter()
cell_count = 0
if table_cells:
# Use table cells directly (more accurate for row detection)
for cell_bbox in table_cells:
if cell_bbox:
y0 = round(cell_bbox[1] / 5) * 5
y1 = round(cell_bbox[3] / 5) * 5
boundary_counts[y0] += 1
boundary_counts[y1] += 1
cell_count += 1
else:
# Fallback to page drawings
drawings = page.get_drawings()
for d in drawings:
if d.get('items'):
for item in d['items']:
if item[0] == 're':
rect = item[1]
if (rect.x0 >= table_bbox[0] - 5 and
rect.x1 <= table_bbox[2] + 5 and
rect.y0 >= table_bbox[1] - 5 and
rect.y1 <= table_bbox[3] + 5):
width = rect.x1 - rect.x0
height = rect.y1 - rect.y0
if width > 30 and height > 15:
y0 = round(rect.y0 / 5) * 5
y1 = round(rect.y1 / 5) * 5
boundary_counts[y0] += 1
boundary_counts[y1] += 1
cell_count += 1
if cell_count < 4:
logger.debug(f"Only {cell_count} cells found, skipping visual row detection")
return None
# Keep only boundaries that appear frequently
# Use 8% threshold similar to column detection
min_frequency = max(3, cell_count * 0.08)
frequent_boundaries = sorted([
y for y, count in boundary_counts.items()
if count >= min_frequency
])
# Always include table edges
table_top = round(table_bbox[1] / 5) * 5
table_bottom = round(table_bbox[3] / 5) * 5
if not frequent_boundaries or frequent_boundaries[0] > table_top + 10:
frequent_boundaries.insert(0, table_top)
if not frequent_boundaries or frequent_boundaries[-1] < table_bottom - 10:
frequent_boundaries.append(table_bottom)
logger.debug(f"Frequent Y boundaries (min_freq={min_frequency:.0f}): {frequent_boundaries}")
if len(frequent_boundaries) < 3:
# Need at least 3 boundaries for 2 rows
return None
# Merge close boundaries (within 10pt) - take the one with higher frequency
def merge_close_by_frequency(boundaries, counts, threshold=10):
if not boundaries:
return []
result = [boundaries[0]]
for b in boundaries[1:]:
if b - result[-1] <= threshold:
# Keep the one with higher frequency
if counts[b] > counts[result[-1]]:
result[-1] = b
else:
result.append(b)
return result
merged_boundaries = merge_close_by_frequency(
frequent_boundaries, boundary_counts, threshold=10
)
if len(merged_boundaries) < 3:
return None
# Calculate row heights
heights = [merged_boundaries[i+1] - merged_boundaries[i]
for i in range(len(merged_boundaries)-1)]
logger.info(f"Visual row detection: {len(heights)} rows")
logger.info(f" Y Boundaries: {merged_boundaries}")
logger.info(f" Heights: {[round(h) for h in heights]}")
return merged_boundaries
except Exception as e:
logger.warning(f"Visual row detection failed: {e}")
import traceback
logger.debug(traceback.format_exc())
return None
def _remap_cells_to_visual_columns(
@@ -1370,8 +1529,9 @@ class DirectExtractionEngine:
column_widths: List[float],
num_rows: int,
num_cols: int,
visual_boundaries: List[float]
) -> Tuple[List[TableCell], List[float], int]:
visual_boundaries: List[float],
row_boundaries: Optional[List[float]] = None
) -> Tuple[List[TableCell], List[float], int, int]:
"""
Remap cells from PyMuPDF columns to visual columns based on cell bbox.
@@ -1381,35 +1541,64 @@ class DirectExtractionEngine:
num_rows: Number of rows
num_cols: Original number of columns
visual_boundaries: Column boundaries from visual detection
row_boundaries: Row boundaries from visual detection (optional)
Returns:
Tuple of (remapped_cells, new_widths, new_num_cols)
Tuple of (remapped_cells, new_widths, new_num_cols, new_num_rows)
"""
try:
new_num_cols = len(visual_boundaries) - 1
new_widths = [visual_boundaries[i+1] - visual_boundaries[i]
for i in range(new_num_cols)]
logger.info(f"Remapping {len(cells)} cells from {num_cols} to {new_num_cols} visual columns")
new_num_rows = len(row_boundaries) - 1 if row_boundaries else num_rows
# Map each cell to visual column based on its bbox center
cell_map = {} # (row, new_col) -> list of cells
logger.info(f"Remapping {len(cells)} cells from {num_cols} to {new_num_cols} visual columns")
if row_boundaries:
logger.info(f"Using {new_num_rows} visual rows for row_span calculation")
# Map each cell to visual column and row based on its bbox
# This ensures spanning cells are placed at their correct position
cell_map = {} # (visual_row, start_col) -> list of cells
for cell in cells:
if not cell.bbox:
continue
# Find which visual column this cell belongs to
cell_center_x = (cell.bbox.x0 + cell.bbox.x1) / 2
new_col = 0
for i in range(new_num_cols):
if visual_boundaries[i] <= cell_center_x < visual_boundaries[i+1]:
new_col = i
break
elif cell_center_x >= visual_boundaries[-1]:
new_col = new_num_cols - 1
# Find start column based on left edge of cell
cell_x0 = cell.bbox.x0
start_col = 0
key = (cell.row, new_col)
# First check if cell_x0 is very close to any boundary (within 5pt)
# If so, it belongs to the column that starts at that boundary
snapped = False
for i in range(1, len(visual_boundaries)): # Skip first (left edge)
if abs(cell_x0 - visual_boundaries[i]) <= 5:
start_col = min(i, new_num_cols - 1)
snapped = True
break
# If not snapped to boundary, use standard containment check
if not snapped:
for i in range(new_num_cols):
if visual_boundaries[i] <= cell_x0 < visual_boundaries[i+1]:
start_col = i
break
elif cell_x0 >= visual_boundaries[-1]:
start_col = new_num_cols - 1
# Find visual row based on top edge of cell
visual_row = cell.row # Default to original row
if row_boundaries:
cell_y0 = cell.bbox.y0
for i in range(new_num_rows):
if row_boundaries[i] <= cell_y0 + 5 < row_boundaries[i+1]:
visual_row = i
break
elif cell_y0 >= row_boundaries[-1] - 5:
visual_row = new_num_rows - 1
key = (visual_row, start_col)
if key not in cell_map:
cell_map[key] = []
cell_map[key].append(cell)
@@ -1418,8 +1607,8 @@ class DirectExtractionEngine:
remapped_cells = []
processed = set()
for (row, new_col), cell_list in sorted(cell_map.items()):
if (row, new_col) in processed:
for (visual_row, start_col), cell_list in sorted(cell_map.items()):
if (visual_row, start_col) in processed:
continue
# Sort by original column
@@ -1433,23 +1622,35 @@ class DirectExtractionEngine:
merged_content = '\n'.join(contents) if contents else ''
# Use the first cell for span info
base_cell = cell_list[0]
# Use the cell with tallest bbox for row span calculation
# (handles case where multiple cells merge into one)
tallest_cell = max(cell_list, key=lambda c: (c.bbox.y1 - c.bbox.y0) if c.bbox else 0)
widest_cell = max(cell_list, key=lambda c: (c.bbox.x1 - c.bbox.x0) if c.bbox else 0)
# Calculate col_span based on visual boundaries
if base_cell.bbox:
cell_x1 = base_cell.bbox.x1
# Find end column
end_col = new_col
for i in range(new_col, new_num_cols):
if visual_boundaries[i+1] <= cell_x1 + 5: # 5pt tolerance
# Calculate col_span based on right edge of widest cell
col_span = 1
if widest_cell.bbox:
cell_x1 = widest_cell.bbox.x1
end_col = start_col
for i in range(start_col, new_num_cols):
if cell_x1 > visual_boundaries[i] + 5: # 5pt tolerance
end_col = i
col_span = max(1, end_col - new_col + 1)
else:
col_span = 1
col_span = max(1, end_col - start_col + 1)
# Calculate row_span based on visual row boundaries
row_span = 1
if row_boundaries and tallest_cell.bbox:
cell_y1 = tallest_cell.bbox.y1
# Find end row based on bottom edge of tallest cell
end_row = visual_row
for i in range(visual_row, new_num_rows):
if cell_y1 > row_boundaries[i] + 5: # 5pt tolerance
end_row = i
row_span = max(1, end_row - visual_row + 1)
# Merge bbox from all cells
merged_bbox = base_cell.bbox
merged_bbox = tallest_cell.bbox
for c in cell_list:
if c.bbox and merged_bbox:
merged_bbox = BoundingBox(
@@ -1462,23 +1663,39 @@ class DirectExtractionEngine:
merged_bbox = c.bbox
remapped_cells.append(TableCell(
row=row,
col=new_col,
row_span=base_cell.row_span,
row=visual_row,
col=start_col,
row_span=row_span,
col_span=col_span,
content=merged_content,
bbox=merged_bbox
))
processed.add((row, new_col))
processed.add((visual_row, start_col))
logger.info(f"Remapped to {len(remapped_cells)} cells in {new_num_cols} columns")
# Filter out cells that are covered by spans from other cells
# Build a set of positions covered by spans
covered_positions = set()
for cell in remapped_cells:
if cell.col_span > 1 or cell.row_span > 1:
for r in range(cell.row, cell.row + cell.row_span):
for c in range(cell.col, cell.col + cell.col_span):
if (r, c) != (cell.row, cell.col): # Don't cover the origin
covered_positions.add((r, c))
return remapped_cells, new_widths, new_num_cols
# Remove covered cells
final_cells = [
cell for cell in remapped_cells
if (cell.row, cell.col) not in covered_positions
]
logger.info(f"Remapped to {len(final_cells)} cells in {new_num_cols} columns x {new_num_rows} rows (filtered {len(remapped_cells) - len(final_cells)} covered cells)")
return final_cells, new_widths, new_num_cols, new_num_rows
except Exception as e:
logger.error(f"Cell remapping failed: {e}")
# Fallback to original
return cells, column_widths, num_cols
return cells, column_widths, num_cols, num_rows
def _detect_tables_by_position(self, page: fitz.Page, page_num: int, counter: int) -> List[DocumentElement]:
"""Detect tables by analyzing text positioning"""
@@ -2138,12 +2355,23 @@ class DirectExtractionEngine:
logger.warning(f"Custom clustering failed ({e}), using fallback method")
drawing_clusters = self._cluster_drawings_fallback(page, non_table_drawings)
# Get page dimensions for filtering
page_rect = page.rect
page_area = page_rect.width * page_rect.height
for cluster_idx, bbox in enumerate(drawing_clusters):
# Ignore small regions (likely noise or separator lines)
if bbox.width < 50 or bbox.height < 50:
logger.debug(f"Skipping small cluster {cluster_idx}: {bbox.width:.1f}x{bbox.height:.1f}")
continue
# Ignore very large regions that cover most of the page
# These are usually background elements, page borders, or misdetected regions
cluster_area = bbox.width * bbox.height
if cluster_area > page_area * 0.7: # More than 70% of page
logger.debug(f"Skipping large cluster {cluster_idx}: covers {cluster_area/page_area*100:.0f}% of page")
continue
# Render the region to a raster image
# matrix=fitz.Matrix(2, 2) increases resolution to ~200 DPI
try: