feat: refactor dual-track architecture (Phase 1-5)
## Backend Changes - **Service Layer Refactoring**: - Add ProcessingOrchestrator for unified document processing - Add PDFTableRenderer for table rendering extraction - Add PDFFontManager for font management with CJK support - Add MemoryPolicyEngine (73% code reduction from MemoryGuard) - **Bug Fixes**: - Fix Direct Track table row span calculation - Fix OCR Track image path handling - Add cell_boxes coordinate validation - Filter out small decorative images - Add covering image detection ## Frontend Changes - **State Management**: - Add TaskStore for centralized task state management - Add localStorage persistence for recent tasks - Add processing state tracking - **Type Consolidation**: - Merge shared types from api.ts to apiV2.ts - Update imports in authStore, uploadStore, ResultsTable, SettingsPage - **Page Integration**: - Integrate TaskStore in ProcessingPage and TaskDetailPage - Update useTaskValidation hook with cache sync ## Testing - Direct Track: edit.pdf (3 pages, 1.281s), edit3.pdf (2 pages, 0.203s) - Cell boxes validation: 43 valid, 0 invalid - Table merging: 12 merged cells verified 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -1048,19 +1048,24 @@ class DirectExtractionEngine:
|
||||
bbox=cell_bbox
|
||||
))
|
||||
|
||||
# Try to detect visual column boundaries from page drawings
|
||||
# Try to detect visual column and row boundaries from page drawings
|
||||
# This is more accurate than PyMuPDF's column detection for complex tables
|
||||
visual_boundaries = self._detect_visual_column_boundaries(
|
||||
fitz_page, bbox_data, column_widths
|
||||
)
|
||||
# Use table.cells (flat list of bboxes) for more accurate row detection
|
||||
raw_table_cells = getattr(table, 'cells', None)
|
||||
row_boundaries = self._detect_visual_row_boundaries(
|
||||
fitz_page, bbox_data, raw_table_cells
|
||||
)
|
||||
|
||||
if visual_boundaries:
|
||||
# Remap cells to visual columns
|
||||
cells, column_widths, num_cols = self._remap_cells_to_visual_columns(
|
||||
cells, column_widths, num_rows, num_cols, visual_boundaries
|
||||
# Remap cells to visual columns and rows
|
||||
cells, column_widths, num_cols, num_rows = self._remap_cells_to_visual_columns(
|
||||
cells, column_widths, num_rows, num_cols, visual_boundaries, row_boundaries
|
||||
)
|
||||
else:
|
||||
# Fallback to narrow column merging
|
||||
# Fallback to narrow column merging (doesn't modify rows)
|
||||
cells, column_widths, num_cols = self._merge_narrow_columns(
|
||||
cells, column_widths, num_rows, num_cols,
|
||||
min_column_width=10.0
|
||||
@@ -1290,7 +1295,13 @@ class DirectExtractionEngine:
|
||||
|
||||
For tables with complex merged cells, PyMuPDF's column detection often
|
||||
creates too many columns. This method analyzes the visual rectangles
|
||||
(cell backgrounds) to find the true column boundaries.
|
||||
(cell backgrounds) to find the MAIN column boundaries by frequency analysis.
|
||||
|
||||
Strategy:
|
||||
1. Collect all cell rectangles from drawings
|
||||
2. Count how frequently each x boundary appears (rounded to 5pt)
|
||||
3. Keep only boundaries that appear frequently (>= threshold)
|
||||
4. These are the main column boundaries that span most rows
|
||||
|
||||
Args:
|
||||
page: PyMuPDF page object
|
||||
@@ -1301,67 +1312,215 @@ class DirectExtractionEngine:
|
||||
List of column boundary x-coordinates, or None if detection fails
|
||||
"""
|
||||
try:
|
||||
table_rect = fitz.Rect(table_bbox)
|
||||
from collections import Counter
|
||||
|
||||
# Collect cell rectangles from page drawings
|
||||
cell_rects = []
|
||||
drawings = page.get_drawings()
|
||||
for d in drawings:
|
||||
rect = fitz.Rect(d.get('rect', (0, 0, 0, 0)))
|
||||
# Filter: must intersect table, must be large enough to be a cell
|
||||
if (table_rect.intersects(rect) and
|
||||
rect.width > 30 and rect.height > 15):
|
||||
cell_rects.append(rect)
|
||||
if d.get('items'):
|
||||
for item in d['items']:
|
||||
if item[0] == 're': # Rectangle
|
||||
rect = item[1]
|
||||
# Filter: within table bounds, large enough to be a cell
|
||||
if (rect.x0 >= table_bbox[0] - 5 and
|
||||
rect.x1 <= table_bbox[2] + 5 and
|
||||
rect.y0 >= table_bbox[1] - 5 and
|
||||
rect.y1 <= table_bbox[3] + 5):
|
||||
width = rect.x1 - rect.x0
|
||||
height = rect.y1 - rect.y0
|
||||
if width > 30 and height > 15:
|
||||
cell_rects.append(rect)
|
||||
|
||||
if len(cell_rects) < 4:
|
||||
# Not enough cell rectangles detected
|
||||
logger.debug(f"Only {len(cell_rects)} cell rectangles found, skipping visual detection")
|
||||
return None
|
||||
|
||||
# Collect unique x boundaries
|
||||
all_x = set()
|
||||
logger.debug(f"Found {len(cell_rects)} cell rectangles for visual column detection")
|
||||
|
||||
# Count frequency of each boundary (rounded to 5pt)
|
||||
boundary_counts = Counter()
|
||||
for r in cell_rects:
|
||||
all_x.add(round(r.x0, 0))
|
||||
all_x.add(round(r.x1, 0))
|
||||
boundary_counts[round(r.x0 / 5) * 5] += 1
|
||||
boundary_counts[round(r.x1 / 5) * 5] += 1
|
||||
|
||||
# Merge close boundaries (within 15pt threshold)
|
||||
def merge_close(values, threshold=15):
|
||||
if not values:
|
||||
return []
|
||||
values = sorted(values)
|
||||
result = [values[0]]
|
||||
for v in values[1:]:
|
||||
if v - result[-1] > threshold:
|
||||
result.append(v)
|
||||
return result
|
||||
# Keep only boundaries that appear frequently
|
||||
# Use 8% threshold to catch internal column boundaries (like nested sub-columns)
|
||||
min_frequency = max(3, len(cell_rects) * 0.08)
|
||||
frequent_boundaries = sorted([
|
||||
x for x, count in boundary_counts.items()
|
||||
if count >= min_frequency
|
||||
])
|
||||
|
||||
boundaries = merge_close(list(all_x), threshold=15)
|
||||
# Always include table edges
|
||||
table_left = round(table_bbox[0] / 5) * 5
|
||||
table_right = round(table_bbox[2] / 5) * 5
|
||||
if not frequent_boundaries or frequent_boundaries[0] > table_left + 10:
|
||||
frequent_boundaries.insert(0, table_left)
|
||||
if not frequent_boundaries or frequent_boundaries[-1] < table_right - 10:
|
||||
frequent_boundaries.append(table_right)
|
||||
|
||||
if len(boundaries) < 3:
|
||||
logger.debug(f"Frequent boundaries (min_freq={min_frequency:.0f}): {frequent_boundaries}")
|
||||
|
||||
if len(frequent_boundaries) < 3:
|
||||
# Need at least 3 boundaries for 2 columns
|
||||
return None
|
||||
|
||||
# Calculate column widths from visual boundaries
|
||||
visual_widths = [boundaries[i+1] - boundaries[i]
|
||||
for i in range(len(boundaries)-1)]
|
||||
# Merge close boundaries (within 10pt) - take the one with higher frequency
|
||||
def merge_close_by_frequency(boundaries, counts, threshold=10):
|
||||
if not boundaries:
|
||||
return []
|
||||
result = [boundaries[0]]
|
||||
for b in boundaries[1:]:
|
||||
if b - result[-1] <= threshold:
|
||||
# Keep the one with higher frequency
|
||||
if counts[b] > counts[result[-1]]:
|
||||
result[-1] = b
|
||||
else:
|
||||
result.append(b)
|
||||
return result
|
||||
|
||||
# Filter out narrow "separator" columns (< 20pt)
|
||||
# and keep only content columns
|
||||
content_boundaries = [boundaries[0]]
|
||||
for i, width in enumerate(visual_widths):
|
||||
if width >= 20: # Content column
|
||||
content_boundaries.append(boundaries[i+1])
|
||||
# Skip narrow separator columns
|
||||
merged_boundaries = merge_close_by_frequency(
|
||||
frequent_boundaries, boundary_counts, threshold=10
|
||||
)
|
||||
|
||||
if len(content_boundaries) < 3:
|
||||
if len(merged_boundaries) < 3:
|
||||
return None
|
||||
|
||||
logger.info(f"Visual column detection: {len(content_boundaries)-1} columns from drawings")
|
||||
logger.debug(f"Visual boundaries: {content_boundaries}")
|
||||
# Calculate column widths
|
||||
widths = [merged_boundaries[i+1] - merged_boundaries[i]
|
||||
for i in range(len(merged_boundaries)-1)]
|
||||
|
||||
return content_boundaries
|
||||
logger.info(f"Visual column detection: {len(widths)} columns")
|
||||
logger.info(f" Boundaries: {merged_boundaries}")
|
||||
logger.info(f" Widths: {[round(w) for w in widths]}")
|
||||
|
||||
return merged_boundaries
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Visual column detection failed: {e}")
|
||||
import traceback
|
||||
logger.debug(traceback.format_exc())
|
||||
return None
|
||||
|
||||
def _detect_visual_row_boundaries(
|
||||
self,
|
||||
page: fitz.Page,
|
||||
table_bbox: Tuple[float, float, float, float],
|
||||
table_cells: Optional[List] = None
|
||||
) -> Optional[List[float]]:
|
||||
"""
|
||||
Detect actual row boundaries from table cell bboxes.
|
||||
|
||||
Uses cell bboxes from PyMuPDF table detection for more accurate
|
||||
row boundary detection than page drawings.
|
||||
|
||||
Args:
|
||||
page: PyMuPDF page object
|
||||
table_bbox: Table bounding box (x0, y0, x1, y1)
|
||||
table_cells: List of cell bboxes from table.cells (preferred)
|
||||
|
||||
Returns:
|
||||
List of row boundary y-coordinates, or None if detection fails
|
||||
"""
|
||||
try:
|
||||
from collections import Counter
|
||||
|
||||
boundary_counts = Counter()
|
||||
cell_count = 0
|
||||
|
||||
if table_cells:
|
||||
# Use table cells directly (more accurate for row detection)
|
||||
for cell_bbox in table_cells:
|
||||
if cell_bbox:
|
||||
y0 = round(cell_bbox[1] / 5) * 5
|
||||
y1 = round(cell_bbox[3] / 5) * 5
|
||||
boundary_counts[y0] += 1
|
||||
boundary_counts[y1] += 1
|
||||
cell_count += 1
|
||||
else:
|
||||
# Fallback to page drawings
|
||||
drawings = page.get_drawings()
|
||||
for d in drawings:
|
||||
if d.get('items'):
|
||||
for item in d['items']:
|
||||
if item[0] == 're':
|
||||
rect = item[1]
|
||||
if (rect.x0 >= table_bbox[0] - 5 and
|
||||
rect.x1 <= table_bbox[2] + 5 and
|
||||
rect.y0 >= table_bbox[1] - 5 and
|
||||
rect.y1 <= table_bbox[3] + 5):
|
||||
width = rect.x1 - rect.x0
|
||||
height = rect.y1 - rect.y0
|
||||
if width > 30 and height > 15:
|
||||
y0 = round(rect.y0 / 5) * 5
|
||||
y1 = round(rect.y1 / 5) * 5
|
||||
boundary_counts[y0] += 1
|
||||
boundary_counts[y1] += 1
|
||||
cell_count += 1
|
||||
|
||||
if cell_count < 4:
|
||||
logger.debug(f"Only {cell_count} cells found, skipping visual row detection")
|
||||
return None
|
||||
|
||||
# Keep only boundaries that appear frequently
|
||||
# Use 8% threshold similar to column detection
|
||||
min_frequency = max(3, cell_count * 0.08)
|
||||
frequent_boundaries = sorted([
|
||||
y for y, count in boundary_counts.items()
|
||||
if count >= min_frequency
|
||||
])
|
||||
|
||||
# Always include table edges
|
||||
table_top = round(table_bbox[1] / 5) * 5
|
||||
table_bottom = round(table_bbox[3] / 5) * 5
|
||||
if not frequent_boundaries or frequent_boundaries[0] > table_top + 10:
|
||||
frequent_boundaries.insert(0, table_top)
|
||||
if not frequent_boundaries or frequent_boundaries[-1] < table_bottom - 10:
|
||||
frequent_boundaries.append(table_bottom)
|
||||
|
||||
logger.debug(f"Frequent Y boundaries (min_freq={min_frequency:.0f}): {frequent_boundaries}")
|
||||
|
||||
if len(frequent_boundaries) < 3:
|
||||
# Need at least 3 boundaries for 2 rows
|
||||
return None
|
||||
|
||||
# Merge close boundaries (within 10pt) - take the one with higher frequency
|
||||
def merge_close_by_frequency(boundaries, counts, threshold=10):
|
||||
if not boundaries:
|
||||
return []
|
||||
result = [boundaries[0]]
|
||||
for b in boundaries[1:]:
|
||||
if b - result[-1] <= threshold:
|
||||
# Keep the one with higher frequency
|
||||
if counts[b] > counts[result[-1]]:
|
||||
result[-1] = b
|
||||
else:
|
||||
result.append(b)
|
||||
return result
|
||||
|
||||
merged_boundaries = merge_close_by_frequency(
|
||||
frequent_boundaries, boundary_counts, threshold=10
|
||||
)
|
||||
|
||||
if len(merged_boundaries) < 3:
|
||||
return None
|
||||
|
||||
# Calculate row heights
|
||||
heights = [merged_boundaries[i+1] - merged_boundaries[i]
|
||||
for i in range(len(merged_boundaries)-1)]
|
||||
|
||||
logger.info(f"Visual row detection: {len(heights)} rows")
|
||||
logger.info(f" Y Boundaries: {merged_boundaries}")
|
||||
logger.info(f" Heights: {[round(h) for h in heights]}")
|
||||
|
||||
return merged_boundaries
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Visual row detection failed: {e}")
|
||||
import traceback
|
||||
logger.debug(traceback.format_exc())
|
||||
return None
|
||||
|
||||
def _remap_cells_to_visual_columns(
|
||||
@@ -1370,8 +1529,9 @@ class DirectExtractionEngine:
|
||||
column_widths: List[float],
|
||||
num_rows: int,
|
||||
num_cols: int,
|
||||
visual_boundaries: List[float]
|
||||
) -> Tuple[List[TableCell], List[float], int]:
|
||||
visual_boundaries: List[float],
|
||||
row_boundaries: Optional[List[float]] = None
|
||||
) -> Tuple[List[TableCell], List[float], int, int]:
|
||||
"""
|
||||
Remap cells from PyMuPDF columns to visual columns based on cell bbox.
|
||||
|
||||
@@ -1381,35 +1541,64 @@ class DirectExtractionEngine:
|
||||
num_rows: Number of rows
|
||||
num_cols: Original number of columns
|
||||
visual_boundaries: Column boundaries from visual detection
|
||||
row_boundaries: Row boundaries from visual detection (optional)
|
||||
|
||||
Returns:
|
||||
Tuple of (remapped_cells, new_widths, new_num_cols)
|
||||
Tuple of (remapped_cells, new_widths, new_num_cols, new_num_rows)
|
||||
"""
|
||||
try:
|
||||
new_num_cols = len(visual_boundaries) - 1
|
||||
new_widths = [visual_boundaries[i+1] - visual_boundaries[i]
|
||||
for i in range(new_num_cols)]
|
||||
|
||||
logger.info(f"Remapping {len(cells)} cells from {num_cols} to {new_num_cols} visual columns")
|
||||
new_num_rows = len(row_boundaries) - 1 if row_boundaries else num_rows
|
||||
|
||||
# Map each cell to visual column based on its bbox center
|
||||
cell_map = {} # (row, new_col) -> list of cells
|
||||
logger.info(f"Remapping {len(cells)} cells from {num_cols} to {new_num_cols} visual columns")
|
||||
if row_boundaries:
|
||||
logger.info(f"Using {new_num_rows} visual rows for row_span calculation")
|
||||
|
||||
# Map each cell to visual column and row based on its bbox
|
||||
# This ensures spanning cells are placed at their correct position
|
||||
cell_map = {} # (visual_row, start_col) -> list of cells
|
||||
|
||||
for cell in cells:
|
||||
if not cell.bbox:
|
||||
continue
|
||||
|
||||
# Find which visual column this cell belongs to
|
||||
cell_center_x = (cell.bbox.x0 + cell.bbox.x1) / 2
|
||||
new_col = 0
|
||||
for i in range(new_num_cols):
|
||||
if visual_boundaries[i] <= cell_center_x < visual_boundaries[i+1]:
|
||||
new_col = i
|
||||
break
|
||||
elif cell_center_x >= visual_boundaries[-1]:
|
||||
new_col = new_num_cols - 1
|
||||
# Find start column based on left edge of cell
|
||||
cell_x0 = cell.bbox.x0
|
||||
start_col = 0
|
||||
|
||||
key = (cell.row, new_col)
|
||||
# First check if cell_x0 is very close to any boundary (within 5pt)
|
||||
# If so, it belongs to the column that starts at that boundary
|
||||
snapped = False
|
||||
for i in range(1, len(visual_boundaries)): # Skip first (left edge)
|
||||
if abs(cell_x0 - visual_boundaries[i]) <= 5:
|
||||
start_col = min(i, new_num_cols - 1)
|
||||
snapped = True
|
||||
break
|
||||
|
||||
# If not snapped to boundary, use standard containment check
|
||||
if not snapped:
|
||||
for i in range(new_num_cols):
|
||||
if visual_boundaries[i] <= cell_x0 < visual_boundaries[i+1]:
|
||||
start_col = i
|
||||
break
|
||||
elif cell_x0 >= visual_boundaries[-1]:
|
||||
start_col = new_num_cols - 1
|
||||
|
||||
# Find visual row based on top edge of cell
|
||||
visual_row = cell.row # Default to original row
|
||||
if row_boundaries:
|
||||
cell_y0 = cell.bbox.y0
|
||||
for i in range(new_num_rows):
|
||||
if row_boundaries[i] <= cell_y0 + 5 < row_boundaries[i+1]:
|
||||
visual_row = i
|
||||
break
|
||||
elif cell_y0 >= row_boundaries[-1] - 5:
|
||||
visual_row = new_num_rows - 1
|
||||
|
||||
key = (visual_row, start_col)
|
||||
if key not in cell_map:
|
||||
cell_map[key] = []
|
||||
cell_map[key].append(cell)
|
||||
@@ -1418,8 +1607,8 @@ class DirectExtractionEngine:
|
||||
remapped_cells = []
|
||||
processed = set()
|
||||
|
||||
for (row, new_col), cell_list in sorted(cell_map.items()):
|
||||
if (row, new_col) in processed:
|
||||
for (visual_row, start_col), cell_list in sorted(cell_map.items()):
|
||||
if (visual_row, start_col) in processed:
|
||||
continue
|
||||
|
||||
# Sort by original column
|
||||
@@ -1433,23 +1622,35 @@ class DirectExtractionEngine:
|
||||
|
||||
merged_content = '\n'.join(contents) if contents else ''
|
||||
|
||||
# Use the first cell for span info
|
||||
base_cell = cell_list[0]
|
||||
# Use the cell with tallest bbox for row span calculation
|
||||
# (handles case where multiple cells merge into one)
|
||||
tallest_cell = max(cell_list, key=lambda c: (c.bbox.y1 - c.bbox.y0) if c.bbox else 0)
|
||||
widest_cell = max(cell_list, key=lambda c: (c.bbox.x1 - c.bbox.x0) if c.bbox else 0)
|
||||
|
||||
# Calculate col_span based on visual boundaries
|
||||
if base_cell.bbox:
|
||||
cell_x1 = base_cell.bbox.x1
|
||||
# Find end column
|
||||
end_col = new_col
|
||||
for i in range(new_col, new_num_cols):
|
||||
if visual_boundaries[i+1] <= cell_x1 + 5: # 5pt tolerance
|
||||
# Calculate col_span based on right edge of widest cell
|
||||
col_span = 1
|
||||
if widest_cell.bbox:
|
||||
cell_x1 = widest_cell.bbox.x1
|
||||
end_col = start_col
|
||||
for i in range(start_col, new_num_cols):
|
||||
if cell_x1 > visual_boundaries[i] + 5: # 5pt tolerance
|
||||
end_col = i
|
||||
col_span = max(1, end_col - new_col + 1)
|
||||
else:
|
||||
col_span = 1
|
||||
col_span = max(1, end_col - start_col + 1)
|
||||
|
||||
# Calculate row_span based on visual row boundaries
|
||||
row_span = 1
|
||||
if row_boundaries and tallest_cell.bbox:
|
||||
cell_y1 = tallest_cell.bbox.y1
|
||||
|
||||
# Find end row based on bottom edge of tallest cell
|
||||
end_row = visual_row
|
||||
for i in range(visual_row, new_num_rows):
|
||||
if cell_y1 > row_boundaries[i] + 5: # 5pt tolerance
|
||||
end_row = i
|
||||
row_span = max(1, end_row - visual_row + 1)
|
||||
|
||||
# Merge bbox from all cells
|
||||
merged_bbox = base_cell.bbox
|
||||
merged_bbox = tallest_cell.bbox
|
||||
for c in cell_list:
|
||||
if c.bbox and merged_bbox:
|
||||
merged_bbox = BoundingBox(
|
||||
@@ -1462,23 +1663,39 @@ class DirectExtractionEngine:
|
||||
merged_bbox = c.bbox
|
||||
|
||||
remapped_cells.append(TableCell(
|
||||
row=row,
|
||||
col=new_col,
|
||||
row_span=base_cell.row_span,
|
||||
row=visual_row,
|
||||
col=start_col,
|
||||
row_span=row_span,
|
||||
col_span=col_span,
|
||||
content=merged_content,
|
||||
bbox=merged_bbox
|
||||
))
|
||||
processed.add((row, new_col))
|
||||
processed.add((visual_row, start_col))
|
||||
|
||||
logger.info(f"Remapped to {len(remapped_cells)} cells in {new_num_cols} columns")
|
||||
# Filter out cells that are covered by spans from other cells
|
||||
# Build a set of positions covered by spans
|
||||
covered_positions = set()
|
||||
for cell in remapped_cells:
|
||||
if cell.col_span > 1 or cell.row_span > 1:
|
||||
for r in range(cell.row, cell.row + cell.row_span):
|
||||
for c in range(cell.col, cell.col + cell.col_span):
|
||||
if (r, c) != (cell.row, cell.col): # Don't cover the origin
|
||||
covered_positions.add((r, c))
|
||||
|
||||
return remapped_cells, new_widths, new_num_cols
|
||||
# Remove covered cells
|
||||
final_cells = [
|
||||
cell for cell in remapped_cells
|
||||
if (cell.row, cell.col) not in covered_positions
|
||||
]
|
||||
|
||||
logger.info(f"Remapped to {len(final_cells)} cells in {new_num_cols} columns x {new_num_rows} rows (filtered {len(remapped_cells) - len(final_cells)} covered cells)")
|
||||
|
||||
return final_cells, new_widths, new_num_cols, new_num_rows
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Cell remapping failed: {e}")
|
||||
# Fallback to original
|
||||
return cells, column_widths, num_cols
|
||||
return cells, column_widths, num_cols, num_rows
|
||||
|
||||
def _detect_tables_by_position(self, page: fitz.Page, page_num: int, counter: int) -> List[DocumentElement]:
|
||||
"""Detect tables by analyzing text positioning"""
|
||||
@@ -2138,12 +2355,23 @@ class DirectExtractionEngine:
|
||||
logger.warning(f"Custom clustering failed ({e}), using fallback method")
|
||||
drawing_clusters = self._cluster_drawings_fallback(page, non_table_drawings)
|
||||
|
||||
# Get page dimensions for filtering
|
||||
page_rect = page.rect
|
||||
page_area = page_rect.width * page_rect.height
|
||||
|
||||
for cluster_idx, bbox in enumerate(drawing_clusters):
|
||||
# Ignore small regions (likely noise or separator lines)
|
||||
if bbox.width < 50 or bbox.height < 50:
|
||||
logger.debug(f"Skipping small cluster {cluster_idx}: {bbox.width:.1f}x{bbox.height:.1f}")
|
||||
continue
|
||||
|
||||
# Ignore very large regions that cover most of the page
|
||||
# These are usually background elements, page borders, or misdetected regions
|
||||
cluster_area = bbox.width * bbox.height
|
||||
if cluster_area > page_area * 0.7: # More than 70% of page
|
||||
logger.debug(f"Skipping large cluster {cluster_idx}: covers {cluster_area/page_area*100:.0f}% of page")
|
||||
continue
|
||||
|
||||
# Render the region to a raster image
|
||||
# matrix=fitz.Matrix(2, 2) increases resolution to ~200 DPI
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user