test

2025-12-04 18:00:37 +08:00
parent 9437387ef1
commit 8265be1741
22 changed files with 2672 additions and 196 deletions
--- a/backend/app/services/direct_extraction_engine.py
+++ b/backend/app/services/direct_extraction_engine.py
--- a/backend/app/services/ocr_to_unified_converter.py
+++ b/backend/app/services/ocr_to_unified_converter.py
@@ -178,6 +178,114 @@ def trim_empty_columns(table_dict: Dict[str, Any]) -> Dict[str, Any]:
    return result


+def validate_cell_boxes(
+    cell_boxes: List[List[float]],
+    table_bbox: List[float],
+    page_width: float,
+    page_height: float,
+    tolerance: float = 5.0
+) -> Dict[str, Any]:
+    """
+    Validate cell_boxes coordinates against page boundaries and table bbox.
+
+    PP-StructureV3 sometimes returns cell_boxes with coordinates that exceed
+    page boundaries. This function validates and reports issues.
+
+    Args:
+        cell_boxes: List of cell bounding boxes [[x0, y0, x1, y1], ...]
+        table_bbox: Table bounding box [x0, y0, x1, y1]
+        page_width: Page width in pixels
+        page_height: Page height in pixels
+        tolerance: Allowed tolerance for boundary checks (pixels)
+
+    Returns:
+        Dict with:
+            - valid: bool - whether all cell_boxes are valid
+            - invalid_count: int - number of invalid cell_boxes
+            - clamped_boxes: List - cell_boxes clamped to valid boundaries
+            - issues: List[str] - description of issues found
+    """
+    if not cell_boxes:
+        return {'valid': True, 'invalid_count': 0, 'clamped_boxes': [], 'issues': []}
+
+    issues = []
+    invalid_count = 0
+    clamped_boxes = []
+
+    # Page boundaries with tolerance
+    min_x = -tolerance
+    min_y = -tolerance
+    max_x = page_width + tolerance
+    max_y = page_height + tolerance
+
+    for idx, box in enumerate(cell_boxes):
+        if not box or len(box) < 4:
+            issues.append(f"Cell {idx}: Invalid box format")
+            invalid_count += 1
+            clamped_boxes.append([0, 0, 0, 0])
+            continue
+
+        x0, y0, x1, y1 = box[:4]
+        is_valid = True
+        cell_issues = []
+
+        # Check if coordinates exceed page boundaries
+        if x0 < min_x:
+            cell_issues.append(f"x0={x0:.1f} < 0")
+            is_valid = False
+        if y0 < min_y:
+            cell_issues.append(f"y0={y0:.1f} < 0")
+            is_valid = False
+        if x1 > max_x:
+            cell_issues.append(f"x1={x1:.1f} > page_width={page_width:.1f}")
+            is_valid = False
+        if y1 > max_y:
+            cell_issues.append(f"y1={y1:.1f} > page_height={page_height:.1f}")
+            is_valid = False
+
+        # Check for inverted coordinates
+        if x0 > x1:
+            cell_issues.append(f"x0={x0:.1f} > x1={x1:.1f}")
+            is_valid = False
+        if y0 > y1:
+            cell_issues.append(f"y0={y0:.1f} > y1={y1:.1f}")
+            is_valid = False
+
+        if not is_valid:
+            invalid_count += 1
+            issues.append(f"Cell {idx}: {', '.join(cell_issues)}")
+
+        # Clamp to valid boundaries
+        clamped_box = [
+            max(0, min(x0, page_width)),
+            max(0, min(y0, page_height)),
+            max(0, min(x1, page_width)),
+            max(0, min(y1, page_height))
+        ]
+
+        # Ensure proper ordering after clamping
+        if clamped_box[0] > clamped_box[2]:
+            clamped_box[0], clamped_box[2] = clamped_box[2], clamped_box[0]
+        if clamped_box[1] > clamped_box[3]:
+            clamped_box[1], clamped_box[3] = clamped_box[3], clamped_box[1]
+
+        clamped_boxes.append(clamped_box)
+
+    if invalid_count > 0:
+        logger.warning(
+            f"Cell boxes validation: {invalid_count}/{len(cell_boxes)} invalid. "
+            f"Page: {page_width:.0f}x{page_height:.0f}, Table bbox: {table_bbox}"
+        )
+
+    return {
+        'valid': invalid_count == 0,
+        'invalid_count': invalid_count,
+        'clamped_boxes': clamped_boxes,
+        'issues': issues,
+        'needs_fallback': invalid_count > len(cell_boxes) * 0.5  # >50% invalid = needs fallback
+    }
+
+
 class OCRToUnifiedConverter:
    """
    Converter for transforming PP-StructureV3 OCR results to UnifiedDocument format.
@@ -337,19 +445,22 @@ class OCRToUnifiedConverter:
        for page_idx, page_result in enumerate(enhanced_results):
            elements = []

+            # Get page dimensions first (needed for element conversion)
+            page_width = page_result.get('width', 0)
+            page_height = page_result.get('height', 0)
+            pp_dimensions = Dimensions(width=page_width, height=page_height)
+
            # Process elements from parsing_res_list
            if 'elements' in page_result:
                for elem_data in page_result['elements']:
-                    element = self._convert_pp3_element(elem_data, page_idx)
+                    element = self._convert_pp3_element(
+                        elem_data, page_idx,
+                        page_width=page_width,
+                        page_height=page_height
+                    )
                    if element:
                        elements.append(element)

-            # Get page dimensions
-            pp_dimensions = Dimensions(
-                width=page_result.get('width', 0),
-                height=page_result.get('height', 0)
-            )
-
            # Apply gap filling if enabled and raw regions available
            if self.gap_filling_service and raw_text_regions:
                # Filter raw regions for current page
@@ -556,9 +667,19 @@ class OCRToUnifiedConverter:
    def _convert_pp3_element(
        self,
        elem_data: Dict[str, Any],
-        page_idx: int
+        page_idx: int,
+        page_width: float = 0,
+        page_height: float = 0
    ) -> Optional[DocumentElement]:
-        """Convert PP-StructureV3 element to DocumentElement."""
+        """
+        Convert PP-StructureV3 element to DocumentElement.
+
+        Args:
+            elem_data: Element data from PP-StructureV3
+            page_idx: Page index (0-based)
+            page_width: Page width for coordinate validation
+            page_height: Page height for coordinate validation
+        """
        try:
            # Extract bbox
            bbox_data = elem_data.get('bbox', [0, 0, 0, 0])
@@ -597,18 +718,67 @@ class OCRToUnifiedConverter:
                # Preserve cell_boxes and embedded_images in metadata for PDF generation
                # These are extracted by PP-StructureV3 and provide accurate cell positioning
                if 'cell_boxes' in elem_data:
-                    elem_data.setdefault('metadata', {})['cell_boxes'] = elem_data['cell_boxes']
-                    elem_data['metadata']['cell_boxes_source'] = elem_data.get('cell_boxes_source', 'table_res_list')
+                    cell_boxes = elem_data['cell_boxes']
+                    elem_data.setdefault('metadata', {})['cell_boxes_source'] = elem_data.get('cell_boxes_source', 'table_res_list')
+
+                    # Validate cell_boxes coordinates if page dimensions are available
+                    if page_width > 0 and page_height > 0:
+                        validation = validate_cell_boxes(
+                            cell_boxes=cell_boxes,
+                            table_bbox=bbox_data,
+                            page_width=page_width,
+                            page_height=page_height
+                        )
+
+                        if not validation['valid']:
+                            elem_data['metadata']['cell_boxes_validation'] = {
+                                'valid': False,
+                                'invalid_count': validation['invalid_count'],
+                                'total_count': len(cell_boxes),
+                                'needs_fallback': validation['needs_fallback']
+                            }
+                            # Use clamped boxes instead of invalid ones
+                            elem_data['metadata']['cell_boxes'] = validation['clamped_boxes']
+                            elem_data['metadata']['cell_boxes_original'] = cell_boxes
+
+                            if validation['needs_fallback']:
+                                logger.warning(
+                                    f"Table {elem_data.get('element_id')}: "
+                                    f"{validation['invalid_count']}/{len(cell_boxes)} cell_boxes invalid, "
+                                    f"fallback recommended"
+                                )
+                        else:
+                            elem_data['metadata']['cell_boxes'] = cell_boxes
+                            elem_data['metadata']['cell_boxes_validation'] = {'valid': True}
+                    else:
+                        # No page dimensions available, store as-is
+                        elem_data['metadata']['cell_boxes'] = cell_boxes
+
                if 'embedded_images' in elem_data:
                    elem_data.setdefault('metadata', {})['embedded_images'] = elem_data['embedded_images']
-            elif element_type in [ElementType.IMAGE, ElementType.FIGURE]:
-                # For images, use metadata dict as content
+            elif element_type in [
+                ElementType.IMAGE, ElementType.FIGURE, ElementType.CHART,
+                ElementType.DIAGRAM, ElementType.LOGO, ElementType.STAMP
+            ]:
+                # For all visual elements, use metadata dict as content
+                # Priority: saved_path > img_path (PP-StructureV3 uses saved_path)
+                image_path = (
+                    elem_data.get('saved_path') or
+                    elem_data.get('img_path') or
+                    ''
+                )
                content = {
-                    'path': elem_data.get('img_path', ''),
+                    'saved_path': image_path,  # Preserve original path key
+                    'path': image_path,        # For backward compatibility
                    'width': elem_data.get('width', 0),
                    'height': elem_data.get('height', 0),
                    'format': elem_data.get('format', 'unknown')
                }
+                if not image_path:
+                    logger.warning(
+                        f"Visual element {element_type.value} missing image path: "
+                        f"saved_path={elem_data.get('saved_path')}, img_path={elem_data.get('img_path')}"
+                    )
            else:
                content = elem_data.get('content', '')

@@ -1139,10 +1309,18 @@ class OCRToUnifiedConverter:
        for page_idx, page_data in enumerate(pages_data):
            elements = []

+            # Get page dimensions first
+            page_width = page_data.get('width', 0)
+            page_height = page_data.get('height', 0)
+
            # Process each element in the page
            if 'elements' in page_data:
                for elem_data in page_data['elements']:
-                    element = self._convert_pp3_element(elem_data, page_idx)
+                    element = self._convert_pp3_element(
+                        elem_data, page_idx,
+                        page_width=page_width,
+                        page_height=page_height
+                    )
                    if element:
                        elements.append(element)

@@ -1150,8 +1328,8 @@ class OCRToUnifiedConverter:
            page = Page(
                page_number=page_idx + 1,
                dimensions=Dimensions(
-                    width=page_data.get('width', 0),
-                    height=page_data.get('height', 0)
+                    width=page_width,
+                    height=page_height
                ),
                elements=elements,
                metadata={'reading_order': self._calculate_reading_order(elements)}
--- a/backend/app/services/pdf_generator_service.py
+++ b/backend/app/services/pdf_generator_service.py
@@ -3371,18 +3371,21 @@ class PDFGeneratorService:
            "rows": 6,
            "cols": 2,
            "cells": [
-                {"row": 0, "col": 0, "content": "..."},
+                {"row": 0, "col": 0, "content": "...", "row_span": 1, "col_span": 2},
                {"row": 0, "col": 1, "content": "..."},
                ...
            ]
        }

-        Returns format compatible with HTMLTableParser output:
+        Returns format compatible with HTMLTableParser output (with colspan/rowspan/col):
        [
-            {"cells": [{"text": "..."}, {"text": "..."}]},  # row 0
-            {"cells": [{"text": "..."}, {"text": "..."}]},  # row 1
+            {"cells": [{"text": "...", "colspan": 1, "rowspan": 1, "col": 0}, ...]},
+            {"cells": [{"text": "...", "colspan": 1, "rowspan": 1, "col": 0}, ...]},
            ...
        ]
+
+        Note: This returns actual cells per row with their absolute column positions.
+        The table renderer uses 'col' to place cells correctly in the grid.
        """
        try:
            num_rows = content.get('rows', 0)
@@ -3392,21 +3395,39 @@ class PDFGeneratorService:
            if not cells or num_rows == 0 or num_cols == 0:
                return []

-            # Initialize rows structure
-            rows_data = []
-            for _ in range(num_rows):
-                rows_data.append({'cells': [{'text': ''} for _ in range(num_cols)]})
-
-            # Fill in cell content
+            # Group cells by row
+            cells_by_row = {}
            for cell in cells:
                row_idx = cell.get('row', 0)
-                col_idx = cell.get('col', 0)
-                cell_content = cell.get('content', '')
+                if row_idx not in cells_by_row:
+                    cells_by_row[row_idx] = []
+                cells_by_row[row_idx].append(cell)

-                if 0 <= row_idx < num_rows and 0 <= col_idx < num_cols:
-                    rows_data[row_idx]['cells'][col_idx]['text'] = str(cell_content) if cell_content else ''
+            # Sort cells within each row by column
+            for row_idx in cells_by_row:
+                cells_by_row[row_idx].sort(key=lambda c: c.get('col', 0))

-            logger.debug(f"Built {num_rows} rows from cells dict")
+            # Build rows structure with colspan/rowspan info and absolute col position
+            rows_data = []
+            for row_idx in range(num_rows):
+                row_cells = []
+                if row_idx in cells_by_row:
+                    for cell in cells_by_row[row_idx]:
+                        cell_content = cell.get('content', '')
+                        row_span = cell.get('row_span', 1) or 1
+                        col_span = cell.get('col_span', 1) or 1
+                        col_idx = cell.get('col', 0)
+
+                        row_cells.append({
+                            'text': str(cell_content) if cell_content else '',
+                            'rowspan': row_span,
+                            'colspan': col_span,
+                            'col': col_idx  # Absolute column position
+                        })
+
+                rows_data.append({'cells': row_cells})
+
+            logger.debug(f"Built {num_rows} rows from cells dict with span info")
            return rows_data

        except Exception as e:
@@ -3471,19 +3492,115 @@ class PDFGeneratorService:
            table_width = bbox.x1 - bbox.x0
            table_height = bbox.y1 - bbox.y0

-            # Build table data for ReportLab
-            table_content = []
-            for row in rows:
-                row_data = [cell['text'].strip() for cell in row['cells']]
-                table_content.append(row_data)
-
            # Create table
            from reportlab.platypus import Table, TableStyle
            from reportlab.lib import colors

-            # Determine number of rows and columns for cell_boxes calculation
+            # Determine grid size from rows structure
+            # Note: rows may have 'col' attribute for absolute positioning (from Direct extraction)
+            # or may be sequential (from HTML parsing)
            num_rows = len(rows)
-            max_cols = max(len(row['cells']) for row in rows) if rows else 0
+
+            # Check if cells have absolute column positions
+            has_absolute_cols = any(
+                'col' in cell
+                for row in rows
+                for cell in row['cells']
+            )
+
+            # Calculate actual number of columns
+            max_cols = 0
+            if has_absolute_cols:
+                # Use absolute col positions + colspan to find max column
+                for row in rows:
+                    for cell in row['cells']:
+                        col = cell.get('col', 0)
+                        colspan = cell.get('colspan', 1)
+                        max_cols = max(max_cols, col + colspan)
+            else:
+                # Sequential cells: sum up colspans
+                for row in rows:
+                    col_pos = 0
+                    for cell in row['cells']:
+                        colspan = cell.get('colspan', 1)
+                        col_pos += colspan
+                    max_cols = max(max_cols, col_pos)
+
+            # Build table data for ReportLab with proper grid structure
+            # ReportLab needs a full grid with placeholders for spanned cells
+            # and SPAN commands to merge them
+            table_content = []
+            span_commands = []
+            covered = set()  # Track cells covered by spans
+
+            # First pass: mark covered cells and collect SPAN commands
+            for row_idx, row in enumerate(rows):
+                if has_absolute_cols:
+                    # Use absolute column positions
+                    for cell in row['cells']:
+                        col_pos = cell.get('col', 0)
+                        colspan = cell.get('colspan', 1)
+                        rowspan = cell.get('rowspan', 1)
+
+                        # Mark cells covered by this span
+                        if colspan > 1 or rowspan > 1:
+                            for r in range(row_idx, row_idx + rowspan):
+                                for c in range(col_pos, col_pos + colspan):
+                                    if (r, c) != (row_idx, col_pos):
+                                        covered.add((r, c))
+                            # Add SPAN command for ReportLab
+                            span_commands.append((
+                                'SPAN',
+                                (col_pos, row_idx),
+                                (col_pos + colspan - 1, row_idx + rowspan - 1)
+                            ))
+                else:
+                    # Sequential positioning
+                    col_pos = 0
+                    for cell in row['cells']:
+                        while (row_idx, col_pos) in covered:
+                            col_pos += 1
+
+                        colspan = cell.get('colspan', 1)
+                        rowspan = cell.get('rowspan', 1)
+
+                        if colspan > 1 or rowspan > 1:
+                            for r in range(row_idx, row_idx + rowspan):
+                                for c in range(col_pos, col_pos + colspan):
+                                    if (r, c) != (row_idx, col_pos):
+                                        covered.add((r, c))
+                            span_commands.append((
+                                'SPAN',
+                                (col_pos, row_idx),
+                                (col_pos + colspan - 1, row_idx + rowspan - 1)
+                            ))
+                        col_pos += colspan
+
+            # Second pass: build content grid
+            for row_idx in range(num_rows):
+                row_data = [''] * max_cols
+
+                if row_idx < len(rows):
+                    if has_absolute_cols:
+                        # Place cells at their absolute positions
+                        for cell in rows[row_idx]['cells']:
+                            col_pos = cell.get('col', 0)
+                            if col_pos < max_cols:
+                                row_data[col_pos] = cell['text'].strip()
+                    else:
+                        # Sequential placement
+                        col_pos = 0
+                        for cell in rows[row_idx]['cells']:
+                            while col_pos < max_cols and (row_idx, col_pos) in covered:
+                                col_pos += 1
+                            if col_pos < max_cols:
+                                row_data[col_pos] = cell['text'].strip()
+                                colspan = cell.get('colspan', 1)
+                                col_pos += colspan
+
+                table_content.append(row_data)
+
+            logger.debug(f"Built table grid: {num_rows} rows × {max_cols} cols, {len(span_commands)} span commands (absolute_cols={has_absolute_cols})")

            # Use original column widths from extraction if available
            # Otherwise try to compute from cell_boxes (from PP-StructureV3)
@@ -3517,7 +3634,7 @@ class PDFGeneratorService:
            # Apply style with minimal padding to reduce table extension
            # Use Chinese font to support special characters (℃, μm, ≦, ×, Ω, etc.)
            font_for_table = self.font_name if self.font_registered else 'Helvetica'
-            style = TableStyle([
+            style_commands = [
                ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
                ('FONTNAME', (0, 0), (-1, -1), font_for_table),
                ('FONTSIZE', (0, 0), (-1, -1), 8),
@@ -3529,7 +3646,13 @@ class PDFGeneratorService:
                ('BOTTOMPADDING', (0, 0), (-1, -1), 0),
                ('LEFTPADDING', (0, 0), (-1, -1), 1),
                ('RIGHTPADDING', (0, 0), (-1, -1), 1),
-            ])
+            ]
+            # Add span commands for merged cells
+            style_commands.extend(span_commands)
+            if span_commands:
+                logger.info(f"Applied {len(span_commands)} SPAN commands for merged cells")
+
+            style = TableStyle(style_commands)
            t.setStyle(style)

            # Use canvas scaling as fallback to fit table within bbox
@@ -4350,33 +4473,100 @@ class PDFGeneratorService:
                        # Replace newlines with <br/>
                        safe_content = safe_content.replace('\n', '<br/>')

-                        # Calculate font size from bbox height, but keep minimum 10pt
-                        font_size = max(box_height * 0.7, 10)
-                        font_size = min(font_size, 24)  # Cap at 24pt
+                        # Get original font size from style info
+                        style_info = elem.get('style', {})
+                        original_font_size = style_info.get('font_size', 12.0)

-                        # Create style for this element
-                        elem_style = ParagraphStyle(
-                            f'elem_{id(elem)}',
-                            parent=base_style,
-                            fontSize=font_size,
-                            leading=font_size * 1.2,
+                        # Detect vertical text (Y-axis labels, etc.)
+                        # Vertical text has aspect_ratio (height/width) > 2 and multiple characters
+                        is_vertical_text = (
+                            box_height > box_width * 2 and
+                            len(content.strip()) > 1
                        )

-                        # Create paragraph
-                        para = Paragraph(safe_content, elem_style)
+                        if is_vertical_text:
+                            # For vertical text, use original font size and rotate
+                            font_size = min(original_font_size, box_width * 0.9)
+                            font_size = max(font_size, 6)  # Minimum 6pt

-                        # Calculate available width and height
-                        available_width = box_width
-                        available_height = box_height * 2  # Allow overflow
+                            # Save canvas state for rotation
+                            pdf_canvas.saveState()

-                        # Wrap the paragraph
-                        para_width, para_height = para.wrap(available_width, available_height)
+                            # Convert to PDF coordinates
+                            pdf_y_center = current_page_height - (y0 + y1) / 2
+                            x_center = (x0 + x1) / 2

-                        # Convert to PDF coordinates (y from bottom)
-                        pdf_y = current_page_height - y0 - para_height
+                            # Translate to center, rotate, translate back
+                            pdf_canvas.translate(x_center, pdf_y_center)
+                            pdf_canvas.rotate(90)

-                        # Draw the paragraph
-                        para.drawOn(pdf_canvas, x0, pdf_y)
+                            # Set font and draw text centered
+                            pdf_canvas.setFont(
+                                self.font_name if self.font_registered else 'Helvetica',
+                                font_size
+                            )
+                            # Draw text at origin (since we translated to center)
+                            text_width = pdf_canvas.stringWidth(
+                                safe_content.replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>'),
+                                self.font_name if self.font_registered else 'Helvetica',
+                                font_size
+                            )
+                            pdf_canvas.drawString(-text_width / 2, -font_size / 3,
+                                safe_content.replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>'))
+
+                            pdf_canvas.restoreState()
+                        else:
+                            # For horizontal text, dynamically fit text within bbox
+                            # Start with original font size and reduce until text fits
+                            MIN_FONT_SIZE = 6
+                            MAX_FONT_SIZE = 14
+
+                            if original_font_size > 0:
+                                start_font_size = min(original_font_size, MAX_FONT_SIZE)
+                            else:
+                                start_font_size = min(box_height * 0.7, MAX_FONT_SIZE)
+
+                            font_size = max(start_font_size, MIN_FONT_SIZE)
+
+                            # Try progressively smaller font sizes until text fits
+                            para = None
+                            para_height = box_height + 1  # Start with height > box to enter loop
+
+                            while font_size >= MIN_FONT_SIZE and para_height > box_height:
+                                elem_style = ParagraphStyle(
+                                    f'elem_{id(elem)}_{font_size}',
+                                    parent=base_style,
+                                    fontSize=font_size,
+                                    leading=font_size * 1.15,  # Tighter leading
+                                )
+
+                                para = Paragraph(safe_content, elem_style)
+                                para_width, para_height = para.wrap(box_width, box_height * 3)
+
+                                if para_height <= box_height:
+                                    break  # Text fits!
+
+                                font_size -= 0.5  # Reduce font size and try again
+
+                            # Ensure minimum font size
+                            if font_size < MIN_FONT_SIZE:
+                                font_size = MIN_FONT_SIZE
+                                elem_style = ParagraphStyle(
+                                    f'elem_{id(elem)}_min',
+                                    parent=base_style,
+                                    fontSize=font_size,
+                                    leading=font_size * 1.15,
+                                )
+                                para = Paragraph(safe_content, elem_style)
+                                para_width, para_height = para.wrap(box_width, box_height * 3)
+
+                            # Convert to PDF coordinates (y from bottom)
+                            # Clip to bbox height to prevent overflow
+                            actual_height = min(para_height, box_height)
+                            pdf_y = current_page_height - y0 - actual_height
+
+                            # Draw the paragraph
+                            para.drawOn(pdf_canvas, x0, pdf_y)

            # Save PDF
            pdf_canvas.save()
@@ -4451,13 +4641,47 @@ class PDFGeneratorService:
            pdf_y_bottom = page_height - ty1
            pdf_canvas.rect(tx0, pdf_y_bottom, table_width, table_height, stroke=1, fill=0)

-            # Step 2: Draw cell borders using cell_boxes
+            # Step 2: Get or calculate cell boxes
            cell_boxes = metadata.get('cell_boxes', [])
-            if cell_boxes:
-                # Normalize cell boxes for grid alignment
-                if hasattr(self, '_normalize_cell_boxes_to_grid'):
-                    cell_boxes = self._normalize_cell_boxes_to_grid(cell_boxes)

+            # If no cell_boxes, calculate from column_widths and row_heights
+            if not cell_boxes:
+                column_widths = metadata.get('column_widths', [])
+                row_heights = metadata.get('row_heights', [])
+
+                if column_widths and row_heights:
+                    # Calculate cell positions from widths and heights
+                    cell_boxes = []
+                    rows = content.get('rows', len(row_heights)) if isinstance(content, dict) else len(row_heights)
+                    cols = content.get('cols', len(column_widths)) if isinstance(content, dict) else len(column_widths)
+
+                    # Calculate cumulative positions
+                    x_positions = [tx0]
+                    for w in column_widths[:cols]:
+                        x_positions.append(x_positions[-1] + w)
+
+                    y_positions = [ty0]
+                    for h in row_heights[:rows]:
+                        y_positions.append(y_positions[-1] + h)
+
+                    # Create cell boxes for each cell (row-major order)
+                    for row_idx in range(rows):
+                        for col_idx in range(cols):
+                            if col_idx < len(x_positions) - 1 and row_idx < len(y_positions) - 1:
+                                cx0 = x_positions[col_idx]
+                                cy0 = y_positions[row_idx]
+                                cx1 = x_positions[col_idx + 1]
+                                cy1 = y_positions[row_idx + 1]
+                                cell_boxes.append([cx0, cy0, cx1, cy1])
+
+                    logger.debug(f"Calculated {len(cell_boxes)} cell boxes from {cols} cols x {rows} rows")
+
+            # Normalize cell boxes for grid alignment
+            if cell_boxes and hasattr(self, '_normalize_cell_boxes_to_grid'):
+                cell_boxes = self._normalize_cell_boxes_to_grid(cell_boxes)
+
+            # Draw cell borders
+            if cell_boxes:
                pdf_canvas.setLineWidth(0.5)
                for box in cell_boxes:
                    if len(box) >= 4:
--- a/backend/app/services/pp_structure_enhanced.py
+++ b/backend/app/services/pp_structure_enhanced.py
@@ -558,8 +558,8 @@ class PPStructureEnhanced:
                        element['embedded_images'] = embedded_images
                        logger.info(f"[TABLE] Embedded {len(embedded_images)} images into table")

-            # Special handling for images/figures/stamps (visual elements that need cropping)
-            elif mapped_type in [ElementType.IMAGE, ElementType.FIGURE, ElementType.STAMP, ElementType.LOGO]:
+            # Special handling for images/figures/charts/stamps (visual elements that need cropping)
+            elif mapped_type in [ElementType.IMAGE, ElementType.FIGURE, ElementType.CHART, ElementType.DIAGRAM, ElementType.STAMP, ElementType.LOGO]:
                # Save image if path provided
                if 'img_path' in item and output_dir:
                    saved_path = self._save_image(item['img_path'], output_dir, element['element_id'])
--- a/backend/tests/debug_table_cells.py
+++ b/backend/tests/debug_table_cells.py
@@ -0,0 +1,43 @@
+"""Debug PyMuPDF table.cells structure"""
+import sys
+from pathlib import Path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+import fitz
+
+pdf_path = Path(__file__).parent.parent.parent / "demo_docs" / "edit3.pdf"
+doc = fitz.open(str(pdf_path))
+page = doc[0]
+
+tables = page.find_tables()
+for idx, table in enumerate(tables.tables):
+    data = table.extract()
+    num_rows = len(data)
+    num_cols = max(len(row) for row in data) if data else 0
+
+    print(f"Table {idx}:")
+    print(f"  table.extract() dimensions: {num_rows} rows x {num_cols} cols")
+    print(f"  Expected positions: {num_rows * num_cols}")
+
+    cell_rects = getattr(table, 'cells', None)
+    if cell_rects:
+        print(f"  table.cells length: {len(cell_rects)}")
+        none_count = sum(1 for c in cell_rects if c is None)
+        actual_count = sum(1 for c in cell_rects if c is not None)
+        print(f"  None cells: {none_count}")
+        print(f"  Actual cells: {actual_count}")
+
+        # Check if cell_rects matches grid size
+        if len(cell_rects) != num_rows * num_cols:
+            print(f"  WARNING: cell_rects length ({len(cell_rects)}) != grid size ({num_rows * num_cols})")
+
+        # Show first few cells
+        print(f"  First 5 cells: {cell_rects[:5]}")
+    else:
+        print(f"  table.cells: NOT AVAILABLE")
+
+    # Check row_count and col_count
+    print(f"  table.row_count: {getattr(table, 'row_count', 'N/A')}")
+    print(f"  table.col_count: {getattr(table, 'col_count', 'N/A')}")
+
+doc.close()
--- a/backend/tests/debug_table_cells2.py
+++ b/backend/tests/debug_table_cells2.py
@@ -0,0 +1,48 @@
+"""Debug PyMuPDF table structure - find merge info"""
+import sys
+from pathlib import Path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+import fitz
+
+pdf_path = Path(__file__).parent.parent.parent / "demo_docs" / "edit3.pdf"
+doc = fitz.open(str(pdf_path))
+page = doc[0]
+
+tables = page.find_tables()
+for idx, table in enumerate(tables.tables):
+    print(f"\nTable {idx}:")
+
+    # Check all available attributes
+    print(f"  Available attributes: {[a for a in dir(table) if not a.startswith('_')]}")
+
+    # Try to get header info
+    if hasattr(table, 'header'):
+        print(f"  header: {table.header}")
+
+    # Check for cells info
+    cell_rects = table.cells
+    print(f"  cells count: {len(cell_rects)}")
+
+    # Get the extracted data
+    data = table.extract()
+    print(f"  extract() shape: {len(data)} x {max(len(r) for r in data)}")
+
+    # Check if there's a way to map cells to grid positions
+    # Look at the pandas output which might have merge info
+    try:
+        df = table.to_pandas()
+        print(f"  pandas shape: {df.shape}")
+    except Exception as e:
+        print(f"  pandas error: {e}")
+
+    # Check the TableRow objects if available
+    if hasattr(table, 'rows'):
+        rows = table.rows
+        print(f"  rows: {len(rows)}")
+        for ri, row in enumerate(rows[:3]):  # first 3 rows
+            print(f"    row {ri}: {len(row.cells)} cells")
+            for ci, cell in enumerate(row.cells[:5]):  # first 5 cells
+                print(f"      cell {ci}: bbox={cell}")
+
+doc.close()
--- a/backend/tests/generate_test_pdf.py
+++ b/backend/tests/generate_test_pdf.py
@@ -0,0 +1,111 @@
+"""
+Generate test PDF to verify Phase 1 fixes
+"""
+
+import sys
+import os
+from pathlib import Path
+
+# Add backend to path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from app.services.direct_extraction_engine import DirectExtractionEngine
+from app.services.pdf_generator_service import PDFGeneratorService
+from app.services.unified_document_exporter import UnifiedDocumentExporter
+
+
+def generate_test_pdf(input_pdf: str, output_dir: Path):
+    """Generate test PDF using Direct Track extraction"""
+
+    input_path = Path(input_pdf)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    print(f"Processing: {input_path.name}")
+    print(f"Output dir: {output_dir}")
+
+    # Step 1: Extract with Direct Track
+    engine = DirectExtractionEngine(
+        enable_table_detection=True,
+        enable_image_extraction=True,
+        min_image_area=200.0,  # Filter tiny images
+        enable_whiteout_detection=True,
+        enable_content_sanitization=True
+    )
+
+    unified_doc = engine.extract(input_path, output_dir=output_dir)
+
+    # Print extraction stats
+    print(f"\n=== Extraction Results ===")
+    print(f"Document ID: {unified_doc.document_id}")
+    print(f"Pages: {len(unified_doc.pages)}")
+
+    table_count = 0
+    image_count = 0
+    merged_cells = 0
+    total_cells = 0
+
+    for page in unified_doc.pages:
+        for elem in page.elements:
+            if elem.type.value == 'table':
+                table_count += 1
+                if elem.content and hasattr(elem.content, 'cells'):
+                    total_cells += len(elem.content.cells)
+                    for cell in elem.content.cells:
+                        if cell.row_span > 1 or cell.col_span > 1:
+                            merged_cells += 1
+            elif elem.type.value == 'image':
+                image_count += 1
+
+    print(f"Tables: {table_count}")
+    print(f"  - Total cells: {total_cells}")
+    print(f"  - Merged cells: {merged_cells}")
+    print(f"Images: {image_count}")
+
+    # Step 2: Export to JSON
+    exporter = UnifiedDocumentExporter()
+    json_path = output_dir / f"{input_path.stem}_result.json"
+    exporter.export_to_json(unified_doc, json_path)
+    print(f"\nJSON saved: {json_path}")
+
+    # Step 3: Generate layout PDF
+    pdf_generator = PDFGeneratorService()
+    pdf_path = output_dir / f"{input_path.stem}_layout.pdf"
+
+    try:
+        pdf_generator.generate_from_unified_document(
+            unified_doc=unified_doc,
+            output_path=pdf_path,
+            source_file_path=input_path
+        )
+        print(f"PDF saved: {pdf_path}")
+        return pdf_path
+    except Exception as e:
+        print(f"PDF generation error: {e}")
+        import traceback
+        traceback.print_exc()
+        return None
+
+
+if __name__ == "__main__":
+    # Test with edit3.pdf (has complex tables with merging)
+    demo_docs = Path(__file__).parent.parent.parent / "demo_docs"
+    output_base = Path(__file__).parent.parent / "storage" / "test_phase1"
+
+    # Process edit3.pdf
+    edit3_pdf = demo_docs / "edit3.pdf"
+    if edit3_pdf.exists():
+        output_dir = output_base / "edit3"
+        result = generate_test_pdf(str(edit3_pdf), output_dir)
+        if result:
+            print(f"\n✓ Test PDF generated: {result}")
+
+    # Also process edit.pdf for comparison
+    edit_pdf = demo_docs / "edit.pdf"
+    if edit_pdf.exists():
+        output_dir = output_base / "edit"
+        result = generate_test_pdf(str(edit_pdf), output_dir)
+        if result:
+            print(f"\n✓ Test PDF generated: {result}")
+
+    print(f"\n=== Output Location ===")
+    print(f"{output_base}")
--- a/backend/tests/test_phase1_fixes.py
+++ b/backend/tests/test_phase1_fixes.py
@@ -0,0 +1,285 @@
+"""
+Phase 1 Bug Fixes Verification Tests
+
+Tests for:
+1.1 Direct Track table cell merging
+1.2 OCR Track image path preservation
+1.3 Cell boxes coordinate validation
+1.4 Tiny decoration image filtering
+1.5 Covering image removal
+"""
+
+import sys
+import os
+from pathlib import Path
+
+# Add backend to path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+import fitz
+from app.services.direct_extraction_engine import DirectExtractionEngine
+from app.services.ocr_to_unified_converter import validate_cell_boxes
+from app.models.unified_document import TableCell
+
+
+def test_1_1_table_cell_merging():
+    """Test 1.1.5: Verify edit3.pdf returns correct merged cells"""
+    print("\n" + "="*60)
+    print("TEST 1.1: Direct Track Table Cell Merging")
+    print("="*60)
+
+    pdf_path = Path(__file__).parent.parent.parent / "demo_docs" / "edit3.pdf"
+    if not pdf_path.exists():
+        print(f"SKIP: {pdf_path} not found")
+        return False
+
+    doc = fitz.open(str(pdf_path))
+
+    total_cells = 0
+    merged_cells = 0
+
+    for page_num, page in enumerate(doc):
+        tables = page.find_tables()
+        for table_idx, table in enumerate(tables.tables):
+            data = table.extract()
+            cell_rects = getattr(table, 'cells', None)
+
+            if cell_rects:
+                num_rows = len(data)
+                num_cols = max(len(row) for row in data) if data else 0
+
+                # Count actual cells (non-None)
+                actual_cells = sum(1 for c in cell_rects if c is not None)
+                none_cells = sum(1 for c in cell_rects if c is None)
+
+                print(f"  Page {page_num}, Table {table_idx}:")
+                print(f"    Grid size: {num_rows} x {num_cols} = {num_rows * num_cols} positions")
+                print(f"    Actual cells: {actual_cells}")
+                print(f"    Merged positions (None): {none_cells}")
+
+                total_cells += actual_cells
+                if none_cells > 0:
+                    merged_cells += 1
+
+    doc.close()
+
+    print(f"\n  Total actual cells across all tables: {total_cells}")
+    print(f"  Tables with merging: {merged_cells}")
+
+    # According to PLAN.md, edit3.pdf should have 83 cells (not 204)
+    # The presence of None values indicates merging is detected
+    if total_cells > 0 and total_cells < 204:
+        print("  RESULT: PASS - Cell merging detected correctly")
+        return True
+    elif total_cells == 204:
+        print("  RESULT: FAIL - All cells treated as 1x1 (no merging detected)")
+        return False
+    else:
+        print(f"  RESULT: INCONCLUSIVE - {total_cells} cells found")
+        return None
+
+
+def test_1_3_cell_boxes_validation():
+    """Test 1.3: Verify cell_boxes coordinate validation"""
+    print("\n" + "="*60)
+    print("TEST 1.3: Cell Boxes Coordinate Validation")
+    print("="*60)
+
+    # Test case 1: Valid coordinates
+    valid_boxes = [
+        [10, 10, 100, 50],
+        [100, 10, 200, 50],
+        [10, 50, 200, 100]
+    ]
+    result = validate_cell_boxes(valid_boxes, [0, 0, 300, 200], 300, 200)
+    print(f"  Valid boxes: valid={result['valid']}, invalid_count={result['invalid_count']}")
+    assert result['valid'], "Valid boxes should pass validation"
+
+    # Test case 2: Out of bounds coordinates
+    invalid_boxes = [
+        [-10, 10, 100, 50],    # x0 < 0
+        [10, 10, 400, 50],     # x1 > page_width
+        [10, 10, 100, 300]     # y1 > page_height
+    ]
+    result = validate_cell_boxes(invalid_boxes, [0, 0, 300, 200], 300, 200)
+    print(f"  Invalid boxes: valid={result['valid']}, invalid_count={result['invalid_count']}")
+    assert not result['valid'], "Invalid boxes should fail validation"
+    assert result['invalid_count'] == 3, "Should detect 3 invalid boxes"
+
+    # Test case 3: Clamping
+    assert len(result['clamped_boxes']) == 3, "Should return clamped boxes"
+    clamped = result['clamped_boxes'][0]
+    assert clamped[0] >= 0, "Clamped x0 should be >= 0"
+
+    print("  RESULT: PASS - Coordinate validation works correctly")
+    return True
+
+
+def test_1_4_tiny_image_filtering():
+    """Test 1.4: Verify tiny decoration image filtering"""
+    print("\n" + "="*60)
+    print("TEST 1.4: Tiny Decoration Image Filtering")
+    print("="*60)
+
+    pdf_path = Path(__file__).parent.parent.parent / "demo_docs" / "edit3.pdf"
+    if not pdf_path.exists():
+        print(f"SKIP: {pdf_path} not found")
+        return None
+
+    doc = fitz.open(str(pdf_path))
+
+    tiny_count = 0
+    normal_count = 0
+    min_area = 200  # Same threshold as in DirectExtractionEngine
+
+    for page_num, page in enumerate(doc):
+        images = page.get_images()
+        for img in images:
+            xref = img[0]
+            rects = page.get_image_rects(xref)
+            if rects:
+                rect = rects[0]
+                area = (rect.x1 - rect.x0) * (rect.y1 - rect.y0)
+                if area < min_area:
+                    tiny_count += 1
+                    print(f"  Page {page_num}: Tiny image xref={xref}, area={area:.1f} px²")
+                else:
+                    normal_count += 1
+
+    doc.close()
+
+    print(f"\n  Tiny images (< {min_area} px²): {tiny_count}")
+    print(f"  Normal images: {normal_count}")
+
+    if tiny_count > 0:
+        print("  RESULT: PASS - Tiny images detected, will be filtered")
+        return True
+    else:
+        print("  RESULT: INFO - No tiny images found in test file")
+        return None
+
+
+def test_1_5_covering_image_detection():
+    """Test 1.5: Verify covering image detection"""
+    print("\n" + "="*60)
+    print("TEST 1.5: Covering Image Detection")
+    print("="*60)
+
+    pdf_path = Path(__file__).parent.parent.parent / "demo_docs" / "edit3.pdf"
+    if not pdf_path.exists():
+        print(f"SKIP: {pdf_path} not found")
+        return None
+
+    engine = DirectExtractionEngine(
+        enable_whiteout_detection=True,
+        whiteout_iou_threshold=0.8
+    )
+
+    doc = fitz.open(str(pdf_path))
+
+    total_covering = 0
+    for page_num, page in enumerate(doc):
+        result = engine._preprocess_page(page, page_num, doc)
+        covering_images = result.get('covering_images', [])
+
+        if covering_images:
+            print(f"  Page {page_num}: {len(covering_images)} covering images detected")
+            for img in covering_images[:3]:  # Show first 3
+                print(f"    - xref={img.get('xref')}, type={img.get('color_type')}, "
+                      f"bbox={[round(x, 1) for x in img.get('bbox', [])]}")
+            total_covering += len(covering_images)
+
+    doc.close()
+
+    print(f"\n  Total covering images detected: {total_covering}")
+
+    if total_covering > 0:
+        print("  RESULT: PASS - Covering images detected, will be filtered")
+        return True
+    else:
+        print("  RESULT: INFO - No covering images found in test file")
+        return None
+
+
+def test_direct_extraction_full():
+    """Full integration test for Direct Track extraction"""
+    print("\n" + "="*60)
+    print("INTEGRATION TEST: Direct Track Full Extraction")
+    print("="*60)
+
+    pdf_path = Path(__file__).parent.parent.parent / "demo_docs" / "edit3.pdf"
+    if not pdf_path.exists():
+        print(f"SKIP: {pdf_path} not found")
+        return None
+
+    engine = DirectExtractionEngine(
+        enable_table_detection=True,
+        enable_image_extraction=True,
+        min_image_area=200.0,
+        enable_whiteout_detection=True
+    )
+
+    try:
+        result = engine.extract(pdf_path)  # Pass Path object, not string
+
+        # Count elements
+        table_count = 0
+        image_count = 0
+        merged_table_count = 0
+
+        for page in result.pages:
+            for elem in page.elements:
+                if elem.type.value == 'table':
+                    table_count += 1
+                    if elem.content and hasattr(elem.content, 'cells'):
+                        # Check for merged cells
+                        for cell in elem.content.cells:
+                            if cell.row_span > 1 or cell.col_span > 1:
+                                merged_table_count += 1
+                                break
+                elif elem.type.value == 'image':
+                    image_count += 1
+
+        print(f"  Document ID: {result.document_id}")
+        print(f"  Pages: {len(result.pages)}")
+        print(f"  Tables: {table_count} (with merging: {merged_table_count})")
+        print(f"  Images: {image_count}")
+
+        print("  RESULT: PASS - Extraction completed successfully")
+        return True
+
+    except Exception as e:
+        print(f"  RESULT: FAIL - {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+
+
+if __name__ == "__main__":
+    print("="*60)
+    print("Phase 1 Bug Fixes Verification Tests")
+    print("="*60)
+
+    results = {}
+
+    # Run tests
+    results['1.1_table_merging'] = test_1_1_table_cell_merging()
+    results['1.3_coord_validation'] = test_1_3_cell_boxes_validation()
+    results['1.4_tiny_filtering'] = test_1_4_tiny_image_filtering()
+    results['1.5_covering_detection'] = test_1_5_covering_image_detection()
+    results['integration'] = test_direct_extraction_full()
+
+    # Summary
+    print("\n" + "="*60)
+    print("TEST SUMMARY")
+    print("="*60)
+
+    for test_name, result in results.items():
+        status = "PASS" if result is True else "FAIL" if result is False else "SKIP/INFO"
+        print(f"  {test_name}: {status}")
+
+    passed = sum(1 for r in results.values() if r is True)
+    failed = sum(1 for r in results.values() if r is False)
+    skipped = sum(1 for r in results.values() if r is None)
+
+    print(f"\n  Total: {passed} passed, {failed} failed, {skipped} skipped/info")