chore: backup before code cleanup

Backup commit before executing remove-unused-code proposal. This includes all pending changes and new features. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-11 11:55:39 +08:00
parent eff9b0bcd5
commit 940a406dce
58 changed files with 8226 additions and 175 deletions
--- a/backend/app/services/pdf_generator_service.py
+++ b/backend/app/services/pdf_generator_service.py
@@ -26,6 +26,23 @@ from html.parser import HTMLParser

 from app.core.config import settings

+# Import table column corrector for column alignment fix
+try:
+    from app.services.table_column_corrector import TableColumnCorrector
+    TABLE_COLUMN_CORRECTOR_AVAILABLE = True
+except ImportError:
+    TABLE_COLUMN_CORRECTOR_AVAILABLE = False
+    TableColumnCorrector = None
+
+# Import text region renderer for simple text positioning
+try:
+    from app.services.text_region_renderer import TextRegionRenderer, load_raw_ocr_regions
+    TEXT_REGION_RENDERER_AVAILABLE = True
+except ImportError:
+    TEXT_REGION_RENDERER_AVAILABLE = False
+    TextRegionRenderer = None
+    load_raw_ocr_regions = None
+
 # Import UnifiedDocument for dual-track support
 try:
    from app.models.unified_document import (
@@ -596,7 +613,8 @@ class PDFGeneratorService:
                        'content': html_content,
                        'bbox': [element.bbox.x0, element.bbox.y0,
                                element.bbox.x1, element.bbox.y1],
-                        'page': page_num - 1  # layout uses 0-based
+                        'page': page_num - 1,  # layout uses 0-based
+                        'element_id': element.element_id  # For _use_border_only matching
                    }

                    # Preserve cell_boxes and embedded_images from metadata
@@ -607,18 +625,29 @@ class PDFGeneratorService:
                            table_element['cell_boxes_source'] = element.metadata.get('cell_boxes_source', 'metadata')
                        if 'embedded_images' in element.metadata:
                            table_element['embedded_images'] = element.metadata['embedded_images']
+                        # Pass through rebuild flag - rebuilt tables should use HTML content
+                        if element.metadata.get('was_rebuilt'):
+                            table_element['was_rebuilt'] = True
+                            logger.debug(f"Table {element.element_id}: marked as rebuilt")

                    layout_elements.append(table_element)

                    # Add bbox to images_metadata for text overlap filtering
                    # (no actual image file, just bbox for filtering)
-                    images_metadata.append({
+                    img_metadata = {
                        'image_path': None,  # No fake table image
                        'bbox': bbox_polygon,
                        'page': page_num - 1,  # 0-based for images_metadata
                        'type': 'table',
                        'element_id': element.element_id
-                    })
+                    }
+                    # Also copy cell_boxes for quality checking
+                    if element.metadata and 'cell_boxes' in element.metadata:
+                        img_metadata['cell_boxes'] = element.metadata['cell_boxes']
+                    # Mark if table was rebuilt
+                    if element.metadata and element.metadata.get('was_rebuilt'):
+                        img_metadata['was_rebuilt'] = True
+                    images_metadata.append(img_metadata)

                # Handle image/visual elements (including stamps/seals)
                elif element.is_visual or element.type in [
@@ -1022,15 +1051,25 @@ class PDFGeneratorService:
            # Set current track
            self.current_processing_track = 'ocr'

-            # Convert UnifiedDocument to OCR data format (legacy)
-            ocr_data = self.convert_unified_document_to_ocr_data(unified_doc)
+            # Check if simple text positioning mode is enabled
+            if (settings.simple_text_positioning_enabled and
+                TEXT_REGION_RENDERER_AVAILABLE):
+                logger.info("Using simple text positioning mode")
+                result = self._generate_simple_text_pdf(
+                    unified_doc=unified_doc,
+                    output_path=output_path,
+                    source_file_path=source_file_path
+                )
+            else:
+                # Convert UnifiedDocument to OCR data format (legacy)
+                ocr_data = self.convert_unified_document_to_ocr_data(unified_doc)

-            # Use existing generation pipeline
-            result = self._generate_pdf_from_data(
-                ocr_data=ocr_data,
-                output_path=output_path,
-                source_file_path=source_file_path
-            )
+                # Use existing generation pipeline
+                result = self._generate_pdf_from_data(
+                    ocr_data=ocr_data,
+                    output_path=output_path,
+                    source_file_path=source_file_path
+                )

            # Reset track
            self.current_processing_track = None
@@ -1043,6 +1082,235 @@ class PDFGeneratorService:
            self.current_processing_track = None
            return False

+    def _generate_simple_text_pdf(
+        self,
+        unified_doc: 'UnifiedDocument',
+        output_path: Path,
+        source_file_path: Optional[Path] = None
+    ) -> bool:
+        """
+        Generate PDF using simple text positioning from raw OCR regions.
+
+        This approach bypasses complex table structure reconstruction and renders
+        raw OCR text directly at detected positions with rotation correction.
+        Images, charts, figures, seals, and formulas are still rendered normally.
+
+        Args:
+            unified_doc: UnifiedDocument from OCR processing
+            output_path: Path to save generated PDF
+            source_file_path: Optional path to original source file
+
+        Returns:
+            True if successful, False otherwise
+        """
+        try:
+            logger.info("=== Simple Text Positioning PDF Generation ===")
+
+            # Initialize text region renderer
+            text_renderer = TextRegionRenderer(
+                font_name=self.font_name,
+                debug=settings.simple_text_positioning_debug
+            )
+
+            # Get result directory from output_path
+            result_dir = output_path.parent
+
+            # Try to determine task_id from result directory or output filename
+            # Output path is typically: result_dir/task_id_edited.pdf
+            task_id = None
+            if output_path.stem.endswith('_edited'):
+                task_id = output_path.stem.replace('_edited', '')
+            elif result_dir.name:
+                # result_dir is typically the task_id directory
+                task_id = result_dir.name
+
+            if not task_id:
+                logger.warning("Could not determine task_id, falling back to legacy method")
+                ocr_data = self.convert_unified_document_to_ocr_data(unified_doc)
+                return self._generate_pdf_from_data(
+                    ocr_data=ocr_data,
+                    output_path=output_path,
+                    source_file_path=source_file_path
+                )
+
+            logger.info(f"Task ID: {task_id}, Result dir: {result_dir}")
+
+            # Get total pages from UnifiedDocument
+            total_pages = len(unified_doc.pages) if unified_doc.pages else 1
+
+            # Get page dimensions from first page (for canvas initialization)
+            if not unified_doc.pages:
+                logger.error("No pages in document")
+                return False
+
+            first_page = unified_doc.pages[0]
+            if hasattr(first_page, 'dimensions') and first_page.dimensions:
+                page_width = float(first_page.dimensions.width)
+                page_height = float(first_page.dimensions.height)
+            else:
+                # Fallback to default size
+                page_width = 612.0  # Letter width
+                page_height = 792.0  # Letter height
+                logger.warning(f"No page dimensions found, using default {page_width}x{page_height}")
+
+            logger.info(f"Initial page size: {page_width:.1f} x {page_height:.1f}")
+
+            # Create PDF canvas
+            pdf_canvas = canvas.Canvas(str(output_path), pagesize=(page_width, page_height))
+
+            # Collect image-type elements from UnifiedDocument for rendering
+            # Types that should be rendered as images: figure, image, chart, seal, formula
+            image_element_types = {'figure', 'image', 'chart', 'seal', 'formula'}
+
+            # Process each page
+            for page_num in range(1, total_pages + 1):
+                logger.info(f">>> Processing page {page_num}/{total_pages}")
+
+                # Get page dimensions for current page
+                if page_num <= len(unified_doc.pages):
+                    current_page = unified_doc.pages[page_num - 1]
+                    if hasattr(current_page, 'dimensions') and current_page.dimensions:
+                        current_width = float(current_page.dimensions.width)
+                        current_height = float(current_page.dimensions.height)
+                    else:
+                        current_width = page_width
+                        current_height = page_height
+                else:
+                    current_width = page_width
+                    current_height = page_height
+
+                if page_num > 1:
+                    pdf_canvas.showPage()
+
+                # Set page size
+                pdf_canvas.setPageSize((current_width, current_height))
+
+                # === Layer 1: Render images, charts, figures, seals, formulas ===
+                # Also collect exclusion zones for text avoidance
+                exclusion_zones = []  # List of (x0, y0, x1, y1) tuples
+
+                if page_num <= len(unified_doc.pages):
+                    current_page = unified_doc.pages[page_num - 1]
+                    page_elements = current_page.elements if hasattr(current_page, 'elements') else []
+
+                    image_elements_rendered = 0
+                    for elem in page_elements:
+                        elem_type = elem.type if hasattr(elem, 'type') else elem.get('type', '')
+                        # Handle enum type
+                        if hasattr(elem_type, 'value'):
+                            elem_type = elem_type.value
+
+                        if elem_type in image_element_types:
+                            # Get image path from element content
+                            content = elem.content if hasattr(elem, 'content') else elem.get('content', {})
+                            if isinstance(content, dict):
+                                saved_path = content.get('saved_path') or content.get('path')
+                            else:
+                                saved_path = None
+
+                            # Get bbox for exclusion zone (even if image file not found)
+                            bbox = elem.bbox if hasattr(elem, 'bbox') else elem.get('bbox', {})
+                            if hasattr(bbox, 'x0'):
+                                x0, y0, x1, y1 = bbox.x0, bbox.y0, bbox.x1, bbox.y1
+                            elif isinstance(bbox, dict):
+                                x0 = bbox.get('x0', 0)
+                                y0 = bbox.get('y0', 0)
+                                x1 = bbox.get('x1', x0 + bbox.get('width', 0))
+                                y1 = bbox.get('y1', y0 + bbox.get('height', 0))
+                            else:
+                                continue
+
+                            # Add to exclusion zones for text avoidance
+                            # Use original image coordinates (not PDF flipped)
+                            exclusion_zones.append((x0, y0, x1, y1))
+
+                            if saved_path:
+                                # Try to find the image file
+                                image_path = result_dir / saved_path
+                                if not image_path.exists():
+                                    # Try in imgs subdirectory
+                                    image_path = result_dir / 'imgs' / saved_path
+                                if not image_path.exists():
+                                    # Try just the filename
+                                    image_path = result_dir / Path(saved_path).name
+
+                                if image_path.exists():
+                                    try:
+                                        # Convert coordinates (flip Y for PDF)
+                                        pdf_x = x0
+                                        pdf_y = current_height - y1  # Bottom of image in PDF coords
+                                        img_width = x1 - x0
+                                        img_height = y1 - y0
+
+                                        # Draw image
+                                        pdf_canvas.drawImage(
+                                            str(image_path),
+                                            pdf_x, pdf_y,
+                                            width=img_width,
+                                            height=img_height,
+                                            preserveAspectRatio=True,
+                                            mask='auto'
+                                        )
+                                        image_elements_rendered += 1
+                                        logger.debug(f"Rendered {elem_type}: {saved_path} at ({pdf_x:.1f}, {pdf_y:.1f})")
+                                    except Exception as e:
+                                        logger.warning(f"Failed to render {elem_type} {saved_path}: {e}")
+                                else:
+                                    logger.warning(f"Image file not found: {saved_path}")
+
+                    if image_elements_rendered > 0:
+                        logger.info(f"Rendered {image_elements_rendered} image elements (figures/charts/seals/formulas)")
+
+                    if exclusion_zones:
+                        logger.info(f"Collected {len(exclusion_zones)} exclusion zones for text avoidance")
+
+                # === Layer 2: Render text from raw OCR regions ===
+                raw_regions = load_raw_ocr_regions(str(result_dir), task_id, page_num)
+
+                if not raw_regions:
+                    logger.warning(f"No raw OCR regions found for page {page_num}")
+                else:
+                    logger.info(f"Loaded {len(raw_regions)} raw OCR regions for page {page_num}")
+
+                    # Collect texts inside exclusion zones for position-aware deduplication
+                    # This prevents duplicate axis labels from being rendered near charts
+                    zone_texts = None
+                    if exclusion_zones:
+                        zone_texts = text_renderer.collect_zone_texts(
+                            raw_regions, exclusion_zones, threshold=0.5, include_axis_labels=True
+                        )
+                        if zone_texts:
+                            logger.info(f"Collected {len(zone_texts)} zone texts for deduplication: {list(zone_texts)[:10]}...")
+
+                    # Render all text regions, avoiding exclusion zones (images/charts)
+                    # Scale factors are 1.0 since OCR dimensions match page dimensions
+                    rendered = text_renderer.render_all_regions(
+                        pdf_canvas=pdf_canvas,
+                        regions=raw_regions,
+                        page_height=current_height,
+                        scale_x=1.0,
+                        scale_y=1.0,
+                        exclusion_zones=exclusion_zones,
+                        zone_texts=zone_texts
+                    )
+
+                    logger.info(f"Rendered {rendered} text regions")
+
+                logger.info(f"<<< Page {page_num} complete")
+
+            # Save PDF
+            pdf_canvas.save()
+
+            file_size = output_path.stat().st_size
+            logger.info(f"Generated PDF: {output_path.name} ({file_size} bytes)")
+            return True
+
+        except Exception as e:
+            logger.error(f"Failed to generate simple text PDF: {e}")
+            import traceback
+            traceback.print_exc()
+            return False
+
    def _generate_pdf_from_data(
        self,
        ocr_data: Dict,
@@ -1093,8 +1361,15 @@ class PDFGeneratorService:
                logger.info("No page_dimensions found, using first page size for all pages")

            # Step 3: Get original file dimensions for all pages
+            # For OCR track, we use OCR coordinate system dimensions directly to avoid scaling issues
            original_page_sizes = {}
-            if source_file_path:
+            use_ocr_dimensions_for_pdf = (self.current_processing_track == 'ocr')
+
+            if use_ocr_dimensions_for_pdf:
+                # OCR Track: Use OCR coordinate system dimensions directly
+                # This ensures no scaling is needed (scale = 1.0)
+                logger.info(f"OCR Track: 使用 OCR 座標系尺寸作為 PDF 頁面尺寸（避免縮放）")
+            elif source_file_path:
                original_page_sizes = self.get_all_page_sizes(source_file_path)
                if original_page_sizes:
                    logger.info(f"從原始文件獲取到 {len(original_page_sizes)} 頁尺寸")
@@ -1104,8 +1379,12 @@ class PDFGeneratorService:
                logger.info(f"無原始文件，將使用 OCR/UnifiedDocument 尺寸")

            # Determine initial canvas size (will be updated per page)
-            # Priority: original file first page > OCR/UnifiedDocument first page
-            if 0 in original_page_sizes:
+            # Priority for OCR track: OCR dimensions (no scaling)
+            # Priority for Direct track: original file first page > OCR/UnifiedDocument first page
+            if use_ocr_dimensions_for_pdf:
+                target_width, target_height = ocr_width, ocr_height
+                logger.info(f"初始 PDF 尺寸（OCR Track, 使用 OCR 座標系）: {target_width:.1f} x {target_height:.1f}")
+            elif 0 in original_page_sizes:
                target_width, target_height = original_page_sizes[0]
                logger.info(f"初始 PDF 尺寸（來自原始文件首頁）: {target_width:.1f} x {target_height:.1f}")
            else:
@@ -1159,14 +1438,49 @@ class PDFGeneratorService:
            # Create PDF canvas with initial page size (will be updated per page)
            pdf_canvas = canvas.Canvas(str(output_path), pagesize=(target_width, target_height))

-            # LAYERED RENDERING: Exclude tables from regions_to_avoid
-            # Text inside tables will be rendered at raw OCR positions (via GapFillingService)
-            # while table borders are drawn separately using cell_boxes
-            # Only avoid overlap with actual images/figures/charts
-            regions_to_avoid = [img for img in images_metadata if img.get('type') != 'table']
-            table_count = len([img for img in images_metadata if img.get('type') == 'table'])
+            # Smart filtering: only include tables with good cell_boxes quality in regions_to_avoid
+            # Tables with bad cell_boxes will use raw OCR text positioning instead
+            # Exception: Rebuilt tables always use HTML content and filter text
+            regions_to_avoid = []
+            good_quality_tables = []
+            bad_quality_tables = []
+            rebuilt_tables = []

-            logger.info(f"過濾文字區域: {len(regions_to_avoid)} 個區域需要避免 (不含表格), {table_count} 個表格使用分層渲染")
+            for img in images_metadata:
+                if img.get('type') == 'table':
+                    elem_id = img.get('element_id', 'unknown')
+
+                    # Check if this table was rebuilt - rebuilt tables have good content
+                    was_rebuilt = img.get('was_rebuilt', False)
+
+                    if was_rebuilt:
+                        # Rebuilt tables have accurate content - filter text, use HTML
+                        regions_to_avoid.append(img)
+                        rebuilt_tables.append(elem_id)
+                    else:
+                        # Check cell_boxes quality for non-rebuilt tables
+                        cell_boxes = img.get('cell_boxes', [])
+                        quality = self._check_cell_boxes_quality(cell_boxes, elem_id)
+
+                        if quality == 'good':
+                            # Good quality: filter text, render with cell_boxes
+                            regions_to_avoid.append(img)
+                            good_quality_tables.append(elem_id)
+                        else:
+                            # Bad quality: don't filter text, just draw border
+                            bad_quality_tables.append(elem_id)
+                            img['_use_border_only'] = True  # Mark for border-only rendering
+                else:
+                    # Non-table elements (images, figures, charts) always avoid
+                    regions_to_avoid.append(img)
+
+            logger.info(f"過濾文字區域: {len(regions_to_avoid)} 個區域需要避免")
+            if rebuilt_tables:
+                logger.info(f"  重建表格用 HTML: {rebuilt_tables}")
+            if good_quality_tables:
+                logger.info(f"  表格用 cell_boxes: {good_quality_tables}")
+            if bad_quality_tables:
+                logger.info(f"  表格用 raw OCR text (border only): {bad_quality_tables}")

            filtered_text_regions = self._filter_text_in_regions(text_regions, regions_to_avoid)

@@ -1178,10 +1492,24 @@ class PDFGeneratorService:
                    pages_data[page_num] = []
                pages_data[page_num].append(region)

-            # Get table elements from layout_data
+            # Get table elements from layout_data and copy _use_border_only flags
            table_elements = []
            if layout_data and layout_data.get('elements'):
-                table_elements = [e for e in layout_data['elements'] if e.get('type') == 'table']
+                # Create a lookup for _use_border_only flags from images_metadata
+                border_only_tables = {img.get('element_id') for img in images_metadata
+                                      if img.get('type') == 'table' and img.get('_use_border_only')}
+
+                logger.debug(f"[DEBUG] border_only_tables from images_metadata: {border_only_tables}")
+
+                for e in layout_data['elements']:
+                    if e.get('type') == 'table':
+                        elem_id = e.get('element_id')
+                        logger.debug(f"[DEBUG] layout_data table element_id: {elem_id}")
+                        # Copy the flag if this table should use border only
+                        if elem_id in border_only_tables:
+                            e['_use_border_only'] = True
+                            logger.info(f"[DEBUG] Set _use_border_only=True for table {elem_id}")
+                        table_elements.append(e)

            # Process each page
            total_pages = ocr_data.get('total_pages', 1)
@@ -1195,14 +1523,23 @@ class PDFGeneratorService:
                logger.info(f">>> 處理第 {page_num}/{total_pages} 頁")

                # Get current page dimensions with priority order:
-                # 1. Original file dimensions (highest priority)
-                # 2. OCR/UnifiedDocument dimensions
-                # 3. Fallback to first page dimensions
+                # For OCR Track: always use OCR dimensions (scale = 1.0)
+                # For Direct Track:
+                #   1. Original file dimensions (highest priority)
+                #   2. OCR/UnifiedDocument dimensions
+                #   3. Fallback to first page dimensions
                page_idx = page_num - 1
                dimension_source = "unknown"

-                # Priority 1: Original file dimensions
-                if page_idx in original_page_sizes:
+                # For OCR Track: always use OCR dimensions
+                if use_ocr_dimensions_for_pdf and page_idx in page_dimensions:
+                    current_page_dims = page_dimensions[page_idx]
+                    current_target_w = float(current_page_dims['width'])
+                    current_target_h = float(current_page_dims['height'])
+                    dimension_source = "ocr_track_direct"
+
+                # Priority 1: Original file dimensions (Direct Track only)
+                elif page_idx in original_page_sizes:
                    current_target_w, current_target_h = original_page_sizes[page_idx]
                    dimension_source = "original_file"

@@ -1774,12 +2111,26 @@ class PDFGeneratorService:
            non_empty_lines = [l for l in lines if l.strip()]
            num_lines = max(len(non_empty_lines), 1)

-            # Font size = bbox_height / num_lines * factor
+            # Font size calculation with stabilization
            # Use 0.8 factor to leave room for line spacing
-            font_size = (bbox_height / num_lines) * 0.8
-            font_size = max(min(font_size, 72), 4)  # Clamp between 4pt and 72pt
+            raw_font_size = (bbox_height / num_lines) * 0.8

-            logger.debug(f"Text has {num_lines} non-empty lines, bbox_height={bbox_height:.1f}, calculated font_size={font_size:.1f}")
+            # Stabilize font size for body text (most common case)
+            # Normal body text should be 9-11pt, only deviate for clear outliers
+            element_type = region.get('element_type', 'text')
+            if element_type in ('text', 'paragraph'):
+                # For body text, bias toward 10pt baseline
+                if 7 <= raw_font_size <= 14:
+                    # Near-normal range: use weighted average toward 10pt
+                    font_size = raw_font_size * 0.7 + 10 * 0.3
+                else:
+                    # Clear outlier: use raw but clamp more aggressively
+                    font_size = max(min(raw_font_size, 14), 7)
+            else:
+                # For titles/headers/etc, use raw calculation with wider range
+                font_size = max(min(raw_font_size, 72), 4)
+
+            logger.debug(f"Text has {num_lines} non-empty lines, bbox_height={bbox_height:.1f}, raw={raw_font_size:.1f}, final={font_size:.1f}")

            # Transform coordinates: OCR (top-left origin) → PDF (bottom-left origin)
            # CRITICAL: Y-axis flip!
@@ -2008,24 +2359,45 @@ class PDFGeneratorService:
            result_dir: Directory containing result files (for embedded images)
        """
        try:
+            elem_id = table_element.get('element_id', 'unknown')
+            use_border_only = table_element.get('_use_border_only', False)
+            logger.info(f"[DEBUG] draw_table_region: elem_id={elem_id}, _use_border_only={use_border_only}")
+
            html_content = table_element.get('content', '')
            if not html_content:
+                # Even without HTML, draw border if requested
+                if use_border_only:
+                    self._draw_table_border_only(pdf_canvas, table_element, page_height, scale_w, scale_h)
                return

-            # Try to use cell_boxes for direct rendering first (more accurate)
+            # Apply column correction if enabled
            cell_boxes = table_element.get('cell_boxes', [])
-            if cell_boxes:
-                logger.info(f"[TABLE] Using cell_boxes direct rendering ({len(cell_boxes)} cells)")
-                success = self._draw_table_with_cell_boxes(
-                    pdf_canvas, table_element, page_height,
-                    scale_w, scale_h, result_dir
-                )
-                if success:
-                    return  # Successfully rendered with cell_boxes
+            if (settings.table_column_correction_enabled and
+                TABLE_COLUMN_CORRECTOR_AVAILABLE and
+                cell_boxes):
+                try:
+                    corrector = TableColumnCorrector(
+                        correction_threshold=settings.table_column_correction_threshold,
+                        vertical_merge_enabled=settings.vertical_fragment_merge_enabled,
+                        vertical_aspect_ratio=settings.vertical_fragment_aspect_ratio
+                    )
+                    # Get table bbox for vertical fragment detection
+                    table_bbox = table_element.get('bbox', [])
+                    if isinstance(table_bbox, dict):
+                        table_bbox = [table_bbox['x0'], table_bbox['y0'], table_bbox['x1'], table_bbox['y1']]

-                logger.info("[TABLE] Falling back to ReportLab Table")
+                    corrected_html, stats = corrector.correct(
+                        html=html_content,
+                        cell_boxes=cell_boxes,
+                        table_bbox=table_bbox if isinstance(table_bbox, list) and len(table_bbox) >= 4 else None
+                    )
+                    if stats.get('column_corrections', 0) > 0:
+                        logger.info(f"[TABLE] {elem_id}: Column correction applied - {stats}")
+                        html_content = corrected_html
+                except Exception as e:
+                    logger.warning(f"[TABLE] {elem_id}: Column correction failed: {e}, using original HTML")

-            # Fallback: Parse HTML to extract table structure and use ReportLab Table
+            # Parse HTML first to get table structure for grid validation
            parser = HTMLTableParser()
            parser.feed(html_content)

@@ -2040,6 +2412,83 @@ class PDFGeneratorService:
            if not rows:
                return

+            # Calculate number of rows and columns from HTML for grid validation
+            num_rows = len(rows)
+            max_cols = 0
+            for row in rows:
+                row_cols = sum(cell.get('colspan', 1) for cell in row['cells'])
+                max_cols = max(max_cols, row_cols)
+
+            # Check if table was rebuilt - if so, use HTML content directly
+            was_rebuilt = table_element.get('was_rebuilt', False)
+            cell_boxes_rendered = False  # Track if we rendered borders with cell_boxes
+
+            if was_rebuilt:
+                logger.info(f"[TABLE] {elem_id}: Table was rebuilt, using HTML content directly")
+            elif use_border_only:
+                # Bad quality cell_boxes: skip cell_boxes rendering, use ReportLab Table with borders
+                logger.info(f"[TABLE] {elem_id}: Bad cell_boxes quality, using ReportLab Table with borders")
+            else:
+                # Check if cell_boxes can produce a valid grid before rendering borders
+                cell_boxes = table_element.get('cell_boxes', [])
+                if cell_boxes:
+                    # Get table bbox for grid calculation
+                    temp_bbox = table_element.get('bbox', [])
+                    if isinstance(temp_bbox, dict):
+                        raw_bbox = [temp_bbox['x0'], temp_bbox['y0'], temp_bbox['x1'], temp_bbox['y1']]
+                    elif isinstance(temp_bbox, list) and len(temp_bbox) >= 4:
+                        if isinstance(temp_bbox[0], (int, float)):
+                            raw_bbox = temp_bbox[:4]
+                        else:
+                            raw_bbox = [temp_bbox[0][0], temp_bbox[0][1], temp_bbox[2][0], temp_bbox[2][1]]
+                    else:
+                        raw_bbox = None
+
+                    # Pre-check: can we compute a valid grid from cell_boxes?
+                    if raw_bbox:
+                        test_col_widths, test_row_heights = self._compute_table_grid_from_cell_boxes(
+                            cell_boxes, raw_bbox, num_rows, max_cols
+                        )
+                        grid_valid = test_col_widths is not None and test_row_heights is not None
+
+                        if grid_valid:
+                            logger.info(f"[TABLE] Grid validation passed, rendering borders with cell_boxes")
+                            success = self._draw_table_with_cell_boxes(
+                                pdf_canvas, table_element, page_height,
+                                scale_w, scale_h, result_dir
+                            )
+                            if success:
+                                cell_boxes_rendered = True
+                                logger.info("[TABLE] cell_boxes rendered borders, continuing with text-only ReportLab Table")
+                            else:
+                                logger.info("[TABLE] cell_boxes rendering failed, using ReportLab Table with borders")
+                        else:
+                            # Grid mismatch: try cellboxes-first rendering if enabled
+                            if settings.table_rendering_prefer_cellboxes:
+                                logger.info(f"[TABLE] Grid mismatch, trying cellboxes-first rendering")
+                                from app.services.pdf_table_renderer import TableRenderer, TableRenderConfig
+                                renderer = TableRenderer(TableRenderConfig())
+                                success = renderer.render_from_cellboxes_grid(
+                                    pdf_canvas,
+                                    cell_boxes,
+                                    html_content,
+                                    tuple(raw_bbox),
+                                    page_height,
+                                    scale_w,
+                                    scale_h,
+                                    row_threshold=settings.table_cellboxes_row_threshold,
+                                    col_threshold=settings.table_cellboxes_col_threshold
+                                )
+                                if success:
+                                    logger.info("[TABLE] cellboxes-first rendering succeeded, skipping HTML-based rendering")
+                                    return  # Table fully rendered, exit early
+                                else:
+                                    logger.info("[TABLE] cellboxes-first rendering failed, falling back to HTML-based")
+                            else:
+                                logger.info(f"[TABLE] Grid validation failed (mismatch), using ReportLab Table with borders")
+                    else:
+                        logger.info("[TABLE] No valid bbox for grid validation, using ReportLab Table with borders")
+
            # Get bbox directly from table element
            table_bbox = table_element.get('bbox')

@@ -2106,15 +2555,7 @@ class PDFGeneratorService:
            pdf_y = page_height - ocr_y_bottom

            # Build table data for ReportLab with proper colspan/rowspan handling
-            # First pass: determine the actual grid size by accounting for spans
-            num_rows = len(rows)
-
-            # Calculate actual number of columns by checking first row's total span
-            max_cols = 0
-            for row in rows:
-                row_cols = sum(cell.get('colspan', 1) for cell in row['cells'])
-                max_cols = max(max_cols, row_cols)
-
+            # num_rows and max_cols already calculated above for grid validation
            logger.info(f"[表格] {num_rows}行x{max_cols}列 → PDF位置: ({pdf_x:.1f}, {pdf_y:.1f}), 寬x高: {table_width:.0f}x{table_height:.0f}")

            # Create a grid to track occupied cells (for rowspan handling)
@@ -2223,16 +2664,25 @@ class PDFGeneratorService:
            logger.info(f"[TABLE] Created with {len(col_widths)} cols, {len(row_heights)} rows")

            # Apply table style
-            style = TableStyle([
+            # If cell_boxes rendered borders, skip GRID style (text-only rendering)
+            style_commands = [
                ('FONT', (0, 0), (-1, -1), self.font_name if self.font_registered else 'Helvetica', font_size),
-                ('GRID', (0, 0), (-1, -1), 0.5, colors.black),
                ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
                ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
                ('LEFTPADDING', (0, 0), (-1, -1), 2),
                ('RIGHTPADDING', (0, 0), (-1, -1), 2),
                ('TOPPADDING', (0, 0), (-1, -1), 2),
                ('BOTTOMPADDING', (0, 0), (-1, -1), 2),
-            ])
+            ]
+
+            # Only add GRID if cell_boxes didn't render borders
+            if not cell_boxes_rendered:
+                style_commands.insert(1, ('GRID', (0, 0), (-1, -1), 0.5, colors.black))
+                logger.info("[TABLE] Adding GRID style (cell_boxes not used)")
+            else:
+                logger.info("[TABLE] Skipping GRID style (cell_boxes rendered borders)")
+
+            style = TableStyle(style_commands)

            # Add header style if first row has headers
            if rows and rows[0]['cells'] and rows[0]['cells'][0].get('is_header'):
@@ -2435,6 +2885,106 @@ class PDFGeneratorService:
        logger.debug(f"[TABLE] Normalized {len(cell_boxes)} cell boxes to grid")
        return normalized_boxes

+    def _draw_table_border_only(
+        self,
+        pdf_canvas: canvas.Canvas,
+        table_element: Dict,
+        page_height: float,
+        scale_w: float = 1.0,
+        scale_h: float = 1.0
+    ):
+        """
+        Draw only the outer border of a table (for tables with bad cell_boxes quality).
+
+        Text inside the table will be rendered using raw OCR positions.
+
+        Args:
+            pdf_canvas: ReportLab canvas object
+            table_element: Table element dict
+            page_height: Height of page in PDF coordinates
+            scale_w: Scale factor for X coordinates
+            scale_h: Scale factor for Y coordinates
+        """
+        table_bbox = table_element.get('bbox', [])
+        if not table_bbox or len(table_bbox) < 4:
+            return
+
+        element_id = table_element.get('element_id', 'unknown')
+
+        # Handle different bbox formats
+        if isinstance(table_bbox, dict):
+            x0, y0, x1, y1 = table_bbox['x0'], table_bbox['y0'], table_bbox['x1'], table_bbox['y1']
+        elif isinstance(table_bbox[0], (int, float)):
+            x0, y0, x1, y1 = table_bbox[0], table_bbox[1], table_bbox[2], table_bbox[3]
+        else:
+            return
+
+        # Apply scaling
+        pdf_x0 = x0 * scale_w
+        pdf_y0 = y0 * scale_h
+        pdf_x1 = x1 * scale_w
+        pdf_y1 = y1 * scale_h
+
+        # Convert to PDF coordinates (flip Y)
+        pdf_top = page_height - pdf_y0
+        pdf_bottom = page_height - pdf_y1
+        width = pdf_x1 - pdf_x0
+        height = pdf_y1 - pdf_y0
+
+        # Draw outer border only
+        pdf_canvas.setStrokeColor(colors.black)
+        pdf_canvas.setLineWidth(0.5)
+        pdf_canvas.rect(pdf_x0, pdf_bottom, width, height, stroke=1, fill=0)
+
+        logger.info(f"[TABLE] {element_id}: Drew border only (bad cell_boxes quality)")
+
+    def _check_cell_boxes_quality(self, cell_boxes: List, element_id: str = "") -> str:
+        """
+        Check the quality of cell_boxes to determine rendering strategy.
+
+        Args:
+            cell_boxes: List of cell bounding boxes
+            element_id: Optional element ID for logging
+
+        Returns:
+            'good' if cell_boxes form a proper grid, 'bad' otherwise
+        """
+        # If quality check is disabled, always return 'good' to use pure PP-Structure output
+        if not settings.table_quality_check_enabled:
+            logger.debug(f"[TABLE QUALITY] {element_id}: good - quality check disabled (pure PP-Structure mode)")
+            return 'good'
+
+        if not cell_boxes or len(cell_boxes) < 2:
+            logger.debug(f"[TABLE QUALITY] {element_id}: bad - too few cells ({len(cell_boxes) if cell_boxes else 0})")
+            return 'bad'  # No cell_boxes or too few
+
+        # Count overlapping cell pairs
+        overlap_count = 0
+        for i, box1 in enumerate(cell_boxes):
+            for j, box2 in enumerate(cell_boxes):
+                if i >= j:
+                    continue
+                if not isinstance(box1, (list, tuple)) or len(box1) < 4:
+                    continue
+                if not isinstance(box2, (list, tuple)) or len(box2) < 4:
+                    continue
+                x_overlap = box1[0] < box2[2] and box1[2] > box2[0]
+                y_overlap = box1[1] < box2[3] and box1[3] > box2[1]
+                if x_overlap and y_overlap:
+                    overlap_count += 1
+
+        total_pairs = len(cell_boxes) * (len(cell_boxes) - 1) // 2
+        overlap_ratio = overlap_count / total_pairs if total_pairs > 0 else 0
+
+        # Relaxed threshold: 20% overlap instead of 10% to allow more tables through
+        # This is because PP-StructureV3's cell detection sometimes has slight overlaps
+        if overlap_ratio > 0.20:
+            logger.info(f"[TABLE QUALITY] {element_id}: bad - overlap ratio {overlap_ratio:.2%} > 20%")
+            return 'bad'
+
+        logger.debug(f"[TABLE QUALITY] {element_id}: good - {len(cell_boxes)} cells, overlap {overlap_ratio:.2%}")
+        return 'good'
+
    def _draw_table_with_cell_boxes(
        self,
        pdf_canvas: canvas.Canvas,
@@ -2465,39 +3015,64 @@ class PDFGeneratorService:
        """
        try:
            cell_boxes = table_element.get('cell_boxes', [])
-
-            # Always draw outer table border first (fallback for incomplete cell_boxes)
            table_bbox = table_element.get('bbox', [])
-            if table_bbox and len(table_bbox) >= 4:
-                # Handle different bbox formats (list or dict)
-                if isinstance(table_bbox, dict):
-                    tx1 = float(table_bbox.get('x0', 0))
-                    ty1 = float(table_bbox.get('y0', 0))
-                    tx2 = float(table_bbox.get('x1', 0))
-                    ty2 = float(table_bbox.get('y1', 0))
-                else:
-                    tx1, ty1, tx2, ty2 = table_bbox[:4]

-                # Apply scaling
-                tx1_scaled = tx1 * scale_w
-                ty1_scaled = ty1 * scale_h
-                tx2_scaled = tx2 * scale_w
-                ty2_scaled = ty2 * scale_h
+            # Check cell_boxes quality - skip if they don't form a proper grid
+            if cell_boxes and len(cell_boxes) > 2:
+                # Count overlapping cell pairs
+                overlap_count = 0
+                for i, box1 in enumerate(cell_boxes):
+                    for j, box2 in enumerate(cell_boxes):
+                        if i >= j:
+                            continue
+                        x_overlap = box1[0] < box2[2] and box1[2] > box2[0]
+                        y_overlap = box1[1] < box2[3] and box1[3] > box2[1]
+                        if x_overlap and y_overlap:
+                            overlap_count += 1

-                table_width = tx2_scaled - tx1_scaled
-                table_height = ty2_scaled - ty1_scaled
+                # If more than 25% of cell pairs overlap, cell_boxes are unreliable
+                # Increased from 10% to 25% to allow more tables to use cell_boxes rendering
+                # which provides better visual fidelity than ReportLab Table fallback
+                total_pairs = len(cell_boxes) * (len(cell_boxes) - 1) // 2
+                overlap_ratio = overlap_count / total_pairs if total_pairs > 0 else 0

-                # Transform Y coordinate (PDF uses bottom-left origin)
-                pdf_x = tx1_scaled
-                pdf_y = page_height - ty2_scaled  # Bottom of table in PDF coords
-
-                # Draw outer table border (slightly thicker for visibility)
-                pdf_canvas.setStrokeColor(colors.black)
-                pdf_canvas.setLineWidth(1.0)
-                pdf_canvas.rect(pdf_x, pdf_y, table_width, table_height, stroke=1, fill=0)
-                logger.info(f"[TABLE] Drew outer table border at [{int(tx1)},{int(ty1)},{int(tx2)},{int(ty2)}]")
+                if overlap_ratio > 0.25:
+                    logger.warning(
+                        f"[TABLE] Skipping cell_boxes rendering: {overlap_count}/{total_pairs} "
+                        f"({overlap_ratio:.1%}) cell pairs overlap - using ReportLab Table fallback"
+                    )
+                    return False  # Return False to trigger ReportLab Table fallback

            if not cell_boxes:
+                # Fallback: draw outer border only when no cell_boxes
+                if table_bbox and len(table_bbox) >= 4:
+                    # Handle different bbox formats (list or dict)
+                    if isinstance(table_bbox, dict):
+                        tx1 = float(table_bbox.get('x0', 0))
+                        ty1 = float(table_bbox.get('y0', 0))
+                        tx2 = float(table_bbox.get('x1', 0))
+                        ty2 = float(table_bbox.get('y1', 0))
+                    else:
+                        tx1, ty1, tx2, ty2 = table_bbox[:4]
+
+                    # Apply scaling
+                    tx1_scaled = tx1 * scale_w
+                    ty1_scaled = ty1 * scale_h
+                    tx2_scaled = tx2 * scale_w
+                    ty2_scaled = ty2 * scale_h
+
+                    table_width = tx2_scaled - tx1_scaled
+                    table_height = ty2_scaled - ty1_scaled
+
+                    # Transform Y coordinate (PDF uses bottom-left origin)
+                    pdf_x = tx1_scaled
+                    pdf_y = page_height - ty2_scaled  # Bottom of table in PDF coords
+
+                    # Draw outer table border (slightly thicker for visibility)
+                    pdf_canvas.setStrokeColor(colors.black)
+                    pdf_canvas.setLineWidth(1.0)
+                    pdf_canvas.rect(pdf_x, pdf_y, table_width, table_height, stroke=1, fill=0)
+                    logger.info(f"[TABLE] Drew outer table border at [{int(tx1)},{int(ty1)},{int(tx2)},{int(ty2)}]")
                logger.warning("[TABLE] No cell_boxes available, only outer border drawn")
                # Still draw embedded images even without cell borders
                embedded_images = table_element.get('embedded_images', [])
@@ -2511,31 +3086,47 @@ class PDFGeneratorService:
            # Normalize cell boxes to create aligned grid
            cell_boxes = self._normalize_cell_boxes_to_grid(cell_boxes)

-            logger.info(f"[TABLE] Drawing {len(cell_boxes)} cell borders (layered mode, grid-aligned)")
+            logger.info(f"[TABLE] Drawing {len(cell_boxes)} cells using grid lines (avoiding duplicates)")
+
+            # Collect unique grid lines to avoid drawing duplicate/overlapping lines
+            h_lines = set()  # Horizontal lines: (y, x_start, x_end)
+            v_lines = set()  # Vertical lines: (x, y_start, y_end)

-            # Draw each cell border
            for box in cell_boxes:
                x1, y1, x2, y2 = box[0], box[1], box[2], box[3]

                # Apply scaling
-                x1_scaled = x1 * scale_w
-                y1_scaled = y1 * scale_h
-                x2_scaled = x2 * scale_w
-                y2_scaled = y2 * scale_h
+                x1_s = x1 * scale_w
+                y1_s = y1 * scale_h
+                x2_s = x2 * scale_w
+                y2_s = y2 * scale_h

-                cell_width = x2_scaled - x1_scaled
-                cell_height = y2_scaled - y1_scaled
+                # Round to 1 decimal place to help with deduplication
+                x1_s, y1_s, x2_s, y2_s = round(x1_s, 1), round(y1_s, 1), round(x2_s, 1), round(y2_s, 1)

-                # Transform Y coordinate (PDF uses bottom-left origin)
-                pdf_x = x1_scaled
-                pdf_y = page_height - y2_scaled  # Bottom of cell in PDF coords
+                # Add horizontal lines (top and bottom of cell)
+                h_lines.add((y1_s, x1_s, x2_s))  # Top line
+                h_lines.add((y2_s, x1_s, x2_s))  # Bottom line

-                # Draw cell border only (no fill, no text)
-                pdf_canvas.setStrokeColor(colors.black)
-                pdf_canvas.setLineWidth(0.5)
-                pdf_canvas.rect(pdf_x, pdf_y, cell_width, cell_height, stroke=1, fill=0)
+                # Add vertical lines (left and right of cell)
+                v_lines.add((x1_s, y1_s, y2_s))  # Left line
+                v_lines.add((x2_s, y1_s, y2_s))  # Right line

-            logger.info(f"[TABLE] Drew {len(cell_boxes)} cell borders")
+            # Draw unique horizontal lines
+            pdf_canvas.setStrokeColor(colors.black)
+            pdf_canvas.setLineWidth(0.5)
+
+            for y, x_start, x_end in h_lines:
+                pdf_y = page_height - y  # Transform Y coordinate
+                pdf_canvas.line(x_start, pdf_y, x_end, pdf_y)
+
+            # Draw unique vertical lines
+            for x, y_start, y_end in v_lines:
+                pdf_y_start = page_height - y_start
+                pdf_y_end = page_height - y_end
+                pdf_canvas.line(x, pdf_y_start, x, pdf_y_end)
+
+            logger.info(f"[TABLE] Drew {len(h_lines)} horizontal + {len(v_lines)} vertical grid lines")

            # Draw embedded images
            embedded_images = table_element.get('embedded_images', [])