diff --git a/backend/app/services/direct_extraction_engine.py b/backend/app/services/direct_extraction_engine.py index 7e09b41..0ed9405 100644 --- a/backend/app/services/direct_extraction_engine.py +++ b/backend/app/services/direct_extraction_engine.py @@ -190,6 +190,10 @@ class DirectExtractionEngine: elements = [] element_counter = 0 + # Get page-level metadata (for final Page metadata) + drawings = page.get_drawings() + links = page.get_links() + # Get page dimensions rect = page.rect dimensions = Dimensions( @@ -198,18 +202,8 @@ class DirectExtractionEngine: dpi=72 # PDF standard DPI ) - # Extract text blocks with formatting (sort=True for reading order) - text_dict = page.get_text("dict", sort=True) - for block_idx, block in enumerate(text_dict.get("blocks", [])): - if block.get("type") == 0: # Text block - element = self._process_text_block( - block, page_num, element_counter - ) - if element: - elements.append(element) - element_counter += 1 - - # Extract tables (if enabled) + # Extract tables first (if enabled) to get table regions + table_bboxes = [] if self.enable_table_detection: try: # Try native table detection (PyMuPDF 1.23.0+) @@ -218,16 +212,32 @@ class DirectExtractionEngine: element = self._process_native_table( table, page_num, element_counter ) - if element: + if element and element.bbox: elements.append(element) + table_bboxes.append(element.bbox) element_counter += 1 except AttributeError: # Fallback to positional table detection logger.debug("Native table detection not available, using positional detection") table_elements = self._detect_tables_by_position(page, page_num, element_counter) + for elem in table_elements: + if elem.bbox: + table_bboxes.append(elem.bbox) elements.extend(table_elements) element_counter += len(table_elements) + # Extract text blocks with formatting (sort=True for reading order) + # Filter out lines that overlap with table regions + text_dict = page.get_text("dict", sort=True) + for block_idx, block in enumerate(text_dict.get("blocks", [])): + if block.get("type") == 0: # Text block + element = self._process_text_block( + block, page_num, element_counter, table_bboxes + ) + if element: + elements.append(element) + element_counter += 1 + # Extract images (if enabled) if self.enable_image_extraction: image_elements = self._extract_images( @@ -236,6 +246,14 @@ class DirectExtractionEngine: elements.extend(image_elements) element_counter += len(image_elements) + # Extract vector graphics (charts, diagrams) from drawing commands + if self.enable_image_extraction: + vector_elements = self._extract_vector_graphics( + page, page_num, document_id, element_counter, output_dir + ) + elements.extend(vector_elements) + element_counter += len(vector_elements) + # Extract hyperlinks links = page.get_links() for link_idx, link in enumerate(links): @@ -258,16 +276,15 @@ class DirectExtractionEngine: elements.append(element) element_counter += 1 - # Extract vector graphics (as metadata) - drawings = page.get_drawings() - if drawings: - logger.debug(f"Page {page_num} contains {len(drawings)} vector drawing commands") - # PyMuPDF's sort=True already provides good reading order for multi-column layouts # (top-to-bottom, left-to-right within each row). We don't need to re-sort. # NOTE: If sort=True is not used in get_text(), uncomment the line below: # elements = self._sort_elements_for_reading_order(elements, dimensions) + # Deduplicate: Remove CHART elements that overlap with TABLE elements + # (Tables have structured data, so they take priority over vector graphics) + elements = self._deduplicate_table_chart_overlap(elements) + # Post-process elements for header/footer detection and structure elements = self._detect_headers_footers(elements, dimensions) elements = self._build_section_hierarchy(elements) @@ -519,24 +536,58 @@ class DirectExtractionEngine: return elements - def _process_text_block(self, block: Dict, page_num: int, counter: int) -> Optional[DocumentElement]: - """Process a text block into a DocumentElement""" - # Calculate block bounding box - bbox_data = block.get("bbox", [0, 0, 0, 0]) - bbox = BoundingBox( - x0=bbox_data[0], - y0=bbox_data[1], - x1=bbox_data[2], - y1=bbox_data[3] - ) + def _process_text_block(self, block: Dict, page_num: int, counter: int, + table_bboxes: List[BoundingBox] = None) -> Optional[DocumentElement]: + """ + Process a text block into a DocumentElement. + + Args: + block: Text block from PyMuPDF + page_num: Page number + counter: Element counter + table_bboxes: List of table bounding boxes to filter overlapping lines + + Returns: + DocumentElement or None if all lines overlap with tables + """ + if table_bboxes is None: + table_bboxes = [] # Extract text content and span information + # Filter out lines that significantly overlap with table regions text_parts = [] styles = [] span_children = [] # Store span-level children for inline styling span_counter = 0 + valid_line_bboxes = [] # Track bboxes of valid lines for overall bbox calculation for line in block.get("lines", []): + line_bbox_data = line.get("bbox", [0, 0, 0, 0]) + + # Check if this line overlaps with any table region + line_overlaps_table = False + for table_bbox in table_bboxes: + overlap_x0 = max(line_bbox_data[0], table_bbox.x0) + overlap_y0 = max(line_bbox_data[1], table_bbox.y0) + overlap_x1 = min(line_bbox_data[2], table_bbox.x1) + overlap_y1 = min(line_bbox_data[3], table_bbox.y1) + + if overlap_x0 < overlap_x1 and overlap_y0 < overlap_y1: + # Calculate overlap ratio + line_height = line_bbox_data[3] - line_bbox_data[1] + overlap_height = overlap_y1 - overlap_y0 + if line_height > 0: + overlap_ratio = overlap_height / line_height + if overlap_ratio >= 0.5: # Line significantly overlaps with table + line_overlaps_table = True + break + + if line_overlaps_table: + continue # Skip this line + + # Process valid line + valid_line_bboxes.append(line_bbox_data) + for span in line.get("spans", []): text = span.get("text", "") if text: @@ -553,7 +604,7 @@ class DirectExtractionEngine: styles.append(style) # Create span child element for inline styling - span_bbox_data = span.get("bbox", bbox_data) + span_bbox_data = span.get("bbox", [0, 0, 0, 0]) span_bbox = BoundingBox( x0=span_bbox_data[0], y0=span_bbox_data[1], @@ -574,10 +625,22 @@ class DirectExtractionEngine: span_counter += 1 if not text_parts: - return None + return None # All lines overlapped with tables full_text = "".join(text_parts) + # Calculate bbox from valid lines only + if valid_line_bboxes: + min_x0 = min(b[0] for b in valid_line_bboxes) + min_y0 = min(b[1] for b in valid_line_bboxes) + max_x1 = max(b[2] for b in valid_line_bboxes) + max_y1 = max(b[3] for b in valid_line_bboxes) + bbox = BoundingBox(x0=min_x0, y0=min_y0, x1=max_x1, y1=max_y1) + else: + # Fallback to original bbox if no valid lines found + bbox_data = block.get("bbox", [0, 0, 0, 0]) + bbox = BoundingBox(x0=bbox_data[0], y0=bbox_data[1], x1=bbox_data[2], y1=bbox_data[3]) + # Determine element type based on content and style element_type = self._infer_element_type(full_text, styles) @@ -642,6 +705,30 @@ class DirectExtractionEngine: y1=bbox_data[3] ) + # Extract column widths from table cells + column_widths = [] + if hasattr(table, 'cells') and table.cells: + # Group cells by column + cols_x = {} + for cell in table.cells: + col_idx = None + # Determine column index by x0 position + for idx, x0 in enumerate(sorted(set(c[0] for c in table.cells))): + if abs(cell[0] - x0) < 1.0: # Within 1pt tolerance + col_idx = idx + break + + if col_idx is not None: + if col_idx not in cols_x: + cols_x[col_idx] = {'x0': cell[0], 'x1': cell[2]} + else: + cols_x[col_idx]['x1'] = max(cols_x[col_idx]['x1'], cell[2]) + + # Calculate width for each column + for col_idx in sorted(cols_x.keys()): + width = cols_x[col_idx]['x1'] - cols_x[col_idx]['x0'] + column_widths.append(width) + # Create table cells cells = [] for row_idx, row in enumerate(data): @@ -661,12 +748,16 @@ class DirectExtractionEngine: headers=data[0] if data else None # Assume first row is header ) + # Store column widths in metadata + metadata = {"column_widths": column_widths} if column_widths else None + return DocumentElement( element_id=f"table_{page_num}_{counter}", type=ElementType.TABLE, content=table_data, bbox=bbox, - confidence=1.0 + confidence=1.0, + metadata=metadata ) except Exception as e: @@ -908,4 +999,298 @@ class DirectExtractionEngine: except Exception as e: logger.error(f"Error extracting image {img_idx}: {e}") - return elements \ No newline at end of file + return elements + + def _extract_vector_graphics(self, + page: fitz.Page, + page_num: int, + document_id: str, + counter: int, + output_dir: Optional[Path]) -> List[DocumentElement]: + """ + Extract vector graphics (charts, diagrams) from page. + + This method identifies regions that are composed of vector drawing commands + (paths, lines, rectangles) rather than embedded raster images. These are + typically charts created in Excel, vector diagrams, or other graphics. + + Args: + page: PyMuPDF page object + page_num: Page number (1-indexed) + document_id: Unique document identifier + counter: Starting counter for element IDs + output_dir: Directory to save rendered graphics + + Returns: + List of DocumentElement objects representing vector graphics + """ + elements = [] + + try: + # Get all drawing commands + drawings = page.get_drawings() + if not drawings: + return elements + + logger.debug(f"Page {page_num} contains {len(drawings)} vector drawing commands") + + # Cluster drawings into groups (charts, diagrams, etc.) + try: + # PyMuPDF's cluster_drawings() groups nearby drawings automatically + drawing_clusters = page.cluster_drawings() + logger.debug(f"Clustered into {len(drawing_clusters)} groups") + except (AttributeError, TypeError) as e: + # cluster_drawings not available or has different signature + # Fallback: try to identify charts by analyzing drawing density + logger.warning(f"cluster_drawings() failed ({e}), using fallback method") + drawing_clusters = self._cluster_drawings_fallback(page, drawings) + + for cluster_idx, bbox in enumerate(drawing_clusters): + # Ignore small regions (likely noise or separator lines) + if bbox.width < 50 or bbox.height < 50: + logger.debug(f"Skipping small cluster {cluster_idx}: {bbox.width:.1f}x{bbox.height:.1f}") + continue + + # Render the region to a raster image + # matrix=fitz.Matrix(2, 2) increases resolution to ~200 DPI + try: + pix = page.get_pixmap(clip=bbox, matrix=fitz.Matrix(2, 2)) + + # Save image if output directory provided + if output_dir: + output_dir.mkdir(parents=True, exist_ok=True) + filename = f"{document_id}_p{page_num}_chart{cluster_idx}.png" + filepath = output_dir / filename + pix.save(str(filepath)) + + # Create DocumentElement + image_data = { + "saved_path": str(filepath), + "width": pix.width, + "height": pix.height, + "colorspace": pix.colorspace.name if pix.colorspace else "unknown", + "source": "vector_graphics" + } + + element = DocumentElement( + element_id=f"chart_{page_num}_{counter + cluster_idx}", + type=ElementType.CHART, # Use CHART type for vector graphics + content=image_data, + bbox=BoundingBox( + x0=bbox.x0, + y0=bbox.y0, + x1=bbox.x1, + y1=bbox.y1 + ), + confidence=0.85, # Slightly lower confidence than raster images + metadata={ + "cluster_index": cluster_idx, + "drawing_count": len(drawings) + } + ) + elements.append(element) + logger.debug(f"Extracted chart {cluster_idx}: {bbox.width:.1f}x{bbox.height:.1f} -> {filepath}") + + pix = None # Free memory + + except Exception as e: + logger.error(f"Error rendering vector graphic cluster {cluster_idx}: {e}") + continue + + except Exception as e: + logger.error(f"Error extracting vector graphics: {e}") + + return elements + + def _cluster_drawings_fallback(self, page: fitz.Page, drawings: list) -> list: + """ + Fallback method to cluster drawings when cluster_drawings() is not available. + + This uses a simple spatial clustering approach based on bounding boxes. + """ + if not drawings: + return [] + + # Collect all drawing bounding boxes + bboxes = [] + for drawing in drawings: + rect = drawing.get('rect') + if rect: + bboxes.append(fitz.Rect(rect)) + + if not bboxes: + return [] + + # Simple clustering: merge overlapping or nearby rectangles + clusters = [] + tolerance = 20 + + for bbox in bboxes: + # Try to merge with existing cluster + merged = False + for i, cluster in enumerate(clusters): + # Check if bbox is close to this cluster + expanded_cluster = cluster + (-tolerance, -tolerance, tolerance, tolerance) + if expanded_cluster.intersects(bbox): + # Merge bbox into cluster + clusters[i] = cluster | bbox # Union of rectangles + merged = True + break + + if not merged: + # Create new cluster + clusters.append(bbox) + + # Filter out very small clusters + filtered_clusters = [c for c in clusters if c.width >= 50 and c.height >= 50] + + logger.debug(f"Fallback clustering: {len(bboxes)} drawings -> {len(clusters)} clusters -> {len(filtered_clusters)} filtered") + + return filtered_clusters + + def _deduplicate_table_chart_overlap(self, elements: List[DocumentElement]) -> List[DocumentElement]: + """ + Intelligently resolve TABLE-CHART overlaps based on table structure completeness. + + When a region is detected as both TABLE and CHART: + - Calculate cell completeness = actual_cells / (rows × cols) + - If completeness ≥50% → Real table with complete structure → Keep TABLE + - If completeness <50% → False positive (chart detected as table) → Keep CHART + + Args: + elements: List of extracted elements + + Returns: + Filtered list with low-quality overlaps removed + """ + # Collect all tables and charts + tables = [elem for elem in elements if elem.type == ElementType.TABLE] + charts = [elem for elem in elements if elem.type == ElementType.CHART] + + if not tables or not charts: + return elements # No potential conflicts + + # Analyze TABLE structure completeness + table_completeness = {} + for table in tables: + if hasattr(table.content, 'rows') and hasattr(table.content, 'cols') and hasattr(table.content, 'cells'): + expected_cells = table.content.rows * table.content.cols + actual_cells = len(table.content.cells) + + if expected_cells > 0: + completeness = actual_cells / expected_cells + table_completeness[table.element_id] = completeness + else: + table_completeness[table.element_id] = 0.0 + else: + table_completeness[table.element_id] = 0.0 + + # Check overlaps and decide what to keep + filtered_elements = [] + removed_charts = 0 + removed_tables = 0 + + # Process TABLEs + for table in tables: + if not table.bbox: + filtered_elements.append(table) + continue + + # Check if this TABLE overlaps with any CHART + overlaps_chart = False + for chart in charts: + if not chart.bbox: + continue + + # Calculate overlap + overlap_x0 = max(table.bbox.x0, chart.bbox.x0) + overlap_y0 = max(table.bbox.y0, chart.bbox.y0) + overlap_x1 = min(table.bbox.x1, chart.bbox.x1) + overlap_y1 = min(table.bbox.y1, chart.bbox.y1) + + if overlap_x0 < overlap_x1 and overlap_y0 < overlap_y1: + overlap_area = (overlap_x1 - overlap_x0) * (overlap_y1 - overlap_y0) + table_area = (table.bbox.x1 - table.bbox.x0) * (table.bbox.y1 - table.bbox.y0) + + if table_area > 0: + overlap_ratio = overlap_area / table_area + + if overlap_ratio >= 0.8: + overlaps_chart = True + completeness = table_completeness.get(table.element_id, 0.0) + + logger.debug( + f"TABLE-CHART overlap: {table.element_id} vs {chart.element_id}: " + f"{overlap_ratio*100:.1f}% overlap, TABLE cell completeness: {completeness*100:.1f}%" + ) + + # Decision: Keep TABLE only if structure is complete + if completeness < 0.5: # <50% cell completeness + logger.info( + f"Removing incomplete TABLE {table.element_id} " + f"({completeness*100:.1f}% completeness, overlaps with CHART {chart.element_id})" + ) + removed_tables += 1 + break + else: + logger.info( + f"Keeping TABLE {table.element_id} with {completeness*100:.1f}% completeness " + f"(will remove overlapping CHART {chart.element_id})" + ) + + if not overlaps_chart or table_completeness.get(table.element_id, 0.0) >= 0.5: + filtered_elements.append(table) + + # Process CHARTs + for chart in charts: + if not chart.bbox: + filtered_elements.append(chart) + continue + + # Check if this CHART should be removed due to overlap with high-quality TABLE + should_remove = False + for table in tables: + if not table.bbox: + continue + + # Calculate overlap + overlap_x0 = max(chart.bbox.x0, table.bbox.x0) + overlap_y0 = max(chart.bbox.y0, table.bbox.y0) + overlap_x1 = min(chart.bbox.x1, table.bbox.x1) + overlap_y1 = min(chart.bbox.y1, table.bbox.y1) + + if overlap_x0 < overlap_x1 and overlap_y0 < overlap_y1: + overlap_area = (overlap_x1 - overlap_x0) * (overlap_y1 - overlap_y0) + chart_area = (chart.bbox.x1 - chart.bbox.x0) * (chart.bbox.y1 - chart.bbox.y0) + + if chart_area > 0: + overlap_ratio = overlap_area / chart_area + + if overlap_ratio >= 0.8: + completeness = table_completeness.get(table.element_id, 0.0) + + # Remove CHART only if TABLE structure is complete + if completeness >= 0.5: + should_remove = True + logger.info( + f"Removing CHART {chart.element_id} " + f"({overlap_ratio*100:.1f}% overlap with TABLE {table.element_id} having {completeness*100:.1f}% completeness)" + ) + removed_charts += 1 + break + + if not should_remove: + filtered_elements.append(chart) + + # Process all other elements + for elem in elements: + if elem.type not in [ElementType.TABLE, ElementType.CHART]: + filtered_elements.append(elem) + + if removed_charts > 0 or removed_tables > 0: + logger.info( + f"Deduplication complete: removed {removed_tables} incomplete TABLE(s), " + f"{removed_charts} overlapping CHART(s)" + ) + + return filtered_elements \ No newline at end of file diff --git a/backend/app/services/pdf_generator_service.py b/backend/app/services/pdf_generator_service.py index 4556e50..bcffbb1 100644 --- a/backend/app/services/pdf_generator_service.py +++ b/backend/app/services/pdf_generator_service.py @@ -744,7 +744,15 @@ class PDFGeneratorService: all_elements.append(('text', elem)) logger.debug(f"Drawing {len(all_elements)} elements in extraction order (preserves multi-column reading order)") - logger.debug(f"Exclusion regions: {len(regions_to_avoid)} tables/images") + logger.debug(f"Exclusion regions: {len(regions_to_avoid)} (tables/images/charts)") + + # Debug: Log exclusion region types + region_types = {} + for region in regions_to_avoid: + region_type = region.type.name + region_types[region_type] = region_types.get(region_type, 0) + 1 + if region_types: + logger.debug(f" Exclusion region breakdown: {region_types}") # Draw elements in document order for elem_type, elem in all_elements: @@ -2133,7 +2141,8 @@ class PDFGeneratorService: # Transform coordinates pdf_x = bbox.x0 - pdf_y = page_height - bbox.y1 # Bottom of table + # Use exact bbox position (no buffer) - scaling will ensure table fits + pdf_y = page_height - bbox.y1 # Bottom of table (ReportLab Y coordinate) table_width = bbox.x1 - bbox.x0 table_height = bbox.y1 - bbox.y0 @@ -2148,20 +2157,53 @@ class PDFGeneratorService: from reportlab.platypus import Table, TableStyle from reportlab.lib import colors - t = Table(table_content, colWidths=[table_width / len(table_content[0])] * len(table_content[0])) + # Use original column widths from extraction if available + # Otherwise let ReportLab auto-calculate + col_widths = None + if element.metadata and 'column_widths' in element.metadata: + col_widths = element.metadata['column_widths'] + logger.debug(f"Using extracted column widths: {col_widths}") - # Apply style + # Create table without rowHeights (will use canvas scaling instead) + t = Table(table_content, colWidths=col_widths) + + # Apply style with minimal padding to reduce table extension style = TableStyle([ ('GRID', (0, 0), (-1, -1), 0.5, colors.grey), ('FONTSIZE', (0, 0), (-1, -1), 8), ('ALIGN', (0, 0), (-1, -1), 'LEFT'), ('VALIGN', (0, 0), (-1, -1), 'TOP'), + # Set minimal padding to prevent table from extending beyond bbox + # User reported padding=1 was still insufficient + ('TOPPADDING', (0, 0), (-1, -1), 0), + ('BOTTOMPADDING', (0, 0), (-1, -1), 0), + ('LEFTPADDING', (0, 0), (-1, -1), 1), + ('RIGHTPADDING', (0, 0), (-1, -1), 1), ]) t.setStyle(style) - # Draw table - t.wrapOn(pdf_canvas, table_width, table_height) - t.drawOn(pdf_canvas, pdf_x, pdf_y) + # CRITICAL: Use canvas scaling to fit table within bbox + # This is more reliable than rowHeights which doesn't always work + + # Step 1: Wrap to get actual rendered size + actual_width, actual_height = t.wrapOn(pdf_canvas, table_width * 10, table_height * 10) + logger.info(f"Table natural size: {actual_width:.1f} × {actual_height:.1f}pt, bbox: {table_width:.1f} × {table_height:.1f}pt") + + # Step 2: Calculate scale factor to fit within bbox + scale_x = table_width / actual_width if actual_width > table_width else 1.0 + scale_y = table_height / actual_height if actual_height > table_height else 1.0 + scale = min(scale_x, scale_y, 1.0) # Never scale up, only down + + logger.info(f"Scale factor: {scale:.3f} (x={scale_x:.3f}, y={scale_y:.3f})") + + # Step 3: Draw with scaling using canvas transform + pdf_canvas.saveState() + pdf_canvas.translate(pdf_x, pdf_y) + pdf_canvas.scale(scale, scale) + t.drawOn(pdf_canvas, 0, 0) + pdf_canvas.restoreState() + + logger.info(f"Drew table at ({pdf_x:.1f}, {pdf_y:.1f}) with scale {scale:.3f}, final size: {actual_width * scale:.1f} × {actual_height * scale:.1f}pt") logger.debug(f"Drew table element: {len(rows)} rows") diff --git a/backend/check_tables.py b/backend/check_tables.py deleted file mode 100644 index a1ea620..0000000 --- a/backend/check_tables.py +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env python3 -"""Check existing tables""" - -from sqlalchemy import create_engine, text -from app.core.config import settings - -engine = create_engine(settings.database_url) - -with engine.connect() as conn: - # Get all tables - result = conn.execute(text("SHOW TABLES")) - tables = [row[0] for row in result.fetchall()] - - print("Existing tables:") - for table in sorted(tables): - print(f" - {table}") - - # Check which V2 tables exist - v2_tables = ['tool_ocr_users', 'tool_ocr_sessions', 'tool_ocr_tasks', - 'tool_ocr_task_files', 'tool_ocr_audit_logs'] - print("\nV2 Tables status:") - for table in v2_tables: - exists = table in tables - print(f" {'✓' if exists else '✗'} {table}") - - # Check which old tables exist - old_tables = ['paddle_ocr_users', 'paddle_ocr_batches', 'paddle_ocr_files', - 'paddle_ocr_results', 'paddle_ocr_export_rules', 'paddle_ocr_translation_configs'] - print("\nOld Tables status:") - for table in old_tables: - exists = table in tables - print(f" {'✓' if exists else '✗'} {table}") diff --git a/backend/verify_chart_recognition.py b/backend/verify_chart_recognition.py deleted file mode 100755 index 4e21fd2..0000000 --- a/backend/verify_chart_recognition.py +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env python3 -""" -Verify if chart recognition can be enabled in the current PaddlePaddle version -Run this in the conda environment: conda activate tool_ocr && python verify_chart_recognition.py -""" - -import sys - -def check_paddle_api(): - """Check if fused_rms_norm_ext API is available""" - try: - import paddle - print(f"✅ PaddlePaddle version: {paddle.__version__}") - - # Check if the API exists - import paddle.incubate.nn.functional as F - - has_base = hasattr(F, 'fused_rms_norm') - has_ext = hasattr(F, 'fused_rms_norm_ext') - - print(f"\n📊 API Availability:") - print(f" - fused_rms_norm: {'✅ Available' if has_base else '❌ Not found'}") - print(f" - fused_rms_norm_ext: {'✅ Available' if has_ext else '❌ Not found'}") - - if has_ext: - print(f"\n🎉 Chart recognition CAN be enabled!") - print(f"\n📝 Action required:") - print(f" 1. Edit backend/app/services/ocr_service.py") - print(f" 2. Change line 217: use_chart_recognition=False → True") - print(f" 3. Restart the backend service") - print(f"\n⚠️ Note: This will enable deep chart analysis (may increase processing time)") - return True - else: - print(f"\n❌ Chart recognition CANNOT be enabled yet") - print(f"\n📝 Current PaddlePaddle version ({paddle.__version__}) does not support fused_rms_norm_ext") - print(f"\n💡 Options:") - print(f" 1. Upgrade PaddlePaddle: pip install --upgrade paddlepaddle>=3.2.0") - print(f" 2. Check for newer versions: pip search paddlepaddle") - print(f" 3. Wait for official PaddlePaddle update") - return False - - except ImportError as e: - print(f"❌ PaddlePaddle not installed: {e}") - print(f"\n💡 Install PaddlePaddle:") - print(f" pip install paddlepaddle>=3.2.0") - return False - except Exception as e: - print(f"❌ Error: {e}") - return False - -if __name__ == "__main__": - print("=" * 70) - print("Chart Recognition Availability Checker") - print("=" * 70) - print() - - can_enable = check_paddle_api() - - print() - print("=" * 70) - sys.exit(0 if can_enable else 1) diff --git a/demo_docs/basic/chinese_simple.png b/demo_docs/basic/chinese_simple.png deleted file mode 100644 index 287284d..0000000 Binary files a/demo_docs/basic/chinese_simple.png and /dev/null differ diff --git a/demo_docs/basic/chinese_traditional.png b/demo_docs/basic/chinese_traditional.png deleted file mode 100644 index d604cf1..0000000 Binary files a/demo_docs/basic/chinese_traditional.png and /dev/null differ diff --git a/demo_docs/basic/english.png b/demo_docs/basic/english.png deleted file mode 100644 index f273a66..0000000 Binary files a/demo_docs/basic/english.png and /dev/null differ diff --git a/demo_docs/layout/document.png b/demo_docs/layout/document.png deleted file mode 100644 index e934939..0000000 Binary files a/demo_docs/layout/document.png and /dev/null differ diff --git a/demo_docs/mixed/4. (附件二)具體事蹟簡報格式(最佳創新獎).pdf b/demo_docs/mixed/4. (附件二)具體事蹟簡報格式(最佳創新獎).pdf deleted file mode 100644 index 7ff1d4c..0000000 Binary files a/demo_docs/mixed/4. (附件二)具體事蹟簡報格式(最佳創新獎).pdf and /dev/null differ diff --git a/demo_docs/mixed/Workflow使用分析.pdf b/demo_docs/mixed/Workflow使用分析.pdf deleted file mode 100644 index ab6f6e7..0000000 Binary files a/demo_docs/mixed/Workflow使用分析.pdf and /dev/null differ diff --git a/demo_docs/tables/simple_table.png b/demo_docs/tables/simple_table.png deleted file mode 100644 index b619e8a..0000000 Binary files a/demo_docs/tables/simple_table.png and /dev/null differ diff --git a/demo_docs/tables/截圖 2025-11-12 上午10.33.12.png b/demo_docs/tables/截圖 2025-11-12 上午10.33.12.png deleted file mode 100644 index 2e7166a..0000000 Binary files a/demo_docs/tables/截圖 2025-11-12 上午10.33.12.png and /dev/null differ diff --git a/demo_docs/tables/截圖 2025-11-12 上午10.34.33.png b/demo_docs/tables/截圖 2025-11-12 上午10.34.33.png deleted file mode 100644 index 8e4d7f3..0000000 Binary files a/demo_docs/tables/截圖 2025-11-12 上午10.34.33.png and /dev/null differ