chore: backup before code cleanup

Backup commit before executing remove-unused-code proposal.
This includes all pending changes and new features.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
egg
2025-12-11 11:55:39 +08:00
parent eff9b0bcd5
commit 940a406dce
58 changed files with 8226 additions and 175 deletions

View File

@@ -28,9 +28,11 @@ from PIL import Image
import numpy as np
import cv2
from app.models.unified_document import ElementType
from app.services.cell_validation_engine import CellValidationEngine, CellValidationConfig
from app.core.config import settings
from app.services.memory_manager import prediction_context
from app.services.cv_table_detector import CVTableDetector
from app.services.table_content_rebuilder import TableContentRebuilder
logger = logging.getLogger(__name__)
@@ -91,7 +93,8 @@ class PPStructureEnhanced:
preprocessed_image: Optional[Image.Image] = None,
scaling_info: Optional['ScalingInfo'] = None,
save_visualization: bool = False,
use_cv_table_detection: bool = False
use_cv_table_detection: bool = False,
raw_ocr_regions: Optional[List[Dict[str, Any]]] = None
) -> Dict[str, Any]:
"""
Analyze document with full PP-StructureV3 capabilities.
@@ -110,6 +113,8 @@ class PPStructureEnhanced:
(layout_det_res, layout_order_res, overall_ocr_res, etc.)
use_cv_table_detection: If True, use CV-based line detection for wired tables
instead of ML-based cell detection (RT-DETR-L)
raw_ocr_regions: Optional list of raw OCR text regions for table content
rebuilding. Used when PP-StructureV3's table HTML is incorrect.
Returns:
Dictionary with complete structure information including:
@@ -222,6 +227,7 @@ class PPStructureEnhanced:
# Extract table_res_list which contains cell_box_list
layout_det_res = None
overall_ocr_res = None
if result_dict:
if 'table_res_list' in result_dict:
table_res_list = result_dict['table_res_list']
@@ -235,13 +241,20 @@ class PPStructureEnhanced:
layout_det_res = result_dict['layout_det_res']
logger.info(f"Found layout_det_res with {len(layout_det_res.get('boxes', []))} boxes")
# Extract overall_ocr_res for gap filling (avoid separate Raw OCR inference)
if 'overall_ocr_res' in result_dict:
overall_ocr_res = result_dict['overall_ocr_res']
ocr_count = len(overall_ocr_res.get('rec_texts', []))
logger.info(f"Found overall_ocr_res with {ocr_count} text regions")
# Process parsing_res_list if found
if parsing_res_list:
elements = self._process_parsing_res_list(
parsing_res_list, current_page, output_dir, image_path, scaling_info,
table_res_list=table_res_list, # Pass table_res_list for cell_box_list
layout_det_res=layout_det_res, # Pass layout_det_res for Image-in-Table
use_cv_table_detection=use_cv_table_detection # Use CV for wired tables
use_cv_table_detection=use_cv_table_detection, # Use CV for wired tables
raw_ocr_regions=raw_ocr_regions # Pass raw OCR for table content rebuilding
)
all_elements.extend(elements)
@@ -289,6 +302,15 @@ class PPStructureEnhanced:
if visualization_dir:
result['visualization_dir'] = str(visualization_dir)
# Add overall_ocr_res for gap filling (converted to standard format)
# This allows gap_filling_service to use PP-StructureV3's internal OCR
# instead of running a separate Raw OCR inference
if overall_ocr_res:
result['overall_ocr_res'] = self._convert_overall_ocr_to_regions(
overall_ocr_res, scaling_info
)
logger.info(f"Converted {len(result['overall_ocr_res'])} OCR regions from overall_ocr_res")
return result
except Exception as e:
@@ -327,7 +349,8 @@ class PPStructureEnhanced:
scaling_info: Optional['ScalingInfo'] = None,
table_res_list: Optional[List[Dict]] = None,
layout_det_res: Optional[Dict] = None,
use_cv_table_detection: bool = False
use_cv_table_detection: bool = False,
raw_ocr_regions: Optional[List[Dict[str, Any]]] = None
) -> List[Dict[str, Any]]:
"""
Process parsing_res_list to extract all elements.
@@ -341,6 +364,7 @@ class PPStructureEnhanced:
table_res_list: Optional list of table results containing cell_box_list
layout_det_res: Optional layout detection result for Image-in-Table processing
use_cv_table_detection: If True, use CV line detection for wired tables
raw_ocr_regions: Optional list of raw OCR text regions for table content rebuilding
Returns:
List of processed elements with normalized structure
@@ -415,6 +439,11 @@ class PPStructureEnhanced:
mapped_type = ElementType.TABLE
html_table_content = content # Store for later use
# Strip LaTeX math formatting from text content (PP-Structure formula detection)
if content and mapped_type in [ElementType.TEXT, ElementType.TITLE, ElementType.HEADER]:
if '$' in content and '\\' in content:
content = self._strip_latex_math(content)
# Create element
element = {
'element_id': f"pp3_{current_page}_{idx}",
@@ -468,18 +497,84 @@ class PPStructureEnhanced:
logger.info(f"[TABLE] Found {len(cell_boxes)} cell boxes from table_res_list (HTML match)")
break
# If no HTML match, use first available table_res with cell_box_list
# If no HTML match, find best matching table_res by bbox overlap
if not cell_boxes_extracted:
best_match = None
best_overlap = 0.0
for tbl_res in table_res_list:
if 'cell_box_list' in tbl_res and tbl_res['cell_box_list']:
cell_boxes = tbl_res['cell_box_list']
element['cell_boxes'] = [[float(c) for c in box] for box in cell_boxes]
element['cell_boxes_source'] = 'table_res_list'
cell_boxes_extracted = True
logger.info(f"[TABLE] Found {len(cell_boxes)} cell boxes from table_res_list (first available)")
# Remove used table_res to avoid reuse
table_res_list.remove(tbl_res)
break
if 'cell_box_list' not in tbl_res or not tbl_res['cell_box_list']:
continue
# Get table_res bbox from its cell_box_list
cell_boxes_temp = tbl_res['cell_box_list']
if not cell_boxes_temp:
continue
# Calculate bounding box of all cells
tbl_x1 = min(cb[0] for cb in cell_boxes_temp)
tbl_y1 = min(cb[1] for cb in cell_boxes_temp)
tbl_x2 = max(cb[2] for cb in cell_boxes_temp)
tbl_y2 = max(cb[3] for cb in cell_boxes_temp)
# Calculate IoU (Intersection over Union) with element bbox
# bbox is [x1, y1, x2, y2]
elem_x1, elem_y1, elem_x2, elem_y2 = bbox[0], bbox[1], bbox[2], bbox[3]
# Intersection
inter_x1 = max(tbl_x1, elem_x1)
inter_y1 = max(tbl_y1, elem_y1)
inter_x2 = min(tbl_x2, elem_x2)
inter_y2 = min(tbl_y2, elem_y2)
if inter_x1 < inter_x2 and inter_y1 < inter_y2:
inter_area = (inter_x2 - inter_x1) * (inter_y2 - inter_y1)
elem_area = (elem_x2 - elem_x1) * (elem_y2 - elem_y1)
tbl_area = (tbl_x2 - tbl_x1) * (tbl_y2 - tbl_y1)
# Use overlap ratio with element bbox (how much of element is covered)
overlap_ratio = inter_area / elem_area if elem_area > 0 else 0
if overlap_ratio > best_overlap:
best_overlap = overlap_ratio
best_match = tbl_res
# Use best match if overlap is significant (>10%)
if best_match and best_overlap > 0.1:
cell_boxes = best_match['cell_box_list']
element['cell_boxes'] = [[float(c) for c in box] for box in cell_boxes]
element['cell_boxes_source'] = 'table_res_list'
cell_boxes_extracted = True
logger.info(f"[TABLE] Found {len(cell_boxes)} cell boxes from table_res_list (bbox match, overlap={best_overlap:.2f})")
# Extract pred_html if not already set
if not html_content and 'pred_html' in best_match:
html_content = best_match['pred_html']
element['html'] = html_content
element['extracted_text'] = self._extract_text_from_html(html_content)
logger.info(f"[TABLE] Extracted HTML from table_res_list (bbox match, {len(html_content)} chars)")
# Remove used table_res to avoid reuse
table_res_list.remove(best_match)
elif table_res_list:
# Fallback to first available if no bbox match found
for tbl_res in table_res_list:
if 'cell_box_list' in tbl_res and tbl_res['cell_box_list']:
cell_boxes = tbl_res['cell_box_list']
element['cell_boxes'] = [[float(c) for c in box] for box in cell_boxes]
element['cell_boxes_source'] = 'table_res_list'
cell_boxes_extracted = True
logger.warning(f"[TABLE] Using first available table_res (no bbox match, {len(cell_boxes)} cells)")
# Extract pred_html if not already set
if not html_content and 'pred_html' in tbl_res:
html_content = tbl_res['pred_html']
element['html'] = html_content
element['extracted_text'] = self._extract_text_from_html(html_content)
logger.info(f"[TABLE] Extracted HTML from table_res_list (fallback, {len(html_content)} chars)")
table_res_list.remove(tbl_res)
break
if not cell_boxes_extracted and 'boxes' in res_data:
# PPStructureV3 returned cell boxes in res (unlikely in PaddleX 3.x)
@@ -558,6 +653,42 @@ class PPStructureEnhanced:
element['embedded_images'] = embedded_images
logger.info(f"[TABLE] Embedded {len(embedded_images)} images into table")
# 4. Table content rebuilding from raw OCR regions
# When cell_boxes have boundary issues, rebuild table content from raw OCR
# Only if table_content_rebuilder is enabled (disabled by default as it's a patch behavior)
logger.info(f"[TABLE] raw_ocr_regions available: {raw_ocr_regions is not None and len(raw_ocr_regions) if raw_ocr_regions else 0}")
logger.info(f"[TABLE] cell_boxes available: {len(element.get('cell_boxes', []))}")
if settings.table_content_rebuilder_enabled and raw_ocr_regions and element.get('cell_boxes'):
rebuilder = TableContentRebuilder()
should_rebuild, rebuild_reason = rebuilder.should_rebuild(
element['cell_boxes'],
bbox,
element.get('html', '')
)
if should_rebuild:
logger.info(f"[TABLE] Triggering table rebuild: {rebuild_reason}")
rebuilt_table, rebuild_stats = rebuilder.rebuild_table(
cell_boxes=element['cell_boxes'],
table_bbox=bbox,
raw_ocr_regions=raw_ocr_regions,
original_html=element.get('html', '')
)
if rebuilt_table:
# Update element with rebuilt content
element['html'] = rebuilt_table['html']
element['rebuilt_table'] = rebuilt_table
element['rebuild_stats'] = rebuild_stats
element['extracted_text'] = self._extract_text_from_html(rebuilt_table['html'])
logger.info(
f"[TABLE] Rebuilt table: {rebuilt_table['rows']}x{rebuilt_table['cols']} "
f"with {len(rebuilt_table['cells'])} cells"
)
else:
logger.warning(f"[TABLE] Rebuild failed: {rebuild_stats.get('reason', 'unknown')}")
element['rebuild_stats'] = rebuild_stats
# Special handling for images/figures/charts/stamps (visual elements that need cropping)
elif mapped_type in [ElementType.IMAGE, ElementType.FIGURE, ElementType.CHART, ElementType.DIAGRAM, ElementType.STAMP, ElementType.LOGO]:
# Save image if path provided
@@ -587,6 +718,21 @@ class PPStructureEnhanced:
elements.append(element)
logger.debug(f"Processed element {idx}: type={mapped_type}, bbox={bbox}")
# Apply cell validation to filter over-detected tables
if settings.cell_validation_enabled:
cell_validator = CellValidationEngine(CellValidationConfig(
max_cell_density=settings.cell_validation_max_density,
min_avg_cell_area=settings.cell_validation_min_cell_area,
min_cell_height=settings.cell_validation_min_cell_height,
enabled=True
))
elements, validation_stats = cell_validator.validate_and_filter_elements(elements)
if validation_stats['reclassified_tables'] > 0:
logger.info(
f"Cell validation: {validation_stats['reclassified_tables']}/{validation_stats['total_tables']} "
f"tables reclassified as TEXT due to over-detection"
)
return elements
def _embed_images_in_table(
@@ -911,18 +1057,145 @@ class PPStructureEnhanced:
type_counts[elem_type] = type_counts.get(elem_type, 0) + 1
return type_counts
def _convert_overall_ocr_to_regions(
self,
overall_ocr_res: Dict[str, Any],
scaling_info: Optional['ScalingInfo'] = None
) -> List[Dict[str, Any]]:
"""
Convert PP-StructureV3's overall_ocr_res to standard OCR region format.
This allows gap_filling_service to use PP-StructureV3's internal OCR results
instead of running a separate Raw OCR inference, saving approximately 50%
of total inference time.
The overall_ocr_res structure:
- dt_polys: List of polygon coordinates [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
- rec_texts: List of recognized text strings
- rec_scores: List of confidence scores
Args:
overall_ocr_res: Dictionary containing OCR results from PP-StructureV3
scaling_info: Optional scaling info for coordinate restoration
Returns:
List of OCR region dictionaries in standard format:
[{'text': str, 'bbox': [[x1,y1],...], 'confidence': float}, ...]
"""
regions = []
dt_polys = overall_ocr_res.get('dt_polys', [])
rec_texts = overall_ocr_res.get('rec_texts', [])
rec_scores = overall_ocr_res.get('rec_scores', [])
# Ensure all lists have the same length
num_regions = min(len(dt_polys), len(rec_texts))
if len(rec_scores) < num_regions:
# Pad with default confidence if scores are missing
rec_scores = list(rec_scores) + [0.9] * (num_regions - len(rec_scores))
for i in range(num_regions):
text = rec_texts[i]
if not text or not text.strip():
continue
poly = dt_polys[i]
confidence = rec_scores[i] if i < len(rec_scores) else 0.9
# Apply scaling restoration if needed
if scaling_info and hasattr(scaling_info, 'scale_factor') and scaling_info.scale_factor != 1.0:
scale = scaling_info.scale_factor
poly = [[pt[0] / scale, pt[1] / scale] for pt in poly]
regions.append({
'text': text,
'bbox': poly, # Keep polygon format for compatibility
'confidence': confidence
})
return regions
def _extract_text_from_html(self, html: str) -> str:
"""Extract plain text from HTML content."""
try:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
return soup.get_text(separator=' ', strip=True)
text = soup.get_text(separator=' ', strip=True)
except:
# Fallback: just remove HTML tags
import re
text = re.sub(r'<[^>]+>', ' ', html)
text = re.sub(r'\s+', ' ', text)
return text.strip()
text = text.strip()
# Strip LaTeX math formatting if present
return self._strip_latex_math(text)
def _strip_latex_math(self, text: str) -> str:
"""
Convert LaTeX math notation to plain text.
PP-StructureV3 outputs formulas in LaTeX format like:
$N\\cdot m\\times8.851=|b\\cdot|$
This converts them to readable plain text.
"""
import re
if not text or '$' not in text:
return text
# Remove $...$ delimiters but keep content
text = re.sub(r'\$([^$]+)\$', r'\1', text)
# Convert common LaTeX math commands to plain text
replacements = [
(r'\\cdot', '·'), # Multiplication dot
(r'\\times', '×'), # Multiplication sign
(r'\\div', '÷'), # Division sign
(r'\\pm', '±'), # Plus-minus
(r'\\leq', ''), # Less than or equal
(r'\\geq', ''), # Greater than or equal
(r'\\neq', ''), # Not equal
(r'\\approx', ''), # Approximately equal
(r'\\circ', '°'), # Degree symbol
(r'\\degree', '°'), # Degree symbol
(r'\\alpha', 'α'),
(r'\\beta', 'β'),
(r'\\gamma', 'γ'),
(r'\\delta', 'δ'),
(r'\\mu', 'μ'),
(r'\\Omega', 'Ω'),
(r'\\infty', ''),
(r'\^\\{2\\}', '²'), # Superscript 2
(r'\^\\{3\\}', '³'), # Superscript 3
(r'\^2', '²'),
(r'\^3', '³'),
(r'_\\{([^}]+)\\}', r'_\1'), # Subscript
(r'\\mathrm\{([^}]+)\}', r'\1'), # Roman text
(r'\\mathsf\{([^}]+)\}', r'\1'), # Sans-serif text
(r'\\mathbf\{([^}]+)\}', r'\1'), # Bold text
(r'\\text\{([^}]+)\}', r'\1'), # Text mode
(r'\\left', ''),
(r'\\right', ''),
(r'\\[|]', '|'), # Pipe symbols
(r'\\ ', ' '), # Escaped space
(r'\\,', ' '), # Thin space
(r'\\;', ' '), # Medium space
(r'\\quad', ' '), # Quad space
(r'\\qquad', ' '), # Double quad space
]
for pattern, replacement in replacements:
text = re.sub(pattern, replacement, text)
# Clean up any remaining backslashes followed by letters (unknown commands)
text = re.sub(r'\\[a-zA-Z]+', '', text)
# Clean up multiple spaces
text = re.sub(r'\s+', ' ', text)
return text.strip()
def _extract_bbox_from_filename(self, filename: str) -> List[int]:
"""Extract bbox from filename if it contains coordinate information."""