chore: backup before code cleanup
Backup commit before executing remove-unused-code proposal. This includes all pending changes and new features. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -28,9 +28,11 @@ from PIL import Image
|
||||
import numpy as np
|
||||
import cv2
|
||||
from app.models.unified_document import ElementType
|
||||
from app.services.cell_validation_engine import CellValidationEngine, CellValidationConfig
|
||||
from app.core.config import settings
|
||||
from app.services.memory_manager import prediction_context
|
||||
from app.services.cv_table_detector import CVTableDetector
|
||||
from app.services.table_content_rebuilder import TableContentRebuilder
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -91,7 +93,8 @@ class PPStructureEnhanced:
|
||||
preprocessed_image: Optional[Image.Image] = None,
|
||||
scaling_info: Optional['ScalingInfo'] = None,
|
||||
save_visualization: bool = False,
|
||||
use_cv_table_detection: bool = False
|
||||
use_cv_table_detection: bool = False,
|
||||
raw_ocr_regions: Optional[List[Dict[str, Any]]] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Analyze document with full PP-StructureV3 capabilities.
|
||||
@@ -110,6 +113,8 @@ class PPStructureEnhanced:
|
||||
(layout_det_res, layout_order_res, overall_ocr_res, etc.)
|
||||
use_cv_table_detection: If True, use CV-based line detection for wired tables
|
||||
instead of ML-based cell detection (RT-DETR-L)
|
||||
raw_ocr_regions: Optional list of raw OCR text regions for table content
|
||||
rebuilding. Used when PP-StructureV3's table HTML is incorrect.
|
||||
|
||||
Returns:
|
||||
Dictionary with complete structure information including:
|
||||
@@ -222,6 +227,7 @@ class PPStructureEnhanced:
|
||||
|
||||
# Extract table_res_list which contains cell_box_list
|
||||
layout_det_res = None
|
||||
overall_ocr_res = None
|
||||
if result_dict:
|
||||
if 'table_res_list' in result_dict:
|
||||
table_res_list = result_dict['table_res_list']
|
||||
@@ -235,13 +241,20 @@ class PPStructureEnhanced:
|
||||
layout_det_res = result_dict['layout_det_res']
|
||||
logger.info(f"Found layout_det_res with {len(layout_det_res.get('boxes', []))} boxes")
|
||||
|
||||
# Extract overall_ocr_res for gap filling (avoid separate Raw OCR inference)
|
||||
if 'overall_ocr_res' in result_dict:
|
||||
overall_ocr_res = result_dict['overall_ocr_res']
|
||||
ocr_count = len(overall_ocr_res.get('rec_texts', []))
|
||||
logger.info(f"Found overall_ocr_res with {ocr_count} text regions")
|
||||
|
||||
# Process parsing_res_list if found
|
||||
if parsing_res_list:
|
||||
elements = self._process_parsing_res_list(
|
||||
parsing_res_list, current_page, output_dir, image_path, scaling_info,
|
||||
table_res_list=table_res_list, # Pass table_res_list for cell_box_list
|
||||
layout_det_res=layout_det_res, # Pass layout_det_res for Image-in-Table
|
||||
use_cv_table_detection=use_cv_table_detection # Use CV for wired tables
|
||||
use_cv_table_detection=use_cv_table_detection, # Use CV for wired tables
|
||||
raw_ocr_regions=raw_ocr_regions # Pass raw OCR for table content rebuilding
|
||||
)
|
||||
all_elements.extend(elements)
|
||||
|
||||
@@ -289,6 +302,15 @@ class PPStructureEnhanced:
|
||||
if visualization_dir:
|
||||
result['visualization_dir'] = str(visualization_dir)
|
||||
|
||||
# Add overall_ocr_res for gap filling (converted to standard format)
|
||||
# This allows gap_filling_service to use PP-StructureV3's internal OCR
|
||||
# instead of running a separate Raw OCR inference
|
||||
if overall_ocr_res:
|
||||
result['overall_ocr_res'] = self._convert_overall_ocr_to_regions(
|
||||
overall_ocr_res, scaling_info
|
||||
)
|
||||
logger.info(f"Converted {len(result['overall_ocr_res'])} OCR regions from overall_ocr_res")
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
@@ -327,7 +349,8 @@ class PPStructureEnhanced:
|
||||
scaling_info: Optional['ScalingInfo'] = None,
|
||||
table_res_list: Optional[List[Dict]] = None,
|
||||
layout_det_res: Optional[Dict] = None,
|
||||
use_cv_table_detection: bool = False
|
||||
use_cv_table_detection: bool = False,
|
||||
raw_ocr_regions: Optional[List[Dict[str, Any]]] = None
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Process parsing_res_list to extract all elements.
|
||||
@@ -341,6 +364,7 @@ class PPStructureEnhanced:
|
||||
table_res_list: Optional list of table results containing cell_box_list
|
||||
layout_det_res: Optional layout detection result for Image-in-Table processing
|
||||
use_cv_table_detection: If True, use CV line detection for wired tables
|
||||
raw_ocr_regions: Optional list of raw OCR text regions for table content rebuilding
|
||||
|
||||
Returns:
|
||||
List of processed elements with normalized structure
|
||||
@@ -415,6 +439,11 @@ class PPStructureEnhanced:
|
||||
mapped_type = ElementType.TABLE
|
||||
html_table_content = content # Store for later use
|
||||
|
||||
# Strip LaTeX math formatting from text content (PP-Structure formula detection)
|
||||
if content and mapped_type in [ElementType.TEXT, ElementType.TITLE, ElementType.HEADER]:
|
||||
if '$' in content and '\\' in content:
|
||||
content = self._strip_latex_math(content)
|
||||
|
||||
# Create element
|
||||
element = {
|
||||
'element_id': f"pp3_{current_page}_{idx}",
|
||||
@@ -468,18 +497,84 @@ class PPStructureEnhanced:
|
||||
logger.info(f"[TABLE] Found {len(cell_boxes)} cell boxes from table_res_list (HTML match)")
|
||||
break
|
||||
|
||||
# If no HTML match, use first available table_res with cell_box_list
|
||||
# If no HTML match, find best matching table_res by bbox overlap
|
||||
if not cell_boxes_extracted:
|
||||
best_match = None
|
||||
best_overlap = 0.0
|
||||
|
||||
for tbl_res in table_res_list:
|
||||
if 'cell_box_list' in tbl_res and tbl_res['cell_box_list']:
|
||||
cell_boxes = tbl_res['cell_box_list']
|
||||
element['cell_boxes'] = [[float(c) for c in box] for box in cell_boxes]
|
||||
element['cell_boxes_source'] = 'table_res_list'
|
||||
cell_boxes_extracted = True
|
||||
logger.info(f"[TABLE] Found {len(cell_boxes)} cell boxes from table_res_list (first available)")
|
||||
# Remove used table_res to avoid reuse
|
||||
table_res_list.remove(tbl_res)
|
||||
break
|
||||
if 'cell_box_list' not in tbl_res or not tbl_res['cell_box_list']:
|
||||
continue
|
||||
|
||||
# Get table_res bbox from its cell_box_list
|
||||
cell_boxes_temp = tbl_res['cell_box_list']
|
||||
if not cell_boxes_temp:
|
||||
continue
|
||||
|
||||
# Calculate bounding box of all cells
|
||||
tbl_x1 = min(cb[0] for cb in cell_boxes_temp)
|
||||
tbl_y1 = min(cb[1] for cb in cell_boxes_temp)
|
||||
tbl_x2 = max(cb[2] for cb in cell_boxes_temp)
|
||||
tbl_y2 = max(cb[3] for cb in cell_boxes_temp)
|
||||
|
||||
# Calculate IoU (Intersection over Union) with element bbox
|
||||
# bbox is [x1, y1, x2, y2]
|
||||
elem_x1, elem_y1, elem_x2, elem_y2 = bbox[0], bbox[1], bbox[2], bbox[3]
|
||||
|
||||
# Intersection
|
||||
inter_x1 = max(tbl_x1, elem_x1)
|
||||
inter_y1 = max(tbl_y1, elem_y1)
|
||||
inter_x2 = min(tbl_x2, elem_x2)
|
||||
inter_y2 = min(tbl_y2, elem_y2)
|
||||
|
||||
if inter_x1 < inter_x2 and inter_y1 < inter_y2:
|
||||
inter_area = (inter_x2 - inter_x1) * (inter_y2 - inter_y1)
|
||||
elem_area = (elem_x2 - elem_x1) * (elem_y2 - elem_y1)
|
||||
tbl_area = (tbl_x2 - tbl_x1) * (tbl_y2 - tbl_y1)
|
||||
|
||||
# Use overlap ratio with element bbox (how much of element is covered)
|
||||
overlap_ratio = inter_area / elem_area if elem_area > 0 else 0
|
||||
|
||||
if overlap_ratio > best_overlap:
|
||||
best_overlap = overlap_ratio
|
||||
best_match = tbl_res
|
||||
|
||||
# Use best match if overlap is significant (>10%)
|
||||
if best_match and best_overlap > 0.1:
|
||||
cell_boxes = best_match['cell_box_list']
|
||||
element['cell_boxes'] = [[float(c) for c in box] for box in cell_boxes]
|
||||
element['cell_boxes_source'] = 'table_res_list'
|
||||
cell_boxes_extracted = True
|
||||
logger.info(f"[TABLE] Found {len(cell_boxes)} cell boxes from table_res_list (bbox match, overlap={best_overlap:.2f})")
|
||||
|
||||
# Extract pred_html if not already set
|
||||
if not html_content and 'pred_html' in best_match:
|
||||
html_content = best_match['pred_html']
|
||||
element['html'] = html_content
|
||||
element['extracted_text'] = self._extract_text_from_html(html_content)
|
||||
logger.info(f"[TABLE] Extracted HTML from table_res_list (bbox match, {len(html_content)} chars)")
|
||||
|
||||
# Remove used table_res to avoid reuse
|
||||
table_res_list.remove(best_match)
|
||||
elif table_res_list:
|
||||
# Fallback to first available if no bbox match found
|
||||
for tbl_res in table_res_list:
|
||||
if 'cell_box_list' in tbl_res and tbl_res['cell_box_list']:
|
||||
cell_boxes = tbl_res['cell_box_list']
|
||||
element['cell_boxes'] = [[float(c) for c in box] for box in cell_boxes]
|
||||
element['cell_boxes_source'] = 'table_res_list'
|
||||
cell_boxes_extracted = True
|
||||
logger.warning(f"[TABLE] Using first available table_res (no bbox match, {len(cell_boxes)} cells)")
|
||||
|
||||
# Extract pred_html if not already set
|
||||
if not html_content and 'pred_html' in tbl_res:
|
||||
html_content = tbl_res['pred_html']
|
||||
element['html'] = html_content
|
||||
element['extracted_text'] = self._extract_text_from_html(html_content)
|
||||
logger.info(f"[TABLE] Extracted HTML from table_res_list (fallback, {len(html_content)} chars)")
|
||||
|
||||
table_res_list.remove(tbl_res)
|
||||
break
|
||||
|
||||
if not cell_boxes_extracted and 'boxes' in res_data:
|
||||
# PPStructureV3 returned cell boxes in res (unlikely in PaddleX 3.x)
|
||||
@@ -558,6 +653,42 @@ class PPStructureEnhanced:
|
||||
element['embedded_images'] = embedded_images
|
||||
logger.info(f"[TABLE] Embedded {len(embedded_images)} images into table")
|
||||
|
||||
# 4. Table content rebuilding from raw OCR regions
|
||||
# When cell_boxes have boundary issues, rebuild table content from raw OCR
|
||||
# Only if table_content_rebuilder is enabled (disabled by default as it's a patch behavior)
|
||||
logger.info(f"[TABLE] raw_ocr_regions available: {raw_ocr_regions is not None and len(raw_ocr_regions) if raw_ocr_regions else 0}")
|
||||
logger.info(f"[TABLE] cell_boxes available: {len(element.get('cell_boxes', []))}")
|
||||
if settings.table_content_rebuilder_enabled and raw_ocr_regions and element.get('cell_boxes'):
|
||||
rebuilder = TableContentRebuilder()
|
||||
should_rebuild, rebuild_reason = rebuilder.should_rebuild(
|
||||
element['cell_boxes'],
|
||||
bbox,
|
||||
element.get('html', '')
|
||||
)
|
||||
|
||||
if should_rebuild:
|
||||
logger.info(f"[TABLE] Triggering table rebuild: {rebuild_reason}")
|
||||
rebuilt_table, rebuild_stats = rebuilder.rebuild_table(
|
||||
cell_boxes=element['cell_boxes'],
|
||||
table_bbox=bbox,
|
||||
raw_ocr_regions=raw_ocr_regions,
|
||||
original_html=element.get('html', '')
|
||||
)
|
||||
|
||||
if rebuilt_table:
|
||||
# Update element with rebuilt content
|
||||
element['html'] = rebuilt_table['html']
|
||||
element['rebuilt_table'] = rebuilt_table
|
||||
element['rebuild_stats'] = rebuild_stats
|
||||
element['extracted_text'] = self._extract_text_from_html(rebuilt_table['html'])
|
||||
logger.info(
|
||||
f"[TABLE] Rebuilt table: {rebuilt_table['rows']}x{rebuilt_table['cols']} "
|
||||
f"with {len(rebuilt_table['cells'])} cells"
|
||||
)
|
||||
else:
|
||||
logger.warning(f"[TABLE] Rebuild failed: {rebuild_stats.get('reason', 'unknown')}")
|
||||
element['rebuild_stats'] = rebuild_stats
|
||||
|
||||
# Special handling for images/figures/charts/stamps (visual elements that need cropping)
|
||||
elif mapped_type in [ElementType.IMAGE, ElementType.FIGURE, ElementType.CHART, ElementType.DIAGRAM, ElementType.STAMP, ElementType.LOGO]:
|
||||
# Save image if path provided
|
||||
@@ -587,6 +718,21 @@ class PPStructureEnhanced:
|
||||
elements.append(element)
|
||||
logger.debug(f"Processed element {idx}: type={mapped_type}, bbox={bbox}")
|
||||
|
||||
# Apply cell validation to filter over-detected tables
|
||||
if settings.cell_validation_enabled:
|
||||
cell_validator = CellValidationEngine(CellValidationConfig(
|
||||
max_cell_density=settings.cell_validation_max_density,
|
||||
min_avg_cell_area=settings.cell_validation_min_cell_area,
|
||||
min_cell_height=settings.cell_validation_min_cell_height,
|
||||
enabled=True
|
||||
))
|
||||
elements, validation_stats = cell_validator.validate_and_filter_elements(elements)
|
||||
if validation_stats['reclassified_tables'] > 0:
|
||||
logger.info(
|
||||
f"Cell validation: {validation_stats['reclassified_tables']}/{validation_stats['total_tables']} "
|
||||
f"tables reclassified as TEXT due to over-detection"
|
||||
)
|
||||
|
||||
return elements
|
||||
|
||||
def _embed_images_in_table(
|
||||
@@ -911,18 +1057,145 @@ class PPStructureEnhanced:
|
||||
type_counts[elem_type] = type_counts.get(elem_type, 0) + 1
|
||||
return type_counts
|
||||
|
||||
def _convert_overall_ocr_to_regions(
|
||||
self,
|
||||
overall_ocr_res: Dict[str, Any],
|
||||
scaling_info: Optional['ScalingInfo'] = None
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Convert PP-StructureV3's overall_ocr_res to standard OCR region format.
|
||||
|
||||
This allows gap_filling_service to use PP-StructureV3's internal OCR results
|
||||
instead of running a separate Raw OCR inference, saving approximately 50%
|
||||
of total inference time.
|
||||
|
||||
The overall_ocr_res structure:
|
||||
- dt_polys: List of polygon coordinates [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
|
||||
- rec_texts: List of recognized text strings
|
||||
- rec_scores: List of confidence scores
|
||||
|
||||
Args:
|
||||
overall_ocr_res: Dictionary containing OCR results from PP-StructureV3
|
||||
scaling_info: Optional scaling info for coordinate restoration
|
||||
|
||||
Returns:
|
||||
List of OCR region dictionaries in standard format:
|
||||
[{'text': str, 'bbox': [[x1,y1],...], 'confidence': float}, ...]
|
||||
"""
|
||||
regions = []
|
||||
|
||||
dt_polys = overall_ocr_res.get('dt_polys', [])
|
||||
rec_texts = overall_ocr_res.get('rec_texts', [])
|
||||
rec_scores = overall_ocr_res.get('rec_scores', [])
|
||||
|
||||
# Ensure all lists have the same length
|
||||
num_regions = min(len(dt_polys), len(rec_texts))
|
||||
if len(rec_scores) < num_regions:
|
||||
# Pad with default confidence if scores are missing
|
||||
rec_scores = list(rec_scores) + [0.9] * (num_regions - len(rec_scores))
|
||||
|
||||
for i in range(num_regions):
|
||||
text = rec_texts[i]
|
||||
if not text or not text.strip():
|
||||
continue
|
||||
|
||||
poly = dt_polys[i]
|
||||
confidence = rec_scores[i] if i < len(rec_scores) else 0.9
|
||||
|
||||
# Apply scaling restoration if needed
|
||||
if scaling_info and hasattr(scaling_info, 'scale_factor') and scaling_info.scale_factor != 1.0:
|
||||
scale = scaling_info.scale_factor
|
||||
poly = [[pt[0] / scale, pt[1] / scale] for pt in poly]
|
||||
|
||||
regions.append({
|
||||
'text': text,
|
||||
'bbox': poly, # Keep polygon format for compatibility
|
||||
'confidence': confidence
|
||||
})
|
||||
|
||||
return regions
|
||||
|
||||
def _extract_text_from_html(self, html: str) -> str:
|
||||
"""Extract plain text from HTML content."""
|
||||
try:
|
||||
from bs4 import BeautifulSoup
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
return soup.get_text(separator=' ', strip=True)
|
||||
text = soup.get_text(separator=' ', strip=True)
|
||||
except:
|
||||
# Fallback: just remove HTML tags
|
||||
import re
|
||||
text = re.sub(r'<[^>]+>', ' ', html)
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
return text.strip()
|
||||
text = text.strip()
|
||||
|
||||
# Strip LaTeX math formatting if present
|
||||
return self._strip_latex_math(text)
|
||||
|
||||
def _strip_latex_math(self, text: str) -> str:
|
||||
"""
|
||||
Convert LaTeX math notation to plain text.
|
||||
|
||||
PP-StructureV3 outputs formulas in LaTeX format like:
|
||||
$N\\cdot m\\times8.851=|b\\cdot|$
|
||||
|
||||
This converts them to readable plain text.
|
||||
"""
|
||||
import re
|
||||
|
||||
if not text or '$' not in text:
|
||||
return text
|
||||
|
||||
# Remove $...$ delimiters but keep content
|
||||
text = re.sub(r'\$([^$]+)\$', r'\1', text)
|
||||
|
||||
# Convert common LaTeX math commands to plain text
|
||||
replacements = [
|
||||
(r'\\cdot', '·'), # Multiplication dot
|
||||
(r'\\times', '×'), # Multiplication sign
|
||||
(r'\\div', '÷'), # Division sign
|
||||
(r'\\pm', '±'), # Plus-minus
|
||||
(r'\\leq', '≤'), # Less than or equal
|
||||
(r'\\geq', '≥'), # Greater than or equal
|
||||
(r'\\neq', '≠'), # Not equal
|
||||
(r'\\approx', '≈'), # Approximately equal
|
||||
(r'\\circ', '°'), # Degree symbol
|
||||
(r'\\degree', '°'), # Degree symbol
|
||||
(r'\\alpha', 'α'),
|
||||
(r'\\beta', 'β'),
|
||||
(r'\\gamma', 'γ'),
|
||||
(r'\\delta', 'δ'),
|
||||
(r'\\mu', 'μ'),
|
||||
(r'\\Omega', 'Ω'),
|
||||
(r'\\infty', '∞'),
|
||||
(r'\^\\{2\\}', '²'), # Superscript 2
|
||||
(r'\^\\{3\\}', '³'), # Superscript 3
|
||||
(r'\^2', '²'),
|
||||
(r'\^3', '³'),
|
||||
(r'_\\{([^}]+)\\}', r'_\1'), # Subscript
|
||||
(r'\\mathrm\{([^}]+)\}', r'\1'), # Roman text
|
||||
(r'\\mathsf\{([^}]+)\}', r'\1'), # Sans-serif text
|
||||
(r'\\mathbf\{([^}]+)\}', r'\1'), # Bold text
|
||||
(r'\\text\{([^}]+)\}', r'\1'), # Text mode
|
||||
(r'\\left', ''),
|
||||
(r'\\right', ''),
|
||||
(r'\\[|]', '|'), # Pipe symbols
|
||||
(r'\\ ', ' '), # Escaped space
|
||||
(r'\\,', ' '), # Thin space
|
||||
(r'\\;', ' '), # Medium space
|
||||
(r'\\quad', ' '), # Quad space
|
||||
(r'\\qquad', ' '), # Double quad space
|
||||
]
|
||||
|
||||
for pattern, replacement in replacements:
|
||||
text = re.sub(pattern, replacement, text)
|
||||
|
||||
# Clean up any remaining backslashes followed by letters (unknown commands)
|
||||
text = re.sub(r'\\[a-zA-Z]+', '', text)
|
||||
|
||||
# Clean up multiple spaces
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
|
||||
return text.strip()
|
||||
|
||||
def _extract_bbox_from_filename(self, filename: str) -> List[int]:
|
||||
"""Extract bbox from filename if it contains coordinate information."""
|
||||
|
||||
Reference in New Issue
Block a user