fix: OCR track table data format and image cropping

Table data format fixes (ocr_to_unified_converter.py):
- Fix ElementType string conversion using value-based lookup
- Add content-based HTML table detection (reclassify TEXT to TABLE)
- Use BeautifulSoup for robust HTML table parsing
- Generate TableData with fully populated cells arrays

Image cropping for OCR track (pp_structure_enhanced.py):
- Add _crop_and_save_image method for extracting image regions
- Pass source_image_path to _process_parsing_res_list
- Return relative filename (not full path) for saved_path
- Consistent with Direct Track image saving pattern

Also includes:
- Add beautifulsoup4 to requirements.txt
- Add architecture overview documentation
- Archive fix-ocr-track-table-data-format proposal (22/24 tasks)

Known issues: OCR track images are restored but still have quality issues
that will be addressed in a follow-up proposal.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-26 18:48:15 +08:00
parent a227311b2d
commit 6e050eb540
8 changed files with 585 additions and 30 deletions

View File

@@ -350,7 +350,19 @@ class OCRToUnifiedConverter:
element_type = elem_data.get('type', ElementType.TEXT)
if isinstance(element_type, str):
# Convert string to ElementType if needed
element_type = ElementType[element_type] if element_type in ElementType.__members__ else ElementType.TEXT
# ElementType is a str-based enum, so we can construct from value (lowercase)
try:
element_type = ElementType(element_type)
except ValueError:
# If value doesn't match, try member name (uppercase)
element_type = ElementType[element_type.upper()] if element_type.upper() in ElementType.__members__ else ElementType.TEXT
# Content-based reclassification: detect HTML tables in text content
content_str = elem_data.get('content', '')
if isinstance(content_str, str) and '<table' in content_str.lower():
if element_type == ElementType.TEXT:
logger.info(f"Element {elem_data.get('element_id')}: Reclassifying TEXT to TABLE (HTML table in content)")
element_type = ElementType.TABLE
# Prepare content based on element type
if element_type == ElementType.TABLE:
@@ -538,7 +550,12 @@ class OCRToUnifiedConverter:
return None
def _extract_table_data(self, elem_data: Dict) -> Optional[TableData]:
"""Extract table data from element."""
"""
Extract table data from element using BeautifulSoup for robust HTML parsing.
This method produces TableData objects with fully populated cells arrays,
matching the format produced by DirectExtractionEngine for consistency.
"""
try:
html = elem_data.get('html', '')
extracted_text = elem_data.get('extracted_text', '')
@@ -550,31 +567,101 @@ class OCRToUnifiedConverter:
html = content
logger.debug("Using content field as HTML table source")
# Try to parse HTML to get rows and columns
rows = 0
# Return None if no HTML table content
if not html or '<table' not in html.lower():
if extracted_text:
# Return minimal TableData with just caption if we have text
return TableData(rows=0, cols=0, cells=[], caption=extracted_text)
return None
# Parse HTML table using BeautifulSoup
try:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
table = soup.find('table')
if not table:
logger.warning("No <table> element found in HTML")
return self._fallback_table_data(html, extracted_text)
cells = []
headers = []
rows = table.find_all('tr')
# Track actual column positions accounting for rowspan/colspan
# This is a simplified approach - complex spanning may need enhancement
for row_idx, row in enumerate(rows):
row_cells = row.find_all(['td', 'th'])
col_idx = 0
for cell in row_cells:
cell_content = cell.get_text(strip=True)
rowspan = int(cell.get('rowspan', 1))
colspan = int(cell.get('colspan', 1))
cells.append(TableCell(
row=row_idx,
col=col_idx,
row_span=rowspan,
col_span=colspan,
content=cell_content
))
# Collect headers from <th> elements or first row
if cell.name == 'th' or row_idx == 0:
headers.append(cell_content)
# Advance column index by colspan
col_idx += colspan
# Calculate actual dimensions
num_rows = len(rows)
num_cols = max(
sum(int(cell.get('colspan', 1)) for cell in row.find_all(['td', 'th']))
for row in rows
) if rows else 0
logger.debug(
f"Parsed HTML table: {num_rows} rows, {num_cols} cols, {len(cells)} cells"
)
return TableData(
rows=num_rows,
cols=num_cols,
cells=cells,
headers=headers if headers else None,
caption=extracted_text if extracted_text else None
)
except ImportError:
logger.warning("BeautifulSoup not available, using fallback parsing")
return self._fallback_table_data(html, extracted_text)
except Exception as e:
logger.warning(f"Failed to extract table data: {e}")
return None
def _fallback_table_data(self, html: str, extracted_text: str = '') -> Optional[TableData]:
"""
Fallback table parsing when BeautifulSoup is not available.
Returns basic TableData with row/col counts only (no cells).
"""
try:
rows = html.count('<tr')
cols = 0
cells = []
if rows > 0:
first_row_end = html.find('</tr>')
if first_row_end > 0:
first_row = html[:first_row_end]
cols = first_row.count('<td') + first_row.count('<th')
if html:
# Simple HTML parsing (could be enhanced with BeautifulSoup)
rows = html.count('<tr')
if rows > 0:
# Estimate columns from first row
first_row_end = html.find('</tr>')
if first_row_end > 0:
first_row = html[:first_row_end]
cols = first_row.count('<td') + first_row.count('<th')
# Return None if no valid table data found
if rows == 0 and cols == 0 and not extracted_text:
return None
# Note: TableData uses 'cols' not 'columns'
# HTML content can be stored as caption or in element metadata
return TableData(
rows=rows,
cols=cols,
cells=cells,
cells=[], # Empty cells in fallback mode
caption=extracted_text if extracted_text else None
)
except:
@@ -653,9 +740,9 @@ class OCRToUnifiedConverter:
min_distance = float('inf')
for target in targets:
# Caption should be below the target
if target.bbox.y2 <= caption.bbox.y1:
distance = caption.bbox.y1 - target.bbox.y2
# Caption should be below the target (y1 is bottom in BoundingBox)
if target.bbox.y1 <= caption.bbox.y0:
distance = caption.bbox.y0 - target.bbox.y1
if distance < min_distance:
min_distance = distance
best_target = target
@@ -684,8 +771,8 @@ class OCRToUnifiedConverter:
else:
prev_item = list_items[i-1]
# Check if items are consecutive (similar x position, reasonable y gap)
x_aligned = abs(item.bbox.x1 - prev_item.bbox.x1) < 20
y_consecutive = (item.bbox.y1 - prev_item.bbox.y2) < 30
x_aligned = abs(item.bbox.x0 - prev_item.bbox.x0) < 20
y_consecutive = (item.bbox.y0 - prev_item.bbox.y1) < 30
if x_aligned and y_consecutive:
current_group.append(item)
@@ -714,11 +801,11 @@ class OCRToUnifiedConverter:
if i + 1 < len(headers):
next_header_y = headers[i + 1].bbox.y1
# Find all elements between headers
# Find all elements between headers (y0=top, y1=bottom)
content_elements = [
e for e in elements
if (e.bbox.y1 > header.bbox.y2 and
e.bbox.y1 < next_header_y and
if (e.bbox.y0 > header.bbox.y1 and
e.bbox.y0 < next_header_y and
e.type not in [ElementType.HEADER, ElementType.TITLE])
]

View File

@@ -167,7 +167,7 @@ class PPStructureEnhanced:
# Process parsing_res_list if found
if parsing_res_list:
elements = self._process_parsing_res_list(
parsing_res_list, current_page, output_dir
parsing_res_list, current_page, output_dir, image_path
)
all_elements.extend(elements)
@@ -229,7 +229,8 @@ class PPStructureEnhanced:
self,
parsing_res_list: List[Dict],
current_page: int,
output_dir: Optional[Path]
output_dir: Optional[Path],
source_image_path: Optional[Path] = None
) -> List[Dict[str, Any]]:
"""
Process parsing_res_list to extract all elements.
@@ -238,6 +239,7 @@ class PPStructureEnhanced:
parsing_res_list: List of parsed elements from PP-StructureV3
current_page: Current page number
output_dir: Optional output directory
source_image_path: Path to source image for cropping image regions
Returns:
List of processed elements with normalized structure
@@ -327,6 +329,17 @@ class PPStructureEnhanced:
element['img_path'] = item['img_path'] # Keep original for reference
else:
logger.warning(f"Failed to save image for element {element['element_id']}")
# Crop image from source if no img_path but source image is available
elif source_image_path and output_dir and bbox != [0, 0, 0, 0]:
cropped_path = self._crop_and_save_image(
source_image_path, bbox, output_dir, element['element_id']
)
if cropped_path:
element['saved_path'] = cropped_path
element['img_path'] = cropped_path
logger.info(f"Cropped and saved image region for {element['element_id']}")
else:
logger.warning(f"Failed to crop image for element {element['element_id']}")
# Add any additional metadata
if 'metadata' in item:
@@ -535,4 +548,62 @@ class PPStructureEnhanced:
img_obj.save(str(img_path))
logger.info(f"Saved image to {img_path}")
except Exception as e:
logger.warning(f"Failed to save PIL image: {e}")
logger.warning(f"Failed to save PIL image: {e}")
def _crop_and_save_image(
self,
source_image_path: Path,
bbox: List[float],
output_dir: Path,
element_id: str
) -> Optional[str]:
"""
Crop image region from source image and save to output directory.
Args:
source_image_path: Path to the source image
bbox: Bounding box [x1, y1, x2, y2]
output_dir: Output directory for saving cropped image
element_id: Element ID for naming
Returns:
Relative filename (not full path) to saved image, consistent with
Direct Track which stores "filename.png" that gets joined with
result_dir by pdf_generator_service.
"""
try:
from PIL import Image
# Open source image
with Image.open(source_image_path) as img:
# Ensure bbox values are integers
x1, y1, x2, y2 = [int(v) for v in bbox[:4]]
# Validate bbox
img_width, img_height = img.size
x1 = max(0, min(x1, img_width))
x2 = max(0, min(x2, img_width))
y1 = max(0, min(y1, img_height))
y2 = max(0, min(y2, img_height))
if x2 <= x1 or y2 <= y1:
logger.warning(f"Invalid bbox for cropping: {bbox}")
return None
# Crop the region
cropped = img.crop((x1, y1, x2, y2))
# Save directly to output directory (no subdirectory)
# Consistent with Direct Track which saves to output_dir directly
image_filename = f"{element_id}.png"
img_path = output_dir / image_filename
cropped.save(str(img_path), "PNG")
# Return just the filename (relative to result_dir)
# PDF generator will join with result_dir to get full path
logger.info(f"Cropped image saved: {img_path} ({x2-x1}x{y2-y1} pixels)")
return image_filename
except Exception as e:
logger.error(f"Failed to crop and save image for {element_id}: {e}")
return None