fix: OCR track table data format and image cropping
Table data format fixes (ocr_to_unified_converter.py): - Fix ElementType string conversion using value-based lookup - Add content-based HTML table detection (reclassify TEXT to TABLE) - Use BeautifulSoup for robust HTML table parsing - Generate TableData with fully populated cells arrays Image cropping for OCR track (pp_structure_enhanced.py): - Add _crop_and_save_image method for extracting image regions - Pass source_image_path to _process_parsing_res_list - Return relative filename (not full path) for saved_path - Consistent with Direct Track image saving pattern Also includes: - Add beautifulsoup4 to requirements.txt - Add architecture overview documentation - Archive fix-ocr-track-table-data-format proposal (22/24 tasks) Known issues: OCR track images are restored but still have quality issues that will be addressed in a follow-up proposal. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -350,7 +350,19 @@ class OCRToUnifiedConverter:
|
||||
element_type = elem_data.get('type', ElementType.TEXT)
|
||||
if isinstance(element_type, str):
|
||||
# Convert string to ElementType if needed
|
||||
element_type = ElementType[element_type] if element_type in ElementType.__members__ else ElementType.TEXT
|
||||
# ElementType is a str-based enum, so we can construct from value (lowercase)
|
||||
try:
|
||||
element_type = ElementType(element_type)
|
||||
except ValueError:
|
||||
# If value doesn't match, try member name (uppercase)
|
||||
element_type = ElementType[element_type.upper()] if element_type.upper() in ElementType.__members__ else ElementType.TEXT
|
||||
|
||||
# Content-based reclassification: detect HTML tables in text content
|
||||
content_str = elem_data.get('content', '')
|
||||
if isinstance(content_str, str) and '<table' in content_str.lower():
|
||||
if element_type == ElementType.TEXT:
|
||||
logger.info(f"Element {elem_data.get('element_id')}: Reclassifying TEXT to TABLE (HTML table in content)")
|
||||
element_type = ElementType.TABLE
|
||||
|
||||
# Prepare content based on element type
|
||||
if element_type == ElementType.TABLE:
|
||||
@@ -538,7 +550,12 @@ class OCRToUnifiedConverter:
|
||||
return None
|
||||
|
||||
def _extract_table_data(self, elem_data: Dict) -> Optional[TableData]:
|
||||
"""Extract table data from element."""
|
||||
"""
|
||||
Extract table data from element using BeautifulSoup for robust HTML parsing.
|
||||
|
||||
This method produces TableData objects with fully populated cells arrays,
|
||||
matching the format produced by DirectExtractionEngine for consistency.
|
||||
"""
|
||||
try:
|
||||
html = elem_data.get('html', '')
|
||||
extracted_text = elem_data.get('extracted_text', '')
|
||||
@@ -550,31 +567,101 @@ class OCRToUnifiedConverter:
|
||||
html = content
|
||||
logger.debug("Using content field as HTML table source")
|
||||
|
||||
# Try to parse HTML to get rows and columns
|
||||
rows = 0
|
||||
# Return None if no HTML table content
|
||||
if not html or '<table' not in html.lower():
|
||||
if extracted_text:
|
||||
# Return minimal TableData with just caption if we have text
|
||||
return TableData(rows=0, cols=0, cells=[], caption=extracted_text)
|
||||
return None
|
||||
|
||||
# Parse HTML table using BeautifulSoup
|
||||
try:
|
||||
from bs4 import BeautifulSoup
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
table = soup.find('table')
|
||||
|
||||
if not table:
|
||||
logger.warning("No <table> element found in HTML")
|
||||
return self._fallback_table_data(html, extracted_text)
|
||||
|
||||
cells = []
|
||||
headers = []
|
||||
rows = table.find_all('tr')
|
||||
|
||||
# Track actual column positions accounting for rowspan/colspan
|
||||
# This is a simplified approach - complex spanning may need enhancement
|
||||
for row_idx, row in enumerate(rows):
|
||||
row_cells = row.find_all(['td', 'th'])
|
||||
col_idx = 0
|
||||
|
||||
for cell in row_cells:
|
||||
cell_content = cell.get_text(strip=True)
|
||||
rowspan = int(cell.get('rowspan', 1))
|
||||
colspan = int(cell.get('colspan', 1))
|
||||
|
||||
cells.append(TableCell(
|
||||
row=row_idx,
|
||||
col=col_idx,
|
||||
row_span=rowspan,
|
||||
col_span=colspan,
|
||||
content=cell_content
|
||||
))
|
||||
|
||||
# Collect headers from <th> elements or first row
|
||||
if cell.name == 'th' or row_idx == 0:
|
||||
headers.append(cell_content)
|
||||
|
||||
# Advance column index by colspan
|
||||
col_idx += colspan
|
||||
|
||||
# Calculate actual dimensions
|
||||
num_rows = len(rows)
|
||||
num_cols = max(
|
||||
sum(int(cell.get('colspan', 1)) for cell in row.find_all(['td', 'th']))
|
||||
for row in rows
|
||||
) if rows else 0
|
||||
|
||||
logger.debug(
|
||||
f"Parsed HTML table: {num_rows} rows, {num_cols} cols, {len(cells)} cells"
|
||||
)
|
||||
|
||||
return TableData(
|
||||
rows=num_rows,
|
||||
cols=num_cols,
|
||||
cells=cells,
|
||||
headers=headers if headers else None,
|
||||
caption=extracted_text if extracted_text else None
|
||||
)
|
||||
|
||||
except ImportError:
|
||||
logger.warning("BeautifulSoup not available, using fallback parsing")
|
||||
return self._fallback_table_data(html, extracted_text)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to extract table data: {e}")
|
||||
return None
|
||||
|
||||
def _fallback_table_data(self, html: str, extracted_text: str = '') -> Optional[TableData]:
|
||||
"""
|
||||
Fallback table parsing when BeautifulSoup is not available.
|
||||
Returns basic TableData with row/col counts only (no cells).
|
||||
"""
|
||||
try:
|
||||
rows = html.count('<tr')
|
||||
cols = 0
|
||||
cells = []
|
||||
if rows > 0:
|
||||
first_row_end = html.find('</tr>')
|
||||
if first_row_end > 0:
|
||||
first_row = html[:first_row_end]
|
||||
cols = first_row.count('<td') + first_row.count('<th')
|
||||
|
||||
if html:
|
||||
# Simple HTML parsing (could be enhanced with BeautifulSoup)
|
||||
rows = html.count('<tr')
|
||||
if rows > 0:
|
||||
# Estimate columns from first row
|
||||
first_row_end = html.find('</tr>')
|
||||
if first_row_end > 0:
|
||||
first_row = html[:first_row_end]
|
||||
cols = first_row.count('<td') + first_row.count('<th')
|
||||
|
||||
# Return None if no valid table data found
|
||||
if rows == 0 and cols == 0 and not extracted_text:
|
||||
return None
|
||||
|
||||
# Note: TableData uses 'cols' not 'columns'
|
||||
# HTML content can be stored as caption or in element metadata
|
||||
return TableData(
|
||||
rows=rows,
|
||||
cols=cols,
|
||||
cells=cells,
|
||||
cells=[], # Empty cells in fallback mode
|
||||
caption=extracted_text if extracted_text else None
|
||||
)
|
||||
except:
|
||||
@@ -653,9 +740,9 @@ class OCRToUnifiedConverter:
|
||||
min_distance = float('inf')
|
||||
|
||||
for target in targets:
|
||||
# Caption should be below the target
|
||||
if target.bbox.y2 <= caption.bbox.y1:
|
||||
distance = caption.bbox.y1 - target.bbox.y2
|
||||
# Caption should be below the target (y1 is bottom in BoundingBox)
|
||||
if target.bbox.y1 <= caption.bbox.y0:
|
||||
distance = caption.bbox.y0 - target.bbox.y1
|
||||
if distance < min_distance:
|
||||
min_distance = distance
|
||||
best_target = target
|
||||
@@ -684,8 +771,8 @@ class OCRToUnifiedConverter:
|
||||
else:
|
||||
prev_item = list_items[i-1]
|
||||
# Check if items are consecutive (similar x position, reasonable y gap)
|
||||
x_aligned = abs(item.bbox.x1 - prev_item.bbox.x1) < 20
|
||||
y_consecutive = (item.bbox.y1 - prev_item.bbox.y2) < 30
|
||||
x_aligned = abs(item.bbox.x0 - prev_item.bbox.x0) < 20
|
||||
y_consecutive = (item.bbox.y0 - prev_item.bbox.y1) < 30
|
||||
|
||||
if x_aligned and y_consecutive:
|
||||
current_group.append(item)
|
||||
@@ -714,11 +801,11 @@ class OCRToUnifiedConverter:
|
||||
if i + 1 < len(headers):
|
||||
next_header_y = headers[i + 1].bbox.y1
|
||||
|
||||
# Find all elements between headers
|
||||
# Find all elements between headers (y0=top, y1=bottom)
|
||||
content_elements = [
|
||||
e for e in elements
|
||||
if (e.bbox.y1 > header.bbox.y2 and
|
||||
e.bbox.y1 < next_header_y and
|
||||
if (e.bbox.y0 > header.bbox.y1 and
|
||||
e.bbox.y0 < next_header_y and
|
||||
e.type not in [ElementType.HEADER, ElementType.TITLE])
|
||||
]
|
||||
|
||||
|
||||
Reference in New Issue
Block a user