fix: OCR track table data format and image cropping

Table data format fixes (ocr_to_unified_converter.py):
- Fix ElementType string conversion using value-based lookup
- Add content-based HTML table detection (reclassify TEXT to TABLE)
- Use BeautifulSoup for robust HTML table parsing
- Generate TableData with fully populated cells arrays

Image cropping for OCR track (pp_structure_enhanced.py):
- Add _crop_and_save_image method for extracting image regions
- Pass source_image_path to _process_parsing_res_list
- Return relative filename (not full path) for saved_path
- Consistent with Direct Track image saving pattern

Also includes:
- Add beautifulsoup4 to requirements.txt
- Add architecture overview documentation
- Archive fix-ocr-track-table-data-format proposal (22/24 tasks)

Known issues: OCR track images are restored but still have quality issues
that will be addressed in a follow-up proposal.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
egg
2025-11-26 18:48:15 +08:00
parent a227311b2d
commit 6e050eb540
8 changed files with 585 additions and 30 deletions

View File

@@ -350,7 +350,19 @@ class OCRToUnifiedConverter:
element_type = elem_data.get('type', ElementType.TEXT)
if isinstance(element_type, str):
# Convert string to ElementType if needed
element_type = ElementType[element_type] if element_type in ElementType.__members__ else ElementType.TEXT
# ElementType is a str-based enum, so we can construct from value (lowercase)
try:
element_type = ElementType(element_type)
except ValueError:
# If value doesn't match, try member name (uppercase)
element_type = ElementType[element_type.upper()] if element_type.upper() in ElementType.__members__ else ElementType.TEXT
# Content-based reclassification: detect HTML tables in text content
content_str = elem_data.get('content', '')
if isinstance(content_str, str) and '<table' in content_str.lower():
if element_type == ElementType.TEXT:
logger.info(f"Element {elem_data.get('element_id')}: Reclassifying TEXT to TABLE (HTML table in content)")
element_type = ElementType.TABLE
# Prepare content based on element type
if element_type == ElementType.TABLE:
@@ -538,7 +550,12 @@ class OCRToUnifiedConverter:
return None
def _extract_table_data(self, elem_data: Dict) -> Optional[TableData]:
"""Extract table data from element."""
"""
Extract table data from element using BeautifulSoup for robust HTML parsing.
This method produces TableData objects with fully populated cells arrays,
matching the format produced by DirectExtractionEngine for consistency.
"""
try:
html = elem_data.get('html', '')
extracted_text = elem_data.get('extracted_text', '')
@@ -550,31 +567,101 @@ class OCRToUnifiedConverter:
html = content
logger.debug("Using content field as HTML table source")
# Try to parse HTML to get rows and columns
rows = 0
# Return None if no HTML table content
if not html or '<table' not in html.lower():
if extracted_text:
# Return minimal TableData with just caption if we have text
return TableData(rows=0, cols=0, cells=[], caption=extracted_text)
return None
# Parse HTML table using BeautifulSoup
try:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
table = soup.find('table')
if not table:
logger.warning("No <table> element found in HTML")
return self._fallback_table_data(html, extracted_text)
cells = []
headers = []
rows = table.find_all('tr')
# Track actual column positions accounting for rowspan/colspan
# This is a simplified approach - complex spanning may need enhancement
for row_idx, row in enumerate(rows):
row_cells = row.find_all(['td', 'th'])
col_idx = 0
for cell in row_cells:
cell_content = cell.get_text(strip=True)
rowspan = int(cell.get('rowspan', 1))
colspan = int(cell.get('colspan', 1))
cells.append(TableCell(
row=row_idx,
col=col_idx,
row_span=rowspan,
col_span=colspan,
content=cell_content
))
# Collect headers from <th> elements or first row
if cell.name == 'th' or row_idx == 0:
headers.append(cell_content)
# Advance column index by colspan
col_idx += colspan
# Calculate actual dimensions
num_rows = len(rows)
num_cols = max(
sum(int(cell.get('colspan', 1)) for cell in row.find_all(['td', 'th']))
for row in rows
) if rows else 0
logger.debug(
f"Parsed HTML table: {num_rows} rows, {num_cols} cols, {len(cells)} cells"
)
return TableData(
rows=num_rows,
cols=num_cols,
cells=cells,
headers=headers if headers else None,
caption=extracted_text if extracted_text else None
)
except ImportError:
logger.warning("BeautifulSoup not available, using fallback parsing")
return self._fallback_table_data(html, extracted_text)
except Exception as e:
logger.warning(f"Failed to extract table data: {e}")
return None
def _fallback_table_data(self, html: str, extracted_text: str = '') -> Optional[TableData]:
"""
Fallback table parsing when BeautifulSoup is not available.
Returns basic TableData with row/col counts only (no cells).
"""
try:
rows = html.count('<tr')
cols = 0
cells = []
if rows > 0:
first_row_end = html.find('</tr>')
if first_row_end > 0:
first_row = html[:first_row_end]
cols = first_row.count('<td') + first_row.count('<th')
if html:
# Simple HTML parsing (could be enhanced with BeautifulSoup)
rows = html.count('<tr')
if rows > 0:
# Estimate columns from first row
first_row_end = html.find('</tr>')
if first_row_end > 0:
first_row = html[:first_row_end]
cols = first_row.count('<td') + first_row.count('<th')
# Return None if no valid table data found
if rows == 0 and cols == 0 and not extracted_text:
return None
# Note: TableData uses 'cols' not 'columns'
# HTML content can be stored as caption or in element metadata
return TableData(
rows=rows,
cols=cols,
cells=cells,
cells=[], # Empty cells in fallback mode
caption=extracted_text if extracted_text else None
)
except:
@@ -653,9 +740,9 @@ class OCRToUnifiedConverter:
min_distance = float('inf')
for target in targets:
# Caption should be below the target
if target.bbox.y2 <= caption.bbox.y1:
distance = caption.bbox.y1 - target.bbox.y2
# Caption should be below the target (y1 is bottom in BoundingBox)
if target.bbox.y1 <= caption.bbox.y0:
distance = caption.bbox.y0 - target.bbox.y1
if distance < min_distance:
min_distance = distance
best_target = target
@@ -684,8 +771,8 @@ class OCRToUnifiedConverter:
else:
prev_item = list_items[i-1]
# Check if items are consecutive (similar x position, reasonable y gap)
x_aligned = abs(item.bbox.x1 - prev_item.bbox.x1) < 20
y_consecutive = (item.bbox.y1 - prev_item.bbox.y2) < 30
x_aligned = abs(item.bbox.x0 - prev_item.bbox.x0) < 20
y_consecutive = (item.bbox.y0 - prev_item.bbox.y1) < 30
if x_aligned and y_consecutive:
current_group.append(item)
@@ -714,11 +801,11 @@ class OCRToUnifiedConverter:
if i + 1 < len(headers):
next_header_y = headers[i + 1].bbox.y1
# Find all elements between headers
# Find all elements between headers (y0=top, y1=bottom)
content_elements = [
e for e in elements
if (e.bbox.y1 > header.bbox.y2 and
e.bbox.y1 < next_header_y and
if (e.bbox.y0 > header.bbox.y1 and
e.bbox.y0 < next_header_y and
e.type not in [ElementType.HEADER, ElementType.TITLE])
]