OCR/requirements.txt

# Tool_OCR - Backend Dependencies
# Python 3.10+

# ===== Core Framework =====
fastapi==0.115.0
uvicorn[standard]==0.32.0
pydantic==2.9.2
pydantic-settings==2.6.1
email-validator>=2.0.0  # For pydantic EmailStr validation

# ===== OCR Engine =====
paddleocr>=3.0.0
# paddlepaddle>=3.0.0  # Installed separately in setup script (GPU/CPU version)
paddlex[ocr]>=3.0.0  # Required for PP-StructureV3 layout analysis

# ===== Image Processing =====
pillow>=10.0.0
pdf2image>=1.17.0
opencv-python>=4.8.0

# ===== PDF Generation =====
weasyprint>=60.0
markdown>=3.5.0
reportlab>=4.0.0  # Layout-preserving PDF generation with precise coordinate control
PyPDF2>=3.0.0  # Extract dimensions from source PDF files
# Note: pandoc needs to be installed via brew (brew install pandoc)

# ===== Direct PDF Extraction (Dual-track Processing) =====
PyMuPDF>=1.23.0  # Primary library for editable PDF text/structure extraction
pdfplumber>=0.10.0  # Fallback for table extraction and validation
python-magic-bin>=0.4.14  # Windows-compatible file type detection

# ===== Data Export =====
pandas>=2.1.0
openpyxl>=3.1.0  # Excel support

# ===== Database =====
sqlalchemy>=2.0.0
pymysql>=1.1.0
alembic>=1.13.0

# ===== Authentication =====
python-jose[cryptography]>=3.3.0
passlib[bcrypt]>=1.7.4
bcrypt==4.2.1  # Pin to 4.2.1 for passlib compatibility
python-multipart>=0.0.6

# ===== Configuration =====
python-dotenv>=1.0.0
pyyaml>=6.0

# ===== HTTP Client =====
httpx>=0.25.0
requests>=2.31.0

# ===== Background Tasks (Optional) =====
# redis>=5.0.0  # Uncomment if using Redis for task queue
# celery>=5.3.0  # Uncomment if using Celery

# ===== Translation (Reserved) =====
# argostranslate>=1.9.0  # Uncomment when implementing translation

# ===== Development Tools =====
pytest>=7.4.0
pytest-asyncio>=0.21.0
pytest-cov>=4.1.0
black>=23.9.0
pylint>=3.0.0

# ===== Utilities =====
python-magic>=0.4.27  # File type detection
beautifulsoup4>=4.12.0  # HTML table parsing for OCR track