Introduces core backend and frontend infrastructure for a PDF translation interface. Adds API endpoints for translation, PDF testing, and AI provider testing; implements PDF text extraction, cost tracking, and pricing logic in the lib directory; adds reusable UI components; and provides comprehensive documentation (SDD, environment setup, Claude instructions). Updates Tailwind and global styles, and includes a sample test PDF and configuration files.
296 lines
7.1 KiB
TypeScript
296 lines
7.1 KiB
TypeScript
import sharp from 'sharp'
|
|
import fs from 'fs'
|
|
import path from 'path'
|
|
import os from 'os'
|
|
|
|
interface PDFToImageOptions {
|
|
density?: number
|
|
saveToFile?: boolean
|
|
format?: 'png' | 'jpeg'
|
|
quality?: number
|
|
}
|
|
|
|
interface ConvertedPage {
|
|
pageNumber: number
|
|
buffer: Buffer
|
|
width: number
|
|
height: number
|
|
}
|
|
|
|
export async function convertPDFToImages(
|
|
pdfBuffer: Buffer,
|
|
options: PDFToImageOptions = {}
|
|
): Promise<ConvertedPage[]> {
|
|
const {
|
|
density = 300,
|
|
format = 'png',
|
|
quality = 100
|
|
} = options
|
|
|
|
// Try pdf2pic first, then fall back to pdf-poppler
|
|
let convertedPages: ConvertedPage[] = []
|
|
|
|
try {
|
|
console.log('Attempting PDF conversion with pdf2pic...')
|
|
convertedPages = await convertWithPdf2pic(pdfBuffer, options)
|
|
if (convertedPages.length > 0) {
|
|
return convertedPages
|
|
}
|
|
} catch (error) {
|
|
console.warn('pdf2pic conversion failed:', error)
|
|
}
|
|
|
|
try {
|
|
console.log('Attempting PDF conversion with pdf-poppler...')
|
|
convertedPages = await convertWithPdfPoppler(pdfBuffer, options)
|
|
if (convertedPages.length > 0) {
|
|
return convertedPages
|
|
}
|
|
} catch (error) {
|
|
console.warn('pdf-poppler conversion failed:', error)
|
|
}
|
|
|
|
// If both methods fail, provide helpful error message
|
|
throw new Error('PDF 轉圖片失敗:需要安裝 GraphicsMagick、ImageMagick 或 Poppler 工具。請安裝其中一個依賴項目。')
|
|
}
|
|
|
|
async function convertWithPdf2pic(
|
|
pdfBuffer: Buffer,
|
|
options: PDFToImageOptions = {}
|
|
): Promise<ConvertedPage[]> {
|
|
const {
|
|
density = 300,
|
|
format = 'png',
|
|
quality = 100
|
|
} = options
|
|
|
|
const { fromPath } = await import('pdf2pic')
|
|
|
|
// Create temporary file for PDF
|
|
const tempDir = os.tmpdir()
|
|
const tempPdfPath = path.join(tempDir, `temp_${Date.now()}.pdf`)
|
|
|
|
try {
|
|
// Write PDF buffer to temporary file
|
|
fs.writeFileSync(tempPdfPath, pdfBuffer)
|
|
|
|
// Configure pdf2pic
|
|
const convert = fromPath(tempPdfPath, {
|
|
density: density,
|
|
saveToFile: false,
|
|
savePath: tempDir,
|
|
format: format,
|
|
width: 2480, // A4 at 300 DPI
|
|
height: 3508
|
|
})
|
|
|
|
const convertedPages: ConvertedPage[] = []
|
|
let pageNumber = 1
|
|
|
|
// Convert all pages
|
|
while (true) {
|
|
try {
|
|
const pageResult = await convert(pageNumber, { responseType: 'buffer' })
|
|
if (!pageResult || !pageResult.buffer) {
|
|
break // No more pages
|
|
}
|
|
|
|
// Optimize image with Sharp
|
|
let processedBuffer = pageResult.buffer
|
|
if (format === 'jpeg') {
|
|
processedBuffer = await sharp(pageResult.buffer)
|
|
.jpeg({ quality: quality })
|
|
.toBuffer()
|
|
} else {
|
|
processedBuffer = await sharp(pageResult.buffer)
|
|
.png({ quality: quality })
|
|
.toBuffer()
|
|
}
|
|
|
|
// Get image dimensions
|
|
const metadata = await sharp(processedBuffer).metadata()
|
|
|
|
convertedPages.push({
|
|
pageNumber,
|
|
buffer: processedBuffer,
|
|
width: metadata.width || 0,
|
|
height: metadata.height || 0
|
|
})
|
|
|
|
pageNumber++
|
|
} catch (error) {
|
|
// No more pages or conversion error
|
|
console.log(`Finished converting ${pageNumber - 1} pages`)
|
|
break
|
|
}
|
|
}
|
|
|
|
return convertedPages
|
|
|
|
} finally {
|
|
// Clean up temporary file
|
|
try {
|
|
if (fs.existsSync(tempPdfPath)) {
|
|
fs.unlinkSync(tempPdfPath)
|
|
}
|
|
} catch (cleanupError) {
|
|
console.warn('Failed to clean up temporary PDF file:', cleanupError)
|
|
}
|
|
}
|
|
}
|
|
|
|
async function convertWithPdfPoppler(
|
|
pdfBuffer: Buffer,
|
|
options: PDFToImageOptions = {}
|
|
): Promise<ConvertedPage[]> {
|
|
const {
|
|
density = 300,
|
|
format = 'png'
|
|
} = options
|
|
|
|
// Try using pdf-poppler as alternative
|
|
try {
|
|
const poppler = await import('pdf-poppler')
|
|
|
|
// Create temporary file for PDF
|
|
const tempDir = os.tmpdir()
|
|
const tempPdfPath = path.join(tempDir, `temp_${Date.now()}.pdf`)
|
|
|
|
try {
|
|
// Write PDF buffer to temporary file
|
|
fs.writeFileSync(tempPdfPath, pdfBuffer)
|
|
|
|
const popplerOptions = {
|
|
format: format,
|
|
out_dir: tempDir,
|
|
out_prefix: `converted_${Date.now()}`,
|
|
page: null, // Convert all pages
|
|
png_file: format === 'png',
|
|
jpeg_file: format === 'jpeg'
|
|
}
|
|
|
|
const convertedFiles = await poppler.convert(tempPdfPath, popplerOptions)
|
|
const convertedPages: ConvertedPage[] = []
|
|
|
|
if (Array.isArray(convertedFiles)) {
|
|
for (let i = 0; i < convertedFiles.length; i++) {
|
|
const filePath = convertedFiles[i]
|
|
try {
|
|
const imageBuffer = fs.readFileSync(filePath)
|
|
const metadata = await sharp(imageBuffer).metadata()
|
|
|
|
convertedPages.push({
|
|
pageNumber: i + 1,
|
|
buffer: imageBuffer,
|
|
width: metadata.width || 0,
|
|
height: metadata.height || 0
|
|
})
|
|
|
|
// Clean up converted file
|
|
fs.unlinkSync(filePath)
|
|
} catch (fileError) {
|
|
console.warn(`Failed to process converted file ${filePath}:`, fileError)
|
|
}
|
|
}
|
|
}
|
|
|
|
return convertedPages
|
|
|
|
} finally {
|
|
// Clean up temporary PDF file
|
|
try {
|
|
if (fs.existsSync(tempPdfPath)) {
|
|
fs.unlinkSync(tempPdfPath)
|
|
}
|
|
} catch (cleanupError) {
|
|
console.warn('Failed to clean up temporary PDF file:', cleanupError)
|
|
}
|
|
}
|
|
} catch (importError) {
|
|
throw new Error('pdf-poppler 無法使用')
|
|
}
|
|
}
|
|
|
|
export async function optimizeImageForOCR(imageBuffer: Buffer): Promise<Buffer> {
|
|
try {
|
|
// Optimize image for OCR:
|
|
// 1. Convert to grayscale
|
|
// 2. Increase contrast
|
|
// 3. Sharpen
|
|
// 4. Ensure good resolution
|
|
const optimizedBuffer = await sharp(imageBuffer)
|
|
.greyscale()
|
|
.normalize() // Auto-level
|
|
.sharpen({
|
|
sigma: 1,
|
|
m1: 0.5,
|
|
m2: 2,
|
|
x1: 2,
|
|
y2: 10
|
|
})
|
|
.png({ quality: 100 })
|
|
.toBuffer()
|
|
|
|
return optimizedBuffer
|
|
} catch (error) {
|
|
console.error('Image optimization error:', error)
|
|
// Return original buffer if optimization fails
|
|
return imageBuffer
|
|
}
|
|
}
|
|
|
|
// Helper function to estimate processing time
|
|
export function estimateProcessingTime(pageCount: number): number {
|
|
// Rough estimate: 3-8 seconds per page depending on complexity
|
|
const baseTimePerPage = 5 // seconds
|
|
const totalTime = pageCount * baseTimePerPage
|
|
return Math.min(totalTime, 120) // Cap at 2 minutes
|
|
}
|
|
|
|
// Helper function to check if system supports PDF conversion
|
|
export async function checkPDFConversionSupport(): Promise<boolean> {
|
|
try {
|
|
// Create a minimal test PDF buffer
|
|
const testPdfBuffer = Buffer.from(`%PDF-1.4
|
|
1 0 obj
|
|
<<
|
|
/Type /Catalog
|
|
/Pages 2 0 R
|
|
>>
|
|
endobj
|
|
2 0 obj
|
|
<<
|
|
/Type /Pages
|
|
/Kids [3 0 R]
|
|
/Count 1
|
|
>>
|
|
endobj
|
|
3 0 obj
|
|
<<
|
|
/Type /Page
|
|
/Parent 2 0 R
|
|
/MediaBox [0 0 612 792]
|
|
>>
|
|
endobj
|
|
xref
|
|
0 4
|
|
0000000000 65535 f
|
|
0000000009 00000 n
|
|
0000000074 00000 n
|
|
0000000120 00000 n
|
|
trailer
|
|
<<
|
|
/Size 4
|
|
/Root 1 0 R
|
|
>>
|
|
startxref
|
|
219
|
|
%%EOF`)
|
|
|
|
await convertPDFToImages(testPdfBuffer)
|
|
return true
|
|
} catch (error) {
|
|
console.warn('PDF conversion support check failed:', error)
|
|
return false
|
|
}
|
|
} |