Add PDF translation API, utilities, docs, and config
Introduces core backend and frontend infrastructure for a PDF translation interface. Adds API endpoints for translation, PDF testing, and AI provider testing; implements PDF text extraction, cost tracking, and pricing logic in the lib directory; adds reusable UI components; and provides comprehensive documentation (SDD, environment setup, Claude instructions). Updates Tailwind and global styles, and includes a sample test PDF and configuration files.
This commit is contained in:
296
lib/pdf-to-image.ts
Normal file
296
lib/pdf-to-image.ts
Normal file
@@ -0,0 +1,296 @@
|
||||
import sharp from 'sharp'
|
||||
import fs from 'fs'
|
||||
import path from 'path'
|
||||
import os from 'os'
|
||||
|
||||
interface PDFToImageOptions {
|
||||
density?: number
|
||||
saveToFile?: boolean
|
||||
format?: 'png' | 'jpeg'
|
||||
quality?: number
|
||||
}
|
||||
|
||||
interface ConvertedPage {
|
||||
pageNumber: number
|
||||
buffer: Buffer
|
||||
width: number
|
||||
height: number
|
||||
}
|
||||
|
||||
export async function convertPDFToImages(
|
||||
pdfBuffer: Buffer,
|
||||
options: PDFToImageOptions = {}
|
||||
): Promise<ConvertedPage[]> {
|
||||
const {
|
||||
density = 300,
|
||||
format = 'png',
|
||||
quality = 100
|
||||
} = options
|
||||
|
||||
// Try pdf2pic first, then fall back to pdf-poppler
|
||||
let convertedPages: ConvertedPage[] = []
|
||||
|
||||
try {
|
||||
console.log('Attempting PDF conversion with pdf2pic...')
|
||||
convertedPages = await convertWithPdf2pic(pdfBuffer, options)
|
||||
if (convertedPages.length > 0) {
|
||||
return convertedPages
|
||||
}
|
||||
} catch (error) {
|
||||
console.warn('pdf2pic conversion failed:', error)
|
||||
}
|
||||
|
||||
try {
|
||||
console.log('Attempting PDF conversion with pdf-poppler...')
|
||||
convertedPages = await convertWithPdfPoppler(pdfBuffer, options)
|
||||
if (convertedPages.length > 0) {
|
||||
return convertedPages
|
||||
}
|
||||
} catch (error) {
|
||||
console.warn('pdf-poppler conversion failed:', error)
|
||||
}
|
||||
|
||||
// If both methods fail, provide helpful error message
|
||||
throw new Error('PDF 轉圖片失敗:需要安裝 GraphicsMagick、ImageMagick 或 Poppler 工具。請安裝其中一個依賴項目。')
|
||||
}
|
||||
|
||||
async function convertWithPdf2pic(
|
||||
pdfBuffer: Buffer,
|
||||
options: PDFToImageOptions = {}
|
||||
): Promise<ConvertedPage[]> {
|
||||
const {
|
||||
density = 300,
|
||||
format = 'png',
|
||||
quality = 100
|
||||
} = options
|
||||
|
||||
const { fromPath } = await import('pdf2pic')
|
||||
|
||||
// Create temporary file for PDF
|
||||
const tempDir = os.tmpdir()
|
||||
const tempPdfPath = path.join(tempDir, `temp_${Date.now()}.pdf`)
|
||||
|
||||
try {
|
||||
// Write PDF buffer to temporary file
|
||||
fs.writeFileSync(tempPdfPath, pdfBuffer)
|
||||
|
||||
// Configure pdf2pic
|
||||
const convert = fromPath(tempPdfPath, {
|
||||
density: density,
|
||||
saveToFile: false,
|
||||
savePath: tempDir,
|
||||
format: format,
|
||||
width: 2480, // A4 at 300 DPI
|
||||
height: 3508
|
||||
})
|
||||
|
||||
const convertedPages: ConvertedPage[] = []
|
||||
let pageNumber = 1
|
||||
|
||||
// Convert all pages
|
||||
while (true) {
|
||||
try {
|
||||
const pageResult = await convert(pageNumber, { responseType: 'buffer' })
|
||||
if (!pageResult || !pageResult.buffer) {
|
||||
break // No more pages
|
||||
}
|
||||
|
||||
// Optimize image with Sharp
|
||||
let processedBuffer = pageResult.buffer
|
||||
if (format === 'jpeg') {
|
||||
processedBuffer = await sharp(pageResult.buffer)
|
||||
.jpeg({ quality: quality })
|
||||
.toBuffer()
|
||||
} else {
|
||||
processedBuffer = await sharp(pageResult.buffer)
|
||||
.png({ quality: quality })
|
||||
.toBuffer()
|
||||
}
|
||||
|
||||
// Get image dimensions
|
||||
const metadata = await sharp(processedBuffer).metadata()
|
||||
|
||||
convertedPages.push({
|
||||
pageNumber,
|
||||
buffer: processedBuffer,
|
||||
width: metadata.width || 0,
|
||||
height: metadata.height || 0
|
||||
})
|
||||
|
||||
pageNumber++
|
||||
} catch (error) {
|
||||
// No more pages or conversion error
|
||||
console.log(`Finished converting ${pageNumber - 1} pages`)
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
return convertedPages
|
||||
|
||||
} finally {
|
||||
// Clean up temporary file
|
||||
try {
|
||||
if (fs.existsSync(tempPdfPath)) {
|
||||
fs.unlinkSync(tempPdfPath)
|
||||
}
|
||||
} catch (cleanupError) {
|
||||
console.warn('Failed to clean up temporary PDF file:', cleanupError)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function convertWithPdfPoppler(
|
||||
pdfBuffer: Buffer,
|
||||
options: PDFToImageOptions = {}
|
||||
): Promise<ConvertedPage[]> {
|
||||
const {
|
||||
density = 300,
|
||||
format = 'png'
|
||||
} = options
|
||||
|
||||
// Try using pdf-poppler as alternative
|
||||
try {
|
||||
const poppler = await import('pdf-poppler')
|
||||
|
||||
// Create temporary file for PDF
|
||||
const tempDir = os.tmpdir()
|
||||
const tempPdfPath = path.join(tempDir, `temp_${Date.now()}.pdf`)
|
||||
|
||||
try {
|
||||
// Write PDF buffer to temporary file
|
||||
fs.writeFileSync(tempPdfPath, pdfBuffer)
|
||||
|
||||
const popplerOptions = {
|
||||
format: format,
|
||||
out_dir: tempDir,
|
||||
out_prefix: `converted_${Date.now()}`,
|
||||
page: null, // Convert all pages
|
||||
png_file: format === 'png',
|
||||
jpeg_file: format === 'jpeg'
|
||||
}
|
||||
|
||||
const convertedFiles = await poppler.convert(tempPdfPath, popplerOptions)
|
||||
const convertedPages: ConvertedPage[] = []
|
||||
|
||||
if (Array.isArray(convertedFiles)) {
|
||||
for (let i = 0; i < convertedFiles.length; i++) {
|
||||
const filePath = convertedFiles[i]
|
||||
try {
|
||||
const imageBuffer = fs.readFileSync(filePath)
|
||||
const metadata = await sharp(imageBuffer).metadata()
|
||||
|
||||
convertedPages.push({
|
||||
pageNumber: i + 1,
|
||||
buffer: imageBuffer,
|
||||
width: metadata.width || 0,
|
||||
height: metadata.height || 0
|
||||
})
|
||||
|
||||
// Clean up converted file
|
||||
fs.unlinkSync(filePath)
|
||||
} catch (fileError) {
|
||||
console.warn(`Failed to process converted file ${filePath}:`, fileError)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return convertedPages
|
||||
|
||||
} finally {
|
||||
// Clean up temporary PDF file
|
||||
try {
|
||||
if (fs.existsSync(tempPdfPath)) {
|
||||
fs.unlinkSync(tempPdfPath)
|
||||
}
|
||||
} catch (cleanupError) {
|
||||
console.warn('Failed to clean up temporary PDF file:', cleanupError)
|
||||
}
|
||||
}
|
||||
} catch (importError) {
|
||||
throw new Error('pdf-poppler 無法使用')
|
||||
}
|
||||
}
|
||||
|
||||
export async function optimizeImageForOCR(imageBuffer: Buffer): Promise<Buffer> {
|
||||
try {
|
||||
// Optimize image for OCR:
|
||||
// 1. Convert to grayscale
|
||||
// 2. Increase contrast
|
||||
// 3. Sharpen
|
||||
// 4. Ensure good resolution
|
||||
const optimizedBuffer = await sharp(imageBuffer)
|
||||
.greyscale()
|
||||
.normalize() // Auto-level
|
||||
.sharpen({
|
||||
sigma: 1,
|
||||
m1: 0.5,
|
||||
m2: 2,
|
||||
x1: 2,
|
||||
y2: 10
|
||||
})
|
||||
.png({ quality: 100 })
|
||||
.toBuffer()
|
||||
|
||||
return optimizedBuffer
|
||||
} catch (error) {
|
||||
console.error('Image optimization error:', error)
|
||||
// Return original buffer if optimization fails
|
||||
return imageBuffer
|
||||
}
|
||||
}
|
||||
|
||||
// Helper function to estimate processing time
|
||||
export function estimateProcessingTime(pageCount: number): number {
|
||||
// Rough estimate: 3-8 seconds per page depending on complexity
|
||||
const baseTimePerPage = 5 // seconds
|
||||
const totalTime = pageCount * baseTimePerPage
|
||||
return Math.min(totalTime, 120) // Cap at 2 minutes
|
||||
}
|
||||
|
||||
// Helper function to check if system supports PDF conversion
|
||||
export async function checkPDFConversionSupport(): Promise<boolean> {
|
||||
try {
|
||||
// Create a minimal test PDF buffer
|
||||
const testPdfBuffer = Buffer.from(`%PDF-1.4
|
||||
1 0 obj
|
||||
<<
|
||||
/Type /Catalog
|
||||
/Pages 2 0 R
|
||||
>>
|
||||
endobj
|
||||
2 0 obj
|
||||
<<
|
||||
/Type /Pages
|
||||
/Kids [3 0 R]
|
||||
/Count 1
|
||||
>>
|
||||
endobj
|
||||
3 0 obj
|
||||
<<
|
||||
/Type /Page
|
||||
/Parent 2 0 R
|
||||
/MediaBox [0 0 612 792]
|
||||
>>
|
||||
endobj
|
||||
xref
|
||||
0 4
|
||||
0000000000 65535 f
|
||||
0000000009 00000 n
|
||||
0000000074 00000 n
|
||||
0000000120 00000 n
|
||||
trailer
|
||||
<<
|
||||
/Size 4
|
||||
/Root 1 0 R
|
||||
>>
|
||||
startxref
|
||||
219
|
||||
%%EOF`)
|
||||
|
||||
await convertPDFToImages(testPdfBuffer)
|
||||
return true
|
||||
} catch (error) {
|
||||
console.warn('PDF conversion support check failed:', error)
|
||||
return false
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user