import fs from 'fs'; import zlib from 'zlib'; import { createWorker } from 'tesseract.js'; import { fromBuffer } from 'pdf2pic'; import sharp from 'sharp'; export class PDFExtractor { static async extractText(buffer) { try { // Convert buffer to string to search for text patterns const pdfString = buffer.toString('latin1'); // Look for text streams in the PDF - multiple patterns const patterns = [ /BT[\s\S]*?ET/g, // Text objects /\((.*?)\)/g, // Text in parentheses /\[(.*?)\]/g, // Text in brackets /<<(.*?)>>/g // Dictionary objects ]; let extractedText = ''; for (const pattern of patterns) { const matches = pdfString.match(pattern); if (matches) { for (const match of matches) { // Extract text content from matches const textContent = match.match(/\((.*?)\)/g); if (textContent) { for (const text of textContent) { let cleanText = text.slice(1, -1); // Basic text cleaning cleanText = cleanText.replace(/\\n/g, '\n'); cleanText = cleanText.replace(/\\r/g, '\r'); cleanText = cleanText.replace(/\\t/g, '\t'); cleanText = cleanText.replace(/\\\(/g, '('); cleanText = cleanText.replace(/\\\)/g, ')'); extractedText += cleanText + ' '; } } } } } return extractedText.trim(); } catch (error) { console.error('Error extracting text from PDF:', error); return ''; } } static async extractTextFromStream(buffer) { try { const pdfString = buffer.toString('latin1'); // Look for compressed streams const streamMatches = pdfString.match(/stream\s*\n([\s\S]*?)\nendstream/g); if (!streamMatches) { console.log('No streams found, trying direct extraction'); return this.extractText(buffer); } console.log('Found', streamMatches.length, 'streams'); let extractedText = ''; for (let i = 0; i < streamMatches.length; i++) { const match = streamMatches[i]; try { // Extract the stream content const streamContent = match.match(/stream\s*\n([\s\S]*?)\nendstream/); if (streamContent) { const streamData = streamContent[1]; console.log(`Processing stream ${i + 1}, length:`, streamData.length); // Try to decompress with zlib (FlateDecode) try { // Convert to buffer properly const streamBuffer = Buffer.from(streamData, 'binary'); const decompressed = zlib.inflateSync(streamBuffer); const decompressedText = decompressed.toString('utf8'); console.log('Decompressed text length:', decompressedText.length); // Check if this looks like readable text const readableChars = decompressedText.match(/[a-zA-ZáéíóúñÁÉÍÓÚÑ]/g); const readabilityRatio = readableChars ? readableChars.length / decompressedText.length : 0; console.log(`Stream ${i + 1} readability ratio:`, readabilityRatio.toFixed(3)); // Only process if it has reasonable readability if (readabilityRatio > 0.1) { console.log(`First 200 chars of decompressed:`, decompressedText.substring(0, 200)); // Extract text from decompressed content - multiple patterns const patterns = [ /\((.*?)\)/g, // Text in parentheses /\[(.*?)\]/g, // Text in brackets /<([0-9A-Fa-f]+)>/g // Hexadecimal text ]; for (const pattern of patterns) { const matches = decompressedText.match(pattern); if (matches) { console.log('Found', matches.length, 'matches for pattern:', pattern.source); for (const match of matches) { let cleanText = match.slice(1, -1); // Handle hexadecimal text if (pattern.source.includes('[0-9A-Fa-f]')) { try { // Convert hex to text const hexText = cleanText.replace(/\s/g, ''); if (hexText.length % 2 === 0) { const bytes = []; for (let j = 0; j < hexText.length; j += 2) { bytes.push(parseInt(hexText.substr(j, 2), 16)); } cleanText = String.fromCharCode(...bytes); } } catch (hexError) { console.log('Hex conversion failed:', hexError.message); } } else { // Regular text cleaning cleanText = cleanText.replace(/\\n/g, '\n'); cleanText = cleanText.replace(/\\r/g, '\r'); cleanText = cleanText.replace(/\\t/g, '\t'); cleanText = cleanText.replace(/\\\(/g, '('); cleanText = cleanText.replace(/\\\)/g, ')'); } extractedText += cleanText + ' '; } } } } else { console.log(`Skipping stream ${i + 1} - low readability (${readabilityRatio.toFixed(3)})`); } } catch (decompressError) { console.log('Decompression failed for stream', i + 1, ':', decompressError.message); // Try raw text extraction from stream const rawText = streamData.replace(/[^\x20-\x7E\n\r\t]/g, ' '); extractedText += rawText + ' '; } } } catch (streamError) { console.log('Stream processing failed for stream', i + 1, ':', streamError.message); } } const result = extractedText.trim(); console.log('Total extracted text length:', result.length); // Check if we got meaningful text const meaningfulChars = result.match(/[a-zA-ZáéíóúñÁÉÍÓÚÑ]/g); const meaningfulRatio = meaningfulChars ? meaningfulChars.length / result.length : 0; console.log('Meaningful text ratio:', meaningfulRatio.toFixed(3)); if (meaningfulRatio < 0.3) { console.log('⚠️ Low meaningful text ratio - trying OCR extraction...'); return await this.extractTextWithOCR(buffer); } return result || this.extractText(buffer); } catch (error) { console.error('Error extracting text from PDF stream:', error); return await this.extractTextWithOCR(buffer); } } /** * Preprocess image to improve OCR accuracy for blurry/scanned documents * @param {Buffer} imageBuffer - Image buffer to preprocess * @returns {Promise} - Preprocessed image buffer */ static async preprocessImage(imageBuffer) { try { const processed = await sharp(imageBuffer) .greyscale() // Convert to grayscale for better OCR .normalize() // Enhance contrast .sharpen({ sigma: 1.5, m1: 1, m2: 3, x1: 3, y2: 15, y3: 15 }) // Sharpen blurry images .median(3) // Denoise - remove small artifacts .toBuffer(); return processed; } catch (error) { console.warn('⚠️ Image preprocessing failed, using original:', error.message); return imageBuffer; // Return original if preprocessing fails } } /** * Determine optimal PSM (Page Segmentation Mode) based on image characteristics * @param {Buffer} imageBuffer - Image buffer to analyze * @returns {Promise} - PSM mode (3=auto, 6=single block, 11=sparse text) */ static async determinePSMMode(imageBuffer) { try { const metadata = await sharp(imageBuffer).metadata(); const stats = await sharp(imageBuffer) .greyscale() .normalize() .stats(); // Calculate text density estimate (simplified heuristic) const avgBrightness = stats.channels[0].mean; const stdDev = stats.channels[0].stDev; // Low stdDev suggests uniform background (single block) // High stdDev suggests varied content (auto mode) if (stdDev < 30) { return 6; // Single uniform block of text } else if (stdDev > 60) { return 11; // Sparse text (scanned documents with lots of whitespace) } return 3; // Auto (default) } catch (error) { console.warn('⚠️ Could not determine PSM mode, using auto:', error.message); return 3; // Default to auto } } static async extractTextWithOCR(buffer) { console.log('🔍 Starting OCR extraction with enhanced preprocessing...'); let worker = null; try { // Convert PDF to images at higher DPI for better quality const convert = fromBuffer(buffer, { density: 400, // Increased from 300 to 400 for better OCR accuracy saveFilename: "page", savePath: "./temp", format: "png", width: 2000, height: 2000 }); // Get the number of pages const result = await convert.bulk(-1, { responseType: "base64" }); console.log(`📄 Converted PDF to ${result.length} images for OCR at 400 DPI`); if (result.length === 0) { throw new Error('No pages found in PDF'); } // Initialize Tesseract worker with optimized settings worker = await createWorker('spa+eng'); // Spanish + English // Set OCR Engine Mode to 1 (LSTM neural nets - better accuracy) await worker.setParameters({ tessedit_ocr_engine_mode: '1', // OEM 1: LSTM engine (best for accuracy) tessedit_pageseg_mode: '3', // PSM 3: Auto (will be adjusted per page) tessedit_char_whitelist: '', // No character restrictions }); let ocrText = ''; let totalConfidence = 0; let pagesProcessed = 0; // Process each page with OCR for (let i = 0; i < result.length; i++) { console.log(`🔍 Processing page ${i + 1}/${result.length} with OCR...`); try { // Decode base64 image const imageBuffer = Buffer.from(result[i].base64, 'base64'); // Preprocess image to improve OCR accuracy const processedImageBuffer = await this.preprocessImage(imageBuffer); // Determine optimal PSM mode for this page (use original for analysis) const psmMode = await this.determinePSMMode(imageBuffer); await worker.setParameters({ tessedit_pageseg_mode: String(psmMode) }); // Convert processed image to base64 data URL for tesseract.js const processedBase64 = processedImageBuffer.toString('base64'); const imageDataUrl = `data:image/png;base64,${processedBase64}`; // Perform OCR with confidence scores const { data } = await worker.recognize(imageDataUrl); // Filter out low-confidence words (optional - can be adjusted) const minConfidence = 30; // Minimum confidence threshold (0-100) const words = data.words || []; const highConfidenceText = words .filter(word => word.confidence >= minConfidence) .map(word => word.text) .join(' '); // Use high-confidence text if available, otherwise use all text const pageText = highConfidenceText.length > data.text.length * 0.5 ? highConfidenceText : data.text; ocrText += pageText + '\n\n'; // Track confidence for reporting const avgConfidence = words.length > 0 ? words.reduce((sum, w) => sum + w.confidence, 0) / words.length : 0; totalConfidence += avgConfidence; pagesProcessed++; console.log(`✅ Page ${i + 1}: Extracted ${pageText.length} chars, avg confidence: ${avgConfidence.toFixed(1)}%`); } catch (pageError) { console.error(`❌ Error processing page ${i + 1}:`, pageError.message); // Continue with next page instead of failing completely } } // Terminate worker await worker.terminate(); worker = null; // Calculate average confidence const avgConfidence = pagesProcessed > 0 ? totalConfidence / pagesProcessed : 0; console.log(`📊 Average OCR confidence: ${avgConfidence.toFixed(1)}%`); // Clean up temp files try { if (fs.existsSync('./temp')) { const tempFiles = fs.readdirSync('./temp').filter(file => file.startsWith('page')); tempFiles.forEach(file => { try { fs.unlinkSync(`./temp/${file}`); } catch (unlinkError) { // Ignore individual file deletion errors } }); console.log('🧹 Cleaned up temp files'); } } catch (cleanupError) { console.log('⚠️ Could not clean up temp files:', cleanupError.message); } const finalText = ocrText.trim(); console.log(`✅ OCR extraction completed: ${finalText.length} total characters`); if (finalText.length > 100) { console.log('📝 First 200 chars of OCR text:', finalText.substring(0, 200)); return finalText; } else { console.log('⚠️ OCR extracted very little text, falling back to document name'); return 'Documento procesado con OCR - información limitada extraída'; } } catch (error) { console.error('❌ OCR extraction failed:', error); // Ensure worker is terminated even on error if (worker) { try { await worker.terminate(); } catch (terminateError) { // Ignore termination errors } } return 'Error en procesamiento OCR - documento requiere revisión manual'; } } }