367 lines
14 KiB
JavaScript
367 lines
14 KiB
JavaScript
import fs from 'fs';
|
|
import zlib from 'zlib';
|
|
import { createWorker } from 'tesseract.js';
|
|
import { fromBuffer } from 'pdf2pic';
|
|
import sharp from 'sharp';
|
|
|
|
export class PDFExtractor {
|
|
static async extractText(buffer) {
|
|
try {
|
|
// Convert buffer to string to search for text patterns
|
|
const pdfString = buffer.toString('latin1');
|
|
|
|
// Look for text streams in the PDF - multiple patterns
|
|
const patterns = [
|
|
/BT[\s\S]*?ET/g, // Text objects
|
|
/\((.*?)\)/g, // Text in parentheses
|
|
/\[(.*?)\]/g, // Text in brackets
|
|
/<<(.*?)>>/g // Dictionary objects
|
|
];
|
|
|
|
let extractedText = '';
|
|
|
|
for (const pattern of patterns) {
|
|
const matches = pdfString.match(pattern);
|
|
if (matches) {
|
|
for (const match of matches) {
|
|
// Extract text content from matches
|
|
const textContent = match.match(/\((.*?)\)/g);
|
|
if (textContent) {
|
|
for (const text of textContent) {
|
|
let cleanText = text.slice(1, -1);
|
|
// Basic text cleaning
|
|
cleanText = cleanText.replace(/\\n/g, '\n');
|
|
cleanText = cleanText.replace(/\\r/g, '\r');
|
|
cleanText = cleanText.replace(/\\t/g, '\t');
|
|
cleanText = cleanText.replace(/\\\(/g, '(');
|
|
cleanText = cleanText.replace(/\\\)/g, ')');
|
|
extractedText += cleanText + ' ';
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return extractedText.trim();
|
|
} catch (error) {
|
|
console.error('Error extracting text from PDF:', error);
|
|
return '';
|
|
}
|
|
}
|
|
|
|
static async extractTextFromStream(buffer) {
|
|
try {
|
|
const pdfString = buffer.toString('latin1');
|
|
|
|
// Look for compressed streams
|
|
const streamMatches = pdfString.match(/stream\s*\n([\s\S]*?)\nendstream/g);
|
|
|
|
if (!streamMatches) {
|
|
console.log('No streams found, trying direct extraction');
|
|
return this.extractText(buffer);
|
|
}
|
|
|
|
console.log('Found', streamMatches.length, 'streams');
|
|
let extractedText = '';
|
|
|
|
for (let i = 0; i < streamMatches.length; i++) {
|
|
const match = streamMatches[i];
|
|
try {
|
|
// Extract the stream content
|
|
const streamContent = match.match(/stream\s*\n([\s\S]*?)\nendstream/);
|
|
if (streamContent) {
|
|
const streamData = streamContent[1];
|
|
console.log(`Processing stream ${i + 1}, length:`, streamData.length);
|
|
|
|
// Try to decompress with zlib (FlateDecode)
|
|
try {
|
|
// Convert to buffer properly
|
|
const streamBuffer = Buffer.from(streamData, 'binary');
|
|
const decompressed = zlib.inflateSync(streamBuffer);
|
|
const decompressedText = decompressed.toString('utf8');
|
|
|
|
console.log('Decompressed text length:', decompressedText.length);
|
|
|
|
// Check if this looks like readable text
|
|
const readableChars = decompressedText.match(/[a-zA-ZáéíóúñÁÉÍÓÚÑ]/g);
|
|
const readabilityRatio = readableChars ? readableChars.length / decompressedText.length : 0;
|
|
|
|
console.log(`Stream ${i + 1} readability ratio:`, readabilityRatio.toFixed(3));
|
|
|
|
// Only process if it has reasonable readability
|
|
if (readabilityRatio > 0.1) {
|
|
console.log(`First 200 chars of decompressed:`, decompressedText.substring(0, 200));
|
|
|
|
// Extract text from decompressed content - multiple patterns
|
|
const patterns = [
|
|
/\((.*?)\)/g, // Text in parentheses
|
|
/\[(.*?)\]/g, // Text in brackets
|
|
/<([0-9A-Fa-f]+)>/g // Hexadecimal text
|
|
];
|
|
|
|
for (const pattern of patterns) {
|
|
const matches = decompressedText.match(pattern);
|
|
if (matches) {
|
|
console.log('Found', matches.length, 'matches for pattern:', pattern.source);
|
|
for (const match of matches) {
|
|
let cleanText = match.slice(1, -1);
|
|
|
|
// Handle hexadecimal text
|
|
if (pattern.source.includes('[0-9A-Fa-f]')) {
|
|
try {
|
|
// Convert hex to text
|
|
const hexText = cleanText.replace(/\s/g, '');
|
|
if (hexText.length % 2 === 0) {
|
|
const bytes = [];
|
|
for (let j = 0; j < hexText.length; j += 2) {
|
|
bytes.push(parseInt(hexText.substr(j, 2), 16));
|
|
}
|
|
cleanText = String.fromCharCode(...bytes);
|
|
}
|
|
} catch (hexError) {
|
|
console.log('Hex conversion failed:', hexError.message);
|
|
}
|
|
} else {
|
|
// Regular text cleaning
|
|
cleanText = cleanText.replace(/\\n/g, '\n');
|
|
cleanText = cleanText.replace(/\\r/g, '\r');
|
|
cleanText = cleanText.replace(/\\t/g, '\t');
|
|
cleanText = cleanText.replace(/\\\(/g, '(');
|
|
cleanText = cleanText.replace(/\\\)/g, ')');
|
|
}
|
|
|
|
extractedText += cleanText + ' ';
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
console.log(`Skipping stream ${i + 1} - low readability (${readabilityRatio.toFixed(3)})`);
|
|
}
|
|
} catch (decompressError) {
|
|
console.log('Decompression failed for stream', i + 1, ':', decompressError.message);
|
|
// Try raw text extraction from stream
|
|
const rawText = streamData.replace(/[^\x20-\x7E\n\r\t]/g, ' ');
|
|
extractedText += rawText + ' ';
|
|
}
|
|
}
|
|
} catch (streamError) {
|
|
console.log('Stream processing failed for stream', i + 1, ':', streamError.message);
|
|
}
|
|
}
|
|
|
|
const result = extractedText.trim();
|
|
console.log('Total extracted text length:', result.length);
|
|
|
|
// Check if we got meaningful text
|
|
const meaningfulChars = result.match(/[a-zA-ZáéíóúñÁÉÍÓÚÑ]/g);
|
|
const meaningfulRatio = meaningfulChars ? meaningfulChars.length / result.length : 0;
|
|
|
|
console.log('Meaningful text ratio:', meaningfulRatio.toFixed(3));
|
|
|
|
if (meaningfulRatio < 0.3) {
|
|
console.log('⚠️ Low meaningful text ratio - trying OCR extraction...');
|
|
return await this.extractTextWithOCR(buffer);
|
|
}
|
|
|
|
return result || this.extractText(buffer);
|
|
} catch (error) {
|
|
console.error('Error extracting text from PDF stream:', error);
|
|
return await this.extractTextWithOCR(buffer);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Preprocess image to improve OCR accuracy for blurry/scanned documents
|
|
* @param {Buffer} imageBuffer - Image buffer to preprocess
|
|
* @returns {Promise<Buffer>} - Preprocessed image buffer
|
|
*/
|
|
static async preprocessImage(imageBuffer) {
|
|
try {
|
|
const processed = await sharp(imageBuffer)
|
|
.greyscale() // Convert to grayscale for better OCR
|
|
.normalize() // Enhance contrast
|
|
.sharpen({ sigma: 1.5, m1: 1, m2: 3, x1: 3, y2: 15, y3: 15 }) // Sharpen blurry images
|
|
.median(3) // Denoise - remove small artifacts
|
|
.toBuffer();
|
|
|
|
return processed;
|
|
} catch (error) {
|
|
console.warn('⚠️ Image preprocessing failed, using original:', error.message);
|
|
return imageBuffer; // Return original if preprocessing fails
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Determine optimal PSM (Page Segmentation Mode) based on image characteristics
|
|
* @param {Buffer} imageBuffer - Image buffer to analyze
|
|
* @returns {Promise<number>} - PSM mode (3=auto, 6=single block, 11=sparse text)
|
|
*/
|
|
static async determinePSMMode(imageBuffer) {
|
|
try {
|
|
const metadata = await sharp(imageBuffer).metadata();
|
|
const stats = await sharp(imageBuffer)
|
|
.greyscale()
|
|
.normalize()
|
|
.stats();
|
|
|
|
// Calculate text density estimate (simplified heuristic)
|
|
const avgBrightness = stats.channels[0].mean;
|
|
const stdDev = stats.channels[0].stDev;
|
|
|
|
// Low stdDev suggests uniform background (single block)
|
|
// High stdDev suggests varied content (auto mode)
|
|
if (stdDev < 30) {
|
|
return 6; // Single uniform block of text
|
|
} else if (stdDev > 60) {
|
|
return 11; // Sparse text (scanned documents with lots of whitespace)
|
|
}
|
|
|
|
return 3; // Auto (default)
|
|
} catch (error) {
|
|
console.warn('⚠️ Could not determine PSM mode, using auto:', error.message);
|
|
return 3; // Default to auto
|
|
}
|
|
}
|
|
|
|
static async extractTextWithOCR(buffer) {
|
|
console.log('🔍 Starting OCR extraction with enhanced preprocessing...');
|
|
let worker = null;
|
|
|
|
try {
|
|
// Convert PDF to images at higher DPI for better quality
|
|
const convert = fromBuffer(buffer, {
|
|
density: 400, // Increased from 300 to 400 for better OCR accuracy
|
|
saveFilename: "page",
|
|
savePath: "./temp",
|
|
format: "png",
|
|
width: 2000,
|
|
height: 2000
|
|
});
|
|
|
|
// Get the number of pages
|
|
const result = await convert.bulk(-1, { responseType: "base64" });
|
|
console.log(`📄 Converted PDF to ${result.length} images for OCR at 400 DPI`);
|
|
|
|
if (result.length === 0) {
|
|
throw new Error('No pages found in PDF');
|
|
}
|
|
|
|
// Initialize Tesseract worker with optimized settings
|
|
worker = await createWorker('spa+eng'); // Spanish + English
|
|
|
|
// Set OCR Engine Mode to 1 (LSTM neural nets - better accuracy)
|
|
await worker.setParameters({
|
|
tessedit_ocr_engine_mode: '1', // OEM 1: LSTM engine (best for accuracy)
|
|
tessedit_pageseg_mode: '3', // PSM 3: Auto (will be adjusted per page)
|
|
tessedit_char_whitelist: '', // No character restrictions
|
|
});
|
|
|
|
let ocrText = '';
|
|
let totalConfidence = 0;
|
|
let pagesProcessed = 0;
|
|
|
|
// Process each page with OCR
|
|
for (let i = 0; i < result.length; i++) {
|
|
console.log(`🔍 Processing page ${i + 1}/${result.length} with OCR...`);
|
|
|
|
try {
|
|
// Decode base64 image
|
|
const imageBuffer = Buffer.from(result[i].base64, 'base64');
|
|
|
|
// Preprocess image to improve OCR accuracy
|
|
const processedImageBuffer = await this.preprocessImage(imageBuffer);
|
|
|
|
// Determine optimal PSM mode for this page (use original for analysis)
|
|
const psmMode = await this.determinePSMMode(imageBuffer);
|
|
await worker.setParameters({
|
|
tessedit_pageseg_mode: String(psmMode)
|
|
});
|
|
|
|
// Convert processed image to base64 data URL for tesseract.js
|
|
const processedBase64 = processedImageBuffer.toString('base64');
|
|
const imageDataUrl = `data:image/png;base64,${processedBase64}`;
|
|
|
|
// Perform OCR with confidence scores
|
|
const { data } = await worker.recognize(imageDataUrl);
|
|
|
|
// Filter out low-confidence words (optional - can be adjusted)
|
|
const minConfidence = 30; // Minimum confidence threshold (0-100)
|
|
const words = data.words || [];
|
|
const highConfidenceText = words
|
|
.filter(word => word.confidence >= minConfidence)
|
|
.map(word => word.text)
|
|
.join(' ');
|
|
|
|
// Use high-confidence text if available, otherwise use all text
|
|
const pageText = highConfidenceText.length > data.text.length * 0.5
|
|
? highConfidenceText
|
|
: data.text;
|
|
|
|
ocrText += pageText + '\n\n';
|
|
|
|
// Track confidence for reporting
|
|
const avgConfidence = words.length > 0
|
|
? words.reduce((sum, w) => sum + w.confidence, 0) / words.length
|
|
: 0;
|
|
totalConfidence += avgConfidence;
|
|
pagesProcessed++;
|
|
|
|
console.log(`✅ Page ${i + 1}: Extracted ${pageText.length} chars, avg confidence: ${avgConfidence.toFixed(1)}%`);
|
|
} catch (pageError) {
|
|
console.error(`❌ Error processing page ${i + 1}:`, pageError.message);
|
|
// Continue with next page instead of failing completely
|
|
}
|
|
}
|
|
|
|
// Terminate worker
|
|
await worker.terminate();
|
|
worker = null;
|
|
|
|
// Calculate average confidence
|
|
const avgConfidence = pagesProcessed > 0 ? totalConfidence / pagesProcessed : 0;
|
|
console.log(`📊 Average OCR confidence: ${avgConfidence.toFixed(1)}%`);
|
|
|
|
// Clean up temp files
|
|
try {
|
|
if (fs.existsSync('./temp')) {
|
|
const tempFiles = fs.readdirSync('./temp').filter(file => file.startsWith('page'));
|
|
tempFiles.forEach(file => {
|
|
try {
|
|
fs.unlinkSync(`./temp/${file}`);
|
|
} catch (unlinkError) {
|
|
// Ignore individual file deletion errors
|
|
}
|
|
});
|
|
console.log('🧹 Cleaned up temp files');
|
|
}
|
|
} catch (cleanupError) {
|
|
console.log('⚠️ Could not clean up temp files:', cleanupError.message);
|
|
}
|
|
|
|
const finalText = ocrText.trim();
|
|
console.log(`✅ OCR extraction completed: ${finalText.length} total characters`);
|
|
|
|
if (finalText.length > 100) {
|
|
console.log('📝 First 200 chars of OCR text:', finalText.substring(0, 200));
|
|
return finalText;
|
|
} else {
|
|
console.log('⚠️ OCR extracted very little text, falling back to document name');
|
|
return 'Documento procesado con OCR - información limitada extraída';
|
|
}
|
|
|
|
} catch (error) {
|
|
console.error('❌ OCR extraction failed:', error);
|
|
|
|
// Ensure worker is terminated even on error
|
|
if (worker) {
|
|
try {
|
|
await worker.terminate();
|
|
} catch (terminateError) {
|
|
// Ignore termination errors
|
|
}
|
|
}
|
|
|
|
return 'Error en procesamiento OCR - documento requiere revisión manual';
|
|
}
|
|
}
|
|
} |