Files
Kontia/pdfExtractor.js
Marcelo Dares 65aaf9275e initial push
2026-03-15 15:03:56 +01:00

367 lines
14 KiB
JavaScript

import fs from 'fs';
import zlib from 'zlib';
import { createWorker } from 'tesseract.js';
import { fromBuffer } from 'pdf2pic';
import sharp from 'sharp';
export class PDFExtractor {
static async extractText(buffer) {
try {
// Convert buffer to string to search for text patterns
const pdfString = buffer.toString('latin1');
// Look for text streams in the PDF - multiple patterns
const patterns = [
/BT[\s\S]*?ET/g, // Text objects
/\((.*?)\)/g, // Text in parentheses
/\[(.*?)\]/g, // Text in brackets
/<<(.*?)>>/g // Dictionary objects
];
let extractedText = '';
for (const pattern of patterns) {
const matches = pdfString.match(pattern);
if (matches) {
for (const match of matches) {
// Extract text content from matches
const textContent = match.match(/\((.*?)\)/g);
if (textContent) {
for (const text of textContent) {
let cleanText = text.slice(1, -1);
// Basic text cleaning
cleanText = cleanText.replace(/\\n/g, '\n');
cleanText = cleanText.replace(/\\r/g, '\r');
cleanText = cleanText.replace(/\\t/g, '\t');
cleanText = cleanText.replace(/\\\(/g, '(');
cleanText = cleanText.replace(/\\\)/g, ')');
extractedText += cleanText + ' ';
}
}
}
}
}
return extractedText.trim();
} catch (error) {
console.error('Error extracting text from PDF:', error);
return '';
}
}
static async extractTextFromStream(buffer) {
try {
const pdfString = buffer.toString('latin1');
// Look for compressed streams
const streamMatches = pdfString.match(/stream\s*\n([\s\S]*?)\nendstream/g);
if (!streamMatches) {
console.log('No streams found, trying direct extraction');
return this.extractText(buffer);
}
console.log('Found', streamMatches.length, 'streams');
let extractedText = '';
for (let i = 0; i < streamMatches.length; i++) {
const match = streamMatches[i];
try {
// Extract the stream content
const streamContent = match.match(/stream\s*\n([\s\S]*?)\nendstream/);
if (streamContent) {
const streamData = streamContent[1];
console.log(`Processing stream ${i + 1}, length:`, streamData.length);
// Try to decompress with zlib (FlateDecode)
try {
// Convert to buffer properly
const streamBuffer = Buffer.from(streamData, 'binary');
const decompressed = zlib.inflateSync(streamBuffer);
const decompressedText = decompressed.toString('utf8');
console.log('Decompressed text length:', decompressedText.length);
// Check if this looks like readable text
const readableChars = decompressedText.match(/[a-zA-ZáéíóúñÁÉÍÓÚÑ]/g);
const readabilityRatio = readableChars ? readableChars.length / decompressedText.length : 0;
console.log(`Stream ${i + 1} readability ratio:`, readabilityRatio.toFixed(3));
// Only process if it has reasonable readability
if (readabilityRatio > 0.1) {
console.log(`First 200 chars of decompressed:`, decompressedText.substring(0, 200));
// Extract text from decompressed content - multiple patterns
const patterns = [
/\((.*?)\)/g, // Text in parentheses
/\[(.*?)\]/g, // Text in brackets
/<([0-9A-Fa-f]+)>/g // Hexadecimal text
];
for (const pattern of patterns) {
const matches = decompressedText.match(pattern);
if (matches) {
console.log('Found', matches.length, 'matches for pattern:', pattern.source);
for (const match of matches) {
let cleanText = match.slice(1, -1);
// Handle hexadecimal text
if (pattern.source.includes('[0-9A-Fa-f]')) {
try {
// Convert hex to text
const hexText = cleanText.replace(/\s/g, '');
if (hexText.length % 2 === 0) {
const bytes = [];
for (let j = 0; j < hexText.length; j += 2) {
bytes.push(parseInt(hexText.substr(j, 2), 16));
}
cleanText = String.fromCharCode(...bytes);
}
} catch (hexError) {
console.log('Hex conversion failed:', hexError.message);
}
} else {
// Regular text cleaning
cleanText = cleanText.replace(/\\n/g, '\n');
cleanText = cleanText.replace(/\\r/g, '\r');
cleanText = cleanText.replace(/\\t/g, '\t');
cleanText = cleanText.replace(/\\\(/g, '(');
cleanText = cleanText.replace(/\\\)/g, ')');
}
extractedText += cleanText + ' ';
}
}
}
} else {
console.log(`Skipping stream ${i + 1} - low readability (${readabilityRatio.toFixed(3)})`);
}
} catch (decompressError) {
console.log('Decompression failed for stream', i + 1, ':', decompressError.message);
// Try raw text extraction from stream
const rawText = streamData.replace(/[^\x20-\x7E\n\r\t]/g, ' ');
extractedText += rawText + ' ';
}
}
} catch (streamError) {
console.log('Stream processing failed for stream', i + 1, ':', streamError.message);
}
}
const result = extractedText.trim();
console.log('Total extracted text length:', result.length);
// Check if we got meaningful text
const meaningfulChars = result.match(/[a-zA-ZáéíóúñÁÉÍÓÚÑ]/g);
const meaningfulRatio = meaningfulChars ? meaningfulChars.length / result.length : 0;
console.log('Meaningful text ratio:', meaningfulRatio.toFixed(3));
if (meaningfulRatio < 0.3) {
console.log('⚠️ Low meaningful text ratio - trying OCR extraction...');
return await this.extractTextWithOCR(buffer);
}
return result || this.extractText(buffer);
} catch (error) {
console.error('Error extracting text from PDF stream:', error);
return await this.extractTextWithOCR(buffer);
}
}
/**
* Preprocess image to improve OCR accuracy for blurry/scanned documents
* @param {Buffer} imageBuffer - Image buffer to preprocess
* @returns {Promise<Buffer>} - Preprocessed image buffer
*/
static async preprocessImage(imageBuffer) {
try {
const processed = await sharp(imageBuffer)
.greyscale() // Convert to grayscale for better OCR
.normalize() // Enhance contrast
.sharpen({ sigma: 1.5, m1: 1, m2: 3, x1: 3, y2: 15, y3: 15 }) // Sharpen blurry images
.median(3) // Denoise - remove small artifacts
.toBuffer();
return processed;
} catch (error) {
console.warn('⚠️ Image preprocessing failed, using original:', error.message);
return imageBuffer; // Return original if preprocessing fails
}
}
/**
* Determine optimal PSM (Page Segmentation Mode) based on image characteristics
* @param {Buffer} imageBuffer - Image buffer to analyze
* @returns {Promise<number>} - PSM mode (3=auto, 6=single block, 11=sparse text)
*/
static async determinePSMMode(imageBuffer) {
try {
const metadata = await sharp(imageBuffer).metadata();
const stats = await sharp(imageBuffer)
.greyscale()
.normalize()
.stats();
// Calculate text density estimate (simplified heuristic)
const avgBrightness = stats.channels[0].mean;
const stdDev = stats.channels[0].stDev;
// Low stdDev suggests uniform background (single block)
// High stdDev suggests varied content (auto mode)
if (stdDev < 30) {
return 6; // Single uniform block of text
} else if (stdDev > 60) {
return 11; // Sparse text (scanned documents with lots of whitespace)
}
return 3; // Auto (default)
} catch (error) {
console.warn('⚠️ Could not determine PSM mode, using auto:', error.message);
return 3; // Default to auto
}
}
static async extractTextWithOCR(buffer) {
console.log('🔍 Starting OCR extraction with enhanced preprocessing...');
let worker = null;
try {
// Convert PDF to images at higher DPI for better quality
const convert = fromBuffer(buffer, {
density: 400, // Increased from 300 to 400 for better OCR accuracy
saveFilename: "page",
savePath: "./temp",
format: "png",
width: 2000,
height: 2000
});
// Get the number of pages
const result = await convert.bulk(-1, { responseType: "base64" });
console.log(`📄 Converted PDF to ${result.length} images for OCR at 400 DPI`);
if (result.length === 0) {
throw new Error('No pages found in PDF');
}
// Initialize Tesseract worker with optimized settings
worker = await createWorker('spa+eng'); // Spanish + English
// Set OCR Engine Mode to 1 (LSTM neural nets - better accuracy)
await worker.setParameters({
tessedit_ocr_engine_mode: '1', // OEM 1: LSTM engine (best for accuracy)
tessedit_pageseg_mode: '3', // PSM 3: Auto (will be adjusted per page)
tessedit_char_whitelist: '', // No character restrictions
});
let ocrText = '';
let totalConfidence = 0;
let pagesProcessed = 0;
// Process each page with OCR
for (let i = 0; i < result.length; i++) {
console.log(`🔍 Processing page ${i + 1}/${result.length} with OCR...`);
try {
// Decode base64 image
const imageBuffer = Buffer.from(result[i].base64, 'base64');
// Preprocess image to improve OCR accuracy
const processedImageBuffer = await this.preprocessImage(imageBuffer);
// Determine optimal PSM mode for this page (use original for analysis)
const psmMode = await this.determinePSMMode(imageBuffer);
await worker.setParameters({
tessedit_pageseg_mode: String(psmMode)
});
// Convert processed image to base64 data URL for tesseract.js
const processedBase64 = processedImageBuffer.toString('base64');
const imageDataUrl = `data:image/png;base64,${processedBase64}`;
// Perform OCR with confidence scores
const { data } = await worker.recognize(imageDataUrl);
// Filter out low-confidence words (optional - can be adjusted)
const minConfidence = 30; // Minimum confidence threshold (0-100)
const words = data.words || [];
const highConfidenceText = words
.filter(word => word.confidence >= minConfidence)
.map(word => word.text)
.join(' ');
// Use high-confidence text if available, otherwise use all text
const pageText = highConfidenceText.length > data.text.length * 0.5
? highConfidenceText
: data.text;
ocrText += pageText + '\n\n';
// Track confidence for reporting
const avgConfidence = words.length > 0
? words.reduce((sum, w) => sum + w.confidence, 0) / words.length
: 0;
totalConfidence += avgConfidence;
pagesProcessed++;
console.log(`✅ Page ${i + 1}: Extracted ${pageText.length} chars, avg confidence: ${avgConfidence.toFixed(1)}%`);
} catch (pageError) {
console.error(`❌ Error processing page ${i + 1}:`, pageError.message);
// Continue with next page instead of failing completely
}
}
// Terminate worker
await worker.terminate();
worker = null;
// Calculate average confidence
const avgConfidence = pagesProcessed > 0 ? totalConfidence / pagesProcessed : 0;
console.log(`📊 Average OCR confidence: ${avgConfidence.toFixed(1)}%`);
// Clean up temp files
try {
if (fs.existsSync('./temp')) {
const tempFiles = fs.readdirSync('./temp').filter(file => file.startsWith('page'));
tempFiles.forEach(file => {
try {
fs.unlinkSync(`./temp/${file}`);
} catch (unlinkError) {
// Ignore individual file deletion errors
}
});
console.log('🧹 Cleaned up temp files');
}
} catch (cleanupError) {
console.log('⚠️ Could not clean up temp files:', cleanupError.message);
}
const finalText = ocrText.trim();
console.log(`✅ OCR extraction completed: ${finalText.length} total characters`);
if (finalText.length > 100) {
console.log('📝 First 200 chars of OCR text:', finalText.substring(0, 200));
return finalText;
} else {
console.log('⚠️ OCR extracted very little text, falling back to document name');
return 'Documento procesado con OCR - información limitada extraída';
}
} catch (error) {
console.error('❌ OCR extraction failed:', error);
// Ensure worker is terminated even on error
if (worker) {
try {
await worker.terminate();
} catch (terminateError) {
// Ignore termination errors
}
}
return 'Error en procesamiento OCR - documento requiere revisión manual';
}
}
}