Kontia/pdfExtractor.js

import fs from 'fs';
import zlib from 'zlib';
import { createWorker } from 'tesseract.js';
import { fromBuffer } from 'pdf2pic';
import sharp from 'sharp';

export class PDFExtractor {
  static async extractText(buffer) {
    try {
      // Convert buffer to string to search for text patterns
      const pdfString = buffer.toString('latin1');

      // Look for text streams in the PDF - multiple patterns
      const patterns = [
        /BT[\s\S]*?ET/g,  // Text objects
        /\((.*?)\)/g,     // Text in parentheses
        /\[(.*?)\]/g,     // Text in brackets
        /<<(.*?)>>/g      // Dictionary objects
      ];

      let extractedText = '';

      for (const pattern of patterns) {
        const matches = pdfString.match(pattern);
        if (matches) {
          for (const match of matches) {
            // Extract text content from matches
            const textContent = match.match(/\((.*?)\)/g);
            if (textContent) {
              for (const text of textContent) {
                let cleanText = text.slice(1, -1);
                // Basic text cleaning
                cleanText = cleanText.replace(/\\n/g, '\n');
                cleanText = cleanText.replace(/\\r/g, '\r');
                cleanText = cleanText.replace(/\\t/g, '\t');
                cleanText = cleanText.replace(/\\\(/g, '(');
                cleanText = cleanText.replace(/\\\)/g, ')');
                extractedText += cleanText + ' ';
              }
            }
          }
        }
      }

      return extractedText.trim();
    } catch (error) {
      console.error('Error extracting text from PDF:', error);
      return '';
    }
  }

  static async extractTextFromStream(buffer) {
    try {
      const pdfString = buffer.toString('latin1');

      // Look for compressed streams
      const streamMatches = pdfString.match(/stream\s*\n([\s\S]*?)\nendstream/g);

      if (!streamMatches) {
        console.log('No streams found, trying direct extraction');
        return this.extractText(buffer);
      }

      console.log('Found', streamMatches.length, 'streams');
      let extractedText = '';

      for (let i = 0; i < streamMatches.length; i++) {
        const match = streamMatches[i];
        try {
          // Extract the stream content
          const streamContent = match.match(/stream\s*\n([\s\S]*?)\nendstream/);
          if (streamContent) {
            const streamData = streamContent[1];
            console.log(`Processing stream ${i + 1}, length:`, streamData.length);

            // Try to decompress with zlib (FlateDecode)
            try {
              // Convert to buffer properly
              const streamBuffer = Buffer.from(streamData, 'binary');
              const decompressed = zlib.inflateSync(streamBuffer);
              const decompressedText = decompressed.toString('utf8');

              console.log('Decompressed text length:', decompressedText.length);

              // Check if this looks like readable text
              const readableChars = decompressedText.match(/[a-zA-ZáéíóúñÁÉÍÓÚÑ]/g);
              const readabilityRatio = readableChars ? readableChars.length / decompressedText.length : 0;

              console.log(`Stream ${i + 1} readability ratio:`, readabilityRatio.toFixed(3));

              // Only process if it has reasonable readability
              if (readabilityRatio > 0.1) {
                console.log(`First 200 chars of decompressed:`, decompressedText.substring(0, 200));

                // Extract text from decompressed content - multiple patterns
                const patterns = [
                  /\((.*?)\)/g,     // Text in parentheses
                  /\[(.*?)\]/g,     // Text in brackets
                  /<([0-9A-Fa-f]+)>/g  // Hexadecimal text
                ];

                for (const pattern of patterns) {
                  const matches = decompressedText.match(pattern);
                  if (matches) {
                    console.log('Found', matches.length, 'matches for pattern:', pattern.source);
                    for (const match of matches) {
                      let cleanText = match.slice(1, -1);

                      // Handle hexadecimal text
                      if (pattern.source.includes('[0-9A-Fa-f]')) {
                        try {
                          // Convert hex to text
                          const hexText = cleanText.replace(/\s/g, '');
                          if (hexText.length % 2 === 0) {
                            const bytes = [];
                            for (let j = 0; j < hexText.length; j += 2) {
                              bytes.push(parseInt(hexText.substr(j, 2), 16));
                            }
                            cleanText = String.fromCharCode(...bytes);
                          }
                        } catch (hexError) {
                          console.log('Hex conversion failed:', hexError.message);
                        }
                      } else {
                        // Regular text cleaning
                        cleanText = cleanText.replace(/\\n/g, '\n');
                        cleanText = cleanText.replace(/\\r/g, '\r');
                        cleanText = cleanText.replace(/\\t/g, '\t');
                        cleanText = cleanText.replace(/\\\(/g, '(');
                        cleanText = cleanText.replace(/\\\)/g, ')');
                      }

                      extractedText += cleanText + ' ';
                    }
                  }
                }
              } else {
                console.log(`Skipping stream ${i + 1} - low readability (${readabilityRatio.toFixed(3)})`);
              }
            } catch (decompressError) {
              console.log('Decompression failed for stream', i + 1, ':', decompressError.message);
              // Try raw text extraction from stream
              const rawText = streamData.replace(/[^\x20-\x7E\n\r\t]/g, ' ');
              extractedText += rawText + ' ';
            }
          }
        } catch (streamError) {
          console.log('Stream processing failed for stream', i + 1, ':', streamError.message);
        }
      }

      const result = extractedText.trim();
      console.log('Total extracted text length:', result.length);

      // Check if we got meaningful text
      const meaningfulChars = result.match(/[a-zA-ZáéíóúñÁÉÍÓÚÑ]/g);
      const meaningfulRatio = meaningfulChars ? meaningfulChars.length / result.length : 0;

      console.log('Meaningful text ratio:', meaningfulRatio.toFixed(3));

      if (meaningfulRatio < 0.3) {
        console.log('⚠️ Low meaningful text ratio - trying OCR extraction...');
        return await this.extractTextWithOCR(buffer);
      }

      return result || this.extractText(buffer);
    } catch (error) {
      console.error('Error extracting text from PDF stream:', error);
      return await this.extractTextWithOCR(buffer);
    }
  }

  /**
   * Preprocess image to improve OCR accuracy for blurry/scanned documents
   * @param {Buffer} imageBuffer - Image buffer to preprocess
   * @returns {Promise<Buffer>} - Preprocessed image buffer
   */
  static async preprocessImage(imageBuffer) {
    try {
      const processed = await sharp(imageBuffer)
        .greyscale() // Convert to grayscale for better OCR
        .normalize() // Enhance contrast
        .sharpen({ sigma: 1.5, m1: 1, m2: 3, x1: 3, y2: 15, y3: 15 }) // Sharpen blurry images
        .median(3) // Denoise - remove small artifacts
        .toBuffer();

      return processed;
    } catch (error) {
      console.warn('⚠️ Image preprocessing failed, using original:', error.message);
      return imageBuffer; // Return original if preprocessing fails
    }
  }

  /**
   * Determine optimal PSM (Page Segmentation Mode) based on image characteristics
   * @param {Buffer} imageBuffer - Image buffer to analyze
   * @returns {Promise<number>} - PSM mode (3=auto, 6=single block, 11=sparse text)
   */
  static async determinePSMMode(imageBuffer) {
    try {
      const metadata = await sharp(imageBuffer).metadata();
      const stats = await sharp(imageBuffer)
        .greyscale()
        .normalize()
        .stats();

      // Calculate text density estimate (simplified heuristic)
      const avgBrightness = stats.channels[0].mean;
      const stdDev = stats.channels[0].stDev;

      // Low stdDev suggests uniform background (single block)
      // High stdDev suggests varied content (auto mode)
      if (stdDev < 30) {
        return 6; // Single uniform block of text
      } else if (stdDev > 60) {
        return 11; // Sparse text (scanned documents with lots of whitespace)
      }

      return 3; // Auto (default)
    } catch (error) {
      console.warn('⚠️ Could not determine PSM mode, using auto:', error.message);
      return 3; // Default to auto
    }
  }

  static async extractTextWithOCR(buffer) {
    console.log('🔍 Starting OCR extraction with enhanced preprocessing...');
    let worker = null;

    try {
      // Convert PDF to images at higher DPI for better quality
      const convert = fromBuffer(buffer, {
        density: 400,           // Increased from 300 to 400 for better OCR accuracy
        saveFilename: "page",
        savePath: "./temp",
        format: "png",
        width: 2000,
        height: 2000
      });

      // Get the number of pages
      const result = await convert.bulk(-1, { responseType: "base64" });
      console.log(`📄 Converted PDF to ${result.length} images for OCR at 400 DPI`);

      if (result.length === 0) {
        throw new Error('No pages found in PDF');
      }

      // Initialize Tesseract worker with optimized settings
      worker = await createWorker('spa+eng'); // Spanish + English

      // Set OCR Engine Mode to 1 (LSTM neural nets - better accuracy)
      await worker.setParameters({
        tessedit_ocr_engine_mode: '1', // OEM 1: LSTM engine (best for accuracy)
        tessedit_pageseg_mode: '3',  // PSM 3: Auto (will be adjusted per page)
        tessedit_char_whitelist: '',  // No character restrictions
      });

      let ocrText = '';
      let totalConfidence = 0;
      let pagesProcessed = 0;

      // Process each page with OCR
      for (let i = 0; i < result.length; i++) {
        console.log(`🔍 Processing page ${i + 1}/${result.length} with OCR...`);

        try {
          // Decode base64 image
          const imageBuffer = Buffer.from(result[i].base64, 'base64');

          // Preprocess image to improve OCR accuracy
          const processedImageBuffer = await this.preprocessImage(imageBuffer);

          // Determine optimal PSM mode for this page (use original for analysis)
          const psmMode = await this.determinePSMMode(imageBuffer);
          await worker.setParameters({
            tessedit_pageseg_mode: String(psmMode)
          });

          // Convert processed image to base64 data URL for tesseract.js
          const processedBase64 = processedImageBuffer.toString('base64');
          const imageDataUrl = `data:image/png;base64,${processedBase64}`;

          // Perform OCR with confidence scores
          const { data } = await worker.recognize(imageDataUrl);

          // Filter out low-confidence words (optional - can be adjusted)
          const minConfidence = 30; // Minimum confidence threshold (0-100)
          const words = data.words || [];
          const highConfidenceText = words
            .filter(word => word.confidence >= minConfidence)
            .map(word => word.text)
            .join(' ');

          // Use high-confidence text if available, otherwise use all text
          const pageText = highConfidenceText.length > data.text.length * 0.5
            ? highConfidenceText
            : data.text;

          ocrText += pageText + '\n\n';

          // Track confidence for reporting
          const avgConfidence = words.length > 0
            ? words.reduce((sum, w) => sum + w.confidence, 0) / words.length
            : 0;
          totalConfidence += avgConfidence;
          pagesProcessed++;

          console.log(`✅ Page ${i + 1}: Extracted ${pageText.length} chars, avg confidence: ${avgConfidence.toFixed(1)}%`);
        } catch (pageError) {
          console.error(`❌ Error processing page ${i + 1}:`, pageError.message);
          // Continue with next page instead of failing completely
        }
      }

      // Terminate worker
      await worker.terminate();
      worker = null;

      // Calculate average confidence
      const avgConfidence = pagesProcessed > 0 ? totalConfidence / pagesProcessed : 0;
      console.log(`📊 Average OCR confidence: ${avgConfidence.toFixed(1)}%`);

      // Clean up temp files
      try {
        if (fs.existsSync('./temp')) {
        const tempFiles = fs.readdirSync('./temp').filter(file => file.startsWith('page'));
        tempFiles.forEach(file => {
            try {
          fs.unlinkSync(`./temp/${file}`);
            } catch (unlinkError) {
              // Ignore individual file deletion errors
            }
        });
        console.log('🧹 Cleaned up temp files');
        }
      } catch (cleanupError) {
        console.log('⚠️ Could not clean up temp files:', cleanupError.message);
      }

      const finalText = ocrText.trim();
      console.log(`✅ OCR extraction completed: ${finalText.length} total characters`);

      if (finalText.length > 100) {
        console.log('📝 First 200 chars of OCR text:', finalText.substring(0, 200));
        return finalText;
      } else {
        console.log('⚠️ OCR extracted very little text, falling back to document name');
        return 'Documento procesado con OCR - información limitada extraída';
      }

    } catch (error) {
      console.error('❌ OCR extraction failed:', error);

      // Ensure worker is terminated even on error
      if (worker) {
        try {
          await worker.terminate();
        } catch (terminateError) {
          // Ignore termination errors
        }
      }

      return 'Error en procesamiento OCR - documento requiere revisión manual';
    }
  }
}