initial push

2026-03-15 15:03:56 +01:00
parent d48b9d5352
commit 65aaf9275e
146 changed files with 70245 additions and 100 deletions
--- a/scripts/analyze-acta-ai.mjs
+++ b/scripts/analyze-acta-ai.mjs
@@ -0,0 +1,344 @@
+#!/usr/bin/env node
+
+import { randomUUID } from "node:crypto";
+import { spawn } from "node:child_process";
+import { existsSync, readFileSync } from "node:fs";
+import { readFile, unlink, writeFile } from "node:fs/promises";
+import { tmpdir } from "node:os";
+import path from "node:path";
+import { PDFParse } from "pdf-parse";
+
+const DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1";
+const DEFAULT_OPENAI_MODEL = process.env.OPENAI_ACTA_MODEL?.trim() || "gpt-4.1-mini";
+const DEFAULT_OCR_LANGUAGE = "spa+eng";
+const MIN_DOC_CHARS = 200;
+const MIN_CHARS_PER_PAGE = 50;
+
+function loadDotEnv() {
+  const envPath = path.resolve(process.cwd(), ".env");
+  if (!existsSync(envPath)) {
+    return;
+  }
+
+  const content = readFileSync(envPath, "utf8");
+  const lines = content.split(/\r?\n/);
+
+  for (const line of lines) {
+    const trimmed = line.trim();
+    if (!trimmed || trimmed.startsWith("#")) {
+      continue;
+    }
+
+    const separator = trimmed.indexOf("=");
+    if (separator <= 0) {
+      continue;
+    }
+
+    const key = trimmed.slice(0, separator).trim();
+    let value = trimmed.slice(separator + 1).trim();
+
+    if ((value.startsWith('"') && value.endsWith('"')) || (value.startsWith("'") && value.endsWith("'"))) {
+      value = value.slice(1, -1);
+    }
+
+    if (key && !process.env[key]) {
+      process.env[key] = value;
+    }
+  }
+}
+
+loadDotEnv();
+
+function usage() {
+  console.log("Usage: node scripts/analyze-acta-ai.mjs <path-to-pdf> [--out output.json]");
+}
+
+function parseArgs(argv) {
+  const args = [...argv];
+  const filePath = args[0];
+  let outPath = null;
+
+  for (let index = 1; index < args.length; index += 1) {
+    if (args[index] === "--out") {
+      outPath = args[index + 1] ?? null;
+      index += 1;
+    }
+  }
+
+  return { filePath, outPath };
+}
+
+function normalizeText(rawText) {
+  return rawText
+    .replace(/\u0000/g, " ")
+    .replace(/\r\n?/g, "\n")
+    .replace(/\n?\s*--\s*\d+\s*of\s*\d+\s*--\s*\n?/gi, "\n")
+    .replace(/\n?\s*p[áa]gina\s+\d+\s+de\s+\d+\s*\n?/gi, "\n")
+    .split("\n")
+    .map((line) => line.replace(/\s+/g, " ").trim())
+    .filter(Boolean)
+    .join("\n")
+    .trim();
+}
+
+async function extractTextFromPdfBuffer(buffer) {
+  const parser = new PDFParse({ data: buffer });
+  try {
+    const parsed = await parser.getText();
+    const rawText = typeof parsed.text === "string" ? parsed.text : "";
+    const text = normalizeText(rawText);
+    const numPages = typeof parsed.total === "number" && Number.isFinite(parsed.total) ? parsed.total : 0;
+    return { text, numPages };
+  } finally {
+    await parser.destroy().catch(() => undefined);
+  }
+}
+
+function shouldApplyOcr(text, numPages) {
+  const totalChars = text.trim().length;
+  const pages = Math.max(numPages, 1);
+  const charsPerPage = totalChars / pages;
+  return totalChars < MIN_DOC_CHARS || charsPerPage < MIN_CHARS_PER_PAGE;
+}
+
+function runCommand(command, args) {
+  return new Promise((resolve, reject) => {
+    const child = spawn(command, args, { stdio: ["ignore", "pipe", "pipe"] });
+    let stdout = "";
+    let stderr = "";
+
+    child.stdout.on("data", (chunk) => {
+      stdout += chunk.toString();
+    });
+    child.stderr.on("data", (chunk) => {
+      stderr += chunk.toString();
+    });
+    child.on("error", (error) => {
+      reject(error);
+    });
+    child.on("close", (code) => {
+      if (code === 0) {
+        resolve({ stdout, stderr });
+        return;
+      }
+      reject(new Error(`${command} failed with code ${code}: ${stderr}`));
+    });
+  });
+}
+
+function createTempPdfPath(prefix = "acta-ai") {
+  return path.join(tmpdir(), `${prefix}-${Date.now()}-${randomUUID()}.pdf`);
+}
+
+async function runOcrAndExtractText(originalBuffer, lang = DEFAULT_OCR_LANGUAGE) {
+  const inputPath = createTempPdfPath("acta-input");
+  const outputPath = createTempPdfPath("acta-output");
+
+  try {
+    await writeFile(inputPath, originalBuffer);
+    await runCommand("ocrmypdf", ["--skip-text", "--force-ocr", "-l", lang, inputPath, outputPath]);
+
+    const ocrBuffer = await readFile(outputPath);
+    const parsed = await extractTextFromPdfBuffer(ocrBuffer);
+    return parsed;
+  } finally {
+    await Promise.all([unlink(inputPath).catch(() => undefined), unlink(outputPath).catch(() => undefined)]);
+  }
+}
+
+function parseIntegerEnv(name, fallback) {
+  const parsed = Number.parseInt((process.env[name] ?? "").trim(), 10);
+  return Number.isFinite(parsed) && parsed > 0 ? parsed : fallback;
+}
+
+function getOpenAiApiKey() {
+  return process.env.OPENAI_API_KEY?.trim() || process.env.API_KEY?.trim() || process.env.api_key?.trim() || "";
+}
+
+function clampActaText(text) {
+  const maxChars = parseIntegerEnv("OPENAI_ACTA_MAX_CHARS", 45_000);
+  if (text.length <= maxChars) {
+    return text;
+  }
+  return `${text.slice(0, maxChars)}\n\n[TEXT_TRUNCATED_TO_${maxChars}_CHARS]`;
+}
+
+function extractJsonObject(rawContent) {
+  const trimmed = rawContent.trim();
+  try {
+    return JSON.parse(trimmed);
+  } catch {
+    // continue
+  }
+
+  const withoutFence = trimmed
+    .replace(/^```json\s*/i, "")
+    .replace(/^```\s*/i, "")
+    .replace(/\s*```$/i, "")
+    .trim();
+
+  try {
+    return JSON.parse(withoutFence);
+  } catch {
+    // continue
+  }
+
+  const firstBrace = withoutFence.indexOf("{");
+  const lastBrace = withoutFence.lastIndexOf("}");
+  if (firstBrace >= 0 && lastBrace > firstBrace) {
+    return JSON.parse(withoutFence.slice(firstBrace, lastBrace + 1));
+  }
+
+  throw new Error("AI response did not contain valid JSON.");
+}
+
+async function extractWithAi(fullText) {
+  const apiKey = getOpenAiApiKey();
+  if (!apiKey) {
+    throw new Error("OpenAI API key is missing (OPENAI_API_KEY or api_key).");
+  }
+
+  const baseUrl = (process.env.OPENAI_API_BASE_URL?.trim() || DEFAULT_OPENAI_BASE_URL).replace(/\/+$/, "");
+  const timeoutMs = parseIntegerEnv("OPENAI_ACTA_TIMEOUT_MS", 60_000);
+  const controller = new AbortController();
+  const timer = setTimeout(() => controller.abort(), timeoutMs);
+
+  const systemPrompt = [
+    "Eres analista legal experto en actas constitutivas mexicanas.",
+    "Devuelve exclusivamente JSON valido, sin markdown.",
+    "Si un campo no aparece, usa null.",
+    "No inventes datos.",
+    "rfc en fields debe ser null.",
+    "lookupDictionary.version debe ser exactamente 'mx_acta_constitutiva_reference_v1'.",
+  ].join(" ");
+
+  const userPrompt = [
+    "Extrae un objeto JSON con dos claves: fields y lookupDictionary.",
+    "fields: name, rfc, legalRepresentative, incorporationDate, deedNumber, notaryName, fiscalAddress, businessPurpose, stateOfIncorporation.",
+    "lookupDictionary: estructura completa del diccionario de acta.",
+    "Texto:",
+    clampActaText(fullText),
+  ].join("\n\n");
+
+  try {
+    const response = await fetch(`${baseUrl}/chat/completions`, {
+      method: "POST",
+      headers: {
+        "Content-Type": "application/json",
+        Authorization: `Bearer ${apiKey}`,
+      },
+      body: JSON.stringify({
+        model: DEFAULT_OPENAI_MODEL,
+        temperature: 0,
+        response_format: { type: "json_object" },
+        messages: [
+          { role: "system", content: systemPrompt },
+          { role: "user", content: userPrompt },
+        ],
+      }),
+      signal: controller.signal,
+    });
+
+    const payload = await response.json().catch(() => ({}));
+    if (!response.ok) {
+      const message = payload?.error?.message ? ` ${payload.error.message}` : "";
+      throw new Error(`OpenAI request failed with ${response.status}.${message}`);
+    }
+
+    const content = payload?.choices?.[0]?.message?.content;
+    if (typeof content !== "string" || !content.trim()) {
+      throw new Error("OpenAI did not return text content.");
+    }
+
+    const parsed = extractJsonObject(content);
+    if (!parsed || typeof parsed !== "object" || !parsed.fields || !parsed.lookupDictionary) {
+      throw new Error("OpenAI JSON missing required keys: fields/lookupDictionary.");
+    }
+
+    return {
+      model: payload?.model ?? DEFAULT_OPENAI_MODEL,
+      usage: {
+        promptTokens: payload?.usage?.prompt_tokens ?? null,
+        completionTokens: payload?.usage?.completion_tokens ?? null,
+        totalTokens: payload?.usage?.total_tokens ?? null,
+      },
+      ...parsed,
+    };
+  } finally {
+    clearTimeout(timer);
+  }
+}
+
+async function analyzePdfWithOcrFallback(filePath) {
+  const buffer = await readFile(filePath);
+  const warnings = [];
+
+  let direct;
+  try {
+    direct = await extractTextFromPdfBuffer(buffer);
+  } catch (error) {
+    warnings.push(`Direct extraction failed: ${error instanceof Error ? error.message : String(error)}`);
+  }
+
+  if (direct && !shouldApplyOcr(direct.text, direct.numPages)) {
+    return {
+      text: direct.text,
+      methodUsed: "direct",
+      numPages: direct.numPages,
+      warnings,
+    };
+  }
+
+  warnings.push("Direct extraction was short; OCR attempted.");
+
+  const ocr = await runOcrAndExtractText(buffer);
+  if (shouldApplyOcr(ocr.text, ocr.numPages)) {
+    throw new Error("OCR completed but usable text was still not detected.");
+  }
+
+  return {
+    text: ocr.text,
+    methodUsed: "ocr",
+    numPages: ocr.numPages,
+    warnings,
+  };
+}
+
+async function main() {
+  const { filePath, outPath } = parseArgs(process.argv.slice(2));
+  if (!filePath) {
+    usage();
+    process.exitCode = 1;
+    return;
+  }
+
+  const absolute = path.resolve(filePath);
+  const analysis = await analyzePdfWithOcrFallback(absolute);
+  const ai = await extractWithAi(analysis.text);
+
+  const result = {
+    ok: true,
+    file: absolute,
+    methodUsed: analysis.methodUsed,
+    numPages: analysis.numPages,
+    warnings: analysis.warnings,
+    extractionEngine: "ai",
+    aiModel: ai.model,
+    aiUsage: ai.usage,
+    fields: ai.fields,
+    lookupDictionary: ai.lookupDictionary,
+  };
+
+  if (outPath) {
+    await writeFile(path.resolve(outPath), `${JSON.stringify(result, null, 2)}\n`, "utf8");
+    console.log(`Result written to ${path.resolve(outPath)}`);
+    return;
+  }
+
+  console.log(JSON.stringify(result, null, 2));
+}
+
+main().catch((error) => {
+  console.error(error instanceof Error ? error.message : String(error));
+  process.exitCode = 1;
+});