Files
Kontia/scripts/analyze-acta-ai.mjs
Marcelo Dares 65aaf9275e initial push
2026-03-15 15:03:56 +01:00

345 lines
9.8 KiB
JavaScript

#!/usr/bin/env node
import { randomUUID } from "node:crypto";
import { spawn } from "node:child_process";
import { existsSync, readFileSync } from "node:fs";
import { readFile, unlink, writeFile } from "node:fs/promises";
import { tmpdir } from "node:os";
import path from "node:path";
import { PDFParse } from "pdf-parse";
const DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1";
const DEFAULT_OPENAI_MODEL = process.env.OPENAI_ACTA_MODEL?.trim() || "gpt-4.1-mini";
const DEFAULT_OCR_LANGUAGE = "spa+eng";
const MIN_DOC_CHARS = 200;
const MIN_CHARS_PER_PAGE = 50;
function loadDotEnv() {
const envPath = path.resolve(process.cwd(), ".env");
if (!existsSync(envPath)) {
return;
}
const content = readFileSync(envPath, "utf8");
const lines = content.split(/\r?\n/);
for (const line of lines) {
const trimmed = line.trim();
if (!trimmed || trimmed.startsWith("#")) {
continue;
}
const separator = trimmed.indexOf("=");
if (separator <= 0) {
continue;
}
const key = trimmed.slice(0, separator).trim();
let value = trimmed.slice(separator + 1).trim();
if ((value.startsWith('"') && value.endsWith('"')) || (value.startsWith("'") && value.endsWith("'"))) {
value = value.slice(1, -1);
}
if (key && !process.env[key]) {
process.env[key] = value;
}
}
}
loadDotEnv();
function usage() {
console.log("Usage: node scripts/analyze-acta-ai.mjs <path-to-pdf> [--out output.json]");
}
function parseArgs(argv) {
const args = [...argv];
const filePath = args[0];
let outPath = null;
for (let index = 1; index < args.length; index += 1) {
if (args[index] === "--out") {
outPath = args[index + 1] ?? null;
index += 1;
}
}
return { filePath, outPath };
}
function normalizeText(rawText) {
return rawText
.replace(/\u0000/g, " ")
.replace(/\r\n?/g, "\n")
.replace(/\n?\s*--\s*\d+\s*of\s*\d+\s*--\s*\n?/gi, "\n")
.replace(/\n?\s*p[áa]gina\s+\d+\s+de\s+\d+\s*\n?/gi, "\n")
.split("\n")
.map((line) => line.replace(/\s+/g, " ").trim())
.filter(Boolean)
.join("\n")
.trim();
}
async function extractTextFromPdfBuffer(buffer) {
const parser = new PDFParse({ data: buffer });
try {
const parsed = await parser.getText();
const rawText = typeof parsed.text === "string" ? parsed.text : "";
const text = normalizeText(rawText);
const numPages = typeof parsed.total === "number" && Number.isFinite(parsed.total) ? parsed.total : 0;
return { text, numPages };
} finally {
await parser.destroy().catch(() => undefined);
}
}
function shouldApplyOcr(text, numPages) {
const totalChars = text.trim().length;
const pages = Math.max(numPages, 1);
const charsPerPage = totalChars / pages;
return totalChars < MIN_DOC_CHARS || charsPerPage < MIN_CHARS_PER_PAGE;
}
function runCommand(command, args) {
return new Promise((resolve, reject) => {
const child = spawn(command, args, { stdio: ["ignore", "pipe", "pipe"] });
let stdout = "";
let stderr = "";
child.stdout.on("data", (chunk) => {
stdout += chunk.toString();
});
child.stderr.on("data", (chunk) => {
stderr += chunk.toString();
});
child.on("error", (error) => {
reject(error);
});
child.on("close", (code) => {
if (code === 0) {
resolve({ stdout, stderr });
return;
}
reject(new Error(`${command} failed with code ${code}: ${stderr}`));
});
});
}
function createTempPdfPath(prefix = "acta-ai") {
return path.join(tmpdir(), `${prefix}-${Date.now()}-${randomUUID()}.pdf`);
}
async function runOcrAndExtractText(originalBuffer, lang = DEFAULT_OCR_LANGUAGE) {
const inputPath = createTempPdfPath("acta-input");
const outputPath = createTempPdfPath("acta-output");
try {
await writeFile(inputPath, originalBuffer);
await runCommand("ocrmypdf", ["--skip-text", "--force-ocr", "-l", lang, inputPath, outputPath]);
const ocrBuffer = await readFile(outputPath);
const parsed = await extractTextFromPdfBuffer(ocrBuffer);
return parsed;
} finally {
await Promise.all([unlink(inputPath).catch(() => undefined), unlink(outputPath).catch(() => undefined)]);
}
}
function parseIntegerEnv(name, fallback) {
const parsed = Number.parseInt((process.env[name] ?? "").trim(), 10);
return Number.isFinite(parsed) && parsed > 0 ? parsed : fallback;
}
function getOpenAiApiKey() {
return process.env.OPENAI_API_KEY?.trim() || process.env.API_KEY?.trim() || process.env.api_key?.trim() || "";
}
function clampActaText(text) {
const maxChars = parseIntegerEnv("OPENAI_ACTA_MAX_CHARS", 45_000);
if (text.length <= maxChars) {
return text;
}
return `${text.slice(0, maxChars)}\n\n[TEXT_TRUNCATED_TO_${maxChars}_CHARS]`;
}
function extractJsonObject(rawContent) {
const trimmed = rawContent.trim();
try {
return JSON.parse(trimmed);
} catch {
// continue
}
const withoutFence = trimmed
.replace(/^```json\s*/i, "")
.replace(/^```\s*/i, "")
.replace(/\s*```$/i, "")
.trim();
try {
return JSON.parse(withoutFence);
} catch {
// continue
}
const firstBrace = withoutFence.indexOf("{");
const lastBrace = withoutFence.lastIndexOf("}");
if (firstBrace >= 0 && lastBrace > firstBrace) {
return JSON.parse(withoutFence.slice(firstBrace, lastBrace + 1));
}
throw new Error("AI response did not contain valid JSON.");
}
async function extractWithAi(fullText) {
const apiKey = getOpenAiApiKey();
if (!apiKey) {
throw new Error("OpenAI API key is missing (OPENAI_API_KEY or api_key).");
}
const baseUrl = (process.env.OPENAI_API_BASE_URL?.trim() || DEFAULT_OPENAI_BASE_URL).replace(/\/+$/, "");
const timeoutMs = parseIntegerEnv("OPENAI_ACTA_TIMEOUT_MS", 60_000);
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), timeoutMs);
const systemPrompt = [
"Eres analista legal experto en actas constitutivas mexicanas.",
"Devuelve exclusivamente JSON valido, sin markdown.",
"Si un campo no aparece, usa null.",
"No inventes datos.",
"rfc en fields debe ser null.",
"lookupDictionary.version debe ser exactamente 'mx_acta_constitutiva_reference_v1'.",
].join(" ");
const userPrompt = [
"Extrae un objeto JSON con dos claves: fields y lookupDictionary.",
"fields: name, rfc, legalRepresentative, incorporationDate, deedNumber, notaryName, fiscalAddress, businessPurpose, stateOfIncorporation.",
"lookupDictionary: estructura completa del diccionario de acta.",
"Texto:",
clampActaText(fullText),
].join("\n\n");
try {
const response = await fetch(`${baseUrl}/chat/completions`, {
method: "POST",
headers: {
"Content-Type": "application/json",
Authorization: `Bearer ${apiKey}`,
},
body: JSON.stringify({
model: DEFAULT_OPENAI_MODEL,
temperature: 0,
response_format: { type: "json_object" },
messages: [
{ role: "system", content: systemPrompt },
{ role: "user", content: userPrompt },
],
}),
signal: controller.signal,
});
const payload = await response.json().catch(() => ({}));
if (!response.ok) {
const message = payload?.error?.message ? ` ${payload.error.message}` : "";
throw new Error(`OpenAI request failed with ${response.status}.${message}`);
}
const content = payload?.choices?.[0]?.message?.content;
if (typeof content !== "string" || !content.trim()) {
throw new Error("OpenAI did not return text content.");
}
const parsed = extractJsonObject(content);
if (!parsed || typeof parsed !== "object" || !parsed.fields || !parsed.lookupDictionary) {
throw new Error("OpenAI JSON missing required keys: fields/lookupDictionary.");
}
return {
model: payload?.model ?? DEFAULT_OPENAI_MODEL,
usage: {
promptTokens: payload?.usage?.prompt_tokens ?? null,
completionTokens: payload?.usage?.completion_tokens ?? null,
totalTokens: payload?.usage?.total_tokens ?? null,
},
...parsed,
};
} finally {
clearTimeout(timer);
}
}
async function analyzePdfWithOcrFallback(filePath) {
const buffer = await readFile(filePath);
const warnings = [];
let direct;
try {
direct = await extractTextFromPdfBuffer(buffer);
} catch (error) {
warnings.push(`Direct extraction failed: ${error instanceof Error ? error.message : String(error)}`);
}
if (direct && !shouldApplyOcr(direct.text, direct.numPages)) {
return {
text: direct.text,
methodUsed: "direct",
numPages: direct.numPages,
warnings,
};
}
warnings.push("Direct extraction was short; OCR attempted.");
const ocr = await runOcrAndExtractText(buffer);
if (shouldApplyOcr(ocr.text, ocr.numPages)) {
throw new Error("OCR completed but usable text was still not detected.");
}
return {
text: ocr.text,
methodUsed: "ocr",
numPages: ocr.numPages,
warnings,
};
}
async function main() {
const { filePath, outPath } = parseArgs(process.argv.slice(2));
if (!filePath) {
usage();
process.exitCode = 1;
return;
}
const absolute = path.resolve(filePath);
const analysis = await analyzePdfWithOcrFallback(absolute);
const ai = await extractWithAi(analysis.text);
const result = {
ok: true,
file: absolute,
methodUsed: analysis.methodUsed,
numPages: analysis.numPages,
warnings: analysis.warnings,
extractionEngine: "ai",
aiModel: ai.model,
aiUsage: ai.usage,
fields: ai.fields,
lookupDictionary: ai.lookupDictionary,
};
if (outPath) {
await writeFile(path.resolve(outPath), `${JSON.stringify(result, null, 2)}\n`, "utf8");
console.log(`Result written to ${path.resolve(outPath)}`);
return;
}
console.log(JSON.stringify(result, null, 2));
}
main().catch((error) => {
console.error(error instanceof Error ? error.message : String(error));
process.exitCode = 1;
});