initial push
This commit is contained in:
344
scripts/analyze-acta-ai.mjs
Normal file
344
scripts/analyze-acta-ai.mjs
Normal file
@@ -0,0 +1,344 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
import { randomUUID } from "node:crypto";
|
||||
import { spawn } from "node:child_process";
|
||||
import { existsSync, readFileSync } from "node:fs";
|
||||
import { readFile, unlink, writeFile } from "node:fs/promises";
|
||||
import { tmpdir } from "node:os";
|
||||
import path from "node:path";
|
||||
import { PDFParse } from "pdf-parse";
|
||||
|
||||
const DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1";
|
||||
const DEFAULT_OPENAI_MODEL = process.env.OPENAI_ACTA_MODEL?.trim() || "gpt-4.1-mini";
|
||||
const DEFAULT_OCR_LANGUAGE = "spa+eng";
|
||||
const MIN_DOC_CHARS = 200;
|
||||
const MIN_CHARS_PER_PAGE = 50;
|
||||
|
||||
function loadDotEnv() {
|
||||
const envPath = path.resolve(process.cwd(), ".env");
|
||||
if (!existsSync(envPath)) {
|
||||
return;
|
||||
}
|
||||
|
||||
const content = readFileSync(envPath, "utf8");
|
||||
const lines = content.split(/\r?\n/);
|
||||
|
||||
for (const line of lines) {
|
||||
const trimmed = line.trim();
|
||||
if (!trimmed || trimmed.startsWith("#")) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const separator = trimmed.indexOf("=");
|
||||
if (separator <= 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const key = trimmed.slice(0, separator).trim();
|
||||
let value = trimmed.slice(separator + 1).trim();
|
||||
|
||||
if ((value.startsWith('"') && value.endsWith('"')) || (value.startsWith("'") && value.endsWith("'"))) {
|
||||
value = value.slice(1, -1);
|
||||
}
|
||||
|
||||
if (key && !process.env[key]) {
|
||||
process.env[key] = value;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
loadDotEnv();
|
||||
|
||||
function usage() {
|
||||
console.log("Usage: node scripts/analyze-acta-ai.mjs <path-to-pdf> [--out output.json]");
|
||||
}
|
||||
|
||||
function parseArgs(argv) {
|
||||
const args = [...argv];
|
||||
const filePath = args[0];
|
||||
let outPath = null;
|
||||
|
||||
for (let index = 1; index < args.length; index += 1) {
|
||||
if (args[index] === "--out") {
|
||||
outPath = args[index + 1] ?? null;
|
||||
index += 1;
|
||||
}
|
||||
}
|
||||
|
||||
return { filePath, outPath };
|
||||
}
|
||||
|
||||
function normalizeText(rawText) {
|
||||
return rawText
|
||||
.replace(/\u0000/g, " ")
|
||||
.replace(/\r\n?/g, "\n")
|
||||
.replace(/\n?\s*--\s*\d+\s*of\s*\d+\s*--\s*\n?/gi, "\n")
|
||||
.replace(/\n?\s*p[áa]gina\s+\d+\s+de\s+\d+\s*\n?/gi, "\n")
|
||||
.split("\n")
|
||||
.map((line) => line.replace(/\s+/g, " ").trim())
|
||||
.filter(Boolean)
|
||||
.join("\n")
|
||||
.trim();
|
||||
}
|
||||
|
||||
async function extractTextFromPdfBuffer(buffer) {
|
||||
const parser = new PDFParse({ data: buffer });
|
||||
try {
|
||||
const parsed = await parser.getText();
|
||||
const rawText = typeof parsed.text === "string" ? parsed.text : "";
|
||||
const text = normalizeText(rawText);
|
||||
const numPages = typeof parsed.total === "number" && Number.isFinite(parsed.total) ? parsed.total : 0;
|
||||
return { text, numPages };
|
||||
} finally {
|
||||
await parser.destroy().catch(() => undefined);
|
||||
}
|
||||
}
|
||||
|
||||
function shouldApplyOcr(text, numPages) {
|
||||
const totalChars = text.trim().length;
|
||||
const pages = Math.max(numPages, 1);
|
||||
const charsPerPage = totalChars / pages;
|
||||
return totalChars < MIN_DOC_CHARS || charsPerPage < MIN_CHARS_PER_PAGE;
|
||||
}
|
||||
|
||||
function runCommand(command, args) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const child = spawn(command, args, { stdio: ["ignore", "pipe", "pipe"] });
|
||||
let stdout = "";
|
||||
let stderr = "";
|
||||
|
||||
child.stdout.on("data", (chunk) => {
|
||||
stdout += chunk.toString();
|
||||
});
|
||||
child.stderr.on("data", (chunk) => {
|
||||
stderr += chunk.toString();
|
||||
});
|
||||
child.on("error", (error) => {
|
||||
reject(error);
|
||||
});
|
||||
child.on("close", (code) => {
|
||||
if (code === 0) {
|
||||
resolve({ stdout, stderr });
|
||||
return;
|
||||
}
|
||||
reject(new Error(`${command} failed with code ${code}: ${stderr}`));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
function createTempPdfPath(prefix = "acta-ai") {
|
||||
return path.join(tmpdir(), `${prefix}-${Date.now()}-${randomUUID()}.pdf`);
|
||||
}
|
||||
|
||||
async function runOcrAndExtractText(originalBuffer, lang = DEFAULT_OCR_LANGUAGE) {
|
||||
const inputPath = createTempPdfPath("acta-input");
|
||||
const outputPath = createTempPdfPath("acta-output");
|
||||
|
||||
try {
|
||||
await writeFile(inputPath, originalBuffer);
|
||||
await runCommand("ocrmypdf", ["--skip-text", "--force-ocr", "-l", lang, inputPath, outputPath]);
|
||||
|
||||
const ocrBuffer = await readFile(outputPath);
|
||||
const parsed = await extractTextFromPdfBuffer(ocrBuffer);
|
||||
return parsed;
|
||||
} finally {
|
||||
await Promise.all([unlink(inputPath).catch(() => undefined), unlink(outputPath).catch(() => undefined)]);
|
||||
}
|
||||
}
|
||||
|
||||
function parseIntegerEnv(name, fallback) {
|
||||
const parsed = Number.parseInt((process.env[name] ?? "").trim(), 10);
|
||||
return Number.isFinite(parsed) && parsed > 0 ? parsed : fallback;
|
||||
}
|
||||
|
||||
function getOpenAiApiKey() {
|
||||
return process.env.OPENAI_API_KEY?.trim() || process.env.API_KEY?.trim() || process.env.api_key?.trim() || "";
|
||||
}
|
||||
|
||||
function clampActaText(text) {
|
||||
const maxChars = parseIntegerEnv("OPENAI_ACTA_MAX_CHARS", 45_000);
|
||||
if (text.length <= maxChars) {
|
||||
return text;
|
||||
}
|
||||
return `${text.slice(0, maxChars)}\n\n[TEXT_TRUNCATED_TO_${maxChars}_CHARS]`;
|
||||
}
|
||||
|
||||
function extractJsonObject(rawContent) {
|
||||
const trimmed = rawContent.trim();
|
||||
try {
|
||||
return JSON.parse(trimmed);
|
||||
} catch {
|
||||
// continue
|
||||
}
|
||||
|
||||
const withoutFence = trimmed
|
||||
.replace(/^```json\s*/i, "")
|
||||
.replace(/^```\s*/i, "")
|
||||
.replace(/\s*```$/i, "")
|
||||
.trim();
|
||||
|
||||
try {
|
||||
return JSON.parse(withoutFence);
|
||||
} catch {
|
||||
// continue
|
||||
}
|
||||
|
||||
const firstBrace = withoutFence.indexOf("{");
|
||||
const lastBrace = withoutFence.lastIndexOf("}");
|
||||
if (firstBrace >= 0 && lastBrace > firstBrace) {
|
||||
return JSON.parse(withoutFence.slice(firstBrace, lastBrace + 1));
|
||||
}
|
||||
|
||||
throw new Error("AI response did not contain valid JSON.");
|
||||
}
|
||||
|
||||
async function extractWithAi(fullText) {
|
||||
const apiKey = getOpenAiApiKey();
|
||||
if (!apiKey) {
|
||||
throw new Error("OpenAI API key is missing (OPENAI_API_KEY or api_key).");
|
||||
}
|
||||
|
||||
const baseUrl = (process.env.OPENAI_API_BASE_URL?.trim() || DEFAULT_OPENAI_BASE_URL).replace(/\/+$/, "");
|
||||
const timeoutMs = parseIntegerEnv("OPENAI_ACTA_TIMEOUT_MS", 60_000);
|
||||
const controller = new AbortController();
|
||||
const timer = setTimeout(() => controller.abort(), timeoutMs);
|
||||
|
||||
const systemPrompt = [
|
||||
"Eres analista legal experto en actas constitutivas mexicanas.",
|
||||
"Devuelve exclusivamente JSON valido, sin markdown.",
|
||||
"Si un campo no aparece, usa null.",
|
||||
"No inventes datos.",
|
||||
"rfc en fields debe ser null.",
|
||||
"lookupDictionary.version debe ser exactamente 'mx_acta_constitutiva_reference_v1'.",
|
||||
].join(" ");
|
||||
|
||||
const userPrompt = [
|
||||
"Extrae un objeto JSON con dos claves: fields y lookupDictionary.",
|
||||
"fields: name, rfc, legalRepresentative, incorporationDate, deedNumber, notaryName, fiscalAddress, businessPurpose, stateOfIncorporation.",
|
||||
"lookupDictionary: estructura completa del diccionario de acta.",
|
||||
"Texto:",
|
||||
clampActaText(fullText),
|
||||
].join("\n\n");
|
||||
|
||||
try {
|
||||
const response = await fetch(`${baseUrl}/chat/completions`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
Authorization: `Bearer ${apiKey}`,
|
||||
},
|
||||
body: JSON.stringify({
|
||||
model: DEFAULT_OPENAI_MODEL,
|
||||
temperature: 0,
|
||||
response_format: { type: "json_object" },
|
||||
messages: [
|
||||
{ role: "system", content: systemPrompt },
|
||||
{ role: "user", content: userPrompt },
|
||||
],
|
||||
}),
|
||||
signal: controller.signal,
|
||||
});
|
||||
|
||||
const payload = await response.json().catch(() => ({}));
|
||||
if (!response.ok) {
|
||||
const message = payload?.error?.message ? ` ${payload.error.message}` : "";
|
||||
throw new Error(`OpenAI request failed with ${response.status}.${message}`);
|
||||
}
|
||||
|
||||
const content = payload?.choices?.[0]?.message?.content;
|
||||
if (typeof content !== "string" || !content.trim()) {
|
||||
throw new Error("OpenAI did not return text content.");
|
||||
}
|
||||
|
||||
const parsed = extractJsonObject(content);
|
||||
if (!parsed || typeof parsed !== "object" || !parsed.fields || !parsed.lookupDictionary) {
|
||||
throw new Error("OpenAI JSON missing required keys: fields/lookupDictionary.");
|
||||
}
|
||||
|
||||
return {
|
||||
model: payload?.model ?? DEFAULT_OPENAI_MODEL,
|
||||
usage: {
|
||||
promptTokens: payload?.usage?.prompt_tokens ?? null,
|
||||
completionTokens: payload?.usage?.completion_tokens ?? null,
|
||||
totalTokens: payload?.usage?.total_tokens ?? null,
|
||||
},
|
||||
...parsed,
|
||||
};
|
||||
} finally {
|
||||
clearTimeout(timer);
|
||||
}
|
||||
}
|
||||
|
||||
async function analyzePdfWithOcrFallback(filePath) {
|
||||
const buffer = await readFile(filePath);
|
||||
const warnings = [];
|
||||
|
||||
let direct;
|
||||
try {
|
||||
direct = await extractTextFromPdfBuffer(buffer);
|
||||
} catch (error) {
|
||||
warnings.push(`Direct extraction failed: ${error instanceof Error ? error.message : String(error)}`);
|
||||
}
|
||||
|
||||
if (direct && !shouldApplyOcr(direct.text, direct.numPages)) {
|
||||
return {
|
||||
text: direct.text,
|
||||
methodUsed: "direct",
|
||||
numPages: direct.numPages,
|
||||
warnings,
|
||||
};
|
||||
}
|
||||
|
||||
warnings.push("Direct extraction was short; OCR attempted.");
|
||||
|
||||
const ocr = await runOcrAndExtractText(buffer);
|
||||
if (shouldApplyOcr(ocr.text, ocr.numPages)) {
|
||||
throw new Error("OCR completed but usable text was still not detected.");
|
||||
}
|
||||
|
||||
return {
|
||||
text: ocr.text,
|
||||
methodUsed: "ocr",
|
||||
numPages: ocr.numPages,
|
||||
warnings,
|
||||
};
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const { filePath, outPath } = parseArgs(process.argv.slice(2));
|
||||
if (!filePath) {
|
||||
usage();
|
||||
process.exitCode = 1;
|
||||
return;
|
||||
}
|
||||
|
||||
const absolute = path.resolve(filePath);
|
||||
const analysis = await analyzePdfWithOcrFallback(absolute);
|
||||
const ai = await extractWithAi(analysis.text);
|
||||
|
||||
const result = {
|
||||
ok: true,
|
||||
file: absolute,
|
||||
methodUsed: analysis.methodUsed,
|
||||
numPages: analysis.numPages,
|
||||
warnings: analysis.warnings,
|
||||
extractionEngine: "ai",
|
||||
aiModel: ai.model,
|
||||
aiUsage: ai.usage,
|
||||
fields: ai.fields,
|
||||
lookupDictionary: ai.lookupDictionary,
|
||||
};
|
||||
|
||||
if (outPath) {
|
||||
await writeFile(path.resolve(outPath), `${JSON.stringify(result, null, 2)}\n`, "utf8");
|
||||
console.log(`Result written to ${path.resolve(outPath)}`);
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(JSON.stringify(result, null, 2));
|
||||
}
|
||||
|
||||
main().catch((error) => {
|
||||
console.error(error instanceof Error ? error.message : String(error));
|
||||
process.exitCode = 1;
|
||||
});
|
||||
Reference in New Issue
Block a user