initial push

This commit is contained in:
Marcelo Dares
2026-03-15 15:03:56 +01:00
parent d48b9d5352
commit 65aaf9275e
146 changed files with 70245 additions and 100 deletions

344
scripts/analyze-acta-ai.mjs Normal file
View File

@@ -0,0 +1,344 @@
#!/usr/bin/env node
import { randomUUID } from "node:crypto";
import { spawn } from "node:child_process";
import { existsSync, readFileSync } from "node:fs";
import { readFile, unlink, writeFile } from "node:fs/promises";
import { tmpdir } from "node:os";
import path from "node:path";
import { PDFParse } from "pdf-parse";
const DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1";
const DEFAULT_OPENAI_MODEL = process.env.OPENAI_ACTA_MODEL?.trim() || "gpt-4.1-mini";
const DEFAULT_OCR_LANGUAGE = "spa+eng";
const MIN_DOC_CHARS = 200;
const MIN_CHARS_PER_PAGE = 50;
function loadDotEnv() {
const envPath = path.resolve(process.cwd(), ".env");
if (!existsSync(envPath)) {
return;
}
const content = readFileSync(envPath, "utf8");
const lines = content.split(/\r?\n/);
for (const line of lines) {
const trimmed = line.trim();
if (!trimmed || trimmed.startsWith("#")) {
continue;
}
const separator = trimmed.indexOf("=");
if (separator <= 0) {
continue;
}
const key = trimmed.slice(0, separator).trim();
let value = trimmed.slice(separator + 1).trim();
if ((value.startsWith('"') && value.endsWith('"')) || (value.startsWith("'") && value.endsWith("'"))) {
value = value.slice(1, -1);
}
if (key && !process.env[key]) {
process.env[key] = value;
}
}
}
loadDotEnv();
function usage() {
console.log("Usage: node scripts/analyze-acta-ai.mjs <path-to-pdf> [--out output.json]");
}
function parseArgs(argv) {
const args = [...argv];
const filePath = args[0];
let outPath = null;
for (let index = 1; index < args.length; index += 1) {
if (args[index] === "--out") {
outPath = args[index + 1] ?? null;
index += 1;
}
}
return { filePath, outPath };
}
function normalizeText(rawText) {
return rawText
.replace(/\u0000/g, " ")
.replace(/\r\n?/g, "\n")
.replace(/\n?\s*--\s*\d+\s*of\s*\d+\s*--\s*\n?/gi, "\n")
.replace(/\n?\s*p[áa]gina\s+\d+\s+de\s+\d+\s*\n?/gi, "\n")
.split("\n")
.map((line) => line.replace(/\s+/g, " ").trim())
.filter(Boolean)
.join("\n")
.trim();
}
async function extractTextFromPdfBuffer(buffer) {
const parser = new PDFParse({ data: buffer });
try {
const parsed = await parser.getText();
const rawText = typeof parsed.text === "string" ? parsed.text : "";
const text = normalizeText(rawText);
const numPages = typeof parsed.total === "number" && Number.isFinite(parsed.total) ? parsed.total : 0;
return { text, numPages };
} finally {
await parser.destroy().catch(() => undefined);
}
}
function shouldApplyOcr(text, numPages) {
const totalChars = text.trim().length;
const pages = Math.max(numPages, 1);
const charsPerPage = totalChars / pages;
return totalChars < MIN_DOC_CHARS || charsPerPage < MIN_CHARS_PER_PAGE;
}
function runCommand(command, args) {
return new Promise((resolve, reject) => {
const child = spawn(command, args, { stdio: ["ignore", "pipe", "pipe"] });
let stdout = "";
let stderr = "";
child.stdout.on("data", (chunk) => {
stdout += chunk.toString();
});
child.stderr.on("data", (chunk) => {
stderr += chunk.toString();
});
child.on("error", (error) => {
reject(error);
});
child.on("close", (code) => {
if (code === 0) {
resolve({ stdout, stderr });
return;
}
reject(new Error(`${command} failed with code ${code}: ${stderr}`));
});
});
}
function createTempPdfPath(prefix = "acta-ai") {
return path.join(tmpdir(), `${prefix}-${Date.now()}-${randomUUID()}.pdf`);
}
async function runOcrAndExtractText(originalBuffer, lang = DEFAULT_OCR_LANGUAGE) {
const inputPath = createTempPdfPath("acta-input");
const outputPath = createTempPdfPath("acta-output");
try {
await writeFile(inputPath, originalBuffer);
await runCommand("ocrmypdf", ["--skip-text", "--force-ocr", "-l", lang, inputPath, outputPath]);
const ocrBuffer = await readFile(outputPath);
const parsed = await extractTextFromPdfBuffer(ocrBuffer);
return parsed;
} finally {
await Promise.all([unlink(inputPath).catch(() => undefined), unlink(outputPath).catch(() => undefined)]);
}
}
function parseIntegerEnv(name, fallback) {
const parsed = Number.parseInt((process.env[name] ?? "").trim(), 10);
return Number.isFinite(parsed) && parsed > 0 ? parsed : fallback;
}
function getOpenAiApiKey() {
return process.env.OPENAI_API_KEY?.trim() || process.env.API_KEY?.trim() || process.env.api_key?.trim() || "";
}
function clampActaText(text) {
const maxChars = parseIntegerEnv("OPENAI_ACTA_MAX_CHARS", 45_000);
if (text.length <= maxChars) {
return text;
}
return `${text.slice(0, maxChars)}\n\n[TEXT_TRUNCATED_TO_${maxChars}_CHARS]`;
}
function extractJsonObject(rawContent) {
const trimmed = rawContent.trim();
try {
return JSON.parse(trimmed);
} catch {
// continue
}
const withoutFence = trimmed
.replace(/^```json\s*/i, "")
.replace(/^```\s*/i, "")
.replace(/\s*```$/i, "")
.trim();
try {
return JSON.parse(withoutFence);
} catch {
// continue
}
const firstBrace = withoutFence.indexOf("{");
const lastBrace = withoutFence.lastIndexOf("}");
if (firstBrace >= 0 && lastBrace > firstBrace) {
return JSON.parse(withoutFence.slice(firstBrace, lastBrace + 1));
}
throw new Error("AI response did not contain valid JSON.");
}
async function extractWithAi(fullText) {
const apiKey = getOpenAiApiKey();
if (!apiKey) {
throw new Error("OpenAI API key is missing (OPENAI_API_KEY or api_key).");
}
const baseUrl = (process.env.OPENAI_API_BASE_URL?.trim() || DEFAULT_OPENAI_BASE_URL).replace(/\/+$/, "");
const timeoutMs = parseIntegerEnv("OPENAI_ACTA_TIMEOUT_MS", 60_000);
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), timeoutMs);
const systemPrompt = [
"Eres analista legal experto en actas constitutivas mexicanas.",
"Devuelve exclusivamente JSON valido, sin markdown.",
"Si un campo no aparece, usa null.",
"No inventes datos.",
"rfc en fields debe ser null.",
"lookupDictionary.version debe ser exactamente 'mx_acta_constitutiva_reference_v1'.",
].join(" ");
const userPrompt = [
"Extrae un objeto JSON con dos claves: fields y lookupDictionary.",
"fields: name, rfc, legalRepresentative, incorporationDate, deedNumber, notaryName, fiscalAddress, businessPurpose, stateOfIncorporation.",
"lookupDictionary: estructura completa del diccionario de acta.",
"Texto:",
clampActaText(fullText),
].join("\n\n");
try {
const response = await fetch(`${baseUrl}/chat/completions`, {
method: "POST",
headers: {
"Content-Type": "application/json",
Authorization: `Bearer ${apiKey}`,
},
body: JSON.stringify({
model: DEFAULT_OPENAI_MODEL,
temperature: 0,
response_format: { type: "json_object" },
messages: [
{ role: "system", content: systemPrompt },
{ role: "user", content: userPrompt },
],
}),
signal: controller.signal,
});
const payload = await response.json().catch(() => ({}));
if (!response.ok) {
const message = payload?.error?.message ? ` ${payload.error.message}` : "";
throw new Error(`OpenAI request failed with ${response.status}.${message}`);
}
const content = payload?.choices?.[0]?.message?.content;
if (typeof content !== "string" || !content.trim()) {
throw new Error("OpenAI did not return text content.");
}
const parsed = extractJsonObject(content);
if (!parsed || typeof parsed !== "object" || !parsed.fields || !parsed.lookupDictionary) {
throw new Error("OpenAI JSON missing required keys: fields/lookupDictionary.");
}
return {
model: payload?.model ?? DEFAULT_OPENAI_MODEL,
usage: {
promptTokens: payload?.usage?.prompt_tokens ?? null,
completionTokens: payload?.usage?.completion_tokens ?? null,
totalTokens: payload?.usage?.total_tokens ?? null,
},
...parsed,
};
} finally {
clearTimeout(timer);
}
}
async function analyzePdfWithOcrFallback(filePath) {
const buffer = await readFile(filePath);
const warnings = [];
let direct;
try {
direct = await extractTextFromPdfBuffer(buffer);
} catch (error) {
warnings.push(`Direct extraction failed: ${error instanceof Error ? error.message : String(error)}`);
}
if (direct && !shouldApplyOcr(direct.text, direct.numPages)) {
return {
text: direct.text,
methodUsed: "direct",
numPages: direct.numPages,
warnings,
};
}
warnings.push("Direct extraction was short; OCR attempted.");
const ocr = await runOcrAndExtractText(buffer);
if (shouldApplyOcr(ocr.text, ocr.numPages)) {
throw new Error("OCR completed but usable text was still not detected.");
}
return {
text: ocr.text,
methodUsed: "ocr",
numPages: ocr.numPages,
warnings,
};
}
async function main() {
const { filePath, outPath } = parseArgs(process.argv.slice(2));
if (!filePath) {
usage();
process.exitCode = 1;
return;
}
const absolute = path.resolve(filePath);
const analysis = await analyzePdfWithOcrFallback(absolute);
const ai = await extractWithAi(analysis.text);
const result = {
ok: true,
file: absolute,
methodUsed: analysis.methodUsed,
numPages: analysis.numPages,
warnings: analysis.warnings,
extractionEngine: "ai",
aiModel: ai.model,
aiUsage: ai.usage,
fields: ai.fields,
lookupDictionary: ai.lookupDictionary,
};
if (outPath) {
await writeFile(path.resolve(outPath), `${JSON.stringify(result, null, 2)}\n`, "utf8");
console.log(`Result written to ${path.resolve(outPath)}`);
return;
}
console.log(JSON.stringify(result, null, 2));
}
main().catch((error) => {
console.error(error instanceof Error ? error.message : String(error));
process.exitCode = 1;
});

View File

@@ -0,0 +1,173 @@
import { spawnSync } from "node:child_process";
import { mkdir, readFile, writeFile } from "node:fs/promises";
import path from "node:path";
function normalizeSpaces(value) {
return value.replace(/\s+/g, " ").trim();
}
const knownOpenPortalMappings = {
"19-019": {
openPortalUrl: "https://licitaciones.sanpedro.gob.mx/Default.aspx?Year=2026&T=1&Pro=1",
openPortalType: "SAN_PEDRO_ASPX",
openSyncIntervalDays: 7,
},
};
function parseMunicipalityRows(text) {
const lines = text.split(/\r?\n/);
const areaRegex = "(Resto del País|Zona Libre de la Frontera Norte)";
const rowRegex = new RegExp(`^\\s*(\\d{2})\\s+(.+?)\\s+(\\d{3})\\*?\\s+(.+?)\\s+${areaRegex}\\s*$`);
const pendingRegex = /^\s*(\d{2})\s+(.+?)\s+(\d{3})\*?\s+(.+?)\s*$/;
const continuationRegex = new RegExp(`^\\s+(.+?)\\s+${areaRegex}\\s*$`);
const rows = [];
let pendingRow = null;
for (const line of lines) {
if (pendingRow) {
const continuation = line.match(continuationRegex);
if (continuation) {
rows.push({
stateCode: pendingRow.stateCode,
stateName: pendingRow.stateName,
municipalityCode: pendingRow.municipalityCode,
municipalityName: normalizeSpaces(continuation[1] ?? ""),
areaGeografica: normalizeSpaces(continuation[2] ?? ""),
openPortalUrl: null,
openPortalType: "GENERIC",
openSyncIntervalDays: 7,
pntSubjectId: null,
pntEntityId: null,
pntSectorId: null,
pntEntryUrl: null,
backupUrl: null,
scrapingEnabled: true,
isActive: true,
});
pendingRow = null;
continue;
}
pendingRow = null;
}
if (!/^\s*\d{2}\s+/.test(line)) {
continue;
}
const match = line.match(rowRegex);
if (!match) {
const pendingMatch = line.match(pendingRegex);
if (pendingMatch) {
pendingRow = {
stateCode: pendingMatch[1],
stateName: normalizeSpaces(pendingMatch[2] ?? ""),
municipalityCode: (pendingMatch[3] ?? "").padStart(3, "0"),
};
}
continue;
}
const stateCode = match[1];
const stateName = normalizeSpaces(match[2] ?? "");
const municipalityCode = (match[3] ?? "").padStart(3, "0");
const municipalityName = normalizeSpaces(match[4] ?? "");
const areaGeografica = normalizeSpaces(match[5] ?? "");
rows.push({
stateCode,
stateName,
municipalityCode,
municipalityName,
areaGeografica,
openPortalUrl: null,
openPortalType: "GENERIC",
openSyncIntervalDays: 7,
pntSubjectId: null,
pntEntityId: null,
pntSectorId: null,
pntEntryUrl: null,
backupUrl: null,
scrapingEnabled: true,
isActive: true,
});
}
const deduped = new Map();
for (const row of rows) {
deduped.set(`${row.stateCode}-${row.municipalityCode}`, row);
}
const merged = [...deduped.values()].map((row) => {
const key = `${row.stateCode}-${row.municipalityCode}`;
const known = knownOpenPortalMappings[key];
if (!known) {
return row;
}
return {
...row,
...known,
};
});
return merged.sort((a, b) => {
if (a.stateCode !== b.stateCode) {
return a.stateCode.localeCompare(b.stateCode, "es");
}
return a.municipalityCode.localeCompare(b.municipalityCode, "es");
});
}
async function readSourceText(inputPath) {
if (inputPath.toLowerCase().endsWith(".txt")) {
return readFile(inputPath, "utf8");
}
const result = spawnSync("pdftotext", ["-layout", inputPath, "-"], {
encoding: "utf8",
maxBuffer: 1024 * 1024 * 20,
});
if (result.status === 0 && typeof result.stdout === "string" && result.stdout.trim()) {
return result.stdout;
}
if (result.error) {
throw result.error;
}
throw new Error(result.stderr || "Failed to extract text from PDF.");
}
function run() {
const inputPdf = process.argv[2] || path.join(process.cwd(), "Estructura_municipal_dic22.pdf");
const outputJson = process.argv[3] || path.join(process.cwd(), "prisma", "data", "municipalities.json");
return readSourceText(inputPdf).then((rawText) => {
const rows = parseMunicipalityRows(rawText);
if (!rows.length) {
throw new Error("No municipality rows parsed from source text.");
}
return mkdir(path.dirname(outputJson), { recursive: true })
.then(() => writeFile(outputJson, `${JSON.stringify(rows, null, 2)}\n`, "utf8"))
.then(() => {
console.log(`Parsed municipalities: ${rows.length}`);
console.log(`Output: ${outputJson}`);
});
});
}
run().catch((error) => {
console.error(error instanceof Error ? error.message : String(error));
process.exitCode = 1;
});

View File

@@ -0,0 +1,195 @@
import { readFileSync, existsSync } from "node:fs";
import { resolve } from "node:path";
const DEFAULT_TIMEOUT_MS = 20000;
function loadDotenv(filePath) {
if (!existsSync(filePath)) {
return;
}
const raw = readFileSync(filePath, "utf8");
for (const line of raw.split(/\r?\n/)) {
const trimmed = line.trim();
if (!trimmed || trimmed.startsWith("#")) {
continue;
}
const eqIndex = trimmed.indexOf("=");
if (eqIndex === -1) {
continue;
}
const key = trimmed.slice(0, eqIndex).trim();
if (!key || process.env[key] !== undefined) {
continue;
}
let value = trimmed.slice(eqIndex + 1).trim();
if ((value.startsWith('"') && value.endsWith('"')) || (value.startsWith("'") && value.endsWith("'"))) {
value = value.slice(1, -1);
}
process.env[key] = value;
}
}
function parseArgs(argv) {
const args = {
baseUrl: process.env.LICITAYA_BASE_URL,
endpoint: process.env.LICITAYA_TEST_ENDPOINT,
accept: process.env.LICITAYA_ACCEPT || "application/json",
method: "GET",
timeoutMs: Number.parseInt(process.env.LICITAYA_TIMEOUT_MS || "", 10) || DEFAULT_TIMEOUT_MS,
};
for (let i = 0; i < argv.length; i += 1) {
const current = argv[i];
const next = argv[i + 1];
if (current === "--base-url" && next) {
args.baseUrl = next;
i += 1;
continue;
}
if (current === "--endpoint" && next) {
args.endpoint = next;
i += 1;
continue;
}
if (current === "--accept" && next) {
args.accept = next;
i += 1;
continue;
}
if (current === "--method" && next) {
args.method = next.toUpperCase();
i += 1;
continue;
}
if (current === "--timeout" && next) {
const parsed = Number.parseInt(next, 10);
if (Number.isFinite(parsed) && parsed > 0) {
args.timeoutMs = parsed;
}
i += 1;
continue;
}
}
return args;
}
function buildUrl(baseUrl, endpoint) {
if (!endpoint) {
throw new Error("Missing LICITAYA_TEST_ENDPOINT (or --endpoint).");
}
if (endpoint.includes("<") || endpoint.includes(">")) {
throw new Error(
"LICITAYA_TEST_ENDPOINT still contains placeholders. Use a real path such as /tender/search or /tender/<tenderId>.",
);
}
if (/^https?:\/\//i.test(endpoint)) {
return new URL(endpoint);
}
if (!baseUrl) {
throw new Error("Missing LICITAYA_BASE_URL (or --base-url).");
}
const normalizedBase = baseUrl.endsWith("/") ? baseUrl : `${baseUrl}/`;
const cleanEndpoint = endpoint.startsWith("/") ? endpoint.slice(1) : endpoint;
return new URL(cleanEndpoint, normalizedBase);
}
function previewBody(rawBody, contentType) {
const trimmed = rawBody.trim();
if (!trimmed) {
return "(empty body)";
}
const isJson = contentType.includes("application/json") || trimmed.startsWith("{") || trimmed.startsWith("[");
if (isJson) {
try {
const json = JSON.parse(trimmed);
return JSON.stringify(json, null, 2);
} catch {
return trimmed.slice(0, 3000);
}
}
return trimmed.slice(0, 3000);
}
loadDotenv(resolve(process.cwd(), ".env"));
const apiKey = process.env.LICITAYA_API_KEY || process.env.X_API_KEY || process.env.X_API_KEY_LICITAYA;
if (!apiKey) {
console.error("Missing API key. Set LICITAYA_API_KEY in .env or shell env.");
process.exit(1);
}
const args = parseArgs(process.argv.slice(2));
let url;
try {
url = buildUrl(args.baseUrl, args.endpoint);
} catch (error) {
console.error(error instanceof Error ? error.message : String(error));
process.exit(1);
}
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), args.timeoutMs);
try {
const response = await fetch(url, {
method: args.method,
headers: {
"X-API-KEY": apiKey,
Accept: args.accept,
},
signal: controller.signal,
});
const contentType = response.headers.get("content-type") || "";
const rawBody = await response.text();
const bodyPreview = previewBody(rawBody, contentType);
console.log(`URL: ${url.toString()}`);
console.log(`Method: ${args.method}`);
console.log(`Status: ${response.status} ${response.statusText}`);
console.log(`Content-Type: ${contentType || "(none)"}`);
console.log("--- Response Preview ---");
console.log(bodyPreview);
if (response.status === 404 && url.pathname.endsWith("/tender/search")) {
console.error("No tenders matched the current filters. Try a broader keyword or fewer filters.");
}
if (!response.ok) {
process.exit(1);
}
} catch (error) {
if (error instanceof Error && error.name === "AbortError") {
console.error(`Request timed out after ${args.timeoutMs} ms.`);
} else {
console.error(error instanceof Error ? error.message : String(error));
}
process.exit(1);
} finally {
clearTimeout(timeout);
}