import { spawnSync } from "node:child_process"; import { mkdir, readFile, writeFile } from "node:fs/promises"; import path from "node:path"; function normalizeSpaces(value) { return value.replace(/\s+/g, " ").trim(); } const knownOpenPortalMappings = { "19-019": { openPortalUrl: "https://licitaciones.sanpedro.gob.mx/Default.aspx?Year=2026&T=1&Pro=1", openPortalType: "SAN_PEDRO_ASPX", openSyncIntervalDays: 7, }, }; function parseMunicipalityRows(text) { const lines = text.split(/\r?\n/); const areaRegex = "(Resto del PaĆ­s|Zona Libre de la Frontera Norte)"; const rowRegex = new RegExp(`^\\s*(\\d{2})\\s+(.+?)\\s+(\\d{3})\\*?\\s+(.+?)\\s+${areaRegex}\\s*$`); const pendingRegex = /^\s*(\d{2})\s+(.+?)\s+(\d{3})\*?\s+(.+?)\s*$/; const continuationRegex = new RegExp(`^\\s+(.+?)\\s+${areaRegex}\\s*$`); const rows = []; let pendingRow = null; for (const line of lines) { if (pendingRow) { const continuation = line.match(continuationRegex); if (continuation) { rows.push({ stateCode: pendingRow.stateCode, stateName: pendingRow.stateName, municipalityCode: pendingRow.municipalityCode, municipalityName: normalizeSpaces(continuation[1] ?? ""), areaGeografica: normalizeSpaces(continuation[2] ?? ""), openPortalUrl: null, openPortalType: "GENERIC", openSyncIntervalDays: 7, pntSubjectId: null, pntEntityId: null, pntSectorId: null, pntEntryUrl: null, backupUrl: null, scrapingEnabled: true, isActive: true, }); pendingRow = null; continue; } pendingRow = null; } if (!/^\s*\d{2}\s+/.test(line)) { continue; } const match = line.match(rowRegex); if (!match) { const pendingMatch = line.match(pendingRegex); if (pendingMatch) { pendingRow = { stateCode: pendingMatch[1], stateName: normalizeSpaces(pendingMatch[2] ?? ""), municipalityCode: (pendingMatch[3] ?? "").padStart(3, "0"), }; } continue; } const stateCode = match[1]; const stateName = normalizeSpaces(match[2] ?? ""); const municipalityCode = (match[3] ?? "").padStart(3, "0"); const municipalityName = normalizeSpaces(match[4] ?? ""); const areaGeografica = normalizeSpaces(match[5] ?? ""); rows.push({ stateCode, stateName, municipalityCode, municipalityName, areaGeografica, openPortalUrl: null, openPortalType: "GENERIC", openSyncIntervalDays: 7, pntSubjectId: null, pntEntityId: null, pntSectorId: null, pntEntryUrl: null, backupUrl: null, scrapingEnabled: true, isActive: true, }); } const deduped = new Map(); for (const row of rows) { deduped.set(`${row.stateCode}-${row.municipalityCode}`, row); } const merged = [...deduped.values()].map((row) => { const key = `${row.stateCode}-${row.municipalityCode}`; const known = knownOpenPortalMappings[key]; if (!known) { return row; } return { ...row, ...known, }; }); return merged.sort((a, b) => { if (a.stateCode !== b.stateCode) { return a.stateCode.localeCompare(b.stateCode, "es"); } return a.municipalityCode.localeCompare(b.municipalityCode, "es"); }); } async function readSourceText(inputPath) { if (inputPath.toLowerCase().endsWith(".txt")) { return readFile(inputPath, "utf8"); } const result = spawnSync("pdftotext", ["-layout", inputPath, "-"], { encoding: "utf8", maxBuffer: 1024 * 1024 * 20, }); if (result.status === 0 && typeof result.stdout === "string" && result.stdout.trim()) { return result.stdout; } if (result.error) { throw result.error; } throw new Error(result.stderr || "Failed to extract text from PDF."); } function run() { const inputPdf = process.argv[2] || path.join(process.cwd(), "Estructura_municipal_dic22.pdf"); const outputJson = process.argv[3] || path.join(process.cwd(), "prisma", "data", "municipalities.json"); return readSourceText(inputPdf).then((rawText) => { const rows = parseMunicipalityRows(rawText); if (!rows.length) { throw new Error("No municipality rows parsed from source text."); } return mkdir(path.dirname(outputJson), { recursive: true }) .then(() => writeFile(outputJson, `${JSON.stringify(rows, null, 2)}\n`, "utf8")) .then(() => { console.log(`Parsed municipalities: ${rows.length}`); console.log(`Output: ${outputJson}`); }); }); } run().catch((error) => { console.error(error instanceof Error ? error.message : String(error)); process.exitCode = 1; });