initial push

This commit is contained in:
Marcelo Dares
2026-03-15 15:03:56 +01:00
parent d48b9d5352
commit 65aaf9275e
146 changed files with 70245 additions and 100 deletions

View File

@@ -0,0 +1,173 @@
import { spawnSync } from "node:child_process";
import { mkdir, readFile, writeFile } from "node:fs/promises";
import path from "node:path";
function normalizeSpaces(value) {
return value.replace(/\s+/g, " ").trim();
}
const knownOpenPortalMappings = {
"19-019": {
openPortalUrl: "https://licitaciones.sanpedro.gob.mx/Default.aspx?Year=2026&T=1&Pro=1",
openPortalType: "SAN_PEDRO_ASPX",
openSyncIntervalDays: 7,
},
};
function parseMunicipalityRows(text) {
const lines = text.split(/\r?\n/);
const areaRegex = "(Resto del País|Zona Libre de la Frontera Norte)";
const rowRegex = new RegExp(`^\\s*(\\d{2})\\s+(.+?)\\s+(\\d{3})\\*?\\s+(.+?)\\s+${areaRegex}\\s*$`);
const pendingRegex = /^\s*(\d{2})\s+(.+?)\s+(\d{3})\*?\s+(.+?)\s*$/;
const continuationRegex = new RegExp(`^\\s+(.+?)\\s+${areaRegex}\\s*$`);
const rows = [];
let pendingRow = null;
for (const line of lines) {
if (pendingRow) {
const continuation = line.match(continuationRegex);
if (continuation) {
rows.push({
stateCode: pendingRow.stateCode,
stateName: pendingRow.stateName,
municipalityCode: pendingRow.municipalityCode,
municipalityName: normalizeSpaces(continuation[1] ?? ""),
areaGeografica: normalizeSpaces(continuation[2] ?? ""),
openPortalUrl: null,
openPortalType: "GENERIC",
openSyncIntervalDays: 7,
pntSubjectId: null,
pntEntityId: null,
pntSectorId: null,
pntEntryUrl: null,
backupUrl: null,
scrapingEnabled: true,
isActive: true,
});
pendingRow = null;
continue;
}
pendingRow = null;
}
if (!/^\s*\d{2}\s+/.test(line)) {
continue;
}
const match = line.match(rowRegex);
if (!match) {
const pendingMatch = line.match(pendingRegex);
if (pendingMatch) {
pendingRow = {
stateCode: pendingMatch[1],
stateName: normalizeSpaces(pendingMatch[2] ?? ""),
municipalityCode: (pendingMatch[3] ?? "").padStart(3, "0"),
};
}
continue;
}
const stateCode = match[1];
const stateName = normalizeSpaces(match[2] ?? "");
const municipalityCode = (match[3] ?? "").padStart(3, "0");
const municipalityName = normalizeSpaces(match[4] ?? "");
const areaGeografica = normalizeSpaces(match[5] ?? "");
rows.push({
stateCode,
stateName,
municipalityCode,
municipalityName,
areaGeografica,
openPortalUrl: null,
openPortalType: "GENERIC",
openSyncIntervalDays: 7,
pntSubjectId: null,
pntEntityId: null,
pntSectorId: null,
pntEntryUrl: null,
backupUrl: null,
scrapingEnabled: true,
isActive: true,
});
}
const deduped = new Map();
for (const row of rows) {
deduped.set(`${row.stateCode}-${row.municipalityCode}`, row);
}
const merged = [...deduped.values()].map((row) => {
const key = `${row.stateCode}-${row.municipalityCode}`;
const known = knownOpenPortalMappings[key];
if (!known) {
return row;
}
return {
...row,
...known,
};
});
return merged.sort((a, b) => {
if (a.stateCode !== b.stateCode) {
return a.stateCode.localeCompare(b.stateCode, "es");
}
return a.municipalityCode.localeCompare(b.municipalityCode, "es");
});
}
async function readSourceText(inputPath) {
if (inputPath.toLowerCase().endsWith(".txt")) {
return readFile(inputPath, "utf8");
}
const result = spawnSync("pdftotext", ["-layout", inputPath, "-"], {
encoding: "utf8",
maxBuffer: 1024 * 1024 * 20,
});
if (result.status === 0 && typeof result.stdout === "string" && result.stdout.trim()) {
return result.stdout;
}
if (result.error) {
throw result.error;
}
throw new Error(result.stderr || "Failed to extract text from PDF.");
}
function run() {
const inputPdf = process.argv[2] || path.join(process.cwd(), "Estructura_municipal_dic22.pdf");
const outputJson = process.argv[3] || path.join(process.cwd(), "prisma", "data", "municipalities.json");
return readSourceText(inputPdf).then((rawText) => {
const rows = parseMunicipalityRows(rawText);
if (!rows.length) {
throw new Error("No municipality rows parsed from source text.");
}
return mkdir(path.dirname(outputJson), { recursive: true })
.then(() => writeFile(outputJson, `${JSON.stringify(rows, null, 2)}\n`, "utf8"))
.then(() => {
console.log(`Parsed municipalities: ${rows.length}`);
console.log(`Output: ${outputJson}`);
});
});
}
run().catch((error) => {
console.error(error instanceof Error ? error.message : String(error));
process.exitCode = 1;
});