174 lines
4.7 KiB
JavaScript
174 lines
4.7 KiB
JavaScript
import { spawnSync } from "node:child_process";
|
|
import { mkdir, readFile, writeFile } from "node:fs/promises";
|
|
import path from "node:path";
|
|
|
|
function normalizeSpaces(value) {
|
|
return value.replace(/\s+/g, " ").trim();
|
|
}
|
|
|
|
const knownOpenPortalMappings = {
|
|
"19-019": {
|
|
openPortalUrl: "https://licitaciones.sanpedro.gob.mx/Default.aspx?Year=2026&T=1&Pro=1",
|
|
openPortalType: "SAN_PEDRO_ASPX",
|
|
openSyncIntervalDays: 7,
|
|
},
|
|
};
|
|
|
|
function parseMunicipalityRows(text) {
|
|
const lines = text.split(/\r?\n/);
|
|
const areaRegex = "(Resto del País|Zona Libre de la Frontera Norte)";
|
|
const rowRegex = new RegExp(`^\\s*(\\d{2})\\s+(.+?)\\s+(\\d{3})\\*?\\s+(.+?)\\s+${areaRegex}\\s*$`);
|
|
const pendingRegex = /^\s*(\d{2})\s+(.+?)\s+(\d{3})\*?\s+(.+?)\s*$/;
|
|
const continuationRegex = new RegExp(`^\\s+(.+?)\\s+${areaRegex}\\s*$`);
|
|
const rows = [];
|
|
let pendingRow = null;
|
|
|
|
for (const line of lines) {
|
|
if (pendingRow) {
|
|
const continuation = line.match(continuationRegex);
|
|
|
|
if (continuation) {
|
|
rows.push({
|
|
stateCode: pendingRow.stateCode,
|
|
stateName: pendingRow.stateName,
|
|
municipalityCode: pendingRow.municipalityCode,
|
|
municipalityName: normalizeSpaces(continuation[1] ?? ""),
|
|
areaGeografica: normalizeSpaces(continuation[2] ?? ""),
|
|
openPortalUrl: null,
|
|
openPortalType: "GENERIC",
|
|
openSyncIntervalDays: 7,
|
|
pntSubjectId: null,
|
|
pntEntityId: null,
|
|
pntSectorId: null,
|
|
pntEntryUrl: null,
|
|
backupUrl: null,
|
|
scrapingEnabled: true,
|
|
isActive: true,
|
|
});
|
|
pendingRow = null;
|
|
continue;
|
|
}
|
|
|
|
pendingRow = null;
|
|
}
|
|
|
|
if (!/^\s*\d{2}\s+/.test(line)) {
|
|
continue;
|
|
}
|
|
|
|
const match = line.match(rowRegex);
|
|
|
|
if (!match) {
|
|
const pendingMatch = line.match(pendingRegex);
|
|
|
|
if (pendingMatch) {
|
|
pendingRow = {
|
|
stateCode: pendingMatch[1],
|
|
stateName: normalizeSpaces(pendingMatch[2] ?? ""),
|
|
municipalityCode: (pendingMatch[3] ?? "").padStart(3, "0"),
|
|
};
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
const stateCode = match[1];
|
|
const stateName = normalizeSpaces(match[2] ?? "");
|
|
const municipalityCode = (match[3] ?? "").padStart(3, "0");
|
|
const municipalityName = normalizeSpaces(match[4] ?? "");
|
|
const areaGeografica = normalizeSpaces(match[5] ?? "");
|
|
|
|
rows.push({
|
|
stateCode,
|
|
stateName,
|
|
municipalityCode,
|
|
municipalityName,
|
|
areaGeografica,
|
|
openPortalUrl: null,
|
|
openPortalType: "GENERIC",
|
|
openSyncIntervalDays: 7,
|
|
pntSubjectId: null,
|
|
pntEntityId: null,
|
|
pntSectorId: null,
|
|
pntEntryUrl: null,
|
|
backupUrl: null,
|
|
scrapingEnabled: true,
|
|
isActive: true,
|
|
});
|
|
}
|
|
|
|
const deduped = new Map();
|
|
|
|
for (const row of rows) {
|
|
deduped.set(`${row.stateCode}-${row.municipalityCode}`, row);
|
|
}
|
|
|
|
const merged = [...deduped.values()].map((row) => {
|
|
const key = `${row.stateCode}-${row.municipalityCode}`;
|
|
const known = knownOpenPortalMappings[key];
|
|
|
|
if (!known) {
|
|
return row;
|
|
}
|
|
|
|
return {
|
|
...row,
|
|
...known,
|
|
};
|
|
});
|
|
|
|
return merged.sort((a, b) => {
|
|
if (a.stateCode !== b.stateCode) {
|
|
return a.stateCode.localeCompare(b.stateCode, "es");
|
|
}
|
|
|
|
return a.municipalityCode.localeCompare(b.municipalityCode, "es");
|
|
});
|
|
}
|
|
|
|
async function readSourceText(inputPath) {
|
|
if (inputPath.toLowerCase().endsWith(".txt")) {
|
|
return readFile(inputPath, "utf8");
|
|
}
|
|
|
|
const result = spawnSync("pdftotext", ["-layout", inputPath, "-"], {
|
|
encoding: "utf8",
|
|
maxBuffer: 1024 * 1024 * 20,
|
|
});
|
|
|
|
if (result.status === 0 && typeof result.stdout === "string" && result.stdout.trim()) {
|
|
return result.stdout;
|
|
}
|
|
|
|
if (result.error) {
|
|
throw result.error;
|
|
}
|
|
|
|
throw new Error(result.stderr || "Failed to extract text from PDF.");
|
|
}
|
|
|
|
function run() {
|
|
const inputPdf = process.argv[2] || path.join(process.cwd(), "Estructura_municipal_dic22.pdf");
|
|
const outputJson = process.argv[3] || path.join(process.cwd(), "prisma", "data", "municipalities.json");
|
|
|
|
return readSourceText(inputPdf).then((rawText) => {
|
|
const rows = parseMunicipalityRows(rawText);
|
|
|
|
if (!rows.length) {
|
|
throw new Error("No municipality rows parsed from source text.");
|
|
}
|
|
|
|
return mkdir(path.dirname(outputJson), { recursive: true })
|
|
.then(() => writeFile(outputJson, `${JSON.stringify(rows, null, 2)}\n`, "utf8"))
|
|
.then(() => {
|
|
console.log(`Parsed municipalities: ${rows.length}`);
|
|
console.log(`Output: ${outputJson}`);
|
|
});
|
|
});
|
|
}
|
|
|
|
run().catch((error) => {
|
|
console.error(error instanceof Error ? error.message : String(error));
|
|
process.exitCode = 1;
|
|
});
|