initial push
This commit is contained in:
173
scripts/extract-municipalities-from-pdf.mjs
Normal file
173
scripts/extract-municipalities-from-pdf.mjs
Normal file
@@ -0,0 +1,173 @@
|
||||
import { spawnSync } from "node:child_process";
|
||||
import { mkdir, readFile, writeFile } from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
|
||||
function normalizeSpaces(value) {
|
||||
return value.replace(/\s+/g, " ").trim();
|
||||
}
|
||||
|
||||
const knownOpenPortalMappings = {
|
||||
"19-019": {
|
||||
openPortalUrl: "https://licitaciones.sanpedro.gob.mx/Default.aspx?Year=2026&T=1&Pro=1",
|
||||
openPortalType: "SAN_PEDRO_ASPX",
|
||||
openSyncIntervalDays: 7,
|
||||
},
|
||||
};
|
||||
|
||||
function parseMunicipalityRows(text) {
|
||||
const lines = text.split(/\r?\n/);
|
||||
const areaRegex = "(Resto del País|Zona Libre de la Frontera Norte)";
|
||||
const rowRegex = new RegExp(`^\\s*(\\d{2})\\s+(.+?)\\s+(\\d{3})\\*?\\s+(.+?)\\s+${areaRegex}\\s*$`);
|
||||
const pendingRegex = /^\s*(\d{2})\s+(.+?)\s+(\d{3})\*?\s+(.+?)\s*$/;
|
||||
const continuationRegex = new RegExp(`^\\s+(.+?)\\s+${areaRegex}\\s*$`);
|
||||
const rows = [];
|
||||
let pendingRow = null;
|
||||
|
||||
for (const line of lines) {
|
||||
if (pendingRow) {
|
||||
const continuation = line.match(continuationRegex);
|
||||
|
||||
if (continuation) {
|
||||
rows.push({
|
||||
stateCode: pendingRow.stateCode,
|
||||
stateName: pendingRow.stateName,
|
||||
municipalityCode: pendingRow.municipalityCode,
|
||||
municipalityName: normalizeSpaces(continuation[1] ?? ""),
|
||||
areaGeografica: normalizeSpaces(continuation[2] ?? ""),
|
||||
openPortalUrl: null,
|
||||
openPortalType: "GENERIC",
|
||||
openSyncIntervalDays: 7,
|
||||
pntSubjectId: null,
|
||||
pntEntityId: null,
|
||||
pntSectorId: null,
|
||||
pntEntryUrl: null,
|
||||
backupUrl: null,
|
||||
scrapingEnabled: true,
|
||||
isActive: true,
|
||||
});
|
||||
pendingRow = null;
|
||||
continue;
|
||||
}
|
||||
|
||||
pendingRow = null;
|
||||
}
|
||||
|
||||
if (!/^\s*\d{2}\s+/.test(line)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const match = line.match(rowRegex);
|
||||
|
||||
if (!match) {
|
||||
const pendingMatch = line.match(pendingRegex);
|
||||
|
||||
if (pendingMatch) {
|
||||
pendingRow = {
|
||||
stateCode: pendingMatch[1],
|
||||
stateName: normalizeSpaces(pendingMatch[2] ?? ""),
|
||||
municipalityCode: (pendingMatch[3] ?? "").padStart(3, "0"),
|
||||
};
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
const stateCode = match[1];
|
||||
const stateName = normalizeSpaces(match[2] ?? "");
|
||||
const municipalityCode = (match[3] ?? "").padStart(3, "0");
|
||||
const municipalityName = normalizeSpaces(match[4] ?? "");
|
||||
const areaGeografica = normalizeSpaces(match[5] ?? "");
|
||||
|
||||
rows.push({
|
||||
stateCode,
|
||||
stateName,
|
||||
municipalityCode,
|
||||
municipalityName,
|
||||
areaGeografica,
|
||||
openPortalUrl: null,
|
||||
openPortalType: "GENERIC",
|
||||
openSyncIntervalDays: 7,
|
||||
pntSubjectId: null,
|
||||
pntEntityId: null,
|
||||
pntSectorId: null,
|
||||
pntEntryUrl: null,
|
||||
backupUrl: null,
|
||||
scrapingEnabled: true,
|
||||
isActive: true,
|
||||
});
|
||||
}
|
||||
|
||||
const deduped = new Map();
|
||||
|
||||
for (const row of rows) {
|
||||
deduped.set(`${row.stateCode}-${row.municipalityCode}`, row);
|
||||
}
|
||||
|
||||
const merged = [...deduped.values()].map((row) => {
|
||||
const key = `${row.stateCode}-${row.municipalityCode}`;
|
||||
const known = knownOpenPortalMappings[key];
|
||||
|
||||
if (!known) {
|
||||
return row;
|
||||
}
|
||||
|
||||
return {
|
||||
...row,
|
||||
...known,
|
||||
};
|
||||
});
|
||||
|
||||
return merged.sort((a, b) => {
|
||||
if (a.stateCode !== b.stateCode) {
|
||||
return a.stateCode.localeCompare(b.stateCode, "es");
|
||||
}
|
||||
|
||||
return a.municipalityCode.localeCompare(b.municipalityCode, "es");
|
||||
});
|
||||
}
|
||||
|
||||
async function readSourceText(inputPath) {
|
||||
if (inputPath.toLowerCase().endsWith(".txt")) {
|
||||
return readFile(inputPath, "utf8");
|
||||
}
|
||||
|
||||
const result = spawnSync("pdftotext", ["-layout", inputPath, "-"], {
|
||||
encoding: "utf8",
|
||||
maxBuffer: 1024 * 1024 * 20,
|
||||
});
|
||||
|
||||
if (result.status === 0 && typeof result.stdout === "string" && result.stdout.trim()) {
|
||||
return result.stdout;
|
||||
}
|
||||
|
||||
if (result.error) {
|
||||
throw result.error;
|
||||
}
|
||||
|
||||
throw new Error(result.stderr || "Failed to extract text from PDF.");
|
||||
}
|
||||
|
||||
function run() {
|
||||
const inputPdf = process.argv[2] || path.join(process.cwd(), "Estructura_municipal_dic22.pdf");
|
||||
const outputJson = process.argv[3] || path.join(process.cwd(), "prisma", "data", "municipalities.json");
|
||||
|
||||
return readSourceText(inputPdf).then((rawText) => {
|
||||
const rows = parseMunicipalityRows(rawText);
|
||||
|
||||
if (!rows.length) {
|
||||
throw new Error("No municipality rows parsed from source text.");
|
||||
}
|
||||
|
||||
return mkdir(path.dirname(outputJson), { recursive: true })
|
||||
.then(() => writeFile(outputJson, `${JSON.stringify(rows, null, 2)}\n`, "utf8"))
|
||||
.then(() => {
|
||||
console.log(`Parsed municipalities: ${rows.length}`);
|
||||
console.log(`Output: ${outputJson}`);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
run().catch((error) => {
|
||||
console.error(error instanceof Error ? error.message : String(error));
|
||||
process.exitCode = 1;
|
||||
});
|
||||
Reference in New Issue
Block a user