fifteen/worker/src/jobs/download-pbf.ts

126 lines
3.9 KiB
TypeScript

import type { Job } from "bullmq";
import { createWriteStream, mkdirSync, statSync, renameSync } from "fs";
import { Writable } from "stream";
import type { JobProgress } from "@transportationer/shared";
export type DownloadPbfData = {
type: "download-pbf";
citySlug: string;
geofabrikUrl: string;
expectedBytes?: number;
};
const ALLOWED_PATTERN =
/^https:\/\/download\.geofabrik\.de\/[\w][\w/-]+-latest\.osm\.pbf$/;
const OSM_DATA_DIR = process.env.OSM_DATA_DIR ?? "/data/osm";
export async function handleDownloadPbf(
job: Job<DownloadPbfData>,
): Promise<void> {
const { citySlug, geofabrikUrl, expectedBytes } = job.data;
if (!ALLOWED_PATTERN.test(geofabrikUrl)) {
throw new Error(`Rejected URL (must be a Geofabrik PBF): ${geofabrikUrl}`);
}
mkdirSync(OSM_DATA_DIR, { recursive: true });
const outputPath = `${OSM_DATA_DIR}/${citySlug}-latest.osm.pbf`;
// Use job.id in the tmp path so two concurrent download-pbf jobs for the
// same city (one under extract-pois, one under build-valhalla) don't write
// to the same file and corrupt each other.
const tmpPath = `${outputPath}.${job.id}.tmp`;
// Idempotency: skip if a complete file is already on disk (supports
// parallel download-pbf instances for the same city PBF).
try {
const stat = statSync(outputPath);
if (stat.size > 0) {
await job.updateProgress({
stage: "Downloading PBF",
pct: 100,
message: `Already on disk: ${outputPath} (${(stat.size / 1_048_576).toFixed(1)} MB)`,
} satisfies JobProgress);
return;
}
} catch {
// File doesn't exist — fall through to download.
}
await job.updateProgress({
stage: "Downloading PBF",
pct: 0,
message: `Starting download from Geofabrik…`,
} satisfies JobProgress);
const response = await fetch(geofabrikUrl, {
headers: { "User-Agent": "Transportationer/1.0" },
});
if (!response.ok || !response.body) {
throw new Error(`HTTP ${response.status} from ${geofabrikUrl}`);
}
const totalBytes =
expectedBytes ??
parseInt(response.headers.get("content-length") ?? "0", 10);
let downloaded = 0;
let lastPct = -1;
// Write to a temp path; rename to final path on completion so concurrent
// instances see a complete file or nothing (never a partial file).
const fileStream = createWriteStream(tmpPath);
// Count bytes through a transform, then write to file
const reader = response.body.getReader();
const writable = new Writable({
write(chunk, _enc, cb) {
fileStream.write(chunk, cb);
},
final(cb) {
fileStream.end(cb);
},
});
// Propagate fileStream errors (e.g. EACCES, ENOSPC) to the writable
// so they surface as a rejected promise rather than an unhandled event.
fileStream.on("error", (err) => writable.destroy(err));
await (async () => {
while (true) {
const { done, value } = await reader.read();
if (done) break;
if (value) {
downloaded += value.byteLength;
const pct =
totalBytes > 0 ? Math.floor((downloaded / totalBytes) * 100) : 0;
if (pct !== lastPct) {
lastPct = pct;
job
.updateProgress({
stage: "Downloading PBF",
pct,
message: `${(downloaded / 1_048_576).toFixed(1)} MB${totalBytes ? ` / ${(totalBytes / 1_048_576).toFixed(1)} MB` : ""}`,
bytesDownloaded: downloaded,
totalBytes,
} satisfies JobProgress)
.catch(() => {});
}
writable.write(value);
}
}
writable.end();
await new Promise((res, rej) =>
writable.on("finish", res).on("error", rej),
);
})();
// Atomically promote the temp file to the final path.
renameSync(tmpPath, outputPath);
await job.updateProgress({
stage: "Downloading PBF",
pct: 100,
message: `Download complete: ${outputPath}`,
} satisfies JobProgress);
}