/** * Download and filter a GTFS feed per city so Valhalla can build transit tiles. * * The raw (unfiltered) feed is downloaded once and cached in GTFS_DATA_DIR/raw/. * Subsequent calls for other cities re-use the raw cache without re-downloading. * * A per-city filtered feed is written to GTFS_DATA_DIR/{citySlug}/feed/ and * clipped to the city's bounding box. This directory is the transit_feeds_dir * for that city's valhalla_ingest_transit + valhalla_build_tiles run. * * Source: https://download.gtfs.de/germany/nv_free/latest.zip */ import { createHash } from "crypto"; import type { Job } from "bullmq"; import { createReadStream, createWriteStream, existsSync, mkdirSync, readdirSync, renameSync, rmSync, readFileSync, writeFileSync, copyFileSync, } from "fs"; import { mkdir } from "fs/promises"; import { pipeline } from "stream/promises"; import { Readable } from "stream"; import { createInterface } from "readline"; import * as path from "path"; import unzipper from "unzipper"; import type { JobProgress } from "@transportationer/shared"; export type DownloadGtfsData = { type: "download-gtfs"; url: string; citySlug: string; /** City bbox [minLng, minLat, maxLng, maxLat] already including buffer. */ bbox: [number, number, number, number]; /** Re-download even if raw data already exists (default: false) */ force?: boolean; }; const GTFS_DATA_DIR = process.env.GTFS_DATA_DIR ?? "/data/valhalla/gtfs"; /** * Cache key derived from feed URL + current date (YYYY-MM-DD). * Including the date ensures feeds are re-downloaded daily. */ function urlCacheKey(url: string): string { const today = new Date().toISOString().slice(0, 10); return createHash("sha256").update(`${url}:${today}`).digest("hex").slice(0, 8); } /** Remove prior-day raw-cache dirs for the same source URL. */ function pruneStaleRawDirs(url: string, currentKey: string): void { const rawBase = `${GTFS_DATA_DIR}/raw`; if (!existsSync(rawBase)) return; for (const entry of readdirSync(rawBase)) { if (entry === currentKey) continue; const markerFile = path.join(rawBase, entry, ".source"); if (!existsSync(markerFile)) continue; try { const marker = JSON.parse(readFileSync(markerFile, "utf8")) as RawMarker; if (marker.source === url) { rmSync(path.join(rawBase, entry), { recursive: true, force: true }); console.log(`[download-gtfs] Pruned stale raw cache: ${entry}`); } } catch { /* ignore */ } } } /** Per-URL raw (unfiltered) feed directory — one per distinct feed source. */ function rawDir(url: string): string { return `${GTFS_DATA_DIR}/raw/${urlCacheKey(url)}`; } function zipPath(url: string): string { return `${GTFS_DATA_DIR}/${urlCacheKey(url)}.zip`; } function rawMarkerPath(url: string): string { return `${rawDir(url)}/.source`; } /** * Bump this when the filtering algorithm changes in a way that produces * different output from the same source + bbox. Forces a re-filter on the * existing raw data without re-downloading. */ const FILTER_VERSION = 3; interface RawMarker { source: string } interface CityMarker { source: string; bbox: [number, number, number, number]; filterVersion: number } function readRawMarker(url: string): RawMarker | null { const p = rawMarkerPath(url); if (!existsSync(p)) return null; try { return JSON.parse(readFileSync(p, "utf8")) as RawMarker; } catch { return null; } } function cityFeedDir(citySlug: string): string { return `${GTFS_DATA_DIR}/${citySlug}/feed`; } function cityMarkerPath(citySlug: string): string { return `${cityFeedDir(citySlug)}/.source`; } function readCityMarker(citySlug: string): CityMarker | null { const p = cityMarkerPath(citySlug); if (!existsSync(p)) return null; try { return JSON.parse(readFileSync(p, "utf8")) as CityMarker; } catch { return null; } } function writeCityMarker(citySlug: string, source: string, bbox: [number, number, number, number]): void { writeFileSync(cityMarkerPath(citySlug), JSON.stringify({ source, bbox, filterVersion: FILTER_VERSION })); } function bboxEqual(a: [number,number,number,number], b: [number,number,number,number]): boolean { return a[0] === b[0] && a[1] === b[1] && a[2] === b[2] && a[3] === b[3]; } // ─── GTFS bbox filter ───────────────────────────────────────────────────────── function splitCsv(line: string): string[] { if (!line.includes('"')) return line.split(","); const result: string[] = []; let current = ""; let inQuotes = false; for (let i = 0; i < line.length; i++) { const ch = line[i]; if (ch === '"') { if (inQuotes && line[i + 1] === '"') { current += '"'; i++; } else inQuotes = !inQuotes; } else if (ch === "," && !inQuotes) { result.push(current); current = ""; } else { current += ch; } } result.push(current); return result; } function colIndex(header: string): Map { return new Map(splitCsv(header).map((c, i) => [c.trim().replace(/^\uFEFF/, ""), i])); } function inBbox(lat: number, lon: number, bbox: [number,number,number,number]): boolean { const [minLng, minLat, maxLng, maxLat] = bbox; return lat >= minLat && lat <= maxLat && lon >= minLng && lon <= maxLng; } function filterSmallCsv( filePath: string, keepRow: (idx: Map, fields: string[]) => boolean, onKept?: (idx: Map, fields: string[]) => void, ): void { if (!existsSync(filePath)) return; const lines = readFileSync(filePath, "utf8").split(/\r?\n/).filter((l) => l.trim()); if (lines.length < 2) return; const idx = colIndex(lines[0]); const out = [lines[0]]; for (let i = 1; i < lines.length; i++) { const fields = splitCsv(lines[i]); if (keepRow(idx, fields)) { if (onKept) onKept(idx, fields); out.push(lines[i]); } } writeFileSync(filePath, out.join("\n") + "\n"); } async function filterLargeCsv( srcPath: string, destPath: string, keepRow: (targetCol: number, line: string) => boolean, getTargetCol: (idx: Map) => number, ): Promise { if (!existsSync(srcPath)) return; const tmpPath = destPath + ".tmp"; const writer = createWriteStream(tmpPath); let isFirst = true; let targetCol = -1; const rl = createInterface({ input: createReadStream(srcPath), crlfDelay: Infinity }); for await (const line of rl) { if (!line.trim()) continue; if (isFirst) { isFirst = false; targetCol = getTargetCol(colIndex(line)); writer.write(line + "\n"); continue; } if (keepRow(targetCol, line)) writer.write(line + "\n"); } await new Promise((resolve, reject) => writer.end((err?: unknown) => (err ? reject(err) : resolve())), ); renameSync(tmpPath, destPath); } /** * Filter raw GTFS feed to a single city bbox. * Reads from rawDir, writes to destDir. * Also writes .stops_bbox (tight bbox of retained stops) for build-valhalla. */ async function filterGtfsForCity( rawDir: string, destDir: string, bbox: [number, number, number, number], ): Promise { console.log(`[download-gtfs] Filtering GTFS to bbox [${bbox.map((v) => v.toFixed(3)).join(",")}] → ${destDir}`); const stopsPath = path.join(rawDir, "stops.txt"); if (!existsSync(stopsPath)) { console.log("[download-gtfs] No stops.txt in raw dir — skipping filter"); return; } // ── Step 1: collect bbox stop IDs ────────────────────────────────────────── const bboxStopIds = new Set(); let seedMinLng = Infinity, seedMinLat = Infinity, seedMaxLng = -Infinity, seedMaxLat = -Infinity; { const lines = readFileSync(stopsPath, "utf8").split(/\r?\n/).filter((l) => l.trim()); if (lines.length >= 2) { const idx = colIndex(lines[0]); const stopIdCol = idx.get("stop_id") ?? -1; const latCol = idx.get("stop_lat") ?? -1; const lonCol = idx.get("stop_lon") ?? -1; for (let i = 1; i < lines.length; i++) { const fields = splitCsv(lines[i]); const lat = parseFloat(fields[latCol] ?? "NaN"); const lon = parseFloat(fields[lonCol] ?? "NaN"); if (inBbox(lat, lon, bbox)) { bboxStopIds.add(fields[stopIdCol] ?? ""); if (isFinite(lat) && isFinite(lon)) { seedMinLat = Math.min(seedMinLat, lat); seedMaxLat = Math.max(seedMaxLat, lat); seedMinLng = Math.min(seedMinLng, lon); seedMaxLng = Math.max(seedMaxLng, lon); } } } } } console.log(`[download-gtfs] ${bboxStopIds.size} stops in bbox`); if (bboxStopIds.size === 0) { console.warn("[download-gtfs] No stops found in bbox — GTFS filter skipped"); return; } // ── Step 2a: collect trip_ids with ≥2 bbox stops ─────────────────────────── const stopTimesRaw = path.join(rawDir, "stop_times.txt"); if (!existsSync(stopTimesRaw)) { console.log("[download-gtfs] No stop_times.txt — skipping"); return; } const validTripIds = new Set(); const tripBboxStopCount = new Map(); { let stopIdCol = -1, tripIdCol = -1, isFirst = true; const rl = createInterface({ input: createReadStream(stopTimesRaw), crlfDelay: Infinity }); for await (const line of rl) { if (!line.trim()) continue; if (isFirst) { isFirst = false; const idx = colIndex(line); stopIdCol = idx.get("stop_id") ?? -1; tripIdCol = idx.get("trip_id") ?? -1; continue; } const fields = line.split(","); const tripId = fields[tripIdCol] ?? ""; const stopId = fields[stopIdCol] ?? ""; if (bboxStopIds.has(stopId)) { validTripIds.add(tripId); tripBboxStopCount.set(tripId, (tripBboxStopCount.get(tripId) ?? 0) + 1); } } } for (const tripId of validTripIds) { if ((tripBboxStopCount.get(tripId) ?? 0) < 2) validTripIds.delete(tripId); } console.log(`[download-gtfs] ${validTripIds.size} trips with ≥2 bbox stops`); // ── Step 2b: write filtered stop_times (bbox stops on valid trips only) ───── const allTripStopIds = new Set(); await filterLargeCsv( stopTimesRaw, path.join(destDir, "stop_times.txt"), (tripCol, line) => { const fields = line.split(","); const tripId = fields[tripCol] ?? ""; const stopId = fields[tripCol + 1] ?? ""; // wrong index — handled below return validTripIds.has(tripId); }, (idx) => idx.get("trip_id") ?? -1, ); // Re-read filtered stop_times to collect actual stop IDs and also filter to bbox stops only { const tmpPath = path.join(destDir, "stop_times.txt") + ".tmp2"; const writer = createWriteStream(tmpPath); let isFirst = true; let tripIdCol = -1, stopIdCol = -1; const rl = createInterface({ input: createReadStream(path.join(destDir, "stop_times.txt")), crlfDelay: Infinity }); for await (const line of rl) { if (!line.trim()) continue; if (isFirst) { isFirst = false; const idx = colIndex(line); tripIdCol = idx.get("trip_id") ?? -1; stopIdCol = idx.get("stop_id") ?? -1; writer.write(line + "\n"); continue; } const fields = line.split(","); const tripId = fields[tripIdCol] ?? ""; const stopId = fields[stopIdCol] ?? ""; if (validTripIds.has(tripId) && bboxStopIds.has(stopId)) { allTripStopIds.add(stopId); writer.write(line + "\n"); } } await new Promise((resolve, reject) => writer.end((err?: unknown) => (err ? reject(err) : resolve())), ); renameSync(tmpPath, path.join(destDir, "stop_times.txt")); } // ── Step 3: filter stops.txt (only stops that appear in final stop_times) ─── { const dest = path.join(destDir, "stops.txt"); const lines = readFileSync(stopsPath, "utf8").split(/\r?\n/).filter((l) => l.trim()); if (lines.length >= 2) { const idx = colIndex(lines[0]); const stopIdCol = idx.get("stop_id") ?? -1; const out = [lines[0]]; for (let i = 1; i < lines.length; i++) { const fields = splitCsv(lines[i]); if (allTripStopIds.has(fields[stopIdCol] ?? "")) out.push(lines[i]); } writeFileSync(dest, out.join("\n") + "\n"); } } if (isFinite(seedMinLat)) { const stopsBbox: [number, number, number, number] = [seedMinLng, seedMinLat, seedMaxLng, seedMaxLat]; writeFileSync(path.join(destDir, ".stops_bbox"), JSON.stringify(stopsBbox)); console.log(`[download-gtfs] Transit stops bbox: [${stopsBbox.map((v) => v.toFixed(3)).join(", ")}]`); } // ── Step 4: filter trips.txt → collect route/service/shape IDs ───────────── const validRouteIds = new Set(); const validServiceIds = new Set(); const validShapeIds = new Set(); { const src = path.join(rawDir, "trips.txt"); const dest = path.join(destDir, "trips.txt"); if (existsSync(src)) { const lines = readFileSync(src, "utf8").split(/\r?\n/).filter((l) => l.trim()); if (lines.length >= 2) { const idx = colIndex(lines[0]); const tripIdCol = idx.get("trip_id") ?? -1; const routeIdCol = idx.get("route_id") ?? -1; const serviceIdCol = idx.get("service_id") ?? -1; const shapeIdCol = idx.get("shape_id") ?? -1; const out = [lines[0]]; for (let i = 1; i < lines.length; i++) { const fields = splitCsv(lines[i]); if (validTripIds.has(fields[tripIdCol] ?? "")) { out.push(lines[i]); validRouteIds.add(fields[routeIdCol] ?? ""); validServiceIds.add(fields[serviceIdCol] ?? ""); const shapeId = fields[shapeIdCol] ?? ""; if (shapeId) validShapeIds.add(shapeId); } } writeFileSync(dest, out.join("\n") + "\n"); } } } // ── Step 5: filter routes, calendar, calendar_dates; copy agency/feed_info ── // Collect agency IDs from the filtered routes so we can filter agency.txt. const validAgencyIds = new Set(); { const src = path.join(rawDir, "routes.txt"); if (existsSync(src)) { const lines = readFileSync(src, "utf8").split(/\r?\n/).filter((l) => l.trim()); if (lines.length >= 2) { const idx = colIndex(lines[0]); const routeIdCol = idx.get("route_id") ?? -1; const agencyIdCol = idx.get("agency_id") ?? -1; for (let i = 1; i < lines.length; i++) { const fields = splitCsv(lines[i]); if (validRouteIds.has(fields[routeIdCol] ?? "")) { const aid = fields[agencyIdCol] ?? ""; if (aid) validAgencyIds.add(aid); } } } } } for (const [file, idCol, validIds] of [ ["agency.txt", "agency_id", validAgencyIds], ["routes.txt", "route_id", validRouteIds], ["calendar.txt", "service_id", validServiceIds], ["calendar_dates.txt", "service_id", validServiceIds], ] as const) { const src = path.join(rawDir, file); const dest = path.join(destDir, file); if (!existsSync(src)) continue; const lines = readFileSync(src, "utf8").split(/\r?\n/).filter((l) => l.trim()); if (lines.length < 2) { writeFileSync(dest, lines[0] + "\n"); continue; } const idx = colIndex(lines[0]); const col = idx.get(idCol) ?? -1; const out = [lines[0]]; for (let i = 1; i < lines.length; i++) { const fields = splitCsv(lines[i]); if ((validIds as Set).has(fields[col] ?? "")) out.push(lines[i]); } writeFileSync(dest, out.join("\n") + "\n"); } // ── Step 5b: copy feed_info.txt verbatim (not filterable, Valhalla may need it) ── { const src = path.join(rawDir, "feed_info.txt"); if (existsSync(src)) copyFileSync(src, path.join(destDir, "feed_info.txt")); } // ── Step 6: shapes.txt (large — stream-filter) ───────────────────────────── if (validShapeIds.size > 0) { await filterLargeCsv( path.join(rawDir, "shapes.txt"), path.join(destDir, "shapes.txt"), (col, line) => validShapeIds.has(line.split(",")[col] ?? ""), (idx) => idx.get("shape_id") ?? -1, ); } console.log( `[download-gtfs] Filter complete: ${allTripStopIds.size} stops, ` + `${validTripIds.size} trips, ${validRouteIds.size} routes`, ); } // ─── Job handler ────────────────────────────────────────────────────────────── export async function handleDownloadGtfs(job: Job): Promise { const { url, citySlug, bbox, force = false } = job.data; const effectiveSource = url; const destDir = cityFeedDir(citySlug); const cityMarker = readCityMarker(citySlug); // ── Check if per-city feed is already up to date ─────────────────────────── const cityDataExists = existsSync(destDir) && readdirSync(destDir).some((f) => f.endsWith(".txt")); if (!force && cityDataExists && cityMarker?.source === effectiveSource && cityMarker?.filterVersion === FILTER_VERSION && bboxEqual(cityMarker.bbox, bbox)) { console.log(`[download-gtfs] Per-city feed for ${citySlug} is up to date, skipping`); await job.updateProgress({ stage: "Downloading GTFS", pct: 100, message: "Feed up to date." } satisfies JobProgress); return; } // ── Ensure raw feed is present ───────────────────────────────────────────── const GTFS_RAW_DIR = rawDir(url); const GTFS_ZIP_PATH = zipPath(url); const rawMarker = readRawMarker(url); const rawExists = existsSync(GTFS_RAW_DIR) && readdirSync(GTFS_RAW_DIR).some((f) => f.endsWith(".txt")); if (force || !rawExists || rawMarker?.source !== effectiveSource) { await job.updateProgress({ stage: "Downloading GTFS", pct: 5, message: `Downloading GTFS feed…` } satisfies JobProgress); mkdirSync(GTFS_DATA_DIR, { recursive: true }); const response = await fetch(url, { signal: AbortSignal.timeout(600_000) }); if (!response.ok || !response.body) { throw new Error(`Failed to download GTFS: HTTP ${response.status} ${response.statusText}`); } const totalBytes = Number(response.headers.get("content-length") ?? 0); let downloadedBytes = 0; let lastReportedPct = 5; const nodeReadable = Readable.fromWeb(response.body as Parameters[0]); nodeReadable.on("data", (chunk: Buffer) => { downloadedBytes += chunk.length; if (totalBytes > 0) { const pct = Math.min(55, 5 + Math.round((downloadedBytes / totalBytes) * 50)); if (pct > lastReportedPct + 4) { lastReportedPct = pct; void job.updateProgress({ stage: "Downloading GTFS", pct, message: `Downloading… ${(downloadedBytes / 1024 / 1024).toFixed(1)} / ${(totalBytes / 1024 / 1024).toFixed(1)} MB`, bytesDownloaded: downloadedBytes, totalBytes, } satisfies JobProgress); } } }); await pipeline(nodeReadable, createWriteStream(GTFS_ZIP_PATH)); console.log(`[download-gtfs] Downloaded ${(downloadedBytes / 1024 / 1024).toFixed(1)} MB`); await job.updateProgress({ stage: "Downloading GTFS", pct: 60, message: "Extracting GTFS feed…" } satisfies JobProgress); if (existsSync(GTFS_RAW_DIR)) rmSync(GTFS_RAW_DIR, { recursive: true, force: true }); mkdirSync(GTFS_RAW_DIR, { recursive: true }); const zip = unzipper.Parse({ forceStream: true }); createReadStream(GTFS_ZIP_PATH).pipe(zip); for await (const entry of zip) { const e = entry as unzipper.Entry; const destPath = path.join(GTFS_RAW_DIR, path.basename(e.path)); if (e.type === "Directory") { e.autodrain(); continue; } await mkdir(path.dirname(destPath), { recursive: true }); await pipeline(e as unknown as NodeJS.ReadableStream, createWriteStream(destPath)); } rmSync(GTFS_ZIP_PATH, { force: true }); const extractedFiles = readdirSync(GTFS_RAW_DIR); console.log(`[download-gtfs] Extracted ${extractedFiles.length} files to ${GTFS_RAW_DIR}`); writeFileSync(rawMarkerPath(url), JSON.stringify({ source: effectiveSource })); pruneStaleRawDirs(url, urlCacheKey(url)); } else { console.log(`[download-gtfs] Raw feed already present (source=${effectiveSource})`); await job.updateProgress({ stage: "Downloading GTFS", pct: 60, message: "Using cached raw feed." } satisfies JobProgress); } // ── Filter raw feed for this city ────────────────────────────────────────── await job.updateProgress({ stage: "Downloading GTFS", pct: 65, message: `Filtering GTFS for ${citySlug}…` } satisfies JobProgress); if (existsSync(destDir)) rmSync(destDir, { recursive: true, force: true }); mkdirSync(destDir, { recursive: true }); await filterGtfsForCity(GTFS_RAW_DIR, destDir, bbox); writeCityMarker(citySlug, effectiveSource, bbox); await job.updateProgress({ stage: "Downloading GTFS", pct: 100, message: `GTFS ready for ${citySlug}.` } satisfies JobProgress); }