fifteen/worker/src/jobs/download-gtfs.ts

/**
 * Download and filter a GTFS feed per city so Valhalla can build transit tiles.
 *
 * The raw (unfiltered) feed is downloaded once and cached in GTFS_DATA_DIR/raw/.
 * Subsequent calls for other cities re-use the raw cache without re-downloading.
 *
 * A per-city filtered feed is written to GTFS_DATA_DIR/{citySlug}/feed/ and
 * clipped to the city's bounding box. This directory is the transit_feeds_dir
 * for that city's valhalla_ingest_transit + valhalla_build_tiles run.
 *
 * Source: https://download.gtfs.de/germany/nv_free/latest.zip
 */
import { createHash } from "crypto";
import type { Job } from "bullmq";
import {
  createReadStream,
  createWriteStream,
  existsSync,
  mkdirSync,
  readdirSync,
  renameSync,
  rmSync,
  readFileSync,
  writeFileSync,
  copyFileSync,
} from "fs";
import { mkdir } from "fs/promises";
import { pipeline } from "stream/promises";
import { Readable } from "stream";
import { createInterface } from "readline";
import * as path from "path";
import unzipper from "unzipper";
import type { JobProgress } from "@transportationer/shared";

export type DownloadGtfsData = {
  type: "download-gtfs";
  url: string;
  citySlug: string;
  /** City bbox [minLng, minLat, maxLng, maxLat] already including buffer. */
  bbox: [number, number, number, number];
  /** Re-download even if raw data already exists (default: false) */
  force?: boolean;
};

const GTFS_DATA_DIR = process.env.GTFS_DATA_DIR ?? "/data/valhalla/gtfs";

/**
 * Cache key derived from feed URL + current date (YYYY-MM-DD).
 * Including the date ensures feeds are re-downloaded daily.
 */
function urlCacheKey(url: string): string {
  const today = new Date().toISOString().slice(0, 10);
  return createHash("sha256").update(`${url}:${today}`).digest("hex").slice(0, 8);
}

/** Remove prior-day raw-cache dirs for the same source URL. */
function pruneStaleRawDirs(url: string, currentKey: string): void {
  const rawBase = `${GTFS_DATA_DIR}/raw`;
  if (!existsSync(rawBase)) return;
  for (const entry of readdirSync(rawBase)) {
    if (entry === currentKey) continue;
    const markerFile = path.join(rawBase, entry, ".source");
    if (!existsSync(markerFile)) continue;
    try {
      const marker = JSON.parse(readFileSync(markerFile, "utf8")) as RawMarker;
      if (marker.source === url) {
        rmSync(path.join(rawBase, entry), { recursive: true, force: true });
        console.log(`[download-gtfs] Pruned stale raw cache: ${entry}`);
      }
    } catch { /* ignore */ }
  }
}

/** Per-URL raw (unfiltered) feed directory — one per distinct feed source. */
function rawDir(url: string): string  { return `${GTFS_DATA_DIR}/raw/${urlCacheKey(url)}`; }
function zipPath(url: string): string { return `${GTFS_DATA_DIR}/${urlCacheKey(url)}.zip`; }
function rawMarkerPath(url: string): string { return `${rawDir(url)}/.source`; }

/**
 * Bump this when the filtering algorithm changes in a way that produces
 * different output from the same source + bbox. Forces a re-filter on the
 * existing raw data without re-downloading.
 */
const FILTER_VERSION = 3;

interface RawMarker   { source: string }
interface CityMarker  { source: string; bbox: [number, number, number, number]; filterVersion: number }

function readRawMarker(url: string): RawMarker | null {
  const p = rawMarkerPath(url);
  if (!existsSync(p)) return null;
  try { return JSON.parse(readFileSync(p, "utf8")) as RawMarker; } catch { return null; }
}

function cityFeedDir(citySlug: string): string  { return `${GTFS_DATA_DIR}/${citySlug}/feed`; }
function cityMarkerPath(citySlug: string): string { return `${cityFeedDir(citySlug)}/.source`; }

function readCityMarker(citySlug: string): CityMarker | null {
  const p = cityMarkerPath(citySlug);
  if (!existsSync(p)) return null;
  try { return JSON.parse(readFileSync(p, "utf8")) as CityMarker; } catch { return null; }
}

function writeCityMarker(citySlug: string, source: string, bbox: [number, number, number, number]): void {
  writeFileSync(cityMarkerPath(citySlug), JSON.stringify({ source, bbox, filterVersion: FILTER_VERSION }));
}

function bboxEqual(a: [number,number,number,number], b: [number,number,number,number]): boolean {
  return a[0] === b[0] && a[1] === b[1] && a[2] === b[2] && a[3] === b[3];
}

// ─── GTFS bbox filter ─────────────────────────────────────────────────────────

function splitCsv(line: string): string[] {
  if (!line.includes('"')) return line.split(",");
  const result: string[] = [];
  let current = "";
  let inQuotes = false;
  for (let i = 0; i < line.length; i++) {
    const ch = line[i];
    if (ch === '"') {
      if (inQuotes && line[i + 1] === '"') { current += '"'; i++; }
      else inQuotes = !inQuotes;
    } else if (ch === "," && !inQuotes) {
      result.push(current); current = "";
    } else {
      current += ch;
    }
  }
  result.push(current);
  return result;
}

function colIndex(header: string): Map<string, number> {
  return new Map(splitCsv(header).map((c, i) => [c.trim().replace(/^\uFEFF/, ""), i]));
}

function inBbox(lat: number, lon: number, bbox: [number,number,number,number]): boolean {
  const [minLng, minLat, maxLng, maxLat] = bbox;
  return lat >= minLat && lat <= maxLat && lon >= minLng && lon <= maxLng;
}

function filterSmallCsv(
  filePath: string,
  keepRow: (idx: Map<string, number>, fields: string[]) => boolean,
  onKept?: (idx: Map<string, number>, fields: string[]) => void,
): void {
  if (!existsSync(filePath)) return;
  const lines = readFileSync(filePath, "utf8").split(/\r?\n/).filter((l) => l.trim());
  if (lines.length < 2) return;
  const idx = colIndex(lines[0]);
  const out = [lines[0]];
  for (let i = 1; i < lines.length; i++) {
    const fields = splitCsv(lines[i]);
    if (keepRow(idx, fields)) {
      if (onKept) onKept(idx, fields);
      out.push(lines[i]);
    }
  }
  writeFileSync(filePath, out.join("\n") + "\n");
}

async function filterLargeCsv(
  srcPath: string,
  destPath: string,
  keepRow: (targetCol: number, line: string) => boolean,
  getTargetCol: (idx: Map<string, number>) => number,
): Promise<void> {
  if (!existsSync(srcPath)) return;
  const tmpPath = destPath + ".tmp";
  const writer = createWriteStream(tmpPath);
  let isFirst = true;
  let targetCol = -1;
  const rl = createInterface({ input: createReadStream(srcPath), crlfDelay: Infinity });
  for await (const line of rl) {
    if (!line.trim()) continue;
    if (isFirst) {
      isFirst = false;
      targetCol = getTargetCol(colIndex(line));
      writer.write(line + "\n");
      continue;
    }
    if (keepRow(targetCol, line)) writer.write(line + "\n");
  }
  await new Promise<void>((resolve, reject) =>
    writer.end((err?: unknown) => (err ? reject(err) : resolve())),
  );
  renameSync(tmpPath, destPath);
}

/**
 * Filter raw GTFS feed to a single city bbox.
 * Reads from rawDir, writes to destDir.
 * Also writes .stops_bbox (tight bbox of retained stops) for build-valhalla.
 */
async function filterGtfsForCity(
  rawDir: string,
  destDir: string,
  bbox: [number, number, number, number],
): Promise<void> {
  console.log(`[download-gtfs] Filtering GTFS to bbox [${bbox.map((v) => v.toFixed(3)).join(",")}] → ${destDir}`);

  const stopsPath = path.join(rawDir, "stops.txt");
  if (!existsSync(stopsPath)) {
    console.log("[download-gtfs] No stops.txt in raw dir — skipping filter");
    return;
  }

  // ── Step 1: collect bbox stop IDs ──────────────────────────────────────────
  const bboxStopIds = new Set<string>();
  let seedMinLng = Infinity, seedMinLat = Infinity, seedMaxLng = -Infinity, seedMaxLat = -Infinity;
  {
    const lines = readFileSync(stopsPath, "utf8").split(/\r?\n/).filter((l) => l.trim());
    if (lines.length >= 2) {
      const idx = colIndex(lines[0]);
      const stopIdCol = idx.get("stop_id") ?? -1;
      const latCol    = idx.get("stop_lat") ?? -1;
      const lonCol    = idx.get("stop_lon") ?? -1;
      for (let i = 1; i < lines.length; i++) {
        const fields = splitCsv(lines[i]);
        const lat = parseFloat(fields[latCol] ?? "NaN");
        const lon = parseFloat(fields[lonCol] ?? "NaN");
        if (inBbox(lat, lon, bbox)) {
          bboxStopIds.add(fields[stopIdCol] ?? "");
          if (isFinite(lat) && isFinite(lon)) {
            seedMinLat = Math.min(seedMinLat, lat); seedMaxLat = Math.max(seedMaxLat, lat);
            seedMinLng = Math.min(seedMinLng, lon); seedMaxLng = Math.max(seedMaxLng, lon);
          }
        }
      }
    }
  }
  console.log(`[download-gtfs] ${bboxStopIds.size} stops in bbox`);

  if (bboxStopIds.size === 0) {
    console.warn("[download-gtfs] No stops found in bbox — GTFS filter skipped");
    return;
  }

  // ── Step 2a: collect trip_ids with ≥2 bbox stops ───────────────────────────
  const stopTimesRaw = path.join(rawDir, "stop_times.txt");
  if (!existsSync(stopTimesRaw)) {
    console.log("[download-gtfs] No stop_times.txt — skipping");
    return;
  }

  const validTripIds = new Set<string>();
  const tripBboxStopCount = new Map<string, number>();
  {
    let stopIdCol = -1, tripIdCol = -1, isFirst = true;
    const rl = createInterface({ input: createReadStream(stopTimesRaw), crlfDelay: Infinity });
    for await (const line of rl) {
      if (!line.trim()) continue;
      if (isFirst) {
        isFirst = false;
        const idx = colIndex(line);
        stopIdCol = idx.get("stop_id") ?? -1;
        tripIdCol = idx.get("trip_id") ?? -1;
        continue;
      }
      const fields = line.split(",");
      const tripId = fields[tripIdCol] ?? "";
      const stopId = fields[stopIdCol] ?? "";
      if (bboxStopIds.has(stopId)) {
        validTripIds.add(tripId);
        tripBboxStopCount.set(tripId, (tripBboxStopCount.get(tripId) ?? 0) + 1);
      }
    }
  }
  for (const tripId of validTripIds) {
    if ((tripBboxStopCount.get(tripId) ?? 0) < 2) validTripIds.delete(tripId);
  }
  console.log(`[download-gtfs] ${validTripIds.size} trips with ≥2 bbox stops`);

  // ── Step 2b: write filtered stop_times (bbox stops on valid trips only) ─────
  const allTripStopIds = new Set<string>();
  await filterLargeCsv(
    stopTimesRaw,
    path.join(destDir, "stop_times.txt"),
    (tripCol, line) => {
      const fields = line.split(",");
      const tripId = fields[tripCol] ?? "";
      const stopId = fields[tripCol + 1] ?? ""; // wrong index — handled below
      return validTripIds.has(tripId);
    },
    (idx) => idx.get("trip_id") ?? -1,
  );
  // Re-read filtered stop_times to collect actual stop IDs and also filter to bbox stops only
  {
    const tmpPath = path.join(destDir, "stop_times.txt") + ".tmp2";
    const writer = createWriteStream(tmpPath);
    let isFirst = true;
    let tripIdCol = -1, stopIdCol = -1;
    const rl = createInterface({ input: createReadStream(path.join(destDir, "stop_times.txt")), crlfDelay: Infinity });
    for await (const line of rl) {
      if (!line.trim()) continue;
      if (isFirst) {
        isFirst = false;
        const idx = colIndex(line);
        tripIdCol = idx.get("trip_id") ?? -1;
        stopIdCol = idx.get("stop_id") ?? -1;
        writer.write(line + "\n");
        continue;
      }
      const fields = line.split(",");
      const tripId = fields[tripIdCol] ?? "";
      const stopId = fields[stopIdCol] ?? "";
      if (validTripIds.has(tripId) && bboxStopIds.has(stopId)) {
        allTripStopIds.add(stopId);
        writer.write(line + "\n");
      }
    }
    await new Promise<void>((resolve, reject) =>
      writer.end((err?: unknown) => (err ? reject(err) : resolve())),
    );
    renameSync(tmpPath, path.join(destDir, "stop_times.txt"));
  }

  // ── Step 3: filter stops.txt (only stops that appear in final stop_times) ───
  {
    const dest = path.join(destDir, "stops.txt");
    const lines = readFileSync(stopsPath, "utf8").split(/\r?\n/).filter((l) => l.trim());
    if (lines.length >= 2) {
      const idx = colIndex(lines[0]);
      const stopIdCol = idx.get("stop_id") ?? -1;
      const out = [lines[0]];
      for (let i = 1; i < lines.length; i++) {
        const fields = splitCsv(lines[i]);
        if (allTripStopIds.has(fields[stopIdCol] ?? "")) out.push(lines[i]);
      }
      writeFileSync(dest, out.join("\n") + "\n");
    }
  }

  if (isFinite(seedMinLat)) {
    const stopsBbox: [number, number, number, number] = [seedMinLng, seedMinLat, seedMaxLng, seedMaxLat];
    writeFileSync(path.join(destDir, ".stops_bbox"), JSON.stringify(stopsBbox));
    console.log(`[download-gtfs] Transit stops bbox: [${stopsBbox.map((v) => v.toFixed(3)).join(", ")}]`);
  }

  // ── Step 4: filter trips.txt → collect route/service/shape IDs ─────────────
  const validRouteIds   = new Set<string>();
  const validServiceIds = new Set<string>();
  const validShapeIds   = new Set<string>();
  {
    const src = path.join(rawDir, "trips.txt");
    const dest = path.join(destDir, "trips.txt");
    if (existsSync(src)) {
      const lines = readFileSync(src, "utf8").split(/\r?\n/).filter((l) => l.trim());
      if (lines.length >= 2) {
        const idx = colIndex(lines[0]);
        const tripIdCol    = idx.get("trip_id")    ?? -1;
        const routeIdCol   = idx.get("route_id")   ?? -1;
        const serviceIdCol = idx.get("service_id") ?? -1;
        const shapeIdCol   = idx.get("shape_id")   ?? -1;
        const out = [lines[0]];
        for (let i = 1; i < lines.length; i++) {
          const fields = splitCsv(lines[i]);
          if (validTripIds.has(fields[tripIdCol] ?? "")) {
            out.push(lines[i]);
            validRouteIds.add(fields[routeIdCol] ?? "");
            validServiceIds.add(fields[serviceIdCol] ?? "");
            const shapeId = fields[shapeIdCol] ?? "";
            if (shapeId) validShapeIds.add(shapeId);
          }
        }
        writeFileSync(dest, out.join("\n") + "\n");
      }
    }
  }

  // ── Step 5: filter routes, calendar, calendar_dates; copy agency/feed_info ──
  // Collect agency IDs from the filtered routes so we can filter agency.txt.
  const validAgencyIds = new Set<string>();
  {
    const src = path.join(rawDir, "routes.txt");
    if (existsSync(src)) {
      const lines = readFileSync(src, "utf8").split(/\r?\n/).filter((l) => l.trim());
      if (lines.length >= 2) {
        const idx = colIndex(lines[0]);
        const routeIdCol  = idx.get("route_id")  ?? -1;
        const agencyIdCol = idx.get("agency_id") ?? -1;
        for (let i = 1; i < lines.length; i++) {
          const fields = splitCsv(lines[i]);
          if (validRouteIds.has(fields[routeIdCol] ?? "")) {
            const aid = fields[agencyIdCol] ?? "";
            if (aid) validAgencyIds.add(aid);
          }
        }
      }
    }
  }

  for (const [file, idCol, validIds] of [
    ["agency.txt",         "agency_id",  validAgencyIds],
    ["routes.txt",         "route_id",   validRouteIds],
    ["calendar.txt",       "service_id", validServiceIds],
    ["calendar_dates.txt", "service_id", validServiceIds],
  ] as const) {
    const src = path.join(rawDir, file);
    const dest = path.join(destDir, file);
    if (!existsSync(src)) continue;
    const lines = readFileSync(src, "utf8").split(/\r?\n/).filter((l) => l.trim());
    if (lines.length < 2) { writeFileSync(dest, lines[0] + "\n"); continue; }
    const idx = colIndex(lines[0]);
    const col = idx.get(idCol) ?? -1;
    const out = [lines[0]];
    for (let i = 1; i < lines.length; i++) {
      const fields = splitCsv(lines[i]);
      if ((validIds as Set<string>).has(fields[col] ?? "")) out.push(lines[i]);
    }
    writeFileSync(dest, out.join("\n") + "\n");
  }

  // ── Step 5b: copy feed_info.txt verbatim (not filterable, Valhalla may need it) ──
  {
    const src = path.join(rawDir, "feed_info.txt");
    if (existsSync(src)) copyFileSync(src, path.join(destDir, "feed_info.txt"));
  }

  // ── Step 6: shapes.txt (large — stream-filter) ─────────────────────────────
  if (validShapeIds.size > 0) {
    await filterLargeCsv(
      path.join(rawDir, "shapes.txt"),
      path.join(destDir, "shapes.txt"),
      (col, line) => validShapeIds.has(line.split(",")[col] ?? ""),
      (idx) => idx.get("shape_id") ?? -1,
    );
  }

  console.log(
    `[download-gtfs] Filter complete: ${allTripStopIds.size} stops, ` +
    `${validTripIds.size} trips, ${validRouteIds.size} routes`,
  );
}

// ─── Job handler ──────────────────────────────────────────────────────────────

export async function handleDownloadGtfs(job: Job<DownloadGtfsData>): Promise<void> {
  const { url, citySlug, bbox, force = false } = job.data;
  const effectiveSource = url;

  const destDir = cityFeedDir(citySlug);
  const cityMarker = readCityMarker(citySlug);

  // ── Check if per-city feed is already up to date ───────────────────────────
  const cityDataExists = existsSync(destDir) && readdirSync(destDir).some((f) => f.endsWith(".txt"));
  if (!force && cityDataExists && cityMarker?.source === effectiveSource &&
      cityMarker?.filterVersion === FILTER_VERSION && bboxEqual(cityMarker.bbox, bbox)) {
    console.log(`[download-gtfs] Per-city feed for ${citySlug} is up to date, skipping`);
    await job.updateProgress({ stage: "Downloading GTFS", pct: 100, message: "Feed up to date." } satisfies JobProgress);
    return;
  }

  // ── Ensure raw feed is present ─────────────────────────────────────────────
  const GTFS_RAW_DIR = rawDir(url);
  const GTFS_ZIP_PATH = zipPath(url);
  const rawMarker = readRawMarker(url);
  const rawExists = existsSync(GTFS_RAW_DIR) && readdirSync(GTFS_RAW_DIR).some((f) => f.endsWith(".txt"));

  if (force || !rawExists || rawMarker?.source !== effectiveSource) {
    await job.updateProgress({ stage: "Downloading GTFS", pct: 5, message: `Downloading GTFS feed…` } satisfies JobProgress);

    mkdirSync(GTFS_DATA_DIR, { recursive: true });

    const response = await fetch(url, { signal: AbortSignal.timeout(600_000) });
    if (!response.ok || !response.body) {
      throw new Error(`Failed to download GTFS: HTTP ${response.status} ${response.statusText}`);
    }

    const totalBytes = Number(response.headers.get("content-length") ?? 0);
    let downloadedBytes = 0;
    let lastReportedPct = 5;

    const nodeReadable = Readable.fromWeb(response.body as Parameters<typeof Readable.fromWeb>[0]);
    nodeReadable.on("data", (chunk: Buffer) => {
      downloadedBytes += chunk.length;
      if (totalBytes > 0) {
        const pct = Math.min(55, 5 + Math.round((downloadedBytes / totalBytes) * 50));
        if (pct > lastReportedPct + 4) {
          lastReportedPct = pct;
          void job.updateProgress({
            stage: "Downloading GTFS", pct,
            message: `Downloading… ${(downloadedBytes / 1024 / 1024).toFixed(1)} / ${(totalBytes / 1024 / 1024).toFixed(1)} MB`,
            bytesDownloaded: downloadedBytes, totalBytes,
          } satisfies JobProgress);
        }
      }
    });

    await pipeline(nodeReadable, createWriteStream(GTFS_ZIP_PATH));
    console.log(`[download-gtfs] Downloaded ${(downloadedBytes / 1024 / 1024).toFixed(1)} MB`);

    await job.updateProgress({ stage: "Downloading GTFS", pct: 60, message: "Extracting GTFS feed…" } satisfies JobProgress);

    if (existsSync(GTFS_RAW_DIR)) rmSync(GTFS_RAW_DIR, { recursive: true, force: true });
    mkdirSync(GTFS_RAW_DIR, { recursive: true });

    const zip = unzipper.Parse({ forceStream: true });
    createReadStream(GTFS_ZIP_PATH).pipe(zip);
    for await (const entry of zip) {
      const e = entry as unzipper.Entry;
      const destPath = path.join(GTFS_RAW_DIR, path.basename(e.path));
      if (e.type === "Directory") { e.autodrain(); continue; }
      await mkdir(path.dirname(destPath), { recursive: true });
      await pipeline(e as unknown as NodeJS.ReadableStream, createWriteStream(destPath));
    }
    rmSync(GTFS_ZIP_PATH, { force: true });

    const extractedFiles = readdirSync(GTFS_RAW_DIR);
    console.log(`[download-gtfs] Extracted ${extractedFiles.length} files to ${GTFS_RAW_DIR}`);
    writeFileSync(rawMarkerPath(url), JSON.stringify({ source: effectiveSource }));
    pruneStaleRawDirs(url, urlCacheKey(url));
  } else {
    console.log(`[download-gtfs] Raw feed already present (source=${effectiveSource})`);
    await job.updateProgress({ stage: "Downloading GTFS", pct: 60, message: "Using cached raw feed." } satisfies JobProgress);
  }

  // ── Filter raw feed for this city ──────────────────────────────────────────
  await job.updateProgress({ stage: "Downloading GTFS", pct: 65, message: `Filtering GTFS for ${citySlug}…` } satisfies JobProgress);

  if (existsSync(destDir)) rmSync(destDir, { recursive: true, force: true });
  mkdirSync(destDir, { recursive: true });

  await filterGtfsForCity(GTFS_RAW_DIR, destDir, bbox);
  writeCityMarker(citySlug, effectiveSource, bbox);

  await job.updateProgress({ stage: "Downloading GTFS", pct: 100, message: `GTFS ready for ${citySlug}.` } satisfies JobProgress);
}