fix: actually save multiple POIs for subcategories

2026-03-06 14:18:01 +01:00 · 2026-03-06 14:18:01 +01:00 · 25c67b2536
commit 25c67b2536
parent fcad3d867a
4 changed files with 73 additions and 55 deletions
--- a/infra/schema.sql
+++ b/infra/schema.sql
@ -94,11 +94,14 @@ ALTER TABLE grid_scores ADD COLUMN IF NOT EXISTS profile TEXT NOT NULL DEFAULT '
 CREATE INDEX IF NOT EXISTS idx_grid_scores_lookup
  ON grid_scores (grid_point_id, travel_mode, threshold_min, profile);

-- ─── Nearest POI per subcategory per grid point ───────────────────────────────
-- Populated by compute-scores job. Stores the nearest (by routing time) POI for
-- each subcategory at each grid point, for each travel mode. Threshold-independent.
+-- ─── Nearest POIs per subcategory per grid point ─────────────────────────────
+-- Populated by compute-routing / compute-transit jobs. Stores up to K nearest
+-- POIs per (grid_point, category, subcategory, travel_mode) so that the scoring
+-- phase can apply the complement-product formula across multiple POIs, giving
+-- diminishing returns for POI diversity within a subcategory.

 CREATE TABLE IF NOT EXISTS grid_poi_details (
+  id               BIGSERIAL PRIMARY KEY,
  grid_point_id    BIGINT NOT NULL REFERENCES grid_points(id) ON DELETE CASCADE,
  category         TEXT NOT NULL,
  subcategory      TEXT NOT NULL,
@ -107,13 +110,27 @@ CREATE TABLE IF NOT EXISTS grid_poi_details (
  nearest_poi_name TEXT,
  distance_m       FLOAT,
  travel_time_s    FLOAT,
-  computed_at      TIMESTAMPTZ NOT NULL DEFAULT now(),
-  PRIMARY KEY (grid_point_id, category, subcategory, travel_mode)
+  computed_at      TIMESTAMPTZ NOT NULL DEFAULT now()
 );

 CREATE INDEX IF NOT EXISTS idx_grid_poi_details_lookup
  ON grid_poi_details (grid_point_id, travel_mode);

+-- Migration: replace old composite PK with surrogate id for diversity scoring
+DO $$
+BEGIN
+  IF NOT EXISTS (
+    SELECT 1 FROM information_schema.columns
+    WHERE table_name = 'grid_poi_details' AND column_name = 'id'
+  ) THEN
+    -- Old single-POI-per-subcategory data is incompatible; clear it
+    DELETE FROM grid_scores;
+    DELETE FROM grid_poi_details;
+    ALTER TABLE grid_poi_details DROP CONSTRAINT IF EXISTS grid_poi_details_pkey;
+    ALTER TABLE grid_poi_details ADD COLUMN id BIGSERIAL PRIMARY KEY;
+  END IF;
+END $$;
+
 -- ─── Isochrone cache ──────────────────────────────────────────────────────────

 CREATE TABLE IF NOT EXISTS isochrone_cache (
--- a/worker/src/jobs/compute-routing.ts
+++ b/worker/src/jobs/compute-routing.ts
@ -10,8 +10,9 @@ export type ComputeRoutingData = {
  category: string;
 };

-/** Number of nearest POI candidates per grid point. */
-const K = 6;
+/** Number of nearest POI candidates per grid point (across all subcategories).
+ *  Higher K means more diversity candidates for the complement-product formula. */
+const K = 15;
 /** Grid points per Valhalla matrix call. */
 const BATCH_SIZE = 5;
 /** Concurrent Valhalla calls within this job. */
@ -55,14 +56,15 @@ export async function handleComputeRouting(job: Job<ComputeRoutingData>): Promis
  `);
  if (count === 0) return;

-  // Nearest POI per (gridPointId, subcategory).
-  const result = new Map<string, Map<string, {
+  // All POI candidates per grid point — multiple per subcategory for diversity scoring.
+  const rows: Array<{
+    gpId: string;
+    subcategory: string;
    poiId: string;
    poiName: string | null;
    distM: number;
    timeS: number | null;
-  }>>();
-  for (const gp of gridPoints) result.set(gp.id, new Map());
+  }> = [];

  const batches: Array<{ id: string; lat: number; lng: number }[]> = [];
  for (let i = 0; i < gridPoints.length; i += BATCH_SIZE) {
@ -141,17 +143,16 @@ export async function handleComputeRouting(job: Job<ComputeRoutingData>): Promis
      const knn = gpKnn.get(gp.id);
      if (!knn || knn.length === 0) continue;

-      const subcatMap = result.get(gp.id)!;
      for (const row of knn) {
-        if (!subcatMap.has(row.subcategory)) {
-          const idx = targetIdx.get(row.poi_id);
-          subcatMap.set(row.subcategory, {
-            poiId: row.poi_id,
-            poiName: row.poi_name,
-            distM: row.dist_m,
-            timeS: idx !== undefined ? (matrix[bi]?.[idx] ?? null) : null,
-          });
-        }
+        const idx = targetIdx.get(row.poi_id);
+        rows.push({
+          gpId: gp.id,
+          subcategory: row.subcategory,
+          poiId: row.poi_id,
+          poiName: row.poi_name,
+          distM: row.dist_m,
+          timeS: idx !== undefined ? (matrix[bi]?.[idx] ?? null) : null,
+        });
      }
    }

@ -162,24 +163,25 @@ export async function handleComputeRouting(job: Job<ComputeRoutingData>): Promis
    } satisfies JobProgress);
  });

-  // Bulk-insert nearest POI per subcategory into grid_poi_details.
-  const gpIdArr: string[] = [];
-  const subcatArr: string[] = [];
-  const poiIdArr: (string | null)[] = [];
-  const poiNameArr: (string | null)[] = [];
-  const distArr: (number | null)[] = [];
-  const timeArr: (number | null)[] = [];
+  if (rows.length === 0) return;

-  for (const [gpId, subcatMap] of result) {
-    for (const [subcategory, detail] of subcatMap) {
-      gpIdArr.push(gpId);
-      subcatArr.push(subcategory);
-      poiIdArr.push(detail.poiId);
-      poiNameArr.push(detail.poiName);
-      distArr.push(detail.distM);
-      timeArr.push(detail.timeS);
-    }
-  }
+  // Delete stale rows for this (city, mode, category) before re-inserting.
+  await Promise.resolve(sql`
+    DELETE FROM grid_poi_details gpd
+    USING grid_points gp
+    WHERE gpd.grid_point_id = gp.id
+      AND gp.city_slug = ${citySlug}
+      AND gpd.category = ${category}
+      AND gpd.travel_mode = ${mode}
+  `);
+
+  // Bulk-insert all POI candidates (multiple per subcategory for diversity scoring).
+  const gpIdArr  = rows.map((r) => r.gpId);
+  const subcatArr = rows.map((r) => r.subcategory);
+  const poiIdArr  = rows.map((r) => r.poiId);
+  const poiNameArr = rows.map((r) => r.poiName);
+  const distArr   = rows.map((r) => r.distM);
+  const timeArr   = rows.map((r) => r.timeS);

  for (let i = 0; i < gpIdArr.length; i += INSERT_CHUNK) {
    const end = Math.min(i + INSERT_CHUNK, gpIdArr.length);
@ -193,7 +195,7 @@ export async function handleComputeRouting(job: Job<ComputeRoutingData>): Promis
        ${category},
        subcat,
        ${mode},
-        CASE WHEN poi_id IS NULL THEN NULL ELSE poi_id::bigint END,
+        poi_id::bigint,
        poi_name,
        dist,
        time_s
@ -205,13 +207,6 @@ export async function handleComputeRouting(job: Job<ComputeRoutingData>): Promis
        ${distArr.slice(i, end)}::float8[],
        ${timeArr.slice(i, end)}::float8[]
      ) AS t(gp_id, subcat, poi_id, poi_name, dist, time_s)
-      ON CONFLICT (grid_point_id, category, subcategory, travel_mode)
-      DO UPDATE SET
-        nearest_poi_id   = EXCLUDED.nearest_poi_id,
-        nearest_poi_name = EXCLUDED.nearest_poi_name,
-        distance_m       = EXCLUDED.distance_m,
-        travel_time_s    = EXCLUDED.travel_time_s,
-        computed_at      = now()
    `);
  }
 }
--- a/worker/src/jobs/compute-scores.ts
+++ b/worker/src/jobs/compute-scores.ts
@ -185,14 +185,16 @@ export async function handleComputeScores(
      WHERE gp.city_slug = ${citySlug}
    ),
    fifteen_subcat AS (
-      -- "fifteen" mode: best (lowest) travel time across walking / cycling / transit
+      -- "fifteen" mode: for each unique POI, take the best time across
+      -- walking / cycling / transit so each POI contributes independently
+      -- to the complement-product formula (preserving diversity).
      SELECT
        grid_point_id, category, subcategory,
        'fifteen'::text AS travel_mode,
        MIN(travel_time_s) AS travel_time_s
      FROM base
      WHERE travel_mode IN ('walking', 'cycling', 'transit')
-      GROUP BY grid_point_id, category, subcategory
+      GROUP BY grid_point_id, category, subcategory, nearest_poi_id
    ),
    all_subcat AS (
      SELECT grid_point_id, category, subcategory, travel_mode, travel_time_s FROM base
--- a/worker/src/jobs/compute-transit.ts
+++ b/worker/src/jobs/compute-transit.ts
@ -116,6 +116,16 @@ export async function handleComputeTransit(job: Job<ComputeTransitData>): Promis

  if (gridPoints.length === 0) return;

+  // Delete stale transit rows before recomputing (no ON CONFLICT since multiple
+  // rows per subcategory are allowed with the diversity-scoring schema).
+  await Promise.resolve(sql`
+    DELETE FROM grid_poi_details gpd
+    USING grid_points gp
+    WHERE gpd.grid_point_id = gp.id
+      AND gp.city_slug = ${citySlug}
+      AND gpd.travel_mode = 'transit'
+  `);
+
  await job.updateProgress({
    stage: "Transit routing",
    pct: 1,
@ -245,13 +255,7 @@ export async function handleComputeTransit(job: Job<ComputeTransitData>): Promis
        ${distArr.slice(i, end)}::float8[],
        ${timeArr.slice(i, end)}::float8[]
      ) AS t(gp_id, cat, subcat, poi_id, poi_name, dist, time_s)
-      ON CONFLICT (grid_point_id, category, subcategory, travel_mode)
-      DO UPDATE SET
-        nearest_poi_id   = EXCLUDED.nearest_poi_id,
-        nearest_poi_name = EXCLUDED.nearest_poi_name,
-        distance_m       = EXCLUDED.distance_m,
-        travel_time_s    = EXCLUDED.travel_time_s,
-        computed_at      = now()
+      -- No ON CONFLICT: stale rows were deleted at job start
    `);
  }