summaryrefslogtreecommitdiff
path: root/app/lib/scraper-runner.server.ts
blob: 012ff9543383c3d77e028410adc81a78cba0e623 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import { randomUUID } from "crypto";
import {
  upsertVenue,
  upsertEvent,
  insertScrapeLog,
  updateScrapeLog,
  type ScrapeLog,
} from "./db.server";
import { generateVenueMarkdown, generateAllVenueMarkdown } from "./markdown-writer.server";
import { closeBrowser } from "./playwright.server";
import { ALL_SCRAPERS } from "~/scrapers/index";
import type { EventInput } from "./db.server";

const SCRAPE_WINDOW_DAYS = 35;

function scrapeWindow(): { from: string; to: string } {
  const from = new Date();
  from.setHours(0, 0, 0, 0);
  const to = new Date(from);
  to.setDate(to.getDate() + SCRAPE_WINDOW_DAYS);
  return {
    from: from.toISOString().slice(0, 10),
    to: to.toISOString().slice(0, 10),
  };
}

function withinWindow(event: EventInput, from: string, to: string): boolean {
  return event.date >= from && event.date <= to;
}

export interface ScrapeResult {
  run_id: string;
  venue_id: string;
  venue_name: string;
  status: "ok" | "error";
  events_saved: number;
  error?: string;
}

/** Fire-and-forget: start all scrapers in the background, return run_id immediately. */
export function startAllScrapersAsync(): string {
  const run_id = randomUUID();
  // Don't await — runs in background
  void runAllScrapers(run_id);
  return run_id;
}

export function startScraperAsync(venueId: string): string {
  const run_id = randomUUID();
  void runScraper(venueId, run_id);
  return run_id;
}

/** Runs all scrapers, writes logs to DB. Can be awaited (e.g. from CLI). */
export async function runAllScrapers(run_id = randomUUID()): Promise<ScrapeResult[]> {
  const results: ScrapeResult[] = [];
  const successIds: string[] = [];

  for (const scraper of ALL_SCRAPERS) {
    const { venue } = scraper;
    upsertVenue(venue.id, venue.name, venue.url, venue.area);
    const logId = insertScrapeLog(run_id, venue.id, venue.name);

    try {
      const { from, to } = scrapeWindow();
      const events = (await scraper.scrape()).filter((e) => withinWindow(e, from, to));
      for (const event of events) {
        upsertEvent(event);
      }
      updateScrapeLog(logId, "ok", events.length);
      successIds.push(venue.id);
      results.push({ run_id, venue_id: venue.id, venue_name: venue.name, status: "ok", events_saved: events.length });
    } catch (err) {
      const error = err instanceof Error ? err.message : String(err);
      updateScrapeLog(logId, "error", 0, error);
      results.push({ run_id, venue_id: venue.id, venue_name: venue.name, status: "error", events_saved: 0, error });
    }
  }

  generateAllVenueMarkdown(successIds);
  // Close shared Playwright browser if it was opened by any scraper
  await closeBrowser();
  return results;
}

/** Runs a single scraper by venue ID. */
export async function runScraper(venueId: string, run_id = randomUUID()): Promise<ScrapeResult> {
  const scraper = ALL_SCRAPERS.find((s) => s.venue.id === venueId);
  if (!scraper) {
    return { run_id, venue_id: venueId, venue_name: venueId, status: "error", events_saved: 0, error: "Scraper not found" };
  }

  const { venue } = scraper;
  upsertVenue(venue.id, venue.name, venue.url, venue.area);
  const logId = insertScrapeLog(run_id, venue.id, venue.name);

  try {
    const { from, to } = scrapeWindow();
    const events = (await scraper.scrape()).filter((e) => withinWindow(e, from, to));
    for (const event of events) {
      upsertEvent(event);
    }
    updateScrapeLog(logId, "ok", events.length);
    generateVenueMarkdown(venue.id);
    return { run_id, venue_id: venue.id, venue_name: venue.name, status: "ok", events_saved: events.length };
  } catch (err) {
    const error = err instanceof Error ? err.message : String(err);
    updateScrapeLog(logId, "error", 0, error);
    return { run_id, venue_id: venue.id, venue_name: venue.name, status: "error", events_saved: 0, error };
  } finally {
    await closeBrowser();
  }
}