From 538fd636e25595d88a958344d285c0e7cf44e530 Mon Sep 17 00:00:00 2001 From: yyamashita Date: Wed, 6 May 2026 22:24:38 +0900 Subject: Async scraping, scrape_logs, and CLI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Background scraping: - POST /api/scrape returns 202 immediately with run_id; scraping runs async - GET /api/scrape-status?run_id=xxx polls for results per venue - scrape_logs table: per-venue status (running/ok/error), events_saved, error, timestamps CLI (npm run scrape): - npm run scrape — 全会場をスクレイプ、結果を色付きで出力 - npm run scrape liquid-room — 特定会場のみ - npm run scrape -- --list — 登録済み会場一覧を表示 - エラー時は exit code 1 + エラーメッセージを dim 表示 Venues page: - 最終スクレイプ日時・成否をインラインで表示 - 会場ごとの「更新」ボタンを追加 Bug fix: upsertEvent に description/optional fields のデフォルト値を設定し better-sqlite3 の "Missing named parameter" エラーを解消 Co-Authored-By: Claude Sonnet 4.6 --- app/lib/scraper-runner.server.ts | 81 ++++++++++++++++++++++------------------ 1 file changed, 44 insertions(+), 37 deletions(-) (limited to 'app/lib/scraper-runner.server.ts') diff --git a/app/lib/scraper-runner.server.ts b/app/lib/scraper-runner.server.ts index 191dd00..87dd16c 100644 --- a/app/lib/scraper-runner.server.ts +++ b/app/lib/scraper-runner.server.ts @@ -1,9 +1,16 @@ -import { upsertVenue, upsertEvent } from "./db.server"; +import { randomUUID } from "crypto"; +import { + upsertVenue, + upsertEvent, + insertScrapeLog, + updateScrapeLog, + type ScrapeLog, +} from "./db.server"; import { generateVenueMarkdown, generateAllVenueMarkdown } from "./markdown-writer.server"; import { ALL_SCRAPERS } from "~/scrapers/index"; import type { EventInput } from "./db.server"; -const SCRAPE_WINDOW_DAYS = 35; // ~1 month +const SCRAPE_WINDOW_DAYS = 35; function scrapeWindow(): { from: string; to: string } { const from = new Date(); @@ -21,81 +28,81 @@ function withinWindow(event: EventInput, from: string, to: string): boolean { } export interface ScrapeResult { + run_id: string; venue_id: string; venue_name: string; + status: "ok" | "error"; events_saved: number; - markdown_path?: string; error?: string; } -export async function runAllScrapers(): Promise { +/** Fire-and-forget: start all scrapers in the background, return run_id immediately. */ +export function startAllScrapersAsync(): string { + const run_id = randomUUID(); + // Don't await — runs in background + void runAllScrapers(run_id); + return run_id; +} + +export function startScraperAsync(venueId: string): string { + const run_id = randomUUID(); + void runScraper(venueId, run_id); + return run_id; +} + +/** Runs all scrapers, writes logs to DB. Can be awaited (e.g. from CLI). */ +export async function runAllScrapers(run_id = randomUUID()): Promise { const results: ScrapeResult[] = []; const successIds: string[] = []; for (const scraper of ALL_SCRAPERS) { const { venue } = scraper; upsertVenue(venue.id, venue.name, venue.url, venue.area); + const logId = insertScrapeLog(run_id, venue.id, venue.name); try { const { from, to } = scrapeWindow(); - const events = (await scraper.scrape()).filter((e) => - withinWindow(e, from, to) - ); + const events = (await scraper.scrape()).filter((e) => withinWindow(e, from, to)); for (const event of events) { upsertEvent(event); } + updateScrapeLog(logId, "ok", events.length); successIds.push(venue.id); - results.push({ - venue_id: venue.id, - venue_name: venue.name, - events_saved: events.length, - }); + results.push({ run_id, venue_id: venue.id, venue_name: venue.name, status: "ok", events_saved: events.length }); } catch (err) { - results.push({ - venue_id: venue.id, - venue_name: venue.name, - events_saved: 0, - error: err instanceof Error ? err.message : String(err), - }); + const error = err instanceof Error ? err.message : String(err); + updateScrapeLog(logId, "error", 0, error); + results.push({ run_id, venue_id: venue.id, venue_name: venue.name, status: "error", events_saved: 0, error }); } } - // Generate Markdown files for all venues that scraped successfully generateAllVenueMarkdown(successIds); - return results; } -export async function runScraper(venueId: string): Promise { +/** Runs a single scraper by venue ID. */ +export async function runScraper(venueId: string, run_id = randomUUID()): Promise { const scraper = ALL_SCRAPERS.find((s) => s.venue.id === venueId); if (!scraper) { - return { venue_id: venueId, venue_name: venueId, events_saved: 0, error: "Scraper not found" }; + return { run_id, venue_id: venueId, venue_name: venueId, status: "error", events_saved: 0, error: "Scraper not found" }; } const { venue } = scraper; upsertVenue(venue.id, venue.name, venue.url, venue.area); + const logId = insertScrapeLog(run_id, venue.id, venue.name); try { const { from, to } = scrapeWindow(); - const events = (await scraper.scrape()).filter((e) => - withinWindow(e, from, to) - ); + const events = (await scraper.scrape()).filter((e) => withinWindow(e, from, to)); for (const event of events) { upsertEvent(event); } + updateScrapeLog(logId, "ok", events.length); generateVenueMarkdown(venue.id); - return { - venue_id: venue.id, - venue_name: venue.name, - events_saved: events.length, - markdown_path: `events/${venue.id}.md`, - }; + return { run_id, venue_id: venue.id, venue_name: venue.name, status: "ok", events_saved: events.length }; } catch (err) { - return { - venue_id: venue.id, - venue_name: venue.name, - events_saved: 0, - error: err instanceof Error ? err.message : String(err), - }; + const error = err instanceof Error ? err.message : String(err); + updateScrapeLog(logId, "error", 0, error); + return { run_id, venue_id: venue.id, venue_name: venue.name, status: "error", events_saved: 0, error }; } } -- cgit v1.2.3