From 538fd636e25595d88a958344d285c0e7cf44e530 Mon Sep 17 00:00:00 2001 From: yyamashita Date: Wed, 6 May 2026 22:24:38 +0900 Subject: Async scraping, scrape_logs, and CLI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Background scraping: - POST /api/scrape returns 202 immediately with run_id; scraping runs async - GET /api/scrape-status?run_id=xxx polls for results per venue - scrape_logs table: per-venue status (running/ok/error), events_saved, error, timestamps CLI (npm run scrape): - npm run scrape — 全会場をスクレイプ、結果を色付きで出力 - npm run scrape liquid-room — 特定会場のみ - npm run scrape -- --list — 登録済み会場一覧を表示 - エラー時は exit code 1 + エラーメッセージを dim 表示 Venues page: - 最終スクレイプ日時・成否をインラインで表示 - 会場ごとの「更新」ボタンを追加 Bug fix: upsertEvent に description/optional fields のデフォルト値を設定し better-sqlite3 の "Missing named parameter" エラーを解消 Co-Authored-By: Claude Sonnet 4.6 --- app/lib/db.server.ts | 102 +++++++++++++++++++++++++++++++++- app/lib/scraper-runner.server.ts | 81 ++++++++++++++------------- app/routes.ts | 1 + app/routes/api.scrape-status.ts | 16 ++++++ app/routes/api.scrape.ts | 34 ++++++------ app/routes/venues.tsx | 115 ++++++++++++++++++++++++++++----------- 6 files changed, 263 insertions(+), 86 deletions(-) create mode 100644 app/routes/api.scrape-status.ts (limited to 'app') diff --git a/app/lib/db.server.ts b/app/lib/db.server.ts index 0c55991..26735c6 100644 --- a/app/lib/db.server.ts +++ b/app/lib/db.server.ts @@ -45,6 +45,21 @@ function initSchema(db: Database.Database) { CREATE INDEX IF NOT EXISTS idx_events_date ON events(date); CREATE INDEX IF NOT EXISTS idx_events_venue_id ON events(venue_id); + + CREATE TABLE IF NOT EXISTS scrape_logs ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + run_id TEXT NOT NULL, + venue_id TEXT NOT NULL, + venue_name TEXT NOT NULL, + status TEXT NOT NULL DEFAULT 'running', -- running | ok | error + events_saved INTEGER NOT NULL DEFAULT 0, + error TEXT, + started_at TEXT NOT NULL DEFAULT (datetime('now')), + finished_at TEXT + ); + + CREATE INDEX IF NOT EXISTS idx_scrape_logs_run_id ON scrape_logs(run_id); + CREATE INDEX IF NOT EXISTS idx_scrape_logs_venue_id ON scrape_logs(venue_id); `); } @@ -102,7 +117,19 @@ export function upsertVenue( .run(id, name, url, area ?? null); } -export function upsertEvent(event: EventInput) { +export function upsertEvent(raw: EventInput) { + // Ensure all named parameters exist (better-sqlite3 requires them all) + const event = { + artist: null, + start_time: null, + open_time: null, + ticket_url: null, + price: null, + image_url: null, + description: null, + source_url: null, + ...raw, + }; getDb() .prepare( `INSERT INTO events @@ -190,3 +217,76 @@ export function getVenues(): Venue[] { ) .all() as Venue[]; } + +// ---------- Scrape logs ---------- + +export interface ScrapeLog { + id: number; + run_id: string; + venue_id: string; + venue_name: string; + status: "running" | "ok" | "error"; + events_saved: number; + error: string | null; + started_at: string; + finished_at: string | null; +} + +export function insertScrapeLog( + run_id: string, + venue_id: string, + venue_name: string +): number { + const result = getDb() + .prepare( + `INSERT INTO scrape_logs (run_id, venue_id, venue_name, status) + VALUES (?, ?, ?, 'running')` + ) + .run(run_id, venue_id, venue_name); + return result.lastInsertRowid as number; +} + +export function updateScrapeLog( + id: number, + status: "ok" | "error", + events_saved: number, + error?: string +) { + getDb() + .prepare( + `UPDATE scrape_logs + SET status = ?, events_saved = ?, error = ?, finished_at = datetime('now') + WHERE id = ?` + ) + .run(status, events_saved, error ?? null, id); +} + +export function getLatestScrapeRun(): ScrapeLog[] { + return getDb() + .prepare( + `SELECT * FROM scrape_logs + WHERE run_id = (SELECT run_id FROM scrape_logs ORDER BY started_at DESC LIMIT 1) + ORDER BY id ASC` + ) + .all() as ScrapeLog[]; +} + +export function getScrapeRunById(run_id: string): ScrapeLog[] { + return getDb() + .prepare( + "SELECT * FROM scrape_logs WHERE run_id = ? ORDER BY id ASC" + ) + .all(run_id) as ScrapeLog[]; +} + +export function getLastScrapePerVenue(): ScrapeLog[] { + return getDb() + .prepare( + `SELECT s.* FROM scrape_logs s + INNER JOIN ( + SELECT venue_id, MAX(started_at) AS latest FROM scrape_logs GROUP BY venue_id + ) t ON s.venue_id = t.venue_id AND s.started_at = t.latest + ORDER BY s.venue_name ASC` + ) + .all() as ScrapeLog[]; +} diff --git a/app/lib/scraper-runner.server.ts b/app/lib/scraper-runner.server.ts index 191dd00..87dd16c 100644 --- a/app/lib/scraper-runner.server.ts +++ b/app/lib/scraper-runner.server.ts @@ -1,9 +1,16 @@ -import { upsertVenue, upsertEvent } from "./db.server"; +import { randomUUID } from "crypto"; +import { + upsertVenue, + upsertEvent, + insertScrapeLog, + updateScrapeLog, + type ScrapeLog, +} from "./db.server"; import { generateVenueMarkdown, generateAllVenueMarkdown } from "./markdown-writer.server"; import { ALL_SCRAPERS } from "~/scrapers/index"; import type { EventInput } from "./db.server"; -const SCRAPE_WINDOW_DAYS = 35; // ~1 month +const SCRAPE_WINDOW_DAYS = 35; function scrapeWindow(): { from: string; to: string } { const from = new Date(); @@ -21,81 +28,81 @@ function withinWindow(event: EventInput, from: string, to: string): boolean { } export interface ScrapeResult { + run_id: string; venue_id: string; venue_name: string; + status: "ok" | "error"; events_saved: number; - markdown_path?: string; error?: string; } -export async function runAllScrapers(): Promise { +/** Fire-and-forget: start all scrapers in the background, return run_id immediately. */ +export function startAllScrapersAsync(): string { + const run_id = randomUUID(); + // Don't await — runs in background + void runAllScrapers(run_id); + return run_id; +} + +export function startScraperAsync(venueId: string): string { + const run_id = randomUUID(); + void runScraper(venueId, run_id); + return run_id; +} + +/** Runs all scrapers, writes logs to DB. Can be awaited (e.g. from CLI). */ +export async function runAllScrapers(run_id = randomUUID()): Promise { const results: ScrapeResult[] = []; const successIds: string[] = []; for (const scraper of ALL_SCRAPERS) { const { venue } = scraper; upsertVenue(venue.id, venue.name, venue.url, venue.area); + const logId = insertScrapeLog(run_id, venue.id, venue.name); try { const { from, to } = scrapeWindow(); - const events = (await scraper.scrape()).filter((e) => - withinWindow(e, from, to) - ); + const events = (await scraper.scrape()).filter((e) => withinWindow(e, from, to)); for (const event of events) { upsertEvent(event); } + updateScrapeLog(logId, "ok", events.length); successIds.push(venue.id); - results.push({ - venue_id: venue.id, - venue_name: venue.name, - events_saved: events.length, - }); + results.push({ run_id, venue_id: venue.id, venue_name: venue.name, status: "ok", events_saved: events.length }); } catch (err) { - results.push({ - venue_id: venue.id, - venue_name: venue.name, - events_saved: 0, - error: err instanceof Error ? err.message : String(err), - }); + const error = err instanceof Error ? err.message : String(err); + updateScrapeLog(logId, "error", 0, error); + results.push({ run_id, venue_id: venue.id, venue_name: venue.name, status: "error", events_saved: 0, error }); } } - // Generate Markdown files for all venues that scraped successfully generateAllVenueMarkdown(successIds); - return results; } -export async function runScraper(venueId: string): Promise { +/** Runs a single scraper by venue ID. */ +export async function runScraper(venueId: string, run_id = randomUUID()): Promise { const scraper = ALL_SCRAPERS.find((s) => s.venue.id === venueId); if (!scraper) { - return { venue_id: venueId, venue_name: venueId, events_saved: 0, error: "Scraper not found" }; + return { run_id, venue_id: venueId, venue_name: venueId, status: "error", events_saved: 0, error: "Scraper not found" }; } const { venue } = scraper; upsertVenue(venue.id, venue.name, venue.url, venue.area); + const logId = insertScrapeLog(run_id, venue.id, venue.name); try { const { from, to } = scrapeWindow(); - const events = (await scraper.scrape()).filter((e) => - withinWindow(e, from, to) - ); + const events = (await scraper.scrape()).filter((e) => withinWindow(e, from, to)); for (const event of events) { upsertEvent(event); } + updateScrapeLog(logId, "ok", events.length); generateVenueMarkdown(venue.id); - return { - venue_id: venue.id, - venue_name: venue.name, - events_saved: events.length, - markdown_path: `events/${venue.id}.md`, - }; + return { run_id, venue_id: venue.id, venue_name: venue.name, status: "ok", events_saved: events.length }; } catch (err) { - return { - venue_id: venue.id, - venue_name: venue.name, - events_saved: 0, - error: err instanceof Error ? err.message : String(err), - }; + const error = err instanceof Error ? err.message : String(err); + updateScrapeLog(logId, "error", 0, error); + return { run_id, venue_id: venue.id, venue_name: venue.name, status: "error", events_saved: 0, error }; } } diff --git a/app/routes.ts b/app/routes.ts index 028da16..c0096e1 100644 --- a/app/routes.ts +++ b/app/routes.ts @@ -8,4 +8,5 @@ export default [ ]), route("venues", "routes/venues.tsx"), route("api/scrape", "routes/api.scrape.ts"), + route("api/scrape-status", "routes/api.scrape-status.ts"), ] satisfies RouteConfig; diff --git a/app/routes/api.scrape-status.ts b/app/routes/api.scrape-status.ts new file mode 100644 index 0000000..28d08d4 --- /dev/null +++ b/app/routes/api.scrape-status.ts @@ -0,0 +1,16 @@ +/** + * GET /api/scrape-status?run_id=xxx — 指定 run_id の結果を返す + * GET /api/scrape-status — 最新 run の結果を返す + */ +import type { Route } from "./+types/api.scrape-status"; +import { getScrapeRunById, getLatestScrapeRun } from "~/lib/db.server"; + +export async function loader({ request }: Route.LoaderArgs) { + const url = new URL(request.url); + const run_id = url.searchParams.get("run_id"); + + const logs = run_id ? getScrapeRunById(run_id) : getLatestScrapeRun(); + const running = logs.some((l) => l.status === "running"); + + return Response.json({ running, results: logs }); +} diff --git a/app/routes/api.scrape.ts b/app/routes/api.scrape.ts index 4071985..f9daa5c 100644 --- a/app/routes/api.scrape.ts +++ b/app/routes/api.scrape.ts @@ -1,37 +1,37 @@ /** - * Resource route: POST /api/scrape - * Triggers scraping for all venues (or a specific one via ?venue_id=xxx). - * Returns JSON results and redirects back if called from a form. + * Resource route: /api/scrape + * + * POST (form action) — バックグラウンドでスクレイプ開始、202 を即時返却 + * GET ?venue_id=xxx — 特定会場のみバックグラウンド開始 + * GET (パラメータなし) — 全会場をバックグラウンド開始 + * + * ステータス確認は /api/scrape-status?run_id=xxx */ import { redirect } from "react-router"; import type { Route } from "./+types/api.scrape"; -import { runAllScrapers, runScraper } from "~/lib/scraper-runner.server"; +import { startAllScrapersAsync, startScraperAsync } from "~/lib/scraper-runner.server"; export async function action({ request }: Route.ActionArgs) { const formData = await request.formData(); const venueId = formData.get("venue_id"); - const results = venueId - ? [await runScraper(String(venueId))] - : await runAllScrapers(); + const run_id = venueId + ? startScraperAsync(String(venueId)) + : startAllScrapersAsync(); - // If called from a browser form, redirect back const referer = request.headers.get("Referer"); - if (referer) { - return redirect(referer); - } + if (referer) return redirect(referer); - return Response.json({ results }); + return Response.json({ run_id, status: "started" }, { status: 202 }); } -// Allow GET for quick testing in the browser export async function loader({ request }: Route.LoaderArgs) { const url = new URL(request.url); const venueId = url.searchParams.get("venue_id"); - const results = venueId - ? [await runScraper(venueId)] - : await runAllScrapers(); + const run_id = venueId + ? startScraperAsync(venueId) + : startAllScrapersAsync(); - return Response.json({ results }); + return Response.json({ run_id, status: "started" }, { status: 202 }); } diff --git a/app/routes/venues.tsx b/app/routes/venues.tsx index 23b052f..affa72a 100644 --- a/app/routes/venues.tsx +++ b/app/routes/venues.tsx @@ -1,17 +1,19 @@ -import { useLoaderData, Link } from "react-router"; +import { useLoaderData, Link, Form } from "react-router"; import type { Route } from "./+types/venues"; -import { getVenues } from "~/lib/db.server"; +import { getVenues, getLastScrapePerVenue, type ScrapeLog } from "~/lib/db.server"; import { getScraperIds } from "~/lib/venue-meta.server"; export async function loader(_: Route.LoaderArgs) { const venues = getVenues(); const scraperIds = getScraperIds(); - return { venues, scraperIds }; + const scrapeStatus = getLastScrapePerVenue(); + return { venues, scraperIds, scrapeStatus }; } export default function Venues() { - const { venues, scraperIds: scraperIdList } = useLoaderData(); + const { venues, scraperIds: scraperIdList, scrapeStatus } = useLoaderData(); const scraperIds = new Set(scraperIdList); + const statusMap = new Map(scrapeStatus.map((s) => [s.venue_id, s])); return (
@@ -26,43 +28,94 @@ export default function Venues() {
-
-

会場一覧

-

- 現在 {scraperIdList.length} 会場のスクレイパーが登録されています。 - 新しい会場を追加するには app/scrapers/ に - モジュールを追加して index.ts に登録してください。 -

+
+
+

会場一覧

+

+ 現在 {scraperIdList.length} 会場のスクレイパーが登録されています。 +

+
+
+ +
{venues.length === 0 ? ( -

まだ会場データがありません。「情報を更新」してください。

+

まだ会場データがありません。「全会場を更新」してください。

) : ( -
- {venues.map((v) => ( - -
-

{v.name}

- {v.area &&

{v.area}

} +
+ {venues.map((v) => { + const log = statusMap.get(v.id); + return ( +
+ {/* 会場名 + エリア */} +
+ + {v.name} + + {v.area &&

{v.area}

} +
+ + {/* イベント件数 */} + + {v.event_count ?? 0} 件 + + + {/* 最終スクレイプ状態 */} + {log ? ( + + ) : ( + 未実行 + )} + + {/* 個別更新ボタン */} {scraperIds.has(v.id) && ( - - スクレイパー登録済 - +
+ + +
)}
- - {v.event_count ?? 0} - - - - ))} + ); + })}
)}
); } + +function ScrapeStatus({ log }: { log: ScrapeLog }) { + if (log.status === "running") { + return ⟳ 実行中...; + } + if (log.status === "error") { + return ( + + ✖ エラー + + ); + } + const time = log.finished_at?.slice(0, 16).replace("T", " ") ?? ""; + return ( + + ✔ {time} + + ); +} -- cgit v1.2.3