summaryrefslogtreecommitdiff
path: root/app/scrapers
diff options
context:
space:
mode:
authoryyamashita <yyamashita@mosquit.one>2026-05-07 10:16:43 +0900
committeryyamashita <yyamashita@mosquit.one>2026-05-07 10:16:43 +0900
commit0cd5fb770ca9bd3f304d9556a4b33a4ad4f45e7e (patch)
treeaa316c21d7195689d87669338373d83b0b6ac3fb /app/scrapers
parent538fd636e25595d88a958344d285c0e7cf44e530 (diff)
Playwright scraping for FLAT/Pitbar; web UI display-only
- Install Playwright + Chromium; add shared browser singleton (playwright.server.ts) - Rewrite flat-nishiogikubo scraper: Wix calendar via headless browser, month navigation via date picker, extracts .WPczEB/.ExCBIq selectors - Rewrite pitbar-nishiogikubo scraper: freecalend.com via headless browser, parses cal-{member}-{year}-{month}-{day} cell IDs - scraper-runner: close shared browser after each run with closeBrowser() - Remove all scrape trigger buttons from web UI (events index, venues page); remove /api/scrape and /api/scrape-status from routes.ts Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Diffstat (limited to 'app/scrapers')
-rw-r--r--app/scrapers/flat-nishiogikubo.ts142
-rw-r--r--app/scrapers/pitbar-nishiogikubo.ts169
2 files changed, 222 insertions, 89 deletions
diff --git a/app/scrapers/flat-nishiogikubo.ts b/app/scrapers/flat-nishiogikubo.ts
index 03cc70c..da6752f 100644
--- a/app/scrapers/flat-nishiogikubo.ts
+++ b/app/scrapers/flat-nishiogikubo.ts
@@ -1,17 +1,20 @@
/**
* FLAT 西荻窪 — https://www.flat.rinky.info/schedule
*
- * ⚠️ Wix サイトのためクライアントサイド JS レンダリング。
- * 静的 fetch ではイベントデータを取得できない。
+ * Wix イベントカレンダー。JS レンダリングが必要なため Playwright を使用。
*
- * 代替案:
- * - Playwright/Puppeteer でヘッドレスブラウザを使用
- * - Wix Events API (要サイトオーナーによる API キー発行)
+ * DOM 構造:
+ * [data-hook="calendar-cell-<UTC ISO>"] ← 各日付セル
+ * .WPczEB → 開始時刻
+ * .ExCBIq → イベントタイトル
+ * aria-label が "イベントなし" のセルはスキップ
*
- * 現在は空配列を返す(エラーにはしない)。
+ * 月ナビ: calendar-date-picker-button を開いて datepicker-right-arrow で翌月へ。
*/
+import type { Page } from "playwright";
import type { Scraper, VenueMeta } from "./base";
import type { EventInput } from "~/lib/db.server";
+import { getBrowser } from "~/lib/playwright.server";
export const venue: VenueMeta = {
id: "flat-nishiogikubo",
@@ -20,14 +23,129 @@ export const venue: VenueMeta = {
area: "西荻窪",
};
+const SCHEDULE_URL = "https://www.flat.rinky.info/schedule";
+
+async function extractMonthEvents(page: Page): Promise<EventInput[]> {
+ const events: EventInput[] = [];
+ const cells = await page.locator('[data-hook^="calendar-cell-"]').all();
+
+ for (const cell of cells) {
+ const ariaLabel = (await cell.getAttribute("aria-label")) ?? "";
+ if (ariaLabel.includes("イベントなし")) continue;
+
+ const dataHook = (await cell.getAttribute("data-hook")) ?? "";
+ const isoStr = dataHook.replace("calendar-cell-", "");
+ if (!isoStr) continue;
+
+ // UTC timestamp → JST date (UTC+9)
+ const utcMs = new Date(isoStr).getTime();
+ if (isNaN(utcMs)) continue;
+ const jstDate = new Date(utcMs + 9 * 3_600_000).toISOString().slice(0, 10);
+
+ const timeLocs = cell.locator(".WPczEB");
+ const titleLocs = cell.locator(".ExCBIq");
+ const titleCount = await titleLocs.count();
+ const timeCount = await timeLocs.count();
+
+ for (let i = 0; i < titleCount; i++) {
+ const title = (await titleLocs.nth(i).innerText()).trim();
+ if (!title) continue;
+ const time = i < timeCount
+ ? (await timeLocs.nth(i).innerText()).trim()
+ : null;
+
+ events.push({
+ venue_id: venue.id,
+ title,
+ date: jstDate,
+ start_time: time || null,
+ source_url: SCHEDULE_URL,
+ });
+ }
+ }
+
+ return events;
+}
+
+async function navigateToMonth(page: Page, targetYYYYMM: string): Promise<void> {
+ const [targetYear, targetMonth] = targetYYYYMM.split("-").map(Number);
+
+ // Open the date picker
+ await page.click('[data-hook="calendar-date-picker-button"]');
+ await page.waitForTimeout(500);
+
+ // Click next-month arrow until we reach the target month
+ for (let attempt = 0; attempt < 6; attempt++) {
+ const monthText = await page.locator('[data-hook="datepicker-month-dropdown-button"]').innerText();
+ const yearText = await page.locator('[data-hook="datepicker-year-dropdown-button"]').innerText();
+
+ const currentYear = parseInt(yearText);
+ const months: Record<string, number> = {
+ "1月": 1, "2月": 2, "3月": 3, "4月": 4, "5月": 5, "6月": 6,
+ "7月": 7, "8月": 8, "9月": 9, "10月": 10, "11月": 11, "12月": 12,
+ };
+ const currentMonth = months[monthText.trim()] ?? 0;
+
+ if (currentYear === targetYear && currentMonth === targetMonth) break;
+
+ const diff = (targetYear * 12 + targetMonth) - (currentYear * 12 + currentMonth);
+ if (diff > 0) {
+ await page.click('[data-hook="datepicker-right-arrow"]');
+ } else {
+ await page.click('[data-hook="datepicker-left-arrow"]');
+ }
+ await page.waitForTimeout(300);
+ }
+
+ // Click any date in the mini-calendar that belongs to the target month
+ const allDays = await page.locator('[role="dialog"] button, [data-hook="datepicker-right-arrow"] ~ * button').all();
+ // Simpler: find a button with aria-label matching target year/month
+ const targetPrefix = `${targetYear}年${targetMonth}月`;
+ const dayBtns = await page.locator(`button[aria-label*="${targetPrefix}"]`).all();
+ if (dayBtns.length > 0) {
+ await dayBtns[0].click();
+ } else {
+ // Fallback: press Escape to close picker
+ await page.keyboard.press("Escape");
+ }
+ await page.waitForTimeout(2000);
+}
+
export const scraper: Scraper = {
venue,
async scrape(): Promise<EventInput[]> {
- // Wix renders events with JavaScript; static fetch returns an empty calendar.
- // TODO: Replace with a headless browser implementation (e.g. Playwright).
- throw new Error(
- "FLAT 西荻窪 は Wix サイトのため JS レンダリングが必要です。" +
- "ヘッドレスブラウザ(Playwright 等)への移行が必要です。"
- );
+ const browser = await getBrowser();
+ const page = await browser.newPage();
+
+ try {
+ await page.goto(SCHEDULE_URL, {
+ waitUntil: "domcontentloaded",
+ timeout: 30_000,
+ });
+ await page.waitForTimeout(5_000);
+
+ const events: EventInput[] = [];
+
+ // Current month events
+ events.push(...(await extractMonthEvents(page)));
+
+ // Navigate to next month for 35-day window coverage
+ const now = new Date();
+ const nextMonth = new Date(now.getFullYear(), now.getMonth() + 1, 1);
+ const nextYYYYMM = `${nextMonth.getFullYear()}-${String(nextMonth.getMonth() + 1).padStart(2, "0")}`;
+ await navigateToMonth(page, nextYYYYMM);
+ events.push(...(await extractMonthEvents(page)));
+
+ // Deduplicate by date + title
+ const seen = new Set<string>();
+ return events.filter((e) => {
+ const key = `${e.date}|${e.title}`;
+ if (seen.has(key)) return false;
+ seen.add(key);
+ return true;
+ });
+ } finally {
+ await page.close();
+ }
},
};
diff --git a/app/scrapers/pitbar-nishiogikubo.ts b/app/scrapers/pitbar-nishiogikubo.ts
index 5c70023..54d25d5 100644
--- a/app/scrapers/pitbar-nishiogikubo.ts
+++ b/app/scrapers/pitbar-nishiogikubo.ts
@@ -1,18 +1,19 @@
/**
- * Pitbar 西荻窪 — http://freecalend.com/open/mem25771_date{YYYYMM}
+ * Pitbar 西荻窪 — http://freecalend.com/open/mem25771
*
- * スケジュールは Ameblo (https://ameblo.jp/pitbar/) 経由で
- * freecalend.com に掲載されているが、自動リクエストをブロックしている。
+ * freecalend.com は静的 fetch をブロックするため Playwright を使用。
*
- * 代替案:
- * - User-Agent を設定したヘッドレスブラウザで freecalend を取得
- * - 公式 Instagram / X (@pitbar_nishiogi) の投稿を取得
- * - 手動でイベントを登録する管理画面を用意する
+ * DOM 構造:
+ * id="cal-25771-{year}-{month}-{day}" ← 日付セル
+ * テキストは: 日数字 + "M.D(day)" + イベント本文
+ * id="sitatumari-25771-..." ← 区切り (スキップ)
*
- * 月ごとの URL パターン: http://freecalend.com/open/mem25771_date{YYYYMM}
+ * open / start 時刻は "open HH:MM" / "start HH:MM" テキストから取得。
*/
+import type { Page } from "playwright";
import type { Scraper, VenueMeta } from "./base";
import type { EventInput } from "~/lib/db.server";
+import { getBrowser } from "~/lib/playwright.server";
export const venue: VenueMeta = {
id: "pitbar-nishiogikubo",
@@ -21,81 +22,95 @@ export const venue: VenueMeta = {
area: "西荻窪",
};
-const FREECALEND_MEMBER = "25771";
+const CALENDAR_URL = "http://freecalend.com/open/mem25771";
+const MEMBER_ID = "25771";
+
+// Cells whose text matches this are BAR open-hours entries (not live events)
+const BAR_ONLY_RE = /^BAR営業/;
+
+async function extractEvents(page: Page, dateFrom: string, dateTo: string): Promise<EventInput[]> {
+ const events: EventInput[] = [];
+
+ const cellData = await page.evaluate((memberId: string) => {
+ const prefix = `cal-${memberId}-`;
+ const cells = document.querySelectorAll(`[id^="${prefix}"]`);
+ return Array.from(cells).map((el) => {
+ const id = el.getAttribute("id") ?? "";
+ const parts = id.split("-");
+ // id: cal-25771-YYYY-M-D
+ const year = parts[2];
+ const month = parts[3];
+ const day = parts[4];
+ if (!year || !month || !day) return null;
+ return {
+ date: `${year}-${month.padStart(2, "0")}-${day.padStart(2, "0")}`,
+ text: el.textContent?.trim() ?? "",
+ };
+ }).filter(Boolean);
+ }, MEMBER_ID);
+
+ for (const cell of cellData as { date: string; text: string }[]) {
+ const { date, text } = cell;
+ if (date < dateFrom || date > dateTo) continue;
+
+ // Remove leading "day-number" and "M.D(day)" lines
+ const lines = text.split(/\n/).map((l) => l.trim()).filter(Boolean);
+ // First line is the day number, second is "M.D(dayname)" — skip both
+ const contentLines = lines.slice(2);
+ if (contentLines.length === 0) continue;
+
+ const title = contentLines[0];
+ if (!title || BAR_ONLY_RE.test(title)) continue;
+
+ const fullText = contentLines.join("\n");
+ const openMatch = fullText.match(/open\s+(\d{1,2}:\d{2})/i);
+ const startMatch = fullText.match(/start\s+(\d{1,2}:\d{2})/i);
+
+ // Extract price: look for lines with "yen" or "円"
+ const priceMatch = fullText.match(/((?:adv|door|前売)[^\n]*(?:yen|円)[^\n]*)/i);
+ const price = priceMatch?.[1]?.trim() ?? null;
+
+ // Collect artists (lines starting with ■)
+ const artists = contentLines
+ .filter((l) => l.startsWith("■"))
+ .map((l) => l.slice(1).trim())
+ .join("、");
+
+ events.push({
+ venue_id: venue.id,
+ title,
+ artist: artists || null,
+ date,
+ open_time: openMatch?.[1] ?? null,
+ start_time: startMatch?.[1] ?? null,
+ price,
+ source_url: CALENDAR_URL,
+ });
+ }
+
+ return events;
+}
export const scraper: Scraper = {
venue,
async scrape(): Promise<EventInput[]> {
- const months = upcomingMonths(2);
- const events: EventInput[] = [];
-
- for (const ym of months) {
- const url = `http://freecalend.com/open/mem${FREECALEND_MEMBER}_date${ym}`;
- const res = await fetch(url, {
- headers: {
- "User-Agent":
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 Chrome/124 Safari/537.36",
- Referer: "https://ameblo.jp/pitbar/",
- },
- redirect: "follow",
- });
- if (!res.ok) continue;
-
- const html = await res.text();
- if (!html.trim()) continue;
-
- // freecalend は HTML テーブルカレンダー形式
- // <td class="day_..."> 内にイベント名と時刻が入る
- const { load } = await import("cheerio");
- const $ = load(html);
-
- $("td[class*='day_']").each((_, el) => {
- const $el = $(el);
- const text = $el.text().trim();
- if (!text || /^\d+$/.test(text)) return; // 日付のみのセルはスキップ
-
- const dayMatch = $el.attr("class")?.match(/day_(\d+)/);
- if (!dayMatch) return;
- const day = dayMatch[1].padStart(2, "0");
- const date = `${ym.slice(0, 4)}-${ym.slice(4)}-${day}`;
-
- const lines = text.split(/[\n\r]+/).map((l) => l.trim()).filter(Boolean);
- const title = lines[0] ?? text.slice(0, 100);
-
- const timeMatch = text.match(/(\d{1,2}:\d{2})/g);
- const openTime = timeMatch?.[0] ?? null;
- const startTime = timeMatch?.[1] ?? null;
-
- events.push({
- venue_id: venue.id,
- title,
- date,
- open_time: openTime,
- start_time: startTime,
- source_url: url,
- });
+ const browser = await getBrowser();
+ const page = await browser.newPage();
+
+ try {
+ await page.goto(CALENDAR_URL, {
+ waitUntil: "domcontentloaded",
+ timeout: 20_000,
});
- }
+ await page.waitForTimeout(5_000);
- if (events.length === 0) {
- throw new Error(
- "Pitbar freecalend からデータを取得できませんでした。" +
- "freecalend.com が自動リクエストをブロックしている可能性があります。"
- );
- }
+ const today = new Date();
+ const dateFrom = today.toISOString().slice(0, 10);
+ const dateTo = new Date(today.getTime() + 35 * 86_400_000).toISOString().slice(0, 10);
- return events;
+ return await extractEvents(page, dateFrom, dateTo);
+ } finally {
+ await page.close();
+ }
},
};
-
-function upcomingMonths(count: number): string[] {
- const months: string[] = [];
- const now = new Date();
- for (let i = 0; i < count; i++) {
- const d = new Date(now.getFullYear(), now.getMonth() + i, 1);
- const y = d.getFullYear();
- const m = String(d.getMonth() + 1).padStart(2, "0");
- months.push(`${y}${m}`);
- }
- return months;
-}