diff options
Diffstat (limited to 'app/scrapers/shibuya-o.ts')
| -rw-r--r-- | app/scrapers/shibuya-o.ts | 153 |
1 files changed, 93 insertions, 60 deletions
diff --git a/app/scrapers/shibuya-o.ts b/app/scrapers/shibuya-o.ts index 1ad8d8c..3d6f192 100644 --- a/app/scrapers/shibuya-o.ts +++ b/app/scrapers/shibuya-o.ts @@ -1,8 +1,15 @@ /** - * Shibuya O-East / O-West / O-Crest / O-Nest (渋谷) - * https://www.shibuya-o.com/schedule/ + * 渋谷 O-EAST / O-WEST / O-Crest / O-nest — https://shibuya-o.com * - * The page uses a unified schedule listing for all O venues. + * 各ベニューのスケジュールページを個別に取得して統合する。 + * DOM 構造 (共通): + * <div class="p-scheduled-card"> + * <a href="https://shibuya-o.com/{venue}/schedule/{slug}/"> + * <span class="p-scheduled-card__date-item">05 / 01</span> + * <span class="p-scheduled-card__date-open">OPEN 18:00 / START 19:00</span> + * <span class="p-scheduled-card__title-main">タイトル</span> + * <li class="p-scheduled-card__artist-item">アーティスト</li> + * 年は nav リンク <a href="/east/schedule/?y=2026&m=6"> から取得。 */ import * as cheerio from "cheerio"; import type { Scraper, VenueMeta } from "./base"; @@ -10,73 +17,99 @@ import type { EventInput } from "~/lib/db.server"; export const venue: VenueMeta = { id: "shibuya-o", - name: "渋谷 O-EAST / O-WEST", - url: "https://www.shibuya-o.com", + name: "渋谷 O-EAST / O-WEST / O-Crest / O-nest", + url: "https://shibuya-o.com", area: "渋谷", }; -export const scraper: Scraper = { - venue, - async scrape(): Promise<EventInput[]> { - const res = await fetch("https://www.shibuya-o.com/schedule/"); - if (!res.ok) throw new Error(`HTTP ${res.status}`); - const html = await res.text(); - const $ = cheerio.load(html); - const events: EventInput[] = []; +const SUB_VENUES = ["east", "west", "crest", "nest"]; +const BASE = "https://shibuya-o.com"; - $(".schedule_list li, .c-schedule__item, .event-item").each((_, el) => { - const $el = $(el); +async function scrapeVenue(subVenue: string): Promise<EventInput[]> { + const url = `${BASE}/${subVenue}/schedule/`; + const res = await fetch(url); + if (!res.ok) throw new Error(`HTTP ${res.status} for ${url}`); + const $ = cheerio.load(await res.text()); + const events: EventInput[] = []; - const title = $el.find(".schedule_title, .title, h3").first().text().trim(); - if (!title) return; + // Extract year: try "next" nav link (?y=YYYY&m=MM) + const nextHref = $("a[href*='?y='][href*='&m=']").last().attr("href") ?? ""; + const nextYearMatch = nextHref.match(/[?&]y=(\d{4})/); + const nextMonthMatch = nextHref.match(/[?&]m=(\d{1,2})/); + const currentMonthRaw = $("div.p-schedule__month").first().text().trim(); + const currentMonth = parseInt(currentMonthRaw, 10); - const rawDate = - $el.find(".schedule_date, .date, time").first().text().trim() || - $el.find("time").attr("datetime") || - ""; - const date = parseJapaneseDate(rawDate); - if (!date) return; + let year = new Date().getFullYear(); + if (nextYearMatch && nextMonthMatch) { + const nextYear = parseInt(nextYearMatch[1], 10); + const nextMonth = parseInt(nextMonthMatch[1], 10); + // If next month == current month + 1 (normal case), year == nextYear + // If current month == 12 and next month == 1, year == nextYear - 1 + year = nextMonth === currentMonth + 1 ? nextYear : nextYear - 1; + } - const hall = $el.find(".schedule_hall, .hall, .venue-name").first().text().trim() || null; - const timeText = $el.find(".schedule_time, .time").first().text(); - const openMatch = timeText.match(/OPEN[:: ]*(\d{2}:\d{2})/i); - const startMatch = timeText.match(/START[:: ]*(\d{2}:\d{2})/i); + $("div.p-scheduled-card").each((_, el) => { + const $el = $(el); - const detailHref = $el.find("a[href]").first().attr("href") ?? null; + const dateRaw = $el.find("span.p-scheduled-card__date-item").first().text().trim(); + // "05 / 01" → month=5, day=1 + const dateMatch = dateRaw.match(/(\d{1,2})\s*\/\s*(\d{1,2})/); + if (!dateMatch) return; + const month = parseInt(dateMatch[1], 10); + const day = parseInt(dateMatch[2], 10); + if (!currentMonth || !month) return; + // Handle year rollover (December cards on January page, etc.) + const cardYear = month < currentMonth ? year + 1 : year; + const date = `${cardYear}-${String(month).padStart(2, "0")}-${String(day).padStart(2, "0")}`; - events.push({ - venue_id: venue.id, - title, - artist: hall, - date, - open_time: openMatch?.[1] ?? null, - start_time: startMatch?.[1] ?? null, - ticket_url: - $el.find("a[href*='eplus'], a[href*='lawson'], a[href*='ticket']").first().attr("href") ?? null, - image_url: $el.find("img").first().attr("src") - ? absoluteUrl($el.find("img").first().attr("src")!, venue.url) - : null, - source_url: detailHref ? absoluteUrl(detailHref, venue.url) : null, - }); - }); + const title = $el.find("span.p-scheduled-card__title-main").first().text().trim(); + if (!title) return; - return events; - }, -}; + const openText = $el.find("span.p-scheduled-card__date-open").first().text().trim(); + const openMatch = openText.match(/OPEN\s*(\d{2}:\d{2})/i); + const startMatch = openText.match(/START\s*(\d{2}:\d{2})/i); -function parseJapaneseDate(raw: string): string | null { - const m = - raw.match(/(\d{4})[./年](\d{1,2})[./月](\d{1,2})/) || - raw.match(/(\d{1,2})[./月](\d{1,2})/); - if (!m) return null; - if (m.length === 4) { - return `${m[1]}-${m[2].padStart(2, "0")}-${m[3].padStart(2, "0")}`; - } - const year = new Date().getFullYear(); - return `${year}-${m[1].padStart(2, "0")}-${m[2].padStart(2, "0")}`; -} + const artists = $el.find("li.p-scheduled-card__artist-item") + .map((_, s) => $(s).text().trim()).get().join("、") || null; + + const detailHref = $el.closest("a[href]").attr("href") ?? + $el.find("a[href]").first().attr("href") ?? null; + const sourceUrl = detailHref + ? (detailHref.startsWith("http") ? detailHref : `${BASE}${detailHref}`) + : null; -function absoluteUrl(url: string, base: string): string { - if (url.startsWith("http")) return url; - return url.startsWith("/") ? base + url : `${base}/${url}`; + const imageUrl = $el.find("figure img").first().attr("src") ?? null; + + events.push({ + venue_id: venue.id, + title, + artist: artists, + date, + open_time: openMatch?.[1] ?? null, + start_time: startMatch?.[1] ?? null, + image_url: imageUrl, + source_url: sourceUrl, + }); + }); + + return events; } + +export const scraper: Scraper = { + venue, + async scrape(): Promise<EventInput[]> { + const results = await Promise.allSettled(SUB_VENUES.map(scrapeVenue)); + const all: EventInput[] = []; + for (const r of results) { + if (r.status === "fulfilled") all.push(...r.value); + } + // Deduplicate by date + title + const seen = new Set<string>(); + return all.filter((e) => { + const key = `${e.date}|${e.title}`; + if (seen.has(key)) return false; + seen.add(key); + return true; + }); + }, +}; |
