summaryrefslogtreecommitdiff
path: root/app/scrapers/mod-shibasaki.ts
diff options
context:
space:
mode:
authoryyamashita <yyamashita@mosquit.one>2026-05-07 19:27:50 +0900
committeryyamashita <yyamashita@mosquit.one>2026-05-07 19:27:50 +0900
commitd5e975b601e70adf901c8e1eb7e61f0388941195 (patch)
treef1778ff15b6540b44c354cb76c44aac795448c4a /app/scrapers/mod-shibasaki.ts
parentbffc2c74408ff7163cea0c0392dfc4b15c620a5f (diff)
Add 5 new venue scrapers; extract artist info for WARP, shibuya-o, MOON STEP, mod
New scrapers: Fever 下北沢, Nine Spices 下北沢, 西荻窪 JAM, mod 柴崎, 中野 MOON STEP Artist extraction added/fixed: - warp-kichijoji: parse div.w-flyer (clone + remove nested notes-wrapper) - shibuya-o: rewrite to scrape each sub-venue; artist from li.p-scheduled-card__artist-item - moon-step-nakano: parse 出演 section from WordPress API description HTML - mod-shibasaki: fetch individual event pages in parallel; handle live:/出演:/・ bullet formats Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Diffstat (limited to 'app/scrapers/mod-shibasaki.ts')
-rw-r--r--app/scrapers/mod-shibasaki.ts124
1 files changed, 124 insertions, 0 deletions
diff --git a/app/scrapers/mod-shibasaki.ts b/app/scrapers/mod-shibasaki.ts
new file mode 100644
index 0000000..0e2a96b
--- /dev/null
+++ b/app/scrapers/mod-shibasaki.ts
@@ -0,0 +1,124 @@
+/**
+ * shibasaki mod (調布市柴崎) — https://shibasakimod.com/schedule
+ *
+ * Squarespace イベントリスト。静的 HTML として配信される。
+ * 一覧ページに出演者情報はないため、各イベントの詳細ページを並列取得する。
+ * DOM 構造 (一覧):
+ * <article class="eventlist-event eventlist-event--upcoming">
+ * <time class="event-date" datetime="YYYY-MM-DD">
+ * <h1 class="eventlist-title"><a class="eventlist-title-link" href="/schedule/YYYYMMDD">
+ * DOM 構造 (詳細):
+ * <div class="sqs-html-content"><p style="white-space:pre-wrap;">
+ * ...
+ * live:\nアーティスト名\n... (または 出演:)
+ */
+import * as cheerio from "cheerio";
+import type { Scraper, VenueMeta } from "./base";
+import type { EventInput } from "~/lib/db.server";
+
+export const venue: VenueMeta = {
+ id: "mod-shibasaki",
+ name: "shibasaki mod",
+ url: "https://shibasakimod.com",
+ area: "柴崎",
+};
+
+const SCHEDULE_URL = "https://shibasakimod.com/schedule";
+
+interface EventStub {
+ date: string;
+ title: string;
+ startTime: string | null;
+ sourceUrl: string | null;
+ imageUrl: string | null;
+}
+
+async function fetchArtist(url: string): Promise<string | null> {
+ try {
+ const res = await fetch(url);
+ if (!res.ok) return null;
+ const $ = cheerio.load(await res.text());
+ // Insert newlines at block boundaries before extracting text
+ $(".sqs-html-content br").replaceWith("\n");
+ $(".sqs-html-content p").each((_, el) => {
+ $(el).append("\n");
+ });
+ const text = $(".sqs-html-content").text();
+ const extractArtists = (section: string): string[] => {
+ const artists: string[] = [];
+ for (const raw of section.split(/\n/)) {
+ const l = raw.replace(/\s*@\S+/g, "").replace(/^[・•]\s*/, "").trim();
+ if (!l) continue;
+ if (/^https?:\/\//i.test(l)) continue; // skip social links between artists
+ if (/\d{1,2}:\d{2}|[¥¥]|yen|ticket|チケット|予約|adv|door/i.test(l)) break;
+ artists.push(l);
+ }
+ return artists;
+ };
+
+ const match = text.match(/(?:live|出演|act)[::]\s*([\s\S]+)/i);
+ if (match) {
+ const artists = extractArtists(match[1]);
+ if (artists.length > 0) return artists.join(" / ");
+ }
+ // Fallback: find the first line starting with ・ and extract from there
+ const lines = text.split(/\n/);
+ const firstBullet = lines.findIndex((l) => /^・/.test(l.trim()));
+ if (firstBullet >= 0) {
+ const artists = extractArtists(lines.slice(firstBullet).join("\n"));
+ if (artists.length > 0) return artists.join(" / ");
+ }
+ return null;
+ } catch {
+ return null;
+ }
+}
+
+export const scraper: Scraper = {
+ venue,
+ async scrape(): Promise<EventInput[]> {
+ const res = await fetch(SCHEDULE_URL);
+ if (!res.ok) throw new Error(`HTTP ${res.status}`);
+ const $ = cheerio.load(await res.text());
+ const stubs: EventStub[] = [];
+
+ $("article.eventlist-event--upcoming").each((_, el) => {
+ const $el = $(el);
+
+ const date = $el.find("time.event-date").first().attr("datetime") ?? "";
+ if (!date.match(/^\d{4}-\d{2}-\d{2}$/)) return;
+
+ const title = $el.find("h1.eventlist-title a, h2.eventlist-title a").first().text().trim();
+ if (!title) return;
+
+ const startTime = $el.find("time.event-time-localized-start").first().text().trim() || null;
+
+ const relHref = $el.find("a.eventlist-title-link").first().attr("href") ?? null;
+ const sourceUrl = relHref
+ ? (relHref.startsWith("http") ? relHref : `${venue.url}${relHref}`)
+ : null;
+
+ const imageUrl =
+ $el.find("img[data-src]").first().attr("data-src") ??
+ $el.find("img[src]").first().attr("src") ?? null;
+
+ stubs.push({ date, title, startTime, sourceUrl, imageUrl });
+ });
+
+ const artists = await Promise.all(
+ stubs.map((s) => (s.sourceUrl ? fetchArtist(s.sourceUrl) : Promise.resolve(null)))
+ );
+
+ return stubs.map((s, i) => ({
+ venue_id: venue.id,
+ title: s.title,
+ date: s.date,
+ open_time: null,
+ start_time: s.startTime,
+ price: null,
+ image_url: s.imageUrl,
+ source_url: s.sourceUrl,
+ artist: artists[i],
+ }));
+ },
+};