summaryrefslogtreecommitdiff
path: root/app/scrapers
diff options
context:
space:
mode:
authoryyamashita <yyamashita@mosquit.one>2026-05-07 22:06:24 +0900
committeryyamashita <yyamashita@mosquit.one>2026-05-07 22:06:24 +0900
commitb4823d4c124160dcba8e5fed1e424cf2cc12e72c (patch)
tree573de8cf82704320b3f308eaa0112e590e5f0aa5 /app/scrapers
parentb8537eabe94b24e8530b4c1511456dc94cf8ec4c (diff)
Fix scrapers returning 0 events for 4 venues
Rewrote selectors to match actual HTML structure after inspecting each site: - LIQUID ROOM: article selector, date extracted from URL (_YYYYMMDD suffix) - WWW/WWW X: article.column selector, month from li.month nav, day rollover detection - 新宿 LOFT: section.block_schedule_list links, full date from time div.year/month/day - CLUB QUATTRO: li[data-event-date] with ISO date attribute, jp-label time parsing Result: 0件 → LIQUID ROOM 25件 / WWW 48件 / LOFT 13件 / QUATTRO 24件 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Diffstat (limited to 'app/scrapers')
-rw-r--r--app/scrapers/club-quattro.ts59
-rw-r--r--app/scrapers/liquid-room.ts101
-rw-r--r--app/scrapers/shinjuku-loft.ts86
-rw-r--r--app/scrapers/www-shibuya.ts70
4 files changed, 132 insertions, 184 deletions
diff --git a/app/scrapers/club-quattro.ts b/app/scrapers/club-quattro.ts
index ae903bc..946b9a4 100644
--- a/app/scrapers/club-quattro.ts
+++ b/app/scrapers/club-quattro.ts
@@ -1,6 +1,3 @@
-/**
- * Club Quattro 渋谷 — https://www.club-quattro.com/shibuya/schedule/
- */
import * as cheerio from "cheerio";
import type { Scraper, VenueMeta } from "./base";
import type { EventInput } from "~/lib/db.server";
@@ -21,38 +18,40 @@ export const scraper: Scraper = {
const $ = cheerio.load(html);
const events: EventInput[] = [];
- $(".schedule-list__item, .c-event, li.event").each((_, el) => {
+ $("li[data-event-date]").each((_, el) => {
const $el = $(el);
- const title = $el.find(".schedule-list__title, .event-name, h3, h2").first().text().trim();
+ const date = $el.attr("data-event-date") ?? "";
+ if (!date) return;
+
+ const title = $el.find("p.txt-02").text().trim();
if (!title) return;
- const rawDate =
- $el.find(".schedule-list__date, .event-date, time").first().text().trim() ||
- $el.find("time").attr("datetime") ||
- "";
- const date = parseJapaneseDate(rawDate);
- if (!date) return;
+ const artist = $el.find("p.txt-01 span").text().trim() || null;
- const timeText = $el.find(".schedule-list__time, .time-info").first().text();
- const openMatch = timeText.match(/OPEN[:: ]*(\d{2}:\d{2})/i);
- const startMatch = timeText.match(/START[:: ]*(\d{2}:\d{2})/i);
+ let openTime: string | null = null;
+ let startTime: string | null = null;
+ $el.find("dl.detail-list .bundle").each((_, bundle) => {
+ const label = $(bundle).find("dt").text().trim();
+ if (label.includes("開場") || label.includes("開演")) {
+ const times = $(bundle).find("dd").text().trim().match(/\d{2}:\d{2}/g) ?? [];
+ openTime = times[0] ?? null;
+ startTime = times[1] ?? null;
+ }
+ });
- const detailHref = $el.find("a[href]").first().attr("href") ?? null;
+ const href = $el.find("a").first().attr("href") ?? null;
+ const imageSrc = $el.find(".front img").attr("src") ?? null;
events.push({
venue_id: venue.id,
title,
- artist: $el.find(".schedule-list__artist, .artist-name").first().text().trim() || null,
+ artist,
date,
- open_time: openMatch?.[1] ?? null,
- start_time: startMatch?.[1] ?? null,
- ticket_url:
- $el.find("a[href*='eplus'], a[href*='pia'], a[href*='ticket']").first().attr("href") ?? null,
- image_url: $el.find("img").first().attr("src")
- ? absoluteUrl($el.find("img").first().attr("src")!, venue.url)
- : null,
- source_url: detailHref ? absoluteUrl(detailHref, venue.url) : null,
+ open_time: openTime,
+ start_time: startTime,
+ image_url: imageSrc ? absoluteUrl(imageSrc, venue.url) : null,
+ source_url: href ? absoluteUrl(href, venue.url) : null,
});
});
@@ -60,18 +59,6 @@ export const scraper: Scraper = {
},
};
-function parseJapaneseDate(raw: string): string | null {
- const m =
- raw.match(/(\d{4})[./年](\d{1,2})[./月](\d{1,2})/) ||
- raw.match(/(\d{1,2})[./月](\d{1,2})/);
- if (!m) return null;
- if (m.length === 4) {
- return `${m[1]}-${m[2].padStart(2, "0")}-${m[3].padStart(2, "0")}`;
- }
- const year = new Date().getFullYear();
- return `${year}-${m[1].padStart(2, "0")}-${m[2].padStart(2, "0")}`;
-}
-
function absoluteUrl(url: string, base: string): string {
if (url.startsWith("http")) return url;
return url.startsWith("/") ? base + url : `${base}/${url}`;
diff --git a/app/scrapers/liquid-room.ts b/app/scrapers/liquid-room.ts
index b497759..f577ee6 100644
--- a/app/scrapers/liquid-room.ts
+++ b/app/scrapers/liquid-room.ts
@@ -1,9 +1,3 @@
-/**
- * Liquid Room (恵比寿) — https://www.liquidroom.net/schedule
- *
- * The schedule page lists events with JSON-LD or HTML data.
- * Structure: <div class="p-schedule__item"> contains date, title, etc.
- */
import * as cheerio from "cheerio";
import type { Scraper, VenueMeta } from "./base";
import type { EventInput } from "~/lib/db.server";
@@ -24,64 +18,53 @@ export const scraper: Scraper = {
const $ = cheerio.load(html);
const events: EventInput[] = [];
- $("article.p-schedule__item, .schedule-list__item, .c-event-item").each(
- (_, el) => {
- const $el = $(el);
+ $("article").each((_, el) => {
+ const $el = $(el);
- const title =
- $el.find(".p-schedule__title, .event-title, h3, h2").first().text().trim();
- if (!title) return;
+ const href = $el.find("a.s_link").attr("href") ?? "";
+ // Date is encoded in the URL: e.g. /schedule/eventname_20260501
+ const dateMatch = href.match(/_(\d{4})(\d{2})(\d{2})$/);
+ if (!dateMatch) return;
+ const date = `${dateMatch[1]}-${dateMatch[2]}-${dateMatch[3]}`;
- const dateStr =
- $el.find(".p-schedule__date, .event-date, time").first().text().trim() ||
- $el.find("time").attr("datetime") ||
- "";
- const date = parseJapaneseDate(dateStr);
- if (!date) return;
+ const h2 = $el.find("h2").first().text().trim();
+ if (!h2) return;
- const artist =
- $el.find(".p-schedule__artist, .artist").first().text().trim() || null;
- const startTime =
- $el.find(".p-schedule__time, .open-time").first().text().trim().match(/\d{2}:\d{2}/)?.[0] ?? null;
- const ticketUrl =
- $el.find("a[href*='ticket'], a[href*='eplus'], a[href*='pia']").first().attr("href") ?? null;
- const imageUrl =
- $el.find("img").first().attr("src") ?? null;
- const sourceUrl =
- $el.find("a").first().attr("href") ?? null;
+ const subtitle = $el.find("p.subtitle").first().text().trim();
+ // h2 is the artist/band name; subtitle (if present) is the event title
+ const title = subtitle || h2;
+ const artist = subtitle ? h2 : null;
- events.push({
- venue_id: venue.id,
- title,
- artist,
- date,
- start_time: startTime,
- ticket_url: ticketUrl,
- image_url: imageUrl ? absoluteUrl(imageUrl, venue.url) : null,
- source_url: sourceUrl ? absoluteUrl(sourceUrl, venue.url) : null,
- });
- }
- );
+ const openTime =
+ $el
+ .find("dl")
+ .filter((_, dl) => $(dl).find("dt").text().includes("OPEN"))
+ .find("dd")
+ .text()
+ .trim()
+ .match(/\d{2}:\d{2}/)?.[0] ?? null;
+
+ const startTime =
+ $el
+ .find("dl")
+ .filter((_, dl) => $(dl).find("dt").text().includes("START"))
+ .find("dd")
+ .text()
+ .trim()
+ .match(/\d{2}:\d{2}/)?.[0] ?? null;
+
+ events.push({
+ venue_id: venue.id,
+ title,
+ artist,
+ date,
+ open_time: openTime,
+ start_time: startTime,
+ image_url: $el.find("div.left img").attr("src") ?? null,
+ source_url: href,
+ });
+ });
return events;
},
};
-
-function parseJapaneseDate(raw: string): string | null {
- // Handles "2025.06.15" "2025/06/15" "2025年06月15日" "06.15" formats
- const m =
- raw.match(/(\d{4})[./年](\d{1,2})[./月](\d{1,2})/) ||
- raw.match(/(\d{1,2})[./月](\d{1,2})/);
- if (!m) return null;
- if (m.length === 4) {
- return `${m[1]}-${m[2].padStart(2, "0")}-${m[3].padStart(2, "0")}`;
- }
- const year = new Date().getFullYear();
- return `${year}-${m[1].padStart(2, "0")}-${m[2].padStart(2, "0")}`;
-}
-
-function absoluteUrl(url: string, base: string): string {
- if (url.startsWith("http")) return url;
- if (url.startsWith("/")) return base + url;
- return base + "/" + url;
-}
diff --git a/app/scrapers/shinjuku-loft.ts b/app/scrapers/shinjuku-loft.ts
index 8a64761..d5602e7 100644
--- a/app/scrapers/shinjuku-loft.ts
+++ b/app/scrapers/shinjuku-loft.ts
@@ -1,8 +1,3 @@
-/**
- * 新宿 LOFT — https://www.loft-prj.co.jp/schedule/loft
- *
- * The schedule page renders events inside `.eventlist` items.
- */
import * as cheerio from "cheerio";
import type { Scraper, VenueMeta } from "./base";
import type { EventInput } from "~/lib/db.server";
@@ -22,59 +17,48 @@ export const scraper: Scraper = {
const html = await res.text();
const $ = cheerio.load(html);
const events: EventInput[] = [];
+ const seen = new Set<string>();
- $(".eventlist__item, .schedule-item, .event_list li").each((_, el) => {
- const $el = $(el);
+ $("section.block_schedule_list a[href*='/schedule/loft/schedule/']").each(
+ (_, el) => {
+ const $el = $(el);
+ const href = $el.attr("href") ?? "";
+ if (seen.has(href)) return;
+ seen.add(href);
- const title = $el.find(".eventlist__title, .event-title, h3, h2").first().text().trim();
- if (!title) return;
+ const year = $el.find("time div.year").text().trim();
+ const month = $el.find("time div.month").text().trim();
+ const day = $el.find("time div.day").text().trim();
+ if (!year || !month || !day) return;
+ const date = `${year}-${month.padStart(2, "0")}-${day.padStart(2, "0")}`;
- const rawDate =
- $el.find(".eventlist__date, .event-date, time").first().text().trim() ||
- $el.find("time").attr("datetime") ||
- "";
- const date = parseJapaneseDate(rawDate);
- if (!date) return;
+ const title = $el.find(".c_title span").text().trim();
+ if (!title) return;
- const timeText = $el.find(".eventlist__time, .time").first().text();
- const openMatch = timeText.match(/OPEN[:: ]*(\d{2}:\d{2})/i);
- const startMatch = timeText.match(/START[:: ]*(\d{2}:\d{2})/i);
+ const timeText = $el.find(".open").text().trim();
+ const openMatch = timeText.match(/OPEN\s*(\d{2}:\d{2})/i);
+ const startMatch = timeText.match(/START\s*(\d{2}:\d{2})/i);
- const detailHref = $el.find("a[href]").first().attr("href") ?? null;
+ const artists = $el
+ .find("ul.artist_tag li")
+ .map((_, li) => $(li).text().trim())
+ .get()
+ .filter((a) => a !== "...");
+ const artist = artists.join(" / ") || null;
- events.push({
- venue_id: venue.id,
- title,
- artist: $el.find(".eventlist__artist, .artist").first().text().trim() || null,
- date,
- open_time: openMatch?.[1] ?? null,
- start_time: startMatch?.[1] ?? null,
- ticket_url:
- $el.find("a[href*='eplus'], a[href*='pia'], a[href*='ticket']").first().attr("href") ?? null,
- image_url: $el.find("img").first().attr("src")
- ? absoluteUrl($el.find("img").first().attr("src")!, venue.url)
- : null,
- source_url: detailHref ? absoluteUrl(detailHref, venue.url) : null,
- });
- });
+ events.push({
+ venue_id: venue.id,
+ title,
+ artist,
+ date,
+ open_time: openMatch?.[1] ?? null,
+ start_time: startMatch?.[1] ?? null,
+ image_url: $el.find("span.bg").attr("data-bg") ?? null,
+ source_url: href,
+ });
+ }
+ );
return events;
},
};
-
-function parseJapaneseDate(raw: string): string | null {
- const m =
- raw.match(/(\d{4})[./年](\d{1,2})[./月](\d{1,2})/) ||
- raw.match(/(\d{1,2})[./月](\d{1,2})/);
- if (!m) return null;
- if (m.length === 4) {
- return `${m[1]}-${m[2].padStart(2, "0")}-${m[3].padStart(2, "0")}`;
- }
- const year = new Date().getFullYear();
- return `${year}-${m[1].padStart(2, "0")}-${m[2].padStart(2, "0")}`;
-}
-
-function absoluteUrl(url: string, base: string): string {
- if (url.startsWith("http")) return url;
- return url.startsWith("/") ? base + url : `${base}/${url}`;
-}
diff --git a/app/scrapers/www-shibuya.ts b/app/scrapers/www-shibuya.ts
index 905fc61..d561332 100644
--- a/app/scrapers/www-shibuya.ts
+++ b/app/scrapers/www-shibuya.ts
@@ -1,6 +1,3 @@
-/**
- * WWW / WWW X (渋谷) — https://www-shibuya.jp/schedule/
- */
import * as cheerio from "cheerio";
import type { Scraper, VenueMeta } from "./base";
import type { EventInput } from "~/lib/db.server";
@@ -21,39 +18,48 @@ export const scraper: Scraper = {
const $ = cheerio.load(html);
const events: EventInput[] = [];
- $(".schedule-list li, .p-schedule-item, article").each((_, el) => {
+ // Month from nav: "202605May" → year=2026, month=5
+ const monthText = $("li.month").first().text().trim();
+ const monthMatch = monthText.match(/(\d{4})(\d{2})/);
+ let year = monthMatch ? parseInt(monthMatch[1]) : new Date().getFullYear();
+ let month = monthMatch ? parseInt(monthMatch[2]) : new Date().getMonth() + 1;
+ let prevDay = 0;
+
+ $("article.column").each((_, el) => {
const $el = $(el);
- const title = $el.find(".schedule-title, .title, h3, h2").first().text().trim();
- if (!title) return;
+ const day = parseInt($el.find(".date .day").text().trim(), 10);
+ if (!day) return;
- const rawDate =
- $el.find(".schedule-date, .date, time").first().text().trim() ||
- $el.find("time").attr("datetime") ||
- "";
- const date = parseJapaneseDate(rawDate);
- if (!date) return;
+ // Detect month rollover when day numbers reset
+ if (prevDay > 0 && day < prevDay) {
+ month++;
+ if (month > 12) {
+ month = 1;
+ year++;
+ }
+ }
+ prevDay = day;
- const timeText = $el.find(".schedule-time, .time").first().text();
- const openMatch = timeText.match(/OPEN\s*(\d{2}:\d{2})/i);
- const startMatch = timeText.match(/START\s*(\d{2}:\d{2})/i);
+ const date = `${year}-${String(month).padStart(2, "0")}-${String(day).padStart(2, "0")}`;
+
+ const title = $el.find("h3").text().trim();
+ if (!title) return;
- const detailHref = $el.find("a").first().attr("href") ?? null;
+ const timeText = $el.find(".openstart").text();
+ const times = timeText.match(/\d{2}:\d{2}/g) ?? [];
+
+ const href = $el.find("a").first().attr("href") ?? null;
events.push({
venue_id: venue.id,
title,
- artist: $el.find(".artist").first().text().trim() || null,
+ artist: null,
date,
- open_time: openMatch?.[1] ?? null,
- start_time: startMatch?.[1] ?? null,
- ticket_url:
- $el.find("a[href*='eplus'], a[href*='pia'], a[href*='ticket']").first().attr("href") ?? null,
- image_url:
- $el.find("img").first().attr("src")
- ? absoluteUrl($el.find("img").first().attr("src")!, venue.url)
- : null,
- source_url: detailHref ? absoluteUrl(detailHref, venue.url) : null,
+ open_time: times[0] ?? null,
+ start_time: times[1] ?? null,
+ image_url: null,
+ source_url: href ? absoluteUrl(href, venue.url) : null,
});
});
@@ -61,18 +67,6 @@ export const scraper: Scraper = {
},
};
-function parseJapaneseDate(raw: string): string | null {
- const m =
- raw.match(/(\d{4})[./年](\d{1,2})[./月](\d{1,2})/) ||
- raw.match(/(\d{1,2})[./月](\d{1,2})/);
- if (!m) return null;
- if (m.length === 4) {
- return `${m[1]}-${m[2].padStart(2, "0")}-${m[3].padStart(2, "0")}`;
- }
- const year = new Date().getFullYear();
- return `${year}-${m[1].padStart(2, "0")}-${m[2].padStart(2, "0")}`;
-}
-
function absoluteUrl(url: string, base: string): string {
if (url.startsWith("http")) return url;
return url.startsWith("/") ? base + url : `${base}/${url}`;