From b4823d4c124160dcba8e5fed1e424cf2cc12e72c Mon Sep 17 00:00:00 2001 From: yyamashita Date: Thu, 7 May 2026 22:06:24 +0900 Subject: Fix scrapers returning 0 events for 4 venues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rewrote selectors to match actual HTML structure after inspecting each site: - LIQUID ROOM: article selector, date extracted from URL (_YYYYMMDD suffix) - WWW/WWW X: article.column selector, month from li.month nav, day rollover detection - 新宿 LOFT: section.block_schedule_list links, full date from time div.year/month/day - CLUB QUATTRO: li[data-event-date] with ISO date attribute, jp-label time parsing Result: 0件 → LIQUID ROOM 25件 / WWW 48件 / LOFT 13件 / QUATTRO 24件 Co-Authored-By: Claude Sonnet 4.6 --- app/scrapers/club-quattro.ts | 59 ++++++++++-------------- app/scrapers/liquid-room.ts | 101 ++++++++++++++++++------------------------ app/scrapers/shinjuku-loft.ts | 86 +++++++++++++++-------------------- app/scrapers/www-shibuya.ts | 70 +++++++++++++---------------- 4 files changed, 132 insertions(+), 184 deletions(-) (limited to 'app/scrapers') diff --git a/app/scrapers/club-quattro.ts b/app/scrapers/club-quattro.ts index ae903bc..946b9a4 100644 --- a/app/scrapers/club-quattro.ts +++ b/app/scrapers/club-quattro.ts @@ -1,6 +1,3 @@ -/** - * Club Quattro 渋谷 — https://www.club-quattro.com/shibuya/schedule/ - */ import * as cheerio from "cheerio"; import type { Scraper, VenueMeta } from "./base"; import type { EventInput } from "~/lib/db.server"; @@ -21,38 +18,40 @@ export const scraper: Scraper = { const $ = cheerio.load(html); const events: EventInput[] = []; - $(".schedule-list__item, .c-event, li.event").each((_, el) => { + $("li[data-event-date]").each((_, el) => { const $el = $(el); - const title = $el.find(".schedule-list__title, .event-name, h3, h2").first().text().trim(); + const date = $el.attr("data-event-date") ?? ""; + if (!date) return; + + const title = $el.find("p.txt-02").text().trim(); if (!title) return; - const rawDate = - $el.find(".schedule-list__date, .event-date, time").first().text().trim() || - $el.find("time").attr("datetime") || - ""; - const date = parseJapaneseDate(rawDate); - if (!date) return; + const artist = $el.find("p.txt-01 span").text().trim() || null; - const timeText = $el.find(".schedule-list__time, .time-info").first().text(); - const openMatch = timeText.match(/OPEN[:: ]*(\d{2}:\d{2})/i); - const startMatch = timeText.match(/START[:: ]*(\d{2}:\d{2})/i); + let openTime: string | null = null; + let startTime: string | null = null; + $el.find("dl.detail-list .bundle").each((_, bundle) => { + const label = $(bundle).find("dt").text().trim(); + if (label.includes("開場") || label.includes("開演")) { + const times = $(bundle).find("dd").text().trim().match(/\d{2}:\d{2}/g) ?? []; + openTime = times[0] ?? null; + startTime = times[1] ?? null; + } + }); - const detailHref = $el.find("a[href]").first().attr("href") ?? null; + const href = $el.find("a").first().attr("href") ?? null; + const imageSrc = $el.find(".front img").attr("src") ?? null; events.push({ venue_id: venue.id, title, - artist: $el.find(".schedule-list__artist, .artist-name").first().text().trim() || null, + artist, date, - open_time: openMatch?.[1] ?? null, - start_time: startMatch?.[1] ?? null, - ticket_url: - $el.find("a[href*='eplus'], a[href*='pia'], a[href*='ticket']").first().attr("href") ?? null, - image_url: $el.find("img").first().attr("src") - ? absoluteUrl($el.find("img").first().attr("src")!, venue.url) - : null, - source_url: detailHref ? absoluteUrl(detailHref, venue.url) : null, + open_time: openTime, + start_time: startTime, + image_url: imageSrc ? absoluteUrl(imageSrc, venue.url) : null, + source_url: href ? absoluteUrl(href, venue.url) : null, }); }); @@ -60,18 +59,6 @@ export const scraper: Scraper = { }, }; -function parseJapaneseDate(raw: string): string | null { - const m = - raw.match(/(\d{4})[./年](\d{1,2})[./月](\d{1,2})/) || - raw.match(/(\d{1,2})[./月](\d{1,2})/); - if (!m) return null; - if (m.length === 4) { - return `${m[1]}-${m[2].padStart(2, "0")}-${m[3].padStart(2, "0")}`; - } - const year = new Date().getFullYear(); - return `${year}-${m[1].padStart(2, "0")}-${m[2].padStart(2, "0")}`; -} - function absoluteUrl(url: string, base: string): string { if (url.startsWith("http")) return url; return url.startsWith("/") ? base + url : `${base}/${url}`; diff --git a/app/scrapers/liquid-room.ts b/app/scrapers/liquid-room.ts index b497759..f577ee6 100644 --- a/app/scrapers/liquid-room.ts +++ b/app/scrapers/liquid-room.ts @@ -1,9 +1,3 @@ -/** - * Liquid Room (恵比寿) — https://www.liquidroom.net/schedule - * - * The schedule page lists events with JSON-LD or HTML data. - * Structure:
contains date, title, etc. - */ import * as cheerio from "cheerio"; import type { Scraper, VenueMeta } from "./base"; import type { EventInput } from "~/lib/db.server"; @@ -24,64 +18,53 @@ export const scraper: Scraper = { const $ = cheerio.load(html); const events: EventInput[] = []; - $("article.p-schedule__item, .schedule-list__item, .c-event-item").each( - (_, el) => { - const $el = $(el); + $("article").each((_, el) => { + const $el = $(el); - const title = - $el.find(".p-schedule__title, .event-title, h3, h2").first().text().trim(); - if (!title) return; + const href = $el.find("a.s_link").attr("href") ?? ""; + // Date is encoded in the URL: e.g. /schedule/eventname_20260501 + const dateMatch = href.match(/_(\d{4})(\d{2})(\d{2})$/); + if (!dateMatch) return; + const date = `${dateMatch[1]}-${dateMatch[2]}-${dateMatch[3]}`; - const dateStr = - $el.find(".p-schedule__date, .event-date, time").first().text().trim() || - $el.find("time").attr("datetime") || - ""; - const date = parseJapaneseDate(dateStr); - if (!date) return; + const h2 = $el.find("h2").first().text().trim(); + if (!h2) return; - const artist = - $el.find(".p-schedule__artist, .artist").first().text().trim() || null; - const startTime = - $el.find(".p-schedule__time, .open-time").first().text().trim().match(/\d{2}:\d{2}/)?.[0] ?? null; - const ticketUrl = - $el.find("a[href*='ticket'], a[href*='eplus'], a[href*='pia']").first().attr("href") ?? null; - const imageUrl = - $el.find("img").first().attr("src") ?? null; - const sourceUrl = - $el.find("a").first().attr("href") ?? null; + const subtitle = $el.find("p.subtitle").first().text().trim(); + // h2 is the artist/band name; subtitle (if present) is the event title + const title = subtitle || h2; + const artist = subtitle ? h2 : null; - events.push({ - venue_id: venue.id, - title, - artist, - date, - start_time: startTime, - ticket_url: ticketUrl, - image_url: imageUrl ? absoluteUrl(imageUrl, venue.url) : null, - source_url: sourceUrl ? absoluteUrl(sourceUrl, venue.url) : null, - }); - } - ); + const openTime = + $el + .find("dl") + .filter((_, dl) => $(dl).find("dt").text().includes("OPEN")) + .find("dd") + .text() + .trim() + .match(/\d{2}:\d{2}/)?.[0] ?? null; + + const startTime = + $el + .find("dl") + .filter((_, dl) => $(dl).find("dt").text().includes("START")) + .find("dd") + .text() + .trim() + .match(/\d{2}:\d{2}/)?.[0] ?? null; + + events.push({ + venue_id: venue.id, + title, + artist, + date, + open_time: openTime, + start_time: startTime, + image_url: $el.find("div.left img").attr("src") ?? null, + source_url: href, + }); + }); return events; }, }; - -function parseJapaneseDate(raw: string): string | null { - // Handles "2025.06.15" "2025/06/15" "2025年06月15日" "06.15" formats - const m = - raw.match(/(\d{4})[./年](\d{1,2})[./月](\d{1,2})/) || - raw.match(/(\d{1,2})[./月](\d{1,2})/); - if (!m) return null; - if (m.length === 4) { - return `${m[1]}-${m[2].padStart(2, "0")}-${m[3].padStart(2, "0")}`; - } - const year = new Date().getFullYear(); - return `${year}-${m[1].padStart(2, "0")}-${m[2].padStart(2, "0")}`; -} - -function absoluteUrl(url: string, base: string): string { - if (url.startsWith("http")) return url; - if (url.startsWith("/")) return base + url; - return base + "/" + url; -} diff --git a/app/scrapers/shinjuku-loft.ts b/app/scrapers/shinjuku-loft.ts index 8a64761..d5602e7 100644 --- a/app/scrapers/shinjuku-loft.ts +++ b/app/scrapers/shinjuku-loft.ts @@ -1,8 +1,3 @@ -/** - * 新宿 LOFT — https://www.loft-prj.co.jp/schedule/loft - * - * The schedule page renders events inside `.eventlist` items. - */ import * as cheerio from "cheerio"; import type { Scraper, VenueMeta } from "./base"; import type { EventInput } from "~/lib/db.server"; @@ -22,59 +17,48 @@ export const scraper: Scraper = { const html = await res.text(); const $ = cheerio.load(html); const events: EventInput[] = []; + const seen = new Set(); - $(".eventlist__item, .schedule-item, .event_list li").each((_, el) => { - const $el = $(el); + $("section.block_schedule_list a[href*='/schedule/loft/schedule/']").each( + (_, el) => { + const $el = $(el); + const href = $el.attr("href") ?? ""; + if (seen.has(href)) return; + seen.add(href); - const title = $el.find(".eventlist__title, .event-title, h3, h2").first().text().trim(); - if (!title) return; + const year = $el.find("time div.year").text().trim(); + const month = $el.find("time div.month").text().trim(); + const day = $el.find("time div.day").text().trim(); + if (!year || !month || !day) return; + const date = `${year}-${month.padStart(2, "0")}-${day.padStart(2, "0")}`; - const rawDate = - $el.find(".eventlist__date, .event-date, time").first().text().trim() || - $el.find("time").attr("datetime") || - ""; - const date = parseJapaneseDate(rawDate); - if (!date) return; + const title = $el.find(".c_title span").text().trim(); + if (!title) return; - const timeText = $el.find(".eventlist__time, .time").first().text(); - const openMatch = timeText.match(/OPEN[:: ]*(\d{2}:\d{2})/i); - const startMatch = timeText.match(/START[:: ]*(\d{2}:\d{2})/i); + const timeText = $el.find(".open").text().trim(); + const openMatch = timeText.match(/OPEN\s*(\d{2}:\d{2})/i); + const startMatch = timeText.match(/START\s*(\d{2}:\d{2})/i); - const detailHref = $el.find("a[href]").first().attr("href") ?? null; + const artists = $el + .find("ul.artist_tag li") + .map((_, li) => $(li).text().trim()) + .get() + .filter((a) => a !== "..."); + const artist = artists.join(" / ") || null; - events.push({ - venue_id: venue.id, - title, - artist: $el.find(".eventlist__artist, .artist").first().text().trim() || null, - date, - open_time: openMatch?.[1] ?? null, - start_time: startMatch?.[1] ?? null, - ticket_url: - $el.find("a[href*='eplus'], a[href*='pia'], a[href*='ticket']").first().attr("href") ?? null, - image_url: $el.find("img").first().attr("src") - ? absoluteUrl($el.find("img").first().attr("src")!, venue.url) - : null, - source_url: detailHref ? absoluteUrl(detailHref, venue.url) : null, - }); - }); + events.push({ + venue_id: venue.id, + title, + artist, + date, + open_time: openMatch?.[1] ?? null, + start_time: startMatch?.[1] ?? null, + image_url: $el.find("span.bg").attr("data-bg") ?? null, + source_url: href, + }); + } + ); return events; }, }; - -function parseJapaneseDate(raw: string): string | null { - const m = - raw.match(/(\d{4})[./年](\d{1,2})[./月](\d{1,2})/) || - raw.match(/(\d{1,2})[./月](\d{1,2})/); - if (!m) return null; - if (m.length === 4) { - return `${m[1]}-${m[2].padStart(2, "0")}-${m[3].padStart(2, "0")}`; - } - const year = new Date().getFullYear(); - return `${year}-${m[1].padStart(2, "0")}-${m[2].padStart(2, "0")}`; -} - -function absoluteUrl(url: string, base: string): string { - if (url.startsWith("http")) return url; - return url.startsWith("/") ? base + url : `${base}/${url}`; -} diff --git a/app/scrapers/www-shibuya.ts b/app/scrapers/www-shibuya.ts index 905fc61..d561332 100644 --- a/app/scrapers/www-shibuya.ts +++ b/app/scrapers/www-shibuya.ts @@ -1,6 +1,3 @@ -/** - * WWW / WWW X (渋谷) — https://www-shibuya.jp/schedule/ - */ import * as cheerio from "cheerio"; import type { Scraper, VenueMeta } from "./base"; import type { EventInput } from "~/lib/db.server"; @@ -21,39 +18,48 @@ export const scraper: Scraper = { const $ = cheerio.load(html); const events: EventInput[] = []; - $(".schedule-list li, .p-schedule-item, article").each((_, el) => { + // Month from nav: "202605May" → year=2026, month=5 + const monthText = $("li.month").first().text().trim(); + const monthMatch = monthText.match(/(\d{4})(\d{2})/); + let year = monthMatch ? parseInt(monthMatch[1]) : new Date().getFullYear(); + let month = monthMatch ? parseInt(monthMatch[2]) : new Date().getMonth() + 1; + let prevDay = 0; + + $("article.column").each((_, el) => { const $el = $(el); - const title = $el.find(".schedule-title, .title, h3, h2").first().text().trim(); - if (!title) return; + const day = parseInt($el.find(".date .day").text().trim(), 10); + if (!day) return; - const rawDate = - $el.find(".schedule-date, .date, time").first().text().trim() || - $el.find("time").attr("datetime") || - ""; - const date = parseJapaneseDate(rawDate); - if (!date) return; + // Detect month rollover when day numbers reset + if (prevDay > 0 && day < prevDay) { + month++; + if (month > 12) { + month = 1; + year++; + } + } + prevDay = day; - const timeText = $el.find(".schedule-time, .time").first().text(); - const openMatch = timeText.match(/OPEN\s*(\d{2}:\d{2})/i); - const startMatch = timeText.match(/START\s*(\d{2}:\d{2})/i); + const date = `${year}-${String(month).padStart(2, "0")}-${String(day).padStart(2, "0")}`; + + const title = $el.find("h3").text().trim(); + if (!title) return; - const detailHref = $el.find("a").first().attr("href") ?? null; + const timeText = $el.find(".openstart").text(); + const times = timeText.match(/\d{2}:\d{2}/g) ?? []; + + const href = $el.find("a").first().attr("href") ?? null; events.push({ venue_id: venue.id, title, - artist: $el.find(".artist").first().text().trim() || null, + artist: null, date, - open_time: openMatch?.[1] ?? null, - start_time: startMatch?.[1] ?? null, - ticket_url: - $el.find("a[href*='eplus'], a[href*='pia'], a[href*='ticket']").first().attr("href") ?? null, - image_url: - $el.find("img").first().attr("src") - ? absoluteUrl($el.find("img").first().attr("src")!, venue.url) - : null, - source_url: detailHref ? absoluteUrl(detailHref, venue.url) : null, + open_time: times[0] ?? null, + start_time: times[1] ?? null, + image_url: null, + source_url: href ? absoluteUrl(href, venue.url) : null, }); }); @@ -61,18 +67,6 @@ export const scraper: Scraper = { }, }; -function parseJapaneseDate(raw: string): string | null { - const m = - raw.match(/(\d{4})[./年](\d{1,2})[./月](\d{1,2})/) || - raw.match(/(\d{1,2})[./月](\d{1,2})/); - if (!m) return null; - if (m.length === 4) { - return `${m[1]}-${m[2].padStart(2, "0")}-${m[3].padStart(2, "0")}`; - } - const year = new Date().getFullYear(); - return `${year}-${m[1].padStart(2, "0")}-${m[2].padStart(2, "0")}`; -} - function absoluteUrl(url: string, base: string): string { if (url.startsWith("http")) return url; return url.startsWith("/") ? base + url : `${base}/${url}`; -- cgit v1.2.3