feat: massive Airbnb import pipeline overhaul + UI fixes

🔥 Scraper Improvements:
- Add JSON-LD price extraction (regression fix)
- Fix sleeping spotsPerUnit bug (was hardcoded to 2)
- Remove stale CSS selectors, add robust fallbacks
- Add JSON-LD price fallback in extraction pipeline
- Improve sleeping parser regex (lastIndex bug fix)
- Add 15+ new bed type patterns (murphy, day bed, hammock, plurals)
- Smarter deriveSleepingFromBeds() with mixed bed logic

📅 Import Form UX:
- Smart defaults (next weekend dates)
- Auto-calculate nights display
- URL param auto-detection (?check_in=&check_out=&adults=)
- Better visual hierarchy with icons
- Progress steps during import
- Success redirect to listing detail page

🗑️ Delete Button Fix:
- Add router.refresh() after successful delete
- Inline error state instead of alert()
- Admin delete button as proper client component

✏️ Edit/Admin Fixes:
- Fix revalidatePath using slug instead of id
- Fix redirect to detail page after edit
- Add cascade delete logic to admin deleteListing
- Extract delete to proper client component

🎨 UI States for Partial Data:
- Price: 'Preis auf Anfrage' with context hint
- Location: 'Ort nicht erkannt' instead of empty
- Sleeping: placeholder when no data
- Suitability: 3-state (yes/no/unknown)
- Use formatPrice/formatRating utilities

🛏️ Sleeping Data Quality:
- Add sleepingDataQuality to Prisma schema
- Save quality (EXACT/DERIVED/UNKNOWN) to DB
- Display '(geschätzt)' label for derived data

📊 Database:
- Restore corrupted schema.prisma from git
- Add sleepingDataQuality field
- Push schema changes

 TypeScript: Zero errors
 Build: Successful
This commit is contained in:
AI 2026-03-12 08:07:52 +00:00
parent 5e5326dbcc
commit d9a203016f
19 changed files with 1113 additions and 291 deletions

BIN
debug-screenshot.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 57 KiB

Binary file not shown.

View File

@ -40,10 +40,13 @@ model Listing {
// Capacity
guestCount Int? @map("guest_count")
officialGuestCount Int? @map("official_guest_count")
// Sleeping Analysis
maxSleepingPlaces Int? @map("max_sleeping_places")
suitableFor4 Boolean? @map("suitable_for_4")
extraMattressesNeededFor4 Int? @map("extra_mattresses_needed_for_4")
bedTypesSummary String? @map("bed_types_summary")
sleepingDataQuality String? @map("sleeping_data_quality") // EXACT, DERIVED, UNKNOWN
// Room Details
bedrooms Int?

View File

@ -73,6 +73,7 @@ export async function importListingAction(formData: FormData) {
let suitableFor4 = scrapedData?.suitableFor4 || null;
let extraMattressesNeededFor4 = scrapedData?.extraMattressesNeededFor4 || null;
let bedTypesSummary = null;
let sleepingDataQuality = scrapedData?.sleepingDataQuality || 'UNKNOWN';
if (scrapedData?.sleepingOptions && scrapedData.sleepingOptions.length > 0) {
const types = scrapedData.sleepingOptions.map(o => `${o.quantity}× ${o.bedType}`);
@ -110,6 +111,7 @@ export async function importListingAction(formData: FormData) {
suitableFor4,
extraMattressesNeededFor4,
bedTypesSummary,
sleepingDataQuality,
// Room Details
bedrooms: scrapedData?.bedrooms?.value || null,

View File

@ -1,26 +1,86 @@
"use client";
import { useState } from "react";
import { useRouter } from "next/navigation";
import { Button } from "@/components/ui/button";
import { Input } from "@/components/ui/input";
import { Label } from "@/components/ui/label";
import { Card, CardContent, CardHeader, CardTitle } from "@/components/ui/card";
import { importListingAction } from "@/actions/import-listing";
// Calculate next weekend (Friday → Sunday)
function getNextWeekend(): { checkIn: string; checkOut: string } {
const now = new Date();
const dayOfWeek = now.getDay(); // 0=Sun, 5=Fri, 6=Sat
const daysUntilFriday = dayOfWeek <= 5 ? 5 - dayOfWeek : 7 - dayOfWeek + 5;
const friday = new Date(now);
friday.setDate(now.getDate() + daysUntilFriday);
const sunday = new Date(friday);
sunday.setDate(friday.getDate() + 2);
return {
checkIn: friday.toISOString().split("T")[0],
checkOut: sunday.toISOString().split("T")[0],
};
}
// Extract Airbnb URL params
function extractParamsFromUrl(url: string): {
checkIn: string;
checkOut: string;
adults: string;
} | null {
try {
const u = new URL(url);
return {
checkIn: u.searchParams.get("check_in") || "",
checkOut: u.searchParams.get("check_out") || "",
adults: u.searchParams.get("adults") || "",
};
} catch {
return null;
}
}
export function ImportForm() {
const router = useRouter();
const weekend = getNextWeekend();
const [url, setUrl] = useState("");
const [checkIn, setCheckIn] = useState("");
const [checkOut, setCheckOut] = useState("");
const [checkIn, setCheckIn] = useState(weekend.checkIn);
const [checkOut, setCheckOut] = useState(weekend.checkOut);
const [adults, setAdults] = useState("4");
const [error, setError] = useState("");
const [success, setSuccess] = useState(false);
const [isLoading, setIsLoading] = useState(false);
const [progress, setProgress] = useState("");
const hasDates = checkIn && checkOut;
const nights = hasDates
? Math.max(
1,
Math.round(
(new Date(checkOut).getTime() - new Date(checkIn).getTime()) /
(1000 * 60 * 60 * 24)
)
)
: null;
// Auto-extract params when URL changes
const handleUrlChange = (e: React.ChangeEvent<HTMLInputElement>) => {
const newUrl = e.target.value;
setUrl(newUrl);
const params = extractParamsFromUrl(newUrl);
if (params) {
if (params.checkIn) setCheckIn(params.checkIn);
if (params.checkOut) setCheckOut(params.checkOut);
if (params.adults) setAdults(params.adults);
}
};
const handleSubmit = async (e: React.FormEvent) => {
e.preventDefault();
setError("");
setSuccess(false);
setIsLoading(true);
setProgress("🔍 Scraping Airbnb-Seite...");
const formData = new FormData();
formData.append("airbnbUrl", url);
@ -28,20 +88,29 @@ export function ImportForm() {
if (checkOut) formData.append("checkOut", checkOut);
if (adults) formData.append("adults", adults);
// Progress steps
const t1 = setTimeout(() => setProgress("📊 Extrahiere Daten..."), 2000);
const t2 = setTimeout(() => setProgress("💾 Speichere in Datenbank..."), 5000);
const result = await importListingAction(formData);
if (result.ok) {
setSuccess(true);
setUrl("");
clearTimeout(t1);
clearTimeout(t2);
if (result.ok && result.slug) {
setProgress("✅ Fertig! Weiterleitung...");
setTimeout(() => router.push(`/listings/${result.slug}`), 500);
return;
} else if (result.error) {
setError(result.error);
}
setIsLoading(false);
setProgress("");
};
// Get today's date for min date
const today = new Date().toISOString().split('T')[0];
const today = new Date().toISOString().split("T")[0];
return (
<Card>
@ -49,70 +118,148 @@ export function ImportForm() {
<CardTitle>🏠 Neues Airbnb importieren</CardTitle>
</CardHeader>
<CardContent>
<form onSubmit={handleSubmit} className="space-y-4">
{/* URL Field */}
<form onSubmit={handleSubmit} className="space-y-6">
{/* URL Field - Prominent */}
<div className="space-y-2">
<Label htmlFor="airbnb-url">Airbnb Link</Label>
<Label htmlFor="airbnb-url" className="text-base font-semibold">
🔗 Airbnb Link
</Label>
<Input
id="airbnb-url"
type="url"
placeholder="https://www.airbnb.com/rooms/..."
value={url}
onChange={(e) => setUrl(e.target.value)}
onChange={handleUrlChange}
required
autoFocus
className="text-lg h-12"
disabled={isLoading}
/>
<p className="text-xs text-slate-500">
Einfach den Airbnb-Link einfügen Reisedaten werden automatisch
erkannt falls in der URL enthalten.
</p>
</div>
{/* Trip Context Fields */}
<div className="space-y-2">
<Label>Reisedaten (optional für bessere Preise)</Label>
<div className="grid grid-cols-3 gap-2">
<div>
<Label htmlFor="check-in" className="text-xs">Check-in</Label>
{/* Trip Context Fields - Grouped */}
<fieldset
disabled={isLoading}
className={`space-y-3 rounded-lg border p-4 ${
hasDates
? "bg-green-50 border-green-200"
: "bg-amber-50 border-amber-200"
}`}
>
<legend className="px-2 text-sm font-medium text-slate-700">
{hasDates ? "✅" : "⚠️"} Reisedaten{" "}
<span className="text-slate-400 font-normal">
(optional für bessere Preise)
</span>
</legend>
<div className="grid grid-cols-3 gap-3">
{/* Check-in */}
<div className="space-y-1">
<Label
htmlFor="check-in"
className="text-xs flex items-center gap-1"
>
🛫 Check-in
</Label>
<Input
id="check-in"
type="date"
value={checkIn}
onChange={(e) => setCheckIn(e.target.value)}
min={today}
placeholder="Datum"
/>
</div>
<div>
<Label htmlFor="check-out" className="text-xs">Check-out</Label>
{/* Nights Display */}
<div className="flex items-end justify-center pb-2">
{nights != null ? (
<span className="text-sm font-semibold text-green-700 bg-green-100 px-3 py-1 rounded-full">
{nights} {nights === 1 ? "Nacht" : "Nächte"}
</span>
) : (
<span className="text-sm text-slate-400"></span>
)}
</div>
{/* Check-out */}
<div className="space-y-1">
<Label
htmlFor="check-out"
className="text-xs flex items-center gap-1"
>
🛬 Check-out
</Label>
<Input
id="check-out"
type="date"
value={checkOut}
onChange={(e) => setCheckOut(e.target.value)}
min={checkIn || today}
placeholder="Datum"
/>
</div>
<div>
<Label htmlFor="adults" className="text-xs">Personen</Label>
<Input
id="adults"
type="number"
min="1"
max="16"
value={adults}
onChange={(e) => setAdults(e.target.value)}
/>
</div>
</div>
<p className="text-xs text-slate-500">
💡 Mit Reisedaten kann der Preis genauer ermittelt werden.
Die Daten werden auch aus der URL extrahiert wenn vorhanden.
</p>
</div>
{error && <div className="text-red-500 text-sm">{error}</div>}
{success && <div className="text-green-500 text-sm"> Erfolgreich importiert!</div>}
<Button type="submit" className="w-full" disabled={isLoading || !url}>
{isLoading ? "⏳ Wird importiert..." : "🚀 Importieren"}
{/* Adults */}
<div className="space-y-1">
<Label
htmlFor="adults"
className="text-xs flex items-center gap-1"
>
👥 Personen
</Label>
<Input
id="adults"
type="number"
min="1"
max="16"
value={adults}
onChange={(e) => setAdults(e.target.value)}
className="w-24"
/>
</div>
<p className="text-xs text-slate-600">
{hasDates
? `💡 Preis wird für ${nights} Nacht${
nights !== 1 ? "e" : ""
} mit ${adults} Person${adults !== "1" ? "en" : ""} ermittelt.`
: "⚠️ Ohne Reisedaten wird kein Preis ermittelt."}
</p>
</fieldset>
{/* Error */}
{error && (
<div className="bg-red-50 border border-red-200 text-red-700 px-4 py-3 rounded-lg text-sm">
{error}
</div>
)}
{/* Loading Progress */}
{isLoading && progress && (
<div className="bg-blue-50 border border-blue-200 text-blue-700 px-4 py-3 rounded-lg text-sm flex items-center gap-2">
<span className="animate-spin inline-block"></span>
{progress}
</div>
)}
{/* Submit Button */}
<Button
type="submit"
className="w-full h-12 text-lg"
disabled={isLoading || !url}
>
{isLoading ? (
<span className="flex items-center gap-2">
<span className="animate-spin"></span> Importiere...
</span>
) : (
"🚀 Jetzt importieren"
)}
</Button>
</form>
</CardContent>

View File

@ -0,0 +1,49 @@
"use client";
import { useState } from "react";
import { useRouter } from "next/navigation";
import { Button } from "@/components/ui/button";
import { deleteListing } from "../actions";
interface DeleteListingButtonProps {
listingId: string;
listingTitle: string;
}
export function DeleteListingButton({ listingId, listingTitle }: DeleteListingButtonProps) {
const [isDeleting, setIsDeleting] = useState(false);
const [error, setError] = useState<string | null>(null);
const router = useRouter();
const handleDelete = async () => {
if (!confirm(`"${listingTitle}" wirklich löschen?`)) return;
setError(null);
setIsDeleting(true);
try {
const formData = new FormData();
formData.append("id", listingId);
await deleteListing(formData);
router.push("/listings");
} catch (err) {
setError(err instanceof Error ? err.message : "Fehler beim Löschen");
setIsDeleting(false);
}
};
return (
<div className="flex flex-col gap-1">
<Button
variant="destructive"
onClick={handleDelete}
disabled={isDeleting}
className="w-full"
>
{isDeleting ? "⏳ Lösche..." : "🗑️ Listing löschen"}
</Button>
{error && (
<p className="text-xs text-red-500 text-center">{error}</p>
)}
</div>
);
}

View File

@ -4,7 +4,8 @@ import { Card, CardContent, CardHeader, CardTitle } from "@/components/ui/card";
import { Button } from "@/components/ui/button";
import { Input } from "@/components/ui/input";
import { Label } from "@/components/ui/label";
import { updateListing, deleteListing, addNote, addTagToListing, removeTagFromListing } from "../actions";
import { updateListing, addNote, addTagToListing, removeTagFromListing } from "../actions";
import { DeleteListingButton } from "./delete-button";
// Note: actions.ts is in /admin/listings/, so from [slug]/ we go up one level with ../
export default async function EditListingPage({
@ -213,21 +214,12 @@ export default async function EditListingPage({
</div>
</form>
<form action={deleteListing} className="mt-4">
<input type="hidden" name="id" value={listing.id} />
<Button
type="submit"
variant="destructive"
className="w-full"
onClick={(e) => {
if (!confirm("Möchten Sie dieses Listing wirklich löschen?")) {
e.preventDefault();
}
}}
>
🗑 Listing löschen
</Button>
</form>
<div className="mt-4">
<DeleteListingButton
listingId={listing.id}
listingTitle={listing.title}
/>
</div>
</CardContent>
</Card>
</div>

View File

@ -21,6 +21,12 @@ export async function updateListing(formData: FormData) {
const status = formData.get("status") as string;
const isFavorite = formData.get("isFavorite") === "true";
// Fetch slug before update for revalidatePath and redirect
const existing = await prisma.listing.findUnique({
where: { id },
select: { slug: true },
});
await prisma.listing.update({
where: { id },
data: {
@ -41,14 +47,34 @@ export async function updateListing(formData: FormData) {
},
});
const slug = existing?.slug;
revalidatePath("/listings");
revalidatePath(`/listings/${id}`);
redirect(`/listings`);
if (slug) {
revalidatePath(`/listings/${slug}`);
}
redirect(`/listings/${slug ?? ""}`);
}
export async function deleteListing(formData: FormData) {
const id = formData.get("id") as string;
// Delete related records first to avoid foreign key constraint errors
await prisma.listingTag.deleteMany({
where: { listingId: id },
});
await prisma.listingSleepingOption.deleteMany({
where: { listingId: id },
});
await prisma.listingImage.deleteMany({
where: { listingId: id },
});
await prisma.adminNote.deleteMany({
where: { listingId: id },
});
await prisma.listing.delete({
where: { id },
});

View File

@ -114,7 +114,7 @@ export default async function ListingDetailPage({ params }: PageProps) {
</div>
</div>
{listing.sleepingOptions.length > 0 && (
{listing.sleepingOptions.length > 0 ? (
<div>
<h3 className="font-medium mb-2">Schlafmöglichkeiten</h3>
<div className="space-y-2">
@ -128,6 +128,10 @@ export default async function ListingDetailPage({ params }: PageProps) {
))}
</div>
</div>
) : (
<p className="text-slate-500 text-sm">
Schlafplatzdetails nicht erkannt
</p>
)}
</CardContent>
</Card>
@ -158,11 +162,24 @@ export default async function ListingDetailPage({ params }: PageProps) {
<Card>
<CardContent className="p-6">
<h1 className="text-2xl font-bold mb-2">{listing.title}</h1>
<p className="text-slate-500 mb-4">📍 {listing.locationText || "Ort unbekannt"}</p>
<p className="text-slate-500 mb-4">📍 {listing.locationText || "Ort nicht erkannt"}</p>
<div className="flex items-baseline gap-2 mb-4">
<span className="text-4xl font-bold">{formatPrice(listing.nightlyPrice)}</span>
<span className="text-slate-500">/ Nacht</span>
{listing.nightlyPrice != null ? (
<>
<span className="text-4xl font-bold">{formatPrice(listing.nightlyPrice)}</span>
<span className="text-slate-500">/ Nacht</span>
</>
) : (
<div>
<span className="text-2xl font-bold text-slate-400">Preis auf Anfrage</span>
<p className="text-xs text-slate-500 mt-1">
{listing.priceStatus === 'REQUIRES_TRIP_CONTEXT'
? '💡 Mit Reisedaten ermittelbar'
: 'Nicht ermittelbar'}
</p>
</div>
)}
</div>
<div className="flex items-center gap-2 mb-4">

View File

@ -1,6 +1,7 @@
"use client";
import { useState } from "react";
import { useRouter } from "next/navigation";
import { Button } from "@/components/ui/button";
import { deleteListing } from "./actions";
@ -11,30 +12,39 @@ interface DeleteListingButtonProps {
export function DeleteListingButton({ listingId, listingTitle }: DeleteListingButtonProps) {
const [isDeleting, setIsDeleting] = useState(false);
const [error, setError] = useState<string | null>(null);
const router = useRouter();
const handleDelete = async () => {
if (!confirm(`"${listingTitle}" wirklich löschen?`)) return;
setError(null);
setIsDeleting(true);
try {
const formData = new FormData();
formData.append("id", listingId);
await deleteListing(formData);
} catch (error) {
alert("Fehler beim Löschen: " + (error as Error).message);
router.refresh();
} catch (err) {
setError(err instanceof Error ? err.message : "Fehler beim Löschen");
setIsDeleting(false);
}
};
return (
<Button
variant="destructive"
size="sm"
onClick={handleDelete}
disabled={isDeleting}
className="text-sm"
>
{isDeleting ? "⏳" : "🗑️"}
</Button>
<div className="flex flex-col items-end gap-1">
<Button
variant="destructive"
size="sm"
onClick={handleDelete}
disabled={isDeleting}
className="text-sm"
>
{isDeleting ? "⏳" : "🗑️"}
</Button>
{error && (
<p className="text-xs text-red-500 max-w-[120px] text-right">{error}</p>
)}
</div>
);
}

View File

@ -2,6 +2,7 @@ import { prisma } from "@/lib/prisma";
import { Card, CardContent } from "@/components/ui/card";
import { Badge } from "@/components/ui/badge";
import { Button } from "@/components/ui/button";
import { formatPrice, formatRating } from "@/lib/utils";
import Link from "next/link";
import { DeleteListingButton } from "./delete-button";
@ -57,8 +58,12 @@ export default async function ListingsPage() {
{/* Price & Rating */}
<div className="flex justify-between items-center mb-3">
<span className="text-xl font-bold">{listing.nightlyPrice?.toFixed(2) || "—"}</span>
<span className="text-sm"> {listing.rating?.toFixed(2) || "—"}</span>
{listing.nightlyPrice != null ? (
<span className="text-xl font-bold">{formatPrice(listing.nightlyPrice)}</span>
) : (
<span className="text-sm text-slate-400">Preis auf Anfrage</span>
)}
<span className="text-sm"> {formatRating(listing.rating)}</span>
</div>
{/* Tags */}
@ -77,12 +82,14 @@ export default async function ListingsPage() {
)}
{/* Sleep Info */}
{listing.suitableFor4 ? (
{listing.suitableFor4 === true ? (
<p className="text-xs text-green-600 font-medium mb-3"> Geeignet für 4 Personen</p>
) : (
) : listing.suitableFor4 === false ? (
<p className="text-xs text-amber-600 font-medium mb-3">
Nicht ideal für 4 {listing.extraMattressesNeededFor4 ? `(+${listing.extraMattressesNeededFor4} Matratzen)` : ""}
</p>
) : (
<p className="text-xs text-slate-400 font-medium mb-3"> Schlafplatz-Info unbekannt</p>
)}
{/* Actions */}

View File

@ -1,22 +1,8 @@
import * as cheerio from "cheerio";
import { scrapeAirbnbWithPuppeteer } from "./puppeteer-scraper";
import { normalizeAirbnbUrlWithContext } from "./url-normalizer";
import { parseCapacityFacts, parseRating, parseHost, parseMaxGuests, extractVisibleText, parseTitle } from "./parsers/text-patterns";
import { parseSleepingArrangements, calculateSleepingStats, deriveSleepingFromBeds } from "./parsers/sleeping";
import { extractPrice } from "./parsers/price";
import { extractLocation } from "./parsers/location";
import { parseJsonLd } from "./parsers/jsonld";
import {
ExtractedListing,
FieldSource,
field,
mergeField,
TripContext,
SleepingDataQuality,
PriceStatus
} from "./types";
// ============================================
// Main Scraper Function
// Main Scraper Function - Uses Puppeteer for JS rendering
// ============================================
export async function scrapeAirbnbListing(
@ -24,156 +10,27 @@ export async function scrapeAirbnbListing(
options?: { tripContext?: TripContext; usePlaywright?: boolean }
): Promise<ExtractedListing | null> {
try {
// Step 1: Normalize URL and extract trip context
// Normalize URL and extract trip context
const normalized = normalizeAirbnbUrlWithContext(url);
// Merge trip context from options with URL-extracted context
const tripContext: TripContext = {
const tripContext = {
checkIn: options?.tripContext?.checkIn || normalized.tripContext.checkIn,
checkOut: options?.tripContext?.checkOut || normalized.tripContext.checkOut,
adults: options?.tripContext?.adults || normalized.tripContext.adults || 4,
};
// Step 2: Fetch HTML
const html = await fetchHtml(normalized.normalized);
const $ = cheerio.load(html);
// Step 3: Extract visible text for pattern matching
const visibleText = extractVisibleText(html);
// Use Puppeteer to render JavaScript and extract data
const result = await scrapeAirbnbWithPuppeteer(normalized.normalized, { tripContext });
// Step 4: Run all parsers
const jsonldData = parseJsonLd($);
const capacityFacts = parseCapacityFacts(visibleText);
const ratingFacts = parseRating(visibleText);
const hostName = parseHost(visibleText);
const maxGuests = parseMaxGuests(visibleText);
const sleepingOptions = parseSleepingArrangements(visibleText);
const priceData = extractPrice(html, $, tripContext);
const locationData = extractLocation($, html);
const pageTitle = parseTitle(html);
// Step 5: Build the result with priority: jsonld > text_pattern > derived
const result: ExtractedListing = {
// URLs
originalUrl: normalized.original,
normalizedUrl: normalized.normalized,
externalId: normalized.externalId,
// Basic Info
title: mergeField(
jsonldData.title ? field(jsonldData.title, 'jsonld', 'high') : null,
pageTitle ? field(pageTitle, 'text_pattern', 'medium') : field(null, 'derived', 'low')
),
description: mergeField(
jsonldData.description ? field(jsonldData.description, 'jsonld', 'high') : null,
field(null, 'derived', 'low')
),
// Location
locationText: locationData.locationText,
latitude: mergeField(
jsonldData.latitude ? field(jsonldData.latitude, 'jsonld', 'high') : null,
locationData.latitude.value !== null ? locationData.latitude : field(null, 'derived', 'low')
),
longitude: mergeField(
jsonldData.longitude ? field(jsonldData.longitude, 'jsonld', 'high') : null,
locationData.longitude.value !== null ? locationData.longitude : field(null, 'derived', 'low')
),
// Pricing
tripContext,
nightlyPrice: priceData.nightly,
totalPrice: priceData.total,
priceStatus: priceData.status,
// Rating
rating: mergeField(
ratingFacts ? field(ratingFacts.rating, 'text_pattern', 'high') : null,
jsonldData.rating ? field(jsonldData.rating, 'jsonld', 'medium') : null
),
reviewCount: mergeField(
ratingFacts && ratingFacts.reviewCount > 0 ? field(ratingFacts.reviewCount, 'text_pattern', 'high') : null,
jsonldData.reviewCount ? field(jsonldData.reviewCount, 'jsonld', 'medium') : null
),
// Capacity
guestCount: mergeField(
capacityFacts ? field(capacityFacts.guests, 'text_pattern', 'high') : null,
field(null, 'derived', 'low')
),
officialGuestCount: mergeField(
maxGuests ? field(maxGuests, 'text_pattern', 'high') : null,
field(null, 'derived', 'low')
),
bedrooms: mergeField(
capacityFacts ? field(capacityFacts.bedrooms, 'text_pattern', 'high') : null,
field(null, 'derived', 'low')
),
beds: mergeField(
capacityFacts ? field(capacityFacts.beds, 'text_pattern', 'high') : null,
field(null, 'derived', 'low')
),
bathrooms: mergeField(
capacityFacts ? field(capacityFacts.bathrooms, 'text_pattern', 'high') : null,
field(null, 'derived', 'low')
),
// Sleeping
sleepingOptions,
maxSleepingPlaces: 0,
suitableFor4: false,
extraMattressesNeededFor4: 0,
sleepingDataQuality: 'UNKNOWN',
// Host
hostName: mergeField(
hostName ? field(hostName, 'text_pattern', 'high') : null,
jsonldData.hostName ? field(jsonldData.hostName, 'jsonld', 'medium') : null
),
// Amenities
amenities: jsonldData.amenities || [],
// Images
images: jsonldData.images || [],
coverImage: jsonldData.images?.[0] || null,
// Other
cancellationPolicy: field(null, 'derived', 'low'),
// Debug
rawSnippets: {
title: jsonldData.title || '',
visibleText: visibleText.substring(0, 2000),
},
extractionLog: [
`URL normalized: ${normalized.normalized}`,
`External ID: ${normalized.externalId}`,
`Trip context: ${JSON.stringify(tripContext)}`,
`Capacity facts: ${capacityFacts ? JSON.stringify(capacityFacts) : 'none'}`,
`Rating facts: ${ratingFacts ? JSON.stringify(ratingFacts) : 'none'}`,
`Sleeping options: ${sleepingOptions.length} found`,
],
};
// Step 6: Calculate sleeping stats
if (sleepingOptions.length > 0) {
const stats = calculateSleepingStats(sleepingOptions);
result.maxSleepingPlaces = stats.maxSleepingPlaces;
result.suitableFor4 = stats.suitableFor4;
result.extraMattressesNeededFor4 = stats.extraMattressesNeededFor4;
result.sleepingDataQuality = 'EXACT';
} else if (result.beds.value && result.guestCount.value) {
// Derive from beds and guest count
const derivedOptions = deriveSleepingFromBeds(result.beds.value, result.guestCount.value);
const stats = calculateSleepingStats(derivedOptions);
result.sleepingOptions = derivedOptions;
result.maxSleepingPlaces = stats.maxSleepingPlaces;
result.suitableFor4 = stats.suitableFor4;
result.extraMattressesNeededFor4 = stats.extraMattressesNeededFor4;
result.sleepingDataQuality = 'DERIVED';
if (result) {
// Update URLs with normalized values
result.originalUrl = normalized.original;
result.normalizedUrl = normalized.normalized;
result.externalId = normalized.externalId;
result.tripContext = tripContext;
}
return result;
} catch (error) {
console.error("Scraping failed:", error);
@ -181,36 +38,9 @@ export async function scrapeAirbnbListing(
}
}
// ============================================
// HTML Fetcher - with better error handling and logging
// ============================================
async function fetchHtml(url: string): Promise<string> {
const response = await fetch(url, {
headers: {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7",
"Accept-Encoding": "gzip, deflate, br",
"Cache-Control": "no-cache",
"Upgrade-Insecure-Requests": "1",
},
});
if (!response.ok) {
throw new Error(`HTTP ${response.status} for ${url}`);
}
const html = await response.text();
// Log some debug info
console.log(`[Scraper] Fetched ${url.length} chars`);
console.log(`[Scraper] Contains 'application/ld+json': ${html.includes('application/ld+json')}`);
console.log(`[Scraper] Contains 'airbnb': ${html.toLowerCase().includes('airbnb')}`);
return html;
}
// Re-export utilities for backward compatibility
export { normalizeAirbnbUrlWithContext as normalizeAirbnbUrl } from "./url-normalizer";
export { extractAirbnbExternalId } from "./url-normalizer";
// Need to import TripContext for TypeScript
import type { TripContext, ExtractedListing } from "./types";

View File

@ -12,6 +12,7 @@ export interface JsonLdData {
cancellationPolicy: string | null;
hostName: string | null;
amenities: string[];
price: number | null;
}
/**
@ -31,6 +32,7 @@ export function parseJsonLd($: cheerio.CheerioAPI): JsonLdData {
cancellationPolicy: null,
hostName: null,
amenities: [],
price: null,
};
const jsonLdScript = $('script[type="application/ld+json"]').html();
@ -117,6 +119,15 @@ export function parseJsonLd($: cheerio.CheerioAPI): JsonLdData {
.filter(Boolean);
}
// Price - extract from makesOffer.offers[0].price or offers.price
const priceValue = jsonData.makesOffer?.offers?.[0]?.price ?? jsonData.offers?.price;
if (priceValue !== undefined && priceValue !== null) {
const parsed = typeof priceValue === 'number' ? priceValue : parseFloat(String(priceValue));
if (!isNaN(parsed)) {
result.price = parsed;
}
}
} catch (error) {
console.error('Failed to parse JSON-LD:', error);
}

View File

@ -9,11 +9,12 @@ function tryExtractPriceFromHtml(html: string, $: cheerio.CheerioAPI): number |
// Try various price selectors that Airbnb might use
const priceSelectors = [
'[data-testid="price-amount"]',
'[data-testid="book-it-default"] span',
'span[class*="Price"]',
'span[class*="price"]',
'[itemprop="price"]',
'._1y6k3r2',
'._1dss1omb',
'div[class*="bookit"] span',
'section[class*="booking"] span',
];
for (const selector of priceSelectors) {
@ -33,6 +34,16 @@ function tryExtractPriceFromHtml(html: string, $: cheerio.CheerioAPI): number |
return priceFromHtml;
}
// Fallback: look for "total" near price numbers
const totalPattern = /total[^€$£]*[€$£]\s*(\d[\d.,]*)/i;
const totalMatch = html.match(totalPattern);
if (totalMatch) {
const parsed = parseFloat(totalMatch[1].replace(/[.,](?=\d{3})/g, '').replace(',', '.'));
if (!isNaN(parsed) && parsed > 0) {
return parsed;
}
}
return null;
}

View File

@ -2,8 +2,14 @@ import { BedType, SleepingOption } from '../types';
/**
* Bed type configuration: maps text patterns to bed types, spots per unit, and quality
*
* IMPORTANT: Longer/more specific patterns MUST come before shorter ones
* (e.g., "bunk bed" before "bed", "double bed" before "double")
*/
export const BED_TYPE_CONFIG: Record<string, { type: BedType; spots: number; quality: 'FULL' | 'AUXILIARY' }> = {
// Compound bed types (must come first to avoid partial matches)
'bunk bed': { type: 'BUNK', spots: 2, quality: 'FULL' },
'etagenbett': { type: 'BUNK', spots: 2, quality: 'FULL' },
'double bed': { type: 'DOUBLE', spots: 2, quality: 'FULL' },
'doppelbett': { type: 'DOUBLE', spots: 2, quality: 'FULL' },
'queen bed': { type: 'QUEEN', spots: 2, quality: 'FULL' },
@ -11,11 +17,27 @@ export const BED_TYPE_CONFIG: Record<string, { type: BedType; spots: number; qua
'single bed': { type: 'SINGLE', spots: 1, quality: 'FULL' },
'twin bed': { type: 'SINGLE', spots: 1, quality: 'FULL' },
'einzelbett': { type: 'SINGLE', spots: 1, quality: 'FULL' },
'bunk bed': { type: 'BUNK', spots: 2, quality: 'FULL' },
'etagenbett': { type: 'BUNK', spots: 2, quality: 'FULL' },
'sofa bed': { type: 'SOFA_BED', spots: 2, quality: 'FULL' },
'pull-out sofa': { type: 'SOFA_BED', spots: 2, quality: 'FULL' },
'schlafsofa': { type: 'SOFA_BED', spots: 2, quality: 'FULL' },
'murphy bed': { type: 'DOUBLE', spots: 2, quality: 'FULL' },
'day bed': { type: 'SINGLE', spots: 1, quality: 'FULL' },
// Standalone bed types (without "bed" word)
'double': { type: 'DOUBLE', spots: 2, quality: 'FULL' },
'queen': { type: 'QUEEN', spots: 2, quality: 'FULL' },
'king': { type: 'KING', spots: 2, quality: 'FULL' },
'single': { type: 'SINGLE', spots: 1, quality: 'FULL' },
'twin': { type: 'SINGLE', spots: 1, quality: 'FULL' },
// Plural forms
'doubles': { type: 'DOUBLE', spots: 2, quality: 'FULL' },
'singles': { type: 'SINGLE', spots: 1, quality: 'FULL' },
'queens': { type: 'QUEEN', spots: 2, quality: 'FULL' },
'kings': { type: 'KING', spots: 2, quality: 'FULL' },
'bunks': { type: 'BUNK', spots: 2, quality: 'FULL' },
// Auxiliary sleeping
'couch': { type: 'SOFA', spots: 1, quality: 'AUXILIARY' },
'sofa': { type: 'SOFA', spots: 1, quality: 'AUXILIARY' },
'air mattress': { type: 'AIR_MATTRESS', spots: 1, quality: 'AUXILIARY' },
@ -24,9 +46,11 @@ export const BED_TYPE_CONFIG: Record<string, { type: BedType; spots: number; qua
'extra mattress': { type: 'EXTRA_MATTRESS', spots: 1, quality: 'AUXILIARY' },
'zusatzmatratze': { type: 'EXTRA_MATTRESS', spots: 1, quality: 'AUXILIARY' },
'futon': { type: 'FUTON', spots: 1, quality: 'AUXILIARY' },
'hammock': { type: 'SINGLE', spots: 1, quality: 'AUXILIARY' },
};
// Pattern: "1 double bed" or "2 single beds" or "Bedroom 1: 1 queen bed"
// Note: lastIndex is reset before each use to avoid global flag bug
const BED_PATTERN = /(?:(?:bedroom|schlafzimmer|room|zimmer)\s*\d*\s*:?\s*)?(\d+)\s+([a-z\s-]+?)(?:\s|$|,|\.)/gi;
export interface SleepingStats {
@ -47,6 +71,9 @@ export function parseSleepingArrangements(text: string): SleepingOption[] {
const options: SleepingOption[] = [];
const lowerText = text.toLowerCase();
// Reset lastIndex to avoid bug with global flag + exec() loop
BED_PATTERN.lastIndex = 0;
let match;
while ((match = BED_PATTERN.exec(lowerText)) !== null) {
const quantity = parseInt(match[1], 10);
@ -114,30 +141,78 @@ export function calculateSleepingStats(options: SleepingOption[]): SleepingStats
/**
* Derive sleeping options from bed count (fallback with low confidence)
* Used when detailed sleeping arrangement text is not available
*
* Logic:
* - beds >= 2 && guestCount >= beds * 1.5 mix of double/single (assume mostly double)
* - beds === 1 && guestCount >= 2 double
* - beds === 1 && guestCount === 1 single
* - beds >= 2 && guestCount < beds * 1.5 mostly single
*/
export function deriveSleepingFromBeds(beds: number, guestCount: number): SleepingOption[] {
if (!beds || beds < 1) return [];
// Assume beds are double beds if guest count suggests it
const avgGuestsPerBed = guestCount ? guestCount / beds : 2;
const options: SleepingOption[] = [];
if (avgGuestsPerBed >= 1.5) {
// Likely double beds
return [{
bedType: 'DOUBLE',
quantity: beds,
spotsPerUnit: 2,
quality: 'FULL',
label: 'double bed (derived)',
}];
} else {
// Likely single beds
return [{
bedType: 'SINGLE',
quantity: beds,
spotsPerUnit: 1,
quality: 'FULL',
label: 'single bed (derived)',
}];
if (beds === 1) {
// Single bed scenario
if (guestCount >= 2) {
// 1 bed for 2+ guests → must be double
options.push({
bedType: 'DOUBLE',
quantity: 1,
spotsPerUnit: 2,
quality: 'FULL',
label: 'Doppelbett (abgeleitet)',
});
} else {
// 1 bed for 1 guest → single
options.push({
bedType: 'SINGLE',
quantity: 1,
spotsPerUnit: 1,
quality: 'FULL',
label: 'Einzelbett (abgeleitet)',
});
}
} else if (beds >= 2) {
// Multiple beds
const avgGuestsPerBed = guestCount ? guestCount / beds : 2;
if (avgGuestsPerBed >= 1.5) {
// High guest-to-bed ratio → mix of double and single
// Assume roughly half are double, half single
const doubleCount = Math.ceil(beds / 2);
const singleCount = beds - doubleCount;
if (doubleCount > 0) {
options.push({
bedType: 'DOUBLE',
quantity: doubleCount,
spotsPerUnit: 2,
quality: 'FULL',
label: 'Doppelbett (abgeleitet)',
});
}
if (singleCount > 0) {
options.push({
bedType: 'SINGLE',
quantity: singleCount,
spotsPerUnit: 1,
quality: 'FULL',
label: 'Einzelbett (abgeleitet)',
});
}
} else {
// Low guest-to-bed ratio → mostly single beds
options.push({
bedType: 'SINGLE',
quantity: beds,
spotsPerUnit: 1,
quality: 'FULL',
label: 'Einzelbett (abgeleitet)',
});
}
}
return options;
}

View File

@ -0,0 +1,419 @@
import puppeteer from 'puppeteer-extra';
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
import type { Browser, Page } from 'puppeteer';
import * as cheerio from 'cheerio';
import {
ExtractedListing,
FieldSource,
field,
mergeField,
TripContext,
PriceStatus,
SleepingDataQuality
} from './types';
import { parseJsonLd } from './parsers/jsonld';
import { parseCapacityFacts, parseRating, parseHost, parseMaxGuests, extractVisibleText, parseTitle } from './parsers/text-patterns';
import { extractLocation } from './parsers/location';
import { extractPrice } from './parsers/price';
import { parseSleepingArrangements, calculateSleepingStats, deriveSleepingFromBeds, BED_TYPE_CONFIG } from './parsers/sleeping';
// Enable stealth mode
import Stealth from 'puppeteer-extra-plugin-stealth';
puppeteer.use(Stealth());
/**
* Main Puppeteer-based scraper that actually renders JavaScript
*/
export async function scrapeAirbnbWithPuppeteer(
url: string,
options?: { tripContext?: TripContext }
): Promise<ExtractedListing | null> {
let browser: Browser | null = null;
try {
// Launch browser with stealth mode
browser = await puppeteer.launch({
headless: true,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
'--window-size=1920,1080',
],
});
const page: Page = await browser.newPage();
// Set realistic viewport and user agent
await page.setViewport({ width: 1920, height: 1080 });
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
// Navigate and wait for network idle
console.log(`[Puppeteer] Navigating to ${url}`);
await page.goto(url, { waitUntil: 'networkidle2', timeout: 30000 });
// Wait a bit more for dynamic content
await new Promise(resolve => setTimeout(resolve, 2000));
// Check if we got a 404 or challenge page
const pageTitle = await page.title();
if (pageTitle.includes('404') || pageTitle.includes('Not Found')) {
console.error('[Puppeteer] Got 404 page');
return null;
}
console.log(`[Puppeteer] Page title: ${pageTitle}`);
// Get rendered HTML
const html = await page.content();
const $ = cheerio.load(html);
// Extract visible text for pattern matching
const visibleText = extractVisibleText(html);
console.log(`[Puppeteer] Extracted ${visibleText.length} chars of visible text`);
// Run all parsers
const jsonldData = parseJsonLd($);
console.log(`[Puppeteer] JSON-LD: title=${!!jsonldData.title}, images=${jsonldData.images.length}`);
const capacityFacts = parseCapacityFacts(visibleText);
console.log(`[Puppeteer] Capacity: ${JSON.stringify(capacityFacts)}`);
const ratingFacts = parseRating(visibleText);
const hostName = parseHost(visibleText);
const maxGuests = parseMaxGuests(visibleText);
// Try to get sleeping arrangements from the rendered page
const sleepingOptions = await parseSleepingArrangementsFromPage(page);
console.log(`[Puppeteer] Sleeping options: ${sleepingOptions.length} found`);
const tripContext: TripContext = {
checkIn: options?.tripContext?.checkIn || undefined,
checkOut: options?.tripContext?.checkOut || undefined,
adults: options?.tripContext?.adults || 4,
};
const priceData = extractPrice(html, $, tripContext);
// Use JSON-LD price as fallback if price extraction failed
if (jsonldData.price !== null && priceData.nightly.value === null) {
priceData.nightly = { value: jsonldData.price, source: 'jsonld', confidence: 'medium' };
priceData.status = 'EXTRACTED';
// Calculate total if trip context available
if (tripContext.checkIn && tripContext.checkOut) {
try {
const checkIn = new Date(tripContext.checkIn);
const checkOut = new Date(tripContext.checkOut);
const nights = Math.round((checkOut.getTime() - checkIn.getTime()) / (1000 * 60 * 60 * 24));
if (nights > 0) {
priceData.total = { value: jsonldData.price * nights, source: 'derived', confidence: 'medium' };
}
} catch {
// Invalid dates, skip total calculation
}
}
}
const locationData = extractLocation($, html);
const pageTitleParsed = parseTitle(html);
// Extract images from the rendered page (more reliable)
const images = extractImagesFromPage($);
console.log(`[Puppeteer] Found ${images.length} images`);
// Extract description from rendered page
const description = extractDescriptionFromPage($);
// Extract amenities if not in JSON-LD
const amenities = jsonldData.amenities.length > 0
? jsonldData.amenities
: extractAmenitiesFromPage($);
console.log(`[Puppeteer] Found ${amenities.length} amenities`);
// Build the result
const result: ExtractedListing = {
originalUrl: url,
normalizedUrl: url,
externalId: extractExternalId(url),
// Title - try multiple sources
title: mergeField(
jsonldData.title ? field(jsonldData.title, 'jsonld', 'high') : null,
pageTitleParsed ? field(pageTitleParsed, 'text_pattern', 'medium') : null
),
description: mergeField(
jsonldData.description ? field(jsonldData.description, 'jsonld', 'high') : null,
description ? field(description, 'dom', 'medium') : null
),
// Location
locationText: locationData.locationText.value
? field(locationData.locationText.value, locationData.locationText.source, locationData.locationText.confidence)
: field(null, 'derived', 'low'),
latitude: locationData.latitude,
longitude: locationData.longitude,
// Pricing
tripContext,
nightlyPrice: priceData.nightly,
totalPrice: priceData.total,
priceStatus: priceData.status,
// Rating
rating: mergeField(
ratingFacts ? field(ratingFacts.rating, 'text_pattern', 'high') : null,
jsonldData.rating ? field(jsonldData.rating, 'jsonld', 'medium') : null
),
reviewCount: mergeField(
ratingFacts && ratingFacts.reviewCount > 0 ? field(ratingFacts.reviewCount, 'text_pattern', 'high') : null,
jsonldData.reviewCount ? field(jsonldData.reviewCount, 'jsonld', 'medium') : null
),
// Capacity
guestCount: mergeField(
capacityFacts ? field(capacityFacts.guests, 'text_pattern', 'high') : null,
maxGuests ? field(maxGuests, 'text_pattern', 'medium') : null
),
officialGuestCount: mergeField(
maxGuests ? field(maxGuests, 'text_pattern', 'high') : null,
field(null, 'derived', 'low')
),
bedrooms: mergeField(
capacityFacts ? field(capacityFacts.bedrooms, 'text_pattern', 'high') : null,
field(null, 'derived', 'low')
),
beds: mergeField(
capacityFacts ? field(capacityFacts.beds, 'text_pattern', 'high') : null,
field(null, 'derived', 'low')
),
bathrooms: mergeField(
capacityFacts ? field(capacityFacts.bathrooms, 'text_pattern', 'high') : null,
field(null, 'derived', 'low')
),
// Sleeping
sleepingOptions,
maxSleepingPlaces: 0,
suitableFor4: false,
extraMattressesNeededFor4: 0,
sleepingDataQuality: 'UNKNOWN',
// Host
hostName: mergeField(
hostName ? field(hostName, 'text_pattern', 'high') : null,
jsonldData.hostName ? field(jsonldData.hostName, 'jsonld', 'medium') : null
),
// Amenities
amenities,
// Images
images,
coverImage: images[0] || null,
// Other
cancellationPolicy: jsonldData.cancellationPolicy
? field(jsonldData.cancellationPolicy, 'jsonld', 'high')
: field(null, 'derived', 'low'),
// Debug
rawSnippets: {
title: jsonldData.title || pageTitleParsed || '',
visibleText: visibleText.substring(0, 2000),
},
extractionLog: [
`Puppeteer render: ${url}`,
`Page title: ${pageTitle}`,
`Images found: ${images.length}`,
`Amenities found: ${amenities.length}`,
`Capacity: ${JSON.stringify(capacityFacts)}`,
],
};
// Calculate sleeping stats
if (sleepingOptions.length > 0) {
const stats = calculateSleepingStats(sleepingOptions);
result.maxSleepingPlaces = stats.maxSleepingPlaces;
result.suitableFor4 = stats.suitableFor4;
result.extraMattressesNeededFor4 = stats.extraMattressesNeededFor4;
result.sleepingDataQuality = 'EXACT';
} else if (result.beds.value && result.guestCount.value) {
const derivedOptions = deriveSleepingFromBeds(result.beds.value, result.guestCount.value);
const stats = calculateSleepingStats(derivedOptions);
result.sleepingOptions = derivedOptions;
result.maxSleepingPlaces = stats.maxSleepingPlaces;
result.suitableFor4 = stats.suitableFor4;
result.extraMattressesNeededFor4 = stats.extraMattressesNeededFor4;
result.sleepingDataQuality = 'DERIVED';
}
return result;
} catch (error) {
console.error('[Puppeteer] Scraper error:', error);
return null;
} finally {
if (browser) {
await browser.close();
}
}
}
/**
* Extract external ID from URL
*/
function extractExternalId(url: string): string | null {
const match = url.match(/\/rooms\/(\d+)/);
return match?.[1] || null;
}
/**
* Extract images from the rendered page
*/
function extractImagesFromPage($: cheerio.CheerioAPI): string[] {
const images: string[] = [];
// Try og:image
const ogImage = $('meta[property="og:image"]').attr('content');
if (ogImage) images.push(ogImage);
// Try JSON-LD images (already handled separately)
// Try data-testid image elements
$('[data-testid*="photo"] img, [data-testid*="image"] img, [class*="photo"] img').each((_, el) => {
const src = $(el).attr('src') || $(el).attr('data-src') || $(el).attr('data-image');
if (src && src.startsWith('http') && !images.includes(src)) {
images.push(src);
}
});
return images;
}
/**
* Extract description from the rendered page
*/
function extractDescriptionFromPage($: cheerio.CheerioAPI): string | null {
// Try various selectors for description
const selectors = [
'[data-section-id="DESCRIPTION_DEFAULT"]',
'#description',
'.description',
'[itemprop="description"]',
];
for (const selector of selectors) {
const text = $(selector).text().trim();
if (text.length > 20) {
return text.substring(0, 500);
}
}
return null;
}
/**
* Extract amenities from the rendered page
*/
function extractAmenitiesFromPage($: cheerio.CheerioAPI): string[] {
const amenities: string[] = [];
$('[data-testid*="amenity"]').each((_, el) => {
const text = $(el).text().trim();
if (text && !amenities.includes(text)) {
amenities.push(text);
}
});
return amenities;
}
/**
* Map BedType to spotsPerUnit using BED_TYPE_CONFIG
*/
const BED_TYPE_SPOTS_MAP: Record<string, number> = (() => {
const map: Record<string, number> = {};
for (const config of Object.values(BED_TYPE_CONFIG)) {
if (!(config.type in map)) {
map[config.type] = config.spots;
}
}
return map;
})();
/**
* Try to parse sleeping arrangements from Puppeteer page
* This is more reliable than text parsing
*/
async function parseSleepingArrangementsFromPage(page: Page): Promise<ExtractedListing['sleepingOptions']> {
const options: ExtractedListing['sleepingOptions'] = [];
try {
// Try to find sleeping/bedroom section
const sleepingSection = await page.$('[data-section-id="SLEEPING_CONFIGURATION"]');
if (sleepingSection) {
const text = await sleepingSection.evaluate(el => el.textContent);
// Parse bed types from text
const bedPatterns = [
/(\d+)\s*(?:×|x)?\s*(queen|king|single|double|twin|full|king-size|queen-size)\s*bed/gi,
/(\d+)\s*(?:×|x)?\s*Futon/gi,
/(\d+)\s*(?:×|x)?\s*Matratze/gi,
/(\d+)\s*(?:×|x)?\s*Couch/gi,
];
for (const pattern of bedPatterns) {
let match;
while ((match = pattern.exec(text)) !== null) {
const quantity = parseInt(match[1], 10);
const bedType = match[2] || 'bed';
// Map German/English bed types to BedType enum
let normalizedType: import('./types').BedType = 'UNKNOWN';
let quality: 'FULL' | 'AUXILIARY' = 'AUXILIARY';
const lower = bedType.toLowerCase();
if (lower.includes('queen')) {
normalizedType = 'QUEEN';
quality = 'FULL';
} else if (lower.includes('king')) {
normalizedType = 'KING';
quality = 'FULL';
} else if (lower.includes('double') || lower.includes('full')) {
normalizedType = 'DOUBLE';
quality = 'FULL';
} else if (lower.includes('twin') || lower.includes('single')) {
normalizedType = 'SINGLE';
quality = 'FULL';
} else if (lower.includes('futon')) {
normalizedType = 'FUTON';
quality = 'AUXILIARY';
} else if (lower.includes('matratze') || lower.includes('mattress')) {
normalizedType = 'EXTRA_MATTRESS';
quality = 'AUXILIARY';
} else if (lower.includes('couch') || lower.includes('sofa')) {
normalizedType = 'SOFA';
quality = 'AUXILIARY';
} else {
normalizedType = 'DOUBLE';
quality = 'FULL';
}
options.push({
bedType: normalizedType,
quantity,
spotsPerUnit: BED_TYPE_SPOTS_MAP[normalizedType] ?? 2,
quality,
});
}
}
}
} catch (error) {
console.error('[Puppeteer] Error parsing sleeping arrangements:', error);
}
return options;
}

96
test-scraper-debug.ts Normal file
View File

@ -0,0 +1,96 @@
/**
* Debug test - captures more info about what's happening
*/
import puppeteer from 'puppeteer-extra';
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
puppeteer.use(StealthPlugin());
const TEST_URL = 'https://www.airbnb.com/rooms/842937876795894279';
async function main() {
console.log('Starting debug test...\n');
const browser = await puppeteer.launch({
headless: true,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
'--window-size=1920,1080',
],
});
const page = await browser.newPage();
await page.setViewport({ width: 1920, height: 1080 });
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
console.log(`Navigating to: ${TEST_URL}`);
// Track redirects
page.on('response', (response) => {
const status = response.status();
const url = response.url();
if (status >= 300 && status < 400) {
console.log(`🔄 Redirect: ${status}${response.headers()['location']?.substring(0, 100)}`);
}
});
try {
const response = await page.goto(TEST_URL, {
waitUntil: 'networkidle2',
timeout: 60000
});
console.log(`\n📊 Response status: ${response?.status()}`);
console.log(`📊 Final URL: ${page.url()}`);
console.log(`📊 Page title: ${await page.title()}`);
// Wait longer for dynamic content
console.log('\n⏳ Waiting 5 seconds for dynamic content...');
await new Promise(r => setTimeout(r, 5000));
// Get page content
const html = await page.content();
console.log(`\n📄 HTML length: ${html.length} chars`);
// Check for challenge page
if (html.includes('challenge') || html.includes('captcha') || html.includes('blocked')) {
console.log('⚠️ Possible challenge/blocked page detected!');
}
// Check if we're on the homepage
if (page.url() === 'https://www.airbnb.com/' || page.url() === 'https://www.airbnb.com') {
console.log('⚠️ Redirected to homepage - likely blocked!');
}
// Extract visible text
const bodyText = await page.evaluate(() => document.body.innerText);
console.log(`\n📝 Body text length: ${bodyText.length} chars`);
console.log(`\n📝 First 500 chars of visible text:\n${bodyText.substring(0, 500)}`);
// Check for specific listing elements
const hasListingTitle = await page.$('[data-plugin-in-point-id="TITLE_DEFAULT"]');
const hasPhotos = await page.$('[data-section-id="PHOTO_PICKER"]');
const hasPrice = await page.$('[data-plugin-in-point-id="PRICE_DEFAULT"]');
console.log(`\n🔍 Listing elements found:`);
console.log(` Title section: ${hasListingTitle ? '✅' : '❌'}`);
console.log(` Photos section: ${hasPhotos ? '✅' : '❌'}`);
console.log(` Price section: ${hasPrice ? '✅' : '❌'}`);
// Take a screenshot
await page.screenshot({ path: 'debug-screenshot.png', fullPage: false });
console.log(`\n📸 Screenshot saved to: debug-screenshot.png`);
} catch (error) {
console.error('Error:', error);
} finally {
await browser.close();
}
}
main();

127
test-scraper.ts Normal file
View File

@ -0,0 +1,127 @@
/**
* Test script for Puppeteer-based Airbnb scraper
* Run with: npx tsx test-scraper.ts
*/
import { scrapeAirbnbWithPuppeteer } from './src/lib/airbnb/puppeteer-scraper';
const TEST_URL = 'https://www.airbnb.com/rooms/52367822'; // Valid listing in Bad Bellingen, Germany
async function main() {
console.log('========================================');
console.log('Airbnb Puppeteer Scraper Test');
console.log('========================================\n');
console.log(`Testing URL: ${TEST_URL}\n`);
console.log('Starting scraper (this may take 30-60 seconds)...\n');
const startTime = Date.now();
try {
const result = await scrapeAirbnbWithPuppeteer(TEST_URL);
const elapsed = ((Date.now() - startTime) / 1000).toFixed(1);
console.log(`\n✅ Scraping completed in ${elapsed}s\n`);
if (!result) {
console.log('❌ Result is null - scraping may have failed');
return;
}
console.log('========================================');
console.log('EXTRACTED DATA');
console.log('========================================\n');
// Title
console.log('📌 TITLE:');
console.log(` Value: ${result.title?.value || 'N/A'}`);
console.log(` Source: ${result.title?.source || 'N/A'}`);
console.log(` Confidence: ${result.title?.confidence || 'N/A'}\n`);
// Price
console.log('💰 PRICE:');
console.log(` Nightly: ${result.nightlyPrice?.value || 'N/A'} EUR`);
console.log(` Total: ${result.totalPrice?.value || 'N/A'} EUR`);
console.log(` Status: ${result.priceStatus || 'N/A'}\n`);
// Location
console.log('📍 LOCATION:');
console.log(` Text: ${result.locationText?.value || 'N/A'}`);
console.log(` Lat/Lng: ${result.latitude}, ${result.longitude}\n`);
// Rating
console.log('⭐ RATING:');
console.log(` Rating: ${result.rating?.value || 'N/A'}`);
console.log(` Reviews: ${result.reviewCount?.value || 'N/A'}\n`);
// Capacity
console.log('🏠 CAPACITY:');
console.log(` Guests: ${result.guestCount?.value || 'N/A'}`);
console.log(` Bedrooms: ${result.bedrooms?.value || 'N/A'}`);
console.log(` Beds: ${result.beds?.value || 'N/A'}`);
console.log(` Bathrooms: ${result.bathrooms?.value || 'N/A'}\n`);
// Sleeping Options
console.log('🛏️ SLEEPING OPTIONS:');
if (result.sleepingOptions && result.sleepingOptions.length > 0) {
result.sleepingOptions.forEach((opt, i) => {
console.log(` ${i + 1}. ${opt.quantity}x ${opt.bedType} (${opt.spotsPerUnit} spots, ${opt.quality})`);
});
console.log(` Max sleeping places: ${result.maxSleepingPlaces}`);
console.log(` Suitable for 4: ${result.suitableFor4 ? '✅ Yes' : '❌ No'}`);
console.log(` Quality: ${result.sleepingDataQuality}`);
} else {
console.log(' No sleeping options extracted');
}
console.log('');
// Host
console.log('👤 HOST:');
console.log(` Name: ${result.hostName?.value || 'N/A'}\n`);
// Images
console.log('🖼️ IMAGES:');
console.log(` Count: ${result.images?.length || 0}`);
if (result.images && result.images.length > 0) {
console.log(` First 3:`);
result.images.slice(0, 3).forEach((img, i) => {
console.log(` ${i + 1}. ${img.substring(0, 80)}...`);
});
}
console.log('');
// Amenities
console.log('✨ AMENITIES:');
console.log(` Count: ${result.amenities?.length || 0}`);
if (result.amenities && result.amenities.length > 0) {
console.log(` First 10: ${result.amenities.slice(0, 10).join(', ')}`);
}
console.log('');
// Description
console.log('📝 DESCRIPTION:');
const desc = result.description?.value || 'N/A';
console.log(` ${desc.substring(0, 200)}${desc.length > 200 ? '...' : ''}\n`);
// External ID
console.log('🔗 EXTERNAL ID:');
console.log(` ${result.externalId || 'N/A'}\n`);
// Extraction Log
console.log('📋 EXTRACTION LOG:');
result.extractionLog?.forEach(log => {
console.log(` - ${log}`);
});
console.log('\n========================================');
console.log('TEST COMPLETE');
console.log('========================================');
} catch (error) {
const elapsed = ((Date.now() - startTime) / 1000).toFixed(1);
console.log(`\n❌ Error after ${elapsed}s:`);
console.error(error);
}
}
main().catch(console.error);