diff --git a/prisma/dev.db b/prisma/dev.db new file mode 100644 index 0000000..e69de29 diff --git a/prisma/prisma/dev.db b/prisma/prisma/dev.db index e5b7a6f..ae87d77 100644 Binary files a/prisma/prisma/dev.db and b/prisma/prisma/dev.db differ diff --git a/prisma/schema.prisma b/prisma/schema.prisma index 425fad9..dd081c1 100644 --- a/prisma/schema.prisma +++ b/prisma/schema.prisma @@ -31,6 +31,7 @@ model Listing { nightlyPrice Float? @map("nightly_price") totalPrice Float? @map("total_price") currency String? @default("EUR") + priceStatus String? @map("price_status") // EXTRACTED, REQUIRES_TRIP_CONTEXT, UNKNOWN, PARTIAL // Rating rating Float? diff --git a/src/actions/import-listing.ts b/src/actions/import-listing.ts index 76e8245..b94fee3 100644 --- a/src/actions/import-listing.ts +++ b/src/actions/import-listing.ts @@ -2,17 +2,29 @@ import { z } from "zod"; import { prisma } from "@/lib/prisma"; -import { scrapeAirbnbListing, extractAirbnbExternalId, normalizeAirbnbUrl } from "@/lib/airbnb-scraper"; +import { scrapeAirbnbListing } from "@/lib/airbnb"; +import { normalizeAirbnbUrl, extractAirbnbExternalId } from "@/lib/airbnb/url-normalizer"; import { slugify } from "@/lib/utils"; import { revalidatePath } from "next/cache"; const schema = z.object({ airbnbUrl: z.string().url("Ungültige URL"), + checkIn: z.string().optional(), + checkOut: z.string().optional(), + adults: z.number().optional(), }); export async function importListingAction(formData: FormData) { + const url = formData.get("airbnbUrl") as string; + const checkIn = formData.get("checkIn") as string | null; + const checkOut = formData.get("checkOut") as string | null; + const adultsStr = formData.get("adults") as string | null; + const parsed = schema.safeParse({ - airbnbUrl: formData.get("airbnbUrl"), + airbnbUrl: url, + checkIn: checkIn || undefined, + checkOut: checkOut || undefined, + adults: adultsStr ? parseInt(adultsStr, 10) : undefined, }); if (!parsed.success) { @@ -22,6 +34,7 @@ export async function importListingAction(formData: FormData) { const normalizedUrl = normalizeAirbnbUrl(parsed.data.airbnbUrl); const externalId = extractAirbnbExternalId(normalizedUrl); + // Check for duplicates const duplicate = await prisma.listing.findFirst({ where: { OR: [ @@ -42,10 +55,31 @@ export async function importListingAction(formData: FormData) { }; } - const scrapedData = await scrapeAirbnbListing(parsed.data.airbnbUrl); - const title = scrapedData?.title || "Neues Airbnb"; + // Build trip context from form or URL + const tripContext = { + checkIn: parsed.data.checkIn, + checkOut: parsed.data.checkOut, + adults: parsed.data.adults || 4, + }; + + // Scrape with trip context for better price extraction + const scrapedData = await scrapeAirbnbListing(parsed.data.airbnbUrl, { tripContext }); + + const title = scrapedData?.title?.value || "Neues Airbnb"; const slug = `${slugify(title)}-${Date.now()}`; + // Calculate sleeping stats + let maxSleepingPlaces = scrapedData?.maxSleepingPlaces || null; + let suitableFor4 = scrapedData?.suitableFor4 || null; + let extraMattressesNeededFor4 = scrapedData?.extraMattressesNeededFor4 || null; + let bedTypesSummary = null; + + if (scrapedData?.sleepingOptions && scrapedData.sleepingOptions.length > 0) { + const types = scrapedData.sleepingOptions.map(o => `${o.quantity}× ${o.bedType}`); + bedTypesSummary = types.join(", "); + } + + // Create listing const listing = await prisma.listing.create({ data: { title, @@ -53,29 +87,54 @@ export async function importListingAction(formData: FormData) { airbnbUrl: parsed.data.airbnbUrl, normalizedUrl, externalId, - ...(scrapedData?.pricePerNight && { nightlyPrice: scrapedData.pricePerNight }), - ...(scrapedData?.rating && { rating: scrapedData.rating }), - ...(scrapedData?.reviewCount && { reviewCount: scrapedData.reviewCount }), - ...(scrapedData?.guestCount && { guestCount: scrapedData.guestCount }), - ...(scrapedData?.bedrooms && { bedrooms: scrapedData.bedrooms }), - ...(scrapedData?.beds && { beds: scrapedData.beds }), - ...(scrapedData?.bathrooms && { bathrooms: scrapedData.bathrooms }), - ...(scrapedData?.description && { description: scrapedData.description }), - ...(scrapedData?.hostName && { hostName: scrapedData.hostName }), - ...(scrapedData?.location && { locationText: scrapedData.location }), - ...(scrapedData?.latitude && { latitude: scrapedData.latitude }), - ...(scrapedData?.longitude && { longitude: scrapedData.longitude }), - ...(scrapedData?.cancellationPolicy && { cancellationPolicy: scrapedData.cancellationPolicy }), - ...(scrapedData?.images?.length && { coverImage: scrapedData.images[0] }), - ...(scrapedData?.amenities?.length && { amenities: JSON.stringify(scrapedData.amenities) }), + + // Location + locationText: scrapedData?.locationText?.value || null, + latitude: scrapedData?.latitude?.value || null, + longitude: scrapedData?.longitude?.value || null, + + // Pricing + nightlyPrice: scrapedData?.nightlyPrice?.value || null, + totalPrice: scrapedData?.totalPrice?.value || null, + currency: "EUR", + priceStatus: scrapedData?.priceStatus || "UNKNOWN", + + // Rating + rating: scrapedData?.rating?.value || null, + reviewCount: scrapedData?.reviewCount?.value || null, + + // Capacity + guestCount: scrapedData?.guestCount?.value || null, + officialGuestCount: scrapedData?.officialGuestCount?.value || null, + maxSleepingPlaces, + suitableFor4, + extraMattressesNeededFor4, + bedTypesSummary, + + // Room Details + bedrooms: scrapedData?.bedrooms?.value || null, + beds: scrapedData?.beds?.value || null, + bathrooms: scrapedData?.bathrooms?.value || null, + + // Description & Host + description: scrapedData?.description?.value || null, + hostName: scrapedData?.hostName?.value || null, + cancellationPolicy: scrapedData?.cancellationPolicy?.value || null, + + // Images + coverImage: scrapedData?.coverImage || null, + amenities: scrapedData?.amenities?.length ? JSON.stringify(scrapedData.amenities) : null, + + // Raw data for debugging rawSourceData: scrapedData ? JSON.stringify(scrapedData) : null, }, select: { id: true, slug: true }, }); + // Save images if (scrapedData?.images?.length) { await prisma.listingImage.createMany({ - data: scrapedData.images.map((url, index) => ({ + data: scrapedData.images.slice(0, 20).map((url, index) => ({ listingId: listing.id, url, sortOrder: index, @@ -83,6 +142,20 @@ export async function importListingAction(formData: FormData) { }); } + // Save sleeping options + if (scrapedData?.sleepingOptions?.length) { + await prisma.listingSleepingOption.createMany({ + data: scrapedData.sleepingOptions.map(opt => ({ + listingId: listing.id, + bedType: opt.bedType, + quantity: opt.quantity, + spotsPerUnit: opt.spotsPerUnit, + quality: opt.quality, + label: opt.label || null, + })), + }); + } + revalidatePath("/dashboard"); revalidatePath("/listings"); diff --git a/src/app/(protected)/admin/import/import-form.tsx b/src/app/(protected)/admin/import/import-form.tsx index 44797ad..330aeac 100644 --- a/src/app/(protected)/admin/import/import-form.tsx +++ b/src/app/(protected)/admin/import/import-form.tsx @@ -4,10 +4,14 @@ import { useState } from "react"; import { Button } from "@/components/ui/button"; import { Input } from "@/components/ui/input"; import { Label } from "@/components/ui/label"; +import { Card, CardContent, CardHeader, CardTitle } from "@/components/ui/card"; import { importListingAction } from "@/actions/import-listing"; export function ImportForm() { const [url, setUrl] = useState(""); + const [checkIn, setCheckIn] = useState(""); + const [checkOut, setCheckOut] = useState(""); + const [adults, setAdults] = useState("4"); const [error, setError] = useState(""); const [success, setSuccess] = useState(false); const [isLoading, setIsLoading] = useState(false); @@ -20,6 +24,9 @@ export function ImportForm() { const formData = new FormData(); formData.append("airbnbUrl", url); + if (checkIn) formData.append("checkIn", checkIn); + if (checkOut) formData.append("checkOut", checkOut); + if (adults) formData.append("adults", adults); const result = await importListingAction(formData); @@ -33,25 +40,82 @@ export function ImportForm() { setIsLoading(false); }; + // Get today's date for min date + const today = new Date().toISOString().split('T')[0]; + return ( -
-
- - setUrl(e.target.value)} - required - autoFocus - /> -
- {error &&
{error}
} - {success &&
✓ Erfolgreich importiert!
} - -
+ + + 🏠 Neues Airbnb importieren + + +
+ {/* URL Field */} +
+ + setUrl(e.target.value)} + required + autoFocus + /> +
+ + {/* Trip Context Fields */} +
+ +
+
+ + setCheckIn(e.target.value)} + min={today} + placeholder="Datum" + /> +
+
+ + setCheckOut(e.target.value)} + min={checkIn || today} + placeholder="Datum" + /> +
+
+ + setAdults(e.target.value)} + /> +
+
+

+ 💡 Mit Reisedaten kann der Preis genauer ermittelt werden. + Die Daten werden auch aus der URL extrahiert wenn vorhanden. +

+
+ + {error &&
{error}
} + {success &&
✓ Erfolgreich importiert!
} + + +
+
+
); } diff --git a/src/app/(protected)/admin/listings/[slug]/page.tsx b/src/app/(protected)/admin/listings/[slug]/page.tsx index 64dba1f..5965e67 100644 --- a/src/app/(protected)/admin/listings/[slug]/page.tsx +++ b/src/app/(protected)/admin/listings/[slug]/page.tsx @@ -5,6 +5,7 @@ import { Button } from "@/components/ui/button"; import { Input } from "@/components/ui/input"; import { Label } from "@/components/ui/label"; import { updateListing, deleteListing, addNote, addTagToListing, removeTagFromListing } from "../actions"; +// Note: actions.ts is in /admin/listings/, so from [slug]/ we go up one level with ../ export default async function EditListingPage({ params, diff --git a/src/lib/airbnb/index.ts b/src/lib/airbnb/index.ts new file mode 100644 index 0000000..8550ffa --- /dev/null +++ b/src/lib/airbnb/index.ts @@ -0,0 +1,207 @@ +import * as cheerio from "cheerio"; +import { normalizeAirbnbUrlWithContext } from "./url-normalizer"; +import { parseCapacityFacts, parseRating, parseHost, parseMaxGuests, extractVisibleText } from "./parsers/text-patterns"; +import { parseSleepingArrangements, calculateSleepingStats, deriveSleepingFromBeds } from "./parsers/sleeping"; +import { extractPrice } from "./parsers/price"; +import { extractLocation } from "./parsers/location"; +import { parseJsonLd } from "./parsers/jsonld"; +import { + ExtractedListing, + FieldSource, + field, + mergeField, + TripContext, + SleepingDataQuality, + PriceStatus +} from "./types"; + +// ============================================ +// Main Scraper Function +// ============================================ + +export async function scrapeAirbnbListing( + url: string, + options?: { tripContext?: TripContext; usePlaywright?: boolean } +): Promise { + try { + // Step 1: Normalize URL and extract trip context + const normalized = normalizeAirbnbUrlWithContext(url); + + // Merge trip context from options with URL-extracted context + const tripContext: TripContext = { + checkIn: options?.tripContext?.checkIn || normalized.tripContext.checkIn, + checkOut: options?.tripContext?.checkOut || normalized.tripContext.checkOut, + adults: options?.tripContext?.adults || normalized.tripContext.adults || 4, + }; + + // Step 2: Fetch HTML + const html = await fetchHtml(normalized.normalized); + const $ = cheerio.load(html); + + // Step 3: Extract visible text for pattern matching + const visibleText = extractVisibleText(html); + + // Step 4: Run all parsers + const jsonldData = parseJsonLd($); + const capacityFacts = parseCapacityFacts(visibleText); + const ratingFacts = parseRating(visibleText); + const hostName = parseHost(visibleText); + const maxGuests = parseMaxGuests(visibleText); + const sleepingOptions = parseSleepingArrangements(visibleText); + const priceData = extractPrice(html, $, tripContext); + const locationData = extractLocation($, html); + + // Step 5: Build the result with priority: jsonld > text_pattern > derived + const result: ExtractedListing = { + // URLs + originalUrl: normalized.original, + normalizedUrl: normalized.normalized, + externalId: normalized.externalId, + + // Basic Info + title: mergeField( + jsonldData.title ? field(jsonldData.title, 'jsonld', 'high') : null, + field(null, 'derived', 'low') + ), + description: mergeField( + jsonldData.description ? field(jsonldData.description, 'jsonld', 'high') : null, + field(null, 'derived', 'low') + ), + + // Location + locationText: locationData.locationText, + latitude: mergeField( + jsonldData.latitude ? field(jsonldData.latitude, 'jsonld', 'high') : null, + locationData.latitude.value !== null ? locationData.latitude : field(null, 'derived', 'low') + ), + longitude: mergeField( + jsonldData.longitude ? field(jsonldData.longitude, 'jsonld', 'high') : null, + locationData.longitude.value !== null ? locationData.longitude : field(null, 'derived', 'low') + ), + + // Pricing + tripContext, + nightlyPrice: priceData.nightly, + totalPrice: priceData.total, + priceStatus: priceData.status, + + // Rating + rating: mergeField( + ratingFacts ? field(ratingFacts.rating, 'text_pattern', 'high') : null, + jsonldData.rating ? field(jsonldData.rating, 'jsonld', 'medium') : null + ), + reviewCount: mergeField( + ratingFacts && ratingFacts.reviewCount > 0 ? field(ratingFacts.reviewCount, 'text_pattern', 'high') : null, + jsonldData.reviewCount ? field(jsonldData.reviewCount, 'jsonld', 'medium') : null + ), + + // Capacity + guestCount: mergeField( + capacityFacts ? field(capacityFacts.guests, 'text_pattern', 'high') : null, + field(null, 'derived', 'low') + ), + officialGuestCount: mergeField( + maxGuests ? field(maxGuests, 'text_pattern', 'high') : null, + field(null, 'derived', 'low') + ), + bedrooms: mergeField( + capacityFacts ? field(capacityFacts.bedrooms, 'text_pattern', 'high') : null, + field(null, 'derived', 'low') + ), + beds: mergeField( + capacityFacts ? field(capacityFacts.beds, 'text_pattern', 'high') : null, + field(null, 'derived', 'low') + ), + bathrooms: mergeField( + capacityFacts ? field(capacityFacts.bathrooms, 'text_pattern', 'high') : null, + field(null, 'derived', 'low') + ), + + // Sleeping + sleepingOptions, + maxSleepingPlaces: 0, + suitableFor4: false, + extraMattressesNeededFor4: 0, + sleepingDataQuality: 'UNKNOWN', + + // Host + hostName: mergeField( + hostName ? field(hostName, 'text_pattern', 'high') : null, + jsonldData.hostName ? field(jsonldData.hostName, 'jsonld', 'medium') : null + ), + + // Amenities + amenities: jsonldData.amenities || [], + + // Images + images: jsonldData.images || [], + coverImage: jsonldData.images?.[0] || null, + + // Other + cancellationPolicy: field(null, 'derived', 'low'), + + // Debug + rawSnippets: { + title: jsonldData.title || '', + visibleText: visibleText.substring(0, 2000), + }, + extractionLog: [ + `URL normalized: ${normalized.normalized}`, + `External ID: ${normalized.externalId}`, + `Trip context: ${JSON.stringify(tripContext)}`, + `Capacity facts: ${capacityFacts ? JSON.stringify(capacityFacts) : 'none'}`, + `Rating facts: ${ratingFacts ? JSON.stringify(ratingFacts) : 'none'}`, + `Sleeping options: ${sleepingOptions.length} found`, + ], + }; + + // Step 6: Calculate sleeping stats + if (sleepingOptions.length > 0) { + const stats = calculateSleepingStats(sleepingOptions); + result.maxSleepingPlaces = stats.maxSleepingPlaces; + result.suitableFor4 = stats.suitableFor4; + result.extraMattressesNeededFor4 = stats.extraMattressesNeededFor4; + result.sleepingDataQuality = 'EXACT'; + } else if (result.beds.value && result.guestCount.value) { + // Derive from beds and guest count + const derivedOptions = deriveSleepingFromBeds(result.beds.value, result.guestCount.value); + const stats = calculateSleepingStats(derivedOptions); + result.sleepingOptions = derivedOptions; + result.maxSleepingPlaces = stats.maxSleepingPlaces; + result.suitableFor4 = stats.suitableFor4; + result.extraMattressesNeededFor4 = stats.extraMattressesNeededFor4; + result.sleepingDataQuality = 'DERIVED'; + } + + return result; + } catch (error) { + console.error("Scraping failed:", error); + return null; + } +} + +// ============================================ +// HTML Fetcher +// ============================================ + +async function fetchHtml(url: string): Promise { + const response = await fetch(url, { + headers: { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language": "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7", + "Accept-Encoding": "gzip, deflate, br", + "Cache-Control": "no-cache", + }, + }); + + if (!response.ok) { + throw new Error(`HTTP ${response.status} for ${url}`); + } + + return response.text(); +} + +// Re-export utilities for backward compatibility +export { normalizeAirbnbUrlWithContext as normalizeAirbnbUrl } from "./url-normalizer"; +export { extractAirbnbExternalId } from "./url-normalizer"; diff --git a/src/lib/airbnb/parsers/jsonld.ts b/src/lib/airbnb/parsers/jsonld.ts new file mode 100644 index 0000000..1cf75b4 --- /dev/null +++ b/src/lib/airbnb/parsers/jsonld.ts @@ -0,0 +1,125 @@ +import * as cheerio from 'cheerio'; + +export interface JsonLdData { + title: string | null; + description: string | null; + locationText: string | null; + latitude: number | null; + longitude: number | null; + rating: number | null; + reviewCount: number | null; + images: string[]; + cancellationPolicy: string | null; + hostName: string | null; + amenities: string[]; +} + +/** + * Parse JSON-LD structured data from HTML + * Airbnb typically uses LodgingBusiness or Room schema + */ +export function parseJsonLd($: cheerio.CheerioAPI): JsonLdData { + const result: JsonLdData = { + title: null, + description: null, + locationText: null, + latitude: null, + longitude: null, + rating: null, + reviewCount: null, + images: [], + cancellationPolicy: null, + hostName: null, + amenities: [], + }; + + const jsonLdScript = $('script[type="application/ld+json"]').html(); + + if (!jsonLdScript) { + return result; + } + + try { + const jsonData = JSON.parse(jsonLdScript); + + // Check if it's a lodging business schema + if (jsonData["@type"] !== "LodgingBusiness" && jsonData["@type"] !== "Room") { + return result; + } + + // Title + if (jsonData.name) { + result.title = jsonData.name; + } + + // Description + if (jsonData.description) { + result.description = jsonData.description; + } + + // Location + if (jsonData.address) { + const parts: string[] = []; + if (jsonData.address.addressLocality) parts.push(jsonData.address.addressLocality); + if (jsonData.address.addressRegion) parts.push(jsonData.address.addressRegion); + if (jsonData.address.addressCountry) parts.push(jsonData.address.addressCountry); + + if (parts.length > 0) { + result.locationText = parts.join(', '); + } + } + + // Coordinates + if (jsonData.geo) { + if (jsonData.geo.latitude) { + result.latitude = parseFloat(jsonData.geo.latitude); + } + if (jsonData.geo.longitude) { + result.longitude = parseFloat(jsonData.geo.longitude); + } + } + + // Rating + if (jsonData.aggregateRating) { + if (jsonData.aggregateRating.ratingValue) { + result.rating = parseFloat(jsonData.aggregateRating.ratingValue); + } + if (jsonData.aggregateRating.reviewCount) { + result.reviewCount = parseInt(jsonData.aggregateRating.reviewCount, 10); + } + } + + // Images + if (jsonData.image) { + const images = Array.isArray(jsonData.image) + ? jsonData.image.map((img: unknown) => { + const imgObj = img as Record; + return imgObj.url || imgObj['@id'] || String(img); + }) + : [jsonData.image.url || jsonData.image['@id'] || jsonData.image]; + result.images = images.filter(Boolean); + } + + // Cancellation Policy + if (jsonData.cancellationPolicy) { + result.cancellationPolicy = jsonData.cancellationPolicy; + } + + // Host name + if (jsonData.provider?.name) { + result.hostName = jsonData.provider.name; + } + + // Amenities + if (jsonData.amenityFeature && Array.isArray(jsonData.amenityFeature)) { + result.amenities = jsonData.amenityFeature + .map((f: unknown) => (f as { name?: string }).name) + .filter(Boolean); + } + + } catch (error) { + console.error('Failed to parse JSON-LD:', error); + } + + return result; +} diff --git a/src/lib/airbnb/parsers/location.ts b/src/lib/airbnb/parsers/location.ts new file mode 100644 index 0000000..6e81337 --- /dev/null +++ b/src/lib/airbnb/parsers/location.ts @@ -0,0 +1,118 @@ +import * as cheerio from 'cheerio'; +import { FieldSource } from '../types'; + +/** + * Extract location from multiple sources with priority: + * 1. JSON-LD address (handled separately) + * 2. "Where you'll be" section + * 3. Meta tags (og:locality, etc.) + * 4. Visible text patterns + */ +export function extractLocation( + $: cheerio.CheerioAPI, + html: string +): { locationText: FieldSource; latitude: FieldSource; longitude: FieldSource } { + + let locationText: string | null = null; + let locationSource: FieldSource['source'] = 'text_pattern'; + let latitude: number | null = null; + let longitude: number | null = null; + + // 1. Try "Where you'll be" section + const whereSection = $('[data-section-id="LOCATION_DEFAULT"]').text() || + $('section:contains("Where you\'ll be")').text() || + $('section:contains("Lage")').text(); + + if (whereSection) { + // Extract location from this section + const locationMatch = whereSection.match(/([A-Z][a-zäöüÄÖÜß]+(?:\s+[A-Z][a-zäöüÄÖÜß]+)*,\s*[A-Z][a-zäöüÄÖÜß]+)/); + if (locationMatch) { + locationText = locationMatch[1].trim(); + locationSource = 'dom'; + } + } + + // 2. Try meta tags + if (!locationText) { + const locality = $('meta[property="og:locality"]').attr('content') || + $('meta[name="location"]').attr('content'); + const region = $('meta[property="og:region"]').attr('content'); + const country = $('meta[property="og:country-name"]').attr('content'); + + if (locality) { + locationText = [locality, region, country].filter(Boolean).join(', '); + locationSource = 'meta'; + } + } + + // 3. Try text patterns like "Location: Berlin, Germany" + if (!locationText) { + const locationPattern = /(?:location|lage|standort)[:\s]+([A-Z][a-zäöüÄÖÜß]+(?:[\s,]+[A-Z][a-zäöüÄÖÜß]+)*)/i; + const match = html.match(locationPattern); + if (match) { + locationText = match[1].trim(); + locationSource = 'text_pattern'; + } + } + + // 4. Try extracting from title (e.g., "Apartment in Berlin · ...") + if (!locationText) { + const titlePattern = /(?:in|bei|am)\s+([A-Z][a-zäöüÄÖÜß]+(?:\s+[A-Z][a-zäöüÄÖÜß]+)?)\s*[·•]/; + const title = $('title').text(); + const match = title.match(titlePattern); + if (match) { + locationText = match[1].trim(); + locationSource = 'text_pattern'; + } + } + + // Extract coordinates from various sources + // Try data attributes + const latAttr = $('[data-lat]').attr('data-lat') || $('[data-latitude]').attr('data-latitude'); + const lngAttr = $('[data-lng]').attr('data-lng') || $('[data-longitude]').attr('data-longitude'); + + if (latAttr && lngAttr) { + latitude = parseFloat(latAttr); + longitude = parseFloat(lngAttr); + } + + // Try meta tags for coordinates + if (!latitude) { + const geoPosition = $('meta[name="geo.position"]').attr('content') || + $('meta[property="place:location:latitude"]').attr('content'); + if (geoPosition) { + const parts = geoPosition.split(/[;,]/); + if (parts.length >= 2) { + latitude = parseFloat(parts[0]); + longitude = parseFloat(parts[1]); + } else { + latitude = parseFloat(geoPosition); + } + } + } + + if (!longitude) { + const lngMeta = $('meta[property="place:location:longitude"]').attr('content'); + if (lngMeta) { + longitude = parseFloat(lngMeta); + } + } + + return { + locationText: { + value: locationText, + source: locationSource, + confidence: locationText ? 'medium' : 'low', + }, + latitude: { + value: latitude, + source: latitude ? 'dom' : 'text_pattern', + confidence: latitude ? 'high' : 'low', + }, + longitude: { + value: longitude, + source: longitude ? 'dom' : 'text_pattern', + confidence: longitude ? 'high' : 'low', + }, + }; +} diff --git a/src/lib/airbnb/parsers/price.ts b/src/lib/airbnb/parsers/price.ts new file mode 100644 index 0000000..9d81e27 --- /dev/null +++ b/src/lib/airbnb/parsers/price.ts @@ -0,0 +1,102 @@ +import * as cheerio from 'cheerio'; +import { FieldSource, PriceStatus, TripContext } from '../types'; +import { parsePriceFromText } from './text-patterns'; + +/** + * Try to extract price from HTML using various selectors + */ +function tryExtractPriceFromHtml(html: string, $: cheerio.CheerioAPI): number | null { + // Try various price selectors that Airbnb might use + const priceSelectors = [ + '[data-testid="price-amount"]', + 'span[class*="Price"]', + 'span[class*="price"]', + '[itemprop="price"]', + '._1y6k3r2', + '._1dss1omb', + ]; + + for (const selector of priceSelectors) { + const element = $(selector).first(); + if (element.length) { + const text = element.text(); + const price = parsePriceFromText(text); + if (price !== null) { + return price; + } + } + } + + // Fallback: search entire HTML for price patterns + const priceFromHtml = parsePriceFromText(html); + if (priceFromHtml !== null) { + return priceFromHtml; + } + + return null; +} + +/** + * Extract price with trip context awareness + * + * CRITICAL: Price reliability depends on trip context + * - With check-in/check-out: Price is for those specific dates + * - Without trip context: Price may be a base/minimum price + */ +export function extractPrice( + html: string, + $: cheerio.CheerioAPI, + tripContext: TripContext +): { nightly: FieldSource; total: FieldSource; status: PriceStatus } { + + // No trip context = unreliable price + if (!tripContext.checkIn || !tripContext.checkOut) { + const extracted = tryExtractPriceFromHtml(html, $); + + if (extracted !== null) { + return { + nightly: { value: extracted, source: 'text_pattern', confidence: 'low' }, + total: { value: null, source: 'text_pattern', confidence: 'low' }, + status: 'REQUIRES_TRIP_CONTEXT', + }; + } + + return { + nightly: { value: null, source: 'text_pattern', confidence: 'low' }, + total: { value: null, source: 'text_pattern', confidence: 'low' }, + status: 'UNKNOWN', + }; + } + + // With trip context, try harder to extract + const extracted = tryExtractPriceFromHtml(html, $); + + if (extracted !== null) { + // Calculate nights for total price + let total: number | null = null; + try { + const checkIn = new Date(tripContext.checkIn); + const checkOut = new Date(tripContext.checkOut); + const nights = Math.round((checkOut.getTime() - checkIn.getTime()) / (1000 * 60 * 60 * 24)); + if (nights > 0) { + total = extracted * nights; + } + } catch { + // Invalid dates, skip total calculation + } + + return { + nightly: { value: extracted, source: 'text_pattern', confidence: 'medium' }, + total: total !== null + ? { value: total, source: 'derived', confidence: 'medium' } + : { value: null, source: 'text_pattern', confidence: 'low' }, + status: 'EXTRACTED', + }; + } + + return { + nightly: { value: null, source: 'text_pattern', confidence: 'low' }, + total: { value: null, source: 'text_pattern', confidence: 'low' }, + status: 'UNKNOWN', + }; +} diff --git a/src/lib/airbnb/parsers/sleeping.ts b/src/lib/airbnb/parsers/sleeping.ts new file mode 100644 index 0000000..87175a1 --- /dev/null +++ b/src/lib/airbnb/parsers/sleeping.ts @@ -0,0 +1,143 @@ +import { BedType, SleepingOption } from '../types'; + +/** + * Bed type configuration: maps text patterns to bed types, spots per unit, and quality + */ +export const BED_TYPE_CONFIG: Record = { + 'double bed': { type: 'DOUBLE', spots: 2, quality: 'FULL' }, + 'doppelbett': { type: 'DOUBLE', spots: 2, quality: 'FULL' }, + 'queen bed': { type: 'QUEEN', spots: 2, quality: 'FULL' }, + 'king bed': { type: 'KING', spots: 2, quality: 'FULL' }, + 'single bed': { type: 'SINGLE', spots: 1, quality: 'FULL' }, + 'twin bed': { type: 'SINGLE', spots: 1, quality: 'FULL' }, + 'einzelbett': { type: 'SINGLE', spots: 1, quality: 'FULL' }, + 'bunk bed': { type: 'BUNK', spots: 2, quality: 'FULL' }, + 'etagenbett': { type: 'BUNK', spots: 2, quality: 'FULL' }, + 'sofa bed': { type: 'SOFA_BED', spots: 2, quality: 'FULL' }, + 'pull-out sofa': { type: 'SOFA_BED', spots: 2, quality: 'FULL' }, + 'schlafsofa': { type: 'SOFA_BED', spots: 2, quality: 'FULL' }, + 'couch': { type: 'SOFA', spots: 1, quality: 'AUXILIARY' }, + 'sofa': { type: 'SOFA', spots: 1, quality: 'AUXILIARY' }, + 'air mattress': { type: 'AIR_MATTRESS', spots: 1, quality: 'AUXILIARY' }, + 'luftmatratze': { type: 'AIR_MATTRESS', spots: 1, quality: 'AUXILIARY' }, + 'floor mattress': { type: 'EXTRA_MATTRESS', spots: 1, quality: 'AUXILIARY' }, + 'extra mattress': { type: 'EXTRA_MATTRESS', spots: 1, quality: 'AUXILIARY' }, + 'zusatzmatratze': { type: 'EXTRA_MATTRESS', spots: 1, quality: 'AUXILIARY' }, + 'futon': { type: 'FUTON', spots: 1, quality: 'AUXILIARY' }, +}; + +// Pattern: "1 double bed" or "2 single beds" or "Bedroom 1: 1 queen bed" +const BED_PATTERN = /(?:(?:bedroom|schlafzimmer|room|zimmer)\s*\d*\s*:?\s*)?(\d+)\s+([a-z\s-]+?)(?:\s|$|,|\.)/gi; + +export interface SleepingStats { + maxSleepingPlaces: number; + suitableFor4: boolean; + extraMattressesNeededFor4: number; +} + +/** + * Parse sleeping arrangements from text + * Handles patterns like: + * - "1 double bed" + * - "2 single beds" + * - "Bedroom 1: 1 queen bed" + * - "Common space: 1 sofa bed" + */ +export function parseSleepingArrangements(text: string): SleepingOption[] { + const options: SleepingOption[] = []; + const lowerText = text.toLowerCase(); + + let match; + while ((match = BED_PATTERN.exec(lowerText)) !== null) { + const quantity = parseInt(match[1], 10); + const bedTypeText = match[2].trim(); + + // Find matching bed type config + let matchedConfig: { type: BedType; spots: number; quality: 'FULL' | 'AUXILIARY' } | null = null; + let matchedLabel = ''; + + for (const [pattern, config] of Object.entries(BED_TYPE_CONFIG)) { + if (bedTypeText.includes(pattern) || pattern.includes(bedTypeText)) { + matchedConfig = config; + matchedLabel = pattern; + break; + } + } + + if (matchedConfig && quantity > 0) { + // Check if this bed type already exists + const existing = options.find(o => o.bedType === matchedConfig!.type); + if (existing) { + existing.quantity += quantity; + } else { + options.push({ + bedType: matchedConfig.type, + quantity, + spotsPerUnit: matchedConfig.spots, + quality: matchedConfig.quality, + label: matchedLabel, + rawText: match[0].trim(), + }); + } + } + } + + return options; +} + +/** + * Calculate sleeping statistics from options + */ +export function calculateSleepingStats(options: SleepingOption[]): SleepingStats { + const maxSleepingPlaces = options.reduce( + (sum, opt) => sum + opt.quantity * opt.spotsPerUnit, + 0 + ); + + const suitableFor4 = maxSleepingPlaces >= 4; + + // Calculate extra mattresses needed for 4 people + // Only count FULL quality beds first + const fullQualitySpots = options + .filter(o => o.quality === 'FULL') + .reduce((sum, opt) => sum + opt.quantity * opt.spotsPerUnit, 0); + + const extraMattressesNeededFor4 = Math.max(0, 4 - fullQualitySpots); + + return { + maxSleepingPlaces, + suitableFor4, + extraMattressesNeededFor4, + }; +} + +/** + * Derive sleeping options from bed count (fallback with low confidence) + * Used when detailed sleeping arrangement text is not available + */ +export function deriveSleepingFromBeds(beds: number, guestCount: number): SleepingOption[] { + if (!beds || beds < 1) return []; + + // Assume beds are double beds if guest count suggests it + const avgGuestsPerBed = guestCount ? guestCount / beds : 2; + + if (avgGuestsPerBed >= 1.5) { + // Likely double beds + return [{ + bedType: 'DOUBLE', + quantity: beds, + spotsPerUnit: 2, + quality: 'FULL', + label: 'double bed (derived)', + }]; + } else { + // Likely single beds + return [{ + bedType: 'SINGLE', + quantity: beds, + spotsPerUnit: 1, + quality: 'FULL', + label: 'single bed (derived)', + }]; + } +} diff --git a/src/lib/airbnb/parsers/text-patterns.ts b/src/lib/airbnb/parsers/text-patterns.ts new file mode 100644 index 0000000..098d13f --- /dev/null +++ b/src/lib/airbnb/parsers/text-patterns.ts @@ -0,0 +1,123 @@ +/** + * Text pattern parsers for extracting data from visible HTML text + * Supports both German and English patterns + */ + +// "2 guests · 1 bedroom · 2 beds · 1 bath" or German variants +const CAPACITY_PATTERN = /(\d+)\s*(?:guests?|gäste?)\s*[·•]\s*(\d+)\s*(?:bedrooms?|schlafzimmer?)\s*[·•]\s*(\d+)\s*(?:beds?|betten?)\s*[·•]\s*(\d+(?:[.,]\d+)?)\s*(?:baths?|bäder?)/i; + +// "4.88 · 200 reviews" or "4,88 (200)" or "4,88 · 200 Bewertungen" +const RATING_PATTERN = /(\d+[.,]\d+)\s*(?:[·•\(]?\s*(\d+)\s*(?:reviews?|bewertungen)?\)?)/i; + +// "Hosted by David" or "Gehostet von David" +const HOST_PATTERN = /(?:hosted by|gehostet von)\s+([^\n·•]+)/i; + +// "€ 150 / night" or "$150 per night" or "150 € pro Nacht" +const PRICE_PATTERN = /[€$]?\s*(\d+(?:[.,]\d{0,2})?)\s*[€$]?\s*(?:\/|per|pro)\s*(?:night|nacht)/i; + +// "6 guests maximum" or "max. 6 Gäste" or "Up to 6 guests" +const MAX_GUESTS_PATTERN = /(?:max\.?|maximum|up to)\s*(\d+)\s*(?:guests?|gäste?)|(\d+)\s*(?:guests?|gäste?)\s*(?:maximum|max\.?)/i; + +export interface CapacityFacts { + guests: number; + bedrooms: number; + beds: number; + bathrooms: number; +} + +export interface RatingFacts { + rating: number; + reviewCount: number; +} + +/** + * Parse capacity facts from text like "2 guests · 1 bedroom · 2 beds · 1 bath" + */ +export function parseCapacityFacts(text: string): CapacityFacts | null { + const match = text.match(CAPACITY_PATTERN); + if (!match) return null; + + return { + guests: parseInt(match[1], 10), + bedrooms: parseInt(match[2], 10), + beds: parseInt(match[3], 10), + bathrooms: parseFloat(match[4].replace(',', '.')), + }; +} + +/** + * Parse rating from text like "4.88 · 200 reviews" + */ +export function parseRating(text: string): RatingFacts | null { + const match = text.match(RATING_PATTERN); + if (!match) return null; + + const rating = parseFloat(match[1].replace(',', '.')); + const reviewCount = match[2] ? parseInt(match[2], 10) : 0; + + if (isNaN(rating)) return null; + + return { rating, reviewCount }; +} + +/** + * Parse host name from text like "Hosted by David" + */ +export function parseHost(text: string): string | null { + const match = text.match(HOST_PATTERN); + if (!match) return null; + + return match[1].trim(); +} + +/** + * Parse price from text like "€ 150 / night" + */ +export function parsePriceFromText(text: string): number | null { + const match = text.match(PRICE_PATTERN); + if (!match) return null; + + const price = parseFloat(match[1].replace(',', '.')); + return isNaN(price) ? null : price; +} + +/** + * Parse max guests from text like "6 guests maximum" + */ +export function parseMaxGuests(text: string): number | null { + const match = text.match(MAX_GUESTS_PATTERN); + if (!match) return null; + + // Pattern has two capture groups depending on word order + const value = match[1] || match[2]; + return value ? parseInt(value, 10) : null; +} + +/** + * Extract all text content from HTML for pattern matching + */ +export function extractVisibleText(html: string): string { + // Remove script and style tags + let text = html.replace(/]*>[\s\S]*?<\/script>/gi, ' '); + text = text.replace(/]*>[\s\S]*?<\/style>/gi, ' '); + + // Replace block elements with newlines + text = text.replace(/<\/(div|p|br|li|tr|td|th|h[1-6]|section|article|header|footer)[^>]*>/gi, '\n'); + + // Remove remaining tags + text = text.replace(/<[^>]+>/g, ' '); + + // Decode HTML entities + text = text + .replace(/ /g, ' ') + .replace(/&/g, '&') + .replace(/</g, '<') + .replace(/>/g, '>') + .replace(/"/g, '"') + .replace(/&#(\d+);/g, (_, num) => String.fromCharCode(parseInt(num, 10))); + + // Normalize whitespace + text = text.replace(/\s+/g, ' ').trim(); + + return text; +} diff --git a/src/lib/airbnb/types.ts b/src/lib/airbnb/types.ts new file mode 100644 index 0000000..4962cd8 --- /dev/null +++ b/src/lib/airbnb/types.ts @@ -0,0 +1,113 @@ +export type DataSource = 'jsonld' | 'meta' | 'text_pattern' | 'dom' | 'playwright' | 'derived' | 'manual'; +export type Confidence = 'high' | 'medium' | 'low'; +export type PriceStatus = 'EXTRACTED' | 'REQUIRES_TRIP_CONTEXT' | 'UNKNOWN' | 'PARTIAL'; +export type SleepingDataQuality = 'EXACT' | 'DERIVED' | 'UNKNOWN'; + +export interface FieldSource { + value: T; + source: DataSource; + confidence: Confidence; +} + +/** + * Create a FieldSource object with value, source, and confidence + */ +export function field(value: T, source: DataSource, confidence: Confidence): FieldSource { + return { value, source, confidence }; +} + +/** + * Merge two FieldSources - takes the first non-null value + * Priority: primary over secondary + */ +export function mergeField(primary: FieldSource | null, secondary: FieldSource | null): FieldSource { + if (primary?.value !== null && primary?.value !== undefined) { + return primary; + } + if (secondary?.value !== null && secondary?.value !== undefined) { + return secondary; + } + // Return null with lowest confidence + return { value: null as T, source: 'derived', confidence: 'low' }; +} + +export type BedType = 'DOUBLE' | 'SINGLE' | 'SOFA_BED' | 'SOFA' | 'AIR_MATTRESS' | 'FUTON' | 'BUNK' | 'EXTRA_MATTRESS' | 'QUEEN' | 'KING' | 'UNKNOWN'; + +export interface SleepingOption { + bedType: BedType; + quantity: number; + spotsPerUnit: number; + quality: 'FULL' | 'AUXILIARY'; + label?: string; + rawText?: string; +} + +export interface TripContext { + checkIn?: string; + checkOut?: string; + adults?: number; +} + +export interface NormalizedUrl { + original: string; + normalized: string; + externalId: string | null; + tripContext: TripContext; +} + +export interface ExtractedListing { + // URLs + originalUrl: string; + normalizedUrl: string; + externalId: string | null; + + // Basic Info + title: FieldSource; + description: FieldSource; + + // Location + locationText: FieldSource; + latitude: FieldSource; + longitude: FieldSource; + + // Pricing + tripContext: TripContext; + nightlyPrice: FieldSource; + totalPrice: FieldSource; + priceStatus: PriceStatus; + + // Rating + rating: FieldSource; + reviewCount: FieldSource; + + // Capacity + guestCount: FieldSource; + officialGuestCount: FieldSource; + bedrooms: FieldSource; + beds: FieldSource; + bathrooms: FieldSource; + + // Sleeping + sleepingOptions: SleepingOption[]; + maxSleepingPlaces: number; + suitableFor4: boolean; + extraMattressesNeededFor4: number; + sleepingDataQuality: SleepingDataQuality; + + // Host + hostName: FieldSource; + + // Amenities + amenities: string[]; + + // Images + images: string[]; + coverImage: string | null; + + // Other + cancellationPolicy: FieldSource; + + // Debug + rawSnippets: Record; + extractionLog: string[]; +} diff --git a/src/lib/airbnb/url-normalizer.ts b/src/lib/airbnb/url-normalizer.ts new file mode 100644 index 0000000..239263e --- /dev/null +++ b/src/lib/airbnb/url-normalizer.ts @@ -0,0 +1,71 @@ +import { TripContext, NormalizedUrl } from './types'; + +/** + * Extracts the Airbnb listing ID from a URL + * Matches patterns like /rooms/12345 or /rooms/12345/ + */ +export function extractAirbnbExternalId(url: string): string | null { + const match = url.match(/\/rooms\/(\d+)/); + return match?.[1] || null; +} + +/** + * Extracts trip context from URL query parameters + * Looks for: check_in, check_out, adults + */ +export function extractTripContext(url: string): TripContext { + try { + const urlObj = new URL(url); + const params = urlObj.searchParams; + + const checkIn = params.get('check_in') || params.get('checkIn') || undefined; + const checkOut = params.get('check_out') || params.get('checkOut') || undefined; + const adultsStr = params.get('adults') || params.get('adults[]') || undefined; + + return { + checkIn, + checkOut, + adults: adultsStr ? parseInt(adultsStr, 10) : undefined, + }; + } catch { + return {}; + } +} + +/** + * Normalizes an Airbnb URL by: + * - Removing hash + * - Removing query params (trip context extracted separately) + * - Removing trailing slashes + * - Removing www prefix + * - Lowercasing hostname + */ +export function normalizeAirbnbUrl(url: string): string { + try { + const urlObj = new URL(url.trim()); + urlObj.hash = ''; + urlObj.search = ''; + urlObj.pathname = urlObj.pathname.replace(/\/+$/, ''); + urlObj.hostname = urlObj.hostname.replace(/^www\./, '').toLowerCase(); + return urlObj.toString(); + } catch { + return url.trim(); + } +} + +/** + * Main function: Normalizes URL and extracts all metadata + */ +export function normalizeAirbnbUrlWithContext(url: string): NormalizedUrl { + const original = url.trim(); + const normalized = normalizeAirbnbUrl(original); + const externalId = extractAirbnbExternalId(normalized); + const tripContext = extractTripContext(original); + + return { + original, + normalized, + externalId, + tripContext, + }; +}