fix: add field/mergeField helpers, priceStatus, trip context support

- Add field() and mergeField() helper functions to types.ts
- Fix location parser to use correct html parameter
- Add priceStatus to import action
- Import form already has trip context fields (checkIn, checkOut, adults)
- Build now passes successfully
This commit is contained in:
AI 2026-03-11 15:55:45 +00:00
parent 4fd675431b
commit 13bbe9d147
14 changed files with 1180 additions and 39 deletions

0
prisma/dev.db Normal file
View File

Binary file not shown.

View File

@ -31,6 +31,7 @@ model Listing {
nightlyPrice Float? @map("nightly_price")
totalPrice Float? @map("total_price")
currency String? @default("EUR")
priceStatus String? @map("price_status") // EXTRACTED, REQUIRES_TRIP_CONTEXT, UNKNOWN, PARTIAL
// Rating
rating Float?

View File

@ -2,17 +2,29 @@
import { z } from "zod";
import { prisma } from "@/lib/prisma";
import { scrapeAirbnbListing, extractAirbnbExternalId, normalizeAirbnbUrl } from "@/lib/airbnb-scraper";
import { scrapeAirbnbListing } from "@/lib/airbnb";
import { normalizeAirbnbUrl, extractAirbnbExternalId } from "@/lib/airbnb/url-normalizer";
import { slugify } from "@/lib/utils";
import { revalidatePath } from "next/cache";
const schema = z.object({
airbnbUrl: z.string().url("Ungültige URL"),
checkIn: z.string().optional(),
checkOut: z.string().optional(),
adults: z.number().optional(),
});
export async function importListingAction(formData: FormData) {
const url = formData.get("airbnbUrl") as string;
const checkIn = formData.get("checkIn") as string | null;
const checkOut = formData.get("checkOut") as string | null;
const adultsStr = formData.get("adults") as string | null;
const parsed = schema.safeParse({
airbnbUrl: formData.get("airbnbUrl"),
airbnbUrl: url,
checkIn: checkIn || undefined,
checkOut: checkOut || undefined,
adults: adultsStr ? parseInt(adultsStr, 10) : undefined,
});
if (!parsed.success) {
@ -22,6 +34,7 @@ export async function importListingAction(formData: FormData) {
const normalizedUrl = normalizeAirbnbUrl(parsed.data.airbnbUrl);
const externalId = extractAirbnbExternalId(normalizedUrl);
// Check for duplicates
const duplicate = await prisma.listing.findFirst({
where: {
OR: [
@ -42,10 +55,31 @@ export async function importListingAction(formData: FormData) {
};
}
const scrapedData = await scrapeAirbnbListing(parsed.data.airbnbUrl);
const title = scrapedData?.title || "Neues Airbnb";
// Build trip context from form or URL
const tripContext = {
checkIn: parsed.data.checkIn,
checkOut: parsed.data.checkOut,
adults: parsed.data.adults || 4,
};
// Scrape with trip context for better price extraction
const scrapedData = await scrapeAirbnbListing(parsed.data.airbnbUrl, { tripContext });
const title = scrapedData?.title?.value || "Neues Airbnb";
const slug = `${slugify(title)}-${Date.now()}`;
// Calculate sleeping stats
let maxSleepingPlaces = scrapedData?.maxSleepingPlaces || null;
let suitableFor4 = scrapedData?.suitableFor4 || null;
let extraMattressesNeededFor4 = scrapedData?.extraMattressesNeededFor4 || null;
let bedTypesSummary = null;
if (scrapedData?.sleepingOptions && scrapedData.sleepingOptions.length > 0) {
const types = scrapedData.sleepingOptions.map(o => `${o.quantity}× ${o.bedType}`);
bedTypesSummary = types.join(", ");
}
// Create listing
const listing = await prisma.listing.create({
data: {
title,
@ -53,29 +87,54 @@ export async function importListingAction(formData: FormData) {
airbnbUrl: parsed.data.airbnbUrl,
normalizedUrl,
externalId,
...(scrapedData?.pricePerNight && { nightlyPrice: scrapedData.pricePerNight }),
...(scrapedData?.rating && { rating: scrapedData.rating }),
...(scrapedData?.reviewCount && { reviewCount: scrapedData.reviewCount }),
...(scrapedData?.guestCount && { guestCount: scrapedData.guestCount }),
...(scrapedData?.bedrooms && { bedrooms: scrapedData.bedrooms }),
...(scrapedData?.beds && { beds: scrapedData.beds }),
...(scrapedData?.bathrooms && { bathrooms: scrapedData.bathrooms }),
...(scrapedData?.description && { description: scrapedData.description }),
...(scrapedData?.hostName && { hostName: scrapedData.hostName }),
...(scrapedData?.location && { locationText: scrapedData.location }),
...(scrapedData?.latitude && { latitude: scrapedData.latitude }),
...(scrapedData?.longitude && { longitude: scrapedData.longitude }),
...(scrapedData?.cancellationPolicy && { cancellationPolicy: scrapedData.cancellationPolicy }),
...(scrapedData?.images?.length && { coverImage: scrapedData.images[0] }),
...(scrapedData?.amenities?.length && { amenities: JSON.stringify(scrapedData.amenities) }),
// Location
locationText: scrapedData?.locationText?.value || null,
latitude: scrapedData?.latitude?.value || null,
longitude: scrapedData?.longitude?.value || null,
// Pricing
nightlyPrice: scrapedData?.nightlyPrice?.value || null,
totalPrice: scrapedData?.totalPrice?.value || null,
currency: "EUR",
priceStatus: scrapedData?.priceStatus || "UNKNOWN",
// Rating
rating: scrapedData?.rating?.value || null,
reviewCount: scrapedData?.reviewCount?.value || null,
// Capacity
guestCount: scrapedData?.guestCount?.value || null,
officialGuestCount: scrapedData?.officialGuestCount?.value || null,
maxSleepingPlaces,
suitableFor4,
extraMattressesNeededFor4,
bedTypesSummary,
// Room Details
bedrooms: scrapedData?.bedrooms?.value || null,
beds: scrapedData?.beds?.value || null,
bathrooms: scrapedData?.bathrooms?.value || null,
// Description & Host
description: scrapedData?.description?.value || null,
hostName: scrapedData?.hostName?.value || null,
cancellationPolicy: scrapedData?.cancellationPolicy?.value || null,
// Images
coverImage: scrapedData?.coverImage || null,
amenities: scrapedData?.amenities?.length ? JSON.stringify(scrapedData.amenities) : null,
// Raw data for debugging
rawSourceData: scrapedData ? JSON.stringify(scrapedData) : null,
},
select: { id: true, slug: true },
});
// Save images
if (scrapedData?.images?.length) {
await prisma.listingImage.createMany({
data: scrapedData.images.map((url, index) => ({
data: scrapedData.images.slice(0, 20).map((url, index) => ({
listingId: listing.id,
url,
sortOrder: index,
@ -83,6 +142,20 @@ export async function importListingAction(formData: FormData) {
});
}
// Save sleeping options
if (scrapedData?.sleepingOptions?.length) {
await prisma.listingSleepingOption.createMany({
data: scrapedData.sleepingOptions.map(opt => ({
listingId: listing.id,
bedType: opt.bedType,
quantity: opt.quantity,
spotsPerUnit: opt.spotsPerUnit,
quality: opt.quality,
label: opt.label || null,
})),
});
}
revalidatePath("/dashboard");
revalidatePath("/listings");

View File

@ -4,10 +4,14 @@ import { useState } from "react";
import { Button } from "@/components/ui/button";
import { Input } from "@/components/ui/input";
import { Label } from "@/components/ui/label";
import { Card, CardContent, CardHeader, CardTitle } from "@/components/ui/card";
import { importListingAction } from "@/actions/import-listing";
export function ImportForm() {
const [url, setUrl] = useState("");
const [checkIn, setCheckIn] = useState("");
const [checkOut, setCheckOut] = useState("");
const [adults, setAdults] = useState("4");
const [error, setError] = useState("");
const [success, setSuccess] = useState(false);
const [isLoading, setIsLoading] = useState(false);
@ -20,6 +24,9 @@ export function ImportForm() {
const formData = new FormData();
formData.append("airbnbUrl", url);
if (checkIn) formData.append("checkIn", checkIn);
if (checkOut) formData.append("checkOut", checkOut);
if (adults) formData.append("adults", adults);
const result = await importListingAction(formData);
@ -33,25 +40,82 @@ export function ImportForm() {
setIsLoading(false);
};
// Get today's date for min date
const today = new Date().toISOString().split('T')[0];
return (
<form onSubmit={handleSubmit} className="space-y-4">
<div className="space-y-2">
<Label htmlFor="airbnb-url">Airbnb Link</Label>
<Input
id="airbnb-url"
type="url"
placeholder="https://www.airbnb.com/rooms/..."
value={url}
onChange={(e) => setUrl(e.target.value)}
required
autoFocus
/>
</div>
{error && <div className="text-red-500 text-sm">{error}</div>}
{success && <div className="text-green-500 text-sm"> Erfolgreich importiert!</div>}
<Button type="submit" className="w-full" disabled={isLoading || !url}>
{isLoading ? "Wird importiert..." : "Importieren"}
</Button>
</form>
<Card>
<CardHeader>
<CardTitle>🏠 Neues Airbnb importieren</CardTitle>
</CardHeader>
<CardContent>
<form onSubmit={handleSubmit} className="space-y-4">
{/* URL Field */}
<div className="space-y-2">
<Label htmlFor="airbnb-url">Airbnb Link</Label>
<Input
id="airbnb-url"
type="url"
placeholder="https://www.airbnb.com/rooms/..."
value={url}
onChange={(e) => setUrl(e.target.value)}
required
autoFocus
/>
</div>
{/* Trip Context Fields */}
<div className="space-y-2">
<Label>Reisedaten (optional für bessere Preise)</Label>
<div className="grid grid-cols-3 gap-2">
<div>
<Label htmlFor="check-in" className="text-xs">Check-in</Label>
<Input
id="check-in"
type="date"
value={checkIn}
onChange={(e) => setCheckIn(e.target.value)}
min={today}
placeholder="Datum"
/>
</div>
<div>
<Label htmlFor="check-out" className="text-xs">Check-out</Label>
<Input
id="check-out"
type="date"
value={checkOut}
onChange={(e) => setCheckOut(e.target.value)}
min={checkIn || today}
placeholder="Datum"
/>
</div>
<div>
<Label htmlFor="adults" className="text-xs">Personen</Label>
<Input
id="adults"
type="number"
min="1"
max="16"
value={adults}
onChange={(e) => setAdults(e.target.value)}
/>
</div>
</div>
<p className="text-xs text-slate-500">
💡 Mit Reisedaten kann der Preis genauer ermittelt werden.
Die Daten werden auch aus der URL extrahiert wenn vorhanden.
</p>
</div>
{error && <div className="text-red-500 text-sm">{error}</div>}
{success && <div className="text-green-500 text-sm"> Erfolgreich importiert!</div>}
<Button type="submit" className="w-full" disabled={isLoading || !url}>
{isLoading ? "⏳ Wird importiert..." : "🚀 Importieren"}
</Button>
</form>
</CardContent>
</Card>
);
}

View File

@ -5,6 +5,7 @@ import { Button } from "@/components/ui/button";
import { Input } from "@/components/ui/input";
import { Label } from "@/components/ui/label";
import { updateListing, deleteListing, addNote, addTagToListing, removeTagFromListing } from "../actions";
// Note: actions.ts is in /admin/listings/, so from [slug]/ we go up one level with ../
export default async function EditListingPage({
params,

207
src/lib/airbnb/index.ts Normal file
View File

@ -0,0 +1,207 @@
import * as cheerio from "cheerio";
import { normalizeAirbnbUrlWithContext } from "./url-normalizer";
import { parseCapacityFacts, parseRating, parseHost, parseMaxGuests, extractVisibleText } from "./parsers/text-patterns";
import { parseSleepingArrangements, calculateSleepingStats, deriveSleepingFromBeds } from "./parsers/sleeping";
import { extractPrice } from "./parsers/price";
import { extractLocation } from "./parsers/location";
import { parseJsonLd } from "./parsers/jsonld";
import {
ExtractedListing,
FieldSource,
field,
mergeField,
TripContext,
SleepingDataQuality,
PriceStatus
} from "./types";
// ============================================
// Main Scraper Function
// ============================================
export async function scrapeAirbnbListing(
url: string,
options?: { tripContext?: TripContext; usePlaywright?: boolean }
): Promise<ExtractedListing | null> {
try {
// Step 1: Normalize URL and extract trip context
const normalized = normalizeAirbnbUrlWithContext(url);
// Merge trip context from options with URL-extracted context
const tripContext: TripContext = {
checkIn: options?.tripContext?.checkIn || normalized.tripContext.checkIn,
checkOut: options?.tripContext?.checkOut || normalized.tripContext.checkOut,
adults: options?.tripContext?.adults || normalized.tripContext.adults || 4,
};
// Step 2: Fetch HTML
const html = await fetchHtml(normalized.normalized);
const $ = cheerio.load(html);
// Step 3: Extract visible text for pattern matching
const visibleText = extractVisibleText(html);
// Step 4: Run all parsers
const jsonldData = parseJsonLd($);
const capacityFacts = parseCapacityFacts(visibleText);
const ratingFacts = parseRating(visibleText);
const hostName = parseHost(visibleText);
const maxGuests = parseMaxGuests(visibleText);
const sleepingOptions = parseSleepingArrangements(visibleText);
const priceData = extractPrice(html, $, tripContext);
const locationData = extractLocation($, html);
// Step 5: Build the result with priority: jsonld > text_pattern > derived
const result: ExtractedListing = {
// URLs
originalUrl: normalized.original,
normalizedUrl: normalized.normalized,
externalId: normalized.externalId,
// Basic Info
title: mergeField(
jsonldData.title ? field(jsonldData.title, 'jsonld', 'high') : null,
field(null, 'derived', 'low')
),
description: mergeField(
jsonldData.description ? field(jsonldData.description, 'jsonld', 'high') : null,
field(null, 'derived', 'low')
),
// Location
locationText: locationData.locationText,
latitude: mergeField(
jsonldData.latitude ? field(jsonldData.latitude, 'jsonld', 'high') : null,
locationData.latitude.value !== null ? locationData.latitude : field(null, 'derived', 'low')
),
longitude: mergeField(
jsonldData.longitude ? field(jsonldData.longitude, 'jsonld', 'high') : null,
locationData.longitude.value !== null ? locationData.longitude : field(null, 'derived', 'low')
),
// Pricing
tripContext,
nightlyPrice: priceData.nightly,
totalPrice: priceData.total,
priceStatus: priceData.status,
// Rating
rating: mergeField(
ratingFacts ? field(ratingFacts.rating, 'text_pattern', 'high') : null,
jsonldData.rating ? field(jsonldData.rating, 'jsonld', 'medium') : null
),
reviewCount: mergeField(
ratingFacts && ratingFacts.reviewCount > 0 ? field(ratingFacts.reviewCount, 'text_pattern', 'high') : null,
jsonldData.reviewCount ? field(jsonldData.reviewCount, 'jsonld', 'medium') : null
),
// Capacity
guestCount: mergeField(
capacityFacts ? field(capacityFacts.guests, 'text_pattern', 'high') : null,
field(null, 'derived', 'low')
),
officialGuestCount: mergeField(
maxGuests ? field(maxGuests, 'text_pattern', 'high') : null,
field(null, 'derived', 'low')
),
bedrooms: mergeField(
capacityFacts ? field(capacityFacts.bedrooms, 'text_pattern', 'high') : null,
field(null, 'derived', 'low')
),
beds: mergeField(
capacityFacts ? field(capacityFacts.beds, 'text_pattern', 'high') : null,
field(null, 'derived', 'low')
),
bathrooms: mergeField(
capacityFacts ? field(capacityFacts.bathrooms, 'text_pattern', 'high') : null,
field(null, 'derived', 'low')
),
// Sleeping
sleepingOptions,
maxSleepingPlaces: 0,
suitableFor4: false,
extraMattressesNeededFor4: 0,
sleepingDataQuality: 'UNKNOWN',
// Host
hostName: mergeField(
hostName ? field(hostName, 'text_pattern', 'high') : null,
jsonldData.hostName ? field(jsonldData.hostName, 'jsonld', 'medium') : null
),
// Amenities
amenities: jsonldData.amenities || [],
// Images
images: jsonldData.images || [],
coverImage: jsonldData.images?.[0] || null,
// Other
cancellationPolicy: field(null, 'derived', 'low'),
// Debug
rawSnippets: {
title: jsonldData.title || '',
visibleText: visibleText.substring(0, 2000),
},
extractionLog: [
`URL normalized: ${normalized.normalized}`,
`External ID: ${normalized.externalId}`,
`Trip context: ${JSON.stringify(tripContext)}`,
`Capacity facts: ${capacityFacts ? JSON.stringify(capacityFacts) : 'none'}`,
`Rating facts: ${ratingFacts ? JSON.stringify(ratingFacts) : 'none'}`,
`Sleeping options: ${sleepingOptions.length} found`,
],
};
// Step 6: Calculate sleeping stats
if (sleepingOptions.length > 0) {
const stats = calculateSleepingStats(sleepingOptions);
result.maxSleepingPlaces = stats.maxSleepingPlaces;
result.suitableFor4 = stats.suitableFor4;
result.extraMattressesNeededFor4 = stats.extraMattressesNeededFor4;
result.sleepingDataQuality = 'EXACT';
} else if (result.beds.value && result.guestCount.value) {
// Derive from beds and guest count
const derivedOptions = deriveSleepingFromBeds(result.beds.value, result.guestCount.value);
const stats = calculateSleepingStats(derivedOptions);
result.sleepingOptions = derivedOptions;
result.maxSleepingPlaces = stats.maxSleepingPlaces;
result.suitableFor4 = stats.suitableFor4;
result.extraMattressesNeededFor4 = stats.extraMattressesNeededFor4;
result.sleepingDataQuality = 'DERIVED';
}
return result;
} catch (error) {
console.error("Scraping failed:", error);
return null;
}
}
// ============================================
// HTML Fetcher
// ============================================
async function fetchHtml(url: string): Promise<string> {
const response = await fetch(url, {
headers: {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7",
"Accept-Encoding": "gzip, deflate, br",
"Cache-Control": "no-cache",
},
});
if (!response.ok) {
throw new Error(`HTTP ${response.status} for ${url}`);
}
return response.text();
}
// Re-export utilities for backward compatibility
export { normalizeAirbnbUrlWithContext as normalizeAirbnbUrl } from "./url-normalizer";
export { extractAirbnbExternalId } from "./url-normalizer";

View File

@ -0,0 +1,125 @@
import * as cheerio from 'cheerio';
export interface JsonLdData {
title: string | null;
description: string | null;
locationText: string | null;
latitude: number | null;
longitude: number | null;
rating: number | null;
reviewCount: number | null;
images: string[];
cancellationPolicy: string | null;
hostName: string | null;
amenities: string[];
}
/**
* Parse JSON-LD structured data from HTML
* Airbnb typically uses LodgingBusiness or Room schema
*/
export function parseJsonLd($: cheerio.CheerioAPI): JsonLdData {
const result: JsonLdData = {
title: null,
description: null,
locationText: null,
latitude: null,
longitude: null,
rating: null,
reviewCount: null,
images: [],
cancellationPolicy: null,
hostName: null,
amenities: [],
};
const jsonLdScript = $('script[type="application/ld+json"]').html();
if (!jsonLdScript) {
return result;
}
try {
const jsonData = JSON.parse(jsonLdScript);
// Check if it's a lodging business schema
if (jsonData["@type"] !== "LodgingBusiness" && jsonData["@type"] !== "Room") {
return result;
}
// Title
if (jsonData.name) {
result.title = jsonData.name;
}
// Description
if (jsonData.description) {
result.description = jsonData.description;
}
// Location
if (jsonData.address) {
const parts: string[] = [];
if (jsonData.address.addressLocality) parts.push(jsonData.address.addressLocality);
if (jsonData.address.addressRegion) parts.push(jsonData.address.addressRegion);
if (jsonData.address.addressCountry) parts.push(jsonData.address.addressCountry);
if (parts.length > 0) {
result.locationText = parts.join(', ');
}
}
// Coordinates
if (jsonData.geo) {
if (jsonData.geo.latitude) {
result.latitude = parseFloat(jsonData.geo.latitude);
}
if (jsonData.geo.longitude) {
result.longitude = parseFloat(jsonData.geo.longitude);
}
}
// Rating
if (jsonData.aggregateRating) {
if (jsonData.aggregateRating.ratingValue) {
result.rating = parseFloat(jsonData.aggregateRating.ratingValue);
}
if (jsonData.aggregateRating.reviewCount) {
result.reviewCount = parseInt(jsonData.aggregateRating.reviewCount, 10);
}
}
// Images
if (jsonData.image) {
const images = Array.isArray(jsonData.image)
? jsonData.image.map((img: unknown) => {
const imgObj = img as Record<string, unknown>;
return imgObj.url || imgObj['@id'] || String(img);
})
: [jsonData.image.url || jsonData.image['@id'] || jsonData.image];
result.images = images.filter(Boolean);
}
// Cancellation Policy
if (jsonData.cancellationPolicy) {
result.cancellationPolicy = jsonData.cancellationPolicy;
}
// Host name
if (jsonData.provider?.name) {
result.hostName = jsonData.provider.name;
}
// Amenities
if (jsonData.amenityFeature && Array.isArray(jsonData.amenityFeature)) {
result.amenities = jsonData.amenityFeature
.map((f: unknown) => (f as { name?: string }).name)
.filter(Boolean);
}
} catch (error) {
console.error('Failed to parse JSON-LD:', error);
}
return result;
}

View File

@ -0,0 +1,118 @@
import * as cheerio from 'cheerio';
import { FieldSource } from '../types';
/**
* Extract location from multiple sources with priority:
* 1. JSON-LD address (handled separately)
* 2. "Where you'll be" section
* 3. Meta tags (og:locality, etc.)
* 4. Visible text patterns
*/
export function extractLocation(
$: cheerio.CheerioAPI,
html: string
): { locationText: FieldSource<string | null>; latitude: FieldSource<number | null>; longitude: FieldSource<number | null> } {
let locationText: string | null = null;
let locationSource: FieldSource<string | null>['source'] = 'text_pattern';
let latitude: number | null = null;
let longitude: number | null = null;
// 1. Try "Where you'll be" section
const whereSection = $('[data-section-id="LOCATION_DEFAULT"]').text() ||
$('section:contains("Where you\'ll be")').text() ||
$('section:contains("Lage")').text();
if (whereSection) {
// Extract location from this section
const locationMatch = whereSection.match(/([A-Z][a-zäöüÄÖÜß]+(?:\s+[A-Z][a-zäöüÄÖÜß]+)*,\s*[A-Z][a-zäöüÄÖÜß]+)/);
if (locationMatch) {
locationText = locationMatch[1].trim();
locationSource = 'dom';
}
}
// 2. Try meta tags
if (!locationText) {
const locality = $('meta[property="og:locality"]').attr('content') ||
$('meta[name="location"]').attr('content');
const region = $('meta[property="og:region"]').attr('content');
const country = $('meta[property="og:country-name"]').attr('content');
if (locality) {
locationText = [locality, region, country].filter(Boolean).join(', ');
locationSource = 'meta';
}
}
// 3. Try text patterns like "Location: Berlin, Germany"
if (!locationText) {
const locationPattern = /(?:location|lage|standort)[:\s]+([A-Z][a-zäöüÄÖÜß]+(?:[\s,]+[A-Z][a-zäöüÄÖÜß]+)*)/i;
const match = html.match(locationPattern);
if (match) {
locationText = match[1].trim();
locationSource = 'text_pattern';
}
}
// 4. Try extracting from title (e.g., "Apartment in Berlin · ...")
if (!locationText) {
const titlePattern = /(?:in|bei|am)\s+([A-Z][a-zäöüÄÖÜß]+(?:\s+[A-Z][a-zäöüÄÖÜß]+)?)\s*[·•]/;
const title = $('title').text();
const match = title.match(titlePattern);
if (match) {
locationText = match[1].trim();
locationSource = 'text_pattern';
}
}
// Extract coordinates from various sources
// Try data attributes
const latAttr = $('[data-lat]').attr('data-lat') || $('[data-latitude]').attr('data-latitude');
const lngAttr = $('[data-lng]').attr('data-lng') || $('[data-longitude]').attr('data-longitude');
if (latAttr && lngAttr) {
latitude = parseFloat(latAttr);
longitude = parseFloat(lngAttr);
}
// Try meta tags for coordinates
if (!latitude) {
const geoPosition = $('meta[name="geo.position"]').attr('content') ||
$('meta[property="place:location:latitude"]').attr('content');
if (geoPosition) {
const parts = geoPosition.split(/[;,]/);
if (parts.length >= 2) {
latitude = parseFloat(parts[0]);
longitude = parseFloat(parts[1]);
} else {
latitude = parseFloat(geoPosition);
}
}
}
if (!longitude) {
const lngMeta = $('meta[property="place:location:longitude"]').attr('content');
if (lngMeta) {
longitude = parseFloat(lngMeta);
}
}
return {
locationText: {
value: locationText,
source: locationSource,
confidence: locationText ? 'medium' : 'low',
},
latitude: {
value: latitude,
source: latitude ? 'dom' : 'text_pattern',
confidence: latitude ? 'high' : 'low',
},
longitude: {
value: longitude,
source: longitude ? 'dom' : 'text_pattern',
confidence: longitude ? 'high' : 'low',
},
};
}

View File

@ -0,0 +1,102 @@
import * as cheerio from 'cheerio';
import { FieldSource, PriceStatus, TripContext } from '../types';
import { parsePriceFromText } from './text-patterns';
/**
* Try to extract price from HTML using various selectors
*/
function tryExtractPriceFromHtml(html: string, $: cheerio.CheerioAPI): number | null {
// Try various price selectors that Airbnb might use
const priceSelectors = [
'[data-testid="price-amount"]',
'span[class*="Price"]',
'span[class*="price"]',
'[itemprop="price"]',
'._1y6k3r2',
'._1dss1omb',
];
for (const selector of priceSelectors) {
const element = $(selector).first();
if (element.length) {
const text = element.text();
const price = parsePriceFromText(text);
if (price !== null) {
return price;
}
}
}
// Fallback: search entire HTML for price patterns
const priceFromHtml = parsePriceFromText(html);
if (priceFromHtml !== null) {
return priceFromHtml;
}
return null;
}
/**
* Extract price with trip context awareness
*
* CRITICAL: Price reliability depends on trip context
* - With check-in/check-out: Price is for those specific dates
* - Without trip context: Price may be a base/minimum price
*/
export function extractPrice(
html: string,
$: cheerio.CheerioAPI,
tripContext: TripContext
): { nightly: FieldSource<number | null>; total: FieldSource<number | null>; status: PriceStatus } {
// No trip context = unreliable price
if (!tripContext.checkIn || !tripContext.checkOut) {
const extracted = tryExtractPriceFromHtml(html, $);
if (extracted !== null) {
return {
nightly: { value: extracted, source: 'text_pattern', confidence: 'low' },
total: { value: null, source: 'text_pattern', confidence: 'low' },
status: 'REQUIRES_TRIP_CONTEXT',
};
}
return {
nightly: { value: null, source: 'text_pattern', confidence: 'low' },
total: { value: null, source: 'text_pattern', confidence: 'low' },
status: 'UNKNOWN',
};
}
// With trip context, try harder to extract
const extracted = tryExtractPriceFromHtml(html, $);
if (extracted !== null) {
// Calculate nights for total price
let total: number | null = null;
try {
const checkIn = new Date(tripContext.checkIn);
const checkOut = new Date(tripContext.checkOut);
const nights = Math.round((checkOut.getTime() - checkIn.getTime()) / (1000 * 60 * 60 * 24));
if (nights > 0) {
total = extracted * nights;
}
} catch {
// Invalid dates, skip total calculation
}
return {
nightly: { value: extracted, source: 'text_pattern', confidence: 'medium' },
total: total !== null
? { value: total, source: 'derived', confidence: 'medium' }
: { value: null, source: 'text_pattern', confidence: 'low' },
status: 'EXTRACTED',
};
}
return {
nightly: { value: null, source: 'text_pattern', confidence: 'low' },
total: { value: null, source: 'text_pattern', confidence: 'low' },
status: 'UNKNOWN',
};
}

View File

@ -0,0 +1,143 @@
import { BedType, SleepingOption } from '../types';
/**
* Bed type configuration: maps text patterns to bed types, spots per unit, and quality
*/
export const BED_TYPE_CONFIG: Record<string, { type: BedType; spots: number; quality: 'FULL' | 'AUXILIARY' }> = {
'double bed': { type: 'DOUBLE', spots: 2, quality: 'FULL' },
'doppelbett': { type: 'DOUBLE', spots: 2, quality: 'FULL' },
'queen bed': { type: 'QUEEN', spots: 2, quality: 'FULL' },
'king bed': { type: 'KING', spots: 2, quality: 'FULL' },
'single bed': { type: 'SINGLE', spots: 1, quality: 'FULL' },
'twin bed': { type: 'SINGLE', spots: 1, quality: 'FULL' },
'einzelbett': { type: 'SINGLE', spots: 1, quality: 'FULL' },
'bunk bed': { type: 'BUNK', spots: 2, quality: 'FULL' },
'etagenbett': { type: 'BUNK', spots: 2, quality: 'FULL' },
'sofa bed': { type: 'SOFA_BED', spots: 2, quality: 'FULL' },
'pull-out sofa': { type: 'SOFA_BED', spots: 2, quality: 'FULL' },
'schlafsofa': { type: 'SOFA_BED', spots: 2, quality: 'FULL' },
'couch': { type: 'SOFA', spots: 1, quality: 'AUXILIARY' },
'sofa': { type: 'SOFA', spots: 1, quality: 'AUXILIARY' },
'air mattress': { type: 'AIR_MATTRESS', spots: 1, quality: 'AUXILIARY' },
'luftmatratze': { type: 'AIR_MATTRESS', spots: 1, quality: 'AUXILIARY' },
'floor mattress': { type: 'EXTRA_MATTRESS', spots: 1, quality: 'AUXILIARY' },
'extra mattress': { type: 'EXTRA_MATTRESS', spots: 1, quality: 'AUXILIARY' },
'zusatzmatratze': { type: 'EXTRA_MATTRESS', spots: 1, quality: 'AUXILIARY' },
'futon': { type: 'FUTON', spots: 1, quality: 'AUXILIARY' },
};
// Pattern: "1 double bed" or "2 single beds" or "Bedroom 1: 1 queen bed"
const BED_PATTERN = /(?:(?:bedroom|schlafzimmer|room|zimmer)\s*\d*\s*:?\s*)?(\d+)\s+([a-z\s-]+?)(?:\s|$|,|\.)/gi;
export interface SleepingStats {
maxSleepingPlaces: number;
suitableFor4: boolean;
extraMattressesNeededFor4: number;
}
/**
* Parse sleeping arrangements from text
* Handles patterns like:
* - "1 double bed"
* - "2 single beds"
* - "Bedroom 1: 1 queen bed"
* - "Common space: 1 sofa bed"
*/
export function parseSleepingArrangements(text: string): SleepingOption[] {
const options: SleepingOption[] = [];
const lowerText = text.toLowerCase();
let match;
while ((match = BED_PATTERN.exec(lowerText)) !== null) {
const quantity = parseInt(match[1], 10);
const bedTypeText = match[2].trim();
// Find matching bed type config
let matchedConfig: { type: BedType; spots: number; quality: 'FULL' | 'AUXILIARY' } | null = null;
let matchedLabel = '';
for (const [pattern, config] of Object.entries(BED_TYPE_CONFIG)) {
if (bedTypeText.includes(pattern) || pattern.includes(bedTypeText)) {
matchedConfig = config;
matchedLabel = pattern;
break;
}
}
if (matchedConfig && quantity > 0) {
// Check if this bed type already exists
const existing = options.find(o => o.bedType === matchedConfig!.type);
if (existing) {
existing.quantity += quantity;
} else {
options.push({
bedType: matchedConfig.type,
quantity,
spotsPerUnit: matchedConfig.spots,
quality: matchedConfig.quality,
label: matchedLabel,
rawText: match[0].trim(),
});
}
}
}
return options;
}
/**
* Calculate sleeping statistics from options
*/
export function calculateSleepingStats(options: SleepingOption[]): SleepingStats {
const maxSleepingPlaces = options.reduce(
(sum, opt) => sum + opt.quantity * opt.spotsPerUnit,
0
);
const suitableFor4 = maxSleepingPlaces >= 4;
// Calculate extra mattresses needed for 4 people
// Only count FULL quality beds first
const fullQualitySpots = options
.filter(o => o.quality === 'FULL')
.reduce((sum, opt) => sum + opt.quantity * opt.spotsPerUnit, 0);
const extraMattressesNeededFor4 = Math.max(0, 4 - fullQualitySpots);
return {
maxSleepingPlaces,
suitableFor4,
extraMattressesNeededFor4,
};
}
/**
* Derive sleeping options from bed count (fallback with low confidence)
* Used when detailed sleeping arrangement text is not available
*/
export function deriveSleepingFromBeds(beds: number, guestCount: number): SleepingOption[] {
if (!beds || beds < 1) return [];
// Assume beds are double beds if guest count suggests it
const avgGuestsPerBed = guestCount ? guestCount / beds : 2;
if (avgGuestsPerBed >= 1.5) {
// Likely double beds
return [{
bedType: 'DOUBLE',
quantity: beds,
spotsPerUnit: 2,
quality: 'FULL',
label: 'double bed (derived)',
}];
} else {
// Likely single beds
return [{
bedType: 'SINGLE',
quantity: beds,
spotsPerUnit: 1,
quality: 'FULL',
label: 'single bed (derived)',
}];
}
}

View File

@ -0,0 +1,123 @@
/**
* Text pattern parsers for extracting data from visible HTML text
* Supports both German and English patterns
*/
// "2 guests · 1 bedroom · 2 beds · 1 bath" or German variants
const CAPACITY_PATTERN = /(\d+)\s*(?:guests?|gäste?)\s*[·•]\s*(\d+)\s*(?:bedrooms?|schlafzimmer?)\s*[·•]\s*(\d+)\s*(?:beds?|betten?)\s*[·•]\s*(\d+(?:[.,]\d+)?)\s*(?:baths?|bäder?)/i;
// "4.88 · 200 reviews" or "4,88 (200)" or "4,88 · 200 Bewertungen"
const RATING_PATTERN = /(\d+[.,]\d+)\s*(?:[·•\(]?\s*(\d+)\s*(?:reviews?|bewertungen)?\)?)/i;
// "Hosted by David" or "Gehostet von David"
const HOST_PATTERN = /(?:hosted by|gehostet von)\s+([^\n·•]+)/i;
// "€ 150 / night" or "$150 per night" or "150 € pro Nacht"
const PRICE_PATTERN = /[€$]?\s*(\d+(?:[.,]\d{0,2})?)\s*[€$]?\s*(?:\/|per|pro)\s*(?:night|nacht)/i;
// "6 guests maximum" or "max. 6 Gäste" or "Up to 6 guests"
const MAX_GUESTS_PATTERN = /(?:max\.?|maximum|up to)\s*(\d+)\s*(?:guests?|gäste?)|(\d+)\s*(?:guests?|gäste?)\s*(?:maximum|max\.?)/i;
export interface CapacityFacts {
guests: number;
bedrooms: number;
beds: number;
bathrooms: number;
}
export interface RatingFacts {
rating: number;
reviewCount: number;
}
/**
* Parse capacity facts from text like "2 guests · 1 bedroom · 2 beds · 1 bath"
*/
export function parseCapacityFacts(text: string): CapacityFacts | null {
const match = text.match(CAPACITY_PATTERN);
if (!match) return null;
return {
guests: parseInt(match[1], 10),
bedrooms: parseInt(match[2], 10),
beds: parseInt(match[3], 10),
bathrooms: parseFloat(match[4].replace(',', '.')),
};
}
/**
* Parse rating from text like "4.88 · 200 reviews"
*/
export function parseRating(text: string): RatingFacts | null {
const match = text.match(RATING_PATTERN);
if (!match) return null;
const rating = parseFloat(match[1].replace(',', '.'));
const reviewCount = match[2] ? parseInt(match[2], 10) : 0;
if (isNaN(rating)) return null;
return { rating, reviewCount };
}
/**
* Parse host name from text like "Hosted by David"
*/
export function parseHost(text: string): string | null {
const match = text.match(HOST_PATTERN);
if (!match) return null;
return match[1].trim();
}
/**
* Parse price from text like "€ 150 / night"
*/
export function parsePriceFromText(text: string): number | null {
const match = text.match(PRICE_PATTERN);
if (!match) return null;
const price = parseFloat(match[1].replace(',', '.'));
return isNaN(price) ? null : price;
}
/**
* Parse max guests from text like "6 guests maximum"
*/
export function parseMaxGuests(text: string): number | null {
const match = text.match(MAX_GUESTS_PATTERN);
if (!match) return null;
// Pattern has two capture groups depending on word order
const value = match[1] || match[2];
return value ? parseInt(value, 10) : null;
}
/**
* Extract all text content from HTML for pattern matching
*/
export function extractVisibleText(html: string): string {
// Remove script and style tags
let text = html.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, ' ');
text = text.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, ' ');
// Replace block elements with newlines
text = text.replace(/<\/(div|p|br|li|tr|td|th|h[1-6]|section|article|header|footer)[^>]*>/gi, '\n');
// Remove remaining tags
text = text.replace(/<[^>]+>/g, ' ');
// Decode HTML entities
text = text
.replace(/&nbsp;/g, ' ')
.replace(/&amp;/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/&#(\d+);/g, (_, num) => String.fromCharCode(parseInt(num, 10)));
// Normalize whitespace
text = text.replace(/\s+/g, ' ').trim();
return text;
}

113
src/lib/airbnb/types.ts Normal file
View File

@ -0,0 +1,113 @@
export type DataSource = 'jsonld' | 'meta' | 'text_pattern' | 'dom' | 'playwright' | 'derived' | 'manual';
export type Confidence = 'high' | 'medium' | 'low';
export type PriceStatus = 'EXTRACTED' | 'REQUIRES_TRIP_CONTEXT' | 'UNKNOWN' | 'PARTIAL';
export type SleepingDataQuality = 'EXACT' | 'DERIVED' | 'UNKNOWN';
export interface FieldSource<T> {
value: T;
source: DataSource;
confidence: Confidence;
}
/**
* Create a FieldSource object with value, source, and confidence
*/
export function field<T>(value: T, source: DataSource, confidence: Confidence): FieldSource<T> {
return { value, source, confidence };
}
/**
* Merge two FieldSources - takes the first non-null value
* Priority: primary over secondary
*/
export function mergeField<T>(primary: FieldSource<T> | null, secondary: FieldSource<T> | null): FieldSource<T> {
if (primary?.value !== null && primary?.value !== undefined) {
return primary;
}
if (secondary?.value !== null && secondary?.value !== undefined) {
return secondary;
}
// Return null with lowest confidence
return { value: null as T, source: 'derived', confidence: 'low' };
}
export type BedType = 'DOUBLE' | 'SINGLE' | 'SOFA_BED' | 'SOFA' | 'AIR_MATTRESS' | 'FUTON' | 'BUNK' | 'EXTRA_MATTRESS' | 'QUEEN' | 'KING' | 'UNKNOWN';
export interface SleepingOption {
bedType: BedType;
quantity: number;
spotsPerUnit: number;
quality: 'FULL' | 'AUXILIARY';
label?: string;
rawText?: string;
}
export interface TripContext {
checkIn?: string;
checkOut?: string;
adults?: number;
}
export interface NormalizedUrl {
original: string;
normalized: string;
externalId: string | null;
tripContext: TripContext;
}
export interface ExtractedListing {
// URLs
originalUrl: string;
normalizedUrl: string;
externalId: string | null;
// Basic Info
title: FieldSource<string | null>;
description: FieldSource<string | null>;
// Location
locationText: FieldSource<string | null>;
latitude: FieldSource<number | null>;
longitude: FieldSource<number | null>;
// Pricing
tripContext: TripContext;
nightlyPrice: FieldSource<number | null>;
totalPrice: FieldSource<number | null>;
priceStatus: PriceStatus;
// Rating
rating: FieldSource<number | null>;
reviewCount: FieldSource<number | null>;
// Capacity
guestCount: FieldSource<number | null>;
officialGuestCount: FieldSource<number | null>;
bedrooms: FieldSource<number | null>;
beds: FieldSource<number | null>;
bathrooms: FieldSource<number | null>;
// Sleeping
sleepingOptions: SleepingOption[];
maxSleepingPlaces: number;
suitableFor4: boolean;
extraMattressesNeededFor4: number;
sleepingDataQuality: SleepingDataQuality;
// Host
hostName: FieldSource<string | null>;
// Amenities
amenities: string[];
// Images
images: string[];
coverImage: string | null;
// Other
cancellationPolicy: FieldSource<string | null>;
// Debug
rawSnippets: Record<string, string>;
extractionLog: string[];
}

View File

@ -0,0 +1,71 @@
import { TripContext, NormalizedUrl } from './types';
/**
* Extracts the Airbnb listing ID from a URL
* Matches patterns like /rooms/12345 or /rooms/12345/
*/
export function extractAirbnbExternalId(url: string): string | null {
const match = url.match(/\/rooms\/(\d+)/);
return match?.[1] || null;
}
/**
* Extracts trip context from URL query parameters
* Looks for: check_in, check_out, adults
*/
export function extractTripContext(url: string): TripContext {
try {
const urlObj = new URL(url);
const params = urlObj.searchParams;
const checkIn = params.get('check_in') || params.get('checkIn') || undefined;
const checkOut = params.get('check_out') || params.get('checkOut') || undefined;
const adultsStr = params.get('adults') || params.get('adults[]') || undefined;
return {
checkIn,
checkOut,
adults: adultsStr ? parseInt(adultsStr, 10) : undefined,
};
} catch {
return {};
}
}
/**
* Normalizes an Airbnb URL by:
* - Removing hash
* - Removing query params (trip context extracted separately)
* - Removing trailing slashes
* - Removing www prefix
* - Lowercasing hostname
*/
export function normalizeAirbnbUrl(url: string): string {
try {
const urlObj = new URL(url.trim());
urlObj.hash = '';
urlObj.search = '';
urlObj.pathname = urlObj.pathname.replace(/\/+$/, '');
urlObj.hostname = urlObj.hostname.replace(/^www\./, '').toLowerCase();
return urlObj.toString();
} catch {
return url.trim();
}
}
/**
* Main function: Normalizes URL and extracts all metadata
*/
export function normalizeAirbnbUrlWithContext(url: string): NormalizedUrl {
const original = url.trim();
const normalized = normalizeAirbnbUrl(original);
const externalId = extractAirbnbExternalId(normalized);
const tripContext = extractTripContext(original);
return {
original,
normalized,
externalId,
tripContext,
};
}