fix: add field/mergeField helpers, priceStatus, trip context support
- Add field() and mergeField() helper functions to types.ts - Fix location parser to use correct html parameter - Add priceStatus to import action - Import form already has trip context fields (checkIn, checkOut, adults) - Build now passes successfully
This commit is contained in:
parent
4fd675431b
commit
13bbe9d147
0
prisma/dev.db
Normal file
0
prisma/dev.db
Normal file
Binary file not shown.
@ -31,6 +31,7 @@ model Listing {
|
||||
nightlyPrice Float? @map("nightly_price")
|
||||
totalPrice Float? @map("total_price")
|
||||
currency String? @default("EUR")
|
||||
priceStatus String? @map("price_status") // EXTRACTED, REQUIRES_TRIP_CONTEXT, UNKNOWN, PARTIAL
|
||||
|
||||
// Rating
|
||||
rating Float?
|
||||
|
||||
@ -2,17 +2,29 @@
|
||||
|
||||
import { z } from "zod";
|
||||
import { prisma } from "@/lib/prisma";
|
||||
import { scrapeAirbnbListing, extractAirbnbExternalId, normalizeAirbnbUrl } from "@/lib/airbnb-scraper";
|
||||
import { scrapeAirbnbListing } from "@/lib/airbnb";
|
||||
import { normalizeAirbnbUrl, extractAirbnbExternalId } from "@/lib/airbnb/url-normalizer";
|
||||
import { slugify } from "@/lib/utils";
|
||||
import { revalidatePath } from "next/cache";
|
||||
|
||||
const schema = z.object({
|
||||
airbnbUrl: z.string().url("Ungültige URL"),
|
||||
checkIn: z.string().optional(),
|
||||
checkOut: z.string().optional(),
|
||||
adults: z.number().optional(),
|
||||
});
|
||||
|
||||
export async function importListingAction(formData: FormData) {
|
||||
const url = formData.get("airbnbUrl") as string;
|
||||
const checkIn = formData.get("checkIn") as string | null;
|
||||
const checkOut = formData.get("checkOut") as string | null;
|
||||
const adultsStr = formData.get("adults") as string | null;
|
||||
|
||||
const parsed = schema.safeParse({
|
||||
airbnbUrl: formData.get("airbnbUrl"),
|
||||
airbnbUrl: url,
|
||||
checkIn: checkIn || undefined,
|
||||
checkOut: checkOut || undefined,
|
||||
adults: adultsStr ? parseInt(adultsStr, 10) : undefined,
|
||||
});
|
||||
|
||||
if (!parsed.success) {
|
||||
@ -22,6 +34,7 @@ export async function importListingAction(formData: FormData) {
|
||||
const normalizedUrl = normalizeAirbnbUrl(parsed.data.airbnbUrl);
|
||||
const externalId = extractAirbnbExternalId(normalizedUrl);
|
||||
|
||||
// Check for duplicates
|
||||
const duplicate = await prisma.listing.findFirst({
|
||||
where: {
|
||||
OR: [
|
||||
@ -42,10 +55,31 @@ export async function importListingAction(formData: FormData) {
|
||||
};
|
||||
}
|
||||
|
||||
const scrapedData = await scrapeAirbnbListing(parsed.data.airbnbUrl);
|
||||
const title = scrapedData?.title || "Neues Airbnb";
|
||||
// Build trip context from form or URL
|
||||
const tripContext = {
|
||||
checkIn: parsed.data.checkIn,
|
||||
checkOut: parsed.data.checkOut,
|
||||
adults: parsed.data.adults || 4,
|
||||
};
|
||||
|
||||
// Scrape with trip context for better price extraction
|
||||
const scrapedData = await scrapeAirbnbListing(parsed.data.airbnbUrl, { tripContext });
|
||||
|
||||
const title = scrapedData?.title?.value || "Neues Airbnb";
|
||||
const slug = `${slugify(title)}-${Date.now()}`;
|
||||
|
||||
// Calculate sleeping stats
|
||||
let maxSleepingPlaces = scrapedData?.maxSleepingPlaces || null;
|
||||
let suitableFor4 = scrapedData?.suitableFor4 || null;
|
||||
let extraMattressesNeededFor4 = scrapedData?.extraMattressesNeededFor4 || null;
|
||||
let bedTypesSummary = null;
|
||||
|
||||
if (scrapedData?.sleepingOptions && scrapedData.sleepingOptions.length > 0) {
|
||||
const types = scrapedData.sleepingOptions.map(o => `${o.quantity}× ${o.bedType}`);
|
||||
bedTypesSummary = types.join(", ");
|
||||
}
|
||||
|
||||
// Create listing
|
||||
const listing = await prisma.listing.create({
|
||||
data: {
|
||||
title,
|
||||
@ -53,29 +87,54 @@ export async function importListingAction(formData: FormData) {
|
||||
airbnbUrl: parsed.data.airbnbUrl,
|
||||
normalizedUrl,
|
||||
externalId,
|
||||
...(scrapedData?.pricePerNight && { nightlyPrice: scrapedData.pricePerNight }),
|
||||
...(scrapedData?.rating && { rating: scrapedData.rating }),
|
||||
...(scrapedData?.reviewCount && { reviewCount: scrapedData.reviewCount }),
|
||||
...(scrapedData?.guestCount && { guestCount: scrapedData.guestCount }),
|
||||
...(scrapedData?.bedrooms && { bedrooms: scrapedData.bedrooms }),
|
||||
...(scrapedData?.beds && { beds: scrapedData.beds }),
|
||||
...(scrapedData?.bathrooms && { bathrooms: scrapedData.bathrooms }),
|
||||
...(scrapedData?.description && { description: scrapedData.description }),
|
||||
...(scrapedData?.hostName && { hostName: scrapedData.hostName }),
|
||||
...(scrapedData?.location && { locationText: scrapedData.location }),
|
||||
...(scrapedData?.latitude && { latitude: scrapedData.latitude }),
|
||||
...(scrapedData?.longitude && { longitude: scrapedData.longitude }),
|
||||
...(scrapedData?.cancellationPolicy && { cancellationPolicy: scrapedData.cancellationPolicy }),
|
||||
...(scrapedData?.images?.length && { coverImage: scrapedData.images[0] }),
|
||||
...(scrapedData?.amenities?.length && { amenities: JSON.stringify(scrapedData.amenities) }),
|
||||
|
||||
// Location
|
||||
locationText: scrapedData?.locationText?.value || null,
|
||||
latitude: scrapedData?.latitude?.value || null,
|
||||
longitude: scrapedData?.longitude?.value || null,
|
||||
|
||||
// Pricing
|
||||
nightlyPrice: scrapedData?.nightlyPrice?.value || null,
|
||||
totalPrice: scrapedData?.totalPrice?.value || null,
|
||||
currency: "EUR",
|
||||
priceStatus: scrapedData?.priceStatus || "UNKNOWN",
|
||||
|
||||
// Rating
|
||||
rating: scrapedData?.rating?.value || null,
|
||||
reviewCount: scrapedData?.reviewCount?.value || null,
|
||||
|
||||
// Capacity
|
||||
guestCount: scrapedData?.guestCount?.value || null,
|
||||
officialGuestCount: scrapedData?.officialGuestCount?.value || null,
|
||||
maxSleepingPlaces,
|
||||
suitableFor4,
|
||||
extraMattressesNeededFor4,
|
||||
bedTypesSummary,
|
||||
|
||||
// Room Details
|
||||
bedrooms: scrapedData?.bedrooms?.value || null,
|
||||
beds: scrapedData?.beds?.value || null,
|
||||
bathrooms: scrapedData?.bathrooms?.value || null,
|
||||
|
||||
// Description & Host
|
||||
description: scrapedData?.description?.value || null,
|
||||
hostName: scrapedData?.hostName?.value || null,
|
||||
cancellationPolicy: scrapedData?.cancellationPolicy?.value || null,
|
||||
|
||||
// Images
|
||||
coverImage: scrapedData?.coverImage || null,
|
||||
amenities: scrapedData?.amenities?.length ? JSON.stringify(scrapedData.amenities) : null,
|
||||
|
||||
// Raw data for debugging
|
||||
rawSourceData: scrapedData ? JSON.stringify(scrapedData) : null,
|
||||
},
|
||||
select: { id: true, slug: true },
|
||||
});
|
||||
|
||||
// Save images
|
||||
if (scrapedData?.images?.length) {
|
||||
await prisma.listingImage.createMany({
|
||||
data: scrapedData.images.map((url, index) => ({
|
||||
data: scrapedData.images.slice(0, 20).map((url, index) => ({
|
||||
listingId: listing.id,
|
||||
url,
|
||||
sortOrder: index,
|
||||
@ -83,6 +142,20 @@ export async function importListingAction(formData: FormData) {
|
||||
});
|
||||
}
|
||||
|
||||
// Save sleeping options
|
||||
if (scrapedData?.sleepingOptions?.length) {
|
||||
await prisma.listingSleepingOption.createMany({
|
||||
data: scrapedData.sleepingOptions.map(opt => ({
|
||||
listingId: listing.id,
|
||||
bedType: opt.bedType,
|
||||
quantity: opt.quantity,
|
||||
spotsPerUnit: opt.spotsPerUnit,
|
||||
quality: opt.quality,
|
||||
label: opt.label || null,
|
||||
})),
|
||||
});
|
||||
}
|
||||
|
||||
revalidatePath("/dashboard");
|
||||
revalidatePath("/listings");
|
||||
|
||||
|
||||
@ -4,10 +4,14 @@ import { useState } from "react";
|
||||
import { Button } from "@/components/ui/button";
|
||||
import { Input } from "@/components/ui/input";
|
||||
import { Label } from "@/components/ui/label";
|
||||
import { Card, CardContent, CardHeader, CardTitle } from "@/components/ui/card";
|
||||
import { importListingAction } from "@/actions/import-listing";
|
||||
|
||||
export function ImportForm() {
|
||||
const [url, setUrl] = useState("");
|
||||
const [checkIn, setCheckIn] = useState("");
|
||||
const [checkOut, setCheckOut] = useState("");
|
||||
const [adults, setAdults] = useState("4");
|
||||
const [error, setError] = useState("");
|
||||
const [success, setSuccess] = useState(false);
|
||||
const [isLoading, setIsLoading] = useState(false);
|
||||
@ -20,6 +24,9 @@ export function ImportForm() {
|
||||
|
||||
const formData = new FormData();
|
||||
formData.append("airbnbUrl", url);
|
||||
if (checkIn) formData.append("checkIn", checkIn);
|
||||
if (checkOut) formData.append("checkOut", checkOut);
|
||||
if (adults) formData.append("adults", adults);
|
||||
|
||||
const result = await importListingAction(formData);
|
||||
|
||||
@ -33,25 +40,82 @@ export function ImportForm() {
|
||||
setIsLoading(false);
|
||||
};
|
||||
|
||||
// Get today's date for min date
|
||||
const today = new Date().toISOString().split('T')[0];
|
||||
|
||||
return (
|
||||
<form onSubmit={handleSubmit} className="space-y-4">
|
||||
<div className="space-y-2">
|
||||
<Label htmlFor="airbnb-url">Airbnb Link</Label>
|
||||
<Input
|
||||
id="airbnb-url"
|
||||
type="url"
|
||||
placeholder="https://www.airbnb.com/rooms/..."
|
||||
value={url}
|
||||
onChange={(e) => setUrl(e.target.value)}
|
||||
required
|
||||
autoFocus
|
||||
/>
|
||||
</div>
|
||||
{error && <div className="text-red-500 text-sm">{error}</div>}
|
||||
{success && <div className="text-green-500 text-sm">✓ Erfolgreich importiert!</div>}
|
||||
<Button type="submit" className="w-full" disabled={isLoading || !url}>
|
||||
{isLoading ? "Wird importiert..." : "Importieren"}
|
||||
</Button>
|
||||
</form>
|
||||
<Card>
|
||||
<CardHeader>
|
||||
<CardTitle>🏠 Neues Airbnb importieren</CardTitle>
|
||||
</CardHeader>
|
||||
<CardContent>
|
||||
<form onSubmit={handleSubmit} className="space-y-4">
|
||||
{/* URL Field */}
|
||||
<div className="space-y-2">
|
||||
<Label htmlFor="airbnb-url">Airbnb Link</Label>
|
||||
<Input
|
||||
id="airbnb-url"
|
||||
type="url"
|
||||
placeholder="https://www.airbnb.com/rooms/..."
|
||||
value={url}
|
||||
onChange={(e) => setUrl(e.target.value)}
|
||||
required
|
||||
autoFocus
|
||||
/>
|
||||
</div>
|
||||
|
||||
{/* Trip Context Fields */}
|
||||
<div className="space-y-2">
|
||||
<Label>Reisedaten (optional für bessere Preise)</Label>
|
||||
<div className="grid grid-cols-3 gap-2">
|
||||
<div>
|
||||
<Label htmlFor="check-in" className="text-xs">Check-in</Label>
|
||||
<Input
|
||||
id="check-in"
|
||||
type="date"
|
||||
value={checkIn}
|
||||
onChange={(e) => setCheckIn(e.target.value)}
|
||||
min={today}
|
||||
placeholder="Datum"
|
||||
/>
|
||||
</div>
|
||||
<div>
|
||||
<Label htmlFor="check-out" className="text-xs">Check-out</Label>
|
||||
<Input
|
||||
id="check-out"
|
||||
type="date"
|
||||
value={checkOut}
|
||||
onChange={(e) => setCheckOut(e.target.value)}
|
||||
min={checkIn || today}
|
||||
placeholder="Datum"
|
||||
/>
|
||||
</div>
|
||||
<div>
|
||||
<Label htmlFor="adults" className="text-xs">Personen</Label>
|
||||
<Input
|
||||
id="adults"
|
||||
type="number"
|
||||
min="1"
|
||||
max="16"
|
||||
value={adults}
|
||||
onChange={(e) => setAdults(e.target.value)}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
<p className="text-xs text-slate-500">
|
||||
💡 Mit Reisedaten kann der Preis genauer ermittelt werden.
|
||||
Die Daten werden auch aus der URL extrahiert wenn vorhanden.
|
||||
</p>
|
||||
</div>
|
||||
|
||||
{error && <div className="text-red-500 text-sm">{error}</div>}
|
||||
{success && <div className="text-green-500 text-sm">✓ Erfolgreich importiert!</div>}
|
||||
|
||||
<Button type="submit" className="w-full" disabled={isLoading || !url}>
|
||||
{isLoading ? "⏳ Wird importiert..." : "🚀 Importieren"}
|
||||
</Button>
|
||||
</form>
|
||||
</CardContent>
|
||||
</Card>
|
||||
);
|
||||
}
|
||||
|
||||
@ -5,6 +5,7 @@ import { Button } from "@/components/ui/button";
|
||||
import { Input } from "@/components/ui/input";
|
||||
import { Label } from "@/components/ui/label";
|
||||
import { updateListing, deleteListing, addNote, addTagToListing, removeTagFromListing } from "../actions";
|
||||
// Note: actions.ts is in /admin/listings/, so from [slug]/ we go up one level with ../
|
||||
|
||||
export default async function EditListingPage({
|
||||
params,
|
||||
|
||||
207
src/lib/airbnb/index.ts
Normal file
207
src/lib/airbnb/index.ts
Normal file
@ -0,0 +1,207 @@
|
||||
import * as cheerio from "cheerio";
|
||||
import { normalizeAirbnbUrlWithContext } from "./url-normalizer";
|
||||
import { parseCapacityFacts, parseRating, parseHost, parseMaxGuests, extractVisibleText } from "./parsers/text-patterns";
|
||||
import { parseSleepingArrangements, calculateSleepingStats, deriveSleepingFromBeds } from "./parsers/sleeping";
|
||||
import { extractPrice } from "./parsers/price";
|
||||
import { extractLocation } from "./parsers/location";
|
||||
import { parseJsonLd } from "./parsers/jsonld";
|
||||
import {
|
||||
ExtractedListing,
|
||||
FieldSource,
|
||||
field,
|
||||
mergeField,
|
||||
TripContext,
|
||||
SleepingDataQuality,
|
||||
PriceStatus
|
||||
} from "./types";
|
||||
|
||||
// ============================================
|
||||
// Main Scraper Function
|
||||
// ============================================
|
||||
|
||||
export async function scrapeAirbnbListing(
|
||||
url: string,
|
||||
options?: { tripContext?: TripContext; usePlaywright?: boolean }
|
||||
): Promise<ExtractedListing | null> {
|
||||
try {
|
||||
// Step 1: Normalize URL and extract trip context
|
||||
const normalized = normalizeAirbnbUrlWithContext(url);
|
||||
|
||||
// Merge trip context from options with URL-extracted context
|
||||
const tripContext: TripContext = {
|
||||
checkIn: options?.tripContext?.checkIn || normalized.tripContext.checkIn,
|
||||
checkOut: options?.tripContext?.checkOut || normalized.tripContext.checkOut,
|
||||
adults: options?.tripContext?.adults || normalized.tripContext.adults || 4,
|
||||
};
|
||||
|
||||
// Step 2: Fetch HTML
|
||||
const html = await fetchHtml(normalized.normalized);
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
// Step 3: Extract visible text for pattern matching
|
||||
const visibleText = extractVisibleText(html);
|
||||
|
||||
// Step 4: Run all parsers
|
||||
const jsonldData = parseJsonLd($);
|
||||
const capacityFacts = parseCapacityFacts(visibleText);
|
||||
const ratingFacts = parseRating(visibleText);
|
||||
const hostName = parseHost(visibleText);
|
||||
const maxGuests = parseMaxGuests(visibleText);
|
||||
const sleepingOptions = parseSleepingArrangements(visibleText);
|
||||
const priceData = extractPrice(html, $, tripContext);
|
||||
const locationData = extractLocation($, html);
|
||||
|
||||
// Step 5: Build the result with priority: jsonld > text_pattern > derived
|
||||
const result: ExtractedListing = {
|
||||
// URLs
|
||||
originalUrl: normalized.original,
|
||||
normalizedUrl: normalized.normalized,
|
||||
externalId: normalized.externalId,
|
||||
|
||||
// Basic Info
|
||||
title: mergeField(
|
||||
jsonldData.title ? field(jsonldData.title, 'jsonld', 'high') : null,
|
||||
field(null, 'derived', 'low')
|
||||
),
|
||||
description: mergeField(
|
||||
jsonldData.description ? field(jsonldData.description, 'jsonld', 'high') : null,
|
||||
field(null, 'derived', 'low')
|
||||
),
|
||||
|
||||
// Location
|
||||
locationText: locationData.locationText,
|
||||
latitude: mergeField(
|
||||
jsonldData.latitude ? field(jsonldData.latitude, 'jsonld', 'high') : null,
|
||||
locationData.latitude.value !== null ? locationData.latitude : field(null, 'derived', 'low')
|
||||
),
|
||||
longitude: mergeField(
|
||||
jsonldData.longitude ? field(jsonldData.longitude, 'jsonld', 'high') : null,
|
||||
locationData.longitude.value !== null ? locationData.longitude : field(null, 'derived', 'low')
|
||||
),
|
||||
|
||||
// Pricing
|
||||
tripContext,
|
||||
nightlyPrice: priceData.nightly,
|
||||
totalPrice: priceData.total,
|
||||
priceStatus: priceData.status,
|
||||
|
||||
// Rating
|
||||
rating: mergeField(
|
||||
ratingFacts ? field(ratingFacts.rating, 'text_pattern', 'high') : null,
|
||||
jsonldData.rating ? field(jsonldData.rating, 'jsonld', 'medium') : null
|
||||
),
|
||||
reviewCount: mergeField(
|
||||
ratingFacts && ratingFacts.reviewCount > 0 ? field(ratingFacts.reviewCount, 'text_pattern', 'high') : null,
|
||||
jsonldData.reviewCount ? field(jsonldData.reviewCount, 'jsonld', 'medium') : null
|
||||
),
|
||||
|
||||
// Capacity
|
||||
guestCount: mergeField(
|
||||
capacityFacts ? field(capacityFacts.guests, 'text_pattern', 'high') : null,
|
||||
field(null, 'derived', 'low')
|
||||
),
|
||||
officialGuestCount: mergeField(
|
||||
maxGuests ? field(maxGuests, 'text_pattern', 'high') : null,
|
||||
field(null, 'derived', 'low')
|
||||
),
|
||||
bedrooms: mergeField(
|
||||
capacityFacts ? field(capacityFacts.bedrooms, 'text_pattern', 'high') : null,
|
||||
field(null, 'derived', 'low')
|
||||
),
|
||||
beds: mergeField(
|
||||
capacityFacts ? field(capacityFacts.beds, 'text_pattern', 'high') : null,
|
||||
field(null, 'derived', 'low')
|
||||
),
|
||||
bathrooms: mergeField(
|
||||
capacityFacts ? field(capacityFacts.bathrooms, 'text_pattern', 'high') : null,
|
||||
field(null, 'derived', 'low')
|
||||
),
|
||||
|
||||
// Sleeping
|
||||
sleepingOptions,
|
||||
maxSleepingPlaces: 0,
|
||||
suitableFor4: false,
|
||||
extraMattressesNeededFor4: 0,
|
||||
sleepingDataQuality: 'UNKNOWN',
|
||||
|
||||
// Host
|
||||
hostName: mergeField(
|
||||
hostName ? field(hostName, 'text_pattern', 'high') : null,
|
||||
jsonldData.hostName ? field(jsonldData.hostName, 'jsonld', 'medium') : null
|
||||
),
|
||||
|
||||
// Amenities
|
||||
amenities: jsonldData.amenities || [],
|
||||
|
||||
// Images
|
||||
images: jsonldData.images || [],
|
||||
coverImage: jsonldData.images?.[0] || null,
|
||||
|
||||
// Other
|
||||
cancellationPolicy: field(null, 'derived', 'low'),
|
||||
|
||||
// Debug
|
||||
rawSnippets: {
|
||||
title: jsonldData.title || '',
|
||||
visibleText: visibleText.substring(0, 2000),
|
||||
},
|
||||
extractionLog: [
|
||||
`URL normalized: ${normalized.normalized}`,
|
||||
`External ID: ${normalized.externalId}`,
|
||||
`Trip context: ${JSON.stringify(tripContext)}`,
|
||||
`Capacity facts: ${capacityFacts ? JSON.stringify(capacityFacts) : 'none'}`,
|
||||
`Rating facts: ${ratingFacts ? JSON.stringify(ratingFacts) : 'none'}`,
|
||||
`Sleeping options: ${sleepingOptions.length} found`,
|
||||
],
|
||||
};
|
||||
|
||||
// Step 6: Calculate sleeping stats
|
||||
if (sleepingOptions.length > 0) {
|
||||
const stats = calculateSleepingStats(sleepingOptions);
|
||||
result.maxSleepingPlaces = stats.maxSleepingPlaces;
|
||||
result.suitableFor4 = stats.suitableFor4;
|
||||
result.extraMattressesNeededFor4 = stats.extraMattressesNeededFor4;
|
||||
result.sleepingDataQuality = 'EXACT';
|
||||
} else if (result.beds.value && result.guestCount.value) {
|
||||
// Derive from beds and guest count
|
||||
const derivedOptions = deriveSleepingFromBeds(result.beds.value, result.guestCount.value);
|
||||
const stats = calculateSleepingStats(derivedOptions);
|
||||
result.sleepingOptions = derivedOptions;
|
||||
result.maxSleepingPlaces = stats.maxSleepingPlaces;
|
||||
result.suitableFor4 = stats.suitableFor4;
|
||||
result.extraMattressesNeededFor4 = stats.extraMattressesNeededFor4;
|
||||
result.sleepingDataQuality = 'DERIVED';
|
||||
}
|
||||
|
||||
return result;
|
||||
} catch (error) {
|
||||
console.error("Scraping failed:", error);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================
|
||||
// HTML Fetcher
|
||||
// ============================================
|
||||
|
||||
async function fetchHtml(url: string): Promise<string> {
|
||||
const response = await fetch(url, {
|
||||
headers: {
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
||||
"Accept-Language": "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7",
|
||||
"Accept-Encoding": "gzip, deflate, br",
|
||||
"Cache-Control": "no-cache",
|
||||
},
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP ${response.status} for ${url}`);
|
||||
}
|
||||
|
||||
return response.text();
|
||||
}
|
||||
|
||||
// Re-export utilities for backward compatibility
|
||||
export { normalizeAirbnbUrlWithContext as normalizeAirbnbUrl } from "./url-normalizer";
|
||||
export { extractAirbnbExternalId } from "./url-normalizer";
|
||||
125
src/lib/airbnb/parsers/jsonld.ts
Normal file
125
src/lib/airbnb/parsers/jsonld.ts
Normal file
@ -0,0 +1,125 @@
|
||||
import * as cheerio from 'cheerio';
|
||||
|
||||
export interface JsonLdData {
|
||||
title: string | null;
|
||||
description: string | null;
|
||||
locationText: string | null;
|
||||
latitude: number | null;
|
||||
longitude: number | null;
|
||||
rating: number | null;
|
||||
reviewCount: number | null;
|
||||
images: string[];
|
||||
cancellationPolicy: string | null;
|
||||
hostName: string | null;
|
||||
amenities: string[];
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse JSON-LD structured data from HTML
|
||||
* Airbnb typically uses LodgingBusiness or Room schema
|
||||
*/
|
||||
export function parseJsonLd($: cheerio.CheerioAPI): JsonLdData {
|
||||
const result: JsonLdData = {
|
||||
title: null,
|
||||
description: null,
|
||||
locationText: null,
|
||||
latitude: null,
|
||||
longitude: null,
|
||||
rating: null,
|
||||
reviewCount: null,
|
||||
images: [],
|
||||
cancellationPolicy: null,
|
||||
hostName: null,
|
||||
amenities: [],
|
||||
};
|
||||
|
||||
const jsonLdScript = $('script[type="application/ld+json"]').html();
|
||||
|
||||
if (!jsonLdScript) {
|
||||
return result;
|
||||
}
|
||||
|
||||
try {
|
||||
const jsonData = JSON.parse(jsonLdScript);
|
||||
|
||||
// Check if it's a lodging business schema
|
||||
if (jsonData["@type"] !== "LodgingBusiness" && jsonData["@type"] !== "Room") {
|
||||
return result;
|
||||
}
|
||||
|
||||
// Title
|
||||
if (jsonData.name) {
|
||||
result.title = jsonData.name;
|
||||
}
|
||||
|
||||
// Description
|
||||
if (jsonData.description) {
|
||||
result.description = jsonData.description;
|
||||
}
|
||||
|
||||
// Location
|
||||
if (jsonData.address) {
|
||||
const parts: string[] = [];
|
||||
if (jsonData.address.addressLocality) parts.push(jsonData.address.addressLocality);
|
||||
if (jsonData.address.addressRegion) parts.push(jsonData.address.addressRegion);
|
||||
if (jsonData.address.addressCountry) parts.push(jsonData.address.addressCountry);
|
||||
|
||||
if (parts.length > 0) {
|
||||
result.locationText = parts.join(', ');
|
||||
}
|
||||
}
|
||||
|
||||
// Coordinates
|
||||
if (jsonData.geo) {
|
||||
if (jsonData.geo.latitude) {
|
||||
result.latitude = parseFloat(jsonData.geo.latitude);
|
||||
}
|
||||
if (jsonData.geo.longitude) {
|
||||
result.longitude = parseFloat(jsonData.geo.longitude);
|
||||
}
|
||||
}
|
||||
|
||||
// Rating
|
||||
if (jsonData.aggregateRating) {
|
||||
if (jsonData.aggregateRating.ratingValue) {
|
||||
result.rating = parseFloat(jsonData.aggregateRating.ratingValue);
|
||||
}
|
||||
if (jsonData.aggregateRating.reviewCount) {
|
||||
result.reviewCount = parseInt(jsonData.aggregateRating.reviewCount, 10);
|
||||
}
|
||||
}
|
||||
|
||||
// Images
|
||||
if (jsonData.image) {
|
||||
const images = Array.isArray(jsonData.image)
|
||||
? jsonData.image.map((img: unknown) => {
|
||||
const imgObj = img as Record<string, unknown>;
|
||||
return imgObj.url || imgObj['@id'] || String(img);
|
||||
})
|
||||
: [jsonData.image.url || jsonData.image['@id'] || jsonData.image];
|
||||
result.images = images.filter(Boolean);
|
||||
}
|
||||
|
||||
// Cancellation Policy
|
||||
if (jsonData.cancellationPolicy) {
|
||||
result.cancellationPolicy = jsonData.cancellationPolicy;
|
||||
}
|
||||
|
||||
// Host name
|
||||
if (jsonData.provider?.name) {
|
||||
result.hostName = jsonData.provider.name;
|
||||
}
|
||||
|
||||
// Amenities
|
||||
if (jsonData.amenityFeature && Array.isArray(jsonData.amenityFeature)) {
|
||||
result.amenities = jsonData.amenityFeature
|
||||
.map((f: unknown) => (f as { name?: string }).name)
|
||||
.filter(Boolean);
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
console.error('Failed to parse JSON-LD:', error);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
118
src/lib/airbnb/parsers/location.ts
Normal file
118
src/lib/airbnb/parsers/location.ts
Normal file
@ -0,0 +1,118 @@
|
||||
import * as cheerio from 'cheerio';
|
||||
import { FieldSource } from '../types';
|
||||
|
||||
/**
|
||||
* Extract location from multiple sources with priority:
|
||||
* 1. JSON-LD address (handled separately)
|
||||
* 2. "Where you'll be" section
|
||||
* 3. Meta tags (og:locality, etc.)
|
||||
* 4. Visible text patterns
|
||||
*/
|
||||
export function extractLocation(
|
||||
$: cheerio.CheerioAPI,
|
||||
html: string
|
||||
): { locationText: FieldSource<string | null>; latitude: FieldSource<number | null>; longitude: FieldSource<number | null> } {
|
||||
|
||||
let locationText: string | null = null;
|
||||
let locationSource: FieldSource<string | null>['source'] = 'text_pattern';
|
||||
let latitude: number | null = null;
|
||||
let longitude: number | null = null;
|
||||
|
||||
// 1. Try "Where you'll be" section
|
||||
const whereSection = $('[data-section-id="LOCATION_DEFAULT"]').text() ||
|
||||
$('section:contains("Where you\'ll be")').text() ||
|
||||
$('section:contains("Lage")').text();
|
||||
|
||||
if (whereSection) {
|
||||
// Extract location from this section
|
||||
const locationMatch = whereSection.match(/([A-Z][a-zäöüÄÖÜß]+(?:\s+[A-Z][a-zäöüÄÖÜß]+)*,\s*[A-Z][a-zäöüÄÖÜß]+)/);
|
||||
if (locationMatch) {
|
||||
locationText = locationMatch[1].trim();
|
||||
locationSource = 'dom';
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Try meta tags
|
||||
if (!locationText) {
|
||||
const locality = $('meta[property="og:locality"]').attr('content') ||
|
||||
$('meta[name="location"]').attr('content');
|
||||
const region = $('meta[property="og:region"]').attr('content');
|
||||
const country = $('meta[property="og:country-name"]').attr('content');
|
||||
|
||||
if (locality) {
|
||||
locationText = [locality, region, country].filter(Boolean).join(', ');
|
||||
locationSource = 'meta';
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Try text patterns like "Location: Berlin, Germany"
|
||||
if (!locationText) {
|
||||
const locationPattern = /(?:location|lage|standort)[:\s]+([A-Z][a-zäöüÄÖÜß]+(?:[\s,]+[A-Z][a-zäöüÄÖÜß]+)*)/i;
|
||||
const match = html.match(locationPattern);
|
||||
if (match) {
|
||||
locationText = match[1].trim();
|
||||
locationSource = 'text_pattern';
|
||||
}
|
||||
}
|
||||
|
||||
// 4. Try extracting from title (e.g., "Apartment in Berlin · ...")
|
||||
if (!locationText) {
|
||||
const titlePattern = /(?:in|bei|am)\s+([A-Z][a-zäöüÄÖÜß]+(?:\s+[A-Z][a-zäöüÄÖÜß]+)?)\s*[·•]/;
|
||||
const title = $('title').text();
|
||||
const match = title.match(titlePattern);
|
||||
if (match) {
|
||||
locationText = match[1].trim();
|
||||
locationSource = 'text_pattern';
|
||||
}
|
||||
}
|
||||
|
||||
// Extract coordinates from various sources
|
||||
// Try data attributes
|
||||
const latAttr = $('[data-lat]').attr('data-lat') || $('[data-latitude]').attr('data-latitude');
|
||||
const lngAttr = $('[data-lng]').attr('data-lng') || $('[data-longitude]').attr('data-longitude');
|
||||
|
||||
if (latAttr && lngAttr) {
|
||||
latitude = parseFloat(latAttr);
|
||||
longitude = parseFloat(lngAttr);
|
||||
}
|
||||
|
||||
// Try meta tags for coordinates
|
||||
if (!latitude) {
|
||||
const geoPosition = $('meta[name="geo.position"]').attr('content') ||
|
||||
$('meta[property="place:location:latitude"]').attr('content');
|
||||
if (geoPosition) {
|
||||
const parts = geoPosition.split(/[;,]/);
|
||||
if (parts.length >= 2) {
|
||||
latitude = parseFloat(parts[0]);
|
||||
longitude = parseFloat(parts[1]);
|
||||
} else {
|
||||
latitude = parseFloat(geoPosition);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!longitude) {
|
||||
const lngMeta = $('meta[property="place:location:longitude"]').attr('content');
|
||||
if (lngMeta) {
|
||||
longitude = parseFloat(lngMeta);
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
locationText: {
|
||||
value: locationText,
|
||||
source: locationSource,
|
||||
confidence: locationText ? 'medium' : 'low',
|
||||
},
|
||||
latitude: {
|
||||
value: latitude,
|
||||
source: latitude ? 'dom' : 'text_pattern',
|
||||
confidence: latitude ? 'high' : 'low',
|
||||
},
|
||||
longitude: {
|
||||
value: longitude,
|
||||
source: longitude ? 'dom' : 'text_pattern',
|
||||
confidence: longitude ? 'high' : 'low',
|
||||
},
|
||||
};
|
||||
}
|
||||
102
src/lib/airbnb/parsers/price.ts
Normal file
102
src/lib/airbnb/parsers/price.ts
Normal file
@ -0,0 +1,102 @@
|
||||
import * as cheerio from 'cheerio';
|
||||
import { FieldSource, PriceStatus, TripContext } from '../types';
|
||||
import { parsePriceFromText } from './text-patterns';
|
||||
|
||||
/**
|
||||
* Try to extract price from HTML using various selectors
|
||||
*/
|
||||
function tryExtractPriceFromHtml(html: string, $: cheerio.CheerioAPI): number | null {
|
||||
// Try various price selectors that Airbnb might use
|
||||
const priceSelectors = [
|
||||
'[data-testid="price-amount"]',
|
||||
'span[class*="Price"]',
|
||||
'span[class*="price"]',
|
||||
'[itemprop="price"]',
|
||||
'._1y6k3r2',
|
||||
'._1dss1omb',
|
||||
];
|
||||
|
||||
for (const selector of priceSelectors) {
|
||||
const element = $(selector).first();
|
||||
if (element.length) {
|
||||
const text = element.text();
|
||||
const price = parsePriceFromText(text);
|
||||
if (price !== null) {
|
||||
return price;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: search entire HTML for price patterns
|
||||
const priceFromHtml = parsePriceFromText(html);
|
||||
if (priceFromHtml !== null) {
|
||||
return priceFromHtml;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract price with trip context awareness
|
||||
*
|
||||
* CRITICAL: Price reliability depends on trip context
|
||||
* - With check-in/check-out: Price is for those specific dates
|
||||
* - Without trip context: Price may be a base/minimum price
|
||||
*/
|
||||
export function extractPrice(
|
||||
html: string,
|
||||
$: cheerio.CheerioAPI,
|
||||
tripContext: TripContext
|
||||
): { nightly: FieldSource<number | null>; total: FieldSource<number | null>; status: PriceStatus } {
|
||||
|
||||
// No trip context = unreliable price
|
||||
if (!tripContext.checkIn || !tripContext.checkOut) {
|
||||
const extracted = tryExtractPriceFromHtml(html, $);
|
||||
|
||||
if (extracted !== null) {
|
||||
return {
|
||||
nightly: { value: extracted, source: 'text_pattern', confidence: 'low' },
|
||||
total: { value: null, source: 'text_pattern', confidence: 'low' },
|
||||
status: 'REQUIRES_TRIP_CONTEXT',
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
nightly: { value: null, source: 'text_pattern', confidence: 'low' },
|
||||
total: { value: null, source: 'text_pattern', confidence: 'low' },
|
||||
status: 'UNKNOWN',
|
||||
};
|
||||
}
|
||||
|
||||
// With trip context, try harder to extract
|
||||
const extracted = tryExtractPriceFromHtml(html, $);
|
||||
|
||||
if (extracted !== null) {
|
||||
// Calculate nights for total price
|
||||
let total: number | null = null;
|
||||
try {
|
||||
const checkIn = new Date(tripContext.checkIn);
|
||||
const checkOut = new Date(tripContext.checkOut);
|
||||
const nights = Math.round((checkOut.getTime() - checkIn.getTime()) / (1000 * 60 * 60 * 24));
|
||||
if (nights > 0) {
|
||||
total = extracted * nights;
|
||||
}
|
||||
} catch {
|
||||
// Invalid dates, skip total calculation
|
||||
}
|
||||
|
||||
return {
|
||||
nightly: { value: extracted, source: 'text_pattern', confidence: 'medium' },
|
||||
total: total !== null
|
||||
? { value: total, source: 'derived', confidence: 'medium' }
|
||||
: { value: null, source: 'text_pattern', confidence: 'low' },
|
||||
status: 'EXTRACTED',
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
nightly: { value: null, source: 'text_pattern', confidence: 'low' },
|
||||
total: { value: null, source: 'text_pattern', confidence: 'low' },
|
||||
status: 'UNKNOWN',
|
||||
};
|
||||
}
|
||||
143
src/lib/airbnb/parsers/sleeping.ts
Normal file
143
src/lib/airbnb/parsers/sleeping.ts
Normal file
@ -0,0 +1,143 @@
|
||||
import { BedType, SleepingOption } from '../types';
|
||||
|
||||
/**
|
||||
* Bed type configuration: maps text patterns to bed types, spots per unit, and quality
|
||||
*/
|
||||
export const BED_TYPE_CONFIG: Record<string, { type: BedType; spots: number; quality: 'FULL' | 'AUXILIARY' }> = {
|
||||
'double bed': { type: 'DOUBLE', spots: 2, quality: 'FULL' },
|
||||
'doppelbett': { type: 'DOUBLE', spots: 2, quality: 'FULL' },
|
||||
'queen bed': { type: 'QUEEN', spots: 2, quality: 'FULL' },
|
||||
'king bed': { type: 'KING', spots: 2, quality: 'FULL' },
|
||||
'single bed': { type: 'SINGLE', spots: 1, quality: 'FULL' },
|
||||
'twin bed': { type: 'SINGLE', spots: 1, quality: 'FULL' },
|
||||
'einzelbett': { type: 'SINGLE', spots: 1, quality: 'FULL' },
|
||||
'bunk bed': { type: 'BUNK', spots: 2, quality: 'FULL' },
|
||||
'etagenbett': { type: 'BUNK', spots: 2, quality: 'FULL' },
|
||||
'sofa bed': { type: 'SOFA_BED', spots: 2, quality: 'FULL' },
|
||||
'pull-out sofa': { type: 'SOFA_BED', spots: 2, quality: 'FULL' },
|
||||
'schlafsofa': { type: 'SOFA_BED', spots: 2, quality: 'FULL' },
|
||||
'couch': { type: 'SOFA', spots: 1, quality: 'AUXILIARY' },
|
||||
'sofa': { type: 'SOFA', spots: 1, quality: 'AUXILIARY' },
|
||||
'air mattress': { type: 'AIR_MATTRESS', spots: 1, quality: 'AUXILIARY' },
|
||||
'luftmatratze': { type: 'AIR_MATTRESS', spots: 1, quality: 'AUXILIARY' },
|
||||
'floor mattress': { type: 'EXTRA_MATTRESS', spots: 1, quality: 'AUXILIARY' },
|
||||
'extra mattress': { type: 'EXTRA_MATTRESS', spots: 1, quality: 'AUXILIARY' },
|
||||
'zusatzmatratze': { type: 'EXTRA_MATTRESS', spots: 1, quality: 'AUXILIARY' },
|
||||
'futon': { type: 'FUTON', spots: 1, quality: 'AUXILIARY' },
|
||||
};
|
||||
|
||||
// Pattern: "1 double bed" or "2 single beds" or "Bedroom 1: 1 queen bed"
|
||||
const BED_PATTERN = /(?:(?:bedroom|schlafzimmer|room|zimmer)\s*\d*\s*:?\s*)?(\d+)\s+([a-z\s-]+?)(?:\s|$|,|\.)/gi;
|
||||
|
||||
export interface SleepingStats {
|
||||
maxSleepingPlaces: number;
|
||||
suitableFor4: boolean;
|
||||
extraMattressesNeededFor4: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse sleeping arrangements from text
|
||||
* Handles patterns like:
|
||||
* - "1 double bed"
|
||||
* - "2 single beds"
|
||||
* - "Bedroom 1: 1 queen bed"
|
||||
* - "Common space: 1 sofa bed"
|
||||
*/
|
||||
export function parseSleepingArrangements(text: string): SleepingOption[] {
|
||||
const options: SleepingOption[] = [];
|
||||
const lowerText = text.toLowerCase();
|
||||
|
||||
let match;
|
||||
while ((match = BED_PATTERN.exec(lowerText)) !== null) {
|
||||
const quantity = parseInt(match[1], 10);
|
||||
const bedTypeText = match[2].trim();
|
||||
|
||||
// Find matching bed type config
|
||||
let matchedConfig: { type: BedType; spots: number; quality: 'FULL' | 'AUXILIARY' } | null = null;
|
||||
let matchedLabel = '';
|
||||
|
||||
for (const [pattern, config] of Object.entries(BED_TYPE_CONFIG)) {
|
||||
if (bedTypeText.includes(pattern) || pattern.includes(bedTypeText)) {
|
||||
matchedConfig = config;
|
||||
matchedLabel = pattern;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (matchedConfig && quantity > 0) {
|
||||
// Check if this bed type already exists
|
||||
const existing = options.find(o => o.bedType === matchedConfig!.type);
|
||||
if (existing) {
|
||||
existing.quantity += quantity;
|
||||
} else {
|
||||
options.push({
|
||||
bedType: matchedConfig.type,
|
||||
quantity,
|
||||
spotsPerUnit: matchedConfig.spots,
|
||||
quality: matchedConfig.quality,
|
||||
label: matchedLabel,
|
||||
rawText: match[0].trim(),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return options;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate sleeping statistics from options
|
||||
*/
|
||||
export function calculateSleepingStats(options: SleepingOption[]): SleepingStats {
|
||||
const maxSleepingPlaces = options.reduce(
|
||||
(sum, opt) => sum + opt.quantity * opt.spotsPerUnit,
|
||||
0
|
||||
);
|
||||
|
||||
const suitableFor4 = maxSleepingPlaces >= 4;
|
||||
|
||||
// Calculate extra mattresses needed for 4 people
|
||||
// Only count FULL quality beds first
|
||||
const fullQualitySpots = options
|
||||
.filter(o => o.quality === 'FULL')
|
||||
.reduce((sum, opt) => sum + opt.quantity * opt.spotsPerUnit, 0);
|
||||
|
||||
const extraMattressesNeededFor4 = Math.max(0, 4 - fullQualitySpots);
|
||||
|
||||
return {
|
||||
maxSleepingPlaces,
|
||||
suitableFor4,
|
||||
extraMattressesNeededFor4,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Derive sleeping options from bed count (fallback with low confidence)
|
||||
* Used when detailed sleeping arrangement text is not available
|
||||
*/
|
||||
export function deriveSleepingFromBeds(beds: number, guestCount: number): SleepingOption[] {
|
||||
if (!beds || beds < 1) return [];
|
||||
|
||||
// Assume beds are double beds if guest count suggests it
|
||||
const avgGuestsPerBed = guestCount ? guestCount / beds : 2;
|
||||
|
||||
if (avgGuestsPerBed >= 1.5) {
|
||||
// Likely double beds
|
||||
return [{
|
||||
bedType: 'DOUBLE',
|
||||
quantity: beds,
|
||||
spotsPerUnit: 2,
|
||||
quality: 'FULL',
|
||||
label: 'double bed (derived)',
|
||||
}];
|
||||
} else {
|
||||
// Likely single beds
|
||||
return [{
|
||||
bedType: 'SINGLE',
|
||||
quantity: beds,
|
||||
spotsPerUnit: 1,
|
||||
quality: 'FULL',
|
||||
label: 'single bed (derived)',
|
||||
}];
|
||||
}
|
||||
}
|
||||
123
src/lib/airbnb/parsers/text-patterns.ts
Normal file
123
src/lib/airbnb/parsers/text-patterns.ts
Normal file
@ -0,0 +1,123 @@
|
||||
/**
|
||||
* Text pattern parsers for extracting data from visible HTML text
|
||||
* Supports both German and English patterns
|
||||
*/
|
||||
|
||||
// "2 guests · 1 bedroom · 2 beds · 1 bath" or German variants
|
||||
const CAPACITY_PATTERN = /(\d+)\s*(?:guests?|gäste?)\s*[·•]\s*(\d+)\s*(?:bedrooms?|schlafzimmer?)\s*[·•]\s*(\d+)\s*(?:beds?|betten?)\s*[·•]\s*(\d+(?:[.,]\d+)?)\s*(?:baths?|bäder?)/i;
|
||||
|
||||
// "4.88 · 200 reviews" or "4,88 (200)" or "4,88 · 200 Bewertungen"
|
||||
const RATING_PATTERN = /(\d+[.,]\d+)\s*(?:[·•\(]?\s*(\d+)\s*(?:reviews?|bewertungen)?\)?)/i;
|
||||
|
||||
// "Hosted by David" or "Gehostet von David"
|
||||
const HOST_PATTERN = /(?:hosted by|gehostet von)\s+([^\n·•]+)/i;
|
||||
|
||||
// "€ 150 / night" or "$150 per night" or "150 € pro Nacht"
|
||||
const PRICE_PATTERN = /[€$]?\s*(\d+(?:[.,]\d{0,2})?)\s*[€$]?\s*(?:\/|per|pro)\s*(?:night|nacht)/i;
|
||||
|
||||
// "6 guests maximum" or "max. 6 Gäste" or "Up to 6 guests"
|
||||
const MAX_GUESTS_PATTERN = /(?:max\.?|maximum|up to)\s*(\d+)\s*(?:guests?|gäste?)|(\d+)\s*(?:guests?|gäste?)\s*(?:maximum|max\.?)/i;
|
||||
|
||||
export interface CapacityFacts {
|
||||
guests: number;
|
||||
bedrooms: number;
|
||||
beds: number;
|
||||
bathrooms: number;
|
||||
}
|
||||
|
||||
export interface RatingFacts {
|
||||
rating: number;
|
||||
reviewCount: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse capacity facts from text like "2 guests · 1 bedroom · 2 beds · 1 bath"
|
||||
*/
|
||||
export function parseCapacityFacts(text: string): CapacityFacts | null {
|
||||
const match = text.match(CAPACITY_PATTERN);
|
||||
if (!match) return null;
|
||||
|
||||
return {
|
||||
guests: parseInt(match[1], 10),
|
||||
bedrooms: parseInt(match[2], 10),
|
||||
beds: parseInt(match[3], 10),
|
||||
bathrooms: parseFloat(match[4].replace(',', '.')),
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse rating from text like "4.88 · 200 reviews"
|
||||
*/
|
||||
export function parseRating(text: string): RatingFacts | null {
|
||||
const match = text.match(RATING_PATTERN);
|
||||
if (!match) return null;
|
||||
|
||||
const rating = parseFloat(match[1].replace(',', '.'));
|
||||
const reviewCount = match[2] ? parseInt(match[2], 10) : 0;
|
||||
|
||||
if (isNaN(rating)) return null;
|
||||
|
||||
return { rating, reviewCount };
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse host name from text like "Hosted by David"
|
||||
*/
|
||||
export function parseHost(text: string): string | null {
|
||||
const match = text.match(HOST_PATTERN);
|
||||
if (!match) return null;
|
||||
|
||||
return match[1].trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse price from text like "€ 150 / night"
|
||||
*/
|
||||
export function parsePriceFromText(text: string): number | null {
|
||||
const match = text.match(PRICE_PATTERN);
|
||||
if (!match) return null;
|
||||
|
||||
const price = parseFloat(match[1].replace(',', '.'));
|
||||
return isNaN(price) ? null : price;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse max guests from text like "6 guests maximum"
|
||||
*/
|
||||
export function parseMaxGuests(text: string): number | null {
|
||||
const match = text.match(MAX_GUESTS_PATTERN);
|
||||
if (!match) return null;
|
||||
|
||||
// Pattern has two capture groups depending on word order
|
||||
const value = match[1] || match[2];
|
||||
return value ? parseInt(value, 10) : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract all text content from HTML for pattern matching
|
||||
*/
|
||||
export function extractVisibleText(html: string): string {
|
||||
// Remove script and style tags
|
||||
let text = html.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, ' ');
|
||||
text = text.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, ' ');
|
||||
|
||||
// Replace block elements with newlines
|
||||
text = text.replace(/<\/(div|p|br|li|tr|td|th|h[1-6]|section|article|header|footer)[^>]*>/gi, '\n');
|
||||
|
||||
// Remove remaining tags
|
||||
text = text.replace(/<[^>]+>/g, ' ');
|
||||
|
||||
// Decode HTML entities
|
||||
text = text
|
||||
.replace(/ /g, ' ')
|
||||
.replace(/&/g, '&')
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>')
|
||||
.replace(/"/g, '"')
|
||||
.replace(/&#(\d+);/g, (_, num) => String.fromCharCode(parseInt(num, 10)));
|
||||
|
||||
// Normalize whitespace
|
||||
text = text.replace(/\s+/g, ' ').trim();
|
||||
|
||||
return text;
|
||||
}
|
||||
113
src/lib/airbnb/types.ts
Normal file
113
src/lib/airbnb/types.ts
Normal file
@ -0,0 +1,113 @@
|
||||
export type DataSource = 'jsonld' | 'meta' | 'text_pattern' | 'dom' | 'playwright' | 'derived' | 'manual';
|
||||
export type Confidence = 'high' | 'medium' | 'low';
|
||||
export type PriceStatus = 'EXTRACTED' | 'REQUIRES_TRIP_CONTEXT' | 'UNKNOWN' | 'PARTIAL';
|
||||
export type SleepingDataQuality = 'EXACT' | 'DERIVED' | 'UNKNOWN';
|
||||
|
||||
export interface FieldSource<T> {
|
||||
value: T;
|
||||
source: DataSource;
|
||||
confidence: Confidence;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a FieldSource object with value, source, and confidence
|
||||
*/
|
||||
export function field<T>(value: T, source: DataSource, confidence: Confidence): FieldSource<T> {
|
||||
return { value, source, confidence };
|
||||
}
|
||||
|
||||
/**
|
||||
* Merge two FieldSources - takes the first non-null value
|
||||
* Priority: primary over secondary
|
||||
*/
|
||||
export function mergeField<T>(primary: FieldSource<T> | null, secondary: FieldSource<T> | null): FieldSource<T> {
|
||||
if (primary?.value !== null && primary?.value !== undefined) {
|
||||
return primary;
|
||||
}
|
||||
if (secondary?.value !== null && secondary?.value !== undefined) {
|
||||
return secondary;
|
||||
}
|
||||
// Return null with lowest confidence
|
||||
return { value: null as T, source: 'derived', confidence: 'low' };
|
||||
}
|
||||
|
||||
export type BedType = 'DOUBLE' | 'SINGLE' | 'SOFA_BED' | 'SOFA' | 'AIR_MATTRESS' | 'FUTON' | 'BUNK' | 'EXTRA_MATTRESS' | 'QUEEN' | 'KING' | 'UNKNOWN';
|
||||
|
||||
export interface SleepingOption {
|
||||
bedType: BedType;
|
||||
quantity: number;
|
||||
spotsPerUnit: number;
|
||||
quality: 'FULL' | 'AUXILIARY';
|
||||
label?: string;
|
||||
rawText?: string;
|
||||
}
|
||||
|
||||
export interface TripContext {
|
||||
checkIn?: string;
|
||||
checkOut?: string;
|
||||
adults?: number;
|
||||
}
|
||||
|
||||
export interface NormalizedUrl {
|
||||
original: string;
|
||||
normalized: string;
|
||||
externalId: string | null;
|
||||
tripContext: TripContext;
|
||||
}
|
||||
|
||||
export interface ExtractedListing {
|
||||
// URLs
|
||||
originalUrl: string;
|
||||
normalizedUrl: string;
|
||||
externalId: string | null;
|
||||
|
||||
// Basic Info
|
||||
title: FieldSource<string | null>;
|
||||
description: FieldSource<string | null>;
|
||||
|
||||
// Location
|
||||
locationText: FieldSource<string | null>;
|
||||
latitude: FieldSource<number | null>;
|
||||
longitude: FieldSource<number | null>;
|
||||
|
||||
// Pricing
|
||||
tripContext: TripContext;
|
||||
nightlyPrice: FieldSource<number | null>;
|
||||
totalPrice: FieldSource<number | null>;
|
||||
priceStatus: PriceStatus;
|
||||
|
||||
// Rating
|
||||
rating: FieldSource<number | null>;
|
||||
reviewCount: FieldSource<number | null>;
|
||||
|
||||
// Capacity
|
||||
guestCount: FieldSource<number | null>;
|
||||
officialGuestCount: FieldSource<number | null>;
|
||||
bedrooms: FieldSource<number | null>;
|
||||
beds: FieldSource<number | null>;
|
||||
bathrooms: FieldSource<number | null>;
|
||||
|
||||
// Sleeping
|
||||
sleepingOptions: SleepingOption[];
|
||||
maxSleepingPlaces: number;
|
||||
suitableFor4: boolean;
|
||||
extraMattressesNeededFor4: number;
|
||||
sleepingDataQuality: SleepingDataQuality;
|
||||
|
||||
// Host
|
||||
hostName: FieldSource<string | null>;
|
||||
|
||||
// Amenities
|
||||
amenities: string[];
|
||||
|
||||
// Images
|
||||
images: string[];
|
||||
coverImage: string | null;
|
||||
|
||||
// Other
|
||||
cancellationPolicy: FieldSource<string | null>;
|
||||
|
||||
// Debug
|
||||
rawSnippets: Record<string, string>;
|
||||
extractionLog: string[];
|
||||
}
|
||||
71
src/lib/airbnb/url-normalizer.ts
Normal file
71
src/lib/airbnb/url-normalizer.ts
Normal file
@ -0,0 +1,71 @@
|
||||
import { TripContext, NormalizedUrl } from './types';
|
||||
|
||||
/**
|
||||
* Extracts the Airbnb listing ID from a URL
|
||||
* Matches patterns like /rooms/12345 or /rooms/12345/
|
||||
*/
|
||||
export function extractAirbnbExternalId(url: string): string | null {
|
||||
const match = url.match(/\/rooms\/(\d+)/);
|
||||
return match?.[1] || null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts trip context from URL query parameters
|
||||
* Looks for: check_in, check_out, adults
|
||||
*/
|
||||
export function extractTripContext(url: string): TripContext {
|
||||
try {
|
||||
const urlObj = new URL(url);
|
||||
const params = urlObj.searchParams;
|
||||
|
||||
const checkIn = params.get('check_in') || params.get('checkIn') || undefined;
|
||||
const checkOut = params.get('check_out') || params.get('checkOut') || undefined;
|
||||
const adultsStr = params.get('adults') || params.get('adults[]') || undefined;
|
||||
|
||||
return {
|
||||
checkIn,
|
||||
checkOut,
|
||||
adults: adultsStr ? parseInt(adultsStr, 10) : undefined,
|
||||
};
|
||||
} catch {
|
||||
return {};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalizes an Airbnb URL by:
|
||||
* - Removing hash
|
||||
* - Removing query params (trip context extracted separately)
|
||||
* - Removing trailing slashes
|
||||
* - Removing www prefix
|
||||
* - Lowercasing hostname
|
||||
*/
|
||||
export function normalizeAirbnbUrl(url: string): string {
|
||||
try {
|
||||
const urlObj = new URL(url.trim());
|
||||
urlObj.hash = '';
|
||||
urlObj.search = '';
|
||||
urlObj.pathname = urlObj.pathname.replace(/\/+$/, '');
|
||||
urlObj.hostname = urlObj.hostname.replace(/^www\./, '').toLowerCase();
|
||||
return urlObj.toString();
|
||||
} catch {
|
||||
return url.trim();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Main function: Normalizes URL and extracts all metadata
|
||||
*/
|
||||
export function normalizeAirbnbUrlWithContext(url: string): NormalizedUrl {
|
||||
const original = url.trim();
|
||||
const normalized = normalizeAirbnbUrl(original);
|
||||
const externalId = extractAirbnbExternalId(normalized);
|
||||
const tripContext = extractTripContext(original);
|
||||
|
||||
return {
|
||||
original,
|
||||
normalized,
|
||||
externalId,
|
||||
tripContext,
|
||||
};
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user