2026-03-11 15:55:45 +00:00
|
|
|
|
/**
|
|
|
|
|
|
* Text pattern parsers for extracting data from visible HTML text
|
|
|
|
|
|
* Supports both German and English patterns
|
|
|
|
|
|
*/
|
|
|
|
|
|
|
2026-03-11 16:15:20 +00:00
|
|
|
|
// Main capacity pattern - "2 guests · 1 bedroom · 2 beds · 1 bath"
|
|
|
|
|
|
const CAPACITY_PATTERN = /(\d+)\s*(?:guests?|gäste?)\s*[·•]\s*(\d+)\s*(?:bedrooms?|schlafzimmer?)\s*[·•]\s*(\d+)\s*(?:beds?|betten?)\s*[·•]\s*(\d+(?:[.,]\d+)?)\s*(?:baths?|bäder?|bath)/i;
|
2026-03-11 15:55:45 +00:00
|
|
|
|
|
2026-03-11 16:15:20 +00:00
|
|
|
|
// Fallback patterns for different orders
|
|
|
|
|
|
const CAPACITY_FALLBACK_1 = /(\d+)\s*(?:guests?|gäste?)\s*[·•]\s*(\d+)\s*(?:bedrooms?|schlafzimmer?)/i;
|
|
|
|
|
|
const CAPACITY_FALLBACK_2 = /(\d+)\s*(?:beds?|betten?)\s*[·•]\s*(\d+)\s*(?:baths?|bäder?)/i;
|
|
|
|
|
|
|
|
|
|
|
|
// Rating patterns - "4.88 · 200 reviews" or "4,88 (200)" or "4,88 · 200 Bewertungen"
|
2026-03-11 15:55:45 +00:00
|
|
|
|
const RATING_PATTERN = /(\d+[.,]\d+)\s*(?:[·•\(]?\s*(\d+)\s*(?:reviews?|bewertungen)?\)?)/i;
|
|
|
|
|
|
|
2026-03-11 16:15:20 +00:00
|
|
|
|
// Host patterns - "Hosted by David" or "Gehostet von David"
|
|
|
|
|
|
const HOST_PATTERN = /(?:hosted by|gehostet von)\s+([^\n·•,]{2,40})/i;
|
2026-03-11 15:55:45 +00:00
|
|
|
|
|
2026-03-11 16:15:20 +00:00
|
|
|
|
// Price patterns - more flexible
|
|
|
|
|
|
const PRICE_PATTERN = /(?:€|EUR|\$)\s*(\d+(?:[.,]\d{0,2})?)|(\d+(?:[.,]\d{0,2})?)\s*(?:€|EUR|\$)/i;
|
|
|
|
|
|
const PRICE_PER_NIGHT = /(\d+(?:[.,]\d+)?)\s*[\/·]\s*(?:night|nacht|per\s*night)/i;
|
2026-03-11 15:55:45 +00:00
|
|
|
|
|
2026-03-11 16:15:20 +00:00
|
|
|
|
// Max guests patterns - "6 guests maximum" or "max. 6 Gäste" or "Up to 6 guests"
|
2026-03-11 15:55:45 +00:00
|
|
|
|
const MAX_GUESTS_PATTERN = /(?:max\.?|maximum|up to)\s*(\d+)\s*(?:guests?|gäste?)|(\d+)\s*(?:guests?|gäste?)\s*(?:maximum|max\.?)/i;
|
|
|
|
|
|
|
2026-03-11 16:15:20 +00:00
|
|
|
|
// Bedroom patterns
|
|
|
|
|
|
const BEDROOM_PATTERN = /(\d+)\s*(?:bedroom|schlafzimmer|bedrooms|schlafzimmer)/i;
|
|
|
|
|
|
|
|
|
|
|
|
// Bed patterns
|
|
|
|
|
|
const BED_PATTERN = /(\d+)\s*(?:bed|beds|bett|betten)/i;
|
|
|
|
|
|
|
|
|
|
|
|
// Bathroom patterns
|
|
|
|
|
|
const BATHROOM_PATTERN = /(\d+(?:[.,]\d+)?)\s*(?:bath|baths|badezimmer|bäder)/i;
|
|
|
|
|
|
|
|
|
|
|
|
// Title in page - <title> tag
|
|
|
|
|
|
const TITLE_PATTERN = /<title>([^<]+)<\/title>/i;
|
|
|
|
|
|
|
|
|
|
|
|
// Location from address pattern
|
|
|
|
|
|
const LOCATION_PATTERN = /([A-Z][a-zA-ZäöüÄÖÜß]+(?:\s+[A-Z][a-zA-ZäöüÄÖÜß]+)*,\s*[A-Z][a-zA-ZäöüÄÖÜß]+)/i;
|
|
|
|
|
|
|
2026-03-11 15:55:45 +00:00
|
|
|
|
export interface CapacityFacts {
|
|
|
|
|
|
guests: number;
|
|
|
|
|
|
bedrooms: number;
|
|
|
|
|
|
beds: number;
|
|
|
|
|
|
bathrooms: number;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
export interface RatingFacts {
|
|
|
|
|
|
rating: number;
|
|
|
|
|
|
reviewCount: number;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* Parse capacity facts from text like "2 guests · 1 bedroom · 2 beds · 1 bath"
|
|
|
|
|
|
*/
|
|
|
|
|
|
export function parseCapacityFacts(text: string): CapacityFacts | null {
|
2026-03-11 16:15:20 +00:00
|
|
|
|
// Try main pattern first
|
2026-03-11 15:55:45 +00:00
|
|
|
|
const match = text.match(CAPACITY_PATTERN);
|
2026-03-11 16:15:20 +00:00
|
|
|
|
if (match) {
|
|
|
|
|
|
return {
|
|
|
|
|
|
guests: parseInt(match[1], 10),
|
|
|
|
|
|
bedrooms: parseInt(match[2], 10),
|
|
|
|
|
|
beds: parseInt(match[3], 10),
|
|
|
|
|
|
bathrooms: parseFloat(match[4].replace(',', '.')),
|
|
|
|
|
|
};
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Fallback: try to extract individual values
|
|
|
|
|
|
const bedroomMatch = text.match(BEDROOM_PATTERN);
|
|
|
|
|
|
const bedMatch = text.match(BED_PATTERN);
|
|
|
|
|
|
const bathroomMatch = text.match(BATHROOM_PATTERN);
|
|
|
|
|
|
const guestMatch = text.match(/(\d+)\s*(?:guests?|gäste?)/i);
|
2026-03-11 15:55:45 +00:00
|
|
|
|
|
2026-03-11 16:15:20 +00:00
|
|
|
|
if (bedroomMatch || bedMatch || bathroomMatch || guestMatch) {
|
|
|
|
|
|
return {
|
|
|
|
|
|
guests: guestMatch ? parseInt(guestMatch[1], 10) : 0,
|
|
|
|
|
|
bedrooms: bedroomMatch ? parseInt(bedroomMatch[1], 10) : 0,
|
|
|
|
|
|
beds: bedMatch ? parseInt(bedMatch[1], 10) : 0,
|
|
|
|
|
|
bathrooms: bathroomMatch ? parseFloat(bathroomMatch[1].replace(',', '.')) : 0,
|
|
|
|
|
|
};
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return null;
|
2026-03-11 15:55:45 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* Parse rating from text like "4.88 · 200 reviews"
|
|
|
|
|
|
*/
|
|
|
|
|
|
export function parseRating(text: string): RatingFacts | null {
|
|
|
|
|
|
const match = text.match(RATING_PATTERN);
|
|
|
|
|
|
if (!match) return null;
|
|
|
|
|
|
|
|
|
|
|
|
const rating = parseFloat(match[1].replace(',', '.'));
|
|
|
|
|
|
const reviewCount = match[2] ? parseInt(match[2], 10) : 0;
|
|
|
|
|
|
|
|
|
|
|
|
if (isNaN(rating)) return null;
|
|
|
|
|
|
|
|
|
|
|
|
return { rating, reviewCount };
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-03-11 16:15:20 +00:00
|
|
|
|
/**
|
|
|
|
|
|
* Extract title from HTML
|
|
|
|
|
|
*/
|
|
|
|
|
|
export function parseTitle(html: string): string | null {
|
|
|
|
|
|
// Try <title> tag first
|
|
|
|
|
|
const titleMatch = html.match(/<title>([^<]+)<\/title>/i);
|
|
|
|
|
|
if (titleMatch) {
|
|
|
|
|
|
// Clean up title - usually "Title - Airbnb" format
|
|
|
|
|
|
let title = titleMatch[1].replace(/\s*[-|–]\s*Airbnb.*$/i, '').trim();
|
|
|
|
|
|
if (title) return title;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Try og:title
|
|
|
|
|
|
const ogTitle = html.match(/<meta[^>]*property=["']og:title["'][^>]*content=["']([^"']+)["']/i);
|
|
|
|
|
|
if (ogTitle) return ogTitle[1];
|
|
|
|
|
|
|
|
|
|
|
|
return null;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-03-11 15:55:45 +00:00
|
|
|
|
/**
|
|
|
|
|
|
* Parse host name from text like "Hosted by David"
|
|
|
|
|
|
*/
|
|
|
|
|
|
export function parseHost(text: string): string | null {
|
|
|
|
|
|
const match = text.match(HOST_PATTERN);
|
|
|
|
|
|
if (!match) return null;
|
|
|
|
|
|
|
|
|
|
|
|
return match[1].trim();
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* Parse price from text like "€ 150 / night"
|
|
|
|
|
|
*/
|
|
|
|
|
|
export function parsePriceFromText(text: string): number | null {
|
|
|
|
|
|
const match = text.match(PRICE_PATTERN);
|
|
|
|
|
|
if (!match) return null;
|
|
|
|
|
|
|
|
|
|
|
|
const price = parseFloat(match[1].replace(',', '.'));
|
|
|
|
|
|
return isNaN(price) ? null : price;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* Parse max guests from text like "6 guests maximum"
|
|
|
|
|
|
*/
|
|
|
|
|
|
export function parseMaxGuests(text: string): number | null {
|
|
|
|
|
|
const match = text.match(MAX_GUESTS_PATTERN);
|
|
|
|
|
|
if (!match) return null;
|
|
|
|
|
|
|
|
|
|
|
|
// Pattern has two capture groups depending on word order
|
|
|
|
|
|
const value = match[1] || match[2];
|
|
|
|
|
|
return value ? parseInt(value, 10) : null;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* Extract all text content from HTML for pattern matching
|
|
|
|
|
|
*/
|
|
|
|
|
|
export function extractVisibleText(html: string): string {
|
|
|
|
|
|
// Remove script and style tags
|
|
|
|
|
|
let text = html.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, ' ');
|
|
|
|
|
|
text = text.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, ' ');
|
|
|
|
|
|
|
|
|
|
|
|
// Replace block elements with newlines
|
|
|
|
|
|
text = text.replace(/<\/(div|p|br|li|tr|td|th|h[1-6]|section|article|header|footer)[^>]*>/gi, '\n');
|
|
|
|
|
|
|
|
|
|
|
|
// Remove remaining tags
|
|
|
|
|
|
text = text.replace(/<[^>]+>/g, ' ');
|
|
|
|
|
|
|
|
|
|
|
|
// Decode HTML entities
|
|
|
|
|
|
text = text
|
|
|
|
|
|
.replace(/ /g, ' ')
|
|
|
|
|
|
.replace(/&/g, '&')
|
|
|
|
|
|
.replace(/</g, '<')
|
|
|
|
|
|
.replace(/>/g, '>')
|
|
|
|
|
|
.replace(/"/g, '"')
|
|
|
|
|
|
.replace(/&#(\d+);/g, (_, num) => String.fromCharCode(parseInt(num, 10)));
|
|
|
|
|
|
|
|
|
|
|
|
// Normalize whitespace
|
|
|
|
|
|
text = text.replace(/\s+/g, ' ').trim();
|
|
|
|
|
|
|
|
|
|
|
|
return text;
|
|
|
|
|
|
}
|