/** * Text pattern parsers for extracting data from visible HTML text * Supports both German and English patterns */ // Main capacity pattern - "2 guests · 1 bedroom · 2 beds · 1 bath" const CAPACITY_PATTERN = /(\d+)\s*(?:guests?|gäste?)\s*[·•]\s*(\d+)\s*(?:bedrooms?|schlafzimmer?)\s*[·•]\s*(\d+)\s*(?:beds?|betten?)\s*[·•]\s*(\d+(?:[.,]\d+)?)\s*(?:baths?|bäder?|bath)/i; // Fallback patterns for different orders const CAPACITY_FALLBACK_1 = /(\d+)\s*(?:guests?|gäste?)\s*[·•]\s*(\d+)\s*(?:bedrooms?|schlafzimmer?)/i; const CAPACITY_FALLBACK_2 = /(\d+)\s*(?:beds?|betten?)\s*[·•]\s*(\d+)\s*(?:baths?|bäder?)/i; // Rating patterns - "4.88 · 200 reviews" or "4,88 (200)" or "4,88 · 200 Bewertungen" const RATING_PATTERN = /(\d+[.,]\d+)\s*(?:[·•\(]?\s*(\d+)\s*(?:reviews?|bewertungen)?\)?)/i; // Host patterns - "Hosted by David" or "Gehostet von David" const HOST_PATTERN = /(?:hosted by|gehostet von)\s+([^\n·•,]{2,40})/i; // Price patterns - more flexible const PRICE_PATTERN = /(?:€|EUR|\$)\s*(\d+(?:[.,]\d{0,2})?)|(\d+(?:[.,]\d{0,2})?)\s*(?:€|EUR|\$)/i; const PRICE_PER_NIGHT = /(\d+(?:[.,]\d+)?)\s*[\/·]\s*(?:night|nacht|per\s*night)/i; // Max guests patterns - "6 guests maximum" or "max. 6 Gäste" or "Up to 6 guests" const MAX_GUESTS_PATTERN = /(?:max\.?|maximum|up to)\s*(\d+)\s*(?:guests?|gäste?)|(\d+)\s*(?:guests?|gäste?)\s*(?:maximum|max\.?)/i; // Bedroom patterns const BEDROOM_PATTERN = /(\d+)\s*(?:bedroom|schlafzimmer|bedrooms|schlafzimmer)/i; // Bed patterns const BED_PATTERN = /(\d+)\s*(?:bed|beds|bett|betten)/i; // Bathroom patterns const BATHROOM_PATTERN = /(\d+(?:[.,]\d+)?)\s*(?:bath|baths|badezimmer|bäder)/i; // Title in page - tag const TITLE_PATTERN = /<title>([^<]+)<\/title>/i; // Location from address pattern const LOCATION_PATTERN = /([A-Z][a-zA-ZäöüÄÖÜß]+(?:\s+[A-Z][a-zA-ZäöüÄÖÜß]+)*,\s*[A-Z][a-zA-ZäöüÄÖÜß]+)/i; export interface CapacityFacts { guests: number; bedrooms: number; beds: number; bathrooms: number; } export interface RatingFacts { rating: number; reviewCount: number; } /** * Parse capacity facts from text like "2 guests · 1 bedroom · 2 beds · 1 bath" */ export function parseCapacityFacts(text: string): CapacityFacts | null { // Try main pattern first const match = text.match(CAPACITY_PATTERN); if (match) { return { guests: parseInt(match[1], 10), bedrooms: parseInt(match[2], 10), beds: parseInt(match[3], 10), bathrooms: parseFloat(match[4].replace(',', '.')), }; } // Fallback: try to extract individual values const bedroomMatch = text.match(BEDROOM_PATTERN); const bedMatch = text.match(BED_PATTERN); const bathroomMatch = text.match(BATHROOM_PATTERN); const guestMatch = text.match(/(\d+)\s*(?:guests?|gäste?)/i); if (bedroomMatch || bedMatch || bathroomMatch || guestMatch) { return { guests: guestMatch ? parseInt(guestMatch[1], 10) : 0, bedrooms: bedroomMatch ? parseInt(bedroomMatch[1], 10) : 0, beds: bedMatch ? parseInt(bedMatch[1], 10) : 0, bathrooms: bathroomMatch ? parseFloat(bathroomMatch[1].replace(',', '.')) : 0, }; } return null; } /** * Parse rating from text like "4.88 · 200 reviews" */ export function parseRating(text: string): RatingFacts | null { const match = text.match(RATING_PATTERN); if (!match) return null; const rating = parseFloat(match[1].replace(',', '.')); const reviewCount = match[2] ? parseInt(match[2], 10) : 0; if (isNaN(rating)) return null; return { rating, reviewCount }; } /** * Extract title from HTML */ export function parseTitle(html: string): string | null { // Try <title> tag first const titleMatch = html.match(/<title>([^<]+)<\/title>/i); if (titleMatch) { // Clean up title - usually "Title - Airbnb" format let title = titleMatch[1].replace(/\s*[-|–]\s*Airbnb.*$/i, '').trim(); if (title) return title; } // Try og:title const ogTitle = html.match(/<meta[^>]*property=["']og:title["'][^>]*content=["']([^"']+)["']/i); if (ogTitle) return ogTitle[1]; return null; } /** * Parse host name from text like "Hosted by David" */ export function parseHost(text: string): string | null { const match = text.match(HOST_PATTERN); if (!match) return null; return match[1].trim(); } /** * Parse price from text like "€ 150 / night" */ export function parsePriceFromText(text: string): number | null { const match = text.match(PRICE_PATTERN); if (!match) return null; const price = parseFloat(match[1].replace(',', '.')); return isNaN(price) ? null : price; } /** * Parse max guests from text like "6 guests maximum" */ export function parseMaxGuests(text: string): number | null { const match = text.match(MAX_GUESTS_PATTERN); if (!match) return null; // Pattern has two capture groups depending on word order const value = match[1] || match[2]; return value ? parseInt(value, 10) : null; } /** * Extract all text content from HTML for pattern matching */ export function extractVisibleText(html: string): string { // Remove script and style tags let text = html.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, ' '); text = text.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, ' '); // Replace block elements with newlines text = text.replace(/<\/(div|p|br|li|tr|td|th|h[1-6]|section|article|header|footer)[^>]*>/gi, '\n'); // Remove remaining tags text = text.replace(/<[^>]+>/g, ' '); // Decode HTML entities text = text .replace(/ /g, ' ') .replace(/&/g, '&') .replace(/</g, '<') .replace(/>/g, '>') .replace(/"/g, '"') .replace(/&#(\d+);/g, (_, num) => String.fromCharCode(parseInt(num, 10))); // Normalize whitespace text = text.replace(/\s+/g, ' ').trim(); return text; }