airbnb-finder/src/lib/airbnb/parsers/text-patterns.ts

124 lines
3.7 KiB
TypeScript
Raw Normal View History

/**
* Text pattern parsers for extracting data from visible HTML text
* Supports both German and English patterns
*/
// "2 guests · 1 bedroom · 2 beds · 1 bath" or German variants
const CAPACITY_PATTERN = /(\d+)\s*(?:guests?|gäste?)\s*[·•]\s*(\d+)\s*(?:bedrooms?|schlafzimmer?)\s*[·•]\s*(\d+)\s*(?:beds?|betten?)\s*[·•]\s*(\d+(?:[.,]\d+)?)\s*(?:baths?|bäder?)/i;
// "4.88 · 200 reviews" or "4,88 (200)" or "4,88 · 200 Bewertungen"
const RATING_PATTERN = /(\d+[.,]\d+)\s*(?:[·•\(]?\s*(\d+)\s*(?:reviews?|bewertungen)?\)?)/i;
// "Hosted by David" or "Gehostet von David"
const HOST_PATTERN = /(?:hosted by|gehostet von)\s+([^\n·•]+)/i;
// "€ 150 / night" or "$150 per night" or "150 € pro Nacht"
const PRICE_PATTERN = /[€$]?\s*(\d+(?:[.,]\d{0,2})?)\s*[€$]?\s*(?:\/|per|pro)\s*(?:night|nacht)/i;
// "6 guests maximum" or "max. 6 Gäste" or "Up to 6 guests"
const MAX_GUESTS_PATTERN = /(?:max\.?|maximum|up to)\s*(\d+)\s*(?:guests?|gäste?)|(\d+)\s*(?:guests?|gäste?)\s*(?:maximum|max\.?)/i;
export interface CapacityFacts {
guests: number;
bedrooms: number;
beds: number;
bathrooms: number;
}
export interface RatingFacts {
rating: number;
reviewCount: number;
}
/**
* Parse capacity facts from text like "2 guests · 1 bedroom · 2 beds · 1 bath"
*/
export function parseCapacityFacts(text: string): CapacityFacts | null {
const match = text.match(CAPACITY_PATTERN);
if (!match) return null;
return {
guests: parseInt(match[1], 10),
bedrooms: parseInt(match[2], 10),
beds: parseInt(match[3], 10),
bathrooms: parseFloat(match[4].replace(',', '.')),
};
}
/**
* Parse rating from text like "4.88 · 200 reviews"
*/
export function parseRating(text: string): RatingFacts | null {
const match = text.match(RATING_PATTERN);
if (!match) return null;
const rating = parseFloat(match[1].replace(',', '.'));
const reviewCount = match[2] ? parseInt(match[2], 10) : 0;
if (isNaN(rating)) return null;
return { rating, reviewCount };
}
/**
* Parse host name from text like "Hosted by David"
*/
export function parseHost(text: string): string | null {
const match = text.match(HOST_PATTERN);
if (!match) return null;
return match[1].trim();
}
/**
* Parse price from text like "€ 150 / night"
*/
export function parsePriceFromText(text: string): number | null {
const match = text.match(PRICE_PATTERN);
if (!match) return null;
const price = parseFloat(match[1].replace(',', '.'));
return isNaN(price) ? null : price;
}
/**
* Parse max guests from text like "6 guests maximum"
*/
export function parseMaxGuests(text: string): number | null {
const match = text.match(MAX_GUESTS_PATTERN);
if (!match) return null;
// Pattern has two capture groups depending on word order
const value = match[1] || match[2];
return value ? parseInt(value, 10) : null;
}
/**
* Extract all text content from HTML for pattern matching
*/
export function extractVisibleText(html: string): string {
// Remove script and style tags
let text = html.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, ' ');
text = text.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, ' ');
// Replace block elements with newlines
text = text.replace(/<\/(div|p|br|li|tr|td|th|h[1-6]|section|article|header|footer)[^>]*>/gi, '\n');
// Remove remaining tags
text = text.replace(/<[^>]+>/g, ' ');
// Decode HTML entities
text = text
.replace(/&nbsp;/g, ' ')
.replace(/&amp;/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/&#(\d+);/g, (_, num) => String.fromCharCode(parseInt(num, 10)));
// Normalize whitespace
text = text.replace(/\s+/g, ' ').trim();
return text;
}