airbnb-finder/src/lib/airbnb/parsers/text-patterns.ts
AI 5e5326dbcc fix: improve scraper robustness, fix delete button, add title parsing
- Add parseTitle for fallback title extraction
- Improve text-patterns with more robust regex patterns
- Add more fallback patterns for capacity, beds, etc.
- Fix delete button to use FormData properly
- Add logging to scraper for debugging
2026-03-11 16:15:20 +00:00

181 lines
5.7 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* Text pattern parsers for extracting data from visible HTML text
* Supports both German and English patterns
*/
// Main capacity pattern - "2 guests · 1 bedroom · 2 beds · 1 bath"
const CAPACITY_PATTERN = /(\d+)\s*(?:guests?|gäste?)\s*[·•]\s*(\d+)\s*(?:bedrooms?|schlafzimmer?)\s*[·•]\s*(\d+)\s*(?:beds?|betten?)\s*[·•]\s*(\d+(?:[.,]\d+)?)\s*(?:baths?|bäder?|bath)/i;
// Fallback patterns for different orders
const CAPACITY_FALLBACK_1 = /(\d+)\s*(?:guests?|gäste?)\s*[·•]\s*(\d+)\s*(?:bedrooms?|schlafzimmer?)/i;
const CAPACITY_FALLBACK_2 = /(\d+)\s*(?:beds?|betten?)\s*[·•]\s*(\d+)\s*(?:baths?|bäder?)/i;
// Rating patterns - "4.88 · 200 reviews" or "4,88 (200)" or "4,88 · 200 Bewertungen"
const RATING_PATTERN = /(\d+[.,]\d+)\s*(?:[·•\(]?\s*(\d+)\s*(?:reviews?|bewertungen)?\)?)/i;
// Host patterns - "Hosted by David" or "Gehostet von David"
const HOST_PATTERN = /(?:hosted by|gehostet von)\s+([^\n·•,]{2,40})/i;
// Price patterns - more flexible
const PRICE_PATTERN = /(?:€|EUR|\$)\s*(\d+(?:[.,]\d{0,2})?)|(\d+(?:[.,]\d{0,2})?)\s*(?:€|EUR|\$)/i;
const PRICE_PER_NIGHT = /(\d+(?:[.,]\d+)?)\s*[\/·]\s*(?:night|nacht|per\s*night)/i;
// Max guests patterns - "6 guests maximum" or "max. 6 Gäste" or "Up to 6 guests"
const MAX_GUESTS_PATTERN = /(?:max\.?|maximum|up to)\s*(\d+)\s*(?:guests?|gäste?)|(\d+)\s*(?:guests?|gäste?)\s*(?:maximum|max\.?)/i;
// Bedroom patterns
const BEDROOM_PATTERN = /(\d+)\s*(?:bedroom|schlafzimmer|bedrooms|schlafzimmer)/i;
// Bed patterns
const BED_PATTERN = /(\d+)\s*(?:bed|beds|bett|betten)/i;
// Bathroom patterns
const BATHROOM_PATTERN = /(\d+(?:[.,]\d+)?)\s*(?:bath|baths|badezimmer|bäder)/i;
// Title in page - <title> tag
const TITLE_PATTERN = /<title>([^<]+)<\/title>/i;
// Location from address pattern
const LOCATION_PATTERN = /([A-Z][a-zA-ZäöüÄÖÜß]+(?:\s+[A-Z][a-zA-ZäöüÄÖÜß]+)*,\s*[A-Z][a-zA-ZäöüÄÖÜß]+)/i;
export interface CapacityFacts {
guests: number;
bedrooms: number;
beds: number;
bathrooms: number;
}
export interface RatingFacts {
rating: number;
reviewCount: number;
}
/**
* Parse capacity facts from text like "2 guests · 1 bedroom · 2 beds · 1 bath"
*/
export function parseCapacityFacts(text: string): CapacityFacts | null {
// Try main pattern first
const match = text.match(CAPACITY_PATTERN);
if (match) {
return {
guests: parseInt(match[1], 10),
bedrooms: parseInt(match[2], 10),
beds: parseInt(match[3], 10),
bathrooms: parseFloat(match[4].replace(',', '.')),
};
}
// Fallback: try to extract individual values
const bedroomMatch = text.match(BEDROOM_PATTERN);
const bedMatch = text.match(BED_PATTERN);
const bathroomMatch = text.match(BATHROOM_PATTERN);
const guestMatch = text.match(/(\d+)\s*(?:guests?|gäste?)/i);
if (bedroomMatch || bedMatch || bathroomMatch || guestMatch) {
return {
guests: guestMatch ? parseInt(guestMatch[1], 10) : 0,
bedrooms: bedroomMatch ? parseInt(bedroomMatch[1], 10) : 0,
beds: bedMatch ? parseInt(bedMatch[1], 10) : 0,
bathrooms: bathroomMatch ? parseFloat(bathroomMatch[1].replace(',', '.')) : 0,
};
}
return null;
}
/**
* Parse rating from text like "4.88 · 200 reviews"
*/
export function parseRating(text: string): RatingFacts | null {
const match = text.match(RATING_PATTERN);
if (!match) return null;
const rating = parseFloat(match[1].replace(',', '.'));
const reviewCount = match[2] ? parseInt(match[2], 10) : 0;
if (isNaN(rating)) return null;
return { rating, reviewCount };
}
/**
* Extract title from HTML
*/
export function parseTitle(html: string): string | null {
// Try <title> tag first
const titleMatch = html.match(/<title>([^<]+)<\/title>/i);
if (titleMatch) {
// Clean up title - usually "Title - Airbnb" format
let title = titleMatch[1].replace(/\s*[-|]\s*Airbnb.*$/i, '').trim();
if (title) return title;
}
// Try og:title
const ogTitle = html.match(/<meta[^>]*property=["']og:title["'][^>]*content=["']([^"']+)["']/i);
if (ogTitle) return ogTitle[1];
return null;
}
/**
* Parse host name from text like "Hosted by David"
*/
export function parseHost(text: string): string | null {
const match = text.match(HOST_PATTERN);
if (!match) return null;
return match[1].trim();
}
/**
* Parse price from text like "€ 150 / night"
*/
export function parsePriceFromText(text: string): number | null {
const match = text.match(PRICE_PATTERN);
if (!match) return null;
const price = parseFloat(match[1].replace(',', '.'));
return isNaN(price) ? null : price;
}
/**
* Parse max guests from text like "6 guests maximum"
*/
export function parseMaxGuests(text: string): number | null {
const match = text.match(MAX_GUESTS_PATTERN);
if (!match) return null;
// Pattern has two capture groups depending on word order
const value = match[1] || match[2];
return value ? parseInt(value, 10) : null;
}
/**
* Extract all text content from HTML for pattern matching
*/
export function extractVisibleText(html: string): string {
// Remove script and style tags
let text = html.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, ' ');
text = text.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, ' ');
// Replace block elements with newlines
text = text.replace(/<\/(div|p|br|li|tr|td|th|h[1-6]|section|article|header|footer)[^>]*>/gi, '\n');
// Remove remaining tags
text = text.replace(/<[^>]+>/g, ' ');
// Decode HTML entities
text = text
.replace(/&nbsp;/g, ' ')
.replace(/&amp;/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/&#(\d+);/g, (_, num) => String.fromCharCode(parseInt(num, 10)));
// Normalize whitespace
text = text.replace(/\s+/g, ' ').trim();
return text;
}