airbnb-finder/src/lib/airbnb/parsers/text-patterns.ts

/**
 * Text pattern parsers for extracting data from visible HTML text
 * Supports both German and English patterns
 */

// "2 guests · 1 bedroom · 2 beds · 1 bath" or German variants
const CAPACITY_PATTERN = /(\d+)\s*(?:guests?|gäste?)\s*[·•]\s*(\d+)\s*(?:bedrooms?|schlafzimmer?)\s*[·•]\s*(\d+)\s*(?:beds?|betten?)\s*[·•]\s*(\d+(?:[.,]\d+)?)\s*(?:baths?|bäder?)/i;

// "4.88 · 200 reviews" or "4,88 (200)" or "4,88 · 200 Bewertungen"
const RATING_PATTERN = /(\d+[.,]\d+)\s*(?:[·•\(]?\s*(\d+)\s*(?:reviews?|bewertungen)?\)?)/i;

// "Hosted by David" or "Gehostet von David"
const HOST_PATTERN = /(?:hosted by|gehostet von)\s+([^\n·•]+)/i;

// "€ 150 / night" or "$150 per night" or "150 € pro Nacht"
const PRICE_PATTERN = /[€$]?\s*(\d+(?:[.,]\d{0,2})?)\s*[€$]?\s*(?:\/|per|pro)\s*(?:night|nacht)/i;

// "6 guests maximum" or "max. 6 Gäste" or "Up to 6 guests"
const MAX_GUESTS_PATTERN = /(?:max\.?|maximum|up to)\s*(\d+)\s*(?:guests?|gäste?)|(\d+)\s*(?:guests?|gäste?)\s*(?:maximum|max\.?)/i;

export interface CapacityFacts {
  guests: number;
  bedrooms: number;
  beds: number;
  bathrooms: number;
}

export interface RatingFacts {
  rating: number;
  reviewCount: number;
}

/**
 * Parse capacity facts from text like "2 guests · 1 bedroom · 2 beds · 1 bath"
 */
export function parseCapacityFacts(text: string): CapacityFacts | null {
  const match = text.match(CAPACITY_PATTERN);
  if (!match) return null;
  
  return {
    guests: parseInt(match[1], 10),
    bedrooms: parseInt(match[2], 10),
    beds: parseInt(match[3], 10),
    bathrooms: parseFloat(match[4].replace(',', '.')),
  };
}

/**
 * Parse rating from text like "4.88 · 200 reviews"
 */
export function parseRating(text: string): RatingFacts | null {
  const match = text.match(RATING_PATTERN);
  if (!match) return null;
  
  const rating = parseFloat(match[1].replace(',', '.'));
  const reviewCount = match[2] ? parseInt(match[2], 10) : 0;
  
  if (isNaN(rating)) return null;
  
  return { rating, reviewCount };
}

/**
 * Parse host name from text like "Hosted by David"
 */
export function parseHost(text: string): string | null {
  const match = text.match(HOST_PATTERN);
  if (!match) return null;
  
  return match[1].trim();
}

/**
 * Parse price from text like "€ 150 / night"
 */
export function parsePriceFromText(text: string): number | null {
  const match = text.match(PRICE_PATTERN);
  if (!match) return null;
  
  const price = parseFloat(match[1].replace(',', '.'));
  return isNaN(price) ? null : price;
}

/**
 * Parse max guests from text like "6 guests maximum"
 */
export function parseMaxGuests(text: string): number | null {
  const match = text.match(MAX_GUESTS_PATTERN);
  if (!match) return null;
  
  // Pattern has two capture groups depending on word order
  const value = match[1] || match[2];
  return value ? parseInt(value, 10) : null;
}

/**
 * Extract all text content from HTML for pattern matching
 */
export function extractVisibleText(html: string): string {
  // Remove script and style tags
  let text = html.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, ' ');
  text = text.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, ' ');
  
  // Replace block elements with newlines
  text = text.replace(/<\/(div|p|br|li|tr|td|th|h[1-6]|section|article|header|footer)[^>]*>/gi, '\n');
  
  // Remove remaining tags
  text = text.replace(/<[^>]+>/g, ' ');
  
  // Decode HTML entities
  text = text
    .replace(/&nbsp;/g, ' ')
    .replace(/&amp;/g, '&')
    .replace(/&lt;/g, '<')
    .replace(/&gt;/g, '>')
    .replace(/&quot;/g, '"')
    .replace(/&#(\d+);/g, (_, num) => String.fromCharCode(parseInt(num, 10)));
  
  // Normalize whitespace
  text = text.replace(/\s+/g, ' ').trim();
  
  return text;
}
fix: add field/mergeField helpers, priceStatus, trip context support - Add field() and mergeField() helper functions to types.ts - Fix location parser to use correct html parameter - Add priceStatus to import action - Import form already has trip context fields (checkIn, checkOut, adults) - Build now passes successfully 2026-03-11 15:55:45 +00:00			`/**`
			`* Text pattern parsers for extracting data from visible HTML text`
			`* Supports both German and English patterns`
			`*/`

			`// "2 guests · 1 bedroom · 2 beds · 1 bath" or German variants`
			`const CAPACITY_PATTERN = /(\d+)\s(?:guests?\|gäste?)\s[·•]\s(\d+)\s(?:bedrooms?\|schlafzimmer?)\s[·•]\s(\d+)\s(?:beds?\|betten?)\s[·•]\s(\d+(?:[.,]\d+)?)\s(?:baths?\|bäder?)/i;`

			`// "4.88 · 200 reviews" or "4,88 (200)" or "4,88 · 200 Bewertungen"`
			`const RATING_PATTERN = /(\d+[.,]\d+)\s(?:[·•\(]?\s(\d+)\s*(?:reviews?\|bewertungen)?\)?)/i;`

			`// "Hosted by David" or "Gehostet von David"`
			`const HOST_PATTERN = /(?:hosted by\|gehostet von)\s+([^\n·•]+)/i;`

			`// "€ 150 / night" or "$150 per night" or "150 € pro Nacht"`
			`const PRICE_PATTERN = /[€$]?\s(\d+(?:[.,]\d{0,2})?)\s[€$]?\s(?:\/\|per\|pro)\s(?:night\|nacht)/i;`

			`// "6 guests maximum" or "max. 6 Gäste" or "Up to 6 guests"`
			`const MAX_GUESTS_PATTERN = /(?:max\.?\|maximum\|up to)\s(\d+)\s(?:guests?\|gäste?)\|(\d+)\s(?:guests?\|gäste?)\s(?:maximum\|max\.?)/i;`

			`export interface CapacityFacts {`
			`guests: number;`
			`bedrooms: number;`
			`beds: number;`
			`bathrooms: number;`
			`}`

			`export interface RatingFacts {`
			`rating: number;`
			`reviewCount: number;`
			`}`

			`/**`
			`* Parse capacity facts from text like "2 guests · 1 bedroom · 2 beds · 1 bath"`
			`*/`
			`export function parseCapacityFacts(text: string): CapacityFacts \| null {`
			`const match = text.match(CAPACITY_PATTERN);`
			`if (!match) return null;`

			`return {`
			`guests: parseInt(match[1], 10),`
			`bedrooms: parseInt(match[2], 10),`
			`beds: parseInt(match[3], 10),`
			`bathrooms: parseFloat(match[4].replace(',', '.')),`
			`};`
			`}`

			`/**`
			`* Parse rating from text like "4.88 · 200 reviews"`
			`*/`
			`export function parseRating(text: string): RatingFacts \| null {`
			`const match = text.match(RATING_PATTERN);`
			`if (!match) return null;`

			`const rating = parseFloat(match[1].replace(',', '.'));`
			`const reviewCount = match[2] ? parseInt(match[2], 10) : 0;`

			`if (isNaN(rating)) return null;`

			`return { rating, reviewCount };`
			`}`

			`/**`
			`* Parse host name from text like "Hosted by David"`
			`*/`
			`export function parseHost(text: string): string \| null {`
			`const match = text.match(HOST_PATTERN);`
			`if (!match) return null;`

			`return match[1].trim();`
			`}`

			`/**`
			`* Parse price from text like "€ 150 / night"`
			`*/`
			`export function parsePriceFromText(text: string): number \| null {`
			`const match = text.match(PRICE_PATTERN);`
			`if (!match) return null;`

			`const price = parseFloat(match[1].replace(',', '.'));`
			`return isNaN(price) ? null : price;`
			`}`

			`/**`
			`* Parse max guests from text like "6 guests maximum"`
			`*/`
			`export function parseMaxGuests(text: string): number \| null {`
			`const match = text.match(MAX_GUESTS_PATTERN);`
			`if (!match) return null;`

			`// Pattern has two capture groups depending on word order`
			`const value = match[1] \|\| match[2];`
			`return value ? parseInt(value, 10) : null;`
			`}`

			`/**`
			`* Extract all text content from HTML for pattern matching`
			`*/`
			`export function extractVisibleText(html: string): string {`
			`// Remove script and style tags`
			`let text = html.replace(/<script[^>]>[\s\S]?<\/script>/gi, ' ');`
			`text = text.replace(/<style[^>]>[\s\S]?<\/style>/gi, ' ');`

			`// Replace block elements with newlines`
			`text = text.replace(/<\/(div\|p\|br\|li\|tr\|td\|th\|h[1-6]\|section\|article\|header\|footer)[^>]*>/gi, '\n');`

			`// Remove remaining tags`
			`text = text.replace(/<[^>]+>/g, ' ');`

			`// Decode HTML entities`
			`text = text`
			`.replace(/ /g, ' ')`
			`.replace(/&/g, '&')`
			`.replace(/</g, '<')`
			`.replace(/>/g, '>')`
			`.replace(/"/g, '"')`
			`.replace(/&#(\d+);/g, (_, num) => String.fromCharCode(parseInt(num, 10)));`

			`// Normalize whitespace`
			`text = text.replace(/\s+/g, ' ').trim();`

			`return text;`
			`}`