fix: improve scraper robustness, fix delete button, add title parsing

- Add parseTitle for fallback title extraction
- Improve text-patterns with more robust regex patterns
- Add more fallback patterns for capacity, beds, etc.
- Fix delete button to use FormData properly
- Add logging to scraper for debugging
This commit is contained in:
AI 2026-03-11 16:15:20 +00:00
parent 13bbe9d147
commit 5e5326dbcc
5 changed files with 102 additions and 25 deletions

Binary file not shown.

View File

@ -3,7 +3,20 @@
import { prisma } from "@/lib/prisma";
import { revalidatePath } from "next/cache";
export async function deleteListing(listingId: string) {
export async function deleteListing(idOrFormData: string | FormData) {
let listingId: string;
// Handle both string ID and FormData
if (typeof idOrFormData === 'string') {
listingId = idOrFormData;
} else {
listingId = idOrFormData.get("id") as string;
}
if (!listingId) {
throw new Error("Keine ID übergeben");
}
try {
// Delete related records first
await prisma.listingTag.deleteMany({

View File

@ -11,19 +11,17 @@ interface DeleteListingButtonProps {
export function DeleteListingButton({ listingId, listingTitle }: DeleteListingButtonProps) {
const [isDeleting, setIsDeleting] = useState(false);
const [showConfirm, setShowConfirm] = useState(false);
const handleDelete = async () => {
if (!confirm(`"${listingTitle}" wirklich löschen?`)) return;
setIsDeleting(true);
try {
await deleteListing(listingId);
// Page will refresh automatically due to revalidation
window.location.reload();
const formData = new FormData();
formData.append("id", listingId);
await deleteListing(formData);
} catch (error) {
alert("Fehler beim Löschen: " + (error as Error).message);
} finally {
setIsDeleting(false);
}
};

View File

@ -1,6 +1,6 @@
import * as cheerio from "cheerio";
import { normalizeAirbnbUrlWithContext } from "./url-normalizer";
import { parseCapacityFacts, parseRating, parseHost, parseMaxGuests, extractVisibleText } from "./parsers/text-patterns";
import { parseCapacityFacts, parseRating, parseHost, parseMaxGuests, extractVisibleText, parseTitle } from "./parsers/text-patterns";
import { parseSleepingArrangements, calculateSleepingStats, deriveSleepingFromBeds } from "./parsers/sleeping";
import { extractPrice } from "./parsers/price";
import { extractLocation } from "./parsers/location";
@ -50,6 +50,7 @@ export async function scrapeAirbnbListing(
const sleepingOptions = parseSleepingArrangements(visibleText);
const priceData = extractPrice(html, $, tripContext);
const locationData = extractLocation($, html);
const pageTitle = parseTitle(html);
// Step 5: Build the result with priority: jsonld > text_pattern > derived
const result: ExtractedListing = {
@ -61,7 +62,7 @@ export async function scrapeAirbnbListing(
// Basic Info
title: mergeField(
jsonldData.title ? field(jsonldData.title, 'jsonld', 'high') : null,
field(null, 'derived', 'low')
pageTitle ? field(pageTitle, 'text_pattern', 'medium') : field(null, 'derived', 'low')
),
description: mergeField(
jsonldData.description ? field(jsonldData.description, 'jsonld', 'high') : null,
@ -181,7 +182,7 @@ export async function scrapeAirbnbListing(
}
// ============================================
// HTML Fetcher
// HTML Fetcher - with better error handling and logging
// ============================================
async function fetchHtml(url: string): Promise<string> {
@ -192,6 +193,7 @@ async function fetchHtml(url: string): Promise<string> {
"Accept-Language": "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7",
"Accept-Encoding": "gzip, deflate, br",
"Cache-Control": "no-cache",
"Upgrade-Insecure-Requests": "1",
},
});
@ -199,7 +201,14 @@ async function fetchHtml(url: string): Promise<string> {
throw new Error(`HTTP ${response.status} for ${url}`);
}
return response.text();
const html = await response.text();
// Log some debug info
console.log(`[Scraper] Fetched ${url.length} chars`);
console.log(`[Scraper] Contains 'application/ld+json': ${html.includes('application/ld+json')}`);
console.log(`[Scraper] Contains 'airbnb': ${html.toLowerCase().includes('airbnb')}`);
return html;
}
// Re-export utilities for backward compatibility

View File

@ -3,21 +3,41 @@
* Supports both German and English patterns
*/
// "2 guests · 1 bedroom · 2 beds · 1 bath" or German variants
const CAPACITY_PATTERN = /(\d+)\s*(?:guests?|gäste?)\s*[·•]\s*(\d+)\s*(?:bedrooms?|schlafzimmer?)\s*[·•]\s*(\d+)\s*(?:beds?|betten?)\s*[·•]\s*(\d+(?:[.,]\d+)?)\s*(?:baths?|bäder?)/i;
// Main capacity pattern - "2 guests · 1 bedroom · 2 beds · 1 bath"
const CAPACITY_PATTERN = /(\d+)\s*(?:guests?|gäste?)\s*[·•]\s*(\d+)\s*(?:bedrooms?|schlafzimmer?)\s*[·•]\s*(\d+)\s*(?:beds?|betten?)\s*[·•]\s*(\d+(?:[.,]\d+)?)\s*(?:baths?|bäder?|bath)/i;
// "4.88 · 200 reviews" or "4,88 (200)" or "4,88 · 200 Bewertungen"
// Fallback patterns for different orders
const CAPACITY_FALLBACK_1 = /(\d+)\s*(?:guests?|gäste?)\s*[·•]\s*(\d+)\s*(?:bedrooms?|schlafzimmer?)/i;
const CAPACITY_FALLBACK_2 = /(\d+)\s*(?:beds?|betten?)\s*[·•]\s*(\d+)\s*(?:baths?|bäder?)/i;
// Rating patterns - "4.88 · 200 reviews" or "4,88 (200)" or "4,88 · 200 Bewertungen"
const RATING_PATTERN = /(\d+[.,]\d+)\s*(?:[·•\(]?\s*(\d+)\s*(?:reviews?|bewertungen)?\)?)/i;
// "Hosted by David" or "Gehostet von David"
const HOST_PATTERN = /(?:hosted by|gehostet von)\s+([^\n·•]+)/i;
// Host patterns - "Hosted by David" or "Gehostet von David"
const HOST_PATTERN = /(?:hosted by|gehostet von)\s+([^\n·•,]{2,40})/i;
// "€ 150 / night" or "$150 per night" or "150 € pro Nacht"
const PRICE_PATTERN = /[€$]?\s*(\d+(?:[.,]\d{0,2})?)\s*[€$]?\s*(?:\/|per|pro)\s*(?:night|nacht)/i;
// Price patterns - more flexible
const PRICE_PATTERN = /(?:€|EUR|\$)\s*(\d+(?:[.,]\d{0,2})?)|(\d+(?:[.,]\d{0,2})?)\s*(?:€|EUR|\$)/i;
const PRICE_PER_NIGHT = /(\d+(?:[.,]\d+)?)\s*[\/·]\s*(?:night|nacht|per\s*night)/i;
// "6 guests maximum" or "max. 6 Gäste" or "Up to 6 guests"
// Max guests patterns - "6 guests maximum" or "max. 6 Gäste" or "Up to 6 guests"
const MAX_GUESTS_PATTERN = /(?:max\.?|maximum|up to)\s*(\d+)\s*(?:guests?|gäste?)|(\d+)\s*(?:guests?|gäste?)\s*(?:maximum|max\.?)/i;
// Bedroom patterns
const BEDROOM_PATTERN = /(\d+)\s*(?:bedroom|schlafzimmer|bedrooms|schlafzimmer)/i;
// Bed patterns
const BED_PATTERN = /(\d+)\s*(?:bed|beds|bett|betten)/i;
// Bathroom patterns
const BATHROOM_PATTERN = /(\d+(?:[.,]\d+)?)\s*(?:bath|baths|badezimmer|bäder)/i;
// Title in page - <title> tag
const TITLE_PATTERN = /<title>([^<]+)<\/title>/i;
// Location from address pattern
const LOCATION_PATTERN = /([A-Z][a-zA-ZäöüÄÖÜß]+(?:\s+[A-Z][a-zA-ZäöüÄÖÜß]+)*,\s*[A-Z][a-zA-ZäöüÄÖÜß]+)/i;
export interface CapacityFacts {
guests: number;
bedrooms: number;
@ -34,15 +54,33 @@ export interface RatingFacts {
* Parse capacity facts from text like "2 guests · 1 bedroom · 2 beds · 1 bath"
*/
export function parseCapacityFacts(text: string): CapacityFacts | null {
// Try main pattern first
const match = text.match(CAPACITY_PATTERN);
if (!match) return null;
if (match) {
return {
guests: parseInt(match[1], 10),
bedrooms: parseInt(match[2], 10),
beds: parseInt(match[3], 10),
bathrooms: parseFloat(match[4].replace(',', '.')),
};
}
return {
guests: parseInt(match[1], 10),
bedrooms: parseInt(match[2], 10),
beds: parseInt(match[3], 10),
bathrooms: parseFloat(match[4].replace(',', '.')),
};
// Fallback: try to extract individual values
const bedroomMatch = text.match(BEDROOM_PATTERN);
const bedMatch = text.match(BED_PATTERN);
const bathroomMatch = text.match(BATHROOM_PATTERN);
const guestMatch = text.match(/(\d+)\s*(?:guests?|gäste?)/i);
if (bedroomMatch || bedMatch || bathroomMatch || guestMatch) {
return {
guests: guestMatch ? parseInt(guestMatch[1], 10) : 0,
bedrooms: bedroomMatch ? parseInt(bedroomMatch[1], 10) : 0,
beds: bedMatch ? parseInt(bedMatch[1], 10) : 0,
bathrooms: bathroomMatch ? parseFloat(bathroomMatch[1].replace(',', '.')) : 0,
};
}
return null;
}
/**
@ -60,6 +98,25 @@ export function parseRating(text: string): RatingFacts | null {
return { rating, reviewCount };
}
/**
* Extract title from HTML
*/
export function parseTitle(html: string): string | null {
// Try <title> tag first
const titleMatch = html.match(/<title>([^<]+)<\/title>/i);
if (titleMatch) {
// Clean up title - usually "Title - Airbnb" format
let title = titleMatch[1].replace(/\s*[-|]\s*Airbnb.*$/i, '').trim();
if (title) return title;
}
// Try og:title
const ogTitle = html.match(/<meta[^>]*property=["']og:title["'][^>]*content=["']([^"']+)["']/i);
if (ogTitle) return ogTitle[1];
return null;
}
/**
* Parse host name from text like "Hosted by David"
*/