fix: improve scraper robustness, fix delete button, add title parsing

- Add parseTitle for fallback title extraction - Improve text-patterns with more robust regex patterns - Add more fallback patterns for capacity, beds, etc. - Fix delete button to use FormData properly - Add logging to scraper for debugging
2026-03-11 16:15:20 +00:00 · 2026-03-11 16:15:20 +00:00 · 5e5326dbcc
commit 5e5326dbcc
parent 13bbe9d147
5 changed files with 102 additions and 25 deletions
--- a/prisma/prisma/dev.db
+++ b/prisma/prisma/dev.db
--- a/src/app/(protected)/listings/actions.ts
+++ b/src/app/(protected)/listings/actions.ts
@ -3,7 +3,20 @@
 import { prisma } from "@/lib/prisma";
 import { revalidatePath } from "next/cache";

-export async function deleteListing(listingId: string) {
+export async function deleteListing(idOrFormData: string | FormData) {
+  let listingId: string;
+  
+  // Handle both string ID and FormData
+  if (typeof idOrFormData === 'string') {
+    listingId = idOrFormData;
+  } else {
+    listingId = idOrFormData.get("id") as string;
+  }
+  
+  if (!listingId) {
+    throw new Error("Keine ID übergeben");
+  }
+  
  try {
    // Delete related records first
    await prisma.listingTag.deleteMany({
--- a/src/app/(protected)/listings/delete-button.tsx
+++ b/src/app/(protected)/listings/delete-button.tsx
@ -11,19 +11,17 @@ interface DeleteListingButtonProps {

 export function DeleteListingButton({ listingId, listingTitle }: DeleteListingButtonProps) {
  const [isDeleting, setIsDeleting] = useState(false);
-  const [showConfirm, setShowConfirm] = useState(false);

  const handleDelete = async () => {
    if (!confirm(`"${listingTitle}" wirklich löschen?`)) return;
    
    setIsDeleting(true);
    try {
-      await deleteListing(listingId);
-      // Page will refresh automatically due to revalidation
-      window.location.reload();
+      const formData = new FormData();
+      formData.append("id", listingId);
+      await deleteListing(formData);
    } catch (error) {
      alert("Fehler beim Löschen: " + (error as Error).message);
-    } finally {
      setIsDeleting(false);
    }
  };
--- a/src/lib/airbnb/index.ts
+++ b/src/lib/airbnb/index.ts
@ -1,6 +1,6 @@
 import * as cheerio from "cheerio";
 import { normalizeAirbnbUrlWithContext } from "./url-normalizer";
-import { parseCapacityFacts, parseRating, parseHost, parseMaxGuests, extractVisibleText } from "./parsers/text-patterns";
+import { parseCapacityFacts, parseRating, parseHost, parseMaxGuests, extractVisibleText, parseTitle } from "./parsers/text-patterns";
 import { parseSleepingArrangements, calculateSleepingStats, deriveSleepingFromBeds } from "./parsers/sleeping";
 import { extractPrice } from "./parsers/price";
 import { extractLocation } from "./parsers/location";
@ -50,6 +50,7 @@ export async function scrapeAirbnbListing(
    const sleepingOptions = parseSleepingArrangements(visibleText);
    const priceData = extractPrice(html, $, tripContext);
    const locationData = extractLocation($, html);
+    const pageTitle = parseTitle(html);

    // Step 5: Build the result with priority: jsonld > text_pattern > derived
    const result: ExtractedListing = {
@ -61,7 +62,7 @@ export async function scrapeAirbnbListing(
      // Basic Info
      title: mergeField(
        jsonldData.title ? field(jsonldData.title, 'jsonld', 'high') : null,
-        field(null, 'derived', 'low')
+        pageTitle ? field(pageTitle, 'text_pattern', 'medium') : field(null, 'derived', 'low')
      ),
      description: mergeField(
        jsonldData.description ? field(jsonldData.description, 'jsonld', 'high') : null,
@ -181,7 +182,7 @@ export async function scrapeAirbnbListing(
 }

 // ============================================
-// HTML Fetcher
+// HTML Fetcher - with better error handling and logging
 // ============================================

 async function fetchHtml(url: string): Promise<string> {
@ -192,6 +193,7 @@ async function fetchHtml(url: string): Promise<string> {
      "Accept-Language": "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7",
      "Accept-Encoding": "gzip, deflate, br",
      "Cache-Control": "no-cache",
+      "Upgrade-Insecure-Requests": "1",
    },
  });

@ -199,7 +201,14 @@ async function fetchHtml(url: string): Promise<string> {
    throw new Error(`HTTP ${response.status} for ${url}`);
  }

-  return response.text();
+  const html = await response.text();
+  
+  // Log some debug info
+  console.log(`[Scraper] Fetched ${url.length} chars`);
+  console.log(`[Scraper] Contains 'application/ld+json': ${html.includes('application/ld+json')}`);
+  console.log(`[Scraper] Contains 'airbnb': ${html.toLowerCase().includes('airbnb')}`);
+  
+  return html;
 }

 // Re-export utilities for backward compatibility
--- a/src/lib/airbnb/parsers/text-patterns.ts
+++ b/src/lib/airbnb/parsers/text-patterns.ts
@ -3,21 +3,41 @@
 * Supports both German and English patterns
 */

-// "2 guests · 1 bedroom · 2 beds · 1 bath" or German variants
-const CAPACITY_PATTERN = /(\d+)\s*(?:guests?|gäste?)\s*[·•]\s*(\d+)\s*(?:bedrooms?|schlafzimmer?)\s*[·•]\s*(\d+)\s*(?:beds?|betten?)\s*[·•]\s*(\d+(?:[.,]\d+)?)\s*(?:baths?|bäder?)/i;
+// Main capacity pattern - "2 guests · 1 bedroom · 2 beds · 1 bath"
+const CAPACITY_PATTERN = /(\d+)\s*(?:guests?|gäste?)\s*[·•]\s*(\d+)\s*(?:bedrooms?|schlafzimmer?)\s*[·•]\s*(\d+)\s*(?:beds?|betten?)\s*[·•]\s*(\d+(?:[.,]\d+)?)\s*(?:baths?|bäder?|bath)/i;

-// "4.88 · 200 reviews" or "4,88 (200)" or "4,88 · 200 Bewertungen"
+// Fallback patterns for different orders
+const CAPACITY_FALLBACK_1 = /(\d+)\s*(?:guests?|gäste?)\s*[·•]\s*(\d+)\s*(?:bedrooms?|schlafzimmer?)/i;
+const CAPACITY_FALLBACK_2 = /(\d+)\s*(?:beds?|betten?)\s*[·•]\s*(\d+)\s*(?:baths?|bäder?)/i;
+
+// Rating patterns - "4.88 · 200 reviews" or "4,88 (200)" or "4,88 · 200 Bewertungen"
 const RATING_PATTERN = /(\d+[.,]\d+)\s*(?:[·•\(]?\s*(\d+)\s*(?:reviews?|bewertungen)?\)?)/i;

-// "Hosted by David" or "Gehostet von David"
-const HOST_PATTERN = /(?:hosted by|gehostet von)\s+([^\n·•]+)/i;
+// Host patterns - "Hosted by David" or "Gehostet von David"
+const HOST_PATTERN = /(?:hosted by|gehostet von)\s+([^\n·•,]{2,40})/i;

-// "€ 150 / night" or "$150 per night" or "150 € pro Nacht"
-const PRICE_PATTERN = /[€$]?\s*(\d+(?:[.,]\d{0,2})?)\s*[€$]?\s*(?:\/|per|pro)\s*(?:night|nacht)/i;
+// Price patterns - more flexible
+const PRICE_PATTERN = /(?:€|EUR|\$)\s*(\d+(?:[.,]\d{0,2})?)|(\d+(?:[.,]\d{0,2})?)\s*(?:€|EUR|\$)/i;
+const PRICE_PER_NIGHT = /(\d+(?:[.,]\d+)?)\s*[\/·]\s*(?:night|nacht|per\s*night)/i;

-// "6 guests maximum" or "max. 6 Gäste" or "Up to 6 guests"
+// Max guests patterns - "6 guests maximum" or "max. 6 Gäste" or "Up to 6 guests"
 const MAX_GUESTS_PATTERN = /(?:max\.?|maximum|up to)\s*(\d+)\s*(?:guests?|gäste?)|(\d+)\s*(?:guests?|gäste?)\s*(?:maximum|max\.?)/i;

+// Bedroom patterns
+const BEDROOM_PATTERN = /(\d+)\s*(?:bedroom|schlafzimmer|bedrooms|schlafzimmer)/i;
+
+// Bed patterns
+const BED_PATTERN = /(\d+)\s*(?:bed|beds|bett|betten)/i;
+
+// Bathroom patterns  
+const BATHROOM_PATTERN = /(\d+(?:[.,]\d+)?)\s*(?:bath|baths|badezimmer|bäder)/i;
+
+// Title in page - <title> tag
+const TITLE_PATTERN = /<title>([^<]+)<\/title>/i;
+
+// Location from address pattern
+const LOCATION_PATTERN = /([A-Z][a-zA-ZäöüÄÖÜß]+(?:\s+[A-Z][a-zA-ZäöüÄÖÜß]+)*,\s*[A-Z][a-zA-ZäöüÄÖÜß]+)/i;
+
 export interface CapacityFacts {
  guests: number;
  bedrooms: number;
@ -34,15 +54,33 @@ export interface RatingFacts {
 * Parse capacity facts from text like "2 guests · 1 bedroom · 2 beds · 1 bath"
 */
 export function parseCapacityFacts(text: string): CapacityFacts | null {
+  // Try main pattern first
  const match = text.match(CAPACITY_PATTERN);
-  if (!match) return null;
+  if (match) {
+    return {
+      guests: parseInt(match[1], 10),
+      bedrooms: parseInt(match[2], 10),
+      beds: parseInt(match[3], 10),
+      bathrooms: parseFloat(match[4].replace(',', '.')),
+    };
+  }
  
-  return {
-    guests: parseInt(match[1], 10),
-    bedrooms: parseInt(match[2], 10),
-    beds: parseInt(match[3], 10),
-    bathrooms: parseFloat(match[4].replace(',', '.')),
-  };
+  // Fallback: try to extract individual values
+  const bedroomMatch = text.match(BEDROOM_PATTERN);
+  const bedMatch = text.match(BED_PATTERN);
+  const bathroomMatch = text.match(BATHROOM_PATTERN);
+  const guestMatch = text.match(/(\d+)\s*(?:guests?|gäste?)/i);
+  
+  if (bedroomMatch || bedMatch || bathroomMatch || guestMatch) {
+    return {
+      guests: guestMatch ? parseInt(guestMatch[1], 10) : 0,
+      bedrooms: bedroomMatch ? parseInt(bedroomMatch[1], 10) : 0,
+      beds: bedMatch ? parseInt(bedMatch[1], 10) : 0,
+      bathrooms: bathroomMatch ? parseFloat(bathroomMatch[1].replace(',', '.')) : 0,
+    };
+  }
+  
+  return null;
 }

 /**
@ -60,6 +98,25 @@ export function parseRating(text: string): RatingFacts | null {
  return { rating, reviewCount };
 }

+/**
+ * Extract title from HTML
+ */
+export function parseTitle(html: string): string | null {
+  // Try <title> tag first
+  const titleMatch = html.match(/<title>([^<]+)<\/title>/i);
+  if (titleMatch) {
+    // Clean up title - usually "Title - Airbnb" format
+    let title = titleMatch[1].replace(/\s*[-|–]\s*Airbnb.*$/i, '').trim();
+    if (title) return title;
+  }
+  
+  // Try og:title
+  const ogTitle = html.match(/<meta[^>]*property=["']og:title["'][^>]*content=["']([^"']+)["']/i);
+  if (ogTitle) return ogTitle[1];
+  
+  return null;
+}
+
 /**
 * Parse host name from text like "Hosted by David"
 */