fix: improve scraper robustness, fix delete button, add title parsing
- Add parseTitle for fallback title extraction - Improve text-patterns with more robust regex patterns - Add more fallback patterns for capacity, beds, etc. - Fix delete button to use FormData properly - Add logging to scraper for debugging
This commit is contained in:
parent
13bbe9d147
commit
5e5326dbcc
Binary file not shown.
@ -3,7 +3,20 @@
|
||||
import { prisma } from "@/lib/prisma";
|
||||
import { revalidatePath } from "next/cache";
|
||||
|
||||
export async function deleteListing(listingId: string) {
|
||||
export async function deleteListing(idOrFormData: string | FormData) {
|
||||
let listingId: string;
|
||||
|
||||
// Handle both string ID and FormData
|
||||
if (typeof idOrFormData === 'string') {
|
||||
listingId = idOrFormData;
|
||||
} else {
|
||||
listingId = idOrFormData.get("id") as string;
|
||||
}
|
||||
|
||||
if (!listingId) {
|
||||
throw new Error("Keine ID übergeben");
|
||||
}
|
||||
|
||||
try {
|
||||
// Delete related records first
|
||||
await prisma.listingTag.deleteMany({
|
||||
|
||||
@ -11,19 +11,17 @@ interface DeleteListingButtonProps {
|
||||
|
||||
export function DeleteListingButton({ listingId, listingTitle }: DeleteListingButtonProps) {
|
||||
const [isDeleting, setIsDeleting] = useState(false);
|
||||
const [showConfirm, setShowConfirm] = useState(false);
|
||||
|
||||
const handleDelete = async () => {
|
||||
if (!confirm(`"${listingTitle}" wirklich löschen?`)) return;
|
||||
|
||||
setIsDeleting(true);
|
||||
try {
|
||||
await deleteListing(listingId);
|
||||
// Page will refresh automatically due to revalidation
|
||||
window.location.reload();
|
||||
const formData = new FormData();
|
||||
formData.append("id", listingId);
|
||||
await deleteListing(formData);
|
||||
} catch (error) {
|
||||
alert("Fehler beim Löschen: " + (error as Error).message);
|
||||
} finally {
|
||||
setIsDeleting(false);
|
||||
}
|
||||
};
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
import * as cheerio from "cheerio";
|
||||
import { normalizeAirbnbUrlWithContext } from "./url-normalizer";
|
||||
import { parseCapacityFacts, parseRating, parseHost, parseMaxGuests, extractVisibleText } from "./parsers/text-patterns";
|
||||
import { parseCapacityFacts, parseRating, parseHost, parseMaxGuests, extractVisibleText, parseTitle } from "./parsers/text-patterns";
|
||||
import { parseSleepingArrangements, calculateSleepingStats, deriveSleepingFromBeds } from "./parsers/sleeping";
|
||||
import { extractPrice } from "./parsers/price";
|
||||
import { extractLocation } from "./parsers/location";
|
||||
@ -50,6 +50,7 @@ export async function scrapeAirbnbListing(
|
||||
const sleepingOptions = parseSleepingArrangements(visibleText);
|
||||
const priceData = extractPrice(html, $, tripContext);
|
||||
const locationData = extractLocation($, html);
|
||||
const pageTitle = parseTitle(html);
|
||||
|
||||
// Step 5: Build the result with priority: jsonld > text_pattern > derived
|
||||
const result: ExtractedListing = {
|
||||
@ -61,7 +62,7 @@ export async function scrapeAirbnbListing(
|
||||
// Basic Info
|
||||
title: mergeField(
|
||||
jsonldData.title ? field(jsonldData.title, 'jsonld', 'high') : null,
|
||||
field(null, 'derived', 'low')
|
||||
pageTitle ? field(pageTitle, 'text_pattern', 'medium') : field(null, 'derived', 'low')
|
||||
),
|
||||
description: mergeField(
|
||||
jsonldData.description ? field(jsonldData.description, 'jsonld', 'high') : null,
|
||||
@ -181,7 +182,7 @@ export async function scrapeAirbnbListing(
|
||||
}
|
||||
|
||||
// ============================================
|
||||
// HTML Fetcher
|
||||
// HTML Fetcher - with better error handling and logging
|
||||
// ============================================
|
||||
|
||||
async function fetchHtml(url: string): Promise<string> {
|
||||
@ -192,6 +193,7 @@ async function fetchHtml(url: string): Promise<string> {
|
||||
"Accept-Language": "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7",
|
||||
"Accept-Encoding": "gzip, deflate, br",
|
||||
"Cache-Control": "no-cache",
|
||||
"Upgrade-Insecure-Requests": "1",
|
||||
},
|
||||
});
|
||||
|
||||
@ -199,7 +201,14 @@ async function fetchHtml(url: string): Promise<string> {
|
||||
throw new Error(`HTTP ${response.status} for ${url}`);
|
||||
}
|
||||
|
||||
return response.text();
|
||||
const html = await response.text();
|
||||
|
||||
// Log some debug info
|
||||
console.log(`[Scraper] Fetched ${url.length} chars`);
|
||||
console.log(`[Scraper] Contains 'application/ld+json': ${html.includes('application/ld+json')}`);
|
||||
console.log(`[Scraper] Contains 'airbnb': ${html.toLowerCase().includes('airbnb')}`);
|
||||
|
||||
return html;
|
||||
}
|
||||
|
||||
// Re-export utilities for backward compatibility
|
||||
|
||||
@ -3,21 +3,41 @@
|
||||
* Supports both German and English patterns
|
||||
*/
|
||||
|
||||
// "2 guests · 1 bedroom · 2 beds · 1 bath" or German variants
|
||||
const CAPACITY_PATTERN = /(\d+)\s*(?:guests?|gäste?)\s*[·•]\s*(\d+)\s*(?:bedrooms?|schlafzimmer?)\s*[·•]\s*(\d+)\s*(?:beds?|betten?)\s*[·•]\s*(\d+(?:[.,]\d+)?)\s*(?:baths?|bäder?)/i;
|
||||
// Main capacity pattern - "2 guests · 1 bedroom · 2 beds · 1 bath"
|
||||
const CAPACITY_PATTERN = /(\d+)\s*(?:guests?|gäste?)\s*[·•]\s*(\d+)\s*(?:bedrooms?|schlafzimmer?)\s*[·•]\s*(\d+)\s*(?:beds?|betten?)\s*[·•]\s*(\d+(?:[.,]\d+)?)\s*(?:baths?|bäder?|bath)/i;
|
||||
|
||||
// "4.88 · 200 reviews" or "4,88 (200)" or "4,88 · 200 Bewertungen"
|
||||
// Fallback patterns for different orders
|
||||
const CAPACITY_FALLBACK_1 = /(\d+)\s*(?:guests?|gäste?)\s*[·•]\s*(\d+)\s*(?:bedrooms?|schlafzimmer?)/i;
|
||||
const CAPACITY_FALLBACK_2 = /(\d+)\s*(?:beds?|betten?)\s*[·•]\s*(\d+)\s*(?:baths?|bäder?)/i;
|
||||
|
||||
// Rating patterns - "4.88 · 200 reviews" or "4,88 (200)" or "4,88 · 200 Bewertungen"
|
||||
const RATING_PATTERN = /(\d+[.,]\d+)\s*(?:[·•\(]?\s*(\d+)\s*(?:reviews?|bewertungen)?\)?)/i;
|
||||
|
||||
// "Hosted by David" or "Gehostet von David"
|
||||
const HOST_PATTERN = /(?:hosted by|gehostet von)\s+([^\n·•]+)/i;
|
||||
// Host patterns - "Hosted by David" or "Gehostet von David"
|
||||
const HOST_PATTERN = /(?:hosted by|gehostet von)\s+([^\n·•,]{2,40})/i;
|
||||
|
||||
// "€ 150 / night" or "$150 per night" or "150 € pro Nacht"
|
||||
const PRICE_PATTERN = /[€$]?\s*(\d+(?:[.,]\d{0,2})?)\s*[€$]?\s*(?:\/|per|pro)\s*(?:night|nacht)/i;
|
||||
// Price patterns - more flexible
|
||||
const PRICE_PATTERN = /(?:€|EUR|\$)\s*(\d+(?:[.,]\d{0,2})?)|(\d+(?:[.,]\d{0,2})?)\s*(?:€|EUR|\$)/i;
|
||||
const PRICE_PER_NIGHT = /(\d+(?:[.,]\d+)?)\s*[\/·]\s*(?:night|nacht|per\s*night)/i;
|
||||
|
||||
// "6 guests maximum" or "max. 6 Gäste" or "Up to 6 guests"
|
||||
// Max guests patterns - "6 guests maximum" or "max. 6 Gäste" or "Up to 6 guests"
|
||||
const MAX_GUESTS_PATTERN = /(?:max\.?|maximum|up to)\s*(\d+)\s*(?:guests?|gäste?)|(\d+)\s*(?:guests?|gäste?)\s*(?:maximum|max\.?)/i;
|
||||
|
||||
// Bedroom patterns
|
||||
const BEDROOM_PATTERN = /(\d+)\s*(?:bedroom|schlafzimmer|bedrooms|schlafzimmer)/i;
|
||||
|
||||
// Bed patterns
|
||||
const BED_PATTERN = /(\d+)\s*(?:bed|beds|bett|betten)/i;
|
||||
|
||||
// Bathroom patterns
|
||||
const BATHROOM_PATTERN = /(\d+(?:[.,]\d+)?)\s*(?:bath|baths|badezimmer|bäder)/i;
|
||||
|
||||
// Title in page - <title> tag
|
||||
const TITLE_PATTERN = /<title>([^<]+)<\/title>/i;
|
||||
|
||||
// Location from address pattern
|
||||
const LOCATION_PATTERN = /([A-Z][a-zA-ZäöüÄÖÜß]+(?:\s+[A-Z][a-zA-ZäöüÄÖÜß]+)*,\s*[A-Z][a-zA-ZäöüÄÖÜß]+)/i;
|
||||
|
||||
export interface CapacityFacts {
|
||||
guests: number;
|
||||
bedrooms: number;
|
||||
@ -34,15 +54,33 @@ export interface RatingFacts {
|
||||
* Parse capacity facts from text like "2 guests · 1 bedroom · 2 beds · 1 bath"
|
||||
*/
|
||||
export function parseCapacityFacts(text: string): CapacityFacts | null {
|
||||
// Try main pattern first
|
||||
const match = text.match(CAPACITY_PATTERN);
|
||||
if (!match) return null;
|
||||
if (match) {
|
||||
return {
|
||||
guests: parseInt(match[1], 10),
|
||||
bedrooms: parseInt(match[2], 10),
|
||||
beds: parseInt(match[3], 10),
|
||||
bathrooms: parseFloat(match[4].replace(',', '.')),
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
guests: parseInt(match[1], 10),
|
||||
bedrooms: parseInt(match[2], 10),
|
||||
beds: parseInt(match[3], 10),
|
||||
bathrooms: parseFloat(match[4].replace(',', '.')),
|
||||
};
|
||||
// Fallback: try to extract individual values
|
||||
const bedroomMatch = text.match(BEDROOM_PATTERN);
|
||||
const bedMatch = text.match(BED_PATTERN);
|
||||
const bathroomMatch = text.match(BATHROOM_PATTERN);
|
||||
const guestMatch = text.match(/(\d+)\s*(?:guests?|gäste?)/i);
|
||||
|
||||
if (bedroomMatch || bedMatch || bathroomMatch || guestMatch) {
|
||||
return {
|
||||
guests: guestMatch ? parseInt(guestMatch[1], 10) : 0,
|
||||
bedrooms: bedroomMatch ? parseInt(bedroomMatch[1], 10) : 0,
|
||||
beds: bedMatch ? parseInt(bedMatch[1], 10) : 0,
|
||||
bathrooms: bathroomMatch ? parseFloat(bathroomMatch[1].replace(',', '.')) : 0,
|
||||
};
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -60,6 +98,25 @@ export function parseRating(text: string): RatingFacts | null {
|
||||
return { rating, reviewCount };
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract title from HTML
|
||||
*/
|
||||
export function parseTitle(html: string): string | null {
|
||||
// Try <title> tag first
|
||||
const titleMatch = html.match(/<title>([^<]+)<\/title>/i);
|
||||
if (titleMatch) {
|
||||
// Clean up title - usually "Title - Airbnb" format
|
||||
let title = titleMatch[1].replace(/\s*[-|–]\s*Airbnb.*$/i, '').trim();
|
||||
if (title) return title;
|
||||
}
|
||||
|
||||
// Try og:title
|
||||
const ogTitle = html.match(/<meta[^>]*property=["']og:title["'][^>]*content=["']([^"']+)["']/i);
|
||||
if (ogTitle) return ogTitle[1];
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse host name from text like "Hosted by David"
|
||||
*/
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user