fix: improve scraper robustness, fix delete button, add title parsing
- Add parseTitle for fallback title extraction - Improve text-patterns with more robust regex patterns - Add more fallback patterns for capacity, beds, etc. - Fix delete button to use FormData properly - Add logging to scraper for debugging
This commit is contained in:
parent
13bbe9d147
commit
5e5326dbcc
Binary file not shown.
@ -3,7 +3,20 @@
|
|||||||
import { prisma } from "@/lib/prisma";
|
import { prisma } from "@/lib/prisma";
|
||||||
import { revalidatePath } from "next/cache";
|
import { revalidatePath } from "next/cache";
|
||||||
|
|
||||||
export async function deleteListing(listingId: string) {
|
export async function deleteListing(idOrFormData: string | FormData) {
|
||||||
|
let listingId: string;
|
||||||
|
|
||||||
|
// Handle both string ID and FormData
|
||||||
|
if (typeof idOrFormData === 'string') {
|
||||||
|
listingId = idOrFormData;
|
||||||
|
} else {
|
||||||
|
listingId = idOrFormData.get("id") as string;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!listingId) {
|
||||||
|
throw new Error("Keine ID übergeben");
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// Delete related records first
|
// Delete related records first
|
||||||
await prisma.listingTag.deleteMany({
|
await prisma.listingTag.deleteMany({
|
||||||
|
|||||||
@ -11,19 +11,17 @@ interface DeleteListingButtonProps {
|
|||||||
|
|
||||||
export function DeleteListingButton({ listingId, listingTitle }: DeleteListingButtonProps) {
|
export function DeleteListingButton({ listingId, listingTitle }: DeleteListingButtonProps) {
|
||||||
const [isDeleting, setIsDeleting] = useState(false);
|
const [isDeleting, setIsDeleting] = useState(false);
|
||||||
const [showConfirm, setShowConfirm] = useState(false);
|
|
||||||
|
|
||||||
const handleDelete = async () => {
|
const handleDelete = async () => {
|
||||||
if (!confirm(`"${listingTitle}" wirklich löschen?`)) return;
|
if (!confirm(`"${listingTitle}" wirklich löschen?`)) return;
|
||||||
|
|
||||||
setIsDeleting(true);
|
setIsDeleting(true);
|
||||||
try {
|
try {
|
||||||
await deleteListing(listingId);
|
const formData = new FormData();
|
||||||
// Page will refresh automatically due to revalidation
|
formData.append("id", listingId);
|
||||||
window.location.reload();
|
await deleteListing(formData);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
alert("Fehler beim Löschen: " + (error as Error).message);
|
alert("Fehler beim Löschen: " + (error as Error).message);
|
||||||
} finally {
|
|
||||||
setIsDeleting(false);
|
setIsDeleting(false);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
import * as cheerio from "cheerio";
|
import * as cheerio from "cheerio";
|
||||||
import { normalizeAirbnbUrlWithContext } from "./url-normalizer";
|
import { normalizeAirbnbUrlWithContext } from "./url-normalizer";
|
||||||
import { parseCapacityFacts, parseRating, parseHost, parseMaxGuests, extractVisibleText } from "./parsers/text-patterns";
|
import { parseCapacityFacts, parseRating, parseHost, parseMaxGuests, extractVisibleText, parseTitle } from "./parsers/text-patterns";
|
||||||
import { parseSleepingArrangements, calculateSleepingStats, deriveSleepingFromBeds } from "./parsers/sleeping";
|
import { parseSleepingArrangements, calculateSleepingStats, deriveSleepingFromBeds } from "./parsers/sleeping";
|
||||||
import { extractPrice } from "./parsers/price";
|
import { extractPrice } from "./parsers/price";
|
||||||
import { extractLocation } from "./parsers/location";
|
import { extractLocation } from "./parsers/location";
|
||||||
@ -50,6 +50,7 @@ export async function scrapeAirbnbListing(
|
|||||||
const sleepingOptions = parseSleepingArrangements(visibleText);
|
const sleepingOptions = parseSleepingArrangements(visibleText);
|
||||||
const priceData = extractPrice(html, $, tripContext);
|
const priceData = extractPrice(html, $, tripContext);
|
||||||
const locationData = extractLocation($, html);
|
const locationData = extractLocation($, html);
|
||||||
|
const pageTitle = parseTitle(html);
|
||||||
|
|
||||||
// Step 5: Build the result with priority: jsonld > text_pattern > derived
|
// Step 5: Build the result with priority: jsonld > text_pattern > derived
|
||||||
const result: ExtractedListing = {
|
const result: ExtractedListing = {
|
||||||
@ -61,7 +62,7 @@ export async function scrapeAirbnbListing(
|
|||||||
// Basic Info
|
// Basic Info
|
||||||
title: mergeField(
|
title: mergeField(
|
||||||
jsonldData.title ? field(jsonldData.title, 'jsonld', 'high') : null,
|
jsonldData.title ? field(jsonldData.title, 'jsonld', 'high') : null,
|
||||||
field(null, 'derived', 'low')
|
pageTitle ? field(pageTitle, 'text_pattern', 'medium') : field(null, 'derived', 'low')
|
||||||
),
|
),
|
||||||
description: mergeField(
|
description: mergeField(
|
||||||
jsonldData.description ? field(jsonldData.description, 'jsonld', 'high') : null,
|
jsonldData.description ? field(jsonldData.description, 'jsonld', 'high') : null,
|
||||||
@ -181,7 +182,7 @@ export async function scrapeAirbnbListing(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// ============================================
|
// ============================================
|
||||||
// HTML Fetcher
|
// HTML Fetcher - with better error handling and logging
|
||||||
// ============================================
|
// ============================================
|
||||||
|
|
||||||
async function fetchHtml(url: string): Promise<string> {
|
async function fetchHtml(url: string): Promise<string> {
|
||||||
@ -192,6 +193,7 @@ async function fetchHtml(url: string): Promise<string> {
|
|||||||
"Accept-Language": "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7",
|
"Accept-Language": "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7",
|
||||||
"Accept-Encoding": "gzip, deflate, br",
|
"Accept-Encoding": "gzip, deflate, br",
|
||||||
"Cache-Control": "no-cache",
|
"Cache-Control": "no-cache",
|
||||||
|
"Upgrade-Insecure-Requests": "1",
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -199,7 +201,14 @@ async function fetchHtml(url: string): Promise<string> {
|
|||||||
throw new Error(`HTTP ${response.status} for ${url}`);
|
throw new Error(`HTTP ${response.status} for ${url}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
return response.text();
|
const html = await response.text();
|
||||||
|
|
||||||
|
// Log some debug info
|
||||||
|
console.log(`[Scraper] Fetched ${url.length} chars`);
|
||||||
|
console.log(`[Scraper] Contains 'application/ld+json': ${html.includes('application/ld+json')}`);
|
||||||
|
console.log(`[Scraper] Contains 'airbnb': ${html.toLowerCase().includes('airbnb')}`);
|
||||||
|
|
||||||
|
return html;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Re-export utilities for backward compatibility
|
// Re-export utilities for backward compatibility
|
||||||
|
|||||||
@ -3,21 +3,41 @@
|
|||||||
* Supports both German and English patterns
|
* Supports both German and English patterns
|
||||||
*/
|
*/
|
||||||
|
|
||||||
// "2 guests · 1 bedroom · 2 beds · 1 bath" or German variants
|
// Main capacity pattern - "2 guests · 1 bedroom · 2 beds · 1 bath"
|
||||||
const CAPACITY_PATTERN = /(\d+)\s*(?:guests?|gäste?)\s*[·•]\s*(\d+)\s*(?:bedrooms?|schlafzimmer?)\s*[·•]\s*(\d+)\s*(?:beds?|betten?)\s*[·•]\s*(\d+(?:[.,]\d+)?)\s*(?:baths?|bäder?)/i;
|
const CAPACITY_PATTERN = /(\d+)\s*(?:guests?|gäste?)\s*[·•]\s*(\d+)\s*(?:bedrooms?|schlafzimmer?)\s*[·•]\s*(\d+)\s*(?:beds?|betten?)\s*[·•]\s*(\d+(?:[.,]\d+)?)\s*(?:baths?|bäder?|bath)/i;
|
||||||
|
|
||||||
// "4.88 · 200 reviews" or "4,88 (200)" or "4,88 · 200 Bewertungen"
|
// Fallback patterns for different orders
|
||||||
|
const CAPACITY_FALLBACK_1 = /(\d+)\s*(?:guests?|gäste?)\s*[·•]\s*(\d+)\s*(?:bedrooms?|schlafzimmer?)/i;
|
||||||
|
const CAPACITY_FALLBACK_2 = /(\d+)\s*(?:beds?|betten?)\s*[·•]\s*(\d+)\s*(?:baths?|bäder?)/i;
|
||||||
|
|
||||||
|
// Rating patterns - "4.88 · 200 reviews" or "4,88 (200)" or "4,88 · 200 Bewertungen"
|
||||||
const RATING_PATTERN = /(\d+[.,]\d+)\s*(?:[·•\(]?\s*(\d+)\s*(?:reviews?|bewertungen)?\)?)/i;
|
const RATING_PATTERN = /(\d+[.,]\d+)\s*(?:[·•\(]?\s*(\d+)\s*(?:reviews?|bewertungen)?\)?)/i;
|
||||||
|
|
||||||
// "Hosted by David" or "Gehostet von David"
|
// Host patterns - "Hosted by David" or "Gehostet von David"
|
||||||
const HOST_PATTERN = /(?:hosted by|gehostet von)\s+([^\n·•]+)/i;
|
const HOST_PATTERN = /(?:hosted by|gehostet von)\s+([^\n·•,]{2,40})/i;
|
||||||
|
|
||||||
// "€ 150 / night" or "$150 per night" or "150 € pro Nacht"
|
// Price patterns - more flexible
|
||||||
const PRICE_PATTERN = /[€$]?\s*(\d+(?:[.,]\d{0,2})?)\s*[€$]?\s*(?:\/|per|pro)\s*(?:night|nacht)/i;
|
const PRICE_PATTERN = /(?:€|EUR|\$)\s*(\d+(?:[.,]\d{0,2})?)|(\d+(?:[.,]\d{0,2})?)\s*(?:€|EUR|\$)/i;
|
||||||
|
const PRICE_PER_NIGHT = /(\d+(?:[.,]\d+)?)\s*[\/·]\s*(?:night|nacht|per\s*night)/i;
|
||||||
|
|
||||||
// "6 guests maximum" or "max. 6 Gäste" or "Up to 6 guests"
|
// Max guests patterns - "6 guests maximum" or "max. 6 Gäste" or "Up to 6 guests"
|
||||||
const MAX_GUESTS_PATTERN = /(?:max\.?|maximum|up to)\s*(\d+)\s*(?:guests?|gäste?)|(\d+)\s*(?:guests?|gäste?)\s*(?:maximum|max\.?)/i;
|
const MAX_GUESTS_PATTERN = /(?:max\.?|maximum|up to)\s*(\d+)\s*(?:guests?|gäste?)|(\d+)\s*(?:guests?|gäste?)\s*(?:maximum|max\.?)/i;
|
||||||
|
|
||||||
|
// Bedroom patterns
|
||||||
|
const BEDROOM_PATTERN = /(\d+)\s*(?:bedroom|schlafzimmer|bedrooms|schlafzimmer)/i;
|
||||||
|
|
||||||
|
// Bed patterns
|
||||||
|
const BED_PATTERN = /(\d+)\s*(?:bed|beds|bett|betten)/i;
|
||||||
|
|
||||||
|
// Bathroom patterns
|
||||||
|
const BATHROOM_PATTERN = /(\d+(?:[.,]\d+)?)\s*(?:bath|baths|badezimmer|bäder)/i;
|
||||||
|
|
||||||
|
// Title in page - <title> tag
|
||||||
|
const TITLE_PATTERN = /<title>([^<]+)<\/title>/i;
|
||||||
|
|
||||||
|
// Location from address pattern
|
||||||
|
const LOCATION_PATTERN = /([A-Z][a-zA-ZäöüÄÖÜß]+(?:\s+[A-Z][a-zA-ZäöüÄÖÜß]+)*,\s*[A-Z][a-zA-ZäöüÄÖÜß]+)/i;
|
||||||
|
|
||||||
export interface CapacityFacts {
|
export interface CapacityFacts {
|
||||||
guests: number;
|
guests: number;
|
||||||
bedrooms: number;
|
bedrooms: number;
|
||||||
@ -34,15 +54,33 @@ export interface RatingFacts {
|
|||||||
* Parse capacity facts from text like "2 guests · 1 bedroom · 2 beds · 1 bath"
|
* Parse capacity facts from text like "2 guests · 1 bedroom · 2 beds · 1 bath"
|
||||||
*/
|
*/
|
||||||
export function parseCapacityFacts(text: string): CapacityFacts | null {
|
export function parseCapacityFacts(text: string): CapacityFacts | null {
|
||||||
|
// Try main pattern first
|
||||||
const match = text.match(CAPACITY_PATTERN);
|
const match = text.match(CAPACITY_PATTERN);
|
||||||
if (!match) return null;
|
if (match) {
|
||||||
|
|
||||||
return {
|
return {
|
||||||
guests: parseInt(match[1], 10),
|
guests: parseInt(match[1], 10),
|
||||||
bedrooms: parseInt(match[2], 10),
|
bedrooms: parseInt(match[2], 10),
|
||||||
beds: parseInt(match[3], 10),
|
beds: parseInt(match[3], 10),
|
||||||
bathrooms: parseFloat(match[4].replace(',', '.')),
|
bathrooms: parseFloat(match[4].replace(',', '.')),
|
||||||
};
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback: try to extract individual values
|
||||||
|
const bedroomMatch = text.match(BEDROOM_PATTERN);
|
||||||
|
const bedMatch = text.match(BED_PATTERN);
|
||||||
|
const bathroomMatch = text.match(BATHROOM_PATTERN);
|
||||||
|
const guestMatch = text.match(/(\d+)\s*(?:guests?|gäste?)/i);
|
||||||
|
|
||||||
|
if (bedroomMatch || bedMatch || bathroomMatch || guestMatch) {
|
||||||
|
return {
|
||||||
|
guests: guestMatch ? parseInt(guestMatch[1], 10) : 0,
|
||||||
|
bedrooms: bedroomMatch ? parseInt(bedroomMatch[1], 10) : 0,
|
||||||
|
beds: bedMatch ? parseInt(bedMatch[1], 10) : 0,
|
||||||
|
bathrooms: bathroomMatch ? parseFloat(bathroomMatch[1].replace(',', '.')) : 0,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -60,6 +98,25 @@ export function parseRating(text: string): RatingFacts | null {
|
|||||||
return { rating, reviewCount };
|
return { rating, reviewCount };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract title from HTML
|
||||||
|
*/
|
||||||
|
export function parseTitle(html: string): string | null {
|
||||||
|
// Try <title> tag first
|
||||||
|
const titleMatch = html.match(/<title>([^<]+)<\/title>/i);
|
||||||
|
if (titleMatch) {
|
||||||
|
// Clean up title - usually "Title - Airbnb" format
|
||||||
|
let title = titleMatch[1].replace(/\s*[-|–]\s*Airbnb.*$/i, '').trim();
|
||||||
|
if (title) return title;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try og:title
|
||||||
|
const ogTitle = html.match(/<meta[^>]*property=["']og:title["'][^>]*content=["']([^"']+)["']/i);
|
||||||
|
if (ogTitle) return ogTitle[1];
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Parse host name from text like "Hosted by David"
|
* Parse host name from text like "Hosted by David"
|
||||||
*/
|
*/
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user