From 13bbe9d147d055bd019b22520dab4575900b71e8 Mon Sep 17 00:00:00 2001 From: AI Date: Wed, 11 Mar 2026 15:55:45 +0000 Subject: [PATCH] fix: add field/mergeField helpers, priceStatus, trip context support - Add field() and mergeField() helper functions to types.ts - Fix location parser to use correct html parameter - Add priceStatus to import action - Import form already has trip context fields (checkIn, checkOut, adults) - Build now passes successfully --- prisma/dev.db | 0 prisma/prisma/dev.db | Bin 126976 -> 118784 bytes prisma/schema.prisma | 1 + src/actions/import-listing.ts | 113 ++++++++-- .../(protected)/admin/import/import-form.tsx | 102 +++++++-- .../admin/listings/[slug]/page.tsx | 1 + src/lib/airbnb/index.ts | 207 ++++++++++++++++++ src/lib/airbnb/parsers/jsonld.ts | 125 +++++++++++ src/lib/airbnb/parsers/location.ts | 118 ++++++++++ src/lib/airbnb/parsers/price.ts | 102 +++++++++ src/lib/airbnb/parsers/sleeping.ts | 143 ++++++++++++ src/lib/airbnb/parsers/text-patterns.ts | 123 +++++++++++ src/lib/airbnb/types.ts | 113 ++++++++++ src/lib/airbnb/url-normalizer.ts | 71 ++++++ 14 files changed, 1180 insertions(+), 39 deletions(-) create mode 100644 prisma/dev.db create mode 100644 src/lib/airbnb/index.ts create mode 100644 src/lib/airbnb/parsers/jsonld.ts create mode 100644 src/lib/airbnb/parsers/location.ts create mode 100644 src/lib/airbnb/parsers/price.ts create mode 100644 src/lib/airbnb/parsers/sleeping.ts create mode 100644 src/lib/airbnb/parsers/text-patterns.ts create mode 100644 src/lib/airbnb/types.ts create mode 100644 src/lib/airbnb/url-normalizer.ts diff --git a/prisma/dev.db b/prisma/dev.db new file mode 100644 index 0000000..e69de29 diff --git a/prisma/prisma/dev.db b/prisma/prisma/dev.db index e5b7a6fd32f3ebb5b545b8de6abb0aebfeae1b7a..ae87d77fb21db8a72730b9e6e588bb1e5e037488 100644 GIT binary patch literal 118784 zcmeI4TWlNId4Ne#A|=@}z40o{UbH)^dWs}7cC0pDT|;9fFGY)D`%vs-18gry9{Qi* z3^~Ig<;%(&$NC9K8lKDi=Y0SFpUa$owEotzViBgH8!gdd_{i}{G#Yu0VIq-86@F&m z$NNmf#i;iM{ziS*!!E0lv$sANqlJ^dp=G9%zd52Mem=1o|2*;0*jW50BR?Abc;vO{ zFQT=`FNfXa0n>MmCsNg!=-x?1lgW;%n3ke7O@ZuK#Lz@lP-Ni-+4a(&SzN0vY}A;| zm8G{fYs}Khh1xYH?dA5@N;9h~z1;NawC}Fz%w9B^NG*cs`JiZxQdO1PL>6pA^-Dj# zzw~g$Py>QR(i!_$BK0aX;pJcxM8&9U^?vcm{ly0>g+%wV(-~xEb+YNv^Ail$q{^rtMYuB>mYEkHYmlM{M0ga-Innz^{TR$F>wg#u$vdtL<| zn9eY3wTrd2+R9>Wy?a9tc$!krP%pbsTdsj$E-tJuE?lV5l+7y_sHeIqlbO|{@sw1J zMjVqfZ$fX0AljDheDAiU-#P@&Yh{1V)ZP3!arWk_ub!O5rDxZ}Fp`BsUZoN!LO@ov|o6+q&jCGG)5%+0Ir^N%C;dLfp3$*0w=@Efk;+O>^Sz9{LtH*IY^B~m53+A(#*5_Cf*V9rBlwX9oM>8;Ds%dUoG(bYL0(n%QqTuo6;-44kSiiF9ozSWr1&ZYrc(whZ-P z9lz{aiK=VOaH^&>w=8v6Xd8+YEN|%+Yz)Hbl5H47le|%^J22Na*Sw{Z0rItGK)OM0 zDP&uabX)5!)ZCUeZDLx3v-CzokrZghK4n_sPH(;1Rz-;}(p)cdC78D2*o6RlIU(Qm zDmNGK8KMAtJj)=aNlZZ_M23;rEgZa+w|F%L({8my!{y-K3UUoyZ-p8F>ESfd+Hwo~ z)FcyAG8AXg9?!ZKne#eBHPT*W)MdT))>G8&vB((i7OO}Tb9)}&&|(oZ*QON zES1j6WZs$i<~b0PFP18WLatD#LRc1SBx@~n6bA?%F zO<-1Yv!ZOP7QHa5#nLQ13%Y3&!Q2uJ0?!kw4Y^LlT!Ux%I?1#7T$X3cWs+s3dOcez zR0?udE_-c!fc6b&@`8tk`RU&?>kb{~$^k^LIOcU=?U-oRvHYE{woR##}ywaL)E0Sfy!ezc~=yZ|5 zifv{7kKqEAVC;a#&cnRlRN$fSd|fUzN;v?WtMEBCU*HOCxgnO=a+a?Y%7n}1Idb+} zZPGliw1n*{6rJ5xWNXX0av!BtOW9fae5txk>g{i*JImE`=|?1zK9lyZf6k>Jh!522 zYy;Bg-cJ91kV=2^JJBEf%Wqf5I=|n)IWl*C?}d9WT%OrG6>?8{*Z}Uicu(xOr>@-) z+u)r!j=yQ`@P=4c?Cp?y=1Vy?$C}cXDmGlV^;~PzDcNU7?Qgj<0^TJLaZX`KNUAhpi!q~yhHejg^`wdt;x`N@BSuor} zJ)314g(6||m3*D8=Zkr^P!^?nB?|*WUh+l{Zs6GBVF--}E&|T|JH*aE{fl?=XG%Hh z@HuAPrvjS7JA(ny1`G%5mag6=T7#%2Q{QElG&r8w(p%(J2#p29;2ezmKYmOdpa1LJJK*#1`~8=rj?br- zo*Urv>yFRwiT9>mpKnQp9XcgbpKtB*&6Zpq;PXY8ErULv+sEg*N-@usDxR?q@_9NV z7jr$2e`N3G3Y8*w_f~FK+p)GUEiY$pvu{339F~s;I(K1e1LF-5)%l<8yo|Th8VR^RS+&^L#l64;wp^JUH^fP2Y|mKWs-se*6I4 zerT6I7yum){qg!Vcqsgy1Rsng|0c^|&h^OnL^nRMOk*Kvd?L4dt9{EZ=7(+pyC?fkbnKiH z2sX9%rz5ndi(XG**DBqHoLvY*+`yfIV_gAAj(R6Z7)Jy;q{py*;tp zy_QrgP({-2Rjh(^^Tuv{=zjb`x_2;v0M7aUOZ5DIB>6Ku|35SX4R#a>AOR$R1dsp{ zKmter2_OL^fCQdk0^SKfeQP!JRLFn+AAXYJKmQLuh@j{H@K68ng9MNO5<~+|3@=G z0!RP}AOR$R1dsp{Kmter2_OL^@bnYF^Z%z`$Jj$8fCP{L5a$zfCP{L5iJTR z&9SDmrHT!=8t>eSXoGH+%jP&fpB=dbH=78xC0{H-rNGT{``nDH6!ToE;_UyYz8;zS z`qY0O`}}aL7<-QdkN^@u0!RP}AOR$R1dsp{7)D?(GkRw;x@B2y^W6OW_V)H!Q8DUT z{j5yp4P9@U^BkAW!oI9hDC7!-O1{E#{CR1MNH+vUtHO4!$Yx>FR+;Uj>9*BP%oS#v zHZd);nwu46TXinXYOyqHYsyWV2T^OP5L2AWd5p4WW}ZYwETk zL5o^#gP5kGYqW7~UZ5Lcu9VI3CE*HxSuL@-YmL0!_(s#(orR*e6x}w3w%CL&5y_(U zG_gghK#rSj5SbXFMPye9y=6AfkwQJ2WgCSeVe^%Iovr7KdA3j%rFta`IP#LmjvGjJ zwhE1r!Ur_6D0T z%MG^NXjIsuT;{n#BVQ+SqrcxlEp;Dr^xw4eh@-Rr|5YUU ztK?_N|2~kO;7yPK540CdoC~$ zdv3zN|NktK{J-RXB|m%Ijl=sR0VIF~kN^@u0!RP}AOR$R1dsp{Kmw1IKq7V`x>CCu zbN`OSV<#ry`_z#iN8+*XM`GVk{^P`-AJGy&pV*9lp7>~NEdG;`AB}!I@>=v4(OTq} zk>(d_UHzFvil2$@MHNjZJE~$@iqF@3kyuV@_s$cr?{iTO1h8hqolFr!25~)|A2`>km(24r!7oXf; ze6UhTbT2!dk){%F@8c{RMi(L!xL#I&<&n5m0S4`u;1HSXYUwLcvrC zX@eaJdYisCWDZL|>I&ajczwCnHJwoPG;?~AflpeI)67Qg+6J?-3O}36%gmLvrOOLz z*O^PT>t{M;JXxW#Za%z?BUrm_(sw5}_ogjsmSXLu!Ok~oZ`9U$qAt`fE^ID;a5kDq z&CEoVbnc*iW~0KK%~N?=!uDW&Hl(6mahiy;g@nQFTdl-v6?o za)+B+NriOyPb*Ll28U%YQP-N`R847aS?aFPHWVpX-qI}?1j6Z(Z5Tw8Ko^eAJN!48 zx!UI1oGS`mK?5@jNT)BZk!?ZJZ4KJ$YSAt3ytp?wOK&t3Nr86kQ>G>E^lY=OiV^|E z1|^ub;@E}I&1~5a-BCB`x^l1p+?n3Ys)R{Qqt3r8idS zh|Zk$mKm@_NM~S-xmW|eEY{Y$a|In)>9mlcYmW=H-NB+;rK{SkTNymon<4h`Sj(_Ihy6m9 z-nNB{{S0VIF~ zkN^@u0!RP}AOR%s#1I&bjYnhj*GMcjJ~0LDdUkX^lKhh-J9TTSdgScnzfSHFqR+vcPNM-B!C1CD}lYU&nHrESEGAx`Qtr$$JK5mhW}Wl z7u*vnHshXR1%rJ;#rp0R2oebuhJc{%xd|NDtvwe{%}qzQo#t91jX82dQ|Ol$-3A1M zfC9xj(r6M5HFCmH*1pz%Px>EgX}Ak5_QLRcdc^|wq=^th^inF3x_UCYx9YUicP!}d zeeV>E#!UGNJs%|L%Nbt1(+XeFAUZf_?umm2{?S(xsc&YYdv6Cc?;MPqej&YcKwn&! zdu;h!u$f&&Imed6ZxYmaM`Wn6J-O*jeR^QHStpLe*DUjE0~=%ty^;ah)CSyOkj(b8 ziPQo_KF#=fK&*v70L3pVHAGaXLP%nuL^`v5VxZta_>5ohSBD4=RR{?Vlt^c8ex(y& zHy!M;FZ#kS_Sqp~`-+Ez`tm`M7&nM9M4GNR5vy?QjBlB)MP6${NE z-BYqq&eY)xMBHFFH)O+X>lXZ@b_j-o|3Kf1c?-vQcoiVX5h7puBZ0c1AafTXveJvP zQpGxP<4jy}PVACDs%lUWM}GvUe+u~$;(SVz@zkYL_p>i&IpLkW3iR8NyZMPk>eQ*| zd%=KKn&-`l^m@`ALi++&X;wGJ$16W95ETl;s2mF6aRP*VVXeU+D46@*dYO(o2EnJz z_)>x2rzxEl4~*~fIunTPqV0dUxURYJcaP;@Y-drM)nx2l{fv*s-Ow;Jk zcu=1-sjJ73js^rBJ4?l2%%bO+9ttX3ovDo0|CNp^ce^|G2*l#3& z1dsp{Kmter2_OL^fCP{L5_q}^jK(7L{69j^|05H)|9`r*jlDwxNB{{S0VIF~kN^@u z0!RP}Ac4b50QdigRrAV;k>sbzPm(`8 zOdPPUNB{{S0VIF~kN^@u0!RP}AOR$R1fDDcFO8lbk4mkUYM8vtW#O|`*ewyWP*Cb} zF4oP~wHsm^a&sJi)7s$;v8>qJBQsEYg4WhsQelVIrrE7szS)w?+cf?^a{S5aFSZ5= zAOR$R1dsp{Kmter2_OL^fCP{L5;#}_xc@&`o_IGTfCP{L5(*Vtfw7A}v zG--jRK>KF7yV})~wsm)Q;>%lDCb{!w-kaZhGdr_0ORl}JD4T?DYI;XB`C{aW2*X6q z^L!){c|ZIehd=i?3U7wo5Ae$P-Vb{F{>bSYpNvsq?w3^N2=~j8uP5|mE%uGX*`e2B zFGXJ;`f}vc(Jw;U#}ADpQs>SwpLos`>k3h1!<5yQU?_xi=^L$U%9?5nI)8X}wK`L) z^0k=@i&Z{7NG#2t9OmIj%Z)T&tA3=$FR#Gg`r;ygd39lFX7wt6se1KPPsS}IB|_R8863PV)=Pwd%#{>aMD}>io?5VvWx`qDEIU4WUbP!BS=O0Wv_@ zid}TD3U0aM^H8C2w-#cZC_NLMGpc41!+&OWWqGZ(Ismc>!>?B7tE<)J+3K2CASgUZsb~0=Wqz)@SOsUz&aBPO%vI@`^~-bAea@NT z%*x?-N;=0xWVJ!IjO$QaB8Zl$*`GaI_C|-0cdgu;!rhuUm`I&C!Q2kUmnpWqn4<3w z^^YT(9V-{h=U3uS!uvc-MK(`esa7aD&R!wWaqP?>87CR4Fcgj@T!(`gk?h@ zTPD#}5lX+Xva(p6S@smbC7^c=G^?oMuSR+iHdv^8uvgM*W=y=(D5yd zgp>gtr=m$B9oq>eWDbOz0_RPu0r_BXx9F6lXlg5bR+U?AQ`r`}x-13dP0bXQKIbJ% z*NG~*y_g%1P;)G6S;R2=Cuz;5EXiQUJ!CrK z*6wh(tB4XEq&cn3(O_7z9Tx(O*o6tNRXM4EPZtFk$(uSc3}Of>Aq{YrBOENs9lRQX zVRbs9?r?C5f>d48Iw1q#eE6JbwjE&~s0J}4UA7190iU)tV6=M`b@WN1DiMYH8{(j= zDYEMqCmpW3Ffx{d!`3wPFir7>ro-USV-fCQnxbO3T?~u?ObcodxNv!W(fD%pitBqx zy8+{IyIBV8*Tqf2&@5dd0$i!YK$_g?LI?S9u6@IqGehyz?C5~;EquWH{pDdfxWKA6im4TJ%rflI_ku3SCK)VVUn)lb{xFh=K1yIHJD#j=fVZ%hBLF# zc0*%r#0f1uk7bp4+l{z2sM!l6Hh7d(&v5v56CnK?#)afWiliX)yXfda1JqOI(* ztBe;r{ER}jM0oU3hvxvyn-^^Dr}!n=G|Bl7UwHYqPyXW0`49dhynOp*<~ev`;rx;B z(Br$2FU8}9JAca0Y27YS>clip@p<^XCbbo@X=qK6WusJ|F5mekJR-pFYVFAI1aLX` z?Z4z_EW?ncw#nal1?tWBw4Vl1y-7AqzIo>rov#x^qxKPgMpf>76&^P@y{2{BGFf|m zWwo}ruvT*!^`2}wZK(#-#c120x^2zWviWi@SDGr9CJL2&Dcgjl%Lyr8Do%-0)0^ii zxg4xfet!IqdjH>gIyyGJ1P-0aCK3!dlmUkxNG>FqqzwK%Ft&5h=g%VevrPS2$<@hb zX=|gSfvqaZZfb4SQd`-Z09&^@*~%oSFBbByv3(u7S19CASHOK;>S^Pz^tADEhyQ3# z;UbZ{H|5qTeod6x22sy1{KfBncx>wL&!7Ce_g|vuDJcZPkPL(_SC9MNbfS9LiZLK33^eK>4c;s$j`J!9tdw7U0 zMNd=;xrr>v6`J|HRBYr(!RMZj+3xw+tqJDY=^1d(8E_A6Z`3_@drR&V;nDG6d$V-# zPm%hkEN#`>N~No4O*0EtXQ5_wc=}@_n=ce6O6AFlYh+)~>=g=m#uad17eNnC=~Nv( zbMUB4gQz}-+%|UZzV?gr&u#xf{hQzT{m72(k+0kxKG*%~m7$@29@%YN(dZS6whBXP z-}$Br&CAnwfmVtattQ`)3Dr*aG^op>2#tvb4!@xg8Ju}hx1bTLr)N}E+)t}2!D|0x z0apAA<#IMJ*9m@1Yl6`$Ma`v2$0O}hTia$k#ZKjB`57d(&v5jn_c7KG2_OL^fCP{L5!)CK{t&pqK2bMIqvVLK)8np|m4$*>= zNuNm~Xs*zLN=orYvvY&E1%)Q*YYWR47pv5sdDbq*Zx1bIk`~kInAY}Wesf?kjy<%T z=c9WGSzAy}iI$^p>WW%uwVZOobN>LLd1kLgc1aVoq~dn*=4R!(I%1df)%|g$Vbr+!wgN9b7d`K>|ns2_OL^fCP{L z5d-ug$>j53l%8_B4NTn1>@RH`08q`jHyHyaIo) zIfK8vy0A2}dX>Lay?UxA=Qi zYO6C~=U{TeE)AgxUQhG$E34Ioi^~)kf711{0I)LrYIVN4T3w#4u6YH5!jqJGhPr>Q zx>$usnVngiotdlBGwYYI(Ft>y8Wr{5?rs(@a{o{xxd9mZ_7!(&EOfdxG>(sCvRK6$+LOaE$oqU;| zoBBD0n&DMmZajQP+{cuj#eL0iD-(zXYX5%YAo@?2PTVgJY%ZSf>8FvM@nj-(^eFR( zdAmQM=jh*X(7*JQ=hu+W`t%8^*cs5??QU$Z&+WcO(iBa1!oh2zW94GG+{};DP-osa zdvs70sf7a6p=BJrHJnJK#>SYJUbL$uHafB@*p1ou_CTzLU0YbHhP%Gm^;PH~Y64ANtIaH3c5++Y#^BuHX!KR@k+zHBOzA*8 zwJ*f$;p~ctcdc ziPXu?XG7CXLDON(agU2s&JmUkfoz#DJQ1Pv3o9#&)tO~a0bBxl*FYGq5GTW1ikyDk zWsl<{4h$F#o?FQb3XGr^cdu7HQxl&&#wJpuqs(Vw{_t=E!x-ut68r1H2KQ51L zFfbZaP4B?C<0drjfikYgy$hh@TNnu`13FGclSDeU6HLe)2sZ`Jn^ptz!N_vaDM``P zR`{$cx7x6sSm^4q6qGkLQ&jq#mn>Z;s^s=!Zaj`x*H_)4lMedTRzSN>ZpdU)kTgr( z9jG~$!H;VhX8$Cu*_0(2?6`+aN8H*SuXYttqJuPN?C5APEZL3=p{eP@gx9K^RKTZ; zLPs=Bofrl&1eK5mILi?Z7Ud3JVH|IDI->4yaEgLdUDG-t1K@o4oM^TkVV|2C#E^8^ z9<&F1+SY*4?p4&$CyA;=6zXq?gRTaXl%PFu)rFC<92~Z$p@(sP1`S+3u!m`iis5!K zFa|I!s6pVu<@H75%hfBc?aa;FO&mJQEFeFkykfb%%~_ z_8c}?{~*_$U%8HM6KMk?E*;Dl@P{MXlvUYiJ2ZF2-7$qD<<-UsDo3zifr>^4d!d7h zI)&+)qJYA>C~er|7e{uFN(ds)%f8ZFsqi(f*VUMUDR-cU5IbE*o!IRvGUzi*(ly+U zgLm6JOr+OfepQ_d7nmE)%tqs>EBQgu5O$o951(Uu>;JbR++FS#_w|Qx9|Ma-0!RP} zAOR$R1dsp{Kmter2_OL^fCMOk410xXn`YPe(3vxvo13S5H(^gpTIY;?Q|*km-hW?7 zZ_)oGdnI|#GQIVGFd!8@9pUaAE|2`*$oj~sgVw>{N@frI^MTC+k;LyMqVYeAKNtI} z*woPXhEB4dV2jZogE$_)kpy;5Kb=Uuc#he5!M_n?_nx)0MZ>?|A~)RqA|bKk&aKAa zK9P{vp3(xlM?%7|o5WjSfvpCsPsLN?N109ATzeta-LpbJJmVM;+^-T4>uIB>ynQOG z@7r4@y_rq-!BDZ$!DYH)fimqI;hCLhjwMo8#+aQI+fv_pp}+RK%VV&(vgH$cI!Mx& zGB|tB3ZG~Y9qdNfRhQ1}eDJ-A)bC`NofiY153jH7E)Y^1Jzv;MmQMxE3?}+54Hgpg zcu!@>*xfAYOg)uI6=Ay)Sbmf?bBY;NmTJ zgKacdNU5(>Xj9CS3A&!Mb0(-dxXS9+{_cBg4`&P+5EMygtS1twvtYvUpb5JR!G87c zy0`jZrjY90aWooToe(d{r{G5LYFg!Yz9 z`|a|vM{MvP?RER@OzkD@*_BE&?MMA^W_&oFdZ2B3kI&A-H@k-o_aV|{e}63c2GahK zY`<{y#=URPM(MNhcIdgZBa4Q_9-|@GQIH?Ewgt<*xtt=Knv8a6jjM$o=$DR~Pjm0VIF~ zkN^@u0!RP}AOR$R1dsp{Kmu=#z#(>&@tz!@AG{j_2iehNaQ^>2`ul(X$$jswX~f(} z00|%gB!C2v01`j~NB{{S0VIF~kia7(Fv1>Z)}CKktt~FB)mV-lW9T=3uEN(WFDze- z9%7Ftz0}_N|IZ@ae{=uK{p=Cih^mkP5)nd^|00|%gB!C2v01`j~NB{{S0VIF~kic6b@FY9V zTp+q4tEc!S*)$3JZZ^v-61jU*Zk^)SM5%2MHF|{2CgZ`?tI-%6O-6RS@BjNL z{RY5a0~^rJ?MjzJ4udbk%F|d#1+xsYS?Jrg`u~mv&E)_9 diff --git a/prisma/schema.prisma b/prisma/schema.prisma index 425fad9..dd081c1 100644 --- a/prisma/schema.prisma +++ b/prisma/schema.prisma @@ -31,6 +31,7 @@ model Listing { nightlyPrice Float? @map("nightly_price") totalPrice Float? @map("total_price") currency String? @default("EUR") + priceStatus String? @map("price_status") // EXTRACTED, REQUIRES_TRIP_CONTEXT, UNKNOWN, PARTIAL // Rating rating Float? diff --git a/src/actions/import-listing.ts b/src/actions/import-listing.ts index 76e8245..b94fee3 100644 --- a/src/actions/import-listing.ts +++ b/src/actions/import-listing.ts @@ -2,17 +2,29 @@ import { z } from "zod"; import { prisma } from "@/lib/prisma"; -import { scrapeAirbnbListing, extractAirbnbExternalId, normalizeAirbnbUrl } from "@/lib/airbnb-scraper"; +import { scrapeAirbnbListing } from "@/lib/airbnb"; +import { normalizeAirbnbUrl, extractAirbnbExternalId } from "@/lib/airbnb/url-normalizer"; import { slugify } from "@/lib/utils"; import { revalidatePath } from "next/cache"; const schema = z.object({ airbnbUrl: z.string().url("Ungültige URL"), + checkIn: z.string().optional(), + checkOut: z.string().optional(), + adults: z.number().optional(), }); export async function importListingAction(formData: FormData) { + const url = formData.get("airbnbUrl") as string; + const checkIn = formData.get("checkIn") as string | null; + const checkOut = formData.get("checkOut") as string | null; + const adultsStr = formData.get("adults") as string | null; + const parsed = schema.safeParse({ - airbnbUrl: formData.get("airbnbUrl"), + airbnbUrl: url, + checkIn: checkIn || undefined, + checkOut: checkOut || undefined, + adults: adultsStr ? parseInt(adultsStr, 10) : undefined, }); if (!parsed.success) { @@ -22,6 +34,7 @@ export async function importListingAction(formData: FormData) { const normalizedUrl = normalizeAirbnbUrl(parsed.data.airbnbUrl); const externalId = extractAirbnbExternalId(normalizedUrl); + // Check for duplicates const duplicate = await prisma.listing.findFirst({ where: { OR: [ @@ -42,10 +55,31 @@ export async function importListingAction(formData: FormData) { }; } - const scrapedData = await scrapeAirbnbListing(parsed.data.airbnbUrl); - const title = scrapedData?.title || "Neues Airbnb"; + // Build trip context from form or URL + const tripContext = { + checkIn: parsed.data.checkIn, + checkOut: parsed.data.checkOut, + adults: parsed.data.adults || 4, + }; + + // Scrape with trip context for better price extraction + const scrapedData = await scrapeAirbnbListing(parsed.data.airbnbUrl, { tripContext }); + + const title = scrapedData?.title?.value || "Neues Airbnb"; const slug = `${slugify(title)}-${Date.now()}`; + // Calculate sleeping stats + let maxSleepingPlaces = scrapedData?.maxSleepingPlaces || null; + let suitableFor4 = scrapedData?.suitableFor4 || null; + let extraMattressesNeededFor4 = scrapedData?.extraMattressesNeededFor4 || null; + let bedTypesSummary = null; + + if (scrapedData?.sleepingOptions && scrapedData.sleepingOptions.length > 0) { + const types = scrapedData.sleepingOptions.map(o => `${o.quantity}× ${o.bedType}`); + bedTypesSummary = types.join(", "); + } + + // Create listing const listing = await prisma.listing.create({ data: { title, @@ -53,29 +87,54 @@ export async function importListingAction(formData: FormData) { airbnbUrl: parsed.data.airbnbUrl, normalizedUrl, externalId, - ...(scrapedData?.pricePerNight && { nightlyPrice: scrapedData.pricePerNight }), - ...(scrapedData?.rating && { rating: scrapedData.rating }), - ...(scrapedData?.reviewCount && { reviewCount: scrapedData.reviewCount }), - ...(scrapedData?.guestCount && { guestCount: scrapedData.guestCount }), - ...(scrapedData?.bedrooms && { bedrooms: scrapedData.bedrooms }), - ...(scrapedData?.beds && { beds: scrapedData.beds }), - ...(scrapedData?.bathrooms && { bathrooms: scrapedData.bathrooms }), - ...(scrapedData?.description && { description: scrapedData.description }), - ...(scrapedData?.hostName && { hostName: scrapedData.hostName }), - ...(scrapedData?.location && { locationText: scrapedData.location }), - ...(scrapedData?.latitude && { latitude: scrapedData.latitude }), - ...(scrapedData?.longitude && { longitude: scrapedData.longitude }), - ...(scrapedData?.cancellationPolicy && { cancellationPolicy: scrapedData.cancellationPolicy }), - ...(scrapedData?.images?.length && { coverImage: scrapedData.images[0] }), - ...(scrapedData?.amenities?.length && { amenities: JSON.stringify(scrapedData.amenities) }), + + // Location + locationText: scrapedData?.locationText?.value || null, + latitude: scrapedData?.latitude?.value || null, + longitude: scrapedData?.longitude?.value || null, + + // Pricing + nightlyPrice: scrapedData?.nightlyPrice?.value || null, + totalPrice: scrapedData?.totalPrice?.value || null, + currency: "EUR", + priceStatus: scrapedData?.priceStatus || "UNKNOWN", + + // Rating + rating: scrapedData?.rating?.value || null, + reviewCount: scrapedData?.reviewCount?.value || null, + + // Capacity + guestCount: scrapedData?.guestCount?.value || null, + officialGuestCount: scrapedData?.officialGuestCount?.value || null, + maxSleepingPlaces, + suitableFor4, + extraMattressesNeededFor4, + bedTypesSummary, + + // Room Details + bedrooms: scrapedData?.bedrooms?.value || null, + beds: scrapedData?.beds?.value || null, + bathrooms: scrapedData?.bathrooms?.value || null, + + // Description & Host + description: scrapedData?.description?.value || null, + hostName: scrapedData?.hostName?.value || null, + cancellationPolicy: scrapedData?.cancellationPolicy?.value || null, + + // Images + coverImage: scrapedData?.coverImage || null, + amenities: scrapedData?.amenities?.length ? JSON.stringify(scrapedData.amenities) : null, + + // Raw data for debugging rawSourceData: scrapedData ? JSON.stringify(scrapedData) : null, }, select: { id: true, slug: true }, }); + // Save images if (scrapedData?.images?.length) { await prisma.listingImage.createMany({ - data: scrapedData.images.map((url, index) => ({ + data: scrapedData.images.slice(0, 20).map((url, index) => ({ listingId: listing.id, url, sortOrder: index, @@ -83,6 +142,20 @@ export async function importListingAction(formData: FormData) { }); } + // Save sleeping options + if (scrapedData?.sleepingOptions?.length) { + await prisma.listingSleepingOption.createMany({ + data: scrapedData.sleepingOptions.map(opt => ({ + listingId: listing.id, + bedType: opt.bedType, + quantity: opt.quantity, + spotsPerUnit: opt.spotsPerUnit, + quality: opt.quality, + label: opt.label || null, + })), + }); + } + revalidatePath("/dashboard"); revalidatePath("/listings"); diff --git a/src/app/(protected)/admin/import/import-form.tsx b/src/app/(protected)/admin/import/import-form.tsx index 44797ad..330aeac 100644 --- a/src/app/(protected)/admin/import/import-form.tsx +++ b/src/app/(protected)/admin/import/import-form.tsx @@ -4,10 +4,14 @@ import { useState } from "react"; import { Button } from "@/components/ui/button"; import { Input } from "@/components/ui/input"; import { Label } from "@/components/ui/label"; +import { Card, CardContent, CardHeader, CardTitle } from "@/components/ui/card"; import { importListingAction } from "@/actions/import-listing"; export function ImportForm() { const [url, setUrl] = useState(""); + const [checkIn, setCheckIn] = useState(""); + const [checkOut, setCheckOut] = useState(""); + const [adults, setAdults] = useState("4"); const [error, setError] = useState(""); const [success, setSuccess] = useState(false); const [isLoading, setIsLoading] = useState(false); @@ -20,6 +24,9 @@ export function ImportForm() { const formData = new FormData(); formData.append("airbnbUrl", url); + if (checkIn) formData.append("checkIn", checkIn); + if (checkOut) formData.append("checkOut", checkOut); + if (adults) formData.append("adults", adults); const result = await importListingAction(formData); @@ -33,25 +40,82 @@ export function ImportForm() { setIsLoading(false); }; + // Get today's date for min date + const today = new Date().toISOString().split('T')[0]; + return ( -
-
- - setUrl(e.target.value)} - required - autoFocus - /> -
- {error &&
{error}
} - {success &&
✓ Erfolgreich importiert!
} - -
+ + + 🏠 Neues Airbnb importieren + + +
+ {/* URL Field */} +
+ + setUrl(e.target.value)} + required + autoFocus + /> +
+ + {/* Trip Context Fields */} +
+ +
+
+ + setCheckIn(e.target.value)} + min={today} + placeholder="Datum" + /> +
+
+ + setCheckOut(e.target.value)} + min={checkIn || today} + placeholder="Datum" + /> +
+
+ + setAdults(e.target.value)} + /> +
+
+

+ 💡 Mit Reisedaten kann der Preis genauer ermittelt werden. + Die Daten werden auch aus der URL extrahiert wenn vorhanden. +

+
+ + {error &&
{error}
} + {success &&
✓ Erfolgreich importiert!
} + + +
+
+
); } diff --git a/src/app/(protected)/admin/listings/[slug]/page.tsx b/src/app/(protected)/admin/listings/[slug]/page.tsx index 64dba1f..5965e67 100644 --- a/src/app/(protected)/admin/listings/[slug]/page.tsx +++ b/src/app/(protected)/admin/listings/[slug]/page.tsx @@ -5,6 +5,7 @@ import { Button } from "@/components/ui/button"; import { Input } from "@/components/ui/input"; import { Label } from "@/components/ui/label"; import { updateListing, deleteListing, addNote, addTagToListing, removeTagFromListing } from "../actions"; +// Note: actions.ts is in /admin/listings/, so from [slug]/ we go up one level with ../ export default async function EditListingPage({ params, diff --git a/src/lib/airbnb/index.ts b/src/lib/airbnb/index.ts new file mode 100644 index 0000000..8550ffa --- /dev/null +++ b/src/lib/airbnb/index.ts @@ -0,0 +1,207 @@ +import * as cheerio from "cheerio"; +import { normalizeAirbnbUrlWithContext } from "./url-normalizer"; +import { parseCapacityFacts, parseRating, parseHost, parseMaxGuests, extractVisibleText } from "./parsers/text-patterns"; +import { parseSleepingArrangements, calculateSleepingStats, deriveSleepingFromBeds } from "./parsers/sleeping"; +import { extractPrice } from "./parsers/price"; +import { extractLocation } from "./parsers/location"; +import { parseJsonLd } from "./parsers/jsonld"; +import { + ExtractedListing, + FieldSource, + field, + mergeField, + TripContext, + SleepingDataQuality, + PriceStatus +} from "./types"; + +// ============================================ +// Main Scraper Function +// ============================================ + +export async function scrapeAirbnbListing( + url: string, + options?: { tripContext?: TripContext; usePlaywright?: boolean } +): Promise { + try { + // Step 1: Normalize URL and extract trip context + const normalized = normalizeAirbnbUrlWithContext(url); + + // Merge trip context from options with URL-extracted context + const tripContext: TripContext = { + checkIn: options?.tripContext?.checkIn || normalized.tripContext.checkIn, + checkOut: options?.tripContext?.checkOut || normalized.tripContext.checkOut, + adults: options?.tripContext?.adults || normalized.tripContext.adults || 4, + }; + + // Step 2: Fetch HTML + const html = await fetchHtml(normalized.normalized); + const $ = cheerio.load(html); + + // Step 3: Extract visible text for pattern matching + const visibleText = extractVisibleText(html); + + // Step 4: Run all parsers + const jsonldData = parseJsonLd($); + const capacityFacts = parseCapacityFacts(visibleText); + const ratingFacts = parseRating(visibleText); + const hostName = parseHost(visibleText); + const maxGuests = parseMaxGuests(visibleText); + const sleepingOptions = parseSleepingArrangements(visibleText); + const priceData = extractPrice(html, $, tripContext); + const locationData = extractLocation($, html); + + // Step 5: Build the result with priority: jsonld > text_pattern > derived + const result: ExtractedListing = { + // URLs + originalUrl: normalized.original, + normalizedUrl: normalized.normalized, + externalId: normalized.externalId, + + // Basic Info + title: mergeField( + jsonldData.title ? field(jsonldData.title, 'jsonld', 'high') : null, + field(null, 'derived', 'low') + ), + description: mergeField( + jsonldData.description ? field(jsonldData.description, 'jsonld', 'high') : null, + field(null, 'derived', 'low') + ), + + // Location + locationText: locationData.locationText, + latitude: mergeField( + jsonldData.latitude ? field(jsonldData.latitude, 'jsonld', 'high') : null, + locationData.latitude.value !== null ? locationData.latitude : field(null, 'derived', 'low') + ), + longitude: mergeField( + jsonldData.longitude ? field(jsonldData.longitude, 'jsonld', 'high') : null, + locationData.longitude.value !== null ? locationData.longitude : field(null, 'derived', 'low') + ), + + // Pricing + tripContext, + nightlyPrice: priceData.nightly, + totalPrice: priceData.total, + priceStatus: priceData.status, + + // Rating + rating: mergeField( + ratingFacts ? field(ratingFacts.rating, 'text_pattern', 'high') : null, + jsonldData.rating ? field(jsonldData.rating, 'jsonld', 'medium') : null + ), + reviewCount: mergeField( + ratingFacts && ratingFacts.reviewCount > 0 ? field(ratingFacts.reviewCount, 'text_pattern', 'high') : null, + jsonldData.reviewCount ? field(jsonldData.reviewCount, 'jsonld', 'medium') : null + ), + + // Capacity + guestCount: mergeField( + capacityFacts ? field(capacityFacts.guests, 'text_pattern', 'high') : null, + field(null, 'derived', 'low') + ), + officialGuestCount: mergeField( + maxGuests ? field(maxGuests, 'text_pattern', 'high') : null, + field(null, 'derived', 'low') + ), + bedrooms: mergeField( + capacityFacts ? field(capacityFacts.bedrooms, 'text_pattern', 'high') : null, + field(null, 'derived', 'low') + ), + beds: mergeField( + capacityFacts ? field(capacityFacts.beds, 'text_pattern', 'high') : null, + field(null, 'derived', 'low') + ), + bathrooms: mergeField( + capacityFacts ? field(capacityFacts.bathrooms, 'text_pattern', 'high') : null, + field(null, 'derived', 'low') + ), + + // Sleeping + sleepingOptions, + maxSleepingPlaces: 0, + suitableFor4: false, + extraMattressesNeededFor4: 0, + sleepingDataQuality: 'UNKNOWN', + + // Host + hostName: mergeField( + hostName ? field(hostName, 'text_pattern', 'high') : null, + jsonldData.hostName ? field(jsonldData.hostName, 'jsonld', 'medium') : null + ), + + // Amenities + amenities: jsonldData.amenities || [], + + // Images + images: jsonldData.images || [], + coverImage: jsonldData.images?.[0] || null, + + // Other + cancellationPolicy: field(null, 'derived', 'low'), + + // Debug + rawSnippets: { + title: jsonldData.title || '', + visibleText: visibleText.substring(0, 2000), + }, + extractionLog: [ + `URL normalized: ${normalized.normalized}`, + `External ID: ${normalized.externalId}`, + `Trip context: ${JSON.stringify(tripContext)}`, + `Capacity facts: ${capacityFacts ? JSON.stringify(capacityFacts) : 'none'}`, + `Rating facts: ${ratingFacts ? JSON.stringify(ratingFacts) : 'none'}`, + `Sleeping options: ${sleepingOptions.length} found`, + ], + }; + + // Step 6: Calculate sleeping stats + if (sleepingOptions.length > 0) { + const stats = calculateSleepingStats(sleepingOptions); + result.maxSleepingPlaces = stats.maxSleepingPlaces; + result.suitableFor4 = stats.suitableFor4; + result.extraMattressesNeededFor4 = stats.extraMattressesNeededFor4; + result.sleepingDataQuality = 'EXACT'; + } else if (result.beds.value && result.guestCount.value) { + // Derive from beds and guest count + const derivedOptions = deriveSleepingFromBeds(result.beds.value, result.guestCount.value); + const stats = calculateSleepingStats(derivedOptions); + result.sleepingOptions = derivedOptions; + result.maxSleepingPlaces = stats.maxSleepingPlaces; + result.suitableFor4 = stats.suitableFor4; + result.extraMattressesNeededFor4 = stats.extraMattressesNeededFor4; + result.sleepingDataQuality = 'DERIVED'; + } + + return result; + } catch (error) { + console.error("Scraping failed:", error); + return null; + } +} + +// ============================================ +// HTML Fetcher +// ============================================ + +async function fetchHtml(url: string): Promise { + const response = await fetch(url, { + headers: { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language": "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7", + "Accept-Encoding": "gzip, deflate, br", + "Cache-Control": "no-cache", + }, + }); + + if (!response.ok) { + throw new Error(`HTTP ${response.status} for ${url}`); + } + + return response.text(); +} + +// Re-export utilities for backward compatibility +export { normalizeAirbnbUrlWithContext as normalizeAirbnbUrl } from "./url-normalizer"; +export { extractAirbnbExternalId } from "./url-normalizer"; diff --git a/src/lib/airbnb/parsers/jsonld.ts b/src/lib/airbnb/parsers/jsonld.ts new file mode 100644 index 0000000..1cf75b4 --- /dev/null +++ b/src/lib/airbnb/parsers/jsonld.ts @@ -0,0 +1,125 @@ +import * as cheerio from 'cheerio'; + +export interface JsonLdData { + title: string | null; + description: string | null; + locationText: string | null; + latitude: number | null; + longitude: number | null; + rating: number | null; + reviewCount: number | null; + images: string[]; + cancellationPolicy: string | null; + hostName: string | null; + amenities: string[]; +} + +/** + * Parse JSON-LD structured data from HTML + * Airbnb typically uses LodgingBusiness or Room schema + */ +export function parseJsonLd($: cheerio.CheerioAPI): JsonLdData { + const result: JsonLdData = { + title: null, + description: null, + locationText: null, + latitude: null, + longitude: null, + rating: null, + reviewCount: null, + images: [], + cancellationPolicy: null, + hostName: null, + amenities: [], + }; + + const jsonLdScript = $('script[type="application/ld+json"]').html(); + + if (!jsonLdScript) { + return result; + } + + try { + const jsonData = JSON.parse(jsonLdScript); + + // Check if it's a lodging business schema + if (jsonData["@type"] !== "LodgingBusiness" && jsonData["@type"] !== "Room") { + return result; + } + + // Title + if (jsonData.name) { + result.title = jsonData.name; + } + + // Description + if (jsonData.description) { + result.description = jsonData.description; + } + + // Location + if (jsonData.address) { + const parts: string[] = []; + if (jsonData.address.addressLocality) parts.push(jsonData.address.addressLocality); + if (jsonData.address.addressRegion) parts.push(jsonData.address.addressRegion); + if (jsonData.address.addressCountry) parts.push(jsonData.address.addressCountry); + + if (parts.length > 0) { + result.locationText = parts.join(', '); + } + } + + // Coordinates + if (jsonData.geo) { + if (jsonData.geo.latitude) { + result.latitude = parseFloat(jsonData.geo.latitude); + } + if (jsonData.geo.longitude) { + result.longitude = parseFloat(jsonData.geo.longitude); + } + } + + // Rating + if (jsonData.aggregateRating) { + if (jsonData.aggregateRating.ratingValue) { + result.rating = parseFloat(jsonData.aggregateRating.ratingValue); + } + if (jsonData.aggregateRating.reviewCount) { + result.reviewCount = parseInt(jsonData.aggregateRating.reviewCount, 10); + } + } + + // Images + if (jsonData.image) { + const images = Array.isArray(jsonData.image) + ? jsonData.image.map((img: unknown) => { + const imgObj = img as Record; + return imgObj.url || imgObj['@id'] || String(img); + }) + : [jsonData.image.url || jsonData.image['@id'] || jsonData.image]; + result.images = images.filter(Boolean); + } + + // Cancellation Policy + if (jsonData.cancellationPolicy) { + result.cancellationPolicy = jsonData.cancellationPolicy; + } + + // Host name + if (jsonData.provider?.name) { + result.hostName = jsonData.provider.name; + } + + // Amenities + if (jsonData.amenityFeature && Array.isArray(jsonData.amenityFeature)) { + result.amenities = jsonData.amenityFeature + .map((f: unknown) => (f as { name?: string }).name) + .filter(Boolean); + } + + } catch (error) { + console.error('Failed to parse JSON-LD:', error); + } + + return result; +} diff --git a/src/lib/airbnb/parsers/location.ts b/src/lib/airbnb/parsers/location.ts new file mode 100644 index 0000000..6e81337 --- /dev/null +++ b/src/lib/airbnb/parsers/location.ts @@ -0,0 +1,118 @@ +import * as cheerio from 'cheerio'; +import { FieldSource } from '../types'; + +/** + * Extract location from multiple sources with priority: + * 1. JSON-LD address (handled separately) + * 2. "Where you'll be" section + * 3. Meta tags (og:locality, etc.) + * 4. Visible text patterns + */ +export function extractLocation( + $: cheerio.CheerioAPI, + html: string +): { locationText: FieldSource; latitude: FieldSource; longitude: FieldSource } { + + let locationText: string | null = null; + let locationSource: FieldSource['source'] = 'text_pattern'; + let latitude: number | null = null; + let longitude: number | null = null; + + // 1. Try "Where you'll be" section + const whereSection = $('[data-section-id="LOCATION_DEFAULT"]').text() || + $('section:contains("Where you\'ll be")').text() || + $('section:contains("Lage")').text(); + + if (whereSection) { + // Extract location from this section + const locationMatch = whereSection.match(/([A-Z][a-zäöüÄÖÜß]+(?:\s+[A-Z][a-zäöüÄÖÜß]+)*,\s*[A-Z][a-zäöüÄÖÜß]+)/); + if (locationMatch) { + locationText = locationMatch[1].trim(); + locationSource = 'dom'; + } + } + + // 2. Try meta tags + if (!locationText) { + const locality = $('meta[property="og:locality"]').attr('content') || + $('meta[name="location"]').attr('content'); + const region = $('meta[property="og:region"]').attr('content'); + const country = $('meta[property="og:country-name"]').attr('content'); + + if (locality) { + locationText = [locality, region, country].filter(Boolean).join(', '); + locationSource = 'meta'; + } + } + + // 3. Try text patterns like "Location: Berlin, Germany" + if (!locationText) { + const locationPattern = /(?:location|lage|standort)[:\s]+([A-Z][a-zäöüÄÖÜß]+(?:[\s,]+[A-Z][a-zäöüÄÖÜß]+)*)/i; + const match = html.match(locationPattern); + if (match) { + locationText = match[1].trim(); + locationSource = 'text_pattern'; + } + } + + // 4. Try extracting from title (e.g., "Apartment in Berlin · ...") + if (!locationText) { + const titlePattern = /(?:in|bei|am)\s+([A-Z][a-zäöüÄÖÜß]+(?:\s+[A-Z][a-zäöüÄÖÜß]+)?)\s*[·•]/; + const title = $('title').text(); + const match = title.match(titlePattern); + if (match) { + locationText = match[1].trim(); + locationSource = 'text_pattern'; + } + } + + // Extract coordinates from various sources + // Try data attributes + const latAttr = $('[data-lat]').attr('data-lat') || $('[data-latitude]').attr('data-latitude'); + const lngAttr = $('[data-lng]').attr('data-lng') || $('[data-longitude]').attr('data-longitude'); + + if (latAttr && lngAttr) { + latitude = parseFloat(latAttr); + longitude = parseFloat(lngAttr); + } + + // Try meta tags for coordinates + if (!latitude) { + const geoPosition = $('meta[name="geo.position"]').attr('content') || + $('meta[property="place:location:latitude"]').attr('content'); + if (geoPosition) { + const parts = geoPosition.split(/[;,]/); + if (parts.length >= 2) { + latitude = parseFloat(parts[0]); + longitude = parseFloat(parts[1]); + } else { + latitude = parseFloat(geoPosition); + } + } + } + + if (!longitude) { + const lngMeta = $('meta[property="place:location:longitude"]').attr('content'); + if (lngMeta) { + longitude = parseFloat(lngMeta); + } + } + + return { + locationText: { + value: locationText, + source: locationSource, + confidence: locationText ? 'medium' : 'low', + }, + latitude: { + value: latitude, + source: latitude ? 'dom' : 'text_pattern', + confidence: latitude ? 'high' : 'low', + }, + longitude: { + value: longitude, + source: longitude ? 'dom' : 'text_pattern', + confidence: longitude ? 'high' : 'low', + }, + }; +} diff --git a/src/lib/airbnb/parsers/price.ts b/src/lib/airbnb/parsers/price.ts new file mode 100644 index 0000000..9d81e27 --- /dev/null +++ b/src/lib/airbnb/parsers/price.ts @@ -0,0 +1,102 @@ +import * as cheerio from 'cheerio'; +import { FieldSource, PriceStatus, TripContext } from '../types'; +import { parsePriceFromText } from './text-patterns'; + +/** + * Try to extract price from HTML using various selectors + */ +function tryExtractPriceFromHtml(html: string, $: cheerio.CheerioAPI): number | null { + // Try various price selectors that Airbnb might use + const priceSelectors = [ + '[data-testid="price-amount"]', + 'span[class*="Price"]', + 'span[class*="price"]', + '[itemprop="price"]', + '._1y6k3r2', + '._1dss1omb', + ]; + + for (const selector of priceSelectors) { + const element = $(selector).first(); + if (element.length) { + const text = element.text(); + const price = parsePriceFromText(text); + if (price !== null) { + return price; + } + } + } + + // Fallback: search entire HTML for price patterns + const priceFromHtml = parsePriceFromText(html); + if (priceFromHtml !== null) { + return priceFromHtml; + } + + return null; +} + +/** + * Extract price with trip context awareness + * + * CRITICAL: Price reliability depends on trip context + * - With check-in/check-out: Price is for those specific dates + * - Without trip context: Price may be a base/minimum price + */ +export function extractPrice( + html: string, + $: cheerio.CheerioAPI, + tripContext: TripContext +): { nightly: FieldSource; total: FieldSource; status: PriceStatus } { + + // No trip context = unreliable price + if (!tripContext.checkIn || !tripContext.checkOut) { + const extracted = tryExtractPriceFromHtml(html, $); + + if (extracted !== null) { + return { + nightly: { value: extracted, source: 'text_pattern', confidence: 'low' }, + total: { value: null, source: 'text_pattern', confidence: 'low' }, + status: 'REQUIRES_TRIP_CONTEXT', + }; + } + + return { + nightly: { value: null, source: 'text_pattern', confidence: 'low' }, + total: { value: null, source: 'text_pattern', confidence: 'low' }, + status: 'UNKNOWN', + }; + } + + // With trip context, try harder to extract + const extracted = tryExtractPriceFromHtml(html, $); + + if (extracted !== null) { + // Calculate nights for total price + let total: number | null = null; + try { + const checkIn = new Date(tripContext.checkIn); + const checkOut = new Date(tripContext.checkOut); + const nights = Math.round((checkOut.getTime() - checkIn.getTime()) / (1000 * 60 * 60 * 24)); + if (nights > 0) { + total = extracted * nights; + } + } catch { + // Invalid dates, skip total calculation + } + + return { + nightly: { value: extracted, source: 'text_pattern', confidence: 'medium' }, + total: total !== null + ? { value: total, source: 'derived', confidence: 'medium' } + : { value: null, source: 'text_pattern', confidence: 'low' }, + status: 'EXTRACTED', + }; + } + + return { + nightly: { value: null, source: 'text_pattern', confidence: 'low' }, + total: { value: null, source: 'text_pattern', confidence: 'low' }, + status: 'UNKNOWN', + }; +} diff --git a/src/lib/airbnb/parsers/sleeping.ts b/src/lib/airbnb/parsers/sleeping.ts new file mode 100644 index 0000000..87175a1 --- /dev/null +++ b/src/lib/airbnb/parsers/sleeping.ts @@ -0,0 +1,143 @@ +import { BedType, SleepingOption } from '../types'; + +/** + * Bed type configuration: maps text patterns to bed types, spots per unit, and quality + */ +export const BED_TYPE_CONFIG: Record = { + 'double bed': { type: 'DOUBLE', spots: 2, quality: 'FULL' }, + 'doppelbett': { type: 'DOUBLE', spots: 2, quality: 'FULL' }, + 'queen bed': { type: 'QUEEN', spots: 2, quality: 'FULL' }, + 'king bed': { type: 'KING', spots: 2, quality: 'FULL' }, + 'single bed': { type: 'SINGLE', spots: 1, quality: 'FULL' }, + 'twin bed': { type: 'SINGLE', spots: 1, quality: 'FULL' }, + 'einzelbett': { type: 'SINGLE', spots: 1, quality: 'FULL' }, + 'bunk bed': { type: 'BUNK', spots: 2, quality: 'FULL' }, + 'etagenbett': { type: 'BUNK', spots: 2, quality: 'FULL' }, + 'sofa bed': { type: 'SOFA_BED', spots: 2, quality: 'FULL' }, + 'pull-out sofa': { type: 'SOFA_BED', spots: 2, quality: 'FULL' }, + 'schlafsofa': { type: 'SOFA_BED', spots: 2, quality: 'FULL' }, + 'couch': { type: 'SOFA', spots: 1, quality: 'AUXILIARY' }, + 'sofa': { type: 'SOFA', spots: 1, quality: 'AUXILIARY' }, + 'air mattress': { type: 'AIR_MATTRESS', spots: 1, quality: 'AUXILIARY' }, + 'luftmatratze': { type: 'AIR_MATTRESS', spots: 1, quality: 'AUXILIARY' }, + 'floor mattress': { type: 'EXTRA_MATTRESS', spots: 1, quality: 'AUXILIARY' }, + 'extra mattress': { type: 'EXTRA_MATTRESS', spots: 1, quality: 'AUXILIARY' }, + 'zusatzmatratze': { type: 'EXTRA_MATTRESS', spots: 1, quality: 'AUXILIARY' }, + 'futon': { type: 'FUTON', spots: 1, quality: 'AUXILIARY' }, +}; + +// Pattern: "1 double bed" or "2 single beds" or "Bedroom 1: 1 queen bed" +const BED_PATTERN = /(?:(?:bedroom|schlafzimmer|room|zimmer)\s*\d*\s*:?\s*)?(\d+)\s+([a-z\s-]+?)(?:\s|$|,|\.)/gi; + +export interface SleepingStats { + maxSleepingPlaces: number; + suitableFor4: boolean; + extraMattressesNeededFor4: number; +} + +/** + * Parse sleeping arrangements from text + * Handles patterns like: + * - "1 double bed" + * - "2 single beds" + * - "Bedroom 1: 1 queen bed" + * - "Common space: 1 sofa bed" + */ +export function parseSleepingArrangements(text: string): SleepingOption[] { + const options: SleepingOption[] = []; + const lowerText = text.toLowerCase(); + + let match; + while ((match = BED_PATTERN.exec(lowerText)) !== null) { + const quantity = parseInt(match[1], 10); + const bedTypeText = match[2].trim(); + + // Find matching bed type config + let matchedConfig: { type: BedType; spots: number; quality: 'FULL' | 'AUXILIARY' } | null = null; + let matchedLabel = ''; + + for (const [pattern, config] of Object.entries(BED_TYPE_CONFIG)) { + if (bedTypeText.includes(pattern) || pattern.includes(bedTypeText)) { + matchedConfig = config; + matchedLabel = pattern; + break; + } + } + + if (matchedConfig && quantity > 0) { + // Check if this bed type already exists + const existing = options.find(o => o.bedType === matchedConfig!.type); + if (existing) { + existing.quantity += quantity; + } else { + options.push({ + bedType: matchedConfig.type, + quantity, + spotsPerUnit: matchedConfig.spots, + quality: matchedConfig.quality, + label: matchedLabel, + rawText: match[0].trim(), + }); + } + } + } + + return options; +} + +/** + * Calculate sleeping statistics from options + */ +export function calculateSleepingStats(options: SleepingOption[]): SleepingStats { + const maxSleepingPlaces = options.reduce( + (sum, opt) => sum + opt.quantity * opt.spotsPerUnit, + 0 + ); + + const suitableFor4 = maxSleepingPlaces >= 4; + + // Calculate extra mattresses needed for 4 people + // Only count FULL quality beds first + const fullQualitySpots = options + .filter(o => o.quality === 'FULL') + .reduce((sum, opt) => sum + opt.quantity * opt.spotsPerUnit, 0); + + const extraMattressesNeededFor4 = Math.max(0, 4 - fullQualitySpots); + + return { + maxSleepingPlaces, + suitableFor4, + extraMattressesNeededFor4, + }; +} + +/** + * Derive sleeping options from bed count (fallback with low confidence) + * Used when detailed sleeping arrangement text is not available + */ +export function deriveSleepingFromBeds(beds: number, guestCount: number): SleepingOption[] { + if (!beds || beds < 1) return []; + + // Assume beds are double beds if guest count suggests it + const avgGuestsPerBed = guestCount ? guestCount / beds : 2; + + if (avgGuestsPerBed >= 1.5) { + // Likely double beds + return [{ + bedType: 'DOUBLE', + quantity: beds, + spotsPerUnit: 2, + quality: 'FULL', + label: 'double bed (derived)', + }]; + } else { + // Likely single beds + return [{ + bedType: 'SINGLE', + quantity: beds, + spotsPerUnit: 1, + quality: 'FULL', + label: 'single bed (derived)', + }]; + } +} diff --git a/src/lib/airbnb/parsers/text-patterns.ts b/src/lib/airbnb/parsers/text-patterns.ts new file mode 100644 index 0000000..098d13f --- /dev/null +++ b/src/lib/airbnb/parsers/text-patterns.ts @@ -0,0 +1,123 @@ +/** + * Text pattern parsers for extracting data from visible HTML text + * Supports both German and English patterns + */ + +// "2 guests · 1 bedroom · 2 beds · 1 bath" or German variants +const CAPACITY_PATTERN = /(\d+)\s*(?:guests?|gäste?)\s*[·•]\s*(\d+)\s*(?:bedrooms?|schlafzimmer?)\s*[·•]\s*(\d+)\s*(?:beds?|betten?)\s*[·•]\s*(\d+(?:[.,]\d+)?)\s*(?:baths?|bäder?)/i; + +// "4.88 · 200 reviews" or "4,88 (200)" or "4,88 · 200 Bewertungen" +const RATING_PATTERN = /(\d+[.,]\d+)\s*(?:[·•\(]?\s*(\d+)\s*(?:reviews?|bewertungen)?\)?)/i; + +// "Hosted by David" or "Gehostet von David" +const HOST_PATTERN = /(?:hosted by|gehostet von)\s+([^\n·•]+)/i; + +// "€ 150 / night" or "$150 per night" or "150 € pro Nacht" +const PRICE_PATTERN = /[€$]?\s*(\d+(?:[.,]\d{0,2})?)\s*[€$]?\s*(?:\/|per|pro)\s*(?:night|nacht)/i; + +// "6 guests maximum" or "max. 6 Gäste" or "Up to 6 guests" +const MAX_GUESTS_PATTERN = /(?:max\.?|maximum|up to)\s*(\d+)\s*(?:guests?|gäste?)|(\d+)\s*(?:guests?|gäste?)\s*(?:maximum|max\.?)/i; + +export interface CapacityFacts { + guests: number; + bedrooms: number; + beds: number; + bathrooms: number; +} + +export interface RatingFacts { + rating: number; + reviewCount: number; +} + +/** + * Parse capacity facts from text like "2 guests · 1 bedroom · 2 beds · 1 bath" + */ +export function parseCapacityFacts(text: string): CapacityFacts | null { + const match = text.match(CAPACITY_PATTERN); + if (!match) return null; + + return { + guests: parseInt(match[1], 10), + bedrooms: parseInt(match[2], 10), + beds: parseInt(match[3], 10), + bathrooms: parseFloat(match[4].replace(',', '.')), + }; +} + +/** + * Parse rating from text like "4.88 · 200 reviews" + */ +export function parseRating(text: string): RatingFacts | null { + const match = text.match(RATING_PATTERN); + if (!match) return null; + + const rating = parseFloat(match[1].replace(',', '.')); + const reviewCount = match[2] ? parseInt(match[2], 10) : 0; + + if (isNaN(rating)) return null; + + return { rating, reviewCount }; +} + +/** + * Parse host name from text like "Hosted by David" + */ +export function parseHost(text: string): string | null { + const match = text.match(HOST_PATTERN); + if (!match) return null; + + return match[1].trim(); +} + +/** + * Parse price from text like "€ 150 / night" + */ +export function parsePriceFromText(text: string): number | null { + const match = text.match(PRICE_PATTERN); + if (!match) return null; + + const price = parseFloat(match[1].replace(',', '.')); + return isNaN(price) ? null : price; +} + +/** + * Parse max guests from text like "6 guests maximum" + */ +export function parseMaxGuests(text: string): number | null { + const match = text.match(MAX_GUESTS_PATTERN); + if (!match) return null; + + // Pattern has two capture groups depending on word order + const value = match[1] || match[2]; + return value ? parseInt(value, 10) : null; +} + +/** + * Extract all text content from HTML for pattern matching + */ +export function extractVisibleText(html: string): string { + // Remove script and style tags + let text = html.replace(/]*>[\s\S]*?<\/script>/gi, ' '); + text = text.replace(/]*>[\s\S]*?<\/style>/gi, ' '); + + // Replace block elements with newlines + text = text.replace(/<\/(div|p|br|li|tr|td|th|h[1-6]|section|article|header|footer)[^>]*>/gi, '\n'); + + // Remove remaining tags + text = text.replace(/<[^>]+>/g, ' '); + + // Decode HTML entities + text = text + .replace(/ /g, ' ') + .replace(/&/g, '&') + .replace(/</g, '<') + .replace(/>/g, '>') + .replace(/"/g, '"') + .replace(/&#(\d+);/g, (_, num) => String.fromCharCode(parseInt(num, 10))); + + // Normalize whitespace + text = text.replace(/\s+/g, ' ').trim(); + + return text; +} diff --git a/src/lib/airbnb/types.ts b/src/lib/airbnb/types.ts new file mode 100644 index 0000000..4962cd8 --- /dev/null +++ b/src/lib/airbnb/types.ts @@ -0,0 +1,113 @@ +export type DataSource = 'jsonld' | 'meta' | 'text_pattern' | 'dom' | 'playwright' | 'derived' | 'manual'; +export type Confidence = 'high' | 'medium' | 'low'; +export type PriceStatus = 'EXTRACTED' | 'REQUIRES_TRIP_CONTEXT' | 'UNKNOWN' | 'PARTIAL'; +export type SleepingDataQuality = 'EXACT' | 'DERIVED' | 'UNKNOWN'; + +export interface FieldSource { + value: T; + source: DataSource; + confidence: Confidence; +} + +/** + * Create a FieldSource object with value, source, and confidence + */ +export function field(value: T, source: DataSource, confidence: Confidence): FieldSource { + return { value, source, confidence }; +} + +/** + * Merge two FieldSources - takes the first non-null value + * Priority: primary over secondary + */ +export function mergeField(primary: FieldSource | null, secondary: FieldSource | null): FieldSource { + if (primary?.value !== null && primary?.value !== undefined) { + return primary; + } + if (secondary?.value !== null && secondary?.value !== undefined) { + return secondary; + } + // Return null with lowest confidence + return { value: null as T, source: 'derived', confidence: 'low' }; +} + +export type BedType = 'DOUBLE' | 'SINGLE' | 'SOFA_BED' | 'SOFA' | 'AIR_MATTRESS' | 'FUTON' | 'BUNK' | 'EXTRA_MATTRESS' | 'QUEEN' | 'KING' | 'UNKNOWN'; + +export interface SleepingOption { + bedType: BedType; + quantity: number; + spotsPerUnit: number; + quality: 'FULL' | 'AUXILIARY'; + label?: string; + rawText?: string; +} + +export interface TripContext { + checkIn?: string; + checkOut?: string; + adults?: number; +} + +export interface NormalizedUrl { + original: string; + normalized: string; + externalId: string | null; + tripContext: TripContext; +} + +export interface ExtractedListing { + // URLs + originalUrl: string; + normalizedUrl: string; + externalId: string | null; + + // Basic Info + title: FieldSource; + description: FieldSource; + + // Location + locationText: FieldSource; + latitude: FieldSource; + longitude: FieldSource; + + // Pricing + tripContext: TripContext; + nightlyPrice: FieldSource; + totalPrice: FieldSource; + priceStatus: PriceStatus; + + // Rating + rating: FieldSource; + reviewCount: FieldSource; + + // Capacity + guestCount: FieldSource; + officialGuestCount: FieldSource; + bedrooms: FieldSource; + beds: FieldSource; + bathrooms: FieldSource; + + // Sleeping + sleepingOptions: SleepingOption[]; + maxSleepingPlaces: number; + suitableFor4: boolean; + extraMattressesNeededFor4: number; + sleepingDataQuality: SleepingDataQuality; + + // Host + hostName: FieldSource; + + // Amenities + amenities: string[]; + + // Images + images: string[]; + coverImage: string | null; + + // Other + cancellationPolicy: FieldSource; + + // Debug + rawSnippets: Record; + extractionLog: string[]; +} diff --git a/src/lib/airbnb/url-normalizer.ts b/src/lib/airbnb/url-normalizer.ts new file mode 100644 index 0000000..239263e --- /dev/null +++ b/src/lib/airbnb/url-normalizer.ts @@ -0,0 +1,71 @@ +import { TripContext, NormalizedUrl } from './types'; + +/** + * Extracts the Airbnb listing ID from a URL + * Matches patterns like /rooms/12345 or /rooms/12345/ + */ +export function extractAirbnbExternalId(url: string): string | null { + const match = url.match(/\/rooms\/(\d+)/); + return match?.[1] || null; +} + +/** + * Extracts trip context from URL query parameters + * Looks for: check_in, check_out, adults + */ +export function extractTripContext(url: string): TripContext { + try { + const urlObj = new URL(url); + const params = urlObj.searchParams; + + const checkIn = params.get('check_in') || params.get('checkIn') || undefined; + const checkOut = params.get('check_out') || params.get('checkOut') || undefined; + const adultsStr = params.get('adults') || params.get('adults[]') || undefined; + + return { + checkIn, + checkOut, + adults: adultsStr ? parseInt(adultsStr, 10) : undefined, + }; + } catch { + return {}; + } +} + +/** + * Normalizes an Airbnb URL by: + * - Removing hash + * - Removing query params (trip context extracted separately) + * - Removing trailing slashes + * - Removing www prefix + * - Lowercasing hostname + */ +export function normalizeAirbnbUrl(url: string): string { + try { + const urlObj = new URL(url.trim()); + urlObj.hash = ''; + urlObj.search = ''; + urlObj.pathname = urlObj.pathname.replace(/\/+$/, ''); + urlObj.hostname = urlObj.hostname.replace(/^www\./, '').toLowerCase(); + return urlObj.toString(); + } catch { + return url.trim(); + } +} + +/** + * Main function: Normalizes URL and extracts all metadata + */ +export function normalizeAirbnbUrlWithContext(url: string): NormalizedUrl { + const original = url.trim(); + const normalized = normalizeAirbnbUrl(original); + const externalId = extractAirbnbExternalId(normalized); + const tripContext = extractTripContext(original); + + return { + original, + normalized, + externalId, + tripContext, + }; +}