From 9b4d15961547a7331fa891e9087590c43f6de70c Mon Sep 17 00:00:00 2001 From: Chris Date: Sun, 26 Nov 2023 15:48:28 +0100 Subject: [PATCH] more data --- crawler/crawler.php | 29 ++++++++++++++++------------- crawler/data.db | Bin 17375232 -> 17403904 bytes 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/crawler/crawler.php b/crawler/crawler.php index eb426ad..0c31462 100644 --- a/crawler/crawler.php +++ b/crawler/crawler.php @@ -7,7 +7,7 @@ $GLOBALS['db'] = new SQLite3('data.db'); if(!$GLOBALS['db']) exit("Error loading database"); //389-7625.pdf.csv -// analyzeResultCSV('tmp/csv/389-7625.pdf.csv',7625,389); +// analyzeResultCSV('tmp/csv/389-7637.pdf.csv',7637,389); // exit(); $html = file_get_contents('tmp/ergebnisse.html'); @@ -34,20 +34,21 @@ foreach ($nodes as $node) { echo "[i] Found $last_page pages\n"; //create an array with all pages -$pages = []; -for($i = 1; $i <= $last_page; $i++){ - $pages[] = 'https://www.dognow.at/ergebnisse/?page=' . $i; -} +$pages = range(1,65); -//loop through all pages -foreach($pages as $key=> $page){ - if(file_exists('tmp/pages/' . ($key+1) . '.html')){ + +scanPage(1,false); + +function scanPage($key,$usecache=true) +{ + $page = 'https://www.dognow.at/ergebnisse/?page=' . $key; + if(file_exists('tmp/pages/' . ($key) . '.html' && $usecache===true)){ $html = file_get_contents($page); } else { $html = file_get_contents($page); - file_put_contents('tmp/pages/' . ($key+1) . '.html', $html); + file_put_contents('tmp/pages/' . ($key) . '.html', $html); } $dom = new DOMDocument; $dom->loadHTML($html); @@ -72,20 +73,20 @@ foreach($pages as $key=> $page){ if($res->fetchArray() == false) $GLOBALS['db']->exec("INSERT INTO events (id, name, organizer, date) VALUES ('$id', '$name', '$organizer', '$db_date')"); - crawlRuns($id); + crawlRuns($id,$usecache); echo " [E] $id - $name - $organizer - $date\n"; } - //exit(); } + var_dump($GLOBALS['pdfs']); -function crawlRuns($eventid) +function crawlRuns($eventid,$usecache=true) { - if(file_exists('tmp/events/' . $eventid . '.html')) + if(file_exists('tmp/events/' . $eventid . '.html') && $usecache===true) $data = file_get_contents('tmp/events/' . $eventid . '.html'); else { @@ -94,6 +95,8 @@ function crawlRuns($eventid) file_put_contents('tmp/events/' . $eventid . '.html', $data); } + + //get first table using DOMDocument $dom = new DOMDocument; $dom->loadHTML($data); diff --git a/crawler/data.db b/crawler/data.db index 420075f197edd409c74aa94fc391f0b6a328948d..853f14d37be0bfdac25deb420d0a7d88098d8a9e 100644 GIT binary patch delta 16324 zcmaJ|349bq*6;2)dy>f{nG7K$frKQ49Ft@S_YpcFFa#1pm}C%42!SLV2~dHAaHt$E zH8b|N0_#z(`?+}Fr@JbOuFkr6OM-Ii-^vYqH#WxvtB z&0cF?WuI)%vVCQD*xt4M!FHSNVcQsD)Ur zu^zD=ux_?CIx-!i{IUEe`62nB+$S%SXW6oBX6vWcKg*--KiJRM2cmaAvh?TXU8#F7 z^jiff$L+FTE}xj?&8qa)w)J(^E-bHwM_UIUmFpkXz@xswM-%jq>fllD;3J>@Q2-uo z8GKa69_`|TO`wBqjy;<&O{X@24%RdHXsZ4Z9SjXVnxcOc7`6ApPWmVfUTnhT!AJGo z?dv+W)WXvp^yygkc+#-PHSls>)>|#mA7f%E3UZh+eJHXz4AWoyyeQ=*2B*-zMiXa9r4Dt{+`DSrxXeU*KYz1lwAUS=O>&#>QM-)-M&@0Q<_Uz1;u zpOK%CAC&KwZ;a(SG=kUu_klTlrO= z5bfhvy+X8?Uu_YhTlm#xA-b7g^$5`(euac6;#WI`=uUpMLx}F+SKEc?c7C-@h;HLo z;ax&B%r8SiG{mp45XJmzpAg;0uR4WjC%;-FMAz`E4k6mXuiAxZJHJ{XL|5>uRw3HT zuUdp?3%_a>qRsp&C`5zws!E71h5z`?CL!9yuNsADBfnZCL>KX^dLdfRuj+(o9lxp- zqP3xB*$mAB%?iy1%??e5=71IlEgqT^S^_i|v_xo0(2}9Kp`}3cKpO!q71~H>Y0ySN zONW*LEfd;knEsf%JEiA7#b^vhSr}zwG!~;AjB+u`!)P2v`52AIr~sovjEXQS#>k6N z2}Y$Dm0{$=XaYv%7)``z5=N6Tnu5_(jHY2U9itf-&BSOHMim&%#z?`a5~C`N=3rEf z(Oit?VdTeXK1MYdEx>3YMzt8#VYCRNdW;rh)PT_vi~<-nV$_6D5Tm6SU4_vyjFw|` zHAc-CwP3UYqgISoV$_DwDvVZR)Q(XHMx7X~!DuZ;>o8i6Q5Qzt7;V64BSxDrLKyX6 zv>Br<81-V*htXDy`Z3yu(RPe>V6+pXYcRSNqw6r*h0$(|_F%LZqw6vH5k~tk!We}x z3S$((=mv~bjQ$IwA7gYQM*A^3fYCvWeuB|W7~PD~Eg0R35y9v-jDCvI?HK(GqyNU} z4vc<|(IJfP#ON-J?#AdIj1FUTFGl}^(R~=*kI^qM`XxpWVDunH4`Fl!qlYni1f!!E zJ&Ms|82t*P$1!>WqbNqd#^_0meuL3(G5Q@wPhs>lM#nIE2BQItp2g@mM$cjNdyM{o z(H}8-9-}{D^a4gNV)SQ>Uc%^Qj9$U$1V$$@`U^&{V)PnDr!aaQqc@&FZx2uRj(^^xZ^pSLv zv|P#%-xT{qukaV4dll>iOwuGu-n3F#w84+dDgRT5j$C9GejHf6c0*rpZ^!12%~_Qj z*S7`Nb!;SUGuEb%{ny_hPgs)n}lPV+pKBD}e1OJbca zG6@ICrqgd0`b>(l{ipDAyGu!;t1$kXfWKM|^>eix_ZYjVQxD;r$Lr=YQlbkKKgc2iE=MnV26_QXZrI}z^|yGKC0uCvOP1L z?8nV!ST7tUZFhW{H>SEKFkKDxa-saRJfqOV#8+}|3>BKBhPH5_ywov9p$Et_ZC{O| zLUn3rGZ!lG)ER|-NiHRRIe`jQsi7V&RFvj33f)g0`NE0xz%i#eR8 zB%bvjN!fAMA5cSGTs1%4r#FFm3mFJ}H{K^H^UhIsIS&TFtu5fcS`Dq|N(Je&_2yP@ z79J$oPoFI-oz7j%p^N}W8~y7g0slHRw2mwJQ|HReMzu+rghvhyEVGJ=H_ZIgq9kyo zV51sZtG$?ehF$++lJF2&>AP6yt;}PlW>J#Z3(XHMR6}d14p^r<76mj*yPI6$YBX%r3^mljgv9*RB7@L%q~Uj;W>Xoz z8fxbV+&c5lL&ut}Vqt03QMT|d#l<%INWdRdLo1n*l;g>hWg2>Tl&CWa zyNS>BWyY)|Y7<9Nl!aps#6;w+Wb39}8`&hRlu%oTJZF^R*V{_Z0b*ncWzttbi@^9+iqRhXA zMJbD7=W_}|CoQxSCv-APzb>`w=WSlMx+^QN2_hzWVeP35x{Tmk&0O4Tj1#qM$jMb- zji$>8Z*%qF+vu0E*3wBTzWO$+R8r=9*gRZ{oi53<`vd+aHME>-m8MP6&KYK5wPrG~ z?9)+CJp`H7m|&xUAEkK?JZ%GuHxd8(#j3uQrDq9 zuUf>?!WOoyN+L7-F+qqwOR0t|=B4HtG}e;J3Ezz_s@Tgn)7{B_f)MwDpddM=qEyYm zoE~;g9w0rXQa*WRyU8+24K;D00*_HhB`4FRJSH?!4K@CsLWgjYEasEl6SmTh0-8*k zmv5WX%|`n%Ia!gGOZRG;8d}0-@>BB-GXF)oo*gkJ17zl@p$0BfnyR@D@8?jNTNX>` zc5YTfi>ZhxRi;g|v$HkHMTc^jZX^vauAN$usM+^CZApsR`RPy`bZ46j0{+R(YN(#~ zQIwWv=T5@!cMWn=wB=I;%EdpuVj_5P8A0=c62!4p8`R35$}U;IS*T` z54-6G(w0BAu!zSui;~RFaUnRhUJcc8m4Y;{79E+2?a;+!Ssb@ZshCbZbEiddbFT_c zgF~*C>*S@n>~;nv7~Zj(ZkoTTyF1I@*Rc_>M0H5#hCx)9pmI`+YuKQg}624BjE1}$Z+IgKbKK|ACt?^lL&&2h{l{mg~-0qkoe=ZZb z%>GBa3aPnZ`<1QNHr@J-^+7AL<^sRgX>pqe%*#ywGTj4^E?0V6+5>>@X|YlG7P{v@ zWL=iEJR<;L8>R_S86uQEoHX0klVfX76?!W9N%-(RpxXZx;)1$j>rhA;T*0qEZQ|oUF6{PAC(Yi=!7j-N#gzqu`WE8 z%3l%dnt80Nm{H3Br;?(b?ur`DrV38U>oy$9L2>Eg<4<&VZ)n@NF{^RyrVVXdmH{w( zEZt2b$rLr5rKu$uFjr7)B6}VB#RlDS$9kD?Nyl<&*tJI?XgQc3J4XKT(D%$ofiLjyaNTJ2R* zc!*P++z^9c@M*LUinV6xG2K#e1M$6hw$MALiVK+!y~v*QJMiuHHnMJ(HGwd$>?RngN_Q ziFJqKo;8as@tCZJQ@9i$BjcoM$;s|p(`i>T)v#Nelo10qi3`Y{vBz>V;N6nBgg4cI zP@2RVGVtYnWuss^Nm?JCNrr*QfbYG0I?Nn3oXACr(hTsg2@?K2N7ol+!0NcTM2;~{ zY7(nSc@_$t=z#m!5roTAAXH;lm^zm;-*W2q)L_dK&cD zuE``;l7V|Zn+#c`pYJ1Nk+Jx-L=DGtwF1u+J-BO<#457q(Q^gGd<4MgJX!`pC621d zfJ7_xkflkSLmGZ~sjzs#-8@jaG(QGQb5ISZG{$2<>5bwH(iVAm1`Q=Lm&h^V_(pLm zIeYuV6&bM0c1^;#%Z=g`QkniZ+vO@XY~w=1lEOxDG7&%DSV(u-$|Xv33?|hmP9oWd zA7d8WtcERGH|{R|MmLJ(B&oY8h1~qY>9P!12s0Nen5zLFo)$NX6Ug}wFHXq>i+}^d z#D%6AeYugWeDJbDvN8_Qh#+y99Jj%O8^sbbaA0d@=0-IvQVCJ=c~%}yO9 z_IDgdBmw_oHFT{u-&CL8v>HvKmkj*-lktTbAW7EF6rhc+;Tm~qFe;;s*s;|pk&B-_ zm_oADFGm;8t!Ia@;?`iy5;e4wYC-5TCIOA&MABCM^AvL8yU#}y&$ZE)o3FSjz8wDz z&1iQ}HK$aWTB)~-MZ%3l-1X0y?HsqNe$EN0EC8lcYDmc*Q;Tg3((U%)i3HcqTL3r1;_Uzm{i?wNFzc9;3|C{?>N2YO5Ni_#Zlf5UM8#+N}^n+w6DGoSDiq7iI2e3^^*L z_p`)FiQ^D8;NPK!X+i*fK(NxI*>($yRwilotK|LVuceN2#ZRhVilz@@dZ|e$;kUg}i+b&v0 zg{!&LuxPMCiw2p1spn|(HJ-^UOhC3i^59n!;Y^;+gNsW^(VfXvT#YijI;v*{Wp>Xz z|J}%jWG$Jf=BujY^sxvvlvKdbm9!6_(}AgGCdH%iivPeI+XDm6TXszM~7jP6h_*lJ~rt|BRePQ0EQ zZOn-WS&#*4crx#!$ZcTxS}g)6rs4daUBb=;Ao;oZ1^m5gc#@_zrb|yIx0>}lH0V}v zD}l40`%AzNNNA#_3yh|Y9&R-WjE&DPRVvofkqUq|(U>fNV&z;3sBwMdXqAM!$UyPG zM|&&yEI~&*Zx*WI2~_6_ez;ZO{BZH~r+Car;qL_uao9&Su3(5;G=`XOE(2Vt-W*&0BUcJq;8k9(HH~Dxn+yS5|=0nVTjEVt3&`Y3KvivniCkHM2kjFlQZ?DBfH{whznAfR3bzK z*J0P9gnuSH1vR^|&exr{I+r-@@qdp0QG9OPi*cLd?2gABb@IQVX4fId*^k)k;ghNt zY^!Z1>qFM1mhUY0SnAAwH%H8)O>daCn%vTFq-EmQ;!Waw;R}K6$;uv=xeDs3>|LaS z)WNLCei=T0u!>iclXqR11SE0Qb?iiiatE(MP?9*pkUc${rp4@;saeP?U2hkcDsy=N z2FuirGrt;PI3G%bNrTqRRLtU1;`8szBwf$t$$1`5xCB?I5r+9;2zNsaxH5T@xSX8K z{(2Hwoc@VD&!hFfQjKUC0OWTDc}cvARDS-%)S{aiSW`x5r&x;`Vc;KpWtaxA91Rl> zOe(Hoas9k9f;)7uNewgP@07|@2YClNCk#pQzza`IrwL7~8fG{j@Kg36=3zoa`E_4S z1~?qU@Bo}r2a_k>Nj2G1^-2{D+mI(S*bYc-D6{86Rix*$hVe8Qu2I7br30c#TOBh* zCgdPV;Rzmc;^q$u$rq(1a$X7r>i#1^;L;dEXCTO+A_JyP%Kc(y@tjy3^04?hD+sx# zR@BW)8_Z|fFm`hG`*%x96#pM-_w$t`j-O}-9IM!-&d~1#wgG$5(C|iq*MAk0RgyI3 z1eUg+8zS%>1{Eu$=ald$j@i_pNkR6uHDqt+-2UU5MMHF3}w+3gb;T|dqbYJRV z8cap^0p#}GcOf^UQVk<60hlosUb%#i^wfM-Fba6OOTEu z-LW|un5|Cgc3=NclA(sXr~r!;dVZ%<0)Eox|M8gFkFg-3EA}r^!|VApfC-IFgMN6> z`Jb53lw##P4IM8j8buTYc5)q`*f4f-jmX(a>X<-1uH{0*sK+&;Mm>Uyu@bE)0Tap& zB9SA@HD;hfFa7vP)%dCbwj>TSjig~#pk+j*{s3BI0;2NZA3iQB;aJn5By#U!Ivsq0 zC^5+nAZ-LmV$64#PW2a$Pb{9rp@fp6Aqha2 ztNDu#GIbtC9CUyg`_?~EOqse>Tw)lH*v!+6XNJ&y_I->Qvg zq$oRd6y*voF^r<@&{32vS|3J=vSW~?HnPxj?Yqw=sPg}9oXZf4uCl;^yWAn}CO4IAWw&Y6hOM9hC z@vL~Gm`Kop+Qnq|30rbTEqtLx-Beq^ zcgew*zZ*w)HK;~7I+VPbr7;}J#xek`yP|dSP(38W}@1 zU@1Lby*=$94?lUK5SDT@?=XL`O3X}u2f6g*h0)%{zh|DW#J;mwszx&TqzgQi`blq> zB>09d{#kF)LJmMIij%310o)eJh)vo6kG7MnxUb5J8|G?b@Q?uqb0nQlxzsa7Kjm%G za*}j5SV%D^?9wPck)rg$OqQ<1HiyrHBd{N>h6a^F{ac4cM0%dSP@J(yjcD~VI6I9J8+bhuG6}7C zHVo2-;S5iD`oi2%3)P5LJexMe)v)=JlYIw9k~8hU97SDclNw3l%L~y?WYiW%gW))_ zhn$?yJ8`yo_FuUrD+`uWNVQX$_BeI#Zxj~?t$wi9L~W1UUcD!U9e_XvzMni6Xtk{S|pOdwE#86jwJSGc5y2a ztveQ!_~WS?y#ywqK|q+Glk4QCl^S$x;yNOxlDv}YG#cb?w(odb*U3pY97$n|*h{kOKg;#b*UqFQX0<{P5@H8e z@foc(Y!>^-rJPr%d8?mfQ&Zg9+ki{RTm|5mJ~)I;;!a*DuDXk3;1QZ@u2mzfSSLz( z>8lMYlDM7p3}h=sOW8szBiJkLgtQ{UDt9uRPll8#EN&y023{>MQmR}K=bqiKjL?D> z1UjvhS7gjqL*(e)pG_s-kIA%MRV2J9xNdPxN;s9UDIv-Es1rF0;?Dp$sEqq0ZhPD` z$2$(<7%%@tUN5KE58LZ(AKEtBa;<-|?geP@u|>5Mn*V6N#_Ta2F?E=dq~A(G@w|8& zKw2m@R+uQjBP{btSAOO$!1wSCj_+D$x9cTRgxfe0zzHO45*WAwQv$-GSE5m-l8wG6Id|5 z5VSTPVdW)C=RjG5fAh@L%*<2(vCOkFRuT{k1tIkNY9Mr2x#w9Ja=2nmPQ)QATJfI2?TS zR09f1L&b9XI(@)#oPFZ`ic#~`$RaLLFjS~y5|lV7rHLNIjyf&^$Cf_o_t7sEzIc05 z#vC6nrz^DjEf+_CjqUX$EB4H^|QvkONomrO0vt7u0?2!Dr0L4|??W zsF4~f1iO^ITE9zu_FEsDKM!`cCXHD*kk%ah2mD~;S`mqIH@fqr&&If$irJ^B3e=qI z9%C>-@o0ZL`EU3NCZg4pC}*Ovoos4W$WnlcDj16I8k!Aus}WXIf|?g_8hRaz$VrWo zY96crno=a(a2}YJ2&*YUNHj7neP)d_BE@Z=CuYJa#i~jW9l%z#8W)?7iIQ+F(HY+P ztH445@x>4Ybgh+{hS5HK5~qC%iv9dtQTx)!hHnUy{qHde{E5)nKzn{KFfszE?2R=cOAgdHzLDKX>C6SUeE( z{jOOFrxRKeGMq0tcRHuXe;NOa_@!~A?UCEJM-^0@ZI7a}u58mNfZ-a^H4KLL>tHxQ zW5#vStl1C(`zb1UG!!mhQT3&XZ#iYD8XO} z{TiaBAsc3+QC7o%dOy%&^us|4!!^ok7!2PwU&H%FeWdQg|abHg^;Z_I*Ck_?^ zet0iNYXP`Phgvl+I8iI?w`(xAq~>WYW7Ym2hu{V^!iX+9xh{i_jprm)$%;QlC?sn~ G?f(K0toG3W delta 1051 zcmWmDd3+WE9LMqJ*|WCic~qMciVh{Yja;!J_mMjiVhzdJ$FbbH_>4kZVXi4m&M4AD zj+znWXyn+OgKYd%yH06pscvdXhJsvEOKRwVTwz7gs7&?fOkwB{ib8Qnhy0!m z8-$LbQ|KH@LTT7AbO~L?ixn0dk-mBnQhOa;O|8hs!WILXMQ9*(aI2AL|;g%2FAeg5%#jD>A$eFHkw@h*nJbUW6EaWc%aihyERd&Vq2%Nlc~%z5bMm~r zATP>dSt5<{k~GQ7@`@~#SLHQXCd=h@c|+cmw`7Gh%iHpftdw`P;g9fV_$&M!{t5qv|H8Vkep-8Z xa(c+8AfHq|h52;IrzoG|u