diff --git a/crawler/crawler.php b/crawler/crawler.php new file mode 100644 index 0000000..454a549 --- /dev/null +++ b/crawler/crawler.php @@ -0,0 +1,164 @@ +loadHTML($html); + +$xpath = new DOMXPath($dom); +$query = '//ul[@class="pagination"]/child::*'; +$nodes = $xpath->query($query); + +$GLOBALS['pdfs'] = 0; + +// Loop through the selected nodes +foreach ($nodes as $node) { + // Do something with each node, for example, echo its content + $url = $node->getElementsByTagName('a')[0]->getAttribute('href'); + $number = intval($node->nodeValue); + if($number > $last_page){ + $last_page = $number; + } +} + +echo "[i] Found $last_page pages\n"; + +//create an array with all pages +$pages = []; +for($i = 1; $i <= $last_page; $i++){ + $pages[] = 'https://www.dognow.at/ergebnisse/?page=' . $i; +} + +//loop through all pages +foreach($pages as $key=> $page){ + if(file_exists('tmp/pages/' . ($key+1) . '.html')){ + $html = file_get_contents($page); + } + else + { + $html = file_get_contents($page); + file_put_contents('tmp/pages/' . ($key+1) . '.html', $html); + } + $dom = new DOMDocument; + $dom->loadHTML($html); + + // search for all divs with class "resultboard" + $xpath = new DOMXPath($dom); + $query = '//div[@class="resultboard info-board info-board-default2"]'; + $nodes = $xpath->query($query); + + // Loop through the selected nodes + foreach ($nodes as $node) { + // Do something with each node, for example, echo its content + //echo $node->nodeValue . "\n"; + $div = $node->getElementsByTagName('div')[0]; + $id = $div->getAttribute('data-event'); + $name = trim($div->getElementsByTagName('div')[1]->nodeValue); + $organizer = trim($div->getElementsByTagName('div')[2]->nodeValue); + $date = trim($div->getElementsByTagName('div')[3]->nodeValue); + + crawlRuns($id); + + echo " [E] $id - $name - $organizer - $date\n"; + } + + //exit(); +} + + +var_dump($GLOBALS['pdfs']); + +function crawlRuns($eventid) +{ + if(file_exists('tmp/events/' . $eventid . '.html')) + $data = file_get_contents('tmp/events/' . $eventid . '.html'); + else + { + //sleep(1); + $data = file_get_contents('https://www.dognow.at/ergebnisse/src/data.php?event='. $eventid .'&lauf=0'); + file_put_contents('tmp/events/' . $eventid . '.html', $data); + } + + //get first table using DOMDocument + $dom = new DOMDocument; + $dom->loadHTML($data); + + if(strpos($data,"Einzelwertung
+ +Derzeit sind keine Ergebnisse")!==false) + { + echo " [i] Keine Einzelwertungen\n"; + return; + } + + // when the string "Cup-Wertung" is found + if(strpos($data, 'Cup-Wertung') !== false){ + echo " [i] Found Cup-Wertung, skipping first table\n"; + $table = $dom->getElementsByTagName('table')[1]; + } + else + $table = $dom->getElementsByTagName('table')[0]; + + if(!$table) return; + + foreach($table->getElementsByTagName('tr') as $row){ + if(!$row) continue; + $rid = $row->getAttribute('id'); + if($rid) + $rid = explode('_', $rid)[1]; + + $tds = $row->getElementsByTagName('td'); + if(count($tds) == 3) //rally obedience + { + $runname = trim($tds[0]->nodeValue); + $lk = trim($tds[1]->nodeValue); + $pdf = $tds[2]->getElementsByTagName('a')[0]->getAttribute('href'); + } + else if(count($tds) == 4) // agility + { + $runname = trim($tds[0]->nodeValue); + $lk = trim($tds[1]->nodeValue); + $gk = trim($tds[2]->nodeValue); + $pdf = $tds[3]->getElementsByTagName('a')[0]->getAttribute('href'); + + getResults($rid,$eventid); + } + + if(!$runname || !$lk || !$pdf) continue; + + + echo " [R-$rid] $runname - $lk - $gk - $pdf\n"; + } + + + //exit("Crawling $eventid"); +} + +function getResults($run,$event) +{ + $GLOBALS['pdfs']++; + //return; + if(!$run || !$event) return; + $url = "https://www.dognow.at/ergebnisse/pdf.php?lauf=$run&event=$event"; + if(!file_exists('tmp/results/' . $event . '-' . $run . '.pdf')) + file_put_contents('tmp/results/' . $event . '-' . $run . '.pdf',file_get_contents($url)); + convertPDFtoCSV('tmp/results/' . $event . '-' . $run . '.pdf','tmp/csv/' . $event . '-' . $run . '.pdf.csv'); +} + +function convertPDFtoCSV($pdf,$targetname) +{ + if(file_exists($targetname)) return; + $csv = analyze($pdf); + file_put_contents($targetname, $csv); +} + +function analyze($pdf) { + $cmd = "java -jar tabula-1.0.5-jar-with-dependencies.jar -f CSV $pdf"; + $output = shell_exec($cmd); + return $output; +} + +?> \ No newline at end of file diff --git a/crawler/parse.php b/crawler/parse.php new file mode 100644 index 0000000..45f5c1d --- /dev/null +++ b/crawler/parse.php @@ -0,0 +1,19 @@ +