loadHTML($html); $xpath = new DOMXPath($dom); $query = '//ul[@class="pagination"]/child::*'; $nodes = $xpath->query($query); $GLOBALS['pdfs'] = 0; // Loop through the selected nodes foreach ($nodes as $node) { // Do something with each node, for example, echo its content $url = $node->getElementsByTagName('a')[0]->getAttribute('href'); $number = intval($node->nodeValue); if($number > $last_page){ $last_page = $number; } } echo "[i] Found $last_page pages\n"; //create an array with all pages $pages = []; for($i = 1; $i <= $last_page; $i++){ $pages[] = 'https://www.dognow.at/ergebnisse/?page=' . $i; } //loop through all pages foreach($pages as $key=> $page){ if(file_exists('tmp/pages/' . ($key+1) . '.html')){ $html = file_get_contents($page); } else { $html = file_get_contents($page); file_put_contents('tmp/pages/' . ($key+1) . '.html', $html); } $dom = new DOMDocument; $dom->loadHTML($html); // search for all divs with class "resultboard" $xpath = new DOMXPath($dom); $query = '//div[@class="resultboard info-board info-board-default2"]'; $nodes = $xpath->query($query); // Loop through the selected nodes foreach ($nodes as $node) { // Do something with each node, for example, echo its content //echo $node->nodeValue . "\n"; $div = $node->getElementsByTagName('div')[0]; $id = $div->getAttribute('data-event'); $name = trim($div->getElementsByTagName('div')[1]->nodeValue); $organizer = trim($div->getElementsByTagName('div')[2]->nodeValue); $date = trim($div->getElementsByTagName('div')[3]->nodeValue); crawlRuns($id); echo " [E] $id - $name - $organizer - $date\n"; } //exit(); } var_dump($GLOBALS['pdfs']); function crawlRuns($eventid) { if(file_exists('tmp/events/' . $eventid . '.html')) $data = file_get_contents('tmp/events/' . $eventid . '.html'); else { //sleep(1); $data = file_get_contents('https://www.dognow.at/ergebnisse/src/data.php?event='. $eventid .'&lauf=0'); file_put_contents('tmp/events/' . $eventid . '.html', $data); } //get first table using DOMDocument $dom = new DOMDocument; $dom->loadHTML($data); if(strpos($data,"Einzelwertung
Derzeit sind keine Ergebnisse")!==false) { echo " [i] Keine Einzelwertungen\n"; return; } // when the string "Cup-Wertung" is found if(strpos($data, 'Cup-Wertung') !== false){ echo " [i] Found Cup-Wertung, skipping first table\n"; $table = $dom->getElementsByTagName('table')[1]; } else $table = $dom->getElementsByTagName('table')[0]; if(!$table) return; foreach($table->getElementsByTagName('tr') as $row){ if(!$row) continue; $rid = $row->getAttribute('id'); if($rid) $rid = explode('_', $rid)[1]; $tds = $row->getElementsByTagName('td'); if(count($tds) == 3) //rally obedience { $runname = trim($tds[0]->nodeValue); $lk = trim($tds[1]->nodeValue); $pdf = $tds[2]->getElementsByTagName('a')[0]->getAttribute('href'); } else if(count($tds) == 4) // agility { $runname = trim($tds[0]->nodeValue); $lk = trim($tds[1]->nodeValue); $gk = trim($tds[2]->nodeValue); $pdf = $tds[3]->getElementsByTagName('a')[0]->getAttribute('href'); getResults($rid,$eventid); } if(!$runname || !$lk || !$pdf) continue; echo " [R-$rid] $runname - $lk - $gk - $pdf\n"; } //exit("Crawling $eventid"); } function getResults($run,$event) { $GLOBALS['pdfs']++; //return; if(!$run || !$event) return; $url = "https://www.dognow.at/ergebnisse/pdf.php?lauf=$run&event=$event"; if(!file_exists('tmp/results/' . $event . '-' . $run . '.pdf')) file_put_contents('tmp/results/' . $event . '-' . $run . '.pdf',file_get_contents($url)); convertPDFtoCSV('tmp/results/' . $event . '-' . $run . '.pdf','tmp/csv/' . $event . '-' . $run . '.pdf.csv'); } function convertPDFtoCSV($pdf,$targetname) { if(file_exists($targetname)) return; $csv = analyze($pdf); file_put_contents($targetname, $csv); } function analyze($pdf) { $cmd = "java -jar tabula-1.0.5-jar-with-dependencies.jar -f CSV $pdf"; $output = shell_exec($cmd); return $output; } ?>