loadHTML($html); // $xpath = new DOMXPath($dom); // $query = '//ul[@class="pagination"]/child::*'; // $nodes = $xpath->query($query); // $GLOBALS['pdfs'] = 0; // // Loop through the selected nodes // foreach ($nodes as $node) { // // Do something with each node, for example, echo its content // $url = $node->getElementsByTagName('a')[0]->getAttribute('href'); // $number = intval($node->nodeValue); // if($number > $last_page){ // $last_page = $number; // } // } // echo "[i] Found $last_page pages\n"; // //create an array with all pages // $pages = range(1,65); // foreach($pages as $page) // { // echo "[i] Crawling page $page\n"; // scanPage($page); // } scanPage(1,false); function scanPage($key,$usecache=true) { $page = 'https://www.dognow.at/ergebnisse/?page=' . $key; if(file_exists('tmp/pages/' . ($key) . '.html' && $usecache===true)){ $html = file_get_contents($page); } else { $html = file_get_contents($page); file_put_contents('tmp/pages/' . ($key) . '.html', $html); } $dom = new DOMDocument; $dom->loadHTML($html); // search for all divs with class "resultboard" $xpath = new DOMXPath($dom); $query = '//div[@class="resultboard info-board info-board-default2"]'; $nodes = $xpath->query($query); // Loop through the selected nodes foreach ($nodes as $node) { // CUPs $div = $node->getElementsByTagName('div')[0]; $id = $div->getAttribute('data-event'); $name = trim($div->getElementsByTagName('div')[1]->nodeValue); $organizer = trim($div->getElementsByTagName('div')[2]->nodeValue); $date = trim($div->getElementsByTagName('div')[3]->nodeValue); $db_date = date(DATE_RFC3339, strtotime($date)); //if not exists, add to db $res = $GLOBALS['db']->query("SELECT * FROM events WHERE id = '$id'"); if($res->fetchArray() == false) $GLOBALS['db']->exec("INSERT INTO events (id, name, organizer, date) VALUES ('$id', '$name', '$organizer', '$db_date')"); crawlRuns($id,$usecache); echo " [E] $id - $name - $organizer - $date\n"; } } var_dump($GLOBALS['pdfs']); function crawlRuns($eventid,$usecache=true) { if(file_exists('tmp/events/' . $eventid . '.html') && $usecache===true) $data = file_get_contents('tmp/events/' . $eventid . '.html'); else { //sleep(1); $data = file_get_contents('https://www.dognow.at/ergebnisse/src/data.php?event='. $eventid .'&lauf=0'); file_put_contents('tmp/events/' . $eventid . '.html', $data); } //get first table using DOMDocument $dom = new DOMDocument; $dom->loadHTML($data); if(strpos($data,"Einzelwertung
Derzeit sind keine Ergebnisse")!==false) { echo " [i] Keine Einzelwertungen\n"; return; } // when the string "Cup-Wertung" is found if(strpos($data, 'Cup-Wertung') !== false){ echo " [i] Found Cup-Wertung, skipping first table\n"; $table = $dom->getElementsByTagName('table')[1]; } else $table = $dom->getElementsByTagName('table')[0]; if(!$table) return; foreach($table->getElementsByTagName('tr') as $row){ if(!$row) continue; $rid = $row->getAttribute('id'); if($rid) $rid = explode('_', $rid)[1]; $tds = $row->getElementsByTagName('td'); if(count($tds) == 3) //rally obedience { $runname = trim($tds[0]->nodeValue); $lk = trim($tds[1]->nodeValue); $pdf = $tds[2]->getElementsByTagName('a')[0]->getAttribute('href'); } else if(count($tds) == 4) // agility { $runname = trim($tds[0]->nodeValue); $lk = trim($tds[1]->nodeValue); $gk = trim($tds[2]->nodeValue); $pdf = $tds[3]->getElementsByTagName('a')[0]->getAttribute('href'); //add run to db if not exists $res = $GLOBALS['db']->query("SELECT * FROM runs WHERE id = '$rid'"); if($res->fetchArray() == false) $GLOBALS['db']->exec("INSERT INTO runs (id, name, event, lk, gk) VALUES ('$rid', '$runname', '$eventid', '$lk', '$gk')"); getResults($rid,$eventid); } if(!$runname || !$lk || !$pdf) continue; echo " [R-$rid] $runname - $lk - $gk - $pdf\n"; } //exit("Crawling $eventid"); } function getResults($run,$event) { $GLOBALS['pdfs']++; //return; if(!$run || !$event) return; $url = "https://www.dognow.at/ergebnisse/pdf.php?lauf=$run&event=$event"; if(!file_exists('tmp/results/' . $event . '-' . $run . '.pdf')) file_put_contents('tmp/results/' . $event . '-' . $run . '.pdf',file_get_contents($url)); /*if($GLOBALS['db']->query("SELECT * FROM runs WHERE id = '$run' AND event = '$event'")->fetchArray() != false) { echo " [i] Skipping run $run in event $event\n"; return; }*/ convertPDFtoCSV('tmp/results/' . $event . '-' . $run . '.pdf','tmp/csv/' . $event . '-' . $run . '.pdf.csv'); analyzeResultCSV('tmp/csv/' . $event . '-' . $run . '.pdf.csv',$run,$event); } function analyzeResultCSV($csvfile,$run,$event) { if(!file_exists($csvfile)) die(" ERR: File $csvfile not found"); $csv = array_map('str_getcsv', file($csvfile)); //prepare header for database foreach($csv[0] as $key=>$value){ $csv[0][$key] = preg_replace('/[^A-Za-z0-9]/', '', strtolower($value)); } array_walk($csv, function(&$a) use ($csv) { $a = array_combine($csv[0], $a); }); array_shift($csv); # remove column header foreach($csv as $row) { $stnr = $row['stnr']; $teilnehmer = $row['teilnehmer']; $hund = $row['hund']; $rang = $row['rang']; $verein = $row['verein']; $f = $row['f']; $vw = $row['vw']; $zf = $row['zf']; $zeit = $row['zeit']; $gf = $row['gf']; $msek = $row['msek']; $punkte = $row['punkte']; $bew = $row['bew']; //add result to db if not exists try { $res = $GLOBALS['db']->query("SELECT * FROM results WHERE stnr = '$stnr' AND run = '$run' AND event = '$event'"); if($res->fetchArray() == false) $GLOBALS['db']->exec("INSERT INTO results (stnr, rang, run, event, teilnehmer, hund, verein, f, vw, zf, zeit, gf, msek, bew, punkte) VALUES ('$stnr', '$rang', '$run', '$event', '$teilnehmer', '$hund', '$verein', '$f', '$vw', '$zf', '$zeit', '$gf', '$msek', '$bew', '$punkte')"); //else echo " [i] Skipping $teilnehmer in run $run in event $event\n"; } catch(Exception $ex) { //die( $ex->getMessage() ); exit($GLOBALS['db']->lastErrorMsg()); } } } function convertPDFtoCSV($pdf,$targetname) { if(file_exists($targetname)) return; $csv = analyze($pdf); file_put_contents($targetname, $csv); } function analyze($pdf) { echo " [i] Analyzing $pdf\n"; $cmd = "java -jar tabula-1.0.5-jar-with-dependencies.jar -f CSV $pdf"; $output = shell_exec($cmd); //var_dump($output); return $output; } ?>