2023-11-25 20:12:33 +01:00
|
|
|
<?php
|
|
|
|
|
2023-12-23 19:53:52 +01:00
|
|
|
|
2023-11-25 20:12:33 +01:00
|
|
|
|
2023-11-26 10:30:00 +01:00
|
|
|
$GLOBALS['db'] = new SQLite3('data.db');
|
|
|
|
if(!$GLOBALS['db']) exit("Error loading database");
|
|
|
|
|
2023-12-23 19:53:52 +01:00
|
|
|
// if(!file_exists('tmp/ergebnisse.html'))
|
|
|
|
// {
|
|
|
|
// $ergebnisse = file_get_contents('https://www.dognow.at/ergebnisse/');
|
|
|
|
// file_put_contents('tmp/ergebnisse.html', $ergebnisse);
|
|
|
|
// }
|
2023-11-26 10:30:00 +01:00
|
|
|
|
2023-12-23 19:53:52 +01:00
|
|
|
// $html = file_get_contents('tmp/ergebnisse.html');
|
2023-11-25 20:12:33 +01:00
|
|
|
|
2023-12-23 19:53:52 +01:00
|
|
|
// $dom = new DOMDocument;
|
|
|
|
// $dom->loadHTML($html);
|
2023-11-25 20:12:33 +01:00
|
|
|
|
2023-12-23 19:53:52 +01:00
|
|
|
// $xpath = new DOMXPath($dom);
|
|
|
|
// $query = '//ul[@class="pagination"]/child::*';
|
|
|
|
// $nodes = $xpath->query($query);
|
2023-11-25 20:12:33 +01:00
|
|
|
|
2023-12-23 19:53:52 +01:00
|
|
|
// $GLOBALS['pdfs'] = 0;
|
2023-11-25 20:12:33 +01:00
|
|
|
|
2023-12-23 19:53:52 +01:00
|
|
|
// // Loop through the selected nodes
|
|
|
|
// foreach ($nodes as $node) {
|
|
|
|
// // Do something with each node, for example, echo its content
|
|
|
|
// $url = $node->getElementsByTagName('a')[0]->getAttribute('href');
|
|
|
|
// $number = intval($node->nodeValue);
|
|
|
|
// if($number > $last_page){
|
|
|
|
// $last_page = $number;
|
|
|
|
// }
|
|
|
|
// }
|
2023-11-25 20:12:33 +01:00
|
|
|
|
2023-12-23 19:53:52 +01:00
|
|
|
// echo "[i] Found $last_page pages\n";
|
2023-11-25 20:12:33 +01:00
|
|
|
|
2023-12-23 19:53:52 +01:00
|
|
|
// //create an array with all pages
|
|
|
|
// $pages = range(1,65);
|
2023-11-26 15:48:28 +01:00
|
|
|
|
2023-11-29 14:57:34 +01:00
|
|
|
// foreach($pages as $page)
|
|
|
|
// {
|
|
|
|
// echo "[i] Crawling page $page\n";
|
|
|
|
// scanPage($page);
|
|
|
|
// }
|
|
|
|
|
2023-12-02 21:49:45 +01:00
|
|
|
scanPage(1,false);
|
2023-11-25 20:12:33 +01:00
|
|
|
|
2023-11-26 15:48:28 +01:00
|
|
|
function scanPage($key,$usecache=true)
|
|
|
|
{
|
|
|
|
$page = 'https://www.dognow.at/ergebnisse/?page=' . $key;
|
|
|
|
if(file_exists('tmp/pages/' . ($key) . '.html' && $usecache===true)){
|
2023-11-25 20:12:33 +01:00
|
|
|
$html = file_get_contents($page);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
$html = file_get_contents($page);
|
2023-11-26 15:48:28 +01:00
|
|
|
file_put_contents('tmp/pages/' . ($key) . '.html', $html);
|
2023-11-25 20:12:33 +01:00
|
|
|
}
|
|
|
|
$dom = new DOMDocument;
|
|
|
|
$dom->loadHTML($html);
|
|
|
|
|
|
|
|
// search for all divs with class "resultboard"
|
|
|
|
$xpath = new DOMXPath($dom);
|
|
|
|
$query = '//div[@class="resultboard info-board info-board-default2"]';
|
|
|
|
$nodes = $xpath->query($query);
|
|
|
|
|
|
|
|
// Loop through the selected nodes
|
|
|
|
foreach ($nodes as $node) {
|
2023-11-26 10:30:00 +01:00
|
|
|
// CUPs
|
2023-11-25 20:12:33 +01:00
|
|
|
$div = $node->getElementsByTagName('div')[0];
|
|
|
|
$id = $div->getAttribute('data-event');
|
|
|
|
$name = trim($div->getElementsByTagName('div')[1]->nodeValue);
|
|
|
|
$organizer = trim($div->getElementsByTagName('div')[2]->nodeValue);
|
|
|
|
$date = trim($div->getElementsByTagName('div')[3]->nodeValue);
|
2023-11-26 10:30:00 +01:00
|
|
|
$db_date = date(DATE_RFC3339, strtotime($date));
|
|
|
|
|
|
|
|
//if not exists, add to db
|
|
|
|
$res = $GLOBALS['db']->query("SELECT * FROM events WHERE id = '$id'");
|
|
|
|
if($res->fetchArray() == false)
|
|
|
|
$GLOBALS['db']->exec("INSERT INTO events (id, name, organizer, date) VALUES ('$id', '$name', '$organizer', '$db_date')");
|
2023-11-25 20:12:33 +01:00
|
|
|
|
2023-11-26 15:48:28 +01:00
|
|
|
crawlRuns($id,$usecache);
|
2023-11-25 20:12:33 +01:00
|
|
|
|
|
|
|
echo " [E] $id - $name - $organizer - $date\n";
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2023-11-26 15:48:28 +01:00
|
|
|
|
2023-11-25 20:12:33 +01:00
|
|
|
var_dump($GLOBALS['pdfs']);
|
|
|
|
|
2023-11-26 15:48:28 +01:00
|
|
|
function crawlRuns($eventid,$usecache=true)
|
2023-11-25 20:12:33 +01:00
|
|
|
{
|
2023-11-26 15:48:28 +01:00
|
|
|
if(file_exists('tmp/events/' . $eventid . '.html') && $usecache===true)
|
2023-11-25 20:12:33 +01:00
|
|
|
$data = file_get_contents('tmp/events/' . $eventid . '.html');
|
|
|
|
else
|
|
|
|
{
|
|
|
|
//sleep(1);
|
|
|
|
$data = file_get_contents('https://www.dognow.at/ergebnisse/src/data.php?event='. $eventid .'&lauf=0');
|
|
|
|
file_put_contents('tmp/events/' . $eventid . '.html', $data);
|
|
|
|
}
|
|
|
|
|
2023-11-26 15:48:28 +01:00
|
|
|
|
|
|
|
|
2023-11-25 20:12:33 +01:00
|
|
|
//get first table using DOMDocument
|
|
|
|
$dom = new DOMDocument;
|
|
|
|
$dom->loadHTML($data);
|
|
|
|
|
|
|
|
if(strpos($data,"<b>Einzelwertung</b><br>
|
|
|
|
|
|
|
|
Derzeit sind keine Ergebnisse")!==false)
|
|
|
|
{
|
|
|
|
echo " [i] Keine Einzelwertungen\n";
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// when the string "Cup-Wertung" is found
|
|
|
|
if(strpos($data, 'Cup-Wertung') !== false){
|
|
|
|
echo " [i] Found Cup-Wertung, skipping first table\n";
|
|
|
|
$table = $dom->getElementsByTagName('table')[1];
|
|
|
|
}
|
|
|
|
else
|
|
|
|
$table = $dom->getElementsByTagName('table')[0];
|
|
|
|
|
|
|
|
if(!$table) return;
|
|
|
|
|
|
|
|
foreach($table->getElementsByTagName('tr') as $row){
|
|
|
|
if(!$row) continue;
|
|
|
|
$rid = $row->getAttribute('id');
|
|
|
|
if($rid)
|
|
|
|
$rid = explode('_', $rid)[1];
|
|
|
|
|
|
|
|
$tds = $row->getElementsByTagName('td');
|
|
|
|
if(count($tds) == 3) //rally obedience
|
|
|
|
{
|
|
|
|
$runname = trim($tds[0]->nodeValue);
|
|
|
|
$lk = trim($tds[1]->nodeValue);
|
|
|
|
$pdf = $tds[2]->getElementsByTagName('a')[0]->getAttribute('href');
|
|
|
|
}
|
|
|
|
else if(count($tds) == 4) // agility
|
|
|
|
{
|
|
|
|
$runname = trim($tds[0]->nodeValue);
|
|
|
|
$lk = trim($tds[1]->nodeValue);
|
|
|
|
$gk = trim($tds[2]->nodeValue);
|
|
|
|
$pdf = $tds[3]->getElementsByTagName('a')[0]->getAttribute('href');
|
|
|
|
|
2023-11-26 10:30:00 +01:00
|
|
|
//add run to db if not exists
|
|
|
|
$res = $GLOBALS['db']->query("SELECT * FROM runs WHERE id = '$rid'");
|
|
|
|
if($res->fetchArray() == false)
|
|
|
|
$GLOBALS['db']->exec("INSERT INTO runs (id, name, event, lk, gk) VALUES ('$rid', '$runname', '$eventid', '$lk', '$gk')");
|
|
|
|
|
|
|
|
|
2023-11-25 20:12:33 +01:00
|
|
|
getResults($rid,$eventid);
|
|
|
|
}
|
|
|
|
|
|
|
|
if(!$runname || !$lk || !$pdf) continue;
|
|
|
|
|
|
|
|
|
|
|
|
echo " [R-$rid] $runname - $lk - $gk - $pdf\n";
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
//exit("Crawling $eventid");
|
|
|
|
}
|
|
|
|
|
|
|
|
function getResults($run,$event)
|
|
|
|
{
|
|
|
|
$GLOBALS['pdfs']++;
|
|
|
|
//return;
|
|
|
|
if(!$run || !$event) return;
|
|
|
|
$url = "https://www.dognow.at/ergebnisse/pdf.php?lauf=$run&event=$event";
|
|
|
|
if(!file_exists('tmp/results/' . $event . '-' . $run . '.pdf'))
|
|
|
|
file_put_contents('tmp/results/' . $event . '-' . $run . '.pdf',file_get_contents($url));
|
2023-12-25 16:42:59 +01:00
|
|
|
/*if($GLOBALS['db']->query("SELECT * FROM runs WHERE id = '$run' AND event = '$event'")->fetchArray() != false)
|
2023-12-23 19:51:46 +01:00
|
|
|
{
|
|
|
|
echo " [i] Skipping run $run in event $event\n";
|
|
|
|
return;
|
2023-12-25 16:42:59 +01:00
|
|
|
}*/
|
2023-11-25 20:12:33 +01:00
|
|
|
convertPDFtoCSV('tmp/results/' . $event . '-' . $run . '.pdf','tmp/csv/' . $event . '-' . $run . '.pdf.csv');
|
2023-11-26 10:30:00 +01:00
|
|
|
analyzeResultCSV('tmp/csv/' . $event . '-' . $run . '.pdf.csv',$run,$event);
|
|
|
|
}
|
|
|
|
|
2023-11-26 14:33:47 +01:00
|
|
|
function analyzeResultCSV($csvfile,$run,$event)
|
2023-11-26 10:30:00 +01:00
|
|
|
{
|
2023-12-11 20:29:14 +01:00
|
|
|
if(!file_exists($csvfile)) die(" ERR: File $csvfile not found");
|
2023-11-26 14:33:47 +01:00
|
|
|
$csv = array_map('str_getcsv', file($csvfile));
|
2023-11-26 10:30:00 +01:00
|
|
|
|
|
|
|
//prepare header for database
|
|
|
|
foreach($csv[0] as $key=>$value){
|
|
|
|
$csv[0][$key] = preg_replace('/[^A-Za-z0-9]/', '', strtolower($value));
|
|
|
|
}
|
|
|
|
|
|
|
|
array_walk($csv, function(&$a) use ($csv) {
|
|
|
|
$a = array_combine($csv[0], $a);
|
|
|
|
});
|
|
|
|
array_shift($csv); # remove column header
|
|
|
|
|
|
|
|
foreach($csv as $row)
|
|
|
|
{
|
|
|
|
$stnr = $row['stnr'];
|
|
|
|
$teilnehmer = $row['teilnehmer'];
|
|
|
|
$hund = $row['hund'];
|
2023-11-26 14:00:23 +01:00
|
|
|
$rang = $row['rang'];
|
2023-11-26 10:30:00 +01:00
|
|
|
$verein = $row['verein'];
|
|
|
|
$f = $row['f'];
|
|
|
|
$vw = $row['vw'];
|
|
|
|
$zf = $row['zf'];
|
|
|
|
$zeit = $row['zeit'];
|
|
|
|
$gf = $row['gf'];
|
|
|
|
$msek = $row['msek'];
|
2023-11-26 14:33:47 +01:00
|
|
|
$punkte = $row['punkte'];
|
2023-11-26 10:30:00 +01:00
|
|
|
$bew = $row['bew'];
|
|
|
|
|
|
|
|
//add result to db if not exists
|
|
|
|
try
|
|
|
|
{
|
|
|
|
$res = $GLOBALS['db']->query("SELECT * FROM results WHERE stnr = '$stnr' AND run = '$run' AND event = '$event'");
|
|
|
|
if($res->fetchArray() == false)
|
2023-11-26 14:33:47 +01:00
|
|
|
$GLOBALS['db']->exec("INSERT INTO results (stnr, rang, run, event, teilnehmer, hund, verein, f, vw, zf, zeit, gf, msek, bew, punkte) VALUES ('$stnr', '$rang', '$run', '$event', '$teilnehmer', '$hund', '$verein', '$f', '$vw', '$zf', '$zeit', '$gf', '$msek', '$bew', '$punkte')");
|
|
|
|
//else echo " [i] Skipping $teilnehmer in run $run in event $event\n";
|
2023-11-26 10:30:00 +01:00
|
|
|
}
|
|
|
|
catch(Exception $ex) {
|
|
|
|
//die( $ex->getMessage() );
|
|
|
|
exit($GLOBALS['db']->lastErrorMsg());
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
2023-11-25 20:12:33 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
function convertPDFtoCSV($pdf,$targetname)
|
|
|
|
{
|
|
|
|
if(file_exists($targetname)) return;
|
|
|
|
$csv = analyze($pdf);
|
|
|
|
file_put_contents($targetname, $csv);
|
|
|
|
}
|
|
|
|
|
|
|
|
function analyze($pdf) {
|
2023-12-11 20:29:14 +01:00
|
|
|
echo " [i] Analyzing $pdf\n";
|
2023-11-25 20:12:33 +01:00
|
|
|
$cmd = "java -jar tabula-1.0.5-jar-with-dependencies.jar -f CSV $pdf";
|
|
|
|
$output = shell_exec($cmd);
|
2023-12-11 20:29:14 +01:00
|
|
|
//var_dump($output);
|
2023-11-25 20:12:33 +01:00
|
|
|
return $output;
|
|
|
|
}
|
|
|
|
|
|
|
|
?>
|