diff --git a/crawler/crawler.php b/crawler/crawler.php index eb426ad..0c31462 100644 --- a/crawler/crawler.php +++ b/crawler/crawler.php @@ -7,7 +7,7 @@ $GLOBALS['db'] = new SQLite3('data.db'); if(!$GLOBALS['db']) exit("Error loading database"); //389-7625.pdf.csv -// analyzeResultCSV('tmp/csv/389-7625.pdf.csv',7625,389); +// analyzeResultCSV('tmp/csv/389-7637.pdf.csv',7637,389); // exit(); $html = file_get_contents('tmp/ergebnisse.html'); @@ -34,20 +34,21 @@ foreach ($nodes as $node) { echo "[i] Found $last_page pages\n"; //create an array with all pages -$pages = []; -for($i = 1; $i <= $last_page; $i++){ - $pages[] = 'https://www.dognow.at/ergebnisse/?page=' . $i; -} +$pages = range(1,65); -//loop through all pages -foreach($pages as $key=> $page){ - if(file_exists('tmp/pages/' . ($key+1) . '.html')){ + +scanPage(1,false); + +function scanPage($key,$usecache=true) +{ + $page = 'https://www.dognow.at/ergebnisse/?page=' . $key; + if(file_exists('tmp/pages/' . ($key) . '.html' && $usecache===true)){ $html = file_get_contents($page); } else { $html = file_get_contents($page); - file_put_contents('tmp/pages/' . ($key+1) . '.html', $html); + file_put_contents('tmp/pages/' . ($key) . '.html', $html); } $dom = new DOMDocument; $dom->loadHTML($html); @@ -72,20 +73,20 @@ foreach($pages as $key=> $page){ if($res->fetchArray() == false) $GLOBALS['db']->exec("INSERT INTO events (id, name, organizer, date) VALUES ('$id', '$name', '$organizer', '$db_date')"); - crawlRuns($id); + crawlRuns($id,$usecache); echo " [E] $id - $name - $organizer - $date\n"; } - //exit(); } + var_dump($GLOBALS['pdfs']); -function crawlRuns($eventid) +function crawlRuns($eventid,$usecache=true) { - if(file_exists('tmp/events/' . $eventid . '.html')) + if(file_exists('tmp/events/' . $eventid . '.html') && $usecache===true) $data = file_get_contents('tmp/events/' . $eventid . '.html'); else { @@ -94,6 +95,8 @@ function crawlRuns($eventid) file_put_contents('tmp/events/' . $eventid . '.html', $data); } + + //get first table using DOMDocument $dom = new DOMDocument; $dom->loadHTML($data); diff --git a/crawler/data.db b/crawler/data.db index 420075f..853f14d 100644 Binary files a/crawler/data.db and b/crawler/data.db differ