progress with the crawler
All checks were successful
Build and push / Pulling repo on server (push) Successful in 2s
All checks were successful
Build and push / Pulling repo on server (push) Successful in 2s
This commit is contained in:
parent
9571c10db3
commit
3d9a948beb
164
crawler/crawler.php
Normal file
164
crawler/crawler.php
Normal file
@ -0,0 +1,164 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
//$ergebnisse = file_get_contents('https://www.dognow.at/ergebnisse/');
|
||||||
|
//file_put_contents('tmp/ergebnisse.html', $ergebnisse);
|
||||||
|
|
||||||
|
$html = file_get_contents('tmp/ergebnisse.html');
|
||||||
|
|
||||||
|
$dom = new DOMDocument;
|
||||||
|
$dom->loadHTML($html);
|
||||||
|
|
||||||
|
$xpath = new DOMXPath($dom);
|
||||||
|
$query = '//ul[@class="pagination"]/child::*';
|
||||||
|
$nodes = $xpath->query($query);
|
||||||
|
|
||||||
|
$GLOBALS['pdfs'] = 0;
|
||||||
|
|
||||||
|
// Loop through the selected nodes
|
||||||
|
foreach ($nodes as $node) {
|
||||||
|
// Do something with each node, for example, echo its content
|
||||||
|
$url = $node->getElementsByTagName('a')[0]->getAttribute('href');
|
||||||
|
$number = intval($node->nodeValue);
|
||||||
|
if($number > $last_page){
|
||||||
|
$last_page = $number;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
echo "[i] Found $last_page pages\n";
|
||||||
|
|
||||||
|
//create an array with all pages
|
||||||
|
$pages = [];
|
||||||
|
for($i = 1; $i <= $last_page; $i++){
|
||||||
|
$pages[] = 'https://www.dognow.at/ergebnisse/?page=' . $i;
|
||||||
|
}
|
||||||
|
|
||||||
|
//loop through all pages
|
||||||
|
foreach($pages as $key=> $page){
|
||||||
|
if(file_exists('tmp/pages/' . ($key+1) . '.html')){
|
||||||
|
$html = file_get_contents($page);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
$html = file_get_contents($page);
|
||||||
|
file_put_contents('tmp/pages/' . ($key+1) . '.html', $html);
|
||||||
|
}
|
||||||
|
$dom = new DOMDocument;
|
||||||
|
$dom->loadHTML($html);
|
||||||
|
|
||||||
|
// search for all divs with class "resultboard"
|
||||||
|
$xpath = new DOMXPath($dom);
|
||||||
|
$query = '//div[@class="resultboard info-board info-board-default2"]';
|
||||||
|
$nodes = $xpath->query($query);
|
||||||
|
|
||||||
|
// Loop through the selected nodes
|
||||||
|
foreach ($nodes as $node) {
|
||||||
|
// Do something with each node, for example, echo its content
|
||||||
|
//echo $node->nodeValue . "\n";
|
||||||
|
$div = $node->getElementsByTagName('div')[0];
|
||||||
|
$id = $div->getAttribute('data-event');
|
||||||
|
$name = trim($div->getElementsByTagName('div')[1]->nodeValue);
|
||||||
|
$organizer = trim($div->getElementsByTagName('div')[2]->nodeValue);
|
||||||
|
$date = trim($div->getElementsByTagName('div')[3]->nodeValue);
|
||||||
|
|
||||||
|
crawlRuns($id);
|
||||||
|
|
||||||
|
echo " [E] $id - $name - $organizer - $date\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
//exit();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
var_dump($GLOBALS['pdfs']);
|
||||||
|
|
||||||
|
function crawlRuns($eventid)
|
||||||
|
{
|
||||||
|
if(file_exists('tmp/events/' . $eventid . '.html'))
|
||||||
|
$data = file_get_contents('tmp/events/' . $eventid . '.html');
|
||||||
|
else
|
||||||
|
{
|
||||||
|
//sleep(1);
|
||||||
|
$data = file_get_contents('https://www.dognow.at/ergebnisse/src/data.php?event='. $eventid .'&lauf=0');
|
||||||
|
file_put_contents('tmp/events/' . $eventid . '.html', $data);
|
||||||
|
}
|
||||||
|
|
||||||
|
//get first table using DOMDocument
|
||||||
|
$dom = new DOMDocument;
|
||||||
|
$dom->loadHTML($data);
|
||||||
|
|
||||||
|
if(strpos($data,"<b>Einzelwertung</b><br>
|
||||||
|
|
||||||
|
Derzeit sind keine Ergebnisse")!==false)
|
||||||
|
{
|
||||||
|
echo " [i] Keine Einzelwertungen\n";
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// when the string "Cup-Wertung" is found
|
||||||
|
if(strpos($data, 'Cup-Wertung') !== false){
|
||||||
|
echo " [i] Found Cup-Wertung, skipping first table\n";
|
||||||
|
$table = $dom->getElementsByTagName('table')[1];
|
||||||
|
}
|
||||||
|
else
|
||||||
|
$table = $dom->getElementsByTagName('table')[0];
|
||||||
|
|
||||||
|
if(!$table) return;
|
||||||
|
|
||||||
|
foreach($table->getElementsByTagName('tr') as $row){
|
||||||
|
if(!$row) continue;
|
||||||
|
$rid = $row->getAttribute('id');
|
||||||
|
if($rid)
|
||||||
|
$rid = explode('_', $rid)[1];
|
||||||
|
|
||||||
|
$tds = $row->getElementsByTagName('td');
|
||||||
|
if(count($tds) == 3) //rally obedience
|
||||||
|
{
|
||||||
|
$runname = trim($tds[0]->nodeValue);
|
||||||
|
$lk = trim($tds[1]->nodeValue);
|
||||||
|
$pdf = $tds[2]->getElementsByTagName('a')[0]->getAttribute('href');
|
||||||
|
}
|
||||||
|
else if(count($tds) == 4) // agility
|
||||||
|
{
|
||||||
|
$runname = trim($tds[0]->nodeValue);
|
||||||
|
$lk = trim($tds[1]->nodeValue);
|
||||||
|
$gk = trim($tds[2]->nodeValue);
|
||||||
|
$pdf = $tds[3]->getElementsByTagName('a')[0]->getAttribute('href');
|
||||||
|
|
||||||
|
getResults($rid,$eventid);
|
||||||
|
}
|
||||||
|
|
||||||
|
if(!$runname || !$lk || !$pdf) continue;
|
||||||
|
|
||||||
|
|
||||||
|
echo " [R-$rid] $runname - $lk - $gk - $pdf\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
//exit("Crawling $eventid");
|
||||||
|
}
|
||||||
|
|
||||||
|
function getResults($run,$event)
|
||||||
|
{
|
||||||
|
$GLOBALS['pdfs']++;
|
||||||
|
//return;
|
||||||
|
if(!$run || !$event) return;
|
||||||
|
$url = "https://www.dognow.at/ergebnisse/pdf.php?lauf=$run&event=$event";
|
||||||
|
if(!file_exists('tmp/results/' . $event . '-' . $run . '.pdf'))
|
||||||
|
file_put_contents('tmp/results/' . $event . '-' . $run . '.pdf',file_get_contents($url));
|
||||||
|
convertPDFtoCSV('tmp/results/' . $event . '-' . $run . '.pdf','tmp/csv/' . $event . '-' . $run . '.pdf.csv');
|
||||||
|
}
|
||||||
|
|
||||||
|
function convertPDFtoCSV($pdf,$targetname)
|
||||||
|
{
|
||||||
|
if(file_exists($targetname)) return;
|
||||||
|
$csv = analyze($pdf);
|
||||||
|
file_put_contents($targetname, $csv);
|
||||||
|
}
|
||||||
|
|
||||||
|
function analyze($pdf) {
|
||||||
|
$cmd = "java -jar tabula-1.0.5-jar-with-dependencies.jar -f CSV $pdf";
|
||||||
|
$output = shell_exec($cmd);
|
||||||
|
return $output;
|
||||||
|
}
|
||||||
|
|
||||||
|
?>
|
19
crawler/parse.php
Normal file
19
crawler/parse.php
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
require_once('../web/inc/helpers.php');
|
||||||
|
|
||||||
|
//loop all files in results folder
|
||||||
|
$files = scandir('tmp/results');
|
||||||
|
foreach ($files as $file) {
|
||||||
|
if ($file == '.' || $file == '..' || !endsWith($file, '.pdf') || file_exists('tmp/csv/' . $file . '.csv'))
|
||||||
|
continue;
|
||||||
|
$csv = analyze('tmp/results/' . $file);
|
||||||
|
file_put_contents('tmp/csv/' . $file . '.csv', $csv);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
function analyze($pdf) {
|
||||||
|
$cmd = "java -jar tabula-1.0.5-jar-with-dependencies.jar -f CSV $pdf";
|
||||||
|
$output = shell_exec($cmd);
|
||||||
|
return $output;
|
||||||
|
}
|
2
crawler/tmp/.gitignore
vendored
Normal file
2
crawler/tmp/.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
*
|
||||||
|
!.gitignore
|
Reference in New Issue
Block a user