projectDir = $projectDir; $this->curlService = $curlService; $this->databaseService = $databaseService; $this->mailerService = $mailerService; $this->spreadsheetWriter = $spreadsheetWriter; } private function str_contains_any(string $haystack, array $needles): bool { return array_reduce($needles, fn($a, $n) => $a || str_contains($haystack, $n), false); } private function initSession() : bool { //generate session file name to be used throughout execution $this->session = "session-courtserve"; //go to homepage and grab yoself a cookie $ch = $this->curlService->initCurl( $this->homeUrl, false, $this->session ); $this->content = curl_exec($ch); $statusCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); if ($statusCode == 200) { return true; } return false; } private function login() : bool { $matches = []; $matchGroup = 1; $regexp = '(Signed in as: ' . $_SERVER['COURTSERVE_USERNAME'] . '<\/strong>)'; if (preg_match("/$regexp/iU", $this->content, $matches)) { //already logged in return true; } //you put you left login $ch = $this->curlService->initCurlPost( $this->loginUrl, [ 'username' => $_SERVER['COURTSERVE_USERNAME'], 'password' => $_SERVER['COURTSERVE_PASSWORD'] ], $this->session ); $this->content = curl_exec($ch); $statusCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); if ($statusCode == 200) { return true; } return false; } private function saveCourtSession( string $courtName, string $courtPostcode, string $sessionDate, string $caseReference, string $caseDetails, bool $motoringRelated ) : ?CourtSession { $courtSession = null; $court = $this->databaseService->findCourtByName($courtName); if ($court == null) { $this->databaseService->addCourt( $courtName, $courtPostcode ); } if ($court != null) { $___courtSession = $this->databaseService->findCourtSessionByCaseReference($caseReference); if ($___courtSession == null) { $date = \DateTime::createFromFormat('Y-m-d H:i:s', $sessionDate); $courtSession = $this->databaseService->addCourtSession( $date, $caseReference, $caseDetails, $motoringRelated, $court ); } } return $courtSession; } public function scrape() : array { $scrapeResult = []; $courtPostcodes = []; $sessionDates = []; //grab content $ch = $this->curlService->initCurl( $this->scrapeUrl, false, $this->session ); $this->content = curl_exec($ch); $content = $this->content; //Grab court names, and create array using court name as index $matches = []; $matchGroup = 2; $regexp = '(<\/a>)(.*)(<\/strong>)'; //'(<\/a>)(\w*)(<\/strong>)'; if (preg_match_all("/$regexp/iU", $content, $matches, PREG_SET_ORDER)) { foreach($matches as $match) { $courtName = $match[$matchGroup]; $scrapeResult[$courtName] = []; } } //Grab links to court sessions by court name foreach ($scrapeResult as $courtName => $sessionLinks) { $matches = []; $matchGroup = 2; $regexp = '(<\/a>' . $courtName . '<\/strong>)(.*)(<\/?table)'; if (preg_match_all("/$regexp/siU", $content, $matches, PREG_SET_ORDER)) { foreach($matches as $match) { $___content = $match[$matchGroup]; //find links to court sessions in content segment $___matches = []; $___matchGroup = 2; $___regexp = '(href=")(.*)(".*<\/a>)'; if (preg_match_all("/$___regexp/iU", $___content, $___matches, PREG_SET_ORDER)) { foreach($___matches as $___match) { $sessionLink = $___match[$___matchGroup]; //exclude court sessions that don't have 'CP' in their title (no data in them) if (str_contains($___match[3], ' CP')) { $scrapeResult[$courtName][$sessionLink] = []; //courtserve updates randomly through the day so we need to extract the date of session //can't rely on actual date of the scrape itself... this really skews data $___matches = []; $___matchGroup = 2; $___regexp = '(>)(\d{2}\/\d{2}\/\d{2})(<\/td>)'; if(preg_match("/$___regexp/iU", $___content, $___matches)) { $___date = explode('/', $___matches[$___matchGroup]); $sessionDate = '20' . $___date[2] . '-' . $___date[1] . '-' . $___date[0] . ' 12:00:00'; $sessionDates[$sessionLink] = $sessionDate; } } } } } } } //grab content from each court session foreach ($scrapeResult as $courtName => $sessionLinks) { foreach ($sessionLinks as $sessionLink => $courtSessionData) { $ch = $this->curlService->initCurl( $this->homeUrl . $sessionLink, false, $this->session ); $content = curl_exec($ch); //this page contains a link to a data object, get link and grab its content $matches = []; $matchGroup = 2; $regexp = '()'; if (preg_match("/$regexp/iU", $content, $matches)) { $ch = $this->curlService->initCurl( $this->homeUrl . $matches[$matchGroup], false, $this->session ); $___content = curl_exec($ch); $scrapeResult[$courtName][$sessionLink]['content'] = $___content; $scrapeResult[$courtName][$sessionLink]['cases'] = []; } } //extract data from content foreach ($scrapeResult as $courtName => $sessionLinks) { foreach ($sessionLinks as $sessionLink => $courtSessionData) { if (!isset($courtSessionData['content'])) { continue; } $content = $courtSessionData['content']; //get court postcode from address data at top of page $courtPostcode = ''; $matches = []; $matchGroup = 0; $regexp = '[A-Z]{1,2}\d[A-Z\d]? ?\d[A-Z]{2}'; if(preg_match("/$regexp/sU", $content, $matches)) { $courtPostcode = $matches[$matchGroup]; } $courtPostcodes[$courtName] = $courtPostcode; //get court sessions $matches = []; $matchGroup = 2; $regexp = '(>Time<)(.*)(\/table>)'; //isolate english language content (welsh courts duplicate in welsh and english) if(preg_match("/$regexp/siU", $content, $matches)) { $___content = $matches[$matchGroup]; //segment by court reference and end of row marker (content is extremely poor quality html) $___matches = []; $___matchGroup = 2; $___regexp = '(\d{2}\w{2}\d{7})(.*)(border-bottom|\/table)'; if(preg_match_all("/$___regexp/siU", $___content, $___matches, PREG_SET_ORDER)) { foreach ($___matches as $___match) { $caseReference = $___match[1]; if (isset($scrapeResult[$courtName][$sessionLink]['cases'][$caseReference])) { continue; } $caseDetails = substr(preg_replace('/\s+/', ' ', strip_tags($___match[$___matchGroup])), 0, 1024); $motoringRelated = false; $searchTerms = ['motor', 'Motor', 'MOTOR', 'vehicle', 'Vehicle', 'VEHICLE']; if ($this->str_contains_any($___match[$___matchGroup], $searchTerms)) { $motoringRelated = true; } $sessionDate = $sessionDates[$sessionLink]; $___courtSession = $this->saveCourtSession( $courtName, $courtPostcode, $sessionDate, $caseReference, $caseDetails, $motoringRelated ); $saved = false; if ($___courtSession != null) { $saved = true; } $case = [ 'sessionDate' => $sessionDate, 'caseReference' => $caseReference, 'caseDetails' => $caseDetails, 'motoringRelated' => $motoringRelated, 'saved' => $saved ]; $scrapeResult[$courtName][$sessionLink]['cases'][$caseReference] = $case; echo '****' . PHP_EOL; echo $courtName . PHP_EOL; echo $sessionLink . PHP_EOL; echo $sessionDate . PHP_EOL; echo $caseReference . PHP_EOL; echo $caseDetails . PHP_EOL; echo 'Motoring Related: ' . $motoringRelated . PHP_EOL; echo 'Saved: ' . $saved . PHP_EOL; echo '****' . PHP_EOL; } } } } } } return $scrapeResult; } public function run() { $body = '

Court Serve Scraper Results

'; $isInitiated = true; $isLoggedIn = true; $spreadsheet = null; $spreadsheetName = 'CourtServeScrapeResults.xlsx'; if (!$this->initSession()) { $isInitiated = false; $body .= '

Could not initiate session

' . PHP_EOL; } if (!$this->login()) { $isLoggedIn = false; $body .= '

Could not login

' . PHP_EOL; } if ($isInitiated && $isLoggedIn) { $scrapeResult = $this->scrape(); $spreadsheet = $this->spreadsheetWriter->createEmptySpreadsheet(); $columns = [ 'COURT NAME', 'SESSION DATE', 'CASE REF', 'CASE DETAILS', 'MOTORING RELATED', 'SAVED' ]; $rows = []; $noOfCourts = 0; $noOfCases = 0; $noOfMotoringRelated = 0; $noOfSaved = 0; foreach ($scrapeResult as $courtName => $courtSessions) { $noOfCourts++; foreach ($courtSessions as $courtSession => $courtSessionData) { foreach ($courtSessionData['cases'] as $case) { $noOfCases++; $motoringRelated = 'FALSE'; if ($case['motoringRelated']) { $motoringRelated = 'TRUE'; $noOfMotoringRelated++; } $saved = 'FALSE'; if ($case['saved']) { $saved = 'TRUE'; $noOfSaved++; } $rows[] = [ $courtName, $case['sessionDate'], $case['caseReference'], $case['caseDetails'], $motoringRelated, $saved ]; } } } $this->spreadsheetWriter->createWorksheet($spreadsheet, $spreadsheetName, $columns, $rows); $body .= '

Number of Courts: ' . $noOfCourts . '

' . PHP_EOL; $body .= '

Number of Cases: ' . $noOfCases . '

' . PHP_EOL; $body .= '

Number of Motoring Related: ' . $noOfMotoringRelated . '

' . PHP_EOL; $body .= '

Number of Saved to DB: ' . $noOfSaved . '

' . PHP_EOL; } //prepare email $fromAddress = new Address( $this->mailerService->noReplyAddress, $this->mailerService->noReplyName ); $email = (new Email()) ->from($fromAddress) ->to($this->mailerService->webmasterAddress) ->subject('Court Serve Scrape Results') ->text(strip_tags($body)) ->html($body); if ($spreadsheet != null) { if ($this->spreadsheetWriter->writeSpreadsheet( $spreadsheet, $this->projectDir . $this->tmpPath, $spreadsheetName )) { $filename = $this->projectDir . $this->tmpPath . $spreadsheetName; $email->addPart(new DataPart(new File($filename))); } } //$this->mailerService->sendEmail($email); } }