From cdf958d29333d448f4521f4d2faa2592b58e9b27 Mon Sep 17 00:00:00 2001
From: lolcat <will@lolcat.ca>
Date: Sun, 10 Aug 2025 21:55:15 -0400
Subject: fix wikipedia crash

---
 scraper/greppr.php | 435 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 435 insertions(+)
 create mode 100644 scraper/greppr.php

(limited to 'scraper/greppr.php')

diff --git a/scraper/greppr.php b/scraper/greppr.php
new file mode 100644
index 0000000..fc8511c
--- /dev/null
+++ b/scraper/greppr.php
@@ -0,0 +1,435 @@
+<?php
+
+class greppr{
+	
+	public function __construct(){
+		
+		include "lib/backend.php";
+		$this->backend = new backend("greppr");
+		
+		include "lib/fuckhtml.php";
+		$this->fuckhtml = new fuckhtml();
+	}
+	
+	public function getfilters($page){
+		
+		return [];
+	}
+	
+	private function get($proxy, $url, $get = [], $cookie = false, $post){
+		
+		$curlproc = curl_init();
+		
+		curl_setopt($curlproc, CURLOPT_URL, $url);
+		
+		curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
+		
+		if($post === false){
+						
+			if($get !== []){
+				$get = http_build_query($get);
+				$url .= "?" . $get;
+			}
+			
+			if($cookie === false){
+				
+				curl_setopt($curlproc, CURLOPT_HTTPHEADER,
+					["User-Agent: " . config::USER_AGENT,
+					"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+					"Accept-Language: en-US,en;q=0.5",
+					"Accept-Encoding: gzip",
+					"DNT: 1",
+					"Connection: keep-alive",
+					"Upgrade-Insecure-Requests: 1",
+					"Sec-Fetch-Dest: document",
+					"Sec-Fetch-Mode: navigate",
+					"Sec-Fetch-Site: none",
+					"Sec-Fetch-User: ?1"]
+				);
+			}else{
+				
+				curl_setopt($curlproc, CURLOPT_HTTPHEADER,
+					["User-Agent: " . config::USER_AGENT,
+					"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+					"Accept-Language: en-US,en;q=0.5",
+					"Accept-Encoding: gzip, deflate, br, zstd",
+					"DNT: 1",
+					"Sec-GPC: 1",
+					"Connection: keep-alive",
+					"Referer: https://greppr.org/search",
+					"Cookie: PHPSESSID=$cookie",
+					"Upgrade-Insecure-Requests: 1",
+					"Sec-Fetch-Dest: document",
+					"Sec-Fetch-Mode: navigate",
+					"Sec-Fetch-Site: same-origin",
+					"Sec-Fetch-User: ?1",
+					"Priority: u=0, i"]
+				);
+			}
+		}else{
+			
+			$get = http_build_query($get);
+			
+			curl_setopt($curlproc, CURLOPT_POST, true);
+			curl_setopt($curlproc, CURLOPT_POSTFIELDS, $get);
+			
+			curl_setopt($curlproc, CURLOPT_HTTPHEADER,
+				["User-Agent: " . config::USER_AGENT,
+				"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+				"Accept-Language: en-US,en;q=0.5",
+				"Accept-Encoding: gzip, deflate, br, zstd",
+				"Content-Type: application/x-www-form-urlencoded",
+				"Content-Length: " . strlen($get),
+				"Origin: https://greppr.org",
+				"DNT: 1",
+				"Sec-GPC: 1",
+				"Connection: keep-alive",
+				"Referer: https://greppr.org/",
+				"Cookie: PHPSESSID=$cookie",
+				"Upgrade-Insecure-Requests: 1",
+				"Sec-Fetch-Dest: document",
+				"Sec-Fetch-Mode: navigate",
+				"Sec-Fetch-Site: same-origin",
+				"Sec-Fetch-User: ?1",
+				"Priority: u=0, i"]
+			);
+		}
+		
+		curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
+		curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
+		curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
+		curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
+		curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
+		
+		$this->backend->assign_proxy($curlproc, $proxy);
+		
+		$headers = [];
+		
+		curl_setopt(
+			$curlproc,
+			CURLOPT_HEADERFUNCTION,
+			function($curlproc, $header) use (&$headers){
+				
+				$len = strlen($header);
+				$header = explode(':', $header, 2);
+				
+				if(count($header) < 2){
+					
+					// ignore invalid headers
+					return $len;
+				}
+				
+				$headers[strtolower(trim($header[0]))] = trim($header[1]);
+
+				return $len;
+			}
+		);
+				
+		$data = curl_exec($curlproc);
+		
+		if(curl_errno($curlproc)){
+			
+			throw new Exception(curl_error($curlproc));
+		}
+		
+		curl_close($curlproc);
+		
+		return [
+			"headers" => $headers,
+			"data" => $data
+		];
+	}
+	
+	public function web($get, $first_attempt = true){
+		
+		if($get["npt"]){
+			
+			[$q, $proxy] = $this->backend->get($get["npt"], "web");
+			
+			$tokens = json_decode($q, true);
+			
+			//
+			// Get paginated page
+			//
+			try{
+			
+				$html = $this->get(
+					$proxy,
+					"https://greppr.org" . $tokens["get"],
+					[],
+					$tokens["cookie"],
+					false
+				);
+			}catch(Exception $error){
+				
+				throw new Exception("Failed to fetch search page");
+			}
+			
+		}else{
+			
+			$search = $get["s"];
+			if(strlen($search) === 0){
+				
+				throw new Exception("Search term is empty!");
+			}
+			
+			$proxy = $this->backend->get_ip();
+			
+			//
+			// get token
+			//
+			try{
+				
+				$html =
+					$this->get(
+						$proxy,
+						"https://greppr.org",
+						[],
+						false,
+						false
+					);
+			}catch(Exception $error){
+				
+				throw new Exception("Failed to fetch search tokens");
+			}
+			
+			//
+			// Parse token
+			//
+			$this->fuckhtml->load($html["data"]);
+		
+			$tokens = [];
+			
+			$inputs =
+				$this->fuckhtml
+				->getElementsByTagName(
+					"input"
+				);
+				
+			foreach($inputs as $input){
+				
+				if(!isset($input["attributes"]["name"])){
+					
+					continue;
+				}
+				
+				switch($input["attributes"]["name"]){
+					
+					case "var1":
+					case "var2":
+					case "n":
+						$tokens[$input["attributes"]["name"]] =
+							$this->fuckhtml
+							->getTextContent(
+								$input["attributes"]["value"]
+							);
+						break;
+					
+					default:
+						$tokens["req"] =
+							$this->fuckhtml
+							->getTextContent(
+								$input["attributes"]["name"]
+							);
+						break;
+				}
+			}
+			
+			// get cookie
+			preg_match(
+				'/PHPSESSID=([^;]+)/',
+				$html["headers"]["set-cookie"],
+				$cookie
+			);
+			
+			if(!isset($cookie[1])){
+				
+				// server sent an unexpected cookie
+				throw new Exception("Got malformed cookie");
+			}
+			
+			$tokens["cookie"] = $cookie[1];
+			
+			if($tokens === false){
+				
+				throw new Exception("Failed to grep search tokens");
+			}
+			
+			//
+			// Get initial search page
+			//
+			try{
+			
+				$html = $this->get(
+					$proxy,
+					"https://greppr.org/search",
+					[
+						"var1" => $tokens["var1"],
+						"var2" => $tokens["var2"],
+						$tokens["req"] => $search,
+						"n" => $tokens["n"]
+					],
+					$tokens["cookie"],
+					true
+				);
+			}catch(Exception $error){
+				
+				throw new Exception("Failed to fetch search page");
+			}
+		}
+		
+		//$html = file_get_contents("scraper/greppr.html");
+		//$this->fuckhtml->load($html);
+		$this->fuckhtml->load($html["data"]);
+		
+		$out = [
+			"status" => "ok",
+			"spelling" => [
+				"type" => "no_correction",
+				"using" => null,
+				"correction" => null
+			],
+			"npt" => null,
+			"answer" => [],
+			"web" => [],
+			"image" => [],
+			"video" => [],
+			"news" => [],
+			"related" => []
+		];
+		
+		// get results for later
+		$results =
+			$this->fuckhtml
+			->getElementsByClassName(
+				"result",
+				"div"
+			);
+		
+		// check for next page
+		$next_elem =
+			$this->fuckhtml
+			->getElementsByClassName(
+				"pagination",
+				"ul"
+			);
+		
+		if(count($next_elem) !== 0){
+			
+			$this->fuckhtml->load($next_elem[0]);
+			
+			$as =
+				$this->fuckhtml
+				->getElementsByClassName(
+					"page-link",
+					"a"
+				);
+			
+			$break = false;
+			foreach($as as $a){
+				
+				if($break === true){
+					
+					$out["npt"] =
+						$this->backend->store(
+							json_encode([
+								"get" =>
+									$this->fuckhtml
+									->getTextContent(
+										$a["attributes"]["href"]
+									),
+								"cookie" => $tokens["cookie"]
+							]),
+							"web",
+							$proxy
+						);
+					break;
+				}
+				
+				if($a["attributes"]["href"] == "#"){
+					
+					$break = true;
+				}
+			}
+		}
+		
+		// scrape results
+		foreach($results as $result){
+			
+			$this->fuckhtml->load($result);
+			
+			$a =
+				$this->fuckhtml
+				->getElementsByTagName(
+					"a"
+				)[0];
+			
+			$description =
+				$this->fuckhtml
+				->getElementsByClassName(
+					"highlightedDesc",
+					"p"
+				);
+			
+			if(count($description) === 0){
+				
+				$description = null;
+			}else{
+				
+				$description =
+					$this->limitstrlen(
+						$this->fuckhtml
+						->getTextContent(
+							$description[0]
+						)
+					);
+			}
+			
+			$date =
+				$this->fuckhtml
+				->getElementsByTagName(
+					"p"
+				);
+			
+			$date =
+				strtotime(
+					explode(
+						":",
+						$this->fuckhtml
+						->getTextContent(
+							$date[count($date) - 1]["innerHTML"]
+						)
+					)[1]
+				);
+			
+			$out["web"][] = [
+				"title" =>
+					$this->fuckhtml
+					->getTextContent(
+						$a["innerHTML"]
+					),
+				"description" => $description,
+				"url" =>
+					$this->fuckhtml
+					->getTextContent(
+						$a["attributes"]["href"]
+					),
+				"date" => $date,
+				"type" => "web",
+				"thumb" => [
+					"url" => null,
+					"ratio" => null
+				],
+				"sublink" => [],
+				"table" => []
+			];
+		}
+		
+		return $out;
+	}
+	
+	private function limitstrlen($text){
+		
+		return explode("\n", wordwrap($text, 300, "\n"))[0];
+	}
+}
-- 
cgit v1.2.3