From cdf958d29333d448f4521f4d2faa2592b58e9b27 Mon Sep 17 00:00:00 2001 From: lolcat Date: Sun, 10 Aug 2025 21:55:15 -0400 Subject: fix wikipedia crash --- lib/anubis.php | 100 ++++ lib/backend.php | 178 ++++++ lib/bingcache-todo-fix.php | 144 +++++ lib/bot_protection.php | 281 +++++++++ lib/curlproxy.php | 660 +++++++++++++++++++++ lib/favicon404.png | Bin 0 -> 744 bytes lib/frontend.php | 1356 ++++++++++++++++++++++++++++++++++++++++++++ lib/fuckhtml.php | 622 ++++++++++++++++++++ lib/img404.png | Bin 0 -> 216 bytes lib/type-todo.php | 132 +++++ 10 files changed, 3473 insertions(+) create mode 100644 lib/anubis.php create mode 100644 lib/backend.php create mode 100644 lib/bingcache-todo-fix.php create mode 100644 lib/bot_protection.php create mode 100644 lib/curlproxy.php create mode 100644 lib/favicon404.png create mode 100644 lib/frontend.php create mode 100644 lib/fuckhtml.php create mode 100644 lib/img404.png create mode 100644 lib/type-todo.php (limited to 'lib') diff --git a/lib/anubis.php b/lib/anubis.php new file mode 100644 index 0000000..2bd6d90 --- /dev/null +++ b/lib/anubis.php @@ -0,0 +1,100 @@ +fuckhtml = new fuckhtml(); + } + + public function scrape($html){ + + $this->fuckhtml->load($html); + + $script = + $this->fuckhtml + ->getElementById( + "anubis_challenge", + "script" + ); + + if($script === false){ + + throw new Exception("Failed to scrape anubis challenge data"); + } + + $script = + json_decode( + $this->fuckhtml + ->getTextContent( + $script + ), + true + ); + + if($script === null){ + + throw new Exception("Failed to decode anubis challenge data"); + } + + if( + !isset($script["challenge"]) || + !isset($script["rules"]["difficulty"]) || + !is_int($script["rules"]["difficulty"]) || + !is_string($script["challenge"]) + ){ + + throw new Exception("Found invalid challenge data"); + } + + return $this->rape($script["challenge"], $script["rules"]["difficulty"]); + } + + private function is_valid_hash($hash, $difficulty){ + + for ($i=0; $i<$difficulty; $i++) { + + $index = (int)floor($i / 2); + $nibble = $i % 2; + + $byte = ord($hash[$index]); + $nibble = ($byte >> ($nibble === 0 ? 4 : 0)) & 0x0f; + + if($nibble !== 0){ + return false; + } + } + + return true; + } + + public function rape($data, $difficulty = 5){ + + $nonce = 0; + + while(true){ + + $hash_binary = hash("sha256", $data . $nonce, true); + + if($this->is_valid_hash($hash_binary, $difficulty)){ + + $hash_hex = bin2hex($hash_binary); + + return [ + "response" => $hash_hex, + //"data" => $data, + //"difficulty" => $difficulty, + "nonce" => $nonce + ]; + } + + $nonce++; + } + } +} diff --git a/lib/backend.php b/lib/backend.php new file mode 100644 index 0000000..66e78a1 --- /dev/null +++ b/lib/backend.php @@ -0,0 +1,178 @@ +scraper = $scraper; + } + + /* + Proxy stuff + */ + public function get_ip(){ + + $pool = constant("config::PROXY_" . strtoupper($this->scraper)); + if($pool === false){ + + // we don't want a proxy, fuck off! + return 'raw_ip::::'; + } + + // indent + $proxy_index_raw = apcu_inc("p." . $this->scraper); + + $proxylist = file_get_contents("data/proxies/" . $pool . ".txt"); + $proxylist = explode("\n", $proxylist); + + // ignore empty or commented lines + $proxylist = array_filter($proxylist, function($entry){ + $entry = ltrim($entry); + return strlen($entry) > 0 && substr($entry, 0, 1) != "#"; + }); + + $proxylist = array_values($proxylist); + + return $proxylist[$proxy_index_raw % count($proxylist)]; + } + + // this function is also called directly on nextpage + public function assign_proxy(&$curlproc, string $ip){ + + // parse proxy line + [ + $type, + $address, + $port, + $username, + $password + ] = explode(":", $ip, 5); + + switch($type){ + + case "raw_ip": + return; + break; + + case "http": + case "https": + curl_setopt($curlproc, CURLOPT_PROXYTYPE, CURLPROXY_HTTP); + curl_setopt($curlproc, CURLOPT_PROXY, $type . "://" . $address . ":" . $port); + break; + + case "socks4": + curl_setopt($curlproc, CURLOPT_PROXYTYPE, CURLPROXY_SOCKS4); + curl_setopt($curlproc, CURLOPT_PROXY, $address . ":" . $port); + break; + + case "socks5": + curl_setopt($curlproc, CURLOPT_PROXYTYPE, CURLPROXY_SOCKS5); + curl_setopt($curlproc, CURLOPT_PROXY, $address . ":" . $port); + break; + + case "socks4a": + curl_setopt($curlproc, CURLOPT_PROXYTYPE, CURLPROXY_SOCKS4A); + curl_setopt($curlproc, CURLOPT_PROXY, $address . ":" . $port); + break; + + case "socks5_hostname": + case "socks5h": + case "socks5a": + curl_setopt($curlproc, CURLOPT_PROXYTYPE, CURLPROXY_SOCKS5_HOSTNAME); + curl_setopt($curlproc, CURLOPT_PROXY, $address . ":" . $port); + break; + } + + if($username != ""){ + + curl_setopt($curlproc, CURLOPT_PROXYUSERPWD, $username . ":" . $password); + } + } + + + + /* + Next page stuff + */ + public function store(string $payload, string $page, string $proxy){ + + $key = sodium_crypto_secretbox_keygen(); + $nonce = random_bytes(SODIUM_CRYPTO_SECRETBOX_NONCEBYTES); + + $requestid = apcu_inc("requestid"); + + apcu_store( + $page[0] . "." . // first letter of page name + $this->scraper . // scraper name + $requestid, + [ + $nonce, + $proxy, + // compress and encrypt + sodium_crypto_secretbox( + gzdeflate($payload), + $nonce, + $key + ) + ], + 900 // cache information for 15 minutes + ); + + return + $this->scraper . $requestid . "." . + rtrim(strtr(base64_encode($key), '+/', '-_'), '='); + } + + public function get(string $npt, string $page){ + + $page = $page[0]; + $explode = explode(".", $npt, 2); + + if(count($explode) !== 2){ + + throw new Exception("Malformed nextPageToken!"); + } + + $apcu = $page . "." . $explode[0]; + $key = $explode[1]; + + $payload = apcu_fetch($apcu); + + if($payload === false){ + + throw new Exception("The next page token is invalid or has expired!"); + } + + $key = + base64_decode( + str_pad( + strtr($key, '-_', '+/'), + strlen($key) % 4, + '=', + STR_PAD_RIGHT + ) + ); + + // decrypt and decompress data + $payload[2] = + gzinflate( + sodium_crypto_secretbox_open( + $payload[2], // data + $payload[0], // nonce + $key + ) + ); + + if($payload[2] === false){ + + throw new Exception("The next page token is invalid or has expired!"); + } + + // remove the key after using successfully + apcu_delete($apcu); + + return [ + $payload[2], // data + $payload[1] // proxy + ]; + } +} diff --git a/lib/bingcache-todo-fix.php b/lib/bingcache-todo-fix.php new file mode 100644 index 0000000..a4acb5b --- /dev/null +++ b/lib/bingcache-todo-fix.php @@ -0,0 +1,144 @@ + + +new bingcache(); + +class bingcache{ + + public function __construct(){ + + if( + !isset($_GET["s"]) || + $this->validate_url($_GET["s"]) === false + ){ + + var_dump($this->validate_url($_GET["s"])); + $this->do404("Please provide a valid URL."); + } + + $url = $_GET["s"]; + + $curlproc = curl_init(); + + curl_setopt( + $curlproc, + CURLOPT_URL, + "https://www.bing.com/search?q=url%3A" . + urlencode($url) + ); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + curl_setopt( + $curlproc, + CURLOPT_HTTPHEADER, + ["User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/107.0", + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "DNT: 1", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: none", + "Sec-Fetch-User: ?1"] + ); + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 5); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + $this->do404("Failed to connect to bing servers. Please try again later."); + } + + curl_close($curlproc); + + preg_match( + '/
/', + $data, + $keys + ); + + print_r($keys); + + if(count($keys) === 0){ + + $this->do404("Bing has not archived this URL."); + } + + $keys = explode("|", $keys[1]); + $count = count($keys); + + //header("Location: https://cc.bingj.com/cache.aspx?d=" . $keys[$count - 2] . "&w=" . $keys[$count - 1]); + echo("Location: https://cc.bingj.com/cache.aspx?d=" . $keys[$count - 2] . "&w=" . $keys[$count - 1]); + } + + public function do404($text){ + + include "lib/frontend.php"; + $frontend = new frontend(); + + echo + $frontend->load( + "error.html", + [ + "title" => "Shit", + "text" => $text + ] + ); + + die(); + } + + public function validate_url($url){ + + $url_parts = parse_url($url); + + // check if required parts are there + if( + !isset($url_parts["scheme"]) || + !( + $url_parts["scheme"] == "http" || + $url_parts["scheme"] == "https" + ) || + !isset($url_parts["host"]) + ){ + return false; + } + + if( + // if its not an RFC-valid URL + !filter_var($url, FILTER_VALIDATE_URL) + ){ + return false; + } + + $ip = + str_replace( + ["[", "]"], // handle ipv6 + "", + $url_parts["host"] + ); + + // if its not an IP + if(!filter_var($ip, FILTER_VALIDATE_IP)){ + + // resolve domain's IP + $ip = gethostbyname($url_parts["host"] . "."); + } + + // check if its localhost + return filter_var( + $ip, + FILTER_VALIDATE_IP, FILTER_FLAG_NO_PRIV_RANGE | FILTER_FLAG_NO_RES_RANGE + ); + } +} diff --git a/lib/bot_protection.php b/lib/bot_protection.php new file mode 100644 index 0000000..e3d51a8 --- /dev/null +++ b/lib/bot_protection.php @@ -0,0 +1,281 @@ +loadheader( + $get, + $filters, + $page + ); + } + return; + } + + /* + Validate cookie, if it exists + */ + if(isset($_COOKIE["pass"])){ + + if( + // check if key is not malformed + preg_match( + '/^k[0-9]+\.[A-Za-z0-9_]{20}$/', + $_COOKIE["pass"] + ) && + // does key exist + apcu_exists($_COOKIE["pass"]) + ){ + + // exists, increment counter + $inc = apcu_inc($_COOKIE["pass"]); + + // we start counting from 1 + // when it has been incremented to 102, it has reached + // 100 reqs + if($inc >= config::MAX_SEARCHES + 2){ + + // reached limit, delete and give captcha + apcu_delete($_COOKIE["pass"]); + }else{ + + // the cookie is OK! dont die() and give results + apcu_inc("real_requests"); + + if($output === true){ + $frontend->loadheader( + $get, + $filters, + $page + ); + } + return; + } + } + } + + if($output === false){ + + http_response_code(401); // forbidden + echo json_encode([ + "status" => "The \"pass\" token in your cookies is missing or has expired!!" + ]); + die(); + } + + /* + Validate form data + */ + $lines = + explode( + "\r\n", + file_get_contents("php://input") + ); + + $invalid = false; + $answers = []; + $key = false; + $error = ""; + + foreach($lines as $line){ + + $line = explode("=", $line, 2); + + if(count($line) !== 2){ + + $invalid = true; + break; + } + + preg_match( + '/^c\[([0-9]+)\]$/', + $line[0], + $regex + ); + + if( + $line[1] != "on" || + !isset($regex[0][1]) + ){ + + // check if its the v key + if( + $line[0] == "v" && + preg_match( + '/^c[0-9]+\.[A-Za-z0-9_]{20}$/', + $line[1] + ) + ){ + + $key = apcu_fetch($line[1]); + apcu_delete($line[1]); + } + break; + } + + $regex = (int)$regex[1]; + + if( + $regex >= 16 || + $regex <= -1 + ){ + + $invalid = true; + break; + } + + $answers[] = $regex; + } + + if( + !$invalid && + $key !== false // has captcha been gen'd? + ){ + $check = count($key); + + // validate answer + for($i=0; $irandomchars(); + + apcu_inc($key, 1, $stupid, 86400); + + apcu_inc("real_requests"); + + setcookie( + "pass", + $key, + [ + "expires" => time() + 86400, // expires in 24 hours + "samesite" => "Lax", + "path" => "/" + ] + ); + + $frontend->loadheader( + $get, + $filters, + $page + ); + return; + + }else{ + + $error = "
You were kicked out of Mensa. Please try again.
"; + } + } + + $key = "c" . apcu_inc("captcha_gen", 1) . "." . $this->randomchars(); + + $payload = [ + "timetaken" => microtime(true), + "class" => "", + "right-left" => "", + "right-right" => "", + "left" => + '
' . + '

IQ test

' . + 'IQ test has been enabled due to bot abuse on the network.
' . + 'Solving this IQ test will let you make 100 searches today. I will add an invite system to bypass this soon...' . + $error . + '
' . + '
' . + '
' . + 'Captcha image' . + '
' . + '' . + '' . + '' . + '' . + '' . + '' . + '' . + '' . + '' . + '' . + '' . + '' . + '' . + '' . + '' . + '' . + '' . + '' . + '' . + '' . + '' . + '' . + '' . + '' . + '' . + '' . + '' . + '' . + '' . + '' . + '' . + '' . + '
' . + '
' . + '
' . + '' . + '' . + '
' . + '
' + ]; + + $frontend->loadheader( + $get, + $filters, + $page + ); + + echo $frontend->load("search.html", $payload); + die(); + } + + private function randomchars(){ + + $chars = + array_merge( + range("A", "Z"), + range("a", "z"), + range(0, 9) + ); + + $chars[] = "_"; + + $c = count($chars) - 1; + + $key = ""; + + for($i=0; $i<20; $i++){ + + $key .= $chars[random_int(0, $c)]; + } + + return $key; + } +} diff --git a/lib/curlproxy.php b/lib/curlproxy.php new file mode 100644 index 0000000..313ab01 --- /dev/null +++ b/lib/curlproxy.php @@ -0,0 +1,660 @@ +cache = $cache; + } + + public function do404(){ + + http_response_code(404); + header("Content-Type: image/png"); + + $handle = fopen("lib/img404.png", "r"); + echo fread($handle, filesize("lib/img404.png")); + fclose($handle); + + die(); + return; + } + + public function getabsoluteurl($path, $relative){ + + if($this->validateurl($path)){ + + return $path; + } + + if(substr($path, 0, 2) == "//"){ + + return "https:" . $path; + } + + $url = null; + + $relative = parse_url($relative); + $url = $relative["scheme"] . "://"; + + if( + isset($relative["user"]) && + isset($relative["pass"]) + ){ + + $url .= $relative["user"] . ":" . $relative["pass"] . "@"; + } + + $url .= $relative["host"]; + + if(isset($relative["path"])){ + + $relative["path"] = explode( + "/", + $relative["path"] + ); + + unset($relative["path"][count($relative["path"]) - 1]); + $relative["path"] = implode("/", $relative["path"]); + + $url .= $relative["path"]; + } + + if( + strlen($path) !== 0 && + $path[0] !== "/" + ){ + + $url .= "/"; + } + + $url .= $path; + + return $url; + } + + public function validateurl($url){ + + $url_parts = parse_url($url); + + // check if required parts are there + if( + !isset($url_parts["scheme"]) || + !( + $url_parts["scheme"] == "http" || + $url_parts["scheme"] == "https" + ) || + !isset($url_parts["host"]) + ){ + return false; + } + + $ip = + str_replace( + ["[", "]"], // handle ipv6 + "", + $url_parts["host"] + ); + + // if its not an IP + if(!filter_var($ip, FILTER_VALIDATE_IP)){ + + // resolve domain's IP + $ip = gethostbyname($url_parts["host"] . "."); + } + + // check if its localhost + if( + filter_var( + $ip, + FILTER_VALIDATE_IP, FILTER_FLAG_NO_PRIV_RANGE | FILTER_FLAG_NO_RES_RANGE + ) === false + ){ + + return false; + } + + return true; + } + + public function get($url, $reqtype = self::req_web, $acceptallcodes = false, $referer = null, $redirectcount = 0){ + + if($redirectcount === 5){ + + throw new Exception("Too many redirects"); + } + + if($url == "https://i.imgur.com/removed.png"){ + + throw new Exception("Encountered imgur 404"); + } + + // sanitize URL + if($this->validateurl($url) === false){ + + throw new Exception("Invalid URL"); + } + + $this->clientcache(); + + $curl = curl_init(); + + curl_setopt($curl, CURLOPT_URL, $url); + curl_setopt($curl, CURLOPT_ENCODING, ""); // default encoding + curl_setopt($curl, CURLOPT_HEADER, 1); + + switch($reqtype){ + case self::req_web: + curl_setopt( + $curl, + CURLOPT_HTTPHEADER, + [ + "User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip, deflate", + "DNT: 1", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: none", + "Sec-Fetch-User: ?1" + ] + ); + break; + + case self::req_image: + + if($referer === null){ + $referer = explode("/", $url, 4); + array_pop($referer); + + $referer = implode("/", $referer); + } + + curl_setopt( + $curl, + CURLOPT_HTTPHEADER, + [ + "User-Agent: " . config::USER_AGENT, + "Accept: image/avif,image/webp,*/*", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip, deflate", + "DNT: 1", + "Connection: keep-alive", + "Referer: {$referer}" + ] + ); + break; + } + + curl_setopt($curl, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curl, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curl, CURLOPT_TIMEOUT, 30); + + // limit size of payloads + curl_setopt($curl, CURLOPT_BUFFERSIZE, 1024); + curl_setopt($curl, CURLOPT_NOPROGRESS, false); + curl_setopt( + $curl, + CURLOPT_PROGRESSFUNCTION, + function($downloadsize, $downloaded, $uploadsize, $uploaded + ){ + + // if $downloaded exceeds 100MB, fuck off + return ($downloaded > 100000000) ? 1 : 0; + }); + + $body = curl_exec($curl); + + if(curl_errno($curl)){ + + throw new Exception(curl_error($curl)); + } + + curl_close($curl); + + $headers = []; + $http = null; + + while(true){ + + $header = explode("\n", $body, 2); + $body = $header[1]; + + if($http === null){ + + // http/1.1 200 ok + $header = explode("/", $header[0], 2); + $header = explode(" ", $header[1], 3); + + $http = [ + "version" => (float)$header[0], + "code" => (int)$header[1] + ]; + + continue; + } + + if(trim($header[0]) == ""){ + + // reached end of headers + break; + } + + $header = explode(":", $header[0], 2); + + // malformed headers + if(count($header) !== 2){ continue; } + + $headers[strtolower(trim($header[0]))] = trim($header[1]); + } + + // check http code + if( + $http["code"] >= 300 && + $http["code"] <= 309 + ){ + + // redirect + if(!isset($headers["location"])){ + + throw new Exception("Broken redirect"); + } + + $redirectcount++; + + return $this->get($this->getabsoluteurl($headers["location"], $url), $reqtype, $acceptallcodes, $referer, $redirectcount); + }else{ + if( + $acceptallcodes === false && + $http["code"] > 300 + ){ + + throw new Exception("Remote server returned an error code! ({$http["code"]})"); + } + } + + // check if data is okay + switch($reqtype){ + + case self::req_image: + + $format = false; + + if(isset($headers["content-type"])){ + + if(stripos($headers["content-type"], "text/html") !== false){ + + throw new Exception("Server returned html"); + } + + if( + preg_match( + '/image\/([^ ]+)/i', + $headers["content-type"], + $match + ) + ){ + + $format = strtolower($match[1]); + + if(substr(strtolower($format), 0, 2) == "x-"){ + + $format = substr($format, 2); + } + } + } + + return [ + "http" => $http, + "format" => $format, + "headers" => $headers, + "body" => $body + ]; + break; + + default: + + return [ + "http" => $http, + "headers" => $headers, + "body" => $body + ]; + break; + } + + return; + } + + public function stream_linear_image($url, $referer = null){ + + $this->stream($url, $referer, "image"); + } + + public function stream_linear_audio($url, $referer = null){ + + $this->stream($url, $referer, "audio"); + } + + private function stream($url, $referer, $format){ + + $this->clientcache(); + + $this->url = $url; + $this->format = $format; + + // sanitize URL + if($this->validateurl($url) === false){ + + throw new Exception("Invalid URL"); + } + + $curl = curl_init(); + + // set headers + if($referer === null){ + $referer = explode("/", $url, 4); + array_pop($referer); + + $referer = implode("/", $referer); + } + + switch($format){ + + case "image": + curl_setopt( + $curl, + CURLOPT_HTTPHEADER, + [ + "User-Agent: " . config::USER_AGENT, + "Accept: image/avif,image/webp,*/*", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip, deflate, br", + "DNT: 1", + "Connection: keep-alive", + "Referer: {$referer}" + ] + ); + break; + + case "audio": + curl_setopt( + $curl, + CURLOPT_HTTPHEADER, + [ + "User-Agent: " . config::USER_AGENT, + "Accept: audio/webm,audio/ogg,audio/wav,audio/*;q=0.9,application/ogg;q=0.7,video/*;q=0.6,*/*;q=0.5", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip, deflate, br", + "DNT: 1", + "Connection: keep-alive", + "Referer: {$referer}" + ] + ); + break; + } + + // follow redirects + curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true); + curl_setopt($curl, CURLOPT_MAXREDIRS, 5); + curl_setopt($curl, CURLOPT_AUTOREFERER, 5); + + // set url + curl_setopt($curl, CURLOPT_URL, $url); + curl_setopt($curl, CURLOPT_ENCODING, ""); // default encoding + + // timeout + disable ssl + curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curl, CURLOPT_CONNECTTIMEOUT, 10); + curl_setopt($curl, CURLOPT_TIMEOUT, 30); + + curl_setopt( + $curl, + CURLOPT_WRITEFUNCTION, + function($c, $data){ + + if(curl_getinfo($c, CURLINFO_HTTP_CODE) !== 200){ + + throw new Exception("Serber returned a non-200 code"); + } + + echo $data; + return strlen($data); + } + ); + + $this->empty_header = false; + $this->cont = false; + $this->headers_tmp = []; + $this->headers = []; + curl_setopt( + $curl, + CURLOPT_HEADERFUNCTION, + function($c, $header){ + + $head = trim($header); + $len = strlen($head); + + if($len === 0){ + + $this->empty_header = true; + $this->headers_tmp = []; + }else{ + + $this->empty_header = false; + $this->headers_tmp[] = $head; + } + + foreach($this->headers_tmp as $h){ + + // parse headers + $h = explode(":", $h, 2); + + if(count($h) !== 2){ + + if(curl_getinfo($c, CURLINFO_HTTP_CODE) !== 200){ + + // not HTTP 200, probably a redirect + $this->cont = false; + }else{ + + $this->cont = true; + } + + // is HTTP 200, just ignore that line + continue; + } + + $this->headers[strtolower(trim($h[0]))] = trim($h[1]); + } + + if( + $this->cont && + $this->empty_header + ){ + + // get content type + if(isset($this->headers["content-type"])){ + + $octet_check = stripos($this->headers["content-type"], "octet-stream"); + + if( + stripos($this->headers["content-type"], $this->format) === false && + $octet_check === false + ){ + + throw new Exception("Resource reported invalid Content-Type"); + } + + }else{ + + throw new Exception("Resource is not an {$this->format} (no Content-Type)"); + } + + $filetype = explode("/", $this->headers["content-type"]); + + if(!isset($filetype[1])){ + + throw new Exception("Malformed Content-Type header"); + } + + if($octet_check !== false){ + + $filetype[1] = "jpeg"; + } + + header("Content-Type: {$this->format}/{$filetype[1]}"); + + // give payload size + if(isset($this->headers["content-length"])){ + + header("Content-Length: {$this->headers["content-length"]}"); + } + + // give filename + $this->getfilenameheader($this->headers, $this->url, $filetype[1]); + } + + return strlen($header); + } + ); + + curl_exec($curl); + + if(curl_errno($curl)){ + + throw new Exception(curl_error($curl)); + } + + curl_close($curl); + } + + public function getfilenameheader($headers, $url, $filetype = "jpg"){ + + // get filename from content-disposition header + if(isset($headers["content-disposition"])){ + + preg_match( + '/filename=([^;]+)/', + $headers["content-disposition"], + $filename + ); + + if(isset($filename[1])){ + + header("Content-Disposition: filename=\"" . trim($filename[1], "\"'") . "." . $filetype . "\""); + return; + } + } + + // get filename from URL + $filename = parse_url($url, PHP_URL_PATH); + + if($filename === null){ + + // everything failed! rename file to domain name + header("Content-Disposition: filename=\"" . parse_url($url, PHP_URL_HOST) . "." . $filetype . "\""); + return; + } + + // remove extension from filename + $filename = + explode( + ".", + basename($filename) + ); + + if(count($filename) > 1){ + array_pop($filename); + } + + $filename = implode(".", $filename); + + header("Content-Disposition: inline; filename=\"" . $filename . "." . $filetype . "\""); + return; + } + + public function getimageformat($payload, &$imagick){ + + $finfo = new finfo(FILEINFO_MIME_TYPE); + $format = $finfo->buffer($payload["body"]); + + if($format === false){ + + if($payload["format"] === false){ + + header("X-Error: Could not parse format"); + $this->favicon404(); + } + + $format = $payload["format"]; + }else{ + + $format_tmp = explode("/", $format, 2); + + if($format_tmp[0] == "image"){ + + $format_tmp = strtolower($format_tmp[1]); + + if(substr($format_tmp, 0, 2) == "x-"){ + + $format_tmp = substr($format_tmp, 2); + } + + $format = $format_tmp; + } + } + + switch($format){ + + case "tiff": $format = "gif"; break; + case "vnd.microsoft.icon": $format = "ico"; break; + case "icon": $format = "ico"; break; + case "svg+xml": $format = "svg"; break; + } + + $imagick = new Imagick(); + + if( + !in_array( + $format, + array_map("strtolower", $imagick->queryFormats()) + ) + ){ + + // format could not be found, but imagemagick can + // sometimes detect it? shit's fucked + $format = false; + } + + return $format; + } + + public function clientcache(){ + + if($this->cache === false){ + + return; + } + + header("Last-Modified: Thu, 01 Oct 1970 00:00:00 GMT"); + $headers = getallheaders(); + + if( + isset($headers["If-Modified-Since"]) || + isset($headers["If-Unmodified-Since"]) + ){ + + http_response_code(304); // 304: Not Modified + die(); + } + } +} diff --git a/lib/favicon404.png b/lib/favicon404.png new file mode 100644 index 0000000..fa8f4d1 Binary files /dev/null and b/lib/favicon404.png differ diff --git a/lib/frontend.php b/lib/frontend.php new file mode 100644 index 0000000..9f819ba --- /dev/null +++ b/lib/frontend.php @@ -0,0 +1,1356 @@ +'; + }else{ + + $replacements["style"] = ""; + } + + if(isset($_COOKIE["scraper_ac"])){ + + $replacements["ac"] = '?ac=' . htmlspecialchars($_COOKIE["scraper_ac"]); + }else{ + + $replacements["ac"] = ''; + } + + if( + isset($replacements["timetaken"]) && + $replacements["timetaken"] !== null + ){ + + $replacements["timetaken"] = '
Took ' . number_format(microtime(true) - $replacements["timetaken"], 2) . 's
'; + } + + $handle = fopen("template/{$template}", "r"); + $data = fread($handle, filesize("template/{$template}")); + fclose($handle); + + $data = explode("\n", $data); + $html = ""; + + for($i=0; $i $value){ + + $html = + str_replace( + "{%{$key}%}", + $value, + $html + ); + } + + return trim($html); + } + + public function loadheader(array $get, array $filters, string $page){ + + echo + $this->load("header.html", [ + "title" => trim(htmlspecialchars($get["s"]) . " ({$page})"), + "description" => ucfirst($page) . ' search results for "' . htmlspecialchars($get["s"]) . '"', + "index" => "no", + "search" => htmlspecialchars($get["s"]), + "tabs" => $this->generatehtmltabs($page, $get["s"]), + "filters" => $this->generatehtmlfilters($filters, $get) + ]); + + $headers_raw = getallheaders(); + $header_keys = []; + $user_agent = ""; + $bad_header = false; + + // block bots that present X-Forwarded-For, Via, etc + foreach($headers_raw as $headerkey => $headervalue){ + + $headerkey = strtolower($headerkey); + if($headerkey == "user-agent"){ + + $user_agent = $headervalue; + continue; + } + + // check header key + if(in_array($headerkey, config::FILTERED_HEADER_KEYS)){ + + $bad_header = true; + break; + } + } + + // SSL check + $bad_ssl = false; + if( + isset($_SERVER["https"]) && + $_SERVER["https"] == "on" && + isset($_SERVER["SSL_CIPHER"]) && + in_array($_SERVER["SSL_CIPHER"], config::FILTERED_HEADER_KEYS) + ){ + + $bad_ssl = true; + } + + if( + $bad_header === true || + $bad_ssl === true || + $user_agent == "" || + // user agent check + preg_match( + config::HEADER_REGEX, + $user_agent + ) + ){ + + // bot detected !! + apcu_inc("captcha_gen"); + + $this->drawerror( + "Tshh, blocked!", + 'Your browser, IP or IP range has been blocked from this 4get instance. If this is an error, please contact the administrator.' + ); + die(); + } + } + + public function drawerror($title, $error, $timetaken = null){ + + if($timetaken === null){ + + $timetaken = microtime(true); + } + + echo + $this->load("search.html", [ + "timetaken" => $timetaken, + "class" => "", + "right-left" => "", + "right-right" => "", + "left" => + '
' . + '

' . htmlspecialchars($title) . '

' . + $error . + '
' + ]); + die(); + } + + public function drawscrapererror($error, $get, $target, $timetaken = null){ + + if($timetaken === null){ + + $timetaken = microtime(true); + } + + $this->drawerror( + "Shit", + 'This scraper returned an error:' . + '
' . htmlspecialchars($error) . '
' . + 'Things you can try:' . + '
' . + 'If the error persists, please contact the administrator.', + $timetaken + ); + } + + public function drawtextresult($site, $greentext = null, $duration = null, $keywords, $tabindex = true, $customhtml = null){ + + $payload = + '
'; + + // add favicon, link and archive links + $payload .= $this->drawlink($site["url"]); + + /* + Draw title + description + filetype + */ + $payload .= + '' . + 'thumb'; + + if($duration !== null){ + + $payload .= + '
' . + htmlspecialchars($duration) . + '
'; + } + + $payload .= + '
'; + } + + $payload .= + '
'; + + if( + isset($site["type"]) && + $site["type"] != "web" + ){ + + $payload .= '
' . strtoupper($site["type"]) . '
'; + } + + $payload .= + $this->highlighttext($keywords, $site["title"]) . + '
'; + + if($greentext !== null){ + + $payload .= + '
' . + htmlspecialchars($greentext) . + '
'; + } + + if($site["description"] !== null){ + + $payload .= + '
' . + $this->highlighttext($keywords, $site["description"]) . + '
'; + } + + $payload .= $customhtml; + + $payload .= '
'; + + /* + Sublinks + */ + if( + isset($site["sublink"]) && + !empty($site["sublink"]) + ){ + + usort($site["sublink"], function($a, $b){ + + return strlen($a["description"]) > strlen($b["description"]); + }); + + $payload .= + ''; + } + + if( + isset($site["table"]) && + !empty($site["table"]) + ){ + + $payload .= ''; + + foreach($site["table"] as $title => $value){ + + $payload .= + '' . + '' . + '' . + ''; + } + + $payload .= '
' . htmlspecialchars($title) . '' . htmlspecialchars($value) . '
'; + } + + return $payload . '
'; + } + + public function highlighttext($keywords, $text){ + + $text = htmlspecialchars($text); + + $keywords = explode(" ", $keywords); + $regex = []; + + foreach($keywords as $word){ + + $regex[] = "\b" . preg_quote($word, "/") . "\b"; + } + + $regex = "/" . implode("|", $regex) . "/i"; + + return + preg_replace( + $regex, + '${0}', + $text + ); + } + + function highlightcode($text){ + + // https://www.php.net/highlight_string + ini_set("highlight.comment", "c-comment"); + ini_set("highlight.default", "c-default"); + ini_set("highlight.html", "c-default"); + ini_set("highlight.keyword", "c-keyword"); + ini_set("highlight.string", "c-string"); + + $text = + trim( + preg_replace( + '/]+>/', + "", + str_replace( + [ + "
", + " ", + "
",
+							"
", + "
" + ], + [ + "\n", + " ", + "", + "", + "" + ], + explode( + "<?php", + highlight_string("', '', $text); + } + + return $text; + } + + public function drawlink($link){ + + /* + Add favicon + */ + $host = parse_url($link); + $esc = + explode( + ".", + $host["host"], + 2 + ); + + if( + count($esc) === 2 && + $esc[0] == "www" + ){ + + $esc = $esc[1]; + }else{ + + $esc = $esc[0]; + } + + $esc = substr($esc, 0, 2); + + $urlencode = urlencode($link); + + $payload = + '
' . + '' . + '
'; + + /* + Add archive links + */ + if( + $host["host"] == "boards.4chan.org" || + $host["host"] == "boards.4channel.org" + ){ + + $archives = []; + $path = explode("/", $host["path"]); + $count = count($path); + // /pol/thread/417568063/post-shitty-memes-if-you-want-to + + if($count !== 0){ + + $isboard = true; + + switch($path[1]){ + + case "con": + break; + + case "q": + $archives[] = "desuarchive.org"; + break; + + case "qa": + $archives[] = "desuarchive.org"; + break; + + case "qb": + $archives[] = "arch.b4k.co"; + break; + + case "trash": + $archives[] = "desuarchive.org"; + break; + + case "a": + $archives[] = "desuarchive.org"; + break; + + case "c": + $archives[] = "desuarchive.org"; + break; + + case "w": + break; + + case "m": + $archives[] = "desuarchive.org"; + break; + + case "cgl": + $archives[] = "desuarchive.org"; + $archives[] = "warosu.org"; + break; + + case "f": + $archives[] = "archive.4plebs.org"; + break; + + case "n": + break; + + case "jp": + $archives[] = "warosu.org"; + break; + + case "vt": + $archives[] = "warosu.org"; + break; + + case "v": + $archives[] = "arch.b4k.co"; + break; + + case "vg": + $archives[] = "arch.b4k.co"; + break; + + case "vm": + $archives[] = "arch.b4k.co"; + break; + + case "vmg": + $archives[] = "arch.b4k.co"; + break; + + case "vp": + $archives[] = "arch.b4k.co"; + break; + + case "vr": + $archives[] = "desuarchive.org"; + $archives[] = "warosu.org"; + break; + + case "vrpg": + $archives[] = "arch.b4k.co"; + break; + + case "vst": + $archives[] = "arch.b4k.co"; + break; + + case "co": + $archives[] = "desuarchive.org"; + break; + + case "g": + $archives[] = "desuarchive.org"; + $archives[] = "arch.b4k.co"; + break; + + case "tv": + $archives[] = "archive.4plebs.org"; + break; + + case "k": + $archives[] = "desuarchive.org"; + break; + + case "o": + $archives[] = "archive.4plebs.org"; + break; + + case "an": + $archives[] = "desuarchive.org"; + break; + + case "tg": + $archives[] = "desuarchive.org"; + $archives[] = "archive.4plebs.org"; + break; + + case "sp": + $archives[] = "archive.4plebs.org"; + break; + + case "xs": + $archives[] = "eientei.xyz"; + break; + + case "pw": + break; + + case "sci": + $archives[] = "warosu.org"; + $archives[] = "eientei.xyz"; + break; + + case "his": + $archives[] = "desuarchive.org"; + break; + + case "int": + $archives[] = "desuarchive.org"; + break; + + case "out": + break; + + case "toy": + break; + + case "i": + $archives[] = "archiveofsins.com"; + $archives[] = "eientei.xyz"; + break; + + case "po": + break; + + case "p": + break; + + case "ck": + $archives[] = "warosu.org"; + break; + + case "ic": + $archives[] = "warosu.org"; + break; + + case "wg": + break; + + case "lit": + $archives[] = "warosu.org"; + break; + + case "mu": + $archives[] = "desuarchive.org"; + break; + + case "fa": + $archives[] = "warosu.org"; + break; + + case "3": + $archives[] = "warosu.org"; + $archives[] = "eientei.xyz"; + break; + + case "gd": + break; + + case "diy": + $archives[] = "warosu.org"; + break; + + case "wsg": + $archives[] = "desuarchive.org"; + break; + + case "qst": + break; + + case "biz": + $archives[] = "warosu.org"; + break; + + case "trv": + $archives[] = "archive.4plebs.org"; + break; + + case "fit": + $archives[] = "desuarchive.org"; + break; + + case "x": + $archives[] = "archive.4plebs.org"; + break; + + case "adv": + $archives[] = "archive.4plebs.org"; + break; + + case "lgbt": + $archives[] = "archiveofsins.com"; + break; + + case "mlp": + $archives[] = "desuarchive.org"; + $archives[] = "arch.b4k.co"; + break; + + case "news": + break; + + case "wsr": + break; + + case "vip": + break; + + case "b": + $archives[] = "thebarchive.com"; + break; + + case "r9k": + $archives[] = "desuarchive.org"; + break; + + case "pol": + $archives[] = "archive.4plebs.org"; + break; + + case "bant": + $archives[] = "thebarchive.com"; + break; + + case "soc": + $archives[] = "archiveofsins.com"; + break; + + case "s4s": + $archives[] = "archive.4plebs.org"; + break; + + case "s": + $archives[] = "archiveofsins.com"; + break; + + case "hc": + $archives[] = "archiveofsins.com"; + break; + + case "hm": + $archives[] = "archiveofsins.com"; + break; + + case "h": + $archives[] = "archiveofsins.com"; + break; + + case "e": + break; + + case "u": + $archives[] = "archiveofsins.com"; + break; + + case "d": + $archives[] = "desuarchive.org"; + break; + + case "t": + $archives[] = "archiveofsins.com"; + break; + + case "hr": + $archives[] = "archive.4plebs.org"; + break; + + case "gif": + break; + + case "aco": + $archives[] = "desuarchive.org"; + break; + + case "r": + $archives[] = "archiveofsins.com"; + break; + + default: + $isboard = false; + break; + } + + if($isboard === true){ + + $archives[] = "archived.moe"; + } + + $trail = ""; + + if( + isset($path[2]) && + isset($path[3]) && + $path[2] == "thread" + ){ + + $trail .= "/" . $path[1] . "/thread/" . $path[3]; + }elseif($isboard){ + + $trail = "/" . $path[1] . "/"; + } + + for($i=0; $i' . + '' . $archives[$i][0] . $archives[$i][1] . '' . + $archives[$i] . + ''; + } + } + } + + $payload .= + 'arArchive.org' . + 'arArchive.is' . + 'ghGhostarchive' . + 'arArquivo.pt' . + 'biBing cache' . + 'meMegalodon' . + '
'; + + /* + Draw link + */ + $parts = explode("/", $link); + $clickurl = ""; + + // remove trailing / + $c = count($parts) - 1; + if($parts[$c] == ""){ + + $parts[$c - 1] = $parts[$c - 1] . "/"; + unset($parts[$c]); + } + + // merge https://site together + $parts = [ + $parts[0] . $parts[1] . '//' . $parts[2], + ...array_slice($parts, 3, count($parts) - 1) + ]; + + $c = count($parts); + for($i=0; $i<$c; $i++){ + + if($i !== 0){ $clickurl .= "/"; } + + $clickurl .= $parts[$i]; + + if($i === $c - 1){ + + $parts[$i] = rtrim($parts[$i], "/"); + } + + $payload .= + '' . + htmlspecialchars(urldecode($parts[$i])) . + ''; + + if($i !== $c - 1){ + + $payload .= ''; + } + } + + return $payload . '
'; + } + + public function getscraperfilters($page){ + + $get_scraper = isset($_COOKIE["scraper_$page"]) ? $_COOKIE["scraper_$page"] : null; + + if( + isset($_GET["scraper"]) && + is_string($_GET["scraper"]) + ){ + + $get_scraper = $_GET["scraper"]; + }else{ + + if( + isset($_GET["npt"]) && + is_string($_GET["npt"]) + ){ + + $get_scraper = explode(".", $_GET["npt"], 2)[0]; + + $get_scraper = + preg_replace( + '/[0-9]+$/', + "", + $get_scraper + ); + } + } + + // add search field + $filters = + [ + "s" => [ + "option" => "_SEARCH" + ] + ]; + + // define default scrapers + switch($page){ + + case "web": + $filters["scraper"] = [ + "display" => "Scraper", + "option" => [ + "ddg" => "DuckDuckGo", + "brave" => "Brave", + "yandex" => "Yandex", + "google" => "Google", + //"google_api" => "Google API", + "google_cse" => "Google CSE", + "startpage" => "Startpage", + "qwant" => "Qwant", + "ghostery" => "Ghostery", + "yep" => "Yep", + "greppr" => "Greppr", + "crowdview" => "Crowdview", + "mwmbl" => "Mwmbl", + "mojeek" => "Mojeek", + "baidu" => "Baidu", + "coccoc" => "Cốc Cốc", + //"solofield" => "Solofield", + "marginalia" => "Marginalia", + "wiby" => "wiby", + "curlie" => "Curlie" + ] + ]; + break; + + case "images": + $filters["scraper"] = [ + "display" => "Scraper", + "option" => [ + "ddg" => "DuckDuckGo", + "yandex" => "Yandex", + "brave" => "Brave", + "google" => "Google", + "google_cse" => "Google CSE", + "startpage" => "Startpage", + "qwant" => "Qwant", + "yep" => "Yep", + "baidu" => "Baidu", + //"solofield" => "Solofield", + "pinterest" => "Pinterest", + "cara" => "Cara", + "flickr" => "Flickr", + "fivehpx" => "500px", + "vsco" => "VSCO", + "imgur" => "Imgur", + "ftm" => "FindThatMeme", + //"sankakucomplex" => "SankakuComplex" + ] + ]; + break; + + case "videos": + $filters["scraper"] = [ + "display" => "Scraper", + "option" => [ + "yt" => "YouTube", + "vimeo" => "Vimeo", + //"odysee" => "Odysee", + "sepiasearch" => "Sepia Search", + //"fb" => "Facebook videos", + "ddg" => "DuckDuckGo", + "brave" => "Brave", + "yandex" => "Yandex", + "google" => "Google", + "startpage" => "Startpage", + "qwant" => "Qwant", + "baidu" => "Baidu", + "coccoc" => "Cốc Cốc" + //"solofield" => "Solofield" + ] + ]; + break; + + case "news": + $filters["scraper"] = [ + "display" => "Scraper", + "option" => [ + "ddg" => "DuckDuckGo", + "brave" => "Brave", + "google" => "Google", + "startpage" => "Startpage", + "qwant" => "Qwant", + "yep" => "Yep", + "mojeek" => "Mojeek", + "baidu" => "Baidu" + ] + ]; + break; + + case "music": + $filters["scraper"] = [ + "display" => "Scraper", + "option" => [ + "sc" => "SoundCloud" + //"spotify" => "Spotify" + ] + ]; + break; + } + + // get scraper name from user input, or default out to preferred scraper + $scraper_out = null; + $first = true; + + foreach($filters["scraper"]["option"] as $scraper_name => $scraper_pretty){ + + if($first === true){ + + $first = $scraper_name; + } + + if($scraper_name == $get_scraper){ + + $scraper_out = $scraper_name; + } + } + + if($scraper_out === null){ + + $scraper_out = $first; + } + + include "scraper/$scraper_out.php"; + $lib = new $scraper_out(); + + // set scraper on $_GET + $_GET["scraper"] = $scraper_out; + + // set nsfw on $_GET + if( + isset($_COOKIE["nsfw"]) && + !isset($_GET["nsfw"]) + ){ + + $_GET["nsfw"] = $_COOKIE["nsfw"]; + } + + return + [ + $lib, + array_merge_recursive( + $filters, + $lib->getfilters($page) + ) + ]; + } + + public function parsegetfilters($parameters, $whitelist){ + + $sanitized = []; + + // add npt token + if( + isset($parameters["npt"]) && + is_string($parameters["npt"]) + ){ + + $sanitized["npt"] = $parameters["npt"]; + }else{ + + $sanitized["npt"] = false; + } + + // we're iterating over $whitelist, so + // you can't polluate $sanitized with useless + // parameters + foreach($whitelist as $parameter => $value){ + + if(isset($parameters[$parameter])){ + + if(!is_string($parameters[$parameter])){ + + $sanitized[$parameter] = null; + continue; + } + + // parameter is already set, use that value + $sanitized[$parameter] = $parameters[$parameter]; + }else{ + + // parameter is not set, add it + if(is_string($value["option"])){ + + // special field: set default value manually + switch($value["option"]){ + + case "_DATE": + // no date set + $sanitized[$parameter] = false; + break; + + case "_SEARCH": + // no search set + $sanitized[$parameter] = ""; + break; + } + + }else{ + + // set a default value + $sanitized[$parameter] = array_keys($value["option"])[0]; + } + } + + // sanitize input + if(is_array($value["option"])){ + if( + !in_array( + $sanitized[$parameter], + $keys = array_keys($value["option"]) + ) + ){ + + $sanitized[$parameter] = $keys[0]; + } + }else{ + + // sanitize search & string + switch($value["option"]){ + + case "_DATE": + if($sanitized[$parameter] !== false){ + + $sanitized[$parameter] = strtotime($sanitized[$parameter]); + if($sanitized[$parameter] <= 0){ + + $sanitized[$parameter] = false; + } + } + break; + + case "_SEARCH": + // get search string + $sanitized["s"] = trim($sanitized[$parameter]); + } + } + } + + // invert dates if needed + if( + isset($sanitized["older"]) && + isset($sanitized["newer"]) && + $sanitized["newer"] !== false && + $sanitized["older"] !== false && + $sanitized["newer"] > $sanitized["older"] + ){ + + // invert + [ + $sanitized["older"], + $sanitized["newer"] + ] = [ + $sanitized["newer"], + $sanitized["older"] + ]; + } + + return $sanitized; + } + + public function s_to_timestamp($seconds){ + + if(is_string($seconds)){ + + return "LIVE"; + } + + return ($seconds >= 60) ? ltrim(gmdate("H:i:s", $seconds), ":0") : gmdate("0:s", $seconds); + } + + public function generatehtmltabs($page, $query){ + + $html = null; + + foreach(["web", "images", "videos", "news", "music"] as $type){ + + $html .= '' . ucfirst($type) . ''; + } + + return $html; + } + + public function generatehtmlfilters($filters, $params){ + + $html = null; + + foreach($filters as $filter_name => $filter_values){ + + if(!isset($filter_values["display"])){ + + continue; + } + + $output = true; + $tmp = + '
' . + '
' . htmlspecialchars($filter_values["display"]) . '
'; + + if(is_array($filter_values["option"])){ + + $tmp .= ''; + }else{ + + switch($filter_values["option"]){ + + case "_DATE": + $tmp .= ' $value){ + + if( + $value == null || + $value == false || + $key == "npt" || + $key == "extendedsearch" || + $value == "any" || + $value == "all" || + $key == "spellcheck" || + ( + $ommit === true && + $key == "s" + ) + ){ + + continue; + } + + if( + $key == "older" || + $key == "newer" + ){ + + $value = date("Y-m-d", (int)$value); + } + + $out[$key] = $value; + } + + return http_build_query($out); + } + + public function htmlimage($image, $format){ + + if( + preg_match( + '/^data:/', + $image + ) + ){ + + return htmlspecialchars($image); + } + + return "/proxy?i=" . urlencode($image) . "&s=" . $format; + } + + public function htmlnextpage($gets, $npt, $page){ + + $query = $this->buildquery($gets); + + return $page . "?" . $query . "&npt=" . $npt; + } +} diff --git a/lib/fuckhtml.php b/lib/fuckhtml.php new file mode 100644 index 0000000..3ea256f --- /dev/null +++ b/lib/fuckhtml.php @@ -0,0 +1,622 @@ +load($html, $isfile); + } + } + + public function load($html, $isfile = false){ + + if(is_array($html)){ + + if(!isset($html["innerHTML"])){ + + throw new Exception("(load) Supplied array doesn't contain an innerHTML index"); + } + $html = $html["innerHTML"]; + } + + if($isfile){ + + $handle = fopen($html, "r"); + $fetch = fread($handle, filesize($html)); + fclose($handle); + + $this->html = $fetch; + }else{ + + $this->html = $html; + } + + $this->strlen = strlen($this->html); + } + + public function getloadedhtml(){ + + return $this->html; + } + + public function getElementsByTagName(string $tagname){ + + $out = []; + + /* + Scrape start of the tag. Example +
... + */ + + if($tagname == "*"){ + + $tagname = '[A-Za-z0-9._-]+'; + }else{ + + $tagname = preg_quote(strtolower($tagname)); + } + + preg_match_all( + '/<\s*(' . $tagname . ')(\s(?:[^>\'"]*|"[^"]*"|\'[^\']*\')+)?\s*>/i', + /* '/<\s*(' . $tagname . ')(\s[\S\s]*?)?>/i', */ + $this->html, + $starting_tags, + PREG_OFFSET_CAPTURE + ); + + for($i=0; $i strtolower($starting_tags[1][$i][0]), + "startPos" => $starting_tags[0][$i][1], + "endPos" => 0, + "startTag" => $starting_tags[0][$i][0], + "attributes" => $attributes, + "innerHTML" => null + ]; + } + + /* + Get innerHTML + */ + // get closing tag positions + preg_match_all( + '/<\s*\/\s*(' . $tagname . ')\s*>/i', + $this->html, + $regex_closing_tags, + PREG_OFFSET_CAPTURE + ); + + // merge opening and closing tags together + for($i=0; $i strtolower($regex_closing_tags[1][$i][0]), + "endTag" => $regex_closing_tags[0][$i][0], + "startPos" => $regex_closing_tags[0][$i][1] + ]; + } + + usort( + $out, + function($a, $b){ + + return $a["startPos"] > $b["startPos"]; + } + ); + + // compute the indent level for each element + $level = []; + $count = count($out); + + for($i=0; $i<$count; $i++){ + + if(!isset($level[$out[$i]["tagName"]])){ + + $level[$out[$i]["tagName"]] = 0; + } + + if(isset($out[$i]["startTag"])){ + + // encountered starting tag + $level[$out[$i]["tagName"]]++; + $out[$i]["level"] = $level[$out[$i]["tagName"]]; + }else{ + + // encountered closing tag + $out[$i]["level"] = $level[$out[$i]["tagName"]]; + $level[$out[$i]["tagName"]]--; + } + } + + // if the indent level is the same for a div, + // we encountered _THE_ closing tag + for($i=0; $i<$count; $i++){ + + if(!isset($out[$i]["startTag"])){ + + continue; + } + + for($k=$i; $k<$count; $k++){ + + if( + isset($out[$k]["endTag"]) && + $out[$i]["tagName"] == $out[$k]["tagName"] && + $out[$i]["level"] + === $out[$k]["level"] + ){ + + $startlen = strlen($out[$i]["startTag"]); + $endlen = strlen($out[$k]["endTag"]); + + $out[$i]["endPos"] = $out[$k]["startPos"] + $endlen; + + $out[$i]["innerHTML"] = + substr( + $this->html, + $out[$i]["startPos"] + $startlen, + $out[$k]["startPos"] - ($out[$i]["startPos"] + $startlen) + ); + + $out[$i]["outerHTML"] = + substr( + $this->html, + $out[$i]["startPos"], + $out[$k]["startPos"] - $out[$i]["startPos"] + $endlen + ); + + break; + } + } + } + + // filter out ending divs + for($i=0; $i<$count; $i++){ + + if(isset($out[$i]["endTag"])){ + + unset($out[$i]); + } + + unset($out[$i]["startTag"]); + } + + return array_values($out); + } + + public function getElementsByAttributeName(string $name, $collection = null){ + + if($collection === null){ + + $collection = $this->getElementsByTagName("*"); + }elseif(is_string($collection)){ + + $collection = $this->getElementsByTagName($collection); + } + + $return = []; + foreach($collection as $elem){ + + foreach($elem["attributes"] as $attrib_name => $attrib_value){ + + if($attrib_name == $name){ + + $return[] = $elem; + continue 2; + } + } + } + + return $return; + } + + public function getElementsByFuzzyAttributeValue(string $name, string $value, $collection = null){ + + $elems = $this->getElementsByAttributeName($name, $collection); + + $value = + explode( + " ", + trim( + preg_replace( + '/\s+/', + " ", + $value + ) + ) + ); + + $return = []; + + foreach($elems as $elem){ + + foreach($elem["attributes"] as $attrib_name => $attrib_value){ + + $attrib_value = + explode( + " ", + trim( + preg_replace( + '/\s+/', + " ", + $attrib_value + ) + ) + ); + + $ac = count($attrib_value); + $nc = count($value); + $cr = 0; + + for($i=0; $i<$nc; $i++){ + + for($k=0; $k<$ac; $k++){ + + if($value[$i] == $attrib_value[$k]){ + + $cr++; + } + } + } + + if($cr === $nc){ + + $return[] = $elem; + continue 2; + } + } + } + + return $return; + } + + public function getElementsByAttributeValue(string $name, string $value, $collection = null){ + + $elems = $this->getElementsByAttributeName($name, $collection); + + $return = []; + + foreach($elems as $elem){ + + foreach($elem["attributes"] as $attrib_name => $attrib_value){ + + if($attrib_value == $value){ + + $return[] = $elem; + continue 2; + } + } + } + + return $return; + } + + public function getElementById(string $idname, $collection = null){ + + $id = $this->getElementsByAttributeValue("id", $idname, $collection); + + if(count($id) !== 0){ + + return $id[0]; + } + + return false; + } + + public function getElementsByClassName(string $classname, $collection = null){ + + return $this->getElementsByFuzzyAttributeValue("class", $classname, $collection); + } + + public function getTextContent($html, $whitespace = false, $trim = true){ + + if(is_array($html)){ + + if(!isset($html["innerHTML"])){ + + throw new Exception("(getTextContent) Supplied array doesn't contain an innerHTML index"); + } + + $html = $html["innerHTML"]; + } + + $html = preg_split('/\n|<\/?br>/i', $html); + + $out = ""; + for($i=0; $i "!" . $row["bang"], + "n" => $row["name"] + ]; + } + + return $results; + }else{ + + // everything is empty + // lets just return a bang list + return [ + [ + "s" => "!w", + "n" => "Wikipedia", + "u" => "https://en.wikipedia.org/wiki/Special:Search?search={%q%}" + ], + [ + "s" => "!4ch", + "n" => "4chan Board", + "u" => "https://find.4chan.org/?q={%q%}" + ], + [ + "s" => "!a", + "n" => "Amazon", + "u" => "https://www.amazon.com/s?k={%q%}" + ], + [ + "s" => "!e", + "n" => "eBay", + "u" => "https://www.ebay.com/sch/items/?_nkw={%q%}" + ], + [ + "s" => "!so", + "n" => "Stack Overflow", + "u" => "http://stackoverflow.com/search?q={%q%}" + ], + [ + "s" => "!gh", + "n" => "GitHub", + "u" => "https://github.com/search?utf8=%E2%9C%93&q={%q%}" + ], + [ + "s" => "!tw", + "n" => "Twitter", + "u" => "https://twitter.com/search?q={%q%}" + ], + [ + "s" => "!r", + "n" => "Reddit", + "u" => "https://www.reddit.com/search?q={%q%}" + ], + ]; + } + } + + // now we know search isnt empty + if(!empty($bang)){ + + // check if the bang exists + $conn = pg_connect("host=localhost dbname=4get user=postgres password=postgres"); + + pg_prepare($conn, "bang_get_single", "SELECT bang,name FROM bangs WHERE bang = $1 LIMIT 1"); + $q = pg_execute($conn, "bang_get_single", [$bang]); + + $row = pg_fetch_array($q, null, PGSQL_ASSOC); + + if(isset($row["bang"])){ + + $bang = "!$bang "; + }else{ + + $bang = ""; + } + } + + try{ + $res = $this->get( + "https://duckduckgo.com/ac/", + [ + "q" => strtolower($search) + ], + ddg::req_xhr + ); + + $res = json_decode($res, true); + + }catch(Exception $e){ + + throw new Exception("Failed to get /ac/"); + } + + $arr = []; + for($i=0; $i $res[$i]["phrase"] + ]; + }else{ + + $arr[] = [ + "s" => $bang . $res[$i]["phrase"], + "n" => $row["name"] + ]; + } + } + + return $arr; + } -- cgit v1.2.3