aboutsummaryrefslogtreecommitdiffstats
path: root/scraper/wiby.php
diff options
context:
space:
mode:
authorlolcat <will@lolcat.ca>2025-08-11 01:55:15 +0000
committerlolcat <will@lolcat.ca>2025-08-11 01:55:15 +0000
commitcdf958d29333d448f4521f4d2faa2592b58e9b27 (patch)
tree528f2a0ffa789a6f4279d9f54a4a2aaf391f390f /scraper/wiby.php
downloadshittyweb-search-cdf958d29333d448f4521f4d2faa2592b58e9b27.tar.gz
shittyweb-search-cdf958d29333d448f4521f4d2faa2592b58e9b27.tar.bz2
shittyweb-search-cdf958d29333d448f4521f4d2faa2592b58e9b27.zip
fix wikipedia crashgrafted
Diffstat (limited to 'scraper/wiby.php')
-rw-r--r--scraper/wiby.php246
1 files changed, 246 insertions, 0 deletions
diff --git a/scraper/wiby.php b/scraper/wiby.php
new file mode 100644
index 0000000..59f723c
--- /dev/null
+++ b/scraper/wiby.php
@@ -0,0 +1,246 @@
+<?php
+
+class wiby{
+
+ public function __construct(){
+
+ include "lib/backend.php";
+ $this->backend = new backend("wiby");
+ }
+
+ public function getfilters($page){
+
+ if($page != "web"){
+
+ return [];
+ }
+
+ return [
+ "nsfw" => [
+ "display" => "NSFW",
+ "option" => [
+ "yes" => "Yes",
+ "no" => "No"
+ ]
+ ],
+ "date" => [
+ "display" => "Time posted",
+ "option" => [
+ "any" => "Any time",
+ "day" => "Past day",
+ "week" => "Past week",
+ "month" => "Past month",
+ "year" => "Past year",
+ ]
+ ]
+ ];
+ }
+
+ private function get($proxy, $url, $get = [], $nsfw){
+
+ $curlproc = curl_init();
+
+ if($get !== []){
+ $get = http_build_query($get);
+ $url .= "?" . $get;
+ }
+
+ curl_setopt($curlproc, CURLOPT_URL, $url);
+
+ curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
+ curl_setopt($curlproc, CURLOPT_HTTPHEADER,
+ ["User-Agent: " . config::USER_AGENT,
+ "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+ "Accept-Language: en-US,en;q=0.5",
+ "Accept-Encoding: gzip",
+ "Cookie: ws={$nsfw}",
+ "DNT: 1",
+ "Connection: keep-alive",
+ "Upgrade-Insecure-Requests: 1",
+ "Sec-Fetch-Dest: document",
+ "Sec-Fetch-Mode: navigate",
+ "Sec-Fetch-Site: none",
+ "Sec-Fetch-User: ?1"]
+ );
+
+ curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
+ curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
+ curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
+ curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
+ curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
+
+ $this->backend->assign_proxy($curlproc, $proxy);
+
+ $data = curl_exec($curlproc);
+
+ if(curl_errno($curlproc)){
+
+ throw new Exception(curl_error($curlproc));
+ }
+
+ curl_close($curlproc);
+ return $data;
+ }
+
+ public function web($get){
+
+ if($get["npt"]){
+
+ [$q, $proxy] = $this->backend->get($get["npt"], "web");
+ $q = json_decode($q, true);
+
+ $nsfw = $q["nsfw"];
+ unset($q["nsfw"]);
+ }else{
+
+ $search = $get["s"];
+ if(strlen($search) === 0){
+
+ throw new Exception("Search term is empty!");
+ }
+
+ $proxy = $this->backend->get_ip();
+ $date = $get["date"];
+ $nsfw = $get["nsfw"] == "yes" ? "0" : "1";
+
+ $search =
+ str_replace(
+ [
+ "!g",
+ "!gi",
+ "!gv",
+ "!gm",
+ "!b",
+ "!bi",
+ "!bv",
+ "!bm",
+ "!td",
+ "!tw",
+ "!tm",
+ "!ty",
+ "&g",
+ "&gi",
+ "&gv",
+ "&gm",
+ "&b",
+ "&bi",
+ "&bv",
+ "&bm",
+ "&td",
+ "&tw",
+ "&tm",
+ "&ty",
+ ],
+ "",
+ $search
+ );
+
+ switch($date){
+
+ case "day": $search = "!td " . $search; break;
+ case "week": $search = "!tw " . $search; break;
+ case "month": $search = "!tm " . $search; break;
+ case "year": $search = "!ty " . $search; break;
+ }
+
+ $q = [
+ "q" => $search
+ ];
+ }
+
+ try{
+ $html = $this->get(
+ $proxy,
+ "https://wiby.me/",
+ $q,
+ $nsfw
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to fetch search page");
+ }
+
+ preg_match(
+ '/<p class="pin"><blockquote>(?:<\/p>)?<br><a class="more" href="\/\?q=[^"]+&p=([0-9]+)">Find more\.\.\.<\/a><\/blockquote>/',
+ $html,
+ $nextpage
+ );
+
+ if(count($nextpage) === 0){
+
+ $nextpage = null;
+ }else{
+
+ $nextpage =
+ $this->backend->store(
+ json_encode([
+ "q" => $q["q"],
+ "p" => (int)$nextpage[1],
+ "nsfw" => $nsfw
+ ]),
+ "web",
+ $proxy
+ );
+ }
+
+ $out = [
+ "status" => "ok",
+ "spelling" => [
+ "type" => "no_correction",
+ "using" => null,
+ "correction" => null
+ ],
+ "npt" => $nextpage,
+ "answer" => [],
+ "web" => [],
+ "image" => [],
+ "video" => [],
+ "news" => [],
+ "related" => []
+ ];
+
+ preg_match_all(
+ '/<blockquote>[\s]*<a .* href="(.*)">(.*)<\/a>.*<p>(.*)<\/p>[\s]*<\/blockquote>/Ui',
+ $html,
+ $links
+ );
+
+ for($i=0; $i<count($links[0]); $i++){
+
+ $out["web"][] = [
+ "title" => $this->unescapehtml(trim($links[2][$i])),
+ "description" => $this->unescapehtml(trim(strip_tags($links[3][$i]), ".\n\r ")),
+ "url" => trim($links[1][$i]),
+ "date" => null,
+ "type" => "web",
+ "thumb" => [
+ "url" => null,
+ "ratio" => null
+ ],
+ "sublink" => [],
+ "table" => []
+ ];
+ }
+
+ return $out;
+ }
+
+ private function unescapehtml($str){
+
+ return html_entity_decode(
+ str_replace(
+ [
+ "<br>",
+ "<br/>",
+ "</br>",
+ "<BR>",
+ "<BR/>",
+ "</BR>",
+ ],
+ "\n",
+ $str
+ ),
+ ENT_QUOTES | ENT_XML1, 'UTF-8'
+ );
+ }
+}