diff options
Diffstat (limited to 'scraper')
32 files changed, 27492 insertions, 0 deletions
diff --git a/scraper/baidu.php b/scraper/baidu.php new file mode 100644 index 0000000..efb14ca --- /dev/null +++ b/scraper/baidu.php @@ -0,0 +1,2229 @@ +<?php + +class baidu{ + + public function __construct(){ + + include "lib/backend.php"; + $this->backend = new backend("baidu"); + + include "lib/fuckhtml.php"; + $this->fuckhtml = new fuckhtml(); + + $this->handles = []; + $this->proc = null; + $this->handle_category = null; + $this->handle_increment = 0; + $this->sublink_increment = 0; + + $this->cookie = null; + } + + public function getfilters($page){ + + switch($page){ + + case "web": + return + [ + "newer" => [ + "display" => "Newer than", + "option" => "_DATE" + ], + "older" => [ + "display" => "Older than", + "option" => "_DATE" + ] + ]; + break; + + case "images": + return + [ + "sort" => [ + "display" => "Sort", + "option" => [ + "relevance" => "Relevance", // no param + "latest" => "Latest", // &latest=1 + "hot" => "Hot" // &hot=1 + ] + ], + "size" => [ + "display" => "Size", + "option" => [ + "any" => "Any size", + "7" => "Extra large (1080px+)", // &z=7 + "6" => "Large (600px~1080px)", // &z=6 + "5" => "Medium (300px~600px)", // &z=5 + "4" => "Small (1px~300px)" // &z=4 + ] + ], + "ratio" => [ + "display" => "Ratio", + "option" => [ + "any" => "Any ratio", + "1" => "Tall vertical", // &imgratio=1 + "2" => "Vertical", // &imgratio=2 + "3" => "Square", // &imgratio=3 + "4" => "Horizontal", // &imgratio=4 + "5" => "Wide horizontal" // &imgratio=5 + ] + ], + "format" => [ + "display" => "Format", + "option" => [ + "any" => "Any format", + "3" => "JPG", // &imgformat=3 + "5" => "JPEG", // &imgformat=5 + "4" => "PNG", // &imgformat=4 + "2" => "BMP", // &imgformat=2 + "6" => "GIF (Animated)" // &imgformat=6 + ] + ], + "color" => [ + "display" => "Color", + "option" => [ + "any" => "Any color", + "1024" => "White", // &ic=1024 + "2048" => "Black & White", + "512" => "Black", + "64" => "Magenta", + "16" => "Blue", + "1" => "Red", + "2" => "Yellow", + "32" => "Purple", + "4" => "Green", + "8" => "Teal", + "256" => "Orange", + "128" => "Brown" + ] + ], + "type" => [ + "display" => "Type", + "option" => [ + "any" => "Any type", + "hd" => "HD", // &hd=1 + "isImgSet" => "Photo album", // &isImgSet=1 + "copyright" => "Copyright" // ©right=1 + ] + ] + ]; + break; + + case "videos": + return []; + break; + + case "news": + return [ + "category" => [ + "display" => "Category", + "option" => [ + "any" => "All news", + "media" => "Media websites", // &medium=1 + "baijiahao" => "Baidu Baijiahao" // &medium=2 + ] + ] + ]; + break; + } + } + + private function get($proxy, $url, $get = [], $referer = false){ + + $curlproc = curl_init(); + + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + + $cookies_tmp = []; + curl_setopt($curlproc, CURLOPT_HEADERFUNCTION, function($curlproc, $header) use (&$cookies_tmp){ + + $length = strlen($header); + + $header = explode(":", $header, 2); + + if(trim(strtolower($header[0])) == "set-cookie"){ + + $cookie_tmp = explode("=", trim($header[1]), 2); + + $cookies_tmp[trim($cookie_tmp[0])] = + explode(";", $cookie_tmp[1], 2)[0]; + } + + return $length; + }); + + curl_setopt($curlproc, CURLOPT_URL, $url); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + + if($referer === false){ + if($this->cookie === null){ + + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip, deflate, br, zstd", + "DNT: 1", + "Sec-GPC: 1", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: cross-site", + "Priority: u=0, i"] + ); + }else{ + + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip, deflate, br, zstd", + "DNT: 1", + "Sec-GPC: 1", + "Connection: keep-alive", + "Cookie: {$this->cookie}", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: cross-site", + "Priority: u=0, i"] + ); + } + }else{ + + if($this->cookie === null){ + + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: application/json, text/plain, */*", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip, deflate, br, zstd", + "Referer: {$referer}", + "DNT: 1", + "Sec-GPC: 1", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: empty", + "Sec-Fetch-Mode: cors", + "Sec-Fetch-Site: same-origin"] + ); + }else{ + + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: application/json, text/plain, */*", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip, deflate, br, zstd", + "Referer: {$referer}", + "DNT: 1", + "Sec-GPC: 1", + "Connection: keep-alive", + "Cookie: {$this->cookie}", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: empty", + "Sec-Fetch-Mode: cors", + "Sec-Fetch-Site: same-origin"] + ); + } + } + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + // store cookie + if(strlen($this->cookie) !== 0){ + + $this->cookie .= "; "; + } + + foreach($cookies_tmp as $cookie_name => $cookie_value){ + + $this->cookie .= $cookie_name . "=" . $cookie_value . "; "; + } + + $this->cookie = rtrim($this->cookie, " ;"); + + curl_close($curlproc); + return $data; + } + + private function redirect_add_url($proxy, $url){ + + if( + preg_match( + '/^https?:\/\/(?:www\.)?baidu\.com\/link\?/', + $url + ) === 0 + ){ + + // not a baidu redirect + return; + } + + $curlproc = curl_init(); + + curl_setopt($curlproc, CURLOPT_URL, $url); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip, deflate, br, zstd", + "DNT: 1", + "Sec-GPC: 1", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: none", + "Sec-Fetch-User: ?1", + "Priority: u=0, i"] + ); + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + curl_setopt($curlproc, CURLOPT_HEADER, true); + curl_setopt($curlproc, CURLOPT_NOBODY, true); + + $this->backend->assign_proxy($curlproc, $proxy); + + curl_multi_add_handle($this->proc, $curlproc); + $this->handles[$this->handle_category][$this->handle_increment][$this->sublink_increment] = $curlproc; + } + + private function resolve_urls($proxy, &$collection, $categories){ + + $this->proc = curl_multi_init(); + curl_multi_select($this->proc); + + foreach($categories as $category){ + + $this->sublink_increment = 0; + $this->handle_increment = 0; + $this->handle_category = $category; + + foreach($collection[$category] as $item){ + + $this->sublink_increment = 0; + $this->redirect_add_url($proxy, $item["url"]); + + if(isset($item["sublink"])){ + + foreach($item["sublink"] as $sublink){ + + $this->sublink_increment++; + $this->redirect_add_url($proxy, $sublink["url"]); + } + } + + $this->handle_increment++; + } + } + + do{ + $status = curl_multi_exec($this->proc, $active); + + }while($active && $status == CURLM_OK); + + // + // if we reach this, we're done downloading garbage + // + + foreach($this->handles as $category => $v){ + + foreach($v as $index => $data){ + + foreach($this->handles[$category][$index] as $sublinkindex => $handle){ + + preg_match( + '/location: ?(.*)$/im', + curl_multi_getcontent($handle), + $location + ); + + if(isset($location[1])){ + + if($sublinkindex === 0){ + + $collection[$category][$index]["url"] = trim($location[1]); + }else{ + + $collection[$category][$index]["sublink"][$sublinkindex - 1]["url"] = trim($location[1]); + } + } + + curl_multi_remove_handle($this->proc, $handle); + curl_close($handle); + } + } + } + + curl_multi_close($this->proc); + } + + private function resolve_images($proxy, &$data){ + + // get the image viewer that contains all of the images direct URLs + // for some reason, getting the second image's url in the set + // doesnt trigger the captcha + + if( + !isset($data["image"][1]["url"]) || + preg_match( + '/^https:\/\/image\.baidu\.com\/search\/detail/', + $data["image"][1]["url"] + ) === 0 + ){ + + // we have an already resolved image link, do nothing + return; + } + + try{ + + $html = + $this->get( + $proxy, + $data["image"][1]["url"], + [] + ); + }catch(Exception $error){ + + // fallback to the limited dataset we have + return; + } + + $this->fuckhtml->load($html); + + $script = + $this->fuckhtml + ->getElementById( + "image-detail-data", + "script" + ); + + if($script){ + + $json = + json_decode( + $script["innerHTML"], + true + ); + + if( + !isset($json["data"]["images"]) || + count($json["data"]["images"]) === 0 + ){ + + // do nothing + return; + } + + // + // Discard all previously scraped images and use data + // from the newly downloaded image carousel + // the imageset !!should!! be the same + // + $data["image"] = []; + + foreach($json["data"]["images"] as $image){ + + parse_str(parse_url($image["thumburl"], PHP_URL_QUERY), $thumb_size); + + $data["image"][] = [ + "title" => + $this->fuckhtml + ->getTextContent( + $image["titleShow"] + ), + "source" => [ + [ + "url" => $image["objurl"], + "width" => (int)$image["width"], + "height" => (int)$image["height"] + ], + [ // thumbnail + "url" => $image["thumburl"], + "width" => (int)$thumb_size["w"], + "height" => (int)$thumb_size["h"] + ] + ], + "url" => $image["fromUrl"] + ]; + } + } + } + + public function web($get){ + + if($get["npt"]){ + + [$json, $proxy] = $this->backend->get($get["npt"], "web"); + + $json = json_decode($json, true); + $this->cookie = $json["cookie"]; + $npt_data = $json["req"]; + + $npt_data["pn"] = $npt_data["pn"] + 20; + + try{ + + $html = $this->get( + $proxy, + "https://www.baidu.com/s", + $npt_data + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch search page"); + } + + }else{ + + // + // Get authentication token + // + $proxy = $this->backend->get_ip(); + + // running this will give us shit in $this->cookie + // @TODO probably not needed? I get blocked anyways ffs + //$this->get($proxy, "https://www.baidu.com", []); + + $npt_data = [ + "wd" => $get["s"], + "rn" => 20 + ]; + + // &gpc=stf%3D0%2C1752638400|stftype%3D2 + if( + $get["older"] !== false || + $get["newer"] !== false + ){ + + if($get["older"] === false){ + + $get["older"] = 0; + } + + $npt_data["gpc"] = "stf={$get["older"]},{$get["newer"]}|stftype=2"; + } + + try{ + + $html = $this->get( + $proxy, + "https://www.baidu.com/s", + $npt_data + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch search page"); + } + + $npt_data["pn"] = 0; + } + + return $this->parse_search($proxy, "web", $npt_data, $html); + } + + private function parse_search($proxy, $pagetype, $npt_data, $html){ + + // @HACK + // remove newlines from the html, cause it fucks with fuckhtml + $html = str_replace(["\n", "\r"], "", $html); + + $out = [ + "status" => "ok", + "spelling" => [ + "type" => "no_correction", + "using" => null, + "correction" => null + ], + "npt" => null, + "answer" => [], + "web" => [], + "image" => [], + "video" => [], + "news" => [], + "related" => [] + ]; + + $this->fuckhtml->load($html); + + $this->detect_ass(); + + $datafields = + $this->fuckhtml + ->getElementsByAttributeName( + "id", + "div" + ); + + // + // Get next page + // + $npt = + $this->fuckhtml + ->getElementsByClassName( + "n", + "a" + ); + + if(count($npt) !== 0){ + + $out["npt"] = + $this->backend->store( + json_encode([ + "req" => $npt_data, + "cookie" => $this->cookie + ]), + $pagetype, + $proxy + ); + } + + // + // Get related searches + // + $related_container = + $this->fuckhtml + ->getElementById( + "rs_new", + $datafields + ); + + if($related_container){ + + $this->fuckhtml->load($related_container); + + $as = + $this->fuckhtml + ->getElementsByClassName( + "c-color-link", + "a" + ); + + foreach($as as $a){ + + $text = + explode( + ">", + $this->fuckhtml + ->getTextContent( + $a + ), + 2 + ); + + $out["related"][] = $text[count($text) - 1]; + } + } + + foreach($datafields as $datafield){ + + if( + !isset($datafield["attributes"]["id"]) || + preg_match( + '/^[0-9]+$/', + $datafield["attributes"]["id"] + ) === 0 + ){ + + // not a search result + continue; + } + + $this->fuckhtml->load($datafield); + $div = + $this->fuckhtml + ->getElementsByTagName( + "div" + ); + + // + // Don't parse as a search result if it's a card + // + $card = + $this->fuckhtml + ->getElementsByClassName( + "cosc-card", + $div + ); + + if(count($card) !== 0){ + + // + // Parse chinese youtube shorts + // + $ytshorts_probe = + $this->fuckhtml + ->getElementsByClassName( + "tts-b-item", + $div + ); + + if(count($ytshorts_probe) !== 0){ + + $videos = + $this->fuckhtml + ->getElementsByAttributeValue( + "data-show", + "list", + $div + ); + + foreach($videos as $video){ + + $this->fuckhtml->load($video); + + $title = + $this->fuckhtml + ->getElementsByClassName( + "cosc-title-slot", + "span" + ); + + if(count($title) === 0){ + + continue; + } + + $url = + $this->fuckhtml + ->getElementsByTagName( + "a" + ); + + if(count($url) === 0){ + + continue; + } + + $image = + $this->fuckhtml + ->getElementsByClassName( + "cos-image-body", + "img" + ); + + if(count($image) === 0){ + + $image = [ + "ratio" => null, + "url" => null + ]; + }else{ + + $image = [ + "ratio" => "1:1", + "url" => + $this->fuckhtml + ->getTextContent( + $image[0]["attributes"]["src"] + ) + ]; + } + + // get duration + $divs = + $this->fuckhtml + ->getElementsByAttributeName( + "class", + "div" + ); + + $duration = null; + foreach($divs as $probe){ + + if(strpos($probe["attributes"]["class"], "tag-bottom-right") !== false){ + + $duration = + $this->hms2int( + $this->fuckhtml + ->getTextContent( + $probe + ) + ); + break; + } + } + + $out["video"][] = [ + "title" => + $this->fuckhtml + ->getTextContent( + $title[0] + ), + "description" => null, + "date" => null, + "duration" => $duration, + "views" => null, + "thumb" => $image, + "url" => + $this->fuckhtml + ->getTextContent( + $url[0]["attributes"]["href"] + ) + ]; + } + } + + // + // Parse image carousel + // + $is_image_carousel = false; + foreach($div as $d){ + + if( + isset($d["attributes"]["class"]) && + strpos($d["attributes"]["class"], "image-container") !== false + ){ + + $is_image_carousel = true; + break; + } + } + + if($is_image_carousel){ + + preg_match( + '/<!--s-data:([\S\s]*)-->/U', + $datafield["innerHTML"], + $matches + ); + + if(isset($matches[1])){ + + // weird behavior with the smaller image carousel where --cos* CSS variables are escaped wrong + $json = + $this->fuckhtml + ->parseJsObject( + str_replace( + "-\-", + "--", + $matches[1] + ) + ); + + if( + $json !== null && + isset($json["imageList"][0]["images"]) + ){ + + // parse image carousel + foreach($json["imageList"][0]["images"] as $image){ + + parse_str(parse_url($image["thumburl"], PHP_URL_QUERY), $thumb_size); + + $out["image"][] = [ + "title" => "image", + "source" => [ + [ + "url" => $image["objurl"], + "width" => (int)$image["width"], + "height" => (int)$image["height"] + ], + [ // thumbnail + "url" => $image["thumburl"], + "width" => (int)$thumb_size["w"], + "height" => (int)$thumb_size["h"] + ] + ], + "url" => $image["jumpUrl"] + ]; + } + } + } + } + continue; + } + + if(!isset($datafield["attributes"]["mu"])){ + + // dont scrape if we dont have the direct link + continue; + } + + // class:FYB_RD -> News garbage, IGNORE + + $result = + $this->fuckhtml + ->getElementsByClassName( + "result", + [$datafield] + ); + + if(count($result) !== 0){ + + // + // Parse normal search result + // + + $title = + $this->fuckhtml + ->getElementsByClassName( + "sc-link", + "a" + ); + + if(count($title) === 0){ + + // should not happen + continue; + } + + $title = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $title[0] + ) + ); + + $description = + $this->fuckhtml + ->getElementsByClassName( + "c-color", + $div + ); + + if(count($description) !== 0){ + + $this->fuckhtml->load($description[0]); + + $description = + $this->fuckhtml + ->getElementsByAttributeName( + "class", + "span" + ); + + $found_desc = false; + foreach($description as $desc){ + + if(stripos($desc["attributes"]["class"], "summary-text") !== false){ + + $found_desc = true; + $description = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $desc + ) + ); + break; + } + } + + if($found_desc === false){ + + $description = null; + } + + $this->fuckhtml->load($datafield); + }else{ + + $description = null; + } + + // parse date + $date_probe = + $this->fuckhtml + ->getElementsByClassName( + "cos-color-text-minor", + "span" + ); + + if(count($date_probe) !== 0){ + + $date = + $this->parse_time( + $this->fuckhtml + ->getTextContent( + $date_probe[0] + ) + ); + }else{ + + $date = null; + } + + // parse image + $img = + $this->fuckhtml + ->getElementsByTagName( + "img" + ); + + if(count($img) !== 0){ + + $image = [ + "ratio" => "16:9", + "url" => + $this->unfuckthumb( + $this->fuckhtml + ->getTextContent( + $img[0]["attributes"]["src"] + ) + ) + ]; + }else{ + + $image = [ + "ratio" => null, + "url" => null + ]; + } + + // get page type + $pagetype_probe = + $this->fuckhtml + ->getElementsByTagName( + "b" + ); + + $pagetype = "web"; + foreach($pagetype_probe as $probe){ + + $pagetype = + strtolower( + trim( + $this->fuckhtml + ->getTextContent( + $probe + ), + " 【】" + ) + ); + } + + // get extra links + $sublinks = []; + + foreach($div as $d){ + + if( + isset($d["attributes"]["class"]) && + strpos($d["attributes"]["class"], "exta-link") !== false + ){ + + $this->fuckhtml->load($d); + + $links = + $this->fuckhtml + ->getElementsByClassName( + "cos-space-mt-xs", + "div" + ); + + foreach($links as $link){ + + $this->fuckhtml->load($link); + $s_title = + $this->fuckhtml + ->getElementsByTagName( + "h3" + ); + + if(count($s_title) === 0){ + + // should not happen + continue; + } + + $data2 = + json_decode( + $this->fuckhtml + ->getTextContent( + $s_title[0]["attributes"]["data-click"] + ), + true + ); + + if(!isset($data2["clk_info"])){ + + // wtf + continue; + } + + $data2 = + json_decode( + $data2["clk_info"], + true + ); + + if(!isset($data2["url"])){ + + // no link, fuck off + continue; + } + + $url = + rawurldecode( + $data2["url"] + ); + + $data = + $this->fuckhtml + ->getElementsByTagName( + "p" + ); + + $s_description = null; + + if(count($data) !== 0){ + + $data = + json_decode( + $this->fuckhtml + ->getTextContent( + $data[0]["attributes"]["sub-show-log"] + ), + true + ); + + if(isset($data["ext"]["content"])){ + + $s_description = $data["ext"]["content"]; + } + } + + $sublinks[] = [ + "title" => + $this->fuckhtml + ->getTextContent( + $s_title[0] + ), + "description" => $s_description, + "url" => $url, + "date" => null + ]; + } + break; + } + } + + $out["web"][] = [ + "title" => $title, + "description" => $description, + "url" => + $this->fuckhtml + ->getTextContent( + $datafield["attributes"]["mu"] + ), + "date" => $date, + "type" => $pagetype, + "thumb" => $image, + "sublink" => $sublinks, + "table" => [] + ]; + + continue; + } + + // parse special result + $result = + $this->fuckhtml + ->getElementsByClassName( + "result-op", + [$datafield] + ); + + if(count($result) !== 0){ + + // + // Parse video carousel + // + if( + isset($datafield["attributes"]["tpl"]) && + stripos($datafield["attributes"]["tpl"], "video") !== false + ){ + + preg_match( + '/<!--s-data:([\S\s]*)-->/U', + $datafield["innerHTML"], + $matches + ); + + if(isset($matches[1])){ + + $json = + json_decode( + $matches[1], + true + ); + + if($json !== null){ + + foreach($json["videoList"] as $video){ + + $out["video"][] = [ + "title" => $video["title"], + "description" => + $this->titledots( + $video["desc"] + ), + "date" => + $this->parse_time( + $video["pubTime"] + ), + "duration" => + $this->hms2int( + $video["duration"] + ), + "views" => + $this->parse_viewcount( + $video["playCount"] + ), + "thumb" => [ + "ratio" => "16:9", + "url" => $video["poster"] + ], + "url" => $video["bindProps"]["link"] + ]; + } + } + } + continue; + } + + // + // Special result div (wiki entries, rich divs) + // + $title = + $this->fuckhtml + ->getElementsByTagName( + "h3" + ); + + if(count($title) === 0){ + + // should have a title somewhere + continue; + } + + $title = + explode( + ">", + $this->fuckhtml + ->getTextContent( + $title[0] + ), + 2 + ); + + if(count($title) === 2){ + + $title = $title[1]; + }else{ + + $title = $title[0]; + } + + // probe for wiki-like entry + $description = + $this->fuckhtml + ->getElementsByClassName( + "sc-paragraph", + "p" + ); + + if(count($description) === 0){ + + // try and get grey description + $description = + $this->fuckhtml + ->getElementsByClassName( + "c-color-gray2", + "p" + ); + + if(count($description) === 0){ + + // probe for special social media description + $description = + $this->fuckhtml + ->getElementsByClassName( + "c-color-text", + "div" + ); + + if(isset($description[0]["attributes"]["aria-label"])){ + + $description = + $this->fuckhtml + ->getTextContent( + $description[0] + ["attributes"] + ["aria-label"] + ); + }else{ + + // check for news tab description + $span = + $this->fuckhtml + ->getElementsByClassName( + "c-font-normal", + "span" + ); + + $description = null; + + foreach($span as $s){ + + if(isset($s["attributes"]["aria-label"])){ + + $description = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $span[count($span) - 1] + ) + ); + + break; + } + } + } + }else{ + + $description = + $this->fuckhtml + ->getTextContent( + $description[0] + ); + } + + }else{ + + preg_match( + '/<!--s-text-->([\S\s]*)<!--\/s-text-->/U', + $description[count($description) - 1]["innerHTML"], + $matches + ); + + if(isset($matches[1])){ + + $description = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $matches[1] + ) + ); + }else{ + + $description = null; + } + } + + // get thumbnail + $thumb = + $this->fuckhtml + ->getElementsByTagName( + "img" + ); + + if(count($thumb) !== 0){ + + $thumb = [ + "ratio" => "1:1", + "url" => + $this->unfuckthumb( + $this->fuckhtml + ->getTextContent( + $thumb[0]["attributes"]["src"] + ) + ) + ]; + }else{ + + $thumb = [ + "ratio" => null, + "url" => null + ]; + } + + // get sublinks + preg_match( + '/<!--s-data:([\S\s]*)-->/U', + $datafield["innerHTML"], + $matches + ); + + $sublinks = []; + + if(isset($matches[1])){ + + $json = + json_decode( + $matches[1], + true + ); + + if($json !== null){ + + if(isset($json["buttons"])){ + + foreach($json["buttons"] as $button){ + + $sublinks[] = [ + "title" => $button["text"], + "description" => null, + "date" => null, + "url" => $button["url"] + ]; + } + }elseif(isset($json["mthreadList"])){ + + foreach($json["mthreadList"] as $thread){ + + $sublinks[] = [ + "title" => + $this->fuckhtml + ->getTextContent( + $thread["title"] + ), + "description" => null, + "date" => null, + "url" => $thread["ttsInfo"]["titleUrl"] + ]; + } + } + } + } + + // get URL + // handle http://fakeurl.baidu.com bullshit + $url = + $this->fuckhtml + ->getTextContent( + $datafield["attributes"]["mu"] + ); + + if( + preg_match( + '/^https?:\/\/(?:fakeurl|nourl)(?:\.ubs)?\.baidu\.com/', + $url + ) + ){ + + // we got some bullshit, get jumpUrl instead + $as = + $this->fuckhtml + ->getElementsByTagName( + "a" + ); + + if(count($as) !== 0){ + + $url = + $this->fuckhtml + ->getTextContent( + $as[0]["attributes"]["href"] + ); + } + } + + // get xueshu sublinks + // get list + $xueshu_list = + $this->fuckhtml + ->getElementsByClassName( + "op-xueshu-links-d20-list", + $div + ); + + if(count($xueshu_list) !== 0){ + + $this->fuckhtml->load($xueshu_list[0]); + + $rows = + $this->fuckhtml + ->getElementsByClassName( + "c-row", + "div" + ); + + // remove "read more" bullshit + foreach($rows as $row){ + + if(strpos($row["attributes"]["class"], "op-xueshu-links-more") !== false){ + + $xueshu_list[0]["innerHTML"] = + str_replace( + $row["outerHTML"], + "", + $xueshu_list[0]["innerHTML"] + ); + } + } + + $this->fuckhtml->load($xueshu_list[0]); + + foreach($rows as $row){ + + $this->fuckhtml->load($row); + + if(strpos($row["attributes"]["class"], "op-xueshu-links-more") !== false){ + + continue; + } + + $as = + $this->fuckhtml + ->getElementsByTagName( + "a" + ); + + foreach($as as $a){ + + $sublinks[] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $a + ) + ), + "description" => null, + "date" => null, + "url" => + $this->fuckhtml + ->getTextContent( + $a["attributes"]["href"] + ) + ]; + } + } + } + + $out["web"][] = [ + "title" => $title, + "description" => $description, + "url" => $url, + "date" => null, + "type" => "web", + "thumb" => $thumb, + "sublink" => $sublinks, + "table" => [] + ]; + continue; + } + } + + // + // Remove tracking URLs and fetch additonal image resources + // + $this->resolve_urls($proxy, $out, ["web", "video"]); + $this->resolve_images($proxy, $out); + + return $out; + } + + public function image($get){ + + // https://image.baidu.com/search/acjson?word=asmr&rn=60&pn=0&newReq=1 + //$json = file_get_contents("scraper/baidu_img.json"); + + if($get["npt"]){ + + [$params, $proxy] = $this->backend->get($get["npt"], "images"); + $params = json_decode($params, true); + + $params["pn"] = $params["pn"] + 60; + + }else{ + + $proxy = $this->backend->get_ip(); + $params = [ + "word" => $get["s"], + "rn" => 60, // results/page + "pn" => 0, // item increment (0 * 60) + "newReq" => 1 // otherwise json is fucked up + ]; + + switch($get["sort"]){ + + case "latest": $params["latest"] = 1; break; + case "hot": $params["hot"] = 1; break; + } + + if($get["size"] != "any"){ + + $params["z"] = $get["size"]; + } + + if($get["ratio"] != "any"){ + + $params["imgratio"] = $get["ratio"]; + } + + if($get["format"] != "any"){ + + $params["imgformat"] = $get["format"]; + } + + if($get["color"] != "any"){ + + $params["ic"] = $get["color"]; + } + + switch($get["type"]){ + + case "hd": $params["hd"] = 1; break; + case "isImgSet": $params["isImgSet"] = 1; break; + case "copyright": $params["copyright"] = 1; break; + } + } + + try{ + + $json = + $this->get( + $proxy, + "https://image.baidu.com/search/acjson", + $params, + "https://image.baidu.com/search/index?tn=baiduimage&word=" . urlencode($get["s"]) + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch JSON"); + } + + $json = json_decode($json, true); + + if($json === null){ + + // detect captcha first + $this->fuckhtml->load($json); + $this->detect_ass(); + + // fallback to json decode error + throw new Exception("Failed to decode JSON"); + } + + if( + isset($json["message"]) && + $json["message"] != "success" + ){ + + throw new Exception("Baidu returned an error: {$json["message"]}"); + } + + if(!isset($json["data"]["images"])){ + + throw new Exception("Baidu did not return an image object"); + } + + $out = [ + "status" => "ok", + "npt" => null, + "image" => [] + ]; + + foreach($json["data"]["images"] as $image){ + + parse_str(parse_url($image["thumburl"], PHP_URL_QUERY), $thumb_size); + + $out["image"][] = [ + "title" => + $this->fuckhtml + ->getTextContent( + $image["titleShow"] + ), + "source" => [ + [ + "url" => $image["objurl"], + "width" => (int)$image["width"], + "height" => (int)$image["height"] + ], + [ // thumbnail + "url" => $image["thumburl"], + "width" => (int)$thumb_size["w"], + "height" => (int)$thumb_size["h"] + ] + ], + "url" => $image["fromUrl"] + ]; + } + + // + // Detect if there's a next page + // + if((int)$json["data"]["totalNum"] >= $params["pn"] + 60){ + + $out["npt"] = + $this->backend->store( + json_encode($params), + "images", + $proxy + ); + } + + return $out; + } + + public function video($get){ + + // https://www.baidu.com/sf/vsearch?pd=video&tn=vsearch&wd=jak%2Band%2Bdaxter&async=1&pn=0 + // increase &pn +20 for pagination + + //$html = file_get_contents("scraper/baidu_vid.html"); + + if($get["npt"]){ + + [$params, $proxy] = $this->backend->get($get["npt"], "videos"); + $params = json_decode($params, true); + + $params["pn"] = $params["pn"] + 10; + }else{ + + $proxy = $this->backend->get_ip(); + $params = [ + "pd" => "video", + "tn" => "vsearch", + "wd" => $get["s"], + "async" => 1, + "pn" => 0 + ]; + } + + try{ + $html = + $this->get( + $proxy, + "https://www.baidu.com/sf/vsearch", + $params + ); + }catch(Exception $error){ + + throw new Exception("Failed to get search page"); + } + + $html = + str_replace( + ["\r", "\n"], + "", + $html + ); + + $out = [ + "status" => "ok", + "npt" => null, + "video" => [], + "author" => [], + "livestream" => [], + "playlist" => [], + "reel" => [] + ]; + + $html = explode("<script>", $html); + + foreach($html as $result){ + + $result = trim($result); + + $this->fuckhtml->load($result); + + // get URL + preg_match( + '/<!-- *([^ ]*) *-->/', + $result, + $matches + ); + + if(!isset($matches[1])){ + + // no link, give up + continue; + } + + $link = $matches[1]; + + // get title + $title = + $this->fuckhtml + ->getElementsByClassName( + "video-title", + "a" + ); + + if(count($title) === 0){ + + // should not happen + continue; + } + + $title = + $this->fuckhtml + ->getTextContent( + $title[0] + ); + + // get thumbnail + $img = + $this->fuckhtml + ->getElementsByClassName( + "border-radius", + "img" + ); + + if(count($img) !== 0){ + + $thumb = [ + "url" => + $this->unfuckthumb( + $this->fuckhtml + ->getTextContent( + $img[0]["attributes"]["src"] + ) + ), + "ratio" => "16:9" + ]; + }else{ + + $thumb = [ + "url" => null, + "ratio" => null + ]; + } + + $span = + $this->fuckhtml + ->getElementsByTagName( + "span" + ); + + // get duration + $duration = + $this->fuckhtml + ->getElementsByClassName( + "video_play_timer", + $span + ); + + if(count($duration) !== 0){ + + $duration = + $this->hms2int( + $this->fuckhtml + ->getTextContent( + $duration[0] + ) + ); + }else{ + + $duration = null; + } + + // get author + // 来源:哔哩哔哩 + $author = + $this->fuckhtml + ->getElementsByClassName( + "wetSource", + $span + ); + + if(count($author) !== 0){ + + $author = + explode( + ":", + $this->fuckhtml + ->getTextContent( + $author[0] + ), + 2 + )[1]; + }else{ + + $author = null; + } + + // get date posted + //发布时间:2024-05-06 + + // AND get description + // 简介:Our first look + $infospans = + array_merge( + $this->fuckhtml + ->getElementsByClassName( + "c-font-normal", + $span + ), + $this->fuckhtml + ->getElementsByClassName( + "c-font-normal", + "div" + ) + ); + + $date = null; + $description = null; + + foreach($infospans as $infospan){ + + $infospan = + explode( + ":", + $this->fuckhtml + ->getTextContent( + $infospan + ), + 2 + ); + + if(count($infospan) !== 2){ + + // should not happen + continue; + } + + $infospan[1] = + $this->fuckhtml + ->getTextContent( + $infospan[1] + ); + + switch($infospan[0]){ + + case "发布时间": // date posted + $date = $this->parse_time($infospan[1]); + break; + + case "简介": // description + $description = $infospan[1]; + break; + } + } + + $out["video"][] = [ + "title" => $this->titledots($title), + "description" => $this->titledots($description), + "author" => [ + "name" => $author, + "url" => null, + "avatar" => null + ], + "date" => $date, + "duration" => $duration, + "views" => null, + "thumb" => $thumb, + "url" => $link + ]; + } + + if(count($out["video"]) === 10){ + + // assume there's another page after this + $out["npt"] = + $this->backend->store( + json_encode($params), + "videos", + $proxy + ); + } + + return $out; + } + + public function news($get){ + + //$proxy = $this->backend->get_ip(); + //$html = file_get_contents("scraper/baidu.html"); + //$npt_data = []; + + if($get["npt"]){ + + [$json, $proxy] = $this->backend->get($get["npt"], "news"); + + $json = json_decode($json, true); + $this->cookie = $json["cookie"]; + $npt_data = $json["req"]; + + $npt_data["pn"] = $npt_data["pn"] + 20; + + try{ + + $html = $this->get( + $proxy, + "https://www.baidu.com/s", + $npt_data + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch search page"); + } + + }else{ + + // + // Get authentication token + // + $proxy = $this->backend->get_ip(); + + $npt_data = [ + "wd" => $get["s"], + "rn" => 20, + "tn" => "news" + ]; + + // @TODO add filters + + try{ + + $html = $this->get( + $proxy, + "https://www.baidu.com/s", + $npt_data + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch search page"); + } + + $npt_data["pn"] = 0; + } + + $data = $this->parse_search($proxy, "news", $npt_data, $html); + + $out = [ + "status" => "ok", + "npt" => $data["npt"], + "news" => [] + ]; + + foreach($data["web"] as $article){ + + $out["news"][] = [ + "title" => $article["title"], + "author" => null, + "description" => $article["description"], + "date" => $article["date"], + "thumb" => [ + "url" => $article["thumb"]["url"], + "ratio" => $article["thumb"]["url"] !== null ? "16:9" : null, + ], + "url" => $article["url"] + ]; + } + + return $out; + } + + private function unfuckthumb($url){ + + // probe for proxy URL + $parsed_url = parse_url($url); + if( + preg_match( + '/^https?:\/\/gimg(?:[0-9]+)?\.baidu\.com/', + $url + ) + ){ + + $parts = explode("src=", $url); + if(count($parts) !== 2){ + + // shits fucked + return $url; + } + + return urldecode(explode("&", $parts[1])[0]); + } + + $q = explode("&", $url, 2); + + if(count($q) !== 2){ + + // shits fucked, again + return $url; + } + + // baidu devs are fucking retarded and dont follow spec: + // &fmt=auto?s=BB32F3A050471AEC72886934030090C4&sec=1753203600&t=0fb2194775d3bd3d1bb114b818479e0a + parse_str(str_replace("?", "&", $q[1]), $query); + + if(isset($query["size"])){ unset($query["size"]); } + if(isset($query["q"])){ $query["q"] = "90"; } + + $query = http_build_query($query); + + return + str_replace( + $q[1], + $query, + $url + ); + } + + private function titledots($title){ + + return trim($title, " .\t\n\r\0\x0B…"); + } + + private function hms2int($time){ + + $parts = explode(":", $time, 3); + $time = 0; + + if(count($parts) === 3){ + + // hours + $time = $time + ((int)$parts[0] * 3600); + array_shift($parts); + } + + if(count($parts) === 2){ + + // minutes + $time = $time + ((int)$parts[0] * 60); + array_shift($parts); + } + + // seconds + $time = $time + (int)$parts[0]; + + return $time; + } + + private function parse_viewcount($views){ + + if( + // 10k (wtf lol) + preg_match( + '/([0-9]+)万次/', + $views, + $matches + ) + ){ + + return (int)$matches[1] * 10000; + } + + if( + // units + preg_match( + '/([0-9]+)次/', + $views, + $matches + ) + ){ + + return (int)$matches[1]; + } + + return null; + } + + private function parse_time($time){ + + // 2023年8月7日 => yyyy/m/d + if( + preg_match( + '/([0-9]{4})年([0-9]{1,2})月([0-9]{1,2})日/', + $time, + $matches + ) + ){ + + return strtotime("{$matches[1]}/{$matches[2]}/{$matches[3]}"); + } + + // 昨天11:45 => yesterday at 11:45 + // 昨天 => yesterday + if( + preg_match( + '/昨天(.*)/', + $time, + $matches + ) + ){ + + return strtotime("Yesterday {$matches[1]}"); + } + + // 3天前 => 3 days ago + if( + preg_match( + '/([0-9]{1,4})天前/', + $time, + $matches + ) + ){ + + return strtotime("{$matches[1]} days ago"); + } + + // 1个月前 => 1 month ago + if( + preg_match( + '/([0-9]{1,4})个月前/', + $time, + $matches + ) + ){ + + return strtotime("{$matches[1]} months ago"); + } + + // attempt to parse as-is + $time = strtotime($time); + + if($time !== false){ + + return $time; + } + + return null; + } + + private function detect_ass(){ + + $as = + $this->fuckhtml + ->getElementsByTagName( + "a" + ); + + if( + count($as) === 0 || + preg_match( + '/^https?:\/\/wappass\.baidu\.com\/static\/captcha/', + $this->fuckhtml + ->getTextContent( + $as[0]["attributes"]["href"] + ) + ) + ){ + + throw new Exception("Baidu returned a Captcha"); + } + } +} diff --git a/scraper/brave.php b/scraper/brave.php new file mode 100644 index 0000000..e6f5908 --- /dev/null +++ b/scraper/brave.php @@ -0,0 +1,1860 @@ +<?php + +class brave{ + + public function __construct(){ + + include "lib/fuckhtml.php"; + $this->fuckhtml = new fuckhtml(); + + include "lib/backend.php"; + $this->backend = new backend("brave"); + } + + public function getfilters($page){ + + switch($page){ + + case "web": + return [ + "country" => [ + "display" => "Country", + "option" => [ + "all" => "All Regions", + "ar" => "Argentina", + "au" => "Australia", + "at" => "Austria", + "be" => "Belgium", + "br" => "Brazil", + "ca" => "Canada", + "cl" => "Chile", + "cn" => "China", + "dk" => "Denmark", + "fi" => "Finland", + "fr" => "France", + "de" => "Germany", + "hk" => "Hong Kong", + "in" => "India", + "id" => "Indonesia", + "it" => "Italy", + "jp" => "Japan", + "kr" => "Korea", + "my" => "Malaysia", + "mx" => "Mexico", + "nl" => "Netherlands", + "nz" => "New Zealand", + "no" => "Norway", + "pl" => "Poland", + "pt" => "Portugal", + "ph" => "Philippines", + "ru" => "Russia", + "sa" => "Saudi Arabia", + "za" => "South Africa", + "es" => "Spain", + "se" => "Sweden", + "ch" => "Switzerland", + "tw" => "Taiwan", + "tr" => "Turkey", + "gb" => "United Kingdom", + "us" => "United States" + ] + ], + "nsfw" => [ + "display" => "NSFW", + "option" => [ + "yes" => "Yes", + "maybe" => "Maybe", + "no" => "No" + ] + ], + "newer" => [ + "display" => "Newer than", + "option" => "_DATE" + ], + "older" => [ + "display" => "Older than", + "option" => "_DATE" + ], + "spellcheck" => [ + "display" => "Spellcheck", + "option" => [ + "yes" => "Yes", + "no" => "No" + ] + ] + ]; + break; + + case "images": + case "videos": + case "news": + return [ + "country" => [ + "display" => "Country", + "option" => [ + "all" => "All regions", + "ar" => "Argentina", + "au" => "Australia", + "at" => "Austria", + "be" => "Belgium", + "br" => "Brazil", + "ca" => "Canada", + "cl" => "Chile", + "cn" => "China", + "dk" => "Denmark", + "fi" => "Finland", + "fr" => "France", + "de" => "Germany", + "hk" => "Hong Kong", + "in" => "India", + "id" => "Indonesia", + "it" => "Italy", + "jp" => "Japan", + "kr" => "Korea", + "my" => "Malaysia", + "mx" => "Mexico", + "nl" => "Netherlands", + "nz" => "New Zealand", + "no" => "Norway", + "pl" => "Poland", + "pt" => "Portugal", + "ph" => "Philippines", + "ru" => "Russia", + "sa" => "Saudi Arabia", + "za" => "South Africa", + "es" => "Spain", + "se" => "Sweden", + "ch" => "Switzerland", + "tw" => "Taiwan", + "tr" => "Turkey", + "gb" => "United Kingdom", + "us" => "United States" + ] + ], + "nsfw" => [ + "display" => "NSFW", + "option" => [ + "yes" => "Yes", + "maybe" => "Maybe", + "no" => "No" + ] + ], + "spellcheck" => [ + "display" => "Spellcheck", + "option" => [ + "yes" => "Yes", + "no" => "No" + ] + ] + ]; + break; + } + } + + private function get($proxy, $url, $get = [], $nsfw, $country){ + + switch($nsfw){ + + case "yes": $nsfw = "off"; break; + case "maybe": $nsfw = "moderate"; break; + case "no": $nsfw = "strict"; break; + } + + if($country == "any"){ + + $country = "all"; + } + + $headers = [ + "User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "Cookie: safesearch={$nsfw}; country={$country}; useLocation=0; summarizer=0", + "DNT: 1", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: none", + "Sec-Fetch-User: ?1" + ]; + + $curlproc = curl_init(); + + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers); + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + return $data; + } + + private function get_js(){ + + $script_disc = + $this->fuckhtml + ->getElementsByTagName( + "script" + ); + + $data = null; + foreach($script_disc as &$discs){ + + if( + preg_match( + '/kit\.start\(/', + $discs["innerHTML"] + ) + ){ + + $data = + explode( + "data:", + $discs["innerHTML"], + 2 + ); + + if(count($data) !== 2){ + + throw new Exception("Failed to split up data field"); + } + + $data = $data[1]; + break; + } + } + + if($data === null){ + + throw new Exception("Could not grep JavaScript object"); + } + + $data = + $this->fuckhtml + ->parseJsObject( + $this->fuckhtml + ->extract_json( + $data + ) + ); + + if($data === null){ + + throw new Exception("Failed to decode JavaScript object"); + } + + return $data; + } + + public function web($get){ + + if($get["npt"]){ + + // get next page data + [$q, $proxy] = $this->backend->get($get["npt"], "web"); + + $q = json_decode($q, true); + + $search = $q["q"]; + $q["spellcheck"] = "0"; + + $nsfw = $q["nsfw"]; + unset($q["nsfw"]); + + $country = $q["country"]; + unset($q["country"]); + + }else{ + + // get _GET data instead + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + if(strlen($search) > 2048){ + + throw new Exception("Search term is too long!"); + } + + $proxy = $this->backend->get_ip(); + $nsfw = $get["nsfw"]; + $country = $get["country"]; + $older = $get["older"]; + $newer = $get["newer"]; + $spellcheck = $get["spellcheck"]; + + $q = [ + "q" => $search + ]; + + /* + Pass older/newer filters to brave + */ + if($newer !== false){ + + $newer = date("Y-m-d", $newer); + + if($older === false){ + + $older = date("Y-m-d", time()); + } + } + + if( + is_string($older) === false && + $older !== false + ){ + + $older = date("Y-m-d", $older); + + if($newer === false){ + + $newer = "1970-01-02"; + } + } + + if($older !== false){ + + $q["tf"] = "{$newer}to{$older}"; + } + + // spellcheck + if($spellcheck == "no"){ + + $q["spellcheck"] = "0"; + } + } + /* + $handle = fopen("scraper/brave.html", "r"); + $html = fread($handle, filesize("scraper/brave.html")); + fclose($handle);*/ + + + try{ + $html = + $this->get( + $proxy, + "https://search.brave.com/search", + $q, + $nsfw, + $country + ); + + }catch(Exception $error){ + + throw new Exception("Could not fetch search page"); + } + + $out = [ + "status" => "ok", + "spelling" => [ + "type" => "no_correction", + "using" => null, + "correction" => null + ], + "npt" => null, + "answer" => [], + "web" => [], + "image" => [], + "video" => [], + "news" => [], + "related" => [] + ]; + + // load html + $this->fuckhtml->load($html); + + /* + Get next page "token" + */ + $nextpage = + $this->fuckhtml + ->getElementById( + "pagination", + "div" + ); + + if($nextpage){ + + $this->fuckhtml->load($nextpage); + + $nextpage = + $this->fuckhtml + ->getElementsByClassName("button", "a"); + + if(count($nextpage) !== 0){ + + $nextpage = + $nextpage[count($nextpage) - 1]; + + if( + strtolower( + $this->fuckhtml + ->getTextContent( + $nextpage + ) + ) == "next" + ){ + + preg_match( + '/offset=([0-9]+)/', + $this->fuckhtml->getTextContent($nextpage["attributes"]["href"]), + $nextpage + ); + + $q["offset"] = (int)$nextpage[1]; + $q["nsfw"] = $nsfw; + $q["country"] = $country; + + $out["npt"] = + $this->backend->store( + json_encode($q), + "web", + $proxy + ); + } + } + } + + // do some magic + $this->fuckhtml->load($html); + $data = $this->get_js(); + + if( + isset($data[2]["data"]["title"]) && + stripos($data[2]["data"]["title"], "PoW Captcha") !== false + ){ + + throw new Exception("Brave returned a PoW captcha"); + } + + if(!isset($data[1]["data"]["body"]["response"])){ + + throw new Exception("Brave did not return a result object"); + } + + $data = $data[1]["data"]["body"]["response"]; + + /* + Get web results + */ + if(!isset($data["web"]["results"])){ + + return $out; + } + + foreach($data["web"]["results"] as $result){ + + if( + isset($result["thumbnail"]) && + is_array($result["thumbnail"]) + ){ + + $thumb = [ + "ratio" => $result["thumbnail"]["logo"] == "false" ? "16:9" : "1:1", + "url" => $result["thumbnail"]["original"] + ]; + }else{ + + $thumb = [ + "ratio" => null, + "url" => null + ]; + } + + // get sublinks + $sublink = []; + if( + isset($result["cluster"]) && + is_array($result["cluster"]) + ){ + + foreach($result["cluster"] as $cluster){ + + $sublink[] = [ + "title" => $this->titledots($cluster["title"]), + "description" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $cluster["description"] + ) + ), + "url" => $cluster["url"], + "date" => null + ]; + } + } + + // more sublinks + if( + isset($result["deep_results"]) && + is_array($result["deep_results"]) + ){ + + foreach($result["deep_results"]["buttons"] as $r){ + + $sublink[] = [ + "title" => $this->titledots($r["title"]), + "description" => null, + "url" => $r["url"], + "date" => null + ]; + } + } + + // parse table elements + $table = []; + + /* + [locations] => void 0 Done + [video] => void 0 Done + [movie] => void 0 Done + [faq] => void 0 + [recipe] => void 0 + [qa] => void 0 Not needed + [book] => void 0 + [rating] => void 0 + [article] => void 0 + [product] => void 0 Done + [product_cluster] => void 0 + [cluster_type] => void 0 + [cluster] => void 0 Done + [creative_work] => void 0 Done + [music_recording] => void 0 + [review] => void 0 Done + [software] => void 0 Done + [content_type] => void 0 + [descriptionLength] => 271 + */ + + // product + // creative_work + $ref = null; + + if(isset($result["product"])){ + + $ref = &$result["product"]; + }elseif(isset($result["creative_work"])){ + + $ref = &$result["creative_work"]; + } + + if($ref !== null){ + + if(isset($ref["offers"])){ + + foreach($ref["offers"] as $offer){ + + $price = null; + + if(isset($offer["price"])){ + + if((float)$offer["price"] == 0){ + + $price = "Free"; + }else{ + + $price = $offer["price"]; + } + } + + if($price !== "Free"){ + if(isset($offer["priceCurrency"])){ + + $price .= " " . $offer["priceCurrency"]; + } + } + + if($price !== null){ + + $table["Price"] = trim($price); + } + } + } + + if(isset($ref["rating"])){ + + $rating = null; + if(isset($ref["rating"]["ratingValue"])){ + + $rating = $ref["rating"]["ratingValue"]; + + if(isset($ref["rating"]["bestRating"])){ + + $rating .= "/" . $ref["rating"]["bestRating"]; + } + } + + if(isset($ref["rating"]["reviewCount"])){ + + $isnull = $rating === null ? false : true; + + if($isnull){ + + $rating .= " ("; + } + + $rating .= number_format($ref["rating"]["reviewCount"]) . " hits"; + + if($isnull){ + + $rating .= ")"; + } + } + + if($rating !== null){ + + $table["Rating"] = $rating; + } + } + } + + // review + if( + isset($result["review"]) && + is_array($result["review"]) + ){ + + if(isset($result["review"]["rating"]["ratingValue"])){ + + $table["Rating"] = + $result["review"]["rating"]["ratingValue"] . "/" . + $result["review"]["rating"]["bestRating"]; + } + } + + // software + if( + isset($result["software"]) && + is_array($result["software"]) + ){ + + if(isset($result["software"]["author"])){ + $table["Author"] = $result["software"]["author"]; + } + + if(isset($result["software"]["stars"])){ + $table["Stars"] = number_format($result["software"]["stars"]); + } + + if(isset($result["software"]["forks"])){ + $table["Forks"] = number_format($result["software"]["forks"]); + } + + if( + isset($result["software"]["programmingLanguage"]) && + $result["software"]["programmingLanguage"] != "" + ){ + $table["Programming languages"] = $result["software"]["programmingLanguage"]; + } + } + + // location + if( + isset($result["location"]) && + is_array($result["location"]) + ){ + + if(isset($result["location"]["postal_address"]["displayAddress"])){ + + $table["Address"] = $result["location"]["postal_address"]["displayAddress"]; + } + + if( + isset($result["location"]["rating"]) && + $result["location"]["rating"] != "void 0" + ){ + + $table["Rating"] = + $result["location"]["rating"]["ratingValue"] . "/" . + $result["location"]["rating"]["bestRating"] . " (" . + number_format($result["location"]["rating"]["reviewCount"]) . " votes)"; + } + + if( + isset($result["location"]["contact"]["telephone"]) && + $result["location"]["contact"]["telephone"] != "void 0" + ){ + + $table["Phone number"] = + $result["location"]["contact"]["telephone"]; + } + + if( + isset($result["location"]["price_range"]) && + $result["location"]["price_range"] != "void 0" + ){ + + $table["Price"] = + $result["location"]["price_range"]; + } + } + + // video + if( + isset($result["video"]) && + is_array($result["video"]) + ){ + + foreach($result["video"] as $key => $value){ + + if(is_string($result["video"][$key]) === false){ + + continue; + } + + $table[ucfirst($key)] = $value; + } + } + + // movie + if( + isset($result["video"]) && + is_array($result["movie"]) + ){ + + if(isset($result["movie"]["release"])){ + + $table["Release date"] = $result["movie"]["release"]; + } + + if(isset($result["movie"]["directors"])){ + + $directors = []; + + foreach($result["movie"]["directors"] as $director){ + + $directors[] = $director["name"]; + } + + if(count($directors) !== 0){ + + $table["Directors"] = implode(", ", $directors); + } + } + + if(isset($result["movie"]["actors"])){ + + $actors = []; + + foreach($result["movie"]["actors"] as $actor){ + + $actors[] = $actor["name"]; + } + + if(count($actors) !== 0){ + $table["Actors"] = implode(", ", $actors); + } + } + + if(isset($result["movie"]["rating"])){ + + $table["Rating"] = + $result["movie"]["rating"]["ratingValue"] . "/" . + $result["movie"]["rating"]["bestRating"] . " (" . + number_format($result["movie"]["rating"]["reviewCount"]) . " votes)"; + } + + if(isset($result["movie"]["duration"])){ + + $table["Duration"] = + $result["movie"]["duration"]; + } + + if(isset($result["movie"]["genre"])){ + + $genres = []; + + foreach($result["movie"]["genre"] as $genre){ + + $genres[] = $genre; + } + + if(count($genres) !== 0){ + $table["Genre"] = implode(", ", $genres); + } + } + } + + if( + isset($result["age"]) && + $result["age"] != "void 0" && + $result["age"] != "" + ){ + + $date = strtotime($result["age"]); + }else{ + + $date = null; + } + + $out["web"][] = [ + "title" => + $this->titledots( + $result["title"] + ), + "description" => + isset($result["review"]["description"]) ? + $this->limitstrlen( + strip_tags( + $result["review"]["description"] + ) + ) : + $this->titledots( + $this->fuckhtml + ->getTextContent( + $result["description"] + ) + ), + "url" => $result["url"], + "date" => $date, + "type" => "web", + "thumb" => $thumb, + "sublink" => $sublink, + "table" => $table + ]; + } + + /* + Get spelling autocorrect + */ + if( + isset($data["query"]["bo_altered_diff"][0][0]) && + $data["query"]["bo_altered_diff"][0][0] == "true" + ){ + $using = []; + + foreach($data["query"]["bo_altered_diff"] as $diff){ + + $using[] = $diff[1]; + } + + $out["spelling"] = [ + "type" => "including", + "using" => implode(" ", $using), + "correction" => $get["s"] + ]; + } + + /* + Get wikipedia heads + */ + if(isset($data["infobox"]["results"][0])){ + + foreach($data["infobox"]["results"] as $info){ + + if($info["subtype"] == "code"){ + + $description = + $this->stackoverflow_parse($info["data"]["answer"]["text"]); + + if(isset($info["data"]["answer"]["author"])){ + + $description[] = [ + "type" => "quote", + "value" => "Answer from " . $info["data"]["answer"]["author"] + ]; + } + }else{ + + $description = []; + + if( + isset($info["description"]) && + $info["description"] != "" + ){ + $description[] = [ + "type" => "quote", + "value" => $info["description"] + ]; + } + + if( + isset($info["long_desc"]) && + $info["long_desc"] != "" + ){ + $description[] = [ + "type" => "text", + "value" => $this->titledots($info["long_desc"]) + ]; + } + + // parse ratings + if( + isset($info["ratings"]) && + $info["ratings"] != "void 0" && + is_array($info["ratings"]) && + count($info["ratings"]) !== 0 + ){ + + $description[] = [ + "type" => "title", + "value" => "Ratings" + ]; + + foreach($info["ratings"] as $rating){ + + $description[] = [ + "type" => "link", + "url" => $rating["profile"]["url"], + "value" => $rating["profile"]["name"] + ]; + + $description[] = [ + "type" => "text", + "value" => ": " . $rating["ratingValue"] . "/" . $rating["bestRating"] . "\n" + ]; + } + } + } + + $table = []; + if(isset($info["attributes"])){ + + foreach($info["attributes"] as $row){ + + if( + $row[1] == "null" && + count($table) !== 0 + ){ + + break; + } + + if($row[1] == "null"){ + + continue; + } + + $table[ + $this->fuckhtml->getTextContent($row[0]) + ] = + $this->fuckhtml->getTextContent($row[1]); + } + } + + $sublink = []; + if(isset($info["profiles"])){ + + foreach($info["profiles"] as $row){ + + $name = $this->fuckhtml->getTextContent($row["name"]); + + if(strtolower($name) == "steampowered"){ + + $name = "Steam"; + } + + $sublink[ + $this->fuckhtml->getTextContent($name) + ] = + $this->fuckhtml->getTextContent($row["url"]); + } + } + + $out["answer"][] = [ + "title" => $this->fuckhtml->getTextContent($info["title"]), + "description" => $description, + "url" => $info["url"], + "thumb" => isset($info["images"][0]["original"]) ? $info["images"][0]["original"] : null, + "table" => $table, + "sublink" => $sublink + ]; + + break; // only iterate once, we get garbage most of the time + } + } + + /* + Get videos + */ + if(isset($data["videos"]["results"])){ + + foreach($data["videos"]["results"] as $video){ + + $out["video"][] = [ + "title" => $this->titledots($video["title"]), + "description" => $this->titledots($video["description"]), + "date" => isset($video["age"]) && $video["age"] != "void 0" ? strtotime($video["age"]) : null, + "duration" => isset($video["video"]["duration"]) && $video["video"]["duration"] != "void 0" ? $this->hms2int($video["video"]["duration"]) : null, + "views" => isset($video["video"]["views"]) && $video["video"]["views"] != "void 0" ? (int)$video["video"]["views"] : null, + "thumb" => + isset($video["thumbnail"]["src"]) ? + [ + "ratio" => "16:9", + "url" => $this->unshiturl($video["thumbnail"]["src"]) + ] : + [ + "ratio" => null, + "url" => null + ], + "url" => $video["url"] + ]; + } + } + + /* + Get news + */ + if(isset($data["news"]["results"])){ + + foreach($data["news"]["results"] as $news){ + + $out["news"][] = [ + "title" => $this->titledots($news["title"]), + "description" => $this->titledots($news["description"]), + "date" => isset($news["age"]) ? strtotime($news["age"]) : null, + "thumb" => + isset($video["thumbnail"]["src"]) ? + [ + "ratio" => "16:9", + "url" => $this->unshiturl($video["thumbnail"]["src"]) + ] : + [ + "ratio" => null, + "url" => null + ], + "url" => $news["url"] + ]; + } + } + + /* + Get discussions + */ + $disc_out = []; + + if(isset($data["discussions"]["results"])){ + + foreach($data["discussions"]["results"] as $disc){ + + $table = []; + + if(isset($disc["data"]["num_votes"])){ + + $table["Votes"] = number_format($disc["data"]["num_votes"]); + } + + if(isset($disc["data"]["num_answers"])){ + + $table["Comments"] = number_format($disc["data"]["num_answers"]); + } + + $disc_out[] = [ + "title" => + $this->titledots( + $disc["title"] + ), + "description" => + $this->limitstrlen( + $this->titledots( + $this->fuckhtml + ->getTextContent( + $disc["description"] + ) + ) + ), + "url" => $disc["url"], + "date" => isset($disc["age"]) ? strtotime($disc["age"]) : null, + "type" => "web", + "thumb" => [ + "ratio" => null, + "url" => null + ], + "sublink" => [], + "table" => $table + ]; + } + } + + // append discussions at position 2 + array_splice($out["web"], 1, 0, $disc_out); + + return $out; + } + + public function news($get){ + + if($get["npt"]){ + + [$req, $proxy] = $this->backend->get($get["npt"], "news"); + + $req = json_decode($req, true); + + $search = $req["q"]; + $country = $req["country"]; + $nsfw = $req["nsfw"]; + $offset = $req["offset"]; + $spellcheck = $req["spellcheck"]; + + try{ + $html = + $this->get( + $proxy, + "https://search.brave.com/news", + [ + "q" => $search, + "offset" => $offset, + "spellcheck" => $spellcheck + ], + $nsfw, + $country + ); + + }catch(Exception $error){ + + throw new Exception("Could not fetch search page"); + } + + }else{ + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + if(strlen($search) > 2048){ + + throw new Exception("Search term is too long!"); + } + + $proxy = $this->backend->get_ip(); + $nsfw = $get["nsfw"]; + $country = $get["country"]; + $spellcheck = $get["spellcheck"] == "yes" ? "1" : "0"; + + /* + $handle = fopen("scraper/brave-news.html", "r"); + $html = fread($handle, filesize("scraper/brave-news.html")); + fclose($handle);*/ + try{ + $html = + $this->get( + $proxy, + "https://search.brave.com/news", + [ + "q" => $search, + "spellcheck" => $spellcheck + ], + $nsfw, + $country + ); + + }catch(Exception $error){ + + throw new Exception("Could not fetch search page"); + } + } + + $out = [ + "status" => "ok", + "npt" => null, + "news" => [] + ]; + + // load html + $this->fuckhtml->load($html); + + // get npt + $out["npt"] = + $this->generatenextpagetoken( + $search, + $nsfw, + $country, + $spellcheck, + "news", + $proxy + ); + + $this->fuckhtml->load($html); + $json = $this->get_js(); + + foreach( + $json[1]["data"]["body"]["response"]["news"]["results"] + as $news + ){ + + if( + !isset($news["thumbnail"]["src"]) || + $news["thumbnail"]["src"] == "void 0" + ){ + + $thumb = [ + "url" => null, + "ratio" => null + ]; + }else{ + + $thumb = [ + "url" => $this->unshiturl($news["thumbnail"]["src"]), + "ratio" => "16:9" + ]; + } + + $out["news"][] = [ + "title" => $news["title"], + "author" => null, + "description" => $news["description"], + "date" => !isset($news["age"]) || $news["age"] == "void 0" || $news["age"] == "null" ? null : strtotime($news["age"]), + "thumb" => $thumb, + "url" => $news["url"] + ]; + } + + return $out; + } + + public function image($get){ + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + if(strlen($search) > 2048){ + + throw new Exception("Search term is too long!"); + } + + $country = $get["country"]; + $nsfw = $get["nsfw"]; + $spellcheck = $get["spellcheck"] == "yes" ? "1" : "0"; + + $out = [ + "status" => "ok", + "npt" => null, + "image" => [] + ]; + + try{ + $html = + $this->get( + $this->backend->get_ip(), // no nextpage right now, pass proxy directly + "https://search.brave.com/images", + [ + "q" => $search, + "spellcheck" => $spellcheck + ], + $nsfw, + $country + ); + + }catch(Exception $error){ + + throw new Exception("Could not fetch search page"); + } + /* + $handle = fopen("scraper/brave-image.html", "r"); + $html = fread($handle, filesize("scraper/brave-image.html")); + fclose($handle);*/ + + $this->fuckhtml->load($html); + $json = $this->get_js(); + + foreach( + $json[1] + ["data"] + ["body"] + ["response"] + ["results"] + as $result + ){ + + $out["image"][] = [ + "title" => $result["title"], + "source" => [ + [ + "url" => $result["properties"]["url"], + "width" => null, + "height" => null + ], + [ + "url" => $result["thumbnail"]["src"], + "width" => null, + "height" => null + ] + ], + "url" => $result["url"] + ]; + } + + return $out; + } + + public function video($get){ + + if($get["npt"]){ + + [$npt, $proxy] = $this->backend->get($get["npt"], "videos"); + + $npt = json_decode($npt, true); + $search = $npt["q"]; + $offset = $npt["offset"]; + $spellcheck = $npt["spellcheck"]; + $country = $npt["country"]; + $nsfw = $npt["nsfw"]; + + try{ + $html = + $this->get( + $proxy, + "https://search.brave.com/videos", + [ + "q" => $search, + "offset" => $offset, + "spellcheck" => $spellcheck + ], + $nsfw, + $country + ); + + }catch(Exception $error){ + + throw new Exception("Could not fetch search page"); + } + + }else{ + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + if(strlen($search) > 2048){ + + throw new Exception("Search term is too long!"); + } + + $country = $get["country"]; + $nsfw = $get["nsfw"]; + $spellcheck = $get["spellcheck"] == "yes" ? "1" : "0"; + + $proxy = $this->backend->get_ip(); + + try{ + $html = + $this->get( + $proxy, + "https://search.brave.com/videos", + [ + "q" => $search, + "spellcheck" => $spellcheck + ], + $nsfw, + $country + ); + + }catch(Exception $error){ + + throw new Exception("Could not fetch search page"); + } + } + + $this->fuckhtml->load($html); + + $out = [ + "status" => "ok", + "npt" => null, + "video" => [], + "author" => [], + "livestream" => [], + "playlist" => [], + "reel" => [] + ]; + + // get npt + $out["npt"] = + $this->generatenextpagetoken( + $search, + $nsfw, + $country, + $spellcheck, + "videos", + $proxy + ); + + /* + $handle = fopen("scraper/brave-video.html", "r"); + $html = fread($handle, filesize("scraper/brave-video.html")); + fclose($handle);*/ + + $this->fuckhtml->load($html); + $json = $this->get_js(); + + foreach( + $json + [1] + ["data"] + ["body"] + ["response"] + ["results"] + as $result + ){ + + if($result["video"]["author"] != "null"){ + + $author = [ + "name" => $result["video"]["author"]["name"] == "null" ? null : $result["video"]["author"]["name"], + "url" => $result["video"]["author"]["url"] == "null" ? null : $result["video"]["author"]["url"], + "avatar" => $result["video"]["author"]["img"] == "null" ? null : $result["video"]["author"]["img"] + ]; + }else{ + + $author = [ + "name" => null, + "url" => null, + "avatar" => null + ]; + } + + if($result["thumbnail"] != "null"){ + + $thumb = [ + "url" => $result["thumbnail"]["original"], + "ratio" => "16:9" + ]; + }else{ + + $thumb = [ + "url" => null, + "ratio" => null + ]; + } + + $out["video"][] = [ + "title" => $result["title"], + "description" => $result["description"] == "null" ? null : $this->titledots($result["description"]), + "author" => $author, + "date" => ($result["age"] == "null" || $result["age"] == "void 0") ? null : strtotime($result["age"]), + "duration" => $result["video"]["duration"] == "null" ? null : $this->hms2int($result["video"]["duration"]), + "views" => $result["video"]["views"] == "null" ? null : (int)$result["video"]["views"], + "thumb" => $thumb, + "url" => $result["url"] + ]; + } + + return $out; + } + + private function stackoverflow_parse($html){ + + $i = 0; + $answer = []; + + $this->fuckhtml->load($html); + + foreach( + $this->fuckhtml->getElementsByTagName("*") + as $snippet + ){ + + switch($snippet["tagName"]){ + + case "p": + $this->fuckhtml->load($snippet["innerHTML"]); + + $codetags = + $this->fuckhtml + ->getElementsByTagName("*"); + + $tmphtml = $snippet["innerHTML"]; + + foreach($codetags as $tag){ + + if(!isset($tag["outerHTML"])){ + + continue; + } + + $tmphtml = + explode( + $tag["outerHTML"], + $tmphtml, + 2 + ); + + $value = $this->fuckhtml->getTextContent($tmphtml[0], false, false); + $this->appendtext($value, $answer, $i); + + $type = null; + switch($tag["tagName"]){ + + case "code": $type = "inline_code"; break; + case "em": $type = "italic"; break; + case "blockquote": $type = "quote"; break; + default: $type = "text"; + } + + if($type !== null){ + $value = $this->fuckhtml->getTextContent($tag, false, true); + + if(trim($value) != ""){ + + if( + $i !== 0 && + $type == "title" + ){ + + $answer[$i - 1]["value"] = rtrim($answer[$i - 1]["value"]); + } + + $answer[] = [ + "type" => $type, + "value" => $value + ]; + $i++; + } + } + + if(count($tmphtml) === 2){ + + $tmphtml = $tmphtml[1]; + }else{ + + break; + } + } + + if(is_array($tmphtml)){ + + $tmphtml = $tmphtml[0]; + } + + if(strlen($tmphtml) !== 0){ + + $value = $this->fuckhtml->getTextContent($tmphtml, false, false); + $this->appendtext($value, $answer, $i); + } + break; + + case "img": + $answer[] = [ + "type" => "image", + "url" => + $this->fuckhtml + ->getTextContent( + $tag["attributes"]["src"] + ) + ]; + $i++; + break; + + case "pre": + + switch($answer[$i - 1]["type"]){ + + case "text": + case "italic": + $answer[$i - 1]["value"] = rtrim($answer[$i - 1]["value"]); + break; + } + + $answer[] = + [ + "type" => "code", + "value" => + rtrim( + $this->fuckhtml + ->getTextContent( + $snippet, + true, + false + ) + ) + ]; + $i++; + + break; + + case "ol": + $o = 0; + + $this->fuckhtml->load($snippet); + $li = + $this->fuckhtml + ->getElementsByTagName("li"); + + foreach($li as $elem){ + $o++; + + $this->appendtext( + $o . ". " . + $this->fuckhtml + ->getTextContent( + $elem + ), + $answer, + $i + ); + } + break; + } + } + + if( + $i !== 0 && + $answer[$i - 1]["type"] == "text" + ){ + + $answer[$i - 1]["value"] = rtrim($answer[$i - 1]["value"]); + } + + return $answer; + } + + private function hms2int($time){ + + $parts = explode(":", $time, 3); + $time = 0; + + if(count($parts) === 3){ + + // hours + $time = $time + ((int)$parts[0] * 3600); + array_shift($parts); + } + + if(count($parts) === 2){ + + // minutes + $time = $time + ((int)$parts[0] * 60); + array_shift($parts); + } + + // seconds + $time = $time + (int)$parts[0]; + + return $time; + } + + private function appendtext($payload, &$text, &$index){ + + if(trim($payload) == ""){ + + return; + } + + if( + $index !== 0 && + $text[$index - 1]["type"] == "text" + ){ + + $text[$index - 1]["value"] .= "\n\n" . preg_replace('/ $/', " ", $payload); + }else{ + + $text[] = [ + "type" => "text", + "value" => preg_replace('/ $/', " ", $payload) + ]; + $index++; + } + } + + private function tablesublink($html_collection, &$data){ + + foreach($html_collection as $html){ + + $html["innerHTML"] = preg_replace( + '/<style>[\S\s]*<\/style>/i', + "", + $html["innerHTML"] + ); + + $html = + explode( + ":", + $this->fuckhtml->getTextContent($html), + 2 + ); + + if(count($html) === 1){ + + $html = ["Rating", $html[0]]; + } + + $data["table"][trim($html[0])] = trim($html[1]); + } + } + /* + private function getimagelinkfromstyle($thumb){ + + $thumb = + $this->fuckhtml + ->getElementsByClassName( + $thumb, + "div" + ); + + if(count($thumb) === 0){ + + return [ + "url" => null, + "ratio" => null + ]; + } + + $thumb = $thumb[0]["attributes"]["style"]; + + preg_match( + '/background-image: ?url\((\'[^\']+\'|"[^"]+"|[^\)]+)\)/', + $thumb, + $thumb + ); + + $url = $this->fuckhtml->getTextContent($this->unshiturl(trim($thumb[1], '"\' '))); + + if(parse_url($url, PHP_URL_HOST) == "cdn.search.brave.com"){ + + return [ + "url" => null, + "ratio" => null + ]; + } + + return [ + "url" => $url, + "ratio" => "16:9" + ]; + }*/ + + private function limitstrlen($text){ + + return explode("\n", wordwrap($text, 300, "\n"))[0]; + } + /* + private function limitwhitespace($text){ + + return + preg_replace( + '/[\s]+/', + " ", + $text + ); + }*/ + + private function titledots($title){ + + $substr = substr($title, -3); + + if( + $substr == "..." || + $substr == "…" + ){ + + return trim(substr($title, 0, -3)); + } + + return trim($title); + } + + private function generatenextpagetoken($q, $nsfw, $country, $spellcheck, $page, $proxy){ + + $nextpage = + $this->fuckhtml + ->getElementById( + "pagination", + "div" + ); + + if($nextpage){ + + $this->fuckhtml->load($nextpage); + + $nextpage = + $this->fuckhtml + ->getElementsByClassName( + "button", + "a" + ); + + if(count($nextpage) !== 0){ + + $nextpage = + $nextpage[count($nextpage) - 1]; + + if( + strtolower( + $this->fuckhtml + ->getTextContent( + $nextpage + ) + ) == "next" + ){ + + preg_match( + '/offset=([0-9]+)/', + $this->fuckhtml->getTextContent($nextpage["attributes"]["href"]), + $nextpage + ); + + return + $this->backend->store( + json_encode( + [ + "q" => $q, + "offset" => (int)$nextpage[1], + "nsfw" => $nsfw, + "country" => $country, + "spellcheck" => $spellcheck + ] + ), + $page, + $proxy + ); + } + } + } + + return null; + } + + private function unshiturl($url){ + + // https://imgs.search.brave.com/XFnbR8Sl7ge82MBDEH7ju0UHImRovMVmQ2qnDvgNTuA/rs:fit:844:225:1/g:ce/aHR0cHM6Ly90c2U0/Lm1tLmJpbmcubmV0/L3RoP2lkPU9JUC54/UWotQXU5N2ozVndT/RDJnNG9BNVhnSGFF/SyZwaWQ9QXBp.jpeg + + $tmp = explode("aHR0", $url); + + if(count($tmp) !== 2){ + + // nothing to do + return $url; + } + + return + base64_decode( + "aHR0" . + str_replace(["/", "_"], ["", "/"], + explode( + ".", + $tmp[1] + )[0] + ) + ); + } +} diff --git a/scraper/cara.php b/scraper/cara.php new file mode 100644 index 0000000..ed3d0b5 --- /dev/null +++ b/scraper/cara.php @@ -0,0 +1,847 @@ +<?php + +class cara{ + + public function __construct(){ + + include "lib/backend.php"; + $this->backend = new backend("cara"); + } + + public function getfilters($page){ + + return [ + "sort" => [ + "display" => "Sort by", + "option" => [ + "Top" => "Top", + "MostRecent" => "Most Recent" + ] + ], + "type" => [ + "display" => "Post type", + "option" => [ + "any" => "Any type", + "portfolio" => "Portfolio", // {"posts":["portfolio"]} + "timeline" => "Timeline" // {"posts":["timeline"]} + ] + ], + "fields" => [ + "display" => "Field/Medium", + "option" => [ + "any" => "Any field", + "2D" => "2D Work", + "3D" => "3D Work", + "3DPrinting" => "3D Printing", + "Acrylic" => "Acrylic", + "AlcoholMarkers" => "Alcohol Markers", + "Animation" => "Animation", + "Chalk" => "Chalk", + "Charcoal" => "Charcoal", + "Colored pencil" => "Colored pencil", + "Conte" => "Conte", + "Crayon" => "Crayon", + "Digital" => "Digital", + "Gouache" => "Gouache", + "Ink" => "Ink", + "MixedMedia" => "Mixed-Media", + "Oil" => "Oil", + "Oil-based Markers" => "Oil-based Markers", + "Other" => "Other", + "Pastels" => "Pastels", + "Photography" => "Photography", + "Sculpture" => "Sculpture", + "Sketches" => "Sketches", + "Tattoos" => "Tattoos", + "Traditional" => "Traditional", + "VFX" => "VFX", + "Watercolor" => "Watercolor" + ] + ], + "category" => [ + "display" => "Category", + "option" => [ + "any" => "Any category", + "3DScanning" => "3D Scanning", + "Abstract" => "Abstract", + "Adoptable" => "Adoptable", + "Anatomy" => "Anatomy", + "Animals" => "Animals", + "Anime" => "Anime", + "App" => "App", + "ArchitecturalConcepts" => "Architectural Concepts", + "ArchitecturalVisualization" => "Architectural Visualization", + "AugmentedReality" => "Augmented Reality", + "Automotive" => "Automotive", + "BoardGameArt" => "Board Game Art", + "BookIllustration" => "Book Illustration", + "CardGameArt" => "Card Game Art", + "CeramicsPottery" => "Ceramics/Pottery", + "CharacterAnimation" => "Character Animation", + "CharacterDesign" => "Character Design", + "CharacterModeling" => "Character Modeling", + "ChildrensArt" => "Children's Illustration", + "Collectibles" => "Collectibles", + "ColoringPage" => "Coloring Page", + "ComicArt" => "Comic Art", + "ConceptArt" => "Concept Art", + "Cosplay" => "Cosplay", + "CostumeDesign" => "Costume Design", + "CoverArt" => "Cover Art", + "Creatures" => "Creatures", + "Diorama" => "Diorama", + "EditorialIllustration" => "Editorial Illustration", + "EmbroiderySewing" => "Embroidery/Sewing", + "EnvironmentalConceptArt" => "Environmental Concept Art", + "EnvironmentalConceptDesign" => "Environmental Concept Design", + "FanArt" => "Fan Art", + "Fantasy" => "Fantasy", + "Fashion" => "Fashion", + "FashionStyling" => "Fashion Styling", + "FiberArts" => "Fiber Arts", + "Furry" => "Furry", + "GameArt" => "Game Art", + "GameplayDesign" => "Gameplay Design", + "GamesEnvironmentArt" => "Games Environment Art", + "Gem" => "Gem", + "GraphicDesign" => "Graphic Design", + "Handicraft" => "Handicraft", + "HairStyling" => "Hair Styling", + "HardSurface" => "Hard Surface", + "Horror" => "Horror", + "Illustration" => "Illustration", + "IllustrationVisualization" => "Illustration Visualization", + "IndustrialDesign" => "Industrial Design", + "Jewelry" => "Jewelry", + "KnittingCrochet" => "Knitting/Crochet", + "Landscape" => "Landscape", + "LevelDesign" => "Level Design", + "Lighting" => "Lighting", + "Makeup" => "Makeup", + "Manga" => "Manga", + "MapsCartography" => "Maps/Cartography", + "MattePainting" => "Matte Painting", + "Materials" => "Materials", + "MechanicalDesign" => "Mechanical Design", + "Medical" => "Medical", + "Mecha" => "Mecha", + "MiniatureArt" => "Miniature Art", + "MotionGraphics" => "Motion Graphics", + "FrescoMurals" => "Fresco/Murals", + "Natural" => "Natural", + "Original Character" => "Original Character", + "Overlay" => "Overlay", + "PleinAir" => "Plein Air", + "Photogrammetry" => "Photogrammetry", + "PixelArt" => "Pixel Art", + "Portraits" => "Portraits", + "Props" => "Props", + "ProductDesign" => "Product Design", + "PublicDomain" => "Public Domain or Royalty Free", + "Real-Time3DEnvironmentArt" => "Real-Time 3D Environment Art", + "Realism" => "Realism", + "ScienceFiction" => "Science Fiction", + "ScientificVisualization" => "Scientific Visualization", + "Scripts" => "Scripts", + "StillLife" => "Still Life", + "Storyboards" => "Storyboards", + "Stylized" => "Stylized", + "Surreal" => "Surreal", + "TechnicalArt" => "Technical Art", + "Textures" => "Textures", + "Tools" => "Tools", + "Toys" => "Toys", + "ToyPackaging" => "Toy Packaging", + "Tutorials" => "Tutorials", + "UIArt" => "User Interface (UI) Art", + "UrbanSketch" => "Urban Sketch", + "VFXforAnimation" => "VFX for Animation", + "VFXforFilm" => "VFX for Film", + "VFXforGames" => "VFX for Games", + "VFXforRealTime" => "VFX for Real-Time", + "VFXforTV" => "VFX for TV", + "Vehicles" => "Vehicles", + "VirtualReality" => "Virtual Reality", + "VisualDevelopment" => "Visual Development", + "VoxelArt" => "Voxel Art", + "Vtubers" => "Vtubers", + "WIP" => "WIP (Work in Progress)", + "Web" => "Web", + "Weapons" => "Weapons", + "Wildlife" => "Wildlife", + "Woodcutting" => "Woodcutting" + ] + ], + "software" => [ + "display" => "Software", + "option" => [ + "any" => "Any software", + "123D" => "123D", + "123DCatch" => "123D Catch", + "3DBee" => "3DBee", + "3DCoat" => "3DCoat", + "3DCoatPrint" => "3DCoatPrint", + "3DCoatTextura" => "3DCoatTextura", + "3DEqualizer" => "3DEqualizer", + "3DFZephyr" => "3DF Zephyr", + "3Delight" => "3Delight", + "3dpeople" => "3dpeople", + "3dsMax" => "3ds Max", + "3DSPaint" => "3DS Paint", + "ACDSeeCanvas" => "ACDSee Canvas", + "AbletonLive" => "Ableton Live", + "Acrobat" => "Acrobat", + "AdobeDraw" => "Adobe Draw", + "AdobeFlash" => "Adobe Flash", + "AdobeFresco" => "Adobe Fresco", + "AdobeSubstance3Dassets" => "Adobe Substance 3D assets", + "AdobeXD" => "Adobe XD", + "AffinityDesigner" => "Affinity Designer", + "AffinityPhoto" => "Affinity Photo", + "AfterEffects" => "After Effects", + "Akeytsu" => "Akeytsu", + "Alchemy" => "Alchemy", + "AliasDesign" => "Alias Design", + "AlightMotion" => "Alight Motion", + "Amadine" => "Amadine", + "Amberlight" => "Amberlight", + "Animate" => "Animate", + "AnimationMaster" => "Animation:Master", + "AnimeStudio" => "Anime Studio", + "Apophysis" => "Apophysis", + "ArchiCAD" => "ArchiCAD", + "Arion" => "Arion", + "ArionFX" => "ArionFX", + "Arnold" => "Arnold", + "ArtEngine" => "ArtEngine", + "ArtFlow" => "ArtFlow", + "ArtRage" => "ArtRage", + "ArtstudioPro" => "Artstudio Pro", + "Artweaver" => "Artweaver", + "Aseprite" => "Aseprite", + "Audition" => "Audition", + "AutoCAD" => "AutoCAD", + "AutodeskSketchBook" => "Autodesk SketchBook", + "AvidMediaComposer" => "Avid Media Composer", + "AzPainter" => "AzPainter", + "babylonjs" => "babylon.js", + "BalsamiqMockup" => "Balsamiq Mockup", + "Bforartists" => "Bforartists", + "BlackInk" => "Black Ink", + "BlackmagicDesignFusion" => "Blackmagic Design Fusion", + "Blender" => "Blender", + "Blender DeepPaint" => "Blender DeepPaint", + "BlenderGreasePencil" => "Blender Grease Pencil", + "Blockbench" => "Blockbench", + "BodyPaint" => "BodyPaint", + "Boxcutter" => "Boxcutter", + "BraidMaker" => "Braid Maker", + "BrickLinkStudio" => "BrickLink Studio", + "Bridge" => "Bridge", + "Brushifyio" => "Brushify.io", + "C" => "C", + "C#" => "C#", + "C++" => "C++", + "CACANi" => "CACANi", + "CLIPSTUDIOPAINT" => "CLIP STUDIO PAINT", + "CLO" => "CLO", + "CRYENGINE" => "CRYENGINE", + "Callipeg" => "Callipeg", + "Canva" => "Canva", + "CaptureOne" => "Capture One", + "CartoonAnimator" => "Cartoon Animator", + "Carveco" => "Carveco", + "Cavalry" => "Cavalry", + "Chaotica" => "Chaotica", + "CharacterAnimator" => "Character Animator", + "CharacterCreator" => "Character Creator", + "Cinema4D" => "Cinema 4D", + "ClarisseiFX" => "Clarisse iFX", + "Coiffure" => "Coiffure", + "ColorsLive" => "Colors Live", + "Combustion" => "Combustion", + "Construct2" => "Construct 2", + "Core" => "Core", + "CorelPainter" => "Corel Painter", + "CorelDRAWGraphicsSuite" => "CorelDRAW Graphics Suite", + "CoronaRenderer" => "Corona Renderer", + "ProMotionNG" => "Cosmigo Pro Motion NG", + "CrazyBump" => "CrazyBump", + "Crocotile3D" => "Crocotile 3D", + "Curvy3D" => "Curvy 3D", + "Cycles4D" => "Cycles 4D", + "Darkroom" => "Darkroom", + "DAZStudio" => "DAZ Studio", + "DDO" => "DDO", + "DECIMA" => "DECIMA", + "Darktable" => "Darktable", + "DaVinciResolve" => "DaVinci Resolve", + "Dimension" => "Dimension", + "DragonBones" => "DragonBones", + "Dragonframe" => "Dragonframe", + "Drawpile" => "Drawpile", + "Dreams" => "Dreams", + "Dreamweaver" => "Dreamweaver", + "DxOPhotoLab" => "DxO PhotoLab", + "ECycles" => "E-Cycles", + "EmberGen" => "EmberGen", + "Encore" => "Encore", + "Expresii" => "Expresii", + "FStorm" => "FStorm", + "FadeIn" => "FadeIn", + "Feather3D" => "Feather 3D", + "FiberShop" => "FiberShop", + "Figma" => "Figma", + "FilmoraWondershare" => "Filmora Wondershare", + "FilterForge" => "Filter Forge", + "FinalCutPro" => "Final Cut Pro", + "FinalDraft" => "Final Draft", + "finalRender" => "finalRender", + "FireAlpaca" => "FireAlpaca", + "Fireworks" => "Fireworks", + "FlamePainter" => "Flame Painter", + "Flash" => "Flash", + "FlipaClip" => "FlipaClip", + "FlipnoteStudio" => "Flipnote Studio", + "Fluent" => "Fluent", + "ForestPack" => "Forest Pack", + "FormZ" => "Form-Z", + "Fractorium" => "Fractorium", + "FreeCAD" => "FreeCAD", + "FreeHand" => "FreeHand", + "Forger" => "Forger", + "FrostbiteEngine" => "Frostbite Engine", + "fSpy" => "fSpy", + "FumeFX" => "FumeFX", + "Fusion360" => "Fusion 360", + "GIMP" => "GIMP", + "GSCurveTools" => "GS CurveTools", + "GSToolbox" => "GS Toolbox", + "Gaea" => "Gaea", + "GameTextures" => "Game Textures", + "GameMakerStudio" => "GameMaker: Studio", + "GarageFarmNET" => "GarageFarm.NET", + "GeoGlyph" => "GeoGlyph", + "GigapixelAl" => "Gigapixel Al", + "Glaxnimate" => "Glaxnimate", + "GnomePaint" => "Gnome Paint", + "Godot" => "Godot", + "Goxel" => "Goxel", + "Graphite" => "Graphite", + "Graswald" => "Graswald", + "GravitySketch" => "Gravity Sketch", + "GuerillaRender" => "GuerillaRender", + "HDRLightStudio" => "HDR Light Studio", + "HairStrandDesigner" => "Hair Strand Designer", + "HairTGHairFur" => "HairTG - Hair & Fur", + "HairTGSurfaceFeatherEdition" => "HairTG - Surface, Feather Edition", + "HairTGSurfaceHairEdition" => "HairTG - Surface, Hair Edition", + "Handplane" => "Handplane", + "Hansoft" => "Hansoft", + "HardOps" => "Hard Ops", + "HardMesh" => "HardMesh", + "Harmony" => "Harmony", + "HeavypaintWebbypaint" => "Heavypaint/Webbypaint", + "HelloPaint" => "HelloPaint", + "HeliconFocus" => "Helicon Focus", + "Hexels" => "Hexels", + "HiPaint" => "HiPaint", + "Houdini" => "Houdini", + "HydraRenderer" => "Hydra Renderer", + "iArtbook" => "iArtbook", + "IbisPaint" => "ibisPaint", + "Ideas" => "Ideas", + "IllustStudio" => "Illust Studio", + "Illustrator" => "Illustrator", + "IllustratorDraw" => "Illustrator Draw", + "InDesign" => "InDesign", + "Inochi2D" => "Inochi2D", + "InVision" => "InVision", + "InVisionCraft" => "InVision Craft", + "InfinitePainter" => "Infinite Painter", + "Inkscape" => "Inkscape", + "Inspirit" => "Inspirit", + "InstaLOD" => "InstaLOD", + "InstaMAT" => "InstaMAT", + "InstantLightRealtimePBR" => "Instant Light Realtime PBR", + "InstantMeshes" => "Instant Meshes", + "InstantTerra" => "Instant Terra", + "Inventor" => "Inventor", + "Iray" => "Iray", + "JWildfire" => "JWildfire", + "Java" => "Java", + "Jira" => "Jira", + "JumpPaint" => "Jump Paint by MediBang", + "JSPaint" => "JS Paint", + "Katana" => "Katana", + "Keyshot" => "Keyshot", + "KidPix" => "Kid Pix", + "KitBash3D" => "KitBash3D", + "Knald" => "Knald", + "Kodon" => "Kodon", + "KolourPaint" => "KolourPaint", + "Krakatoa" => "Krakatoa", + "KRESKA" => "KRESKA", + "Krita" => "Krita", + "LensStudio" => "Lens Studio", + "LibreSprite" => "LibreSprite", + "LightWave3D" => "LightWave 3D", + "Lightroom" => "Lightroom", + "Linearity" => "Linearity", + "LiquiGen" => "LiquiGen", + "Live2DCubism" => "Live2D Cubism", + "LookatmyHair" => "Look at my Hair", + "Lotpixel" => "Lotpixel", + "Lumion" => "Lumion", + "LuxRender" => "LuxRender", + "MacPaint" => "MacPaint", + "MagicaCSG" => "MagicaCSG", + "MagicaVoxel" => "MagicaVoxel", + "Magma" => "Magma", + "MakeHuman" => "MakeHuman", + "Malmal" => "Malmal", + "Mandelbulb3D" => "Mandelbulb 3D", + "Mandelbulber" => "Mandelbulber", + "MangaStudio" => "Manga Studio", + "Mari" => "Mari", + "MarmosetToolbag" => "Marmoset Toolbag", + "MarvelousDesigner" => "Marvelous Designer", + "MasterpieceStudioPro" => "Masterpiece Studio Pro", + "MasterpieceVR" => "MasterpieceVR", + "Maverick" => "Maverick", + "MaxwellRender" => "Maxwell Render", + "Maya" => "Maya", + "MediBangPaint" => "MediBang Paint", + "MediumbyAdobe" => "Medium by Adobe", + "Megascans" => "Megascans", + "mentalray" => "mental ray", + "MeshLab" => "MeshLab", + "Meshroom" => "Meshroom", + "MetaHumanCreator" => "MetaHuman Creator", + "Metashape" => "Metashape", + "MightyBake" => "MightyBake", + "MikuMikuDance" => "MikuMikuDance", + "Minecraft" => "Minecraft", + "Mischief" => "Mischief", + "Mixamo" => "Mixamo", + "Mixer" => "Mixer", + "MoI3D" => "MoI3D", + "Mocha" => "Mocha", + "Modo" => "Modo", + "Moho" => "Moho", + "MotionBuilder" => "MotionBuilder", + "Mudbox" => "Mudbox", + "Muse" => "Muse", + "MSPaint" => "MS Paint", + "MyPaint" => "MyPaint", + "NDO" => "NDO", + "NX" => "NX", + "NdotCAD" => "NdotCAD", + "NintendoNotes" => "Nintendo Notes", + "NomadSculpt" => "Nomad Sculpt", + "Notability" => "Notability", + "Nuke" => "Nuke", + "Nvil" => "Nvil", + "OctaneRender" => "Octane Render", + "Omniverse" => "Omniverse", + "OmniverseCreate" => "Omniverse Create", + "ON1PhotoRAW" => "ON1 Photo RAW", + "Open3DEngine" => "Open 3D Engine", + "OpenCanvas" => "OpenCanvas", + "OpenGL" => "OpenGL", + "OpenToonz" => "OpenToonz", + "Ornatrix" => "Ornatrix", + "OsciRender" => "Osci-Render", + "OurPaint" => "Our Paint", + "PBRMAX" => "PBRMAX", + "PFTrack" => "PFTrack", + "PTGui" => "PTGui", + "Paintbrush" => "Paintbrush", + "PaintNET" => "Paint.NET", + "PaintShopPro" => "PaintShop Pro", + "PaintToolSAI" => "Paint Tool SAI", + "PaintstormStudio" => "Paintstorm Studio", + "Paper" => "Paper", + "Pencil2D" => "Pencil2D", + "Penpot" => "Penpot", + "PhoenixFD" => "Phoenix FD", + "Phonto" => "Phonto", + "PhotoLab2" => "PhotoLab 2", + "Photopea" => "Photopea", + "Photoscan" => "Photoscan", + "Photoshop" => "Photoshop", + "PhotoshopElements" => "Photoshop Elements", + "PicoCAD" => "picoCAD", + "PicoCAD2" => "picoCAD 2", + "Pinta" => "Pinta", + "Piskel" => "Piskel", + "Pixilart" => "Pixilart", + "Pixelitor" => "Pixelitor", + "Pixelmator" => "Pixelmator", + "Pixelorama" => "Pixelorama", + "PixivSketch" => "pixiv Sketch", + "Pixquare" => "Pixquare", + "PlantCatalog" => "PlantCatalog", + "PlantFactory" => "PlantFactory", + "Plasticity" => "Plasticity", + "PNGtuberPlus" => "PNGtuber Plus", + "Poliigon" => "Poliigon", + "Polybrush" => "Polybrush", + "PopcornFx" => "PopcornFx", + "Poser" => "Poser", + "Premiere" => "Premiere", + "PremiereElements" => "Premiere Elements", + "PresagisCreator" => "Presagis Creator", + "ProTools" => "Pro Tools", + "Procreate" => "Procreate", + "ProcreateDreams" => "Procreate Dreams", + "Producer" => "Producer", + "PrometheanAI" => "Promethean AI", + "PureRef" => "PureRef", + "Python" => "Python", + "PyxelEdit" => "PyxelEdit", + "QuadRemesher" => "Quad Remesher", + "QuarkXPress" => "QuarkXPress", + "Qubicle" => "Qubicle", + "Quill" => "Quill", + "QuixelBridge" => "Quixel Bridge", + "QuixelMegascans" => "Quixel Megascans", + "QuixelMixer" => "Quixel Mixer", + "QuixelSuite" => "Quixel Suite", + "R3DSWrap" => "R3DS Wrap", + "R3DSZWRAP" => "R3DS ZWRAP", + "RDTextures" => "RD-Textures", + "RailClone" => "RailClone", + "RealFlow" => "RealFlow", + "RealisticPaintStudio" => "Realistic Paint Studio", + "RealityCapture" => "RealityCapture", + "RealityScan" => "RealityScan", + "RealtimeBoard" => "Realtime Board", + "Rebelle" => "Rebelle", + "Redshift" => "Redshift", + "RenderMan" => "RenderMan", + "RenderNetwork" => "Render Network", + "Revit" => "Revit", + "Rhino" => "Rhino", + "Rhinoceros" => "Rhinoceros", + "RizomUV" => "RizomUV", + "RoughAnimator" => "Rough Animator", + "SamsungNotes" => "Samsung Notes", + "SamsungPENUP" => "Samsung PENUP", + "ScansLibrary" => "ScansLibrary", + "Scrivener" => "Scrivener", + "Sculpt+" => "Sculpt+", + "Sculptris" => "Sculptris", + "ShaveandaHaircut" => "Shave and a Haircut", + "ShiVa3D" => "ShiVa3D", + "Shotgun" => "Shotgun", + "Silo" => "Silo", + "Silugen" => "Silugen", + "Sketch" => "Sketch", + "SketchApp" => "Sketch App", + "SketchBookPro" => "SketchBook Pro", + "SketchClub" => "SketchClub", + "SketchUp" => "SketchUp", + "Sketchable" => "Sketchable", + "Sketchfab" => "Sketchfab", + "Skyshop" => "Skyshop", + "Snapseed" => "Snapseed", + "Snowdrop" => "Snowdrop", + "Softimage" => "Softimage", + "SolidWorks" => "SolidWorks", + "SonySketch" => "Sony Sketch", + "Soundbooth" => "Soundbooth", + "Source2" => "Source 2", + "SourceControl" => "Source Control", + "SourceFilmmaker" => "Source Filmmaker", + "SpeedTree" => "SpeedTree", + "Speedgrade" => "Speedgrade", + "SpeedyPainter" => "SpeedyPainter", + "Spine2D" => "Spine 2D", + "Spriter" => "Spriter", + "Stingray" => "Stingray", + "Storyboarder" => "Storyboarder", + "StoryboardPro" => "Storyboard Pro", + "SublimeText" => "Sublime Text", + "Substance3DDesigner" => "Substance 3D Designer", + "Substance3DModeler" => "Substance 3D Modeler", + "Substance3DPainter" => "Substance 3D Painter", + "Substance3DSampler" => "Substance 3D Sampler", + "Substance3DStager" => "Substance 3D Stager", + "SubstanceB2M" => "Substance B2M", + "SweetHome3D" => "Sweet Home 3D", + "SynthEyes" => "SynthEyes", + "TTools" => "TTools", + "TVPaint" => "TVPaint", + "TVPaintAnimation" => "TVPaint Animation", + "TayasuiSketches" => "Tayasui Sketches", + "TayasuiSketchesMobileApp" => "Tayasui Sketches Mobile App", + "TayasuiSketchesPro" => "Tayasui Sketches Pro", + "Terragen" => "Terragen", + "Texturescom" => "Textures.com", + "Texturingxyz" => "Texturingxyz", + "TeyaConceptor" => "Teya Conceptor", + "TheGrove3D" => "The Grove 3D", + "TheaRender" => "Thea Render", + "Threejs" => "Three.js", + "Tiled" => "Tiled", + "TiltBrush" => "Tilt Brush", + "Tooll3" => "Tooll3", + "ToonBoomHarmony" => "Toon Boom Harmony", + "ToonBoomStudio" => "Toon Boom Studio", + "ToonSquid" => "ToonSquid", + "TopoGun" => "TopoGun", + "TuxPaint" => "Tux Paint", + "Tvori" => "Tvori", + "Twinmotion" => "Twinmotion", + "UNIGINEEngine" => "UNIGINE Engine", + "UVLayout" => "UVLayout", + "UltraFractal" => "Ultra Fractal", + "uMake" => "uMake", + "Unfold3D" => "Unfold 3D", + "Unity" => "Unity", + "UnrealEngine" => "Unreal Engine", + "Vengi" => "vengi", + "VRay" => "V-Ray", + "VRED" => "VRED", + "VTubeStudio" => "VTube Studio", + "Vectary" => "Vectary", + "VectorayGen" => "VectorayGen", + "Vectorworks" => "Vectorworks", + "VegasPro" => "Vegas Pro", + "VisualDesigner3D" => "Visual Designer 3D", + "VisualStudio" => "Visual Studio", + "VRoidStudio" => "VRoid Studio", + "Vue" => "Vue", + "Vuforia" => "Vuforia", + "WebGL" => "WebGL", + "WhiteboardFox" => "Whiteboard Fox", + "WickEditor" => "Wick Editor", + "Wings3D" => "Wings 3D", + "Word" => "Word", + "WorldCreator" => "World Creator", + "WorldMachine" => "World Machine", + "XParticles" => "X-Particles", + "Xfrog" => "Xfrog", + "Xgen" => "Xgen", + "xNormal" => "xNormal", + "xTex" => "xTex", + "XoliulShader" => "Xoliul Shader", + "Yafaray" => "Yafaray", + "Yeti" => "Yeti", + "ZBrush" => "ZBrush", + "ZBrushCore" => "ZBrushCore", + "ZenBrush" => "Zen Brush" + ] + ] + ]; + } + + private function get($proxy, $url, $get = [], $search){ + + $curlproc = curl_init(); + + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: application/json, text/plain, */*", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip, deflate, br, zstd", + //"sentry-trace: 72b0318a7141fe18cbacbd905572eddf-a60de161b66b1e6f-1 + //"baggage: sentry-environment=vercel-production,sentry-release=251ff5179b4de94974f36d9b8659a487bbb8a819,sentry-public_key=2b87af2b44c84643a011838ad097735f,sentry-trace_id=72b0318a7141fe18cbacbd905572eddf,sentry-transaction=GET%20%2Fsearch,sentry-sampled=true,sentry-sample_rand=0.09967130764937493,sentry-sample_rate=0.5", + "DNT: 1", + "Sec-GPC: 1", + "Connection: keep-alive", + //"Referer: https://cara.app/search?q=jak+and+daxter&type=&sortBy=Top&filters=%7B%7D", + "Referer: https://cara.app/search?q=" . urlencode($search), + //"Cookie: __Host-next-auth.csrf-token=b752c4296375bccb7b480ff010e1e916c65c35c311a4a57ac6cd871468730578%7C4d3783cfb72a98f390e534abd149806432b6cf8d50555a52d00e99216a516911; __Secure-next-auth.callback-url=https%3A%2F%2Fcara.app; crumb=BV0HDt87G5+fOWE0ZDQ5MWM0ZTQ3YTZmMzM4MGU5MGNjNDNmMzY2", + "Sec-Fetch-Dest: empty", + "Sec-Fetch-Mode: cors", + "Sec-Fetch-Site: same-origin", + "TE: trailers"] + ); + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + return $data; + } + + public function image($get){ + + if($get["npt"]){ + + [$npt, $proxy] = + $this->backend->get( + $get["npt"], + "images" + ); + + $npt = json_decode($npt, true); + }else{ + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $proxy = $this->backend->get_ip(); + + $npt = [ + "q" => $get["s"], + "sortBy" => $get["sort"], + "take" => 24, + "skip" => 0, + "filters" => [] + ]; + + // parse filters + if($get["type"] != "any"){ + + $npt["filters"]["posts"] = [$get["type"]]; + } + + if($get["fields"] != "any"){ + + $npt["filters"]["fields"] = [$get["fields"]]; + } + + if($get["category"] != "any"){ + + $npt["filters"]["categories"] = [$get["category"]]; + } + + if($get["software"] != "any"){ + + $npt["filters"]["softwares"] = [$get["software"]]; + } + + if($npt["filters"] == []){ + + $npt["filters"] = "{}"; + }else{ + + $npt["filters"] = json_encode($npt["filters"]); + } + } + + $out = [ + "status" => "ok", + "npt" => null, + "image" => [] + ]; + + // https://cara.app/api/search/portfolio-posts?q=jak+and+daxter&sortBy=Top&take=24&skip=0&filters=%7B%7D + try{ + $json = + $this->get( + $proxy, + "https://cara.app/api/search/posts", + $npt, + $npt["q"] + ); + + }catch(Exception $error){ + + throw new Exception("Failed to fetch JSON"); + } + + $json = json_decode($json, true); + + if($json === null){ + + throw new Exception("Failed to decode JSON"); + } + + $imagecount = 0; + foreach($json as $image){ + + if(count($image["images"]) === 0){ + + // sometimes the api returns no images for an object + $imagecount++; + continue; + } + + $cover = null; + $sources = []; + + foreach($image["images"] as $source){ + + if($source["isCoverImg"]){ + + $cover = [ + "url" => "https://images.cara.app/" . $this->fix_url($source["src"]), + "width" => 500, + "height" => 500 + ]; + }else{ + + $sources[] = [ + "url" => "https://images.cara.app/" . $this->fix_url($source["src"]), + "width" => null, + "height" => null + ]; + } + } + + if($cover !== null){ + + $sources[] = $cover; + } + + $out["image"][] = [ + "title" => str_replace("\n", " ", $image["content"]), + "source" => $sources, + "url" => "https://cara.app/post/" . $image["id"] + ]; + + $imagecount++; + } + + if($imagecount === 24){ + + $npt["skip"] += 24; + + $out["npt"] = + $this->backend->store( + json_encode($npt), + "images", + $proxy + ); + } + + return $out; + } + + private function fix_url($url){ + + return + str_replace( + [" "], + ["%20"], + $url + ); + } +} diff --git a/scraper/coccoc.php b/scraper/coccoc.php new file mode 100644 index 0000000..fd09556 --- /dev/null +++ b/scraper/coccoc.php @@ -0,0 +1,672 @@ +<?php + +class coccoc{ + + public function __construct(){ + + include "lib/backend.php"; + $this->backend = new backend("coccoc"); + + include "lib/fuckhtml.php"; + $this->fuckhtml = new fuckhtml(); + } + + + private function get($proxy, $url, $get = []){ + + $curlproc = curl_init(); + + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + // http2 bypass + curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); + + curl_setopt($curlproc, CURLOPT_HTTPHEADER, [ + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip, deflate, br, zstd", + "DNT: 1", + "Sec-GPC: 1", + "Connection: keep-alive", + //"Cookie: _contentAB_15040_vi=V-06_01; split_test_search=new_search; uid=L_bauXyZBY1B; vid=uCVQJQSTgb9QGT3o; ls=1753742684; serp_version=29223843/7621a70; savedS=direct", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: cross-site", + "Priority: u=0, i" + ]); + + $this->backend->assign_proxy($curlproc, $proxy); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + return $data; + } + + public function getfilters($pagetype){ + + return [ + "nsfw" => [ + "display" => "NSFW", + "option" => [ + "yes" => "Yes", // nsfw by default???? + "no" => "No" // &safe=1 + ] + ], + "time" => [ + "display" => "Time posted", + "option" => [ + "any" => "Any time", + "1w" => "1 week ago", + "2w" => "2 weeks ago", + "1m" => "1 month ago", + "3m" => "3 months ago", + "6m" => "6 months ago", + "1Y" => "1 year ago" + ] + ], + "filter" => [ + "display" => "Remove duplicates", + "option" => [ + "no" => "No", + "yes" => "Yes" // &filter=0 + ] + ] + ]; + } + + public function web($get){ + + if($get["npt"]){ + + [$query, $proxy] = + $this->backend->get( + $get["npt"], + "web" + ); + + $query = json_decode($query, true); + }else{ + + $proxy = $this->backend->get_ip(); + + $query = [ + "query" => $get["s"] + ]; + + // add filters + if($get["nsfw"] == "no"){ + + $query["safe"] = 1; + } + + if($get["time"] != "any"){ + + $query["tbs"] = $get["time"]; + } + + if($get["filter"] == "yes"){ + + $query["filter"] = 0; + } + } + + try{ + + $html = + $this->get( + $proxy, + "https://coccoc.com/search", + $query + ); + }catch(Exception $error){ + + throw new Exception("Failed to get search page"); + } + //$html = file_get_contents("scraper/coccoc.html"); + + + $html = explode("window.composerResponse", $html, 2); + + if(count($html) !== 2){ + + throw new Exception("Failed to grep window.composerResponse"); + } + + $html = + json_decode( + $this->fuckhtml + ->extract_json( + ltrim($html[1], " =") + ), + true + ); + + if($html === null){ + + throw new Exception("Failed to decode JSON"); + } + + if(!isset($html["search"]["search_results"])){ + + throw new Exception("Coc Coc did not return a search_results object"); + } + + $out = [ + "status" => "ok", + "spelling" => [ + "type" => "no_correction", + "using" => null, + "correction" => null + ], + "npt" => null, + "answer" => [], + "web" => [], + "image" => [], + "video" => [], + "news" => [], + "related" => [] + ]; + + // word correction + foreach($html["top"] as $element){ + + if(isset($element["spellChecker"][0]["query"])){ + + $out["spelling"] = [ + "type" => "not_many", + "using" => $html["search"]["query"], + "correction" => $element["spellChecker"][0]["query"] + ]; + } + } + + foreach($html["search"]["search_results"] as $result){ + + if(isset($result["type"])){ + + switch($result["type"]){ + + // + // Related searches + // + case "related_queries": + $out["related"] = $result["queries"]; + continue 2; + + // + // Videos + // + case "video_hits": + foreach($result["results"] as $video){ + + if( + isset($video["image_url"]) && + !empty($video["image_url"]) + ){ + + $thumb = [ + "ratio" => "16:9", + "url" => $video["image_url"] + ]; + }else{ + + $thumb = [ + "ratio" => null, + "url" => null + ]; + } + + $out["video"][] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $video["title"] + ) + ), + "description" => null, + "author" => [ + "name" => $video["uploader"], + "url" => null, + "avatar" => null + ], + "date" => (int)$video["date"], + "duration" => (int)$video["duration"], + "views" => null, + "thumb" => $thumb, + "url" => $video["url"] + ]; + } + continue 2; + } + } + + if( + !isset($result["title"]) || + !isset($result["url"]) + ){ + + // should not happen + continue; + } + + if(isset($result["rich"]["data"]["image_url"])){ + + $thumb = [ + "url" => $result["rich"]["data"]["image_url"], + "ratio" => "16:9" + ]; + }else{ + + $thumb = [ + "url" => null, + "ratio" => null + ]; + } + + $sublinks = []; + + if(isset($result["rich"]["data"]["linked_docs"])){ + + foreach($result["rich"]["data"]["linked_docs"] as $sub){ + + $sublinks[] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $sub["title"] + ) + ), + "description" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $sub["content"] + ) + ), + "date" => null, + "url" => $sub["url"] + ]; + } + } + + // get date + if(isset($result["date"])){ + + $date = (int)$result["date"]; + }else{ + + $date = null; + } + + // probe for metadata + $table = []; + + if(isset($result["rich"]["data"]["rating"])){ + + $table["Rating"] = $result["rich"]["data"]["rating"]; + + if(isset($result["rich"]["data"]["num_rating"])){ + + $table["Rating"] .= " (" . number_format($result["rich"]["data"]["num_rating"]) . " ratings)"; + } + } + + if(isset($result["rich"]["data"]["views"])){ + + $table["Views"] = number_format($result["rich"]["data"]["views"]); + } + + if(isset($result["rich"]["data"]["duration"])){ + + $table["Duration"] = $this->int2hms($result["rich"]["data"]["duration"]); + } + + if(isset($result["rich"]["data"]["channel_name"])){ + + $table["Author"] = $result["rich"]["data"]["channel_name"]; + } + + if(isset($result["rich"]["data"]["video_quality"])){ + + $table["Quality"] = $result["rich"]["data"]["video_quality"]; + } + + if(isset($result["rich"]["data"]["category"])){ + + $table["Category"] = $result["rich"]["data"]["category"]; + } + + $out["web"][] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $result["title"] + ) + ), + "description" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $result["content"] + ) + ), + "url" => $result["url"], + "date" => $date, + "type" => "web", + "thumb" => $thumb, + "sublink" => $sublinks, + "table" => $table + ]; + } + + // + // Get wikipedia head + // + if(isset($html["right"])){ + + foreach($html["right"] as $wiki){ + + $description = []; + + if(isset($wiki["short_intro"])){ + + $description[] = + [ + "type" => "quote", + "value" => $wiki["short_intro"], + ]; + } + + if(isset($wiki["intro"])){ + + $description[] = + [ + "type" => "text", + "value" => $wiki["intro"], + ]; + } + + // get table elements + $table = []; + + if(isset($wiki["fields"])){ + + foreach($wiki["fields"] as $element){ + + $table[$element["title"]] = implode(", ", $element["value"]); + } + } + + // get sublinks + $sublinks = []; + + if(isset($wiki["website"])){ + + if( + preg_match( + '/^http/', + $wiki["website"] + ) === 0 + ){ + + $sublinks["Website"] = "https://" . $wiki["website"]; + }else{ + + $sublinks["Website"] = $wiki["website"]; + } + } + + foreach($wiki["profiles"] as $sitename => $url){ + + $sitename = explode("_", $sitename); + $sitename = ucfirst($sitename[count($sitename) - 1]); + + $sublinks[$sitename] = $url; + } + + $out["answer"][] = [ + "title" => + $this->titledots( + $wiki["title"] + ), + "description" => $description, + "url" => null, + "thumb" => isset($wiki["image"]["contentUrl"]) ? $wiki["image"]["contentUrl"] : null, + "table" => $table, + "sublink" => $sublinks + ]; + } + } + + // get next page + if((int)$html["search"]["page"] < (int)$html["search"]["max_page"]){ + + // https://coccoc.com/composer?_=1754021153532&p=0&q=zbabduiqwhduwqhdnwq&reqid=bwcAs00q&s=direct&apiV=1 + // ^json endpoint, but we can just do &page=2 lol + + if(!isset($query["page"])){ + + $query["page"] = 2; + }else{ + + $query["page"]++; + } + + $out["npt"] = + $this->backend + ->store( + json_encode($query), + "web", + $proxy + ); + } + + return $out; + } + + public function video($get){ + + //$html = file_get_contents("scraper/coccoc.html"); + if($get["npt"]){ + + [$query, $proxy] = + $this->backend->get( + $get["npt"], + "videos" + ); + + $query = json_decode($query, true); + }else{ + + $proxy = $this->backend->get_ip(); + + $query = [ + "query" => $get["s"], + "tbm" => "vid" + ]; + + // add filters + if($get["nsfw"] == "no"){ + + $query["safe"] = 1; + } + + if($get["time"] != "any"){ + + $query["tbs"] = $get["time"]; + } + + if($get["filter"] == "yes"){ + + $query["filter"] = 0; + } + } + + try{ + + $html = + $this->get( + $proxy, + "https://coccoc.com/search", + $query + ); + }catch(Exception $error){ + + throw new Exception("Failed to get search page"); + } + + $html = explode("window.composerResponse", $html, 2); + + if(count($html) !== 2){ + + throw new Exception("Failed to grep window.composerResponse"); + } + + $html = + json_decode( + $this->fuckhtml + ->extract_json( + ltrim($html[1], " =") + ), + true + ); + + if($html === null){ + + throw new Exception("Failed to decode JSON"); + } + + $out = [ + "status" => "ok", + "npt" => null, + "video" => [], + "author" => [], + "livestream" => [], + "playlist" => [], + "reel" => [] + ]; + + if(!isset($html["search_video"]["search_results"])){ + + if(isset($html["search_video"]["error"]["title"])){ + + if($html["search_video"]["error"]["title"] == "Không tìm thấy kết quả nào"){ + + return $out; + } + + throw new Exception("Coc Coc returned an error: " . $html["search_video"]["error"]["title"]); + } + + throw new Exception("Coc Coc did not supply a search_results object"); + } + + foreach($html["search_video"]["search_results"] as $video){ + + if(isset($video["rich"]["data"]["image_url"])){ + + $thumb = [ + "ratio" => "16:9", + "url" => $video["rich"]["data"]["image_url"] + ]; + }else{ + + $thumb = [ + "ratio" => null, + "url" => null + ]; + } + + $out["video"][] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $video["title"] + ) + ), + "description" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $video["content"] + ) + ), + "author" => [ + "name" => + isset($video["rich"]["data"]["channel_name"]) ? + $video["rich"]["data"]["channel_name"] : null, + "url" => null, + "avatar" => null + ], + "date" => + isset($video["date"]) ? + $video["date"] : null, + "duration" => + isset($video["rich"]["data"]["duration"]) ? + (int)$video["rich"]["data"]["duration"] : null, + "views" => null, + "thumb" => $thumb, + "url" => $video["url"] + ]; + } + + // get next page + if((int)$html["search_video"]["page"] < (int)$html["search_video"]["max_page"]){ + + if(!isset($query["page"])){ + + $query["page"] = 2; + }else{ + + $query["page"]++; + } + + $out["npt"] = + $this->backend + ->store( + json_encode($query), + "videos", + $proxy + ); + } + + return $out; + } + + private function titledots($title){ + + return trim($title, " .\t\n\r\0\x0B…"); + } + + private function int2hms($seconds){ + + $hours = floor($seconds / 3600); + $minutes = floor(($seconds % 3600) / 60); + $seconds = $seconds % 60; + + return sprintf("%02d:%02d:%02d", $hours, $minutes, $seconds); + } +} diff --git a/scraper/crowdview.php b/scraper/crowdview.php new file mode 100644 index 0000000..8fb267b --- /dev/null +++ b/scraper/crowdview.php @@ -0,0 +1,145 @@ +<?php + +class crowdview{ + + public function __construct(){ + + include "lib/backend.php"; + $this->backend = new backend("crowdview"); + + include "lib/fuckhtml.php"; + $this->fuckhtml = new fuckhtml(); + } + + public function getfilters($page){ + + return []; + } + + private function get($proxy, $url, $get = []){ + + $curlproc = curl_init(); + + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "DNT: 1", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: none", + "Sec-Fetch-User: ?1"] + ); + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + return $data; + } + + public function web($get){ + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $proxy = $this->backend->get_ip(); + + try{ + $json = $this->get( + $proxy, + "https://crowdview-next-js.onrender.com/api/search-v3", + [ + "query" => $search + ] + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch JSON"); + } + + $out = [ + "status" => "ok", + "spelling" => [ + "type" => "no_correction", + "using" => null, + "correction" => null + ], + "npt" => null, + "answer" => [], + "web" => [], + "image" => [], + "video" => [], + "news" => [], + "related" => [] + ]; + + $json = json_decode($json, true); + + if($json === NULL){ + + throw new Exception("Failed to decode JSON"); + } + + foreach($json["results"] as $item){ + + $description = explode("<b>", $item["snippet"], 2); + + $out["web"][] = [ + "title" => $this->sanitize($item["title"]), + "description" => $this->sanitize($description[1]), + "url" => $item["link"], + "date" => strtotime($description[0]), + "type" => "web", + "thumb" => [ + "url" => null, + "ratio" => null + ], + "sublink" => [], + "table" => [] + ]; + } + + return $out; + } + + private function sanitize($html){ + + return + trim( + $this->fuckhtml + ->getTextContent( + html_entity_decode( + $html + ) + ), + ". " + ); + } +} diff --git a/scraper/curlie.php b/scraper/curlie.php new file mode 100644 index 0000000..61a8eb2 --- /dev/null +++ b/scraper/curlie.php @@ -0,0 +1,309 @@ +<?php + +class curlie{ + + public function __construct(){ + + include "lib/backend.php"; + $this->backend = new backend("curlie"); + + include "lib/fuckhtml.php"; + $this->fuckhtml = new fuckhtml(); + } + + public function getfilters($page){ + + if($page != "web"){ + + return []; + } + + return [ + "lang" => [ + "display" => "Language", + "option" => [ + "any" => "Any language", + "en" => "English", + "de" => "German", + "fr" => "French", + "ja" => "Japanese", + "it" => "Italian", + "es" => "Spanish", + "ru" => "Russian", + "nl" => "Dutch", + "pl" => "Polish", + "tr" => "Turkish", + "da" => "Danish", + "sv" => "Swedish", + "no" => "Norwegian", + "is" => "Icelandic", + "fo" => "Faroese", + "fi" => "Finnish", + "et" => "Estonian", + "lt" => "Lithuanian", + "lv" => "Latvian", + "cy" => "Welsh", + "ga" => "Irish", + "gd" => "Scottish Gaelic", + "br" => "Breton", + "fy" => "Frisian", + "frr" => "North Frisian", + "gem" => "Saterland Frisian", + "lb" => "Luxembourgish", + "rm" => "Romansh", + "pt" => "Portuguese", + "ca" => "Catalan", + "gl" => "Galician", + "eu" => "Basque", + "ast" => "Asturian", + "an" => "Aragonese", + "fur" => "Friulan", + "sc" => "Sardinian", + "scn" => "Sicilian", + "oc" => "Occitan", + "be" => "Belarusian", + "cs" => "Czech", + "hu" => "Hungarian", + "sk" => "Slovak", + "uk" => "Ukrainian", + "csb" => "Kashubian", + "tt" => "Tatar", + "ba" => "Bashkir", + "os" => "Ossetian", + "sl" => "Slovene", + "sr" => "Serbian", + "hr" => "Croatian", + "bs" => "Bosnian", + "bg" => "Bulgarian", + "sq" => "Albanian", + "ro" => "Romanian", + "mk" => "Macedonian", + "el" => "Greek", + "iw" => "Hebrew", + "fa" => "Persian", + "ar" => "Arabic", + "ku" => "Kurdish", + "az" => "Azerbaijani", + "hy" => "Armenian", + "af" => "Afrikaans", + "sw" => "Kiswahili", + "uz" => "Uzbek", + "kk" => "Kazakh", + "ky" => "Kyrgyz", + "tg" => "Tajik", + "tk" => "Turkmen", + "ug" => "Uyghurche", + "hi" => "Hindi", + "si" => "Sinhalese", + "gu" => "Gujarati", + "ur" => "Urdu", + "mr" => "Marathi", + "pa" => "Punjabi", + "bn" => "Bengali", + "ta" => "Tamil", + "te" => "Telugu", + "kn" => "Kannada", + "zh_CN" => "Chinese Simplified", + "zh_TW" => "Chinese Traditional", + "ko" => "Korean", + "cfr" => "Taiwanese", + "th" => "Thai", + "vi" => "Vietnamese", + "in" => "Indonesian", + "ms" => "Malay", + "tl" => "Tagalog", + "eo" => "Esperanto", + "ia" => "Interlingua", + "la" => "Latin" + ] + ] + ]; + } + + private function get($proxy, $url, $get = []){ + + $curlproc = curl_init(); + + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "DNT: 1", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: none", + "Sec-Fetch-User: ?1"] + ); + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + return $data; + } + + public function web($get){ + + if($get["npt"]){ + + [$query, $proxy] = $this->backend->get($get["npt"], "web"); + + try{ + $html = $this->get( + $proxy, + "https://curlie.org/" . $query, + [] + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch search page"); + } + + }else{ + $proxy = $this->backend->get_ip(); + + $query = [ + "q" => $get["s"], + "start" => 0, + "stime" => 92452189 // ? + ]; + + if($get["lang"] !== "any"){ + + $query["lang"] = $get["lang"]; + } + + try{ + $html = $this->get( + $proxy, + "https://curlie.org/search", + $query + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch search page"); + } + } + + $this->fuckhtml->load($html); + + $nextpage = + $this->fuckhtml + ->getElementsByClassName( + "next-page", + "a" + ); + + if(count($nextpage) !== 0){ + + $nextpage = + $this->backend->store( + $nextpage[0]["attributes"]["href"], + "web", + $proxy + ); + }else{ + + $nextpage = null; + } + + $out = [ + "status" => "ok", + "spelling" => [ + "type" => "no_correction", + "using" => null, + "correction" => null + ], + "npt" => $nextpage, + "answer" => [], + "web" => [], + "image" => [], + "video" => [], + "news" => [], + "related" => [] + ]; + + $items = + $this->fuckhtml + ->getElementsByClassName( + "site-item", + "div" + ); + + foreach($items as $item){ + + $this->fuckhtml->load($item); + + $a = + $this->fuckhtml + ->getElementsByAttributeValue( + "target", + "_blank", + "a" + )[0]; + + $description = + $this->fuckhtml + ->getElementsByClassName("site-descr"); + + if(count($description) !== 0){ + + $description = + $this->fuckhtml + ->getTextContent( + $description[0] + ); + }else{ + + $description = null; + } + + $out["web"][] = [ + "title" => + $this->fuckhtml + ->getTextContent( + $a + ), + "description" => $description, + "url" => + $this->fuckhtml + ->getTextContent( + $a["attributes"]["href"] + ), + "date" => null, + "type" => "web", + "thumb" => [ + "url" => null, + "ratio" => null + ], + "sublink" => [], + "table" => [] + ]; + } + + return $out; + } +} diff --git a/scraper/ddg.php b/scraper/ddg.php new file mode 100644 index 0000000..49e0d37 --- /dev/null +++ b/scraper/ddg.php @@ -0,0 +1,2246 @@ +<?php + +class ddg{ + + public function __construct(){ + + include "lib/backend.php"; + $this->backend = new backend("ddg"); + + include "lib/fuckhtml.php"; + $this->fuckhtml = new fuckhtml(); + } + + /* + curl functions + */ + private const req_web = 0; + private const req_xhr = 1; + + private function get($proxy, $url, $get = [], $reqtype = self::req_web){ + + $curlproc = curl_init(); + + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + // http2 bypass + curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); + + switch($reqtype){ + case self::req_web: + $headers = + ["User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "DNT: 1", + "Sec-GPC: 1", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: same-origin", + "Sec-Fetch-User: ?1", + "Priority: u=0, i", + "TE: trailers"]; + break; + + case self::req_xhr: + $headers = + ["User-Agent: " . config::USER_AGENT, + "Accept: */*", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "Referer: https://duckduckgo.com/", + "DNT: 1", + "Sec-GPC: 1", + "Connection: keep-alive", + "Sec-Fetch-Dest: script", + "Sec-Fetch-Mode: no-cors", + "Sec-Fetch-Site: same-site", + "Priority: u=1"]; + break; + } + + $this->backend->assign_proxy($curlproc, $proxy); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers); + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + return $data; + } + + public function getfilters($pagetype){ + + $base = [ + "country" => [ + "display" => "Country", + "option" => [ + "us-en" => "US (English)", + "ar-es" => "Argentina", + "au-en" => "Australia", + "at-de" => "Austria", + "be-fr" => "Belgium (fr)", + "be-nl" => "Belgium (nl)", + "br-pt" => "Brazil", + "bg-bg" => "Bulgaria", + "ca-en" => "Canada (en)", + "ca-fr" => "Canada (fr)", + "ct-ca" => "Catalonia", + "cl-es" => "Chile", + "cn-zh" => "China", + "co-es" => "Colombia", + "hr-hr" => "Croatia", + "cz-cs" => "Czech Republic", + "dk-da" => "Denmark", + "ee-et" => "Estonia", + "fi-fi" => "Finland", + "fr-fr" => "France", + "de-de" => "Germany", + "gr-el" => "Greece", + "hk-tzh" => "Hong Kong", + "hu-hu" => "Hungary", + "in-en" => "India (en)", + "id-en" => "Indonesia (en)", + "ie-en" => "Ireland", + "il-en" => "Israel (en)", + "it-it" => "Italy", + "jp-jp" => "Japan", + "kr-kr" => "Korea", + "lv-lv" => "Latvia", + "lt-lt" => "Lithuania", + "my-en" => "Malaysia (en)", + "mx-es" => "Mexico", + "nl-nl" => "Netherlands", + "nz-en" => "New Zealand", + "no-no" => "Norway", + "pk-en" => "Pakistan (en)", + "pe-es" => "Peru", + "ph-en" => "Philippines (en)", + "pl-pl" => "Poland", + "pt-pt" => "Portugal", + "ro-ro" => "Romania", + "ru-ru" => "Russia", + "xa-ar" => "Saudi Arabia", + "sg-en" => "Singapore", + "sk-sk" => "Slovakia", + "sl-sl" => "Slovenia", + "za-en" => "South Africa", + "es-ca" => "Spain (ca)", + "es-es" => "Spain (es)", + "se-sv" => "Sweden", + "ch-de" => "Switzerland (de)", + "ch-fr" => "Switzerland (fr)", + "tw-tzh" => "Taiwan", + "th-en" => "Thailand (en)", + "tr-tr" => "Turkey", + "us-es" => "US (Spanish)", + "ua-uk" => "Ukraine", + "uk-en" => "United Kingdom", + "vn-en" => "Vietnam (en)" + ] + ] + ]; + + switch($pagetype){ + + case "web": + $base["country"]["option"] = + array_merge(["any" => "All Regions"], $base["country"]["option"]); + + return array_merge($base, + [ + "nsfw" => [ + "display" => "NSFW", + "option" => [ + "yes" => "Yes", + "maybe" => "Maybe", + "no" => "No" + ] + ], + "newer" => [ + "display" => "Newer than", + "option" => "_DATE" + ], + "older" => [ + "display" => "Older than", + "option" => "_DATE" + ], + "extendedsearch" => [ + // undefined display + "option" => [ + "yes" => "Yes", + "no" => "No", + ] + ] + ] + ); + break; + + case "images": + return array_merge($base, + [ + "nsfw" => [ + "display" => "NSFW", + "option" => [ + "yes" => "Yes", + "no" => "No" + ] + ], + "date" => [ + "display" => "Time posted", + "option" => [ + "any" => "Any time", + "Day" => "Past day", + "Week" => "Past week", + "Month" => "Past month" + ] + ], + "size" => [ + "display" => "Size", + "option" => [ + "any" => "Any size", + "Small" => "Small", + "Medium" => "Medium", + "Large" => "Large", + "Wallpaper" => "Wallpaper" + ] + ], + "color" => [ + "display" => "Colors", + "option" => [ + "any" => "All colors", + "Monochrome" => "Black and white", + "Red" => "Red", + "Orange" => "Orange", + "Yellow" => "Yellow", + "Green" => "Green", + "Blue" => "Blue", + "Purple" => "Purple", + "Pink" => "Pink", + "Brown" => "Brown", + "Black" => "Black", + "Gray" => "Gray", + "Teal" => "Teal", + "White" => "White" + ] + ], + "type" => [ + "display" => "Type", + "option" => [ + "any" => "All types", + "photo" => "Photograph", + "clipart" => "Clipart", + "gif" => "Animated GIF", + "transparent" => "Transparent" + ] + ], + "layout" => [ + "display" => "Layout", + "option" => [ + "any" => "All layouts", + "Square" => "Square", + "Tall" => "Tall", + "Wide" => "Wide" + ] + ], + "license" => [ + "display" => "License", + "option" => [ + "any" => "All licenses", + "Any" => "All Creative Commons", + "Public" => "Public domain", + "Share" => "Free to Share and Use", + "ShareCommercially" => "Free to Share and Use Commercially", + "Modify" => "Free to Modify, Share, and Use", + "ModifyCommercially" => "Free to Modify, Share, and Use Commercially" + ] + ] + ] + ); + break; + + case "videos": + return array_merge($base, + [ + "nsfw" => [ + "display" => "NSFW", + "option" => [ + "yes" => "Yes", + "maybe" => "Maybe", + "no" => "No" + ] + ], + "date" => [ + "display" => "Time fetched", + "option" => [ + "any" => "Any time", + "d" => "Past day", + "w" => "Past week", + "m" => "Past month" + ] + ], + "resolution" => [ //videoDefinition + "display" => "Resolution", + "option" => [ + "any" => "Any resolution", + "high" => "High definition", + "standard" => "Standard definition" + ] + ], + "duration" => [ // videoDuration + "display" => "Duration", + "option" => [ + "any" => "Any duration", + "short" => "Short (>5min)", + "medium" => "Medium (5-20min)", + "long" => "Long (<20min)" + ] + ], + "license" => [ + "display" => "License", + "option" => [ + "any" => "Any license", + "creativeCommon" => "Creative Commons", + "youtube" => "YouTube Standard" + ] + ] + ] + ); + break; + + case "news": + return array_merge($base, + [ + "nsfw" => [ + "display" => "NSFW", + "option" => [ + "yes" => "Yes", + "maybe" => "Maybe", + "no" => "No" + ] + ], + "date" => [ + "display" => "Time posted", + "option" => [ + "any" => "Any time", + "d" => "Past day", + "w" => "Past week", + "m" => "Past month" + ] + ] + ] + ); + break; + } + } + + public function web($get){ + + $out = [ + "status" => "ok", + "spelling" => [ + "type" => "no_correction", + "using" => null, + "correction" => null + ], + "npt" => null, + "answer" => [], + "web" => [], + "image" => [], + "video" => [], + "news" => [], + "related" => [] + ]; + + if($get["npt"]){ + + [$js_link, $proxy] = $this->backend->get($get["npt"], "web"); + $js_link = "https://links.duckduckgo.com" . $js_link; + + $html = ""; + $get["extendedsearch"] = "no"; + + }else{ + if(strlen($get["s"]) === 0){ + + throw new Exception("Search term is empty!"); + } + + $proxy = $this->backend->get_ip(); + + // generate filters + $get_filters = [ + "q" => $get["s"] + ]; + + if($get["country"] == "any"){ + + $get_filters["kl"] = "wt-wt"; + }else{ + + $get_filters["kl"] = $get["country"]; + } + + switch($get["nsfw"]){ + + case "yes": $get_filters["kp"] = "-2"; break; + case "maybe": $get_filters["kp"] = "-1"; break; + case "no": $get_filters["kp"] = "1"; break; + } + + $df = true; + + if($get["newer"] === false){ + + if($get["older"] !== false){ + + $start = 36000; + $end = $get["older"]; + }else{ + + $df = false; + } + }else{ + + $start = $get["newer"]; + + if($get["older"] !== false){ + + $end = $get["older"]; + }else{ + + $end = time(); + } + } + + if($df === true){ + $get_filters["df"] = date("Y-m-d", $start) . ".." . date("Y-m-d", $end); + } + + // + // Get HTML + // + try{ + $html = $this->get( + $proxy, + "https://duckduckgo.com/", + $get_filters + ); + }catch(Exception $e){ + + throw new Exception("Failed to fetch search page"); + } + + $this->fuckhtml->load($html); + + $script = + $this->fuckhtml + ->getElementById( + "deep_preload_link", + "link" + ); + + if( + $script === null || + !isset($script["attributes"]["href"]) + ){ + + throw new Exception("Failed to grep d.js"); + } + + $js_link = + $this->fuckhtml + ->getTextContent( + $script["attributes"]["href"] + ); + } + + // + // Get d.js + // + try{ + $js = $this->get( + $proxy, + $js_link, + [], + ddg::req_xhr + ); + + }catch(Exception $e){ + + throw new Exception("Failed to fetch d.js"); + } + + //echo htmlspecialchars($js); + + $js_tmp = + preg_split( + '/DDG\.pageLayout\.load\(\s*\'d\'\s*,\s*/', + $js, + 2 + ); + + if(count($js_tmp) <= 1){ + + throw new Exception("Failed to grep pageLayout(d)"); + } + + $json = + json_decode( + $this->fuckhtml + ->extract_json( + $js_tmp[1] + ), + true + ); + + if($json === null){ + + throw new Exception("Failed to decode JSON"); + } + + // + // Get search results + NPT token + // + foreach($json as $item){ + + if(isset($item["c"])){ + + if( + !isset($item["s"]) && + isset($item["t"]) && + $item["t"] == "DEEP_ERROR_NO_RESULTS" + ){ + + return $out; + } + + $table = []; + + // get youtube video information + if(isset($item["video"]["thumbnail_url_template"])){ + + $thumb = + [ + "ratio" => "16:9", + "url" => $this->bingimg($item["video"]["thumbnail_url_template"]) + ]; + }else{ + + $thumb = + [ + "ratio" => null, + "url" => null + ]; + } + + // get table items + if(isset($item["rf"])){ + + foreach($item["rf"] as $hint){ + + if( + !isset($hint["label"]["text"]) || + !isset($hint["items"][0]["text"]) + ){ + + continue; + } + + $text = []; + + foreach($hint["items"] as $text_part){ + + $text[] = $text_part["text"]; + } + + $text = implode(", ", $text); + + if(is_numeric($text)){ + + $text = number_format((string)$text); + } + + $table[$hint["label"]["text"]] = $text; + } + } + + // get ratings + if(isset($item["ar"])){ + + foreach($item["ar"] as $rating){ + + if( + isset($rating["aggregateRating"]["bestRating"]) && + isset($rating["aggregateRating"]["ratingValue"]) + ){ + + $text = $rating["aggregateRating"]["ratingValue"] . "/" . $rating["aggregateRating"]["bestRating"]; + + if(isset($rating["aggregateRating"]["reviewCount"])){ + + $text .= " (" . number_format($rating["aggregateRating"]["reviewCount"]) . " votes)"; + } + + $table["Rating"] = $text; + } + } + } + + // get sublinks + $sublinks = []; + + if(isset($item["l"])){ + + foreach($item["l"] as $sublink){ + + $sublinks[] = [ + "title" => $this->titledots($sublink["text"]), + "description" => $this->titledots($sublink["snippet"]), + "url" => $sublink["targetUrl"], + "date" => null + ]; + } + } + + $title = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $item["t"] + ) + ); + + if( + $title == "EOF" && + strpos( + $item["c"], + "google" + ) + ){ + + continue; + } + + // parse search result + $out["web"][] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $item["t"] + ) + ), + "description" => + isset($item["a"]) ? + $this->titledots( + $this->fuckhtml + ->getTextContent( + $item["a"] + ) + ) : null, + "url" => $this->unshiturl($item["c"]), + "date" => + isset($item["e"]) ? + strtotime($item["e"]) : null, + "type" => "web", + "thumb" => $thumb, + "sublink" => $sublinks, + "table" => $table + ]; + continue; + } + + if(isset($item["n"])){ + + // get NPT + $out["npt"] = + $this->backend->store( + $item["n"], + "web", + $proxy + ); + continue; + } + } + + // + // Get spelling + // + $js_tmp = + preg_split( + '/DDG\.page\.showMessage\(\s*\'spelling\'\s*,\s*/', + $js, + 2 + ); + + if(count($js_tmp) > 1){ + + $json = + json_decode( + $this->fuckhtml + ->extract_json( + $js_tmp[1] + ), + true + ); + + if($json !== null){ + + // parse spelling + // qc=2: including + + switch((int)$json["qc"]){ + + case 2: + $type = "including"; + break; + + default: + $type = "not_many"; + break; + } + + $out["spelling"] = [ + "type" => $type, + "using" => + $this->fuckhtml + ->getTextContent( + $json["suggestion"] + ), + "correction" => html_entity_decode($json["recourseText"]) + ]; + } + } + + // + // Get images + // + $js_tmp = + preg_split( + '/DDG\.duckbar\.load\(\s*\'images\'\s*,\s*/', + $js, + 2 + ); + + if(count($js_tmp) > 1){ + + $json = + json_decode( + $this->fuckhtml + ->extract_json( + $js_tmp[1] + ), + true + ); + + if($json !== null){ + + foreach($json["results"] as $image){ + + $ratio = $this->bingratio((int)$image["width"], (int)$image["height"]); + + $out["image"][] = [ + "title" => $image["title"], + "source" => [ + [ + "url" => $image["image"], + "width" => (int)$image["width"], + "height" => (int)$image["height"] + ], + [ + "url" => $this->bingimg($image["thumbnail"]), + "width" => $ratio[0], + "height" => $ratio[1] + ] + ], + "url" => $this->unshiturl($image["url"]) + ]; + } + } + } + + // + // Get videos + // + $js_tmp = + preg_split( + '/DDG\.duckbar\.load\(\s*\'videos\'\s*,\s*/', + $js, + 2 + ); + + if(count($js_tmp) > 1){ + + $json = + json_decode( + $this->fuckhtml + ->extract_json( + $js_tmp[1] + ), + true + ); + + if($json !== null){ + + foreach($json["results"] as $video){ + + $thumb = [ + "ratio" => null, + "url" => null + ]; + + foreach(["large", "medium", "small"] as $contender){ + + if(isset($video["images"][$contender])){ + + $thumb = [ + "ratio" => "16:9", + "url" => $this->bingimg($video["images"][$contender]) + ]; + break; + } + } + + $out["video"][] = [ + "title" => $this->titledots($video["title"]), + "description" => + $video["description"] != "" ? + $this->titledots($video["description"]) : null, + "date" => + isset($video["published"]) ? + strtotime($video["published"]) : null, + "duration" => + $video["duration"] != "" ? + $this->hms2int($video["duration"]) : null, + "views" => + isset($video["statistics"]["viewCount"]) ? + (int)$video["statistics"]["viewCount"] : null, + "thumb" => $thumb, + "url" => $this->unshiturl($video["content"]) + ]; + } + } + } + + // + // Get news + // + $js_tmp = + preg_split( + '/DDG\.duckbar\.load\(\s*\'news\'\s*,\s*/', + $js, + 2 + ); + + if(count($js_tmp) > 1){ + + $json = + json_decode( + $this->fuckhtml + ->extract_json( + $js_tmp[1] + ), + true + ); + + if($json !== null){ + + foreach($json["results"] as $news){ + + if(isset($news["image"])){ + + $thumb = [ + "ratio" => "16:9", + "url" => $news["image"] + ]; + }else{ + + $thumb = [ + "ratio" => null, + "url" => null + ]; + } + + $out["news"][] = [ + "title" => $news["title"], + "description" => + $this->fuckhtml + ->getTextContent( + $news["excerpt"] + ), + "date" => (int)$news["date"], + "thumb" => $thumb, + "url" => $news["url"] + ]; + } + } + } + + // + // Get related searches + // + $js_tmp = + preg_split( + '/DDG\.duckbar\.loadModule\(\s*\'related_searches\'\s*,\s*/', + $js, + 2 + ); + + if(count($js_tmp) > 1){ + + $json = + json_decode( + $this->fuckhtml + ->extract_json( + $js_tmp[1] + ), + true + ); + + if($json !== null){ + + foreach($json["results"] as $related){ + + $out["related"][] = $related["text"]; + } + } + } + + // + // Get instant answers + // + $js_tmp = + preg_split( + '/DDG\.duckbar\.add\(\s*/', + $html . $js, + 2 + ); + + if(count($js_tmp) > 1){ + + $json = + json_decode( + $this->fuckhtml + ->extract_json( + $js_tmp[1] + ), + true + ); + + if($json !== null){ + + $json = $json["data"]; + $table = []; + $sublinks = []; + $description = []; + + // get official website + if( + isset($json["OfficialWebsite"]) && + $json["OfficialWebsite"] !== null + ){ + + $sublinks["Website"] = $json["OfficialWebsite"]; + } + + // get sublinks & table elements + if(isset($json["Infobox"]["content"])){ + foreach($json["Infobox"]["content"] as $info){ + + if($info["data_type"] == "string"){ + + // add table element + $table[$info["label"]] = $info["value"]; + continue; + } + + if($info["data_type"] == "wd_description"){ + + $description[] = [ + "type" => "quote", + "value" => $info["value"] + ]; + continue; + } + + // add sublink + switch($info["data_type"]){ + + case "official_site": + case "official_website": + $type = "Website"; + break; + + case "wikipedia": $type = "Wikipedia"; break; + case "itunes": $type = "iTunes"; break; + case "amazon": $type = "Amazon"; break; + + case "imdb_title_id": + case "imdb_id": + case "imdb_name_id": + $type = "IMDb"; + $delim = substr($info["value"], 0, 2); + + if($delim == "nm"){ + + $prefix = "https://www.imdb.com/name/"; + }elseif($delim == "tt"){ + + $prefix = "https://www.imdb.com/title/"; + }elseif($delim == "co"){ + + $prefix = "https://www.imdb.com/search/title/?companies="; + }else{ + + $prefix = "https://www.imdb.com/title/"; + } + break; + + case "imdb_name_id": $prefix = "https://www.imdb.com/name/"; $type = "IMDb"; break; + case "twitter_profile": $prefix = "https://twitter.com/"; $type = "Twitter"; break; + case "instagram_profile": $prefix = "https://instagram.com/"; $type = "Instagram"; break; + case "facebook_profile": $prefix = "https://facebook.com/"; $type = "Facebook"; break; + case "spotify_artist_id": $prefix = "https://open.spotify.com/artist/"; $type = "Spotify"; break; + case "itunes_artist_id": $prefix = "https://music.apple.com/us/artist/"; $type = "iTunes"; break; + case "rotten_tomatoes": $prefix = "https://rottentomatoes.com/"; $type = "Rotten Tomatoes"; break; + case "youtube_channel": $prefix = "https://youtube.com/channel/"; $type = "YouTube"; break; + case "soundcloud_id": $prefix = "https://soundcloud.com/"; $type = "SoundCloud"; break; + + default: + $prefix = null; + $type = false; + } + + if($type !== false){ + + if($prefix === null){ + + $sublinks[$type] = $info["value"]; + }else{ + + $sublinks[$type] = $prefix . $info["value"]; + } + } + } + } + + if(isset($json["Abstract"])){ + + $description = $this->parse_rich_text($json["Abstract"]); + } + + if( + !isset($json["Image"]) || + $json["Image"] == "" || + $json["Image"] === null || + $json["Image"] == "https://duckduckgo.com/i/" + ){ + + $image = null; + }else{ + + if( + preg_match( + '/^https?:\/\//', + $json["Image"] + ) + ){ + + $image = $json["Image"]; + }else{ + + $image = "https://duckduckgo.com" . $json["Image"]; + } + } + + $out["answer"][] = [ + "title" => $json["Heading"], + "description" => $description, + "url" => $json["AbstractURL"], + "thumb" => $image, + "table" => $table, + "sublink" => $sublinks + ]; + } + } + + if($get["extendedsearch"] == "no"){ + + return $out; + } + + // + // Parse additional data endpoints + // + //nrj('/js/spice/dictionary/definition/create', null, null, null, null, 'dictionary_definition'); + + preg_match_all( + '/nrj\(\s*\'([^\']+)\'/', + $js, + $nrj + ); + + if(isset($nrj[1])){ + + foreach($nrj[1] as $potential_endpoint){ + + // + // Probe for wordnik definition + // + preg_match( + '/\/js\/spice\/dictionary\/definition\/([^\/]+)/', + $potential_endpoint, + $word + ); + + if(isset($word[1])){ + + $word = $word[1]; + + // found wordnik definition & word + try{ + $nik = + $this->get( + $proxy, + "https://duckduckgo.com/js/spice/dictionary/definition/" . $word, + [], + ddg::req_xhr + ); + + }catch(Exception $e){ + + // fail gracefully + return $out; + } + + // remove javascript + $js_tmp = + preg_split( + '/ddg_spice_dictionary_definition\(\s*/', + $nik, + 2 + ); + + if(count($js_tmp) > 1){ + + $nik = + json_decode( + $this->fuckhtml + ->extract_json( + $js_tmp[1] + ), + true + ); + } + + if($nik === null){ + + return $out; + } + + $answer_cat = []; + $answer = []; + + foreach($nik as $snippet){ + + if(!isset($snippet["partOfSpeech"])){ continue; } + + $push = []; + + // add text snippet + if(isset($snippet["text"])){ + + $push[] = [ + "type" => "text", + "value" => + $this->fuckhtml + ->getTextContent( + $snippet["text"] + ) + ]; + } + + // add example uses + if(isset($snippet["exampleUses"])){ + + foreach($snippet["exampleUses"] as $example){ + + $push[] = [ + "type" => "quote", + "value" => "\"" . + $this->fuckhtml + ->getTextContent( + $example["text"] + ) . "\"" + ]; + } + } + + // add citations + if(isset($snippet["citations"])){ + + foreach($snippet["citations"] as $citation){ + + if(!isset($citation["cite"])){ continue; } + + $text = + $this->fuckhtml + ->getTextContent( + $citation["cite"] + ); + + if(isset($citation["source"])){ + + $text .= + " - " . + $this->fuckhtml + ->getTextContent( + $citation["source"] + ); + } + + $push[] = [ + "type" => "quote", + "value" => $text + ]; + } + } + + // add related words + if(isset($snippet["relatedWords"])){ + + $relations = []; + + foreach($snippet["relatedWords"] as $related){ + + $words = []; + foreach($related["words"] as $wrd){ + + $words[] = + $this->fuckhtml + ->getTextContent( + $wrd + ); + } + + if( + count($words) !== 0 && + isset($related["relationshipType"]) + ){ + + $relations[ucfirst($related["relationshipType"]) . "s"] = + implode(", ", $words); + } + } + + foreach($relations as $relation_title => $relation_content){ + + $push[] = [ + "type" => "quote", + "value" => $relation_title . ": " . $relation_content + ]; + } + } + + + if(count($push) !== 0){ + + // push data to answer_cat + if(!isset($answer_cat[$snippet["partOfSpeech"]])){ + + $answer_cat[$snippet["partOfSpeech"]] = []; + } + + $answer_cat[$snippet["partOfSpeech"]] = + array_merge( + $answer_cat[$snippet["partOfSpeech"]], + $push + ); + } + } + + foreach($answer_cat as $answer_title => $answer_content){ + + $i = 0; + $answer[] = [ + "type" => "title", + "value" => $answer_title + ]; + + $old_type = $answer[count($answer) - 1]["type"]; + + foreach($answer_content as $ans){ + + if( + $ans["type"] == "text" && + $old_type == "text" + ){ + + $i++; + $c = count($answer) - 1; + + // append text to existing textfield + $answer[$c] = [ + "type" => "text", + "value" => $answer[$c]["value"] . "\n" . $i . ". " . $ans["value"] + ]; + + }elseif($ans["type"] == "text"){ + + $i++; + $answer[] = [ + "type" => "text", + "value" => $i . ". " . $ans["value"] + ]; + }else{ + + // append normally + $answer[] = $ans; + } + + $old_type = $ans["type"]; + } + } + + // yeah.. sometimes duckduckgo doesnt give us a definition back + if(count($answer) !== 0){ + + $out["answer"][] = [ + "title" => ucfirst($word), + "description" => $answer, + "url" => "https://www.wordnik.com/words/" . $word, + "thumb" => null, + "table" => [], + "sublink" => [] + ]; + } + } + + // + // Parse stackoverflow answer + // + if( + preg_match( + '/^\/a\.js.*src_id=stack_overflow/', + $potential_endpoint + ) + ){ + + // found stackoverflow answer + try{ + $json = + $this->get( + $proxy, + "https://duckduckgo.com" . $potential_endpoint, + [], + ddg::req_xhr + ); + + }catch(Exception $e){ + + // fail gracefully + return $out; + } + + $json = explode("DDG.duckbar.add_array(", $json, 2); + + if(count($json) === 2){ + + $json = + json_decode( + $this->fuckhtml + ->extract_json( + $json[1] + ), + true + ); + + if( + $json !== null && + isset($json[0]["data"]) + ){ + + $json = $json[0]["data"]; + + foreach($json as $answer){ + + if(isset($answer["Heading"])){ + + $title = $answer["Heading"]; + }elseif(isset($answer["title"])){ + + $title = $answer["title"]; + }else{ + + $title = null; + } + + if( + $title !== null && + isset($answer["Abstract"]) + ){ + + $description = $this->parse_rich_text($answer["Abstract"]); + + $out["answer"][] = [ + "title" => $title, + "description" => $description, + "url" => $answer["AbstractURL"], + "thumb" => null, + "table" => [], + "sublink" => [] + ]; + } + } + } + } + } + } + } + + return $out; + } + + public function image($get){ + + if($get["npt"]){ + + [$js_link, $proxy] = $this->backend->get($get["npt"], "images"); + + }else{ + if(strlen($get["s"]) === 0){ + + throw new Exception("Search term is empty!"); + } + + $proxy = $this->backend->get_ip(); + + $filters = []; + + if($get["date"] != "any"){ $filters[] = "time:{$get["date"]}"; } + if($get["size"] != "any"){ $filters[] = "size:{$get["size"]}"; } + if($get["color"] != "any"){ $filters[] = "color:{$get["color"]}"; } + if($get["type"] != "any"){ $filters[] = "type:{$get["type"]}"; } + if($get["layout"] != "any"){ $filters[] = "layout:{$get["layout"]}"; } + if($get["license"] != "any"){ $filters[] = "license:{$get["license"]}"; } + + $filters = implode(",", $filters); + + $get_filters = [ + "q" => $get["s"], + "iax" => "images", + "ia" => "images" + ]; + + if($filters != ""){ + + $get_filters["iaf"] = $filters; + } + + $nsfw = $get["nsfw"] == "yes" ? "-1" : "1"; + $get_filters["kp"] = $nsfw; + + try{ + + $html = $this->get( + $proxy, + "https://duckduckgo.com", + $get_filters, + ddg::req_web + ); + }catch(Exception $err){ + + throw new Exception("Failed to fetch search page"); + } + + preg_match( + '/vqd="([0-9-]+)"/', + $html, + $vqd + ); + + if(!isset($vqd[1])){ + + throw new Exception("Failed to grep VQD token"); + } + + $js_link = + "i.js?" . + http_build_query([ + "l" => $get["country"], + "o" => "json", + "q" => $get["s"], + "vqd" => $vqd[1], + "f" => $filters, + "p" => $nsfw + ]); + } + + try{ + + $json = + $this->get( + $proxy, + "https://duckduckgo.com/" . $js_link, + [], + ddg::req_xhr + ); + }catch(Exception $error){ + + throw new Exception("Failed to get i.js"); + } + + $json = json_decode($json, true); + + if($json === null){ + + throw new Exception("Failed to decode JSON"); + } + + $out = [ + "status" => "ok", + "npt" => null, + "image" => [] + ]; + + if(!isset($json["results"])){ + + return $out; + } + + // get npt + if( + isset($json["next"]) && + $json["next"] !== null + ){ + + $vqd = null; + + if(isset($vqd[1])){ + + $vqd = $vqd[1]; + }else{ + + $vqd = array_values($json["vqd"]); + + if(count($vqd) > 0){ + + $vqd = $vqd[0]; + } + } + + if($vqd !== null){ + + $out["npt"] = + $this->backend->store( + $json["next"] . "&vqd=" . $vqd, + "images", + $proxy + ); + } + } + + // get images + foreach($json["results"] as $image){ + + $ratio = + $this->bingratio( + (int)$image["width"], + (int)$image["height"] + ); + + $out["image"][] = [ + "title" => $this->titledots($image["title"]), + "source" => [ + [ + "url" => $image["image"], + "width" => (int)$image["width"], + "height" => (int)$image["height"] + ], + [ + "url" => $this->bingimg($image["thumbnail"]), + "width" => $ratio[0], + "height" => $ratio[1] + ] + ], + "url" => $this->unshiturl($image["url"]) + ]; + } + + return $out; + } + + public function video($get){ + + if($get["npt"]){ + + [$js_link, $proxy] = $this->backend->get($get["npt"], "videos"); + + }else{ + if(strlen($get["s"]) === 0){ + + throw new Exception("Search term is empty!"); + } + + $proxy = $this->backend->get_ip(); + + $get_filters = [ + "q" => $get["s"], + "iax" => "videos", + "ia" => "videos" + ]; + + switch($get["nsfw"]){ + + case "yes": $nsfw = "-2"; break; + case "maybe": $nsfw = "-1"; break; + case "no": $nsfw = "1"; break; + } + + $filters = []; + + if($get["date"] != "any"){ $filters[] = "publishedAfter:{$date}"; } + if($get["resolution"] != "any"){ $filters[] = "videoDefinition:{$resolution}"; } + if($get["duration"] != "any"){ $filters[] = "videoDuration:{$duration}"; } + if($get["license"] != "any"){ $filters[] = "videoLicense:{$license}"; } + + $filters = implode(",", $filters); + + if($filters != ""){ + + $get_filters["iaf"] = $filters; + } + + try{ + + $html = + $this->get( + $proxy, + "https://duckduckgo.com/", + $get_filters, + ddg::req_web + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch search page"); + } + + preg_match( + '/vqd="([0-9-]+)"/', + $html, + $vqd + ); + + if(!isset($vqd[1])){ + + throw new Exception("Failed to grep VQD token"); + } + + $js_link = + "v.js?" . + http_build_query([ + "l" => $get["country"], + "o" => "json", + "sr" => "1", + "q" => $get["s"], + "vqd" => $vqd[1], + "f" => $filters, + "p" => $nsfw + ]); + } + + try{ + + $json = + $this->get( + $proxy, + "https://duckduckgo.com/" . $js_link, + [], + ddg::req_xhr + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch JSON"); + } + + $json = json_decode($json, true); + + if($json === null){ + + throw new Exception("Failed to decode JSON"); + } + + $out = [ + "status" => "ok", + "npt" => null, + "video" => [], + "author" => [], + "livestream" => [], + "playlist" => [], + "reel" => [] + ]; + + if(!isset($json["results"])){ + + return $out; + } + + // get NPT + if( + isset($json["next"]) && + $json["next"] !== null + ){ + + $out["npt"] = + $this->backend->store( + $json["next"], + "videos", + $proxy + ); + } + + foreach($json["results"] as $video){ + + $thumb = [ + "ratio" => null, + "url" => null + ]; + + foreach(["large", "medium", "small"] as $contender){ + + if(isset($video["images"][$contender])){ + + $thumb = [ + "ratio" => "16:9", + "url" => $this->bingimg($video["images"][$contender]) + ]; + break; + } + } + + $out["video"][] = [ + "title" => $this->titledots($video["title"]), + "description" => $this->titledots($video["description"]), + "author" => [ + "name" => + ( + isset($video["uploader"]) && + $video["uploader"] != "" + ) ? + $video["uploader"] : null, + "url" => null, + "avatar" => null + ], + "date" => + ( + isset($video["published"]) && + $video["published"] != "" + ) ? + strtotime($video["published"]) : null, + "duration" => + ( + isset($video["duration"]) && + $video["duration"] != "" + ) ? + $this->hms2int($video["duration"]) : null, + "views" => + isset($video["statistics"]["viewCount"]) ? + (int)$video["statistics"]["viewCount"] : null, + "thumb" => $thumb, + "url" => $this->unshiturl($video["content"]) + ]; + } + + return $out; + } + + public function news($get){ + + if($get["npt"]){ + + [$js_link, $proxy] = $this->backend->get($get["npt"], "news"); + + }else{ + if(strlen($get["s"]) === 0){ + + throw new Exception("Search term is empty!"); + } + + $proxy = $this->backend->get_ip(); + + $get_filters = [ + "q" => $get["s"], + "iar" => "news", + "ia" => "news" + ]; + + if($get["date"] != "any"){ + + $date = $get["date"]; + $get_filters["df"] = $date; + }else{ + + $date = ""; + } + + switch($get["nsfw"]){ + + case "yes": $get_filters["kp"] = "-2"; break; + case "maybe": $get_filters["kp"] = "-1"; break; + case "no": $get_filters["kp"] = "1"; break; + } + + try{ + + $html = + $this->get( + $proxy, + "https://duckduckgo.com/", + $get_filters, + ddg::req_web + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch search page"); + } + + preg_match( + '/vqd="([0-9-]+)"/', + $html, + $vqd + ); + + if(!isset($vqd[1])){ + + throw new Exception("Failed to grep VQD token"); + } + + $js_link = + "news.js?" . + http_build_query([ + "l" => $get["country"], + "o" => "json", + "noamp" => "1", + "m" => "30", + "q" => $get["s"], + "vqd" => $vqd[1], + "p" => $get_filters["kp"], + "df" => $date, + "u" => "bing" + ]); + } + + try{ + + $json = + $this->get( + $proxy, + "https://duckduckgo.com/" . $js_link, + [], + ddg::req_xhr + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch JSON"); + } + + $json = json_decode($json, true); + + if($json === null){ + + throw new Exception("Failed to decode JSON"); + } + + $out = [ + "status" => "ok", + "npt" => null, + "news" => [] + ]; + + if(!isset($json["results"])){ + + return $out; + } + + // get NPT + if( + isset($json["next"]) && + $json["next"] !== null + ){ + + $out["npt"] = + $this->backend->store( + $json["next"], + "news", + $proxy + ); + } + + foreach($json["results"] as $news){ + + if( + isset($news["image"]) && + $news["image"] != "" + ){ + + $thumb = [ + "ratio" => "16:9", + "url" => $news["image"] + ]; + }else{ + + $thumb = [ + "ratio" => null, + "url" => null + ]; + } + + $out["news"][] = [ + "title" => $news["title"], + "author" => + ( + isset($news["source"]) && + $news["source"] != "" + ) ? + $news["source"] : null, + "description" => + ( + isset($news["excerpt"]) && + $news["excerpt"] != "" + ) ? + $this->fuckhtml + ->getTextContent( + $news["excerpt"] + ) : null, + "date" => + isset($news["date"]) ? + (int)$news["date"] : null, + "thumb" => $thumb, + "url" => $this->unshiturl($news["url"]) + ]; + } + + return $out; + } + + private function parse_rich_text($html){ + + $description = []; + + // pre-process the html, remove useless elements + $html = + strip_tags( + $html, + [ + "h1", "h2", "h3", "h4", "h5", "h6", "h7", + "pre", "code" + ] + ); + + $html = + preg_replace( + '/<(\/?)pre *[^>]*>\s*<\/?code *[^>]*>/i', + '<$1pre>', + $html + ); + + $this->fuckhtml->load($html); + + $tags = + $this->fuckhtml + ->getElementsByTagName( + "*" + ); + + if(count($tags) === 0){ + + $description[] = [ + "type" => "text", + "value" => + trim( + $this->fuckhtml + ->getTextContent( + $html, + true, + false + ) + ) + ]; + }else{ + + $start = 0; + $was_code_block = true; + foreach($tags as $tag){ + + $text = + $this->fuckhtml + ->getTextContent( + substr( + $html, + $start, + $tag["startPos"] - $start + ), + true, + false + ); + + if($was_code_block){ + + $text = ltrim($text); + $was_code_block = false; + } + + $description[] = [ + "type" => "text", + "value" => $text + ]; + + switch($tag["tagName"]){ + + case "pre": + $append = "code"; + $was_code_block = true; + $c = count($description) - 1; + $description[$c]["value"] = + rtrim($description[$c]["value"]); + break; + + case "code": + $append = "inline_code"; + $c = count($description) - 1; + $description[$c]["value"] = + rtrim($description[$c]["value"]) . " "; + break; + + case "h1": + case "h2": + case "h3": + case "h4": + case "h5": + case "h6": + case "h7": + $append = "title"; + $c = count($description) - 1; + $description[$c]["value"] = + rtrim($description[$c]["value"]); + break; + } + + $description[] = [ + "type" => $append, + "value" => + trim( + $this->fuckhtml + ->getTextContent( + $tag, + true, + false + ) + ) + ]; + + $start = $tag["endPos"]; + } + + // shit out remainder + $description[] = [ + "type" => "text", + "value" => + trim( + $this->fuckhtml + ->getTextContent( + substr( + $html, + $start + ), + true, + false + ) + ) + ]; + } + + return $description; + } + + private function titledots($title){ + + $substr = substr($title, -3); + + if( + $substr == "..." || + $substr == "…" + ){ + + return trim(substr($title, 0, -3)); + } + + return trim($title); + } + + private function hms2int($time){ + + $parts = explode(":", $time, 3); + $time = 0; + + if(count($parts) === 3){ + + // hours + $time = $time + ((int)$parts[0] * 3600); + array_shift($parts); + } + + if(count($parts) === 2){ + + // minutes + $time = $time + ((int)$parts[0] * 60); + array_shift($parts); + } + + // seconds + $time = $time + (int)$parts[0]; + + return $time; + } + + + private function unshiturl($url){ + + // check for domains w/out first short subdomain (ex: www.) + + $domain = parse_url($url, PHP_URL_HOST); + + $subdomain = preg_replace( + '/^[A-z0-9]{1,3}\./', + "", + $domain + ); + + switch($subdomain){ + case "ebay.com.au": + case "ebay.at": + case "ebay.ca": + case "ebay.fr": + case "ebay.de": + case "ebay.com.hk": + case "ebay.ie": + case "ebay.it": + case "ebay.com.my": + case "ebay.nl": + case "ebay.ph": + case "ebay.pl": + case "ebay.com.sg": + case "ebay.es": + case "ebay.ch": + case "ebay.co.uk": + case "cafr.ebay.ca": + case "ebay.com": + case "community.ebay.com": + case "pages.ebay.com": + + // remove ebay tracking elements + $old_params = parse_url($url, PHP_URL_QUERY); + parse_str($old_params, $params); + + if(isset($params["mkevt"])){ unset($params["mkevt"]); } + if(isset($params["mkcid"])){ unset($params["mkcid"]); } + if(isset($params["mkrid"])){ unset($params["mkrid"]); } + if(isset($params["campid"])){ unset($params["campid"]); } + if(isset($params["customid"])){ unset($params["customid"]); } + if(isset($params["toolid"])){ unset($params["toolid"]); } + if(isset($params["_sop"])){ unset($params["_sop"]); } + if(isset($params["_dcat"])){ unset($params["_dcat"]); } + if(isset($params["epid"])){ unset($params["epid"]); } + if(isset($params["epid"])){ unset($params["oid"]); } + + $params = http_build_query($params); + + if(strlen($params) === 0){ + $replace = "\?"; + }else{ + $replace = ""; + } + + $url = preg_replace( + "/" . $replace . preg_quote($old_params, "/") . "$/", + $params, + $url + ); + break; + } + + return $url; + } + + private function bingimg($url){ + + $image = parse_url($url); + + $id = null; + if(isset($image["query"])){ + + parse_str($image["query"], $str); + + if(isset($str["id"])){ + + $id = $str["id"]; + } + } + + if($id === null){ + + $id = explode("/th/id/", $image["path"], 2); + + if(count($id) !== 2){ + + // malformed + return $url; + } + + $id = $id[1]; + } + + return "https://" . $image["host"] . "/th?id=" . rawurlencode($id); + } + + private function bingratio($width, $height){ + + $ratio = [ + 474 / $width, + 474 / $height + ]; + + if($ratio[0] < $ratio[1]){ + + $ratio = $ratio[0]; + }else{ + + $ratio = $ratio[1]; + } + + return [ + floor($width * $ratio), + floor($height * $ratio) + ]; + } +} diff --git a/scraper/facebook.php b/scraper/facebook.php new file mode 100644 index 0000000..395a863 --- /dev/null +++ b/scraper/facebook.php @@ -0,0 +1,820 @@ +<?php + +class facebook{ + + const get = 0; + const post = 1; + + public function __construct(){ + + include "lib/nextpage.php"; + $this->nextpage = new nextpage("fb"); + + include "lib/proxy_pool.php"; + $this->proxy = new proxy_pool("facebook"); + } + + public function getfilters($page){ + + return [ + "sort" => [ + "display" => "Sort by", + "option" => [ + "relevance" => "Relevance", + "most_recent" => "Most recent" + ] + ], + "newer" => [ + "display" => "Newer than", + "option" => "_DATE" + ], + "older" => [ + "display" => "Older than", + "option" => "_DATE" + ], + "live" => [ + "display" => "Livestream", + "option" => [ + "no" => "No", + "yes" => "Yes" + ] + ] + ]; + } + + private function get($url, $get = [], $reqtype = self::get){ + + $curlproc = curl_init(); + + if($get !== []){ + + $get = http_build_query($get); + + if($reqtype === self::get){ + + $headers = [ + "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0", + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "DNT: 1", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: none", + "Sec-Fetch-User: ?1" + ]; + + $url .= "?" . $get; + }else{ + + curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); + + $headers = [ + "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0", + "Accept: */*", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip, deflate, br", + "Content-Type: application/x-www-form-urlencoded", + "X-FB-Friendly-Name: SearchCometResultsPaginatedResultsQuery", + //"X-FB-LSD: AVptQC4a16c", + //"X-ASBD-ID: 129477", + "Content-Length: " . strlen($get), + "Origin: https://www.facebook.com", + "DNT: 1", + "Connection: keep-alive", + "Referer: https://www.facebook.com/watch/", + "Cookie: datr=__GMZCgwVF5BbyvAtfJojQwg; oo=v1%7C3%3A1691641171; wd=955x995", + "Sec-Fetch-Dest: empty", + "Sec-Fetch-Mode: cors", + "Sec-Fetch-Site: same-origin", + "TE: trailers" + ]; + + curl_setopt($curlproc, CURLOPT_POST, true); + curl_setopt($curlproc, CURLOPT_POSTFIELDS, $get); + } + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers); + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->proxy->assign_proxy($curlproc); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + return $data; + } + + public function video($get){ + + $search = $get["s"]; + $npt = $get["npt"]; + + $this->out = [ + "status" => "ok", + "npt" => null, + "video" => [], + "author" => [], + "livestream" => [], + "playlist" => [], + "reel" => [] + ]; + + if($get["npt"]){ + + $nextpage = + json_decode( + $this->nextpage->get( + $npt, + "videos" + ), + true + ); + + // parse next page + $this->video_nextpage($nextpage); + + return $this->out; + } + + // generate filter data + // { + // "rp_creation_time:0":"{\"name\":\"creation_time\",\"args\":\"{\\\"start_year\\\":\\\"2023\\\",\\\"start_month\\\":\\\"2023-08\\\",\\\"end_year\\\":\\\"2023\\\",\\\"end_month\\\":\\\"2023-08\\\",\\\"start_day\\\":\\\"2023-08-10\\\",\\\"end_day\\\":\\\"2023-08-10\\\"}\"}", + // "videos_sort_by:0":"{\"name\":\"videos_sort_by\",\"args\":\"Most Recent\"}", + // "videos_live:0":"{\"name\":\"videos_live\",\"args\":\"\"}" + // } + $filter = []; + $sort = $get["sort"]; + $live = $get["live"]; + $older = $get["older"]; + $newer = $get["newer"]; + + if( + $older !== false || + $newer !== false + ){ + + if($older === false){ + + $older = time(); + } + + if($newer === false){ + + $newer = 0; + } + + $filter["rp_creation_time:0"] = + json_encode( + [ + "name" => "creation_time", + "args" => + json_encode( + [ + "start_year" => date("Y", $newer), + "start_month" => date("Y-m", $newer), + "end_year" => date("Y", $older), + "end_month" => date("Y-m", $older), + "start_day" => date("Y-m-d", $newer), + "end_day" => date("Y-m-d", $older) + ] + ) + ] + ); + } + + if($sort != "relevance"){ + + $filter["videos_sort_by:0"] = + json_encode( + [ + "name" => "videos_sort_by", + "args" => "Most Recent" + ] + ); + } + + if($live != "no"){ + + $filter["videos_live:0"] = json_encode( + [ + "name" => "videos_live", + "args" => "" + ] + ); + } + + $req = [ + "q" => $search + ]; + + if(count($filter) !== 0){ + + $req["filters"] = + base64_encode( + json_encode( + $filter + ) + ); + } + /* + $html = + $this->get( + "https://www.facebook.com/watch/search/", + $req + );*/ + + $handle = fopen("scraper/facebook.html", "r"); + $html = fread($handle, filesize("scraper/facebook.html")); + fclose($handle); + + preg_match_all( + '/({"__bbox":.*,"sequence_number":0}})\]\]/', + $html, + $json + ); + + if(!isset($json[1][1])){ + + throw new Exception("Could not grep JSON body"); + } + + $json = json_decode($json[1][1], true); + + foreach( + $json + ["__bbox"] + ["result"] + ["data"] + ["serpResponse"] + ["results"] + ["edges"] + as $result + ){ + + $this->parse_edge($result); + } + + // get nextpage data + if( + $json + ["__bbox"] + ["result"] + ["data"] + ["serpResponse"] + ["results"] + ["page_info"] + ["has_next_page"] + == 1 + ){ + + preg_match( + '/handleWithCustomApplyEach\(ScheduledApplyEach,({.*})\);}\);}\);<\/script>/', + $html, + $nextpagedata + ); + + // [POST] https://www.facebook.com/api/graphql/ + // FORM data, not JSON! + + $nextpage = [ + "av" => "0", + "__user" => null, + "__a" => null, + "__req" => "2", + "__hs" => null, + "dpr" => "1", + "__ccg" => null, + "__rev" => null, + // another client side token + "__s" => $this->randomstring(6) . ":" . $this->randomstring(6) . ":" . $this->randomstring(6), + "__hsi" => null, + // tracking fingerprint (probably generated using webgl) + "__dyn" => "7xeUmwlE7ibwKBWo2vwAxu13w8CewSwMwNw9G2S0im3y4o0B-q1ew65xO2O1Vw8G1Qw5Mx61vw9m1YwBgao6C0Mo5W3S7Udo5q4U2zxe2Gew9O222SUbEaU2eU5O0GpovU19pobodEGdw46wbS1LwTwNwLw8O1pwr86C16w", + "__csr" => $this->randomstring(null), + "__comet_req" => null, + "lsd" => null, + "jazoest" => null, + "__spin_r" => null, + "__spin_b" => null, + "__spin_t" => null, + "fb_api_caller_class" => "RelayModern", + "fb_api_req_friendly_name" => "SearchCometResultsPaginatedResultsQuery", + "variables" => [ // this is json + "UFI2CommentsProvider_commentsKey" => "SearchCometResultsInitialResultsQuery", + "allow_streaming" => false, + "args" => [ + "callsite" => "comet:watch_search", + "config" => [ + "exact_match" => false, + "high_confidence_config" => null, + "intercept_config" => null, + "sts_disambiguation" => null, + "watch_config" => null + ], + "context" => [ + "bsid" => null, + "tsid" => null + ], + "experience" => [ + "encoded_server_defined_params" => null, + "fbid" => null, + "type" => "WATCH_TAB_GLOBAL" + ], + "filters" => [], + "text" => $search + ], + "count" => 5, + "cursor" => + $json + ["__bbox"] + ["result"] + ["data"] + ["serpResponse"] + ["results"] + ["page_info"] + ["end_cursor"], + "displayCommentsContextEnableComment" => false, + "displayCommentsContextIsAdPreview" => false, + "displayCommentsContextIsAggregatedShare" => false, + "displayCommentsContextIsStorySet" => false, + "displayCommentsFeedbackContext" => null, + "feedLocation" => "SEARCH", + "feedbackSource" => 23, + "fetch_filters" => true, + "focusCommentID" => null, + "locale" => null, + "privacySelectorRenderLocation" => "COMET_STREAM", + "renderLocation" => "search_results_page", + "scale" => 1, + "stream_initial_count" => 0, + "useDefaultActor" => false, + "__relay_internal__pv__IsWorkUserrelayprovider" => false, + "__relay_internal__pv__IsMergQAPollsrelayprovider" => false, + "__relay_internal__pv__StoriesArmadilloReplyEnabledrelayprovider" => false, + "__relay_internal__pv__StoriesRingrelayprovider" => false + ], + "server_timestamps" => "true", + "doc_id" => "6761275837251607" // is actually dynamic + ]; + + // append filters to nextpage + foreach($filter as $key => $value){ + + $nextpage["variables"]["args"]["filters"][] = + $value; + } + + $nextpagedata = json_decode($nextpagedata[1], true); + + // get bsid + foreach($nextpagedata["require"] as $key){ + + foreach($key as $innerkey){ + + if(is_array($innerkey)){ + foreach($innerkey as $inner_innerkey){ + + if(is_array($inner_innerkey)){ + foreach($inner_innerkey as $inner_inner_innerkey){ + + if( + isset( + $inner_inner_innerkey + ["variables"] + ["args"] + ["context"] + ["bsid"] + ) + ){ + + $nextpage + ["variables"] + ["args"] + ["context"] + ["bsid"] = + $inner_inner_innerkey + ["variables"] + ["args"] + ["context"] + ["bsid"]; + } + } + } + } + } + } + } + + foreach($nextpagedata["define"] as $key){ + + if(isset($key[2]["haste_session"])){ + + $nextpage["__hs"] = $key[2]["haste_session"]; + } + + if(isset($key[2]["connectionClass"])){ + + $nextpage["__ccg"] = $key[2]["connectionClass"]; + } + + if(isset($key[2]["__spin_r"])){ + + $nextpage["__spin_r"] = (string)$key[2]["__spin_r"]; + } + + if(isset($key[2]["hsi"])){ + + $nextpage["__hsi"] = (string)$key[2]["hsi"]; + } + + if( + isset($key[2]["token"]) && + !empty($key[2]["token"]) + ){ + + $nextpage["lsd"] = $key[2]["token"]; + } + + if(isset($key[2]["__spin_r"])){ + + $nextpage["__spin_r"] = (string)$key[2]["__spin_r"]; + $nextpage["__rev"] = $nextpage["__spin_r"]; + } + + if(isset($key[2]["__spin_b"])){ + + $nextpage["__spin_b"] = $key[2]["__spin_b"]; + } + + if(isset($key[2]["__spin_t"])){ + + $nextpage["__spin_t"] = (string)$key[2]["__spin_t"]; + } + } + + preg_match( + '/{"u":"\\\\\/ajax\\\\\/qm\\\\\/\?__a=([0-9]+)&__user=([0-9]+)&__comet_req=([0-9]+)&jazoest=([0-9]+)"/', + $html, + $ajaxparams + ); + + if(count($ajaxparams) !== 5){ + + throw new Exception("Could not grep the AJAX parameters"); + } + + $nextpage["__a"] = $ajaxparams[1]; + $nextpage["__user"] = $ajaxparams[2]; + $nextpage["__comet_req"] = $ajaxparams[3]; + $nextpage["jazoest"] = $ajaxparams[4]; + + /* + $handle = fopen("scraper/facebook-nextpage.json", "r"); + $json = fread($handle, filesize("scraper/facebook-nextpage.json")); + fclose($handle);*/ + + $nextpage["variables"] = json_encode($nextpage["variables"]); + + $this->video_nextpage($nextpage); + } + + return $this->out; + } + + private function video_nextpage($nextpage, $getcursor = false){ + + $json = + $this->get( + "https://www.facebook.com/api/graphql/", + $nextpage, + self::post + ); + + $json = json_decode($json, true); + + if($json === null){ + + throw new Exception("Failed to decode next page JSON"); + } + + foreach( + $json + ["data"] + ["serpResponse"] + ["results"] + ["edges"] + as $result + ){ + + $this->parse_edge($result); + } + + if( + $json + ["data"] + ["serpResponse"] + ["results"] + ["page_info"] + ["has_next_page"] == 1 + ){ + + $nextpage["variables"] = json_decode($nextpage["variables"], true); + + $nextpage["variables"]["cursor"] = + $json + ["data"] + ["serpResponse"] + ["results"] + ["page_info"] + ["end_cursor"]; + + $nextpage["variables"] = json_encode($nextpage["variables"]); + + //change this for second call. after, it's static. + // TODO: csr also updates to longer string + $nextpage["__dyn"] = "7xeUmwlEnwn8K2WnFw9-2i5U4e0yoW3q322aew9G2S0zU20xi3y4o0B-q1ew65xOfxO1Vw8G11xmfz81s8hwGwQw9m1YwBgao6C2O0B85W3S7Udo5qfK0EUjwGzE2swwwJK2W2K0zK5o4q0GpovU19pobodEGdw46wbS1LwTwNwLw8O1pwr86C16w"; + + // TODO: change this on third and 6th call + //$nextpage["__s"] = $this->randomstring(6) . ":" . explode(":", $nextpage["__s"], 2)[1]; + + $this->out["npt"] = $this->nextpage->store(json_encode($nextpage), "videos"); + } + } + + private function parse_edge($edge){ + + $append = "video"; + $edge = + $edge + ["relay_rendering_strategy"] + ["view_model"]; + + if( + strtolower( + $edge + ["video_metadata_model"] + ["video_broadcast_status"] + ) + == "live" + ){ + + // handle livestream + $duration = "_LIVE"; + $append = "livestream"; + $timetext = null; + $views = + (int)$edge + ["video_metadata_model"] + ["relative_time_string"]; + + $url_prefix = "https://www.facebook.com/watch/live/?v="; + + }elseif( + stripos( + $edge + ["video_metadata_model"] + ["video_broadcast_status"], + "vod" + ) !== false + ){ + + // handle VOD format + $timetext = null; + $views = + (int)$edge + ["video_metadata_model"] + ["relative_time_string"]; + + $duration = + $this->hms2int( + $edge + ["video_thumbnail_model"] + ["video_duration_text"] + ); + + $url_prefix = "https://www.facebook.com/watch/live/?v="; + + }else{ + + // handle normal format + $timetext = + explode( + " · ", + $edge + ["video_metadata_model"] + ["relative_time_string"], + 2 + ); + + if(count($timetext) === 2){ + + $views = $this->truncatedcount2int($timetext[1]); + }else{ + + $views = null; + } + + $timetext = strtotime($timetext[0]); + + $duration = + $this->hms2int( + $edge + ["video_thumbnail_model"] + ["video_duration_text"] + ); + + $url_prefix = "https://www.facebook.com/watch/?v="; + } + + if( + isset( + $edge + ["video_metadata_model"] + ["video_owner_profile"] + ["uri_token"] + ) + ){ + + $profileurl = + "https://www.facebook.com/watch/" . + $edge + ["video_metadata_model"] + ["video_owner_profile"] + ["uri_token"]; + }else{ + + $profileurl = + $edge + ["video_metadata_model"] + ["video_owner_profile"] + ["url"]; + } + + $this->out[$append][] = [ + "title" => + $this->limitstrlen( + str_replace( + "\n", + " ", + $edge + ["video_metadata_model"] + ["title"] + ), + 100 + ), + "description" => + empty( + $edge + ["video_metadata_model"] + ["save_description"] + ) ? + null : + str_replace( + "\n", + " ", + $this->limitstrlen( + $edge + ["video_metadata_model"] + ["save_description"] + ) + ), + "author" => [ + "name" => + $edge + ["video_metadata_model"] + ["video_owner_profile"] + ["name"], + "url" => $profileurl, + "avatar" => null + ], + "date" => $timetext, + "duration" => $duration, + "views" => $views, + "thumb" => + [ + "url" => + $edge + ["video_thumbnail_model"] + ["thumbnail_image"] + ["uri"], + "ratio" => "16:9" + ], + "url" => + $url_prefix . + $edge + ["video_click_model"] + ["click_metadata_model"] + ["video_id"] + ]; + } + + private function randomstring($len){ + + if($len === null){ + + $str = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ123456789-"; + $len = rand(141, 145); + $c = 61; + }else{ + + $str = "abcdefghijklmnopqrstuvwxyz123456789"; + $c = 34; + } + + $out = null; + for($i=0; $i<$len; $i++){ + + $out .= $str[rand(0, $c)]; + } + + return $out; + } + + private function limitstrlen($text, $len = 300){ + + return explode("\n", wordwrap($text, $len, "\n"))[0]; + } + + private function hms2int($time){ + + $parts = explode(":", $time, 3); + $time = 0; + + if(count($parts) === 3){ + + // hours + $time = $time + ((int)$parts[0] * 3600); + array_shift($parts); + } + + if(count($parts) === 2){ + + // minutes + $time = $time + ((int)$parts[0] * 60); + array_shift($parts); + } + + // seconds + $time = $time + (int)$parts[0]; + + return $time; + } + + private function truncatedcount2int($number){ + + // decimal should always be 1 number long + $number = explode(" ", $number, 2); + $number = $number[0]; + + $unit = strtolower($number[strlen($number) - 1]); + + $tmp = explode(".", $number, 2); + $number = (int)$number; + + if(count($tmp) === 2){ + + $decimal = (int)$tmp[1]; + }else{ + + $decimal = 0; + } + + switch($unit){ + + case "k": + $exponant = 1000; + break; + + case "m": + $exponant = 1000000; + break; + + case "b"; + $exponant = 1000000000; + break; + + default: + $exponant = 1; + break; + } + + return ($number * $exponant) + ($decimal * ($exponant / 10)); + } +} diff --git a/scraper/fivehpx.php b/scraper/fivehpx.php new file mode 100644 index 0000000..8a600df --- /dev/null +++ b/scraper/fivehpx.php @@ -0,0 +1,262 @@ +<?php + +class fivehpx{ + + public function __construct(){ + + include "lib/backend.php"; + $this->backend = new backend("fivehpx"); + + include "lib/fuckhtml.php"; + $this->fuckhtml = new fuckhtml(); + } + + public function getfilters($page){ + + return [ + "sort" => [ + "display" => "Sort", + "option" => [ + "relevance" => "Relevance", + "pulse" => "Pulse", + "newest" => "Newest" + ] + ] + ]; + } + + private function get($proxy, $url, $get = [], $post_data = null){ + + $curlproc = curl_init(); + + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + + if($post_data === null){ + + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "DNT: 1", + "Sec-GPC: 1", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: same-origin", + "Sec-Fetch-User: ?1", + "Priority: u=0, i", + "TE: trailers"] + ); + }else{ + + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: */*", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "Referer: https://500px.com/", + "content-type: application/json", + //"x-csrf-token: undefined", + "x-500px-source: Search", + "Content-Length: " . strlen($post_data), + "Origin: https://500px.com", + "DNT: 1", + "Sec-GPC: 1", + "Connection: keep-alive", + // "Cookie: _pin_unauth, _fbp, _sharedID, _sharedID_cst", + "Sec-Fetch-Dest: empty", + "Sec-Fetch-Mode: cors", + "Sec-Fetch-Site: same-site", + "Priority: u=4", + "TE: trailers"] + ); + + // set post data + curl_setopt($curlproc, CURLOPT_POST, true); + curl_setopt($curlproc, CURLOPT_POSTFIELDS, $post_data); + } + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + // http2 bypass + curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); + + $this->backend->assign_proxy($curlproc, $proxy); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + return $data; + } + + public function image($get){ + + if($get["npt"]){ + + [$pagination, $proxy] = + $this->backend->get( + $get["npt"], "images" + ); + + $pagination = json_decode($pagination, true); + $search = $pagination["search"]; + + }else{ + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $proxy = $this->backend->get_ip(); + $pagination = [ + "sort" => strtoupper($get["sort"]), + "search" => $search, + "filters" => [], + "nlp" => false, + ]; + } + + try{ + + $json = + $this->get( + $proxy, + "https://api.500px.com/graphql", + [], + json_encode([ + "operationName" => "PhotoSearchPaginationContainerQuery", + "variables" => $pagination, + "query" => + 'query PhotoSearchPaginationContainerQuery(' . + (isset($pagination["cursor"]) ? '$cursor: String, ' : "") . + '$sort: PhotoSort, $search: String!, $filters: [PhotoSearchFilter!], $nlp: Boolean) { ...PhotoSearchPaginationContainer_query_1vzAZD} fragment PhotoSearchPaginationContainer_query_1vzAZD on Query { photoSearch(sort: $sort, first: 100, ' . + (isset($pagination["cursor"]) ? 'after: $cursor, ' : "") . + 'search: $search, filters: $filters, nlp: $nlp) { edges { node { id legacyId canonicalPath name description width height images(sizes: [33, 36]) { size url id } } } totalCount pageInfo { endCursor hasNextPage } }}' + ]) + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch graphQL object"); + } + + $json = json_decode($json, true); + + if($json === null){ + + throw new Exception("Failed to decode graphQL object"); + } + + if(isset($json["errors"][0]["message"])){ + + throw new Exception("500px returned an API error: " . $json["errors"][0]["message"]); + } + + if(!isset($json["data"]["photoSearch"]["edges"])){ + + throw new Exception("No edges returned by API"); + } + + $out = [ + "status" => "ok", + "npt" => null, + "image" => [] + ]; + + foreach($json["data"]["photoSearch"]["edges"] as $image){ + + $image = $image["node"]; + $title = + trim( + $this->fuckhtml + ->getTextContent( + $image["name"] + ) . ": " . + $this->fuckhtml + ->getTextContent( + $image["description"] + ) + , " :" + ); + + $small = $this->image_ratio(600, $image["width"], $image["height"]); + $large = $this->image_ratio(2048, $image["width"], $image["height"]); + + $out["image"][] = [ + "title" => $title, + "source" => [ + [ + "url" => $image["images"][1]["url"], + "width" => $large[0], + "height" => $large[1] + ], + [ + "url" => $image["images"][0]["url"], + "width" => $small[0], + "height" => $small[1] + ] + ], + "url" => "https://500px.com" . $image["canonicalPath"] + ]; + } + + // get NPT token + if($json["data"]["photoSearch"]["pageInfo"]["hasNextPage"] === true){ + + $out["npt"] = + $this->backend->store( + json_encode([ + "cursor" => $json["data"]["photoSearch"]["pageInfo"]["endCursor"], + "search" => $search, + "sort" => $pagination["sort"], + "filters" => [], + "nlp" => false + ]), + "images", + $proxy + ); + } + + return $out; + } + + private function image_ratio($longest_edge, $width, $height){ + + $ratio = [ + $longest_edge / $width, + $longest_edge / $height + ]; + + if($ratio[0] < $ratio[1]){ + + $ratio = $ratio[0]; + }else{ + + $ratio = $ratio[1]; + } + + return [ + floor($width * $ratio), + floor($height * $ratio) + ]; + } +} diff --git a/scraper/flickr.php b/scraper/flickr.php new file mode 100644 index 0000000..71656ee --- /dev/null +++ b/scraper/flickr.php @@ -0,0 +1,415 @@ +<?php + +class flickr{ + + const req_web = 0; + const req_xhr = 1; + + public function __construct(){ + + include "lib/backend.php"; + $this->backend = new backend("flickr"); + + include "lib/fuckhtml.php"; + $this->fuckhtml = new fuckhtml(); + } + + public function getfilters($page){ + + return [ + "nsfw" => [ + "display" => "NSFW", + "option" => [ + "yes" => "Yes", + "maybe" => "Maybe", + "no" => "No", + ] + ], + "sort" => [ + "display" => "Sort by", + "option" => [ + "relevance" => "Relevance", + "date-posted-desc" => "Newest uploads", + "date-posted-asc" => "Oldest uploads", + "date-taken-desc" => "Newest taken", + "date-taken-asc" => "Oldest taken", + "interestingness-desc" => "Interesting" + ] + ], + "color" => [ + "display" => "Color", + "option" => [ + "any" => "Any color", + // color_codes= + "0" => "Red", + "1" => "Brown", + "2" => "Orange", + "b" => "Pink", + "4" => "Yellow", + "3" => "Golden", + "5" => "Lime", + "6" => "Green", + "7" => "Sky blue", + "8" => "Blue", + "9" => "Purple", + "a" => "Hot pink", + "c" => "White", + "d" => "Gray", + "e" => "Black", + // styles= override + "blackandwhite" => "Black & white", + ] + ], + "style" => [ // styles= + "display" => "Style", + "option" => [ + "any" => "Any style", + "depthoffield" => "Depth of field", + "minimalism" => "Minimalism", + "pattern" => "Patterns" + ] + ], + "license" => [ + "display" => "License", + "option" => [ + "any" => "Any license", + "1,2,3,4,5,6,9,11,12,13,14,15,16" => "All creative commons", + "4,5,6,9,10,11,12,13" => "Commercial use allowed", + "1,2,4,5,9,10,11,12,14,15" => "Modifications allowed", + "4,5,9,10,11,12" => "Commercial use & mods allowed", + "7,9,10" => "No known copyright restrictions", + "8" => "U.S Government works" + ] + ] + ]; + } + + private function get($proxy, $url, $get = [], $reqtype){ + + $curlproc = curl_init(); + + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + + if($reqtype === flickr::req_web){ + + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "DNT: 1", + "Sec-GPC: 1", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: same-origin", + "Sec-Fetch-User: ?1", + "Priority: u=0, i", + "TE: trailers"] + ); + }else{ + + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: */*", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "Origin: https://www.flickr.com", + "DNT: 1", + "Sec-GPC: 1", + "Connection: keep-alive", + "Referer: https://www.flickr.com/", + // Cookie: + "Sec-Fetch-Dest: empty", + "Sec-Fetch-Mode: cors", + "Sec-Fetch-Site: same-site", + "TE: trailers"] + ); + } + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + // http2 bypass + curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); + + $this->backend->assign_proxy($curlproc, $proxy); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + return $data; + } + + public function image($get){ + + if($get["npt"]){ + + [$filters, $proxy] = + $this->backend->get( + $get["npt"], "images" + ); + + $filters = json_decode($filters, true); + + // Workaround for the future, if flickr deprecates &page argument on html page + /* + try{ + + $json = + $this->get( + $proxy, + "https://api.flickr.com/services/rest", + [ + "sort" => $data["sort"], + "parse_tags" => 1, + // url_s,url_n,url_w,url_m,url_z,url_c,url_l,url_h,url_k,url_3k,url_4k,url_5k,url_6k,url_o + "extras" => "can_comment,can_print,count_comments,count_faves,description,isfavorite,license,media,needs_interstitial,owner_name,path_alias,realname,rotation,url_sq,url_q,url_t,url_s,url_n,url_w,url_m,url_z,url_c,url_l", + "per_page" => 100, + "page" => $data["page"], + "lang" => "en-US", + "text" => $data["search"], + "viewerNSID" => "", + "method" => "flickr.photos.search", + "csrf" => "", + "api_key" => $data["api_key"], + "format" => "json", + "hermes" => 1, + "hermesClient" => 1, + "reqId" => $data["reqId"], + "nojsoncallback" => 1 + ] + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch JSON"); + }*/ + + }else{ + + if(strlen($get["s"]) === 0){ + + throw new Exception("Search term is empty!"); + } + + $proxy = $this->backend->get_ip(); + + // compute filters + $filters = [ + "page" => 1, + "sort" => $get["sort"] + ]; + + if($get["style"] != "any"){ + + $filters["styles"] = $get["style"]; + } + + if($get["color"] != "any"){ + + if($get["color"] != "blackandwhite"){ + + $filters["color_codes"] = $get["color"]; + }else{ + + $filters["styles"] = "blackandwhite"; + } + } + + if($get["license"] != "any"){ + + $filters["license"] = $get["license"]; + } + + switch($get["nsfw"]){ + + case "yes": $filters["safe_search"] = 0; break; + case "maybe": $filters["safe_search"] = 2; break; + case "no": $filters["safe_search"] = 1; break; + } + } + + $get_params = [ + "text" => $get["s"], + "per_page" => 50, + // scrape highest resolution + "extras" => "url_s,url_n,url_w,url_m,url_z,url_c,url_l,url_h,url_k,url_3k,url_4k,url_5k,url_6k,url_o", + "view_all" => 1 + ]; + + $get_params = array_merge($get_params, $filters); + + $html = + $this->get( + $proxy, + "https://www.flickr.com/search/", + $get_params, + flickr::req_web + ); + + // @TODO + // get api_key and reqId, if flickr deprecates &page + + $this->fuckhtml->load($html); + + // + // get response JSON + // + $scripts = + $this->fuckhtml + ->getElementsByClassName( + "modelExport", + "script" + ); + + $found = false; + foreach($scripts as $script){ + + $json = + preg_split( + '/modelExport: ?/', + $script["innerHTML"], + 2 + ); + + if(count($json) !== 0){ + + $found = true; + $json = $json[1]; + break; + } + } + + if($found === false){ + + throw new Exception("Failed to grep JSON"); + } + + $json = + json_decode( + $this->fuckhtml + ->extract_json( + $json + ), + true + ); + + if($json === null){ + + throw new Exception("Failed to decode JSON"); + } + + $out = [ + "status" => "ok", + "npt" => null, + "image" => [] + ]; + + if(!isset($json["main"]["search-photos-lite-models"][0]["data"]["photos"]["data"]["_data"])){ + + throw new Exception("Failed to access data object"); + } + + foreach($json["main"]["search-photos-lite-models"][0]["data"]["photos"]["data"]["_data"] as $image){ + + if(!isset($image["data"])){ + + // flickr likes to gives us empty array objects + continue; + } + + $image = $image["data"]; + + $title = []; + + if(isset($image["title"])){ + + $title[] = + $this->fuckhtml + ->getTextContent( + $image["title"] + ); + } + + if(isset($image["description"])){ + + $title[] = + $this->fuckhtml + ->getTextContent( + str_replace( + "\n", + " ", + $image["description"] + ) + ); + } + + $title = implode(": ", $title); + + $sources = array_values($image["sizes"]["data"]); + + $suitable_sizes = ["n", "m", "w", "s"]; + + $thumb = &$sources[0]["data"]; + foreach($suitable_sizes as $testing_size){ + + if(isset($image["sizes"]["data"][$testing_size])){ + + $thumb = &$image["sizes"]["data"][$testing_size]["data"]; + break; + } + } + + $og = &$sources[count($sources) - 1]["data"]; + + $out["image"][] = [ + "title" => $title, + "source" => [ + [ + "url" => "https:" . $og["displayUrl"], + "width" => (int)$og["width"], + "height" => (int)$og["height"] + ], + [ + "url" => "https:" . $thumb["displayUrl"], + "width" => (int)$thumb["width"], + "height" => (int)$thumb["height"] + ] + ], + "url" => "https://www.flickr.com/photos/" . $image["ownerNsid"] . "/" . $image["id"] . "/" + ]; + } + + $total_items = (int)$json["main"]["search-photos-lite-models"][0]["data"]["photos"]["data"]["totalItems"]; + + if(($filters["page"]) * 50 < $total_items){ + + $filters["page"]++; + + $out["npt"] = + $this->backend->store( + json_encode($filters), + "images", + $proxy + ); + } + + return $out; + } +} diff --git a/scraper/ftm.php b/scraper/ftm.php new file mode 100644 index 0000000..470c13e --- /dev/null +++ b/scraper/ftm.php @@ -0,0 +1,161 @@ +<?php + +class ftm{ + + public function __construct(){ + + include "lib/backend.php"; + $this->backend = new backend("ftm"); + } + + public function getfilters($page){ + + return []; + } + + private function get($proxy, $url, $search, $offset){ + + $curlproc = curl_init(); + + curl_setopt($curlproc, CURLOPT_URL, $url); + + $payload = + json_encode( + [ + "search" => $search, + "offset" => $offset + ] + ); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "Content-Length: " . strlen($payload), + "Content-Type: application/json", + "DNT: 1", + "Connection: keep-alive", + "Origin: https://findthatmeme.com", + "Referer: https://findthatmeme.com/?search=" . urlencode($search), + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: none", + "Sec-Fetch-User: ?1", + "X-Auth-Key: undefined", + "X-CSRF-Validation-Header: true"] + ); + + curl_setopt($curlproc, CURLOPT_POST, true); + curl_setopt($curlproc, CURLOPT_POSTFIELDS, $payload); + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + return $data; + } + + public function image($get){ + + $out = [ + "status" => "ok", + "npt" => null, + "image" => [] + ]; + + if($get["npt"]){ + + [$data, $proxy] = $this->backend->get($get["npt"], "images"); + $data = json_decode($data, true); + + $count = $data["count"]; + $search = $data["search"]; + }else{ + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $count = 0; + $proxy = $this->backend->get_ip(); + } + + try{ + $json = + json_decode( + $this->get( + $proxy, + "https://findthatmeme.com/api/v1/search", + $search, + $count + ), + true + ); + + }catch(Exception $error){ + + throw new Exception("Failed to fetch JSON"); + } + + if($json === null){ + + throw new Exception("Failed to decode JSON"); + } + + foreach($json as $item){ + + $count++; + + if($item["type"] == "VIDEO"){ + + $thumb = "thumb/" . $item["thumbnail"]; + }else{ + + $thumb = $item["image_path"]; + } + + $out["image"][] = [ + "title" => date("jS \of F Y @ g:ia", strtotime($item["created_at"])), + "source" => [ + [ + "url" => + "https://s3.thehackerblog.com/findthatmeme/" . + $thumb, + "width" => null, + "height" => null + ] + ], + "url" => $item["source_page_url"] + ]; + } + + $out["npt"] = + $this->backend->store( + json_encode([ + "count" => $count, + "search" => $search + ]), + "images", + $proxy + ); + + return $out; + } +} diff --git a/scraper/ghostery.php b/scraper/ghostery.php new file mode 100644 index 0000000..394756e --- /dev/null +++ b/scraper/ghostery.php @@ -0,0 +1,320 @@ +<?php + +class ghostery{ + + public function __construct(){ + + include "lib/backend.php"; + $this->backend = new backend("ghostery"); + + include "lib/fuckhtml.php"; + $this->fuckhtml = new fuckhtml(); + } + + public function getfilters($page){ + + if($page != "web"){ + + return []; + } + + return [ + "country" => [ + "display" => "Country", + "option" => [ + "any" => "All regions", + "AR" => "Argentina", + "AU" => "Australia", + "AT" => "Austria", + "BE" => "Belgium", + "BR" => "Brazil", + "CA" => "Canada", + "CL" => "Chile", + "DK" => "Denmark", + "FI" => "Finland", + "FR" => "France", + "DE" => "Germany", + "HK" => "Hong Kong", + "IN" => "India", + "ID" => "Indonesia", + "IT" => "Italy", + "JP" => "Japan", + "KR" => "Korea", + "MY" => "Malaysia", + "MX" => "Mexico", + "NL" => "Netherlands", + "NZ" => "New Zealand", + "NO" => "Norway", + "CN" => "People's Republic of China", + "PL" => "Poland", + "PT" => "Portugal", + "PH" => "Republic of the Philippines", + "RU" => "Russia", + "SA" => "Saudi Arabia", + "ZA" => "South Africa", + "ES" => "Spain", + "SE" => "Sweden", + "CH" => "Switzerland", + "TW" => "Taiwan", + "TR" => "Turkey", + "GB" => "United Kingdom", + "US" => "United States" + ] + ] + ]; + } + + private function get($proxy, $url, $get = [], $country){ + + $curlproc = curl_init(); + + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "Referer: https://ghosterysearch.com", + "DNT: 1", + "Sec-GPC: 1", + "Connection: keep-alive", + "Cookie: ctry=" . ($country == "any" ? "--" : $country) . "; noads=true", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: same-origin", + "Sec-Fetch-User: ?1", + "Priority: u=0, i"] + ); + + // http2 bypass + curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + return $data; + } + + public function web($get){ + + if($get["npt"]){ + + [$query, $proxy] = $this->backend->get($get["npt"], "web"); + + parse_str($query, $query); + + // country + $country = $query["c"]; + unset($query["c"]); + + $query = http_build_query($query); + + try{ + + $html = + $this->get( + $proxy, + "https://ghosterysearch.com/search?" . $query, + [], + $country + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch search page"); + } + }else{ + + $proxy = $this->backend->get_ip(); + + try{ + + $html = + $this->get( + $proxy, + "https://ghosterysearch.com/search", + [ + "q" => $get["s"] + ], + $get["country"] + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch search page"); + } + } + + $out = [ + "status" => "ok", + "spelling" => [ + "type" => "no_correction", + "using" => null, + "correction" => null + ], + "npt" => null, + "answer" => [], + "web" => [], + "image" => [], + "video" => [], + "news" => [], + "related" => [] + ]; + + $this->fuckhtml->load($html); + + $results_wrapper = + $this->fuckhtml + ->getElementsByClassName( + "results", + "section" + ); + + if(count($results_wrapper) === 0){ + + throw new Exception("Failed to grep result section"); + } + + $this->fuckhtml->load($results_wrapper[0]); + + // get search results + $results = + $this->fuckhtml + ->getElementsByClassName( + "result", + "li" + ); + + if(count($results) === 0){ + + return $out; + } + + foreach($results as $result){ + + $this->fuckhtml->load($result); + + $a = + $this->fuckhtml + ->getElementsByClassName( + "url", + "a" + ); + + if(count($a) === 0){ + + continue; + } + + $a = $a[0]; + + $out["web"][] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByTagName( + "h2" + )[0] + ) + ), + "description" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByTagName( + "p" + )[0] + ) + ), + "url" => + $this->fuckhtml + ->getTextContent( + $a + ["attributes"] + ["href"] + ), + "date" => null, + "type" => "web", + "thumb" => [ + "url" => null, + "ratio" => null + ], + "sublink" => [], + "table" => [] + ]; + } + + $this->fuckhtml->load($html); + + // get pagination token + $pagination_wrapper = + $this->fuckhtml + ->getElementsByClassName( + "pagination", + "div" + ); + + if(count($pagination_wrapper) !== 0){ + + // found next page! + $this->fuckhtml->load($pagination_wrapper[0]); + + $a = + $this->fuckhtml + ->getElementsByTagName( + "a" + ); + + if(count($a) !== 0){ + + $q = + parse_url( + $this->fuckhtml + ->getTextContent( + $a[count($a) - 1] + ["attributes"] + ["href"] + ), + PHP_URL_QUERY + ); + + $out["npt"] = + $this->backend + ->store( + $q . "&c=" . $get["country"], + "web", + $proxy + ); + } + } + + return $out; + } + + private function titledots($title){ + + return trim($title, " .\t\n\r\0\x0B…"); + } +} diff --git a/scraper/google.php b/scraper/google.php new file mode 100644 index 0000000..0c73ea0 --- /dev/null +++ b/scraper/google.php @@ -0,0 +1,2989 @@ +<?php + +// @TODO check for consent.google.com page, if need be + +class google{ + + public function __construct(){ + + include "lib/fuckhtml.php"; + $this->fuckhtml = new fuckhtml(); + + include "lib/backend.php"; + $this->backend = new backend("google"); + } + + public function getfilters($page){ + + $base = [ + "country" => [ // gl=<country> (image: cr=countryAF) + "display" => "Country", + "option" => [ + "any" => "Instance's country", + "af" => "Afghanistan", + "al" => "Albania", + "dz" => "Algeria", + "as" => "American Samoa", + "ad" => "Andorra", + "ao" => "Angola", + "ai" => "Anguilla", + "aq" => "Antarctica", + "ag" => "Antigua and Barbuda", + "ar" => "Argentina", + "am" => "Armenia", + "aw" => "Aruba", + "au" => "Australia", + "at" => "Austria", + "az" => "Azerbaijan", + "bs" => "Bahamas", + "bh" => "Bahrain", + "bd" => "Bangladesh", + "bb" => "Barbados", + "by" => "Belarus", + "be" => "Belgium", + "bz" => "Belize", + "bj" => "Benin", + "bm" => "Bermuda", + "bt" => "Bhutan", + "bo" => "Bolivia", + "ba" => "Bosnia and Herzegovina", + "bw" => "Botswana", + "bv" => "Bouvet Island", + "br" => "Brazil", + "io" => "British Indian Ocean Territory", + "bn" => "Brunei Darussalam", + "bg" => "Bulgaria", + "bf" => "Burkina Faso", + "bi" => "Burundi", + "kh" => "Cambodia", + "cm" => "Cameroon", + "ca" => "Canada", + "cv" => "Cape Verde", + "ky" => "Cayman Islands", + "cf" => "Central African Republic", + "td" => "Chad", + "cl" => "Chile", + "cn" => "China", + "cx" => "Christmas Island", + "cc" => "Cocos (Keeling) Islands", + "co" => "Colombia", + "km" => "Comoros", + "cg" => "Congo", + "cd" => "Congo, the Democratic Republic", + "ck" => "Cook Islands", + "cr" => "Costa Rica", + "ci" => "Cote D'ivoire", + "hr" => "Croatia", + "cu" => "Cuba", + "cy" => "Cyprus", + "cz" => "Czech Republic", + "dk" => "Denmark", + "dj" => "Djibouti", + "dm" => "Dominica", + "do" => "Dominican Republic", + "ec" => "Ecuador", + "eg" => "Egypt", + "sv" => "El Salvador", + "gq" => "Equatorial Guinea", + "er" => "Eritrea", + "ee" => "Estonia", + "et" => "Ethiopia", + "fk" => "Falkland Islands (Malvinas)", + "fo" => "Faroe Islands", + "fj" => "Fiji", + "fi" => "Finland", + "fr" => "France", + "gf" => "French Guiana", + "pf" => "French Polynesia", + "tf" => "French Southern Territories", + "ga" => "Gabon", + "gm" => "Gambia", + "ge" => "Georgia", + "de" => "Germany", + "gh" => "Ghana", + "gi" => "Gibraltar", + "gr" => "Greece", + "gl" => "Greenland", + "gd" => "Grenada", + "gp" => "Guadeloupe", + "gu" => "Guam", + "gt" => "Guatemala", + "gn" => "Guinea", + "gw" => "Guinea-Bissau", + "gy" => "Guyana", + "ht" => "Haiti", + "hm" => "Heard Island and Mcdonald Islands", + "va" => "Holy See (Vatican City State)", + "hn" => "Honduras", + "hk" => "Hong Kong", + "hu" => "Hungary", + "is" => "Iceland", + "in" => "India", + "id" => "Indonesia", + "ir" => "Iran, Islamic Republic", + "iq" => "Iraq", + "ie" => "Ireland", + "il" => "Israel", + "it" => "Italy", + "jm" => "Jamaica", + "jp" => "Japan", + "jo" => "Jordan", + "kz" => "Kazakhstan", + "ke" => "Kenya", + "ki" => "Kiribati", + "kp" => "Korea, Democratic People's Republic", + "kr" => "Korea, Republic", + "kw" => "Kuwait", + "kg" => "Kyrgyzstan", + "la" => "Lao People's Democratic Republic", + "lv" => "Latvia", + "lb" => "Lebanon", + "ls" => "Lesotho", + "lr" => "Liberia", + "ly" => "Libyan Arab Jamahiriya", + "li" => "Liechtenstein", + "lt" => "Lithuania", + "lu" => "Luxembourg", + "mo" => "Macao", + "mk" => "Macedonia, the Former Yugosalv Republic", + "mg" => "Madagascar", + "mw" => "Malawi", + "my" => "Malaysia", + "mv" => "Maldives", + "ml" => "Mali", + "mt" => "Malta", + "mh" => "Marshall Islands", + "mq" => "Martinique", + "mr" => "Mauritania", + "mu" => "Mauritius", + "yt" => "Mayotte", + "mx" => "Mexico", + "fm" => "Micronesia, Federated States", + "md" => "Moldova, Republic", + "mc" => "Monaco", + "mn" => "Mongolia", + "ms" => "Montserrat", + "ma" => "Morocco", + "mz" => "Mozambique", + "mm" => "Myanmar", + "na" => "Namibia", + "nr" => "Nauru", + "np" => "Nepal", + "nl" => "Netherlands", + "an" => "Netherlands Antilles", + "nc" => "New Caledonia", + "nz" => "New Zealand", + "ni" => "Nicaragua", + "ne" => "Niger", + "ng" => "Nigeria", + "nu" => "Niue", + "nf" => "Norfolk Island", + "mp" => "Northern Mariana Islands", + "no" => "Norway", + "om" => "Oman", + "pk" => "Pakistan", + "pw" => "Palau", + "ps" => "Palestinian Territory, Occupied", + "pa" => "Panama", + "pg" => "Papua New Guinea", + "py" => "Paraguay", + "pe" => "Peru", + "ph" => "Philippines", + "pn" => "Pitcairn", + "pl" => "Poland", + "pt" => "Portugal", + "pr" => "Puerto Rico", + "qa" => "Qatar", + "re" => "Reunion", + "ro" => "Romania", + "ru" => "Russian Federation", + "rw" => "Rwanda", + "sh" => "Saint Helena", + "kn" => "Saint Kitts and Nevis", + "lc" => "Saint Lucia", + "pm" => "Saint Pierre and Miquelon", + "vc" => "Saint Vincent and the Grenadines", + "ws" => "Samoa", + "sm" => "San Marino", + "st" => "Sao Tome and Principe", + "sa" => "Saudi Arabia", + "sn" => "Senegal", + "cs" => "Serbia and Montenegro", + "sc" => "Seychelles", + "sl" => "Sierra Leone", + "sg" => "Singapore", + "sk" => "Slovakia", + "si" => "Slovenia", + "sb" => "Solomon Islands", + "so" => "Somalia", + "za" => "South Africa", + "gs" => "South Georgia and the South Sandwich Islands", + "es" => "Spain", + "lk" => "Sri Lanka", + "sd" => "Sudan", + "sr" => "Suriname", + "sj" => "Svalbard and Jan Mayen", + "sz" => "Swaziland", + "se" => "Sweden", + "ch" => "Switzerland", + "sy" => "Syrian Arab Republic", + "tw" => "Taiwan, Province of China", + "tj" => "Tajikistan", + "tz" => "Tanzania, United Republic", + "th" => "Thailand", + "tl" => "Timor-Leste", + "tg" => "Togo", + "tk" => "Tokelau", + "to" => "Tonga", + "tt" => "Trinidad and Tobago", + "tn" => "Tunisia", + "tr" => "Turkey", + "tm" => "Turkmenistan", + "tc" => "Turks and Caicos Islands", + "tv" => "Tuvalu", + "ug" => "Uganda", + "ua" => "Ukraine", + "ae" => "United Arab Emirates", + "uk" => "United Kingdom", + "us" => "United States", + "um" => "United States Minor Outlying Islands", + "uy" => "Uruguay", + "uz" => "Uzbekistan", + "vu" => "Vanuatu", + "ve" => "Venezuela", + "vn" => "Viet Nam", + "vg" => "Virgin Islands, British", + "vi" => "Virgin Islands, U.S.", + "wf" => "Wallis and Futuna", + "eh" => "Western Sahara", + "ye" => "Yemen", + "zm" => "Zambia", + "zw" => "Zimbabwe" + ] + ], + "nsfw" => [ + "display" => "NSFW", + "option" => [ + "yes" => "Yes", // safe=active + "no" => "No" // safe=off + ] + ] + ]; + + switch($page){ + + case "web": + return array_merge( + $base, + [ + "lang" => [ // lr=<lang> (prefix lang with "lang_") + "display" => "Language", + "option" => [ + "any" => "Any language", + "ar" => "Arabic", + "bg" => "Bulgarian", + "ca" => "Catalan", + "cs" => "Czech", + "da" => "Danish", + "de" => "German", + "el" => "Greek", + "en" => "English", + "es" => "Spanish", + "et" => "Estonian", + "fi" => "Finnish", + "fr" => "French", + "hr" => "Croatian", + "hu" => "Hungarian", + "id" => "Indonesian", + "is" => "Icelandic", + "it" => "Italian", + "iw" => "Hebrew", + "ja" => "Japanese", + "ko" => "Korean", + "lt" => "Lithuanian", + "lv" => "Latvian", + "nl" => "Dutch", + "no" => "Norwegian", + "pl" => "Polish", + "pt" => "Portuguese", + "ro" => "Romanian", + "ru" => "Russian", + "sk" => "Slovak", + "sl" => "Slovenian", + "sr" => "Serbian", + "sv" => "Swedish", + "tr" => "Turkish", + "zh-CN" => "Chinese (Simplified)", + "zh-TW" => "Chinese (Traditional)" + ] + ], + "newer" => [ // tbs + "display" => "Newer than", + "option" => "_DATE" + ], + "older" => [ + "display" => "Older than", + "option" => "_DATE" + ], + "spellcheck" => [ + "display" => "Spellcheck", + "option" => [ + "yes" => "Yes", + "no" => "No" + ] + ] + ] + ); + break; + + case "images": + return array_merge( + $base, + [ + "time" => [ // tbs=qdr:<time> + "display" => "Time posted", + "option" => [ + "any" => "Any time", + "d" => "Past 24 hours", + "w" => "Past week", + "m" => "Past month", + "y" => "Past year" + ] + ], + "size" => [ // imgsz + "display" => "Size", + "option" => [ + "any" => "Any size", + "l" => "Large", + "m" => "Medium", + "i" => "Icon", + "qsvga" => "Larger than 400x300", + "vga" => "Larger than 640x480", + "svga" => "Larger than 800x600", + "xga" => "Larger than 1024x768", + "2mp" => "Larger than 2MP", + "4mp" => "Larger than 4MP", + "6mp" => "Larger than 6MP", + "8mp" => "Larger than 8MP", + "10mp" => "Larger than 10MP", + "12mp" => "Larger than 12MP", + "15mp" => "Larger than 15MP", + "20mp" => "Larger than 20MP", + "40mp" => "Larger than 40MP", + "70mp" => "Larger than 70MP" + ] + ], + "ratio" => [ // imgar + "display" => "Aspect ratio", + "option" => [ + "any" => "Any ratio", + "t|xt" => "Tall", + "s" => "Square", + "w" => "Wide", + "xw" => "Panoramic" + ] + ], + "color" => [ // imgc + "display" => "Color", + "option" => [ + "any" => "Any color", + "color" => "Full color", + "bnw" => "Black & white", + "trans" => "Transparent", + // from here, imgcolor + "red" => "Red", + "orange" => "Orange", + "yellow" => "Yellow", + "green" => "Green", + "teal" => "Teal", + "blue" => "Blue", + "purple" => "Purple", + "pink" => "Pink", + "white" => "White", + "gray" => "Gray", + "black" => "Black", + "brown" => "Brown" + ] + ], + "type" => [ // tbs=itp:<type> + "display" => "Type", + "option" => [ + "any" => "Any type", + "clipart" => "Clip Art", + "lineart" => "Line Drawing", + "animated" => "Animated" + ] + ], + "format" => [ // as_filetype + "display" => "Format", + "option" => [ + "any" => "Any format", + "jpg" => "JPG", + "gif" => "GIF", + "png" => "PNG", + "bmp" => "BMP", + "svg" => "SVG", + "webp" => "WEBP", + "ico" => "ICO", + "craw" => "RAW" + ] + ], + "rights" => [ // tbs=sur:<rights> + "display" => "Usage rights", + "option" => [ + "any" => "Any license", + "cl" => "Creative Commons licenses", + "ol" => "Commercial & other licenses" + ] + ] + ] + ); + break; + + case "videos": + return array_merge( + $base, + [ + "newer" => [ // tbs + "display" => "Newer than", + "option" => "_DATE" + ], + "older" => [ + "display" => "Older than", + "option" => "_DATE" + ], + "duration" => [ + "display" => "Duration", + "option" => [ + "any" => "Any duration", + "s" => "Short (0-4min)", // tbs=dur:s + "m" => "Medium (4-20min)", // tbs=dur:m + "l" => "Long (20+ min)" // tbs=dur:l + ] + ], + "quality" => [ + "display" => "Quality", + "option" => [ + "any" => "Any quality", + "h" => "High quality" // tbs=hq:h + ] + ], + "captions" => [ + "display" => "Captions", + "option" => [ + "any" => "No preference", + "yes" => "Closed captioned" // tbs=cc:1 + ] + ] + ] + ); + break; + + case "news": + return array_merge( + $base, + [ + "newer" => [ // tbs + "display" => "Newer than", + "option" => "_DATE" + ], + "older" => [ + "display" => "Older than", + "option" => "_DATE" + ], + "sort" => [ + "display" => "Sort", + "option" => [ + "relevance" => "Relevance", + "date" => "Date" // sbd:1 + ] + ] + ] + ); + break; + } + } + + private function get($proxy, $url, $get = [], $use_lynx = false){ + + $curlproc = curl_init(); + + if($use_lynx === false){ + + $headers = [ + "User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "DNT: 1", + //"Cookie: SOCS=CAESNQgCEitib3FfaWRlbnRpdHlmcm9udGVuZHVpc2VydmVyXzIwMjQwMzE3LjA4X3AwGgJlbiAEGgYIgM7orwY", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: none", + "Sec-Fetch-User: ?1", + "Priority: u=1", + "TE: trailers" + ]; + + // use http2 + curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); + }else{ + + $headers = [ + "Accept: text/html, text/plain, text/sgml, */*;q=0.01", + "Accept-Encoding: gzip, compress, bzip2", + "Accept-Language: en", + "User-Agent: Lynx/2.9.0dev.12 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/3.7.8" + ]; + } + + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers); + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + // follow redirects + curl_setopt($curlproc, CURLOPT_FOLLOWLOCATION, true); + + $this->backend->assign_proxy($curlproc, $proxy); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + + if($use_lynx){ + + return mb_convert_encoding($data, "UTF-8", "ISO-8859-1"); + } + + return $data; + } + + + private function scrape_dimg($html){ + + // get images loaded through javascript + $this->dimg = []; + + preg_match_all( + '/function\(\){google\.ldi=({.*?});/', + $html, + $dimg + ); + + if(isset($dimg[1])){ + + foreach($dimg[1] as $i){ + + $tmp = json_decode($i, true); + foreach($tmp as $key => $value){ + + $this->dimg[$key] = + $this->unshit_thumb( + $value + ); + } + } + } + + // get additional javascript base64 images + preg_match_all( + '/var s=\'(data:image\/[^\']+)\';var ii=\[((?:\'[^\']+\',?)+)\];/', + $html, + $dimg + ); + + if(isset($dimg[1])){ + + for($i=0; $i<count($dimg[1]); $i++){ + + $delims = explode(",", $dimg[2][$i]); + $string = + $this->fuckhtml + ->parseJsString( + $dimg[1][$i] + ); + + foreach($delims as $delim){ + + $this->dimg[trim($delim, "'")] = $string; + } + } + } + } + + + private function scrape_imagearr($html){ + // get image links arrays + preg_match_all( + '/\[[0-9]+,"([^"]+)",\["([^"]+)\",([0-9]+),([0-9]+)\],\["([^"]+)",([0-9]+),([0-9]+)\]/', + $html, + $image_arr + ); + + $this->image_arr = []; + if(isset($image_arr[1])){ + + for($i=0; $i<count($image_arr[1]); $i++){ + + $original = + $this->fuckhtml + ->parseJsString( + $image_arr[5][$i] + ); + + if( + preg_match( + '/^x-raw-image/', + $original + ) + ){ + + // only add thumbnail, google doesnt have OG resolution + $this->image_arr[$image_arr[1][$i]] = [ + [ + "url" => + $this->unshit_thumb( + $this->fuckhtml + ->parseJsString( + $image_arr[2][$i] + ) + ), + "width" => (int)$image_arr[7][$i], // pass the OG image width & height + "height" => (int)$image_arr[6][$i] + ] + ]; + + continue; + } + + $this->image_arr[$image_arr[1][$i]] = + [ + [ + "url" => $original, + "width" => (int)$image_arr[7][$i], + "height" => (int)$image_arr[6][$i] + ], + [ + "url" => + $this->unshit_thumb( + $this->fuckhtml + ->parseJsString( + $image_arr[2][$i] + ) + ), + "width" => (int)$image_arr[4][$i], + "height" => (int)$image_arr[3][$i] + ] + ]; + } + } + } + + + private function getdimg($dimg){ + + return isset($this->dimg[$dimg]) ? $this->dimg[$dimg] : null; + } + + + private function unshit_thumb($url){ + // https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQINE2vbnNLHXqoZr3RVsaEJFyOsj1_BiBnJch-e1nyz3oia7Aj5xVj + // https://i.ytimg.com/vi/PZVIyA5ER3Y/mqdefault.jpg?sqp=-oaymwEFCJQBEFM&rs=AMzJL3nXeaCpdIar-ltNwl82Y82cIJfphA + + $parts = parse_url($url); + + if( + isset($parts["host"]) && + preg_match( + '/tbn.*\.gstatic\.com/', + $parts["host"] + ) + ){ + + parse_str($parts["query"], $params); + + if(isset($params["q"])){ + + return "https://" . $parts["host"] . "/images?q=" . $params["q"]; + } + } + + return $url; + } + + + private function parsestyles(){ + + $styles = []; + + $style_div = + $this->fuckhtml + ->getElementsByTagName( + "style" + ); + + $raw_styles = ""; + + foreach($style_div as $style){ + + $raw_styles .= $style["innerHTML"]; + } + + // filter out media/keyframe queries + $raw_styles = + preg_replace( + '/@\s*(?!font-face)[^{]+\s*{[\S\s]+?}\s*}/', + "", + $raw_styles + ); + + // get styles + preg_match_all( + '/(.+?){([\S\s]*?)}/', + $raw_styles, + $matches + ); + + for($i=0; $i<count($matches[1]); $i++){ + + // get style values + preg_match_all( + '/([^:;]+):([^;]*?(?:\([^)]+\)[^;]*?)?)(?:;|$)/', + $matches[2][$i], + $values_regex + ); + + $values = []; + for($k=0; $k<count($values_regex[1]); $k++){ + + $values[trim($values_regex[1][$k])] = + strtolower(trim($values_regex[2][$k])); + } + + $names = explode(",", $matches[1][$i]); + + // h1,h2,h3 will each get their own array index + foreach($names as $name){ + + $name = trim($name, "}\t\n\r\0\x0B"); + + foreach($values as $key => $value){ + + $styles[$name][$key] = $value; + } + } + } + + foreach($styles as $key => $values){ + + $styles[$key]["_c"] = count($values); + } + + $this->styles = $styles; + + // get CSS colors + $this->css_colors = []; + + if(isset($this->styles[":root"])){ + + foreach($this->styles[":root"] as $key => $value){ + + $this->css_colors[$value] = strtolower($key); + } + } + } + + + + private function getstyle($styles){ + + $styles["_c"] = count($styles); + + foreach($this->styles as $style_key => $style_values){ + + if(count(array_intersect_assoc($style_values, $styles)) === $styles["_c"] + 1){ + + $style_key = + explode(" ", $style_key); + + $style_key = $style_key[count($style_key) - 1]; + + return + ltrim( + str_replace( + [".", "#"], + " ", + $style_key + ) + ); + } + } + + return false; + } + + + + private function getcolorvar($color){ + + if(isset($this->css_colors[$color])){ + + return $this->css_colors[$color]; + } + + return null; + } + + + + public function web($get){ + + if($get["npt"]){ + + [$get, $proxy] = $this->backend->get($get["npt"], "web"); + + try{ + $html = + $this->get( + $proxy, + "https://www.google.com" . $get, + [], + true + ); + }catch(Exception $error){ + + throw new Exception("Failed to get HTML"); + } + + }else{ + $search = $get["s"]; + $country = $get["country"]; + $nsfw = $get["nsfw"]; + $lang = $get["lang"]; + $older = $get["older"]; + $newer = $get["newer"]; + $spellcheck = $get["spellcheck"]; + $proxy = $this->backend->get_ip(); + + $offset = 0; + + $params = [ + "q" => $search, + "hl" => "en", + "num" => 20 + ]; + + // country + if($country != "any"){ + + $params["gl"] = $country; + } + + // nsfw + $params["safe"] = $nsfw == "yes" ? "off" : "active"; + + // language + if($lang != "any"){ + + $params["lr"] = "lang_" . $lang; + } + + // generate tbs + $tbs = []; + + // get date + $older = $older === false ? null : date("m/d/Y", $older); + $newer = $newer === false ? null : date("m/d/Y", $newer); + + if( + $older !== null || + $newer !== null + ){ + + $tbs["cdr"] = "1"; + $tbs["cd_min"] = $newer; + $tbs["cd_max"] = $older; + } + + // spellcheck filter + if($spellcheck == "no"){ + + $params["nfpr"] = "1"; + } + + if(count($tbs) !== 0){ + + $params["tbs"] = ""; + + foreach($tbs as $key => $value){ + + $params["tbs"] .= $key . ":" . $value . ","; + } + + $params["tbs"] = rtrim($params["tbs"], ","); + } + + try{ + $html = + $this->get( + $proxy, + "https://www.google.com/search", + $params, + true + ); + }catch(Exception $error){ + + throw new Exception("Failed to get HTML"); + } + + //$html = file_get_contents("scraper/google.html"); + } + + $out = [ + "status" => "ok", + "spelling" => [ + "type" => "no_correction", + "using" => null, + "correction" => null + ], + "npt" => null, + "answer" => [], + "web" => [], + "image" => [], + "video" => [], + "news" => [], + "related" => [] + ]; + + $this->fuckhtml->load($html); + $this->detect_sorry(); + + $this->parsestyles(); + + $boxes = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle([ + "border" => "thin solid #dadce0", + "padding" => "12px 16px 12px 16px", + "margin-bottom" => "10px", + "font-family" => "sans-serif" + ]), + "div" + ); + + $skip_next = false; + + // get next page token + $npt = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle([ + "border" => "thin solid #dadce0", + "color" => "#70757a", + "font-size" => "14px", + "text-align" => "center", + "table-layout" => "fixed", + "width" => "100%" + ]), + "table" + ); + + if(count($npt) !== 0){ + + $this->fuckhtml->load($npt[0]); + + $as = + $this->fuckhtml + ->getElementsByTagName( + "a" + ); + + foreach($as as $a){ + + $text = + $this->fuckhtml + ->getTextContent( + $a + ); + + if( + $text == "Next >" || + $text == ">" + ){ + + $out["npt"] = + $this->backend->store( + $this->fuckhtml + ->getTextContent( + $a["attributes"]["href"] + ), + "web", + $proxy + ); + } + } + + $this->fuckhtml->load($html); + } + + $first_box = true; + foreach($boxes as $box){ + + $this->fuckhtml->load($box); + + if($first_box){ + + // + // Probe for word correction + // + $first_box = false; + + $txt = + $this->fuckhtml + ->getTextContent($box); + + if( + preg_match( + '/^Showing results for /', + $txt + ) + ){ + + $as = + $this->fuckhtml + ->getElementsByTagName( + "a" + ); + + if(count($as) === 2){ + + $out["spelling"] = [ + "type" => "including", + "using" => + $this->fuckhtml + ->getTextContent( + $as[0] + ), + "correction" => + $this->fuckhtml + ->getTextContent( + $as[1] + ) + ]; + } + continue; + } + } + + // probe for custom container + $container_title = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle([ + "font-weight" => "bold" + ]) + ); + + if(count($container_title) !== 0){ + + $container_title = + strtolower( + $this->fuckhtml + ->getTextContent( + $container_title[0] + ) + ); + + if($container_title == "images"){ + + // + // Parse image carousel + // + $images = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle([ + "display" => "inline-block", + "padding" => "2px", + "padding-bottom" => "4px" + ]), + "a" + ); + + foreach($images as $image){ + + $this->fuckhtml->load($image); + + $image_data = + $this->unshiturl( + $image["attributes"]["href"], + true + ); + + $img = + $this->fuckhtml + ->getElementsByTagName( + "img" + )[0]; + + $out["image"][] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $img["attributes"]["alt"] + ) + ), + "source" => [ + [ + "url" => $image_data["url"], + "width" => $image_data["image_width"], + "height" => $image_data["image_height"] + ], + [ + "url" => + $this->fuckhtml + ->getTextContent( + $img["attributes"]["src"] + ), + "width" => $image_data["thumb_width"], + "height" => $image_data["thumb_height"] + ] + ], + "url" => $image_data["ref"] + ]; + } + + continue; + } + + if( + $container_title == "related searches" || + $container_title == "people also search for" + ){ + + $as = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle([ + "color" => "#202124", + "font-size" => "13px", + "line-height" => "20px" + ]), + "span" + ); + + foreach($as as $a){ + + $out["related"][] = + $this->fuckhtml + ->getTextContent( + $a + ); + } + continue; + } + } + + // probe for website link + $link = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle([ + "color" => "#1967d2", + "font-size" => "18px", + "line-height" => "24px" + ]), + "a" + ); + + if(count($link) !== 0){ + + // + // Parse search result + // + + $this->fuckhtml->load($link[0]); + + $title = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle([ + "color" => "#1967d2", + "font-size" => "18px", + "line-height" => "24px" + ]), + "span" + ); + + if(count($title) === 0){ + + continue; + } + + $this->fuckhtml->load($box); + + $sublinks = []; + $table = []; + + $categories = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle([ + "color" => "#202124", + "font-size" => "13px", + "line-height" => "20px" + ]), + "span" + ); + + $i = 0; + foreach($categories as $category){ + + $this->fuckhtml->load($category); + + // probe for sublinks + $subs = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle([ + "color" => "#1967d2" + ]), + "a" + ); + + if(count($subs) !== 0){ + + foreach($subs as $sub){ + + $url = + $this->unshiturl( + $this->fuckhtml + ->getTextContent( + $sub["attributes"]["href"] + ) + ); + + if( + preg_match( + '/^https?:\/\//', + $url + ) + ){ + + $sublinks[] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $sub + ) + ), + "description" => null, + "url" => + $this->unshiturl( + $this->fuckhtml + ->getTextContent( + $sub["attributes"]["href"] + ) + ), + "date" => null + ]; + } + } + + unset($categories[$i]); + } + + $i++; + } + + // get description & date + $date = null; + + $categories = array_values($categories); + + //print_r($categories); + + $c = count($categories) - 1; + + $description = + $this->fuckhtml + ->getTextContent( + $categories[$c] + ); + + // remove last category since we're done with it + unset($categories[$c]); + + // probe for date + $description_tmp = explode("·", $description, 2); + $date_tmp = strtotime(trim($description_tmp[0])); + + if( + count($description_tmp) === 2 && + strlen($description_tmp[0]) <= 20 && + $date_tmp !== false + ){ + + $description = + ltrim( + $this->titledots( + $description_tmp[1] + ) + ); + $date = $date_tmp; + }else{ + + $description = + $this->titledots( + $description + ); + } + + // remaining categories should all be greytext + if(count($categories) !== 0){ + + $texts = + explode( + "·", + preg_replace( + '/\s+/', + " ", + $this->fuckhtml + ->getTextContent( + $categories[0] + ) + ) + ); + + foreach($texts as $text){ + + $text = trim($text); + + if( + preg_match( + '/^Rating ([0-9.]+)(?: \(([0-9,]+)\))?/', + $text, + $rating + ) + ){ + + $table["Rating"] = $rating[1]; + if(isset($rating[2])){ + + $table["Rating"] .= " (" . $rating[2] . " votes)"; + } + + continue; + } + + if(stripos($text, "stock") !== false){ + + $table["Stock"] = $text; + continue; + } + } + } + + $out["web"][] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $title[0] + ) + ), + "description" => $description, + "url" => + $this->unshiturl( + $link[0]["attributes"]["href"] + ), + "date" => $date, + "type" => "web", + "thumb" => [ + "url" => null, + "ratio" => null + ], + "sublink" => $sublinks, + "table" => $table + ]; + + continue; + } + + // parse wikipedia heads + $wiki_title = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle([ + "color" => "#202124", + "font-size" => "18px", + "line-height" => "24px" + ]), + "span" + ); + + if(count($wiki_title) !== 0){ + + $wiki_title = + $this->fuckhtml + ->getTextContent( + $wiki_title[0] + ); + + if($wiki_title == "See results about"){ + + // ignore + continue; + } + + if($wiki_title == "Top stories"){ + + // + // Parse news + // + $tds = + $this->fuckhtml + ->getElementsByTagName( + "td" + ); + + foreach($tds as $td){ + + $this->fuckhtml->load($td); + + $a = + $this->fuckhtml + ->getElementsByTagName( + "a" + ); + + if(count($a) === 0){ + + continue; + } + + $title = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle([ + "color" => "#1967d2" + ]), + "span" + ); + + if(count($title) === 0){ + + continue; + } + + $date = null; + + $meta_div = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle([ + "color" => "#70757a", + "font-size" => "13px", + "line-height" => "20px" + ]), + "span" + ); + + $meta_div = + explode( + "·", + $this->fuckhtml + ->getTextContent( + $meta_div[count($meta_div) - 1] + ), + 2 + ); + + if(count($meta_div) === 2){ + + $date = strtotime($meta_div[count($meta_div) - 1]); + + if($date === false){ + + $date = null; + } + } + + $out["news"][] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $title[0] + ) + ), + "description" => null, + "date" => $date, + "thumb" => [ + "url" => null, + "ratio" => null + ], + "url" => + $this->unshiturl( + $a[0]["attributes"]["href"] + ) + ]; + } + continue; + } + + // + // Parse wikipedia heads + // + + $table_div = + $this->fuckhtml + ->getElementsByTagName( + "table" + ); + + if(count($table_div) === 0){ + + continue; + } + + $this->fuckhtml->load($table_div[0]); + + // remove table from box + $box["innerHTML"] = + str_replace( + $table_div[0]["outerHTML"], + "", + $box["innerHTML"] + ); + + // find wiki image + $thumb = null; + + $img = + $this->fuckhtml + ->getElementsByTagName( + "img" + ); + + if(count($img) !== 0){ + + $thumb = + $this->fuckhtml + ->getTextContent( + $img[0]["attributes"]["src"] + ); + } + + $tds = + $this->fuckhtml + ->getElementsByTagName( + "td" + ); + + $description = []; + + foreach($tds as $td){ + + // probe for subtitle + $this->fuckhtml->load($td); + + $subtext = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle([ + "color" => "#70757a", + "font-size" => "13px", + "line-height" => "20px" + ]) + ); + + if(count($subtext) !== 0){ + + $description[] = [ + "type" => "quote", + "value" => + $this->fuckhtml + ->getTextContent( + $subtext[0] + ) + ]; + break; + } + } + + $this->fuckhtml->load($box); + + // probe for word definition + $lists = + $this->fuckhtml + ->getElementsByTagName( + "ol" + ); + + if(count($lists) !== 0){ + + $description = []; + + foreach($lists as $list){ + + $box["innerHTML"] = + explode( + $list["outerHTML"], + $box["innerHTML"], + 2 + ); + + if( + count($box["innerHTML"]) === 1 || + trim($box["innerHTML"][0]) == "" + ){ + + break; + } + + $description[] = [ + "type" => "title", + "value" => + $this->fuckhtml + ->getTextContent( + $box["innerHTML"][0] + ) + ]; + + $this->fuckhtml->load($list); + + $lis = + $this->fuckhtml + ->getElementsByTagName( + "li" + ); + + $increment = 1; + + foreach($lis as $li){ + + $this->fuckhtml->load($li); + + $list_items = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle([ + "color" => "#202124", + "font-size" => "13px", + "line-height" => "20px" + ]) + ); + + $first_item = true; + foreach($list_items as $it){ + + if($first_item){ + + $first_item = false; + $c = count($description); + + if( + $c !== 0 && + $description[$c - 1]["type"] == "text" + ){ + + $description[$c - 1]["value"] .= + "\n\n" . + $increment . ". " . $this->fuckhtml + ->getTextContent( + $it + ); + }else{ + + $description[] = [ + "type" => "text", + "value" => + $increment . ". " . $this->fuckhtml + ->getTextContent( + $it + ) + ]; + } + }else{ + + $description[] = [ + "type" => "quote", + "value" => + $this->fuckhtml + ->getTextContent( + $it + ) + ]; + } + + $increment++; + } + } + + $box["innerHTML"] = $box["innerHTML"][1]; + } + + $out["answer"][] = [ + "title" => $wiki_title, + "description" => $description, + "url" => null, + "thumb" => null, + "table" => [], + "sublink" => [] + ]; + + continue; + } + + // get separator between description and facts + $separator = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle([ + "height" => "4px" + ]), + "div" + ); + + $box_html = []; + $table = []; + + if(count($separator) !== 0){ + + $box_html = + explode( + $separator[0]["outerHTML"], + $box["innerHTML"], + 2 + ); + + if(count($box_html) === 2){ + + $box["innerHTML"] = $box_html[0]; + } + + $this->fuckhtml->load($box_html[1]); + + // get all facts + $facts = + $this->fuckhtml + ->getElementsByTagName( + "div" + ); + + foreach($facts as $fact){ + + if($fact["level"] !== 1){ continue; } + + $fact = + explode( + ":", + $this->fuckhtml + ->getTextContent( + $fact + ) + ); + + $table[trim(preg_replace('/\s+/', " ", $fact[0]))] = + trim(preg_replace('/\s+/', " ", $fact[1])); + } + + $this->fuckhtml->load($box); + } + + // remove wikipedia link + $wiki_link = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle([ + "color" => "#1967d2" + ]), + "a" + ); + + $url = null; + if(count($wiki_link) !== 0){ + + foreach($wiki_link as $link){ + + if( + strtolower( + $this->fuckhtml + ->getTextContent( + $link + ) + ) == "wikipedia" + ){ + + $box["innerHTML"] = + str_replace( + $link["outerHTML"], + "", + $box["innerHTML"] + ); + + $url = + $this->unshiturl( + $link["attributes"]["href"] + ); + + $this->fuckhtml->load($box); + break; + } + } + } + + // remains of box should be description + $description[] = [ + "type" => "text", + "value" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $box + ) + ) + ]; + + $out["answer"][] = [ + "title" => $wiki_title, + "description" => $description, + "url" => $url, + "thumb" => $thumb, + "table" => $table, + "sublink" => [] + ]; + } + } + + return $out; + } + + + + public function video($get){ + + if($get["npt"]){ + + [$params, $proxy] = $this->backend->get($get["npt"], "video"); + $params = json_decode($params, true); + + }else{ + $search = $get["s"]; + $country = $get["country"]; + $nsfw = $get["nsfw"]; + $older = $get["older"]; + $newer = $get["newer"]; + $duration = $get["duration"]; + $quality = $get["quality"]; + $captions = $get["captions"]; + $proxy = $this->backend->get_ip(); + + $params = [ + "q" => $search, + "udm" => "7", + "hl" => "en", + "num" => 20 + ]; + + // country + if($country != "any"){ + + $params["gl"] = $country; + } + + // nsfw + $params["safe"] = $nsfw == "yes" ? "off" : "active"; + + $tbs = []; + + // get date + $older = $older === false ? null : date("m/d/Y", $older); + $newer = $newer === false ? null : date("m/d/Y", $newer); + + if( + $older !== null || + $newer !== null + ){ + + $tbs["cdr"] = "1"; + $tbs["cd_min"] = $newer; + $tbs["cd_max"] = $older; + } + + // duration + if($duration != "any"){ + + $tbs[] = "dur:" . $duration; + } + + // quality + if($quality != "any"){ + + $tbs[] = "hq:" . $quality; + } + + // captions + if($captions != "any"){ + + $tbs[] = "cc:" . $captions; + } + + // append tbs + if(count($tbs) !== 0){ + + $params["tbs"] = + implode(",", $tbs); + } + } + + try{ + $html = + $this->get( + $proxy, + "https://www.google.com/search", + $params + ); + }catch(Exception $error){ + + throw new Exception("Failed to get HTML"); + } + + if(!isset($params["start"])){ + + $params["start"] = 0; + } + $params["start"] += 20; + + $this->fuckhtml->load($html); + + // + // Parse web video page + // + $this->detect_sorry(); + + // parse all <style> tags + $this->parsestyles(); + + // get javascript images + $this->scrape_dimg($html); + + $this->scrape_imagearr($html); + + $out = [ + "status" => "ok", + "npt" => + $this->backend->store( + json_encode($params), + "videos", + $proxy + ), + "video" => [], + "author" => [], + "livestream" => [], + "playlist" => [], + "reel" => [] + ]; + + $search_div = + $this->fuckhtml + ->getElementById( + "center_col" + ); + + if($search_div === false){ + + throw new Exception("Failed to grep search div"); + } + + $this->fuckhtml->load($search_div); + + $results = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle([ + "margin" => "0px 0px 30px" + ]), + "div" + ); + + foreach($results as $result){ + + $this->fuckhtml->load($result); + + $url = + $this->fuckhtml + ->getElementsByTagName( + "a" + ); + + if(count($url) === 0){ + + // no url, weird, continue + continue; + } + + $title = + $this->fuckhtml + ->getElementsByTagName( + "h3" + ); + + if(count($title) === 0){ + + // no title, weird, continue + continue; + } + + // get description + $description = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle([ + "-webkit-box-orient" => "vertical", + "display" => "-webkit-box", + "-webkit-line-clamp" => "2", + "overflow" => "hidden", + "word-break" => "break-word" + ]), + "div" + ); + + if(count($description) === 0){ + + $description = null; + }else{ + + $description = + html_entity_decode( + $this->titledots( + $this->fuckhtml + ->getTextContent( + $description[0] + ) + ) + ); + } + + // get author + date posted + $metadiv = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle([ + "margin-top" => "12px" + ]), + "div" + ); + + $author = null; + $date = null; + + if(count($metadiv) !== 0){ + + $metadiv = + explode( + "·", + $this->fuckhtml + ->getTextContent( + $metadiv[0] + ) + ); + + if(count($metadiv) === 3){ + + $author = trim($metadiv[1]); + $date = strtotime(trim($metadiv[2])); + }elseif(count($metadiv) === 2){ + + $author = trim($metadiv[0]); + $date = strtotime(trim($metadiv[1])); + } + } + + $thumb = [ + "url" => null, + "ratio" => null + ]; + + $image = + $this->fuckhtml + ->getElementsByTagName( + "img" + ); + + $duration = null; + + if( + count($image) !== 0 && + isset($image[0]["attributes"]["id"]) + ){ + + $thumb = [ + "url" => $this->getdimg($image[0]["attributes"]["id"]), + "ratio" => "16:9" + ]; + + // get duration + $duration = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle([ + "background-color" => "rgba(0,0,0,0.6)", + "color" => "#fff", + "fill" => "#fff" + ]) + ); + + if(count($duration) !== 0){ + + $duration = + $this->hms2int( + $this->fuckhtml + ->getTextContent( + $duration[0] + )); + }else{ + + $duration = null; + } + } + + $out["video"][] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $title[0] + ) + ), + "description" => $description, + "author" => [ + "name" => $author, + "url" => null, + "avatar" => null + ], + "date" => $date, + "duration" => $duration, + "views" => null, + "thumb" => $thumb, + "url" => + $this->fuckhtml + ->getTextContent( + $url[0]["attributes"]["href"] + ) + ]; + } + + return $out; + } + + + + public function news($get){ + + if($get["npt"]){ + + [$req, $proxy] = $this->backend->get($get["npt"], "news"); + /*parse_str( + parse_url($req, PHP_URL_QUERY), + $search + );*/ + + try{ + + $html = + $this->get( + $proxy, + "https://www.google.com" . $req, + [] + ); + }catch(Exception $error){ + + throw new Exception("Failed to get HTML"); + } + + }else{ + $search = $get["s"]; + $country = $get["country"]; + $nsfw = $get["nsfw"]; + $older = $get["older"]; + $newer = $get["newer"]; + $sort = $get["sort"]; + $proxy = $this->backend->get_ip(); + + $params = [ + "q" => $search, + "tbm" => "nws", + "hl" => "en", + "num" => "20" + ]; + + // country + if($country != "any"){ + + $params["gl"] = $country; + } + + // nsfw + $params["safe"] = $nsfw == "yes" ? "off" : "active"; + + $tbs = []; + + // get date + $older = $older === false ? null : date("m/d/Y", $older); + $newer = $newer === false ? null : date("m/d/Y", $newer); + + if( + $older !== null || + $newer !== null + ){ + + $tbs["cdr"] = "1"; + $tbs["cd_min"] = $newer; + $tbs["cd_max"] = $older; + } + + // relevance + if($sort == "date"){ + + $tbs["sbd"] = "1"; + } + + // append tbs + if(count($tbs) !== 0){ + + $params["tbs"] = ""; + + foreach($tbs as $key => $value){ + + $params["tbs"] .= $key . ":" . $value . ","; + } + + $params["tbs"] = rtrim($params["tbs"], ","); + } + + //$html = file_get_contents("scraper/google-news.html"); + + $html = + $this->get( + $proxy, + "https://www.google.com/search", + $params + ); + } + + $out = [ + "status" => "ok", + "npt" => null, + "news" => [] + ]; + + $this->fuckhtml->load($html); + + $this->detect_sorry(); + + // get images + $this->scrape_dimg($html); + + // parse styles + $this->parsestyles(); + + $center_col = + $this->fuckhtml + ->getElementById( + "center_col", + "div" + ); + + if($center_col === null){ + + throw new Exception("Could not grep result div"); + } + + $this->fuckhtml->load($center_col); + + // get next page + $npt = + $this->fuckhtml + ->getElementById( + "pnnext", + "a" + ); + + if($npt !== false){ + + $out["npt"] = + $this->backend->store( + $this->fuckhtml + ->getTextContent( + $npt["attributes"] + ["href"] + ), + "news", + $proxy + ); + } + + $as = + $this->fuckhtml + ->getElementsByAttributeName( + "jsname", + "a" + ); + + foreach($as as $a){ + + $this->fuckhtml->load($a); + + // get title + $title = + $this->fuckhtml + ->getElementsByAttributeValue( + "role", + "heading", + "div" + ); + + if(count($title) === 0){ + + continue; + } + + $title = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $title[0] + ) + ); + + // get thumbnail + $image = + $this->fuckhtml + ->getElementsByAttributeName( + "id", + "img" + ); + + // check for padded title node, if found, we're inside a carousel + $probe = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle( + [ + "padding" => "16px 16px 40px 16px" + ] + ), + "div" + ); + + if(count($probe) !== 0){ + + $probe = true; + }else{ + + $probe = false; + } + + if( + count($image) !== 0 && + !isset($image[0]["attributes"]["width"]) + ){ + + $thumb = [ + "url" => + $this->getdimg( + $image[0]["attributes"]["id"] + ), + "ratio" => $probe === true ? "16:9" : "1:1" + ]; + }else{ + + $thumb = [ + "url" => null, + "ratio" => null + ]; + } + + $description = null; + + if($probe === false){ + + $desc_divs = + $this->fuckhtml + ->getElementsByAttributeName( + "style", + "div" + ); + + foreach($desc_divs as $desc){ + + if( + strpos( + $desc["attributes"]["style"], + "margin-top:" + ) !== false + ){ + + $description = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $desc + ) + ); + break; + } + } + } + + // get author + $author = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle( + [ + "overflow" => "hidden", + "text-align" => "left", + "text-overflow" => "ellipsis", + "white-space" => "nowrap", + "margin-bottom" => "8px" + ] + ), + "div" + ); + + if(count($author) !== 0){ + + $author = + $this->fuckhtml + ->getTextContent( + $author[0] + ); + }else{ + + $author = null; + } + + // get date + $date = null; + + $date_div = + $this->fuckhtml + ->getElementsByAttributeName( + "style", + "div" + ); + + foreach($date_div as $d){ + + $this->fuckhtml->load($d); + + $span = + $this->fuckhtml + ->getElementsByTagName( + "span" + ); + + if( + strpos( + $d["attributes"]["style"], + "bottom:" + ) !== false + ){ + + $date = + strtotime( + $this->fuckhtml + ->getTextContent( + $span[count($span) - 1] + ) + ); + break; + } + } + + $out["news"][] = [ + "title" => $title, + "author" => $author, + "description" => $description, + "date" => $date, + "thumb" => $thumb, + "url" => + $this->unshiturl( + $a["attributes"] + ["href"] + ) + ]; + } + + return $out; + } + + + + + public function image($get){ + + // generate parameters + if($get["npt"]){ + + [$params, $proxy] = + $this->backend->get( + $get["npt"], + "images" + ); + + $params = json_decode($params, true); + }else{ + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $proxy = $this->backend->get_ip(); + $country = $get["country"]; + $nsfw = $get["nsfw"]; + $time = $get["time"]; + $size = $get["size"]; + $ratio = $get["ratio"]; + $color = $get["color"]; + $type = $get["type"]; + $format = $get["format"]; + $rights = $get["rights"]; + + $params = [ + "q" => $search, + "udm" => "2" // get images + ]; + + // country (image search uses cr instead of gl) + if($country != "any"){ + + $params["cr"] = "country" . strtoupper($country); + } + + // nsfw + $params["safe"] = $nsfw == "yes" ? "off" : "active"; + + // generate tbs + $tbs = []; + + // time + if($time != "any"){ + + $tbs["qdr"] = $time; + } + + // size + if($size != "any"){ + + $params["imgsz"] = $size; + } + + // ratio + if($ratio != "any"){ + + $params["imgar"] = $ratio; + } + + // color + if($color != "any"){ + + if( + $color == "color" || + $color == "trans" + ){ + + $params["imgc"] = $color; + }elseif($color == "bnw"){ + + $params["imgc"] = "gray"; + }else{ + + $tbs["ic"] = "specific"; + $tbs["isc"] = $color; + } + } + + // type + if($type != "any"){ + + $tbs["itp"] = $type; + } + + // format + if($format != "any"){ + + $params["as_filetype"] = $format; + } + + // rights (tbs) + if($rights != "any"){ + + $tbs["sur"] = $rights; + } + + // append tbs + if(count($tbs) !== 0){ + + $params["tbs"] = ""; + + foreach($tbs as $key => $value){ + + $params["tbs"] .= $key . ":" . $value . ","; + } + + $params["tbs"] = rtrim($params["tbs"], ","); + } + } + /* + $handle = fopen("scraper/page.html", "r"); + $html = fread($handle, filesize("scraper/page.html")); + fclose($handle);*/ + + try{ + $html = + $this->get( + $proxy, + "https://www.google.com/search", + $params + ); + }catch(Exception $error){ + + throw new Exception("Failed to get search page"); + } + + $this->fuckhtml->load($html); + + $this->detect_sorry(); + + // get javascript images + $this->scrape_imagearr($html); + + $out = [ + "status" => "ok", + "npt" => null, + "image" => [] + ]; + + $images = + $this->fuckhtml + ->getElementsByClassName( + "ivg-i", + "div" + ); + + foreach($images as $div){ + + $this->fuckhtml->load($div); + + $image = + $this->fuckhtml + ->getElementsByTagName("img")[0]; + + // make sure we dont attempt to show an image we dont have data for + if( + isset($div["attributes"]["data-docid"]) && + isset($this->image_arr[$div["attributes"]["data-docid"]]) + ){ + + $source = + $this->image_arr[ + $div["attributes"]["data-docid"] + ]; + }else{ + + continue; + } + + $out["image"][] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $image["attributes"]["alt"] + ) + ), + "source" => $source, + "url" => + $this->fuckhtml + ->getTextContent( + $div["attributes"]["data-lpage"] + ) + ]; + } + + // as usual, no way to check if there is a next page reliably + if(count($out["image"]) > 50){ + + if(!isset($params["start"])){ + + $params["start"] = 10; + }else{ + + $params["start"] += 10; + } + + $out["npt"] = + $this->backend + ->store( + json_encode($params), + "image", + $proxy + ); + } + + return $out; + } + + private function unshiturl($url, $return_size = false){ + + // decode + $url = + $this->fuckhtml + ->getTextContent( + $url + ); + + $url_parts = parse_url($url); + + if(isset($url_parts["query"])){ + + parse_str($url_parts["query"], $query); + }else{ + + $query = []; + } + + if( + !isset( + $url_parts["host"] + ) || + stripos($url_parts["host"], "google.") !== false + ){ + + // no host, we have a tracking url + if(isset($query["imgurl"])){ + + $url = $query["imgurl"]; + } + elseif(isset($query["q"])){ + + $url = $query["q"]; + } + } + + // rewrite URLs to remove extra tracking parameters + $domain = parse_url($url, PHP_URL_HOST); + + if( + preg_match( + '/wikipedia.org$/', + $domain + ) + ){ + + // rewrite wikipedia mobile URLs to desktop + $url = + $this->replacedomain( + $url, + preg_replace( + '/([a-z0-9]+)(\.m\.)/', + '$1.', + $domain + ) + ); + } + + elseif( + preg_match( + '/imdb\.com$|youtube\.[^.]+$/', + $domain + ) + ){ + + // rewrite imdb and youtube mobile URLs too + $url = + $this->replacedomain( + $url, + preg_replace( + '/^m\./', + "", + $domain + ) + ); + + } + + elseif( + preg_match( + '/play\.google\.[^.]+$/', + $domain + ) + ){ + + // remove referrers from play.google.com + $u_query = parse_url($url, PHP_URL_QUERY); + if($u_query !== null){ + + parse_str($u_query, $u_query); + if(isset($u_query["referrer"])){ unset($u_query["referrer"]); } + if(isset($u_query["hl"])){ unset($u_query["hl"]); } + if(isset($u_query["gl"])){ unset($u_query["gl"]); } + + $query = http_build_query($query); + + $url = + str_replace( + $u_query, + $u_query, + $url + ); + } + } + + elseif( + preg_match( + '/twitter\.com$/', + $domain + ) + ){ + // remove more referrers from twitter.com + $u_query = parse_url($url, PHP_URL_QUERY); + if($u_query !== null){ + + parse_str($u_query, $u_query); + if(isset($u_query["ref_src"])){ unset($u_query["ref_src"]); } + + $u_query = http_build_query($u_query); + + $url = + str_replace( + $oldquery, + $u_query, + $url + ); + } + } + + elseif( + preg_match( + '/maps\.google\.[^.]+/', + $domain + ) + ){ + + if(stripos($url, "maps?") !== false){ + + $u_query = parse_url($url, PHP_URL_QUERY); + + if($u_query !== null){ + + parse_str($u_query, $u_query); + + if(isset($u_query["daddr"])){ + + $url = + "https://maps.google.com/maps?daddr=" . + urlencode($u_query["daddr"]); + } + } + } + } + + if($return_size){ + + return [ + "url" => $url, + "ref" => isset($query["imgrefurl"]) ? $query["imgrefurl"] : null, + "thumb_width" => isset($query["tbnw"]) ? (int)$query["tbnw"] : null, + "thumb_height" => isset($query["tbnh"]) ? (int)$query["tbnh"] : null, + "image_width" => isset($query["w"]) ? (int)$query["w"] : null, + "image_height" => isset($query["h"]) ? (int)$query["h"] : null + ]; + } + + return $url; + } + + private function replacedomain($url, $domain){ + + return + preg_replace( + '/(https?:\/\/)([^\/]+)/', + '$1' . $domain, + $url + ); + } + + private function titledots($title){ + + return trim($title, " .\t\n\r\0\x0B…"); + } + + private function hms2int($time){ + + $parts = explode(":", $time, 3); + $time = 0; + + if(count($parts) === 3){ + + // hours + $time = $time + ((int)$parts[0] * 3600); + array_shift($parts); + } + + if(count($parts) === 2){ + + // minutes + $time = $time + ((int)$parts[0] * 60); + array_shift($parts); + } + + // seconds + $time = $time + (int)$parts[0]; + + return $time; + } + + private function detect_sorry(){ + + $captcha_form = + $this->fuckhtml + ->getElementById( + "captcha-form", + "form" + ); + + if($captcha_form !== false){ + + throw new Exception("Google returned a captcha"); + } + } +} diff --git a/scraper/google_cse.php b/scraper/google_cse.php new file mode 100644 index 0000000..02ab462 --- /dev/null +++ b/scraper/google_cse.php @@ -0,0 +1,1054 @@ +<?php + +class google_cse{ + + public const req_html = 0; + public const req_js = 1; + + public function __construct(){ + + include "lib/backend.php"; + $this->backend = new backend("google_cse"); + + include "lib/fuckhtml.php"; + $this->fuckhtml = new fuckhtml(); + } + + public function getfilters($page){ + + $base = [ + "country" => [ // gl=<country> (image: cr=countryAF) + "display" => "Country", + "option" => [ + "any" => "Any country", + "af" => "Afghanistan", + "al" => "Albania", + "dz" => "Algeria", + "as" => "American Samoa", + "ad" => "Andorra", + "ao" => "Angola", + "ai" => "Anguilla", + "aq" => "Antarctica", + "ag" => "Antigua and Barbuda", + "ar" => "Argentina", + "am" => "Armenia", + "aw" => "Aruba", + "au" => "Australia", + "at" => "Austria", + "az" => "Azerbaijan", + "bs" => "Bahamas", + "bh" => "Bahrain", + "bd" => "Bangladesh", + "bb" => "Barbados", + "by" => "Belarus", + "be" => "Belgium", + "bz" => "Belize", + "bj" => "Benin", + "bm" => "Bermuda", + "bt" => "Bhutan", + "bo" => "Bolivia", + "ba" => "Bosnia and Herzegovina", + "bw" => "Botswana", + "bv" => "Bouvet Island", + "br" => "Brazil", + "io" => "British Indian Ocean Territory", + "bn" => "Brunei Darussalam", + "bg" => "Bulgaria", + "bf" => "Burkina Faso", + "bi" => "Burundi", + "kh" => "Cambodia", + "cm" => "Cameroon", + "ca" => "Canada", + "cv" => "Cape Verde", + "ky" => "Cayman Islands", + "cf" => "Central African Republic", + "td" => "Chad", + "cl" => "Chile", + "cn" => "China", + "cx" => "Christmas Island", + "cc" => "Cocos (Keeling) Islands", + "co" => "Colombia", + "km" => "Comoros", + "cg" => "Congo", + "cd" => "Congo, the Democratic Republic", + "ck" => "Cook Islands", + "cr" => "Costa Rica", + "ci" => "Cote D'ivoire", + "hr" => "Croatia", + "cu" => "Cuba", + "cy" => "Cyprus", + "cz" => "Czech Republic", + "dk" => "Denmark", + "dj" => "Djibouti", + "dm" => "Dominica", + "do" => "Dominican Republic", + "ec" => "Ecuador", + "eg" => "Egypt", + "sv" => "El Salvador", + "gq" => "Equatorial Guinea", + "er" => "Eritrea", + "ee" => "Estonia", + "et" => "Ethiopia", + "fk" => "Falkland Islands (Malvinas)", + "fo" => "Faroe Islands", + "fj" => "Fiji", + "fi" => "Finland", + "fr" => "France", + "gf" => "French Guiana", + "pf" => "French Polynesia", + "tf" => "French Southern Territories", + "ga" => "Gabon", + "gm" => "Gambia", + "ge" => "Georgia", + "de" => "Germany", + "gh" => "Ghana", + "gi" => "Gibraltar", + "gr" => "Greece", + "gl" => "Greenland", + "gd" => "Grenada", + "gp" => "Guadeloupe", + "gu" => "Guam", + "gt" => "Guatemala", + "gn" => "Guinea", + "gw" => "Guinea-Bissau", + "gy" => "Guyana", + "ht" => "Haiti", + "hm" => "Heard Island and Mcdonald Islands", + "va" => "Holy See (Vatican City State)", + "hn" => "Honduras", + "hk" => "Hong Kong", + "hu" => "Hungary", + "is" => "Iceland", + "in" => "India", + "id" => "Indonesia", + "ir" => "Iran, Islamic Republic", + "iq" => "Iraq", + "ie" => "Ireland", + "il" => "Israel", + "it" => "Italy", + "jm" => "Jamaica", + "jp" => "Japan", + "jo" => "Jordan", + "kz" => "Kazakhstan", + "ke" => "Kenya", + "ki" => "Kiribati", + "kp" => "Korea, Democratic People's Republic", + "kr" => "Korea, Republic", + "kw" => "Kuwait", + "kg" => "Kyrgyzstan", + "la" => "Lao People's Democratic Republic", + "lv" => "Latvia", + "lb" => "Lebanon", + "ls" => "Lesotho", + "lr" => "Liberia", + "ly" => "Libyan Arab Jamahiriya", + "li" => "Liechtenstein", + "lt" => "Lithuania", + "lu" => "Luxembourg", + "mo" => "Macao", + "mk" => "Macedonia, the Former Yugosalv Republic", + "mg" => "Madagascar", + "mw" => "Malawi", + "my" => "Malaysia", + "mv" => "Maldives", + "ml" => "Mali", + "mt" => "Malta", + "mh" => "Marshall Islands", + "mq" => "Martinique", + "mr" => "Mauritania", + "mu" => "Mauritius", + "yt" => "Mayotte", + "mx" => "Mexico", + "fm" => "Micronesia, Federated States", + "md" => "Moldova, Republic", + "mc" => "Monaco", + "mn" => "Mongolia", + "ms" => "Montserrat", + "ma" => "Morocco", + "mz" => "Mozambique", + "mm" => "Myanmar", + "na" => "Namibia", + "nr" => "Nauru", + "np" => "Nepal", + "nl" => "Netherlands", + "an" => "Netherlands Antilles", + "nc" => "New Caledonia", + "nz" => "New Zealand", + "ni" => "Nicaragua", + "ne" => "Niger", + "ng" => "Nigeria", + "nu" => "Niue", + "nf" => "Norfolk Island", + "mp" => "Northern Mariana Islands", + "no" => "Norway", + "om" => "Oman", + "pk" => "Pakistan", + "pw" => "Palau", + "ps" => "Palestinian Territory, Occupied", + "pa" => "Panama", + "pg" => "Papua New Guinea", + "py" => "Paraguay", + "pe" => "Peru", + "ph" => "Philippines", + "pn" => "Pitcairn", + "pl" => "Poland", + "pt" => "Portugal", + "pr" => "Puerto Rico", + "qa" => "Qatar", + "re" => "Reunion", + "ro" => "Romania", + "ru" => "Russian Federation", + "rw" => "Rwanda", + "sh" => "Saint Helena", + "kn" => "Saint Kitts and Nevis", + "lc" => "Saint Lucia", + "pm" => "Saint Pierre and Miquelon", + "vc" => "Saint Vincent and the Grenadines", + "ws" => "Samoa", + "sm" => "San Marino", + "st" => "Sao Tome and Principe", + "sa" => "Saudi Arabia", + "sn" => "Senegal", + "cs" => "Serbia and Montenegro", + "sc" => "Seychelles", + "sl" => "Sierra Leone", + "sg" => "Singapore", + "sk" => "Slovakia", + "si" => "Slovenia", + "sb" => "Solomon Islands", + "so" => "Somalia", + "za" => "South Africa", + "gs" => "South Georgia and the South Sandwich Islands", + "es" => "Spain", + "lk" => "Sri Lanka", + "sd" => "Sudan", + "sr" => "Suriname", + "sj" => "Svalbard and Jan Mayen", + "sz" => "Swaziland", + "se" => "Sweden", + "ch" => "Switzerland", + "sy" => "Syrian Arab Republic", + "tw" => "Taiwan, Province of China", + "tj" => "Tajikistan", + "tz" => "Tanzania, United Republic", + "th" => "Thailand", + "tl" => "Timor-Leste", + "tg" => "Togo", + "tk" => "Tokelau", + "to" => "Tonga", + "tt" => "Trinidad and Tobago", + "tn" => "Tunisia", + "tr" => "Turkey", + "tm" => "Turkmenistan", + "tc" => "Turks and Caicos Islands", + "tv" => "Tuvalu", + "ug" => "Uganda", + "ua" => "Ukraine", + "ae" => "United Arab Emirates", + "uk" => "United Kingdom", + "us" => "United States", + "um" => "United States Minor Outlying Islands", + "uy" => "Uruguay", + "uz" => "Uzbekistan", + "vu" => "Vanuatu", + "ve" => "Venezuela", + "vn" => "Viet Nam", + "vg" => "Virgin Islands, British", + "vi" => "Virgin Islands, U.S.", + "wf" => "Wallis and Futuna", + "eh" => "Western Sahara", + "ye" => "Yemen", + "zm" => "Zambia", + "zw" => "Zimbabwe" + ] + ], + "nsfw" => [ + "display" => "NSFW", + "option" => [ + "yes" => "Yes", // safe=active + "no" => "No" // safe=off + ] + ], + "spellcheck" => [ + // display undefined + "option" => [ + "yes" => "Yes", + "no" => "No" + ] + ] + ]; + + switch($page){ + + case "web": + return array_merge( + $base, + [ + "lang" => [ // lr=<lang> (prefix lang with "lang_") + "display" => "Language", + "option" => [ + "any" => "Any language", + "ar" => "Arabic", + "bg" => "Bulgarian", + "ca" => "Catalan", + "cs" => "Czech", + "da" => "Danish", + "de" => "German", + "el" => "Greek", + "en" => "English", + "es" => "Spanish", + "et" => "Estonian", + "fi" => "Finnish", + "fr" => "French", + "hr" => "Croatian", + "hu" => "Hungarian", + "id" => "Indonesian", + "is" => "Icelandic", + "it" => "Italian", + "iw" => "Hebrew", + "ja" => "Japanese", + "ko" => "Korean", + "lt" => "Lithuanian", + "lv" => "Latvian", + "nl" => "Dutch", + "no" => "Norwegian", + "pl" => "Polish", + "pt" => "Portuguese", + "ro" => "Romanian", + "ru" => "Russian", + "sk" => "Slovak", + "sl" => "Slovenian", + "sr" => "Serbian", + "sv" => "Swedish", + "tr" => "Turkish", + "zh-CN" => "Chinese (Simplified)", + "zh-TW" => "Chinese (Traditional)" + ] + ], + "sort" => [ + "display" => "Sort by", + "option" => [ + "relevance" => "Relevance", + "date" => "Date" + ] + ], + "redundant" => [ + "display" => "Remove redundant", + "option" => [ + "yes" => "Yes", + "no" => "No", + ] + ] + ] + ); + break; + + case "images": + return array_merge( + $base, + [ + "size" => [ // imgsz + "display" => "Size", + "option" => [ + "any" => "Any size", + "l" => "Large", + "m" => "Medium", + "i" => "Icon", + "qsvga" => "Larger than 400x300", + "vga" => "Larger than 640x480", + "svga" => "Larger than 800x600", + "xga" => "Larger than 1024x768", + "2mp" => "Larger than 2MP", + "4mp" => "Larger than 4MP", + "6mp" => "Larger than 6MP", + "8mp" => "Larger than 8MP", + "10mp" => "Larger than 10MP", + "12mp" => "Larger than 12MP", + "15mp" => "Larger than 15MP", + "20mp" => "Larger than 20MP", + "40mp" => "Larger than 40MP", + "70mp" => "Larger than 70MP" + ] + ], + "color" => [ // imgc + "display" => "Color", + "option" => [ + "any" => "Any color", + "color" => "Full color", + "bnw" => "Black & white", + "trans" => "Transparent", + // from here, imgcolor + "red" => "Red", + "orange" => "Orange", + "yellow" => "Yellow", + "green" => "Green", + "teal" => "Teal", + "blue" => "Blue", + "purple" => "Purple", + "pink" => "Pink", + "white" => "White", + "gray" => "Gray", + "black" => "Black", + "brown" => "Brown" + ] + ], + "format" => [ // as_filetype + "display" => "Format", + "option" => [ + "any" => "Any format", + "jpg" => "JPG", + "gif" => "GIF", + "png" => "PNG", + "bmp" => "BMP", + "svg" => "SVG", + "webp" => "WEBP", + "ico" => "ICO", + "craw" => "RAW" + ] + ] + ] + ); + break; + } + } + + private function get($proxy, $url, $get = [], $reqtype = self::req_js){ + + $curlproc = curl_init(); + + if($get !== []){ + + $get = http_build_query($get); + $url .= "?" . $get; + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + // http2 bypass + curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + + if($reqtype === self::req_js){ + + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: */*", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "DNT: 1", + "Sec-GPC: 1", + "Alt-Used: cse.google.com", + "Connection: keep-alive", + "Referer: https://cse.google.com/cse?cx=" . config::GOOGLE_CX_ENDPOINT, + "Sec-Fetch-Dest: script", + "Sec-Fetch-Mode: no-cors", + "Sec-Fetch-Site: same-origin", + "TE: trailers"] + ); + }else{ + + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/png,image/svg+xml,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "DNT: 1", + "Sec-GPC: 1", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: none", + "Sec-Fetch-User: ?1", + "Priority: u=0, i"] + ); + } + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + return $data; + } + + public function web($get){ + + // page 1 + // https://cse.google.com/cse/element/v1?rsz=filtered_cse&num=10&hl=en&source=gcsc&cselibv=8fa85d58e016b414&cx=d4e68b99b876541f0&q=asmr&safe=active&cse_tok=AB-tC_6RPUTmB4XK0lE9e1AFFC5r%3A1729563832926&lr=&cr=&gl=&filter=0&sort=&as_oq=&as_sitesearch=&exp=cc%2Capo&oq=asmr&gs_l=partner-web.3..0i512i433j0i512i433i131l2j0i512i433j0i512i433i131j0i512i433j0i512i433i131l2j0i512l2.10902.266627.5.267157.11.10.0.0.0.0.188.1108.2j7.9.0.csems%2Cnrl%3D10...0....1.34.partner-web..42.14.1500.WJQvMvfXkx4&cseclient=hosted-page-client&callback=google.search.cse.api8223&rurl=https%3A%2F%2Fcse.google.com%2Fcse%3Fcx%3Dd4e68b99b876541f0%23gsc.tab%3D0%26gsc.q%3Dtest%26gsc.sort%3D + + // page 2 + // https://cse.google.com/cse/element/v1?rsz=filtered_cse&num=10&hl=en&source=gcsc&start=10&cselibv=8fa85d58e016b414&cx=d4e68b99b876541f0&q=asmr&safe=active&cse_tok=AB-tC_6RPUTmB4XK0lE9e1AFFC5r%3A1729563832926&lr=&cr=&gl=&filter=0&sort=&as_oq=&as_sitesearch=&exp=cc%2Capo&callback=google.search.cse.api3595&rurl=https%3A%2F%2Fcse.google.com%2Fcse%3Fcx%3Dd4e68b99b876541f0%23gsc.tab%3D0%26gsc.q%3Dtest%26gsc.sort%3D + + if($get["npt"]){ + + [$req_params, $proxy] = + $this->backend->get( + $get["npt"], + "web" + ); + + $req_params = + json_decode( + $req_params, + true + ); + + $json = + $this->get( + $proxy, + "https://cse.google.com/cse/element/v1", + $req_params, + self::req_js + ); + + }else{ + + $proxy = $this->backend->get_ip(); + $params = $this->generate_token($proxy); + + //$json = file_get_contents("scraper/google_cse.txt"); + $req_params = [ + "rsz" => "filtered_cse", + "num" => 20, + "hl" => "en", + "source" => "gcsc", + "cselibv" => $params["lib"], + "cx" => config::GOOGLE_CX_ENDPOINT, + "q" => $get["s"], + "safe" => $get["nsfw"] == "yes" ? "off" : "active", + "cse_tok" => $params["token"], + "lr" => $get["lang"] == "any" ? "" : "lang_" . $get["lang"], + "cr" => $get["country"] == "any" ? "" : "country" . strtoupper($get["country"]), + "gl" => "", + "filter" => $get["redundant"] == "yes" ? "1" : "0", + "sort" => $get["sort"] == "relevance" ? "" : "date", + "as_oq" => "", + "as_sitesearch" => "", + "exp" => "cc,apo", + "oq" => $get["s"], + "gs_l" => "partner-web.3...33294.34225.3.34597.26.11.0.0.0.0.201.1132.6j4j1.11.0.csems,nrl=10...0....1.34.partner-web..34.19.1897.FKEeG5yh2iw", + "cseclient" => "hosted-page-client", + "callback" => "google.search.cse.api" . random_int(4000, 99999), + "rurl" => "https://cse.google.com/cse?cx=" . config::GOOGLE_CX_ENDPOINT . "#gsc.tab=0&gsc.q=" . $get["s"] . "&gsc.sort=" + ]; + + if($get["spellcheck"] == "no"){ + + $req_params["nfpr"] = "1"; + } + + $json = + $this->get( + $proxy, + "https://cse.google.com/cse/element/v1", + $req_params, + self::req_js + ); + + unset($req_params["gs_l"]); + $req_params["start"] = 0; + } + + $req_params["start"] += 20; + + if( + !preg_match( + '/google\.search\.cse\.[A-Za-z0-9]+\(([\S\s]*)\);/i', + $json, + $json + ) + ){ + + throw new Exception("Failed to grep JSON"); + } + + $json = json_decode($json[1], true); + + if(isset($json["error"])){ + + if(isset($json["error"]["errors"][0]["message"])){ + + throw new Exception("Google returned an error: " . $json["error"]["errors"][0]["message"]); + } + + if(isset($json["error"]["message"])){ + + throw new Exception("Google returned an error: " . $json["error"]["message"]); + } + + throw new Exception("Google returned an error object"); + } + + $out = [ + "status" => "ok", + "spelling" => [ + "type" => "no_correction", + "using" => null, + "correction" => null + ], + "npt" => null, + "answer" => [], + "web" => [], + "image" => [], + "video" => [], + "news" => [], + "related" => [] + ]; + + // detect word correction + if(isset($json["spelling"]["type"])){ + + switch($json["spelling"]["type"]){ + + case "DYM": // did you mean? @TODO fix wording + $type = "including"; + break; + + case "SPELL_CORRECTED_RESULTS": // not many results for + $type = "not_many"; + break; + + default: + $type = "not_many"; + } + + if(isset($json["spelling"]["originalQuery"])){ + + $using = $json["spelling"]["originalQuery"]; + } + elseif(isset($json["spelling"]["anchor"])){ + + $using = html_entity_decode(strip_tags($json["spelling"]["anchor"])); + }elseif(isset($json["spelling"]["originalAnchor"])){ + + $using = html_entity_decode(strip_tags($json["spelling"]["originalAnchor"])); + } + + $out["spelling"] = [ + "type" => $type, + "using" => $using, + "correction" => $json["spelling"]["correctedQuery"] + ]; + } + + if(!isset($json["results"])){ + + return $out; + } + + foreach($json["results"] as $result){ + + // get date from description + $description = + explode( + "...", + trim($result["contentNoFormatting"], " ."), + 2 + ); + + if(count($description) === 2){ + + if($date = strtotime($description[0])){ + + $description = ltrim($description[1]); + }else{ + + $date = null; + $description = implode("...", $description); + } + }else{ + + $description = implode("...", $description); + $date = null; + } + + $description = trim($description, " ."); + + // get thumbnails + if(isset($result["richSnippet"]["cseThumbnail"]["src"])){ + + $thumb = [ + "url" => $this->unshit_thumb($result["richSnippet"]["cseThumbnail"]["src"]), + "ratio" => "1:1" + ]; + } + elseif(isset($result["richSnippet"]["cseImage"]["src"])){ + + $thumb = [ + "url" => $result["richSnippet"]["cseImage"]["src"], + "ratio" => "1:1" + ]; + }else{ + + $thumb = [ + "url" => null, + "ratio" => null + ]; + } + + if($thumb["url"] !== null){ + + $found_size = false; + + // find correct ratio + + if( + isset($result["richSnippet"]["cseThumbnail"]["width"]) && + isset($result["richSnippet"]["cseThumbnail"]["height"]) + ){ + $found_size = true; + $width = (int)$result["richSnippet"]["cseThumbnail"]["width"]; + $height = (int)$result["richSnippet"]["cseThumbnail"]["height"]; + } + elseif( + isset($result["richSnippet"]["metatags"]["ogImageWidth"]) && + isset($result["richSnippet"]["metatags"]["ogImageHeight"]) + ){ + $found_size = true; + $width = (int)$result["richSnippet"]["metatags"]["ogImageWidth"]; + $height = (int)$result["richSnippet"]["metatags"]["ogImageHeight"]; + } + + // calculate rounded ratio + if($found_size){ + + $aspect_ratio = $width / $height; + + if($aspect_ratio >= 1.5){ + + $thumb["ratio"] = "16:9"; + } + elseif($aspect_ratio >= 0.8){ + + $thumb["ratio"] = "1:1"; + }else{ + + $thumb["ratio"] = "9:16"; + } + } + } + + $out["web"][] = [ + "title" => rtrim($result["titleNoFormatting"], " ."), + "description" => $description, + "url" => $result["unescapedUrl"], + "date" => $date, + "type" => "web", + "thumb" => $thumb, + "sublink" => [], + "table" => [] + ]; + } + + // detect next page + if( + isset($json["cursor"]["isExactTotalResults"]) || // detects last page + !isset($json["cursor"]["pages"]) // detects no results on page + ){ + + return $out; + } + + // get next page + $out["npt"] = + $this->backend->store( + json_encode( + $req_params + ), + "web", + $proxy + ); + + return $out; + } + + public function image($get){ + + if($get["npt"]){ + + [$req_params, $proxy] = + $this->backend->get( + $get["npt"], + "images" + ); + + $req_params = + json_decode( + $req_params, + true + ); + + $json = + $this->get( + $proxy, + "https://cse.google.com/cse/element/v1", + $req_params, + self::req_js + ); + + }else{ + + $proxy = $this->backend->get_ip(); + $params = $this->generate_token($proxy); + + //$json = file_get_contents("scraper/google_cse.txt"); + $req_params = [ + "rsz" => "filtered_cse", + "num" => 20, + "hl" => "en", + "source" => "gcsc", + "cselibv" => $params["lib"], + "searchtype" => "image", + "cx" => config::GOOGLE_CX_ENDPOINT, + "q" => $get["s"], + "safe" => $get["nsfw"] == "yes" ? "off" : "active", + "cse_tok" => $params["token"], + "exp" => "cc,apo", + "cseclient" => "hosted-page-client", + "callback" => "google.search.cse.api" . random_int(4000, 99999), + "rurl" => "https://cse.google.com/cse?cx=" . config::GOOGLE_CX_ENDPOINT . "#gsc.tab=1&gsc.q=" . $get["s"] . "&gsc.sort=" + ]; + + // add additional hidden filters + + // country (image search uses cr instead of gl) + if($get["country"] != "any"){ + + $req_params["cr"] = "country" . strtoupper($get["country"]); + } + + // nsfw + $req_params["safe"] = $get["nsfw"] == "yes" ? "off" : "active"; + + // size + if($get["size"] != "any"){ + + $req_params["imgsz"] = $get["size"]; + } + + // format + if($get["format"] != "any"){ + + $req_params["as_filetype"] = $get["format"]; + } + + // color + if($get["color"] != "any"){ + + if( + $get["color"] == "color" || + $get["color"] == "trans" + ){ + + $req_params["imgc"] = $get["color"]; + }elseif($get["color"] == "bnw"){ + + $req_params["imgc"] = "gray"; + }else{ + + $req_params["imgcolor"] = $get["color"]; + } + } + + $json = + $this->get( + $proxy, + "https://cse.google.com/cse/element/v1", + $req_params, + self::req_js + ); + + $req_params["start"] = 0; + } + + $req_params["start"] += 20; + + if( + !preg_match( + '/google\.search\.cse\.[A-Za-z0-9]+\(([\S\s]*)\);/i', + $json, + $json + ) + ){ + + throw new Exception("Failed to grep JSON"); + } + + $json = json_decode($json[1], true); + + if(isset($json["error"])){ + + if(isset($json["error"]["errors"][0]["message"])){ + + throw new Exception("Google returned an error: " . $json["error"]["errors"][0]["message"]); + } + + if(isset($json["error"]["message"])){ + + throw new Exception("Google returned an error: " . $json["error"]["message"]); + } + + throw new Exception("Google returned an error object"); + } + + $out = [ + "status" => "ok", + "npt" => null, + "image" => [] + ]; + + // detect next page + if( + isset($json["cursor"]["isExactTotalResults"]) || // detects last page + !isset($json["cursor"]["pages"]) // detects no results on page + ){ + + return $out; + } + + foreach($json["results"] as $result){ + + $out["image"][] = [ + "title" => rtrim($result["titleNoFormatting"], " ."), + "source" => [ + [ + "url" => $result["unescapedUrl"], + "width" => (int)$result["width"], + "height" => (int)$result["height"] + ], + [ + "url" => $result["tbLargeUrl"], + "width" => (int)$result["tbLargeWidth"], + "height" => (int)$result["tbLargeHeight"] + ] + ], + "url" => $result["originalContextUrl"] + ]; + } + + // get next page + $out["npt"] = + $this->backend->store( + json_encode( + $req_params + ), + "images", + $proxy + ); + + return $out; + } + + private function generate_token($proxy){ + + $html = + $this->get( + $proxy, + "https://cse.google.com/cse", + [ + "cx" => config::GOOGLE_CX_ENDPOINT + ], + self::req_html + ); + + // detect captcha + $this->fuckhtml->load($html); + + $title = + $this->fuckhtml + ->getElementsByTagName( + "title" + ); + + if( + count($title) !== 0 && + $title[0]["innerHTML"] == "302 Moved" + ){ + + throw new Exception("Google returned a captcha"); + } + + // get token + preg_match( + '/relativeUrl=\'([^\']+)\';/i', + $html, + $js_uri + ); + + if(!isset($js_uri[1])){ + + throw new Exception("Failed to grep search token"); + } + + $js_uri = + $this->fuckhtml + ->parseJsString( + $js_uri[1] + ); + + // get parameters + $js = + $this->get( + $proxy, + "https://cse.google.com" . $js_uri, + [], + self::req_js + ); + + preg_match( + '/}\)\(({[\S\s]+})\);/', + $js, + $json + ); + + if(!isset($json[1])){ + + throw new Exception("Failed to grep JSON parameters"); + } + + $json = json_decode($json[1], true); + + return [ + "token" => $json["cse_token"], + "lib" => $json["cselibVersion"] + ]; + } + + private function unshit_thumb($url){ + // https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQINE2vbnNLHXqoZr3RVsaEJFyOsj1_BiBnJch-e1nyz3oia7Aj5xVj + // https://i.ytimg.com/vi/PZVIyA5ER3Y/mqdefault.jpg?sqp=-oaymwEFCJQBEFM&rs=AMzJL3nXeaCpdIar-ltNwl82Y82cIJfphA + + $parts = parse_url($url); + + if( + isset($parts["host"]) && + preg_match( + '/tbn.*\.gstatic\.com/', + $parts["host"] + ) + ){ + + parse_str($parts["query"], $params); + + if(isset($params["q"])){ + + return "https://" . $parts["host"] . "/images?q=" . $params["q"]; + } + } + + return $url; + } +} diff --git a/scraper/greppr.php b/scraper/greppr.php new file mode 100644 index 0000000..fc8511c --- /dev/null +++ b/scraper/greppr.php @@ -0,0 +1,435 @@ +<?php + +class greppr{ + + public function __construct(){ + + include "lib/backend.php"; + $this->backend = new backend("greppr"); + + include "lib/fuckhtml.php"; + $this->fuckhtml = new fuckhtml(); + } + + public function getfilters($page){ + + return []; + } + + private function get($proxy, $url, $get = [], $cookie = false, $post){ + + $curlproc = curl_init(); + + curl_setopt($curlproc, CURLOPT_URL, $url); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + + if($post === false){ + + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + + if($cookie === false){ + + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "DNT: 1", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: none", + "Sec-Fetch-User: ?1"] + ); + }else{ + + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip, deflate, br, zstd", + "DNT: 1", + "Sec-GPC: 1", + "Connection: keep-alive", + "Referer: https://greppr.org/search", + "Cookie: PHPSESSID=$cookie", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: same-origin", + "Sec-Fetch-User: ?1", + "Priority: u=0, i"] + ); + } + }else{ + + $get = http_build_query($get); + + curl_setopt($curlproc, CURLOPT_POST, true); + curl_setopt($curlproc, CURLOPT_POSTFIELDS, $get); + + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip, deflate, br, zstd", + "Content-Type: application/x-www-form-urlencoded", + "Content-Length: " . strlen($get), + "Origin: https://greppr.org", + "DNT: 1", + "Sec-GPC: 1", + "Connection: keep-alive", + "Referer: https://greppr.org/", + "Cookie: PHPSESSID=$cookie", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: same-origin", + "Sec-Fetch-User: ?1", + "Priority: u=0, i"] + ); + } + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); + + $headers = []; + + curl_setopt( + $curlproc, + CURLOPT_HEADERFUNCTION, + function($curlproc, $header) use (&$headers){ + + $len = strlen($header); + $header = explode(':', $header, 2); + + if(count($header) < 2){ + + // ignore invalid headers + return $len; + } + + $headers[strtolower(trim($header[0]))] = trim($header[1]); + + return $len; + } + ); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + + return [ + "headers" => $headers, + "data" => $data + ]; + } + + public function web($get, $first_attempt = true){ + + if($get["npt"]){ + + [$q, $proxy] = $this->backend->get($get["npt"], "web"); + + $tokens = json_decode($q, true); + + // + // Get paginated page + // + try{ + + $html = $this->get( + $proxy, + "https://greppr.org" . $tokens["get"], + [], + $tokens["cookie"], + false + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch search page"); + } + + }else{ + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $proxy = $this->backend->get_ip(); + + // + // get token + // + try{ + + $html = + $this->get( + $proxy, + "https://greppr.org", + [], + false, + false + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch search tokens"); + } + + // + // Parse token + // + $this->fuckhtml->load($html["data"]); + + $tokens = []; + + $inputs = + $this->fuckhtml + ->getElementsByTagName( + "input" + ); + + foreach($inputs as $input){ + + if(!isset($input["attributes"]["name"])){ + + continue; + } + + switch($input["attributes"]["name"]){ + + case "var1": + case "var2": + case "n": + $tokens[$input["attributes"]["name"]] = + $this->fuckhtml + ->getTextContent( + $input["attributes"]["value"] + ); + break; + + default: + $tokens["req"] = + $this->fuckhtml + ->getTextContent( + $input["attributes"]["name"] + ); + break; + } + } + + // get cookie + preg_match( + '/PHPSESSID=([^;]+)/', + $html["headers"]["set-cookie"], + $cookie + ); + + if(!isset($cookie[1])){ + + // server sent an unexpected cookie + throw new Exception("Got malformed cookie"); + } + + $tokens["cookie"] = $cookie[1]; + + if($tokens === false){ + + throw new Exception("Failed to grep search tokens"); + } + + // + // Get initial search page + // + try{ + + $html = $this->get( + $proxy, + "https://greppr.org/search", + [ + "var1" => $tokens["var1"], + "var2" => $tokens["var2"], + $tokens["req"] => $search, + "n" => $tokens["n"] + ], + $tokens["cookie"], + true + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch search page"); + } + } + + //$html = file_get_contents("scraper/greppr.html"); + //$this->fuckhtml->load($html); + $this->fuckhtml->load($html["data"]); + + $out = [ + "status" => "ok", + "spelling" => [ + "type" => "no_correction", + "using" => null, + "correction" => null + ], + "npt" => null, + "answer" => [], + "web" => [], + "image" => [], + "video" => [], + "news" => [], + "related" => [] + ]; + + // get results for later + $results = + $this->fuckhtml + ->getElementsByClassName( + "result", + "div" + ); + + // check for next page + $next_elem = + $this->fuckhtml + ->getElementsByClassName( + "pagination", + "ul" + ); + + if(count($next_elem) !== 0){ + + $this->fuckhtml->load($next_elem[0]); + + $as = + $this->fuckhtml + ->getElementsByClassName( + "page-link", + "a" + ); + + $break = false; + foreach($as as $a){ + + if($break === true){ + + $out["npt"] = + $this->backend->store( + json_encode([ + "get" => + $this->fuckhtml + ->getTextContent( + $a["attributes"]["href"] + ), + "cookie" => $tokens["cookie"] + ]), + "web", + $proxy + ); + break; + } + + if($a["attributes"]["href"] == "#"){ + + $break = true; + } + } + } + + // scrape results + foreach($results as $result){ + + $this->fuckhtml->load($result); + + $a = + $this->fuckhtml + ->getElementsByTagName( + "a" + )[0]; + + $description = + $this->fuckhtml + ->getElementsByClassName( + "highlightedDesc", + "p" + ); + + if(count($description) === 0){ + + $description = null; + }else{ + + $description = + $this->limitstrlen( + $this->fuckhtml + ->getTextContent( + $description[0] + ) + ); + } + + $date = + $this->fuckhtml + ->getElementsByTagName( + "p" + ); + + $date = + strtotime( + explode( + ":", + $this->fuckhtml + ->getTextContent( + $date[count($date) - 1]["innerHTML"] + ) + )[1] + ); + + $out["web"][] = [ + "title" => + $this->fuckhtml + ->getTextContent( + $a["innerHTML"] + ), + "description" => $description, + "url" => + $this->fuckhtml + ->getTextContent( + $a["attributes"]["href"] + ), + "date" => $date, + "type" => "web", + "thumb" => [ + "url" => null, + "ratio" => null + ], + "sublink" => [], + "table" => [] + ]; + } + + return $out; + } + + private function limitstrlen($text){ + + return explode("\n", wordwrap($text, 300, "\n"))[0]; + } +} diff --git a/scraper/imgur.php b/scraper/imgur.php new file mode 100644 index 0000000..e41f4c2 --- /dev/null +++ b/scraper/imgur.php @@ -0,0 +1,282 @@ +<?php + +class imgur{ + + public function __construct(){ + + include "lib/fuckhtml.php"; + $this->fuckhtml = new fuckhtml(); + + include "lib/backend.php"; + $this->backend = new backend("imgur"); + } + + public function getfilters($page){ + + return [ + "sort" => [ // /score/ + "display" => "Sort by", + "option" => [ + "score" => "Highest scoring", + "relevance" => "Most relevant", + "time" => "Newest first" + ] + ], + "time" => [ // /score/day/ + "display" => "Time posted", + "option" => [ + "all" => "All time", + "day" => "Today", + "week" => "This week", + "month" => "This month", + "year" => "This year" + ] + ], + "format" => [ // q_type + "display" => "Format", + "option" => [ + "any" => "Any format", + "jpg" => "JPG", + "png" => "PNG", + "gif" => "GIF", + "anigif" => "Animated GIF", + "album" => "Albums" + ] + ], + "size" => [ // q_size_px + "display" => "Size", + "option" => [ + "any" => "Any size", + "small" => "Small (500px or less)", + "med" => "Medium (500px to 2000px)", + "big" => "Big (2000px to 5000px)", + "lrg" => "Large (5000px to 10000px)", + "huge" => "Huge (10000px and above)" + ] + ] + ]; + } + + private function get($proxy, $url, $get = []){ + + $curlproc = curl_init(); + + if($get !== []){ + $get = http_build_query($get); + $url .= "?scrolled&" . $get; + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "DNT: 1", + "Referer: https://imgur.com/search/", + "Connection: keep-alive", + "Sec-Fetch-Dest: empty", + "Sec-Fetch-Mode: cors", + "Sec-Fetch-Site: same-origin", + "TE: trailers", + "X-Requested-With: XMLHttpRequest"] + ); + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + return $data; + } + + public function image($get){ + + if($get["npt"]){ + + [$filter, $proxy] = + $this->backend->get( + $get["npt"], + "images" + ); + + $filter = json_decode($filter, true); + + $search = $filter["s"]; + unset($filter["s"]); + + $sort = $filter["sort"]; + unset($filter["sort"]); + + $time = $filter["time"]; + unset($filter["time"]); + + $format = $filter["format"]; + unset($filter["format"]); + + $size = $filter["size"]; + unset($filter["size"]); + + $page = $filter["page"]; + unset($filter["page"]); + }else{ + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $proxy = $this->backend->get_ip(); + $sort = $get["sort"]; + $time = $get["time"]; + $format = $get["format"]; + $size = $get["size"]; + $page = 0; + + $filter = [ + "q" => $search + ]; + + if($format != "any"){ + + $filter["q_type"] = $format; + } + + if($size != "any"){ + + $filter["q_size_px"] = $size; + $filter["q_size_is_mpx"] = "off"; + } + } + + $out = [ + "status" => "ok", + "npt" => null, + "image" => [] + ]; + + try{ + $html = + $this->get( + $proxy, + "https://imgur.com/search/$sort/$time/page/$page", + $filter + ); + + }catch(Exception $error){ + + throw new Exception("Failed to fetch HTML"); + } + + $json = json_decode($html, true); + + if($json){ + + // {"data":{"error":"Imgur is temporarily over capacity. Please try again later."},"success":false,"status":403} + + if(isset($json["data"]["error"])){ + + if(stripos($json["data"]["error"], "capacity")){ + + throw new Exception("Imgur IP blocked this 4get instance or request proxy. Try again"); + } + } + + throw new Exception("Imgur returned an unknown error (IP ban?)"); + } + + $this->fuckhtml->load($html); + + $posts = + $this->fuckhtml + ->getElementsByClassName( + "post", + "div" + ); + + foreach($posts as $post){ + + $this->fuckhtml->load($post); + + $image = + $this->fuckhtml + ->getElementsByTagName("img"); + + if(count($image) === 0){ + + continue; + } + + $image = $image[0]; + + $image_url = "https:" . substr($this->fuckhtml->getTextContent($image["attributes"]["src"]), 0, -5); + + $out["image"][] = [ + "title" => + $this->fuckhtml + ->getTextContent( + $image["attributes"]["alt"] + ), + "source" => [ + [ + "url" => $image_url . ".jpg", + "width" => null, + "height" => null + ], + [ + "url" => $image_url . "m.jpg", + "width" => null, + "height" => null + ] + ], + "url" => + "https://imgur.com" . + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByClassName( + "image-list-link", + "a" + ) + [0] + ["attributes"] + ["href"] + ) + ]; + } + + if(isset($out["image"][0])){ + + // store nextpage + $filter["s"] = $search; + $filter["sort"] = $sort; + $filter["time"] = $time; + $filter["format"] = $format; + $filter["size"] = $size; + $filter["page"] = $page + 1; + + $out["npt"] = + $this->backend->store( + json_encode($filter), + "images", + $proxy + ); + } + + return $out; + } +} diff --git a/scraper/marginalia.php b/scraper/marginalia.php new file mode 100644 index 0000000..8fcd9fc --- /dev/null +++ b/scraper/marginalia.php @@ -0,0 +1,580 @@ +<?php + +class marginalia{ + public function __construct(){ + + include "lib/anubis.php"; + $this->anubis = new anubis(); + + include_once "lib/fuckhtml.php"; + $this->fuckhtml = new fuckhtml(); + + include "lib/backend.php"; + $this->backend = new backend("marginalia"); + } + + public function getfilters($page){ + + if(config::MARGINALIA_API_KEY === null){ + + $base = [ + "adtech" => [ + "display" => "Reduce adtech", + "option" => [ + "no" => "No", + "yes" => "Yes" + ] + ], + "recent" => [ + "display" => "Recent results", + "option" => [ + "no" => "No", + "yes" => "Yes" + ] + ], + "intitle" => [ + "display" => "Search in title", + "option" => [ + "no" => "No", + "yes" => "Yes" + ] + ] + ]; + }else{ + + $base = []; + } + + return array_merge( + $base, + [ + "format" => [ + "display" => "Format", + "option" => [ + "any" => "Any format", + "html5" => "html5", + "xhtml" => "xhtml", + "html123" => "html123" + ] + ], + "file" => [ + "display" => "Filetype", + "option" => [ + "any" => "Any filetype", + "nomedia" => "Deny media", + "media" => "Contains media", + "audio" => "Contains audio", + "video" => "Contains video", + "archive" => "Contains archive", + "document" => "Contains document" + ] + ], + "javascript" => [ + "display" => "Javascript", + "option" => [ + "any" => "Allow JS", + "deny" => "Deny JS", + "require" => "Require JS" + ] + ], + "trackers" => [ + "display" => "Trackers", + "option" => [ + "any" => "Allow trackers", + "deny" => "Deny trackers", + "require" => "Require trackers" + ] + ], + "cookies" => [ + "display" => "Cookies", + "option" => [ + "any" => "Allow cookies", + "deny" => "Deny cookies", + "require" => "Require cookies" + ] + ], + "affiliate" => [ + "display" => "Affiliate links in body", + "option" => [ + "any" => "Allow affiliate links", + "deny" => "Deny affiliate links", + "require" => "Require affiliate links" + ] + ] + ] + ); + } + + private function get($proxy, $url, $get = [], $get_cookies = 1){ + + $curlproc = curl_init(); + + switch($get_cookies){ + + case 0: + $cookies = ""; + $cookies_tmp = []; + curl_setopt($curlproc, CURLOPT_HEADERFUNCTION, function($curlproc, $header) use (&$cookies_tmp){ + + $length = strlen($header); + + $header = explode(":", $header, 2); + + if(trim(strtolower($header[0])) == "set-cookie"){ + + $cookie_tmp = explode("=", trim($header[1]), 2); + + $cookies_tmp[trim($cookie_tmp[0])] = + explode(";", $cookie_tmp[1], 2)[0]; + } + + return $length; + }); + break; + + case 1: + $cookies = ""; + break; + + default: + $cookies = "Cookie: " . $get_cookies; + } + + $headers = [ + "User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "DNT: 1", + $cookies, + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: none", + "Sec-Fetch-User: ?1" + ]; + + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers); + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + if($get_cookies === 0){ + + $cookie = []; + + foreach($cookies_tmp as $key => $value){ + + $cookie[] = $key . "=" . $value; + } + + curl_close($curlproc); + return implode(";", $cookie); + } + + return $data; + } + + public function web($get){ + + $search = [$get["s"]]; + if(strlen($get["s"]) === 0){ + + throw new Exception("Search term is empty!"); + } + + $format = $get["format"]; + $file = $get["file"]; + + foreach( + [ + "javascript" => $get["javascript"], + "trackers" => $get["trackers"], + "cookies" => $get["cookies"], + "affiliate" => $get["affiliate"] + ] + as $key => $value + ){ + + if($value == "any"){ continue; } + + switch($key){ + + case "javascript": $str = "js:true"; break; + case "trackers": $str = "special:tracking"; break; + case "cookies": $str = "special:cookies"; break; + case "affiliate": $str = "special:affiliate"; break; + } + + if($value == "deny"){ + $str = "-" . $str; + } + + $search[] = $str; + } + + if($format != "any"){ + + $search[] = "format:$format"; + } + + switch($file){ + + case "any": break; + case "nomedia": $search[] = "-special:media"; break; + case "media": $search[] = "special:media"; break; + + default: + $search[] = "file:$file"; + } + + $search = implode(" ", $search); + + $out = [ + "status" => "ok", + "spelling" => [ + "type" => "no_correction", + "using" => null, + "correction" => null + ], + "npt" => null, + "answer" => [], + "web" => [], + "image" => [], + "video" => [], + "news" => [], + "related" => [] + ]; + + // API scraper + if(config::MARGINALIA_API_KEY !== null){ + + try{ + $json = + $this->get( + $this->backend->get_ip(), // no nextpage + "https://api.marginalia-search.com/" . config::MARGINALIA_API_KEY . "/search/" . urlencode($search), + [ + "count" => 20 + ] + ); + }catch(Exception $error){ + + throw new Exception("Failed to get JSON"); + } + + if($json == "Slow down"){ + + throw new Exception("The API key used is rate limited. Please try again in a few minutes."); + } + + $json = json_decode($json, true); + + foreach($json["results"] as $result){ + + $out["web"][] = [ + "title" => $result["title"], + "description" => str_replace("\n", " ", $result["description"]), + "url" => $result["url"], + "date" => null, + "type" => "web", + "thumb" => [ + "url" => null, + "ratio" => null + ], + "sublink" => [], + "table" => [] + ]; + } + + return $out; + } + + // HTML parser + $proxy = $this->backend->get_ip(); + + // + // Bypass anubis check + // + /* + if(($anubis_key = apcu_fetch("marginalia_cookie")) === false){ + + try{ + $html = + $this->get( + $proxy, + "https://old-search.marginalia.nu/search", + [ + "query" => $search + ] + ); + + }catch(Exception $error){ + + throw new Exception("Failed to get anubis challenge"); + } + + try{ + + $anubis_data = $this->anubis->scrape($html); + }catch(Exception $error){ + + throw new Exception($error); + } + + // send anubis response & get cookies + // https://old-search.marginalia.nu/.within.website/x/cmd/anubis/api/pass-challenge?response=0000018966b086834f738bacba6031028adb5aa875974ead197a8b75778baf3a&nonce=39947&redir=https%3A%2F%2Fold-search.marginalia.nu%2F&elapsedTime=1164 + + try{ + + $anubis_key = + $this->get( + $proxy, + "https://old-search.marginalia.nu/.within.website/x/cmd/anubis/api/pass-challenge", + [ + "response" => $anubis_data["response"], + "nonce" => $anubis_data["nonce"], + "redir" => "https://old-search.marginalia.nu/", + "elapsedTime" => random_int(1000, 2000) + ], + 0 + ); + }catch(Exception $error){ + + throw new Exception("Failed to submit anubis challenge"); + } + + apcu_store("marginalia_cookie", $anubis_key); + }*/ + + if($get["npt"]){ + + [$params, $proxy] = + $this->backend->get( + $get["npt"], + "web" + ); + + try{ + $html = + $this->get( + $proxy, + "https://old-search.marginalia.nu/search?" . $params, + [], + //$anubis_key + ); + }catch(Exception $error){ + + throw new Exception("Failed to get HTML"); + } + + }else{ + $params = [ + "query" => $search + ]; + + foreach(["adtech", "recent", "intitle"] as $v){ + + if($get[$v] == "yes"){ + + switch($v){ + + case "adtech": $params["adtech"] = "reduce"; break; + case "recent": $params["recent"] = "recent"; break; + case "adtech": $params["searchTitle"] = "title"; break; + } + } + } + + try{ + $html = + $this->get( + $proxy, + "https://old-search.marginalia.nu/search", + $params, + //$anubis_key + ); + }catch(Exception $error){ + + throw new Exception("Failed to get HTML"); + } + } + + $this->fuckhtml->load($html); + + $sections = + $this->fuckhtml + ->getElementsByClassName( + "card search-result", + "section" + ); + + foreach($sections as $section){ + + $this->fuckhtml->load($section); + + $title = + $this->fuckhtml + ->getElementsByClassName( + "title", + "a" + )[0]; + + $description = + $this->fuckhtml + ->getElementsByClassName( + "description", + "p" + ); + + if(count($description) !== 0){ + + $description = + $this->fuckhtml + ->getTextContent( + $description[0] + ); + }else{ + + $description = null; + } + + $sublinks = []; + $sublink_html = + $this->fuckhtml + ->getElementsByClassName("additional-results"); + + if(count($sublink_html) !== 0){ + + $this->fuckhtml->load($sublink_html[0]); + + $links = + $this->fuckhtml + ->getElementsByTagName("a"); + + foreach($links as $link){ + + $sublinks[] = [ + "title" => + $this->fuckhtml + ->getTextContent( + $link + ), + "date" => null, + "description" => null, + "url" => + $this->fuckhtml + ->getTextContent( + $link["attributes"]["href"] + ) + ]; + } + } + + $out["web"][] = [ + "title" => + $this->fuckhtml + ->getTextContent( + $title + ), + "description" => $description, + "url" => + $this->fuckhtml + ->getTextContent( + $title["attributes"]["href"] + ), + "date" => null, + "type" => "web", + "thumb" => [ + "url" => null, + "ratio" => null + ], + "sublink" => $sublinks, + "table" => [] + ]; + } + + // get next page + $this->fuckhtml->load($html); + + $pagination = + $this->fuckhtml + ->getElementsByAttributeValue( + "aria-label", + "pagination", + "nav" + ); + + if(count($pagination) === 0){ + + // no pagination + return $out; + } + + $this->fuckhtml->load($pagination[0]); + + $pages = + $this->fuckhtml + ->getElementsByClassName( + "page-link", + "a" + ); + + $found_current_page = false; + + foreach($pages as $page){ + + if( + stripos( + $page["attributes"]["class"], + "active" + ) !== false + ){ + + $found_current_page = true; + continue; + } + + if($found_current_page){ + + // we found current page index, and we iterated over + // the next page <a> + + $out["npt"] = + $this->backend->store( + parse_url( + $page["attributes"]["href"], + PHP_URL_QUERY + ), + "web", + $proxy + ); + break; + } + } + + return $out; + } +} + diff --git a/scraper/mojeek.php b/scraper/mojeek.php new file mode 100644 index 0000000..2939be5 --- /dev/null +++ b/scraper/mojeek.php @@ -0,0 +1,1194 @@ +<?php + +class mojeek{ + public function __construct(){ + + include "lib/fuckhtml.php"; + $this->fuckhtml = new fuckhtml(); + + include "lib/backend.php"; + $this->backend = new backend("mojeek"); + } + + public function getfilters($page){ + + switch($page){ + + case "web": + return [ + "focus" => [ + "display" => "Focus", + "option" => [ + "any" => "No focus", + "blogs" => "Blogs", + "Dictionary" => "Dictionary", + "Recipes" => "Recipes", + "Time" => "Time", + "Weather" => "Weather" + ] + ], + "lang" => [ + "display" => "Language", + "option" => [ + "any" => "Any language", + "af" => "Afrikaans", + "sq" => "Albanian", + "an" => "Aragonese", + "ay" => "Aymara", + "bi" => "Bislama", + "br" => "Breton", + "ca" => "Catalan", + "kw" => "Cornish", + "co" => "Corsican", + "hr" => "Croatian", + "da" => "Danish", + "nl" => "Dutch", + "dz" => "Dzongkha", + "en" => "English", + "fj" => "Fijian", + "fi" => "Finnish", + "fr" => "French", + "gd" => "Gaelic", + "gl" => "Galician", + "de" => "German", + "ht" => "Haitian", + "io" => "Ido", + "id" => "Indonesian", + "ia" => "Interlingua", + "ie" => "Interlingue", + "ga" => "Irish", + "it" => "Italian", + "rw" => "Kinyarwanda", + "la" => "Latin", + "li" => "Limburgish", + "lb" => "Luxembourgish", + "no" => "Norwegian", + "nb" => "Norwegian Bokmål", + "nn" => "Norwegian Nynorsk", + "oc" => "Occitan (post 1500)", + "pl" => "Polish", + "pt" => "Portuguese", + "rm" => "Romansh", + "rn" => "Rundi", + "sg" => "Sango", + "so" => "Somali", + "es" => "Spanish", + "sw" => "Swahili", + "ss" => "Swati", + "sv" => "Swedish", + "ty" => "Tahitian", + "to" => "Tonga (Tonga Islands)", + "ts" => "Tsonga", + "vo" => "Volapük", + "wa" => "Walloon", + "cy" => "Welsh", + "xh" => "Xhosa", + "zu" => "Zulu" + ] + ], + "country" => [ + "display" => "Country", + "option" => [ + "any" => "No location bias", + "af" => "Afghanistan", + "ax" => "Åland Islands", + "al" => "Albania", + "dz" => "Algeria", + "as" => "American Samoa", + "ad" => "Andorra", + "ao" => "Angola", + "ai" => "Anguilla", + "aq" => "Antarctica", + "ag" => "Antigua and Barbuda", + "ar" => "Argentina", + "am" => "Armenia", + "aw" => "Aruba", + "au" => "Australia", + "at" => "Austria", + "az" => "Azerbaijan", + "bs" => "Bahamas", + "bh" => "Bahrain", + "bd" => "Bangladesh", + "bb" => "Barbados", + "by" => "Belarus", + "be" => "Belgium", + "bz" => "Belize", + "bj" => "Benin", + "bm" => "Bermuda", + "bt" => "Bhutan", + "bo" => "Bolivia (Plurinational State of)", + "bq" => "Bonaire, Sint Eustatius and Saba", + "ba" => "Bosnia and Herzegovina", + "bw" => "Botswana", + "bv" => "Bouvet Island", + "br" => "Brazil", + "io" => "British Indian Ocean Territory", + "bn" => "Brunei Darussalam", + "bg" => "Bulgaria", + "bf" => "Burkina Faso", + "bi" => "Burundi", + "cv" => "Cabo Verde", + "kh" => "Cambodia", + "cm" => "Cameroon", + "ca" => "Canada", + "ky" => "Cayman Islands", + "cf" => "Central African Republic", + "td" => "Chad", + "cl" => "Chile", + "cn" => "China", + "cx" => "Christmas Island", + "cc" => "Cocos (Keeling) Islands", + "co" => "Colombia", + "km" => "Comoros", + "cg" => "Congo", + "cd" => "Congo (Democratic Republic of the)", + "ck" => "Cook Islands", + "cr" => "Costa Rica", + "ci" => "Côte d'Ivoire", + "hr" => "Croatia", + "cu" => "Cuba", + "cw" => "Curaçao", + "cy" => "Cyprus", + "cz" => "Czechia", + "dk" => "Denmark", + "dj" => "Djibouti", + "dm" => "Dominica", + "do" => "Dominican Republic", + "ec" => "Ecuador", + "eg" => "Egypt", + "sv" => "El Salvador", + "gq" => "Equatorial Guinea", + "er" => "Eritrea", + "ee" => "Estonia", + "et" => "Ethiopia", + "fk" => "Falkland Islands (Malvinas)", + "fo" => "Faroe Islands", + "fj" => "Fiji", + "fi" => "Finland", + "fr" => "France", + "gf" => "French Guiana", + "pf" => "French Polynesia", + "tf" => "French Southern Territories", + "ga" => "Gabon", + "gm" => "Gambia", + "ge" => "Georgia", + "de" => "Germany", + "gh" => "Ghana", + "gi" => "Gibraltar", + "gr" => "Greece", + "gl" => "Greenland", + "gd" => "Grenada", + "gp" => "Guadeloupe", + "gu" => "Guam", + "gt" => "Guatemala", + "gg" => "Guernsey", + "gn" => "Guinea", + "gw" => "Guinea-Bissau", + "gy" => "Guyana", + "ht" => "Haiti", + "hm" => "Heard Island and McDonald Islands", + "va" => "Holy See", + "hn" => "Honduras", + "hk" => "Hong Kong", + "hu" => "Hungary", + "is" => "Iceland", + "in" => "India", + "id" => "Indonesia", + "ir" => "Iran (Islamic Republic of)", + "iq" => "Iraq", + "ie" => "Ireland", + "im" => "Isle of Man", + "il" => "Israel", + "it" => "Italy", + "jm" => "Jamaica", + "jp" => "Japan", + "je" => "Jersey", + "jo" => "Jordan", + "kz" => "Kazakhstan", + "ke" => "Kenya", + "ki" => "Kiribati", + "kp" => "Korea (Democratic People's Republic of)", + "kr" => "Korea (Republic of)", + "kw" => "Kuwait", + "kg" => "Kyrgyzstan", + "la" => "Lao People's Democratic Republic", + "lv" => "Latvia", + "lb" => "Lebanon", + "ls" => "Lesotho", + "lr" => "Liberia", + "ly" => "Libya", + "li" => "Liechtenstein", + "lt" => "Lithuania", + "lu" => "Luxembourg", + "mo" => "Macao", + "mk" => "Macedonia (the former Yugoslav Republic of)", + "mg" => "Madagascar", + "mw" => "Malawi", + "my" => "Malaysia", + "mv" => "Maldives", + "ml" => "Mali", + "mt" => "Malta", + "mh" => "Marshall Islands", + "mq" => "Martinique", + "mr" => "Mauritania", + "mu" => "Mauritius", + "yt" => "Mayotte", + "mx" => "Mexico", + "fm" => "Micronesia (Federated States of)", + "md" => "Moldova (Republic of)", + "mc" => "Monaco", + "mn" => "Mongolia", + "me" => "Montenegro", + "ms" => "Montserrat", + "ma" => "Morocco", + "mz" => "Mozambique", + "mm" => "Myanmar", + "na" => "Namibia", + "nr" => "Nauru", + "np" => "Nepal", + "nl" => "Netherlands", + "nc" => "New Caledonia", + "nz" => "New Zealand", + "ni" => "Nicaragua", + "ne" => "Niger", + "ng" => "Nigeria", + "nu" => "Niue", + "nf" => "Norfolk Island", + "mp" => "Northern Mariana Islands", + "no" => "Norway", + "om" => "Oman", + "pk" => "Pakistan", + "pw" => "Palau", + "ps" => "Palestine, State of", + "pa" => "Panama", + "pg" => "Papua New Guinea", + "py" => "Paraguay", + "pe" => "Peru", + "ph" => "Philippines", + "pn" => "Pitcairn", + "pl" => "Poland", + "pt" => "Portugal", + "pr" => "Puerto Rico", + "qa" => "Qatar", + "re" => "Réunion", + "ro" => "Romania", + "ru" => "Russian Federation", + "rw" => "Rwanda", + "bl" => "Saint Barthélemy", + "sh" => "Saint Helena, Ascension and Tristan da Cunha", + "kn" => "Saint Kitts and Nevis", + "lc" => "Saint Lucia", + "mf" => "Saint Martin (French part)", + "pm" => "Saint Pierre and Miquelon", + "vc" => "Saint Vincent and the Grenadines", + "ws" => "Samoa", + "sm" => "San Marino", + "st" => "Sao Tome and Principe", + "sa" => "Saudi Arabia", + "sn" => "Senegal", + "rs" => "Serbia", + "sc" => "Seychelles", + "sl" => "Sierra Leone", + "sg" => "Singapore", + "sx" => "Sint Maarten (Dutch part)", + "sk" => "Slovakia", + "si" => "Slovenia", + "sb" => "Solomon Islands", + "so" => "Somalia", + "za" => "South Africa", + "gs" => "South Georgia and South Sandwich Islands", + "ss" => "South Sudan", + "es" => "Spain", + "lk" => "Sri Lanka", + "sd" => "Sudan", + "sr" => "Suriname", + "sj" => "Svalbard and Jan Mayen", + "sz" => "Swaziland", + "se" => "Sweden", + "ch" => "Switzerland", + "sy" => "Syrian Arab Republic", + "tw" => "Taiwan", + "tj" => "Tajikistan", + "tz" => "Tanzania, United Republic of", + "th" => "Thailand", + "tl" => "Timor-Leste", + "tg" => "Togo", + "tk" => "Tokelau", + "to" => "Tonga", + "tt" => "Trinidad and Tobago", + "tn" => "Tunisia", + "tr" => "Turkey", + "tm" => "Turkmenistan", + "tc" => "Turks and Caicos Islands", + "tv" => "Tuvalu", + "ug" => "Uganda", + "ua" => "Ukraine", + "ae" => "United Arab Emirates", + "gb" => "United Kingdom", + "us" => "United States of America", + "um" => "United States Minor Outlying Islands", + "uy" => "Uruguay", + "uz" => "Uzbekistan", + "vu" => "Vanuatu", + "ve" => "Venezuela (Bolivarian Republic of)", + "vn" => "Viet Nam", + "vg" => "Virgin Islands (British)", + "vi" => "Virgin Islands (U.S.)", + "wf" => "Wallis and Futuna", + "eh" => "Western Sahara", + "ye" => "Yemen", + "zm" => "Zambia", + "zw" => "Zimbabwe" + ] + ], + "region" => [ + "display" => "Region", + "option" => [ + "any" => "Any region", + "eu" => "European Union", + "de" => "Germany", + "fr" => "France", + "uk" => "United Kingdom" + ] + ], + "domain" => [ + "display" => "Results per domain", + "option" => [ + "1" => "1 result", + "2" => "2 results", + "3" => "3 results", + "4" => "4 results", + "5" => "5 results", + "10" => "10 results", + "0" => "Unlimited", + ] + ] + ]; + break; + + case "news": + return []; + } + } + + private function get($proxy, $url, $get = []){ + + $headers = [ + "User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "DNT: 1", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: none", + "Sec-Fetch-User: ?1" + ]; + + $curlproc = curl_init(); + + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers); + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + return $data; + } + + public function web($get){ + + if($get["npt"]){ + + [$token, $proxy] = $this->backend->get($get["npt"], "web"); + + try{ + $html = + $this->get( + $proxy, + "https://www.mojeek.com" . $token, + [] + ); + }catch(Exception $error){ + + throw new Exception("Failed to get HTML"); + } + + }else{ + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $proxy = $this->backend->get_ip(); + $lang = $get["lang"]; + $country = $get["country"]; + $region = $get["region"]; + $domain = $get["domain"]; + $focus = $get["focus"]; + + $params = [ + "q" => $search, + "t" => 20, // number of results/page + "tn" => 7, // number of news results/page + "date" => 1, // show date + "tlen" => 128, // max length of title + //"dlen" => 511, // max length of description + "arc" => ($country == "any" ? "none" : $country) // location. don't use autodetect! + ]; + + switch($focus){ + + case "any": break; + + case "blogs": + $params["fmt"] = "sst"; + $params["sst"] = "1"; + break; + + default: + $params["foc_t"] = $focus; + break; + } + + if($lang != "any"){ + + $params["lb"] = $lang; + } + + if($region != "any"){ + + $params["reg"] = $region; + } + + if($domain != "1"){ + + $params["si"] = $domain; + } + + try{ + $html = + $this->get( + $proxy, + "https://www.mojeek.com/search", + $params + ); + }catch(Exception $error){ + + throw new Exception("Failed to get HTML"); + } + } + + $out = [ + "status" => "ok", + "spelling" => [ + "type" => "no_correction", + "using" => null, + "correction" => null + ], + "npt" => null, + "answer" => [], + "web" => [], + "image" => [], + "video" => [], + "news" => [], + "related" => [] + ]; + + $this->fuckhtml->load($html); + + $this->detect_block(); + + $results = + $this->fuckhtml + ->getElementsByClassName("results-standard", "ul"); + + if(count($results) === 0){ + + return $out; + } + + /* + Get all search result divs + */ + foreach($results as $container){ + + $this->fuckhtml->load($container); + $results = + $this->fuckhtml + ->getElementsByTagName("li"); + + foreach($results as $result){ + + $data = [ + "title" => null, + "description" => null, + "url" => null, + "date" => null, + "type" => "web", + "thumb" => [ + "url" => null, + "ratio" => null + ], + "sublink" => [], + "table" => [] + ]; + + $this->fuckhtml->load($result); + + $title = + $this->fuckhtml + ->getElementsByClassName("title", "a")[0]; + + $data["title"] = + html_entity_decode( + $this->fuckhtml + ->getTextContent( + $title["innerHTML"] + ) + ); + + $data["url"] = + html_entity_decode( + $this->fuckhtml + ->getTextContent( + $title["attributes"]["href"] + ) + ); + + $description = + $this->fuckhtml + ->getElementsByClassName( + "s", "p" + ); + + if(count($description) !== 0){ + + $data["description"] = + $this->titledots( + html_entity_decode( + $this->fuckhtml + ->getTextContent( + $description[0] + ) + ) + ); + } + + $date = + $this->fuckhtml + ->getElementsByClassName( + "mdate", + "span" + ); + + if(count($date) !== 0){ + + $data["date"] = + strtotime( + $this->fuckhtml + ->getTextContent( + $date[0] + ) + ); + } + + $out["web"][] = $data; + } + } + + /* + Get instant answers + */ + $this->fuckhtml->load($html); + + $infoboxes = + $this->fuckhtml + ->getElementsByClassName( + "infobox infobox-top", + "div" + ); + + foreach($infoboxes as $infobox){ + + $answer = [ + "title" => null, + "description" => [], + "url" => null, + "thumb" => null, + "table" => [], + "sublink" => [] + ]; + + // load first part with title + short definition + $infobox_html = + explode( + "<hr>", + $infobox["innerHTML"] + ); + + $this->fuckhtml->load($infobox_html[0]); + + // title + $answer["title"] = + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByTagName("h1")[0] + ); + + // short definition + $definition = + $this->fuckhtml + ->getElementsByTagName( + "p" + ); + + if(count($definition) !== 0){ + + $answer["description"][] = [ + "type" => "quote", + "value" => + $this->fuckhtml + ->getTextContent( + $definition[0] + ) + ]; + } + + // get thumbnail, if it exists + $this->fuckhtml->load($infobox_html[1]); + + $thumb = + $this->fuckhtml + ->getElementsByClassName("float-right", "img"); + + if(count($thumb) !== 0){ + + preg_match( + '/\/image\?img=([^&]+)/i', + $thumb[0]["attributes"]["src"], + $matches + ); + + if(count($matches) === 2){ + + // for some reason, if we dont get the image from mojeek + // it sometimes fail to fetch the right image URL + $answer["thumb"] = + "https://mojeek.com" . + $this->fuckhtml + ->getTextContent( + $thumb[0]["attributes"]["src"] + ); + } + } + + // get description + $ps = + $this->fuckhtml + ->getElementsByTagName("p"); + + $first_tag = true; + foreach($ps as $p){ + + $this->fuckhtml->load($p); + + if( + preg_match( + '/^\s*<strong>/i', + $p["innerHTML"] + ) + ){ + + /* + Parse table + */ + + $strong = + $this->fuckhtml + ->getElementsByTagName("strong")[0]; + + $p["innerHTML"] = + str_replace($strong["innerHTML"], "", $p["innerHTML"]); + + $strong = + preg_replace( + '/:$/', + "", + ucfirst( + $this->fuckhtml + ->getTextContent( + $strong + ) + ) + ); + + $answer["table"][trim($strong)] = + trim( + $this->fuckhtml + ->getTextContent( + $p + ) + ); + + continue; + } + + $as = + $this->fuckhtml + ->getElementsByClassName("svg-icon"); + + if(count($as) !== 0){ + + /* + Parse websites + */ + foreach($as as $a){ + + $answer["sublink"][ + ucfirst(explode(" ", $a["attributes"]["class"], 2)[1]) + ] = + $this->fuckhtml + ->getTextContent( + $a["attributes"]["href"] + ); + } + + continue; + } + + /* + Parse text content + */ + $tags = + $this->fuckhtml + ->getElementsByTagName("*"); + + $i = 0; + foreach($tags as $tag){ + + $c = count($answer["description"]); + + // remove tag from innerHTML + $p["innerHTML"] = + explode($tag["outerHTML"], $p["innerHTML"], 2); + + if(count($p["innerHTML"]) === 2){ + + if( + $i === 0 && + $c !== 0 && + $answer["description"][$c - 1]["type"] == "link" + ){ + + $append = "\n\n"; + }else{ + + $append = ""; + } + + if($p["innerHTML"][0] != ""){ + $answer["description"][] = [ + "type" => "text", + "value" => $append . trim($p["innerHTML"][0]) + ]; + } + + $p["innerHTML"] = $p["innerHTML"][1]; + }else{ + + $p["innerHTML"] = $p["innerHTML"][0]; + } + + switch($tag["tagName"]){ + + case "a": + + $value = + $this->fuckhtml + ->getTextContent( + $tag + ); + + if(strtolower($value) == "wikipedia"){ + + if($c !== 0){ + $answer["description"][$c - 1]["value"] = + rtrim($answer["description"][$c - 1]["value"]); + } + break; + } + + $answer["description"][] = [ + "type" => "link", + "url" => + $this->fuckhtml + ->getTextContent( + $tag["attributes"]["href"] + ), + "value" => + $this->fuckhtml + ->getTextContent( + $tag + ) + ]; + break; + } + + $i++; + } + } + + // get URL + $this->fuckhtml->load($infobox_html[2]); + + $answer["url"] = + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByTagName( + "a" + )[0] + ["attributes"] + ["href"] + ); + + // append answer + $out["answer"][] = $answer; + } + + /* + Get news + */ + $this->fuckhtml->load($html); + + $news = + $this->fuckhtml + ->getElementsByClassName( + "results news-results", + "div" + ); + + if(count($news) !== 0){ + + $this->fuckhtml->load($news[0]); + + $lis = + $this->fuckhtml + ->getElementsByTagName("li"); + + foreach($lis as $li){ + + $this->fuckhtml->load($li); + + $a = + $this->fuckhtml + ->getElementsByClassName( + "ob", + "a" + ); + + if(count($a) === 0){ + + continue; + } + + $a = $a[0]; + + $date = + explode( + " - ", + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByTagName( + "span" + )[0] + ) + ); + + $date = + strtotime( + $date[count($date) - 1] + ); + + $out["news"][] = [ + "title" => + html_entity_decode( + $this->fuckhtml + ->getTextContent( + $a + ) + ), + "description" => null, + "date" => $date, + "thumb" => [ + "url" => null, + "ratio" => null + ], + "url" => + $this->fuckhtml + ->getTextContent( + $a["attributes"]["href"] + ) + ]; + } + } + + /* + Get next page + */ + $this->fuckhtml->load($html); + + $pagination = + $this->fuckhtml + ->getElementsByClassName("pagination"); + + if(count($pagination) !== false){ + + $this->fuckhtml->load($pagination[0]); + $as = + $this->fuckhtml + ->getElementsByTagName("a"); + + foreach($as as $a){ + + if($a["innerHTML"] == "Next"){ + + $out["npt"] = $this->backend->store( + $this->fuckhtml + ->getTextContent( + $a["attributes"]["href"] + ), + "web", + $proxy + ); + } + } + } + + return $out; + } + + public function news($get){ + + $search = $get["s"]; + + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $out = [ + "status" => "ok", + "npt" => null, + "news" => [] + ]; + + try{ + $html = + $this->get( + $this->backend->get_ip(), + "https://www.mojeek.com/search", + [ + "q" => $search, + "fmt" => "news" + ] + ); + }catch(Exception $error){ + + throw new Exception("Failed to get HTML"); + } + /* + $handle = fopen("scraper/mojeek.html", "r"); + $html = fread($handle, filesize("scraper/mojeek.html")); + fclose($handle); + */ + + $this->fuckhtml->load($html); + + $this->detect_block(); + + $articles = + $this->fuckhtml->getElementsByTagName("article"); + + foreach($articles as $article){ + + $this->fuckhtml->load($article); + + $data = [ + "title" => null, + "author" => null, + "description" => null, + "date" => null, + "thumb" => + [ + "url" => null, + "ratio" => null + ], + "url" => null + ]; + + $a = $this->fuckhtml->getElementsByTagName("a")[0]; + + $data["title"] = + $this->fuckhtml + ->getTextContent( + $a["attributes"]["title"] + ); + + $data["url"] = + $this->fuckhtml + ->getTextContent( + $a["attributes"]["href"] + ); + + $p = $this->fuckhtml->getElementsByTagName("p"); + + $data["description"] = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByClassName( + "s", + $p + )[0] + ) + ); + + if($data["description"] == ""){ + + $data["description"] = null; + } + + // get date from big node + $date = + $this->fuckhtml + ->getElementsByClassName( + "date", + $p + ); + + if(count($date) !== 0){ + + $data["date"] = + strtotime( + $this->fuckhtml + ->getTextContent( + $date[0] + ) + ); + } + + // grep date + author + $s = + $this->fuckhtml + ->getElementsByClassName( + "i", + $p + )[0]; + + $this->fuckhtml->load($s); + + $a = + $this->fuckhtml + ->getElementsByTagName("a"); + + if(count($a) !== 0){ + + // parse big node information + $data["author"] = + htmlspecialchars_decode( + $this->fuckhtml + ->getTextContent( + $a[0]["innerHTML"] + ) + ); + }else{ + + // parse smaller nodes + $replace = + $this->fuckhtml + ->getElementsByTagName("time")[0]; + + $data["date"] = + strtotime( + $this->fuckhtml + ->getTextContent( + $replace + ) + ); + + $s["innerHTML"] = + str_replace( + $replace["outerHTML"], + "", + $s["innerHTML"] + ); + + $data["author"] = + preg_replace( + '/ • $/', + "", + $s["innerHTML"] + ); + } + + $out["news"][] = $data; + } + + return $out; + } + + private function detect_block(){ + + $title = + $this->fuckhtml + ->getElementsByTagName( + "title" + ); + + if( + count($title) !== 0 && + $this->fuckhtml + ->getTextContent( + $title[0]["innerHTML"] + ) == "403 - Forbidden" + ){ + + throw new Exception("Mojeek blocked this instance or request proxy."); + } + } + + private function titledots($title){ + + return trim($title, ". \t\n\r\0\x0B"); + } +} + diff --git a/scraper/mwmbl.php b/scraper/mwmbl.php new file mode 100644 index 0000000..631b90c --- /dev/null +++ b/scraper/mwmbl.php @@ -0,0 +1,236 @@ +<?php + +class mwmbl{ + + public function __construct(){ + + include "lib/backend.php"; + $this->backend = new backend("mwmbl"); + + include "lib/fuckhtml.php"; + $this->fuckhtml = new fuckhtml(); + } + + public function getfilters($page){ + + return []; + } + + private function get($proxy, $url, $get = []){ + + $curlproc = curl_init(); + + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + // use http2 + curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "Referer: https://beta.mwmbl.org/", + "DNT: 1", + "Sec-GPC: 1", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: same-origin", + "Priority: u=0, i", + "Sec-Fetch-User: ?1"] + ); + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + return $data; + } + + public function web($get){ + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + try{ + $html = $this->get( + $this->backend->get_ip(), // no next page! + "https://beta.mwmbl.org/", + [ + "q" => $search + ] + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch HTML. If you're getting a timeout, make sure you have curl-impersonate setup."); + } + + $out = [ + "status" => "ok", + "spelling" => [ + "type" => "no_correction", + "using" => null, + "correction" => null + ], + "npt" => null, + "answer" => [], + "web" => [], + "image" => [], + "video" => [], + "news" => [], + "related" => [] + ]; + + $this->fuckhtml->load($html); + + $results = + $this->fuckhtml + ->getElementsByClassName( + "result", + "li" + ); + + foreach($results as $result){ + + $this->fuckhtml->load($result); + + $p = + $this->fuckhtml + ->getElementsByTagName("p"); + + $sublinks = []; + + $mores = + $this->fuckhtml + ->getElementsByClassName( + "result-link-more", + "div" + ); + + foreach($mores as $more){ + + $this->fuckhtml->load($more); + + $as = + $this->fuckhtml + ->getElementsByClassName( + "more", + "a" + ); + + if(count($as) === 0){ + + // ?? invalid + continue; + } + + $sublinks[] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByClassName( + "more-title", + "span" + )[0] + ) + ), + "description" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByClassName( + "more-extract", + "span" + )[0] + ) + ), + "url" => + $this->fuckhtml + ->getTextContent( + $as[0] + ["attributes"] + ["href"] + ) + ]; + } + + // reset + $this->fuckhtml->load($result); + + $out["web"][] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByClassName( + "title", + $p + )[0] + ) + ), + "description" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByClassName( + "extract", + $p + )[0] + ) + ), + "url" => + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByTagName("a") + [0] + ["attributes"] + ["href"] + ), + "date" => null, + "type" => "web", + "thumb" => [ + "url" => null, + "ratio" => null + ], + "sublink" => $sublinks, + "table" => [] + ]; + } + + return $out; + } + + private function titledots($title){ + + return rtrim($title, "…"); + } +} diff --git a/scraper/pinterest.php b/scraper/pinterest.php new file mode 100644 index 0000000..4188bce --- /dev/null +++ b/scraper/pinterest.php @@ -0,0 +1,439 @@ +<?php + +class pinterest{ + + public function __construct(){ + + include "lib/backend.php"; + $this->backend = new backend("pinterest"); + } + + public function getfilters($page){ + + return []; + } + + private function get($proxy, $url, $get = [], &$cookies, $header_data_post = null){ + + $curlproc = curl_init(); + + if($header_data_post === null){ + + // handling GET + + // extract cookies + $cookies_tmp = []; + curl_setopt($curlproc, CURLOPT_HEADERFUNCTION, function($curlproc, $header) use (&$cookies_tmp){ + + $length = strlen($header); + + $header = explode(":", $header, 2); + + if(trim(strtolower($header[0])) == "set-cookie"){ + + $cookie_tmp = explode("=", trim($header[1]), 2); + + $cookies_tmp[trim($cookie_tmp[0])] = + explode(";", $cookie_tmp[1], 2)[0]; + } + + return $length; + }); + + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: application/json, text/javascript, */*, q=0.01", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "Referer: https://ca.pinterest.com/", + "X-Requested-With: XMLHttpRequest", + "X-APP-VERSION: 78f8764", + "X-Pinterest-AppState: active", + "X-Pinterest-Source-Url: /", + "X-Pinterest-PWS-Handler: www/index.js", + "screen-dpr: 1", + "is-preload-enabled: 1", + "DNT: 1", + "Sec-GPC: 1", + "Sec-Fetch-Dest: empty", + "Sec-Fetch-Mode: cors", + "Sec-Fetch-Site: same-origin", + "Connection: keep-alive", + "Alt-Used: ca.pinterest.com", + "Priority: u=0", + "TE: trailers"] + ); + + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + }else{ + + // handling POST (pagination) + $get = http_build_query($get); + + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: application/json, text/javascript, */*, q=0.01", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "Content-Type: application/x-www-form-urlencoded", + "Content-Length: " . strlen($get), + "Referer: https://ca.pinterest.com/", + "X-Requested-With: XMLHttpRequest", + "X-APP-VERSION: 78f8764", + "X-CSRFToken: " . $cookies["csrf"], + "X-Pinterest-AppState: active", + "X-Pinterest-Source-Url: /search/pins/?rs=ac&len=2&q=" . urlencode($header_data_post) . "&eq=" . urlencode($header_data_post), + "X-Pinterest-PWS-Handler: www/search/[scope].js", + "screen-dpr: 1", + "is-preload-enabled: 1", + "Origin: https://ca.pinterest.com", + "DNT: 1", + "Sec-GPC: 1", + "Sec-Fetch-Dest: empty", + "Sec-Fetch-Mode: cors", + "Sec-Fetch-Site: same-origin", + "Connection: keep-alive", + "Alt-Used: ca.pinterest.com", + "Cookie: " . $cookies["cookie"], + "TE: trailers"] + ); + + curl_setopt($curlproc, CURLOPT_POST, true); + curl_setopt($curlproc, CURLOPT_POSTFIELDS, $get); + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + + // http2 bypass + curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + if($header_data_post === null){ + + if(!isset($cookies_tmp["csrftoken"])){ + + throw new Exception("Failed to grep CSRF token"); + } + + $cookies = ""; + + foreach($cookies_tmp as $cookie_name => $cookie_value){ + + $cookies .= $cookie_name . "=" . $cookie_value . "; "; + } + + $cookies = [ + "csrf" => $cookies_tmp["csrftoken"], + "cookie" => rtrim($cookies, " ;") + ]; + } + + curl_close($curlproc); + return $data; + } + + public function image($get){ + + if($get["npt"]){ + + [$data, $proxy] = + $this->backend->get( + $get["npt"], "images" + ); + + $data = json_decode($data, true); + + $search = $data["q"]; + $cookies = $data["cookies"]; + + try{ + $json = + $this->get( + $proxy, + "https://ca.pinterest.com/resource/BaseSearchResource/get/", + [ + "source_url" => "/search/pins/?q=" . urlencode($search) . "&rs=typed", + "data" => json_encode( + [ + "options" => [ + "applied_unified_filters" => null, + "appliedProductFilters" => "---", + "article" => null, + "auto_correction_disabled" => false, + "corpus" => null, + "customized_rerank_type" => null, + "domains" => null, + "dynamicPageSizeExpGroup" => null, + "filters" => null, + "journey_depth" => null, + "page_size" => null, + "price_max" => null, + "price_min" => null, + "query_pin_sigs" => null, + "query" => $data["q"], + "redux_normalize_feed" => true, + "request_params" => null, + "rs" => "typed", + "scope" => "pins", + "selected_one_bar_modules" => null, + "source_id" => null, + "source_module_id" => null, + "source_url" => "/search/pins/?q=" . urlencode($search) . "&rs=typed", + "top_pin_id" => null, + "top_pin_ids" => null, + "bookmarks" => [ + $data["bookmark"] + ] + ], + "context" => [] + ], + JSON_UNESCAPED_SLASHES + ) + ], + $cookies, + $search + ); + + }catch(Exception $error){ + + throw new Exception("Failed to fetch JSON"); + } + + }else{ + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + // https://ca.pinterest.com/resource/BaseSearchResource/get/?source_url=%2Fsearch%2Fpins%2F%3Feq%3Dhigurashi%26etslf%3D5966%26len%3D2%26q%3Dhigurashi%2520when%2520they%2520cry%26rs%3Dac&data=%7B%22options%22%3A%7B%22applied_unified_filters%22%3Anull%2C%22appliedProductFilters%22%3A%22---%22%2C%22article%22%3Anull%2C%22auto_correction_disabled%22%3Afalse%2C%22corpus%22%3Anull%2C%22customized_rerank_type%22%3Anull%2C%22domains%22%3Anull%2C%22dynamicPageSizeExpGroup%22%3Anull%2C%22filters%22%3Anull%2C%22journey_depth%22%3Anull%2C%22page_size%22%3Anull%2C%22price_max%22%3Anull%2C%22price_min%22%3Anull%2C%22query_pin_sigs%22%3Anull%2C%22query%22%3A%22higurashi%20when%20they%20cry%22%2C%22redux_normalize_feed%22%3Atrue%2C%22request_params%22%3Anull%2C%22rs%22%3A%22ac%22%2C%22scope%22%3A%22pins%22%2C%22selected_one_bar_modules%22%3Anull%2C%22source_id%22%3Anull%2C%22source_module_id%22%3Anull%2C%22source_url%22%3A%22%2Fsearch%2Fpins%2F%3Feq%3Dhigurashi%26etslf%3D5966%26len%3D2%26q%3Dhigurashi%2520when%2520they%2520cry%26rs%3Dac%22%2C%22top_pin_id%22%3Anull%2C%22top_pin_ids%22%3Anull%7D%2C%22context%22%3A%7B%7D%7D&_=1736116313987 + // source_url=%2Fsearch%2Fpins%2F%3Feq%3Dhigurashi%26etslf%3D5966%26len%3D2%26q%3Dhigurashi%2520when%2520they%2520cry%26rs%3Dac + // &data=%7B%22options%22%3A%7B%22applied_unified_filters%22%3Anull%2C%22appliedProductFilters%22%3A%22---%22%2C%22article%22%3Anull%2C%22auto_correction_disabled%22%3Afalse%2C%22corpus%22%3Anull%2C%22customized_rerank_type%22%3Anull%2C%22domains%22%3Anull%2C%22dynamicPageSizeExpGroup%22%3Anull%2C%22filters%22%3Anull%2C%22journey_depth%22%3Anull%2C%22page_size%22%3Anull%2C%22price_max%22%3Anull%2C%22price_min%22%3Anull%2C%22query_pin_sigs%22%3Anull%2C%22query%22%3A%22higurashi%20when%20they%20cry%22%2C%22redux_normalize_feed%22%3Atrue%2C%22request_params%22%3Anull%2C%22rs%22%3A%22ac%22%2C%22scope%22%3A%22pins%22%2C%22selected_one_bar_modules%22%3Anull%2C%22source_id%22%3Anull%2C%22source_module_id%22%3Anull%2C%22source_url%22%3A%22%2Fsearch%2Fpins%2F%3Feq%3Dhigurashi%26etslf%3D5966%26len%3D2%26q%3Dhigurashi%2520when%2520they%2520cry%26rs%3Dac%22%2C%22top_pin_id%22%3Anull%2C%22top_pin_ids%22%3Anull%7D%2C%22context%22%3A%7B%7D%7D + // &_=1736116313987 + + $source_url = "/search/pins/?q=" . urlencode($search) . "&rs=" . urlencode($search); + + $filter = [ + "source_url" => $source_url, + "rs" => "typed", + "data" => + json_encode( + [ + "options" => [ + "applied_unified_filters" => null, + "appliedProductFilters" => "---", + "article" => null, + "corpus" => null, + "customized_rerank_type" => null, + "domains" => null, + "dynamicPageSizeExpGroup" => null, + "filters" => null, + "journey_depth" => null, + "page_size" => null, + "price_max" => null, + "price_min" => null, + "query_pin_sigs" => null, + "query" => $search, + "redux_normalize_feed" => true, + "request_params" => null, + "rs" => "ac", + "scope" => "pins", // pins, boards, videos, + "selected_one_bar_modules" => null, + "source_id" => null, + "source_module_id" => null, + "source_url" => $source_url, + "top_pin_id" => null, + "top_pin_ids" => null + ], + "context" => [] + ] + ), + "_" => substr(str_replace(".", "", (string)microtime(true)), 0, -1) + ]; + + $proxy = $this->backend->get_ip(); + $cookies = []; + + try{ + $json = + $this->get( + $proxy, + "https://ca.pinterest.com/resource/BaseSearchResource/get/", + $filter, + $cookies, + null + ); + + }catch(Exception $error){ + + throw new Exception("Failed to fetch JSON"); + } + } + + $json = json_decode($json, true); + + if($json === null){ + + throw new Exception("Failed to decode JSON"); + } + + $out = [ + "status" => "ok", + "npt" => null, + "image" => [] + ]; + + if( + !isset( + $json["resource_response"] + ["status"] + ) + ){ + + throw new Exception("Unknown API failure"); + } + + if($json["resource_response"]["status"] != "success"){ + + $status = "Got non-OK response: " . $json["resource_response"]["status"]; + + if( + isset( + $json["resource_response"]["message"] + ) + ){ + + $status .= " - " . $json["resource_response"]["message"]; + } + + throw new Exception($status); + } + + if( + isset( + $json["resource_response"]["sensitivity"] + ["notices"][0]["description"]["text"] + ) + ){ + + throw new Exception( + "Pinterest returned a notice: " . + $json["resource_response"]["sensitivity"]["notices"][0]["description"]["text"] + ); + } + + // get NPT + if(isset($json["resource_response"]["bookmark"])){ + + $out["npt"] = + $this->backend->store( + json_encode([ + "q" => $search, + "bookmark" => $json["resource_response"]["bookmark"], + "cookies" => $cookies + ]), + "images", + $proxy + ); + } + + foreach( + $json + ["resource_response"] + ["data"] + ["results"] + as $item + ){ + + switch($item["type"]){ + + case "pin": + case "board": + + /* + Handle image object + */ + $images = array_values($item["images"]); + $image = &$images[count($images) - 1]; // original + $thumb = &$images[1]; // 236x + + $title = []; + + if( + isset($item["grid_title"]) && + trim($item["grid_title"]) != "" + ){ + + $title[] = $item["grid_title"]; + } + + if( + isset($item["description"]) && + trim($item["description"]) != "" + ){ + + $title[] = $item["description"]; + } + + $title = implode(": ", $title); + + if( + $title == "" && + isset($item["board"]["name"]) && + trim($item["board"]["name"]) != "" + ){ + + $title = $item["board"]["name"]; + } + + if($title == ""){ + + $title = null; + } + + $out["image"][] = [ + "title" => $title, + "source" => [ + [ + "url" => $image["url"], + "width" => (int)$image["width"], + "height" => (int)$image["height"] + ], + [ + "url" => $thumb["url"], + "width" => (int)$thumb["width"], + "height" => (int)$thumb["height"] + ] + ], + "url" => + $item["link"] === null ? + "https://ca.pinterest.com/pin/" . $item["id"] : + $item["link"] + ]; + break; + } + } + + return $out; + } +} diff --git a/scraper/qwant.php b/scraper/qwant.php new file mode 100644 index 0000000..ecbd4ec --- /dev/null +++ b/scraper/qwant.php @@ -0,0 +1,993 @@ +<?php + +class qwant{ + + public function __construct(){ + + include "lib/backend.php"; + $this->backend = new backend("qwant"); + } + + public function getfilters($page){ + + $base = [ + "nsfw" => [ + "display" => "NSFW", + "option" => [ + "yes" => "Yes", + "maybe" => "Maybe", + "no" => "No" + ] + ], + "country" => [ + "display" => "Country", + "option" => [ + "en_US" => "United States", + "fr_FR" => "France", + "en_GB" => "Great Britain", + "de_DE" => "Germany", + "it_IT" => "Italy", + "es_AR" => "Argentina", + "en_AU" => "Australia", + "es_ES" => "Spain (es)", + "ca_ES" => "Spain (ca)", + "cs_CZ" => "Czech Republic", + "ro_RO" => "Romania", + "el_GR" => "Greece", + "zh_CN" => "China", + "zh_HK" => "Hong Kong", + "en_NZ" => "New Zealand", + "fr_FR" => "France", + "th_TH" => "Thailand", + "ko_KR" => "South Korea", + "sv_SE" => "Sweden", + "nb_NO" => "Norway", + "da_DK" => "Denmark", + "hu_HU" => "Hungary", + "et_EE" => "Estonia", + "es_MX" => "Mexico", + "es_CL" => "Chile", + "en_CA" => "Canada (en)", + "fr_CA" => "Canada (fr)", + "en_MY" => "Malaysia", + "bg_BG" => "Bulgaria", + "fi_FI" => "Finland", + "pl_PL" => "Poland", + "nl_NL" => "Netherlands", + "pt_PT" => "Portugal", + "de_CH" => "Switzerland (de)", + "fr_CH" => "Switzerland (fr)", + "it_CH" => "Switzerland (it)", + "de_AT" => "Austria", + "fr_BE" => "Belgium (fr)", + "nl_BE" => "Belgium (nl)", + "en_IE" => "Ireland", + "he_IL" => "Israel" + ] + ] + ]; + + switch($page){ + + case "web": + $base = array_merge( + $base, + [ + "time" => [ + "display" => "Time posted", + "option" => [ + "any" => "Any time", + "day" => "Past 24 hours", + "week" => "Past week", + "month" => "Past month" + ] + ], + "extendedsearch" => [ + // no display, wont show in interface + "option" => [ + "yes" => "Yes", + "no" => "No" + ] + ] + ] + ); + break; + + case "images": + $base = array_merge( + $base, + [ + "time" => [ + "display" => "Time posted", + "option" => [ + "any" => "Any time", + "day" => "Past 24 hours", + "week" => "Past week", + "month" => "Past month" + ] + ], + "size" => [ + "display" => "Size", + "option" => [ + "any" => "Any size", + "large" => "Large", + "medium" => "Medium", + "small" => "Small" + ] + ], + "color" => [ + "display" => "Color", + "option" => [ + "any" => "Any color", + "coloronly" => "Color only", + "monochrome" => "Monochrome", + "black" => "Black", + "brown" => "Brown", + "gray" => "Gray", + "white" => "White", + "yellow" => "Yellow", + "orange" => "Orange", + "red" => "Red", + "pink" => "Pink", + "purple" => "Purple", + "blue" => "Blue", + "teal" => "Teal", + "green" => "Green" + ] + ], + "imagetype" => [ + "display" => "Type", + "option" => [ + "any" => "Any type", + "animatedgif" => "Animated GIF", + "photo" => "Photograph", + "transparent" => "Transparent" + ] + ], + "license" => [ + "display" => "License", + "option" => [ + "any" => "Any license", + "share" => "Non-commercial reproduction and sharing", + "sharecommercially" => "Reproduction and sharing", + "modify" => "Non-commercial reproduction, sharing and modification", + "modifycommercially" => "Reproduction, sharing and modification", + "public" => "Public domain" + ] + ] + ] + ); + break; + + case "videos": + $base = array_merge( + $base, + [ + "order" => [ + "display" => "Order by", + "option" => [ + "relevance" => "Relevance", + "views" => "Views", + "date" => "Most recent", + ] + ], + "source" => [ + "display" => "Source", + "option" => [ + "any" => "Any source", + "youtube" => "YouTube", + "dailymotion" => "Dailymotion", + ] + ] + ] + ); + break; + + case "news": + $base = array_merge( + $base, + [ + "time" => [ + "display" => "Time posted", + "option" => [ + "any" => "Any time", + "hour" => "Less than 1 hour ago", + "day" => "Past 24 hours", + "week" => "Past week", + "month" => "Past month" + ] + ], + "order" => [ + "display" => "Order by", + "option" => [ + "relevance" => "Relevance", + "date" => "Most recent" + ] + ] + ] + ); + break; + } + + return $base; + } + + private function get($proxy, $url, $get = []){ + + $headers = [ + "User-Agent: " . config::USER_AGENT, + "Accept: application/json, text/plain, */*", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "DNT: 1", + "Connection: keep-alive", + "Origin: https://www.qwant.com", + "Referer: https://www.qwant.com/", + "Sec-Fetch-Dest: empty", + "Sec-Fetch-Mode: cors", + "Sec-Fetch-Site: same-site", + "TE: trailers" + ]; + + $curlproc = curl_init(); + + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers); + + // Bypass HTTP/2 check + curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + + return $data; + } + + public function web($get){ + + if($get["npt"]){ + + // get next page data + [$params, $proxy] = $this->backend->get($get["npt"], "web"); + + $params = json_decode($params, true); + + }else{ + + // get _GET data instead + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + if(strlen($search) > 2048){ + + throw new Exception("Search term is too long!"); + } + + $proxy = $this->backend->get_ip(); + + $params = [ + "q" => $search, + "freshness" => $get["time"], + "count" => 10, + "locale" => $get["country"], + "offset" => 0, + "device" => "desktop", + "tgp" => 3, + "safesearch" => 0, + "displayed" => "true" + ]; + + switch($get["nsfw"]){ + + case "yes": $params["safesearch"] = 0; break; + case "maybe": $params["safesearch"] = 1; break; + case "no": $params["safesearch"] = 2; break; + } + } + /* + $handle = fopen("scraper/qwant_web.json", "r"); + $json = fread($handle, filesize("scraper/qwant_web.json")); + fclose($handle);*/ + + try{ + $json = + $this->get( + $proxy, + "https://fdn.qwant.com/v3/search/web", + $params + ); + + }catch(Exception $error){ + + throw new Exception("Could not fetch JSON"); + } + + $json = json_decode($json, true); + + if($json === NULL){ + + throw new Exception("Failed to decode JSON"); + } + + if(isset($json["data"]["message"][0])){ + + throw new Exception("Server returned an error:\n" . $json["data"]["message"][0]); + } + + $out = [ + "status" => "ok", + "spelling" => [ + "type" => "no_correction", + "using" => null, + "correction" => null + ], + "npt" => null, + "answer" => [], + "web" => [], + "image" => [], + "video" => [], + "news" => [], + "related" => [] + ]; + + if( + $json["status"] != "success" && + $json["data"]["error_code"] === 5 + ){ + + // no results + return $out; + } + + $this->detect_errors($json); + + if(!isset($json["data"]["result"]["items"]["mainline"])){ + + throw new Exception("Server did not return a result object"); + } + + // data is OK, parse + + // get instant answer + if( + $get["extendedsearch"] == "yes" && + isset($json["data"]["result"]["items"]["sidebar"][0]["endpoint"]) + ){ + + try{ + $answer = + $this->get( + $proxy, + "https://api.qwant.com/v3" . + $json["data"]["result"]["items"]["sidebar"][0]["endpoint"], + [] + ); + + $answer = json_decode($answer, true); + + if( + $answer === null || + $answer["status"] != "success" || + $answer["data"]["result"] === null + ){ + + throw new Exception(); + } + + // parse answer + $out["answer"][] = [ + "title" => $answer["data"]["result"]["title"], + "description" => [ + [ + "type" => "text", + "value" => $this->trimdots($answer["data"]["result"]["description"]) + ] + ], + "url" => $answer["data"]["result"]["url"], + "thumb" => + $answer["data"]["result"]["thumbnail"]["landscape"] == null ? + null : + $this->unshitimage($answer["data"]["result"]["thumbnail"]["landscape"]), + "table" => [], + "sublink" => [] + ]; + + }catch(Exception $error){ + + // do nothing in case of failure + } + + } + + // get word correction + if(isset($json["data"]["query"]["queryContext"]["alteredQuery"])){ + + $out["spelling"] = [ + "type" => "including", + "using" => $json["data"]["query"]["queryContext"]["alteredQuery"], + "correction" => $json["data"]["query"]["queryContext"]["alterationOverrideQuery"] + ]; + } + + // check for next page + if($json["data"]["result"]["lastPage"] === false){ + + $params["offset"] = $params["offset"] + 10; + + $out["npt"] = + $this->backend->store( + json_encode($params), + "web", + $proxy + ); + } + + // parse results + foreach($json["data"]["result"]["items"]["mainline"] as $item){ + + switch($item["type"]){ // ignores ads + + case "web": + + $first_iteration = true; + foreach($item["items"] as $result){ + + if(isset($result["thumbnailUrl"])){ + + $thumb = [ + "url" => $this->unshitimage($result["thumbnailUrl"]), + "ratio" => "16:9" + ]; + }else{ + + $thumb = [ + "url" => null, + "ratio" => null + ]; + } + + $sublinks = []; + if(isset($result["links"])){ + + foreach($result["links"] as $link){ + + $sublinks[] = [ + "title" => $this->trimdots($link["title"]), + "date" => null, + "description" => isset($link["desc"]) ? $this->trimdots($link["desc"]) : null, + "url" => $link["url"] + ]; + } + } + + // detect gibberish results + if( + $first_iteration && + !isset($result["urlPingSuffix"]) + ){ + + throw new Exception("Qwant returned gibberish results"); + } + + $out["web"][] = [ + "title" => $this->trimdots($result["title"]), + "description" => $this->trimdots($result["desc"]), + "url" => $result["url"], + "date" => null, + "type" => "web", + "thumb" => $thumb, + "sublink" => $sublinks, + "table" => [] + ]; + + $first_iteration = false; + } + break; + + case "images": + foreach($item["items"] as $image){ + + $out["image"][] = [ + "title" => $image["title"], + "source" => [ + [ + "url" => $image["media"], + "width" => (int)$image["width"], + "height" => (int)$image["height"] + ], + [ + "url" => $this->unshitimage($image["thumbnail"]), + "width" => $image["thumb_width"], + "height" => $image["thumb_height"] + ] + ], + "url" => $image["url"] + ]; + } + break; + + case "videos": + foreach($item["items"] as $video){ + + $out["video"][] = [ + "title" => $video["title"], + "description" => null, + "date" => (int)$video["date"], + "duration" => $video["duration"] === null ? null : $video["duration"] / 1000, + "views" => null, + "thumb" => + $video["thumbnail"] === null ? + [ + "url" => null, + "ratio" => null, + ] : + [ + "url" => $this->unshitimage($video["thumbnail"]), + "ratio" => "16:9", + ], + "url" => $video["url"] + ]; + } + break; + + case "related_searches": + foreach($item["items"] as $related){ + + $out["related"][] = $related["text"]; + } + break; + } + } + + return $out; + } + + + public function image($get){ + + if($get["npt"]){ + + [$params, $proxy] = + $this->backend->get( + $get["npt"], + "images" + ); + + $params = json_decode($params, true); + }else{ + + $search = $get["s"]; + + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $proxy = $this->backend->get_ip(); + + $params = [ + "t" => "images", + "q" => $search, + "count" => 125, + "locale" => $get["country"], + "offset" => 0, // increment by 125 + "device" => "desktop", + "tgp" => 3 + ]; + + if($get["time"] != "any"){ + + $params["freshness"] = $get["time"]; + } + + foreach(["size", "color", "imagetype", "license"] as $p){ + + if($get[$p] != "any"){ + + $params[$p] = $get[$p]; + } + } + + switch($get["nsfw"]){ + + case "yes": $params["safesearch"] = 0; break; + case "maybe": $params["safesearch"] = 1; break; + case "no": $params["safesearch"] = 2; break; + } + } + + try{ + $json = $this->get( + $proxy, + "https://api.qwant.com/v3/search/images", + $params, + ); + }catch(Exception $err){ + + throw new Exception("Failed to get JSON"); + } + + /* + $handle = fopen("scraper/yandex.json", "r"); + $json = fread($handle, filesize("scraper/yandex.json")); + fclose($handle);*/ + + $json = json_decode($json, true); + + if($json === null){ + + throw new Exception("Failed to decode JSON"); + } + + $this->detect_errors($json); + + if(isset($json["data"]["result"]["items"]["mainline"])){ + + throw new Exception("Qwant returned gibberish results"); + } + + $out = [ + "status" => "ok", + "npt" => null, + "image" => [] + ]; + + if($json["data"]["result"]["lastPage"] === false){ + + $params["offset"] = $params["offset"] + 125; + + $out["npt"] = $this->backend->store( + json_encode($params), + "images", + $proxy + ); + } + + foreach($json["data"]["result"]["items"] as $image){ + + $out["image"][] = [ + "title" => $this->trimdots($image["title"]), + "source" => [ + [ + "url" => $image["media"], + "width" => $image["width"], + "height" => $image["height"] + ], + [ + "url" => $this->unshitimage($image["thumbnail"]), + "width" => $image["thumb_width"], + "height" => $image["thumb_height"] + ] + ], + "url" => $image["url"] + ]; + } + + return $out; + } + + public function video($get){ + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $params = [ + "t" => "videos", + "q" => $search, + "count" => 50, + "locale" => $get["country"], + "offset" => 0, // dont implement pagination + "device" => "desktop", + "tgp" => 3 + ]; + + switch($get["nsfw"]){ + + case "yes": $params["safesearch"] = 0; break; + case "maybe": $params["safesearch"] = 1; break; + case "no": $params["safesearch"] = 2; break; + } + + try{ + $json = + $this->get( + $this->backend->get_ip(), + "https://api.qwant.com/v3/search/videos", + $params + ); + }catch(Exception $error){ + + throw new Exception("Could not fetch JSON"); + } + + /* + $handle = fopen("scraper/yandex-video.json", "r"); + $json = fread($handle, filesize("scraper/yandex-video.json")); + fclose($handle); + */ + + $json = json_decode($json, true); + + if($json === null){ + + throw new Exception("Could not parse JSON"); + } + + $this->detect_errors($json); + + if(isset($json["data"]["result"]["items"]["mainline"])){ + + throw new Exception("Qwant returned gibberish results"); + } + + $out = [ + "status" => "ok", + "npt" => null, + "video" => [], + "author" => [], + "livestream" => [], + "playlist" => [], + "reel" => [] + ]; + + foreach($json["data"]["result"]["items"] as $video){ + + if(empty($video["thumbnail"])){ + + $thumb = [ + "url" => null, + "ratio" => null + ]; + }else{ + + $thumb = [ + "url" => $this->unshitimage($video["thumbnail"]), + "ratio" => "16:9" + ]; + } + + $duration = (int)$video["duration"]; + + $out["video"][] = [ + "title" => $video["title"], + "description" => $this->limitstrlen($video["desc"]), + "author" => [ + "name" => $video["channel"], + "url" => null, + "avatar" => null + ], + "date" => (int)$video["date"], + "duration" => $duration === 0 ? null : $duration, + "views" => null, + "thumb" => $thumb, + "url" => preg_replace("/\?syndication=.+/", "", $video["url"]) + ]; + } + + return $out; + } + + public function news($get){ + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $params = [ + "t" => "news", + "q" => $search, + "count" => 50, + "locale" => $get["country"], + "offset" => 0, // dont implement pagination + "device" => "desktop", + "tgp" => 3 + ]; + + switch($get["nsfw"]){ + + case "yes": $params["safesearch"] = 0; break; + case "maybe": $params["safesearch"] = 1; break; + case "no": $params["safesearch"] = 2; break; + } + + try{ + $json = + $this->get( + $this->backend->get_ip(), + "https://api.qwant.com/v3/search/news", + $params + ); + }catch(Exception $error){ + + throw new Exception("Could not fetch JSON"); + } + + /* + $handle = fopen("scraper/yandex-video.json", "r"); + $json = fread($handle, filesize("scraper/yandex-video.json")); + fclose($handle); + */ + + $json = json_decode($json, true); + + if($json === null){ + + throw new Exception("Could not parse JSON"); + } + + $this->detect_errors($json); + + if(isset($json["data"]["result"]["items"]["mainline"])){ + + throw new Exception("Qwant returned gibberish results"); + } + + $out = [ + "status" => "ok", + "npt" => null, + "news" => [] + ]; + + foreach($json["data"]["result"]["items"] as $news){ + + if(empty($news["media"][0]["pict_big"]["url"])){ + + $thumb = [ + "url" => null, + "ratio" => null + ]; + }else{ + + $thumb = [ + "url" => $this->unshitimage($news["media"][0]["pict_big"]["url"]), + "ratio" => "16:9" + ]; + } + + $out["news"][] = [ + "title" => $news["title"], + "author" => $news["press_name"], + "description" => $this->trimdots($news["desc"]), + "date" => (int)$news["date"], + "thumb" => $thumb, + "url" => $news["url"] + ]; + } + + return $out; + } + + private function detect_errors($json){ + + if( + isset($json["status"]) && + $json["status"] == "error" + ){ + + if(isset($json["data"]["error_data"]["captchaUrl"])){ + + throw new Exception("Qwant returned a captcha"); + }elseif(isset($json["data"]["error_data"]["error_code"])){ + + throw new Exception( + "Qwant returned an API error: " . + $json["data"]["error_data"]["error_code"] + ); + } + + throw new Exception("Qwant returned an API error"); + } + } + + private function limitstrlen($text){ + + return explode("\n", wordwrap($text, 300, "\n"))[0]; + } + + private function trimdots($text){ + + return trim($text, ". "); + } + + private function unshitimage($url){ + + // https://s1.qwant.com/thumbr/0x0/8/d/f6de4deb2c2b12f55d8bdcaae576f9f62fd58a05ec0feeac117b354d1bf5c2/th.jpg?u=https%3A%2F%2Fwww.bing.com%2Fth%3Fid%3DOIP.vvDWsagzxjoKKP_rOqhwrQAAAA%26w%3D160%26h%3D160%26c%3D7%26pid%3D5.1&q=0&b=1&p=0&a=0 + // https://s2.qwant.com/thumbr/474x289/7/f/412d13b3fe3a03eb2b89633c8e88b609b7d0b93cdd9a5e52db3c663e41e65e/th.jpg?u=https%3A%2F%2Ftse.mm.bing.net%2Fth%3Fid%3DOIP.9Tm_Eo6m7V7ltN19mxduDgHaEh%26pid%3DApi&q=0&b=1&p=0&a=0 + + $image = parse_url($url); + + if( + !isset($image["host"]) || + !isset($image["query"]) + ){ + + // cant do anything + return $url; + } + + $id = null; + + if( + preg_match( + '/s[0-9]+\.qwant\.com$/', + $image["host"] + ) + ){ + + parse_str($image["query"], $str); + + // we're being served a proxy URL + if(isset($str["u"])){ + + $bing_url = $str["u"]; + }else{ + + // give up + return $url; + } + } + + // parse bing URL + $id = null; + $image = parse_url($bing_url); + + if(isset($image["query"])){ + + parse_str($image["query"], $str); + + if(isset($str["id"])){ + + $id = $str["id"]; + } + } + + if($id === null){ + + $id = explode("/th/id/", $image["path"], 2); + + if(count($id) !== 2){ + + // malformed + return $url; + } + + $id = $id[1]; + } + + if(is_array($id)){ + + // fuck off, let proxy.php deal with it + return $url; + } + + return "https://" . $image["host"] . "/th?id=" . rawurlencode($id); + } +} diff --git a/scraper/sc.php b/scraper/sc.php new file mode 100644 index 0000000..7083c42 --- /dev/null +++ b/scraper/sc.php @@ -0,0 +1,512 @@ +<?php + +class sc{ + + public function __construct(){ + + include "lib/backend.php"; + $this->backend = new backend("sc"); + + include "lib/fuckhtml.php"; + $this->fuckhtml = new fuckhtml(); + } + + public function getfilters($page){ + + return [ + "type" => [ + "display" => "Type", + "option" => [ + "any" => "Any type", + "track" => "Tracks", + "author" => "People", + "album" => "Albums", + "playlist" => "Playlists", + "goplus" => "Go+ Tracks" + ] + ] + ]; + } + + private function get($proxy, $url, $get = [], $web_req = false){ + + $curlproc = curl_init(); + + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + + // use http2 + curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); + + if($web_req === false){ + + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "Referer: https://soundcloud.com/", + "Origin: https://soundcloud.com", + "DNT: 1", + "Connection: keep-alive", + "Sec-Fetch-Dest: empty", + "Sec-Fetch-Mode: cors", + "Sec-Fetch-Site: same-site", + "Priority: u=1"] + ); + }else{ + + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "DNT: 1", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: cross-site", + "Priority: u=1", + "TE: trailers"] + ); + } + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + return $data; + } + + public function music($get, $last_attempt = false){ + + if($get["npt"]){ + + [$params, $proxy] = $this->backend->get($get["npt"], "music"); + $params = json_decode($params, true); + + $url = $params["url"]; + unset($params["url"]); + + }else{ + + // normal search: + // https://api-v2.soundcloud.com/search?q=freddie%20dredd&variant_ids=&facet=model&user_id=351062-302234-707916-795081&client_id=iMxZgT5mfGstBj8GWJbYMvpzelS8ne0E&limit=20&offset=0&linked_partitioning=1&app_version=1693487844&app_locale=en + + // soundcloud go+ search: + // https://api-v2.soundcloud.com/search/tracks?q=freddie%20dredd&variant_ids=&filter.content_tier=SUB_HIGH_TIER&facet=genre&user_id=630591-269800-703400-765403&client_id=iMxZgT5mfGstBj8GWJbYMvpzelS8ne0E&limit=20&offset=0&linked_partitioning=1&app_version=1693487844&app_locale=en + + // tracks search: + // https://api-v2.soundcloud.com/search/tracks?q=freddie%20dredd&variant_ids=&facet=genre&user_id=630591-269800-703400-765403&client_id=iMxZgT5mfGstBj8GWJbYMvpzelS8ne0E&limit=20&offset=0&linked_partitioning=1&app_version=1693487844&app_locale=en + + // users search: + // https://api-v2.soundcloud.com/search/users?q=freddie%20dredd&variant_ids=&facet=place&user_id=630591-269800-703400-765403&client_id=iMxZgT5mfGstBj8GWJbYMvpzelS8ne0E&limit=20&offset=0&linked_partitioning=1&app_version=1693487844&app_locale=en + + // albums search: + // https://api-v2.soundcloud.com/search/albums?q=freddie%20dredd&variant_ids=&facet=genre&user_id=630591-269800-703400-765403&client_id=iMxZgT5mfGstBj8GWJbYMvpzelS8ne0E&limit=20&offset=0&linked_partitioning=1&app_version=1693487844&app_locale=en + + // playlists search: + // https://api-v2.soundcloud.com/search/playlists_without_albums?q=freddie%20dredd&variant_ids=&facet=genre&user_id=630591-269800-703400-765403&client_id=iMxZgT5mfGstBj8GWJbYMvpzelS8ne0E&limit=20&offset=0&linked_partitioning=1&app_version=1693487844&app_locale=en + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $type = $get["type"]; + $proxy = $this->backend->get_ip(); + $token = $this->get_token($proxy); + + switch($type){ + + case "any": + $url = "https://api-v2.soundcloud.com/search"; + $params = [ + "q" => $search, + "variant_ids" => "", + "facet" => "model", + "client_id" => $token, + "limit" => 20, + "offset" => 0, + "linked_partitioning" => 1, + "app_version" => 1713542117, + "app_locale" => "en" + ]; + break; + + case "track": + $url = "https://api-v2.soundcloud.com/search/tracks"; + $params = [ + "q" => $search, + "variant_ids" => "", + "facet_genre" => "", + "client_id" => $token, + "limit" => 20, + "offset" => 0, + "linked_partitioning" => 1, + "app_version" => 1713542117, + "app_locale" => "en" + ]; + break; + + case "author": + $url = "https://api-v2.soundcloud.com/search/users"; + $params = [ + "q" => $search, + "variant_ids" => "", + "facet" => "place", + "client_id" => $token, + "limit" => 20, + "offset" => 0, + "linked_partitioning" => 1, + "app_version" => 1713542117, + "app_locale" => "en" + ]; + break; + + case "album": + $url = "https://api-v2.soundcloud.com/search/albums"; + $params = [ + "q" => $search, + "variant_ids" => "", + "facet" => "genre", + "client_id" => $token, + "limit" => 20, + "offset" => 0, + "linked_partitioning" => 1, + "app_version" => 1713542117, + "app_locale" => "en" + ]; + break; + + case "playlist": + $url = "https://api-v2.soundcloud.com/search/playlists_without_albums"; + $params = [ + "q" => $search, + "variant_ids" => "", + "facet" => "genre", + "client_id" => $token, + "limit" => 20, + "offset" => 0, + "linked_partitioning" => 1, + "app_version" => 1713542117, + "app_locale" => "en" + ]; + break; + + case "goplus": + $url = "https://api-v2.soundcloud.com/search/tracks"; + $params = [ + "q" => $search, + "variant_ids" => "", + "filter.content_tier" => "SUB_HIGH_TIER", + "facet" => "genre", + "client_id" => $token, + "limit" => 20, + "offset" => 0, + "linked_partitioning" => 1, + "app_version" => 1713542117, + "app_locale" => "en" + ]; + break; + } + } + + try{ + + $json = $this->get($proxy, $url, $params); + + }catch(Exception $error){ + + throw new Exception("Failed to fetch JSON"); + } + + /* + $handle = fopen("scraper/soundcloud.json", "r"); + $json = fread($handle, filesize("scraper/soundcloud.json")); + fclose($handle); + */ + + $json = json_decode($json, true); + + if($json === null){ + + if($last_attempt === true){ + + throw new Exception("Fetched an invalid token (please report!!)"); + } + + // token might've expired, get a new one and re-try search + $this->get_token($proxy); + return $this->music($get, true); + } + + $out = [ + "status" => "ok", + "npt" => null, + "song" => [], + "playlist" => [], + "album" => [], + "podcast" => [], + "author" => [], + "user" => [] + ]; + + /* + Get next page + */ + if(isset($json["next_href"])){ + + $params["query_urn"] = $json["query_urn"]; + $params["offset"] = $params["offset"] + 20; + $params["url"] = $url; // we will remove this later + + $out["npt"] = + $this->backend->store( + json_encode($params), + "music", + $proxy + ); + } + + /* + Scrape items + */ + foreach($json["collection"] as $item){ + + switch($item["kind"]){ + + case "user": + // parse author + $out["author"][] = [ + "title" => $item["username"], + "followers" => $item["followers_count"], + "description" => trim($item["track_count"] . " songs. " . $this->limitstrlen($item["description"])), + "thumb" => [ + "url" => $item["avatar_url"], + "ratio" => "1:1" + ], + "url" => $item["permalink_url"] + ]; + break; + + case "playlist": + // parse playlist + $description = []; + $count = 0; + + foreach($item["tracks"] as $song){ + + $count++; + + if(!isset($song["title"])){ + + continue; + } + + $description[] = $song["title"]; + } + + if(count($description) !== 0){ + + $description = trim($count . " songs. " . implode(", ", $description)); + }else{ + + $description = ""; + } + + if( + isset($item["artwork_url"]) && + !empty($item["artwork_url"]) + ){ + + $thumb = [ + "ratio" => "1:1", + "url" => $item["artwork_url"] + ]; + + }elseif( + isset($item["tracks"][0]["artwork_url"]) && + !empty($item["tracks"][0]["artwork_url"]) + ){ + + $thumb = [ + "ratio" => "1:1", + "url" => $item["tracks"][0]["artwork_url"] + ]; + }else{ + + $thumb = [ + "ratio" => null, + "url" => null + ]; + } + + $out["playlist"][] = [ + "title" => $item["title"], + "description" => $this->limitstrlen($description), + "author" => [ + "name" => $item["user"]["username"], + "url" => $item["user"]["permalink_url"], + "avatar" => $item["user"]["avatar_url"] + ], + "thumb" => $thumb, + "date" => strtotime($item["created_at"]), + "duration" => $item["duration"] / 1000, + "url" => $item["permalink_url"] + ]; + break; + + case "track": + if(stripos($item["monetization_model"], "TIER") === false){ + + $stream = [ + "endpoint" => "sc", + "url" => + $item["media"]["transcodings"][0]["url"] . + "?client_id=" . $token . + "&track_authorization=" . + $item["track_authorization"] + ]; + }else{ + + $stream = [ + "endpoint" => null, + "url" => null + ]; + } + + // parse track + $out["song"][] = [ + "title" => $item["title"], + "description" => $item["description"] == "" ? null : $this->limitstrlen($item["description"]), + "url" => $item["permalink_url"], + "views" => $item["playback_count"], + "author" => [ + "name" => $item["user"]["username"], + "url" => $item["user"]["permalink_url"], + "avatar" => $item["user"]["avatar_url"] + ], + "thumb" => [ + "ratio" => "1:1", + "url" => $item["artwork_url"] + ], + "date" => strtotime($item["created_at"]), + "duration" => (int)$item["full_duration"] / 1000, + "stream" => $stream + ]; + break; + } + } + + return $out; + } + + public function get_token($proxy){ + + $token = apcu_fetch("sc_token"); + + if($token !== false){ + + return $token; + } + + // search through all javascript components on the main page + try{ + $html = + $this->get( + $proxy, + "https://soundcloud.com", + [], + true + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch front page"); + } + + $this->fuckhtml->load($html); + + $scripts = + $this->fuckhtml + ->getElementsByTagName( + "script" + ); + + foreach($scripts as $script){ + + if( + !isset($script["attributes"]["src"]) || + strpos($script["attributes"]["src"], "sndcdn.com") === false + ){ + + continue; + } + + try{ + $js = + $this->get( + $proxy, + $script["attributes"]["src"], + [] + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch search token"); + } + + preg_match( + '/client_id=([^"]+)/', + $js, + $token + ); + + if(isset($token[1])){ + + apcu_store("sc_token", $token[1]); + return $token[1]; + break; + } + } + + throw new Exception("Did not find a Soundcloud token in the Javascript blobs"); + } + + private function limitstrlen($text){ + + return + explode( + "\n", + wordwrap( + str_replace( + ["\n\r", "\r\n", "\n", "\r"], + " ", + $text + ), + 300, + "\n" + ), + 2 + )[0]; + } +} diff --git a/scraper/sepiasearch.php b/scraper/sepiasearch.php new file mode 100644 index 0000000..c59e12f --- /dev/null +++ b/scraper/sepiasearch.php @@ -0,0 +1,541 @@ +<?php + +class sepiasearch{ + + public function __construct(){ + + include "lib/backend.php"; + $this->backend = new backend("sepiasearch"); + } + + public function getfilters($page){ + + return [ + "nsfw" => [ + "display" => "NSFW", + "option" => [ + "yes" => "Yes", // &sensitiveContent=both + "no" => "No" // &sensitiveContent=false + ] + ], + "language" => [ + "display" => "Language", // &language= + "option" => [ + "any" => "Any language", + "en" => "English", + "fr" => "Français", + "ar" => "العربية", + "ca" => "Català", + "cs" => "Čeština", + "de" => "Deutsch", + "el" => "ελληνικά", + "eo" => "Esperanto", + "es" => "Español", + "eu" => "Euskara", + "fa" => "فارسی", + "fi" => "Suomi", + "gd" => "Gàidhlig", + "gl" => "Galego", + "hr" => "Hrvatski", + "hu" => "Magyar", + "is" => "Íslenska", + "it" => "Italiano", + "ja" => "日本語", + "kab" => "Taqbaylit", + "nl" => "Nederlands", + "no" => "Norsk", + "oc" => "Occitan", + "pl" => "Polski", + "pt" => "Português (Brasil)", + "pt-PT" => "Português (Portugal)", + "ru" => "Pусский", + "sk" => "Slovenčina", + "sq" => "Shqip", + "sv" => "Svenska", + "th" => "ไทย", + "tok" => "Toki Pona", + "tr" => "Türkçe", + "uk" => "украї́нська мо́ва", + "vi" => "Tiếng Việt", + "zh-Hans" => "简体中文(中国)", + "zh-Hant" => "繁體中文(台灣)" + ] + ], + "type" => [ + "display" => "Result type", // i handle this + "option" => [ + "videos" => "Videos", + "playlists" => "Playlists", + "channels" => "Channels" + ] + ], + "sort" => [ + "display" => "Sort by", + "option" => [ + "best" => "Best match", // no filter + "-publishedAt" => "Newest", // sort=-publishedAt + "publishedAt" => "Oldest" // sort=publishedAt + ] + ], + "newer" => [ // &startDate=2025-07-26T04:00:00.000Z + "display" => "Newer than", + "option" => "_DATE" + ], + "duration" => [ + "display" => "Duration", + "option" => [ + "any" => "Any duration", + "short" => "Short (0-4mins)", // &durationRange=short + "medium" => "Medium (4-10 mins)", + "long" => "Long (10+ mins)", + ] + ], + "category" => [ + "display" => "Category", // &categoryOneOf[]= + "option" => [ + "any" => "Any category", + "1" => "Music", + "2" => "Films", + "3" => "Vehicles", + "4" => "Art", + "5" => "Sports", + "6" => "Travels", + "7" => "Gaming", + "8" => "People", + "9" => "Comedy", + "10" => "Entertainment", + "11" => "News & Politics", + "12" => "How To", + "13" => "Education", + "14" => "Activism", + "15" => "Science & Technology", + "16" => "Animals", + "17" => "Kids", + "18" => "Food" + ] + ], + "display" => [ + "display" => "Display", + "option" => [ + "any" => "Everything", + "true" => "Live videos", // &isLive=true + "false" => "VODs" // &isLive=false + ] + ], + "license" => [ + "display" => "License", // &license= + "option" => [ + "any" => "Any license", + "1" => "Attribution", + "2" => "Attribution - Share Alike", + "3" => "Attribution - No Derivatives", + "4" => "Attribution - Non Commercial", + "5" => "Attribution - Non Commercial - Share Alike", + "6" => "Attribution - Non Commercial - No Derivatives", + "7" => "Public Domain Dedication" + ] + ] + ]; + } + + private function get($proxy, $url, $get = []){ + + $curlproc = curl_init(); + + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + + curl_setopt( + $curlproc, + CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: application/json, text/plain, */*", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip, deflate, br, zstd", + "DNT: 1", + "Sec-GPC: 1", + "Connection: keep-alive", + "Referer: https://sepiasearch.org/search", + "Sec-Fetch-Dest: empty", + "Sec-Fetch-Mode: cors", + "Sec-Fetch-Site: same-origin", + "Priority: u=0", + "TE: trailers"] + ); + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + return $data; + } + + public function video($get){ + + if($get["npt"]){ + + [$npt, $proxy] = + $this->backend + ->get( + $get["npt"], + "videos" + ); + + $npt = json_decode($npt, true); + $type = $npt["type"]; + $npt = $npt["npt"]; + }else{ + + $proxy = $this->backend->get_ip(); + + $npt = [ + "search" => $get["s"], + "start" => 0, + "count" => 20 + ]; + + if($get["type"] == "videos"){ + + // + // Parse video filters + // + switch($get["nsfw"]){ + + case "yes": $npt["nsfw"] = "both"; break; + case "no": $npt["nsfw"] = "false"; break; + } + + $npt["boostLanguages[]"] = "en"; + if($get["language"] != "any"){ + + $npt["languageOneOf[]"] = $get["language"]; + } + + if($get["sort"] != "best"){ + + $npt["sort"] = $get["sort"]; + } + + if($get["newer"] !== false){ + + $date = new DateTime("@{$get["newer"]}"); + $date->setTimezone(new DateTimeZone("UTC")); + $formatted = $date->format("Y-m-d\TH:i:s.000\Z"); + + $npt["startDate"] = $formatted; + } + + switch($get["duration"]){ + + case "short": + $npt["durationMax"] = 240; + break; + + case "medium": + $npt["durationMin"] = 240; + $npt["durationMax"] = 600; + break; + + case "long": + $npt["durationMin"] = 600; + break; + } + + if($get["category"] != "any"){ + + $npt["categoryOneOf[]"] = $get["category"]; + } + + if($get["display"] != "any"){ + + $npt["isLive"] = $get["display"]; + } + + if($get["license"] != "any"){ + + // typo in license, lol + $npt["licenceOneOf[]"] = $get["license"]; + } + } + + $type = $get["type"]; + } + + switch($type){ + + case "videos": + $url = "https://sepiasearch.org/api/v1/search/videos"; + break; + + case "channels": + $url = "https://sepiasearch.org/api/v1/search/video-channels"; + break; + + case "playlists": + $url = "https://sepiasearch.org/api/v1/search/video-playlists"; + break; + } + + //$json = file_get_contents("scraper/sepia.json"); + try{ + + $json = + $this->get( + $proxy, + $url, + $npt + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch JSON"); + } + + $json = json_decode($json, true); + + if($json === null){ + + throw new Exception("Failed to parse JSON"); + } + + if(isset($json["errors"])){ + + $msg = []; + foreach($json["errors"] as $error){ + + if(isset($error["msg"])){ + + $msg[] = $error["msg"]; + } + } + + throw new Exception("Sepia Search returned error(s): " . implode(", ", $msg)); + } + + if(!isset($json["data"])){ + + throw new Exception("Sepia Search did not return a data object"); + } + + $out = [ + "status" => "ok", + "npt" => null, + "video" => [], + "author" => [], + "livestream" => [], + "playlist" => [], + "reel" => [] + ]; + + + switch($get["type"]){ + + case "videos": + foreach($json["data"] as $video){ + + if(count($video["account"]["avatars"]) !== 0){ + + $avatar = + $video["account"]["avatars"][count($video["account"]["avatars"]) - 1]["url"]; + }else{ + + $avatar = null; + } + + if($video["thumbnailUrl"] === null){ + + $thumb = [ + "ratio" => null, + "url" => null + ]; + }else{ + + $thumb = [ + "ratio" => "16:9", + "url" => $video["thumbnailUrl"] + ]; + } + + if($video["isLive"]){ + + $append = "livestream"; + }else{ + + $append = "video"; + } + + $out[$append][] = [ + "title" => $video["name"], + "description" => + $this->limitstrlen( + $this->titledots( + $video["description"] + ) + ), + "author" => [ + "name" => $video["account"]["displayName"] . " ({$video["account"]["name"]})", + "url" => $video["account"]["url"], + "avatar" => $avatar + ], + "date" => strtotime($video["publishedAt"]), + "duration" => $video["isLive"] ? "_LIVE" : $video["duration"], + "views" => $video["views"], + "thumb" => $thumb, + "url" => $video["url"] + ]; + } + break; + + case "playlists": + foreach($json["data"] as $playlist){ + + if(count($playlist["ownerAccount"]["avatars"]) !== 0){ + + $avatar = + $playlist["ownerAccount"]["avatars"][count($playlist["ownerAccount"]["avatars"]) - 1]["url"]; + }else{ + + $avatar = null; + } + + if($playlist["thumbnailUrl"] === null){ + + $thumb = [ + "ratio" => null, + "url" => null + ]; + }else{ + + $thumb = [ + "ratio" => "16:9", + "url" => $playlist["thumbnailUrl"] + ]; + } + + $out["playlist"][] = [ + "title" => $playlist["displayName"], + "description" => + $this->limitstrlen( + $this->titledots( + $playlist["description"] + ) + ), + "author" => [ + "name" => $playlist["ownerAccount"]["displayName"] . " ({$playlist["ownerAccount"]["name"]})", + "url" => $playlist["ownerAccount"]["url"], + "avatar" => $avatar + ], + "date" => strtotime($playlist["createdAt"]), + "duration" => $playlist["videosLength"], + "views" => null, + "thumb" => $thumb, + "url" => $playlist["url"] + ]; + } + break; + + case "channels": + foreach($json["data"] as $channel){ + + if(count($channel["avatars"]) !== 0){ + + $thumb = [ + "ratio" => "1:1", + "url" => $channel["avatars"][count($channel["avatars"]) - 1]["url"] + ]; + }else{ + + $thumb = [ + "ratio" => null, + "url" => null + ]; + } + + $out["author"][] = [ + "title" => $channel["displayName"] . " ({$channel["name"]})", + "followers" => $channel["followersCount"], + "description" => + $channel["videosCount"] . " videos. " . + $this->limitstrlen( + $this->titledots( + $channel["description"] + ) + ), + "thumb" => $thumb, + "url" => $channel["url"] + ]; + } + break; + } + + // get next page + if($json["total"] - 20 > $npt["start"]){ + + $npt["start"] += 20; + + $npt = [ + "type" => $get["type"], + "npt" => $npt + ]; + + $out["npt"] = + $this->backend + ->store( + json_encode($npt), + "videos", + $proxy + ); + } + + return $out; + } + + private function titledots($title){ + + $substr = substr($title, -3); + + if( + $substr == "..." || + $substr == "…" + ){ + + return trim(substr($title, 0, -3), " \n\r\t\v\x00\0\x0B\xc2\xa0"); + } + + return trim($title, " \n\r\t\v\x00\0\x0B\xc2\xa0"); + } + + private function limitstrlen($text){ + + return + explode( + "\n", + wordwrap( + str_replace( + ["\n\r", "\r\n", "\n", "\r"], + " ", + $text + ), + 300, + "\n" + ), + 2 + )[0]; + } +} diff --git a/scraper/solofield.php b/scraper/solofield.php new file mode 100644 index 0000000..4fe10e4 --- /dev/null +++ b/scraper/solofield.php @@ -0,0 +1,668 @@ +<?php + +class solofield{ + + public function __construct(){ + + include "lib/backend.php"; + $this->backend = new backend("solofield"); + + include "lib/fuckhtml.php"; + $this->fuckhtml = new fuckhtml(); + } + + public function getfilters($page){ + + return [ + "nsfw" => [ + "display" => "NSFW", + "option" => [ + "yes" => "Yes", + "no" => "No", + ] + ] + ]; + } + + private function get($proxy, $url, $get = []){ + + $curlproc = curl_init(); + + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "Referer: https://solofield.net", + "DNT: 1", + "Connection: keep-alive", + "Cookie: cross-site-cookie=name; lno=35842050", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: same-origin", + "Sec-Fetch-User: ?1"] + ); + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + return $data; + } + + public function web($get){ + + if($get["npt"]){ + + [$query, $proxy] = $this->backend->get($get["npt"], "web"); + + try{ + + $html = + $this->get( + $proxy, + "https://solofield.net/search?" . $query, + [] + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch search page"); + } + }else{ + + $proxy = $this->backend->get_ip(); + + try{ + + $html = + $this->get( + $proxy, + "https://solofield.net/search", + [ + "q" => $get["s"], + "ie" => "UTF-8", + "oe" => "UTF-8", + "hl" => "ja", // changing this doesnt do anything + "lr" => "lang_ja", // same here + //"ls" => "", // ?? + "f" => ($get["nsfw"] == "yes" ? "off" : "on") + ] + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch search page"); + } + } + + $out = [ + "status" => "ok", + "spelling" => [ + "type" => "no_correction", + "using" => null, + "correction" => null + ], + "npt" => null, + "answer" => [], + "web" => [], + "image" => [], + "video" => [], + "news" => [], + "related" => [] + ]; + + // check for errors and load the result div + if($this->error_and_load($html)){ + + return $out; + } + + $items = + $this->fuckhtml + ->getElementsByClassName( + "g0", + "li" + ); + + foreach($items as $item){ + + $this->fuckhtml->load($item); + + $title_tag = + $this->fuckhtml + ->getElementsByClassName( + "r", + "h3" + ); + + if(count($title_tag) === 0){ + + continue; + } + + $this->fuckhtml->load($title_tag[0]); + + $link = + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByTagName( + "a" + )[0] + ["attributes"] + ["href"] + ); + + $this->fuckhtml->load($item); + $thumb = + $this->fuckhtml + ->getElementsByClassName( + "webshot", + "img" + ); + + if(count($thumb) !== 0){ + + $uri = + $this->fuckhtml + ->getTextContent( + $thumb[0] + ["attributes"] + ["src"] + ); + + if(stripos($uri, "now_printing") === false){ + + $thumb = [ + "ratio" => "1:1", + "url" => + "https://solofield.net" . + $this->fuckhtml + ->getTextContent( + $thumb[0] + ["attributes"] + ["src"] + ) + ]; + }else{ + + $thumb = [ + "ratio" => null, + "url" => null + ]; + } + }else{ + + $thumb = [ + "ratio" => null, + "url" => null + ]; + } + + $out["web"][] = [ + "title" => + $this->fuckhtml + ->getTextContent( + $title_tag[0] + ), + "description" => + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByClassName( + "s", + "div" + )[0] + ), + "url" => $link, + "date" => null, + "type" => "web", + "thumb" => $thumb, + "sublink" => [], + "table" => [] + ]; + } + + // get next page + $this->get_npt($html, $proxy, $out, "web"); + + return $out; + } + + + public function image($get){ + + // no pagination + $html = + $this->get( + $this->backend->get_ip(), + "https://solofield.net/isearch", + [ + "q" => $get["s"], + "ie" => "UTF-8", + "oe" => "UTF-8", + "hl" => "ja", // changing this doesnt do anything + //"lr" => "lang_ja", // same here + "ls" => "", // ?? + "f" => ($get["nsfw"] == "yes" ? "off" : "on") + ] + ); + + $out = [ + "status" => "ok", + "npt" => null, + "image" => [] + ]; + + // check for errors and load the result div + if($this->error_and_load($html)){ + + return $out; + } + + $images = + $this->fuckhtml + ->getElementsByTagName( + "li" + ); + + foreach($images as $image){ + + $this->fuckhtml->load($image); + + $img = + $this->fuckhtml + ->getElementsByTagName( + "img" + ); + + if(count($img) === 0){ + + // ?? invalid + continue; + } + + $img = $img[0]; + + $size = + explode( + "x", + $this->fuckhtml + ->getTextContent( + $image + ), + 2 + ); + + $size = [ + (int)trim($size[0]), // width + (int)trim($size[1]) // height + ]; + + $out["image"][] = [ + "title" => null, + "source" => [ + [ + "url" => + "https://solofield.net/" . + $this->fuckhtml + ->getTextContent( + $img["attributes"]["src"] + ), + "width" => $size[0], + "height" => $size[1] + ] + ], + "url" => + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByTagName( + "a" + )[0] + ["attributes"] + ["href"] + ) + ]; + } + + return $out; + } + + + public function video($get){ + + if($get["npt"]){ + + [$query, $proxy] = $this->backend->get($get["npt"], "videos"); + + try{ + + $html = + $this->get( + $proxy, + "https://solofield.net/vsearch?" . $query, + [] + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch search page"); + } + }else{ + + $proxy = $this->backend->get_ip(); + + try{ + + $html = + $this->get( + $proxy, + "https://solofield.net/vsearch", + [ + "q" => $get["s"], + "ie" => "UTF-8", + "oe" => "UTF-8", + "hl" => "ja", // changing this doesnt do anything + //"lr" => "lang_ja", // same here + "ls" => "", // ?? + "f" => ($get["nsfw"] == "yes" ? "off" : "on") + ] + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch search page"); + } + } + + $out = [ + "status" => "ok", + "npt" => null, + "video" => [], + "author" => [], + "livestream" => [], + "playlist" => [], + "reel" => [] + ]; + + // check for errors and load the result div + if($this->error_and_load($html)){ + + return $out; + } + + $items = + $this->fuckhtml + ->getElementsByTagName( + "li" + ); + + foreach($items as $item){ + + $this->fuckhtml->load($item); + + $as = + $this->fuckhtml + ->getElementsByTagName( + "a" + ); + + if(count($as) === 0){ + + continue; + } + + $thumb = + $this->fuckhtml + ->getElementsByTagName( + "img" + ); + + if(count($thumb) !== 0){ + + $thumb = [ + "ratio" => "16:9", + "url" => + "https://solofield.net/" . + $thumb[0] + ["attributes"] + ["src"] + ]; + }else{ + + $thumb = [ + "ratio" => null, + "url" => null + ]; + } + + $date = + $this->fuckhtml + ->getElementsByAttributeValue( + "style", + "font-size: 10px;", + "span" + ); + + if(count($date) !== 0){ + + $date = + $this->unfuckdate( + $this->fuckhtml + ->getTextContent( + $date[0] + ) + ); + }else{ + + $date = null; + } + + $center_td = + $this->fuckhtml + ->getElementsByAttributeValue( + "align", + "center", + "td" + ); + + if(count($center_td) === 2){ + + $duration = + $this->fuckhtml + ->getTextContent( + $this->hms2int( + $center_td[0] + ) + ); + }else{ + + $duration = null; + } + + $out["video"][] = [ + "title" => + $this->fuckhtml + ->getTextContent( + $as[1] + ), + "description" => null, + "author" => [ + "name" => null, + "url" => null, + "avatar" => null + ], + "date" => $date, + "duration" => $duration, + "views" => null, + "thumb" => $thumb, + "url" => + $this->fuckhtml + ->getTextContent( + $as[0] + ["attributes"] + ["href"] + ) + ]; + } + + // get next page + $this->get_npt($html, $proxy, $out, "videos"); + + return $out; + } + + + private function get_npt($html, $proxy, &$out, $type){ + + // get next page + $this->fuckhtml->load($html); + + $pjs = + $this->fuckhtml + ->getElementById( + "pjs" + ); + + if($pjs){ + + $alnk = + $this->fuckhtml + ->getElementsByClassName( + "alnk", + "span" + ); + + foreach($alnk as $lnk){ + + if( + stripos( + $this->fuckhtml + ->getTextContent( + $lnk + ), + "Next" + ) !== false + ){ + + $this->fuckhtml->load($lnk); + + $out["npt"] = + $this->backend->store( + parse_url( + $this->fuckhtml + ->getElementsByTagName( + "a" + )[0] + ["attributes"] + ["href"], + PHP_URL_QUERY + ), + $type, + $proxy + ); + } + } + } + } + + private function error_and_load($html){ + + if(strlen($html) === 0){ + + throw new Exception("Solofield blocked the request IP"); + } + + $this->fuckhtml->load($html); + + $list = + $this->fuckhtml + ->getElementById( + "list", + "div" + ); + + if($list === false){ + + $nosearch = + $this->fuckhtml + ->getElementById( + "nosearch", + "div" + ); + + if($nosearch){ + + return true; + } + + throw new Exception("Failed to grep search list"); + } + + $this->fuckhtml->load($list); + return false; + } + + private function unfuckdate($date){ + + return + strtotime( + rtrim( + preg_replace( + '/[^0-9]+/', + "-", + explode( + ":", + $date, + 2 + )[1] + ), + "-" + ) + ); + } + + private function hms2int($time){ + + $parts = explode(":", $time, 3); + $time = 0; + + if(count($parts) === 3){ + + // hours + $time = $time + ((int)$parts[0] * 3600); + array_shift($parts); + } + + if(count($parts) === 2){ + + // minutes + $time = $time + ((int)$parts[0] * 60); + array_shift($parts); + } + + // seconds + $time = $time + (int)$parts[0]; + + return $time; + } +} diff --git a/scraper/spotify.php b/scraper/spotify.php new file mode 100644 index 0000000..79f61a6 --- /dev/null +++ b/scraper/spotify.php @@ -0,0 +1,726 @@ +<?php + +class spotify{ + + private const req_web = 0; + private const req_api = 1; + private const req_clientid = 2; + + public function __construct(){ + + include "lib/backend.php"; + $this->backend = new backend("spotify"); + + include "lib/fuckhtml.php"; + $this->fuckhtml = new fuckhtml(); + } + + public function getfilters($page){ + + return [ + "category" => [ + "display" => "Category", + "option" => [ + "any" => "All (no pagination)", + "audiobooks" => "Audiobooks", + "tracks" => "Songs", + "artists" => "Artists", + "playlists" => "Playlists", + "albums" => "Albums", + "podcastAndEpisodes" => "Podcasts & Shows (no pagination)", + "episodes" => "Episodes", + "users" => "Profiles" + ] + ] + ]; + } + + private function get($proxy, $url, $get = [], $reqtype = self::req_web, $bearer = null, $token = null){ + + $curlproc = curl_init(); + + switch($reqtype){ + + case self::req_api: + $headers = [ + "User-Agent: " . config::USER_AGENT, + "Accept: application/json", + "Accept-Language: en", + "app-platform: WebPlayer", + "authorization: Bearer {$bearer}", + "client-token: {$token}", + "content-type: application/json;charset=UTF-8", + "Origin: https://open.spotify.com", + "Referer: https://open.spotify.com/", + "DNT: 1", + "Connection: keep-alive", + "Sec-Fetch-Dest: empty", + "Sec-Fetch-Mode: cors", + "Sec-Fetch-Site: same-site", + "spotify-app-version: 1.2.27.93.g7aee53d4", + "TE: trailers" + ]; + break; + + case self::req_web: + $headers = [ + "User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "DNT: 1", + "Sec-GPC: 1", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: cross-site" + ]; + break; + + case self::req_clientid: + $get = json_encode($get); + + curl_setopt($curlproc, CURLOPT_POST, true); + curl_setopt($curlproc, CURLOPT_POSTFIELDS, $get); + + $headers = [ + "User-Agent:" . config::USER_AGENT, + "Accept: application/json", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip, deflate, br", + "Referer: https://open.spotify.com/", + "content-type: application/json", + "Content-Length: " . strlen($get), + "Origin: https://open.spotify.com", + "DNT: 1", + "Sec-GPC: 1", + "Connection: keep-alive", + "Sec-Fetch-Dest: empty", + "Sec-Fetch-Mode: cors", + "Sec-Fetch-Site: same-site", + "TE: trailers" + ]; + break; + } + + if($reqtype !== self::req_clientid){ + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers); + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + return $data; + } + + public function music($get){ + + $search = $get["s"]; + $ip = $this->backend->get_ip(); + $category = $get["category"]; + + /* + audiobooks first and second page decoded + https://api-partner.spotify.com/pathfinder/v1/query?operationName=searchAudiobooks&variables={"searchTerm":"freddie+dredd","offset":0,"limit":30,"numberOfTopResults":20,"includeAudiobooks":true}&extensions={"persistedQuery":{"version":1,"sha256Hash":"8758e540afdba5afa3c5246817f6bd31d86a15b3f5666c363dd017030f35d785"}} + https://api-partner.spotify.com/pathfinder/v1/query?operationName=searchAudiobooks&variables={"searchTerm":"freddie+dredd","offset":30,"limit":30,"numberOfTopResults":20,"includeAudiobooks":true}&extensions={"persistedQuery":{"version":1,"sha256Hash":"8758e540afdba5afa3c5246817f6bd31d86a15b3f5666c363dd017030f35d785"}} + */ + + /* + songs + https://api-partner.spotify.com/pathfinder/v1/query?operationName=searchTracks&variables={"searchTerm":"asmr","offset":0,"limit":100,"numberOfTopResults":20,"includeAudiobooks":false}&extensions={"persistedQuery":{"version":1,"sha256Hash":"16c02d6304f5f721fc2eb39dacf2361a4543815112506a9c05c9e0bc9733a679"}} + https://api-partner.spotify.com/pathfinder/v1/query?operationName=searchTracks&variables={"searchTerm":"asmr","offset":100,"limit":100,"numberOfTopResults":20,"includeAudiobooks":false}&extensions={"persistedQuery":{"version":1,"sha256Hash":"16c02d6304f5f721fc2eb39dacf2361a4543815112506a9c05c9e0bc9733a679"}} + */ + + /* + artists + https://api-partner.spotify.com/pathfinder/v1/query?operationName=searchArtists&variables={"searchTerm":"asmr","offset":0,"limit":30,"numberOfTopResults":20,"includeAudiobooks":true}&extensions={"persistedQuery":{"version":1,"sha256Hash":"b8840daafdda9a9ceadb7c5774731f63f9eca100445d2d94665f2dc58b45e2b9"}} + https://api-partner.spotify.com/pathfinder/v1/query?operationName=searchArtists&variables={"searchTerm":"asmr","offset":30,"limit":23,"numberOfTopResults":20,"includeAudiobooks":true}&extensions={"persistedQuery":{"version":1,"sha256Hash":"b8840daafdda9a9ceadb7c5774731f63f9eca100445d2d94665f2dc58b45e2b9"}} + https://api-partner.spotify.com/pathfinder/v1/query?operationName=searchArtists&variables={"searchTerm":"asmr","offset":53,"limit":30,"numberOfTopResults":20,"includeAudiobooks":true}&extensions={"persistedQuery":{"version":1,"sha256Hash":"b8840daafdda9a9ceadb7c5774731f63f9eca100445d2d94665f2dc58b45e2b9"}} + */ + + /* + playlists + https://api-partner.spotify.com/pathfinder/v1/query?operationName=searchPlaylists&variables={"searchTerm":"asmr","offset":0,"limit":30,"numberOfTopResults":20,"includeAudiobooks":true}&extensions={"persistedQuery":{"version":1,"sha256Hash":"19b4143a0500ccec189ca0f4a0316bc2c615ecb51ce993ba4d7d08afd1d87aa4"}} + https://api-partner.spotify.com/pathfinder/v1/query?operationName=searchPlaylists&variables={"searchTerm":"asmr","offset":30,"limit":3,"numberOfTopResults":20,"includeAudiobooks":true}&extensions={"persistedQuery":{"version":1,"sha256Hash":"19b4143a0500ccec189ca0f4a0316bc2c615ecb51ce993ba4d7d08afd1d87aa4"}} + */ + + /* + albums + https://api-partner.spotify.com/pathfinder/v1/query?operationName=searchAlbums&variables={"searchTerm":"asmr","offset":33,"limit":30,"numberOfTopResults":20,"includeAudiobooks":true}&extensions={"persistedQuery":{"version":1,"sha256Hash":"e93b13cda461482da2940467eb2beed9368e9bb2fff37df3fb6633fc61271a27"}} + https://api-partner.spotify.com/pathfinder/v1/query?operationName=searchAlbums&variables={"searchTerm":"asmr","offset":33,"limit":30,"numberOfTopResults":20,"includeAudiobooks":true}&extensions={"persistedQuery":{"version":1,"sha256Hash":"e93b13cda461482da2940467eb2beed9368e9bb2fff37df3fb6633fc61271a27"}} + */ + + /* + podcasts & shows (contains authors, no pagination) + https://api-partner.spotify.com/pathfinder/v1/query?operationName=searchFullEpisodes&variables={"searchTerm":"asmr","offset":0,"limit":30}&extensions={"persistedQuery":{"version":1,"sha256Hash":"9f996251c9781fabce63f1a9980b5287ea33bc5e8c8953d0c4689b09936067a1"}} + */ + + /* + episodes + https://api-partner.spotify.com/pathfinder/v1/query?operationName=searchDesktop&variables={"searchTerm":"asmr","offset":0,"limit":10,"numberOfTopResults":5,"includeAudiobooks":true}&extensions={"persistedQuery":{"version":1,"sha256Hash":"da03293d92a2cfc5e24597dcdc652c0ad135e1c64a78fddbf1478a7e096bea44"}} + ??? https://api-partner.spotify.com/pathfinder/v1/query?operationName=searchFullEpisodes&variables={"searchTerm":"asmr","offset":60,"limit":30}&extensions={"persistedQuery":{"version":1,"sha256Hash":"9f996251c9781fabce63f1a9980b5287ea33bc5e8c8953d0c4689b09936067a1"}} + */ + + /* + profiles + https://api-partner.spotify.com/pathfinder/v1/query?operationName=searchUsers&variables={"searchTerm":"asmr","offset":0,"limit":30,"numberOfTopResults":20,"includeAudiobooks":true}&extensions={"persistedQuery":{"version":1,"sha256Hash":"02026f48ab5001894e598904079b620ebc64f2d53b55ca20c3858abd3a46c5fb"}} + https://api-partner.spotify.com/pathfinder/v1/query?operationName=searchUsers&variables={"searchTerm":"asmr","offset":30,"limit":30,"numberOfTopResults":20,"includeAudiobooks":true}&extensions={"persistedQuery":{"version":1,"sha256Hash":"02026f48ab5001894e598904079b620ebc64f2d53b55ca20c3858abd3a46c5fb"}} + */ + + // get HTML + try{ + + $html = + $this->get( + $ip, + "https://open.spotify.com/search/" . + rawurlencode($search) . + ($category != "any" ? "/" . $category : ""), + [] + ); + }catch(Exception $error){ + + throw new Exception("Failed to get initial search page"); + } + + // grep bearer and client ID + $this->fuckhtml->load($html); + + $script = + $this->fuckhtml + ->getElementById( + "session", + "script" + ); + + if($script === null){ + + throw new Exception("Failed to grep bearer token"); + } + + $script = + json_decode( + $script["innerHTML"], + true + ); + + $bearer = $script["accessToken"]; + $client_id = $script["clientId"]; + + // hit client ID endpoint + try{ + + $token = + json_decode( + $this->get( + $ip, + "https://clienttoken.spotify.com/v1/clienttoken", + [ // !! that shit must be sent as json data + "client_data" => [ + "client_id" => $client_id, + "client_version" => "1.2.27.93.g7aee53d4", + "js_sdk_data" => [ + "device_brand" => "unknown", + "device_id" => "4c7ca20117ca12288ea8fc7118a9118c", + "device_model" => "unknown", + "device_name" => "computer", + "os" => "windows", + "os_version" => "NT 10.0" + ] + ] + ], + self::req_clientid + ), + true + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch token"); + } + + if($token === null){ + + throw new Exception("Failed to decode token"); + } + + $token = $token["granted_token"]["token"]; + + try{ + + switch($get["option"]){ + + case "any": + $variables = [ + "searchTerm" => $search, + "offset" => 0, + "limit" => 10, + "numberOfTopResults" => 5, + "includeAudiobooks" => true + ]; + break; + + case "audiobooks": + + break; + } + + $payload = + $this->get( + $ip, + "https://api-partner.spotify.com/pathfinder/v1/query", + [ + "operationName" => "searchDesktop", + "variables" => + json_encode( + [ + "searchTerm" => $search, + "offset" => 0, + "limit" => 10, + "numberOfTopResults" => 5, + "includeAudiobooks" => true + ] + ), + "extensions" => + json_encode( + [ + "persistedQuery" => [ + "version" => 1, + "sha256Hash" => "21969b655b795601fb2d2204a4243188e75fdc6d3520e7b9cd3f4db2aff9591e" // ? + ] + ] + ) + ], + self::req_api, + $bearer, + $token + ); + + }catch(Exception $error){ + + throw new Exception("Failed to fetch JSON results"); + } + + if($payload == "Token expired"){ + + throw new Exception("Grepped spotify token has expired"); + } + + $payload = json_decode($payload, true); + + if($payload === null){ + + throw new Exception("Failed to decode JSON results"); + } + + //$payload = json_decode(file_get_contents("scraper/spotify.json"), true); + + $out = [ + "status" => "ok", + "npt" => null, + "song" => [], + "playlist" => [], + "album" => [], + "podcast" => [], + "author" => [], + "user" => [] + ]; + + // get songs + foreach($payload["data"]["searchV2"]["tracksV2"]["items"] as $result){ + + if(isset($result["item"])){ + + $result = $result["item"]; + } + + if(isset($result["data"])){ + + $result = $result["data"]; + } + + [$artist, $artist_link] = $this->get_artists($result["artists"]); + + $out["song"][] = [ + "title" => $result["name"], + "description" => null, + "url" => "https://open.spotify.com/track/" . $result["id"], + "views" => null, + "author" => [ + "name" => $artist, + "url" => $artist_link, + "avatar" => null + ], + "thumb" => $this->get_thumb($result["albumOfTrack"]["coverArt"]), + "date" => null, + "duration" => $result["duration"]["totalMilliseconds"] / 1000, + "stream" => [ + "endpoint" => "spotify", + "url" => "track." . $result["id"] + ] + ]; + } + + // get playlists + foreach($payload["data"]["searchV2"]["playlists"]["items"] as $playlist){ + + if(isset($playlist["data"])){ + + $playlist = $playlist["data"]; + } + + $avatar = $this->get_thumb($playlist["ownerV2"]["data"]["avatar"]); + + $out["playlist"][] = [ + "title" => $playlist["name"], + "description" => null, + "author" => [ + "name" => $playlist["ownerV2"]["data"]["name"], + "url" => + "https://open.spotify.com/user/" . + explode( + ":", + $playlist["ownerV2"]["data"]["uri"], + 3 + )[2], + "avatar" => $avatar["url"] + ], + "thumb" => $this->get_thumb($playlist["images"]["items"][0]), + "date" => null, + "duration" => null, + "url" => + "https://open.spotify.com/playlist/" . + explode( + ":", + $playlist["uri"], + 3 + )[2] + ]; + } + + // get albums + foreach($payload["data"]["searchV2"]["albums"]["items"] as $album){ + + if(isset($album["data"])){ + + $album = $album["data"]; + } + + [$artist, $artist_link] = $this->get_artists($album["artists"]); + + $out["album"][] = [ + "title" => $album["name"], + "description" => null, + "author" => [ + "name" => $artist, + "url" => $artist_link, + "avatar" => null + ], + "thumb" => $this->get_thumb($album["coverArt"]), + "date" => mktime(0, 0, 0, 0, 32, $album["date"]["year"]), + "duration" => null, + "url" => + "https://open.spotify.com/album/" . + explode( + ":", + $album["uri"], + 3 + )[2] + ]; + } + + // get podcasts + foreach($payload["data"]["searchV2"]["podcasts"]["items"] as $podcast){ + + if(isset($podcast["data"])){ + + $podcast = $podcast["data"]; + } + + $description = []; + foreach($podcast["topics"]["items"] as $subject){ + + $description[] = $subject["title"]; + } + + $description = implode(", ", $description); + + if($description == ""){ + + $description = null; + } + + $out["podcast"][] = [ + "title" => $podcast["name"], + "description" => $description, + "author" => [ + "name" => $podcast["publisher"]["name"], + "url" => null, + "avatar" => null + ], + "thumb" => $this->get_thumb($podcast["coverArt"]), + "date" => null, + "duration" => null, + "url" => + "https://open.spotify.com/show/" . + explode( + ":", + $podcast["uri"], + 3 + )[2], + "stream" => [ + "endpoint" => null, + "url" => null + ] + ]; + } + + // get audio books (put in podcasts) + foreach($payload["data"]["searchV2"]["audiobooks"]["items"] as $podcast){ + + if(isset($podcast["data"])){ + + $podcast = $podcast["data"]; + } + + $description = []; + foreach($podcast["topics"]["items"] as $subject){ + + $description[] = $subject["title"]; + } + + $description = implode(", ", $description); + + if($description == ""){ + + $description = null; + } + + $authors = []; + foreach($podcast["authors"] as $author){ + + $authors[] = $author["name"]; + } + + $authors = implode(", ", $authors); + + if($authors == ""){ + + $authors = null; + } + + $uri = + explode( + ":", + $podcast["uri"], + 3 + )[2]; + + $out["podcast"][] = [ + "title" => $podcast["name"], + "description" => $description, + "author" => [ + "name" => $authors, + "url" => null, + "avatar" => null + ], + "thumb" => $this->get_thumb($podcast["coverArt"]), + "date" => strtotime($podcast["publishDate"]["isoString"]), + "duration" => null, + "url" => "https://open.spotify.com/show/" . $uri, + "stream" => [ + "endpoint" => "spotify", + "url" => "episode." . $uri + ] + ]; + } + + // get episodes (and place them in podcasts) + foreach($payload["data"]["searchV2"]["episodes"]["items"] as $podcast){ + + if(isset($podcast["data"])){ + + $podcast = $podcast["data"]; + } + + $out["podcast"][] = [ + "title" => $podcast["name"], + "description" => $this->limitstrlen($podcast["description"]), + "author" => [ + "name" => + isset( + $podcast["podcastV2"]["data"]["publisher"]["name"] + ) ? + $podcast["podcastV2"]["data"]["publisher"]["name"] + : null, + "url" => null, + "avatar" => null + ], + "thumb" => $this->get_thumb($podcast["coverArt"]), + "date" => strtotime($podcast["releaseDate"]["isoString"]), + "duration" => $podcast["duration"]["totalMilliseconds"] / 1000, + "url" => + "https://open.spotify.com/show/" . + explode( + ":", + $podcast["uri"], + 3 + )[2], + "stream" => [ + "endpoint" => null, + "url" => null + ] + ]; + } + + // get authors + foreach($payload["data"]["searchV2"]["artists"]["items"] as $user){ + + if(isset($user["data"])){ + + $user = $user["data"]; + } + + $avatar = $this->get_thumb($user["visuals"]["avatarImage"]); + + $out["author"][] = [ + "title" => + ( + $user["profile"]["verified"] === true ? + "✓ " : "" + ) . + $user["profile"]["name"], + "followers" => null, + "description" => null, + "thumb" => $avatar, + "url" => + "https://open.spotify.com/artist/" . + explode( + ":", + $user["uri"], + 3 + )[2] + ]; + } + + // get users + foreach($payload["data"]["searchV2"]["users"]["items"] as $user){ + + if(isset($user["data"])){ + + $user = $user["data"]; + } + + $avatar = $this->get_thumb($user["avatar"]); + + $out["user"][] = [ + "title" => $user["displayName"] . " (@{$user["id"]})", + "followers" => null, + "description" => null, + "thumb" => $avatar, + "url" => "https://open.spotify.com/user/" . $user["id"] + ]; + } + + return $out; + } + + private function get_artists($artists){ + + $artist_out = []; + + foreach($artists["items"] as $artist){ + + $artist_out[] = $artist["profile"]["name"]; + } + + $artist_out = + implode(", ", $artist_out); + + if($artist_out == ""){ + + return [null, null]; + } + + $artist_link = + $artist === null ? + null : + "https://open.spotify.com/artist/" . + explode( + ":", + $artists["items"][0]["uri"] + )[2]; + + return [$artist_out, $artist_link]; + } + + private function get_thumb($cover){ + + $thumb_out = null; + + if($cover !== null){ + foreach($cover["sources"] as $thumb){ + + if( + $thumb_out === null || + (int)$thumb["width"] > $thumb_out["width"] + ){ + + $thumb_out = $thumb; + } + } + } + + if($thumb_out === null){ + + return [ + "url" => null, + "ratio" => null + ]; + }else{ + + return [ + "url" => $thumb_out["url"], + "ratio" => "1:1" + ]; + } + } + + private function limitstrlen($text){ + + return + explode( + "\n", + wordwrap( + str_replace( + ["\n\r", "\r\n", "\n", "\r"], + " ", + $text + ), + 300, + "\n" + ), + 2 + )[0]; + } +} diff --git a/scraper/startpage.php b/scraper/startpage.php new file mode 100644 index 0000000..e48a429 --- /dev/null +++ b/scraper/startpage.php @@ -0,0 +1,1584 @@ +<?php + +class startpage{ + + public function __construct(){ + + include "lib/backend.php"; + $this->backend = new backend("startpage"); + + include "lib/fuckhtml.php"; + $this->fuckhtml = new fuckhtml(); + } + + public function getfilters($page){ + + switch($page){ + case "web": + return [ + "country" => [ + "display" => "Country", + "option" => [ + "any" => "All Regions", + "es_AR" => "Argentina", + "en_AU" => "Australia", + "de_AT" => "Austria", + "ru_BY" => "Belarus", + "fr_BE" => "Belgium (FR)", + "nl_BE" => "Belgium (NL)", + "bg_BG" => "Bulgaria", + "en_CA" => "Canada (EN)", + "fr_CA" => "Canada (FR)", + "es_CL" => "Chile", + "es_CO" => "Colombia", + "cs_CZ" => "Czech Republic", + "da_DK" => "Denmark", + "ar_EG" => "Egypt", + "et_EE" => "Estonia", + "fi_FI" => "Finland", + "fr_FR" => "France", + "de_DE" => "Germany", + "el_GR" => "Greece", + "hu_HU" => "Hungary", + "hi_IN" => "India (HI)", + "en_IN" => "India (EN)", + "id_ID" => "Indonesia (ID)", + "en_ID" => "Indonesia (EN)", + "en_IE" => "Ireland", + "it_IT" => "Italy", + "ja_JP" => "Japan", + "ko_KR" => "Korea", + "ms_MY" => "Malaysia (MS)", + "en_MY" => "Malaysia (EN)", + "es_MX" => "Mexico", + "nl_NL" => "Netherlands", + "en_NZ" => "New Zealand", + "no_NO" => "Norway", + "es_PE" => "Peru", + "fil_PH" => "Philippines (FIL)", + "en_PH" => "Philippines (EN)", + "pl_PL" => "Poland", + "pt_PT" => "Portugal", + "ro_RO" => "Romania", + "ru_RU" => "Russia", + "ms_SG" => "Singapore (MS)", + "en_SG" => "Singapore (EN)", + "es_ES" => "Spain (ES)", + "ca_ES" => "Spain (CA)", + "sv_SE" => "Sweden", + "de_CH" => "Switzerland (DE)", + "fr_CH" => "Switzerland (FR)", + "it_CH" => "Switzerland (IT)", + "tr_TR" => "Turkey", + "uk_UA" => "Ukraine", + "en_US" => "US (EN)", + "es_US" => "US (ES)", + "es_UY" => "Uruguay", + "es_VE" => "Venezuela", + "vi_VN" => "Vietnam (VI)", + "en_VN" => "Vietnam (EN)", + "en_ZA" => "South Africa" + ] + ], + "nsfw" => [ // qadf + "display" => "NSFW", + "option" => [ + "yes" => "Yes", // qadf=none + "no" => "No" // qadf=heavy + ] + ], + "time" => [ // with_date + "display" => "Time posted", + "option" => [ + "any" => "Any time", + "d" => "Past 24 hours", + "w" => "Past week", + "m" => "Past month", + "y" => "Past year", + ] + ], + "extendedsearch" => [ + // undefined display, so it wont show in frontend + "option" => [ + "yes" => "Yes", + "no" => "No" + ] + ] + ]; + break; + + case "images": + return [ + "nsfw" => [ // qadf + "display" => "NSFW", + "option" => [ + "yes" => "Yes", // qadf=none + "no" => "No" // qadf=heavy + ] + ], + "size" => [ // flimgsize + "display" => "Size", + "option" => [ + "any" => "Any size", + "Small" => "Small", + "Medium" => "Medium", + "Large" => "Large", + "Wallpaper" => "Wallpaper", + // from here, image-size-select, var prefix = isz:lt,islt: + "qsvgs" => "Larger than 400x300", + "vga" => "Larger than 640x480", + "svga" => "Larger than 800x600", + "xga" => "Larger than 1024x768", + "qsvgs" => "Larger than 400x300", + "2mp" => "Larger than 2 MP (1600x1200)", + "4mp" => "Larger than 4 MP (2272x1704)", + "6mp" => "Larger than 6 MP (2816x2112)", + "8mp" => "Larger than 8 MP (3264x2448)", + "10mp" => "Larger than 10 MP (3648x2736)", + "12mp" => "Larger than 12 MP (4096x3072)", + "15mp" => "Larger than 15 MP (4480x3360)", + "20mp" => "Larger than 20 MP (5120x3840)", + "40mp" => "Larger than 40 MP (7216x5412)", + "70mp" => "Larger than 70 MP (9600x7200)" + ] + ], + "color" => [ // flimgcolor + "display" => "Color", + "option" => [ + "any" => "Any color", + // from here, var prefix = ic: + "color" => "Color only", + "bnw" => "Black & white", // set to "gray" + // from here, var prefix = ic:specific,isc: + "red" => "Red", + "orange" => "Orange", + "yellow" => "Yellow", + "green" => "Green", + "teal" => "Teal", + "blue" => "Blue", + "purple" => "Purple", + "pink" => "Pink", + "white" => "White", + "gray" => "Gray", + "black" => "Black", + "brown" => "Brown" + ] + ], + "type" => [ // flimgtype + "display" => "Type", + "option" => [ + "any" => "Any type", + "AnimatedGif" => "Animated GIF", + "Clipart" => "Clip Art", + "Line" => "Line Drawing", + "Photo" => "Photograph", + "Transparent" => "Transparent Background" + ] + ], + "license" => [ // flimglicense + "display" => "License", + "option" => [ + "any" => "Any license", + "p" => "Public domain", + "s" => "Free to share", + "sc" => "Free to share commercially", + "m" => "Free to modify", + "mc" => "Free to modify commercially" + ] + ] + ]; + break; + + case "videos": + return [ + "nsfw" => [ // qadf + "display" => "NSFW", + "option" => [ + "yes" => "Yes", // qadf=none + "no" => "No" // qadf=heavy + ] + ], + "sort" => [ + "display" => "Sort by", + "option" => [ + "relevance" => "Most relevant", + "popular" => "Most popular", + "recent" => "Most recent" + ] + ], + "duration" => [ // with_duration + "display" => "Duration", + "option" => [ + "any" => "Any duration", + "short" => "Short", + "medium" => "Medium", + "long" => "Long" + ] + ] + ]; + break; + + case "news": + return [ + "nsfw" => [ // qadf + "display" => "NSFW", + "option" => [ + "yes" => "Yes", // qadf=none + "no" => "No" // qadf=heavy + ] + ], + "time" => [ // with_date + "display" => "Time posted", + "option" => [ + "any" => "Any time", + "d" => "Past 24 hours", + "w" => "Past week", + "m" => "Past month" + ] + ] + ]; + break; + + //preferences=date_timeEEEworldN1Ndisable_family_filterEEE1N1Ndisable_open_in_new_windowEEE0N1Nenable_post_methodEEE1N1Nenable_proxy_safety_suggestEEE0N1Nenable_stay_controlEEE0N1Ninstant_answersEEE1N1Nlang_homepageEEEs%2Fdevice%2FenN1NlanguageEEEazerbaijaniN1Nlanguage_uiEEEenglishN1Nnum_of_resultsEEE20N1Nsearch_results_regionEEEallN1NsuggestionsEEE1N1Nwt_unitEEEcelsius; Domain=startpage.com; Expires=Mon, 28 Oct 2024 20:21:58 GMT; Secure; Path=/ + //preferences=date_timeEEEworldN1Ndisable_family_filterEEE1N1Ndisable_open_in_new_windowEEE0N1Nenable_post_methodEEE1N1Nenable_proxy_safety_suggestEEE0N1Nenable_stay_controlEEE0N1Ninstant_answersEEE1N1Nlang_homepageEEEs%2Fdevice%2FenN1NlanguageEEEenglishN1Nlanguage_uiEEEenglishN1Nnum_of_resultsEEE20N1Nsearch_results_regionEEEallN1NsuggestionsEEE1N1Nwt_unitEEEcelsius; Domain=startpage.com; Expires=Mon, 28 Oct 2024 20:22:52 GMT; Secure; Path=/ + } + } + + private function get($proxy, $url, $get = [], $post = false, $is_xhr = false){ + + $curlproc = curl_init(); + + if($post === true){ + + curl_setopt($curlproc, CURLOPT_POST, true); + curl_setopt($curlproc, CURLOPT_POSTFIELDS, $get); + + }elseif($get !== []){ + + $get = http_build_query($get); + $url .= "?" . $get; + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + // http2 bypass + curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + + if($is_xhr === true){ + + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: application/json", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "Referer: https://www.startpage.com/", + "Content-Type: application/json", + "Content-Length: " . strlen($get), + "Origin: https://www.startpage.com/", + "DNT: 1", + "Connection: keep-alive", + "Cookie: preferences=date_timeEEEworldN1Ndisable_family_filterEEE1N1Ndisable_open_in_new_windowEEE0N1Nenable_post_methodEEE1N1Nenable_proxy_safety_suggestEEE0N1Nenable_stay_controlEEE0N1Ninstant_answersEEE1N1Nlang_homepageEEEs%2Fdevice%2FenN1NlanguageEEEenglishN1Nlanguage_uiEEEenglishN1Nnum_of_resultsEEE20N1Nsearch_results_regionEEEallN1NsuggestionsEEE1N1Nwt_unitEEEcelsius", + "Sec-Fetch-Dest: empty", + "Sec-Fetch-Mode: cors", + "Sec-Fetch-Site: same-origin", + "TE: trailers"] + ); + + }elseif($post === true){ + + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "Referer: https://www.startpage.com/", + "Content-Type: application/x-www-form-urlencoded", + "Content-Length: " . strlen($get), + "DNT: 1", + "Connection: keep-alive", + "Cookie: preferences=date_timeEEEworldN1Ndisable_family_filterEEE1N1Ndisable_open_in_new_windowEEE0N1Nenable_post_methodEEE1N1Nenable_proxy_safety_suggestEEE0N1Nenable_stay_controlEEE0N1Ninstant_answersEEE1N1Nlang_homepageEEEs%2Fdevice%2FenN1NlanguageEEEenglishN1Nlanguage_uiEEEenglishN1Nnum_of_resultsEEE20N1Nsearch_results_regionEEEallN1NsuggestionsEEE1N1Nwt_unitEEEcelsius", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: none", + "Sec-Fetch-User: ?1", + "Priority: u=0, i", + "TE: trailers"] + ); + }else{ + + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "DNT: 1", + "Connection: keep-alive", + "Cookie: preferences=date_timeEEEworldN1Ndisable_family_filterEEE1N1Ndisable_open_in_new_windowEEE0N1Nenable_post_methodEEE1N1Nenable_proxy_safety_suggestEEE0N1Nenable_stay_controlEEE0N1Ninstant_answersEEE1N1Nlang_homepageEEEs%2Fdevice%2FenN1NlanguageEEEenglishN1Nlanguage_uiEEEenglishN1Nnum_of_resultsEEE20N1Nsearch_results_regionEEEallN1NsuggestionsEEE1N1Nwt_unitEEEcelsius", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: none", + "Sec-Fetch-User: ?1", + "Priority: u=0, i", + "TE: trailers"] + ); + } + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + return $data; + } + + public function web($get){ + + if($get["npt"]){ + + [$post, $proxy] = $this->backend->get($get["npt"], "web"); + + try{ + $html = $this->get( + $proxy, + "https://www.startpage.com/sp/search", + $post, + true + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch search page"); + } + + $get_instant_answer = false; + + }else{ + + $proxy = $this->backend->get_ip(); + + $params = [ + "query" => $get["s"], + "cat" => "web", + "pl" => "opensearch" + ]; + + if($get["nsfw"] == "no"){ + + $params["qadf"] = "heavy"; + $get_instant_answer = false; + }else{ + + $get_instant_answer = true; + } + + if($get["country"] !== "any"){ + + $params["qsr"] = $get["country"]; + } + + if($get["time"] !== "any"){ + + $params["with_date"] = $get["time"]; + } + + try{ + $html = $this->get( + $proxy, + "https://www.startpage.com/sp/search", + $params + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch search page"); + } + + //$html = file_get_contents("scraper/startpage.html"); + } + + $this->detect_captcha($html); + + if( + preg_match( + '/React\.createElement\(UIStartpage\.AppSerpWeb, ?(.+)\),?$/m', + $html, + $matches + ) === 0 + ){ + + throw new Exception("Failed to grep JSON object"); + } + + $json = json_decode($matches[1], true); + + if($json === null){ + + throw new Exception("Failed to decode JSON"); + } + + //print_r($json); + + $out = [ + "status" => "ok", + "spelling" => [ + "type" => "no_correction", + "using" => null, + "correction" => null + ], + "npt" => null, + "answer" => [], + "web" => [], + "image" => [], + "video" => [], + "news" => [], + "related" => [] + ]; + + // get npt + $out["npt"] = $this->parse_npt($json, "web", $proxy); + + foreach($json["render"]["presenter"]["regions"]["mainline"] as $category){ + + if(!isset($category["display_type"])){ + + continue; + } + + switch($category["display_type"]){ + + case "web-google": + foreach($category["results"] as $result){ + + $sublinks = []; + + foreach($result["siteLinks"] as $sublink){ + + $sublinks[] = [ + "title" => $sublink["title"], + "description" => null, + "url" => $sublink["clickUrl"] + ]; + } + + $description = + explode( + "...", + $this->titledots( + html_entity_decode( + $this->fuckhtml + ->getTextContent( + $result["description"] + ) + ) + ), + 2 + ); + + $date = strtotime(trim($description[0])); + + if( + $date === false || + count($description) !== 2 || + strlen($description[0]) > 14 + ){ + + // no date found + $description = + implode( + " ... ", + $description + ); + + $date = null; + }else{ + + // date found + $description = ltrim($description[1]); + } + + $out["web"][] = [ + "title" => + $this->titledots( + html_entity_decode( + $this->fuckhtml + ->getTextContent( + $result["title"] + ) + ) + ), + "description" => $description, + "url" => $result["clickUrl"], + "date" => $date, + "type" => "web", + "thumb" => [ + "url" => null, + "ratio" => null + ], + "sublink" => $sublinks, + "table" => [] + ]; + } + break; + + case "images-qi-top": + foreach($category["results"] as $result){ + + $out["image"][] = [ + "title" => + $this->titledots( + html_entity_decode( + $this->fuckhtml + ->getTextContent( + $result["title"] + ) + ) + ), + "source" => [ + [ + "url" => $result["rawImageUrl"], + "width" => (int)$result["width"], + "height" => (int)$result["height"] + ], + [ + "url" => $this->unshitimage($result["mdThumbnailUrl"]), + "width" => (int)$result["mdThumbnailWidth"], + "height" => (int)$result["mdThumbnailHeight"] + ] + ], + "url" => + $result["altClickUrl"] + ]; + } + break; + + case "spellsuggest-google": + $out["spelling"] = + [ + "type" => "including", + "using" => $json["render"]["query"], + "correction" => $category["results"][0]["query"] + ]; + break; + + case "dictionary-qi": + foreach($category["results"] as $result){ + + $answer = [ + "title" => $result["word"], + "description" => [], + "url" => null, + "thumb" => null, + "table" => [], + "sublink" => [] + ]; + + foreach($result["lexical_categories"] as $lexic_type => $definitions){ + + $answer["description"][] = [ + "type" => "title", + "value" => $lexic_type + ]; + + $i = 0; + + foreach($definitions as $definition){ + + $text_definition = trim($definition["definition"]); + $text_example = trim($definition["example"]); + $text_synonyms = implode(", ", $definition["synonyms"]); + + if($text_definition != ""){ + + $i++; + + $c = count($answer["description"]) - 1; + if( + $c !== 0 && + $answer["description"][$c]["type"] == "text" + ){ + + $answer["description"][$c]["value"] .= + "\n\n" . $i . ". " . $text_definition; + + }else{ + + $answer["description"][] = [ + "type" => "text", + "value" => $i . ". " . $text_definition + ]; + } + } + + if($text_example != ""){ + + $answer["description"][] = [ + "type" => "quote", + "value" => $text_example + ]; + } + + if($text_synonyms != ""){ + + $answer["description"][] = [ + "type" => "text", + "value" => "Synonyms: " . $text_synonyms + ]; + } + } + } + + $out["answer"][] = $answer; + } + break; + } + } + + // parse instant answers + if( + $get["extendedsearch"] == "yes" && + $get_instant_answer === true + ){ + + // https://www.startpage.com/sp/qi?qimsn=ex&sxap=%2Fv1%2Fquery&sc=BqZ3inqrAgF701&sr=1 + try{ + $post = [ + "se" => "n0vze2y9dqwy", + "q" => $json["render"]["query"], + "results" => [], // populate + "enableKnowledgePanel" => true, + "enableMediaThumbBar" => false, + "enableSearchSuggestions" => false, + "enableTripadvisorProperties" => [], + "enableTripadvisorPlaces" => [], + "enableTripadvisorPlacesForLocations" => [], + "enableWebProducts" => false, + "tripadvisorPartnerId" => null, + "tripadvisorMapColorMode" => "light", + "tripadvisorDisablesKnowledgePanel" => false, + "instantAnswers" => [ + "smartAnswers", + "youtube", + "tripadvisor" + ], + "iaType" => null, + "forceEnhancedKnowledgePanel" => false, + "shoppingOnly" => false, + "allowAdultProducts" => true, + "lang" => "en", + "browserLang" => "en-US", + "browserTimezone" => "America/New_York", + "market" => null, + "userLocation" => null, + "userDate" => date("Y-m-d"), + "userAgentType" => "unknown" + ]; + + foreach($out["web"] as $result){ + + $post["results"][] = [ + "url" => $result["url"], + "title" => $result["title"] + ]; + } + + $post = json_encode($post, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_INVALID_UTF8_IGNORE); + + $additional_data = + $this->get( + $proxy, + "https://www.startpage.com/sp/qi?qimsn=ex&sxap=%2Fv1%2Fquery&sc=" . $json["render"]["callback_sc"] . "&sr=1", + $post, + true, + true + ); + + $additional_data = json_decode($additional_data, true); + + if($additional_data === null){ + + throw new Exception("Failed to decode JSON"); // just break out, dont fail completely + } + + if(!isset($additional_data["knowledgePanel"])){ + + throw new Exception("Response has missing data (knowledgePanel)"); + } + + $additional_data = $additional_data["knowledgePanel"]; + + $answer = [ + "title" => $additional_data["meta"]["title"], + "description" => [ + [ + "type" => "quote", + "value" => $additional_data["meta"]["description"] + ] + ], + "url" => $additional_data["meta"]["origWikiUrl"], + "thumb" => $additional_data["meta"]["image"], + "table" => [], + "sublink" => [] + ]; + + // parse html for instant answer + $this->fuckhtml->load($additional_data["html"]); + + $div = + $this->fuckhtml + ->getElementsByTagName( + "div" + ); + + // get description + $description = + $this->fuckhtml + ->getElementsByClassName( + "sx-kp-short-extract sx-kp-short-extract-complete", + $div + ); + + if(count($description) !== 0){ + + $answer["description"][] = [ + "type" => "text", + "value" => + html_entity_decode( + $this->fuckhtml + ->getTextContent( + $description[0] + ) + ) + ]; + } + + // get socials + $socials = + $this->fuckhtml + ->getElementsByClassName( + "sx-wiki-social-link", + "a" + ); + + foreach($socials as $social){ + + $title = + $this->fuckhtml + ->getTextContent( + $social["attributes"]["title"] + ); + + $url = + $this->fuckhtml + ->getTextContent( + $social["attributes"]["href"] + ); + + switch($title){ + + case "Official Website": + $title = "Website"; + break; + } + + $answer["sublink"][$title] = $url; + } + + // get videos + $videos = + $this->fuckhtml + ->getElementsByClassName( + "sx-kp-video-grid-item", + $div + ); + + foreach($videos as $video){ + + $this->fuckhtml->load($video); + + $as = + $this->fuckhtml + ->getElementsByTagName( + "a" + ); + + if(count($as) === 0){ + + // ?? invalid + continue; + } + + $image = + $this->fuckhtml + ->getElementsByAttributeName( + "data-sx-src", + "img" + ); + + if(count($image) !== 0){ + + $thumb = [ + "ratio" => "16:9", + "url" => + $this->fuckhtml + ->getTextContent( + $image[0]["attributes"]["data-sx-src"] + ) + ]; + }else{ + + $thumb = [ + "ratio" => null, + "url" => null + ]; + } + + $out["video"][] = [ + "title" => + $this->fuckhtml + ->getTextContent( + $as[0]["attributes"]["title"] + ), + "description" => null, + "date" => null, + "duration" => null, + "views" => null, + "thumb" => $thumb, + "url" => + $this->fuckhtml + ->getTextContent( + $as[0]["attributes"]["href"] + ) + ]; + } + + // reset + $this->fuckhtml->load($additional_data["html"]); + + // get table elements + $table = + $this->fuckhtml + ->getElementsByClassName( + "sx-infobox", + "table" + ); + + if(count($table) !== 0){ + + $trs = + $this->fuckhtml + ->getElementsByTagName( + "tr" + ); + + foreach($trs as $tr){ + + $this->fuckhtml->load($tr); + + // ok so startpage devs cant fucking code a table + // td = content + // th (AAAHH) = title + $tds = + $this->fuckhtml + ->getElementsByTagName( + "td" + ); + + $ths = + $this->fuckhtml + ->getElementsByTagName( + "th" + ); + + if( + count($ths) === 1 && + count($tds) === 1 + ){ + + $title = + $this->fuckhtml + ->getTextContent( + $ths[0] + ); + + $description = []; + + $this->fuckhtml->load($tds[0]); + + $lis = + $this->fuckhtml + ->getElementsByTagName( + "li" + ); + + if(count($lis) !== 0){ + + foreach($lis as $li){ + + $description[] = + $this->fuckhtml + ->getTextContent( + $li + ); + } + + $description = implode(", ", $description); + }else{ + + $description = + $this->fuckhtml + ->getTextContent( + $tds[0] + ); + } + + $answer["table"][$title] = $description; + } + } + } + + $out["answer"][] = $answer; + + }catch(Exception $error){ + + // do nothing + //echo "error!"; + } + } + + return $out; + } + + public function image($get){ + + if($get["npt"]){ + + [$post, $proxy] = $this->backend->get($get["npt"], "images"); + + try{ + $html = $this->get( + $proxy, + "https://www.startpage.com/sp/search", + $post, + true + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch search page"); + } + + }else{ + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + try{ + + $proxy = $this->backend->get_ip(); + + $params = [ + "query" => $get["s"], + "cat" => "images", + "pl" => "opensearch" + ]; + + if($get["nsfw"] == "no"){ + + $params["qadf"] = "heavy"; + } + + if($get["size"] != "any"){ + + if( + $get["size"] == "Small" || + $get["size"] == "Medium" || + $get["size"] == "Large" || + $get["size"] == "Wallpaper" + ){ + + $params["flimgsize"] = $get["size"]; + }else{ + + $params["image-size-select"] = "isz:lt,islt:" . $get["size"]; + } + } + + if($get["color"] != "any"){ + + if($get["color"] == "color"){ + + $params["flimgcolor"] = "ic:color"; + }elseif($get["color"] == "bnw"){ + + $params["flimgcolor"] = "ic:gray"; + }else{ + + $params["flimgcolor"] = "ic:specific,isc:" . $get["color"]; + } + } + + if($get["type"] != "any"){ + + $params["flimgtype"] = $get["type"]; + } + + if($get["license"] != "any"){ + + $params["flimglicense"] = $get["license"]; + } + + try{ + $html = $this->get( + $proxy, + "https://www.startpage.com/sp/search", + $params + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch search page"); + } + //$html = file_get_contents("scraper/startpage.html"); + + }catch(Exception $error){ + + throw new Exception("Failed to fetch search page"); + } + } + + $this->detect_captcha($html); + + $out = [ + "status" => "ok", + "npt" => null, + "image" => [] + ]; + + if( + preg_match( + '/React\.createElement\(UIStartpage\.AppSerpImages, ?(.+)\),?$/m', + $html, + $matches + ) === 0 + ){ + + throw new Exception("Failed to grep JSON object"); + } + + $json = json_decode($matches[1], true); + + if($json === null){ + + throw new Exception("Failed to decode JSON object"); + } + + // get npt + $out["npt"] = $this->parse_npt($json, "images", $proxy); + + // get images + foreach($json["render"]["presenter"]["regions"]["mainline"] as $category){ + + if($category["display_type"] != "images-bing"){ + + // ignore ads and !! suggestions !! @todo + continue; + } + + foreach($category["results"] as $image){ + + $out["image"][] = [ + "title" => $this->titledots($image["title"]), + "source" => [ + [ + "url" => $this->unshitimage($image["clickUrl"]), + "width" => (int)$image["width"], + "height" => (int)$image["height"] + ], + [ + "url" => $this->unshitimage($image["thumbnailUrl"]), + "width" => (int)$image["thumbnailWidth"], + "height" => (int)$image["thumbnailHeight"] + ] + ], + "url" => $image["altClickUrl"] + ]; + } + } + + return $out; + } + + public function video($get){ + + if($get["npt"]){ + + [$post, $proxy] = $this->backend->get($get["npt"], "videos"); + + try{ + $html = $this->get( + $proxy, + "https://www.startpage.com/sp/search", + $post, + true + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch search page"); + } + + }else{ + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + try{ + + $proxy = $this->backend->get_ip(); + + $params = [ + "query" => $get["s"], + "cat" => "video", + "pl" => "opensearch" + ]; + + if($get["nsfw"] == "no"){ + + $params["qadf"] = "heavy"; + } + + if($get["sort"] != "relevance"){ + + $params["sort_by"] = $get["sort"]; + } + + if($get["duration"] != "any"){ + + $params["with_duration"] = $get["duration"]; + } + + try{ + $html = $this->get( + $proxy, + "https://www.startpage.com/sp/search", + $params + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch search page"); + } + //$html = file_get_contents("scraper/startpage.html"); + + }catch(Exception $error){ + + throw new Exception("Failed to fetch search page"); + } + } + + $this->detect_captcha($html); + + if( + preg_match( + '/React\.createElement\(UIStartpage\.AppSerpVideos, ?(.+)\),?$/m', + $html, + $matches + ) === 0 + ){ + + throw new Exception("Failed to get JSON object"); + } + + $json = json_decode($matches[1], true); + + if($json === null){ + + throw new Exception("Failed to decode JSON object"); + } + + $out = [ + "status" => "ok", + "npt" => null, + "video" => [], + "author" => [], + "livestream" => [], + "playlist" => [], + "reel" => [] + ]; + + // get npt + $out["npt"] = $this->parse_npt($json, "video", $proxy); + + // get results + foreach($json["render"]["presenter"]["regions"]["mainline"] as $category){ + + if( + preg_match( + '/^video-/i', + $category["display_type"] + ) + ){ + + foreach($category["results"] as $video){ + + if( + isset($video["thumbnailUrl"]) && + $video["thumbnailUrl"] !== null + ){ + + $thumb = [ + "ratio" => "16:9", + "url" => $this->unshitimage($video["thumbnailUrl"]) + ]; + }else{ + + $thumb = [ + "ratio" => null, + "url" => null + ]; + } + + $out["video"][] = [ + "title" => str_replace(["", ""], "", $video["title"]), + "description" => $this->limitstrlen($video["description"]), + "author" => [ + "name" => $video["channelTitle"], + "url" => null, + "avatar" => null + ], + "date" => strtotime($video["publishDate"]), + "duration" => $this->hms2int($category["display_type"] == "video-youtube" ? $video["duration"] : $video["duration"] / 1000), + "views" => (int)$video["viewCount"], + "thumb" => $thumb, + "url" => $video["clickUrl"] + ]; + } + } + } + + return $out; + } + + public function news($get){ + + if($get["npt"]){ + + [$post, $proxy] = $this->backend->get($get["npt"], "news"); + + try{ + $html = $this->get( + $proxy, + "https://www.startpage.com/sp/search", + $post, + true + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch search page"); + } + + }else{ + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + try{ + + $proxy = $this->backend->get_ip(); + + $params = [ + "query" => $get["s"], + "cat" => "news", + "pl" => "opensearch" + ]; + + if($get["nsfw"] == "no"){ + + $params["qadf"] = "heavy"; + } + + if($get["time"] != "any"){ + + $params["with_date"] = $get["time"]; + } + + try{ + $html = $this->get( + $proxy, + "https://www.startpage.com/sp/search", + $params + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch search page"); + } + //$html = file_get_contents("scraper/startpage.html"); + + }catch(Exception $error){ + + throw new Exception("Failed to fetch search page"); + } + } + + $this->detect_captcha($html); + + if( + preg_match( + '/React\.createElement\(UIStartpage\.AppSerpNews, ?(.+)\),?$/m', + $html, + $matches + ) === 0 + ){ + + throw new Exception("Failed to get JSON object"); + } + + $json = json_decode($matches[1], true); + + if($json === null){ + + throw new Exception("Failed to decode JSON object"); + } + + $out = [ + "status" => "ok", + "npt" => null, + "news" => [] + ]; + + // get npt + $out["npt"] = $this->parse_npt($json, "news", $proxy); + + foreach($json["render"]["presenter"]["regions"]["mainline"] as $category){ + + if($category["display_type"] != "news-bing"){ + + // unsupported category + continue; + } + + foreach($category["results"] as $news){ + + if( + isset($news["thumbnailUrl"]) && + $news["thumbnailUrl"] !== null + ){ + + $thumb = [ + "ratio" => "16:9", + "url" => $this->unshitimage($news["thumbnailUrl"]) + ]; + }else{ + + $thumb = [ + "ratio" => null, + "url" => null + ]; + } + + $out["news"][] = [ + "title" => $this->titledots($this->remove_penguins($news["title"])), + "author" => $news["source"], + "description" => $this->titledots($this->remove_penguins($news["description"])), + "date" => (int)substr((string)$news["date"], 0, -3), + "thumb" => $thumb, + "url" => $news["clickUrl"] + ]; + } + } + + return $out; + } + + private function parse_npt($json, $pagetype, $proxy){ + + foreach($json["render"]["presenter"]["pagination"]["pages"] as $page){ + + if($page["name"] == "Next"){ + + parse_str( + explode( + "?", + $page["url"], + 2 + )[1], + $str + ); + + return + $this->backend->store( + http_build_query( + [ + "lui" => "english", + "language" => "english", + "query" => $str["q"], + "cat" => $pagetype, + "sc" => $str["sc"], + "t" => "device", + "segment" => "startpage.udog", + "page" => $str["page"] + ] + ), + $pagetype, + $proxy + ); + + break; + } + } + + return null; + } + + private function unshitimage($url){ + + $query = parse_url($url, PHP_URL_QUERY); + parse_str($query, $query); + + if(isset($query["piurl"])){ + + if(strpos($query["piurl"], "gstatic.com/")){ + + return + explode( + "&", + $query["piurl"], + 2 + )[0]; + } + + if( + strpos($query["piurl"], "bing.net/") || + strpos($query["piurl"], "bing.com/") + ){ + + return + explode( + "&", + $query["piurl"], + 2 + )[0]; + } + + return $query["piurl"]; + } + + return $url; + } + + private function limitstrlen($text){ + + return + explode( + "\n", + wordwrap( + str_replace( + ["\n\r", "\r\n", "\n", "\r"], + " ", + $text + ), + 300, + "\n" + ), + 2 + )[0]; + } + + private function titledots($title){ + + return trim($title, " .\t\n\r\0\x0B…"); + } + + private function hms2int($time){ + + $parts = explode(":", $time, 3); + $time = 0; + + if(count($parts) === 3){ + + // hours + $time = $time + ((int)$parts[0] * 3600); + array_shift($parts); + } + + if(count($parts) === 2){ + + // minutes + $time = $time + ((int)$parts[0] * 60); + array_shift($parts); + } + + // seconds + $time = $time + (int)$parts[0]; + + return $time; + } + + private function remove_penguins($text){ + + return str_replace( + ["", ""], + "", + $text + ); + } + + private function detect_captcha($html){ + + $this->fuckhtml->load($html); + + $title = + $this->fuckhtml + ->getElementsByTagName( + "title" + ); + + if( + count($title) !== 0 && + $title[0]["innerHTML"] == "Redirecting..." + ){ + + // check if it's a captcha + $as = + $this->fuckhtml + ->getElementsByTagName( + "a" + ); + + foreach($as as $a){ + + if( + strpos( + $this->fuckhtml + ->getTextContent( + $a["innerHTML"] + ), + "https://www.startpage.com/sp/captcha" + ) !== false + ){ + + throw new Exception("Startpage returned a captcha"); + } + } + + throw new Exception("Startpage redirected the scraper to an unhandled page"); + } + } +} diff --git a/scraper/vimeo.php b/scraper/vimeo.php new file mode 100644 index 0000000..50bb21b --- /dev/null +++ b/scraper/vimeo.php @@ -0,0 +1,754 @@ +<?php + +class vimeo{ + + public function __construct(){ + + include "lib/backend.php"; + $this->backend = new backend("vimeo"); + + include "lib/fuckhtml.php"; + $this->fuckhtml = new fuckhtml(); + } + + public function getfilters($page){ + + return [ + "time" => [ + "display" => "Date uploaded", // &filter_uploaded= + "option" => [ + "any" => "Any time", + "today" => "Last 24 hours", + "this-week" => "Last 7 days", + "this-month" => "Last 30 days", + "this-year" => "Last 365 days", + ] + ], + "display" => [ + "display" => "Display", + "option" => [ + "video" => "Videos", + "ondemand" => "On-Demand ($$)", + "people" => "People", + "channel" => "Channels", + "group" => "Groups" + ] + ], + "sort" => [ + "display" => "Sort by", + "option" => [ + "relevance" => "Relevance", // no param + "recent" => "Newest", // &sort=latest&direction=desc + "popular" => "Most popular", // &sort=popularity&direction=desc + "a_z" => "Title, A to Z", // &sort=alphabetical&direction=asc + "z_a" => "Title, Z to A", // &sort=alphabetical&direction=desc + "longest" => "Longest", // &sort=duration&direction=desc + "shortest" => "Shortest", // &sort=duration&direction=asc + ] + ], + "duration" => [ + "display" => "Duration", // &filter_duration= + "option" => [ + "any" => "Any duration", + "short" => "Short (less than 4 minutes)", + "medium" => "Medium (4-10 minutes)", + "long" => "Long (over 10 minutes)" + ] + ], + "resolution" => [ + "display" => "Resolution", + "option" => [ + "any" => "Any resolution", + "4k" => "4K" // &filter_resolution=4k + ] + ], + "category" => [ + "display" => "Category", // &filter_category= + "option" => [ + "any" => "Any category", + "animation" => "Animation", + "comedy" => "Comedy", + "music" => "Music", + "experimental" => "Experimental", + "documentary" => "Documentary", + "identsandanimatedlogos" => "Idents and Animated Logos", + "industry" => "Industry", + "instructionals" => "Instructionals", + "narrative" => "Narrative", + "personal" => "Personal" + ] + ], + "live" => [ + "display" => "Live events", + "option" => [ + "any" => "Any", + "yes" => "Live now" // &filter_live=now + ] + ], + "hdr" => [ + "display" => "HDR", // &filter_hdr= + "option" => [ + "any" => "Any", + "hdr" => "Any HDR", + "dolby_vision" => "Dolby Vision", + "hdr10" => "HDR10", + "hdr10+" => "HDR10+" + ] + ], + "vimeo_360" => [ + "display" => "Vimeo 360°", // &filter_vimeo_360 + "option" => [ + "any" => "Any", + "spatial" => "Spatial", + "360" => "360°" + ] + ], + "price" => [ // &filter_price= + "display" => "Price", + "option" => [ + "any" => "Any price", + "free" => "Free", + "paid" => "Paid" + ] + ], + "collection" => [ + "display" => "Vimeo collections", + "option" => [ + "any" => "Any collection", + "staff_pick" => "Staff picks" // &filter_staffpicked=true + ] + ], + "license" => [ // &filter_license= + "display" => "License", + "option" => [ + "any" => "Any license", + "by-nc-nd" => "CC BY-NC-ND", + "by" => "CC BY", + "by-nc" => "CC BY-NC", + "by-nc-sa" => "CC BY-NC-SA", + "by-nd" => "CC BY-ND", + "by-sa" => "CC BY-SA", + "cc0" => "CC0" + ] + ] + ]; + } + + private function get($proxy, $url, $get = [], $jwt = false){ + + $curlproc = curl_init(); + + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + + if($jwt === false){ + + curl_setopt( + $curlproc, + CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: */*", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip, deflate, br, zstd", + "Referer: https://vimeo.com/search", + "X-Requested-With: XMLHttpRequest", + "DNT: 1", + "Sec-GPC: 1", + "Connection: keep-alive", + "Sec-Fetch-Dest: empty", + "Sec-Fetch-Mode: cors", + "Sec-Fetch-Site: same-origin", + "Priority: u=4"] + ); + + }else{ + + curl_setopt( + $curlproc, + CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: application/vnd.vimeo.*+json;version=3.3", + "Accept-Language: en", + "Accept-Encoding: gzip, deflate, br, zstd", + "Referer: https://vimeo.com/", + "Content-Type: application/json", + "Authorization: jwt $jwt", + "Vimeo-Page: /search/[[...slug]]", + "Origin: https://vimeo.com", + "DNT: 1", + "Sec-GPC: 1", + "Connection: keep-alive", + "Sec-Fetch-Dest: empty", + "Sec-Fetch-Mode: cors", + "Sec-Fetch-Site: same-site", + "Priority: u=4"] + ); + } + + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + return $data; + } + + public function video($get){ + + // parse shit + if($get["npt"]){ + + [$npt, $proxy] = + $this->backend + ->get( + $get["npt"], + "videos" + ); + + $npt = json_decode($npt, true); + $pagetype = $npt["pagetype"]; + $npt = $npt["npt"]; + + $jwt = $this->get_jwt($proxy); + + try{ + + $json = + $this->get( + $proxy, + "https://api.vimeo.com" . $npt, + [], + $jwt + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch JSON"); + } + + }else{ + + $proxy = null; + $jwt = $this->get_jwt($proxy); // this gives us a proxy by reference + + // parse filters + $npt = [ + "query" => $get["s"], + "page" => 1, + "per_page" => 24, + "facets" => "type" + ]; + + switch($get["display"]){ + + case "video": + $npt["filter_type"] = "clip"; + $npt["fields"] = "clip.name,stats.plays,clip.pictures,clip.user.name,clip.user.link,clip.user.pictures.sizes,clip.uri,clip.stats.plays,clip.duration,clip.created_time,clip.link,clip.description"; + break; + + case "ondemand": + $npt["filter_type"] = "ondemand"; + $npt["sizes"] = "296x744"; + $npt["fields"] = "ondemand.link,ondemand.name,ondemand.pictures.sizes,ondemand.metadata.interactions.buy,ondemand.metadata.interactions.rent,ondemand.uri"; + break; + + case "people": + $npt["filter_type"] = "people"; + $npt["fetch_user_profile"] = "1"; + $npt["fields"] = "people.name,people.location_details.formatted_address,people.metadata.public_videos.total,people.pictures.sizes,people.link,people.metadata.connections.followers.total,people.skills.name,people.skills.uri,people.background_video,people.uri"; + break; + + case "channel": + $npt["filter_type"] = "channel"; + $npt["fields"] = "channel.name,channel.metadata.connections.users.total,channel.metadata.connections.videos.total,channel.pictures.sizes,channel.link,channel.uri"; + break; + + case "group": + $npt["filter_type"] = "group"; + $npt["fields"] = "group.name,group.metadata.connections.users.total,group.metadata.connections.videos.total,group.pictures.sizes,group.link,group.uri"; + break; + } + + // only apply filters if we're searching for videos + if($get["display"] == "video"){ + + switch($get["sort"]){ + + case "relevance": break; // do nothing + + case "recent": + $npt["sort"] = "latest"; + $npt["direction"] = "desc"; + break; + + case "popular": + $npt["sort"] = "popularity"; + $npt["direction"] = "desc"; + break; + + case "a_z": + $npt["sort"] = "alphabetical"; + $npt["direction"] = "asc"; + break; + + case "z_a": + $npt["sort"] = "alphabetical"; + $npt["direction"] = "desc"; + break; + + case "longest": + $npt["sort"] = "duration"; + $npt["direction"] = "desc"; + break; + + case "shortest": + $npt["sort"] = "duration"; + $npt["direction"] = "asc"; + break; + } + + if($get["time"] != "any"){ + + $npt["filter_uploaded"] = $get["time"]; + } + + if($get["duration"] != "any"){ + + $npt["filter_duration"] = $get["duration"]; + } + + if($get["resolution"] != "any"){ + + $npt["filter_resolution"] = $get["resolution"]; + } + + if($get["category"] != "any"){ + + $npt["filter_category"] = $get["category"]; + } + + if($get["live"] != "any"){ + + $npt["filter_live"] = "now"; + } + + if($get["hdr"] != "any"){ + + $npt["filter_hdr"] = $get["hdr"]; + } + + if($get["vimeo_360"] != "any"){ + + $npt["filter_vimeo_360"] = $get["vimeo_360"]; + } + + if($get["price"] != "any"){ + + $npt["filter_price"] = $get["price"]; + } + + if($get["collection"] == "staff_pick"){ + + $npt["filter_staffpicked"] = "true"; + } + + if($get["license"] != "any"){ + + $npt["filter_license"] = $get["license"]; + } + } + + $pagetype = $npt["filter_type"]; + + try{ + + $json = + $this->get( + $proxy, + "https://api.vimeo.com/search", + $npt, + $jwt + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch JSON"); + } + } + + $json = json_decode($json, true); + + if($json === null){ + + throw new Exception("Failed to parse JSON"); + } + + $out = [ + "status" => "ok", + "npt" => null, + "video" => [], + "author" => [], + "livestream" => [], + "playlist" => [], + "reel" => [] + ]; + + if(isset($json["error"])){ + + $error = $json["error"]; + if(isset($json["developer_message"])){ + + $error .= " ({$json["developer_message"]})"; + } + + throw new Exception("Vimeo returned an error: " . $error); + } + + if(!isset($json["data"])){ + + throw new Exception("Vimeo did not return a data object"); + } + + switch($pagetype){ + + case "clip": + foreach($json["data"] as $video){ + + $video = $video["clip"]; + + if(isset($video["user"]["pictures"]["sizes"])){ + + $avatar = $video["user"]["pictures"]["sizes"][count($video["user"]["pictures"]["sizes"]) - 1]["link"]; + }else{ + + $avatar = null; + } + + $out["video"][] = [ + "title" => $video["name"], + "description" => + $this->limitstrlen( + $video["description"] + ), + "author" => [ + "name" => $video["user"]["name"], + "url" => $video["user"]["link"], + "avatar" => $avatar + ], + "date" => strtotime($video["created_time"]), + "duration" => (int)$video["duration"], + "views" => (int)$video["stats"]["plays"], + "thumb" => [ + "ratio" => "16:9", + "url" => $video["pictures"]["base_link"] + ], + "url" => $video["link"] + ]; + } + break; + + case "ondemand": + foreach($json["data"] as $video){ + + $video = $video["ondemand"]; + + $description = []; + if(isset($video["metadata"]["interactions"]["rent"]["display_price"])){ + + $description[] = "Rent for " . $video["metadata"]["interactions"]["rent"]["display_price"]; + } + + if(isset($video["metadata"]["interactions"]["buy"]["display_price"])){ + + $description[] = "Buy for " . $video["metadata"]["interactions"]["buy"]["display_price"]; + } + + $description = implode(", ", $description); + + $out["video"][] = [ + "title" => $video["name"], + "description" => $description, + "author" => [ + "name" => null, + "url" => null, + "avatar" => null + ], + "date" => null, + "duration" => null, + "views" => null, + "thumb" => [ + "ratio" => "9:16", + "url" => $video["pictures"]["sizes"][0]["link"] + ], + "url" => $video["link"] + ]; + } + break; + + case "people": + foreach($json["data"] as $user){ + + $user = $user["people"]; + + if( + isset($user["pictures"]["sizes"]) && + count($user["pictures"]["sizes"]) !== 0 + ){ + + $thumb = [ + "ratio" => "1:1", + "url" => $user["pictures"]["sizes"][count($user["pictures"]["sizes"]) - 1]["link"] + ]; + }else{ + + $thumb = [ + "ratio" => null, + "url" => null + ]; + } + + $out["author"][] = [ + "title" => $user["name"], + "followers" => (int)$user["metadata"]["connections"]["followers"]["total"], + "description" => $user["metadata"]["public_videos"]["total"] . " videos.", + "thumb" => $thumb, + "url" => $user["link"] + ]; + } + break; + + case "channel": + case "group": + foreach($json["data"] as $channel){ + + $channel = $channel[$npt["filter_type"]]; + + if( + isset($channel["pictures"]["sizes"]) && + count($channel["pictures"]["sizes"]) !== 0 + ){ + + $thumb = [ + "ratio" => "16:9", + "url" => $channel["pictures"]["sizes"][count($channel["pictures"]["sizes"]) - 1]["link"] + ]; + }else{ + + $thumb = [ + "ratio" => null, + "url" => null + ]; + } + + $out["author"][] = [ + "title" => $channel["name"], + "followers" => (int)$channel["metadata"]["connections"]["users"]["total"], + "description" => $channel["metadata"]["connections"]["videos"]["total"] . " videos.", + "thumb" => $thumb, + "url" => $channel["link"] + ]; + } + break; + } + + // + // get next page + // + if( + isset($json["paging"]["next"]) && + is_string($json["paging"]["next"]) + ){ + + $out["npt"] = + $this->backend + ->store( + json_encode([ + "npt" => $json["paging"]["next"], + "pagetype" => $pagetype + ]), + "videos", + $proxy + ); + } + + return $out; + } + + private function get_jwt(&$proxy){ + + // + // get jwt token + // it's probably safe to cache this across proxies, cause the jwt doesnt contain an userID + // only an appID, whatever shit that is + // we can only cache it for 5 minutes though, otherwise vimeo cries about it + // + if($proxy === null){ + + $proxy = $this->backend->get_ip(); + } + + $jwt = apcu_fetch("vimeo_jwt"); + + if($jwt === false){ + /* + $html = + $this->get( + $proxy, + "https://vimeo.com/search", + [], + false + ); + + $this->fuckhtml->load($html); + + $captcha = + $this->fuckhtml + ->getElementsByTagName( + "title" + ); + + if( + count($captcha) !== 0 && + $this->fuckhtml + ->getTextContent( + $captcha[0] + ) == "Vimeo / CAPTCHA Challenge" + ){ + + throw new Exception("Vimeo returned a Captcha"); + } + + $html = + explode( + '<script id="viewer-bootstrap" type="application/json">', + $html, + 2 + ); + + if(count($html) !== 2){ + + throw new Exception("Failed to find JWT json"); + } + + $jwt = + json_decode( + $this->fuckhtml + ->extract_json( + $html[1] + ), + true + ); + + if($jwt === null){ + + throw new Exception("Failed to decode JWT json"); + } + + if(!isset($jwt["jwt"])){ + + throw new Exception("Failed to grep JWT"); + } + + $jwt = $jwt["jwt"]; + */ + + try{ + $json = + $this->get( + $proxy, + "https://vimeo.com/_next/jwt", + [], + false + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch JWT token"); + } + + $this->fuckhtml->load($json); + + $captcha = + $this->fuckhtml + ->getElementsByTagName( + "title" + ); + + if( + count($captcha) !== 0 && + $this->fuckhtml + ->getTextContent( + $captcha[0] + ) == "Vimeo / CAPTCHA Challenge" + ){ + + throw new Exception("Vimeo returned a Captcha"); + } + + $json = json_decode($json, true); + + if($json === null){ + + throw new Exception("The JWT object could not be decoded"); + } + + if(!isset($json["token"])){ + + throw new Exception("Vimeo did not return a JWT"); + } + + $jwt = $json["token"]; + + apcu_store("vimeo_jwt", $jwt, 300); + } + + return $jwt; + } + + private function titledots($title){ + + $substr = substr($title, -3); + + if( + $substr == "..." || + $substr == "…" + ){ + + return trim(substr($title, 0, -3), " \n\r\t\v\x00\0\x0B\xc2\xa0"); + } + + return trim($title, " \n\r\t\v\x00\0\x0B\xc2\xa0"); + } + + private function limitstrlen($text){ + + return + explode( + "\n", + wordwrap( + str_replace( + ["\n\r", "\r\n", "\n", "\r"], + " ", + $text + ), + 300, + "\n" + ), + 2 + )[0]; + } +} diff --git a/scraper/vsco.php b/scraper/vsco.php new file mode 100644 index 0000000..8a7f057 --- /dev/null +++ b/scraper/vsco.php @@ -0,0 +1,257 @@ +<?php + +class vsco{ + + public function __construct(){ + + include "lib/backend.php"; + $this->backend = new backend("vsco"); + } + + public function getfilters($page){ + + return []; + } + + private function get($proxy, $url, $get = [], $bearer = null){ + + $curlproc = curl_init(); + + if($get !== []){ + $get_tmp = http_build_query($get); + $url .= "?" . $get_tmp; + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + + if($bearer === null){ + + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "DNT: 1", + "Sec-GPC: 1", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: same-origin", + "Sec-Fetch-User: ?1", + "Priority: u=0, i", + "TE: trailers"] + ); + }else{ + + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: */*", + "Accept-Language: en-US", + "Accept-Encoding: gzip", + "Referer: https://vsco.co/search/images/" . urlencode($get["query"]), + "authorization: Bearer " . $bearer, + "content-type: application/json", + "x-client-build: 1", + "x-client-platform: web", + "DNT: 1", + "Sec-GPC: 1", + "Connection: keep-alive", + "Sec-Fetch-Dest: empty", + "Sec-Fetch-Mode: cors", + "Sec-Fetch-Site: same-origin", + "Priority: u=0", + "TE: trailers"] + ); + } + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + // http2 bypass + curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); + + $this->backend->assign_proxy($curlproc, $proxy); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + return $data; + } + + public function image($get){ + + if($get["npt"]){ + + [$data, $proxy] = + $this->backend->get( + $get["npt"], "images" + ); + + $data = json_decode($data, true); + + }else{ + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $proxy = $this->backend->get_ip(); + + // get bearer token + try{ + + $html = + $this->get( + $proxy, + "https://vsco.co/feed" + ); + + }catch(Exception $error){ + + throw new Exception("Failed to fetch feed page"); + } + + preg_match( + '/"tkn":"([A-z0-9]+)"/', + $html, + $bearer + ); + + if(!isset($bearer[1])){ + + throw new Exception("Failed to grep bearer token"); + } + + $data = [ + "pagination" => [ + "query" => $search, + "page" => 0, + "size" => 100 + ], + "bearer" => $bearer[1] + ]; + } + + try{ + + $json = + $this->get( + $proxy, + "https://vsco.co/api/2.0/search/images", + $data["pagination"], + $data["bearer"] + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch JSON"); + } + + $json = json_decode($json, true); + + if($json === null){ + + throw new Exception("Failed to decode JSON"); + } + + $out = [ + "status" => "ok", + "npt" => null, + "image" => [] + ]; + + if(!isset($json["results"])){ + + throw new Exception("Failed to access results object"); + } + + foreach($json["results"] as $image){ + + $image_domain = parse_url("https://" . $image["responsive_url"], PHP_URL_HOST); + $thumbnail = explode($image_domain, $image["responsive_url"], 2)[1]; + + if(substr($thumbnail, 0, 3) != "/1/"){ + + $thumbnail = + preg_replace( + '/^\/[^\/]+/', + "", + $thumbnail + ); + } + + $thumbnail = "https://img.vsco.co/cdn-cgi/image/width=480,height=360" . $thumbnail; + $size = + $this->image_ratio( + (int)$image["dimensions"]["width"], + (int)$image["dimensions"]["height"] + ); + + $out["image"][] = [ + "title" => $image["description"], + "source" => [ + [ + "url" => "https://" . $image["responsive_url"], + "width" => (int)$image["dimensions"]["width"], + "height" => (int)$image["dimensions"]["height"] + ], + [ + "url" => $thumbnail, + "width" => $size[0], + "height" => $size[1] + ] + ], + "url" => "https://" . $image["grid"]["domain"] . "/media/" . $image["imageId"] + ]; + } + + // get NPT + $max_page = ceil($json["total"] / 100); + $data["pagination"]["page"]++; + + if($max_page > $data["pagination"]["page"]){ + + $out["npt"] = + $this->backend->store( + json_encode($data), + "images", + $proxy + ); + } + + return $out; + } + + private function image_ratio($width, $height){ + + $ratio = [ + 480 / $width, + 360 / $height + ]; + + if($ratio[0] < $ratio[1]){ + + $ratio = $ratio[0]; + }else{ + + $ratio = $ratio[1]; + } + + return [ + floor($width * $ratio), + floor($height * $ratio) + ]; + } +} diff --git a/scraper/wiby.php b/scraper/wiby.php new file mode 100644 index 0000000..59f723c --- /dev/null +++ b/scraper/wiby.php @@ -0,0 +1,246 @@ +<?php + +class wiby{ + + public function __construct(){ + + include "lib/backend.php"; + $this->backend = new backend("wiby"); + } + + public function getfilters($page){ + + if($page != "web"){ + + return []; + } + + return [ + "nsfw" => [ + "display" => "NSFW", + "option" => [ + "yes" => "Yes", + "no" => "No" + ] + ], + "date" => [ + "display" => "Time posted", + "option" => [ + "any" => "Any time", + "day" => "Past day", + "week" => "Past week", + "month" => "Past month", + "year" => "Past year", + ] + ] + ]; + } + + private function get($proxy, $url, $get = [], $nsfw){ + + $curlproc = curl_init(); + + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "Cookie: ws={$nsfw}", + "DNT: 1", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: none", + "Sec-Fetch-User: ?1"] + ); + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + return $data; + } + + public function web($get){ + + if($get["npt"]){ + + [$q, $proxy] = $this->backend->get($get["npt"], "web"); + $q = json_decode($q, true); + + $nsfw = $q["nsfw"]; + unset($q["nsfw"]); + }else{ + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $proxy = $this->backend->get_ip(); + $date = $get["date"]; + $nsfw = $get["nsfw"] == "yes" ? "0" : "1"; + + $search = + str_replace( + [ + "!g", + "!gi", + "!gv", + "!gm", + "!b", + "!bi", + "!bv", + "!bm", + "!td", + "!tw", + "!tm", + "!ty", + "&g", + "&gi", + "&gv", + "&gm", + "&b", + "&bi", + "&bv", + "&bm", + "&td", + "&tw", + "&tm", + "&ty", + ], + "", + $search + ); + + switch($date){ + + case "day": $search = "!td " . $search; break; + case "week": $search = "!tw " . $search; break; + case "month": $search = "!tm " . $search; break; + case "year": $search = "!ty " . $search; break; + } + + $q = [ + "q" => $search + ]; + } + + try{ + $html = $this->get( + $proxy, + "https://wiby.me/", + $q, + $nsfw + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch search page"); + } + + preg_match( + '/<p class="pin"><blockquote>(?:<\/p>)?<br><a class="more" href="\/\?q=[^"]+&p=([0-9]+)">Find more\.\.\.<\/a><\/blockquote>/', + $html, + $nextpage + ); + + if(count($nextpage) === 0){ + + $nextpage = null; + }else{ + + $nextpage = + $this->backend->store( + json_encode([ + "q" => $q["q"], + "p" => (int)$nextpage[1], + "nsfw" => $nsfw + ]), + "web", + $proxy + ); + } + + $out = [ + "status" => "ok", + "spelling" => [ + "type" => "no_correction", + "using" => null, + "correction" => null + ], + "npt" => $nextpage, + "answer" => [], + "web" => [], + "image" => [], + "video" => [], + "news" => [], + "related" => [] + ]; + + preg_match_all( + '/<blockquote>[\s]*<a .* href="(.*)">(.*)<\/a>.*<p>(.*)<\/p>[\s]*<\/blockquote>/Ui', + $html, + $links + ); + + for($i=0; $i<count($links[0]); $i++){ + + $out["web"][] = [ + "title" => $this->unescapehtml(trim($links[2][$i])), + "description" => $this->unescapehtml(trim(strip_tags($links[3][$i]), ".\n\r ")), + "url" => trim($links[1][$i]), + "date" => null, + "type" => "web", + "thumb" => [ + "url" => null, + "ratio" => null + ], + "sublink" => [], + "table" => [] + ]; + } + + return $out; + } + + private function unescapehtml($str){ + + return html_entity_decode( + str_replace( + [ + "<br>", + "<br/>", + "</br>", + "<BR>", + "<BR/>", + "</BR>", + ], + "\n", + $str + ), + ENT_QUOTES | ENT_XML1, 'UTF-8' + ); + } +} diff --git a/scraper/yandex.php b/scraper/yandex.php new file mode 100644 index 0000000..f73c3fd --- /dev/null +++ b/scraper/yandex.php @@ -0,0 +1,1248 @@ +<?php + +class yandex{ + + /* + curl functions + */ + public function __construct(){ + + include "lib/fuckhtml.php"; + $this->fuckhtml = new fuckhtml(); + + include "lib/backend.php"; + // backend included in the scraper functions + } + + private function get($proxy, $url, $get = [], $nsfw, $get_cookie = 1){ + + $curlproc = curl_init(); + + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + // extract "i" cookie + if($get_cookie === 0){ + + $cookies_tmp = []; + curl_setopt($curlproc, CURLOPT_HEADERFUNCTION, function($curlproc, $header) use (&$cookies_tmp){ + + $length = strlen($header); + + $header = explode(":", $header, 2); + + if(trim(strtolower($header[0])) == "set-cookie"){ + + $cookie_tmp = explode("=", trim($header[1]), 2); + + $cookies_tmp[trim($cookie_tmp[0])] = + explode(";", $cookie_tmp[1], 2)[0]; + } + + return $length; + }); + } + + switch($nsfw){ + case "yes": $nsfw = "0"; break; + case "maybe": $nsfw = "1"; break; + case "no": $nsfw = "2"; break; + } + + switch($get_cookie){ + + case 0: + $cookie = ""; + break; + + case 1: + $cookie = "Cookie: yp=" . (time() - 4000033) . ".szm.1:1920x1080:876x1000#" . time() . ".sp.family:" . $nsfw; + break; + + default: + $cookie = "Cookie: i=" . $get_cookie; + } + + $headers = + ["User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Encoding: gzip", + "Accept-Language: en-US,en;q=0.5", + "DNT: 1", + $cookie, + "Referer: https://yandex.com/images/search", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: cross-site", + "Upgrade-Insecure-Requests: 1"]; + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers); + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); + + $data = curl_exec($curlproc); + + if($get_cookie === 0){ + + if(isset($cookies_tmp["i"])){ + + return $cookies_tmp["i"]; + }else{ + + throw new Exception("Failed to get Yandex clearance cookie"); + } + } + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + return $data; + } + + public function getfilters($pagetype){ + + switch($pagetype){ + + case "web": + return [ + "lang" => [ + "display" => "Language", + "option" => [ + "any" => "Any language", + "en" => "English", + "ru" => "Russian", + "be" => "Belorussian", + "fr" => "French", + "de" => "German", + "id" => "Indonesian", + "kk" => "Kazakh", + "tt" => "Tatar", + "tr" => "Turkish", + "uk" => "Ukrainian" + ] + ], + "newer" => [ + "display" => "Newer than", + "option" => "_DATE" + ], + "older" => [ + "display" => "Older than", + "option" => "_DATE" + ] + ]; + break; + + case "images": + return + [ + "nsfw" => [ + "display" => "NSFW", + "option" => [ + "yes" => "Yes", + "maybe" => "Maybe", + "no" => "No" + ] + ], + "time" => [ + "display" => "Time posted", + "option" => [ + "any" => "Any time", + "week" => "Last week" + ] + ], + "size" => [ + "display" => "Size", + "option" => [ + "any" => "Any size", + "small" => "Small", + "medium" => "Medium", + "large" => "Large", + "wallpaper" => "Wallpaper" + ] + ], + "color" => [ + "display" => "Colors", + "option" => [ + "any" => "All colors", + "color" => "Color images only", + "gray" => "Black and white", + "red" => "Red", + "orange" => "Orange", + "yellow" => "Yellow", + "cyan" => "Cyan", + "green" => "Green", + "blue" => "Blue", + "violet" => "Purple", + "white" => "White", + "black" => "Black" + ] + ], + "type" => [ + "display" => "Type", + "option" => [ + "any" => "All types", + "photo" => "Photos", + "clipart" => "White background", + "lineart" => "Drawings and sketches", + "face" => "People", + "demotivator" => "Demotivators" + ] + ], + "layout" => [ + "display" => "Layout", + "option" => [ + "any" => "All layouts", + "horizontal" => "Horizontal", + "vertical" => "Vertical", + "square" => "Square" + ] + ], + "format" => [ + "display" => "Format", + "option" => [ + "any" => "Any format", + "jpeg" => "JPEG", + "png" => "PNG", + "gif" => "GIF" + ] + ] + ]; + break; + + case "videos": + return [ + "nsfw" => [ + "display" => "NSFW", + "option" => [ + "yes" => "Yes", + "maybe" => "Maybe", + "no" => "No" + ] + ], + "time" => [ + "display" => "Time posted", + "option" => [ + "any" => "Any time", + "9" => "Recently" + ] + ], + "duration" => [ + "display" => "Duration", + "option" => [ + "any" => "Any duration", + "short" => "Short" + ] + ] + ]; + break; + } + } + + public function web($get){ + + $this->backend = new backend("yandex_w"); + + // has captcha + // https://yandex.com/search/touch/?text=lol&app_platform=android&appsearch_header=1&ui=webmobileapp.yandex&app_version=23070603&app_id=ru.yandex.searchplugin&search_source=yandexcom_touch_native&clid=2218567 + + // https://yandex.com/search/site/?text=minecraft&web=1&frame=1&v=2.0&searchid=3131712 + // &within=777&from_day=26&from_month=8&from_year=2023&to_day=26&to_month=8&to_year=2023 + + // get clearance cookie + if(($cookie = apcu_fetch("yandexweb_cookie")) === false){ + + $proxy = $this->backend->get_ip(); + + $cookie = + $this->get( + $proxy, + "https://yandex.ru/support2/smart-captcha/ru/", + [], + false, + 0 + ); + + apcu_store("yandexweb_cookie", $cookie); + } + + if($get["npt"]){ + + [$npt, $proxy] = $this->backend->get($get["npt"], "web"); + + $html = + $this->get( + $proxy, + "https://yandex.com" . $npt, + [], + "yes", + $cookie + ); + }else{ + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $proxy = !isset($proxy) ? $this->backend->get_ip() : $proxy; + $lang = $get["lang"]; + $older = $get["older"]; + $newer = $get["newer"]; + + $params = [ + "text" => $search, + "web" => "1", + "frame" => "1", + "searchid" => "3131712" + ]; + + if($lang != "any"){ + + $params["lang"] = $lang; + } + + if( + $newer === false && + $older !== false + ){ + + $newer = 0; + } + + if($newer !== false){ + + $params["from_day"] = date("j", $newer); + $params["from_month"] = date("n", $newer); + $params["from_year"] = date("Y", $newer); + + if($older === false){ + + $older = time(); + } + + $params["to_day"] = date("j", $older); + $params["to_month"] = date("n", $older); + $params["to_year"] = date("Y", $older); + } + + try{ + $html = + $this->get( + $proxy, + "https://yandex.com/search/site/", + $params, + "yes", + $cookie + ); + }catch(Exception $error){ + + throw new Exception("Could not get search page"); + } + + /* + $handle = fopen("scraper/yandex.html", "r"); + $html = fread($handle, filesize("scraper/yandex.html")); + fclose($handle);*/ + } + + $out = [ + "status" => "ok", + "spelling" => [ + "type" => "no_correction", + "using" => null, + "correction" => null + ], + "npt" => null, + "answer" => [], + "web" => [], + "image" => [], + "video" => [], + "news" => [], + "related" => [] + ]; + + $this->fuckhtml->load($html); + + // Scrape page blocked error + $title = + $this->fuckhtml + ->getElementsByTagName("title"); + + if( + count($title) !== 0 && + $title[0]["innerHTML"] == "403" + ){ + + throw new Exception("Yandex blocked this proxy or 4get instance."); + } + + // get nextpage + $npt = + $this->fuckhtml + ->getElementsByClassName( + "b-pager__next", + "a" + ); + + if(count($npt) !== 0){ + + $out["npt"] = + $this->backend->store( + $this->fuckhtml + ->getTextContent( + $npt + [0] + ["attributes"] + ["href"] + ), + "web", + $proxy + ); + } + + // get items + $items = + $this->fuckhtml + ->getElementsByClassName( + "b-serp-item", + "li" + ); + + foreach($items as $item){ + + $this->fuckhtml->load($item); + + $link = + $this->fuckhtml + ->getElementsByClassName( + "b-serp-item__title-link", + "a" + )[0]; + + $out["web"][] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $link + ) + ), + "description" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByClassName( + "b-serp-item__text", + "div" + )[0] + ) + ), + "url" => + $this->fuckhtml + ->getTextContent( + $link + ["attributes"] + ["href"] + ), + "date" => null, + "type" => "web", + "thumb" => [ + "url" => null, + "ratio" => null + ], + "sublink" => [], + "table" => [] + ]; + } + + return $out; + } + + public function image($get){ + + $this->backend = new backend("yandex_i"); + + if($get["npt"]){ + + [$request, $proxy] = + $this->backend->get( + $get["npt"], + "images" + ); + + $request = json_decode($request, true); + + $nsfw = $request["nsfw"]; + unset($request["nsfw"]); + }else{ + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $proxy = $this->backend->get_ip(); + $nsfw = $get["nsfw"]; + $time = $get["time"]; + $size = $get["size"]; + $color = $get["color"]; + $type = $get["type"]; + $layout = $get["layout"]; + $format = $get["format"]; + /* + $handle = fopen("scraper/yandex.json", "r"); + $json = fread($handle, filesize("scraper/yandex.json")); + fclose($handle);*/ + + // SIZE + // large + // 227.0=1;203.0=1;76fe94.0=1;41d251.0=1;75.0=1;371.0=1;291.0=1;307.0=1;f797ee.0=1;1cf7c2.0=1;deca32.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&isize=large&suggest_reqid=486139416166165501540886508227485&text=minecraft&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // medium + // 227.0=1;203.0=1;76fe94.0=1;41d251.0=1;75.0=1;371.0=1;291.0=1;307.0=1;f797ee.0=1;1cf7c2.0=1;deca32.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&isize=medium&suggest_reqid=486139416166165501540886508227485&text=minecraft&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // small + // 227.0=1;203.0=1;76fe94.0=1;41d251.0=1;75.0=1;371.0=1;291.0=1;307.0=1;f797ee.0=1;1cf7c2.0=1;deca32.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&isize=small&suggest_reqid=486139416166165501540886508227485&text=minecraft&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // ORIENTATION + // Horizontal + // 227.0=1;203.0=1;76fe94.0=1;41d251.0=1;75.0=1;371.0=1;291.0=1;307.0=1;f797ee.0=1;1cf7c2.0=1;deca32.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&iorient=horizontal&suggest_reqid=486139416166165501540886508227485&text=minecraft&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // Vertical + // 227.0=1;203.0=1;76fe94.0=1;41d251.0=1;75.0=1;371.0=1;291.0=1;307.0=1;f797ee.0=1;1cf7c2.0=1;deca32.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&iorient=vertical&suggest_reqid=486139416166165501540886508227485&text=minecraft&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // Square + // 227.0=1;203.0=1;76fe94.0=1;41d251.0=1;75.0=1;371.0=1;291.0=1;307.0=1;f797ee.0=1;1cf7c2.0=1;deca32.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&iorient=square&suggest_reqid=486139416166165501540886508227485&text=minecraft&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // TYPE + // Photos + // 307.0=1;371.0=1;291.0=1;203.0=1;deca32.0=1;f797ee.0=1;1cf7c2.0=1;41d251.0=1;267.0=1;bde197.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&text=minecraft&type=photo&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // White background + // 307.0=1;371.0=1;291.0=1;203.0=1;deca32.0=1;f797ee.0=1;1cf7c2.0=1;41d251.0=1;267.0=1;bde197.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&text=minecraft&type=clipart&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // Drawings and sketches + // 307.0=1;371.0=1;291.0=1;203.0=1;deca32.0=1;f797ee.0=1;1cf7c2.0=1;41d251.0=1;267.0=1;bde197.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&text=minecraft&type=lineart&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // People + // 307.0=1;371.0=1;291.0=1;203.0=1;deca32.0=1;f797ee.0=1;1cf7c2.0=1;41d251.0=1;267.0=1;bde197.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&text=minecraft&type=face&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // Demotivators + // 307.0=1;371.0=1;291.0=1;203.0=1;deca32.0=1;f797ee.0=1;1cf7c2.0=1;41d251.0=1;267.0=1;bde197.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&text=minecraft&type=demotivator&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // COLOR + // Color images only + // 307.0=1;371.0=1;291.0=1;203.0=1;deca32.0=1;f797ee.0=1;1cf7c2.0=1;41d251.0=1;267.0=1;bde197.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&icolor=color&text=minecraft&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // Black and white + // 307.0=1;371.0=1;291.0=1;203.0=1;deca32.0=1;f797ee.0=1;1cf7c2.0=1;41d251.0=1;267.0=1;bde197.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&icolor=gray&text=minecraft&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // Red + // 307.0=1;371.0=1;291.0=1;203.0=1;deca32.0=1;f797ee.0=1;1cf7c2.0=1;41d251.0=1;267.0=1;bde197.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&icolor=red&text=minecraft&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // Orange + // 307.0=1;371.0=1;291.0=1;203.0=1;deca32.0=1;f797ee.0=1;1cf7c2.0=1;41d251.0=1;267.0=1;bde197.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&icolor=orange&text=minecraft&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // Yellow + // 307.0=1;371.0=1;291.0=1;203.0=1;deca32.0=1;f797ee.0=1;1cf7c2.0=1;41d251.0=1;267.0=1;bde197.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&icolor=yellow&text=minecraft&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // Cyan + // 307.0=1;371.0=1;291.0=1;203.0=1;deca32.0=1;f797ee.0=1;1cf7c2.0=1;41d251.0=1;267.0=1;bde197.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&icolor=cyan&text=minecraft&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // Green + // 307.0=1;371.0=1;291.0=1;203.0=1;deca32.0=1;f797ee.0=1;1cf7c2.0=1;41d251.0=1;267.0=1;bde197.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&icolor=green&text=minecraft&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // Blue + // 307.0=1;371.0=1;291.0=1;203.0=1;deca32.0=1;f797ee.0=1;1cf7c2.0=1;41d251.0=1;267.0=1;bde197.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&icolor=blue&text=minecraft&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // Purple + // 307.0=1;371.0=1;291.0=1;203.0=1;deca32.0=1;f797ee.0=1;1cf7c2.0=1;41d251.0=1;267.0=1;bde197.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&icolor=violet&text=minecraft&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // White + // 307.0=1;371.0=1;291.0=1;203.0=1;deca32.0=1;f797ee.0=1;1cf7c2.0=1;41d251.0=1;267.0=1;bde197.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&icolor=white&text=minecraft&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // Black + // 307.0=1;371.0=1;291.0=1;203.0=1;deca32.0=1;f797ee.0=1;1cf7c2.0=1;41d251.0=1;267.0=1;bde197.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&icolor=black&text=minecraft&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // FORMAT + // jpeg + // 307.0=1;371.0=1;291.0=1;203.0=1;deca32.0=1;f797ee.0=1;1cf7c2.0=1;41d251.0=1;267.0=1;bde197.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&itype=jpg&text=minecraft&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // png + // 307.0=1;371.0=1;291.0=1;203.0=1;deca32.0=1;f797ee.0=1;1cf7c2.0=1;41d251.0=1;267.0=1;bde197.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&itype=png&text=minecraft&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // gif + // 307.0=1;371.0=1;291.0=1;203.0=1;deca32.0=1;f797ee.0=1;1cf7c2.0=1;41d251.0=1;267.0=1;bde197.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&itype=gifan&text=minecraft&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // RECENT + // 307.0=1;371.0=1;291.0=1;203.0=1;deca32.0=1;f797ee.0=1;1cf7c2.0=1;41d251.0=1;267.0=1;bde197.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&recent=7D&text=minecraft&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + // WALLPAPER + // 307.0=1;371.0=1;291.0=1;203.0=1;deca32.0=1;f797ee.0=1;1cf7c2.0=1;41d251.0=1;267.0=1;bde197.0=1"},"extraContent":{"names":["i-react-ajax-adapter"]}}}&yu=4861394161661655015&isize=wallpaper&text=minecraft&wp=wh16x9_1920x1080&uinfo=sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080 + + + $request = [ + "format" => "json", + "request" => [ + "blocks" => [ + [ + "block" => "extra-content", + "params" => (object)[], + "version" => 2 + ], + [ + "block" => "i-global__params:ajax", + "params" => (object)[], + "version" => 2 + ], + [ + "block" => "search2:ajax", + "params" => (object)[], + "version" => 2 + ], + [ + "block" => "preview__isWallpaper", + "params" => (object)[], + "version" => 2 + ], + [ + "block" => "content_type_search", + "params" => (object)[], + "version" => 2 + ], + [ + "block" => "serp-controller", + "params" => (object)[], + "version" => 2 + ], + [ + "block" => "cookies_ajax", + "params" => (object)[], + "version" => 2 + ], + [ + "block" => "advanced-search-block", + "params" => (object)[], + "version" => 2 + ] + ], + "metadata" => [ + "bundles" => [ + "lb" => "AS?(E<X120" + ], + "assets" => [ + // las base + "las" => "justifier-height=1;justifier-setheight=1;fitimages-height=1;justifier-fitincuts=1;react-with-dom=1;" + + // las default + //"las" => "justifier-height=1;justifier-setheight=1;fitimages-height=1;justifier-fitincuts=1;react-with-dom=1;227.0=1;203.0=1;76fe94.0=1;215f96.0=1;75.0=1" + ], + "extraContent" => [ + "names" => [ + "i-react-ajax-adapter" + ] + ] + ] + ] + ]; + + /* + Apply filters + */ + if($time == "week"){ + $request["recent"] = "7D"; + } + + if($size != "any"){ + + $request["isize"] = $size; + } + + if($type != "any"){ + + $request["type"] = $type; + } + + if($color != "any"){ + + $request["icolor"] = $color; + } + + if($layout != "any"){ + + $request["iorient"] = $layout; + } + + if($format != "any"){ + + $request["itype"] = $format; + } + + $request["text"] = $search; + $request["uinfo"] = "sw-1920-sh-1080-ww-1125-wh-999-pd-1-wp-16x9_1920x1080"; + + $request["request"] = json_encode($request["request"]); + } + + try{ + $json = $this->get( + $proxy, + "https://yandex.com/images/search", + $request, + $nsfw, + "yandex_i" + ); + }catch(Exception $err){ + + throw new Exception("Failed to get JSON"); + } + + /* + $handle = fopen("scraper/yandex.json", "r"); + $json = fread($handle, filesize("scraper/yandex.json")); + fclose($handle);*/ + + $json = json_decode($json, true); + + if($json === null){ + + throw new Exception("Failed to decode JSON"); + } + + if( + isset($json["type"]) && + $json["type"] == "captcha" + ){ + + throw new Exception("Yandex blocked this 4get instance. Please try again in ~7 minutes."); + } + + $out = [ + "status" => "ok", + "npt" => null, + "image" => [] + ]; + + // get html + $html = ""; + foreach($json["blocks"] as $block){ + + $html .= $block["html"]; + // get next page + if( + isset($block["params"]["nextPageUrl"]) && + !empty($block["params"]["nextPageUrl"]) + ){ + + $request["nsfw"] = $nsfw; + + if(isset($request["p"])){ + + $request["p"]++; + }else{ + + $request["p"] = 1; + } + + $out["npt"] = + $this->backend->store( + json_encode($request), + "images", + $proxy + ); + } + } + + $this->fuckhtml->load($html); + + // get search results + $data = null; + + foreach( + $this->fuckhtml + ->getElementsByClassName( + "Root", + "div" + ) as $div + ){ + + if(isset($div["attributes"]["data-state"])){ + + $tmp = json_decode( + $this->fuckhtml + ->getTextContent( + $div["attributes"]["data-state"] + ), + true + ); + + if(isset($tmp["initialState"]["serpList"])){ + + $data = $tmp; + break; + } + } + } + + if($data === null){ + + throw new Exception("Failed to extract JSON"); + } + + foreach($data["initialState"]["serpList"]["items"]["entities"] as $image){ + + $title = [html_entity_decode($image["snippet"]["title"], ENT_QUOTES | ENT_HTML5)]; + + if(isset($image["snippet"]["text"])){ + + $title[] = html_entity_decode($image["snippet"]["text"], ENT_QUOTES | ENT_HTML5); + } + + $tmp = [ + "title" => + $this->fuckhtml + ->getTextContent( + $this->titledots( + implode(": ", $title) + ) + ), + "source" => [], + "url" => htmlspecialchars_decode($image["snippet"]["url"]) + ]; + + // add preview URL + $tmp["source"][] = [ + "url" => htmlspecialchars_decode($image["viewerData"]["preview"][0]["url"]), + "width" => (int)$image["viewerData"]["preview"][0]["w"], + "height" => (int)$image["viewerData"]["preview"][0]["h"], + ]; + + foreach($image["viewerData"]["dups"] as $dup){ + + $tmp["source"][] = [ + "url" => htmlspecialchars_decode($dup["url"]), + "width" => (int)$dup["w"], + "height" => (int)$dup["h"], + ]; + } + + $tmp["source"][] = [ + "url" => + preg_replace( + '/^\/\//', + "https://", + htmlspecialchars_decode($image["viewerData"]["thumb"]["url"]) + ), + "width" => (int)$image["viewerData"]["thumb"]["w"], + "height" => (int)$image["viewerData"]["thumb"]["h"] + ]; + + $out["image"][] = $tmp; + } + + return $out; + } + + public function video($get){ + + $this->backend = new backend("yandex_v"); + + if($get["npt"]){ + + [$params, $proxy] = + $this->backend->get( + $get["npt"], + "video" + ); + + $params = json_decode($params, true); + + $nsfw = $params["nsfw"]; + unset($params["nsfw"]); + }else{ + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $proxy = $this->backend->get_ip(); + $nsfw = $get["nsfw"]; + $time = $get["time"]; + $duration = $get["duration"]; + + // https://yandex.com/video/search + // ?tmpl_version=releases/frontend/video/v1.1168.0#8d942de0f4ebc4eb6b8f3c24ffbd1f8dbc5bbe63 + // &format=json + // &request= + // { + // "blocks":[ + // {"block":"extra-content","params":{},"version":2}, + // {"block":"i-global__params:ajax","params":{},"version":2}, + // {"block":"search2:ajax","params":{},"version":2}, + // {"block":"vital-incut","params":{},"version":2}, + // {"block":"content_type_search","params":{},"version":2}, + // {"block":"serp-controller","params":{},"version":2}, + // {"block":"cookies_ajax","params":{},"version":2} + // ], + // "metadata":{ + // "bundles":{"lb":"^G]!q<X120"}, + // "assets":{"las":"react-with-dom=1;185.0=1;73.0=1;145.0=1;5a502a.0=1;32c342.0=1;b84ac8.0=1"}, + // "extraContent":{"names":["i-react-ajax-adapter"]} + // } + // } + // &yu=4861394161661655015 + // &from=tabbar + // &reqid=1693106278500184-6825210746979814879-balancer-l7leveler-kubr-yp-sas-7-BAL-4237 + // &suggest_reqid=486139416166165501562797413447032 + // &text=minecraft + + $params = [ + "tmpl_version" => "releases/frontend/video/v1.1168.0#8d942de0f4ebc4eb6b8f3c24ffbd1f8dbc5bbe63", + "format" => "json", + "request" => json_encode([ + "blocks" => [ + (object)[ + "block" => "extra-content", + "params" => (object)[], + "version" => 2 + ], + (object)[ + "block" => "i-global__params:ajax", + "params" => (object)[], + "version" => 2 + ], + (object)[ + "block" => "search2:ajax", + "params" => (object)[], + "version" => 2 + ], + (object)[ + "block" => "vital-incut", + "params" => (object)[], + "version" => 2 + ], + (object)[ + "block" => "content_type_search", + "params" => (object)[], + "version" => 2 + ], + (object)[ + "block" => "serp-controller", + "params" => (object)[], + "version" => 2 + ], + (object)[ + "block" => "cookies_ajax", + "params" => (object)[], + "version" => 2 + ] + ], + "metadata" => (object)[ + "bundles" => (object)[ + "lb" => "^G]!q<X120" + ], + "assets" => (object)[ + "las" => "react-with-dom=1;185.0=1;73.0=1;145.0=1;5a502a.0=1;32c342.0=1;b84ac8.0=1" + ], + "extraContent" => (object)[ + "names" => [ + "i-react-ajax-adapter" + ] + ] + ] + ]), + "text" => $search + ]; + + if($duration != "any"){ + + $params["duration"] = $duration; + } + + if($time != "any"){ + + $params["within"] = $time; + } + } + /* + $handle = fopen("scraper/yandex-video.json", "r"); + $json = fread($handle, filesize("scraper/yandex-video.json")); + fclose($handle); + */ + try{ + $json = + $this->get( + $proxy, + "https://yandex.com/video/search", + $params, + $nsfw, + "yandex_v" + ); + }catch(Exception $error){ + + throw new Exception("Could not fetch JSON"); + } + + $json = json_decode($json, true); + + if($json === null){ + + throw new Exception("Could not parse JSON"); + } + + if(!isset($json["blocks"])){ + + throw new Exception("Yandex blocked this 4get instance. Please try again in 7~ minutes."); + } + + $out = [ + "status" => "ok", + "npt" => null, + "video" => [], + "author" => [], + "livestream" => [], + "playlist" => [], + "reel" => [] + ]; + + $html = null; + foreach($json["blocks"] as $block){ + + if(isset($block["html"])){ + + $html .= $block["html"]; + } + } + + $this->fuckhtml->load($html); + + $div = + $this->fuckhtml + ->getElementsByTagName("div"); + + /* + Get nextpage + */ + $npt = + $this->fuckhtml + ->getElementsByClassName( + "more more_direction_next i-bem", + $div + ); + + if(count($npt) !== 0){ + + $params["p"] = "1"; + $params["nsfw"] = $nsfw; + $out["npt"] = + $this->backend->store( + json_encode($params), + "video", + $proxy + ); + } + + $items = + $this->fuckhtml + ->getElementsByClassName( + "serp-item", + $div + ); + + foreach($items as $item){ + + $data = + json_decode( + $this->fuckhtml + ->getTextContent( + $item["attributes"]["data-video"] + ), + true + ); + + $this->fuckhtml->load($item); + + $thumb = + $this->fuckhtml + ->getElementsByClassName( + "thumb-image__image", + "img" + ); + + $c = 1; + if(count($thumb) === 0){ + + $thumb = [ + "url" => null, + "ratio" => null + ]; + }else{ + + $thumb = [ + "url" => + str_replace( + "//", + "https://", + $this->fuckhtml + ->getTextContent( + $thumb + [0] + ["attributes"] + ["src"] + ), + $c + ), + "ratio" => "16:9" + ]; + } + + $smallinfos = + $this->fuckhtml + ->getElementsByClassName( + "serp-item__sitelinks-item", + "div" + ); + + $date = null; + $views = null; + $first = true; + + foreach($smallinfos as $info){ + + if($first){ + + $first = false; + continue; + } + + $info = + $this->fuckhtml + ->getTextContent( + $info + ); + + if($temp_date = strtotime($info)){ + + $date = $temp_date; + }else{ + + $views = $this->parseviews($info); + } + } + + $description = + $this->fuckhtml + ->getElementsByClassName( + "serp-item__text serp-item__text_visibleText_always", + "div" + ); + + if(count($description) === 0){ + + $description = null; + }else{ + + $description = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $description[0] + ) + ); + } + + $out["video"][] = [ + "title" => + $this->fuckhtml + ->getTextContent( + $this->titledots( + $data["title"] + ) + ), + "description" => $description, + "author" => [ + "name" => null, + "url" => null, + "avatar" => null + ], + "date" => $date, + "duration" => + (int)$data + ["counters"] + ["toHostingLoaded"] + ["stredParams"] + ["duration"], + "views" => $views, + "thumb" => $thumb, + "url" => + str_replace( + "http://", + "https://", + $this->fuckhtml + ->getTextContent( + $data["counters"] + ["toHostingLoaded"] + ["postfix"] + ["href"] + ), + $c + ) + ]; + } + + return $out; + } + + private function parseviews($text){ + + $text = explode(" ", $text); + + $num = (float)$text[0]; + $mod = $text[1]; + + switch($mod){ + + case "bln.": $num = $num * 1000000000; break; + case "mln.": $num = $num * 1000000; break; + case "thsd.": $num = $num * 1000; break; + } + + return $num; + } + + private function titledots($title){ + + $substr = substr($title, -3); + + if( + $substr == "..." || + $substr == "…" + ){ + + return trim(substr($title, 0, -3)); + } + + return trim($title); + } +} diff --git a/scraper/yep.php b/scraper/yep.php new file mode 100644 index 0000000..bfe347f --- /dev/null +++ b/scraper/yep.php @@ -0,0 +1,741 @@ +<?php + +class yep{ + + public function __construct(){ + + include "lib/backend.php"; + $this->backend = new backend("yep"); + + include "lib/fuckhtml.php"; + $this->fuckhtml = new fuckhtml(); + } + + public function getfilters($page){ + + return [ + "country" => [ + "display" => "Country", + "option" => [ + "all" => "All regions", + "af" => "Afghanistan", + "al" => "Albania", + "dz" => "Algeria", + "as" => "American Samoa", + "ad" => "Andorra", + "ao" => "Angola", + "ai" => "Anguilla", + "ag" => "Antigua and Barbuda", + "ar" => "Argentina", + "am" => "Armenia", + "aw" => "Aruba", + "au" => "Australia", + "at" => "Austria", + "az" => "Azerbaijan", + "bs" => "Bahamas", + "bh" => "Bahrain", + "bd" => "Bangladesh", + "bb" => "Barbados", + "by" => "Belarus", + "be" => "Belgium", + "bz" => "Belize", + "bj" => "Benin", + "bt" => "Bhutan", + "bo" => "Bolivia", + "ba" => "Bosnia and Herzegovina", + "bw" => "Botswana", + "br" => "Brazil", + "bn" => "Brunei Darussalam", + "bg" => "Bulgaria", + "bf" => "Burkina Faso", + "bi" => "Burundi", + "cv" => "Cabo Verde", + "kh" => "Cambodia", + "cm" => "Cameroon", + "ca" => "Canada", + "ky" => "Cayman Islands", + "cf" => "Central African Republic", + "td" => "Chad", + "cl" => "Chile", + "cn" => "China", + "co" => "Colombia", + "cg" => "Congo", + "cd" => "Congo, Democratic Republic", + "ck" => "Cook Islands", + "cr" => "Costa Rica", + "hr" => "Croatia", + "cu" => "Cuba", + "cy" => "Cyprus", + "cz" => "Czechia", + "ci" => "Côte d'Ivoire", + "dk" => "Denmark", + "dj" => "Djibouti", + "dm" => "Dominica", + "do" => "Dominican Republic", + "ec" => "Ecuador", + "eg" => "Egypt", + "sv" => "El Salvador", + "gq" => "Equatorial Guinea", + "ee" => "Estonia", + "et" => "Ethiopia", + "fo" => "Faroe Islands", + "fj" => "Fiji", + "fi" => "Finland", + "fr" => "France", + "gf" => "French Guiana", + "pf" => "French Polynesia", + "ga" => "Gabon", + "gm" => "Gambia", + "ge" => "Georgia", + "de" => "Germany", + "gh" => "Ghana", + "gi" => "Gibraltar", + "gr" => "Greece", + "gl" => "Greenland", + "gd" => "Grenada", + "gp" => "Guadeloupe", + "gu" => "Guam", + "gt" => "Guatemala", + "gg" => "Guernsey", + "gn" => "Guinea", + "gy" => "Guyana", + "ht" => "Haiti", + "hn" => "Honduras", + "hk" => "Hong Kong", + "hu" => "Hungary", + "is" => "Iceland", + "in" => "India", + "id" => "Indonesia", + "iq" => "Iraq", + "ie" => "Ireland", + "im" => "Isle of Man", + "il" => "Israel", + "it" => "Italy", + "jm" => "Jamaica", + "jp" => "Japan", + "je" => "Jersey", + "jo" => "Jordan", + "kz" => "Kazakhstan", + "ke" => "Kenya", + "ki" => "Kiribati", + "kw" => "Kuwait", + "kg" => "Kyrgyzstan", + "la" => "Lao People's Democratic Republic", + "lv" => "Latvia", + "lb" => "Lebanon", + "ls" => "Lesotho", + "ly" => "Libya", + "li" => "Liechtenstein", + "lt" => "Lithuania", + "lu" => "Luxembourg", + "mk" => "Macedonia", + "mg" => "Madagascar", + "mw" => "Malawi", + "my" => "Malaysia", + "mv" => "Maldives", + "ml" => "Mali", + "mt" => "Malta", + "mq" => "Martinique", + "mr" => "Mauritania", + "mu" => "Mauritius", + "yt" => "Mayotte", + "mx" => "Mexico", + "fm" => "Micronesia, Federated States of", + "md" => "Moldova", + "mc" => "Monaco", + "mn" => "Mongolia", + "me" => "Montenegro", + "ms" => "Montserrat", + "ma" => "Morocco", + "mz" => "Mozambique", + "mm" => "Myanmar", + "na" => "Namibia", + "nr" => "Nauru", + "np" => "Nepal", + "nl" => "Netherlands", + "nc" => "New Caledonia", + "nz" => "New Zealand", + "ni" => "Nicaragua", + "ne" => "Niger", + "ng" => "Nigeria", + "nu" => "Niue", + "no" => "Norway", + "om" => "Oman", + "pk" => "Pakistan", + "ps" => "Palestine, State of", + "pa" => "Panama", + "pg" => "Papua New Guinea", + "py" => "Paraguay", + "pe" => "Peru", + "ph" => "Philippines", + "pn" => "Pitcairn", + "pl" => "Poland", + "pt" => "Portugal", + "pr" => "Puerto Rico", + "qa" => "Qatar", + "ro" => "Romania", + "ru" => "Russian Federation", + "rw" => "Rwanda", + "re" => "Réunion", + "sh" => "Saint Helena", + "kn" => "Saint Kitts and Nevis", + "lc" => "Saint Lucia", + "vc" => "Saint Vincent and the Grenadines", + "ws" => "Samoa", + "sm" => "San Marino", + "st" => "Sao Tome and Principe", + "sa" => "Saudi Arabia", + "sn" => "Senegal", + "rs" => "Serbia", + "sc" => "Seychelles", + "sl" => "Sierra Leone", + "sg" => "Singapore", + "sk" => "Slovakia", + "si" => "Slovenia", + "sb" => "Solomon Islands", + "so" => "Somalia", + "kr" => "Sourth Korea", + "za" => "South Africa", + "es" => "Spain", + "lk" => "Sri Lanka", + "sr" => "Suriname", + "se" => "Sweden", + "ch" => "Switzerland", + "tw" => "Taiwan", + "tj" => "Tajikistan", + "tz" => "Tanzania", + "th" => "Thailand", + "tl" => "Timor-Leste", + "tg" => "Togo", + "tk" => "Tokelau", + "to" => "Tonga", + "tt" => "Trinidad and Tobago", + "tn" => "Tunisia", + "tr" => "Turkey", + "tm" => "Turkmenistan", + "ug" => "Uganda", + "ua" => "Ukraine", + "ae" => "United Arab Emirates", + "gb" => "United Kingdom", + "us" => "United States", + "uy" => "Uruguay", + "uz" => "Uzbekistan", + "vu" => "Vanuatu", + "ve" => "Venezuela", + "vn" => "Vietnam", + "vg" => "Virgin Islands, British", + "vi" => "Virgin Islands, U.S.", + "ye" => "Yemen", + "zm" => "Zambia", + "zw" => "Zimbabwe" + ] + ], + "nsfw" => [ + "display" => "NSFW", + "option" => [ + "yes" => "Yes", + "maybe" => "Maybe", + "no" => "No" + ] + ] + ]; + } + + private function get($proxy, $url, $get = []){ + + $curlproc = curl_init(); + + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + // use http2 + curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); + + // set ciphers + curl_setopt( + $curlproc, + CURLOPT_SSL_CIPHER_LIST, + "aes_128_gcm_sha_256,chacha20_poly1305_sha_256,aes_256_gcm_sha_384,ecdhe_ecdsa_aes_128_gcm_sha_256,ecdhe_rsa_aes_128_gcm_sha_256,ecdhe_ecdsa_chacha20_poly1305_sha_256,ecdhe_rsa_chacha20_poly1305_sha_256,ecdhe_ecdsa_aes_256_gcm_sha_384,ecdhe_rsa_aes_256_gcm_sha_384,ecdhe_ecdsa_aes_256_sha,ecdhe_ecdsa_aes_128_sha,ecdhe_rsa_aes_128_sha,ecdhe_rsa_aes_256_sha,rsa_aes_128_gcm_sha_256,rsa_aes_256_gcm_sha_384,rsa_aes_128_sha,rsa_aes_256_sha" + ); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: */*", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip, deflate, br, zstd", + "Referer: https://yep.com/", + "Origin: https://yep.com", + "DNT: 1", + "Connection: keep-alive", + "Sec-Fetch-Dest: empty", + "Sec-Fetch-Mode: cors", + "Sec-Fetch-Site: same-site", + "Priority: u=4", + "TE: trailers"] + ); + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + return $data; + } + + + + public function web($get){ + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $country = $get["country"]; + $nsfw = $get["nsfw"]; + + switch($nsfw){ + + case "yes": $nsfw = "off"; break; + case "maybe": $nsfw = "moderate"; break; + case "no": $nsfw = "strict"; break; + } + + $out = [ + "status" => "ok", + "spelling" => [ + "type" => "no_correction", + "using" => null, + "correction" => null + ], + "npt" => null, + "answer" => [], + "web" => [], + "image" => [], + "video" => [], + "news" => [], + "related" => [] + ]; + + try{ + + // https://api.yep.com/fs/2/search?client=web&gl=CA&no_correct=false&q=undefined+variable+javascript&safeSearch=off&type=web + $json = + $this->get( + $this->backend->get_ip(), + "https://api.yep.com/fs/2/search", + [ + "client" => "web", + "gl" => $country == "all" ? $country : strtoupper($country), + "limit" => "99999", + "no_correct" => "false", + "q" => $search, + "safeSearch" => $nsfw, + "type" => "web" + ] + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch JSON"); + } + + $this->detect_cf($json); + + $json = json_decode($json, true); + //$json = json_decode(file_get_contents("scraper/yep.json"), true); + + if($json === null){ + + throw new Exception("Failed to decode JSON"); + } + + if(isset($json[1]["correction"])){ + + $out["spelling"] = [ + "type" => "not_many", + "using" => $search, + "correction" => $json[1]["correction"][1] + ]; + } + + if(isset($json[1]["results"])){ + foreach($json[1]["results"] as $item){ + + switch(strtolower($item["type"])){ + + case "organic": + $sublinks = []; + + if(isset($item["sitelinks"]["full"])){ + + foreach($item["sitelinks"]["full"] as $link){ + + $sublinks[] = [ + "title" => $link["title"], + "date" => null, + "description" => + $this->titledots( + strip_tags( + html_entity_decode( + $link["snippet"] + ) + ) + ), + "url" => $link["url"] + ]; + } + } + + $out["web"][] = [ + "title" => $item["title"], + "description" => + $this->titledots( + strip_tags( + html_entity_decode( + $item["snippet"] + ) + ) + ), + "url" => $item["url"], + "date" => strtotime($item["first_seen"]), + "type" => "web", + "thumb" => [ + "url" => null, + "ratio" => null + ], + "sublink" => $sublinks, + "table" => [] + ]; + break; + } + } + } + + if(isset($json[1]["featured_news"])){ + + foreach($json[1]["featured_news"] as $news){ + + $out["news"][] = [ + "title" => $news["title"], + "description" => + $this->titledots( + strip_tags( + html_entity_decode( + $news["snippet"] + ) + ) + ), + "date" => strtotime($news["first_seen"]), + "thumb" => + isset($news["img"]) ? + [ + "url" => $this->unshiturl($news["img"]), + "ratio" => "16:9" + ] : + [ + "url" => null, + "ratio" => null + ], + "url" => $news["url"] + ]; + } + } + + if(isset($json[1]["featured_images"])){ + + foreach($json[1]["featured_images"] as $image){ + + if( + $image["width"] !== 0 && + $image["height"] !== 0 + ){ + + $thumb_width = $image["width"] >= 260 ? 260 : $image["width"]; + $thumb_height = ceil($image["height"] * ($thumb_width / $image["width"])); + + $width = $image["width"]; + $height = $image["height"]; + }else{ + + $thumb_width = null; + $thumb_height = null; + $width = null; + $height = null; + } + + $out["image"][] = [ + "title" => $image["title"], + "source" => [ + [ + "url" => $image["image_id"], + "width" => $width, + "height" => $height + ], + [ + "url" => $image["src"], + "width" => $thumb_width, + "height" => $thumb_height + ] + ], + "url" => $image["host_page"] + ]; + } + } + + return $out; + } + + + + public function image($get){ + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $country = $get["country"]; + $nsfw = $get["nsfw"]; + + switch($nsfw){ + + case "yes": $nsfw = "off"; break; + case "maybe": $nsfw = "moderate"; break; + case "no": $nsfw = "strict"; break; + } + + $out = [ + "status" => "ok", + "npt" => null, + "image" => [] + ]; + + try{ + + $json = + $this->get( + $this->backend->get_ip(), // no nextpage! + "https://api.yep.com/fs/2/search", + [ + "client" => "web", + "gl" => $country == "all" ? $country : strtoupper($country), + "no_correct" => "false", + "q" => $search, + "safeSearch" => $nsfw, + "type" => "images" + ] + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch JSON"); + } + + $this->detect_cf($json); + + $json = json_decode($json, true); + + if($json === null){ + + throw new Exception("Failed to decode JSON"); + } + + if(isset($json[1]["results"])){ + foreach($json[1]["results"] as $item){ + + if( + $item["width"] !== 0 && + $item["height"] !== 0 + ){ + + $thumb_width = $item["width"] >= 260 ? 260 : $item["width"]; + $thumb_height = ceil($item["height"] * ($thumb_width / $item["width"])); + + $width = $item["width"]; + $height = $item["height"]; + }else{ + + $thumb_width = null; + $thumb_height = null; + $width = null; + $height = null; + } + + $out["image"][] = [ + "title" => $item["title"], + "source" => [ + [ + "url" => $item["image_id"], + "width" => $width, + "height" => $height + ], + [ + "url" => $item["src"], + "width" => $thumb_width, + "height" => $thumb_height + ] + ], + "url" => $item["host_page"] + ]; + } + } + + return $out; + } + + + public function news($get){ + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $country = $get["country"]; + $nsfw = $get["nsfw"]; + + switch($nsfw){ + + case "yes": $nsfw = "off"; break; + case "maybe": $nsfw = "moderate"; break; + case "no": $nsfw = "strict"; break; + } + + $out = [ + "status" => "ok", + "npt" => null, + "news" => [] + ]; + + try{ + + // https://api.yep.com/fs/2/search?client=web&gl=CA&no_correct=false&q=undefined+variable+javascript&safeSearch=off&type=web + $json = + $this->get( + $this->backend->get_ip(), + "https://api.yep.com/fs/2/search", + [ + "client" => "web", + "gl" => $country == "all" ? $country : strtoupper($country), + "limit" => "99999", + "no_correct" => "false", + "q" => $search, + "safeSearch" => $nsfw, + "type" => "news" + ] + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch JSON"); + } + + $this->detect_cf($json); + + $json = json_decode($json, true); + //$json = json_decode(file_get_contents("scraper/yep.json"), true); + + if($json === null){ + + throw new Exception("Failed to decode JSON"); + } + + if(isset($json[1]["results"])){ + foreach($json[1]["results"] as $item){ + + $out["news"][] = [ + "title" => $item["title"], + "author" => null, + "description" => + $this->titledots( + strip_tags( + html_entity_decode( + $item["snippet"] + ) + ) + ), + "date" => strtotime($item["first_seen"]), + "thumb" => + isset($item["img"]) ? + [ + "url" => $this->unshiturl($item["img"]), + "ratio" => "16:9" + ] : + [ + "url" => null, + "ratio" => null + ], + "url" => $item["url"] + ]; + } + } + + return $out; + } + + + private function detect_cf($payload){ + + // detect cloudflare page + $this->fuckhtml->load($payload); + + if( + count( + $this->fuckhtml + ->getElementsByClassName( + "cf-wrapper", + "div" + ) + ) !== 0 + ){ + + throw new Exception("Blocked by Cloudflare. Please follow curl-impersonate installation instructions"); + } + } + + + private function titledots($title){ + + $substr = substr($title, -4); + + if( + strpos($substr, "...") !== false || + strpos($substr, "…") !== false + ){ + + return trim(substr($title, 0, -4)); + } + + return trim($title); + } + + private function unshiturl($url){ + + $newurl = parse_url($url, PHP_URL_QUERY); + parse_str($newurl, $newurl); + + if(isset($newurl["url"])){ + + return $newurl["url"]; + } + + return $url; + } +} diff --git a/scraper/yt.php b/scraper/yt.php new file mode 100644 index 0000000..a27fd82 --- /dev/null +++ b/scraper/yt.php @@ -0,0 +1,1727 @@ +<?php + +//$yt = new youtube(); +//header("Content-Type: application/json"); +//echo json_encode($yt->video("minecraft", null, "today", "any", "any", "live", "relevance")); + +class yt{ + + public function __construct(){ + + include "lib/backend.php"; + $this->backend = new backend("yt"); + } + + public function getfilters($page){ + + if($page != "videos"){ + + return []; + } + + return [ + "date" => [ + "display" => "Time posted", + "option" => [ + "any" => "Any time", + "hour" => "Last hour", + "today" => "Today", + "week" => "This week", + "month" => "This month", + "year" => "This year" + ] + ], + "type" => [ + "display" => "Type", + "option" => [ + "video" => "Video", + "channel" => "Channel", + "playlist" => "Playlist", + "Movie" => "Movie" + ] + ], + "duration" => [ + "display" => "Duration", + "option" => [ + "any" => "Any duration", + "short" => "Short (>4min)", + "medium" => "Medium (4-20min)", + "long" => "Long (<20min)" + ] + ], + "feature" => [ + "display" => "Feature", + "option" => [ + "any" => "No features", + "live" => "Live", + "4k" => "4K", + "hd" => "HD", + "subtitles" => "Subtitles/CC", + "creativecommons" => "Creative Commons", + "360" => "VR 360°", + "vr180" => "VR 180°", + "3d" => "3D", + "hdr" => "HDR" + ] + ], + "sort" => [ + "display" => "Sort by", + "option" => [ + "relevance" => "Relevance", + "upload_date" => "Upload date", + "view_count" => "View count", + "rating" => "Rating" + ] + ] + ]; + } + + private function ytfilter($date, $type, $duration, $feature, $sort){ + + // ------------ + // INCOMPATIBLE FILTERS + // channel,playlist DURATION, FEATURES, SORT BY + // Movie Features=[live, subtitles, creative commons, 3d] + + // live, 3D + // Type[channel, playlist, movie] + + // UPLOAD DATE, DURATION, 4k, 360, VR180, HDR + // Type[channel, playlist] + + // ----------- + + // MUST BE TOGETHER + // Relevance,upload date Type=Video + + switch($type){ + + case "channel": + case "playlist": + if($duration != "any"){ $duration = "any"; } + if($feature != "any"){ $feature = "any"; } + if($sort != "any"){ $sort = "any"; } + break; + + case "movie": + if( + in_array( + $feature, + [ + "live", + "subtitles", + "creative_commons", + "3d" + ], + ) + ){ + + $feature = "any"; + } + break; + } + + switch($feature){ + + case "live": + case "3d": + if( + in_array( + $type, + [ + "channel", + "playlist", + "movie" + ], + ) + ){ + + $type = "video"; + } + break; + } + + if( + ( + $date != "any" || + $duration != "any" || + $feature == "4k" || + $feature == "360" || + $feature == "vr180" || + $feature == "hdr" + ) && + ( + $type == "channel" || + $type == "playlist" + ) + ){ + + $type = "video"; + } + + if( + $date == "any" && + $type == "video" && + $duration == "any" && + $feature == "any" && + $sort == "relevance" + ){ + + return null; + } + + //print_r([$date, $type, $duration, $feature, $sort]); + + /* + Encode hex data + */ + + // UPLOAD DATE + // hour EgQIARAB 12 04 08 01 10 01 + // today EgQIAhAB 12 04 08 02 10 01 + // week EgQIAxAB 12 04 08 03 10 01 + // month EgQIBBAB 12 04 08 04 10 01 + // year EgQIBRAB 12 04 08 05 10 01 + + // TYPE + // video EgIQAQ%253D%253D 12 02 10 01 + // channel EgIQAg%253D%253D 12 02 10 02 + // playlist EgIQAw%253D%253D 12 02 10 03 + // movie EgIQBA%253D%253D 12 02 10 04 + + // DURATION + // -4min EgIYAQ%253D%253D 12 02 18 01 + // 4-20min EgIYAw%253D%253D 12 02 18 03 + // 20+min EgIYAg%253D%253D 12 02 18 02 + + // FEATURE + // live EgJAAQ%253D%253D 12 02 40 01 + // 4K EgJwAQ%253D%253D 12 02 70 01 + // HD EgIgAQ%253D%253D 12 02 20 01 + // Subtitles/CC EgIoAQ%253D%253D 12 02 28 01 + // Creative Commons EgIwAQ%253D%253D 12 02 30 01 + // 360 EgJ4AQ%253D%253D 12 02 78 01 + // VR180 EgPQAQE%253D 12 03 d0 01 01 + // 3D EgI4AQ%253D%253D 12 02 38 01 + // HDR EgPIAQE%253D 12 03 c8 01 01 + // (location & purchased unused) + + // SORT BY + // Relevance CAASAhAB 08 00 12 02 10 01 (is nothing by default) + // Upload date CAI%253D 08 02 + // View count CAM%253D 08 03 + // Rating CAE%253D 08 01 + + // video + // 12 02 10 01 + + // under 4 minutes + // 12 02 18 01 + + // video + under 4 minutes + // 12 04 10 01 18 01 + + // video + under 4 minutes + HD + // 08 00 12 06 10 01 18 01 20 01 + + // video + under 4 minutes + upload date + // 08 02 12 04 10 01 18 01 + + // video + under 4 minutes + HD + upload date + // 08 02 12 06 10 01 18 01 20 01 + + // this year + video + under 4 minutes + HD + upload date + // 08 02 12 08 08 05 10 01 18 01 20 01 + + // this week + video + over 20 minutes + HD + view count + // 08 03 12 08 08 03 10 01 18 02 20 01 + + //echo urlencode(urlencode(base64_encode(hex2bin($str)))); + //echo bin2hex(base64_decode(urldecode(urldecode("CAI%253D")))); + + // week + video + 20min + rating + // 08 01 12 06 08 03 10 01 18 02 + + // week + video + 20min + live + rating + // 08 01 12 08 08 03 10 01 18 02 40 01 + + // live 12 02 40 01 + + $hex = null; + if( + $date == "any" && + $type == "video" && + $duration == "any" && + $feature == "any" && + $sort == "relevance" + ){ + + return $hex; + } + + $opcode = 0; + + if($date != "any"){ $opcode += 2; } + if($type != "any"){ $opcode += 2; } + if($duration != "any"){ $opcode += 2; } + + switch($feature){ + + case "live": + case "4k": + case "hd": + case "subtitles": + case "creativecommons": + case "360": + case "3d": + $opcode += 2; + break; + + case "hdr": + case "vr180": + $opcode += 3; + break; + } + + switch($sort){ + + case "relevance": $hex .= "0800"; break; + case "upload_date": $hex .= "0802"; break; + case "view_count": $hex .= "0803"; break; + case "rating": $hex .= "0801"; break; + } + + $hex .= "12" . "0".$opcode; + + switch($date){ + + case "hour": $hex .= "0801"; break; + case "today": $hex .= "0802"; break; + case "week": $hex .= "0803"; break; + case "month": $hex .= "0804"; break; + case "year": $hex .= "0805"; break; + } + + switch($type){ + + case "video": $hex .= "1001"; break; + case "channel": $hex .= "1002"; break; + case "playlist": $hex .= "1003"; break; + case "movie": $hex .= "1004"; break; + } + + switch($duration){ + + case "short": $hex .= "1801"; break; + case "medium": $hex .= "1803"; break; + case "long": $hex .= "1802"; break; + } + + switch($feature){ + + case "live": $hex .= "4001"; break; + case "4k": $hex .= "7001"; break; + case "hd": $hex .= "2001"; break; + case "subtitles": $hex .= "2801"; break; + case "creativecommons": $hex .= "3001"; break; + case "360": $hex .= "7801"; break; + case "vr180": $hex .= "d00101"; break; + case "3d": $hex .= "3801"; break; + case "hdr": $hex .= "c80101"; break; + } + + //echo $hex . "\n\n"; + return urlencode(base64_encode(hex2bin($hex))); + } + + // me reading youtube's json + // https://imgur.com/X9hVlFX + + const req_web = 0; + const req_xhr = 1; + + private function get($proxy, $url, $get = [], $reqtype = self::req_web, $continuation = null){ + + $curlproc = curl_init(); + + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + switch($reqtype){ + case self::req_web: + $headers = + ["User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "Cookie: PREF=tz=America.New_York", + "DNT: 1", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: none", + "Sec-Fetch-User: ?1"]; + break; + + case self::req_xhr: + $headers = + ["User-Agent: " . config::USER_AGENT, + "Accept: */*", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "Cookie: PREF=tz=America.New_York", + "Referer: https://youtube.com.com/", + "Content-Type: application/json", + "Content-Length: " . strlen($continuation), + "DNT: 1", + "Connection: keep-alive", + "Sec-Fetch-Dest: empty", + "Sec-Fetch-Mode: same-origin", + "Sec-Fetch-Site: same-origin"]; + + curl_setopt($curlproc, CURLOPT_POST, true); + curl_setopt($curlproc, CURLOPT_POSTFIELDS, $continuation); + break; + } + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers); + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + return $data; + } + + public function video($get){ + + $this->out = [ + "status" => "ok", + "npt" => null, + "video" => [], + "author" => [], + "livestream" => [], + "playlist" => [], + "reel" => [] + ]; + + if($get["npt"]){ + + // parse nextPage + // https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false + /* + $handle = fopen("nextpage.json", "r"); + $json = fread($handle, filesize("nextpage.json")); + fclose($handle);*/ + + [$npt, $proxy] = + $this->backend->get( + $get["npt"], + "videos" + ); + + $npt = json_decode($npt, true); + + try{ + $json = $this->get( + $proxy, + "https://www.youtube.com/youtubei/v1/search", + [ + "key" => $npt["key"], + "prettyPrint" => "false" + ], + self::req_xhr, + json_encode($npt["post"]) + ); + }catch(Exception $error){ + + throw new Exception("Could not fetch results page"); + } + + $json = json_decode($json); + + foreach( + $json + ->onResponseReceivedCommands[0] + ->appendContinuationItemsAction + ->continuationItems[0] + ->itemSectionRenderer + ->contents + as $video + ){ + + $this->parsevideoobject($video); + } + + if( + !isset( + $json + ->onResponseReceivedCommands[0] + ->appendContinuationItemsAction + ->continuationItems[1] + ->continuationItemRenderer + ->continuationEndpoint + ->continuationCommand + ->token + ) + ){ + + $npt = null; + + }else{ + // prepare nextpage for later.. + $npt["post"]["continuation"] = + $json + ->onResponseReceivedCommands[0] + ->appendContinuationItemsAction + ->continuationItems[1] + ->continuationItemRenderer + ->continuationEndpoint + ->continuationCommand + ->token; + } + + $this->out["npt"] = $npt; + + }else{ + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $proxy = $this->backend->get_ip(); + $date = $get["date"]; + $type = $get["type"]; + $duration = $get["duration"]; + $feature = $get["feature"]; + $sort = $get["sort"]; + + // parse ytInitialData + + $get = [ + "search_query" => $search + ]; + + if( + ( + $filter = + $this->ytfilter( + $date, + $type, + $duration, + $feature, + $sort + ) + ) !== null + ){ + + $get["sp"] = $filter; + } + + try{ + $json = $this->get( + $proxy, + "https://www.youtube.com/results", + $get + ); + }catch(Exception $error){ + + throw new Exception("Could not fetch results page"); + } + /* + $handle = fopen("test.html", "r"); + $json = fread($handle, filesize("test.html")); + fclose($handle); + */ + if( + !preg_match( + '/ytcfg\.set\(({".*})\); *window\.ytcfg/', + $json, + $ytconfig + ) + ){ + + throw new Exception("Could not get ytcfg"); + } + + $ytconfig = json_decode($ytconfig[1]); + + if( + !preg_match( + '/ytInitialData *= *({.*});<\/script>/', + $json, + $json + ) + ){ + + throw new Exception("Could not get ytInitialData"); + } + + $json = json_decode($json[1]); + + // generate POST data for nextpage + + $ytconfig->INNERTUBE_CONTEXT->client->screenWidthPoints = 1239; + $ytconfig->INNERTUBE_CONTEXT->client->screenHeightPoints = 999; + $ytconfig->INNERTUBE_CONTEXT->client->screenPixelDensity = 1; + $ytconfig->INNERTUBE_CONTEXT->client->screenDensityFloat = 1; + $ytconfig->INNERTUBE_CONTEXT->client->utcOffsetMinutes = -240; + $ytconfig->INNERTUBE_CONTEXT->request->internalExperimentFlags = []; + $ytconfig->INNERTUBE_CONTEXT->request->consistencyTokenJars = []; + + $ytconfig->INNERTUBE_CONTEXT->client->mainAppWebInfo = [ + "graftUrl" => $ytconfig->INNERTUBE_CONTEXT->client->originalUrl, + "webDisplayMode" => "WEB_DISPLAY_MODE_BROWSER", + "isWebNativeShareAvailable" => false + ]; + + $ytconfig->INNERTUBE_CONTEXT->adSignalsInfo = [ + "params" => [ + [ + "key" => "dt", + "value" => (string)$ytconfig->TIME_CREATED_MS + ], + [ + "key" => "flash", + "value" => "0" + ], + [ + "key" => "frm", + "value" => "0" + ], + [ + "key" => "u_tz", + "value" => "-240" + ], + [ + "key" => "u_his", + "value" => "3" + ], + [ + "key" => "u_h", + "value" => "1080" + ], + [ + "key" => "u_w", + "value" => "1920" + ], + [ + "key" => "u_ah", + "value" => "1080" + ], + [ + "key" => "u_cd", + "value" => "24" + ], + [ + "key" => "bc", + "value" => "31" + ], + [ + "key" => "bih", + "value" => "999" + ], + [ + "key" => "biw", + "value" => "1239" + ], + [ + "key" => "brdim", + "value" => "0,0,0,0,1920,0,1920,1061,1239,999" + ], + [ + "key" => "vis", + "value" => "1" + ], + [ + "key" => "wgl", + "value" => "true" + ], + [ + "key" => "ca_type", + "value" => "image" + ] + ] + ]; + + /* + echo json_encode($json); + die();*/ + + // *inhales* + foreach( + $json + ->contents + ->twoColumnSearchResultsRenderer + ->primaryContents + ->sectionListRenderer + ->contents[0] + ->itemSectionRenderer + ->contents + as $video + ){ + + $this->parsevideoobject($video); + } + + // get additional data from secondaryContents + if( + isset( + $json + ->contents + ->twoColumnSearchResultsRenderer + ->secondaryContents + ->secondarySearchContainerRenderer + ->contents[0] + ->universalWatchCardRenderer + ) + ){ + + $video = + $json + ->contents + ->twoColumnSearchResultsRenderer + ->secondaryContents + ->secondarySearchContainerRenderer + ->contents[0] + ->universalWatchCardRenderer; + /* + echo json_encode($video); + die();*/ + + $author = + [ + "name" => + $video + ->header + ->watchCardRichHeaderRenderer + ->title + ->simpleText, + "url" => + "https://www.youtube.com/channel/" . + $video + ->header + ->watchCardRichHeaderRenderer + ->titleNavigationEndpoint + ->browseEndpoint + ->browseId, + "avatar" => null + ]; + + if( + isset( + $video + ->header + ->watchCardRichHeaderRenderer + ->avatar + ->thumbnails[0] + ->url + ) + ){ + + $author["avatar"] = + $video + ->header + ->watchCardRichHeaderRenderer + ->avatar + ->thumbnails[0] + ->url; + } + + // add video in callToAction if present + if( + isset( + $video + ->callToAction + ->watchCardHeroVideoRenderer + ->lengthText + ) + ){ + + array_push( + $this->out["video"], + [ + "title" => + $video + ->callToAction + ->watchCardHeroVideoRenderer + ->title + ->simpleText, + "description" => null, + "author" => $author, + "date" => + $this->textualdate2unix( + trim( + explode( + "•", + $video + ->callToAction + ->watchCardHeroVideoRenderer + ->subtitle + ->simpleText + )[2] + ) + ), + "duration" => + $this->hms2int( + $video + ->callToAction + ->watchCardHeroVideoRenderer + ->lengthText + ->simpleText + ), + "views" => + $this->truncatedcount2int( + trim( + explode( + "•", + $video + ->callToAction + ->watchCardHeroVideoRenderer + ->subtitle + ->simpleText, + 2 + )[1] + ) + ), + "thumb" => [ + "url" => + $video + ->callToAction + ->watchCardHeroVideoRenderer + ->heroImage + ->singleHeroImageRenderer + ->thumbnail + ->thumbnails[0] + ->url, + "ratio" => "16:9" + ], + "url" => + "https://www.youtube.com/watch?v=" . + $video + ->callToAction + ->watchCardHeroVideoRenderer + ->navigationEndpoint + ->watchEndpoint + ->videoId + ] + ); + } + + // get all playlists, ignore videos + $out = null; + + foreach( + $video + ->sections + as $section + ){ + + if( + isset( + $section + ->watchCardSectionSequenceRenderer + ->lists[0] + ->horizontalCardListRenderer + ->cards + ) + ){ + + $out = + $section + ->watchCardSectionSequenceRenderer + ->lists[0] + ->horizontalCardListRenderer + ->cards; + break; + } + } + + if($out !== null){ + + foreach( + $out as $video + ){ + + if( + !isset( + $video + ->searchRefinementCardRenderer + ) + ){ + + continue; + } + + $video = + $video + ->searchRefinementCardRenderer; + + array_push( + $this->out["playlist"], + [ + "title" => + $video + ->query + ->runs[0] + ->text, + "description" => null, + "author" => $author, + "date" => null, + "duration" => null, + "views" => null, + "thumb" => [ + "url" => + $video + ->thumbnail + ->thumbnails[0] + ->url, + "ratio" => "1:1" + ], + "url" => + "https://www.youtube.com" . + $video + ->searchEndpoint + ->commandMetadata + ->webCommandMetadata + ->url + ] + ); + } + } + } + + foreach( + $json + ->contents + ->twoColumnSearchResultsRenderer + ->primaryContents + ->sectionListRenderer + ->contents + as $cont + ){ + + if(isset($cont->continuationItemRenderer)){ + + $this->out["npt"] = [ + "key" => + $ytconfig + ->INNERTUBE_API_KEY, + "post" => [ + "context" => + $ytconfig + ->INNERTUBE_CONTEXT, + "continuation" => + $cont + ->continuationItemRenderer + ->continuationEndpoint + ->continuationCommand + ->token + ] + ]; + break; + } + } + } + + if($this->out["npt"] !== null){ + + $this->out["npt"] = + $this->backend->store( + json_encode( + $this->out["npt"] + ), + "videos", + $proxy + ); + } + + return $this->out; + } + + private function parsevideoobject($video){ + + if(isset($video->videoRenderer)){ + + $video = $video->videoRenderer; + + $description = null; + + if(isset($video->detailedMetadataSnippets)){ + foreach( + $video + ->detailedMetadataSnippets[0] + ->snippetText + ->runs + as $description_part + ){ + + $description .= $description_part->text; + } + } + + if( + isset( + $video + ->badges[0] + ->metadataBadgeRenderer + ->icon + ->iconType + ) && + $video + ->badges[0] + ->metadataBadgeRenderer + ->icon + ->iconType + == "LIVE" + ){ + + $type = "livestream"; + $date = null; + $duration = "_LIVE"; + + if(isset($video->viewCountText->runs[0]->text)){ + + $views = + $this->views2int( + $video + ->viewCountText + ->runs[0] + ->text + ); + }else{ + + $views = null; + } + }else{ + + $type = "video"; + + if(isset($video->publishedTimeText->simpleText)){ + + $date = $this->textualdate2unix( + $video + ->publishedTimeText + ->simpleText + ); + }else{ + + $date = null; + } + + if(isset($video->lengthText->simpleText)){ + + $duration = + $this->hms2int( + $video + ->lengthText + ->simpleText + ); + }else{ + + $duration = null; + } + + if(isset($video->viewCountText->simpleText)){ + + $views = + $this->views2int( + $video + ->viewCountText + ->simpleText + ); + }else{ + + $views = null; + } + } + + if( + $video + ->navigationEndpoint + ->commandMetadata + ->webCommandMetadata + ->webPageType + == "WEB_PAGE_TYPE_SHORTS" + ){ + + // haha you thought you could get me, youtube + // jokes on you i dont go outside + $type = "reel"; + } + + array_push( + $this->out[$type], + [ + "title" => + $video + ->title + ->runs[0] + ->text, + "description" => + $this->titledots($description), + "author" => [ + "name" => + $video + ->longBylineText + ->runs[0] + ->text, + "url" => + "https://www.youtube.com/channel/" . + $video + ->longBylineText + ->runs[0] + ->navigationEndpoint + ->browseEndpoint + ->browseId, + "avatar" => + $this->checkhttpspresence( + $video + ->channelThumbnailSupportedRenderers + ->channelThumbnailWithLinkRenderer + ->thumbnail + ->thumbnails[0] + ->url + ) + ], + "date" => $date, + "duration" => $duration, + "views" => $views, + "thumb" => [ + "url" => + $video + ->thumbnail + ->thumbnails[0] + ->url, + "ratio" => "16:9" + ], + "url" => + "https://www.youtube.com/watch?v=" . + $video + ->videoId + ] + ); + }elseif(isset($video->watchCardCompactVideoRenderer)){ + + $video = + $video + ->watchCardCompactVideoRenderer; + + array_push( + $this->out["video"], + [ + "title" => + $video + ->title + ->simpleText, + "description" => null, + "author" => [ + "name" => + $video + ->byline + ->runs[0] + ->text, + "url" => + "https://www.youtube.com/channel/" . + $video + ->byline + ->runs[0] + ->navigationEndpoint + ->browseEndpoint + ->browseId, + "avatar" => null + ], + "date" => + $this->textualdate2unix( + trim( + explode( + "•", + $video + ->subtitle + ->simpleText, + 2 + )[1] + ) + ), + "duration" => + $this->hms2int( + $video + ->lengthText + ->simpleText + ), + "views" => + $this->truncatedcount2int( + trim( + explode( + "•", + $video + ->subtitle + ->simpleText, + 2 + )[0] + ) + ), + "thumb" => [ + "url" => + $video + ->thumbnail + ->thumbnails[0] + ->url, + "ratio" => "16:9" + ], + "url" => + "https://www.youtube.com/watch?v=" . + $video + ->navigationEndpoint + ->watchEndpoint + ->videoId + ] + ); + + }elseif(isset($video->reelShelfRenderer)){ + + foreach( + $video + ->reelShelfRenderer + ->items + as $reel + ){ + + $reel = + $reel + ->shortsLockupViewModel; + + array_push( + $this->out["reel"], + [ + "title" => + $reel + ->overlayMetadata + ->primaryText + ->content, + "description" => null, + "author" => [ + "name" => null, + "url" => null, + "avatar" => null + ], + "date" => null, + "duration" => null, + "views" => null, + "thumb" => [ + "url" => + $reel + ->thumbnail + ->sources[0] + ->url, + "ratio" => "9:16" + ], + "url" => + "https://www.youtube.com/watch?v=" . + $reel + ->onTap + ->innertubeCommand + ->reelWatchEndpoint + ->videoId + ] + ); + } + } + + elseif(isset($video->channelRenderer)){ + + $video = $video->channelRenderer; + + $description = null; + + if(isset($video->descriptionSnippet)){ + + foreach( + $video + ->descriptionSnippet + ->runs + as $description_part + ){ + + $description .= $description_part->text; + } + } + + array_push( + $this->out["author"], + [ + "title" => + $video + ->title + ->simpleText, + "followers" => + isset( + $video + ->videoCountText + ->simpleText + ) ? + $this->truncatedcount2int( + $video + ->videoCountText + ->simpleText + ) : + 0, + "description" => $this->titledots($description), + "thumb" => + [ + "url" => + $this->checkhttpspresence( + $video + ->thumbnail + ->thumbnails[ + count( + $video + ->thumbnail + ->thumbnails + ) - 1 + ] + ->url + ), + "ratio" => "1:1" + ], + "url" => + "https://www.youtube.com/channel/" . + $video + ->channelId + ] + ); + } + + elseif(isset($video->shelfRenderer)){ + + if( + !is_object( + $video + ->shelfRenderer + ->content + ->verticalListRenderer + ) + ){ + return; + } + + foreach( + $video + ->shelfRenderer + ->content + ->verticalListRenderer + ->items + as $shelfvideo + ){ + + $this->parsevideoobject($shelfvideo); + } + + }elseif(isset($video->radioRenderer)){ + + $video = $video->radioRenderer; + + $description = + $video + ->videoCountText + ->runs[0] + ->text + . "."; + + $tmp = []; + foreach( + $video->videos + as $childvideo + ){ + + $tmp[] = + $childvideo + ->childVideoRenderer + ->title + ->simpleText; + } + + if(count($tmp) !== 0){ + + $description .= + " " . implode(", ", $tmp); + } + + array_push( + $this->out["playlist"], + [ + "title" => + $video + ->title + ->simpleText, + "description" => $description, + "author" => [ + "name" => + $video + ->longBylineText + ->simpleText, + "url" => null, + "avatar" => null + ], + "date" => null, + "duration" => null, + "views" => null, + "thumb" => [ + "url" => + $video + ->thumbnail + ->thumbnails[ + count( + $video + ->thumbnail + ->thumbnails + ) - 1 + ] + ->url, + "ratio" => "16:9" + ], + "url" => + "https://www.youtube.com/watch?v=" . + $video + ->videos[0] + ->childVideoRenderer + ->videoId . + "&list=" . + $video + ->playlistId . + "&start_radio=1" + ] + ); + + }elseif(isset($video->playlistRenderer)){ + + $video = $video->playlistRenderer; + + $description = $video->videoCount . " videos."; + + $tmp = []; + foreach( + $video + ->videos + as $childvideo + ){ + + $tmp[] = + $childvideo + ->childVideoRenderer + ->title + ->simpleText; + } + + if(count($tmp) !== 0){ + + $description .= + " " . implode(", ", $tmp); + } + + array_push( + $this->out["playlist"], + [ + "title" => + $video + ->title + ->simpleText, + "description" => $description, + "author" => [ + "name" => + $video + ->longBylineText + ->runs[0] + ->text, + "url" => + "https://www.youtube.com/channel/" . + $video + ->longBylineText + ->runs[0] + ->navigationEndpoint + ->browseEndpoint + ->browseId, + "picture" => null + ], + "date" => null, + "duration" => null, + "views" => null, + "thumb" => + [ + "url" => + $video + ->thumbnails[0] + ->thumbnails[ + count( + $video + ->thumbnails[0] + ->thumbnails + ) - 1 + ] + ->url, + "ratio" => "16:9" + ], + "url" => + "https://www.youtube.com/watch?v=" . + $video + ->videos[0] + ->childVideoRenderer + ->videoId . + "&list=" . + $video + ->playlistId . + "&start_radio=1" + ] + ); + + }/*else{ + if(!isset($video->searchPyvRenderer)){ + echo json_encode($video); + die();} + }*/ + } + + private function textualdate2unix($number){ + + $number = + explode( + " ", + str_replace( + [ + " ago", + "seconds", + "minutes", + "hours", + "days", + "weeks", + "months", + "years" + ], + [ + "", + "second", + "minute", + "hour", + "day", + "week", + "month", + "year" + ], + $number + ), + 2 + ); + + $time = 0; + switch($number[1]){ + + case "second": + $time = (int)$number[0]; + break; + + case "minute": + $time = (int)$number[0] * 60; + break; + + case "hour": + $time = (int)$number[0] * 3600; + break; + + case "day": + $time = (int)$number[0] * 86400; + break; + + case "week": + $time = (int)$number[0] * 604800; + break; + + case "month": + $time = (int)$number[0] * 2629746; + break; + + case "year": + $time = (int)$number[0] * 31556952; + break; + } + + return time() - $time; + } + + private function checkhttpspresence($link){ + + if(substr($link, 0, 2) == "//"){ + + return "https:" . $link; + } + + return $link; + } + + private function textualtime2int($number){ + + $number = explode(" - ", $number); + + if(count($number) >= 2){ + + $number = $number[count($number) - 2]; + }else{ + + $number = $number[0]; + } + + $number = + str_replace( + [ + " ", + "seconds", + "minutes", + "hours", + ], + [ + "", + "second", + "minute", + "hour" + ], + $number + ); + + preg_match_all( + '/([0-9]+)(second|minute|hour)/', + $number, + $number + ); + + $time = 0; + + for($i=0; $i<count($number[0]); $i++){ + + switch($number[2][$i]){ + + case "second": + $time = $time + (int)$number[1][$i]; + break; + + case "minute": + $time = $time + ((int)$number[1][$i] * 60); + break; + + case "hour": + $time = $time + ((int)$number[1][$i] * 3600); + break; + } + } + + return $time; + } + + private function views2int($views){ + + return + (int)str_replace( + ",", "", + explode(" ", $views, 2)[0] + ); + } + + private function hms2int($time){ + + $parts = explode(":", $time, 3); + $time = 0; + + if(count($parts) === 3){ + + // hours + $time = $time + ((int)$parts[0] * 3600); + array_shift($parts); + } + + if(count($parts) === 2){ + + // minutes + $time = $time + ((int)$parts[0] * 60); + array_shift($parts); + } + + // seconds + $time = $time + (int)$parts[0]; + + return $time; + } + + private function truncatedcount2int($number){ + + // decimal should always be 1 number long + $number = explode(" ", $number, 2); + $number = $number[0]; + + $unit = strtolower($number[strlen($number) - 1]); + + $tmp = explode(".", $number, 2); + $number = (int)$number; + + if(count($tmp) === 2){ + + $decimal = (int)$tmp[1]; + }else{ + + $decimal = 0; + } + + switch($unit){ + + case "k": + $exponant = 1000; + break; + + case "m": + $exponant = 1000000; + break; + + case "b"; + $exponant = 1000000000; + break; + + default: + $exponant = 1; + break; + } + + return ($number * $exponant) + ($decimal * ($exponant / 10)); + } + + private function titledots($title){ + + $substr = substr($title, -3); + + if( + $substr == "..." || + $substr == "…" + ){ + + return trim(substr($title, 0, -3), " \n\r\t\v\x00\0\x0B\xc2\xa0"); + } + + return trim($title, " \n\r\t\v\x00\0\x0B\xc2\xa0"); + } +} |