aboutsummaryrefslogtreecommitdiffstats
path: root/scraper/startpage.php
diff options
context:
space:
mode:
authorlolcat <will@lolcat.ca>2025-08-11 01:55:15 +0000
committerlolcat <will@lolcat.ca>2025-08-11 01:55:15 +0000
commitcdf958d29333d448f4521f4d2faa2592b58e9b27 (patch)
tree528f2a0ffa789a6f4279d9f54a4a2aaf391f390f /scraper/startpage.php
downloadshittyweb-search-cdf958d29333d448f4521f4d2faa2592b58e9b27.tar.gz
shittyweb-search-cdf958d29333d448f4521f4d2faa2592b58e9b27.tar.bz2
shittyweb-search-cdf958d29333d448f4521f4d2faa2592b58e9b27.zip
fix wikipedia crashgrafted
Diffstat (limited to 'scraper/startpage.php')
-rw-r--r--scraper/startpage.php1584
1 files changed, 1584 insertions, 0 deletions
diff --git a/scraper/startpage.php b/scraper/startpage.php
new file mode 100644
index 0000000..e48a429
--- /dev/null
+++ b/scraper/startpage.php
@@ -0,0 +1,1584 @@
+<?php
+
+class startpage{
+
+ public function __construct(){
+
+ include "lib/backend.php";
+ $this->backend = new backend("startpage");
+
+ include "lib/fuckhtml.php";
+ $this->fuckhtml = new fuckhtml();
+ }
+
+ public function getfilters($page){
+
+ switch($page){
+ case "web":
+ return [
+ "country" => [
+ "display" => "Country",
+ "option" => [
+ "any" => "All Regions",
+ "es_AR" => "Argentina",
+ "en_AU" => "Australia",
+ "de_AT" => "Austria",
+ "ru_BY" => "Belarus",
+ "fr_BE" => "Belgium (FR)",
+ "nl_BE" => "Belgium (NL)",
+ "bg_BG" => "Bulgaria",
+ "en_CA" => "Canada (EN)",
+ "fr_CA" => "Canada (FR)",
+ "es_CL" => "Chile",
+ "es_CO" => "Colombia",
+ "cs_CZ" => "Czech Republic",
+ "da_DK" => "Denmark",
+ "ar_EG" => "Egypt",
+ "et_EE" => "Estonia",
+ "fi_FI" => "Finland",
+ "fr_FR" => "France",
+ "de_DE" => "Germany",
+ "el_GR" => "Greece",
+ "hu_HU" => "Hungary",
+ "hi_IN" => "India (HI)",
+ "en_IN" => "India (EN)",
+ "id_ID" => "Indonesia (ID)",
+ "en_ID" => "Indonesia (EN)",
+ "en_IE" => "Ireland",
+ "it_IT" => "Italy",
+ "ja_JP" => "Japan",
+ "ko_KR" => "Korea",
+ "ms_MY" => "Malaysia (MS)",
+ "en_MY" => "Malaysia (EN)",
+ "es_MX" => "Mexico",
+ "nl_NL" => "Netherlands",
+ "en_NZ" => "New Zealand",
+ "no_NO" => "Norway",
+ "es_PE" => "Peru",
+ "fil_PH" => "Philippines (FIL)",
+ "en_PH" => "Philippines (EN)",
+ "pl_PL" => "Poland",
+ "pt_PT" => "Portugal",
+ "ro_RO" => "Romania",
+ "ru_RU" => "Russia",
+ "ms_SG" => "Singapore (MS)",
+ "en_SG" => "Singapore (EN)",
+ "es_ES" => "Spain (ES)",
+ "ca_ES" => "Spain (CA)",
+ "sv_SE" => "Sweden",
+ "de_CH" => "Switzerland (DE)",
+ "fr_CH" => "Switzerland (FR)",
+ "it_CH" => "Switzerland (IT)",
+ "tr_TR" => "Turkey",
+ "uk_UA" => "Ukraine",
+ "en_US" => "US (EN)",
+ "es_US" => "US (ES)",
+ "es_UY" => "Uruguay",
+ "es_VE" => "Venezuela",
+ "vi_VN" => "Vietnam (VI)",
+ "en_VN" => "Vietnam (EN)",
+ "en_ZA" => "South Africa"
+ ]
+ ],
+ "nsfw" => [ // qadf
+ "display" => "NSFW",
+ "option" => [
+ "yes" => "Yes", // qadf=none
+ "no" => "No" // qadf=heavy
+ ]
+ ],
+ "time" => [ // with_date
+ "display" => "Time posted",
+ "option" => [
+ "any" => "Any time",
+ "d" => "Past 24 hours",
+ "w" => "Past week",
+ "m" => "Past month",
+ "y" => "Past year",
+ ]
+ ],
+ "extendedsearch" => [
+ // undefined display, so it wont show in frontend
+ "option" => [
+ "yes" => "Yes",
+ "no" => "No"
+ ]
+ ]
+ ];
+ break;
+
+ case "images":
+ return [
+ "nsfw" => [ // qadf
+ "display" => "NSFW",
+ "option" => [
+ "yes" => "Yes", // qadf=none
+ "no" => "No" // qadf=heavy
+ ]
+ ],
+ "size" => [ // flimgsize
+ "display" => "Size",
+ "option" => [
+ "any" => "Any size",
+ "Small" => "Small",
+ "Medium" => "Medium",
+ "Large" => "Large",
+ "Wallpaper" => "Wallpaper",
+ // from here, image-size-select, var prefix = isz:lt,islt:
+ "qsvgs" => "Larger than 400x300",
+ "vga" => "Larger than 640x480",
+ "svga" => "Larger than 800x600",
+ "xga" => "Larger than 1024x768",
+ "qsvgs" => "Larger than 400x300",
+ "2mp" => "Larger than 2 MP (1600x1200)",
+ "4mp" => "Larger than 4 MP (2272x1704)",
+ "6mp" => "Larger than 6 MP (2816x2112)",
+ "8mp" => "Larger than 8 MP (3264x2448)",
+ "10mp" => "Larger than 10 MP (3648x2736)",
+ "12mp" => "Larger than 12 MP (4096x3072)",
+ "15mp" => "Larger than 15 MP (4480x3360)",
+ "20mp" => "Larger than 20 MP (5120x3840)",
+ "40mp" => "Larger than 40 MP (7216x5412)",
+ "70mp" => "Larger than 70 MP (9600x7200)"
+ ]
+ ],
+ "color" => [ // flimgcolor
+ "display" => "Color",
+ "option" => [
+ "any" => "Any color",
+ // from here, var prefix = ic:
+ "color" => "Color only",
+ "bnw" => "Black & white", // set to "gray"
+ // from here, var prefix = ic:specific,isc:
+ "red" => "Red",
+ "orange" => "Orange",
+ "yellow" => "Yellow",
+ "green" => "Green",
+ "teal" => "Teal",
+ "blue" => "Blue",
+ "purple" => "Purple",
+ "pink" => "Pink",
+ "white" => "White",
+ "gray" => "Gray",
+ "black" => "Black",
+ "brown" => "Brown"
+ ]
+ ],
+ "type" => [ // flimgtype
+ "display" => "Type",
+ "option" => [
+ "any" => "Any type",
+ "AnimatedGif" => "Animated GIF",
+ "Clipart" => "Clip Art",
+ "Line" => "Line Drawing",
+ "Photo" => "Photograph",
+ "Transparent" => "Transparent Background"
+ ]
+ ],
+ "license" => [ // flimglicense
+ "display" => "License",
+ "option" => [
+ "any" => "Any license",
+ "p" => "Public domain",
+ "s" => "Free to share",
+ "sc" => "Free to share commercially",
+ "m" => "Free to modify",
+ "mc" => "Free to modify commercially"
+ ]
+ ]
+ ];
+ break;
+
+ case "videos":
+ return [
+ "nsfw" => [ // qadf
+ "display" => "NSFW",
+ "option" => [
+ "yes" => "Yes", // qadf=none
+ "no" => "No" // qadf=heavy
+ ]
+ ],
+ "sort" => [
+ "display" => "Sort by",
+ "option" => [
+ "relevance" => "Most relevant",
+ "popular" => "Most popular",
+ "recent" => "Most recent"
+ ]
+ ],
+ "duration" => [ // with_duration
+ "display" => "Duration",
+ "option" => [
+ "any" => "Any duration",
+ "short" => "Short",
+ "medium" => "Medium",
+ "long" => "Long"
+ ]
+ ]
+ ];
+ break;
+
+ case "news":
+ return [
+ "nsfw" => [ // qadf
+ "display" => "NSFW",
+ "option" => [
+ "yes" => "Yes", // qadf=none
+ "no" => "No" // qadf=heavy
+ ]
+ ],
+ "time" => [ // with_date
+ "display" => "Time posted",
+ "option" => [
+ "any" => "Any time",
+ "d" => "Past 24 hours",
+ "w" => "Past week",
+ "m" => "Past month"
+ ]
+ ]
+ ];
+ break;
+
+ //preferences=date_timeEEEworldN1Ndisable_family_filterEEE1N1Ndisable_open_in_new_windowEEE0N1Nenable_post_methodEEE1N1Nenable_proxy_safety_suggestEEE0N1Nenable_stay_controlEEE0N1Ninstant_answersEEE1N1Nlang_homepageEEEs%2Fdevice%2FenN1NlanguageEEEazerbaijaniN1Nlanguage_uiEEEenglishN1Nnum_of_resultsEEE20N1Nsearch_results_regionEEEallN1NsuggestionsEEE1N1Nwt_unitEEEcelsius; Domain=startpage.com; Expires=Mon, 28 Oct 2024 20:21:58 GMT; Secure; Path=/
+ //preferences=date_timeEEEworldN1Ndisable_family_filterEEE1N1Ndisable_open_in_new_windowEEE0N1Nenable_post_methodEEE1N1Nenable_proxy_safety_suggestEEE0N1Nenable_stay_controlEEE0N1Ninstant_answersEEE1N1Nlang_homepageEEEs%2Fdevice%2FenN1NlanguageEEEenglishN1Nlanguage_uiEEEenglishN1Nnum_of_resultsEEE20N1Nsearch_results_regionEEEallN1NsuggestionsEEE1N1Nwt_unitEEEcelsius; Domain=startpage.com; Expires=Mon, 28 Oct 2024 20:22:52 GMT; Secure; Path=/
+ }
+ }
+
+ private function get($proxy, $url, $get = [], $post = false, $is_xhr = false){
+
+ $curlproc = curl_init();
+
+ if($post === true){
+
+ curl_setopt($curlproc, CURLOPT_POST, true);
+ curl_setopt($curlproc, CURLOPT_POSTFIELDS, $get);
+
+ }elseif($get !== []){
+
+ $get = http_build_query($get);
+ $url .= "?" . $get;
+ }
+
+ curl_setopt($curlproc, CURLOPT_URL, $url);
+
+ // http2 bypass
+ curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
+
+ curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
+
+ if($is_xhr === true){
+
+ curl_setopt($curlproc, CURLOPT_HTTPHEADER,
+ ["User-Agent: " . config::USER_AGENT,
+ "Accept: application/json",
+ "Accept-Language: en-US,en;q=0.5",
+ "Accept-Encoding: gzip",
+ "Referer: https://www.startpage.com/",
+ "Content-Type: application/json",
+ "Content-Length: " . strlen($get),
+ "Origin: https://www.startpage.com/",
+ "DNT: 1",
+ "Connection: keep-alive",
+ "Cookie: preferences=date_timeEEEworldN1Ndisable_family_filterEEE1N1Ndisable_open_in_new_windowEEE0N1Nenable_post_methodEEE1N1Nenable_proxy_safety_suggestEEE0N1Nenable_stay_controlEEE0N1Ninstant_answersEEE1N1Nlang_homepageEEEs%2Fdevice%2FenN1NlanguageEEEenglishN1Nlanguage_uiEEEenglishN1Nnum_of_resultsEEE20N1Nsearch_results_regionEEEallN1NsuggestionsEEE1N1Nwt_unitEEEcelsius",
+ "Sec-Fetch-Dest: empty",
+ "Sec-Fetch-Mode: cors",
+ "Sec-Fetch-Site: same-origin",
+ "TE: trailers"]
+ );
+
+ }elseif($post === true){
+
+ curl_setopt($curlproc, CURLOPT_HTTPHEADER,
+ ["User-Agent: " . config::USER_AGENT,
+ "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+ "Accept-Language: en-US,en;q=0.5",
+ "Accept-Encoding: gzip",
+ "Referer: https://www.startpage.com/",
+ "Content-Type: application/x-www-form-urlencoded",
+ "Content-Length: " . strlen($get),
+ "DNT: 1",
+ "Connection: keep-alive",
+ "Cookie: preferences=date_timeEEEworldN1Ndisable_family_filterEEE1N1Ndisable_open_in_new_windowEEE0N1Nenable_post_methodEEE1N1Nenable_proxy_safety_suggestEEE0N1Nenable_stay_controlEEE0N1Ninstant_answersEEE1N1Nlang_homepageEEEs%2Fdevice%2FenN1NlanguageEEEenglishN1Nlanguage_uiEEEenglishN1Nnum_of_resultsEEE20N1Nsearch_results_regionEEEallN1NsuggestionsEEE1N1Nwt_unitEEEcelsius",
+ "Upgrade-Insecure-Requests: 1",
+ "Sec-Fetch-Dest: document",
+ "Sec-Fetch-Mode: navigate",
+ "Sec-Fetch-Site: none",
+ "Sec-Fetch-User: ?1",
+ "Priority: u=0, i",
+ "TE: trailers"]
+ );
+ }else{
+
+ curl_setopt($curlproc, CURLOPT_HTTPHEADER,
+ ["User-Agent: " . config::USER_AGENT,
+ "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+ "Accept-Language: en-US,en;q=0.5",
+ "Accept-Encoding: gzip",
+ "DNT: 1",
+ "Connection: keep-alive",
+ "Cookie: preferences=date_timeEEEworldN1Ndisable_family_filterEEE1N1Ndisable_open_in_new_windowEEE0N1Nenable_post_methodEEE1N1Nenable_proxy_safety_suggestEEE0N1Nenable_stay_controlEEE0N1Ninstant_answersEEE1N1Nlang_homepageEEEs%2Fdevice%2FenN1NlanguageEEEenglishN1Nlanguage_uiEEEenglishN1Nnum_of_resultsEEE20N1Nsearch_results_regionEEEallN1NsuggestionsEEE1N1Nwt_unitEEEcelsius",
+ "Sec-Fetch-Dest: document",
+ "Sec-Fetch-Mode: navigate",
+ "Sec-Fetch-Site: none",
+ "Sec-Fetch-User: ?1",
+ "Priority: u=0, i",
+ "TE: trailers"]
+ );
+ }
+
+ curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
+ curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
+ curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
+ curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
+ curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
+
+ $this->backend->assign_proxy($curlproc, $proxy);
+
+ $data = curl_exec($curlproc);
+
+ if(curl_errno($curlproc)){
+
+ throw new Exception(curl_error($curlproc));
+ }
+
+ curl_close($curlproc);
+ return $data;
+ }
+
+ public function web($get){
+
+ if($get["npt"]){
+
+ [$post, $proxy] = $this->backend->get($get["npt"], "web");
+
+ try{
+ $html = $this->get(
+ $proxy,
+ "https://www.startpage.com/sp/search",
+ $post,
+ true
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to fetch search page");
+ }
+
+ $get_instant_answer = false;
+
+ }else{
+
+ $proxy = $this->backend->get_ip();
+
+ $params = [
+ "query" => $get["s"],
+ "cat" => "web",
+ "pl" => "opensearch"
+ ];
+
+ if($get["nsfw"] == "no"){
+
+ $params["qadf"] = "heavy";
+ $get_instant_answer = false;
+ }else{
+
+ $get_instant_answer = true;
+ }
+
+ if($get["country"] !== "any"){
+
+ $params["qsr"] = $get["country"];
+ }
+
+ if($get["time"] !== "any"){
+
+ $params["with_date"] = $get["time"];
+ }
+
+ try{
+ $html = $this->get(
+ $proxy,
+ "https://www.startpage.com/sp/search",
+ $params
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to fetch search page");
+ }
+
+ //$html = file_get_contents("scraper/startpage.html");
+ }
+
+ $this->detect_captcha($html);
+
+ if(
+ preg_match(
+ '/React\.createElement\(UIStartpage\.AppSerpWeb, ?(.+)\),?$/m',
+ $html,
+ $matches
+ ) === 0
+ ){
+
+ throw new Exception("Failed to grep JSON object");
+ }
+
+ $json = json_decode($matches[1], true);
+
+ if($json === null){
+
+ throw new Exception("Failed to decode JSON");
+ }
+
+ //print_r($json);
+
+ $out = [
+ "status" => "ok",
+ "spelling" => [
+ "type" => "no_correction",
+ "using" => null,
+ "correction" => null
+ ],
+ "npt" => null,
+ "answer" => [],
+ "web" => [],
+ "image" => [],
+ "video" => [],
+ "news" => [],
+ "related" => []
+ ];
+
+ // get npt
+ $out["npt"] = $this->parse_npt($json, "web", $proxy);
+
+ foreach($json["render"]["presenter"]["regions"]["mainline"] as $category){
+
+ if(!isset($category["display_type"])){
+
+ continue;
+ }
+
+ switch($category["display_type"]){
+
+ case "web-google":
+ foreach($category["results"] as $result){
+
+ $sublinks = [];
+
+ foreach($result["siteLinks"] as $sublink){
+
+ $sublinks[] = [
+ "title" => $sublink["title"],
+ "description" => null,
+ "url" => $sublink["clickUrl"]
+ ];
+ }
+
+ $description =
+ explode(
+ "...",
+ $this->titledots(
+ html_entity_decode(
+ $this->fuckhtml
+ ->getTextContent(
+ $result["description"]
+ )
+ )
+ ),
+ 2
+ );
+
+ $date = strtotime(trim($description[0]));
+
+ if(
+ $date === false ||
+ count($description) !== 2 ||
+ strlen($description[0]) > 14
+ ){
+
+ // no date found
+ $description =
+ implode(
+ " ... ",
+ $description
+ );
+
+ $date = null;
+ }else{
+
+ // date found
+ $description = ltrim($description[1]);
+ }
+
+ $out["web"][] = [
+ "title" =>
+ $this->titledots(
+ html_entity_decode(
+ $this->fuckhtml
+ ->getTextContent(
+ $result["title"]
+ )
+ )
+ ),
+ "description" => $description,
+ "url" => $result["clickUrl"],
+ "date" => $date,
+ "type" => "web",
+ "thumb" => [
+ "url" => null,
+ "ratio" => null
+ ],
+ "sublink" => $sublinks,
+ "table" => []
+ ];
+ }
+ break;
+
+ case "images-qi-top":
+ foreach($category["results"] as $result){
+
+ $out["image"][] = [
+ "title" =>
+ $this->titledots(
+ html_entity_decode(
+ $this->fuckhtml
+ ->getTextContent(
+ $result["title"]
+ )
+ )
+ ),
+ "source" => [
+ [
+ "url" => $result["rawImageUrl"],
+ "width" => (int)$result["width"],
+ "height" => (int)$result["height"]
+ ],
+ [
+ "url" => $this->unshitimage($result["mdThumbnailUrl"]),
+ "width" => (int)$result["mdThumbnailWidth"],
+ "height" => (int)$result["mdThumbnailHeight"]
+ ]
+ ],
+ "url" =>
+ $result["altClickUrl"]
+ ];
+ }
+ break;
+
+ case "spellsuggest-google":
+ $out["spelling"] =
+ [
+ "type" => "including",
+ "using" => $json["render"]["query"],
+ "correction" => $category["results"][0]["query"]
+ ];
+ break;
+
+ case "dictionary-qi":
+ foreach($category["results"] as $result){
+
+ $answer = [
+ "title" => $result["word"],
+ "description" => [],
+ "url" => null,
+ "thumb" => null,
+ "table" => [],
+ "sublink" => []
+ ];
+
+ foreach($result["lexical_categories"] as $lexic_type => $definitions){
+
+ $answer["description"][] = [
+ "type" => "title",
+ "value" => $lexic_type
+ ];
+
+ $i = 0;
+
+ foreach($definitions as $definition){
+
+ $text_definition = trim($definition["definition"]);
+ $text_example = trim($definition["example"]);
+ $text_synonyms = implode(", ", $definition["synonyms"]);
+
+ if($text_definition != ""){
+
+ $i++;
+
+ $c = count($answer["description"]) - 1;
+ if(
+ $c !== 0 &&
+ $answer["description"][$c]["type"] == "text"
+ ){
+
+ $answer["description"][$c]["value"] .=
+ "\n\n" . $i . ". " . $text_definition;
+
+ }else{
+
+ $answer["description"][] = [
+ "type" => "text",
+ "value" => $i . ". " . $text_definition
+ ];
+ }
+ }
+
+ if($text_example != ""){
+
+ $answer["description"][] = [
+ "type" => "quote",
+ "value" => $text_example
+ ];
+ }
+
+ if($text_synonyms != ""){
+
+ $answer["description"][] = [
+ "type" => "text",
+ "value" => "Synonyms: " . $text_synonyms
+ ];
+ }
+ }
+ }
+
+ $out["answer"][] = $answer;
+ }
+ break;
+ }
+ }
+
+ // parse instant answers
+ if(
+ $get["extendedsearch"] == "yes" &&
+ $get_instant_answer === true
+ ){
+
+ // https://www.startpage.com/sp/qi?qimsn=ex&sxap=%2Fv1%2Fquery&sc=BqZ3inqrAgF701&sr=1
+ try{
+ $post = [
+ "se" => "n0vze2y9dqwy",
+ "q" => $json["render"]["query"],
+ "results" => [], // populate
+ "enableKnowledgePanel" => true,
+ "enableMediaThumbBar" => false,
+ "enableSearchSuggestions" => false,
+ "enableTripadvisorProperties" => [],
+ "enableTripadvisorPlaces" => [],
+ "enableTripadvisorPlacesForLocations" => [],
+ "enableWebProducts" => false,
+ "tripadvisorPartnerId" => null,
+ "tripadvisorMapColorMode" => "light",
+ "tripadvisorDisablesKnowledgePanel" => false,
+ "instantAnswers" => [
+ "smartAnswers",
+ "youtube",
+ "tripadvisor"
+ ],
+ "iaType" => null,
+ "forceEnhancedKnowledgePanel" => false,
+ "shoppingOnly" => false,
+ "allowAdultProducts" => true,
+ "lang" => "en",
+ "browserLang" => "en-US",
+ "browserTimezone" => "America/New_York",
+ "market" => null,
+ "userLocation" => null,
+ "userDate" => date("Y-m-d"),
+ "userAgentType" => "unknown"
+ ];
+
+ foreach($out["web"] as $result){
+
+ $post["results"][] = [
+ "url" => $result["url"],
+ "title" => $result["title"]
+ ];
+ }
+
+ $post = json_encode($post, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_INVALID_UTF8_IGNORE);
+
+ $additional_data =
+ $this->get(
+ $proxy,
+ "https://www.startpage.com/sp/qi?qimsn=ex&sxap=%2Fv1%2Fquery&sc=" . $json["render"]["callback_sc"] . "&sr=1",
+ $post,
+ true,
+ true
+ );
+
+ $additional_data = json_decode($additional_data, true);
+
+ if($additional_data === null){
+
+ throw new Exception("Failed to decode JSON"); // just break out, dont fail completely
+ }
+
+ if(!isset($additional_data["knowledgePanel"])){
+
+ throw new Exception("Response has missing data (knowledgePanel)");
+ }
+
+ $additional_data = $additional_data["knowledgePanel"];
+
+ $answer = [
+ "title" => $additional_data["meta"]["title"],
+ "description" => [
+ [
+ "type" => "quote",
+ "value" => $additional_data["meta"]["description"]
+ ]
+ ],
+ "url" => $additional_data["meta"]["origWikiUrl"],
+ "thumb" => $additional_data["meta"]["image"],
+ "table" => [],
+ "sublink" => []
+ ];
+
+ // parse html for instant answer
+ $this->fuckhtml->load($additional_data["html"]);
+
+ $div =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "div"
+ );
+
+ // get description
+ $description =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "sx-kp-short-extract sx-kp-short-extract-complete",
+ $div
+ );
+
+ if(count($description) !== 0){
+
+ $answer["description"][] = [
+ "type" => "text",
+ "value" =>
+ html_entity_decode(
+ $this->fuckhtml
+ ->getTextContent(
+ $description[0]
+ )
+ )
+ ];
+ }
+
+ // get socials
+ $socials =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "sx-wiki-social-link",
+ "a"
+ );
+
+ foreach($socials as $social){
+
+ $title =
+ $this->fuckhtml
+ ->getTextContent(
+ $social["attributes"]["title"]
+ );
+
+ $url =
+ $this->fuckhtml
+ ->getTextContent(
+ $social["attributes"]["href"]
+ );
+
+ switch($title){
+
+ case "Official Website":
+ $title = "Website";
+ break;
+ }
+
+ $answer["sublink"][$title] = $url;
+ }
+
+ // get videos
+ $videos =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "sx-kp-video-grid-item",
+ $div
+ );
+
+ foreach($videos as $video){
+
+ $this->fuckhtml->load($video);
+
+ $as =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "a"
+ );
+
+ if(count($as) === 0){
+
+ // ?? invalid
+ continue;
+ }
+
+ $image =
+ $this->fuckhtml
+ ->getElementsByAttributeName(
+ "data-sx-src",
+ "img"
+ );
+
+ if(count($image) !== 0){
+
+ $thumb = [
+ "ratio" => "16:9",
+ "url" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $image[0]["attributes"]["data-sx-src"]
+ )
+ ];
+ }else{
+
+ $thumb = [
+ "ratio" => null,
+ "url" => null
+ ];
+ }
+
+ $out["video"][] = [
+ "title" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $as[0]["attributes"]["title"]
+ ),
+ "description" => null,
+ "date" => null,
+ "duration" => null,
+ "views" => null,
+ "thumb" => $thumb,
+ "url" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $as[0]["attributes"]["href"]
+ )
+ ];
+ }
+
+ // reset
+ $this->fuckhtml->load($additional_data["html"]);
+
+ // get table elements
+ $table =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "sx-infobox",
+ "table"
+ );
+
+ if(count($table) !== 0){
+
+ $trs =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "tr"
+ );
+
+ foreach($trs as $tr){
+
+ $this->fuckhtml->load($tr);
+
+ // ok so startpage devs cant fucking code a table
+ // td = content
+ // th (AAAHH) = title
+ $tds =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "td"
+ );
+
+ $ths =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "th"
+ );
+
+ if(
+ count($ths) === 1 &&
+ count($tds) === 1
+ ){
+
+ $title =
+ $this->fuckhtml
+ ->getTextContent(
+ $ths[0]
+ );
+
+ $description = [];
+
+ $this->fuckhtml->load($tds[0]);
+
+ $lis =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "li"
+ );
+
+ if(count($lis) !== 0){
+
+ foreach($lis as $li){
+
+ $description[] =
+ $this->fuckhtml
+ ->getTextContent(
+ $li
+ );
+ }
+
+ $description = implode(", ", $description);
+ }else{
+
+ $description =
+ $this->fuckhtml
+ ->getTextContent(
+ $tds[0]
+ );
+ }
+
+ $answer["table"][$title] = $description;
+ }
+ }
+ }
+
+ $out["answer"][] = $answer;
+
+ }catch(Exception $error){
+
+ // do nothing
+ //echo "error!";
+ }
+ }
+
+ return $out;
+ }
+
+ public function image($get){
+
+ if($get["npt"]){
+
+ [$post, $proxy] = $this->backend->get($get["npt"], "images");
+
+ try{
+ $html = $this->get(
+ $proxy,
+ "https://www.startpage.com/sp/search",
+ $post,
+ true
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to fetch search page");
+ }
+
+ }else{
+
+ $search = $get["s"];
+ if(strlen($search) === 0){
+
+ throw new Exception("Search term is empty!");
+ }
+
+ try{
+
+ $proxy = $this->backend->get_ip();
+
+ $params = [
+ "query" => $get["s"],
+ "cat" => "images",
+ "pl" => "opensearch"
+ ];
+
+ if($get["nsfw"] == "no"){
+
+ $params["qadf"] = "heavy";
+ }
+
+ if($get["size"] != "any"){
+
+ if(
+ $get["size"] == "Small" ||
+ $get["size"] == "Medium" ||
+ $get["size"] == "Large" ||
+ $get["size"] == "Wallpaper"
+ ){
+
+ $params["flimgsize"] = $get["size"];
+ }else{
+
+ $params["image-size-select"] = "isz:lt,islt:" . $get["size"];
+ }
+ }
+
+ if($get["color"] != "any"){
+
+ if($get["color"] == "color"){
+
+ $params["flimgcolor"] = "ic:color";
+ }elseif($get["color"] == "bnw"){
+
+ $params["flimgcolor"] = "ic:gray";
+ }else{
+
+ $params["flimgcolor"] = "ic:specific,isc:" . $get["color"];
+ }
+ }
+
+ if($get["type"] != "any"){
+
+ $params["flimgtype"] = $get["type"];
+ }
+
+ if($get["license"] != "any"){
+
+ $params["flimglicense"] = $get["license"];
+ }
+
+ try{
+ $html = $this->get(
+ $proxy,
+ "https://www.startpage.com/sp/search",
+ $params
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to fetch search page");
+ }
+ //$html = file_get_contents("scraper/startpage.html");
+
+ }catch(Exception $error){
+
+ throw new Exception("Failed to fetch search page");
+ }
+ }
+
+ $this->detect_captcha($html);
+
+ $out = [
+ "status" => "ok",
+ "npt" => null,
+ "image" => []
+ ];
+
+ if(
+ preg_match(
+ '/React\.createElement\(UIStartpage\.AppSerpImages, ?(.+)\),?$/m',
+ $html,
+ $matches
+ ) === 0
+ ){
+
+ throw new Exception("Failed to grep JSON object");
+ }
+
+ $json = json_decode($matches[1], true);
+
+ if($json === null){
+
+ throw new Exception("Failed to decode JSON object");
+ }
+
+ // get npt
+ $out["npt"] = $this->parse_npt($json, "images", $proxy);
+
+ // get images
+ foreach($json["render"]["presenter"]["regions"]["mainline"] as $category){
+
+ if($category["display_type"] != "images-bing"){
+
+ // ignore ads and !! suggestions !! @todo
+ continue;
+ }
+
+ foreach($category["results"] as $image){
+
+ $out["image"][] = [
+ "title" => $this->titledots($image["title"]),
+ "source" => [
+ [
+ "url" => $this->unshitimage($image["clickUrl"]),
+ "width" => (int)$image["width"],
+ "height" => (int)$image["height"]
+ ],
+ [
+ "url" => $this->unshitimage($image["thumbnailUrl"]),
+ "width" => (int)$image["thumbnailWidth"],
+ "height" => (int)$image["thumbnailHeight"]
+ ]
+ ],
+ "url" => $image["altClickUrl"]
+ ];
+ }
+ }
+
+ return $out;
+ }
+
+ public function video($get){
+
+ if($get["npt"]){
+
+ [$post, $proxy] = $this->backend->get($get["npt"], "videos");
+
+ try{
+ $html = $this->get(
+ $proxy,
+ "https://www.startpage.com/sp/search",
+ $post,
+ true
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to fetch search page");
+ }
+
+ }else{
+
+ $search = $get["s"];
+ if(strlen($search) === 0){
+
+ throw new Exception("Search term is empty!");
+ }
+
+ try{
+
+ $proxy = $this->backend->get_ip();
+
+ $params = [
+ "query" => $get["s"],
+ "cat" => "video",
+ "pl" => "opensearch"
+ ];
+
+ if($get["nsfw"] == "no"){
+
+ $params["qadf"] = "heavy";
+ }
+
+ if($get["sort"] != "relevance"){
+
+ $params["sort_by"] = $get["sort"];
+ }
+
+ if($get["duration"] != "any"){
+
+ $params["with_duration"] = $get["duration"];
+ }
+
+ try{
+ $html = $this->get(
+ $proxy,
+ "https://www.startpage.com/sp/search",
+ $params
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to fetch search page");
+ }
+ //$html = file_get_contents("scraper/startpage.html");
+
+ }catch(Exception $error){
+
+ throw new Exception("Failed to fetch search page");
+ }
+ }
+
+ $this->detect_captcha($html);
+
+ if(
+ preg_match(
+ '/React\.createElement\(UIStartpage\.AppSerpVideos, ?(.+)\),?$/m',
+ $html,
+ $matches
+ ) === 0
+ ){
+
+ throw new Exception("Failed to get JSON object");
+ }
+
+ $json = json_decode($matches[1], true);
+
+ if($json === null){
+
+ throw new Exception("Failed to decode JSON object");
+ }
+
+ $out = [
+ "status" => "ok",
+ "npt" => null,
+ "video" => [],
+ "author" => [],
+ "livestream" => [],
+ "playlist" => [],
+ "reel" => []
+ ];
+
+ // get npt
+ $out["npt"] = $this->parse_npt($json, "video", $proxy);
+
+ // get results
+ foreach($json["render"]["presenter"]["regions"]["mainline"] as $category){
+
+ if(
+ preg_match(
+ '/^video-/i',
+ $category["display_type"]
+ )
+ ){
+
+ foreach($category["results"] as $video){
+
+ if(
+ isset($video["thumbnailUrl"]) &&
+ $video["thumbnailUrl"] !== null
+ ){
+
+ $thumb = [
+ "ratio" => "16:9",
+ "url" => $this->unshitimage($video["thumbnailUrl"])
+ ];
+ }else{
+
+ $thumb = [
+ "ratio" => null,
+ "url" => null
+ ];
+ }
+
+ $out["video"][] = [
+ "title" => str_replace(["", ""], "", $video["title"]),
+ "description" => $this->limitstrlen($video["description"]),
+ "author" => [
+ "name" => $video["channelTitle"],
+ "url" => null,
+ "avatar" => null
+ ],
+ "date" => strtotime($video["publishDate"]),
+ "duration" => $this->hms2int($category["display_type"] == "video-youtube" ? $video["duration"] : $video["duration"] / 1000),
+ "views" => (int)$video["viewCount"],
+ "thumb" => $thumb,
+ "url" => $video["clickUrl"]
+ ];
+ }
+ }
+ }
+
+ return $out;
+ }
+
+ public function news($get){
+
+ if($get["npt"]){
+
+ [$post, $proxy] = $this->backend->get($get["npt"], "news");
+
+ try{
+ $html = $this->get(
+ $proxy,
+ "https://www.startpage.com/sp/search",
+ $post,
+ true
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to fetch search page");
+ }
+
+ }else{
+
+ $search = $get["s"];
+ if(strlen($search) === 0){
+
+ throw new Exception("Search term is empty!");
+ }
+
+ try{
+
+ $proxy = $this->backend->get_ip();
+
+ $params = [
+ "query" => $get["s"],
+ "cat" => "news",
+ "pl" => "opensearch"
+ ];
+
+ if($get["nsfw"] == "no"){
+
+ $params["qadf"] = "heavy";
+ }
+
+ if($get["time"] != "any"){
+
+ $params["with_date"] = $get["time"];
+ }
+
+ try{
+ $html = $this->get(
+ $proxy,
+ "https://www.startpage.com/sp/search",
+ $params
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to fetch search page");
+ }
+ //$html = file_get_contents("scraper/startpage.html");
+
+ }catch(Exception $error){
+
+ throw new Exception("Failed to fetch search page");
+ }
+ }
+
+ $this->detect_captcha($html);
+
+ if(
+ preg_match(
+ '/React\.createElement\(UIStartpage\.AppSerpNews, ?(.+)\),?$/m',
+ $html,
+ $matches
+ ) === 0
+ ){
+
+ throw new Exception("Failed to get JSON object");
+ }
+
+ $json = json_decode($matches[1], true);
+
+ if($json === null){
+
+ throw new Exception("Failed to decode JSON object");
+ }
+
+ $out = [
+ "status" => "ok",
+ "npt" => null,
+ "news" => []
+ ];
+
+ // get npt
+ $out["npt"] = $this->parse_npt($json, "news", $proxy);
+
+ foreach($json["render"]["presenter"]["regions"]["mainline"] as $category){
+
+ if($category["display_type"] != "news-bing"){
+
+ // unsupported category
+ continue;
+ }
+
+ foreach($category["results"] as $news){
+
+ if(
+ isset($news["thumbnailUrl"]) &&
+ $news["thumbnailUrl"] !== null
+ ){
+
+ $thumb = [
+ "ratio" => "16:9",
+ "url" => $this->unshitimage($news["thumbnailUrl"])
+ ];
+ }else{
+
+ $thumb = [
+ "ratio" => null,
+ "url" => null
+ ];
+ }
+
+ $out["news"][] = [
+ "title" => $this->titledots($this->remove_penguins($news["title"])),
+ "author" => $news["source"],
+ "description" => $this->titledots($this->remove_penguins($news["description"])),
+ "date" => (int)substr((string)$news["date"], 0, -3),
+ "thumb" => $thumb,
+ "url" => $news["clickUrl"]
+ ];
+ }
+ }
+
+ return $out;
+ }
+
+ private function parse_npt($json, $pagetype, $proxy){
+
+ foreach($json["render"]["presenter"]["pagination"]["pages"] as $page){
+
+ if($page["name"] == "Next"){
+
+ parse_str(
+ explode(
+ "?",
+ $page["url"],
+ 2
+ )[1],
+ $str
+ );
+
+ return
+ $this->backend->store(
+ http_build_query(
+ [
+ "lui" => "english",
+ "language" => "english",
+ "query" => $str["q"],
+ "cat" => $pagetype,
+ "sc" => $str["sc"],
+ "t" => "device",
+ "segment" => "startpage.udog",
+ "page" => $str["page"]
+ ]
+ ),
+ $pagetype,
+ $proxy
+ );
+
+ break;
+ }
+ }
+
+ return null;
+ }
+
+ private function unshitimage($url){
+
+ $query = parse_url($url, PHP_URL_QUERY);
+ parse_str($query, $query);
+
+ if(isset($query["piurl"])){
+
+ if(strpos($query["piurl"], "gstatic.com/")){
+
+ return
+ explode(
+ "&",
+ $query["piurl"],
+ 2
+ )[0];
+ }
+
+ if(
+ strpos($query["piurl"], "bing.net/") ||
+ strpos($query["piurl"], "bing.com/")
+ ){
+
+ return
+ explode(
+ "&",
+ $query["piurl"],
+ 2
+ )[0];
+ }
+
+ return $query["piurl"];
+ }
+
+ return $url;
+ }
+
+ private function limitstrlen($text){
+
+ return
+ explode(
+ "\n",
+ wordwrap(
+ str_replace(
+ ["\n\r", "\r\n", "\n", "\r"],
+ " ",
+ $text
+ ),
+ 300,
+ "\n"
+ ),
+ 2
+ )[0];
+ }
+
+ private function titledots($title){
+
+ return trim($title, " .\t\n\r\0\x0B…");
+ }
+
+ private function hms2int($time){
+
+ $parts = explode(":", $time, 3);
+ $time = 0;
+
+ if(count($parts) === 3){
+
+ // hours
+ $time = $time + ((int)$parts[0] * 3600);
+ array_shift($parts);
+ }
+
+ if(count($parts) === 2){
+
+ // minutes
+ $time = $time + ((int)$parts[0] * 60);
+ array_shift($parts);
+ }
+
+ // seconds
+ $time = $time + (int)$parts[0];
+
+ return $time;
+ }
+
+ private function remove_penguins($text){
+
+ return str_replace(
+ ["", ""],
+ "",
+ $text
+ );
+ }
+
+ private function detect_captcha($html){
+
+ $this->fuckhtml->load($html);
+
+ $title =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "title"
+ );
+
+ if(
+ count($title) !== 0 &&
+ $title[0]["innerHTML"] == "Redirecting..."
+ ){
+
+ // check if it's a captcha
+ $as =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "a"
+ );
+
+ foreach($as as $a){
+
+ if(
+ strpos(
+ $this->fuckhtml
+ ->getTextContent(
+ $a["innerHTML"]
+ ),
+ "https://www.startpage.com/sp/captcha"
+ ) !== false
+ ){
+
+ throw new Exception("Startpage returned a captcha");
+ }
+ }
+
+ throw new Exception("Startpage redirected the scraper to an unhandled page");
+ }
+ }
+}