aboutsummaryrefslogtreecommitdiffstats
path: root/scraper/google.php
diff options
context:
space:
mode:
authorlolcat <will@lolcat.ca>2025-08-11 01:55:15 +0000
committerlolcat <will@lolcat.ca>2025-08-11 01:55:15 +0000
commitcdf958d29333d448f4521f4d2faa2592b58e9b27 (patch)
tree528f2a0ffa789a6f4279d9f54a4a2aaf391f390f /scraper/google.php
downloadshittyweb-search-cdf958d29333d448f4521f4d2faa2592b58e9b27.tar.gz
shittyweb-search-cdf958d29333d448f4521f4d2faa2592b58e9b27.tar.bz2
shittyweb-search-cdf958d29333d448f4521f4d2faa2592b58e9b27.zip
fix wikipedia crashgrafted
Diffstat (limited to 'scraper/google.php')
-rw-r--r--scraper/google.php2989
1 files changed, 2989 insertions, 0 deletions
diff --git a/scraper/google.php b/scraper/google.php
new file mode 100644
index 0000000..0c73ea0
--- /dev/null
+++ b/scraper/google.php
@@ -0,0 +1,2989 @@
+<?php
+
+// @TODO check for consent.google.com page, if need be
+
+class google{
+
+ public function __construct(){
+
+ include "lib/fuckhtml.php";
+ $this->fuckhtml = new fuckhtml();
+
+ include "lib/backend.php";
+ $this->backend = new backend("google");
+ }
+
+ public function getfilters($page){
+
+ $base = [
+ "country" => [ // gl=<country> (image: cr=countryAF)
+ "display" => "Country",
+ "option" => [
+ "any" => "Instance's country",
+ "af" => "Afghanistan",
+ "al" => "Albania",
+ "dz" => "Algeria",
+ "as" => "American Samoa",
+ "ad" => "Andorra",
+ "ao" => "Angola",
+ "ai" => "Anguilla",
+ "aq" => "Antarctica",
+ "ag" => "Antigua and Barbuda",
+ "ar" => "Argentina",
+ "am" => "Armenia",
+ "aw" => "Aruba",
+ "au" => "Australia",
+ "at" => "Austria",
+ "az" => "Azerbaijan",
+ "bs" => "Bahamas",
+ "bh" => "Bahrain",
+ "bd" => "Bangladesh",
+ "bb" => "Barbados",
+ "by" => "Belarus",
+ "be" => "Belgium",
+ "bz" => "Belize",
+ "bj" => "Benin",
+ "bm" => "Bermuda",
+ "bt" => "Bhutan",
+ "bo" => "Bolivia",
+ "ba" => "Bosnia and Herzegovina",
+ "bw" => "Botswana",
+ "bv" => "Bouvet Island",
+ "br" => "Brazil",
+ "io" => "British Indian Ocean Territory",
+ "bn" => "Brunei Darussalam",
+ "bg" => "Bulgaria",
+ "bf" => "Burkina Faso",
+ "bi" => "Burundi",
+ "kh" => "Cambodia",
+ "cm" => "Cameroon",
+ "ca" => "Canada",
+ "cv" => "Cape Verde",
+ "ky" => "Cayman Islands",
+ "cf" => "Central African Republic",
+ "td" => "Chad",
+ "cl" => "Chile",
+ "cn" => "China",
+ "cx" => "Christmas Island",
+ "cc" => "Cocos (Keeling) Islands",
+ "co" => "Colombia",
+ "km" => "Comoros",
+ "cg" => "Congo",
+ "cd" => "Congo, the Democratic Republic",
+ "ck" => "Cook Islands",
+ "cr" => "Costa Rica",
+ "ci" => "Cote D'ivoire",
+ "hr" => "Croatia",
+ "cu" => "Cuba",
+ "cy" => "Cyprus",
+ "cz" => "Czech Republic",
+ "dk" => "Denmark",
+ "dj" => "Djibouti",
+ "dm" => "Dominica",
+ "do" => "Dominican Republic",
+ "ec" => "Ecuador",
+ "eg" => "Egypt",
+ "sv" => "El Salvador",
+ "gq" => "Equatorial Guinea",
+ "er" => "Eritrea",
+ "ee" => "Estonia",
+ "et" => "Ethiopia",
+ "fk" => "Falkland Islands (Malvinas)",
+ "fo" => "Faroe Islands",
+ "fj" => "Fiji",
+ "fi" => "Finland",
+ "fr" => "France",
+ "gf" => "French Guiana",
+ "pf" => "French Polynesia",
+ "tf" => "French Southern Territories",
+ "ga" => "Gabon",
+ "gm" => "Gambia",
+ "ge" => "Georgia",
+ "de" => "Germany",
+ "gh" => "Ghana",
+ "gi" => "Gibraltar",
+ "gr" => "Greece",
+ "gl" => "Greenland",
+ "gd" => "Grenada",
+ "gp" => "Guadeloupe",
+ "gu" => "Guam",
+ "gt" => "Guatemala",
+ "gn" => "Guinea",
+ "gw" => "Guinea-Bissau",
+ "gy" => "Guyana",
+ "ht" => "Haiti",
+ "hm" => "Heard Island and Mcdonald Islands",
+ "va" => "Holy See (Vatican City State)",
+ "hn" => "Honduras",
+ "hk" => "Hong Kong",
+ "hu" => "Hungary",
+ "is" => "Iceland",
+ "in" => "India",
+ "id" => "Indonesia",
+ "ir" => "Iran, Islamic Republic",
+ "iq" => "Iraq",
+ "ie" => "Ireland",
+ "il" => "Israel",
+ "it" => "Italy",
+ "jm" => "Jamaica",
+ "jp" => "Japan",
+ "jo" => "Jordan",
+ "kz" => "Kazakhstan",
+ "ke" => "Kenya",
+ "ki" => "Kiribati",
+ "kp" => "Korea, Democratic People's Republic",
+ "kr" => "Korea, Republic",
+ "kw" => "Kuwait",
+ "kg" => "Kyrgyzstan",
+ "la" => "Lao People's Democratic Republic",
+ "lv" => "Latvia",
+ "lb" => "Lebanon",
+ "ls" => "Lesotho",
+ "lr" => "Liberia",
+ "ly" => "Libyan Arab Jamahiriya",
+ "li" => "Liechtenstein",
+ "lt" => "Lithuania",
+ "lu" => "Luxembourg",
+ "mo" => "Macao",
+ "mk" => "Macedonia, the Former Yugosalv Republic",
+ "mg" => "Madagascar",
+ "mw" => "Malawi",
+ "my" => "Malaysia",
+ "mv" => "Maldives",
+ "ml" => "Mali",
+ "mt" => "Malta",
+ "mh" => "Marshall Islands",
+ "mq" => "Martinique",
+ "mr" => "Mauritania",
+ "mu" => "Mauritius",
+ "yt" => "Mayotte",
+ "mx" => "Mexico",
+ "fm" => "Micronesia, Federated States",
+ "md" => "Moldova, Republic",
+ "mc" => "Monaco",
+ "mn" => "Mongolia",
+ "ms" => "Montserrat",
+ "ma" => "Morocco",
+ "mz" => "Mozambique",
+ "mm" => "Myanmar",
+ "na" => "Namibia",
+ "nr" => "Nauru",
+ "np" => "Nepal",
+ "nl" => "Netherlands",
+ "an" => "Netherlands Antilles",
+ "nc" => "New Caledonia",
+ "nz" => "New Zealand",
+ "ni" => "Nicaragua",
+ "ne" => "Niger",
+ "ng" => "Nigeria",
+ "nu" => "Niue",
+ "nf" => "Norfolk Island",
+ "mp" => "Northern Mariana Islands",
+ "no" => "Norway",
+ "om" => "Oman",
+ "pk" => "Pakistan",
+ "pw" => "Palau",
+ "ps" => "Palestinian Territory, Occupied",
+ "pa" => "Panama",
+ "pg" => "Papua New Guinea",
+ "py" => "Paraguay",
+ "pe" => "Peru",
+ "ph" => "Philippines",
+ "pn" => "Pitcairn",
+ "pl" => "Poland",
+ "pt" => "Portugal",
+ "pr" => "Puerto Rico",
+ "qa" => "Qatar",
+ "re" => "Reunion",
+ "ro" => "Romania",
+ "ru" => "Russian Federation",
+ "rw" => "Rwanda",
+ "sh" => "Saint Helena",
+ "kn" => "Saint Kitts and Nevis",
+ "lc" => "Saint Lucia",
+ "pm" => "Saint Pierre and Miquelon",
+ "vc" => "Saint Vincent and the Grenadines",
+ "ws" => "Samoa",
+ "sm" => "San Marino",
+ "st" => "Sao Tome and Principe",
+ "sa" => "Saudi Arabia",
+ "sn" => "Senegal",
+ "cs" => "Serbia and Montenegro",
+ "sc" => "Seychelles",
+ "sl" => "Sierra Leone",
+ "sg" => "Singapore",
+ "sk" => "Slovakia",
+ "si" => "Slovenia",
+ "sb" => "Solomon Islands",
+ "so" => "Somalia",
+ "za" => "South Africa",
+ "gs" => "South Georgia and the South Sandwich Islands",
+ "es" => "Spain",
+ "lk" => "Sri Lanka",
+ "sd" => "Sudan",
+ "sr" => "Suriname",
+ "sj" => "Svalbard and Jan Mayen",
+ "sz" => "Swaziland",
+ "se" => "Sweden",
+ "ch" => "Switzerland",
+ "sy" => "Syrian Arab Republic",
+ "tw" => "Taiwan, Province of China",
+ "tj" => "Tajikistan",
+ "tz" => "Tanzania, United Republic",
+ "th" => "Thailand",
+ "tl" => "Timor-Leste",
+ "tg" => "Togo",
+ "tk" => "Tokelau",
+ "to" => "Tonga",
+ "tt" => "Trinidad and Tobago",
+ "tn" => "Tunisia",
+ "tr" => "Turkey",
+ "tm" => "Turkmenistan",
+ "tc" => "Turks and Caicos Islands",
+ "tv" => "Tuvalu",
+ "ug" => "Uganda",
+ "ua" => "Ukraine",
+ "ae" => "United Arab Emirates",
+ "uk" => "United Kingdom",
+ "us" => "United States",
+ "um" => "United States Minor Outlying Islands",
+ "uy" => "Uruguay",
+ "uz" => "Uzbekistan",
+ "vu" => "Vanuatu",
+ "ve" => "Venezuela",
+ "vn" => "Viet Nam",
+ "vg" => "Virgin Islands, British",
+ "vi" => "Virgin Islands, U.S.",
+ "wf" => "Wallis and Futuna",
+ "eh" => "Western Sahara",
+ "ye" => "Yemen",
+ "zm" => "Zambia",
+ "zw" => "Zimbabwe"
+ ]
+ ],
+ "nsfw" => [
+ "display" => "NSFW",
+ "option" => [
+ "yes" => "Yes", // safe=active
+ "no" => "No" // safe=off
+ ]
+ ]
+ ];
+
+ switch($page){
+
+ case "web":
+ return array_merge(
+ $base,
+ [
+ "lang" => [ // lr=<lang> (prefix lang with "lang_")
+ "display" => "Language",
+ "option" => [
+ "any" => "Any language",
+ "ar" => "Arabic",
+ "bg" => "Bulgarian",
+ "ca" => "Catalan",
+ "cs" => "Czech",
+ "da" => "Danish",
+ "de" => "German",
+ "el" => "Greek",
+ "en" => "English",
+ "es" => "Spanish",
+ "et" => "Estonian",
+ "fi" => "Finnish",
+ "fr" => "French",
+ "hr" => "Croatian",
+ "hu" => "Hungarian",
+ "id" => "Indonesian",
+ "is" => "Icelandic",
+ "it" => "Italian",
+ "iw" => "Hebrew",
+ "ja" => "Japanese",
+ "ko" => "Korean",
+ "lt" => "Lithuanian",
+ "lv" => "Latvian",
+ "nl" => "Dutch",
+ "no" => "Norwegian",
+ "pl" => "Polish",
+ "pt" => "Portuguese",
+ "ro" => "Romanian",
+ "ru" => "Russian",
+ "sk" => "Slovak",
+ "sl" => "Slovenian",
+ "sr" => "Serbian",
+ "sv" => "Swedish",
+ "tr" => "Turkish",
+ "zh-CN" => "Chinese (Simplified)",
+ "zh-TW" => "Chinese (Traditional)"
+ ]
+ ],
+ "newer" => [ // tbs
+ "display" => "Newer than",
+ "option" => "_DATE"
+ ],
+ "older" => [
+ "display" => "Older than",
+ "option" => "_DATE"
+ ],
+ "spellcheck" => [
+ "display" => "Spellcheck",
+ "option" => [
+ "yes" => "Yes",
+ "no" => "No"
+ ]
+ ]
+ ]
+ );
+ break;
+
+ case "images":
+ return array_merge(
+ $base,
+ [
+ "time" => [ // tbs=qdr:<time>
+ "display" => "Time posted",
+ "option" => [
+ "any" => "Any time",
+ "d" => "Past 24 hours",
+ "w" => "Past week",
+ "m" => "Past month",
+ "y" => "Past year"
+ ]
+ ],
+ "size" => [ // imgsz
+ "display" => "Size",
+ "option" => [
+ "any" => "Any size",
+ "l" => "Large",
+ "m" => "Medium",
+ "i" => "Icon",
+ "qsvga" => "Larger than 400x300",
+ "vga" => "Larger than 640x480",
+ "svga" => "Larger than 800x600",
+ "xga" => "Larger than 1024x768",
+ "2mp" => "Larger than 2MP",
+ "4mp" => "Larger than 4MP",
+ "6mp" => "Larger than 6MP",
+ "8mp" => "Larger than 8MP",
+ "10mp" => "Larger than 10MP",
+ "12mp" => "Larger than 12MP",
+ "15mp" => "Larger than 15MP",
+ "20mp" => "Larger than 20MP",
+ "40mp" => "Larger than 40MP",
+ "70mp" => "Larger than 70MP"
+ ]
+ ],
+ "ratio" => [ // imgar
+ "display" => "Aspect ratio",
+ "option" => [
+ "any" => "Any ratio",
+ "t|xt" => "Tall",
+ "s" => "Square",
+ "w" => "Wide",
+ "xw" => "Panoramic"
+ ]
+ ],
+ "color" => [ // imgc
+ "display" => "Color",
+ "option" => [
+ "any" => "Any color",
+ "color" => "Full color",
+ "bnw" => "Black & white",
+ "trans" => "Transparent",
+ // from here, imgcolor
+ "red" => "Red",
+ "orange" => "Orange",
+ "yellow" => "Yellow",
+ "green" => "Green",
+ "teal" => "Teal",
+ "blue" => "Blue",
+ "purple" => "Purple",
+ "pink" => "Pink",
+ "white" => "White",
+ "gray" => "Gray",
+ "black" => "Black",
+ "brown" => "Brown"
+ ]
+ ],
+ "type" => [ // tbs=itp:<type>
+ "display" => "Type",
+ "option" => [
+ "any" => "Any type",
+ "clipart" => "Clip Art",
+ "lineart" => "Line Drawing",
+ "animated" => "Animated"
+ ]
+ ],
+ "format" => [ // as_filetype
+ "display" => "Format",
+ "option" => [
+ "any" => "Any format",
+ "jpg" => "JPG",
+ "gif" => "GIF",
+ "png" => "PNG",
+ "bmp" => "BMP",
+ "svg" => "SVG",
+ "webp" => "WEBP",
+ "ico" => "ICO",
+ "craw" => "RAW"
+ ]
+ ],
+ "rights" => [ // tbs=sur:<rights>
+ "display" => "Usage rights",
+ "option" => [
+ "any" => "Any license",
+ "cl" => "Creative Commons licenses",
+ "ol" => "Commercial & other licenses"
+ ]
+ ]
+ ]
+ );
+ break;
+
+ case "videos":
+ return array_merge(
+ $base,
+ [
+ "newer" => [ // tbs
+ "display" => "Newer than",
+ "option" => "_DATE"
+ ],
+ "older" => [
+ "display" => "Older than",
+ "option" => "_DATE"
+ ],
+ "duration" => [
+ "display" => "Duration",
+ "option" => [
+ "any" => "Any duration",
+ "s" => "Short (0-4min)", // tbs=dur:s
+ "m" => "Medium (4-20min)", // tbs=dur:m
+ "l" => "Long (20+ min)" // tbs=dur:l
+ ]
+ ],
+ "quality" => [
+ "display" => "Quality",
+ "option" => [
+ "any" => "Any quality",
+ "h" => "High quality" // tbs=hq:h
+ ]
+ ],
+ "captions" => [
+ "display" => "Captions",
+ "option" => [
+ "any" => "No preference",
+ "yes" => "Closed captioned" // tbs=cc:1
+ ]
+ ]
+ ]
+ );
+ break;
+
+ case "news":
+ return array_merge(
+ $base,
+ [
+ "newer" => [ // tbs
+ "display" => "Newer than",
+ "option" => "_DATE"
+ ],
+ "older" => [
+ "display" => "Older than",
+ "option" => "_DATE"
+ ],
+ "sort" => [
+ "display" => "Sort",
+ "option" => [
+ "relevance" => "Relevance",
+ "date" => "Date" // sbd:1
+ ]
+ ]
+ ]
+ );
+ break;
+ }
+ }
+
+ private function get($proxy, $url, $get = [], $use_lynx = false){
+
+ $curlproc = curl_init();
+
+ if($use_lynx === false){
+
+ $headers = [
+ "User-Agent: " . config::USER_AGENT,
+ "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+ "Accept-Language: en-US,en;q=0.5",
+ "Accept-Encoding: gzip",
+ "DNT: 1",
+ //"Cookie: SOCS=CAESNQgCEitib3FfaWRlbnRpdHlmcm9udGVuZHVpc2VydmVyXzIwMjQwMzE3LjA4X3AwGgJlbiAEGgYIgM7orwY",
+ "Connection: keep-alive",
+ "Upgrade-Insecure-Requests: 1",
+ "Sec-Fetch-Dest: document",
+ "Sec-Fetch-Mode: navigate",
+ "Sec-Fetch-Site: none",
+ "Sec-Fetch-User: ?1",
+ "Priority: u=1",
+ "TE: trailers"
+ ];
+
+ // use http2
+ curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
+ }else{
+
+ $headers = [
+ "Accept: text/html, text/plain, text/sgml, */*;q=0.01",
+ "Accept-Encoding: gzip, compress, bzip2",
+ "Accept-Language: en",
+ "User-Agent: Lynx/2.9.0dev.12 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/3.7.8"
+ ];
+ }
+
+ if($get !== []){
+ $get = http_build_query($get);
+ $url .= "?" . $get;
+ }
+
+ curl_setopt($curlproc, CURLOPT_URL, $url);
+
+ curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
+ curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers);
+
+ curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
+ curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
+ curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
+ curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
+ curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
+
+ // follow redirects
+ curl_setopt($curlproc, CURLOPT_FOLLOWLOCATION, true);
+
+ $this->backend->assign_proxy($curlproc, $proxy);
+
+ $data = curl_exec($curlproc);
+
+ if(curl_errno($curlproc)){
+
+ throw new Exception(curl_error($curlproc));
+ }
+
+ curl_close($curlproc);
+
+ if($use_lynx){
+
+ return mb_convert_encoding($data, "UTF-8", "ISO-8859-1");
+ }
+
+ return $data;
+ }
+
+
+ private function scrape_dimg($html){
+
+ // get images loaded through javascript
+ $this->dimg = [];
+
+ preg_match_all(
+ '/function\(\){google\.ldi=({.*?});/',
+ $html,
+ $dimg
+ );
+
+ if(isset($dimg[1])){
+
+ foreach($dimg[1] as $i){
+
+ $tmp = json_decode($i, true);
+ foreach($tmp as $key => $value){
+
+ $this->dimg[$key] =
+ $this->unshit_thumb(
+ $value
+ );
+ }
+ }
+ }
+
+ // get additional javascript base64 images
+ preg_match_all(
+ '/var s=\'(data:image\/[^\']+)\';var ii=\[((?:\'[^\']+\',?)+)\];/',
+ $html,
+ $dimg
+ );
+
+ if(isset($dimg[1])){
+
+ for($i=0; $i<count($dimg[1]); $i++){
+
+ $delims = explode(",", $dimg[2][$i]);
+ $string =
+ $this->fuckhtml
+ ->parseJsString(
+ $dimg[1][$i]
+ );
+
+ foreach($delims as $delim){
+
+ $this->dimg[trim($delim, "'")] = $string;
+ }
+ }
+ }
+ }
+
+
+ private function scrape_imagearr($html){
+ // get image links arrays
+ preg_match_all(
+ '/\[[0-9]+,"([^"]+)",\["([^"]+)\",([0-9]+),([0-9]+)\],\["([^"]+)",([0-9]+),([0-9]+)\]/',
+ $html,
+ $image_arr
+ );
+
+ $this->image_arr = [];
+ if(isset($image_arr[1])){
+
+ for($i=0; $i<count($image_arr[1]); $i++){
+
+ $original =
+ $this->fuckhtml
+ ->parseJsString(
+ $image_arr[5][$i]
+ );
+
+ if(
+ preg_match(
+ '/^x-raw-image/',
+ $original
+ )
+ ){
+
+ // only add thumbnail, google doesnt have OG resolution
+ $this->image_arr[$image_arr[1][$i]] = [
+ [
+ "url" =>
+ $this->unshit_thumb(
+ $this->fuckhtml
+ ->parseJsString(
+ $image_arr[2][$i]
+ )
+ ),
+ "width" => (int)$image_arr[7][$i], // pass the OG image width & height
+ "height" => (int)$image_arr[6][$i]
+ ]
+ ];
+
+ continue;
+ }
+
+ $this->image_arr[$image_arr[1][$i]] =
+ [
+ [
+ "url" => $original,
+ "width" => (int)$image_arr[7][$i],
+ "height" => (int)$image_arr[6][$i]
+ ],
+ [
+ "url" =>
+ $this->unshit_thumb(
+ $this->fuckhtml
+ ->parseJsString(
+ $image_arr[2][$i]
+ )
+ ),
+ "width" => (int)$image_arr[4][$i],
+ "height" => (int)$image_arr[3][$i]
+ ]
+ ];
+ }
+ }
+ }
+
+
+ private function getdimg($dimg){
+
+ return isset($this->dimg[$dimg]) ? $this->dimg[$dimg] : null;
+ }
+
+
+ private function unshit_thumb($url){
+ // https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQINE2vbnNLHXqoZr3RVsaEJFyOsj1_BiBnJch-e1nyz3oia7Aj5xVj
+ // https://i.ytimg.com/vi/PZVIyA5ER3Y/mqdefault.jpg?sqp=-oaymwEFCJQBEFM&rs=AMzJL3nXeaCpdIar-ltNwl82Y82cIJfphA
+
+ $parts = parse_url($url);
+
+ if(
+ isset($parts["host"]) &&
+ preg_match(
+ '/tbn.*\.gstatic\.com/',
+ $parts["host"]
+ )
+ ){
+
+ parse_str($parts["query"], $params);
+
+ if(isset($params["q"])){
+
+ return "https://" . $parts["host"] . "/images?q=" . $params["q"];
+ }
+ }
+
+ return $url;
+ }
+
+
+ private function parsestyles(){
+
+ $styles = [];
+
+ $style_div =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "style"
+ );
+
+ $raw_styles = "";
+
+ foreach($style_div as $style){
+
+ $raw_styles .= $style["innerHTML"];
+ }
+
+ // filter out media/keyframe queries
+ $raw_styles =
+ preg_replace(
+ '/@\s*(?!font-face)[^{]+\s*{[\S\s]+?}\s*}/',
+ "",
+ $raw_styles
+ );
+
+ // get styles
+ preg_match_all(
+ '/(.+?){([\S\s]*?)}/',
+ $raw_styles,
+ $matches
+ );
+
+ for($i=0; $i<count($matches[1]); $i++){
+
+ // get style values
+ preg_match_all(
+ '/([^:;]+):([^;]*?(?:\([^)]+\)[^;]*?)?)(?:;|$)/',
+ $matches[2][$i],
+ $values_regex
+ );
+
+ $values = [];
+ for($k=0; $k<count($values_regex[1]); $k++){
+
+ $values[trim($values_regex[1][$k])] =
+ strtolower(trim($values_regex[2][$k]));
+ }
+
+ $names = explode(",", $matches[1][$i]);
+
+ // h1,h2,h3 will each get their own array index
+ foreach($names as $name){
+
+ $name = trim($name, "}\t\n\r\0\x0B");
+
+ foreach($values as $key => $value){
+
+ $styles[$name][$key] = $value;
+ }
+ }
+ }
+
+ foreach($styles as $key => $values){
+
+ $styles[$key]["_c"] = count($values);
+ }
+
+ $this->styles = $styles;
+
+ // get CSS colors
+ $this->css_colors = [];
+
+ if(isset($this->styles[":root"])){
+
+ foreach($this->styles[":root"] as $key => $value){
+
+ $this->css_colors[$value] = strtolower($key);
+ }
+ }
+ }
+
+
+
+ private function getstyle($styles){
+
+ $styles["_c"] = count($styles);
+
+ foreach($this->styles as $style_key => $style_values){
+
+ if(count(array_intersect_assoc($style_values, $styles)) === $styles["_c"] + 1){
+
+ $style_key =
+ explode(" ", $style_key);
+
+ $style_key = $style_key[count($style_key) - 1];
+
+ return
+ ltrim(
+ str_replace(
+ [".", "#"],
+ " ",
+ $style_key
+ )
+ );
+ }
+ }
+
+ return false;
+ }
+
+
+
+ private function getcolorvar($color){
+
+ if(isset($this->css_colors[$color])){
+
+ return $this->css_colors[$color];
+ }
+
+ return null;
+ }
+
+
+
+ public function web($get){
+
+ if($get["npt"]){
+
+ [$get, $proxy] = $this->backend->get($get["npt"], "web");
+
+ try{
+ $html =
+ $this->get(
+ $proxy,
+ "https://www.google.com" . $get,
+ [],
+ true
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to get HTML");
+ }
+
+ }else{
+ $search = $get["s"];
+ $country = $get["country"];
+ $nsfw = $get["nsfw"];
+ $lang = $get["lang"];
+ $older = $get["older"];
+ $newer = $get["newer"];
+ $spellcheck = $get["spellcheck"];
+ $proxy = $this->backend->get_ip();
+
+ $offset = 0;
+
+ $params = [
+ "q" => $search,
+ "hl" => "en",
+ "num" => 20
+ ];
+
+ // country
+ if($country != "any"){
+
+ $params["gl"] = $country;
+ }
+
+ // nsfw
+ $params["safe"] = $nsfw == "yes" ? "off" : "active";
+
+ // language
+ if($lang != "any"){
+
+ $params["lr"] = "lang_" . $lang;
+ }
+
+ // generate tbs
+ $tbs = [];
+
+ // get date
+ $older = $older === false ? null : date("m/d/Y", $older);
+ $newer = $newer === false ? null : date("m/d/Y", $newer);
+
+ if(
+ $older !== null ||
+ $newer !== null
+ ){
+
+ $tbs["cdr"] = "1";
+ $tbs["cd_min"] = $newer;
+ $tbs["cd_max"] = $older;
+ }
+
+ // spellcheck filter
+ if($spellcheck == "no"){
+
+ $params["nfpr"] = "1";
+ }
+
+ if(count($tbs) !== 0){
+
+ $params["tbs"] = "";
+
+ foreach($tbs as $key => $value){
+
+ $params["tbs"] .= $key . ":" . $value . ",";
+ }
+
+ $params["tbs"] = rtrim($params["tbs"], ",");
+ }
+
+ try{
+ $html =
+ $this->get(
+ $proxy,
+ "https://www.google.com/search",
+ $params,
+ true
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to get HTML");
+ }
+
+ //$html = file_get_contents("scraper/google.html");
+ }
+
+ $out = [
+ "status" => "ok",
+ "spelling" => [
+ "type" => "no_correction",
+ "using" => null,
+ "correction" => null
+ ],
+ "npt" => null,
+ "answer" => [],
+ "web" => [],
+ "image" => [],
+ "video" => [],
+ "news" => [],
+ "related" => []
+ ];
+
+ $this->fuckhtml->load($html);
+ $this->detect_sorry();
+
+ $this->parsestyles();
+
+ $boxes =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle([
+ "border" => "thin solid #dadce0",
+ "padding" => "12px 16px 12px 16px",
+ "margin-bottom" => "10px",
+ "font-family" => "sans-serif"
+ ]),
+ "div"
+ );
+
+ $skip_next = false;
+
+ // get next page token
+ $npt =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle([
+ "border" => "thin solid #dadce0",
+ "color" => "#70757a",
+ "font-size" => "14px",
+ "text-align" => "center",
+ "table-layout" => "fixed",
+ "width" => "100%"
+ ]),
+ "table"
+ );
+
+ if(count($npt) !== 0){
+
+ $this->fuckhtml->load($npt[0]);
+
+ $as =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "a"
+ );
+
+ foreach($as as $a){
+
+ $text =
+ $this->fuckhtml
+ ->getTextContent(
+ $a
+ );
+
+ if(
+ $text == "Next&nbsp;>" ||
+ $text == ">"
+ ){
+
+ $out["npt"] =
+ $this->backend->store(
+ $this->fuckhtml
+ ->getTextContent(
+ $a["attributes"]["href"]
+ ),
+ "web",
+ $proxy
+ );
+ }
+ }
+
+ $this->fuckhtml->load($html);
+ }
+
+ $first_box = true;
+ foreach($boxes as $box){
+
+ $this->fuckhtml->load($box);
+
+ if($first_box){
+
+ //
+ // Probe for word correction
+ //
+ $first_box = false;
+
+ $txt =
+ $this->fuckhtml
+ ->getTextContent($box);
+
+ if(
+ preg_match(
+ '/^Showing results for /',
+ $txt
+ )
+ ){
+
+ $as =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "a"
+ );
+
+ if(count($as) === 2){
+
+ $out["spelling"] = [
+ "type" => "including",
+ "using" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $as[0]
+ ),
+ "correction" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $as[1]
+ )
+ ];
+ }
+ continue;
+ }
+ }
+
+ // probe for custom container
+ $container_title =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle([
+ "font-weight" => "bold"
+ ])
+ );
+
+ if(count($container_title) !== 0){
+
+ $container_title =
+ strtolower(
+ $this->fuckhtml
+ ->getTextContent(
+ $container_title[0]
+ )
+ );
+
+ if($container_title == "images"){
+
+ //
+ // Parse image carousel
+ //
+ $images =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle([
+ "display" => "inline-block",
+ "padding" => "2px",
+ "padding-bottom" => "4px"
+ ]),
+ "a"
+ );
+
+ foreach($images as $image){
+
+ $this->fuckhtml->load($image);
+
+ $image_data =
+ $this->unshiturl(
+ $image["attributes"]["href"],
+ true
+ );
+
+ $img =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "img"
+ )[0];
+
+ $out["image"][] = [
+ "title" =>
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $img["attributes"]["alt"]
+ )
+ ),
+ "source" => [
+ [
+ "url" => $image_data["url"],
+ "width" => $image_data["image_width"],
+ "height" => $image_data["image_height"]
+ ],
+ [
+ "url" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $img["attributes"]["src"]
+ ),
+ "width" => $image_data["thumb_width"],
+ "height" => $image_data["thumb_height"]
+ ]
+ ],
+ "url" => $image_data["ref"]
+ ];
+ }
+
+ continue;
+ }
+
+ if(
+ $container_title == "related searches" ||
+ $container_title == "people also search for"
+ ){
+
+ $as =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle([
+ "color" => "#202124",
+ "font-size" => "13px",
+ "line-height" => "20px"
+ ]),
+ "span"
+ );
+
+ foreach($as as $a){
+
+ $out["related"][] =
+ $this->fuckhtml
+ ->getTextContent(
+ $a
+ );
+ }
+ continue;
+ }
+ }
+
+ // probe for website link
+ $link =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle([
+ "color" => "#1967d2",
+ "font-size" => "18px",
+ "line-height" => "24px"
+ ]),
+ "a"
+ );
+
+ if(count($link) !== 0){
+
+ //
+ // Parse search result
+ //
+
+ $this->fuckhtml->load($link[0]);
+
+ $title =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle([
+ "color" => "#1967d2",
+ "font-size" => "18px",
+ "line-height" => "24px"
+ ]),
+ "span"
+ );
+
+ if(count($title) === 0){
+
+ continue;
+ }
+
+ $this->fuckhtml->load($box);
+
+ $sublinks = [];
+ $table = [];
+
+ $categories =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle([
+ "color" => "#202124",
+ "font-size" => "13px",
+ "line-height" => "20px"
+ ]),
+ "span"
+ );
+
+ $i = 0;
+ foreach($categories as $category){
+
+ $this->fuckhtml->load($category);
+
+ // probe for sublinks
+ $subs =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle([
+ "color" => "#1967d2"
+ ]),
+ "a"
+ );
+
+ if(count($subs) !== 0){
+
+ foreach($subs as $sub){
+
+ $url =
+ $this->unshiturl(
+ $this->fuckhtml
+ ->getTextContent(
+ $sub["attributes"]["href"]
+ )
+ );
+
+ if(
+ preg_match(
+ '/^https?:\/\//',
+ $url
+ )
+ ){
+
+ $sublinks[] = [
+ "title" =>
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $sub
+ )
+ ),
+ "description" => null,
+ "url" =>
+ $this->unshiturl(
+ $this->fuckhtml
+ ->getTextContent(
+ $sub["attributes"]["href"]
+ )
+ ),
+ "date" => null
+ ];
+ }
+ }
+
+ unset($categories[$i]);
+ }
+
+ $i++;
+ }
+
+ // get description & date
+ $date = null;
+
+ $categories = array_values($categories);
+
+ //print_r($categories);
+
+ $c = count($categories) - 1;
+
+ $description =
+ $this->fuckhtml
+ ->getTextContent(
+ $categories[$c]
+ );
+
+ // remove last category since we're done with it
+ unset($categories[$c]);
+
+ // probe for date
+ $description_tmp = explode("·", $description, 2);
+ $date_tmp = strtotime(trim($description_tmp[0]));
+
+ if(
+ count($description_tmp) === 2 &&
+ strlen($description_tmp[0]) <= 20 &&
+ $date_tmp !== false
+ ){
+
+ $description =
+ ltrim(
+ $this->titledots(
+ $description_tmp[1]
+ )
+ );
+ $date = $date_tmp;
+ }else{
+
+ $description =
+ $this->titledots(
+ $description
+ );
+ }
+
+ // remaining categories should all be greytext
+ if(count($categories) !== 0){
+
+ $texts =
+ explode(
+ "·",
+ preg_replace(
+ '/\s+/',
+ " ",
+ $this->fuckhtml
+ ->getTextContent(
+ $categories[0]
+ )
+ )
+ );
+
+ foreach($texts as $text){
+
+ $text = trim($text);
+
+ if(
+ preg_match(
+ '/^Rating ([0-9.]+)(?: \(([0-9,]+)\))?/',
+ $text,
+ $rating
+ )
+ ){
+
+ $table["Rating"] = $rating[1];
+ if(isset($rating[2])){
+
+ $table["Rating"] .= " (" . $rating[2] . " votes)";
+ }
+
+ continue;
+ }
+
+ if(stripos($text, "stock") !== false){
+
+ $table["Stock"] = $text;
+ continue;
+ }
+ }
+ }
+
+ $out["web"][] = [
+ "title" =>
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $title[0]
+ )
+ ),
+ "description" => $description,
+ "url" =>
+ $this->unshiturl(
+ $link[0]["attributes"]["href"]
+ ),
+ "date" => $date,
+ "type" => "web",
+ "thumb" => [
+ "url" => null,
+ "ratio" => null
+ ],
+ "sublink" => $sublinks,
+ "table" => $table
+ ];
+
+ continue;
+ }
+
+ // parse wikipedia heads
+ $wiki_title =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle([
+ "color" => "#202124",
+ "font-size" => "18px",
+ "line-height" => "24px"
+ ]),
+ "span"
+ );
+
+ if(count($wiki_title) !== 0){
+
+ $wiki_title =
+ $this->fuckhtml
+ ->getTextContent(
+ $wiki_title[0]
+ );
+
+ if($wiki_title == "See results about"){
+
+ // ignore
+ continue;
+ }
+
+ if($wiki_title == "Top stories"){
+
+ //
+ // Parse news
+ //
+ $tds =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "td"
+ );
+
+ foreach($tds as $td){
+
+ $this->fuckhtml->load($td);
+
+ $a =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "a"
+ );
+
+ if(count($a) === 0){
+
+ continue;
+ }
+
+ $title =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle([
+ "color" => "#1967d2"
+ ]),
+ "span"
+ );
+
+ if(count($title) === 0){
+
+ continue;
+ }
+
+ $date = null;
+
+ $meta_div =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle([
+ "color" => "#70757a",
+ "font-size" => "13px",
+ "line-height" => "20px"
+ ]),
+ "span"
+ );
+
+ $meta_div =
+ explode(
+ "·",
+ $this->fuckhtml
+ ->getTextContent(
+ $meta_div[count($meta_div) - 1]
+ ),
+ 2
+ );
+
+ if(count($meta_div) === 2){
+
+ $date = strtotime($meta_div[count($meta_div) - 1]);
+
+ if($date === false){
+
+ $date = null;
+ }
+ }
+
+ $out["news"][] = [
+ "title" =>
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $title[0]
+ )
+ ),
+ "description" => null,
+ "date" => $date,
+ "thumb" => [
+ "url" => null,
+ "ratio" => null
+ ],
+ "url" =>
+ $this->unshiturl(
+ $a[0]["attributes"]["href"]
+ )
+ ];
+ }
+ continue;
+ }
+
+ //
+ // Parse wikipedia heads
+ //
+
+ $table_div =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "table"
+ );
+
+ if(count($table_div) === 0){
+
+ continue;
+ }
+
+ $this->fuckhtml->load($table_div[0]);
+
+ // remove table from box
+ $box["innerHTML"] =
+ str_replace(
+ $table_div[0]["outerHTML"],
+ "",
+ $box["innerHTML"]
+ );
+
+ // find wiki image
+ $thumb = null;
+
+ $img =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "img"
+ );
+
+ if(count($img) !== 0){
+
+ $thumb =
+ $this->fuckhtml
+ ->getTextContent(
+ $img[0]["attributes"]["src"]
+ );
+ }
+
+ $tds =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "td"
+ );
+
+ $description = [];
+
+ foreach($tds as $td){
+
+ // probe for subtitle
+ $this->fuckhtml->load($td);
+
+ $subtext =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle([
+ "color" => "#70757a",
+ "font-size" => "13px",
+ "line-height" => "20px"
+ ])
+ );
+
+ if(count($subtext) !== 0){
+
+ $description[] = [
+ "type" => "quote",
+ "value" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $subtext[0]
+ )
+ ];
+ break;
+ }
+ }
+
+ $this->fuckhtml->load($box);
+
+ // probe for word definition
+ $lists =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "ol"
+ );
+
+ if(count($lists) !== 0){
+
+ $description = [];
+
+ foreach($lists as $list){
+
+ $box["innerHTML"] =
+ explode(
+ $list["outerHTML"],
+ $box["innerHTML"],
+ 2
+ );
+
+ if(
+ count($box["innerHTML"]) === 1 ||
+ trim($box["innerHTML"][0]) == ""
+ ){
+
+ break;
+ }
+
+ $description[] = [
+ "type" => "title",
+ "value" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $box["innerHTML"][0]
+ )
+ ];
+
+ $this->fuckhtml->load($list);
+
+ $lis =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "li"
+ );
+
+ $increment = 1;
+
+ foreach($lis as $li){
+
+ $this->fuckhtml->load($li);
+
+ $list_items =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle([
+ "color" => "#202124",
+ "font-size" => "13px",
+ "line-height" => "20px"
+ ])
+ );
+
+ $first_item = true;
+ foreach($list_items as $it){
+
+ if($first_item){
+
+ $first_item = false;
+ $c = count($description);
+
+ if(
+ $c !== 0 &&
+ $description[$c - 1]["type"] == "text"
+ ){
+
+ $description[$c - 1]["value"] .=
+ "\n\n" .
+ $increment . ". " . $this->fuckhtml
+ ->getTextContent(
+ $it
+ );
+ }else{
+
+ $description[] = [
+ "type" => "text",
+ "value" =>
+ $increment . ". " . $this->fuckhtml
+ ->getTextContent(
+ $it
+ )
+ ];
+ }
+ }else{
+
+ $description[] = [
+ "type" => "quote",
+ "value" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $it
+ )
+ ];
+ }
+
+ $increment++;
+ }
+ }
+
+ $box["innerHTML"] = $box["innerHTML"][1];
+ }
+
+ $out["answer"][] = [
+ "title" => $wiki_title,
+ "description" => $description,
+ "url" => null,
+ "thumb" => null,
+ "table" => [],
+ "sublink" => []
+ ];
+
+ continue;
+ }
+
+ // get separator between description and facts
+ $separator =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle([
+ "height" => "4px"
+ ]),
+ "div"
+ );
+
+ $box_html = [];
+ $table = [];
+
+ if(count($separator) !== 0){
+
+ $box_html =
+ explode(
+ $separator[0]["outerHTML"],
+ $box["innerHTML"],
+ 2
+ );
+
+ if(count($box_html) === 2){
+
+ $box["innerHTML"] = $box_html[0];
+ }
+
+ $this->fuckhtml->load($box_html[1]);
+
+ // get all facts
+ $facts =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "div"
+ );
+
+ foreach($facts as $fact){
+
+ if($fact["level"] !== 1){ continue; }
+
+ $fact =
+ explode(
+ ":",
+ $this->fuckhtml
+ ->getTextContent(
+ $fact
+ )
+ );
+
+ $table[trim(preg_replace('/\s+/', " ", $fact[0]))] =
+ trim(preg_replace('/\s+/', " ", $fact[1]));
+ }
+
+ $this->fuckhtml->load($box);
+ }
+
+ // remove wikipedia link
+ $wiki_link =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle([
+ "color" => "#1967d2"
+ ]),
+ "a"
+ );
+
+ $url = null;
+ if(count($wiki_link) !== 0){
+
+ foreach($wiki_link as $link){
+
+ if(
+ strtolower(
+ $this->fuckhtml
+ ->getTextContent(
+ $link
+ )
+ ) == "wikipedia"
+ ){
+
+ $box["innerHTML"] =
+ str_replace(
+ $link["outerHTML"],
+ "",
+ $box["innerHTML"]
+ );
+
+ $url =
+ $this->unshiturl(
+ $link["attributes"]["href"]
+ );
+
+ $this->fuckhtml->load($box);
+ break;
+ }
+ }
+ }
+
+ // remains of box should be description
+ $description[] = [
+ "type" => "text",
+ "value" =>
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $box
+ )
+ )
+ ];
+
+ $out["answer"][] = [
+ "title" => $wiki_title,
+ "description" => $description,
+ "url" => $url,
+ "thumb" => $thumb,
+ "table" => $table,
+ "sublink" => []
+ ];
+ }
+ }
+
+ return $out;
+ }
+
+
+
+ public function video($get){
+
+ if($get["npt"]){
+
+ [$params, $proxy] = $this->backend->get($get["npt"], "video");
+ $params = json_decode($params, true);
+
+ }else{
+ $search = $get["s"];
+ $country = $get["country"];
+ $nsfw = $get["nsfw"];
+ $older = $get["older"];
+ $newer = $get["newer"];
+ $duration = $get["duration"];
+ $quality = $get["quality"];
+ $captions = $get["captions"];
+ $proxy = $this->backend->get_ip();
+
+ $params = [
+ "q" => $search,
+ "udm" => "7",
+ "hl" => "en",
+ "num" => 20
+ ];
+
+ // country
+ if($country != "any"){
+
+ $params["gl"] = $country;
+ }
+
+ // nsfw
+ $params["safe"] = $nsfw == "yes" ? "off" : "active";
+
+ $tbs = [];
+
+ // get date
+ $older = $older === false ? null : date("m/d/Y", $older);
+ $newer = $newer === false ? null : date("m/d/Y", $newer);
+
+ if(
+ $older !== null ||
+ $newer !== null
+ ){
+
+ $tbs["cdr"] = "1";
+ $tbs["cd_min"] = $newer;
+ $tbs["cd_max"] = $older;
+ }
+
+ // duration
+ if($duration != "any"){
+
+ $tbs[] = "dur:" . $duration;
+ }
+
+ // quality
+ if($quality != "any"){
+
+ $tbs[] = "hq:" . $quality;
+ }
+
+ // captions
+ if($captions != "any"){
+
+ $tbs[] = "cc:" . $captions;
+ }
+
+ // append tbs
+ if(count($tbs) !== 0){
+
+ $params["tbs"] =
+ implode(",", $tbs);
+ }
+ }
+
+ try{
+ $html =
+ $this->get(
+ $proxy,
+ "https://www.google.com/search",
+ $params
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to get HTML");
+ }
+
+ if(!isset($params["start"])){
+
+ $params["start"] = 0;
+ }
+ $params["start"] += 20;
+
+ $this->fuckhtml->load($html);
+
+ //
+ // Parse web video page
+ //
+ $this->detect_sorry();
+
+ // parse all <style> tags
+ $this->parsestyles();
+
+ // get javascript images
+ $this->scrape_dimg($html);
+
+ $this->scrape_imagearr($html);
+
+ $out = [
+ "status" => "ok",
+ "npt" =>
+ $this->backend->store(
+ json_encode($params),
+ "videos",
+ $proxy
+ ),
+ "video" => [],
+ "author" => [],
+ "livestream" => [],
+ "playlist" => [],
+ "reel" => []
+ ];
+
+ $search_div =
+ $this->fuckhtml
+ ->getElementById(
+ "center_col"
+ );
+
+ if($search_div === false){
+
+ throw new Exception("Failed to grep search div");
+ }
+
+ $this->fuckhtml->load($search_div);
+
+ $results =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle([
+ "margin" => "0px 0px 30px"
+ ]),
+ "div"
+ );
+
+ foreach($results as $result){
+
+ $this->fuckhtml->load($result);
+
+ $url =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "a"
+ );
+
+ if(count($url) === 0){
+
+ // no url, weird, continue
+ continue;
+ }
+
+ $title =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "h3"
+ );
+
+ if(count($title) === 0){
+
+ // no title, weird, continue
+ continue;
+ }
+
+ // get description
+ $description =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle([
+ "-webkit-box-orient" => "vertical",
+ "display" => "-webkit-box",
+ "-webkit-line-clamp" => "2",
+ "overflow" => "hidden",
+ "word-break" => "break-word"
+ ]),
+ "div"
+ );
+
+ if(count($description) === 0){
+
+ $description = null;
+ }else{
+
+ $description =
+ html_entity_decode(
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $description[0]
+ )
+ )
+ );
+ }
+
+ // get author + date posted
+ $metadiv =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle([
+ "margin-top" => "12px"
+ ]),
+ "div"
+ );
+
+ $author = null;
+ $date = null;
+
+ if(count($metadiv) !== 0){
+
+ $metadiv =
+ explode(
+ "·",
+ $this->fuckhtml
+ ->getTextContent(
+ $metadiv[0]
+ )
+ );
+
+ if(count($metadiv) === 3){
+
+ $author = trim($metadiv[1]);
+ $date = strtotime(trim($metadiv[2]));
+ }elseif(count($metadiv) === 2){
+
+ $author = trim($metadiv[0]);
+ $date = strtotime(trim($metadiv[1]));
+ }
+ }
+
+ $thumb = [
+ "url" => null,
+ "ratio" => null
+ ];
+
+ $image =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "img"
+ );
+
+ $duration = null;
+
+ if(
+ count($image) !== 0 &&
+ isset($image[0]["attributes"]["id"])
+ ){
+
+ $thumb = [
+ "url" => $this->getdimg($image[0]["attributes"]["id"]),
+ "ratio" => "16:9"
+ ];
+
+ // get duration
+ $duration =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle([
+ "background-color" => "rgba(0,0,0,0.6)",
+ "color" => "#fff",
+ "fill" => "#fff"
+ ])
+ );
+
+ if(count($duration) !== 0){
+
+ $duration =
+ $this->hms2int(
+ $this->fuckhtml
+ ->getTextContent(
+ $duration[0]
+ ));
+ }else{
+
+ $duration = null;
+ }
+ }
+
+ $out["video"][] = [
+ "title" =>
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $title[0]
+ )
+ ),
+ "description" => $description,
+ "author" => [
+ "name" => $author,
+ "url" => null,
+ "avatar" => null
+ ],
+ "date" => $date,
+ "duration" => $duration,
+ "views" => null,
+ "thumb" => $thumb,
+ "url" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $url[0]["attributes"]["href"]
+ )
+ ];
+ }
+
+ return $out;
+ }
+
+
+
+ public function news($get){
+
+ if($get["npt"]){
+
+ [$req, $proxy] = $this->backend->get($get["npt"], "news");
+ /*parse_str(
+ parse_url($req, PHP_URL_QUERY),
+ $search
+ );*/
+
+ try{
+
+ $html =
+ $this->get(
+ $proxy,
+ "https://www.google.com" . $req,
+ []
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to get HTML");
+ }
+
+ }else{
+ $search = $get["s"];
+ $country = $get["country"];
+ $nsfw = $get["nsfw"];
+ $older = $get["older"];
+ $newer = $get["newer"];
+ $sort = $get["sort"];
+ $proxy = $this->backend->get_ip();
+
+ $params = [
+ "q" => $search,
+ "tbm" => "nws",
+ "hl" => "en",
+ "num" => "20"
+ ];
+
+ // country
+ if($country != "any"){
+
+ $params["gl"] = $country;
+ }
+
+ // nsfw
+ $params["safe"] = $nsfw == "yes" ? "off" : "active";
+
+ $tbs = [];
+
+ // get date
+ $older = $older === false ? null : date("m/d/Y", $older);
+ $newer = $newer === false ? null : date("m/d/Y", $newer);
+
+ if(
+ $older !== null ||
+ $newer !== null
+ ){
+
+ $tbs["cdr"] = "1";
+ $tbs["cd_min"] = $newer;
+ $tbs["cd_max"] = $older;
+ }
+
+ // relevance
+ if($sort == "date"){
+
+ $tbs["sbd"] = "1";
+ }
+
+ // append tbs
+ if(count($tbs) !== 0){
+
+ $params["tbs"] = "";
+
+ foreach($tbs as $key => $value){
+
+ $params["tbs"] .= $key . ":" . $value . ",";
+ }
+
+ $params["tbs"] = rtrim($params["tbs"], ",");
+ }
+
+ //$html = file_get_contents("scraper/google-news.html");
+
+ $html =
+ $this->get(
+ $proxy,
+ "https://www.google.com/search",
+ $params
+ );
+ }
+
+ $out = [
+ "status" => "ok",
+ "npt" => null,
+ "news" => []
+ ];
+
+ $this->fuckhtml->load($html);
+
+ $this->detect_sorry();
+
+ // get images
+ $this->scrape_dimg($html);
+
+ // parse styles
+ $this->parsestyles();
+
+ $center_col =
+ $this->fuckhtml
+ ->getElementById(
+ "center_col",
+ "div"
+ );
+
+ if($center_col === null){
+
+ throw new Exception("Could not grep result div");
+ }
+
+ $this->fuckhtml->load($center_col);
+
+ // get next page
+ $npt =
+ $this->fuckhtml
+ ->getElementById(
+ "pnnext",
+ "a"
+ );
+
+ if($npt !== false){
+
+ $out["npt"] =
+ $this->backend->store(
+ $this->fuckhtml
+ ->getTextContent(
+ $npt["attributes"]
+ ["href"]
+ ),
+ "news",
+ $proxy
+ );
+ }
+
+ $as =
+ $this->fuckhtml
+ ->getElementsByAttributeName(
+ "jsname",
+ "a"
+ );
+
+ foreach($as as $a){
+
+ $this->fuckhtml->load($a);
+
+ // get title
+ $title =
+ $this->fuckhtml
+ ->getElementsByAttributeValue(
+ "role",
+ "heading",
+ "div"
+ );
+
+ if(count($title) === 0){
+
+ continue;
+ }
+
+ $title =
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $title[0]
+ )
+ );
+
+ // get thumbnail
+ $image =
+ $this->fuckhtml
+ ->getElementsByAttributeName(
+ "id",
+ "img"
+ );
+
+ // check for padded title node, if found, we're inside a carousel
+ $probe =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle(
+ [
+ "padding" => "16px 16px 40px 16px"
+ ]
+ ),
+ "div"
+ );
+
+ if(count($probe) !== 0){
+
+ $probe = true;
+ }else{
+
+ $probe = false;
+ }
+
+ if(
+ count($image) !== 0 &&
+ !isset($image[0]["attributes"]["width"])
+ ){
+
+ $thumb = [
+ "url" =>
+ $this->getdimg(
+ $image[0]["attributes"]["id"]
+ ),
+ "ratio" => $probe === true ? "16:9" : "1:1"
+ ];
+ }else{
+
+ $thumb = [
+ "url" => null,
+ "ratio" => null
+ ];
+ }
+
+ $description = null;
+
+ if($probe === false){
+
+ $desc_divs =
+ $this->fuckhtml
+ ->getElementsByAttributeName(
+ "style",
+ "div"
+ );
+
+ foreach($desc_divs as $desc){
+
+ if(
+ strpos(
+ $desc["attributes"]["style"],
+ "margin-top:"
+ ) !== false
+ ){
+
+ $description =
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $desc
+ )
+ );
+ break;
+ }
+ }
+ }
+
+ // get author
+ $author =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle(
+ [
+ "overflow" => "hidden",
+ "text-align" => "left",
+ "text-overflow" => "ellipsis",
+ "white-space" => "nowrap",
+ "margin-bottom" => "8px"
+ ]
+ ),
+ "div"
+ );
+
+ if(count($author) !== 0){
+
+ $author =
+ $this->fuckhtml
+ ->getTextContent(
+ $author[0]
+ );
+ }else{
+
+ $author = null;
+ }
+
+ // get date
+ $date = null;
+
+ $date_div =
+ $this->fuckhtml
+ ->getElementsByAttributeName(
+ "style",
+ "div"
+ );
+
+ foreach($date_div as $d){
+
+ $this->fuckhtml->load($d);
+
+ $span =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "span"
+ );
+
+ if(
+ strpos(
+ $d["attributes"]["style"],
+ "bottom:"
+ ) !== false
+ ){
+
+ $date =
+ strtotime(
+ $this->fuckhtml
+ ->getTextContent(
+ $span[count($span) - 1]
+ )
+ );
+ break;
+ }
+ }
+
+ $out["news"][] = [
+ "title" => $title,
+ "author" => $author,
+ "description" => $description,
+ "date" => $date,
+ "thumb" => $thumb,
+ "url" =>
+ $this->unshiturl(
+ $a["attributes"]
+ ["href"]
+ )
+ ];
+ }
+
+ return $out;
+ }
+
+
+
+
+ public function image($get){
+
+ // generate parameters
+ if($get["npt"]){
+
+ [$params, $proxy] =
+ $this->backend->get(
+ $get["npt"],
+ "images"
+ );
+
+ $params = json_decode($params, true);
+ }else{
+
+ $search = $get["s"];
+ if(strlen($search) === 0){
+
+ throw new Exception("Search term is empty!");
+ }
+
+ $proxy = $this->backend->get_ip();
+ $country = $get["country"];
+ $nsfw = $get["nsfw"];
+ $time = $get["time"];
+ $size = $get["size"];
+ $ratio = $get["ratio"];
+ $color = $get["color"];
+ $type = $get["type"];
+ $format = $get["format"];
+ $rights = $get["rights"];
+
+ $params = [
+ "q" => $search,
+ "udm" => "2" // get images
+ ];
+
+ // country (image search uses cr instead of gl)
+ if($country != "any"){
+
+ $params["cr"] = "country" . strtoupper($country);
+ }
+
+ // nsfw
+ $params["safe"] = $nsfw == "yes" ? "off" : "active";
+
+ // generate tbs
+ $tbs = [];
+
+ // time
+ if($time != "any"){
+
+ $tbs["qdr"] = $time;
+ }
+
+ // size
+ if($size != "any"){
+
+ $params["imgsz"] = $size;
+ }
+
+ // ratio
+ if($ratio != "any"){
+
+ $params["imgar"] = $ratio;
+ }
+
+ // color
+ if($color != "any"){
+
+ if(
+ $color == "color" ||
+ $color == "trans"
+ ){
+
+ $params["imgc"] = $color;
+ }elseif($color == "bnw"){
+
+ $params["imgc"] = "gray";
+ }else{
+
+ $tbs["ic"] = "specific";
+ $tbs["isc"] = $color;
+ }
+ }
+
+ // type
+ if($type != "any"){
+
+ $tbs["itp"] = $type;
+ }
+
+ // format
+ if($format != "any"){
+
+ $params["as_filetype"] = $format;
+ }
+
+ // rights (tbs)
+ if($rights != "any"){
+
+ $tbs["sur"] = $rights;
+ }
+
+ // append tbs
+ if(count($tbs) !== 0){
+
+ $params["tbs"] = "";
+
+ foreach($tbs as $key => $value){
+
+ $params["tbs"] .= $key . ":" . $value . ",";
+ }
+
+ $params["tbs"] = rtrim($params["tbs"], ",");
+ }
+ }
+ /*
+ $handle = fopen("scraper/page.html", "r");
+ $html = fread($handle, filesize("scraper/page.html"));
+ fclose($handle);*/
+
+ try{
+ $html =
+ $this->get(
+ $proxy,
+ "https://www.google.com/search",
+ $params
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to get search page");
+ }
+
+ $this->fuckhtml->load($html);
+
+ $this->detect_sorry();
+
+ // get javascript images
+ $this->scrape_imagearr($html);
+
+ $out = [
+ "status" => "ok",
+ "npt" => null,
+ "image" => []
+ ];
+
+ $images =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "ivg-i",
+ "div"
+ );
+
+ foreach($images as $div){
+
+ $this->fuckhtml->load($div);
+
+ $image =
+ $this->fuckhtml
+ ->getElementsByTagName("img")[0];
+
+ // make sure we dont attempt to show an image we dont have data for
+ if(
+ isset($div["attributes"]["data-docid"]) &&
+ isset($this->image_arr[$div["attributes"]["data-docid"]])
+ ){
+
+ $source =
+ $this->image_arr[
+ $div["attributes"]["data-docid"]
+ ];
+ }else{
+
+ continue;
+ }
+
+ $out["image"][] = [
+ "title" =>
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $image["attributes"]["alt"]
+ )
+ ),
+ "source" => $source,
+ "url" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $div["attributes"]["data-lpage"]
+ )
+ ];
+ }
+
+ // as usual, no way to check if there is a next page reliably
+ if(count($out["image"]) > 50){
+
+ if(!isset($params["start"])){
+
+ $params["start"] = 10;
+ }else{
+
+ $params["start"] += 10;
+ }
+
+ $out["npt"] =
+ $this->backend
+ ->store(
+ json_encode($params),
+ "image",
+ $proxy
+ );
+ }
+
+ return $out;
+ }
+
+ private function unshiturl($url, $return_size = false){
+
+ // decode
+ $url =
+ $this->fuckhtml
+ ->getTextContent(
+ $url
+ );
+
+ $url_parts = parse_url($url);
+
+ if(isset($url_parts["query"])){
+
+ parse_str($url_parts["query"], $query);
+ }else{
+
+ $query = [];
+ }
+
+ if(
+ !isset(
+ $url_parts["host"]
+ ) ||
+ stripos($url_parts["host"], "google.") !== false
+ ){
+
+ // no host, we have a tracking url
+ if(isset($query["imgurl"])){
+
+ $url = $query["imgurl"];
+ }
+ elseif(isset($query["q"])){
+
+ $url = $query["q"];
+ }
+ }
+
+ // rewrite URLs to remove extra tracking parameters
+ $domain = parse_url($url, PHP_URL_HOST);
+
+ if(
+ preg_match(
+ '/wikipedia.org$/',
+ $domain
+ )
+ ){
+
+ // rewrite wikipedia mobile URLs to desktop
+ $url =
+ $this->replacedomain(
+ $url,
+ preg_replace(
+ '/([a-z0-9]+)(\.m\.)/',
+ '$1.',
+ $domain
+ )
+ );
+ }
+
+ elseif(
+ preg_match(
+ '/imdb\.com$|youtube\.[^.]+$/',
+ $domain
+ )
+ ){
+
+ // rewrite imdb and youtube mobile URLs too
+ $url =
+ $this->replacedomain(
+ $url,
+ preg_replace(
+ '/^m\./',
+ "",
+ $domain
+ )
+ );
+
+ }
+
+ elseif(
+ preg_match(
+ '/play\.google\.[^.]+$/',
+ $domain
+ )
+ ){
+
+ // remove referrers from play.google.com
+ $u_query = parse_url($url, PHP_URL_QUERY);
+ if($u_query !== null){
+
+ parse_str($u_query, $u_query);
+ if(isset($u_query["referrer"])){ unset($u_query["referrer"]); }
+ if(isset($u_query["hl"])){ unset($u_query["hl"]); }
+ if(isset($u_query["gl"])){ unset($u_query["gl"]); }
+
+ $query = http_build_query($query);
+
+ $url =
+ str_replace(
+ $u_query,
+ $u_query,
+ $url
+ );
+ }
+ }
+
+ elseif(
+ preg_match(
+ '/twitter\.com$/',
+ $domain
+ )
+ ){
+ // remove more referrers from twitter.com
+ $u_query = parse_url($url, PHP_URL_QUERY);
+ if($u_query !== null){
+
+ parse_str($u_query, $u_query);
+ if(isset($u_query["ref_src"])){ unset($u_query["ref_src"]); }
+
+ $u_query = http_build_query($u_query);
+
+ $url =
+ str_replace(
+ $oldquery,
+ $u_query,
+ $url
+ );
+ }
+ }
+
+ elseif(
+ preg_match(
+ '/maps\.google\.[^.]+/',
+ $domain
+ )
+ ){
+
+ if(stripos($url, "maps?") !== false){
+
+ $u_query = parse_url($url, PHP_URL_QUERY);
+
+ if($u_query !== null){
+
+ parse_str($u_query, $u_query);
+
+ if(isset($u_query["daddr"])){
+
+ $url =
+ "https://maps.google.com/maps?daddr=" .
+ urlencode($u_query["daddr"]);
+ }
+ }
+ }
+ }
+
+ if($return_size){
+
+ return [
+ "url" => $url,
+ "ref" => isset($query["imgrefurl"]) ? $query["imgrefurl"] : null,
+ "thumb_width" => isset($query["tbnw"]) ? (int)$query["tbnw"] : null,
+ "thumb_height" => isset($query["tbnh"]) ? (int)$query["tbnh"] : null,
+ "image_width" => isset($query["w"]) ? (int)$query["w"] : null,
+ "image_height" => isset($query["h"]) ? (int)$query["h"] : null
+ ];
+ }
+
+ return $url;
+ }
+
+ private function replacedomain($url, $domain){
+
+ return
+ preg_replace(
+ '/(https?:\/\/)([^\/]+)/',
+ '$1' . $domain,
+ $url
+ );
+ }
+
+ private function titledots($title){
+
+ return trim($title, " .\t\n\r\0\x0B…");
+ }
+
+ private function hms2int($time){
+
+ $parts = explode(":", $time, 3);
+ $time = 0;
+
+ if(count($parts) === 3){
+
+ // hours
+ $time = $time + ((int)$parts[0] * 3600);
+ array_shift($parts);
+ }
+
+ if(count($parts) === 2){
+
+ // minutes
+ $time = $time + ((int)$parts[0] * 60);
+ array_shift($parts);
+ }
+
+ // seconds
+ $time = $time + (int)$parts[0];
+
+ return $time;
+ }
+
+ private function detect_sorry(){
+
+ $captcha_form =
+ $this->fuckhtml
+ ->getElementById(
+ "captcha-form",
+ "form"
+ );
+
+ if($captcha_form !== false){
+
+ throw new Exception("Google returned a captcha");
+ }
+ }
+}