aboutsummaryrefslogtreecommitdiffstats
path: root/scraper/baidu.php
diff options
context:
space:
mode:
Diffstat (limited to 'scraper/baidu.php')
-rw-r--r--scraper/baidu.php2229
1 files changed, 2229 insertions, 0 deletions
diff --git a/scraper/baidu.php b/scraper/baidu.php
new file mode 100644
index 0000000..efb14ca
--- /dev/null
+++ b/scraper/baidu.php
@@ -0,0 +1,2229 @@
+<?php
+
+class baidu{
+
+ public function __construct(){
+
+ include "lib/backend.php";
+ $this->backend = new backend("baidu");
+
+ include "lib/fuckhtml.php";
+ $this->fuckhtml = new fuckhtml();
+
+ $this->handles = [];
+ $this->proc = null;
+ $this->handle_category = null;
+ $this->handle_increment = 0;
+ $this->sublink_increment = 0;
+
+ $this->cookie = null;
+ }
+
+ public function getfilters($page){
+
+ switch($page){
+
+ case "web":
+ return
+ [
+ "newer" => [
+ "display" => "Newer than",
+ "option" => "_DATE"
+ ],
+ "older" => [
+ "display" => "Older than",
+ "option" => "_DATE"
+ ]
+ ];
+ break;
+
+ case "images":
+ return
+ [
+ "sort" => [
+ "display" => "Sort",
+ "option" => [
+ "relevance" => "Relevance", // no param
+ "latest" => "Latest", // &latest=1
+ "hot" => "Hot" // &hot=1
+ ]
+ ],
+ "size" => [
+ "display" => "Size",
+ "option" => [
+ "any" => "Any size",
+ "7" => "Extra large (1080px+)", // &z=7
+ "6" => "Large (600px~1080px)", // &z=6
+ "5" => "Medium (300px~600px)", // &z=5
+ "4" => "Small (1px~300px)" // &z=4
+ ]
+ ],
+ "ratio" => [
+ "display" => "Ratio",
+ "option" => [
+ "any" => "Any ratio",
+ "1" => "Tall vertical", // &imgratio=1
+ "2" => "Vertical", // &imgratio=2
+ "3" => "Square", // &imgratio=3
+ "4" => "Horizontal", // &imgratio=4
+ "5" => "Wide horizontal" // &imgratio=5
+ ]
+ ],
+ "format" => [
+ "display" => "Format",
+ "option" => [
+ "any" => "Any format",
+ "3" => "JPG", // &imgformat=3
+ "5" => "JPEG", // &imgformat=5
+ "4" => "PNG", // &imgformat=4
+ "2" => "BMP", // &imgformat=2
+ "6" => "GIF (Animated)" // &imgformat=6
+ ]
+ ],
+ "color" => [
+ "display" => "Color",
+ "option" => [
+ "any" => "Any color",
+ "1024" => "White", // &ic=1024
+ "2048" => "Black & White",
+ "512" => "Black",
+ "64" => "Magenta",
+ "16" => "Blue",
+ "1" => "Red",
+ "2" => "Yellow",
+ "32" => "Purple",
+ "4" => "Green",
+ "8" => "Teal",
+ "256" => "Orange",
+ "128" => "Brown"
+ ]
+ ],
+ "type" => [
+ "display" => "Type",
+ "option" => [
+ "any" => "Any type",
+ "hd" => "HD", // &hd=1
+ "isImgSet" => "Photo album", // &isImgSet=1
+ "copyright" => "Copyright" // &copyright=1
+ ]
+ ]
+ ];
+ break;
+
+ case "videos":
+ return [];
+ break;
+
+ case "news":
+ return [
+ "category" => [
+ "display" => "Category",
+ "option" => [
+ "any" => "All news",
+ "media" => "Media websites", // &medium=1
+ "baijiahao" => "Baidu Baijiahao" // &medium=2
+ ]
+ ]
+ ];
+ break;
+ }
+ }
+
+ private function get($proxy, $url, $get = [], $referer = false){
+
+ $curlproc = curl_init();
+
+ if($get !== []){
+ $get = http_build_query($get);
+ $url .= "?" . $get;
+ }
+
+ $cookies_tmp = [];
+ curl_setopt($curlproc, CURLOPT_HEADERFUNCTION, function($curlproc, $header) use (&$cookies_tmp){
+
+ $length = strlen($header);
+
+ $header = explode(":", $header, 2);
+
+ if(trim(strtolower($header[0])) == "set-cookie"){
+
+ $cookie_tmp = explode("=", trim($header[1]), 2);
+
+ $cookies_tmp[trim($cookie_tmp[0])] =
+ explode(";", $cookie_tmp[1], 2)[0];
+ }
+
+ return $length;
+ });
+
+ curl_setopt($curlproc, CURLOPT_URL, $url);
+
+ curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
+
+ if($referer === false){
+ if($this->cookie === null){
+
+ curl_setopt($curlproc, CURLOPT_HTTPHEADER,
+ ["User-Agent: " . config::USER_AGENT,
+ "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+ "Accept-Language: en-US,en;q=0.5",
+ "Accept-Encoding: gzip, deflate, br, zstd",
+ "DNT: 1",
+ "Sec-GPC: 1",
+ "Connection: keep-alive",
+ "Upgrade-Insecure-Requests: 1",
+ "Sec-Fetch-Dest: document",
+ "Sec-Fetch-Mode: navigate",
+ "Sec-Fetch-Site: cross-site",
+ "Priority: u=0, i"]
+ );
+ }else{
+
+ curl_setopt($curlproc, CURLOPT_HTTPHEADER,
+ ["User-Agent: " . config::USER_AGENT,
+ "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+ "Accept-Language: en-US,en;q=0.5",
+ "Accept-Encoding: gzip, deflate, br, zstd",
+ "DNT: 1",
+ "Sec-GPC: 1",
+ "Connection: keep-alive",
+ "Cookie: {$this->cookie}",
+ "Upgrade-Insecure-Requests: 1",
+ "Sec-Fetch-Dest: document",
+ "Sec-Fetch-Mode: navigate",
+ "Sec-Fetch-Site: cross-site",
+ "Priority: u=0, i"]
+ );
+ }
+ }else{
+
+ if($this->cookie === null){
+
+ curl_setopt($curlproc, CURLOPT_HTTPHEADER,
+ ["User-Agent: " . config::USER_AGENT,
+ "Accept: application/json, text/plain, */*",
+ "Accept-Language: en-US,en;q=0.5",
+ "Accept-Encoding: gzip, deflate, br, zstd",
+ "Referer: {$referer}",
+ "DNT: 1",
+ "Sec-GPC: 1",
+ "Connection: keep-alive",
+ "Upgrade-Insecure-Requests: 1",
+ "Sec-Fetch-Dest: empty",
+ "Sec-Fetch-Mode: cors",
+ "Sec-Fetch-Site: same-origin"]
+ );
+ }else{
+
+ curl_setopt($curlproc, CURLOPT_HTTPHEADER,
+ ["User-Agent: " . config::USER_AGENT,
+ "Accept: application/json, text/plain, */*",
+ "Accept-Language: en-US,en;q=0.5",
+ "Accept-Encoding: gzip, deflate, br, zstd",
+ "Referer: {$referer}",
+ "DNT: 1",
+ "Sec-GPC: 1",
+ "Connection: keep-alive",
+ "Cookie: {$this->cookie}",
+ "Upgrade-Insecure-Requests: 1",
+ "Sec-Fetch-Dest: empty",
+ "Sec-Fetch-Mode: cors",
+ "Sec-Fetch-Site: same-origin"]
+ );
+ }
+ }
+
+ curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
+ curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
+ curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
+ curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
+ curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
+
+ $this->backend->assign_proxy($curlproc, $proxy);
+
+ $data = curl_exec($curlproc);
+
+ if(curl_errno($curlproc)){
+
+ throw new Exception(curl_error($curlproc));
+ }
+
+ // store cookie
+ if(strlen($this->cookie) !== 0){
+
+ $this->cookie .= "; ";
+ }
+
+ foreach($cookies_tmp as $cookie_name => $cookie_value){
+
+ $this->cookie .= $cookie_name . "=" . $cookie_value . "; ";
+ }
+
+ $this->cookie = rtrim($this->cookie, " ;");
+
+ curl_close($curlproc);
+ return $data;
+ }
+
+ private function redirect_add_url($proxy, $url){
+
+ if(
+ preg_match(
+ '/^https?:\/\/(?:www\.)?baidu\.com\/link\?/',
+ $url
+ ) === 0
+ ){
+
+ // not a baidu redirect
+ return;
+ }
+
+ $curlproc = curl_init();
+
+ curl_setopt($curlproc, CURLOPT_URL, $url);
+
+ curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
+ curl_setopt($curlproc, CURLOPT_HTTPHEADER,
+ ["User-Agent: " . config::USER_AGENT,
+ "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+ "Accept-Language: en-US,en;q=0.5",
+ "Accept-Encoding: gzip, deflate, br, zstd",
+ "DNT: 1",
+ "Sec-GPC: 1",
+ "Connection: keep-alive",
+ "Upgrade-Insecure-Requests: 1",
+ "Sec-Fetch-Dest: document",
+ "Sec-Fetch-Mode: navigate",
+ "Sec-Fetch-Site: none",
+ "Sec-Fetch-User: ?1",
+ "Priority: u=0, i"]
+ );
+
+ curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
+ curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
+ curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
+ curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
+ curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
+
+ curl_setopt($curlproc, CURLOPT_HEADER, true);
+ curl_setopt($curlproc, CURLOPT_NOBODY, true);
+
+ $this->backend->assign_proxy($curlproc, $proxy);
+
+ curl_multi_add_handle($this->proc, $curlproc);
+ $this->handles[$this->handle_category][$this->handle_increment][$this->sublink_increment] = $curlproc;
+ }
+
+ private function resolve_urls($proxy, &$collection, $categories){
+
+ $this->proc = curl_multi_init();
+ curl_multi_select($this->proc);
+
+ foreach($categories as $category){
+
+ $this->sublink_increment = 0;
+ $this->handle_increment = 0;
+ $this->handle_category = $category;
+
+ foreach($collection[$category] as $item){
+
+ $this->sublink_increment = 0;
+ $this->redirect_add_url($proxy, $item["url"]);
+
+ if(isset($item["sublink"])){
+
+ foreach($item["sublink"] as $sublink){
+
+ $this->sublink_increment++;
+ $this->redirect_add_url($proxy, $sublink["url"]);
+ }
+ }
+
+ $this->handle_increment++;
+ }
+ }
+
+ do{
+ $status = curl_multi_exec($this->proc, $active);
+
+ }while($active && $status == CURLM_OK);
+
+ //
+ // if we reach this, we're done downloading garbage
+ //
+
+ foreach($this->handles as $category => $v){
+
+ foreach($v as $index => $data){
+
+ foreach($this->handles[$category][$index] as $sublinkindex => $handle){
+
+ preg_match(
+ '/location: ?(.*)$/im',
+ curl_multi_getcontent($handle),
+ $location
+ );
+
+ if(isset($location[1])){
+
+ if($sublinkindex === 0){
+
+ $collection[$category][$index]["url"] = trim($location[1]);
+ }else{
+
+ $collection[$category][$index]["sublink"][$sublinkindex - 1]["url"] = trim($location[1]);
+ }
+ }
+
+ curl_multi_remove_handle($this->proc, $handle);
+ curl_close($handle);
+ }
+ }
+ }
+
+ curl_multi_close($this->proc);
+ }
+
+ private function resolve_images($proxy, &$data){
+
+ // get the image viewer that contains all of the images direct URLs
+ // for some reason, getting the second image's url in the set
+ // doesnt trigger the captcha
+
+ if(
+ !isset($data["image"][1]["url"]) ||
+ preg_match(
+ '/^https:\/\/image\.baidu\.com\/search\/detail/',
+ $data["image"][1]["url"]
+ ) === 0
+ ){
+
+ // we have an already resolved image link, do nothing
+ return;
+ }
+
+ try{
+
+ $html =
+ $this->get(
+ $proxy,
+ $data["image"][1]["url"],
+ []
+ );
+ }catch(Exception $error){
+
+ // fallback to the limited dataset we have
+ return;
+ }
+
+ $this->fuckhtml->load($html);
+
+ $script =
+ $this->fuckhtml
+ ->getElementById(
+ "image-detail-data",
+ "script"
+ );
+
+ if($script){
+
+ $json =
+ json_decode(
+ $script["innerHTML"],
+ true
+ );
+
+ if(
+ !isset($json["data"]["images"]) ||
+ count($json["data"]["images"]) === 0
+ ){
+
+ // do nothing
+ return;
+ }
+
+ //
+ // Discard all previously scraped images and use data
+ // from the newly downloaded image carousel
+ // the imageset !!should!! be the same
+ //
+ $data["image"] = [];
+
+ foreach($json["data"]["images"] as $image){
+
+ parse_str(parse_url($image["thumburl"], PHP_URL_QUERY), $thumb_size);
+
+ $data["image"][] = [
+ "title" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $image["titleShow"]
+ ),
+ "source" => [
+ [
+ "url" => $image["objurl"],
+ "width" => (int)$image["width"],
+ "height" => (int)$image["height"]
+ ],
+ [ // thumbnail
+ "url" => $image["thumburl"],
+ "width" => (int)$thumb_size["w"],
+ "height" => (int)$thumb_size["h"]
+ ]
+ ],
+ "url" => $image["fromUrl"]
+ ];
+ }
+ }
+ }
+
+ public function web($get){
+
+ if($get["npt"]){
+
+ [$json, $proxy] = $this->backend->get($get["npt"], "web");
+
+ $json = json_decode($json, true);
+ $this->cookie = $json["cookie"];
+ $npt_data = $json["req"];
+
+ $npt_data["pn"] = $npt_data["pn"] + 20;
+
+ try{
+
+ $html = $this->get(
+ $proxy,
+ "https://www.baidu.com/s",
+ $npt_data
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to fetch search page");
+ }
+
+ }else{
+
+ //
+ // Get authentication token
+ //
+ $proxy = $this->backend->get_ip();
+
+ // running this will give us shit in $this->cookie
+ // @TODO probably not needed? I get blocked anyways ffs
+ //$this->get($proxy, "https://www.baidu.com", []);
+
+ $npt_data = [
+ "wd" => $get["s"],
+ "rn" => 20
+ ];
+
+ // &gpc=stf%3D0%2C1752638400|stftype%3D2
+ if(
+ $get["older"] !== false ||
+ $get["newer"] !== false
+ ){
+
+ if($get["older"] === false){
+
+ $get["older"] = 0;
+ }
+
+ $npt_data["gpc"] = "stf={$get["older"]},{$get["newer"]}|stftype=2";
+ }
+
+ try{
+
+ $html = $this->get(
+ $proxy,
+ "https://www.baidu.com/s",
+ $npt_data
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to fetch search page");
+ }
+
+ $npt_data["pn"] = 0;
+ }
+
+ return $this->parse_search($proxy, "web", $npt_data, $html);
+ }
+
+ private function parse_search($proxy, $pagetype, $npt_data, $html){
+
+ // @HACK
+ // remove newlines from the html, cause it fucks with fuckhtml
+ $html = str_replace(["\n", "\r"], "", $html);
+
+ $out = [
+ "status" => "ok",
+ "spelling" => [
+ "type" => "no_correction",
+ "using" => null,
+ "correction" => null
+ ],
+ "npt" => null,
+ "answer" => [],
+ "web" => [],
+ "image" => [],
+ "video" => [],
+ "news" => [],
+ "related" => []
+ ];
+
+ $this->fuckhtml->load($html);
+
+ $this->detect_ass();
+
+ $datafields =
+ $this->fuckhtml
+ ->getElementsByAttributeName(
+ "id",
+ "div"
+ );
+
+ //
+ // Get next page
+ //
+ $npt =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "n",
+ "a"
+ );
+
+ if(count($npt) !== 0){
+
+ $out["npt"] =
+ $this->backend->store(
+ json_encode([
+ "req" => $npt_data,
+ "cookie" => $this->cookie
+ ]),
+ $pagetype,
+ $proxy
+ );
+ }
+
+ //
+ // Get related searches
+ //
+ $related_container =
+ $this->fuckhtml
+ ->getElementById(
+ "rs_new",
+ $datafields
+ );
+
+ if($related_container){
+
+ $this->fuckhtml->load($related_container);
+
+ $as =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "c-color-link",
+ "a"
+ );
+
+ foreach($as as $a){
+
+ $text =
+ explode(
+ ">",
+ $this->fuckhtml
+ ->getTextContent(
+ $a
+ ),
+ 2
+ );
+
+ $out["related"][] = $text[count($text) - 1];
+ }
+ }
+
+ foreach($datafields as $datafield){
+
+ if(
+ !isset($datafield["attributes"]["id"]) ||
+ preg_match(
+ '/^[0-9]+$/',
+ $datafield["attributes"]["id"]
+ ) === 0
+ ){
+
+ // not a search result
+ continue;
+ }
+
+ $this->fuckhtml->load($datafield);
+ $div =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "div"
+ );
+
+ //
+ // Don't parse as a search result if it's a card
+ //
+ $card =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "cosc-card",
+ $div
+ );
+
+ if(count($card) !== 0){
+
+ //
+ // Parse chinese youtube shorts
+ //
+ $ytshorts_probe =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "tts-b-item",
+ $div
+ );
+
+ if(count($ytshorts_probe) !== 0){
+
+ $videos =
+ $this->fuckhtml
+ ->getElementsByAttributeValue(
+ "data-show",
+ "list",
+ $div
+ );
+
+ foreach($videos as $video){
+
+ $this->fuckhtml->load($video);
+
+ $title =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "cosc-title-slot",
+ "span"
+ );
+
+ if(count($title) === 0){
+
+ continue;
+ }
+
+ $url =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "a"
+ );
+
+ if(count($url) === 0){
+
+ continue;
+ }
+
+ $image =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "cos-image-body",
+ "img"
+ );
+
+ if(count($image) === 0){
+
+ $image = [
+ "ratio" => null,
+ "url" => null
+ ];
+ }else{
+
+ $image = [
+ "ratio" => "1:1",
+ "url" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $image[0]["attributes"]["src"]
+ )
+ ];
+ }
+
+ // get duration
+ $divs =
+ $this->fuckhtml
+ ->getElementsByAttributeName(
+ "class",
+ "div"
+ );
+
+ $duration = null;
+ foreach($divs as $probe){
+
+ if(strpos($probe["attributes"]["class"], "tag-bottom-right") !== false){
+
+ $duration =
+ $this->hms2int(
+ $this->fuckhtml
+ ->getTextContent(
+ $probe
+ )
+ );
+ break;
+ }
+ }
+
+ $out["video"][] = [
+ "title" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $title[0]
+ ),
+ "description" => null,
+ "date" => null,
+ "duration" => $duration,
+ "views" => null,
+ "thumb" => $image,
+ "url" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $url[0]["attributes"]["href"]
+ )
+ ];
+ }
+ }
+
+ //
+ // Parse image carousel
+ //
+ $is_image_carousel = false;
+ foreach($div as $d){
+
+ if(
+ isset($d["attributes"]["class"]) &&
+ strpos($d["attributes"]["class"], "image-container") !== false
+ ){
+
+ $is_image_carousel = true;
+ break;
+ }
+ }
+
+ if($is_image_carousel){
+
+ preg_match(
+ '/<!--s-data:([\S\s]*)-->/U',
+ $datafield["innerHTML"],
+ $matches
+ );
+
+ if(isset($matches[1])){
+
+ // weird behavior with the smaller image carousel where --cos* CSS variables are escaped wrong
+ $json =
+ $this->fuckhtml
+ ->parseJsObject(
+ str_replace(
+ "-\-",
+ "--",
+ $matches[1]
+ )
+ );
+
+ if(
+ $json !== null &&
+ isset($json["imageList"][0]["images"])
+ ){
+
+ // parse image carousel
+ foreach($json["imageList"][0]["images"] as $image){
+
+ parse_str(parse_url($image["thumburl"], PHP_URL_QUERY), $thumb_size);
+
+ $out["image"][] = [
+ "title" => "image",
+ "source" => [
+ [
+ "url" => $image["objurl"],
+ "width" => (int)$image["width"],
+ "height" => (int)$image["height"]
+ ],
+ [ // thumbnail
+ "url" => $image["thumburl"],
+ "width" => (int)$thumb_size["w"],
+ "height" => (int)$thumb_size["h"]
+ ]
+ ],
+ "url" => $image["jumpUrl"]
+ ];
+ }
+ }
+ }
+ }
+ continue;
+ }
+
+ if(!isset($datafield["attributes"]["mu"])){
+
+ // dont scrape if we dont have the direct link
+ continue;
+ }
+
+ // class:FYB_RD -> News garbage, IGNORE
+
+ $result =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "result",
+ [$datafield]
+ );
+
+ if(count($result) !== 0){
+
+ //
+ // Parse normal search result
+ //
+
+ $title =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "sc-link",
+ "a"
+ );
+
+ if(count($title) === 0){
+
+ // should not happen
+ continue;
+ }
+
+ $title =
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $title[0]
+ )
+ );
+
+ $description =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "c-color",
+ $div
+ );
+
+ if(count($description) !== 0){
+
+ $this->fuckhtml->load($description[0]);
+
+ $description =
+ $this->fuckhtml
+ ->getElementsByAttributeName(
+ "class",
+ "span"
+ );
+
+ $found_desc = false;
+ foreach($description as $desc){
+
+ if(stripos($desc["attributes"]["class"], "summary-text") !== false){
+
+ $found_desc = true;
+ $description =
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $desc
+ )
+ );
+ break;
+ }
+ }
+
+ if($found_desc === false){
+
+ $description = null;
+ }
+
+ $this->fuckhtml->load($datafield);
+ }else{
+
+ $description = null;
+ }
+
+ // parse date
+ $date_probe =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "cos-color-text-minor",
+ "span"
+ );
+
+ if(count($date_probe) !== 0){
+
+ $date =
+ $this->parse_time(
+ $this->fuckhtml
+ ->getTextContent(
+ $date_probe[0]
+ )
+ );
+ }else{
+
+ $date = null;
+ }
+
+ // parse image
+ $img =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "img"
+ );
+
+ if(count($img) !== 0){
+
+ $image = [
+ "ratio" => "16:9",
+ "url" =>
+ $this->unfuckthumb(
+ $this->fuckhtml
+ ->getTextContent(
+ $img[0]["attributes"]["src"]
+ )
+ )
+ ];
+ }else{
+
+ $image = [
+ "ratio" => null,
+ "url" => null
+ ];
+ }
+
+ // get page type
+ $pagetype_probe =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "b"
+ );
+
+ $pagetype = "web";
+ foreach($pagetype_probe as $probe){
+
+ $pagetype =
+ strtolower(
+ trim(
+ $this->fuckhtml
+ ->getTextContent(
+ $probe
+ ),
+ " 【】"
+ )
+ );
+ }
+
+ // get extra links
+ $sublinks = [];
+
+ foreach($div as $d){
+
+ if(
+ isset($d["attributes"]["class"]) &&
+ strpos($d["attributes"]["class"], "exta-link") !== false
+ ){
+
+ $this->fuckhtml->load($d);
+
+ $links =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "cos-space-mt-xs",
+ "div"
+ );
+
+ foreach($links as $link){
+
+ $this->fuckhtml->load($link);
+ $s_title =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "h3"
+ );
+
+ if(count($s_title) === 0){
+
+ // should not happen
+ continue;
+ }
+
+ $data2 =
+ json_decode(
+ $this->fuckhtml
+ ->getTextContent(
+ $s_title[0]["attributes"]["data-click"]
+ ),
+ true
+ );
+
+ if(!isset($data2["clk_info"])){
+
+ // wtf
+ continue;
+ }
+
+ $data2 =
+ json_decode(
+ $data2["clk_info"],
+ true
+ );
+
+ if(!isset($data2["url"])){
+
+ // no link, fuck off
+ continue;
+ }
+
+ $url =
+ rawurldecode(
+ $data2["url"]
+ );
+
+ $data =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "p"
+ );
+
+ $s_description = null;
+
+ if(count($data) !== 0){
+
+ $data =
+ json_decode(
+ $this->fuckhtml
+ ->getTextContent(
+ $data[0]["attributes"]["sub-show-log"]
+ ),
+ true
+ );
+
+ if(isset($data["ext"]["content"])){
+
+ $s_description = $data["ext"]["content"];
+ }
+ }
+
+ $sublinks[] = [
+ "title" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $s_title[0]
+ ),
+ "description" => $s_description,
+ "url" => $url,
+ "date" => null
+ ];
+ }
+ break;
+ }
+ }
+
+ $out["web"][] = [
+ "title" => $title,
+ "description" => $description,
+ "url" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $datafield["attributes"]["mu"]
+ ),
+ "date" => $date,
+ "type" => $pagetype,
+ "thumb" => $image,
+ "sublink" => $sublinks,
+ "table" => []
+ ];
+
+ continue;
+ }
+
+ // parse special result
+ $result =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "result-op",
+ [$datafield]
+ );
+
+ if(count($result) !== 0){
+
+ //
+ // Parse video carousel
+ //
+ if(
+ isset($datafield["attributes"]["tpl"]) &&
+ stripos($datafield["attributes"]["tpl"], "video") !== false
+ ){
+
+ preg_match(
+ '/<!--s-data:([\S\s]*)-->/U',
+ $datafield["innerHTML"],
+ $matches
+ );
+
+ if(isset($matches[1])){
+
+ $json =
+ json_decode(
+ $matches[1],
+ true
+ );
+
+ if($json !== null){
+
+ foreach($json["videoList"] as $video){
+
+ $out["video"][] = [
+ "title" => $video["title"],
+ "description" =>
+ $this->titledots(
+ $video["desc"]
+ ),
+ "date" =>
+ $this->parse_time(
+ $video["pubTime"]
+ ),
+ "duration" =>
+ $this->hms2int(
+ $video["duration"]
+ ),
+ "views" =>
+ $this->parse_viewcount(
+ $video["playCount"]
+ ),
+ "thumb" => [
+ "ratio" => "16:9",
+ "url" => $video["poster"]
+ ],
+ "url" => $video["bindProps"]["link"]
+ ];
+ }
+ }
+ }
+ continue;
+ }
+
+ //
+ // Special result div (wiki entries, rich divs)
+ //
+ $title =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "h3"
+ );
+
+ if(count($title) === 0){
+
+ // should have a title somewhere
+ continue;
+ }
+
+ $title =
+ explode(
+ ">",
+ $this->fuckhtml
+ ->getTextContent(
+ $title[0]
+ ),
+ 2
+ );
+
+ if(count($title) === 2){
+
+ $title = $title[1];
+ }else{
+
+ $title = $title[0];
+ }
+
+ // probe for wiki-like entry
+ $description =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "sc-paragraph",
+ "p"
+ );
+
+ if(count($description) === 0){
+
+ // try and get grey description
+ $description =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "c-color-gray2",
+ "p"
+ );
+
+ if(count($description) === 0){
+
+ // probe for special social media description
+ $description =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "c-color-text",
+ "div"
+ );
+
+ if(isset($description[0]["attributes"]["aria-label"])){
+
+ $description =
+ $this->fuckhtml
+ ->getTextContent(
+ $description[0]
+ ["attributes"]
+ ["aria-label"]
+ );
+ }else{
+
+ // check for news tab description
+ $span =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "c-font-normal",
+ "span"
+ );
+
+ $description = null;
+
+ foreach($span as $s){
+
+ if(isset($s["attributes"]["aria-label"])){
+
+ $description =
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $span[count($span) - 1]
+ )
+ );
+
+ break;
+ }
+ }
+ }
+ }else{
+
+ $description =
+ $this->fuckhtml
+ ->getTextContent(
+ $description[0]
+ );
+ }
+
+ }else{
+
+ preg_match(
+ '/<!--s-text-->([\S\s]*)<!--\/s-text-->/U',
+ $description[count($description) - 1]["innerHTML"],
+ $matches
+ );
+
+ if(isset($matches[1])){
+
+ $description =
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $matches[1]
+ )
+ );
+ }else{
+
+ $description = null;
+ }
+ }
+
+ // get thumbnail
+ $thumb =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "img"
+ );
+
+ if(count($thumb) !== 0){
+
+ $thumb = [
+ "ratio" => "1:1",
+ "url" =>
+ $this->unfuckthumb(
+ $this->fuckhtml
+ ->getTextContent(
+ $thumb[0]["attributes"]["src"]
+ )
+ )
+ ];
+ }else{
+
+ $thumb = [
+ "ratio" => null,
+ "url" => null
+ ];
+ }
+
+ // get sublinks
+ preg_match(
+ '/<!--s-data:([\S\s]*)-->/U',
+ $datafield["innerHTML"],
+ $matches
+ );
+
+ $sublinks = [];
+
+ if(isset($matches[1])){
+
+ $json =
+ json_decode(
+ $matches[1],
+ true
+ );
+
+ if($json !== null){
+
+ if(isset($json["buttons"])){
+
+ foreach($json["buttons"] as $button){
+
+ $sublinks[] = [
+ "title" => $button["text"],
+ "description" => null,
+ "date" => null,
+ "url" => $button["url"]
+ ];
+ }
+ }elseif(isset($json["mthreadList"])){
+
+ foreach($json["mthreadList"] as $thread){
+
+ $sublinks[] = [
+ "title" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $thread["title"]
+ ),
+ "description" => null,
+ "date" => null,
+ "url" => $thread["ttsInfo"]["titleUrl"]
+ ];
+ }
+ }
+ }
+ }
+
+ // get URL
+ // handle http://fakeurl.baidu.com bullshit
+ $url =
+ $this->fuckhtml
+ ->getTextContent(
+ $datafield["attributes"]["mu"]
+ );
+
+ if(
+ preg_match(
+ '/^https?:\/\/(?:fakeurl|nourl)(?:\.ubs)?\.baidu\.com/',
+ $url
+ )
+ ){
+
+ // we got some bullshit, get jumpUrl instead
+ $as =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "a"
+ );
+
+ if(count($as) !== 0){
+
+ $url =
+ $this->fuckhtml
+ ->getTextContent(
+ $as[0]["attributes"]["href"]
+ );
+ }
+ }
+
+ // get xueshu sublinks
+ // get list
+ $xueshu_list =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "op-xueshu-links-d20-list",
+ $div
+ );
+
+ if(count($xueshu_list) !== 0){
+
+ $this->fuckhtml->load($xueshu_list[0]);
+
+ $rows =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "c-row",
+ "div"
+ );
+
+ // remove "read more" bullshit
+ foreach($rows as $row){
+
+ if(strpos($row["attributes"]["class"], "op-xueshu-links-more") !== false){
+
+ $xueshu_list[0]["innerHTML"] =
+ str_replace(
+ $row["outerHTML"],
+ "",
+ $xueshu_list[0]["innerHTML"]
+ );
+ }
+ }
+
+ $this->fuckhtml->load($xueshu_list[0]);
+
+ foreach($rows as $row){
+
+ $this->fuckhtml->load($row);
+
+ if(strpos($row["attributes"]["class"], "op-xueshu-links-more") !== false){
+
+ continue;
+ }
+
+ $as =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "a"
+ );
+
+ foreach($as as $a){
+
+ $sublinks[] = [
+ "title" =>
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $a
+ )
+ ),
+ "description" => null,
+ "date" => null,
+ "url" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $a["attributes"]["href"]
+ )
+ ];
+ }
+ }
+ }
+
+ $out["web"][] = [
+ "title" => $title,
+ "description" => $description,
+ "url" => $url,
+ "date" => null,
+ "type" => "web",
+ "thumb" => $thumb,
+ "sublink" => $sublinks,
+ "table" => []
+ ];
+ continue;
+ }
+ }
+
+ //
+ // Remove tracking URLs and fetch additonal image resources
+ //
+ $this->resolve_urls($proxy, $out, ["web", "video"]);
+ $this->resolve_images($proxy, $out);
+
+ return $out;
+ }
+
+ public function image($get){
+
+ // https://image.baidu.com/search/acjson?word=asmr&rn=60&pn=0&newReq=1
+ //$json = file_get_contents("scraper/baidu_img.json");
+
+ if($get["npt"]){
+
+ [$params, $proxy] = $this->backend->get($get["npt"], "images");
+ $params = json_decode($params, true);
+
+ $params["pn"] = $params["pn"] + 60;
+
+ }else{
+
+ $proxy = $this->backend->get_ip();
+ $params = [
+ "word" => $get["s"],
+ "rn" => 60, // results/page
+ "pn" => 0, // item increment (0 * 60)
+ "newReq" => 1 // otherwise json is fucked up
+ ];
+
+ switch($get["sort"]){
+
+ case "latest": $params["latest"] = 1; break;
+ case "hot": $params["hot"] = 1; break;
+ }
+
+ if($get["size"] != "any"){
+
+ $params["z"] = $get["size"];
+ }
+
+ if($get["ratio"] != "any"){
+
+ $params["imgratio"] = $get["ratio"];
+ }
+
+ if($get["format"] != "any"){
+
+ $params["imgformat"] = $get["format"];
+ }
+
+ if($get["color"] != "any"){
+
+ $params["ic"] = $get["color"];
+ }
+
+ switch($get["type"]){
+
+ case "hd": $params["hd"] = 1; break;
+ case "isImgSet": $params["isImgSet"] = 1; break;
+ case "copyright": $params["copyright"] = 1; break;
+ }
+ }
+
+ try{
+
+ $json =
+ $this->get(
+ $proxy,
+ "https://image.baidu.com/search/acjson",
+ $params,
+ "https://image.baidu.com/search/index?tn=baiduimage&word=" . urlencode($get["s"])
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to fetch JSON");
+ }
+
+ $json = json_decode($json, true);
+
+ if($json === null){
+
+ // detect captcha first
+ $this->fuckhtml->load($json);
+ $this->detect_ass();
+
+ // fallback to json decode error
+ throw new Exception("Failed to decode JSON");
+ }
+
+ if(
+ isset($json["message"]) &&
+ $json["message"] != "success"
+ ){
+
+ throw new Exception("Baidu returned an error: {$json["message"]}");
+ }
+
+ if(!isset($json["data"]["images"])){
+
+ throw new Exception("Baidu did not return an image object");
+ }
+
+ $out = [
+ "status" => "ok",
+ "npt" => null,
+ "image" => []
+ ];
+
+ foreach($json["data"]["images"] as $image){
+
+ parse_str(parse_url($image["thumburl"], PHP_URL_QUERY), $thumb_size);
+
+ $out["image"][] = [
+ "title" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $image["titleShow"]
+ ),
+ "source" => [
+ [
+ "url" => $image["objurl"],
+ "width" => (int)$image["width"],
+ "height" => (int)$image["height"]
+ ],
+ [ // thumbnail
+ "url" => $image["thumburl"],
+ "width" => (int)$thumb_size["w"],
+ "height" => (int)$thumb_size["h"]
+ ]
+ ],
+ "url" => $image["fromUrl"]
+ ];
+ }
+
+ //
+ // Detect if there's a next page
+ //
+ if((int)$json["data"]["totalNum"] >= $params["pn"] + 60){
+
+ $out["npt"] =
+ $this->backend->store(
+ json_encode($params),
+ "images",
+ $proxy
+ );
+ }
+
+ return $out;
+ }
+
+ public function video($get){
+
+ // https://www.baidu.com/sf/vsearch?pd=video&tn=vsearch&wd=jak%2Band%2Bdaxter&async=1&pn=0
+ // increase &pn +20 for pagination
+
+ //$html = file_get_contents("scraper/baidu_vid.html");
+
+ if($get["npt"]){
+
+ [$params, $proxy] = $this->backend->get($get["npt"], "videos");
+ $params = json_decode($params, true);
+
+ $params["pn"] = $params["pn"] + 10;
+ }else{
+
+ $proxy = $this->backend->get_ip();
+ $params = [
+ "pd" => "video",
+ "tn" => "vsearch",
+ "wd" => $get["s"],
+ "async" => 1,
+ "pn" => 0
+ ];
+ }
+
+ try{
+ $html =
+ $this->get(
+ $proxy,
+ "https://www.baidu.com/sf/vsearch",
+ $params
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to get search page");
+ }
+
+ $html =
+ str_replace(
+ ["\r", "\n"],
+ "",
+ $html
+ );
+
+ $out = [
+ "status" => "ok",
+ "npt" => null,
+ "video" => [],
+ "author" => [],
+ "livestream" => [],
+ "playlist" => [],
+ "reel" => []
+ ];
+
+ $html = explode("<script>", $html);
+
+ foreach($html as $result){
+
+ $result = trim($result);
+
+ $this->fuckhtml->load($result);
+
+ // get URL
+ preg_match(
+ '/<!-- *([^ ]*) *-->/',
+ $result,
+ $matches
+ );
+
+ if(!isset($matches[1])){
+
+ // no link, give up
+ continue;
+ }
+
+ $link = $matches[1];
+
+ // get title
+ $title =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "video-title",
+ "a"
+ );
+
+ if(count($title) === 0){
+
+ // should not happen
+ continue;
+ }
+
+ $title =
+ $this->fuckhtml
+ ->getTextContent(
+ $title[0]
+ );
+
+ // get thumbnail
+ $img =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "border-radius",
+ "img"
+ );
+
+ if(count($img) !== 0){
+
+ $thumb = [
+ "url" =>
+ $this->unfuckthumb(
+ $this->fuckhtml
+ ->getTextContent(
+ $img[0]["attributes"]["src"]
+ )
+ ),
+ "ratio" => "16:9"
+ ];
+ }else{
+
+ $thumb = [
+ "url" => null,
+ "ratio" => null
+ ];
+ }
+
+ $span =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "span"
+ );
+
+ // get duration
+ $duration =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "video_play_timer",
+ $span
+ );
+
+ if(count($duration) !== 0){
+
+ $duration =
+ $this->hms2int(
+ $this->fuckhtml
+ ->getTextContent(
+ $duration[0]
+ )
+ );
+ }else{
+
+ $duration = null;
+ }
+
+ // get author
+ // 来源:哔哩哔哩
+ $author =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "wetSource",
+ $span
+ );
+
+ if(count($author) !== 0){
+
+ $author =
+ explode(
+ ":",
+ $this->fuckhtml
+ ->getTextContent(
+ $author[0]
+ ),
+ 2
+ )[1];
+ }else{
+
+ $author = null;
+ }
+
+ // get date posted
+ //发布时间:2024-05-06
+
+ // AND get description
+ // 简介:Our first look
+ $infospans =
+ array_merge(
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "c-font-normal",
+ $span
+ ),
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "c-font-normal",
+ "div"
+ )
+ );
+
+ $date = null;
+ $description = null;
+
+ foreach($infospans as $infospan){
+
+ $infospan =
+ explode(
+ ":",
+ $this->fuckhtml
+ ->getTextContent(
+ $infospan
+ ),
+ 2
+ );
+
+ if(count($infospan) !== 2){
+
+ // should not happen
+ continue;
+ }
+
+ $infospan[1] =
+ $this->fuckhtml
+ ->getTextContent(
+ $infospan[1]
+ );
+
+ switch($infospan[0]){
+
+ case "发布时间": // date posted
+ $date = $this->parse_time($infospan[1]);
+ break;
+
+ case "简介": // description
+ $description = $infospan[1];
+ break;
+ }
+ }
+
+ $out["video"][] = [
+ "title" => $this->titledots($title),
+ "description" => $this->titledots($description),
+ "author" => [
+ "name" => $author,
+ "url" => null,
+ "avatar" => null
+ ],
+ "date" => $date,
+ "duration" => $duration,
+ "views" => null,
+ "thumb" => $thumb,
+ "url" => $link
+ ];
+ }
+
+ if(count($out["video"]) === 10){
+
+ // assume there's another page after this
+ $out["npt"] =
+ $this->backend->store(
+ json_encode($params),
+ "videos",
+ $proxy
+ );
+ }
+
+ return $out;
+ }
+
+ public function news($get){
+
+ //$proxy = $this->backend->get_ip();
+ //$html = file_get_contents("scraper/baidu.html");
+ //$npt_data = [];
+
+ if($get["npt"]){
+
+ [$json, $proxy] = $this->backend->get($get["npt"], "news");
+
+ $json = json_decode($json, true);
+ $this->cookie = $json["cookie"];
+ $npt_data = $json["req"];
+
+ $npt_data["pn"] = $npt_data["pn"] + 20;
+
+ try{
+
+ $html = $this->get(
+ $proxy,
+ "https://www.baidu.com/s",
+ $npt_data
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to fetch search page");
+ }
+
+ }else{
+
+ //
+ // Get authentication token
+ //
+ $proxy = $this->backend->get_ip();
+
+ $npt_data = [
+ "wd" => $get["s"],
+ "rn" => 20,
+ "tn" => "news"
+ ];
+
+ // @TODO add filters
+
+ try{
+
+ $html = $this->get(
+ $proxy,
+ "https://www.baidu.com/s",
+ $npt_data
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to fetch search page");
+ }
+
+ $npt_data["pn"] = 0;
+ }
+
+ $data = $this->parse_search($proxy, "news", $npt_data, $html);
+
+ $out = [
+ "status" => "ok",
+ "npt" => $data["npt"],
+ "news" => []
+ ];
+
+ foreach($data["web"] as $article){
+
+ $out["news"][] = [
+ "title" => $article["title"],
+ "author" => null,
+ "description" => $article["description"],
+ "date" => $article["date"],
+ "thumb" => [
+ "url" => $article["thumb"]["url"],
+ "ratio" => $article["thumb"]["url"] !== null ? "16:9" : null,
+ ],
+ "url" => $article["url"]
+ ];
+ }
+
+ return $out;
+ }
+
+ private function unfuckthumb($url){
+
+ // probe for proxy URL
+ $parsed_url = parse_url($url);
+ if(
+ preg_match(
+ '/^https?:\/\/gimg(?:[0-9]+)?\.baidu\.com/',
+ $url
+ )
+ ){
+
+ $parts = explode("src=", $url);
+ if(count($parts) !== 2){
+
+ // shits fucked
+ return $url;
+ }
+
+ return urldecode(explode("&", $parts[1])[0]);
+ }
+
+ $q = explode("&", $url, 2);
+
+ if(count($q) !== 2){
+
+ // shits fucked, again
+ return $url;
+ }
+
+ // baidu devs are fucking retarded and dont follow spec:
+ // &fmt=auto?s=BB32F3A050471AEC72886934030090C4&sec=1753203600&t=0fb2194775d3bd3d1bb114b818479e0a
+ parse_str(str_replace("?", "&", $q[1]), $query);
+
+ if(isset($query["size"])){ unset($query["size"]); }
+ if(isset($query["q"])){ $query["q"] = "90"; }
+
+ $query = http_build_query($query);
+
+ return
+ str_replace(
+ $q[1],
+ $query,
+ $url
+ );
+ }
+
+ private function titledots($title){
+
+ return trim($title, " .\t\n\r\0\x0B…");
+ }
+
+ private function hms2int($time){
+
+ $parts = explode(":", $time, 3);
+ $time = 0;
+
+ if(count($parts) === 3){
+
+ // hours
+ $time = $time + ((int)$parts[0] * 3600);
+ array_shift($parts);
+ }
+
+ if(count($parts) === 2){
+
+ // minutes
+ $time = $time + ((int)$parts[0] * 60);
+ array_shift($parts);
+ }
+
+ // seconds
+ $time = $time + (int)$parts[0];
+
+ return $time;
+ }
+
+ private function parse_viewcount($views){
+
+ if(
+ // 10k (wtf lol)
+ preg_match(
+ '/([0-9]+)万次/',
+ $views,
+ $matches
+ )
+ ){
+
+ return (int)$matches[1] * 10000;
+ }
+
+ if(
+ // units
+ preg_match(
+ '/([0-9]+)次/',
+ $views,
+ $matches
+ )
+ ){
+
+ return (int)$matches[1];
+ }
+
+ return null;
+ }
+
+ private function parse_time($time){
+
+ // 2023年8月7日 => yyyy/m/d
+ if(
+ preg_match(
+ '/([0-9]{4})年([0-9]{1,2})月([0-9]{1,2})日/',
+ $time,
+ $matches
+ )
+ ){
+
+ return strtotime("{$matches[1]}/{$matches[2]}/{$matches[3]}");
+ }
+
+ // 昨天11:45 => yesterday at 11:45
+ // 昨天 => yesterday
+ if(
+ preg_match(
+ '/昨天(.*)/',
+ $time,
+ $matches
+ )
+ ){
+
+ return strtotime("Yesterday {$matches[1]}");
+ }
+
+ // 3天前 => 3 days ago
+ if(
+ preg_match(
+ '/([0-9]{1,4})天前/',
+ $time,
+ $matches
+ )
+ ){
+
+ return strtotime("{$matches[1]} days ago");
+ }
+
+ // 1个月前 => 1 month ago
+ if(
+ preg_match(
+ '/([0-9]{1,4})个月前/',
+ $time,
+ $matches
+ )
+ ){
+
+ return strtotime("{$matches[1]} months ago");
+ }
+
+ // attempt to parse as-is
+ $time = strtotime($time);
+
+ if($time !== false){
+
+ return $time;
+ }
+
+ return null;
+ }
+
+ private function detect_ass(){
+
+ $as =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "a"
+ );
+
+ if(
+ count($as) === 0 ||
+ preg_match(
+ '/^https?:\/\/wappass\.baidu\.com\/static\/captcha/',
+ $this->fuckhtml
+ ->getTextContent(
+ $as[0]["attributes"]["href"]
+ )
+ )
+ ){
+
+ throw new Exception("Baidu returned a Captcha");
+ }
+ }
+}