aboutsummaryrefslogtreecommitdiffstats
path: root/scraper/coccoc.php
diff options
context:
space:
mode:
authorlolcat <will@lolcat.ca>2025-08-11 01:55:15 +0000
committerlolcat <will@lolcat.ca>2025-08-11 01:55:15 +0000
commitcdf958d29333d448f4521f4d2faa2592b58e9b27 (patch)
tree528f2a0ffa789a6f4279d9f54a4a2aaf391f390f /scraper/coccoc.php
downloadshittyweb-search-cdf958d29333d448f4521f4d2faa2592b58e9b27.tar.gz
shittyweb-search-cdf958d29333d448f4521f4d2faa2592b58e9b27.tar.bz2
shittyweb-search-cdf958d29333d448f4521f4d2faa2592b58e9b27.zip
fix wikipedia crashgrafted
Diffstat (limited to 'scraper/coccoc.php')
-rw-r--r--scraper/coccoc.php672
1 files changed, 672 insertions, 0 deletions
diff --git a/scraper/coccoc.php b/scraper/coccoc.php
new file mode 100644
index 0000000..fd09556
--- /dev/null
+++ b/scraper/coccoc.php
@@ -0,0 +1,672 @@
+<?php
+
+class coccoc{
+
+ public function __construct(){
+
+ include "lib/backend.php";
+ $this->backend = new backend("coccoc");
+
+ include "lib/fuckhtml.php";
+ $this->fuckhtml = new fuckhtml();
+ }
+
+
+ private function get($proxy, $url, $get = []){
+
+ $curlproc = curl_init();
+
+ if($get !== []){
+ $get = http_build_query($get);
+ $url .= "?" . $get;
+ }
+
+ curl_setopt($curlproc, CURLOPT_URL, $url);
+
+ // http2 bypass
+ curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
+
+ curl_setopt($curlproc, CURLOPT_HTTPHEADER, [
+ "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+ "Accept-Language: en-US,en;q=0.5",
+ "Accept-Encoding: gzip, deflate, br, zstd",
+ "DNT: 1",
+ "Sec-GPC: 1",
+ "Connection: keep-alive",
+ //"Cookie: _contentAB_15040_vi=V-06_01; split_test_search=new_search; uid=L_bauXyZBY1B; vid=uCVQJQSTgb9QGT3o; ls=1753742684; serp_version=29223843/7621a70; savedS=direct",
+ "Upgrade-Insecure-Requests: 1",
+ "Sec-Fetch-Dest: document",
+ "Sec-Fetch-Mode: navigate",
+ "Sec-Fetch-Site: cross-site",
+ "Priority: u=0, i"
+ ]);
+
+ $this->backend->assign_proxy($curlproc, $proxy);
+
+ curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
+
+ curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
+ curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
+ curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
+ curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
+ curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
+
+ $data = curl_exec($curlproc);
+
+ if(curl_errno($curlproc)){
+ throw new Exception(curl_error($curlproc));
+ }
+
+ curl_close($curlproc);
+ return $data;
+ }
+
+ public function getfilters($pagetype){
+
+ return [
+ "nsfw" => [
+ "display" => "NSFW",
+ "option" => [
+ "yes" => "Yes", // nsfw by default????
+ "no" => "No" // &safe=1
+ ]
+ ],
+ "time" => [
+ "display" => "Time posted",
+ "option" => [
+ "any" => "Any time",
+ "1w" => "1 week ago",
+ "2w" => "2 weeks ago",
+ "1m" => "1 month ago",
+ "3m" => "3 months ago",
+ "6m" => "6 months ago",
+ "1Y" => "1 year ago"
+ ]
+ ],
+ "filter" => [
+ "display" => "Remove duplicates",
+ "option" => [
+ "no" => "No",
+ "yes" => "Yes" // &filter=0
+ ]
+ ]
+ ];
+ }
+
+ public function web($get){
+
+ if($get["npt"]){
+
+ [$query, $proxy] =
+ $this->backend->get(
+ $get["npt"],
+ "web"
+ );
+
+ $query = json_decode($query, true);
+ }else{
+
+ $proxy = $this->backend->get_ip();
+
+ $query = [
+ "query" => $get["s"]
+ ];
+
+ // add filters
+ if($get["nsfw"] == "no"){
+
+ $query["safe"] = 1;
+ }
+
+ if($get["time"] != "any"){
+
+ $query["tbs"] = $get["time"];
+ }
+
+ if($get["filter"] == "yes"){
+
+ $query["filter"] = 0;
+ }
+ }
+
+ try{
+
+ $html =
+ $this->get(
+ $proxy,
+ "https://coccoc.com/search",
+ $query
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to get search page");
+ }
+ //$html = file_get_contents("scraper/coccoc.html");
+
+
+ $html = explode("window.composerResponse", $html, 2);
+
+ if(count($html) !== 2){
+
+ throw new Exception("Failed to grep window.composerResponse");
+ }
+
+ $html =
+ json_decode(
+ $this->fuckhtml
+ ->extract_json(
+ ltrim($html[1], " =")
+ ),
+ true
+ );
+
+ if($html === null){
+
+ throw new Exception("Failed to decode JSON");
+ }
+
+ if(!isset($html["search"]["search_results"])){
+
+ throw new Exception("Coc Coc did not return a search_results object");
+ }
+
+ $out = [
+ "status" => "ok",
+ "spelling" => [
+ "type" => "no_correction",
+ "using" => null,
+ "correction" => null
+ ],
+ "npt" => null,
+ "answer" => [],
+ "web" => [],
+ "image" => [],
+ "video" => [],
+ "news" => [],
+ "related" => []
+ ];
+
+ // word correction
+ foreach($html["top"] as $element){
+
+ if(isset($element["spellChecker"][0]["query"])){
+
+ $out["spelling"] = [
+ "type" => "not_many",
+ "using" => $html["search"]["query"],
+ "correction" => $element["spellChecker"][0]["query"]
+ ];
+ }
+ }
+
+ foreach($html["search"]["search_results"] as $result){
+
+ if(isset($result["type"])){
+
+ switch($result["type"]){
+
+ //
+ // Related searches
+ //
+ case "related_queries":
+ $out["related"] = $result["queries"];
+ continue 2;
+
+ //
+ // Videos
+ //
+ case "video_hits":
+ foreach($result["results"] as $video){
+
+ if(
+ isset($video["image_url"]) &&
+ !empty($video["image_url"])
+ ){
+
+ $thumb = [
+ "ratio" => "16:9",
+ "url" => $video["image_url"]
+ ];
+ }else{
+
+ $thumb = [
+ "ratio" => null,
+ "url" => null
+ ];
+ }
+
+ $out["video"][] = [
+ "title" =>
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $video["title"]
+ )
+ ),
+ "description" => null,
+ "author" => [
+ "name" => $video["uploader"],
+ "url" => null,
+ "avatar" => null
+ ],
+ "date" => (int)$video["date"],
+ "duration" => (int)$video["duration"],
+ "views" => null,
+ "thumb" => $thumb,
+ "url" => $video["url"]
+ ];
+ }
+ continue 2;
+ }
+ }
+
+ if(
+ !isset($result["title"]) ||
+ !isset($result["url"])
+ ){
+
+ // should not happen
+ continue;
+ }
+
+ if(isset($result["rich"]["data"]["image_url"])){
+
+ $thumb = [
+ "url" => $result["rich"]["data"]["image_url"],
+ "ratio" => "16:9"
+ ];
+ }else{
+
+ $thumb = [
+ "url" => null,
+ "ratio" => null
+ ];
+ }
+
+ $sublinks = [];
+
+ if(isset($result["rich"]["data"]["linked_docs"])){
+
+ foreach($result["rich"]["data"]["linked_docs"] as $sub){
+
+ $sublinks[] = [
+ "title" =>
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $sub["title"]
+ )
+ ),
+ "description" =>
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $sub["content"]
+ )
+ ),
+ "date" => null,
+ "url" => $sub["url"]
+ ];
+ }
+ }
+
+ // get date
+ if(isset($result["date"])){
+
+ $date = (int)$result["date"];
+ }else{
+
+ $date = null;
+ }
+
+ // probe for metadata
+ $table = [];
+
+ if(isset($result["rich"]["data"]["rating"])){
+
+ $table["Rating"] = $result["rich"]["data"]["rating"];
+
+ if(isset($result["rich"]["data"]["num_rating"])){
+
+ $table["Rating"] .= " (" . number_format($result["rich"]["data"]["num_rating"]) . " ratings)";
+ }
+ }
+
+ if(isset($result["rich"]["data"]["views"])){
+
+ $table["Views"] = number_format($result["rich"]["data"]["views"]);
+ }
+
+ if(isset($result["rich"]["data"]["duration"])){
+
+ $table["Duration"] = $this->int2hms($result["rich"]["data"]["duration"]);
+ }
+
+ if(isset($result["rich"]["data"]["channel_name"])){
+
+ $table["Author"] = $result["rich"]["data"]["channel_name"];
+ }
+
+ if(isset($result["rich"]["data"]["video_quality"])){
+
+ $table["Quality"] = $result["rich"]["data"]["video_quality"];
+ }
+
+ if(isset($result["rich"]["data"]["category"])){
+
+ $table["Category"] = $result["rich"]["data"]["category"];
+ }
+
+ $out["web"][] = [
+ "title" =>
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $result["title"]
+ )
+ ),
+ "description" =>
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $result["content"]
+ )
+ ),
+ "url" => $result["url"],
+ "date" => $date,
+ "type" => "web",
+ "thumb" => $thumb,
+ "sublink" => $sublinks,
+ "table" => $table
+ ];
+ }
+
+ //
+ // Get wikipedia head
+ //
+ if(isset($html["right"])){
+
+ foreach($html["right"] as $wiki){
+
+ $description = [];
+
+ if(isset($wiki["short_intro"])){
+
+ $description[] =
+ [
+ "type" => "quote",
+ "value" => $wiki["short_intro"],
+ ];
+ }
+
+ if(isset($wiki["intro"])){
+
+ $description[] =
+ [
+ "type" => "text",
+ "value" => $wiki["intro"],
+ ];
+ }
+
+ // get table elements
+ $table = [];
+
+ if(isset($wiki["fields"])){
+
+ foreach($wiki["fields"] as $element){
+
+ $table[$element["title"]] = implode(", ", $element["value"]);
+ }
+ }
+
+ // get sublinks
+ $sublinks = [];
+
+ if(isset($wiki["website"])){
+
+ if(
+ preg_match(
+ '/^http/',
+ $wiki["website"]
+ ) === 0
+ ){
+
+ $sublinks["Website"] = "https://" . $wiki["website"];
+ }else{
+
+ $sublinks["Website"] = $wiki["website"];
+ }
+ }
+
+ foreach($wiki["profiles"] as $sitename => $url){
+
+ $sitename = explode("_", $sitename);
+ $sitename = ucfirst($sitename[count($sitename) - 1]);
+
+ $sublinks[$sitename] = $url;
+ }
+
+ $out["answer"][] = [
+ "title" =>
+ $this->titledots(
+ $wiki["title"]
+ ),
+ "description" => $description,
+ "url" => null,
+ "thumb" => isset($wiki["image"]["contentUrl"]) ? $wiki["image"]["contentUrl"] : null,
+ "table" => $table,
+ "sublink" => $sublinks
+ ];
+ }
+ }
+
+ // get next page
+ if((int)$html["search"]["page"] < (int)$html["search"]["max_page"]){
+
+ // https://coccoc.com/composer?_=1754021153532&p=0&q=zbabduiqwhduwqhdnwq&reqid=bwcAs00q&s=direct&apiV=1
+ // ^json endpoint, but we can just do &page=2 lol
+
+ if(!isset($query["page"])){
+
+ $query["page"] = 2;
+ }else{
+
+ $query["page"]++;
+ }
+
+ $out["npt"] =
+ $this->backend
+ ->store(
+ json_encode($query),
+ "web",
+ $proxy
+ );
+ }
+
+ return $out;
+ }
+
+ public function video($get){
+
+ //$html = file_get_contents("scraper/coccoc.html");
+ if($get["npt"]){
+
+ [$query, $proxy] =
+ $this->backend->get(
+ $get["npt"],
+ "videos"
+ );
+
+ $query = json_decode($query, true);
+ }else{
+
+ $proxy = $this->backend->get_ip();
+
+ $query = [
+ "query" => $get["s"],
+ "tbm" => "vid"
+ ];
+
+ // add filters
+ if($get["nsfw"] == "no"){
+
+ $query["safe"] = 1;
+ }
+
+ if($get["time"] != "any"){
+
+ $query["tbs"] = $get["time"];
+ }
+
+ if($get["filter"] == "yes"){
+
+ $query["filter"] = 0;
+ }
+ }
+
+ try{
+
+ $html =
+ $this->get(
+ $proxy,
+ "https://coccoc.com/search",
+ $query
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to get search page");
+ }
+
+ $html = explode("window.composerResponse", $html, 2);
+
+ if(count($html) !== 2){
+
+ throw new Exception("Failed to grep window.composerResponse");
+ }
+
+ $html =
+ json_decode(
+ $this->fuckhtml
+ ->extract_json(
+ ltrim($html[1], " =")
+ ),
+ true
+ );
+
+ if($html === null){
+
+ throw new Exception("Failed to decode JSON");
+ }
+
+ $out = [
+ "status" => "ok",
+ "npt" => null,
+ "video" => [],
+ "author" => [],
+ "livestream" => [],
+ "playlist" => [],
+ "reel" => []
+ ];
+
+ if(!isset($html["search_video"]["search_results"])){
+
+ if(isset($html["search_video"]["error"]["title"])){
+
+ if($html["search_video"]["error"]["title"] == "Không tìm thấy kết quả nào"){
+
+ return $out;
+ }
+
+ throw new Exception("Coc Coc returned an error: " . $html["search_video"]["error"]["title"]);
+ }
+
+ throw new Exception("Coc Coc did not supply a search_results object");
+ }
+
+ foreach($html["search_video"]["search_results"] as $video){
+
+ if(isset($video["rich"]["data"]["image_url"])){
+
+ $thumb = [
+ "ratio" => "16:9",
+ "url" => $video["rich"]["data"]["image_url"]
+ ];
+ }else{
+
+ $thumb = [
+ "ratio" => null,
+ "url" => null
+ ];
+ }
+
+ $out["video"][] = [
+ "title" =>
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $video["title"]
+ )
+ ),
+ "description" =>
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $video["content"]
+ )
+ ),
+ "author" => [
+ "name" =>
+ isset($video["rich"]["data"]["channel_name"]) ?
+ $video["rich"]["data"]["channel_name"] : null,
+ "url" => null,
+ "avatar" => null
+ ],
+ "date" =>
+ isset($video["date"]) ?
+ $video["date"] : null,
+ "duration" =>
+ isset($video["rich"]["data"]["duration"]) ?
+ (int)$video["rich"]["data"]["duration"] : null,
+ "views" => null,
+ "thumb" => $thumb,
+ "url" => $video["url"]
+ ];
+ }
+
+ // get next page
+ if((int)$html["search_video"]["page"] < (int)$html["search_video"]["max_page"]){
+
+ if(!isset($query["page"])){
+
+ $query["page"] = 2;
+ }else{
+
+ $query["page"]++;
+ }
+
+ $out["npt"] =
+ $this->backend
+ ->store(
+ json_encode($query),
+ "videos",
+ $proxy
+ );
+ }
+
+ return $out;
+ }
+
+ private function titledots($title){
+
+ return trim($title, " .\t\n\r\0\x0B…");
+ }
+
+ private function int2hms($seconds){
+
+ $hours = floor($seconds / 3600);
+ $minutes = floor(($seconds % 3600) / 60);
+ $seconds = $seconds % 60;
+
+ return sprintf("%02d:%02d:%02d", $hours, $minutes, $seconds);
+ }
+}