From cdf958d29333d448f4521f4d2faa2592b58e9b27 Mon Sep 17 00:00:00 2001 From: lolcat Date: Sun, 10 Aug 2025 21:55:15 -0400 Subject: fix wikipedia crash --- scraper/sc.php | 512 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 512 insertions(+) create mode 100644 scraper/sc.php (limited to 'scraper/sc.php') diff --git a/scraper/sc.php b/scraper/sc.php new file mode 100644 index 0000000..7083c42 --- /dev/null +++ b/scraper/sc.php @@ -0,0 +1,512 @@ +backend = new backend("sc"); + + include "lib/fuckhtml.php"; + $this->fuckhtml = new fuckhtml(); + } + + public function getfilters($page){ + + return [ + "type" => [ + "display" => "Type", + "option" => [ + "any" => "Any type", + "track" => "Tracks", + "author" => "People", + "album" => "Albums", + "playlist" => "Playlists", + "goplus" => "Go+ Tracks" + ] + ] + ]; + } + + private function get($proxy, $url, $get = [], $web_req = false){ + + $curlproc = curl_init(); + + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + + // use http2 + curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); + + if($web_req === false){ + + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "Referer: https://soundcloud.com/", + "Origin: https://soundcloud.com", + "DNT: 1", + "Connection: keep-alive", + "Sec-Fetch-Dest: empty", + "Sec-Fetch-Mode: cors", + "Sec-Fetch-Site: same-site", + "Priority: u=1"] + ); + }else{ + + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "DNT: 1", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: cross-site", + "Priority: u=1", + "TE: trailers"] + ); + } + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + return $data; + } + + public function music($get, $last_attempt = false){ + + if($get["npt"]){ + + [$params, $proxy] = $this->backend->get($get["npt"], "music"); + $params = json_decode($params, true); + + $url = $params["url"]; + unset($params["url"]); + + }else{ + + // normal search: + // https://api-v2.soundcloud.com/search?q=freddie%20dredd&variant_ids=&facet=model&user_id=351062-302234-707916-795081&client_id=iMxZgT5mfGstBj8GWJbYMvpzelS8ne0E&limit=20&offset=0&linked_partitioning=1&app_version=1693487844&app_locale=en + + // soundcloud go+ search: + // https://api-v2.soundcloud.com/search/tracks?q=freddie%20dredd&variant_ids=&filter.content_tier=SUB_HIGH_TIER&facet=genre&user_id=630591-269800-703400-765403&client_id=iMxZgT5mfGstBj8GWJbYMvpzelS8ne0E&limit=20&offset=0&linked_partitioning=1&app_version=1693487844&app_locale=en + + // tracks search: + // https://api-v2.soundcloud.com/search/tracks?q=freddie%20dredd&variant_ids=&facet=genre&user_id=630591-269800-703400-765403&client_id=iMxZgT5mfGstBj8GWJbYMvpzelS8ne0E&limit=20&offset=0&linked_partitioning=1&app_version=1693487844&app_locale=en + + // users search: + // https://api-v2.soundcloud.com/search/users?q=freddie%20dredd&variant_ids=&facet=place&user_id=630591-269800-703400-765403&client_id=iMxZgT5mfGstBj8GWJbYMvpzelS8ne0E&limit=20&offset=0&linked_partitioning=1&app_version=1693487844&app_locale=en + + // albums search: + // https://api-v2.soundcloud.com/search/albums?q=freddie%20dredd&variant_ids=&facet=genre&user_id=630591-269800-703400-765403&client_id=iMxZgT5mfGstBj8GWJbYMvpzelS8ne0E&limit=20&offset=0&linked_partitioning=1&app_version=1693487844&app_locale=en + + // playlists search: + // https://api-v2.soundcloud.com/search/playlists_without_albums?q=freddie%20dredd&variant_ids=&facet=genre&user_id=630591-269800-703400-765403&client_id=iMxZgT5mfGstBj8GWJbYMvpzelS8ne0E&limit=20&offset=0&linked_partitioning=1&app_version=1693487844&app_locale=en + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $type = $get["type"]; + $proxy = $this->backend->get_ip(); + $token = $this->get_token($proxy); + + switch($type){ + + case "any": + $url = "https://api-v2.soundcloud.com/search"; + $params = [ + "q" => $search, + "variant_ids" => "", + "facet" => "model", + "client_id" => $token, + "limit" => 20, + "offset" => 0, + "linked_partitioning" => 1, + "app_version" => 1713542117, + "app_locale" => "en" + ]; + break; + + case "track": + $url = "https://api-v2.soundcloud.com/search/tracks"; + $params = [ + "q" => $search, + "variant_ids" => "", + "facet_genre" => "", + "client_id" => $token, + "limit" => 20, + "offset" => 0, + "linked_partitioning" => 1, + "app_version" => 1713542117, + "app_locale" => "en" + ]; + break; + + case "author": + $url = "https://api-v2.soundcloud.com/search/users"; + $params = [ + "q" => $search, + "variant_ids" => "", + "facet" => "place", + "client_id" => $token, + "limit" => 20, + "offset" => 0, + "linked_partitioning" => 1, + "app_version" => 1713542117, + "app_locale" => "en" + ]; + break; + + case "album": + $url = "https://api-v2.soundcloud.com/search/albums"; + $params = [ + "q" => $search, + "variant_ids" => "", + "facet" => "genre", + "client_id" => $token, + "limit" => 20, + "offset" => 0, + "linked_partitioning" => 1, + "app_version" => 1713542117, + "app_locale" => "en" + ]; + break; + + case "playlist": + $url = "https://api-v2.soundcloud.com/search/playlists_without_albums"; + $params = [ + "q" => $search, + "variant_ids" => "", + "facet" => "genre", + "client_id" => $token, + "limit" => 20, + "offset" => 0, + "linked_partitioning" => 1, + "app_version" => 1713542117, + "app_locale" => "en" + ]; + break; + + case "goplus": + $url = "https://api-v2.soundcloud.com/search/tracks"; + $params = [ + "q" => $search, + "variant_ids" => "", + "filter.content_tier" => "SUB_HIGH_TIER", + "facet" => "genre", + "client_id" => $token, + "limit" => 20, + "offset" => 0, + "linked_partitioning" => 1, + "app_version" => 1713542117, + "app_locale" => "en" + ]; + break; + } + } + + try{ + + $json = $this->get($proxy, $url, $params); + + }catch(Exception $error){ + + throw new Exception("Failed to fetch JSON"); + } + + /* + $handle = fopen("scraper/soundcloud.json", "r"); + $json = fread($handle, filesize("scraper/soundcloud.json")); + fclose($handle); + */ + + $json = json_decode($json, true); + + if($json === null){ + + if($last_attempt === true){ + + throw new Exception("Fetched an invalid token (please report!!)"); + } + + // token might've expired, get a new one and re-try search + $this->get_token($proxy); + return $this->music($get, true); + } + + $out = [ + "status" => "ok", + "npt" => null, + "song" => [], + "playlist" => [], + "album" => [], + "podcast" => [], + "author" => [], + "user" => [] + ]; + + /* + Get next page + */ + if(isset($json["next_href"])){ + + $params["query_urn"] = $json["query_urn"]; + $params["offset"] = $params["offset"] + 20; + $params["url"] = $url; // we will remove this later + + $out["npt"] = + $this->backend->store( + json_encode($params), + "music", + $proxy + ); + } + + /* + Scrape items + */ + foreach($json["collection"] as $item){ + + switch($item["kind"]){ + + case "user": + // parse author + $out["author"][] = [ + "title" => $item["username"], + "followers" => $item["followers_count"], + "description" => trim($item["track_count"] . " songs. " . $this->limitstrlen($item["description"])), + "thumb" => [ + "url" => $item["avatar_url"], + "ratio" => "1:1" + ], + "url" => $item["permalink_url"] + ]; + break; + + case "playlist": + // parse playlist + $description = []; + $count = 0; + + foreach($item["tracks"] as $song){ + + $count++; + + if(!isset($song["title"])){ + + continue; + } + + $description[] = $song["title"]; + } + + if(count($description) !== 0){ + + $description = trim($count . " songs. " . implode(", ", $description)); + }else{ + + $description = ""; + } + + if( + isset($item["artwork_url"]) && + !empty($item["artwork_url"]) + ){ + + $thumb = [ + "ratio" => "1:1", + "url" => $item["artwork_url"] + ]; + + }elseif( + isset($item["tracks"][0]["artwork_url"]) && + !empty($item["tracks"][0]["artwork_url"]) + ){ + + $thumb = [ + "ratio" => "1:1", + "url" => $item["tracks"][0]["artwork_url"] + ]; + }else{ + + $thumb = [ + "ratio" => null, + "url" => null + ]; + } + + $out["playlist"][] = [ + "title" => $item["title"], + "description" => $this->limitstrlen($description), + "author" => [ + "name" => $item["user"]["username"], + "url" => $item["user"]["permalink_url"], + "avatar" => $item["user"]["avatar_url"] + ], + "thumb" => $thumb, + "date" => strtotime($item["created_at"]), + "duration" => $item["duration"] / 1000, + "url" => $item["permalink_url"] + ]; + break; + + case "track": + if(stripos($item["monetization_model"], "TIER") === false){ + + $stream = [ + "endpoint" => "sc", + "url" => + $item["media"]["transcodings"][0]["url"] . + "?client_id=" . $token . + "&track_authorization=" . + $item["track_authorization"] + ]; + }else{ + + $stream = [ + "endpoint" => null, + "url" => null + ]; + } + + // parse track + $out["song"][] = [ + "title" => $item["title"], + "description" => $item["description"] == "" ? null : $this->limitstrlen($item["description"]), + "url" => $item["permalink_url"], + "views" => $item["playback_count"], + "author" => [ + "name" => $item["user"]["username"], + "url" => $item["user"]["permalink_url"], + "avatar" => $item["user"]["avatar_url"] + ], + "thumb" => [ + "ratio" => "1:1", + "url" => $item["artwork_url"] + ], + "date" => strtotime($item["created_at"]), + "duration" => (int)$item["full_duration"] / 1000, + "stream" => $stream + ]; + break; + } + } + + return $out; + } + + public function get_token($proxy){ + + $token = apcu_fetch("sc_token"); + + if($token !== false){ + + return $token; + } + + // search through all javascript components on the main page + try{ + $html = + $this->get( + $proxy, + "https://soundcloud.com", + [], + true + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch front page"); + } + + $this->fuckhtml->load($html); + + $scripts = + $this->fuckhtml + ->getElementsByTagName( + "script" + ); + + foreach($scripts as $script){ + + if( + !isset($script["attributes"]["src"]) || + strpos($script["attributes"]["src"], "sndcdn.com") === false + ){ + + continue; + } + + try{ + $js = + $this->get( + $proxy, + $script["attributes"]["src"], + [] + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch search token"); + } + + preg_match( + '/client_id=([^"]+)/', + $js, + $token + ); + + if(isset($token[1])){ + + apcu_store("sc_token", $token[1]); + return $token[1]; + break; + } + } + + throw new Exception("Did not find a Soundcloud token in the Javascript blobs"); + } + + private function limitstrlen($text){ + + return + explode( + "\n", + wordwrap( + str_replace( + ["\n\r", "\r\n", "\n", "\r"], + " ", + $text + ), + 300, + "\n" + ), + 2 + )[0]; + } +} -- cgit v1.2.3