aboutsummaryrefslogtreecommitdiffstats
path: root/scraper/sc.php
diff options
context:
space:
mode:
Diffstat (limited to 'scraper/sc.php')
-rw-r--r--scraper/sc.php512
1 files changed, 512 insertions, 0 deletions
diff --git a/scraper/sc.php b/scraper/sc.php
new file mode 100644
index 0000000..7083c42
--- /dev/null
+++ b/scraper/sc.php
@@ -0,0 +1,512 @@
+<?php
+
+class sc{
+
+ public function __construct(){
+
+ include "lib/backend.php";
+ $this->backend = new backend("sc");
+
+ include "lib/fuckhtml.php";
+ $this->fuckhtml = new fuckhtml();
+ }
+
+ public function getfilters($page){
+
+ return [
+ "type" => [
+ "display" => "Type",
+ "option" => [
+ "any" => "Any type",
+ "track" => "Tracks",
+ "author" => "People",
+ "album" => "Albums",
+ "playlist" => "Playlists",
+ "goplus" => "Go+ Tracks"
+ ]
+ ]
+ ];
+ }
+
+ private function get($proxy, $url, $get = [], $web_req = false){
+
+ $curlproc = curl_init();
+
+ if($get !== []){
+ $get = http_build_query($get);
+ $url .= "?" . $get;
+ }
+
+ curl_setopt($curlproc, CURLOPT_URL, $url);
+
+ curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
+
+ // use http2
+ curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
+
+ if($web_req === false){
+
+ curl_setopt($curlproc, CURLOPT_HTTPHEADER,
+ ["User-Agent: " . config::USER_AGENT,
+ "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+ "Accept-Language: en-US,en;q=0.5",
+ "Accept-Encoding: gzip",
+ "Referer: https://soundcloud.com/",
+ "Origin: https://soundcloud.com",
+ "DNT: 1",
+ "Connection: keep-alive",
+ "Sec-Fetch-Dest: empty",
+ "Sec-Fetch-Mode: cors",
+ "Sec-Fetch-Site: same-site",
+ "Priority: u=1"]
+ );
+ }else{
+
+ curl_setopt($curlproc, CURLOPT_HTTPHEADER,
+ ["User-Agent: " . config::USER_AGENT,
+ "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+ "Accept-Language: en-US,en;q=0.5",
+ "Accept-Encoding: gzip",
+ "DNT: 1",
+ "Connection: keep-alive",
+ "Upgrade-Insecure-Requests: 1",
+ "Sec-Fetch-Dest: document",
+ "Sec-Fetch-Mode: navigate",
+ "Sec-Fetch-Site: cross-site",
+ "Priority: u=1",
+ "TE: trailers"]
+ );
+ }
+
+ curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
+ curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
+ curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
+ curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
+ curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
+
+ $this->backend->assign_proxy($curlproc, $proxy);
+
+ $data = curl_exec($curlproc);
+
+ if(curl_errno($curlproc)){
+
+ throw new Exception(curl_error($curlproc));
+ }
+
+ curl_close($curlproc);
+ return $data;
+ }
+
+ public function music($get, $last_attempt = false){
+
+ if($get["npt"]){
+
+ [$params, $proxy] = $this->backend->get($get["npt"], "music");
+ $params = json_decode($params, true);
+
+ $url = $params["url"];
+ unset($params["url"]);
+
+ }else{
+
+ // normal search:
+ // https://api-v2.soundcloud.com/search?q=freddie%20dredd&variant_ids=&facet=model&user_id=351062-302234-707916-795081&client_id=iMxZgT5mfGstBj8GWJbYMvpzelS8ne0E&limit=20&offset=0&linked_partitioning=1&app_version=1693487844&app_locale=en
+
+ // soundcloud go+ search:
+ // https://api-v2.soundcloud.com/search/tracks?q=freddie%20dredd&variant_ids=&filter.content_tier=SUB_HIGH_TIER&facet=genre&user_id=630591-269800-703400-765403&client_id=iMxZgT5mfGstBj8GWJbYMvpzelS8ne0E&limit=20&offset=0&linked_partitioning=1&app_version=1693487844&app_locale=en
+
+ // tracks search:
+ // https://api-v2.soundcloud.com/search/tracks?q=freddie%20dredd&variant_ids=&facet=genre&user_id=630591-269800-703400-765403&client_id=iMxZgT5mfGstBj8GWJbYMvpzelS8ne0E&limit=20&offset=0&linked_partitioning=1&app_version=1693487844&app_locale=en
+
+ // users search:
+ // https://api-v2.soundcloud.com/search/users?q=freddie%20dredd&variant_ids=&facet=place&user_id=630591-269800-703400-765403&client_id=iMxZgT5mfGstBj8GWJbYMvpzelS8ne0E&limit=20&offset=0&linked_partitioning=1&app_version=1693487844&app_locale=en
+
+ // albums search:
+ // https://api-v2.soundcloud.com/search/albums?q=freddie%20dredd&variant_ids=&facet=genre&user_id=630591-269800-703400-765403&client_id=iMxZgT5mfGstBj8GWJbYMvpzelS8ne0E&limit=20&offset=0&linked_partitioning=1&app_version=1693487844&app_locale=en
+
+ // playlists search:
+ // https://api-v2.soundcloud.com/search/playlists_without_albums?q=freddie%20dredd&variant_ids=&facet=genre&user_id=630591-269800-703400-765403&client_id=iMxZgT5mfGstBj8GWJbYMvpzelS8ne0E&limit=20&offset=0&linked_partitioning=1&app_version=1693487844&app_locale=en
+
+ $search = $get["s"];
+ if(strlen($search) === 0){
+
+ throw new Exception("Search term is empty!");
+ }
+
+ $type = $get["type"];
+ $proxy = $this->backend->get_ip();
+ $token = $this->get_token($proxy);
+
+ switch($type){
+
+ case "any":
+ $url = "https://api-v2.soundcloud.com/search";
+ $params = [
+ "q" => $search,
+ "variant_ids" => "",
+ "facet" => "model",
+ "client_id" => $token,
+ "limit" => 20,
+ "offset" => 0,
+ "linked_partitioning" => 1,
+ "app_version" => 1713542117,
+ "app_locale" => "en"
+ ];
+ break;
+
+ case "track":
+ $url = "https://api-v2.soundcloud.com/search/tracks";
+ $params = [
+ "q" => $search,
+ "variant_ids" => "",
+ "facet_genre" => "",
+ "client_id" => $token,
+ "limit" => 20,
+ "offset" => 0,
+ "linked_partitioning" => 1,
+ "app_version" => 1713542117,
+ "app_locale" => "en"
+ ];
+ break;
+
+ case "author":
+ $url = "https://api-v2.soundcloud.com/search/users";
+ $params = [
+ "q" => $search,
+ "variant_ids" => "",
+ "facet" => "place",
+ "client_id" => $token,
+ "limit" => 20,
+ "offset" => 0,
+ "linked_partitioning" => 1,
+ "app_version" => 1713542117,
+ "app_locale" => "en"
+ ];
+ break;
+
+ case "album":
+ $url = "https://api-v2.soundcloud.com/search/albums";
+ $params = [
+ "q" => $search,
+ "variant_ids" => "",
+ "facet" => "genre",
+ "client_id" => $token,
+ "limit" => 20,
+ "offset" => 0,
+ "linked_partitioning" => 1,
+ "app_version" => 1713542117,
+ "app_locale" => "en"
+ ];
+ break;
+
+ case "playlist":
+ $url = "https://api-v2.soundcloud.com/search/playlists_without_albums";
+ $params = [
+ "q" => $search,
+ "variant_ids" => "",
+ "facet" => "genre",
+ "client_id" => $token,
+ "limit" => 20,
+ "offset" => 0,
+ "linked_partitioning" => 1,
+ "app_version" => 1713542117,
+ "app_locale" => "en"
+ ];
+ break;
+
+ case "goplus":
+ $url = "https://api-v2.soundcloud.com/search/tracks";
+ $params = [
+ "q" => $search,
+ "variant_ids" => "",
+ "filter.content_tier" => "SUB_HIGH_TIER",
+ "facet" => "genre",
+ "client_id" => $token,
+ "limit" => 20,
+ "offset" => 0,
+ "linked_partitioning" => 1,
+ "app_version" => 1713542117,
+ "app_locale" => "en"
+ ];
+ break;
+ }
+ }
+
+ try{
+
+ $json = $this->get($proxy, $url, $params);
+
+ }catch(Exception $error){
+
+ throw new Exception("Failed to fetch JSON");
+ }
+
+ /*
+ $handle = fopen("scraper/soundcloud.json", "r");
+ $json = fread($handle, filesize("scraper/soundcloud.json"));
+ fclose($handle);
+ */
+
+ $json = json_decode($json, true);
+
+ if($json === null){
+
+ if($last_attempt === true){
+
+ throw new Exception("Fetched an invalid token (please report!!)");
+ }
+
+ // token might've expired, get a new one and re-try search
+ $this->get_token($proxy);
+ return $this->music($get, true);
+ }
+
+ $out = [
+ "status" => "ok",
+ "npt" => null,
+ "song" => [],
+ "playlist" => [],
+ "album" => [],
+ "podcast" => [],
+ "author" => [],
+ "user" => []
+ ];
+
+ /*
+ Get next page
+ */
+ if(isset($json["next_href"])){
+
+ $params["query_urn"] = $json["query_urn"];
+ $params["offset"] = $params["offset"] + 20;
+ $params["url"] = $url; // we will remove this later
+
+ $out["npt"] =
+ $this->backend->store(
+ json_encode($params),
+ "music",
+ $proxy
+ );
+ }
+
+ /*
+ Scrape items
+ */
+ foreach($json["collection"] as $item){
+
+ switch($item["kind"]){
+
+ case "user":
+ // parse author
+ $out["author"][] = [
+ "title" => $item["username"],
+ "followers" => $item["followers_count"],
+ "description" => trim($item["track_count"] . " songs. " . $this->limitstrlen($item["description"])),
+ "thumb" => [
+ "url" => $item["avatar_url"],
+ "ratio" => "1:1"
+ ],
+ "url" => $item["permalink_url"]
+ ];
+ break;
+
+ case "playlist":
+ // parse playlist
+ $description = [];
+ $count = 0;
+
+ foreach($item["tracks"] as $song){
+
+ $count++;
+
+ if(!isset($song["title"])){
+
+ continue;
+ }
+
+ $description[] = $song["title"];
+ }
+
+ if(count($description) !== 0){
+
+ $description = trim($count . " songs. " . implode(", ", $description));
+ }else{
+
+ $description = "";
+ }
+
+ if(
+ isset($item["artwork_url"]) &&
+ !empty($item["artwork_url"])
+ ){
+
+ $thumb = [
+ "ratio" => "1:1",
+ "url" => $item["artwork_url"]
+ ];
+
+ }elseif(
+ isset($item["tracks"][0]["artwork_url"]) &&
+ !empty($item["tracks"][0]["artwork_url"])
+ ){
+
+ $thumb = [
+ "ratio" => "1:1",
+ "url" => $item["tracks"][0]["artwork_url"]
+ ];
+ }else{
+
+ $thumb = [
+ "ratio" => null,
+ "url" => null
+ ];
+ }
+
+ $out["playlist"][] = [
+ "title" => $item["title"],
+ "description" => $this->limitstrlen($description),
+ "author" => [
+ "name" => $item["user"]["username"],
+ "url" => $item["user"]["permalink_url"],
+ "avatar" => $item["user"]["avatar_url"]
+ ],
+ "thumb" => $thumb,
+ "date" => strtotime($item["created_at"]),
+ "duration" => $item["duration"] / 1000,
+ "url" => $item["permalink_url"]
+ ];
+ break;
+
+ case "track":
+ if(stripos($item["monetization_model"], "TIER") === false){
+
+ $stream = [
+ "endpoint" => "sc",
+ "url" =>
+ $item["media"]["transcodings"][0]["url"] .
+ "?client_id=" . $token .
+ "&track_authorization=" .
+ $item["track_authorization"]
+ ];
+ }else{
+
+ $stream = [
+ "endpoint" => null,
+ "url" => null
+ ];
+ }
+
+ // parse track
+ $out["song"][] = [
+ "title" => $item["title"],
+ "description" => $item["description"] == "" ? null : $this->limitstrlen($item["description"]),
+ "url" => $item["permalink_url"],
+ "views" => $item["playback_count"],
+ "author" => [
+ "name" => $item["user"]["username"],
+ "url" => $item["user"]["permalink_url"],
+ "avatar" => $item["user"]["avatar_url"]
+ ],
+ "thumb" => [
+ "ratio" => "1:1",
+ "url" => $item["artwork_url"]
+ ],
+ "date" => strtotime($item["created_at"]),
+ "duration" => (int)$item["full_duration"] / 1000,
+ "stream" => $stream
+ ];
+ break;
+ }
+ }
+
+ return $out;
+ }
+
+ public function get_token($proxy){
+
+ $token = apcu_fetch("sc_token");
+
+ if($token !== false){
+
+ return $token;
+ }
+
+ // search through all javascript components on the main page
+ try{
+ $html =
+ $this->get(
+ $proxy,
+ "https://soundcloud.com",
+ [],
+ true
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to fetch front page");
+ }
+
+ $this->fuckhtml->load($html);
+
+ $scripts =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "script"
+ );
+
+ foreach($scripts as $script){
+
+ if(
+ !isset($script["attributes"]["src"]) ||
+ strpos($script["attributes"]["src"], "sndcdn.com") === false
+ ){
+
+ continue;
+ }
+
+ try{
+ $js =
+ $this->get(
+ $proxy,
+ $script["attributes"]["src"],
+ []
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to fetch search token");
+ }
+
+ preg_match(
+ '/client_id=([^"]+)/',
+ $js,
+ $token
+ );
+
+ if(isset($token[1])){
+
+ apcu_store("sc_token", $token[1]);
+ return $token[1];
+ break;
+ }
+ }
+
+ throw new Exception("Did not find a Soundcloud token in the Javascript blobs");
+ }
+
+ private function limitstrlen($text){
+
+ return
+ explode(
+ "\n",
+ wordwrap(
+ str_replace(
+ ["\n\r", "\r\n", "\n", "\r"],
+ " ",
+ $text
+ ),
+ 300,
+ "\n"
+ ),
+ 2
+ )[0];
+ }
+}