youtube-transcript-api
Version:
A YouTube video transcript extractor based on reverse-engineered youtube-transcript.io
172 lines (153 loc) • 6.22 kB
JavaScript
import axios from "axios";
import * as cheerio from "cheerio";
/**
* Generates a random hex string.
* @param {number} size - Length of hex string
* @returns A random hex string
*/
function generateRandomHex(size) {
return [...Array(size)].map(() => Math.floor(Math.random() * 16).toString(16)).join("");
}
class TranscriptClient {
ready; // ready event trigger
#instance; // Axios Instance
#firebase_cfg_creds; // Firebase configuration credentials
constructor(AxiosOptions) {
this.#instance = axios.create({
...(AxiosOptions || {}),
headers: {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:139.0) Gecko/20100101 Firefox/139.0",
...(AxiosOptions?.headers || {})
},
baseURL: "https://www.youtube-transcript.io/"
});
// Promise-based ready event trigger system
this.ready = new Promise(async resolve => {
this.#firebase_cfg_creds = await this.#get_firebase_cfg_creds();
resolve();
});
}
/**
* Gets Google Firebase configuration credentials
* @returns Firebase auth details
*/
#get_firebase_cfg_creds() {
return (async () => {
const { data } = await this.#instance.get("/");
const $ = cheerio.load(data);
for (const elem of $("script[src]").toArray()) {
const url = $(elem).attr("src");
const { data: script } = await this.#instance.get(url);
const match = script.match(/\(\{[^}]*apiKey:"([^"]+)"[^}]*\}\)/gm);
if (match) return Function("return " + match[0])();
}
})();
}
/**
* Gets API authorization details from the Google Identity Platform
* @returns SignupNewUserResponse
*/
#get_auth() {
const creds = this.#firebase_cfg_creds;
if (!creds) throw new Error("client not fully initialized!");
const url = new URL("https://identitytoolkit.googleapis.com/v1/accounts:signUp");
url.searchParams.set("key", creds.apiKey);
return (async () => {
const { data } = await this.#instance.post(url, {
returnSecureToken: true
}, {
headers: {
"X-Client-Version": "Firefox/JsCore/10.14.1/FirebaseCore-web",
"X-Firebase-Client": JSON.stringify({
"version": 2,
"heartbeats": [
{
"agent": "fire-core/0.10.13 fire-core-esm2017/0.10.13 fire-js/ fire-js-all-app/10.14.1 fire-auth/1.7.9 fire-auth-esm2017/1.7.9",
"dates": [
new Date().toISOString().split('T')[0]
]
}
]
}),
"X-Firebase-gmpid": creds.appId.slice(2)
}
});
return data;
})();
}
/**
* Gets x-client-context value
* @param {string} id - The YouTube video ID
* @returns Firebase auth details
*/
#get_x_client_context(id) {
return (async () => {
const { data } = await this.#instance.get("/videos/" + id);
const $ = cheerio.load(data);
for (const elem of $("script[src]").toArray()) {
const url = $(elem).attr("src");
const { data: script } = await this.#instance.get(url);
const match = script.match(/"([^"]+)"\s*:\s*"([^"]+)"\},body:JSON\.stringify\(\{ids:\[t\]\}\)/gm);
if (match) {
const nextMatch = match[0].match(/"([^"]+)"\s*:\s*"([^"]+)"/);
return [nextMatch[1], nextMatch[2]];
}
}
})();
}
/**
* Retrieves the transcript of a particular video.
* @param {string} id - The YouTube video ID
* @param {object} [config] - Request configurations for the Axios HTTP client
* @returns A Promise that resolves to the transcript object
*/
async getTranscript(id, config) {
const auth = await this.#get_auth();
const x_header = await this.#get_x_client_context(id);
try {
const { data } = await this.#instance.post("/api/transcripts", {
ids: [ id ]
}, {
...(config || {}),
headers: {
...(config?.headers || {}),
Authorization: "Bearer " + auth.idToken,
[x_header[0]]: x_header[1],
'X-Hash': generateRandomHex(64)
}
});
return data[0];
} catch (e) {
if (e.status == 403) throw new Error('invalid video ID');
else throw e;
}
}
/**
* Retrieves the transcript of multiple videos.
* @param {string[]} ids - A list of YouTube video IDs
* @param {object} [config] - Request configurations for the Axios HTTP client
* @returns A Promise that resolves to an array of transcript objects
*/
async bulkGetTranscript(ids, config) {
const auth = await this.#get_auth();
const x_header = await this.#get_x_client_context(id);
try {
const { data } = await this.#instance.post("/api/transcripts", {
ids
}, {
...(config || {}),
headers: {
...(config?.headers || {}),
Authorization: "Bearer " + auth.idToken,
[x_header[0]]: x_header[1],
'X-Hash': generateRandomHex(64)
}
});
return data;
} catch (e) {
if (e.status == 403) throw new Error('video not found or unavailable');
else throw e;
}
}
}
export { TranscriptClient as default };