animethemes-parser
Version:
Parser for /r/animethemes wiki
232 lines (199 loc) • 6.78 kB
JavaScript
const axios = require('axios').default;
const cheerio = require('cheerio');
const fs = require('fs');
/**
* @typedef {Object} Theme
* @property {String} name Song type, number and title e.g.
* - OP1 "sister's noise"
* @property {String} link Direct link to webm video on animethemes.moe
* @property {String} desc Link information (eg Webm (DVD, 480))
* @property {('opening'|'ending')} type Song type
* - opening
* - ending
* @property {String} episodes Episodes with this theme
* @property {String} notes Additional notes (NSFW, Spoilers)
*/
/**
* @typedef {Object} Anime
* @property {String} id MyAnimeList ID
* @property {String} title Anime title, usually in romaji
* @property {String} year Anime release year or decade if 1999 or older (XXs, e.g. 90s)
* @property {Array<Theme>} themes Themes
*/
/** Class representing theme parser */
class ThemeParser {
constructor() {
this.baseUrl = 'https://reddit.com';
}
/**
* Get all themes available
* @returns {Promise<Array<Anime>>}
*/
async all() {
try {
// Fetching year links
this.animes = [];
let resp = await axios.get("https://reddit.com/r/AnimeThemes/wiki/year_index.json", {
headers: {
"User-Agent": "animethemes-scraper 1.0"
}
})
let html = resp.data.data.content_html;
html = getHTML(html);
this.$ = cheerio.load(html);
let data = await this.parseLinks(); // Parse each year
return data;
}
catch(err) {
throw err;
}
}
/**
* Get all animes from a year
* @param {Number} n Year
* @returns {Promise<Array<Anime>>}
*/
async year(n) {
let animes = [];
let y = await biribiri('/r/AnimeThemes/wiki/'+n) // Fetch and parse wiki page
this.$ = y;
y('h3').each((i, el) => { // Each series in year
let parsed = this.parseAnime(el);
parsed.year = n;
animes.push(parsed);
})
return animes;
}
/**
* Import from file generated by animethemes-parser
* @param {string} [filename=output.json] Filename
* @returns {Promise<Array<Anime>>}
*/
async import(filename = "output.json") {
try {
let c = fs.readFileSync(filename);
let parsed = JSON.parse(c);
if(parsed instanceof Array && 'id' in parsed[0]) {
return parsed;
} else {
throw "Not a valid file"
}
} catch(e) {
throw e;
}
}
/**
* Export themes to file
* @param {string} [filename=output.json] Filename
* @returns {Promise<Array<Anime>>}
*/
async export(filename = "output.json") {
try {
const a = await this.all() // Parse all themes
fs.writeFileSync(filename, JSON.stringify(a)) // and save to file
return a;
} catch(e) {
throw e;
}
}
parseLinks() {
return new Promise(async resolve => {
let years = this.$('h3 a'); // All year links
this.finl = 0; // Finished tasks counter
years.each(async (i, yearElement) => { // Each year
this.year(this.$(yearElement).attr('href').split('/')[4]) // Parse this year
.then(animes => {
this.animes = this.animes.concat(animes); // Add animes from this year to array
this.finl++; // Finished parsing
if(this.finl == years.length) { // If everything is finished
resolve(this.animes); // Return
}
})
});
})
}
/**
* @returns {Anime}
*/
parseAnime(dat) {
let el = this.$(dat).children('a'); // Title element
let title = el.text(); // Title
let malId = el.attr('href').split('/')[4]; // Title link - mal id
let next = this.$(dat).next(); // Next element - other titles or table
let theme = {
id: malId,
title,
year: null
}
if (next.prop("tagName") == "P") { // If next element is other titles
theme.themes = this.parseTable(next.next()); // Next element should be a table
} else if (next.prop("tagName") == "TABLE") { // Next element is table
theme.themes = this.parseTable(next); // Parse table
}
return theme;
}
parseTable(table) {
if (table.prop('tagName') != "TABLE") { // If for some reason it's not a table, check next element
return this.parseTable(table.next());
}
let themes = [];
table.children('tbody').children('tr').each(function (i) { // For each theme
const $ = cheerio.load(this);
const td = $('td'); // Theme row
let name = replaceAll(td.first().text(), """, "\""); // Theme name
let linkEl = td.eq(1).children().first(); // link element
let link = linkEl.attr('href'); // animethemes.moe link
let linkDesc = linkEl.text(); // link description (eg. NCBD 1080)
let episodes = td.eq(2).text(); // Episode notes
let notes = td.eq(3).text(); // Additional notes
// Push theme to array of themes
themes.push({
name,
link,
desc: linkDesc,
type: (name.startsWith('OP') ? 'opening' : 'ending'),
episodes,
notes
})
})
return themes; // Return all themes of this anime
}
}
// Running from command line
if (require.main === module) {
let parser = new ThemeParser();
parser.export()
.then(a => {
console.log("Parsed " + a.length + " anime. Written to output.json")
})
}
/**
* @param {string} href Wiki page path
*/
async function biribiri(href) {
let resp = await axios.get("https://reddit.com" + href + ".json", {
headers: {
"User-Agent": "animethemes-scraper 1.0"
}
})
return cheerio.load(getHTML(resp.data.data.content_html));
}
/**
* @param {Cheerio} table Cheerio with loaded <table>
*/
function getHTML(str) {
let html = replaceAll(str, "<", "<")
html = replaceAll(html, ">", ">")
return html;
}
/**
*
* @param {*} str
* @param {*} find
* @param {*} replace
* @returns {string} replaced
*/
function replaceAll(str, find, replace) {
return str.replace(new RegExp(find, 'g'), replace);
}
module.exports = ThemeParser; // For importing in code