mal-scrape
Version:
scrapes info from myanimelist
601 lines (554 loc) • 19.8 kB
JavaScript
const fetch = require('node-fetch')
const cheerio = require('cheerio')
const endpoints = require('./endpoints')
/**
* MAL class
* @class
* @classdesc the class that has all the functionality stuff
*/
class MAL {
/**
* @typedef Posters
* @type {Object}
* @property {string} src - the src to the item of the top page.
* @property {Object} srcset - the srcset of the item (an object with 2 sizes).
* @property {string} id - the picture id of the img.
* @property {string} big - the poster in an big size.
* @property {string} huge - the poster in an huge size.
*/
/**
* @typedef TopInfo
* @type {Object}
* @property {string} runtime - the runtime of the item.
* @property {number} members - the members of the item.
* @property {number} episodes - the amount of episodes of an item.
* @property {number} type - the type of an item. this is different from the other type.
*/
/**
* @typedef Top
* @type {Object}
* @property {number} id - the id of the item.
* @property {number} ranking - the ranking of the item the list.
* @property {string} title - the title of the item.
* @property {string} href - the link to the item.
* @property {number} score - the score of the item.
* @property {Posters} posters - the poster of the item in different sizes.
* @property {string} type - the type of the item.
* @property {TopInfo} info - the basic info of the item.
* @property {function} getDetails - returns a Promise with the details.
* @property {function} getPictures - returns a Promise with the pictures.
* @property {function} getPics - returns a Promise with the pictures.
* @property {function} getImages - returns a Promise with the pictures.
* @property {function} getVideos - returns a Promise with the videos.
*/
/**
* @typedef DetailsInformation
* @type {Object}
* @property {string} type - the type of an item. this is different from the other type.
* @property {string} episodes - the amount of episodes of an item. this will become an int.
* @property {string} status - the status of an item: finished airing and that kind of stuff.
* @property {string} aired - from when to when it aired.
* @property {string} premiered - when it premiered.
* @property {string} broadcast - what day and time it broadcasts.
* @property {string} producers - the producers. this will become an array.
* @property {string} licensors - the licensors. this will become an array.
* @property {string} studios - the studios. this will become an array.
* @property {string} source - the source.
* @property {string} duration - the duration. how long an episode is.
* @property {string} rating - the rating, pg-13 or something.
*/
/**
* @typedef DetailsStatistics
* @type {Object}
* @property {string} score - the scrore of the item.
* @property {string} ranked - the rank of the item.
* @property {string} popularity - the popularity of the item.
* @property {string} members - the amount of members of the item.
* @property {string} favorites - the amount of favorites of the item.
*/
/**
* @typedef Details
* @type {Object}
* @property {string} title - the title of the item.
* @property {string} type - the type of the item.
* @property {number} score - the score of the item.
* @property {number} rank - the rank of the item.
* @property {number} popularity - the popularity of the item.
* @property {number} members - the members of the item.
* @property {string} synopsis - the synopsis of the item.
* @property {string} poster - the poster of the item.
* @property {Object} video - the video on the details page of the item. contains the href to the embeded youtube thing and a youtube video id.
* @property {string} href - the link to the items page.
* @property {Object} alternativeTitles - the alternative titles of the item.
* @property {DetailsInformation} information - detailed info of the item.
* @property {DetailsStatistics} statistics - statistics of the item
* @property {function} getPictures - returns a Promise with the pictures.
* @property {function} getPics - returns a Promise with the pictures.
* @property {function} getImages - returns a Promise with the pictures.
* @property {function} getVideos - returns a Promise with the videos.
*/
/**
* @typedef SearchItem
* @type {Object}
* @property {string} title - the title of the item.
* @property {string} href - the link to the items page.
* @property {number} id - the score of the item.
* @property {string} type - the type of the item.
* @property {string} synopsis - a short synopsis of the item.
* @property {Posters} posters - different sizes of the poster.
* @property {function} getPictures - returns a Promise with the pictures.
* @property {function} getPics - returns a Promise with the pictures.
* @property {function} getImages - returns a Promise with the pictures.
* @property {function} getVideos - returns a Promise with the videos.
* @property {function} getDetails - returns a Promise with the details.
*/
/**
* constructor, instantiates the object
* @param {strign} prefix - it uses this in front of the request, you could use this to prevent cors errors in browsers
* @param {string} url - the base url to use, default: https://myanimelist.net
*/
constructor(prefix = '', url) {
this.base = url || 'https://myanimelist.net'
this.prefix = prefix
}
// options can look like this
// {limit: 50}
_genOptions(options) {
let i = 0
let res = ''
for (let opt in options) {
if (i === 0) {
res += `?${opt}=${options[opt]}`
} else {
res += `&${opt}=${options[opt]}`
}
i++
}
return res
}
_parsePathParam(path, param) {
const reg = /{(.*?)}/g // get the stuff between the {}
const arr = path.match(reg) || [] // array with the results of the regexp
for (var i = 0; i < arr.length; i++) {
let par = param[arr[i].substr(1, arr[i].length - 2)]
if (par) {
path = path.replace(arr[i], par)
} else {
path = path.replace(arr[i], '')
}
}
path = path.replace(/\/\/+/g, '/') // remove double slashes
return path
}
_get(path, options, param) {
return fetch(
this.prefix +
this.base +
this._parsePathParam(path, param) +
this._genOptions(options)
).then(res => res.text())
}
// gets posters from poster data
_parsePosters(out, type) {
// get small posters
const posters = {}
const x = out.posters.srcset.split(', ')
for (let src in x) {
const piece = x[src].split(' ')
posters[piece[1]] = piece[0]
}
out.posters.srcset = posters
// get big poster
const posterBase = `https://myanimelist.cdn-dena.com/images/${type}`
const pos = out.posters.src.indexOf(`${type}/`) + type.length
const posterId = out.posters.src.substring(
pos,
out.posters.src.indexOf('.', pos) + 4
)
out.posters.id = posterId
out.posters.big = posterBase + posterId
// doesn't return anything, because out is 'linked'
}
// gets posters from poster data
_getPosters(elem, type) {
// get small posters
const posters = {}
const data = elem.find('img').data()
const x = data.srcset.split(', ')
for (let src in x) {
const piece = x[src].split(' ')
posters[piece[1]] = piece[0]
}
data.srcset = posters
// get big poster
const posterBase = `https://myanimelist.cdn-dena.com/images/${type}`
const pos = data.src.indexOf(`${type}/`) + type.length
const posterId = data.src.substring(pos, data.src.indexOf('.', pos) + 4)
data.id = posterId
data.big = posterBase + posterId
data.huge = posterBase + posterId.replace('.', 'l.')
return data
}
_parseTopTr(tr, type = 'anime') {
const out = {
id: parseInt(
tr
.find('.detail .hoverinfo_trigger')
.attr('id')
.replace('#area', '')
),
ranking: parseInt(tr.find('.top-anime-rank-text').text()),
title: tr
.find('.detail .hoverinfo_trigger')
.text()
.replace(/\s\s+/g, ' ')
.trim(),
href: tr.find('.hoverinfo_trigger').attr('href'),
score: parseFloat(tr.find('.score .text').text()),
posters: this._getPosters(tr, type),
type: type
}
// parse information
let info = tr
.find('.information')
.text()
.trim()
.split('\n')
.map(x => x.replace(/\s\s+/g, ' '))
info = {
runtime: info[1].trim(),
members: parseInt(
info[2]
.replace(' members', '')
.replace(',', '')
.trim()
),
episodes: parseInt(info[0].replace(/[a-zA-Z()?]/g, '').trim()) || 0,
type: info[0]
.replace(/[0-9()?]/g, '')
.replace(type == 'anime' ? 'eps' : 'vols', '')
.trim()
}
// add the info to the out
out.info = info
// add functions
out.getDetails = () => this.getDetails(out.id, out.type)
this._addFunctionsToOutput(out)
return out
}
_addFunctionsToOutput(output) {
// add functions
output.getPictures = () => this._getPictures(output.href)
output.getPics = () => this._getPictures(output.href) // added a synonym
output.getImages = () => this._getPictures(output.href) // added a synonym
output.getVideos = () => this._getVideos(output.href)
}
/**
* get 50 of the top anime
* @param {object} options - the GET options to give to the page
* @returns {Promise<Top>} - a promise with the data
*/
topAnime(options) {
return this._get(endpoints.topAnime, options).then(text => {
return new Promise((resolve, reject) => {
try {
const output = []
const $ = cheerio.load(text)
$('.top-ranking-table tbody')
.children()
.each((i, elem) => {
const tr = $(elem)
if (tr.hasClass('ranking-list')) {
// push it to the output
output.push(this._parseTopTr(tr))
}
})
resolve(output)
} catch (e) {
reject(e)
}
})
})
}
/**
* get 50 of the top manga
* @param {object} options - the GET options to give to the page
* @returns {Promise<Top>} - a promise with the data
*/
topManga(options) {
return this._get(endpoints.topManga, options).then(text => {
return new Promise((resolve, reject) => {
try {
const output = []
const $ = cheerio.load(text)
$('.top-ranking-table tbody')
.children()
.each((i, elem) => {
const tr = $(elem)
if (tr.hasClass('ranking-list')) {
// push it to the output
output.push(this._parseTopTr(tr, 'manga'))
}
})
resolve(output)
} catch (e) {
reject(e)
}
})
})
}
/**
* get a top 50
* @param {string} type - the type of the top list: anime or manga
* @param {object} options - the GET options to give to the page
* @returns {Promise<Top>} - a promise with the data
*/
top(type = 'anime', options) {
if (typeof type == 'object') {
options = type
type = 'anime'
}
switch (type) {
case 'anime':
return this.topAnime(options)
break
case 'manga':
return this.topManga(options)
break
default:
throw new Promise((resolve, reject) => reject('wrong type!'))
}
}
_getPictures(baseUrl) {
const url = this.prefix + baseUrl + '/pics'
return fetch(url)
.then(data => data.text())
.then(html => {
return new Promise((resolve, reject) => {
try {
const $ = cheerio.load(html)
const output = []
$('.js-picture-gallery').each((i, elem) => {
output.push({
huge: $(elem).attr('href'),
big: $(elem)
.attr('href')
.replace('l.', '.')
})
})
resolve(output)
} catch (e) {
reject(e)
}
})
})
}
_getVideos(baseUrl) {
const url = this.prefix + baseUrl + '/video'
return fetch(url)
.then(data => data.text())
.then(html => {
return new Promise((resolve, reject) => {
try {
const $ = cheerio.load(html)
const output = []
$('.video-list-outer .video-list').each((i, elem) => {
elem = $(elem)
const youtubeHref = $(elem).attr('href') || ''
const out = {
href: youtubeHref,
youtube: youtubeHref.substring(
youtubeHref.indexOf('/embed/') + '/embed/'.length,
youtubeHref.indexOf('?')
),
thumbnail: elem.find('.thumbs').data()
}
out.id = out.thumbnail.animeId
delete out.thumbnail.pinNoHover // unneeded data, is for the site or something
output.push(out)
})
resolve(output)
} catch (e) {
reject(e)
}
})
})
}
/**
* get the details of an anime or manga
* @param {number} id - the id of the anime or manga
* @param {string} type - the type of the thing you want the details from: anime or manga
* @returns {Promise<Details>} - a promise with the data
*/
getDetails(id, type = 'anime') {
return this._get(
endpoints.details,
{},
{ type: type, id: id }
).then(text => {
return new Promise((resolve, reject) => {
try {
const $ = cheerio.load(text)
const youtubeHref =
$('.video-promotion .video-unit').attr('href') || ''
let output = {
title: $('#contentWrapper h1 [itemProp=name]').text(),
type: type,
score: parseFloat(
$('.score')
.text()
.replace(/\s\s+/g, ' ')
.trim()
),
rank: parseInt(
$('.ranked strong')
.text()
.replace('#', '')
),
popularity: parseInt(
$('.popularity strong')
.text()
.replace('#', '')
),
members: parseInt(
$('.members strong')
.text()
.replace(',', '')
),
synopsis: $('[itemProp=description]').text(),
poster: $('.js-scrollfix-bottom img.ac').attr('src'),
video: {
href: youtubeHref,
youtube: youtubeHref.substring(
youtubeHref.indexOf('/embed/') + '/embed/'.length,
youtubeHref.indexOf('?')
)
},
href: $('#horiznav_nav ul')
.children()
.find('a')
.attr('href')
}
output = Object.assign({}, output, getInfo())
this._addFunctionsToOutput(output)
function getInfo() {
const out = {}
$('.js-scrollfix-bottom h2').each((i, elem) => {
let infoElem = $(elem).nextUntil('h2')
const info = infoElem
.text()
.replace(/\s\s+/g, ' ')
.trim()
.split(
/\w*: /g
) /* gets the info tested on http://regexr.com/ with this text:
English: Your Name. Japanese: 君の名は。
Type: Movie Episodes: 1 Status: Finished Airing Aired: Aug 26, 2016 Producers: Kadokawa Shoten, Toho, Sound Team Don Juan, Lawson HMV Entertainment, Amuse, East Japan Marketing & Communications Licensors: Funimation Studios: CoMix Wave Films Source: Original Genres: Supernatural, Drama, Romance, School Duration: 1 hr. 46 min. Rating: PG-13 - Teens 13 or older
Score: 9.261 (scored by 276,745 users) 1 indicates a weighted score. Please note that 'Not yet aired' titles are excluded. Ranked: #12 2 based on the top anime page. Please note that 'Not yet aired' and 'R18+' titles are excluded. Popularity: #60 Members: 469,007 Favorites: 22,690*/
.map(x => x.trim()) // trims it again
.filter(x => x !== '') //remove empty strings
const key = []
infoElem
.text()
.replace(/\s\s+/g, ' ')
.trim()
.replace(/\w*: /g, match => {
key.push(match)
})
out[camelize(clean($(elem).text()))] = mapArrayToObject(key, info)
})
function clean(str) {
return str
.replace(/\s\s+/g, ' ')
.trim()
.replace(':', '')
.trim()
}
// maps two arrays to one object
function mapArrayToObject(key, items) {
const output = {}
for (var i = 0; i < key.length; i++) {
output[camelize(clean(key[i]))] = items[i]
}
return output
}
// turns a string into camelCase
function camelize(str) {
return str
.replace(/(?:^\w|[A-Z]|\b\w)/g, function(letter, index) {
return index == 0
? letter.toLowerCase()
: letter.toUpperCase()
})
.replace(/\s+/g, '')
}
return out
}
resolve(output)
} catch (e) {
reject(e)
}
})
})
}
_parseSearchItems(elem, type = 'anime') {
const out = {
title: elem.find('.hoverinfo_trigger strong').text(),
href: elem.find('a.hoverinfo_trigger').attr('href'),
id: parseInt(
elem
.find('a.hoverinfo_trigger')
.attr('id')
.replace('sarea', '')
),
type: type,
synopsis: elem
.find('.pt4')
.text()
.replace('read more.', ''),
posters: this._getPosters(elem, type)
// info: { // WIP
// type : "TV",
// score: 10,
// episodes: 13
// }
}
this._addFunctionsToOutput(out)
out.getDetails = () => this.getDetails(out.id, out.type)
return out
}
/**
* search for anime or manga, might also work for other things but I didn't test that
* @param {number} q - the query/search terms
* @param {string} type - the type of the thing you want to search: anime or manga
* @param {string} options - the GET options to give to the page
* @returns {Promise.<SearchItem[]>} - a promise with the data
*/
search(q, type = 'anime', options) {
if (typeof type === 'object') {
options = type
type = 'anime'
}
return this._get(endpoints.search, Object.assign({}, { q }, options), {
type
}).then(text => {
return new Promise((resolve, reject) => {
try {
const $ = cheerio.load(text)
const output = []
$('.js-block-list tbody')
.children()
.each((i, elem) => {
if (i != 0) {
output.push(this._parseSearchItems($(elem), type))
}
})
resolve(output)
} catch (e) {
reject(e)
}
})
})
}
}
module.exports = MAL