UNPKG

@johndoeantler/mal-scraper

Version:

Scrap everything you can from MyAnimeList.net

162 lines (140 loc) 5.4 kB
const axios = require('axios') const cheerio = require('cheerio') const { getResultsFromSearch } = require('./info.js') const BASE_URI = 'https://myanimelist.net/anime/' const NUMBER_REVIEWS_BY_PAGE = 20 const INITIAL_FIRST_PAGE_REVIEW = 1 /** * Return a formatter javascript date * @params malDate string a date in a string object * @return date The string parse to a date **/ const malDateToJsDate = (malDate) => { return new Date(malDate) } /** * Return a formatted javascript number * @params malNumber string a number in a string object * @return number The string parse to a number **/ const malNumberToJsNumber = (malNumber) => { return malNumber ? Number(malNumber) : 0 } const parsePage = ($) => { const items = $('.borderDark') const result = [] items.each(function (elem) { const notes = $(this).find('.spaceit.pt8 div') const reviewMore = $(this).find('.spaceit.pt8 span') // For presenting the review only without the notes $(this).find('.spaceit.pt8 div').remove() $(this).find('.spaceit.pt8 span').remove() $(this).find('.spaceit.pt8 a.js-toggle-review-button').remove() result.push({ author: $($(this).find('.spaceit td:nth-child(2) a')['0']).text().trim(), date: malDateToJsDate($($(this).find('.spaceit .mb8 div')['0']).text().trim()), seen: $(this).find('.spaceit .mb8 .lightLink').text().trim(), overall: malNumberToJsNumber($(notes).find('tr:nth-child(1) td:nth-child(2)').text().trim()), story: malNumberToJsNumber($(notes).find('tr:nth-child(2) td:nth-child(2)').text().trim()), animation: malNumberToJsNumber($(notes).find('tr:nth-child(3) td:nth-child(2)').text().trim()), sound: malNumberToJsNumber($(notes).find('tr:nth-child(4) td:nth-child(2)').text().trim()), character: malNumberToJsNumber($(notes).find('tr:nth-child(5) td:nth-child(2)').text().trim()), enjoyment: malNumberToJsNumber($(notes).find('tr:nth-child(6) td:nth-child(2)').text().trim()), review: $(this).find('.spaceit.pt8').text().trim() + $(reviewMore).text().trim() }) }) return result } const searchPage = (url, limit, skip, p, res = []) => { return new Promise((resolve, reject) => { axios.get(url, { params: { p } }).then(({ data }) => { const $ = cheerio.load(data) const tmpRes = parsePage($) res = res.concat(tmpRes) // If there is some skip to do, we splice the first result of the first page if (skip !== 0) { res.splice(0, skip) skip = 0 } if (res.length <= limit) { p++ searchPage(url, limit, skip, p, res) .then((data) => resolve(data)) .catch(/* istanbul ignore next */(err) => reject(err)) } else { // If our limit is under the number of result in the page, we remove the excess if (res.length !== limit) { const nbrElementToRemove = res.length - limit res.splice(-nbrElementToRemove, nbrElementToRemove) } resolve(res) } }).catch(/* istanbul ignore next */(err) => reject(err)) }) } const getReviewsFromName = (name, limit, skip, p) => { return new Promise((resolve, reject) => { getResultsFromSearch(name).then((items) => { const { url } = items[0] searchPage(`${encodeURI(url)}/reviews`, limit, skip, p) .then((data) => resolve(data)) .catch(/* istanbul ignore next */(err) => reject(err)) }).catch(/* istanbul ignore next */(err) => reject(err)) }) } const getReviewsFromNameAndId = (id, name, limit, skip, p) => { return new Promise((resolve, reject) => { searchPage(`${BASE_URI}${id}/${encodeURI(name)}/reviews`, limit, skip, p) .then((data) => resolve(data)) .catch(/* istanbul ignore next */(err) => reject(err)) }) } /** * Return the starting page of the query depending of the number of element to skip * @params skip number The number of element to skip * @return number page to start the query **/ const startingPage = (skip) => { return skip !== 0 ? Math.floor(skip / NUMBER_REVIEWS_BY_PAGE) + 1 : INITIAL_FIRST_PAGE_REVIEW } /** * Return the number of skip remaining after skipping x page * @params skip number Total number of skip of the call * @params p number Number of page to skip * @return number Number of skip remaining in the first page **/ const skipByPage = (skip, p) => { return skip !== 0 ? Math.max(0, skip - ((p - 1) * NUMBER_REVIEWS_BY_PAGE)) : 0 } const getReviewsList = (obj) => { return new Promise((resolve, reject) => { if (!obj || typeof obj !== 'object') { reject(new Error('[Mal-Scraper]: No id nor name received.')) return } const { id, name, limit } = obj let skip = obj.skip ? obj.skip : 0 if ((obj.id && (!name || isNaN(+id))) || typeof name !== 'string') { reject(new Error('[Mal-Scraper]: Malformed input. ID or name is malformed or missing.')) return } const p = startingPage(skip) skip = skipByPage(skip, p) if (obj.id) { getReviewsFromNameAndId(id, name, limit, skip, p) .then((data) => resolve(data)) .catch(/* istanbul ignore next */(err) => reject(err)) } else { getReviewsFromName(name, limit, skip, p) .then((data) => resolve(data)) .catch(/* istanbul ignore next */(err) => reject(err)) } }) } module.exports = { getReviewsList }