@tidic/nautiljon-scraper
Version:
Nautiljon's anime and manga website scraping tool
272 lines (235 loc) • 16 kB
JavaScript
module.exports = {
searchScraper ($, genre, totalLength) {
let scrapedUrls;
if (genre === 'anime') {
if (!$('tbody')[1]?.children) return []
scrapedUrls = $('tbody')[1].children.map(e => {
let arr = $(e).find(".acenter").toArray();
return {
name: $(e).find('.left.vtop span + a').text() || null,
url: $(e).find('.left.vtop span + a') ? "https://nautiljon.com" + $(e).find('.left.vtop span + a').attr('href') : null,
imageUrl: $(e).find('.image img').attr('src') ? "https://nautiljon.com" + $(e).find('.image img').attr('src').replace("imagesmin", "images") : null,
description: $(e).find('.left.vtop p').text()?.replace(" Lire la suite", "") || null,
format: $(arr[1]).text() || null,
diffusion: $(arr[2]).text() || null,
episodesNumber: $(arr[4]).text() || null,
startDate: $(arr[5]).text() || null,
endDate: $(arr[6]).text() || null,
score: $(arr[7]).text() || null
}
});
}
else if (genre === 'manga') {
if (!$('tbody')[1]?.children) return []
scrapedUrls = $('tbody')[1].children.map(e => {
let arr = $(e).find(".acenter").toArray();
return {
name: $(e).find('.left.vtop span + a').text(),
url: $(e).find('.left.vtop span + a') ? "https://nautiljon.com" + $(e).find('.left.vtop span + a').attr('href') : null,
imageUrl: $(e).find('.image img').attr('src') ? "https://nautiljon.com" + $(e).find('.image img').attr('src').replace("imagesmin", "images") : null,
description: $(e).find('.left.vtop p').text()?.replace(" Lire la suite", "") || null,
type: $(arr[1]).text() || null,
volumesNumber: $(arr[2]).text() || null,
startDate: $(arr[6]).text() || null,
score: $(arr[7]).text() || null
}
});
}
return scrapedUrls.slice(0, totalLength);
},
dataPageScraper ($, url) {
let genre = url.split("/")[3];
if (genre === 'animes') {
//relations
let state = true, lastTitle, relations = {};
$('.top_bloc').filter(function() { return $(this).text().indexOf('Fiches liées') > -1;}).find(".relative.imagesBorder").toArray().filter(e => {
if (!state) return false;
let matchTitle = $(e).find('h3').text();
if (matchTitle.length > 0) {
if (!["Animes", "Manga", "Mangas", "Anime"].includes(matchTitle)) {
state = false;
return false;
}
relations[matchTitle] = [];
lastTitle = matchTitle;
}
relations[lastTitle].push({
name: $(e).find('a').text() || null,
url: $(e).find('a').attr('href') ? "https://nautiljon.com/" + $(e).find('a').attr('href') : null,
relationType: $(e).html().split("</div>")[2]?.trim() || $(e).html().split("</div>")[1]?.trim() || null,
imageUrl: $(e).find('img').attr('src') ? "https://nautiljon.com/" + $(e).find('img').attr('src').replace("imagesmin", "images") : null,
additionnalInformations: $(e).find('div:not([style])').text() || null
})
})
// Récupération intelligente des plateformes VOD
let vodPlatforms = [];
// Méthode 1: Chercher dans les spans avec "streaming" ou "simulcast"
$('span').filter(function() {
return $(this).text().includes('streaming') || $(this).text().includes('simulcast');
}).parent().find('a').each((i, el) => {
const text = $(el).text().trim();
const platforms = ['ADN', 'Crunchyroll', 'Netflix', 'Amazon', 'Funimation', 'Wakanim', 'Anime Digital Network'];
if (platforms.some(platform => text.includes(platform))) {
vodPlatforms.push(text);
}
});
// Méthode 2: Chercher directement les liens vers les plateformes connues
if (vodPlatforms.length === 0) {
const platformLinks = ['ADN', 'Crunchyroll', 'Netflix', 'Amazon Prime Video', 'Funimation', 'Wakanim'];
platformLinks.forEach(platform => {
$('a').each((i, el) => {
const text = $(el).text().trim();
const href = $(el).attr('href') || '';
// Vérifier si c'est un lien vers une plateforme et pas vers une actualité
if (text === platform && (href.includes('/societes/') || href.includes('http'))) {
if (!vodPlatforms.includes(platform)) {
vodPlatforms.push(platform);
}
}
});
});
}
// Récupération améliorée des épisodes
let episodesList = [];
// Chercher le tableau d'épisodes avec 3 colonnes (numéro, titre, date)
$('h2').filter(function() {
return $(this).text().indexOf('Épisodes') > -1;
}).next().find('table').each((i, table) => {
const rows = $(table).find('tr');
if (rows.length > 10) { // Table avec beaucoup d'épisodes
// Vérifier si c'est le bon tableau (3 colonnes)
const firstDataRow = rows.eq(1);
const cells = firstDataRow.find('td');
if (cells.length === 3) {
// C'est le bon tableau !
rows.slice(1).each((j, row) => {
const cells = $(row).find('td');
if (cells.length === 3) {
const episodeNum = $(cells[0]).text().trim();
const titleText = $(cells[1]).text().trim();
const dateText = $(cells[2]).text().trim();
// Séparer le titre anglais et français
let englishTitle = null;
let frenchTitle = null;
// Le format semble être "English Title!French Title" ou parfois avec des annotations
if (titleText.includes('!')) {
const parts = titleText.split('!');
englishTitle = parts[0].trim();
// Prendre tout ce qui suit le premier "!" comme titre français
frenchTitle = parts.slice(1).join('!').trim();
} else {
// Si pas de "!", c'est probablement juste un titre
englishTitle = titleText;
frenchTitle = titleText;
}
// Nettoyer les annotations comme "[Semi filler]"
englishTitle = englishTitle.replace(/\[.*?\]/g, '').trim();
frenchTitle = frenchTitle.replace(/\[.*?\]/g, '').trim();
episodesList.push({
name: englishTitle,
frenchName: frenchTitle,
episode: episodeNum,
date: dateText
});
}
});
}
}
});
return {
name: $('.h1titre > [itemprop="name"]')?.text() || null,
japName: $('span').filter(function() { return $(this).text().indexOf('Titre original : ') > -1;}).parent().html()?.split("</span>")[1].trim() || null,
alternateName: $('[itemprop="alternateName"]').html() || null,
url: url,
imageUrl: $('.image_fiche.fleft a img').attr('src') ? `https://nautiljon.com${$('.image_fiche.fleft a img').attr('src').replace('/mini', '')}` : null,
country: $('.flag').next().text() || null,
score: $('[itemprop="ratingValue"]').text() || null,
votersNumber: $('[itemprop="ratingCount"]').text() || null,
format: $('span').filter(function() { return $(this).text().indexOf('Format : ') > -1;}).next().text() ||
$('span').filter(function() { return $(this).text().indexOf('Type : ') > -1;}).next().text() ||
'Série TV', // Valeur par défaut si non trouvé
source: $('span').filter(function() { return $(this).text().indexOf('Origine : ') > -1;}).next().text() || null,
startDate: $('[itemprop="datePublished"]').text() || null,
endDate: $('[itemprop="datePublished"]').parent().html()?.split("</span>")[2].split("au")[1]?.trim() || null,
genres: $('[itemprop="genre"]').toArray().map(e => $(e).text()),
studio: $('span').filter(function() { return $(this).text().indexOf('Studio d\'animation : ') > -1;}).parent().find('[itemprop="legalName"]').text() || null,
vodPlatform: vodPlatforms,
description: $($(".description").html()?.split('<div class="groupe"')[0].trim()).text() || null,
pictures: $('h3').filter(function() { return $(this).text().indexOf('Captures d\'écran') > -1;}).next().find('a').toArray().map(e => "https://nautiljon.com" + e.attribs.href),
trailer: $('.unTrailerA')[0]?.attribs.href || null,
episodes: {
totalNumber: $('[itemprop="numberOfEpisodes"]').text() || null,
duration: $('[itemprop="numberOfEpisodes"]').parent().html()?.split('</span>')[1]?.split(" x ")[1] || null,
listEpisodes: episodesList
},
relations: relations,
news: {
french: $('#fiche_news li').toArray().map(e => { return {
name: $(e).find('.sim[href]').text()?.split(":").map(e => e.trim())?.slice(1).join(" ") || null,
url: $(e).find('.sim[href]') ? "https://nautiljon.com" + $(e).find('.sim[href]').attr('href') : null,
date: $(e).find('.sim[href]').text()?.split(":")[0]?.trim() || null,
description: $(e).find('.introNews').text() || null,
imageUrl: $(e).find('img') ? "https://nautiljon.com" + $(e).find('img').attr('src') : null,
}})
}
}
}
else if (genre === "mangas") {
//relations
let state = true, lastTitle, relations = {};
$('.top_bloc').filter(function() { return $(this).text().indexOf('Fiches liées') > -1;}).find(".relative.imagesBorder").toArray().filter(e => {
if (!state) return false;
let matchTitle = $(e).find('h3').text();
if (matchTitle.length > 0) {
if (!["Animes", "Manga", "Mangas", "Anime"].includes(matchTitle)) {
state = false;
return false;
}
relations[matchTitle] = [];
lastTitle = matchTitle;
}
relations[lastTitle].push({
name: $(e).find('a').text() || null,
url: $(e).find('a').attr('href') ? "https://nautiljon.com/" + $(e).find('a').attr('href') : null,
relationType: $(e).html().split("</div>")[2]?.trim() || $(e).html().split("</div>")[1]?.trim() || null,
imageUrl: $(e).find('img').attr('src') ? "https://nautiljon.com/" + $(e).find('img').attr('src').replace("imagesmin", "images") : null,
additionnalInformations: $(e).find('div:not([style])').text() || null
})
})
return {
name: $('.h1titre > [itemprop="name"]')?.text() || null,
japName: $('span').filter(function() { return $(this).text().indexOf('Titre original : ') > -1;}).parent().html()?.split("</span>")[1].trim() || null,
alternateName: $('[itemprop="alternateName"]').html() || null,
url: url,
imageUrl: $('.image_fiche.fleft a img').attr('src') ? `https://nautiljon.com${$('.image_fiche.fleft a img').attr('src').replace('/mini', '')}` : null,
score: $('[itemprop="ratingValue"]').text() || null,
votersNumber: $('[itemprop="ratingCount"]').text() || null,
country: $('.flag').parent().text()?.split(":")[1]?.split("-")[0]?.trim() || null,
type: $('span').filter(function() { return $(this).text().indexOf('Type : ') > -1;}).next().text() || null,
startDate: $('[itemprop="datePublished"]').text() || null,
status: $('span').filter(function() { return $(this).text().indexOf('Nb volumes VO : ') > -1;}).parent().html()?.split("</span>")[1]?.split("(")[1]?.slice(0, -1) || null,
volumesNumber: $('span').filter(function() { return $(this).text().indexOf('Nb volumes VO : ') > -1;}).parent().html()?.split("</span>")[1]?.split("(")[0]?.trim() || null,
genres: $('[itemprop="genre"]').toArray().map(e => $(e).text()),
author: {
story: $('li [itemprop="author"] [itemprop="name"]').text() || null,
art: $('li [itemprop="illustrator"] [itemprop="name"]').text() || $('li [itemprop="author"] [itemprop="name"]').text() || null
},
editor: {
VO: $('span').filter(function() { return $(this).text().indexOf('Éditeur VO : ') > -1;}).parent().find('[itemprop="legalName"]').text() || null,
VF: $('span').filter(function() { return $(this).text().indexOf('Éditeur VF : ') > -1;}).parent().find('[itemprop="legalName"]').text() || null
},
description: $($(".description").html()?.split('<div class="groupe"')[0].trim()).text() || null,
relations: relations,
news: {
french: $('#fiche_news li').toArray().map(e => { return {
name: $(e).find('.sim[href]').text()?.split(":").map(e => e.trim())?.slice(1).join(" ") || null,
url: $(e).find('.sim[href]') ? "https://nautiljon.com" + $(e).find('.sim[href]').attr('href') : null,
date: $(e).find('.sim[href]').text()?.split(":")[0]?.trim() || null,
description: $(e).find('.introNews').text() || null,
imageUrl: $(e).find('img') ? "https://nautiljon.com" + $(e).find('img').attr('src') : null,
}})
}
}
}
},
}