UNPKG

@tidic/nautiljon-scraper

Version:

Nautiljon's anime and manga website scraping tool

272 lines (235 loc) 16 kB
module.exports = { searchScraper ($, genre, totalLength) { let scrapedUrls; if (genre === 'anime') { if (!$('tbody')[1]?.children) return [] scrapedUrls = $('tbody')[1].children.map(e => { let arr = $(e).find(".acenter").toArray(); return { name: $(e).find('.left.vtop span + a').text() || null, url: $(e).find('.left.vtop span + a') ? "https://nautiljon.com" + $(e).find('.left.vtop span + a').attr('href') : null, imageUrl: $(e).find('.image img').attr('src') ? "https://nautiljon.com" + $(e).find('.image img').attr('src').replace("imagesmin", "images") : null, description: $(e).find('.left.vtop p').text()?.replace(" Lire la suite", "") || null, format: $(arr[1]).text() || null, diffusion: $(arr[2]).text() || null, episodesNumber: $(arr[4]).text() || null, startDate: $(arr[5]).text() || null, endDate: $(arr[6]).text() || null, score: $(arr[7]).text() || null } }); } else if (genre === 'manga') { if (!$('tbody')[1]?.children) return [] scrapedUrls = $('tbody')[1].children.map(e => { let arr = $(e).find(".acenter").toArray(); return { name: $(e).find('.left.vtop span + a').text(), url: $(e).find('.left.vtop span + a') ? "https://nautiljon.com" + $(e).find('.left.vtop span + a').attr('href') : null, imageUrl: $(e).find('.image img').attr('src') ? "https://nautiljon.com" + $(e).find('.image img').attr('src').replace("imagesmin", "images") : null, description: $(e).find('.left.vtop p').text()?.replace(" Lire la suite", "") || null, type: $(arr[1]).text() || null, volumesNumber: $(arr[2]).text() || null, startDate: $(arr[6]).text() || null, score: $(arr[7]).text() || null } }); } return scrapedUrls.slice(0, totalLength); }, dataPageScraper ($, url) { let genre = url.split("/")[3]; if (genre === 'animes') { //relations let state = true, lastTitle, relations = {}; $('.top_bloc').filter(function() { return $(this).text().indexOf('Fiches liées') > -1;}).find(".relative.imagesBorder").toArray().filter(e => { if (!state) return false; let matchTitle = $(e).find('h3').text(); if (matchTitle.length > 0) { if (!["Animes", "Manga", "Mangas", "Anime"].includes(matchTitle)) { state = false; return false; } relations[matchTitle] = []; lastTitle = matchTitle; } relations[lastTitle].push({ name: $(e).find('a').text() || null, url: $(e).find('a').attr('href') ? "https://nautiljon.com/" + $(e).find('a').attr('href') : null, relationType: $(e).html().split("</div>")[2]?.trim() || $(e).html().split("</div>")[1]?.trim() || null, imageUrl: $(e).find('img').attr('src') ? "https://nautiljon.com/" + $(e).find('img').attr('src').replace("imagesmin", "images") : null, additionnalInformations: $(e).find('div:not([style])').text() || null }) }) // Récupération intelligente des plateformes VOD let vodPlatforms = []; // Méthode 1: Chercher dans les spans avec "streaming" ou "simulcast" $('span').filter(function() { return $(this).text().includes('streaming') || $(this).text().includes('simulcast'); }).parent().find('a').each((i, el) => { const text = $(el).text().trim(); const platforms = ['ADN', 'Crunchyroll', 'Netflix', 'Amazon', 'Funimation', 'Wakanim', 'Anime Digital Network']; if (platforms.some(platform => text.includes(platform))) { vodPlatforms.push(text); } }); // Méthode 2: Chercher directement les liens vers les plateformes connues if (vodPlatforms.length === 0) { const platformLinks = ['ADN', 'Crunchyroll', 'Netflix', 'Amazon Prime Video', 'Funimation', 'Wakanim']; platformLinks.forEach(platform => { $('a').each((i, el) => { const text = $(el).text().trim(); const href = $(el).attr('href') || ''; // Vérifier si c'est un lien vers une plateforme et pas vers une actualité if (text === platform && (href.includes('/societes/') || href.includes('http'))) { if (!vodPlatforms.includes(platform)) { vodPlatforms.push(platform); } } }); }); } // Récupération améliorée des épisodes let episodesList = []; // Chercher le tableau d'épisodes avec 3 colonnes (numéro, titre, date) $('h2').filter(function() { return $(this).text().indexOf('Épisodes') > -1; }).next().find('table').each((i, table) => { const rows = $(table).find('tr'); if (rows.length > 10) { // Table avec beaucoup d'épisodes // Vérifier si c'est le bon tableau (3 colonnes) const firstDataRow = rows.eq(1); const cells = firstDataRow.find('td'); if (cells.length === 3) { // C'est le bon tableau ! rows.slice(1).each((j, row) => { const cells = $(row).find('td'); if (cells.length === 3) { const episodeNum = $(cells[0]).text().trim(); const titleText = $(cells[1]).text().trim(); const dateText = $(cells[2]).text().trim(); // Séparer le titre anglais et français let englishTitle = null; let frenchTitle = null; // Le format semble être "English Title!French Title" ou parfois avec des annotations if (titleText.includes('!')) { const parts = titleText.split('!'); englishTitle = parts[0].trim(); // Prendre tout ce qui suit le premier "!" comme titre français frenchTitle = parts.slice(1).join('!').trim(); } else { // Si pas de "!", c'est probablement juste un titre englishTitle = titleText; frenchTitle = titleText; } // Nettoyer les annotations comme "[Semi filler]" englishTitle = englishTitle.replace(/\[.*?\]/g, '').trim(); frenchTitle = frenchTitle.replace(/\[.*?\]/g, '').trim(); episodesList.push({ name: englishTitle, frenchName: frenchTitle, episode: episodeNum, date: dateText }); } }); } } }); return { name: $('.h1titre > [itemprop="name"]')?.text() || null, japName: $('span').filter(function() { return $(this).text().indexOf('Titre original : ') > -1;}).parent().html()?.split("</span>")[1].trim() || null, alternateName: $('[itemprop="alternateName"]').html() || null, url: url, imageUrl: $('.image_fiche.fleft a img').attr('src') ? `https://nautiljon.com${$('.image_fiche.fleft a img').attr('src').replace('/mini', '')}` : null, country: $('.flag').next().text() || null, score: $('[itemprop="ratingValue"]').text() || null, votersNumber: $('[itemprop="ratingCount"]').text() || null, format: $('span').filter(function() { return $(this).text().indexOf('Format : ') > -1;}).next().text() || $('span').filter(function() { return $(this).text().indexOf('Type : ') > -1;}).next().text() || 'Série TV', // Valeur par défaut si non trouvé source: $('span').filter(function() { return $(this).text().indexOf('Origine : ') > -1;}).next().text() || null, startDate: $('[itemprop="datePublished"]').text() || null, endDate: $('[itemprop="datePublished"]').parent().html()?.split("</span>")[2].split("au")[1]?.trim() || null, genres: $('[itemprop="genre"]').toArray().map(e => $(e).text()), studio: $('span').filter(function() { return $(this).text().indexOf('Studio d\'animation : ') > -1;}).parent().find('[itemprop="legalName"]').text() || null, vodPlatform: vodPlatforms, description: $($(".description").html()?.split('<div class="groupe"')[0].trim()).text() || null, pictures: $('h3').filter(function() { return $(this).text().indexOf('Captures d\'écran') > -1;}).next().find('a').toArray().map(e => "https://nautiljon.com" + e.attribs.href), trailer: $('.unTrailerA')[0]?.attribs.href || null, episodes: { totalNumber: $('[itemprop="numberOfEpisodes"]').text() || null, duration: $('[itemprop="numberOfEpisodes"]').parent().html()?.split('</span>')[1]?.split(" x ")[1] || null, listEpisodes: episodesList }, relations: relations, news: { french: $('#fiche_news li').toArray().map(e => { return { name: $(e).find('.sim[href]').text()?.split(":").map(e => e.trim())?.slice(1).join(" ") || null, url: $(e).find('.sim[href]') ? "https://nautiljon.com" + $(e).find('.sim[href]').attr('href') : null, date: $(e).find('.sim[href]').text()?.split(":")[0]?.trim() || null, description: $(e).find('.introNews').text() || null, imageUrl: $(e).find('img') ? "https://nautiljon.com" + $(e).find('img').attr('src') : null, }}) } } } else if (genre === "mangas") { //relations let state = true, lastTitle, relations = {}; $('.top_bloc').filter(function() { return $(this).text().indexOf('Fiches liées') > -1;}).find(".relative.imagesBorder").toArray().filter(e => { if (!state) return false; let matchTitle = $(e).find('h3').text(); if (matchTitle.length > 0) { if (!["Animes", "Manga", "Mangas", "Anime"].includes(matchTitle)) { state = false; return false; } relations[matchTitle] = []; lastTitle = matchTitle; } relations[lastTitle].push({ name: $(e).find('a').text() || null, url: $(e).find('a').attr('href') ? "https://nautiljon.com/" + $(e).find('a').attr('href') : null, relationType: $(e).html().split("</div>")[2]?.trim() || $(e).html().split("</div>")[1]?.trim() || null, imageUrl: $(e).find('img').attr('src') ? "https://nautiljon.com/" + $(e).find('img').attr('src').replace("imagesmin", "images") : null, additionnalInformations: $(e).find('div:not([style])').text() || null }) }) return { name: $('.h1titre > [itemprop="name"]')?.text() || null, japName: $('span').filter(function() { return $(this).text().indexOf('Titre original : ') > -1;}).parent().html()?.split("</span>")[1].trim() || null, alternateName: $('[itemprop="alternateName"]').html() || null, url: url, imageUrl: $('.image_fiche.fleft a img').attr('src') ? `https://nautiljon.com${$('.image_fiche.fleft a img').attr('src').replace('/mini', '')}` : null, score: $('[itemprop="ratingValue"]').text() || null, votersNumber: $('[itemprop="ratingCount"]').text() || null, country: $('.flag').parent().text()?.split(":")[1]?.split("-")[0]?.trim() || null, type: $('span').filter(function() { return $(this).text().indexOf('Type : ') > -1;}).next().text() || null, startDate: $('[itemprop="datePublished"]').text() || null, status: $('span').filter(function() { return $(this).text().indexOf('Nb volumes VO : ') > -1;}).parent().html()?.split("</span>")[1]?.split("(")[1]?.slice(0, -1) || null, volumesNumber: $('span').filter(function() { return $(this).text().indexOf('Nb volumes VO : ') > -1;}).parent().html()?.split("</span>")[1]?.split("(")[0]?.trim() || null, genres: $('[itemprop="genre"]').toArray().map(e => $(e).text()), author: { story: $('li [itemprop="author"] [itemprop="name"]').text() || null, art: $('li [itemprop="illustrator"] [itemprop="name"]').text() || $('li [itemprop="author"] [itemprop="name"]').text() || null }, editor: { VO: $('span').filter(function() { return $(this).text().indexOf('Éditeur VO : ') > -1;}).parent().find('[itemprop="legalName"]').text() || null, VF: $('span').filter(function() { return $(this).text().indexOf('Éditeur VF : ') > -1;}).parent().find('[itemprop="legalName"]').text() || null }, description: $($(".description").html()?.split('<div class="groupe"')[0].trim()).text() || null, relations: relations, news: { french: $('#fiche_news li').toArray().map(e => { return { name: $(e).find('.sim[href]').text()?.split(":").map(e => e.trim())?.slice(1).join(" ") || null, url: $(e).find('.sim[href]') ? "https://nautiljon.com" + $(e).find('.sim[href]').attr('href') : null, date: $(e).find('.sim[href]').text()?.split(":")[0]?.trim() || null, description: $(e).find('.introNews').text() || null, imageUrl: $(e).find('img') ? "https://nautiljon.com" + $(e).find('img').attr('src') : null, }}) } } } }, }