manga-crawl-lib
Version:
A library for scraping manga from various websites.
495 lines (456 loc) • 15 kB
text/typescript
/* eslint-disable @typescript-eslint/no-floating-promises */
/* eslint-disable @typescript-eslint/no-non-null-assertion */
import puppeteer, { Browser } from 'puppeteer';
import {
AbstractMangaFactory,
chapter,
genre,
image_chapter,
responseChapter,
responseDetailManga,
responseListManga,
} from '../types/type';
import { not_null } from '../utils/validate';
import {
NETTRUYEN_SORT_FILTER,
NETTRUYEN_STATUS_FILTER,
} from '../constants/filter';
export class Nettruyen implements AbstractMangaFactory {
baseUrl: string;
all_genres: genre[];
browser: Promise<Browser>;
constructor(baseUrl: string) {
this.baseUrl = baseUrl;
this.browser = puppeteer.launch({
headless: 'new',
});
this.all_genres = [] as genre[];
}
async search(keyword: string, page = 1): Promise<responseListManga> {
const _page = await (await this.browser).newPage();
await _page.setRequestInterception(true);
_page.on('request', (req) => {
if (req.resourceType() !== 'document') req.abort();
else req.continue();
});
await _page.goto(
`${this.baseUrl}/tim-truyen?keyword=${keyword}${
page > 1 ? `&page=${page}` : ``
}`
);
const element = await _page.$$(
'#ctl00_divCenter > div.Module.Module-170 > div > div.items > div > div.item > figure'
);
const is_multipage = await _page
.$eval('#ctl00_mainContent_ctl01_divPager', () => true)
.catch(() => false);
const canNext = is_multipage
? await _page
.$eval(
'#ctl00_mainContent_ctl01_divPager > ul > li > a.next-page',
() => true
)
.catch(() => false)
: false;
const canPrev = is_multipage
? await _page
.$eval(
'#ctl00_mainContent_ctl01_divPager > ul > li > a.prev-page',
() => true
)
.catch(() => false)
: false;
const totalPage = is_multipage
? parseInt(
not_null(
await _page.$eval(
'#ctl00_mainContent_ctl01_divPager > ul > li:last-child > a',
(el) => el.getAttribute('href')
)
).split('page=')[1]
)
: 0;
return {
totalData: element.length,
totalPage,
currentPage: page !== undefined ? page : 1,
canNext,
canPrev,
data: await Promise.all(
element.map(async (e, i) => {
const href = not_null(
await e.$eval('div.image > a', (el) => el.getAttribute('href'))
);
const title = not_null(
await e.$eval('figcaption > h3 > a', (el) => el.textContent)
);
const image_thumbnail = not_null(
await e.$eval('div.image > a > img', (el) =>
el.getAttribute('data-original')
)
);
return {
_id: i,
title,
image_thumbnail: image_thumbnail.startsWith('//')
? `https:${image_thumbnail}`
: image_thumbnail,
href,
};
})
),
};
}
async getListByGenre(
genre: genre,
page?: number,
status?: NETTRUYEN_STATUS_FILTER,
sort?: NETTRUYEN_SORT_FILTER
): Promise<responseListManga> {
const _page = await (await this.browser).newPage();
let path = genre.path;
if (sort !== undefined) {
path += `?sort=${sort}${
status !== undefined ? `&status=${status}` : '&status=-1'
}${page !== undefined ? `&page=${page}` : ''}`;
} else if (status !== undefined) {
path += `?status=${status}${page !== undefined ? `&page=${page}` : ''}`;
} else if (page !== undefined) {
path += `?page=${page}`;
}
await _page.setRequestInterception(true);
_page.on('request', (req) => {
if (req.resourceType() !== 'document') req.abort();
else req.continue();
});
await _page.goto(`${this.baseUrl}${path}`);
const element = await _page.$$(
'#ctl00_divCenter > div.Module.Module-170 > div > div.items > div > div.item > figure'
);
const canNext = await _page
.$eval(
'#ctl00_mainContent_ctl01_divPager > ul > li > a.next-page',
() => true
)
.catch(() => false);
const canPrev = await _page
.$eval(
'#ctl00_mainContent_ctl01_divPager > ul > li > a.prev-page',
() => true
)
.catch(() => false);
const totalPage = parseInt(
not_null(
await _page.$eval(
'#ctl00_mainContent_ctl01_divPager > ul > li:last-child > a',
(el) => el.getAttribute('href')
)
).split('page=')[1]
);
return {
totalData: element.length,
totalPage,
currentPage: page !== undefined ? page : 1,
canNext,
canPrev,
data: await Promise.all(
element.map(async (e, i) => {
const href = not_null(
await e.$eval('div.image > a', (el) => el.getAttribute('href'))
);
const title = not_null(
await e.$eval('figcaption > h3 > a', (el) => el.textContent)
);
const image_thumbnail = not_null(
await e.$eval('div.image > a > img', (el) =>
el.getAttribute('data-original')
)
);
return {
_id: i,
title,
image_thumbnail: image_thumbnail.startsWith('//')
? `https:${image_thumbnail}`
: image_thumbnail,
href,
};
})
),
};
}
async getDataChapter(
url_chapter: string,
url?: string,
path?: string,
prev_chapter?: chapter,
next_chapter?: chapter
): Promise<responseChapter> {
url = url !== undefined ? url : '';
path = path !== undefined ? path : '';
const _page = await (await this.browser).newPage();
await _page.setRequestInterception(true);
_page.on('request', (req) => {
if (req.resourceType() !== 'document') req.abort();
else req.continue();
});
await _page.goto(url_chapter);
const content = await _page.$(
'#ctl00_divCenter > div > div.reading-detail.box_doc'
);
const title = not_null(
await _page.$eval(
'#ctl00_divCenter > div > div:nth-child(1) > div.top > h1',
(el) => el.textContent
)
);
const images: image_chapter[] = await Promise.all(
(
await content!.$$('div.page-chapter > img')
).map(async (e, i) => {
const _data_image = await e.evaluate((el) => {
return {
src_origin: el.getAttribute('data-original'),
src_cdn: el.getAttribute('data-cdn'),
alt: el.getAttribute('alt'),
};
});
return {
_id: i,
src_origin: not_null(_data_image.src_origin).startsWith('//')
? `https:${not_null(_data_image.src_origin)}`
: not_null(_data_image.src_origin),
...(not_null(_data_image.src_cdn) !== ''
? {
src_cdn: not_null(_data_image.src_cdn).startsWith('//')
? `https:${not_null(_data_image.src_cdn)}`
: not_null(_data_image.src_cdn),
}
: {}),
alt: not_null(_data_image.alt),
};
})
);
const prev: chapter = {} as chapter;
if (prev_chapter === undefined) {
const prev_chapter_get = await _page.$eval(
'#chapterNav > a.prev.a_prev',
(el) => {
return {
url_chapter: el.getAttribute('href'),
};
}
);
prev.url = not_null(prev_chapter_get.url_chapter);
prev.parent_href = url;
prev.path = url.substring(`${this.baseUrl}`.length);
}
const next: chapter = {} as chapter;
if (next_chapter === undefined) {
const next_chapter_get = await _page.$eval(
'#chapterNav > a.next.a_next',
(el) => {
return {
url_chapter: el.getAttribute('href'),
};
}
);
next.url = not_null(next_chapter_get.url_chapter);
next.parent_href = url;
next.path = url.substring(`${this.baseUrl}`.length);
}
return {
url,
path,
chapter_data: images,
title,
next_chapter:
next_chapter !== undefined
? next_chapter
: next.url !== '#'
? next
: null,
prev_chapter:
prev_chapter !== undefined
? prev_chapter
: prev.url !== '#'
? prev
: null,
};
}
async getDetailManga(url: string): Promise<responseDetailManga> {
const _page = await (await this.browser).newPage();
await _page.setRequestInterception(true);
_page.on('request', (req) => {
if (req.resourceType() !== 'document') req.abort();
else req.continue();
});
await _page.goto(url);
const content = await _page.$('#ctl00_divCenter');
const title = await content!.$eval('article > h1', (el) => el.textContent);
const description = await content!.$eval('#item-detail > div.detail-content > p', (el) => el.textContent) || '';
const thumbnail = await content!.$eval('#item-detail > div.detail-info > div > div.col-xs-4.col-image > img', (el) => el.getAttribute('src')) || '';
const path = url.substring(`${this.baseUrl}`.length);
const author = await content!.$eval(
'#item-detail > div.detail-info > div > div.col-xs-8.col-info > ul > li.author.row > p.col-xs-8',
(el) => el.textContent
);
const status = await content!.$eval(
'#item-detail > div.detail-info > div > div.col-xs-8.col-info > ul > li.status.row > p.col-xs-8',
(el) => el.textContent
);
const genres: genre[] = await Promise.all(
(
await content!.$$(
'#item-detail > div.detail-info > div > div.col-xs-8.col-info > ul > li.kind.row > p.col-xs-8 > a'
)
).map(async (e) => {
const data = await e.evaluate((el) => {
return {
url: el.getAttribute('href'),
path: el.getAttribute('href'),
name: el.textContent,
};
});
return {
url: not_null(data.url),
path: not_null(data.path).substring(`${this.baseUrl}`.length),
name: not_null(data.name),
};
})
);
const views = await content!.$eval(
'#item-detail > div.detail-info > div.row > div.col-xs-8.col-info > ul > li:last-child > p.col-xs-8',
(el) => el.textContent
);
const chapters: chapter[] = await Promise.all(
(
await content!.$$('.list-chapter > nav > ul > li')
).map(async (e) => {
const chapter_anchor = await e.$eval('.col-xs-5.chapter > a', (el) => {
const data = {
title: el.textContent,
url: el.getAttribute('href'),
};
return {
title: data.title,
url: data.url,
};
});
const last_update = await e.$eval(
'.col-xs-4.text-center.no-wrap.small',
(el) => el.textContent
);
const views_chapter = await e.$eval(
'.col-xs-3.text-center.small',
(el) => el.textContent
);
return {
title: not_null(chapter_anchor.title),
url: not_null(chapter_anchor.url),
path: not_null(chapter_anchor.url).substring(
`${this.baseUrl}`.length
),
parent_href: url,
last_update: not_null(last_update),
views: not_null(views_chapter),
};
})
);
const rate = not_null(
await content!.$eval(
'#item-detail > div.detail-info > div > div.col-xs-8.col-info > div.mrt5.mrb10 > span > span:nth-child(1)',
(el) => el.textContent
)
);
const rate_number = not_null(
await content!.$eval(
'#item-detail > div.detail-info > div > div.col-xs-8.col-info > div.mrt5.mrb10 > span > span:nth-child(3)',
(el) => el.textContent
)
);
const follows = not_null(
await content!.$eval(
'#item-detail > div.detail-info > div > div.col-xs-8.col-info > div.follow > span > b',
(el) => el.textContent
)
);
return {
title: not_null(title),
path,
author: not_null(author),
url,
status: not_null(status),
genres,
views: not_null(views),
rate,
rate_number,
follows,
chapters,
description,
thumbnail: thumbnail.startsWith('//')
? `https:${thumbnail}`
: thumbnail,
};
}
async getListLatestUpdate(page = 1): Promise<responseListManga> {
const _page = await (await this.browser).newPage();
await _page.setRequestInterception(true);
_page.on('request', (req) => {
if (req.resourceType() !== 'document') req.abort();
else req.continue();
});
await _page.goto(`${this.baseUrl}${page > 1 ? `/?page=${page}` : ``}`);
const element = await _page.$$(
'#ctl00_divCenter > div > div > div.items > div.row > div.item'
);
const canNext = await _page
.$eval(
'#ctl00_mainContent_ctl00_divPager > ul > li > a.next-page',
() => true
)
.catch(() => false);
const canPrev = await _page
.$eval(
'#ctl00_mainContent_ctl00_divPager > ul > li > a.prev-page',
() => true
)
.catch(() => false);
const totalPage = parseInt(
not_null(
await _page.$eval(
'#ctl00_mainContent_ctl00_divPager > ul > li:last-child > a',
(el) => el.getAttribute('href')
)
).split('page=')[1]
);
return {
totalData: element.length,
totalPage,
currentPage: page,
canNext,
canPrev,
data: await Promise.all(
element.map(async (e, i) => {
const image_thumbnail: string = await e.$eval(
'.image > a > img',
(el) => el.getAttribute('data-original')!
);
const link = await e.$eval('figure > figcaption > h3 > a', (el) => {
return {
title: el.textContent,
href: el.getAttribute('href'),
};
});
return {
_id: i,
title: not_null(link.title),
href: not_null(link.href),
image_thumbnail: image_thumbnail.startsWith('//')
? `https:${image_thumbnail}`
: image_thumbnail,
};
})
),
};
}
}