manga-crawl-lib
Version:
A library for scraping manga from various websites.
357 lines (337 loc) • 11.3 kB
text/typescript
/* eslint-disable @typescript-eslint/no-floating-promises */
import puppeteer, { Browser } from 'puppeteer';
import {
AbstractMangaFactory,
chapter,
genre,
image_chapter,
responseChapter,
responseDetailManga,
responseListManga,
} from '../types/type';
import { not_null } from '../utils/validate';
import {
NETTRUYEN_SORT_FILTER,
NETTRUYEN_STATUS_FILTER,
} from '../constants/filter';
export class Blogtruyen implements AbstractMangaFactory {
baseUrl: string;
all_genres: genre[];
browser: Promise<Browser>;
constructor(baseUrl: string) {
this.baseUrl = baseUrl;
this.browser = puppeteer.launch({
headless: 'new',
});
this.all_genres = [] as genre[];
}
async search(keyword: string, page = 1): Promise<responseListManga> {
const _page = await (await this.browser).newPage();
await _page.setRequestInterception(true);
_page.on('request', (req) => {
if (req.resourceType() !== 'document') req.abort();
else req.continue();
});
await _page.goto(
`${this.baseUrl}/timkiem/nangcao/1/0/-1/-1?txt=${keyword}${
page > 1 ? `&page=${page}` : ``
}`
);
const element = await _page.$$(
'#wrapper > section.main-content > div:nth-child(1) > div:nth-child(1) > section > article > section > div.list > p:not(.uppercase)'
);
const totalPage = parseInt(
not_null(
await _page.$eval(
'#wrapper > section.main-content > div:nth-child(1) > div:nth-child(1) > section > article > section > div.row > div > nav > ul > li:last-child > a',
(el) => el.getAttribute('href')
)
).split(`/timkiem/nangcao/1/0/-1/-1?txt=${keyword}&p=`)[1]
);
const data = await Promise.all(
element.map(async (e, i) => {
const href = not_null(
await e.$eval('span:nth-child(1) > a', (el) =>
el.getAttribute('href')
)
);
const title = not_null(
await e.$eval('span:nth-child(1) > a', (el) => el.textContent)
);
return {
_id: i,
title,
image_thumbnail:
'!! Not working at this time | Will be fixed soon !!',
href: this.baseUrl + href,
};
})
);
return {
totalData: element.length,
totalPage,
currentPage: page !== undefined ? page : 1,
canNext: page === totalPage ? false : true,
canPrev: page === 1 ? false : true,
data,
};
}
async getListByGenre(
genre: genre,
page = 1,
status?: NETTRUYEN_STATUS_FILTER,
sort?: NETTRUYEN_SORT_FILTER
): Promise<responseListManga> {
const path = genre.path;
const _page = await (await this.browser).newPage();
await _page.setRequestInterception(true);
_page.on('request', (req) => {
if (req.resourceType() !== 'document') req.abort();
else req.continue();
});
await _page.goto(`${this.baseUrl}${path}`);
const element = await _page.$$(
'#wrapper > section.main-content > div:nth-child(1) > div:nth-child(1) > article > section > div.list > p:not(.uppercase)'
);
let totalPage = parseInt(
not_null(
await _page.$eval(
'#wrapper > section.main-content > div > div:nth-child(1) > article > section > div.paging > span:last-child > a',
(el) => el.getAttribute('href')
)
)
.split('javascript:LoadMangaPage(')[1]
.split(')')[0]
);
return {
totalData: element.length,
totalPage,
currentPage: page !== undefined ? page : 1,
canNext: page === totalPage ? false : true,
canPrev: page === 1 ? false : true,
data: await Promise.all(
element.map(async (e, i) => {
const href = not_null(
await e.$eval('span.tiptip > a', (el) => el.getAttribute('href'))
);
const title = not_null(
await e.$eval('span.tiptip > a', (el) => el.textContent)
);
return {
_id: i,
title,
image_thumbnail:
'!! Not working at this time | Will be fixed soon !!',
href: this.baseUrl + href.trim().slice(1),
};
})
),
};
}
async getDataChapter(
url_chapter: string,
url?: string,
path?: string,
prev_chapter?: chapter,
next_chapter?: chapter
): Promise<responseChapter> {
url = url !== undefined ? url : '';
path = path !== undefined ? path : '';
const _page = await (await this.browser).newPage();
await _page.goto(url_chapter);
const content = await _page.$('#readonline > section');
const title = not_null(
await _page.$eval('#readonline > header > h1', (el) => el.textContent)
);
const images: image_chapter[] = await Promise.all(
(
await content!.$$('#content > img')
).map(async (e, i) => {
const _data_image = await e.evaluate((el) => {
return {
src_origin: el.getAttribute('src'),
src_cdn: 'N|A',
};
});
return {
_id: i,
src_origin: not_null(_data_image.src_origin),
src_cdn: not_null(_data_image.src_cdn),
alt: not_null(title + ' id : ' + i),
};
})
);
const next: chapter = {} as chapter;
if (next_chapter === undefined) {
const next_chapter_get = await _page.$eval(
'#readonline > section > div:first-child > a:first-child ',
(el) => {
return {
url_chapter: el.getAttribute('href'),
};
}
);
next.url = this.baseUrl + not_null(next_chapter_get.url_chapter).trim();
next.parent_href = url_chapter;
next.path = not_null(next_chapter_get.url_chapter);
}
const prev: chapter = {} as chapter;
if (prev_chapter === undefined) {
const prev_chapter_get = await _page.$eval(
'#readonline > section > div:first-child > a:last-child ',
(el) => {
return {
url_chapter: el.getAttribute('href'),
};
}
);
prev.url = this.baseUrl + not_null(prev_chapter_get.url_chapter).trim();
prev.parent_href = url_chapter;
prev.path = not_null(prev_chapter_get.url_chapter);
}
return {
url,
path,
chapter_data: images,
title,
next_chapter: next_chapter === undefined ? next : next_chapter,
prev_chapter: prev_chapter === undefined ? prev : prev_chapter,
};
}
async getDetailManga(url: string): Promise<responseDetailManga> {
const _page = await (await this.browser).newPage();
await _page.setRequestInterception(true);
_page.on('request', (req) => {
if (req.resourceType() !== 'document') req.abort();
else req.continue();
});
await _page.goto(url);
const content = await _page.$(
'#wrapper > section.main-content > div.row > div.col-md-8 > section.manga-detail'
);
const title = await content!.$eval(
'h1.entry-title',
(el) => el.textContent
);
const path = url.substring(`${this.baseUrl}`.length);
const author = await content!.$eval(
'div.description > p:nth-child(2) > a',
(el) => el.textContent
);
const status = await content!.$eval(
'div.description > p:nth-child(5) > span',
(el) => el.textContent
);
const genres: genre[] = await Promise.all(
(
await content!.$$('div.description > p:nth-child(4) > span > a')
).map(async (e) => {
const data = await e.evaluate((el) => {
return {
url: el.getAttribute('href'),
path: el.getAttribute('href'),
name: el.textContent,
};
});
return {
url: this.baseUrl + not_null(data.url).trim(),
path: not_null(data.url).trim().slice(1),
name: not_null(data.name),
};
})
);
const views = await content!.$eval('#PageViews', (el) => el.textContent);
const chapters: chapter[] = await Promise.all(
(
await content!.$$(
'div.list-chapters > #loadChapter > #list-chapters > p'
)
).map(async (e) => {
const chapter_anchor = await e.$eval('span.title > a', (el) => {
const data = {
title: el.textContent,
url: el.getAttribute('href'),
};
return {
title: data.title,
url: data.url,
};
});
const last_update = await e.$eval(
'span.publishedDate',
(el) => el.textContent
);
return {
title: not_null(chapter_anchor.title).trim(),
url: this.baseUrl + not_null(chapter_anchor.url).trim(),
path: not_null(chapter_anchor.url).trim(),
parent_href: url,
last_update: not_null(last_update),
};
})
);
const follows = not_null(
await content!.$eval('#LikeCount', (el) => el.textContent)
);
return {
title: not_null(title).trim(),
path,
author: not_null(author).trim(),
url,
status: not_null(status).trim(),
genres,
views: not_null(views),
follows,
chapters,
};
}
async getListLatestUpdate(page = 1): Promise<responseListManga> {
const _page = await (await this.browser).newPage();
await _page.setRequestInterception(true);
_page.on('request', (req) => {
if (req.resourceType() !== 'document') req.abort();
else req.continue();
});
await _page.goto(`${this.baseUrl}${page > 1 ? `/page-${page}` :``}`);
const element = await _page.$$(
'#wrapper > section.main-content > div > div:nth-child(1) > section.list-mainpage > div:nth-child(1) > div > div'
);
const totalPage = parseInt(
not_null(
await _page.$eval(
'#wrapper > section.main-content > div > div:nth-child(1) > section.list-mainpage > div:nth-child(2) > div > nav > ul > li:last-child > a',
(el) => el.getAttribute('href')
)
).split('/page-')[1]
);
return {
totalData: element.length,
totalPage,
currentPage: page,
canNext: page === totalPage ? false : true,
canPrev: page === 1 ? false : true,
data: await Promise.all(
element.map(async (e, i) => {
const image_thumbnail: string = await e.$eval(
'div.fl-l > a > img',
(el) => el.getAttribute('src')!
);
const link = await e.$eval('div.fl-r > h3 > a', (el) => {
return {
title: el.textContent,
href: el.getAttribute('href'),
};
});
return {
_id: i,
title: not_null(link.title),
href: this.baseUrl + not_null(link.href),
image_thumbnail: image_thumbnail.startsWith('//')
? `https:${image_thumbnail}`
: image_thumbnail,
};
})
),
};
}
}