manga-crawl-lib
Version:
A library for scraping manga from various websites.
90 lines (82 loc) • 2.28 kB
text/typescript
/* eslint-disable @typescript-eslint/no-non-null-assertion */
import { CheerioAPI } from 'cheerio';
import { not_null } from '../utils/validate';
import { Page } from 'puppeteer';
interface MangaDataParams {
cheerioApi?: CheerioAPI;
puppeteer?: Page;
wrapSelector: string;
titleSelector: string;
thumbnailSelector: string;
thumbnailAttr: string;
hrefSelector: string;
}
export const useGetDataItemsManga = async (
params: MangaDataParams
): Promise<
{
_id: number;
title: string;
image_thumbnail: string;
href: string;
}[]
> => {
const {
cheerioApi,
puppeteer,
wrapSelector,
titleSelector,
thumbnailSelector,
thumbnailAttr,
hrefSelector,
} = params;
let data = [] as {
_id: number;
title: string;
image_thumbnail: string;
href: string;
}[];
if (cheerioApi !== undefined) {
const wrapItems = cheerioApi(wrapSelector);
wrapItems.each((i, e) => {
data.push({
_id: i,
title: cheerioApi(e).find(titleSelector).text(),
image_thumbnail: not_null(
cheerioApi(e).find(thumbnailSelector).attr(thumbnailAttr)
),
href: not_null(cheerioApi(e).find(hrefSelector).attr('href')),
});
});
} else {
const wrapItems = await puppeteer!.$$(wrapSelector);
data = await Promise.all(
wrapItems.map(async (e, i) => {
const image_thumbnail: string = await (await e.$(
thumbnailSelector
))!.evaluate((el, thumbnailAttr) => {
return el.getAttribute(thumbnailAttr)!;
}, thumbnailAttr);
const { href } = await e.$eval(hrefSelector, (el) => {
return {
href: el.getAttribute('href'),
};
});
const { title } = await e.$eval(titleSelector, (el) => {
return {
title: el.textContent,
};
});
return {
_id: i,
title: not_null(title).trim().replace(/\n/, ''),
href: not_null(href),
image_thumbnail: image_thumbnail.startsWith('//')
? `https:${image_thumbnail}`
: image_thumbnail,
};
})
);
}
return data;
};