scrapper-tools
Version:
Modern way to scrape modern websites
357 lines (303 loc) • 10.3 kB
text/typescript
import AsyncLock from "async-lock"
import debug from "debug"
import playwright, {
Browser,
BrowserContext,
BrowserContextOptions,
CDPSession,
ChromiumBrowserContext,
Page,
} from "playwright"
import * as functionsToInject from "./functionToInject"
import { PathLike } from "fs-extra"
import { pageStealth } from "playwright-mini"
let error = debug("scrapper_tools:fastpage:error")
let info = debug("scrapper_tools:fastpage:info")
let lock = new AsyncLock()
interface BrowserTypeLaunchOptionsProxy {
server: string
bypass?: string
username?: string
password?: string
}
interface ConfigValue {
browserHandle?: BrowserContext
nonPersistantBrowserHandle?: any
browser: "chromium" | "firefox" | "webkit"
proxy?: BrowserTypeLaunchOptionsProxy
headless: boolean
devtools: boolean
userDataDir?: string
windowSize: { width: number; height: number }
blockFonts: boolean
blockImages: boolean
blockCSS: boolean
defaultNavigationTimeout: number
extensions: Array<String>
showPageError: boolean
userAgent: string
args: Array<string>
hooks: any
enableStealth: boolean
downloadDir: any | PathLike
}
let defaultConfig: ConfigValue = {
browserHandle: undefined,
browser: "chromium",
nonPersistantBrowserHandle: undefined,
proxy: undefined,
headless: false,
devtools: false,
userDataDir: undefined,
windowSize: { width: 595, height: 842 },
blockFonts: false,
blockImages: false,
blockCSS: false,
enableStealth: true,
defaultNavigationTimeout: 30 * 1000,
extensions: [],
showPageError: false,
userAgent:
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36",
args: [],
hooks: [],
downloadDir: null,
}
interface Config {
[name: string]: ConfigValue
}
let config: Config = {
default: { ...defaultConfig },
}
async function loadHooks(hooks: any, name: string, ...args: any): Promise<void> {
hooks.filter((v: any) => v.name === name).forEach(async (v: any) => await v.action(...args))
}
async function browser(instanceName: string): Promise<Browser> {
return await lock
.acquire("instance_" + instanceName, async function () {
let ic = config[instanceName]
if (ic.browserHandle) {
return ic.browserHandle
}
let args: Array<string> = [...ic.args]
if (ic.browser === "chromium") {
args = args.concat([
"--no-sandbox",
"--allow-running-insecure-content",
"--disable-background-timer-throttling",
"--disable-backgrounding-occluded-windows",
"--disable-renderer-backgrounding",
"--disable-web-security",
`--window-size=${ic.windowSize.width},${ic.windowSize.height}`,
])
if (ic.extensions.length > 0) {
args.push(
`--disable-extensions-except=${ic.extensions.join(",")}`,
`--load-extension=${ic.extensions.join(",")}`
)
}
}
let launchOption: any = {
headless: ic.headless,
args,
devtools: ic.devtools,
acceptDownloads: true,
}
if (ic.downloadDir) {
launchOption.downloadsPath = ic.downloadDir
}
if (ic.proxy) {
launchOption.proxy = ic.proxy
}
if (ic.userDataDir) {
ic.browserHandle = await playwright[ic.browser].launchPersistentContext(ic.userDataDir!, {
acceptDownloads: true,
colorScheme: "dark",
...launchOption,
})
} else {
let browser = await playwright[ic.browser].launch(launchOption)
let contextOption: BrowserContextOptions = {
ignoreHTTPSErrors: true,
acceptDownloads: true,
bypassCSP: true,
userAgent: ic.userAgent,
colorScheme: "dark",
viewport: {
width: ic.windowSize.width,
height: ic.windowSize.height,
},
}
ic.nonPersistantBrowserHandle = browser
ic.browserHandle = await browser.newContext(contextOption)
}
return ic.browserHandle
})
.catch((err: any) => {
error("Error on starting new page: Lock Error ->", err)
throw err
})
}
export async function makePageFaster(
page: Page,
instanceName: string
): Promise<{ session: CDPSession | null; page: Page }> {
let instanceConfig: typeof defaultConfig = config[instanceName]
await loadHooks(instanceConfig["hooks"], "make_page_faster", page)
page.setDefaultNavigationTimeout(instanceConfig.defaultNavigationTimeout)
page.setDefaultTimeout(instanceConfig.defaultNavigationTimeout)
let session: null | CDPSession = null
if (instanceConfig.browser === "chromium") {
session = await (page.context() as ChromiumBrowserContext).newCDPSession(page)
}
if (instanceConfig.enableStealth === true) {
await pageStealth(page)
}
await page.addScriptTag({
content: `${functionsToInject.waitForElement} ${functionsToInject.waitForElementToBeRemoved} ${functionsToInject.delay}`,
})
if (instanceConfig.showPageError === true) {
page.on("pageerror", (err: any) => {
error("Error happen at the page: ", err)
})
page.on("pageerror", (pageerr: any) => {
error("Page Error occurred: ", pageerr)
})
}
if (instanceConfig.blockCSS || instanceConfig.blockFonts || instanceConfig.blockImages) {
// await page.setRequestInterception(true)
page.on("request", (request: any) => {
if (
(instanceConfig.blockImages && request.resourceType() === "image") ||
(instanceConfig.blockFonts && request.resourceType() === "font") ||
(instanceConfig.blockCSS && request.resourceType() === "stylesheet")
) {
request.abort()
} else {
request.continue()
}
})
}
if (session) {
await session.send("Page.setWebLifecycleState", {
state: "active",
})
}
return { session, page }
}
export function fastPage(instanceName = "default") {
async function init(useCurrentDefaultConfig = true) {
if (useCurrentDefaultConfig) {
config[instanceName] = { ...config.default }
} else {
config[instanceName] = { ...defaultConfig }
}
}
return {
init: init,
getBrowserHandle: async (): Promise<Browser> => {
return await browser(instanceName)
},
newPage: async (): Promise<Page> => {
info("Fast Page", "Launching new page ")
if (!config[instanceName]) {
info("Fast Page", "Using default config")
await init()
}
let brow = await browser(instanceName)
let { page } = await makePageFaster(await brow.newPage(), instanceName)
return page
},
newPage1: async (): Promise<{ session: CDPSession | null; page: Page }> => {
info("Fast Page", "Launching new page with session ")
let brow = await browser(instanceName)
let { page, session } = await makePageFaster(await brow.newPage(), instanceName)
return { page, session }
},
closeBrowser: async () => {
info("Fast Page", "Requesting to close browser ")
return await lock
.acquire("instance_close_" + instanceName, async function () {
if (config[instanceName].nonPersistantBrowserHandle) {
config[instanceName].nonPersistantBrowserHandle.close()
} else if (config[instanceName].browserHandle) {
let bHandle = await browser(instanceName)
await bHandle.close()
}
config[instanceName].browserHandle = undefined
config[instanceName].nonPersistantBrowserHandle = undefined
return "closed"
})
.catch((err: any) => console.log("Error on closing browser: Lock Error ->", err))
},
setProxy: (value: BrowserTypeLaunchOptionsProxy) => {
info("Fast Page", "Setting proxy to ", value)
config[instanceName].proxy = value
},
setDefaultBrowser: (name: "chromium" | "firefox" | "webkit") => {
config[instanceName].browser = name
},
setShowPageError: (value: boolean) => {
info("Fast Page", "Setting show page error to ", value)
config[instanceName].showPageError = value
},
setHeadless: (value: boolean = false) => {
info("Fast Page", "Setting headless to ", value)
config[instanceName].headless = value
},
setDevtools: (value: boolean = true) => {
info("Fast Page", "Setting devtools to ", value)
config[instanceName].devtools = value
},
setUserDataDir: (value: string) => {
info("Fast Page", "Storing chrome cache in ", value)
config[instanceName].userDataDir = value
},
setUserAgent: (value: string) => {
info("Fast Page", "Setting user agent in ", value)
config[instanceName].userAgent = value
},
setWindowSizeArg: (value: { width: number; height: number }) => {
info("Fast Page", "Setting window size to ", value)
config[instanceName].windowSize = value
},
setExtensionsPaths: (value: Array<string>) => {
config[instanceName].extensions = value
},
setStealth: (value: boolean) => {
config[instanceName].enableStealth = value
},
setDefaultNavigationTimeout: (value: number) => {
info("Fast Page", "Default navigation timeout", value)
config[instanceName].defaultNavigationTimeout = value
},
setDownloadDir: (value: PathLike) => {
info("Fast Page", "Download timeout", value)
config[instanceName].downloadDir = value
},
blockImages: (value: boolean = true) => {
info("Fast Page", "Block Image", value)
config[instanceName].blockImages = value
},
blockFonts: (value: boolean = true) => {
info("Fast Page", "Block Font", value)
config[instanceName].blockFonts = value
},
blockCSS: (value: boolean = true) => {
info("Fast Page", "Block CSS", value)
config[instanceName].blockCSS = value
},
getConfig(instanceName: string = "default") {
if (instanceName === null) {
return config
}
return config[instanceName]
},
addHook(name: string, action: Function) {
config[instanceName].hooks.push({ name, action })
},
addArg(arg: any) {
config[instanceName].args.push(arg)
},
}
}