UNPKG

@letsscrapedata/proxy

Version:

proxy manager used to scrape data

676 lines (669 loc) 27.5 kB
import EventEmitter from 'node:events'; import { LogFunction } from '@letsscrapedata/utils'; declare const DefaultStaticProxyPriority = 5; declare const DefaultProxyPackagePriority = 10; declare const DefaultResidentialProxyPriority = 10; declare const DefaultIspProxyPriority = 10; declare const DefaultDatacenterProxyPriority = 10; declare const DefaultMobileProxyPriority = 10; type DateTime = number; type Seconds = number; type MilliSeconds = number; type ProxyIpType = "residential" | "isp" | "datacenter" | "mobile"; /** * * static: IP doesnot change, and valid only when proxyIpType is isp or datacenter * * rotating: IP changes for each request or session */ type ProxyDurationType = "static" | "rotating"; type ProxySharedType = "dedicated" | "shared"; /** * abbreviation of proxyIpType and proxySharedType, such as "rd" for "residential" and "dedicated" */ type ProxyIpSharedType = "rd" | "rs" | "id" | "is" | "dd" | "ds" | "md" | "ms"; type BillingModel = "period" | "trafic"; type ProxyProtocal = "HTTP" | "SOCKS"; type WhenApplyProxy = "apply" | "allocate" | "applyAndAllocate"; type ProxyAccountStatus = "normal" | "disabled"; type ProxyPackageStatus = "normal" | "disabled"; type ProxyStatus = "idle" | "busy" | "discarded"; type WhenCheckProxy = "none" | "apply" | "allocate" | "applyAndAllocate"; type StickySession = boolean; /** * proxyAccountId:`${accountType}-${accountName}` */ type ProxyAccountId = string; type PackageName = string; /** * complete URL of proxy: [protocol://][username:password]hostname[:port] * * The proxy uses the http protocol by default. */ type CompleteUrl = string; /** * (expected) geo location of proxy * * In order to simplify the management logic, it is recommended to use only country. If you really need to specify a city, use city. * * refer to checkGeoLocations() */ interface GeoLocation { region?: string; /** * ISO 3166 country code */ country?: string; state?: string; city?: string; asn?: number; zip?: string; longitude?: number; latitude?: number; } /** * data of NewProxyEvent event: Proxy[], new added proxies * * Both ProxyAccountManager and ProxyManager may emit this event. * * Please listen to this event on ProxyManager instance if ProxyManager is used. Or else listen to this event on ProxyAccountManager instance. */ declare const NewProxyEvent = "newproxy"; type MiscInOptions = Record<string, any>; /** * * options used to create ProxyAccountManager instance * * different ProxyAccountManagers have different default values * * it is better to define the specific attributes of interface MiscInOptions if misc is used */ interface ProxyAccountManagerOptions { accountName?: string; description?: string; username?: string; password?: string; apiKey?: string; whenCheckProxy?: WhenCheckProxy; discardAbnormalProxy?: boolean; abnormalLockSeconds?: number; refreshIntervalSecs?: number; misc?: MiscInOptions; } type RefreshOptions = { /** * Whether to refresh once now * @default true */ refreshNow?: boolean; /** * @default {} */ misc?: MiscInOptions; }; interface PackageAttributesInAccount { /** * type of proxy account, usually the name of the proxy supplier. It is recommended to use lowercase characters, such as "brightdata". * * Reminder: Please use vendorName to represent the real proxy vendor, although in most cases the two are the same! */ accountType: string; /** * Only used to identify a proxy account. You can use a name that is easily recognizable to users, or use "accountType-username", but do not include a password or token. */ accountName: string; } /** * pk: accountType & accountName */ interface ProxyAccount extends PackageAttributesInAccount { status: ProxyAccountStatus; proxyPackageNum: number; ipTotal: number; ipBalance: number; ipIdle: number; gbTotal: number; gbBalance: number; whenCheckProxy: WhenCheckProxy; description: string; createTime: DateTime; } /** * all proxies in a proxyPackage have same following attributes: * * proxyIpType * * proxyDurationType * * proxySharedType * * proxyIpSharedType * * billingModel * * packageName * * vendorName */ interface ProxyAttributesInPackage { proxyIpType: ProxyIpType; proxyDurationType: ProxyDurationType; proxySharedType: ProxySharedType; proxyIpSharedType: ProxyIpSharedType; billingModel: BillingModel; packageName: PackageName; /** * vendorName is usually the same as accountType and used: * * when one proxy account manages proxies of multiple vendors * * to support specific features of specified proxies, such as residential proxy(session) from brightdata * @default accountType */ vendorName: string; } /** * all proxies in a proxyPackage generally have the same but can vary the following attributes: * * maxSessionDuration * * priority * * maxUserPerIp * * maxConcurrencyPerUser * * bandwidth */ interface DefaultProxyAttributesInPackage { /** * max validity period of session, in seconds, such as 900 seconds (15 minutes): * * only used for rotating proxy * * 0 means that sessions are not supported and a different IP address is used for each request * @default 0 */ maxSessionDuraion: Seconds; /** * priority of proxy, the smaller the value, the higher the priority * * it is recommended to use the default value, then task scheduler can determine the priority based on the different attributes of the proxy * @default 10 */ priority: number; /** * * max real users per (static or dynamic) proxy in this package * * or how many users can share this proxy * * mainly used for dedicated proxy with billingModel as period, or shared proxy gateway with billingModel as traffic * * suggestions: * * * dedicated/period: residential(4~8), ISP(2~4), datacenter(1~2), other(1) * * * shared proxy gateway(rotating): depends on traffic(GB) and expected number of concurrent users * * * others: 1 * @default 1 */ maxUsersPerIp: number; /** * max concurrency (browser pages or api requests) per user * * mainly used for proxy with billingModel as period * * max concurrency per proxy is determined by maxUserPerIp * maxConcurrencyPerUser or bandwidth * @default 10 (period), or 1 (traffic) */ maxConcurrencyPerUser: number; /** * proxy's network bandwidth in kbps * @default 0 that means unkown */ bandwidth: number; } interface ProxyPackageOptions extends ProxyAttributesInPackage, DefaultProxyAttributesInPackage { } interface ProxyPackageBasic extends PackageAttributesInAccount, ProxyAttributesInPackage, DefaultProxyAttributesInPackage { status: ProxyPackageStatus; /** * max validity period of proxy, in seconds, such as 900 seconds (15 minutes): * * mainly used for static proxy */ maxProxyDuration: number; geoLocations: GeoLocation[]; /** * The smaller the value, the higher the priority * * static proxies (valid now and billingModel is period) should have the highest priority * @default 10 */ expireTime: DateTime; /** * number of total proxies in this package (billingModel: "period") */ ipTotal: number; /** * number of unallocated or allocated/idle proxies in this package (billingModel: "period") */ ipBalance: number; /** * number of allocated and idle proxies in this package (billingModel: "period") */ ipIdle: number; /** * nubmer of total trafic(GB) in this package (billingModel: "traffic") */ gbTotal: number; /** * nubmer of remaining trafic(GB) in this package (billingModel: "traffic") */ gbBalance: number; whenApplyProxy: WhenApplyProxy; description: string; } type ProxyAccountIdentifier = PackageAttributesInAccount; type ProxyPackageIdentifier = Pick<ProxyPackageBasic, "accountType" | "accountName" | "packageName">; /** * used to manage proxies internally */ interface ProxyPackageInfo extends ProxyPackageBasic { proxyInfos: ProxyInfo[]; } /** * used by users of this libary */ interface ProxyPackage extends ProxyPackageBasic { proxyNum: number; } interface SimpleProxy { /** * id of Proxy: * * string, all proxies from different vendors should have unique proxyId !!! * * refer to function parseCompleteUrl * * It's best that different proxyIds means different actual IPs especially when proxyDurationType is not "rotating" !!! * * It can be the same as proxyUrl if proxyUrl meets the above conditons. * * When proxyUrl is "", proxyId is preferably an external network IP, or none. * * proxyId should not include the password. */ proxyId: string; /** * * The proxyUrl includes the proxy protocol, host or IP, port, but does not include the username and password. * * When proxyUrl is "", no proxy will be set for browser and API requests. */ proxyUrl: string; host?: string; /** * 0 means default port, such as 80 for http */ port?: number; username?: string; password?: string; } interface BasicProxy extends SimpleProxy { /** * expired UNIX time(seconds) of proxy * * 0 means never expires */ expireTime: number; geoLocation?: GeoLocation; /** * created UNIX time(seconds) of proxy */ createTime: number; /** * whether the proxy can be freed for later reuse * * if freeable is false, the proxy will be deleted after being allocated by calling getProxies */ freeable: boolean; } /** * used to manage proxies internally */ interface ProxyInfo extends BasicProxy, DefaultProxyAttributesInPackage { /** * if status is "busy", freeable must be true */ status: ProxyStatus; /** * last abnormal UNIX time(seconds) of proxy * * valid only for static proxy * * set by calling lockProxy (don't lock proxy if it is used to access many different websites) */ lastAbnormalTime: number; } /** * used by users of this libary */ interface Proxy extends PackageAttributesInAccount, ProxyAttributesInPackage, DefaultProxyAttributesInPackage, BasicProxy { expireTime: number; } interface ProxyRequirements { /** * length===0:any proxyIpType/ProxySharedType;length>0:any one of proxyShortType * @default [] */ proxyIpSharedTypes?: ProxyIpSharedType[]; /** * The minimum effective time of the proxyUrl * * <=0:ingored, or no requirements * * >0:Proxy.expireTime - currentTime >= minProxyValidSeconds * @default 0 */ minProxyValidSeconds?: number; /** * The minimum effective time of the actual proxy IP * * <=0:ingored, or no requirements * * >0:Proxy.proxyDruationType!=="rotating" && Proxy.expireTime - currentTime >= minIpValidSeconds * @default 0 */ minIpValidSeconds?: number; /** * it's better to specify minProxyValidSeconds [ and minIpValidSeconds ] instead of proxyDurationTypes */ proxyDurationTypes?: ProxyDurationType[]; /** * length===0:any billingModel;length>0:any one of billingModels * @default [] */ billingModels?: BillingModel[]; /** * length===0:any getLocation;length>0:any one of geoLocations * @default [] */ geoLocations?: GeoLocation[]; /** * length===0:any vendorName;length>0:any one of vendorNames * @default [] */ vendorNames?: string[]; } /** * Proxy account manager manages proxies that an account has purchased from a proxy vendor. Basic concepts: * * ProxyAccount: an account usually manages the proxies provided by an proxy vendor. A prorxyAccount may purchase 0 or many proxyPackages. A joint proxy account manages proxies provided by many vendors. * * ProxyPackage: a package that you buy from a proxy vendor. Each proxy package usually contains many proxies of the same type, that will expire later. * * Proxy: a network proxy that can be used to scrape data * * NewProxyEvent: event emitted when new proxies are added */ interface ProxyAccountManager extends EventEmitter { /** * each type of proxyAccountManager decides which parameters are required or optional: * * the constructor should throw error if the parameters are invalid. */ /** * set new options of this manager, each type of proxyAccountManager decides which options can be updated * @param options */ setOptions(options: ProxyAccountManagerOptions): boolean; /** * Start to refresh proxies: * * refresh once immediately if options.refreshNow is true * * refresh periodly if refreshIntervalSecs of ProxyAccountManager is greater than 0 * @param options */ startRefresh(options: RefreshOptions): Promise<boolean>; /** * Stop to refresh periodly. */ stopRefresh(): boolean; /** * Get proxy packages that meet the conditions. * * used only by ProxyManager * @param proxyRequirements */ _getProxyPackages(proxyRequirements: ProxyRequirements): ProxyPackageInfo[]; /** * Get proxies, which meet the conditions, from a package. * * used only by ProxyManager, user should use getProxies() * @param proxyRequirements * @param proxyNum default 1 * @param onlyApplied default false, whether to get proxies only from applied proxies * @param onlyApply default false;false - apply and allocate,true - apply and not allocate */ _getProxiesFromPackage(proxyPackageInfo: ProxyPackageInfo, proxyRequirements: ProxyRequirements, proxyNum?: number, onlyApplied?: boolean, onlyApply?: boolean): Promise<Proxy[]>; /** * Get the applied proxies that meet proxyRequirements * @param proxyRequirements * @param proxyNum defaut 0, <=0: all, >0: the number of proxies to get * @param onlyApply default false;false - apply and allocate,true - apply and not allocate */ /** * Get proxies that meet the conditions, can be used now, and have the highest priority(and the minimum expireTime if same priority). * * The number of proxies in return may be less than proxyNum. * * Return [] if there is no requried proxies. * * Throw error if there is exeception. * @param proxyRequirements * @param proxyNum default 1, the number of proxies to get * @param onlyApplied default false, whether to get proxies only from applied proxies */ getProxies(proxyRequirements: ProxyRequirements, proxyNum?: number, onlyApplied?: boolean): Promise<Proxy[]>; /** * Discard a proxy that is expired or invalid. This proxy will not be used again. * @param proxy */ discardProxy(proxy: Proxy): boolean; /** * * Free a busy proxy for later use. * @param proxy */ freeProxy(proxy: Proxy): boolean; /** * Lock a proxy that cannot be used now and can be used later again (usually lock **static** proxy): * * Free this proxy if it is busy (in use) * * Lock this proxy by updating lastAbnormalTime of this proxy (status of proxy is "idle") * * Please don't lock proxy if it is used to access many different websites * @param proxy */ lockProxy(proxy: Proxy): boolean; /** * Optional method, launched manually and used by some types of proxyAccountManager, such as GeneralAccount uses this to add new package or add new proxies. * * It's best to refresh periodly in startRefresh method. * * Use this method only when it is requried. * @param opType * @param data */ update(opType?: string, data?: object): Promise<boolean>; /** * Adjust the priority of packages * @param priority * @param packageNames */ adjustPriorityOfPackages(priority: number, packageNames: PackageName[]): boolean; /** * Enable packages * @param packageNames */ enablePackages(packageNames: PackageName[]): boolean; /** * Disable packages * @param packageNames */ disablePackages(packageNames: PackageName[]): boolean; /** * Enable this proxyAccount */ enable(): boolean; /** * Disable this proxyAccount */ disable(): boolean; proxyAccountId(): ProxyAccountId; status(): ProxyAccountStatus; proxyAccount(): ProxyAccount; proxyPackages(): ProxyPackage[]; proxies(): Proxy[]; /** * Close this proxy account manager (proxyAccount) that cannot be used again. */ close(): Promise<boolean>; } /** * Proxy manager manages proxies that many accounts have purchased from different proxy vendors. Basic concepts: * * ProxyAccount: an account manages the proxies provided by an proxy vendor. A prorxyAccount may purchase 0 or many proxyPackages. * * ProxyPackage: a package that you buy from a proxy vendor. Each proxy package usually contains many proxies of the same type, that will expire later. * * Proxy: a network proxy that can be used to scrape data */ interface ProxyManager$1 extends EventEmitter { /** * Add a proxyAccount * @param proxyAccount */ addProxyAccount(proxyAccount: ProxyAccountManager): boolean; /** * Get the applied proxies that meet proxyRequirements * @param proxyRequirements * @param proxyNum defaut 0, <=0: all, >0: the number of proxies to get * @param onlyApply default false;false - apply and allocate,true - apply and not allocate */ /** * Get proxies that meet the conditions, can be used now, and have the highest priority(and the minimum expireTime if same priority). * * The number of proxies in return may be less than proxyNum. * * Return [] if there is no requried proxies. * * Throw error if there is exeception. * @param proxyRequirements * @param proxyNum default 1, the number of proxies to get * @param onlyApplied default false, whether to get proxies only from applied proxies */ getProxies(proxyRequirements: ProxyRequirements, proxyNum?: number, onlyApplied?: boolean): Promise<Proxy[]>; /** * Discard a proxy that is expired (static/dynamic/rotating) or invalid (dynamic). This proxy will not be used again. * @param proxy */ discardProxy(proxy: Proxy): boolean; /** * * Free a busy proxy for later use (usually free **static** proxy) * @param proxy */ freeProxy(proxy: Proxy): boolean; /** * Lock a proxy that cannot be used now and can be used later again (usually lock **static** proxy): * * Free this proxy if it is busy (in use) * * Lock this proxy * * Update lastAbnormalTime of this proxy (currentTime) * * This proxy will be unlocked automatically later by proxyManager * @param proxy */ lockProxy(proxy: Proxy): boolean; /** * Adjust the priority of packages * @param priority * @param proxyPackageIdentifiers */ adjustPriorityOfPackages(priority: number, proxyPackageIdentifiers: ProxyPackageIdentifier[]): boolean; /** * Enable packages * @param proxyPackageIdentifiers */ enablePackages(proxyPackageIdentifiers: ProxyPackageIdentifier[]): boolean; /** * Disable packages * @param proxyPackageIdentifiers */ disablePackages(proxyPackageIdentifiers: ProxyPackageIdentifier[]): boolean; /** * Enable proxyAccounts * @param proxyAccountIdentifiers */ enableProxyAccounts(proxyAccountIdentifiers: ProxyAccountIdentifier[]): boolean; /** * Disable proxyAccounts * @param proxyAccountIdentifiers */ disableProxyAccounts(proxyAccountIdentifiers: ProxyAccountIdentifier[]): boolean; proxyAccountManager(accountType: string, accountName: string): ProxyAccountManager | undefined; proxyAccounts(): ProxyAccount[]; proxyPackages(): ProxyPackage[]; proxies(): Proxy[]; /** * Close this proxy manager that cannot be used again. */ close(): Promise<boolean>; } declare function setProxyLogFun(logFun: LogFunction): boolean; declare class ProxyManager extends EventEmitter implements ProxyManager$1 { #private; constructor(); addProxyAccount(proxyAccount: ProxyAccountManager): boolean; getProxies(proxyRequirements: ProxyRequirements, proxyNum?: number, onlyApplied?: boolean): Promise<Proxy[]>; discardProxy(proxy: Proxy): boolean; freeProxy(proxy: Proxy): boolean; lockProxy(proxy: Proxy): boolean; adjustPriorityOfPackages(priority: number, proxyPackageIdentifiers: ProxyPackageIdentifier[]): boolean; enablePackages(proxyPackageIdentifiers: ProxyPackageIdentifier[]): boolean; disablePackages(proxyPackageIdentifiers: ProxyPackageIdentifier[]): boolean; enableProxyAccounts(proxyAccountIdentifiers: ProxyAccountIdentifier[]): boolean; disableProxyAccounts(proxyAccountIdentifiers: ProxyAccountIdentifier[]): boolean; proxyAccountManager(accountType: string, accountName: string): ProxyAccountManager | undefined; proxyAccounts(): ProxyAccount[]; proxyPackages(): ProxyPackage[]; proxies(): Proxy[]; close(): Promise<boolean>; } interface GeneralAccountOptions { /** * @default `general-${GeneralAccount.nextAccountIdx}` */ accountName?: string; /** * @default "" */ description?: string; /** * @default "applyAndAllocate" */ whenCheckProxy?: WhenCheckProxy; /** * @default false */ discardAbnormalProxy?: boolean; /** * @default 3600 */ abnormalLockSeconds?: number; /** * Refresh interval for new proxies (seconds): * * To disable refreshing proxies regularly, set this value to 0 * * Call startRefresh to start regular refresh * @default 60 */ refreshIntervalSecs?: number; } type GeneralOpType = "addProxy" | "addPackage"; interface AddProxyData { packageName: string; completeUrls: CompleteUrl[]; } interface GeneralPackageOptions extends Omit<ProxyAttributesInPackage, "packageName" | "proxyIpSharedType">, Partial<DefaultProxyAttributesInPackage>, Partial<Pick<ProxyPackageBasic, "maxProxyDuration" | "geoLocations" | "description">> { completeUrls?: CompleteUrl[]; filename?: string; /** * */ requestUrl?: string; } /** * GeneralAccount is used to manage proxies that meet the following conditions: * * All packages in a GeneralAccount have a unique packageName. * * The packageName is always proxyIpType-proxyDurationType-proxySharedType-billingModel-vendorName. * * All proxies in a package have the same maxSessionDuration/priority/maxUsersPerIp/maxConcurrencyPerUser/bandwidth. (You can create many GeneralAccounts that have the same packageName and different maxSessionDuration etc.) * * All prroxies are freeable and their geolocation is unkown (or undefined). * * Each packageName can be bound to a filename or requestUrl to add new proxies regularly. * * Each non-blank line in the file represents the complete URL(aka CompleteUrl) of a proxy. * * * If the file content does not change, it is considered that there is no new proxy. * * The requestUrl can include "${packageName}" that will be replaced with packageName: * * * The method is always "GET". * * * The json data in response must include completeUrls (CompleteUrl[]), such as { completeUrls: ["http://username:password@proxyIp:port"] }. * * How to add new proxy: * * * Update the content of filename if filenname is valid. * * * Send requestUrl to get new proxies if requestUrl is valid. * * * Call the update function with opType "add". * * Ignored if the new proxy already exists in this GeneralAccount. * * Ignored if the CompleteUrl is invalid. */ declare class GeneralAccount extends EventEmitter implements ProxyAccountManager { #private; /** * Manage general proxies, default options members: * * accountName = `${accountType}-${nextAccountIdx}` * * description = "", * * whenCheckProxy = "applyAndAllocate", * * discardAbnormalProxy = false, * * abnormalLockSeconds = 3600, * * refreshIntervalSecs = 60 * @param options * */ constructor(options?: GeneralAccountOptions); proxyAccountId(): string; setOptions(options: GeneralAccountOptions): boolean; startRefresh(options?: RefreshOptions): Promise<boolean>; stopRefresh(): boolean; _getProxyPackages(proxyRequirements: ProxyRequirements): ProxyPackageInfo[]; _getProxiesFromPackage(proxyPackageInfo: ProxyPackageInfo, proxyRequirements: ProxyRequirements, proxyNum?: number, onlyApplied?: boolean, onlyAppy?: boolean): Promise<Proxy[]>; getProxies(proxyRequirements: ProxyRequirements, proxyNum?: number, onlyApplied?: boolean): Promise<Proxy[]>; discardProxy(proxy: Proxy): boolean; freeProxy(proxy: Proxy): boolean; lockProxy(proxy: Proxy): boolean; update(opType: GeneralOpType, data: GeneralPackageOptions | AddProxyData): Promise<boolean>; adjustPriorityOfPackages(priority: number, packageNames?: string[] | null): boolean; enablePackages(packageNames?: string[] | null): boolean; disablePackages(packageNames?: string[] | null): boolean; enable(): boolean; disable(): boolean; status(): ProxyAccountStatus; proxyAccount(): ProxyAccount; proxyPackages(): ProxyPackage[]; proxies(): Proxy[]; close(): Promise<boolean>; } declare function checkProxy(proxyUrl: string): Promise<boolean>; declare function checkGeoLocations(requiredGls: GeoLocation[], glsInPpi: GeoLocation[]): boolean; declare function getDefaultMaxUsersPerIp(proxyIpType: ProxyIpType, proxySharedType: ProxySharedType, billingModel: BillingModel): number; declare function getDefaultMaxConcurrencyPerUser(proxyIpType: ProxyIpType, proxySharedType: ProxySharedType, billingModel: BillingModel): number; declare function doesProxyMeetRequirements(proxy: Proxy, proxyRequirements: ProxyRequirements): boolean; export { type AddProxyData, type BasicProxy, type BillingModel, type CompleteUrl, type DateTime, DefaultDatacenterProxyPriority, DefaultIspProxyPriority, DefaultMobileProxyPriority, type DefaultProxyAttributesInPackage, DefaultProxyPackagePriority, DefaultResidentialProxyPriority, DefaultStaticProxyPriority, GeneralAccount, type GeneralAccountOptions, type GeneralPackageOptions, type GeoLocation, type MilliSeconds, type MiscInOptions, NewProxyEvent, type PackageAttributesInAccount, type PackageName, type Proxy, type ProxyAccount, type ProxyAccountId, type ProxyAccountIdentifier, type ProxyAccountManager, type ProxyAccountManagerOptions, type ProxyAccountStatus, type ProxyAttributesInPackage, type ProxyDurationType, type ProxyInfo, type ProxyIpSharedType, type ProxyIpType, ProxyManager, type ProxyPackage, type ProxyPackageBasic, type ProxyPackageIdentifier, type ProxyPackageInfo, type ProxyPackageOptions, type ProxyPackageStatus, type ProxyProtocal, type ProxyRequirements, type ProxySharedType, type ProxyStatus, type RefreshOptions, type Seconds, type SimpleProxy, type StickySession, type WhenApplyProxy, type WhenCheckProxy, checkGeoLocations, checkProxy, doesProxyMeetRequirements, getDefaultMaxConcurrencyPerUser, getDefaultMaxUsersPerIp, setProxyLogFun };