UNPKG

third-party-web

Version:

Categorized data on third party entities on the web.

152 lines (130 loc) 4.58 kB
const DOMAIN_IN_URL_REGEX = /:\/\/(\S*?)(:\d+)?(\/|$)/ const DOMAIN_CHARACTERS = /(?:[a-z0-9.-]+\.[a-z0-9]+|localhost)/i const IP_REGEX = /^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$/ const ROOT_DOMAIN_REGEX = /[^.]+\.([^.]+|(gov|com|co|ne)\.\w{2})$/i /** * @param {string} originOrURL * @return {[string|null, string|null]} - The first item is the root domain, the second item is the domain. */ function parseDomains(originOrURL) { if (typeof originOrURL !== 'string') return [null, null] if (originOrURL.length > 10000 || originOrURL.startsWith('data:')) return [null, null] let m = originOrURL.match(DOMAIN_IN_URL_REGEX) let domain; if (m) { domain = m[1] } m = originOrURL.match(DOMAIN_CHARACTERS) if (m) { domain = m[0] } if (!domain) return [null, null] if (IP_REGEX.test(domain)) return [domain, domain] m = domain.match(ROOT_DOMAIN_REGEX) const rootDomain = m && m[0] || domain; return [rootDomain, domain] } function getRootDomain(originOrURL,) { return parseDomains(originOrURL)[0]; } function sliceSubdomainFromDomain(domain, rootDomain) { if (domain.length <= rootDomain.length) return domain return domain .split('.') .slice(1) .join('.') } function getEntityInDataset(entityByDomain, entityBySubDomain, entityByRootDomain, originOrURL) { const [rootDomain, domain] = parseDomains(originOrURL); if (!domain || !rootDomain) return undefined if (entityByDomain.has(domain)) return entityByDomain.get(domain) for ( let subdomain = domain; subdomain.length > rootDomain.length; subdomain = sliceSubdomainFromDomain(subdomain, rootDomain) ) { if (entityBySubDomain.has(subdomain)) return entityBySubDomain.get(subdomain) } if (entityByRootDomain.has(rootDomain)) return entityByRootDomain.get(rootDomain) return undefined } function getProductInDataset(entityByDomain, entityBySubDomain, entityByRootDomain, originOrURL) { const entity = getEntityInDataset( entityByDomain, entityBySubDomain, entityByRootDomain, originOrURL ) const products = entity && entity.products if (!products) return undefined if (typeof originOrURL !== 'string') return undefined for (const product of products) { for (const pattern of product.urlPatterns) { if (pattern instanceof RegExp && pattern.test(originOrURL)) return product if (typeof pattern === 'string' && originOrURL.includes(pattern)) return product } } return undefined } function cloneEntities(entities) { return entities.map(entity_ => { const entity = { company: entity_.name, categories: [entity_.category], ...entity_, } const products = (entity_.products || []).map(product => ({ company: entity.company, category: entity.category, categories: [entity.category], facades: [], ...product, urlPatterns: (product.urlPatterns || []).map(s => s.startsWith('REGEXP:') ? new RegExp(s.slice('REGEXP:'.length)) : s ), })) entity.products = products return entity }) } function createAPIFromDataset(entities_) { const entities = cloneEntities(entities_) const entityByDomain = new Map() const entityByRootDomain = new Map() const entityBySubDomain = new Map() for (const entity of entities) { entity.totalExecutionTime = Number(entity.totalExecutionTime) || 0 entity.totalOccurrences = Number(entity.totalOccurrences) || 0 entity.averageExecutionTime = entity.totalExecutionTime / entity.totalOccurrences for (const domain of entity.domains) { if (entityByDomain.has(domain)) { const duplicate = entityByDomain.get(domain) throw new Error(`Duplicate domain ${domain} (${entity.name} and ${duplicate.name})`) } entityByDomain.set(domain, entity) const rootDomain = getRootDomain(domain) if (domain.startsWith('*.')) { const wildcardDomain = domain.slice(2) if (wildcardDomain === rootDomain) entityByRootDomain.set(rootDomain, entity) else entityBySubDomain.set(wildcardDomain, entity) } } } for (const [rootDomain, entity] of entityByRootDomain.entries()) { if (!entity) entityByRootDomain.delete(rootDomain) } const getEntity = getEntityInDataset.bind( null, entityByDomain, entityBySubDomain, entityByRootDomain ) const getProduct = getProductInDataset.bind( null, entityByDomain, entityBySubDomain, entityByRootDomain ) return {getEntity, getProduct, getRootDomain, entities} } module.exports = {createAPIFromDataset}