UNPKG

user-agents

Version:

A JavaScript library for generating random user agents.

154 lines (131 loc) 4.9 kB
import fs from 'fs'; import { argv } from 'process'; import { fileURLToPath } from 'url'; import { gzipSync } from 'zlib'; import dynamoose from 'dynamoose'; import { Item } from 'dynamoose/dist/Item.js'; import stableStringify from 'fast-json-stable-stringify'; import isbot from 'isbot'; import random from 'random'; import UAParser from 'ua-parser-js'; import { UserAgentData } from './user-agent'; const ddb = new dynamoose.aws.ddb.DynamoDB({ region: 'us-east-2', }); dynamoose.aws.ddb.set(ddb); const SubmissionModel = dynamoose.model< { ip: string; profile: { [key: string]: unknown }; } & UserAgentData & Item >( 'userAgentsAnalyticsSubmissionTable', new dynamoose.Schema( { id: { type: String, hashKey: true, }, ip: String, profile: Object, }, { saveUnknown: ['profile.**'], timestamps: { createdAt: 'timestamp', updatedAt: undefined }, }, ), { create: false, update: false }, ); const getUserAgentTable = async (limit = 1e4) => { const minimumTimestamp = Date.now() - 1 * 24 * 60 * 60 * 1000; // Scan through all recent profiles keeping track of the count of each. let lastKey = null; const countsByProfile: { [stringifiedProfile: string]: number } = {}; const ipAddressAlreadySeen: { [ipAddress: string]: boolean } = {}; do { const scan = SubmissionModel.scan( new dynamoose.Condition().filter('timestamp').gt(minimumTimestamp), ); if (lastKey) { scan.startAt(lastKey); } const response = await scan.exec(); response.forEach(({ ip, profile }) => { // Only count one profile per IP address. if (ipAddressAlreadySeen[ip]) return; ipAddressAlreadySeen[ip] = true; // Filter out bots like Googlebot and YandexBot. if (isbot(profile.userAgent)) return; // Strip extra quotes that some browsers include around the user agent string. profile.userAgent = (profile.userAgent as string).replace(/^"|"$/g, ''); // Filter out spam/fake user agents with non-ASCII characters. // Real browser UA strings are always pure ASCII per the HTTP spec. if (/[^\x20-\x7E]/.test(profile.userAgent as string)) return; // Filter out the literal `{userAgent}` placeholder string that some clients send. if (profile.userAgent === '{userAgent}') return; // Track the counts for this exact profile. const stringifiedProfile = stableStringify(profile); if (!countsByProfile[stringifiedProfile]) { countsByProfile[stringifiedProfile] = 0; } countsByProfile[stringifiedProfile] += 1; }); lastKey = response.lastKey; } while (lastKey); // Add some noise to the counts/weights. const n = () => random.normal(); Object.entries(countsByProfile).forEach(([stringifiedProfile, count]) => { const unnormalizedWeight = Array(2 * count) .fill(undefined) .reduce((sum) => sum + n()() ** 2, 0) / 2; countsByProfile[stringifiedProfile] = unnormalizedWeight; }); // Accumulate the profiles and add/remove a few properties to match the historical format. const profiles: UserAgentData[] = []; Object.entries(countsByProfile).forEach(([stringifiedProfile, weight]) => { if (Object.hasOwn(countsByProfile, stringifiedProfile)) { const profile = JSON.parse(stringifiedProfile); profile.weight = weight; delete profile.sessionId; // Find the device category. const parser = new UAParser(profile.userAgent); const device = parser.getDevice(); // Sketchy, but I validated this on historical data and it is a 100% match. profile.deviceCategory = { mobile: 'mobile', tablet: 'tablet', undefined: 'desktop' }[`${device.type}`] ?? 'desktop'; profiles.push(profile); delete countsByProfile[stringifiedProfile]; } }); // Sort by descending weight. profiles.sort((a, b) => b.weight - a.weight); // Apply the count limit and normalize the weights. profiles.splice(limit); const totalWeight = profiles.reduce((total, profile) => total + profile.weight, 0); profiles.forEach((profile) => { profile.weight /= totalWeight; }); return profiles; }; if (fileURLToPath(import.meta.url) === argv[1]) { const filename = process.argv[2]; if (!filename) { throw new Error('An output filename must be passed as an argument to the command.'); } getUserAgentTable() .then(async (userAgents) => { const stringifiedUserAgents = JSON.stringify(userAgents, null, 2); // Compress the content if the extension ends with `.gz`. const content = filename.endsWith('.gz') ? gzipSync(stringifiedUserAgents) : stringifiedUserAgents; fs.writeFileSync(filename, content); }) .catch((error) => { console.error(error); process.exit(1); }); } export default getUserAgentTable;