diff --git a/packages/generator-networks-creator/src/generator-networks-creator.ts b/packages/generator-networks-creator/src/generator-networks-creator.ts index b245cd5b..b54e9764 100755 --- a/packages/generator-networks-creator/src/generator-networks-creator.ts +++ b/packages/generator-networks-creator/src/generator-networks-creator.ts @@ -1,8 +1,9 @@ import fs from 'fs'; -import path from 'path'; +import path, { parse } from 'path'; import { BayesianNetwork } from 'generative-bayesian-network'; import fetch from 'node-fetch'; +import { UAParser } from 'ua-parser-js'; const browserHttpNodeName = '*BROWSER_HTTP'; const httpVersionNodeName = '*HTTP_VERSION'; @@ -20,42 +21,119 @@ const nonGeneratedNodes = [ const STRINGIFIED_PREFIX = '*STRINGIFIED*'; -const PLUGIN_CHARACTERISTICS_ATTRIBUTES = [ - 'plugins', - 'mimeTypes', -]; - -async function prepareRecords(records: Record[], preprocessingType: string) : Promise[]> { - const cleanedRecords = records - .filter(( - { - requestFingerprint: { headers }, - browserFingerprint, - }) => { - return (headers['user-agent'] ?? headers['User-Agent']) === browserFingerprint.userAgent; - }) - .filter( - ({ - browserFingerprint: { - screen: { width, height }, - userAgent, - }, - }) => ((width >= 1280 && width > height) || (width < height && /phone|android|mobile/i.test(userAgent))), - ) - .map((record) => ({ ...record, userAgent: record.browserFingerprint.userAgent } as any)); +const PLUGIN_CHARACTERISTICS_ATTRIBUTES = ['plugins', 'mimeTypes']; + +async function prepareRecords( + records: Record[], + preprocessingType: string +): Promise[]> { + const cleanedRecords = []; + + for (const record of records) { + const { + requestFingerprint: { headers }, + browserFingerprint: fingerprint, + } = record; + + // The webdriver attribute should not be truthy + if (fingerprint.webdriver) continue; + + const validPluginAndMime = + 'plugins' in fingerprint && + 'mimeTypes' in fingerprint && + fingerprint.plugins.length > 0 && + fingerprint.mimeTypes.length > 0; + + // The plugins and mimeTypes should be present and non-empty + if (!validPluginAndMime) continue; + + const validUserAgent = + fingerprint.userAgent === + (headers['user-agent'] ?? headers['User-Agent']); + + // The userAgent should match the one in the headers + if (!validUserAgent) continue; + + const validUserAgentData = + !('userAgentData' in fingerprint) || + ('brands' in fingerprint.userAgentData && + 'mobile' in fingerprint.userAgentData && + 'platform' in fingerprint.userAgentData && + fingerprint.userAgentData.brands.length === 3); + + // The userAgentData should have the correct structure + if (!validUserAgentData) continue; + + const validLanguage = + fingerprint.language && + 'languages' in fingerprint && + fingerprint.languages.length > 0 && + fingerprint.language === fingerprint.languages[0]; + + // The language should be the first in the list + if (!validLanguage) continue; + + const parsedUserAgent = await UAParser( + fingerprint.userAgent, + headers + ).withClientHints(); + + const validBrowser = + parsedUserAgent.browser.name && + [ + 'Edge', + 'Chrome', + 'Chrome Mobile', + 'Firefox', + 'Safari', + 'Safari Mobile', + ].includes(parsedUserAgent.browser.name); + + // The browser should be one of the supported ones + if (!validBrowser) continue; + + const desktopFingerprint = + parsedUserAgent.device.type === undefined || + !['wearable', 'mobile'].includes(parsedUserAgent.device.type); + + const validDeviceType = + parsedUserAgent.device.type === 'mobile' || + parsedUserAgent.device.type === 'tablet' || + desktopFingerprint; + + // The device type should be mobile, tablet or desktop + if (!validDeviceType) continue; + + const validTouchSupport = + desktopFingerprint || fingerprint.userAgentData?.mobile !== true + ? fingerprint.maxTouchPoints === 0 + : fingerprint.maxTouchPoints > 0; + + // The maxTouchPoints should be 0 for desktops and > 0 for mobile devices + if (!validTouchSupport) continue; + + cleanedRecords.push({ + ...record, + userAgent: record.browserFingerprint.userAgent, + } as any); + } // TODO this could break if the list is not there anymore // The robots list is available under the MIT license, for details see https://github.com/atmire/COUNTER-Robots/blob/master/LICENSE - const robotUserAgents = await fetch('https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/COUNTER_Robots_list.json') - .then(async (res) => res.json()) as {pattern: string}[]; + const robotUserAgents = (await fetch( + 'https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/COUNTER_Robots_list.json' + ).then(async (res) => res.json())) as { pattern: string }[]; const deconstructedRecords = []; const userAgents = new Set(); for (let x = 0; x < cleanedRecords.length; x++) { let record = cleanedRecords[x]; const { userAgent } = record as { userAgent: string }; - let useRecord = !userAgent.match(/(bot|bots|slurp|spider|crawler|crawl)\b/i) - && !robotUserAgents.some((robot) => userAgent.match(new RegExp(robot.pattern, 'i'))); + let useRecord = + !userAgent.match(/(bot|bots|slurp|spider|crawler|crawl)\b/i) && + !robotUserAgents.some((robot) => + userAgent.match(new RegExp(robot.pattern, 'i')) + ); if (useRecord) { if (preprocessingType === 'headers') { @@ -70,7 +148,9 @@ async function prepareRecords(records: Record[], preprocessingType: } } - if (useRecord) { deconstructedRecords.push(record); } else { + if (useRecord) { + deconstructedRecords.push(record); + } else { userAgents.add(userAgent); } } @@ -97,8 +177,12 @@ async function prepareRecords(records: Record[], preprocessingType: return reorganizedRecords; } + export class GeneratorNetworksCreator { - private getDeviceOS(userAgent: string) : { device: string; operatingSystem: string } { + private getDeviceOS(userAgent: string): { + device: string; + operatingSystem: string; + } { let operatingSystem = missingValueDatasetToken; if (/windows/i.test(userAgent)) { operatingSystem = 'windows'; @@ -120,7 +204,9 @@ export class GeneratorNetworksCreator { return { device, operatingSystem }; } - private getBrowserNameVersion(userAgent: string) : `${string}/${string}` | typeof missingValueDatasetToken { + private getBrowserNameVersion( + userAgent: string + ): `${string}/${string}` | typeof missingValueDatasetToken { const canonicalNames = { chrome: 'chrome', crios: 'chrome', @@ -133,10 +219,12 @@ export class GeneratorNetworksCreator { edgios: 'edge', } as Record; - const unsupportedBrowsers = /opr|yabrowser|SamsungBrowser|UCBrowser|vivaldi/i; + const unsupportedBrowsers = + /opr|yabrowser|SamsungBrowser|UCBrowser|vivaldi/i; const edge = /(edg(a|ios|e)?)\/([0-9.]*)/i; const safari = /Version\/([\d.]+)( Mobile\/[a-z0-9]+)? Safari/i; - const supportedBrowsers = /(firefox|fxios|chrome|crios|safari)\/([0-9.]*)/i; + const supportedBrowsers = + /(firefox|fxios|chrome|crios|safari)\/([0-9.]*)/i; if (unsupportedBrowsers.test(userAgent)) { return missingValueDatasetToken; @@ -160,25 +248,52 @@ export class GeneratorNetworksCreator { return missingValueDatasetToken; } - async prepareHeaderGeneratorFiles(datasetPath: string, resultsPath: string) { + async prepareHeaderGeneratorFiles( + datasetPath: string, + resultsPath: string + ) { const datasetText = fs.readFileSync(datasetPath, { encoding: 'utf8' }); - const records = await prepareRecords(JSON.parse(datasetText), 'headers'); - - const inputGeneratorNetwork = new BayesianNetwork({ path: path.join(__dirname, 'network_structures', 'input-network-structure.zip') }); - const headerGeneratorNetwork = new BayesianNetwork({ path: path.join(__dirname, 'network_structures', 'header-network-structure.zip') }); + const records = await prepareRecords( + JSON.parse(datasetText), + 'headers' + ); + + const inputGeneratorNetwork = new BayesianNetwork({ + path: path.join( + __dirname, + 'network_structures', + 'input-network-structure.zip' + ), + }); + const headerGeneratorNetwork = new BayesianNetwork({ + path: path.join( + __dirname, + 'network_structures', + 'header-network-structure.zip' + ), + }); // eslint-disable-next-line dot-notation - const desiredHeaderAttributes = Object.keys(headerGeneratorNetwork['nodesByName']) - .filter((attribute) => !nonGeneratedNodes.includes(attribute)); + const desiredHeaderAttributes = Object.keys( + headerGeneratorNetwork['nodesByName'] + ).filter((attribute) => !nonGeneratedNodes.includes(attribute)); let selectedRecords = records.map((record) => { - return Object.entries(record).reduce((acc: typeof record, [key, value]) => { - if (desiredHeaderAttributes.includes(key)) acc[key] = value ?? missingValueDatasetToken; - return acc; - }, {}); + return Object.entries(record).reduce( + (acc: typeof record, [key, value]) => { + if (desiredHeaderAttributes.includes(key)) + acc[key] = value ?? missingValueDatasetToken; + return acc; + }, + {} + ); }); selectedRecords = selectedRecords.map((record) => { - const userAgent = (record['user-agent'] !== missingValueDatasetToken ? record['user-agent'] : record['User-Agent']).toLowerCase(); + const userAgent = ( + record['user-agent'] !== missingValueDatasetToken + ? record['user-agent'] + : record['User-Agent'] + ).toLowerCase(); const browser = this.getBrowserNameVersion(userAgent); const { device, operatingSystem } = this.getDeviceOS(userAgent); @@ -188,72 +303,131 @@ export class GeneratorNetworksCreator { [browserNodeName]: browser, [operatingSystemNodeName]: operatingSystem, [deviceNodeName]: device, - [browserHttpNodeName]: `${browser}|${(record[httpVersionNodeName] as string).startsWith('_1') ? '1' : '2'}`, + [browserHttpNodeName]: `${browser}|${ + (record[httpVersionNodeName] as string).startsWith('_1') + ? '1' + : '2' + }`, }; }); headerGeneratorNetwork.setProbabilitiesAccordingToData(selectedRecords); inputGeneratorNetwork.setProbabilitiesAccordingToData(selectedRecords); - const inputNetworkDefinitionPath = path.join(resultsPath, 'input-network-definition.zip'); - const headerNetworkDefinitionPath = path.join(resultsPath, 'header-network-definition.zip'); - const browserHelperFilePath = path.join(resultsPath, 'browser-helper-file.json'); - - headerGeneratorNetwork.saveNetworkDefinition({ path: headerNetworkDefinitionPath }); - inputGeneratorNetwork.saveNetworkDefinition({ path: inputNetworkDefinitionPath }); + const inputNetworkDefinitionPath = path.join( + resultsPath, + 'input-network-definition.zip' + ); + const headerNetworkDefinitionPath = path.join( + resultsPath, + 'header-network-definition.zip' + ); + const browserHelperFilePath = path.join( + resultsPath, + 'browser-helper-file.json' + ); + + headerGeneratorNetwork.saveNetworkDefinition({ + path: headerNetworkDefinitionPath, + }); + inputGeneratorNetwork.saveNetworkDefinition({ + path: inputNetworkDefinitionPath, + }); - const uniqueBrowsersAndHttps = Array.from(new Set(selectedRecords.map((record) => record[browserHttpNodeName]))); - fs.writeFileSync(browserHelperFilePath, JSON.stringify(uniqueBrowsersAndHttps)); + const uniqueBrowsersAndHttps = Array.from( + new Set( + selectedRecords.map((record) => record[browserHttpNodeName]) + ) + ); + fs.writeFileSync( + browserHelperFilePath, + JSON.stringify(uniqueBrowsersAndHttps) + ); } - async prepareFingerprintGeneratorFiles(datasetPath: string, resultsPath: string) { - const datasetText = fs.readFileSync(datasetPath, { encoding: 'utf8' }).replace(/^\ufeff/, ''); - const records = await prepareRecords(JSON.parse(datasetText), 'fingerprints'); + async prepareFingerprintGeneratorFiles( + datasetPath: string, + resultsPath: string + ) { + const datasetText = fs + .readFileSync(datasetPath, { encoding: 'utf8' }) + .replace(/^\ufeff/, ''); + const records = await prepareRecords( + JSON.parse(datasetText), + 'fingerprints' + ); for (let x = 0; x < records.length; x++) { // eslint-disable-next-line no-console - if (x % 1000 === 0) console.log(`Processing record ${x} of ${records.length}`); + if (x % 1000 === 0) + console.log(`Processing record ${x} of ${records.length}`); const record = records[x]; const pluginCharacteristics = {} as { [key: string]: string }; for (const pluginCharacteristicsAttribute of PLUGIN_CHARACTERISTICS_ATTRIBUTES) { if (pluginCharacteristicsAttribute in record) { if (record[pluginCharacteristicsAttribute] !== '') { - pluginCharacteristics[pluginCharacteristicsAttribute] = record[pluginCharacteristicsAttribute]; + pluginCharacteristics[pluginCharacteristicsAttribute] = + record[pluginCharacteristicsAttribute]; } delete record[pluginCharacteristicsAttribute]; } } - record.pluginsData = Object.keys(pluginCharacteristics).length !== 0 ? pluginCharacteristics : missingValueDatasetToken; + record.pluginsData = + Object.keys(pluginCharacteristics).length !== 0 + ? pluginCharacteristics + : missingValueDatasetToken; for (const attribute of Object.keys(record)) { if ([null, '', undefined].includes(record[attribute])) { record[attribute] = missingValueDatasetToken; } else { - record[attribute] = (typeof record[attribute] === 'string' || record[attribute] instanceof String) - ? record[attribute] - : (STRINGIFIED_PREFIX + JSON.stringify(record[attribute])); + record[attribute] = + typeof record[attribute] === 'string' || + record[attribute] instanceof String + ? record[attribute] + : STRINGIFIED_PREFIX + + JSON.stringify(record[attribute]); } } records[x] = record; } - const fingerprintGeneratorNetwork = new BayesianNetwork({ path: path.join(__dirname, 'network_structures', 'fingerprint-network-structure.zip') }); + const fingerprintGeneratorNetwork = new BayesianNetwork({ + path: path.join( + __dirname, + 'network_structures', + 'fingerprint-network-structure.zip' + ), + }); // eslint-disable-next-line dot-notation - const desiredFingerprintAttributes = Object.keys(fingerprintGeneratorNetwork['nodesByName']); + const desiredFingerprintAttributes = Object.keys( + fingerprintGeneratorNetwork['nodesByName'] + ); const selectedRecords = records.map((record) => { - return Object.entries(record).reduce((acc: typeof record, [key, value]) => { - if (desiredFingerprintAttributes.includes(key)) acc[key] = value ?? missingValueDatasetToken; - return acc; - }, {}); + return Object.entries(record).reduce( + (acc: typeof record, [key, value]) => { + if (desiredFingerprintAttributes.includes(key)) + acc[key] = value ?? missingValueDatasetToken; + return acc; + }, + {} + ); }); - const fingerprintNetworkDefinitionPath = path.join(resultsPath, 'fingerprint-network-definition.zip'); + const fingerprintNetworkDefinitionPath = path.join( + resultsPath, + 'fingerprint-network-definition.zip' + ); // eslint-disable-next-line no-console console.log('Building the fingerprint network...'); - fingerprintGeneratorNetwork.setProbabilitiesAccordingToData(selectedRecords); - fingerprintGeneratorNetwork.saveNetworkDefinition({ path: fingerprintNetworkDefinitionPath }); + fingerprintGeneratorNetwork.setProbabilitiesAccordingToData( + selectedRecords + ); + fingerprintGeneratorNetwork.saveNetworkDefinition({ + path: fingerprintNetworkDefinitionPath, + }); } }