From 666db54bad69e8fe05ea89f1e293012672f176cc Mon Sep 17 00:00:00 2001 From: Benjamin Altpeter Date: Mon, 5 Jun 2023 17:20:24 +0200 Subject: [PATCH] Fixes #6: Indicator matching (#15) --- README.md | 48 +++++++++++++++++- docs/README.md | 94 ++++++++++++++++++++++++++--------- package.json | 2 + src/index.ts | 129 ++++++++++++++++++++++++++++++++++++++++++++++--- yarn.lock | 10 ++++ 5 files changed, 249 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index 7c8677a..4edcc67 100644 --- a/README.md +++ b/README.md @@ -4,8 +4,14 @@ For research into mobile privacy and complaints against tracking, it is important to know what data is being transmitted in a request to a tracking server. But these requests are in a huge variety of different formats and often heavily nested and/or obfuscated, which hinders straightforward automatic analysis. TrackHAR aims to address this problem. It takes recorded traffic in a [HAR files](http://www.softwareishard.com/blog/har-12-spec/) as the input and returns a parsed list of the transmitted data (and, optionally, additional metadata like the tracking company and location in the data) for each request it can handle. -To achieve this, TrackHAR uses adapters written for specific tracking endpoints. In our [research](https://benjamin-altpeter.de/doc/thesis-consent-dialogs.pdf), we have found that generic approaches (like indicator matching in the raw transmitted plain text or [base64-encoded](https://github.com/baltpeter/base64-search) request content) are not sufficient due to the frankly ridiculous nesting and obfuscation we observed. In addition, approaches that search for static honey data values can never capture dynamic data types such as free disk space and current RAM usage, or low-entropy values like the operating system version (e.g. `11`). -However, we have also noticed that there is a comparatively small number of tracking endpoints which make up a large portion of all app traffic. This makes our adapter-based approach feasible to detect most of the transmitted tracking data. But it will never be possible to write an adapter for every request. As such, we plan to implement [support for indicator matching](https://github.com/tweaselORG/TrackHAR/issues/6) as a fallback for requests not covered by any adapter in the future. +To achieve this, TrackHAR uses two complementary approaches: adapter-based parsing and indicator matching. + +* **Adapter-based parsing**: Our main approach is to use adapters written for specific tracking endpoints. In our [research](https://benjamin-altpeter.de/doc/thesis-consent-dialogs.pdf), we have found that generic approaches (like indicator matching in the raw transmitted plain text or [base64-encoded](https://github.com/baltpeter/base64-search) request content) are not sufficient due to the frankly ridiculous nesting and obfuscation we observed. In addition, approaches that search for static honey data values can never capture dynamic data types such as free disk space and current RAM usage, or low-entropy values like the operating system version (e.g. `11`). + However, we have also noticed that there is a comparatively small number of tracking endpoints which make up a large portion of all app traffic. This makes our adapter-based approach feasible to detect most of the transmitted tracking data. + +* **Indicator matching**: But it will never be possible to write an adapter for every request. Thus, we use indicator matching as a fallback for requests not covered by any adapter. Indicator matching relies on the user providing known honey data values (such as the advertising ID or geolocation) that are then searched for in the requests. TrackHAR supports indicator matching for plain text, base64-encoded and URL-encoded values in the request headers, path, or body. It also tries to match case-insensitively where possible. + +Note that TrackHAR is designed to err on the side of matching too little instead of overmatching. Both the adapters and indicator matching can miss transmitted tracking data. However conversely, you can be sure that any data that TrackHAR detects is actually transmitted. This is beneficial for research but also legal enforcement against tracking. An important additional goal of TrackHAR is to produce outputs that make it possible to automatically generate human-readable documentation that allows people to comprehend why we detected each data transmission. This is especially important to submit complaints against illegal tracking to the data protection authorities. The generation of these reports is not handled by TrackHAR itself, but this requirement influences the design of our adapters and return values. As a result, the adapters are not regular functions that know how to handle a request, but implement a specific custom decoding "language" that can more easily be parsed and reasoned about automatically. This documentation is generated in [tweaselORG/tracker-wiki](https://github.com/tweaselORG/tracker-wiki) and hosted at [trackers.tweasel.org](https://trackers.tweasel.org). @@ -107,6 +113,44 @@ undefined } ``` +If you want to enable indicator matching for requests not handled by any adapter, you need to provide an object with indicator values for certain properties: + +```ts +import { readFile } from 'fs/promises'; +import { process as processHar } from 'trackhar'; + +(async () => { + const har = await readFile(process.argv[2], 'utf-8'); + + const indicators = { + localIp: [ '10.0.0.2', 'fd31:4159::a2a1' ], + idfa: '6a1c1487-a0af-4223-b142-a0f4621d0311' + }; + + const data = await processHar(JSON.parse(har), { indicatorValues: indicators }); + for (const request of data) console.log(request, '\n'); +})(); +``` + +With this, we can see that our device's advertising ID was transmitted in the first request, after all: + +```ts +[ + { + adapter: 'indicators', + property: 'idfa', + context: 'body', + path: '$[12]', + reasoning: 'indicator matching (base64)', + value: 'NmExYzE0ODctYTBhZi00MjIzLWIxNDItYTBmNDYyMWQwMzEx' + } +] + +// [second request as before…] +``` + +In this case, it was not transmitted as plain text but base64-encoded. TrackHAR was still able to detect it. The `path` indicates the index into the body where the IDFA was found. + ## Contributing adapters As stated, TrackHAR uses so-called adapters to detect tracking traffic. They are JavaScript objects defining a decoding algorithm for the request and the paths to the transmitted data in the decoded request. For each endpoint of a tracker, a separate adapter needs to be defined. To determine which adapter fits a request, the URL is matched against the `endpointUrls` of the adapter, which can either just use string matching or a regular expression. If one of the endpoints matches, the adapter is chosen to analyze the request. Where the same endpoint expects different data formats, multiple adapters with identical `endpointUrls` might be required. In that case, the `match` function of an adapter will be used to determine which adapter to apply to a request. The first adapter to return `true` in its matching method is chosen. Only one adapter can match a request at a time. diff --git a/docs/README.md b/docs/README.md index 5e36097..cdd161a 100644 --- a/docs/README.md +++ b/docs/README.md @@ -13,6 +13,7 @@ trackhar - [DataPath](README.md#datapath) - [DecodingStep](README.md#decodingstep) - [Identifier](README.md#identifier) +- [IndicatorValues](README.md#indicatorvalues) - [JsonPath](README.md#jsonpath) - [Path](README.md#path) - [Property](README.md#property) @@ -56,30 +57,37 @@ The first adapter that matches a request will be used to decode it. #### Defined in -[index.ts:166](https://github.com/tweaselORG/TrackHAR/blob/main/src/index.ts#L166) +[index.ts:168](https://github.com/tweaselORG/TrackHAR/blob/main/src/index.ts#L168) ___ ### AnnotatedResult -Ƭ **AnnotatedResult**: { `adapter`: `string` ; `property`: [`Property`](README.md#property) ; `value`: [`TrackingDataValue`](README.md#trackingdatavalue) } & [`DataPath`](README.md#datapath)[] +Ƭ **AnnotatedResult**: { `adapter`: `string` ; `property`: `LiteralUnion`<[`Property`](README.md#property), `string`\> ; `reasoning`: [`DataPath`](README.md#datapath)[``"reasoning"``] \| ``"indicator matching (plain text)"`` \| ``"indicator matching (base64)"`` \| ``"indicator matching (URL-encoded)"`` ; `value`: [`TrackingDataValue`](README.md#trackingdatavalue) } & `Omit`<[`DataPath`](README.md#datapath), ``"reasoning"``\>[] Extended version of the [Result](README.md#result) type that includes additional metadata about the detected tracking. Each entry in the array is one instance of a tracking data value that was found in a request, with the following properties: -- `adapter`: The adapter that detected the tracking data (`/`). +- `adapter`: The adapter that detected the tracking data (`/`) or `indicators` if the entry + was detected through indicator matching. - `property`: The type of tracking data that was detected. - `value`: The actual value of the tracking data that was transmitted. - `context`: The part of the request in which the tracking data was found (e.g. `body`, `path`). - `path`: A JSONPath expression indicating where this match was found. Note that while we try to keep this path as close as possible to the format used by the tracker, it refers to the decoded request, after our processing steps. This is unavoidable as the trackers don't transmit in a standardized format. + + If indicator matching was used to detect this entry, the path will point to the first character of the match in the + respective part of the request. - `reasoning`: An explanation of how we concluded that this is information is actually the type of data we labelled it as. This can either be a standardized description, or a URL to a more in-depth research report. + If indicator matching was used to detect this entry, the reasoning will be `indicator matching` followed by the + encoding that was used to match the indicator value in parentheses. + #### Defined in -[index.ts:298](https://github.com/tweaselORG/TrackHAR/blob/main/src/index.ts#L298) +[index.ts:371](https://github.com/tweaselORG/TrackHAR/blob/main/src/index.ts#L371) ___ @@ -109,7 +117,7 @@ A part of a request, to explain where some information was found. #### Defined in -[index.ts:18](https://github.com/tweaselORG/TrackHAR/blob/main/src/index.ts#L18) +[index.ts:20](https://github.com/tweaselORG/TrackHAR/blob/main/src/index.ts#L20) ___ @@ -129,7 +137,7 @@ A description of where a certain piece of tracking data can be found in the deco #### Defined in -[index.ts:148](https://github.com/tweaselORG/TrackHAR/blob/main/src/index.ts#L148) +[index.ts:150](https://github.com/tweaselORG/TrackHAR/blob/main/src/index.ts#L150) ___ @@ -165,7 +173,7 @@ The following `function`s are available: #### Defined in -[index.ts:139](https://github.com/tweaselORG/TrackHAR/blob/main/src/index.ts#L139) +[index.ts:141](https://github.com/tweaselORG/TrackHAR/blob/main/src/index.ts#L141) ___ @@ -178,7 +186,35 @@ An identifer for a variable or nested property on the global state in the decodi #### Defined in -[index.ts:107](https://github.com/tweaselORG/TrackHAR/blob/main/src/index.ts#L107) +[index.ts:109](https://github.com/tweaselORG/TrackHAR/blob/main/src/index.ts#L109) + +___ + +### IndicatorValues + +Ƭ **IndicatorValues**: `Partial`<`Record`<`LiteralUnion`<[`Property`](README.md#property), `string`\>, [`ArrayOrSingle`](README.md#arrayorsingle)<`string`\>\>\> + +A mapping from properties (standardized names for certain types of tracking data) to indicator values (known honey +data strings that appear in the request if the property is present). Indicator values can be provided as arrays or +single strings. They are automatically matched against their encoded versions (e.g. base64 and URL-encoded). Where +possible, they are matched case-insensitively. + +**`Example`** + +```ts +{ + "localIp": ["10.0.0.2", "fd31:4159::a2a1"], + "idfa": "6a1c1487-a0af-4223-b142-a0f4621d0311" +} +``` + +This example means that if the string `10.0.0.2` or `fd31:4159::a2a1` is found in the request, it indicates that the +local IP is being transmitted. Similarly, if the string `6a1c1487-a0af-4223-b142-a0f4621d0311` is found in the +request, it indicates that the advertising ID is being transmitted. + +#### Defined in + +[index.ts:409](https://github.com/tweaselORG/TrackHAR/blob/main/src/index.ts#L409) ___ @@ -190,7 +226,7 @@ A JSONPath expression to be parsed by https://github.com/JSONPath-Plus/JSONPath. #### Defined in -[index.ts:11](https://github.com/tweaselORG/TrackHAR/blob/main/src/index.ts#L11) +[index.ts:13](https://github.com/tweaselORG/TrackHAR/blob/main/src/index.ts#L13) ___ @@ -203,7 +239,7 @@ process of a request. #### Defined in -[index.ts:102](https://github.com/tweaselORG/TrackHAR/blob/main/src/index.ts#L102) +[index.ts:104](https://github.com/tweaselORG/TrackHAR/blob/main/src/index.ts#L104) ___ @@ -218,7 +254,7 @@ by the tracker. #### Defined in -[index.ts:43](https://github.com/tweaselORG/TrackHAR/blob/main/src/index.ts#L43) +[index.ts:45](https://github.com/tweaselORG/TrackHAR/blob/main/src/index.ts#L45) ___ @@ -252,14 +288,17 @@ ___ ### Result -Ƭ **Result**: `Partial`<`Record`<[`Property`](README.md#property), [`TrackingDataValue`](README.md#trackingdatavalue)[]\>\> +Ƭ **Result**: `Partial`<`Record`<`LiteralUnion`<[`Property`](README.md#property), `string`\>, [`TrackingDataValue`](README.md#trackingdatavalue)[]\>\> A mapping from properties (standardized names for certain types of tracking data) to the actual instances of values of that property found in a request. +If indicator matching is enabled, it is not possible to distinguish between instances detected through adapter and +indicator matching. + #### Defined in -[index.ts:303](https://github.com/tweaselORG/TrackHAR/blob/main/src/index.ts#L303) +[index.ts:388](https://github.com/tweaselORG/TrackHAR/blob/main/src/index.ts#L388) ___ @@ -280,7 +319,7 @@ A tracking company that we have adapters for. #### Defined in -[index.ts:21](https://github.com/tweaselORG/TrackHAR/blob/main/src/index.ts#L21) +[index.ts:23](https://github.com/tweaselORG/TrackHAR/blob/main/src/index.ts#L23) ___ @@ -292,7 +331,7 @@ Some value transmitted by a tracker. We don't have any type information about it #### Defined in -[index.ts:15](https://github.com/tweaselORG/TrackHAR/blob/main/src/index.ts#L15) +[index.ts:17](https://github.com/tweaselORG/TrackHAR/blob/main/src/index.ts#L17) ___ @@ -304,7 +343,7 @@ A variable on the global state used in the decoding process of a request. This d #### Defined in -[index.ts:97](https://github.com/tweaselORG/TrackHAR/blob/main/src/index.ts#L97) +[index.ts:99](https://github.com/tweaselORG/TrackHAR/blob/main/src/index.ts#L99) ## Variables @@ -321,16 +360,19 @@ generate the information in [`tracker-wiki`](https://github.com/tweaselORG/track #### Defined in -[index.ts:347](https://github.com/tweaselORG/TrackHAR/blob/main/src/index.ts#L347) +[index.ts:460](https://github.com/tweaselORG/TrackHAR/blob/main/src/index.ts#L460) ## Functions ### process -▸ **process**<`ValuesOnly`\>(`har`, `options?`): `Promise`<`ValuesOnly` extends ``true`` ? (`undefined` \| `Partial`<`Record`<[`Property`](README.md#property), `any`[]\>\>)[] : (`undefined` \| [`AnnotatedResult`](README.md#annotatedresult))[]\> +▸ **process**<`ValuesOnly`\>(`har`, `options?`): `Promise`<`ValuesOnly` extends ``true`` ? (`undefined` \| `Partial`<`Record`<`LiteralUnion`<[`Property`](README.md#property), `string`\>, `any`[]\>\>)[] : (`undefined` \| [`AnnotatedResult`](README.md#annotatedresult))[]\> Parse the requests in a HAR traffic dump and extract tracking data. +This always tries to parse requests with the tracker-specific adapters first. If none of them can handle a request, +and `options.indicatorValues` is provided, it will fall back to indicator matching. + #### Type parameters | Name | Type | @@ -342,25 +384,27 @@ Parse the requests in a HAR traffic dump and extract tracking data. | Name | Type | Description | | :------ | :------ | :------ | | `har` | `Har` | A traffic dump in HAR format. | -| `options?` | `Object` | An optional object that can configure the following options: - `valuesOnly`: By default, the result contains not just the values but also various metadata (like the adapter that processed the request). If you only need the values, you can set this option to `true` to get a simpler result. | +| `options?` | `Object` | An optional object that can configure the following options: - `valuesOnly`: By default, the result contains not just the values but also various metadata (like the adapter that processed the request). If you only need the values, you can set this option to `true` to get a simpler result. - `indicatorValues`: An object that specifies known honey data values for certain properties. If no adapter could match the request but indicator values are provided, this function will fall back to indicator matching and try to find the indicator values in the request headers, path or body. See [IndicatorValues](README.md#indicatorvalues). | +| `options.indicatorValues?` | `Partial`<`Record`<`LiteralUnion`<[`Property`](README.md#property), `string`\>, [`ArrayOrSingle`](README.md#arrayorsingle)<`string`\>\>\> | - | | `options.valuesOnly?` | `ValuesOnly` | - | #### Returns -`Promise`<`ValuesOnly` extends ``true`` ? (`undefined` \| `Partial`<`Record`<[`Property`](README.md#property), `any`[]\>\>)[] : (`undefined` \| [`AnnotatedResult`](README.md#annotatedresult))[]\> +`Promise`<`ValuesOnly` extends ``true`` ? (`undefined` \| `Partial`<`Record`<`LiteralUnion`<[`Property`](README.md#property), `string`\>, `any`[]\>\>)[] : (`undefined` \| [`AnnotatedResult`](README.md#annotatedresult))[]\> An array of results, corresponding to each request in the HAR file. If a request could not be processed - (i.e. if no adapter was found that could handle it), the corresponding entry in the array will be `undefined`. + (i.e. if no adapter was found that could handle it and indicator matching, if enabled, didn't produce any results), + the corresponding entry in the array will be `undefined`. #### Defined in -[index.ts:318](https://github.com/tweaselORG/TrackHAR/blob/main/src/index.ts#L318) +[index.ts:431](https://github.com/tweaselORG/TrackHAR/blob/main/src/index.ts#L431) ___ ### processRequest -▸ **processRequest**(`request`): `undefined` \| [`AnnotatedResult`](README.md#annotatedresult) +▸ **processRequest**(`request`, `options?`): `undefined` \| [`AnnotatedResult`](README.md#annotatedresult) Parse a single request in our internal request representation and extract tracking data as an annotated result from it. @@ -374,6 +418,8 @@ This is not needed for the main purposes of this library, but can be useful for | Name | Type | Description | | :------ | :------ | :------ | | `request` | [`Request`](README.md#request) | The request to process in our internal request format. | +| `options?` | `Object` | An optional object that can configure the following options: - `indicatorValues`: An object that specifies known honey data values for certain properties. If no adapter could match the request but indicator values are provided, this function will fall back to indicator matching and try to find the indicator values in the request headers, path or body. See [IndicatorValues](README.md#indicatorvalues). | +| `options.indicatorValues?` | `Partial`<`Record`<`LiteralUnion`<[`Property`](README.md#property), `string`\>, [`ArrayOrSingle`](README.md#arrayorsingle)<`string`\>\>\> | - | #### Returns @@ -381,4 +427,4 @@ This is not needed for the main purposes of this library, but can be useful for #### Defined in -[index.ts:261](https://github.com/tweaselORG/TrackHAR/blob/main/src/index.ts#L261) +[index.ts:268](https://github.com/tweaselORG/TrackHAR/blob/main/src/index.ts#L268) diff --git a/package.json b/package.json index d5a5ccd..e87b597 100644 --- a/package.json +++ b/package.json @@ -51,7 +51,9 @@ "prettier": "@baltpeter/prettier-config", "dependencies": { "@types/har-format": "^1.2.10", + "base64-search": "^1.0.0", "cross-dirname": "^0.1.0", + "escape-string-regexp": "^5.0.0", "jsonpath-plus": "^7.2.0", "protobufjs": "^7.2.3", "qs": "^6.11.1" diff --git a/src/index.ts b/src/index.ts index 6331baa..62a1ee1 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,3 +1,5 @@ +import { base64Regex } from 'base64-search'; +import escapeStringRegexp from 'escape-string-regexp'; import type { Har } from 'har-format'; import { JSONPath } from 'jsonpath-plus'; import type { LiteralUnion } from 'type-fest'; @@ -257,11 +259,75 @@ const adapterForRequest = (r: Request) => * This is not needed for the main purposes of this library, but can be useful for more advanced use cases. * * @param request The request to process in our internal request format. + * @param options An optional object that can configure the following options: + * + * - `indicatorValues`: An object that specifies known honey data values for certain properties. If no adapter could match + * the request but indicator values are provided, this function will fall back to indicator matching and try to + * find the indicator values in the request headers, path or body. See {@link IndicatorValues}. */ -export const processRequest = (request: Request): AnnotatedResult | undefined => { +export const processRequest = ( + request: Request, + options?: { indicatorValues?: IndicatorValues } +): AnnotatedResult | undefined => { const adapter = adapterForRequest(request); - if (!adapter) return undefined; + if (!adapter) { + if (!options?.indicatorValues) return undefined; + + // If no adapter could match the request but the user provided indicator values, we fall back to indicator + // matching. + const indicators = Object.entries(options.indicatorValues) + .map(([property, valueOrValues]) => + (Array.isArray(valueOrValues) ? valueOrValues : [valueOrValues]) + .filter((value): value is string => value !== undefined) + .map((value) => ({ + property: property as keyof IndicatorValues, + indicatorValue: value, + })) + ) + .flat(); + + const indicatorMatches = indicators + .map(({ property, indicatorValue }) => + (['header', 'path', 'body'] as const).map((context) => + (['plain text', 'base64', 'URL-encoded'] as const).map((encoding) => { + const haystack = + context === 'body' + ? request.content || '' + : context === 'path' + ? request.path + : (request.headers || []).map(({ name, value }) => `${name}: ${value}`).join('\n'); + const encodedIndicatorValue = + encoding === 'plain text' + ? indicatorValue + : encoding === 'base64' + ? base64Regex(indicatorValue) + : encodeURIComponent(indicatorValue); + // We don't want to match multiple times if the encoding is equivalent to plain text. + if (encoding !== 'plain text' && encodedIndicatorValue === indicatorValue) return undefined; + + const caseInsensitive = ['plain text', 'URL-encoded'].includes(encoding) ? 'i' : ''; + const matches = haystack.matchAll( + new RegExp(escapeStringRegexp(encodedIndicatorValue), `g${caseInsensitive}`) + ); + return [...matches].map((m) => ({ + adapter: 'indicators', + property, + context, + path: `$[${m.index}]`, + reasoning: `indicator matching (${encoding})` as const, + value: m[0], + })); + }) + ) + ) + .flat(3) + .filter((r): r is Exclude => r !== undefined); + if (indicatorMatches.length > 0) return indicatorMatches; + return undefined; + } + + // If an adapter matched, we only return its results. const decodedRequest = decodeRequest(request, adapter.decodingSteps); const flattenedPaths = Object.entries(adapter.containedDataPaths) @@ -285,41 +351,88 @@ export const processRequest = (request: Request): AnnotatedResult | undefined => * Extended version of the {@link Result} type that includes additional metadata about the detected tracking. Each entry * in the array is one instance of a tracking data value that was found in a request, with the following properties: * - * - `adapter`: The adapter that detected the tracking data (`/`). + * - `adapter`: The adapter that detected the tracking data (`/`) or `indicators` if the entry + * was detected through indicator matching. * - `property`: The type of tracking data that was detected. * - `value`: The actual value of the tracking data that was transmitted. * - `context`: The part of the request in which the tracking data was found (e.g. `body`, `path`). * - `path`: A JSONPath expression indicating where this match was found. Note that while we try to keep this path as * close as possible to the format used by the tracker, it refers to the decoded request, after our processing steps. * This is unavoidable as the trackers don't transmit in a standardized format. + * + * If indicator matching was used to detect this entry, the path will point to the first character of the match in the + * respective part of the request. * - `reasoning`: An explanation of how we concluded that this is information is actually the type of data we labelled it * as. This can either be a standardized description, or a URL to a more in-depth research report. + * + * If indicator matching was used to detect this entry, the reasoning will be `indicator matching` followed by the + * encoding that was used to match the indicator value in parentheses. */ -export type AnnotatedResult = ({ adapter: string; property: Property; value: TrackingDataValue } & DataPath)[]; +export type AnnotatedResult = ({ + adapter: string; + property: LiteralUnion; + value: TrackingDataValue; + reasoning: + | DataPath['reasoning'] + | 'indicator matching (plain text)' + | 'indicator matching (base64)' + | 'indicator matching (URL-encoded)'; +} & Omit)[]; /** * A mapping from properties (standardized names for certain types of tracking data) to the actual instances of values * of that property found in a request. + * + * If indicator matching is enabled, it is not possible to distinguish between instances detected through adapter and + * indicator matching. + */ +export type Result = Partial, TrackingDataValue[]>>; + +/** + * A mapping from properties (standardized names for certain types of tracking data) to indicator values (known honey + * data strings that appear in the request if the property is present). Indicator values can be provided as arrays or + * single strings. They are automatically matched against their encoded versions (e.g. base64 and URL-encoded). Where + * possible, they are matched case-insensitively. + * + * @example + * + * ```ts + * { + * "localIp": ["10.0.0.2", "fd31:4159::a2a1"], + * "idfa": "6a1c1487-a0af-4223-b142-a0f4621d0311" + * } + * ``` + * + * This example means that if the string `10.0.0.2` or `fd31:4159::a2a1` is found in the request, it indicates that the + * local IP is being transmitted. Similarly, if the string `6a1c1487-a0af-4223-b142-a0f4621d0311` is found in the + * request, it indicates that the advertising ID is being transmitted. */ -export type Result = Partial>; +export type IndicatorValues = Partial, ArrayOrSingle>>; /** * Parse the requests in a HAR traffic dump and extract tracking data. * + * This always tries to parse requests with the tracker-specific adapters first. If none of them can handle a request, + * and `options.indicatorValues` is provided, it will fall back to indicator matching. + * * @param har A traffic dump in HAR format. * @param options An optional object that can configure the following options: * * - `valuesOnly`: By default, the result contains not just the values but also various metadata (like the adapter that * processed the request). If you only need the values, you can set this option to `true` to get a simpler * result. + * - `indicatorValues`: An object that specifies known honey data values for certain properties. If no adapter could match + * the request but indicator values are provided, this function will fall back to indicator matching and try to + * find the indicator values in the request headers, path or body. See {@link IndicatorValues}. * * @returns An array of results, corresponding to each request in the HAR file. If a request could not be processed - * (i.e. if no adapter was found that could handle it), the corresponding entry in the array will be `undefined`. + * (i.e. if no adapter was found that could handle it and indicator matching, if enabled, didn't produce any results), + * the corresponding entry in the array will be `undefined`. */ export const process = async ( har: Har, - options?: { valuesOnly?: ValuesOnly } + options?: { valuesOnly?: ValuesOnly; indicatorValues?: IndicatorValues } ): Promise => { - const res = await Promise.all(unhar(har).map(processRequest)); + const res = await Promise.all(unhar(har).map((r) => processRequest(r, options))); const ret = options?.valuesOnly ? res.map((req) => diff --git a/yarn.lock b/yarn.lock index 16230a7..f663b56 100644 --- a/yarn.lock +++ b/yarn.lock @@ -1198,6 +1198,11 @@ base-x@^3.0.8: dependencies: safe-buffer "^5.0.1" +base64-search@^1.0.0: + version "1.0.0" + resolved "https://registry.yarnpkg.com/base64-search/-/base64-search-1.0.0.tgz#8e3fe74d9999f056c26f69a3d3b7eb7b23f4c6f9" + integrity sha512-EHQrX/E0N/ODCli/npLoWGX5US3jWl5DEUHboNMlRDCbSWVK1dk5xcYSrcjz48bByzf1S0OI+VS4iDuY9Y85JA== + binary-searching@^2.0.5: version "2.0.5" resolved "https://registry.yarnpkg.com/binary-searching/-/binary-searching-2.0.5.tgz#ab6d08d51cd1b58878ae208ab61988f885b22dd3" @@ -1690,6 +1695,11 @@ escape-string-regexp@^4.0.0: resolved "https://registry.yarnpkg.com/escape-string-regexp/-/escape-string-regexp-4.0.0.tgz#14ba83a5d373e3d311e5afca29cf5bfad965bf34" integrity sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA== +escape-string-regexp@^5.0.0: + version "5.0.0" + resolved "https://registry.yarnpkg.com/escape-string-regexp/-/escape-string-regexp-5.0.0.tgz#4683126b500b61762f2dbebace1806e8be31b1c8" + integrity sha512-/veY75JbMK4j1yjvuUxuVsiS/hr/4iHs9FTT6cgTexxdE0Ly/glccBAkloH/DofkjRbZU3bnoj38mOmhkZ0lHw== + eslint-config-prettier@^8.6.0: version "8.8.0" resolved "https://registry.yarnpkg.com/eslint-config-prettier/-/eslint-config-prettier-8.8.0.tgz#bfda738d412adc917fd7b038857110efe98c9348"