From d592576b9b02cb486a0fe966e9a29b7a8975558b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Berson?= Date: Fri, 23 Nov 2018 16:18:33 +0100 Subject: [PATCH 1/2] Add support for CSP (Content Security Policy) filters --- example/background.ts | 63 ++++++++++++++++++++++++-------------- index.ts | 2 +- src/utils.ts | 27 +++++++++++++++++ test/engine.test.ts | 70 ++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 137 insertions(+), 25 deletions(-) diff --git a/example/background.ts b/example/background.ts index 4f8d5ce46b..ae046cb046 100644 --- a/example/background.ts +++ b/example/background.ts @@ -83,39 +83,56 @@ chrome.tabs.onActivated.addListener(({ tabId }) => { updateBadgeCount(tabId); }); -loadAdblocker().then((engine) => { - function listener({ tabId, type, url }) { - let source; - if (tabs.has(tabId)) { - source = tabs.get(tabId).source; - } - const result = engine.match({ - sourceUrl: source, - type, - url, - }); - - if (result.redirect) { - incrementBlockedCounter(tabId); - return { redirectUrl: result.redirect }; - } else if (result.match) { - incrementBlockedCounter(tabId); - return { cancel: true }; - } - - return {}; +function requestFromDetails({ tabId, type, url }) { + let source; + if (tabs.has(tabId)) { + source = tabs.get(tabId).source; } + return { + sourceUrl: source, + type, + url, + }; +} +loadAdblocker().then((engine) => { // Start listening to requests, and allow 'blocking' so that we can cancel // some of them (or redirect). chrome.webRequest.onBeforeRequest.addListener( - listener, + (details) => { + const result = engine.match(requestFromDetails(details)); + + if (result.redirect) { + incrementBlockedCounter(details.tabId); + return { redirectUrl: result.redirect }; + } else if (result.match) { + incrementBlockedCounter(details.tabId); + return { cancel: true }; + } + + return {}; + }, { - urls: ['*://*/*'], + urls: [''], }, ['blocking'], ); + chrome.webRequest.onHeadersReceived.addListener( + (details) => { + if (details.type !== 'main_frame') { + return {}; + } + + return adblocker.updateResponseHeadersWithCSP( + details, + engine.getCSPDirectives(requestFromDetails(details)), + ); + }, + { urls: [''] }, + ['blocking', 'responseHeaders'], + ); + // Start listening to messages coming from the content-script chrome.runtime.onMessage.addListener((msg, sender, sendResponse) => { // Extract hostname from sender's URL diff --git a/index.ts b/index.ts index 6a78d0600f..4d58f86228 100644 --- a/index.ts +++ b/index.ts @@ -17,4 +17,4 @@ export { f, parseList } from './src/parsing/list'; export { compactTokens, hasEmptyIntersection, mergeCompactSets } from './src/compact-set'; export { fetchLists, fetchResources } from './src/fetch'; -export { tokenize, fastHash } from './src/utils'; +export { tokenize, fastHash, updateResponseHeadersWithCSP } from './src/utils'; diff --git a/src/utils.ts b/src/utils.ts index 98c8532f96..003a4d5ebf 100644 --- a/src/utils.ts +++ b/src/utils.ts @@ -240,3 +240,30 @@ export function binSearch(arr: Uint32Array, elt: number): boolean { } return false; } + +export function updateResponseHeadersWithCSP( + details: chrome.webRequest.WebResponseHeadersDetails, + policies: string | undefined, +): chrome.webRequest.BlockingResponse { + if (policies === undefined) { + return {}; + } + + let responseHeaders = details.responseHeaders || []; + const CSP_HEADER_NAME = 'content-security-policy'; + + // Collect existing CSP headers from response + responseHeaders.forEach(({ name, value }) => { + if (name.toLowerCase() === CSP_HEADER_NAME) { + policies += `; ${value}`; + } + }); + + // Remove all CSP headers from response + responseHeaders = responseHeaders.filter(({ name }) => name.toLowerCase() !== CSP_HEADER_NAME); + + // Add updated CSP header + responseHeaders.push({ name: CSP_HEADER_NAME, value: policies }); + + return { responseHeaders }; +} diff --git a/test/engine.test.ts b/test/engine.test.ts index b1c12c0d41..9e2aae997e 100644 --- a/test/engine.test.ts +++ b/test/engine.test.ts @@ -1,7 +1,7 @@ import Engine from '../src/engine/engine'; import requests from './data/requests'; -function createEngine(filters: string, enableOptimizations: boolean) { +function createEngine(filters: string, enableOptimizations: boolean = true) { const newEngine = new Engine({ enableOptimizations, loadCosmeticFilters: true, @@ -26,6 +26,74 @@ function createEngine(filters: string, enableOptimizations: boolean) { } describe('#FiltersEngine', () => { + describe('cps policies', () => { + it('no policy in engine', () => { + expect( + createEngine('this is not a csp').getCSPDirectives({ + url: 'https://foo.com', + }), + ).toBeUndefined(); + }); + + it('does not match request', () => { + expect( + createEngine('||bar.com$csp=bar').getCSPDirectives({ + url: 'https://foo.com', + }), + ).toBeUndefined(); + }); + + it('matches request (1 policy)', () => { + expect( + createEngine('||foo.com$csp=bar').getCSPDirectives({ + url: 'https://foo.com', + }), + ).toEqual('bar'); + }); + + it('matches request (2 policy)', () => { + const policies = createEngine(` +||foo.com$csp=bar +$csp=baz,domain=bar.com +`).getCSPDirectives({ + sourceUrl: 'https://bar.com', + url: 'https://foo.com', + }); + + expect(policies).not.toBeUndefined(); + if (policies !== undefined) { + expect(policies.split('; ').sort()).toEqual(['bar', 'baz']); + } + }); + + it('matches request (1 policy with one exception)', () => { + expect( + createEngine(` +||foo.com$csp=bar +@@$csp=baz +$csp=baz,domain=bar.com +`).getCSPDirectives({ + sourceUrl: 'https://bar.com', + url: 'https://foo.com', + }), + ).toEqual('bar'); + }); + + it('exception global exception', () => { + expect( + createEngine(` +@@$csp,domain=bar.com +||foo.com$csp=bar +@@$csp=baz +$csp=baz,domain=bar.com +`).getCSPDirectives({ + sourceUrl: 'https://bar.com', + url: 'https://foo.com', + }), + ).toBeUndefined(); + }); + }); + describe('network filters', () => { const allRequestFilters = requests.map(({ filters }) => filters.join('\n')).join('\n'); From 596c415c8ad4833dcbd08d3c5934dfff450e6720 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Berson?= Date: Mon, 26 Nov 2018 18:13:50 +0100 Subject: [PATCH 2/2] Optimize serialization + properly handle unicode in filters --- CHANGELOG.md | 1 + bench/micro.js | 2 +- bench/utils.js | 2 +- example/rollup.config.js | 4 +- package.json | 1 + rollup.config.js | 4 +- src/data-view.ts | 169 ++++++++++++++++++++++++++ src/dynamic-data-view.ts | 216 --------------------------------- src/encoding.ts | 20 --- src/engine/optimizer.ts | 55 +++------ src/parsing/cosmetic-filter.ts | 14 ++- src/parsing/network-filter.ts | 41 +++---- src/serialization.ts | 97 +++++++-------- src/utils.ts | 9 ++ test/serialization.test.ts | 15 +-- 15 files changed, 290 insertions(+), 360 deletions(-) create mode 100644 src/data-view.ts delete mode 100644 src/dynamic-data-view.ts delete mode 100644 src/encoding.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 89609d3425..54d134db40 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ *2018-11-29* + * Optimize serialization and properly handle unicode in filters [#61](https://github.com/cliqz-oss/adblocker/pull/61) * Fix fuzzy matching by allowing tokens of any size [#61](https://github.com/cliqz-oss/adblocker/pull/62) * Add support for CSP (Content Security Policy) filters [#60](https://github.com/cliqz-oss/adblocker/pull/60) * Add hard-coded circumvention logic (+ IL defuser) [#59](https://github.com/cliqz-oss/adblocker/pull/59) diff --git a/bench/micro.js b/bench/micro.js index 6682c6700d..3dd17b7088 100644 --- a/bench/micro.js +++ b/bench/micro.js @@ -1,4 +1,4 @@ -const adblocker = require('../dist/adblocker.umd.min.js'); +const adblocker = require('../dist/adblocker.cjs.js'); const { createEngine } = require('./utils'); diff --git a/bench/utils.js b/bench/utils.js index 00c808ece3..d7cd33dbe8 100644 --- a/bench/utils.js +++ b/bench/utils.js @@ -1,5 +1,5 @@ const fs = require('fs'); -const adblocker = require('../dist/adblocker.umd.min.js'); +const adblocker = require('../dist/adblocker.cjs.js'); function createEngine(lists, resources, options = {}, serialize = false) { const engine = new adblocker.FiltersEngine({ diff --git a/example/rollup.config.js b/example/rollup.config.js index e275fd1203..788dd9936e 100644 --- a/example/rollup.config.js +++ b/example/rollup.config.js @@ -3,7 +3,9 @@ import commonjs from 'rollup-plugin-commonjs'; const plugins = [ - resolve(), + resolve({ + preferBuiltins: false, + }), commonjs(), ]; diff --git a/package.json b/package.json index b842020042..66abdbb327 100644 --- a/package.json +++ b/package.json @@ -56,6 +56,7 @@ "typescript": "^3.1.6" }, "dependencies": { + "punycode": "^2.1.1", "tldts": "^3.0.0", "tslib": "^1.9.3" } diff --git a/rollup.config.js b/rollup.config.js index 44877e9cea..99a59f9db1 100644 --- a/rollup.config.js +++ b/rollup.config.js @@ -3,7 +3,9 @@ import commonjs from 'rollup-plugin-commonjs'; import pkg from './package.json'; const plugins = [ - resolve(), + resolve({ + preferBuiltins: false, + }), commonjs(), ]; diff --git a/src/data-view.ts b/src/data-view.ts new file mode 100644 index 0000000000..231a6c1ff8 --- /dev/null +++ b/src/data-view.ts @@ -0,0 +1,169 @@ +import * as punycode from 'punycode'; +import { hasUnicode } from './utils'; + +/** + * @class StaticDataView + * + * This abstraction allows to serialize efficiently low-level values of types: + * String, uint8, uint16, uint32 while hiding the complexity of managing the + * current offset and growing. It should always be instantiated with a + * big-enough length because this will not allow for resizing. + * + * This class is also more efficient than the built-in `DataView`. + * + * The way this is used in practice is that you write pairs of function to + * serialize (respectively) deserialize a given structure/class (with code being + * pretty symetrical). In the serializer you `pushX` values, and in the + * deserializer you use `getX` functions to get back the values. + */ +export default class StaticDataView { + protected buffer: Uint8Array; + protected pos: number; + + constructor(length: number, buffer?: Uint8Array) { + this.buffer = buffer !== undefined ? buffer : new Uint8Array(length); + this.pos = 0; + } + + public seekZero(): void { + this.pos = 0; + } + + public crop(): Uint8Array { + if (this.pos >= this.buffer.byteLength) { + throw new Error( + `StaticDataView too small: ${this.buffer.byteLength}, but required ${this.pos - 1} bytes`, + ); + } + return this.buffer.subarray(0, this.pos); + } + + public set(buffer: Uint8Array): void { + this.buffer = new Uint8Array(buffer); + this.seekZero(); + } + + public pushByte(octet: number): void { + this.pushUint8(octet); + } + + public getByte(): number { + return this.getUint8(); + } + + public pushUint8(uint8: number): void { + this.buffer[this.pos] = uint8; + this.pos += 1; + } + + public getUint8(): number { + const uint8 = this.buffer[this.pos]; + this.pos += 1; + return uint8; + } + + public pushUint16(uint16: number): void { + this.buffer[this.pos] = uint16 >>> 8; + this.buffer[this.pos + 1] = uint16; + this.pos += 2; + } + + public getUint16(): number { + const uint16 = ((this.buffer[this.pos] << 8) | this.buffer[this.pos + 1]) >>> 0; + this.pos += 2; + return uint16; + } + + public pushUint32(uint32: number): void { + this.buffer[this.pos] = uint32 >>> 24; + this.buffer[this.pos + 1] = uint32 >>> 16; + this.buffer[this.pos + 2] = uint32 >>> 8; + this.buffer[this.pos + 3] = uint32; + this.pos += 4; + } + + public pushUint32Array(arr: Uint32Array | undefined): void { + if (arr === undefined) { + this.pushUint16(0); + } else { + this.pushUint16(arr.length); + for (let i = 0; i < arr.length; i += 1) { + this.pushUint32(arr[i]); + } + } + } + + public getUint32Array(): Uint32Array | undefined { + const length = this.getUint16(); + if (length === 0) { + return undefined; + } + const arr = new Uint32Array(length); + for (let i = 0; i < length; i += 1) { + arr[i] = this.getUint32(); + } + return arr; + } + + public getUint32(): number { + const uint32 = + (((this.buffer[this.pos] << 24) >>> 0) + + ((this.buffer[this.pos + 1] << 16) | + (this.buffer[this.pos + 2] << 8) | + this.buffer[this.pos + 3])) >>> + 0; + this.pos += 4; + return uint32; + } + + public pushUTF8(str: string | undefined): void { + if (str === undefined) { + this.pushUint16(0); + } else { + this.pushUint16(str.length); + if (hasUnicode(str)) { + this.pushASCII(punycode.encode(str)); + } else { + this.pushASCII(str); + } + } + } + + public getUTF8(): string | undefined { + const length = this.getUint16(); + if (length === 0) { + return undefined; + } + + const str = this.getASCII(); + if (str === undefined || str.length === length) { + return str; + } + return punycode.decode(str); + } + + public pushASCII(str: string | undefined): void { + if (str === undefined) { + this.pushUint16(0); + } else { + this.pushUint16(str.length); + const len = str.length; + const offset = this.pos; + for (let i = 0; i < len; i += 1) { + this.buffer[offset + i] = str.charCodeAt(i); + } + this.pos += len; + } + } + + public getASCII(): string | undefined { + const byteLength = this.getUint16(); + + if (byteLength === 0) { + return undefined; + } + + this.pos += byteLength; + return String.fromCharCode.apply(null, this.buffer.subarray(this.pos - byteLength, this.pos)); + } +} diff --git a/src/dynamic-data-view.ts b/src/dynamic-data-view.ts deleted file mode 100644 index 7436e552b6..0000000000 --- a/src/dynamic-data-view.ts +++ /dev/null @@ -1,216 +0,0 @@ -import { decode, encode } from './encoding'; - -/** - * @class DynamicDataView - * - * This abstraction allows to serialize efficiently low-level values of types: - * String, uint8, uint16, uint32 while hiding the complexity of managing the - * current offset and growing. If initialized with a big enough `length`, it - * might also not require any resize (thus enabling serializationg with a single - * memory allocation). - * - * This class is also more efficient than the built-in `DataView`. - * - * The way this is used in practice is that you write pairs of function to - * serialize (respectively) deserialize a given structure/class (with code being - * pretty symetrical). In the serializer you `pushX` values, and in the - * deserializer you use `getX` functions to get back the values. - */ -export default class DynamicDataView { - private buffer: Uint8Array; - private pos: number; - - constructor(length: number) { - this.buffer = new Uint8Array(length); - this.pos = 0; - } - - public seek(pos: number = 0): void { - this.pos = pos; - } - - public crop(): Uint8Array { - return this.buffer.subarray(0, this.pos); - } - - public set(buffer: Uint8Array): void { - this.buffer = new Uint8Array(buffer); - this.seek(0); - } - - public pushBytes(bytes: Uint8Array): void { - this.checkShouldResize(bytes.byteLength); - this.buffer.set(bytes, this.pos); - this.pos += bytes.byteLength; - } - - public pushByte(octet: number): void { - this.pushUint8(octet); - } - - public pushUint8(uint8: number): void { - this.checkShouldResize(1); - this.buffer[this.pos] = uint8; - this.pos += 1; - } - - public pushUint16(uint16: number): void { - this.checkShouldResize(2); - this.buffer[this.pos] = uint16 >>> 8; - this.buffer[this.pos + 1] = uint16; - this.pos += 2; - } - - public pushUint32(uint32: number): void { - this.checkShouldResize(4); - this.buffer[this.pos] = uint32 >>> 24; - this.buffer[this.pos + 1] = uint32 >>> 16; - this.buffer[this.pos + 2] = uint32 >>> 8; - this.buffer[this.pos + 3] = uint32; - this.pos += 4; - } - - public pushUTF8(str: string): void { - const buffer = encode(str); - this.pushUint16(buffer.byteLength); - this.pushBytes(buffer); - } - - public pushUint32Array(arr: Uint32Array | undefined): void { - if (arr === undefined) { - this.pushUint16(0); - } else { - this.pushUint16(arr.length); - for (let i = 0; i < arr.length; i += 1) { - this.pushUint32(arr[i]); - } - } - } - - /** - * This method is very optimistic and will assume that by default every string - * is ascii only, but fallback to a slower utf-8 method if a non-ascii char is - * encountered in the process of pushing the string. - * - * WARNING: Currently only strings of size <= 65k can be stored. - */ - public pushStr(str: string | undefined): void { - if (str === undefined) { - // Special handling for empty strings - this.checkShouldResize(2); - this.pushUint16(0); - } else { - // Keep track of original position to be able to fallback - // to pushUTF8 if we encounter non-ascii characters. - const originalPos = this.pos; - let foundUnicode = false; - - this.checkShouldResize(2 + str.length); - this.pushUint16(str.length); - - const offset = this.pos; - const buffer = this.buffer; - for (let i = 0; i < str.length && !foundUnicode; i += 1) { - const ch = str.charCodeAt(i); - buffer[offset + i] = ch; - foundUnicode = foundUnicode || ch > 127; - } - - if (foundUnicode) { - // Fallback to a slower utf-8 text encoder - this.pos = originalPos; - this.pushUTF8(str); - } else { - this.pos += str.length; - } - } - } - - // Read next value - - public getBytes(n: number): Uint8Array { - const bytes = this.buffer.subarray(this.pos, this.pos + n); - this.pos += n; - return bytes; - } - - public getByte(): number { - return this.getUint8(); - } - - public getUint8(): number { - const uint8 = this.buffer[this.pos]; - this.pos += 1; - return uint8; - } - - public getUint16(): number { - const uint16 = ((this.buffer[this.pos] << 8) | this.buffer[this.pos + 1]) >>> 0; - this.pos += 2; - return uint16; - } - - public getUint32(): number { - const uint32 = - (((this.buffer[this.pos] << 24) >>> 0) + - ((this.buffer[this.pos + 1] << 16) | - (this.buffer[this.pos + 2] << 8) | - this.buffer[this.pos + 3])) >>> - 0; - this.pos += 4; - return uint32; - } - - public getUTF8(): string { - return decode(this.getBytes(this.getUint16())); - } - - public getStr(): string | undefined { - // Keep track of original position to be able to fallback - // to getUTF8 if we encounter non-ascii characters. - const originalPos = this.pos; - const size = this.getUint16(); - - // Special handling for empty strings - if (size === 0) { - return undefined; - } - - // Check if there is a non-ascii character in the string. - let i = 0; - for (; i < size && this.buffer[this.pos + i] <= 127; i += 1) { - /* empty */ - } - - if (i < size) { - this.pos = originalPos; - return this.getUTF8(); - } - - return String.fromCharCode.apply(null, this.getBytes(size)); - } - - public getUint32Array(): Uint32Array | undefined { - const length = this.getUint16(); - if (length > 0) { - const arr = new Uint32Array(length); - for (let i = 0; i < length; i += 1) { - arr[i] = this.getUint32(); - } - return arr; - } - return undefined; - } - - private checkShouldResize(n: number): void { - if (this.pos + n >= this.buffer.byteLength) { - this.resize(n); - } - } - - private resize(n: number = 0): void { - const newBuffer = new Uint8Array(Math.floor((this.pos + n) * 1.5)); - newBuffer.set(this.buffer); - this.buffer = newBuffer; - } -} diff --git a/src/encoding.ts b/src/encoding.ts deleted file mode 100644 index 4895dbbbc4..0000000000 --- a/src/encoding.ts +++ /dev/null @@ -1,20 +0,0 @@ -function fromString(str: string): Uint8Array { - const res = new Uint8Array(str.length); - const len = str.length; - for (let i = 0; i < len; i += 1) { - res[i] = str.charCodeAt(i); - } - return res; -} - -declare function escape(s: string): string; -declare function unescape(s: string): string; - -// http://ecmanaut.blogspot.de/2006/07/encoding-decoding-utf8-in-javascript.html -export function encode(s: string): Uint8Array { - return fromString(unescape(encodeURIComponent(s))); -} - -export function decode(bytes: Uint8Array): string { - return decodeURIComponent(escape(String.fromCharCode.apply(null, bytes))); -} diff --git a/src/engine/optimizer.ts b/src/engine/optimizer.ts index aedcd9ba3a..d9cd63696a 100644 --- a/src/engine/optimizer.ts +++ b/src/engine/optimizer.ts @@ -1,4 +1,5 @@ -import { NetworkFilter } from '../parsing/network-filter'; +import { NETWORK_FILTER_MASK, NetworkFilter } from '../parsing/network-filter'; +import { setBit } from '../utils'; function processRegex(r: RegExp): string { return `(?:${r.source})`; @@ -66,13 +67,6 @@ const OPTIMIZATIONS: IOptimization[] = [ { description: 'Group idential filter with same mask but different domains in single filters', fusion: (filters: NetworkFilter[]) => { - const filter = new NetworkFilter(filters[0]); - - // Keep track of original filters view rawLine attribute - if (filter.rawLine !== undefined) { - filter.rawLine = filters.map(({ rawLine }) => rawLine).join(' <+> '); - } - const domains: Set = new Set(); const notDomains: Set = new Set(); @@ -90,19 +84,15 @@ const OPTIMIZATIONS: IOptimization[] = [ } } - if (domains.size > 0) { - filter.optDomains = new Uint32Array(domains); - } else { - filter.optDomains = undefined; - } - - if (notDomains.size > 0) { - filter.optNotDomains = new Uint32Array(notDomains); - } else { - filter.optNotDomains = undefined; - } - - return filter; + return new NetworkFilter({ + ...filters[0], + optDomains: domains.size > 0 ? new Uint32Array(domains) : undefined, + optNotDomains: notDomains.size > 0 ? new Uint32Array(notDomains) : undefined, + rawLine: + filters[0].rawLine !== undefined + ? filters.map(({ rawLine }) => rawLine).join(' <+> ') + : undefined, + }); }, groupByCriteria: (filter: NetworkFilter) => filter.getHostname() + filter.getFilter() + filter.getMask() + filter.getRedirect(), @@ -114,13 +104,6 @@ const OPTIMIZATIONS: IOptimization[] = [ { description: 'Group simple patterns, into a single filter', fusion: (filters: NetworkFilter[]) => { - const filter = new NetworkFilter(filters[0]); - - // Keep track of original filters view rawLine attribute - if (filter.rawLine !== undefined) { - filter.rawLine = filters.map(({ rawLine }) => rawLine).join(' <+> '); - } - const patterns: string[] = []; for (let i = 0; i < filters.length; i += 1) { const f = filters[i]; @@ -135,13 +118,15 @@ const OPTIMIZATIONS: IOptimization[] = [ } } - if (patterns.length > 0) { - filter.setRegex(new RegExp(patterns.join('|'))); - } else { - filter.filter = undefined; - } - - return filter; + return new NetworkFilter({ + ...filters[0], + mask: setBit(filters[0].mask, NETWORK_FILTER_MASK.isRegex), + rawLine: + filters[0].rawLine !== undefined + ? filters.map(({ rawLine }) => rawLine).join(' <+> ') + : undefined, + regex: new RegExp(patterns.join('|')), + }); }, groupByCriteria: (filter: NetworkFilter) => '' + filter.getMask(), select: (filter: NetworkFilter) => diff --git a/src/parsing/cosmetic-filter.ts b/src/parsing/cosmetic-filter.ts index 8d82da3784..ead9b2a3fc 100644 --- a/src/parsing/cosmetic-filter.ts +++ b/src/parsing/cosmetic-filter.ts @@ -1,4 +1,5 @@ -import { fastStartsWithFrom, getBit, setBit, tokenizeCSS } from '../utils'; +import * as punycode from 'punycode'; +import { fastStartsWithFrom, getBit, hasUnicode, setBit, tokenizeCSS } from '../utils'; import IFilter from './interface'; /** @@ -53,14 +54,12 @@ const TOKENS_BUFFER = new Uint32Array(200); * - xpath */ export class CosmeticFilter implements IFilter { - public mask: number; - public selector?: string; - public hostnames?: string; + public readonly mask: number; + public selector?: string; // TODO - set to read-only + public readonly hostnames?: string; - // For debug only public id?: number; public rawLine?: string; - private hostnamesArray?: string[]; constructor({ @@ -263,6 +262,9 @@ export function parseCosmeticFilter(line: string): CosmeticFilter | null { // Parse hostnames if (sharpIndex > 0) { hostnames = line.slice(0, sharpIndex); + if (hasUnicode(hostnames)) { + hostnames = punycode.encode(hostnames); + } } // Deal with script:inject and script:contains diff --git a/src/parsing/network-filter.ts b/src/parsing/network-filter.ts index 08e5893ebd..27b932ab61 100644 --- a/src/parsing/network-filter.ts +++ b/src/parsing/network-filter.ts @@ -1,3 +1,4 @@ +import * as punycode from 'punycode'; import { RequestType } from '../request'; import { clearBit, @@ -6,6 +7,7 @@ import { fastStartsWith, fastStartsWithFrom, getBit, + hasUnicode, setBit, tokenize, tokenizeFilter, @@ -17,7 +19,7 @@ const TOKENS_BUFFER = new Uint32Array(200); /** * Masks used to store options of network filters in a bitmask. */ -const enum NETWORK_FILTER_MASK { +export const enum NETWORK_FILTER_MASK { // Content Policy Type fromImage = 1 << 0, fromMedia = 1 << 1, @@ -173,14 +175,13 @@ const MATCH_ALL = new RegExp(''); // - genericblock // 2. Replace `split` with `substr` export class NetworkFilter implements IFilter { - public mask: number; - - public filter?: string; - public optDomains?: Uint32Array; - public optNotDomains?: Uint32Array; - public redirect?: string; - public hostname?: string; - public csp?: string; + public readonly mask: number; + public readonly filter?: string; + public readonly optDomains?: Uint32Array; + public readonly optNotDomains?: Uint32Array; + public readonly redirect?: string; + public readonly hostname?: string; + public readonly csp?: string; // Set only in debug mode public rawLine?: string; @@ -188,7 +189,7 @@ export class NetworkFilter implements IFilter { public id?: number; private fuzzySignature?: Uint32Array; private regex?: RegExp; - private optimized: boolean; + private optimized: boolean = false; constructor({ csp, @@ -200,17 +201,18 @@ export class NetworkFilter implements IFilter { optNotDomains, rawLine, redirect, - }: { mask: number } & Partial) { - this.mask = mask; - this.id = id; - this.optimized = false; + regex, + }: { mask: number; regex?: RegExp } & Partial) { this.csp = csp; this.filter = filter; this.hostname = hostname; + this.id = id; + this.mask = mask; this.optDomains = optDomains; this.optNotDomains = optNotDomains; this.rawLine = rawLine; this.redirect = redirect; + this.regex = regex; } public isCosmeticFilter() { @@ -406,14 +408,6 @@ export class NetworkFilter implements IFilter { return this.filter || ''; } - /** - * Special method, should only be used by the filter optimizer - */ - public setRegex(re: RegExp): void { - this.regex = re; - this.mask = setBit(this.mask, NETWORK_FILTER_MASK.isRegex); - } - public getRegex(): RegExp { this.optimize(); return this.regex || MATCH_ALL; @@ -998,6 +992,9 @@ export function parseNetworkFilter(rawLine: string): NetworkFilter | null { hostname = hostname.slice(4); } hostname = hostname.toLowerCase(); + if (hasUnicode(hostname)) { + hostname = punycode.toASCII(hostname); + } } return new NetworkFilter({ diff --git a/src/serialization.ts b/src/serialization.ts index 2fb274a68a..2579adebcd 100644 --- a/src/serialization.ts +++ b/src/serialization.ts @@ -1,11 +1,9 @@ /** * This modules contains all functions and utils to serialize the adblocker - * efficiently. The central part if `DynamicDataView`, a dynamically growing - * ArrayBuffer exposing an API allowing to set values of type: String, uint8, - * uint16 and uint32 efficiently. + * efficiently. The central part if `StaticDataView`. */ -import DynamicDataView from './dynamic-data-view'; +import StaticDataView from './data-view'; import Engine from './engine/engine'; import IList from './engine/list'; import ReverseIndex, { IBucket, newBucket } from './engine/reverse-index'; @@ -49,7 +47,7 @@ import { NetworkFilter } from './parsing/network-filter'; * * first byte could contain the mask as well if small enough. * * when packing ascii string, store several of them in each byte. */ -function serializeNetworkFilter(filter: NetworkFilter, buffer: DynamicDataView): void { +function serializeNetworkFilter(filter: NetworkFilter, buffer: StaticDataView): void { // Check number of optional parts (e.g.: filter, hostname, etc.) let numberOfOptionalParts = 0; @@ -75,12 +73,12 @@ function serializeNetworkFilter(filter: NetworkFilter, buffer: DynamicDataView): return; } - buffer.pushStr(filter.hostname); + buffer.pushASCII(filter.hostname); if (numberOfOptionalParts === 1) { return; } - buffer.pushStr(filter.filter); + buffer.pushASCII(filter.filter); if (numberOfOptionalParts === 2) { return; } @@ -95,19 +93,19 @@ function serializeNetworkFilter(filter: NetworkFilter, buffer: DynamicDataView): return; } - buffer.pushStr(filter.redirect); + buffer.pushASCII(filter.redirect); if (numberOfOptionalParts === 5) { return; } - buffer.pushStr(filter.csp); + buffer.pushASCII(filter.csp); } /** * Deserialize network filters. The code accessing the buffer should be * symetrical to the one in `serializeNetworkFilter`. */ -function deserializeNetworkFilter(buffer: DynamicDataView): NetworkFilter { +function deserializeNetworkFilter(buffer: StaticDataView): NetworkFilter { const id = buffer.getUint32(); const mask = buffer.getUint32(); const numberOfOptionalParts = buffer.getUint8(); @@ -120,10 +118,10 @@ function deserializeNetworkFilter(buffer: DynamicDataView): NetworkFilter { let csp: string | undefined; if (numberOfOptionalParts > 0) { - hostname = buffer.getStr(); + hostname = buffer.getASCII(); } if (numberOfOptionalParts > 1) { - filter = buffer.getStr(); + filter = buffer.getASCII(); } if (numberOfOptionalParts > 2) { optDomains = buffer.getUint32Array(); @@ -132,10 +130,10 @@ function deserializeNetworkFilter(buffer: DynamicDataView): NetworkFilter { optNotDomains = buffer.getUint32Array(); } if (numberOfOptionalParts > 4) { - redirect = buffer.getStr(); + redirect = buffer.getASCII(); } if (numberOfOptionalParts > 5) { - csp = buffer.getStr(); + csp = buffer.getASCII(); } return new NetworkFilter({ @@ -162,39 +160,39 @@ function deserializeNetworkFilter(buffer: DynamicDataView): NetworkFilter { * Improvements similar to the onces mentioned in `serializeNetworkFilters` * could be applied here, to get a more compact representation. */ -function serializeCosmeticFilter(filter: CosmeticFilter, buffer: DynamicDataView): void { +function serializeCosmeticFilter(filter: CosmeticFilter, buffer: StaticDataView): void { buffer.pushUint32(filter.getId()); buffer.pushUint8(filter.mask); - buffer.pushStr(filter.selector); - buffer.pushStr(filter.hostnames); + buffer.pushASCII(filter.hostnames); + buffer.pushUTF8(filter.selector); } /** * Deserialize cosmetic filters. The code accessing the buffer should be * symetrical to the one in `serializeCosmeticFilter`. */ -function deserializeCosmeticFilter(buffer: DynamicDataView): CosmeticFilter { +function deserializeCosmeticFilter(buffer: StaticDataView): CosmeticFilter { const id = buffer.getUint32(); const mask = buffer.getUint8(); - const selector = buffer.getStr(); - const hostnames = buffer.getStr(); + const hostnames = buffer.getASCII(); + const selector = buffer.getUTF8(); return new CosmeticFilter({ - hostnames: hostnames || undefined, + hostnames, id, mask, - selector: selector || undefined, + selector, }); } -function serializeNetworkFilters(filters: NetworkFilter[], buffer: DynamicDataView): void { +function serializeNetworkFilters(filters: NetworkFilter[], buffer: StaticDataView): void { buffer.pushUint32(filters.length); for (let i = 0; i < filters.length; i += 1) { serializeNetworkFilter(filters[i], buffer); } } -function serializeCosmeticFilters(filters: CosmeticFilter[], buffer: DynamicDataView): void { +function serializeCosmeticFilters(filters: CosmeticFilter[], buffer: StaticDataView): void { buffer.pushUint32(filters.length); for (let i = 0; i < filters.length; i += 1) { serializeCosmeticFilter(filters[i], buffer); @@ -202,7 +200,7 @@ function serializeCosmeticFilters(filters: CosmeticFilter[], buffer: DynamicData } function deserializeNetworkFilters( - buffer: DynamicDataView, + buffer: StaticDataView, allFilters: Map, ): NetworkFilter[] { const length = buffer.getUint32(); @@ -217,7 +215,7 @@ function deserializeNetworkFilters( } function deserializeCosmeticFilters( - buffer: DynamicDataView, + buffer: StaticDataView, allFilters: Map, ): CosmeticFilter[] { const length = buffer.getUint32(); @@ -231,13 +229,13 @@ function deserializeCosmeticFilters( return filters; } -function serializeLists(buffer: DynamicDataView, lists: Map): void { +function serializeLists(buffer: StaticDataView, lists: Map): void { // Serialize number of lists buffer.pushUint8(lists.size); lists.forEach((list, asset) => { - buffer.pushStr(asset); - buffer.pushStr(list.checksum); + buffer.pushASCII(asset); + buffer.pushASCII(list.checksum); serializeCosmeticFilters(list.cosmetics, buffer); serializeNetworkFilters(list.csp, buffer); serializeNetworkFilters(list.exceptions, buffer); @@ -248,7 +246,7 @@ function serializeLists(buffer: DynamicDataView, lists: Map): voi } function deserializeLists( - buffer: DynamicDataView, + buffer: StaticDataView, ): { cosmeticFilters: Map; networkFilters: Map; @@ -261,8 +259,8 @@ function deserializeLists( // Get number of assets const size = buffer.getUint8(); for (let i = 0; i < size; i += 1) { - lists.set(buffer.getStr(), { - checksum: buffer.getStr(), + lists.set(buffer.getASCII(), { + checksum: buffer.getASCII(), cosmetics: deserializeCosmeticFilters(buffer, cosmeticFilters), csp: deserializeNetworkFilters(buffer, networkFilters), exceptions: deserializeNetworkFilters(buffer, networkFilters), @@ -279,7 +277,7 @@ function deserializeLists( }; } -function serializeBucket(token: number, filters: T[], buffer: DynamicDataView) { +function serializeBucket(token: number, filters: T[], buffer: StaticDataView) { buffer.pushUint16(filters.length); buffer.pushUint32(token); @@ -289,7 +287,7 @@ function serializeBucket(token: number, filters: T[], buffer: } function deserializeBucket( - buffer: DynamicDataView, + buffer: StaticDataView, filters: Map, ): { token: number; @@ -315,7 +313,7 @@ function deserializeBucket( function serializeReverseIndex( reverseIndex: ReverseIndex, - buffer: DynamicDataView, + buffer: StaticDataView, ): void { const index = reverseIndex.index; @@ -328,7 +326,7 @@ function serializeReverseIndex( } function deserializeReverseIndex( - buffer: DynamicDataView, + buffer: StaticDataView, index: ReverseIndex, filters: Map, ): ReverseIndex { @@ -348,21 +346,21 @@ function deserializeReverseIndex( return index; } -function serializeResources(engine: Engine, buffer: DynamicDataView): void { +function serializeResources(engine: Engine, buffer: StaticDataView): void { // Serialize `resourceChecksum` - buffer.pushStr(engine.resourceChecksum); + buffer.pushASCII(engine.resourceChecksum); // Serialize `resources` buffer.pushUint8(engine.resources.size); engine.resources.forEach(({ contentType, data }, name) => { - buffer.pushStr(name); - buffer.pushStr(contentType); - buffer.pushStr(data); + buffer.pushASCII(name); + buffer.pushASCII(contentType); + buffer.pushASCII(data); }); } function deserializeResources( - buffer: DynamicDataView, + buffer: StaticDataView, ): { js: Map; resources: Map; @@ -370,14 +368,14 @@ function deserializeResources( } { const js = new Map(); const resources = new Map(); - const resourceChecksum = buffer.getStr() || ''; + const resourceChecksum = buffer.getASCII() || ''; // Deserialize `resources` const resourcesSize = buffer.getUint8(); for (let i = 0; i < resourcesSize; i += 1) { - resources.set(buffer.getStr(), { - contentType: buffer.getStr(), - data: buffer.getStr(), + resources.set(buffer.getASCII(), { + contentType: buffer.getASCII(), + data: buffer.getASCII(), }); } @@ -402,8 +400,8 @@ function deserializeResources( */ function serializeEngine(engine: Engine): Uint8Array { // Create a big buffer! It does not have to be the right size since - // `DynamicDataView` is able to resize itself dynamically if needed. - const buffer = new DynamicDataView(4000000); + // `StaticDataView` is able to resize itself dynamically if needed. + const buffer = new StaticDataView(8000000); buffer.pushUint8(engine.version); @@ -430,8 +428,7 @@ function serializeEngine(engine: Engine): Uint8Array { } function deserializeEngine(serialized: Uint8Array, version: number): Engine { - const buffer = new DynamicDataView(0); - buffer.set(serialized); + const buffer = new StaticDataView(0, serialized); // Before starting deserialization, we make sure that the version of the // serialized engine is the same as the current source code. If not, we start diff --git a/src/utils.ts b/src/utils.ts index 003a4d5ebf..ca4ad6f6d9 100644 --- a/src/utils.ts +++ b/src/utils.ts @@ -267,3 +267,12 @@ export function updateResponseHeadersWithCSP( return { responseHeaders }; } + +export function hasUnicode(str: string): boolean { + for (let i = 0; i < str.length; i += 1) { + if (str.charCodeAt(i) > 127) { + return true; + } + } + return false; +} diff --git a/test/serialization.test.ts b/test/serialization.test.ts index 138bc0f29c..ff5dae7f7e 100644 --- a/test/serialization.test.ts +++ b/test/serialization.test.ts @@ -1,6 +1,6 @@ import { loadAllLists, loadResources } from './utils'; -import DynamicDataView from '../src/dynamic-data-view'; +import StaticDataView from '../src/data-view'; import Engine from '../src/engine/engine'; import ReverseIndex from '../src/engine/reverse-index'; import { parseList } from '../src/parsing/list'; @@ -20,20 +20,21 @@ describe('Serialization', () => { const { networkFilters, cosmeticFilters } = parseList(loadAllLists()); describe('filters', () => { + const buffer = new StaticDataView(1000000); it('cosmetic', () => { cosmeticFilters.forEach((filter) => { - const buffer = new DynamicDataView(100); + buffer.seekZero(); serializeCosmeticFilter(filter, buffer); - buffer.seek(0); + buffer.seekZero(); expect(deserializeCosmeticFilter(buffer)).toEqual(filter); }); }); it('network', () => { networkFilters.forEach((filter) => { - const buffer = new DynamicDataView(100); + buffer.seekZero(); serializeNetworkFilter(filter, buffer); - buffer.seek(0); + buffer.seekZero(); expect(deserializeNetworkFilter(buffer)).toEqual(filter); }); }); @@ -54,9 +55,9 @@ describe('Serialization', () => { ); // Serialize index - const buffer = new DynamicDataView(4000000); + const buffer = new StaticDataView(4000000); serializeReverseIndex(reverseIndex, buffer); - buffer.seek(0); + buffer.seekZero(); const deserialized: any = {}; deserializeReverseIndex(buffer, deserialized, filters);