Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Opt in crawl #71

Merged
merged 5 commits into from
Jul 21, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ Available options:
- `-a, --disable-anti-bot` - disable simple build-in anti bot detection script injected to every frame
- `--chromium-version <version_number>` - use custom version of Chromium (e.g. "843427") instead of using the default
- `--config <path>` - path to a config file that allows to set all the above settings (and more). Note that CLI flags have a higher priority than settings passed via config. You can find a sample config file in `tests/cli/sampleConfig.json`.
- `--run-autoconsent` - enable autoconsent opt-out of cookie popups (requires the `cmps` collector)
- `--autoconsent-action <action>` - automatic autoconsent action (requires the `cmps` collector). Possible values: optIn, optOut

### Use it as a module

Expand Down
4 changes: 2 additions & 2 deletions cli/crawl-cli.js
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ program
.option('-r, --region-code <region>', 'optional 2 letter region code. Used for metadata only.')
.option('-a, --disable-anti-bot', 'disable anti bot detection protections injected to every frame')
.option('--config <path>', 'crawl configuration file')
.option('--run-autoconsent', 'run autoconsent opt-outs on pages')
.option('--autoconsent-action <action>', 'dismiss cookie popups. Possible values: optout, optin')
.option('--chromium-version <version_number>', 'use custom version of chromium')
.parse(process.argv);

Expand Down Expand Up @@ -205,7 +205,7 @@ async function run(inputUrls, outputPath, verbose, logPath, numberOfCrawlers, da
// @ts-ignore
const config = crawlConfig.figureOut(program);
const collectorFlags = {
runAutoconsent: Boolean(program.runAutoconsent),
autoconsentAction: program.autoconsentAction,
};
/**
* @type {BaseCollector[]}
Expand Down
1 change: 1 addition & 0 deletions collectors/APICalls/TrackerTracker.js
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ class TrackerTracker {
const error = (typeof e === 'string') ? e : e.message;
if (
!error.includes('Target closed.') && // we don't care if tab was closed during this opperation
!error.includes('Session closed.') && // we don't care if tab was closed during this opperation
!error.includes('Breakpoint at specified location already exists.') &&
!error.includes('Cannot find context with specified id') &&
!error.includes('API unavailable in given context.') // some APIs are unavailable on HTTP or in a worker
Expand Down
2 changes: 1 addition & 1 deletion collectors/BaseCollector.js
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ class BaseCollector {
* @property {import('puppeteer').BrowserContext} context
* @property {URL} url
* @property {function(...any):void} log
* @property {Object.<string, boolean>} collectorFlags
* @property {Object.<string, string>} collectorFlags
*/

module.exports = BaseCollector;
92 changes: 59 additions & 33 deletions collectors/CMPCollector.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,32 @@ const BaseCollector = require('./BaseCollector');

// @ts-ignore
const baseContentScript = fs.readFileSync(
path.join(__dirname, "../node_modules/@duckduckgo/autoconsent/dist/autoconsent.standalone.js"),
path.join(__dirname, "../node_modules/@duckduckgo/autoconsent/dist/autoconsent.playwright.js"),
sammacbeth marked this conversation as resolved.
Show resolved Hide resolved
"utf8"
);

const contentScript = `
window.autoconsentSendMessage = (msg) => {
window.cdpAutoconsentSendMessage(JSON.stringify(msg));
};
` + baseContentScript;

const worldName = 'cmpcollector';

/**
* @param {import('@duckduckgo/autoconsent/lib/types').Config} config
* @param {String|Error} e
*/
function generateContentScript(config) {
return baseContentScript + `
window.initAutoconsentStandalone(${JSON.stringify(config)});
`;
function isIgnoredEvalError(e) {
// ignore evaluation errors (sometimes frames reload too fast)
const error = (typeof e === 'string') ? e : e.message;
return (
error.includes('No frame for given id found') ||
error.includes('Target closed.') ||
error.includes('Session closed.') ||
error.includes('Cannot find context with specified id')
sammacbeth marked this conversation as resolved.
Show resolved Hide resolved
);
}

const worldName = 'cmpcollector';
class CMPCollector extends BaseCollector {

id() {
Expand All @@ -31,15 +43,8 @@ class CMPCollector extends BaseCollector {
*/
init(options) {
this.log = options.log;
this.doOptOut = options.collectorFlags.runAutoconsent;
this.shortTimeouts = options.collectorFlags.shortTimeouts; // used to speed up unit tests
this.contentScript = generateContentScript({
enabled: true,
autoAction: this.doOptOut ? 'optOut' : null,
disabledCmps: [],
enablePrehide: true,
detectRetries: 20,
});
this.autoAction = /** @type {import('@duckduckgo/autoconsent/lib/types').AutoAction} */ (options.collectorFlags.autoconsentAction);
/** @type {import('@duckduckgo/autoconsent/lib/messages').ContentScriptMessage[]} */
this.receivedMsgs = [];
this.selfTestFrame = null;
Expand Down Expand Up @@ -95,27 +100,30 @@ class CMPCollector extends BaseCollector {
});
this.isolated2pageworld.set(executionContextId, context.id);
await this._cdpClient.send('Runtime.evaluate', {
expression: this.contentScript,
expression: contentScript,
contextId: executionContextId,
});
} catch (e) {
// ignore evaluation errors (sometimes frames reload too fast)
this.log(`Error evaluating content script: ${e}`);
if (!isIgnoredEvalError(e)) {
this.log(`Error evaluating content script: ${e}`);
}
}
});

this._cdpClient.on('Runtime.bindingCalled', async ({name, payload, executionContextId}) => {
if (name === 'autoconsentStandaloneSendMessage') {
if (name === 'cdpAutoconsentSendMessage') {
try {
const msg = JSON.parse(payload);
await this.handleMessage(msg, executionContextId);
} catch (e) {
this.log(`Could not handle autoconsent message ${payload}`);
if (!isIgnoredEvalError(e)) {
this.log(`Could not handle autoconsent message ${payload}`, e);
}
}
}
});
await this._cdpClient.send('Runtime.addBinding', {
name: 'autoconsentStandaloneSendMessage',
name: 'cdpAutoconsentSendMessage',
executionContextName: worldName,
});
}
Expand All @@ -131,6 +139,21 @@ class CMPCollector extends BaseCollector {
async handleMessage(msg, executionContextId) {
this.receivedMsgs.push(msg);
switch (msg.type) {
case 'init': {
/** @type {import('@duckduckgo/autoconsent/lib/types').Config} */
const autoconsentConfig = {
enabled: true,
autoAction: this.autoAction,
disabledCmps: [],
enablePrehide: true,
detectRetries: 20,
};
await this._cdpClient.send('Runtime.evaluate', {
expression: `autoconsentReceiveMessage({ type: "initResp", config: ${JSON.stringify(autoconsentConfig)} })`,
contextId: executionContextId,
});
break;
}
case 'optInResult':
case 'optOutResult': {
if (msg.scheduleSelfTest) {
Expand All @@ -141,7 +164,7 @@ class CMPCollector extends BaseCollector {
case 'autoconsentDone': {
if (this.selfTestFrame) {
await this._cdpClient.send('Runtime.evaluate', {
expression: `autoconsentStandaloneReceiveMessage({ type: "selfTest" })`,
expression: `autoconsentReceiveMessage({ type: "selfTest" })`,
allowUnsafeEvalBlockedByCSP: true,
contextId: this.selfTestFrame,
});
Expand All @@ -161,7 +184,7 @@ class CMPCollector extends BaseCollector {
}

await this._cdpClient.send('Runtime.evaluate', {
expression: `autoconsentStandaloneReceiveMessage({ id: "${msg.id}", type: "evalResp", result: ${JSON.stringify(evalResult)} })`,
expression: `autoconsentReceiveMessage({ id: "${msg.id}", type: "evalResp", result: ${JSON.stringify(evalResult)} })`,
allowUnsafeEvalBlockedByCSP: true,
contextId: executionContextId,
});
Expand Down Expand Up @@ -204,17 +227,18 @@ class CMPCollector extends BaseCollector {
return;
}

if (!this.doOptOut) {
if (!this.autoAction) {
return;
}

// did we opt-out?
const optOutDone = /** @type {import('@duckduckgo/autoconsent/lib/messages').OptOutResultMessage} */ (await this.waitForMessage({
type: 'optOutResult',
const resultType = this.autoAction === 'optOut' ? 'optOutResult' : 'optInResult';
const autoActionDone = /** @type {import('@duckduckgo/autoconsent/lib/messages').OptOutResultMessage|import('@duckduckgo/autoconsent/lib/messages').OptInResultMessage} */ (await this.waitForMessage({
type: resultType,
cmp: detectedMsg.cmp
}));
if (optOutDone) {
if (!optOutDone.result) {
if (autoActionDone) {
if (!autoActionDone.result) {
return;
}
}
Expand Down Expand Up @@ -280,14 +304,16 @@ class CMPCollector extends BaseCollector {
const found = this.findMessage({type: 'popupFound', cmp: msg.cmp});
if (found) {
result.open = true;
if (this.doOptOut) {
if (this.autoAction) {
const resultType = this.autoAction === 'optOut' ? 'optOutResult' : 'optInResult';
result.started = true;
const optOutResult = /** @type {import('@duckduckgo/autoconsent/lib/messages').OptOutResultMessage} */ (this.findMessage({
type: 'optOutResult',
const autoActionResult = /** @type {import('@duckduckgo/autoconsent/lib/messages').OptOutResultMessage|import('@duckduckgo/autoconsent/lib/messages').OptInResultMessage} */ (this.findMessage({
type: resultType,
cmp: msg.cmp,
}));

result.succeeded = optOutResult.result;
if (autoActionResult) {
result.succeeded = autoActionResult.result;
}
}
}
results.push(result);
Expand Down
4 changes: 2 additions & 2 deletions crawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ function openBrowser(log, proxyHost, executablePath) {
/**
* @param {puppeteer.BrowserContext} context
* @param {URL} url
* @param {{collectors: import('./collectors/BaseCollector')[], log: function(...any):void, urlFilter: function(string, string):boolean, emulateMobile: boolean, emulateUserAgent: boolean, runInEveryFrame: function():void, maxLoadTimeMs: number, extraExecutionTimeMs: number, collectorFlags: Object.<string, boolean>}} data
* @param {{collectors: import('./collectors/BaseCollector')[], log: function(...any):void, urlFilter: function(string, string):boolean, emulateMobile: boolean, emulateUserAgent: boolean, runInEveryFrame: function():void, maxLoadTimeMs: number, extraExecutionTimeMs: number, collectorFlags: Object.<string, string>}} data
*
* @returns {Promise<CollectResult>}
*/
Expand Down Expand Up @@ -277,7 +277,7 @@ function isThirdPartyRequest(documentUrl, requestUrl) {

/**
* @param {URL} url
* @param {{collectors?: import('./collectors/BaseCollector')[], log?: function(...any):void, filterOutFirstParty?: boolean, emulateMobile?: boolean, emulateUserAgent?: boolean, proxyHost?: string, browserContext?: puppeteer.BrowserContext, runInEveryFrame?: function():void, executablePath?: string, maxLoadTimeMs?: number, extraExecutionTimeMs?: number, collectorFlags?: Object.<string, boolean>}} options
* @param {{collectors?: import('./collectors/BaseCollector')[], log?: function(...any):void, filterOutFirstParty?: boolean, emulateMobile?: boolean, emulateUserAgent?: boolean, proxyHost?: string, browserContext?: puppeteer.BrowserContext, runInEveryFrame?: function():void, executablePath?: string, maxLoadTimeMs?: number, extraExecutionTimeMs?: number, collectorFlags?: Object.<string, string>}} options
* @returns {Promise<CollectResult>}
*/
module.exports = async (url, options) => {
Expand Down
2 changes: 1 addition & 1 deletion crawlerConductor.js
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ const MAX_NUMBER_OF_RETRIES = 2;
* @param {string} executablePath
* @param {number} maxLoadTimeMs
* @param {number} extraExecutionTimeMs
* @param {Object.<string, boolean>} collectorFlags
* @param {Object.<string, string>} collectorFlags
*/
async function crawlAndSaveData(urlString, dataCollectors, log, filterOutFirstParty, dataCallback, emulateMobile, proxyHost, antiBotDetection, executablePath, maxLoadTimeMs, extraExecutionTimeMs, collectorFlags) {
const url = new URL(urlString);
Expand Down
Loading