From 99fc4b6f87e247b125f4c2d07970e03268599609 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Thu, 19 Sep 2024 16:35:06 +0200 Subject: [PATCH 01/44] deprecated --- definitions/output/response_bodies.js | 20 -------------------- 1 file changed, 20 deletions(-) delete mode 100644 definitions/output/response_bodies.js diff --git a/definitions/output/response_bodies.js b/definitions/output/response_bodies.js deleted file mode 100644 index 20af03e..0000000 --- a/definitions/output/response_bodies.js +++ /dev/null @@ -1,20 +0,0 @@ -const current_month_underscored = constants.fn_date_underscored(constants.current_month); - -constants.clients.forEach(client => { - publish(current_month_underscored + "_" + client, { - type: "table", - schema: "response_bodies", - tags: ["crawl_results_legacy"] - }).query(ctx => ` -SELECT - page, - url, - SUBSTRING(response_body, 0, 2 * 1024 * 1024) AS response_body, - LENGTH(response_body) >= 2 * 1024 * 1024 AS truncated -FROM ${ctx.ref("all", "requests")} -WHERE date = '${constants.current_month}' AND - client = '${client}' AND - is_root_page AND - response_body IS NOT NULL - `); -}); From 0dbb96cb9d17874deacd65a12b06e91687ee2533 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Thu, 19 Sep 2024 20:15:02 +0200 Subject: [PATCH 02/44] backfill draft --- definitions/output/all/requests.js | 84 ++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/definitions/output/all/requests.js b/definitions/output/all/requests.js index fd8c1e8..0043ef1 100644 --- a/definitions/output/all/requests.js +++ b/definitions/output/all/requests.js @@ -51,3 +51,87 @@ SELECT * EXCEPT (rank) FROM ${ctx.ref("crawl_staging", "requests")} ${constants.dev_TABLESAMPLE} WHERE date = '${constants.current_month}' AND client = 'mobile' AND is_root_page = FALSE AND (type != 'script' OR type IS NULL) `) + +let monthRange = []; +for ( + let month = '2016-01-01'; + month < '2022-07-01'; + month = constants.fn_past_month(month)) { + monthRange.push(month) +} + +monthRange.forEach((month, i) => { + operate(`requests_backfill_from_response_bodies ${month}`).tags([ + "response_bodies_deprecated" + ]).queries(ctx => ` +DELETE FROM ${ctx.resolve("all", "requests")} +WHERE date = '${month}'; + +INSERT INTO ${ctx.resolve("all", "requests")} +SELECT + '${month}' AS date, + COALESCE(response_bodies._TABLE_SUFFIX, requests._TABLE_SUFFIX) AS client, + COALESCE(response_bodies.page, requests.page) AS page, + TRUE AS is_root_page, + COALESCE(response_bodies.page, requests.page) AS root_page, + COALESCE(response_bodies.url, requests.url) AS url, + IF( + AND( + SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$._request_type') AS STRING) = "Document", + MIN(SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$._index') AS INT64)) OVER (PARTITION BY page) = SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$._index') AS INT64) + ), + TRUE, + FALSE + ) AS is_main_document, + SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$._index') AS INT64) AS index, + requests.payload AS payload, + JSON_OBJECT( + "time", SAFE_CAST(JSON_VALUE(requests.payload, '$.time') AS INTEGER), + "method", JSON_VALUE(requests.payload, '$._method'), + "redirectUrl", + "reqHttpVersion", JSON_VALUE(requests.payload, '$.request.httpVersion'), + "reqHeadersSize", JSON_VALUE(requests.payload, '$.request.headersSize'), + "reqBodySize", JSON_VALUE(requests.payload, '$.request.bodySize'), + "reqCookieLen", + "status", JSON_VALUE(requests.payload, '$.response.status'), + "respHttpVersion", JSON_VALUE(requests.payload, '$.response.httpVersion'), + "respHeadersSize", JSON_VALUE(requests.payload, '$.response.headersSize'), + "respBodySize", JSON_VALUE(requests.payload, '$.response.bodySize'), + "respSize", JSON_VALUE(requests.payload, '$.response.content.size'), + "respCookieLen", + "expAge", + "mimeType", JSON_VALUE(requests.payload, '$.response.content.mimeType'), + "_cdn_provider", JSON_VALUE(requests.payload, '$._cdn_provider'), + "_gzip_save", JSON_VALUE(requests.payload, '$._gzip_save'), + "ext", + "format", + ) AS summary, + SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$.request.headers') AS JSON) AS request_headers, + SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$.response.headers') AS JSON) AS response_headers, + response_bodies.response_body AS response_body +FROM ${ctx.resolve("response_bodies", `${constants.fn_date_underscored(month)}_*`)} AS response_bodies +FULL OUTER JOIN ${ctx.resolve("requests", `${constants.fn_date_underscored(month)}_*`)} AS requests +USING (page, url); + `) +}) + + + SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.time') AS INTEGER) AS time, + JSON_EXTRACT_SCALAR(summary, '$.method') AS method, + JSON_EXTRACT_SCALAR(summary, '$.redirectUrl') AS redirectUrl, + JSON_EXTRACT_SCALAR(summary, '$.reqHttpVersion') AS reqHttpVersion, + SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqHeadersSize') AS INTEGER) AS reqHeadersSize, + SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqBodySize') AS INTEGER) AS reqBodySize, + SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqCookieLen') AS INTEGER) AS reqCookieLen, + SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.status') AS INTEGER) AS status, + JSON_EXTRACT_SCALAR(summary, '$.respHttpVersion') AS respHttpVersion, + SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.respHeadersSize') AS INTEGER) AS respHeadersSize, + SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.respBodySize') AS INTEGER) AS respBodySize, + SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.respSize') AS INTEGER) AS respSize, + SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.respCookieLen') AS INTEGER) AS respCookieLen, + SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.expAge') AS INTEGER) AS expAge, + JSON_EXTRACT_SCALAR(summary, '$.mimeType') AS mimeType, + JSON_EXTRACT_SCALAR(summary, '$._cdn_provider') AS _cdn_provider, + SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$._gzip_save') AS INTEGER) AS _gzip_save, + JSON_EXTRACT_SCALAR(summary, '$.ext') AS ext, + JSON_EXTRACT_SCALAR(summary, '$.format') AS format, From 1c30ac315be658d0ec7380ea81595fa469a9ed12 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Thu, 19 Sep 2024 20:18:09 +0200 Subject: [PATCH 03/44] cleanup --- definitions/output/all/requests.js | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/definitions/output/all/requests.js b/definitions/output/all/requests.js index 0043ef1..5c89909 100644 --- a/definitions/output/all/requests.js +++ b/definitions/output/all/requests.js @@ -114,24 +114,3 @@ FULL OUTER JOIN ${ctx.resolve("requests", `${constants.fn_date_underscored(month USING (page, url); `) }) - - - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.time') AS INTEGER) AS time, - JSON_EXTRACT_SCALAR(summary, '$.method') AS method, - JSON_EXTRACT_SCALAR(summary, '$.redirectUrl') AS redirectUrl, - JSON_EXTRACT_SCALAR(summary, '$.reqHttpVersion') AS reqHttpVersion, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqHeadersSize') AS INTEGER) AS reqHeadersSize, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqBodySize') AS INTEGER) AS reqBodySize, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqCookieLen') AS INTEGER) AS reqCookieLen, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.status') AS INTEGER) AS status, - JSON_EXTRACT_SCALAR(summary, '$.respHttpVersion') AS respHttpVersion, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.respHeadersSize') AS INTEGER) AS respHeadersSize, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.respBodySize') AS INTEGER) AS respBodySize, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.respSize') AS INTEGER) AS respSize, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.respCookieLen') AS INTEGER) AS respCookieLen, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.expAge') AS INTEGER) AS expAge, - JSON_EXTRACT_SCALAR(summary, '$.mimeType') AS mimeType, - JSON_EXTRACT_SCALAR(summary, '$._cdn_provider') AS _cdn_provider, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$._gzip_save') AS INTEGER) AS _gzip_save, - JSON_EXTRACT_SCALAR(summary, '$.ext') AS ext, - JSON_EXTRACT_SCALAR(summary, '$.format') AS format, From 78e4a23e59715d64efa6878c6ee47415712eb273 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Thu, 19 Sep 2024 20:19:47 +0200 Subject: [PATCH 04/44] null placeholders --- definitions/output/all/requests.js | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/definitions/output/all/requests.js b/definitions/output/all/requests.js index 5c89909..281632a 100644 --- a/definitions/output/all/requests.js +++ b/definitions/output/all/requests.js @@ -88,23 +88,23 @@ SELECT JSON_OBJECT( "time", SAFE_CAST(JSON_VALUE(requests.payload, '$.time') AS INTEGER), "method", JSON_VALUE(requests.payload, '$._method'), - "redirectUrl", + "redirectUrl", NULL, "reqHttpVersion", JSON_VALUE(requests.payload, '$.request.httpVersion'), "reqHeadersSize", JSON_VALUE(requests.payload, '$.request.headersSize'), "reqBodySize", JSON_VALUE(requests.payload, '$.request.bodySize'), - "reqCookieLen", + "reqCookieLen", NULL, "status", JSON_VALUE(requests.payload, '$.response.status'), "respHttpVersion", JSON_VALUE(requests.payload, '$.response.httpVersion'), "respHeadersSize", JSON_VALUE(requests.payload, '$.response.headersSize'), "respBodySize", JSON_VALUE(requests.payload, '$.response.bodySize'), "respSize", JSON_VALUE(requests.payload, '$.response.content.size'), - "respCookieLen", - "expAge", + "respCookieLen", NULL, + "expAge", NULL, "mimeType", JSON_VALUE(requests.payload, '$.response.content.mimeType'), "_cdn_provider", JSON_VALUE(requests.payload, '$._cdn_provider'), "_gzip_save", JSON_VALUE(requests.payload, '$._gzip_save'), - "ext", - "format", + "ext", NULL, + "format", NULL, ) AS summary, SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$.request.headers') AS JSON) AS request_headers, SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$.response.headers') AS JSON) AS response_headers, From 67387858cae5272f5ce86dd9242bd68875321277 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Thu, 19 Sep 2024 20:26:26 +0200 Subject: [PATCH 05/44] sql fix --- definitions/output/all/requests.js | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/definitions/output/all/requests.js b/definitions/output/all/requests.js index 281632a..16faeaf 100644 --- a/definitions/output/all/requests.js +++ b/definitions/output/all/requests.js @@ -76,10 +76,8 @@ SELECT COALESCE(response_bodies.page, requests.page) AS root_page, COALESCE(response_bodies.url, requests.url) AS url, IF( - AND( - SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$._request_type') AS STRING) = "Document", - MIN(SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$._index') AS INT64)) OVER (PARTITION BY page) = SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$._index') AS INT64) - ), + SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$._request_type') AS STRING) = "Document" AND + MIN(SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$._index') AS INT64)) OVER (PARTITION BY page) = SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$._index') AS INT64), TRUE, FALSE ) AS is_main_document, @@ -104,10 +102,10 @@ SELECT "_cdn_provider", JSON_VALUE(requests.payload, '$._cdn_provider'), "_gzip_save", JSON_VALUE(requests.payload, '$._gzip_save'), "ext", NULL, - "format", NULL, + "format", NULL ) AS summary, - SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$.request.headers') AS JSON) AS request_headers, - SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$.response.headers') AS JSON) AS response_headers, + JSON_QUERY(payload, '$.request.headers') AS request_headers, + JSON_QUERY(payload, '$.response.headers') AS response_headers, response_bodies.response_body AS response_body FROM ${ctx.resolve("response_bodies", `${constants.fn_date_underscored(month)}_*`)} AS response_bodies FULL OUTER JOIN ${ctx.resolve("requests", `${constants.fn_date_underscored(month)}_*`)} AS requests From 4d69fc6af754c9cf3405ac9b70128d0d2ec1f14c Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Thu, 19 Sep 2024 20:36:26 +0200 Subject: [PATCH 06/44] fix month range --- definitions/output/all/requests.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/definitions/output/all/requests.js b/definitions/output/all/requests.js index 16faeaf..9d8293f 100644 --- a/definitions/output/all/requests.js +++ b/definitions/output/all/requests.js @@ -54,8 +54,8 @@ WHERE date = '${constants.current_month}' AND client = 'mobile' AND is_root_page let monthRange = []; for ( - let month = '2016-01-01'; - month < '2022-07-01'; + let month = '2022-06-01'; + month >= '2016-01-01'; month = constants.fn_past_month(month)) { monthRange.push(month) } From 52a8eec0d2d1a1c083dbbbbdd44c186a2095e440 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Thu, 19 Sep 2024 20:40:01 +0200 Subject: [PATCH 07/44] literal table names --- definitions/output/all/requests.js | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/definitions/output/all/requests.js b/definitions/output/all/requests.js index 9d8293f..9dba572 100644 --- a/definitions/output/all/requests.js +++ b/definitions/output/all/requests.js @@ -61,8 +61,8 @@ for ( } monthRange.forEach((month, i) => { - operate(`requests_backfill_from_response_bodies ${month}`).tags([ - "response_bodies_deprecated" + operate(`requests_backfill ${month}`).tags([ + "requests_backfill" ]).queries(ctx => ` DELETE FROM ${ctx.resolve("all", "requests")} WHERE date = '${month}'; @@ -107,8 +107,8 @@ SELECT JSON_QUERY(payload, '$.request.headers') AS request_headers, JSON_QUERY(payload, '$.response.headers') AS response_headers, response_bodies.response_body AS response_body -FROM ${ctx.resolve("response_bodies", `${constants.fn_date_underscored(month)}_*`)} AS response_bodies -FULL OUTER JOIN ${ctx.resolve("requests", `${constants.fn_date_underscored(month)}_*`)} AS requests +FROM \`response_bodies.${constants.fn_date_underscored(month)}_*\` AS response_bodies +FULL OUTER JOIN \`requests.${constants.fn_date_underscored(month)}_*\` AS requests USING (page, url); `) }) From a7b7a533c39dd1116540142d2ef2308946df2e44 Mon Sep 17 00:00:00 2001 From: Max Ostapenko Date: Fri, 27 Sep 2024 20:23:09 +0000 Subject: [PATCH 08/44] backfill tested --- definitions/extra/test_env.js | 22 +-- definitions/output/all/requests.js | 61 -------- definitions/output/all/requests_backfill.js | 163 ++++++++++++++++++++ includes/constants.js | 4 +- 4 files changed, 177 insertions(+), 73 deletions(-) create mode 100644 definitions/output/all/requests_backfill.js diff --git a/definitions/extra/test_env.js b/definitions/extra/test_env.js index 57f56bc..1bf34b8 100644 --- a/definitions/extra/test_env.js +++ b/definitions/extra/test_env.js @@ -1,26 +1,28 @@ -const two_months_ago = constants.fn_past_month(constants.fn_past_month(constants.current_month)); +const last_month = constants.fn_past_month(constants.current_month); operate("test_env", { hasOutput: true, disabled: true // MUST NOT be commented in main branch }).queries(ctx => ` -CREATE OR REPLACE TABLE ${ctx.ref("all", "pages")} AS -SELECT * -FROM httparchive.all.pages ${constants.dev_TABLESAMPLE} -WHERE date = '${two_months_ago}'; - CREATE OR REPLACE TABLE ${ctx.ref("all", "requests")} AS SELECT * FROM httparchive.all.requests ${constants.dev_TABLESAMPLE} -WHERE date = '${two_months_ago}'; +WHERE date = '${last_month}'; +`) + +/* +CREATE OR REPLACE TABLE ${ctx.ref("all", "pages")} AS +SELECT * +FROM httparchive.all.pages ${constants.dev_TABLESAMPLE} +WHERE date = '${last_month}'; CREATE OR REPLACE TABLE ${ctx.ref("all", "parsed_css")} AS SELECT * FROM httparchive.all.parsed_css ${constants.dev_TABLESAMPLE} -WHERE date = '${two_months_ago}'; +WHERE date = '${last_month}'; CREATE OR REPLACE TABLE ${ctx.ref("core_web_vitals", "technologies")} AS SELECT * FROM httparchive.core_web_vitals.technologies -WHERE date = '${two_months_ago}' -`) +WHERE date = '${last_month}' +*/ \ No newline at end of file diff --git a/definitions/output/all/requests.js b/definitions/output/all/requests.js index 9dba572..fd8c1e8 100644 --- a/definitions/output/all/requests.js +++ b/definitions/output/all/requests.js @@ -51,64 +51,3 @@ SELECT * EXCEPT (rank) FROM ${ctx.ref("crawl_staging", "requests")} ${constants.dev_TABLESAMPLE} WHERE date = '${constants.current_month}' AND client = 'mobile' AND is_root_page = FALSE AND (type != 'script' OR type IS NULL) `) - -let monthRange = []; -for ( - let month = '2022-06-01'; - month >= '2016-01-01'; - month = constants.fn_past_month(month)) { - monthRange.push(month) -} - -monthRange.forEach((month, i) => { - operate(`requests_backfill ${month}`).tags([ - "requests_backfill" - ]).queries(ctx => ` -DELETE FROM ${ctx.resolve("all", "requests")} -WHERE date = '${month}'; - -INSERT INTO ${ctx.resolve("all", "requests")} -SELECT - '${month}' AS date, - COALESCE(response_bodies._TABLE_SUFFIX, requests._TABLE_SUFFIX) AS client, - COALESCE(response_bodies.page, requests.page) AS page, - TRUE AS is_root_page, - COALESCE(response_bodies.page, requests.page) AS root_page, - COALESCE(response_bodies.url, requests.url) AS url, - IF( - SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$._request_type') AS STRING) = "Document" AND - MIN(SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$._index') AS INT64)) OVER (PARTITION BY page) = SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$._index') AS INT64), - TRUE, - FALSE - ) AS is_main_document, - SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$._index') AS INT64) AS index, - requests.payload AS payload, - JSON_OBJECT( - "time", SAFE_CAST(JSON_VALUE(requests.payload, '$.time') AS INTEGER), - "method", JSON_VALUE(requests.payload, '$._method'), - "redirectUrl", NULL, - "reqHttpVersion", JSON_VALUE(requests.payload, '$.request.httpVersion'), - "reqHeadersSize", JSON_VALUE(requests.payload, '$.request.headersSize'), - "reqBodySize", JSON_VALUE(requests.payload, '$.request.bodySize'), - "reqCookieLen", NULL, - "status", JSON_VALUE(requests.payload, '$.response.status'), - "respHttpVersion", JSON_VALUE(requests.payload, '$.response.httpVersion'), - "respHeadersSize", JSON_VALUE(requests.payload, '$.response.headersSize'), - "respBodySize", JSON_VALUE(requests.payload, '$.response.bodySize'), - "respSize", JSON_VALUE(requests.payload, '$.response.content.size'), - "respCookieLen", NULL, - "expAge", NULL, - "mimeType", JSON_VALUE(requests.payload, '$.response.content.mimeType'), - "_cdn_provider", JSON_VALUE(requests.payload, '$._cdn_provider'), - "_gzip_save", JSON_VALUE(requests.payload, '$._gzip_save'), - "ext", NULL, - "format", NULL - ) AS summary, - JSON_QUERY(payload, '$.request.headers') AS request_headers, - JSON_QUERY(payload, '$.response.headers') AS response_headers, - response_bodies.response_body AS response_body -FROM \`response_bodies.${constants.fn_date_underscored(month)}_*\` AS response_bodies -FULL OUTER JOIN \`requests.${constants.fn_date_underscored(month)}_*\` AS requests -USING (page, url); - `) -}) diff --git a/definitions/output/all/requests_backfill.js b/definitions/output/all/requests_backfill.js new file mode 100644 index 0000000..f0df478 --- /dev/null +++ b/definitions/output/all/requests_backfill.js @@ -0,0 +1,163 @@ + +let monthRange = []; +for ( + let month = '2022-08-01'; + month >= '2022-08-01'; //2016-01-01 + month = constants.fn_past_month(month)) { + monthRange.push(month) +} + +monthRange.forEach((month, i) => { + constants.clients.forEach(client => { + operate(`requests_backfill ${month}_${client}`).tags([ + "requests_backfill" + ]).queries(ctx => ` +DELETE FROM ${ctx.resolve("all", "requests")} +WHERE date = '${month}'; + +CREATE TEMP FUNCTION get_ext_from_url(url STRING) +RETURNS STRING +LANGUAGE js +AS """ + try { + let ret_ext = url; + + // Remove query parameters + const i_q = ret_ext.indexOf("?"); + if (i_q > -1) { + ret_ext = ret_ext.substring(0, i_q); + } + + // Get the last segment of the path after the last "/" + ret_ext = ret_ext.substring(ret_ext.lastIndexOf("/") + 1); + + // Find the position of the last dot + const i_dot = ret_ext.lastIndexOf("."); + + if (i_dot === -1) { + // No dot means no extension + ret_ext = ""; + } else { + // Extract the extension + ret_ext = ret_ext.substring(i_dot + 1); + + // Weed out overly long extensions + if (ret_ext.length > 5) { + ret_ext = ""; + } + } + + return ret_ext.toLowerCase(); + } catch (e) { + return ""; // Return an empty string in case of any errors + } +"""; + +CREATE TEMP FUNCTION get_type(mime_typ STRING, ext STRING) +RETURNS STRING +LANGUAGE js +AS """ + try { + mime_typ = mime_typ.toLowerCase(); + + // Order by most unique types first + const uniqueTypes = ["font", "css", "image", "script", "video", "audio", "xml"]; + for (let typ of uniqueTypes) { + if (mime_typ.includes(typ)) { + return typ; + } + } + + // Special cases + if (mime_typ.includes("json") || ["js", "json"].includes(ext)) { + return "script"; + } else if (["eot", "ttf", "woff", "woff2", "otf"].includes(ext)) { + return "font"; + } else if ( + ["png", "gif", "jpg", "jpeg", "webp", "ico", "svg", "avif", "jxl", "heic", "heif"].includes(ext) + ) { + return "image"; + } else if (ext === "css") { + return "css"; + } else if (ext === "xml") { + return "xml"; + } else if ( + ["mp4", "webm", "ts", "m4v", "m4s", "mov", "ogv", "swf", "f4v", "flv"].includes(ext) || + ["flash", "webm", "mp4", "flv"].some(typ => mime_typ.includes(typ)) + ) { + return "video"; + } else if (mime_typ.includes("wasm") || ext === "wasm") { + return "wasm"; + } else if (mime_typ.includes("html") || ["html", "htm"].includes(ext)) { + return "html"; + } else if (mime_typ.includes("text")) { + // Put "text" last because it is often misused, so extension should take precedence. + return "text"; + } else { + return "other"; + } + } catch (e) { + return "other"; // Return "other" if there's any error + } +"""; + +CREATE TEMP FUNCTION parse_headers(headers STRING) +RETURNS ARRAY> +LANGUAGE js +AS """ + try { + return JSON.parse(headers).map(header => { + return { name: header.name, value: header.value }; + }); + } catch (e) { + return []; + } +"""; + +INSERT INTO ${ctx.resolve("all", "requests")} +SELECT + DATE('${month}') AS date, + '${client}' AS client, + requests.page AS page, + TRUE AS is_root_page, + requests.page AS root_page, + requests.url AS url, + IF( + SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$._request_type') AS STRING) = "Document" AND + MIN(SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$._index') AS INT64)) OVER (PARTITION BY page) = SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$._index') AS INT64), + TRUE, + FALSE + ) AS is_main_document, + get_type(JSON_VALUE(requests.payload, '$.response.content.mimeType'), get_ext_from_url(requests.url)) AS type, + SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$._index') AS INT64) AS index, + requests.payload AS payload, + TO_JSON_STRING( STRUCT( + SAFE_CAST(JSON_VALUE(requests.payload, '$.time') AS INTEGER) AS time, + JSON_VALUE(requests.payload, '$._method') AS method, + NULL AS redirectUrl, + JSON_VALUE(requests.payload, '$.request.httpVersion') AS reqHttpVersion, + JSON_VALUE(requests.payload, '$.request.headersSize') AS reqHeadersSize, + JSON_VALUE(requests.payload, '$.request.bodySize') AS reqBodySize, + NULL AS reqCookieLen, + JSON_VALUE(requests.payload, '$.response.status') AS status, + JSON_VALUE(requests.payload, '$.response.httpVersion') AS respHttpVersion, + JSON_VALUE(requests.payload, '$.response.headersSize') AS respHeadersSize, + JSON_VALUE(requests.payload, '$.response.bodySize') AS respBodySize, + JSON_VALUE(requests.payload, '$.response.content.size') AS respSize, + NULL AS respCookieLen, + NULL AS expAge, + JSON_VALUE(requests.payload, '$.response.content.mimeType') AS mimeType, + JSON_VALUE(requests.payload, '$._cdn_provider') AS _cdn_provide, + JSON_VALUE(requests.payload, '$._gzip_save') AS _gzip_save, + NULL AS ext, + NULL AS format + )) AS summary, + parse_headers(JSON_QUERY(payload, '$.request.headers')) AS request_headers, + parse_headers(JSON_QUERY(payload, '$.response.headers')) AS response_headers, + response_bodies.body AS response_body +FROM requests.${constants.fn_date_underscored(month)}_${client} AS requests ${constants.dev_TABLESAMPLE} +LEFT JOIN response_bodies.${constants.fn_date_underscored(month)}_${client} AS response_bodies ${constants.dev_TABLESAMPLE} +USING (page, url); + `) + }) +}) \ No newline at end of file diff --git a/includes/constants.js b/includes/constants.js index 40054b2..441dd51 100644 --- a/includes/constants.js +++ b/includes/constants.js @@ -11,7 +11,7 @@ const booleans = ['TRUE', 'FALSE'], [ dev_TABLESAMPLE, - dev_rank5000_filter + dev_rank_filter ] = dataform.projectConfig.vars.env_name == "dev" ? [ "TABLESAMPLE SYSTEM (0.001 PERCENT)", "AND rank = 5000" @@ -25,5 +25,5 @@ module.exports = { clients, booleans, dev_TABLESAMPLE, - dev_rank5000_filter + dev_rank_filter }; From 96fee1506abb12a7b89b4f6f5611ae45c295e27c Mon Sep 17 00:00:00 2001 From: Max Ostapenko Date: Fri, 27 Sep 2024 20:24:41 +0000 Subject: [PATCH 09/44] dates reset --- definitions/output/all/requests_backfill.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/definitions/output/all/requests_backfill.js b/definitions/output/all/requests_backfill.js index f0df478..e5881e5 100644 --- a/definitions/output/all/requests_backfill.js +++ b/definitions/output/all/requests_backfill.js @@ -1,8 +1,8 @@ let monthRange = []; for ( - let month = '2022-08-01'; - month >= '2022-08-01'; //2016-01-01 + let month = '2022-06-01'; + month >= '2016-01-01'; month = constants.fn_past_month(month)) { monthRange.push(month) } From a82927c43f20bdd1f9d0bdf9de303080f4e5525e Mon Sep 17 00:00:00 2001 From: Max Ostapenko Date: Fri, 27 Sep 2024 23:13:48 +0000 Subject: [PATCH 10/44] requests_summary --- .../output/all/requests_summary_backfill.js | 195 ++++++++++++++++++ 1 file changed, 195 insertions(+) create mode 100644 definitions/output/all/requests_summary_backfill.js diff --git a/definitions/output/all/requests_summary_backfill.js b/definitions/output/all/requests_summary_backfill.js new file mode 100644 index 0000000..4039e91 --- /dev/null +++ b/definitions/output/all/requests_summary_backfill.js @@ -0,0 +1,195 @@ +let monthRange = []; +for ( + let month = '2015-12-01'; + month >= '2011-06-01'; + month = constants.fn_past_month(month)) { + monthRange.push(month) +} + +monthRange.forEach((month, i) => { + if(month > "2014-06-01"){ + flag1 = true; + } else { + flag1 = false; + } + + constants.clients.forEach(client => { + operate(`requests_backfill_summary ${month}_${client}`).tags([ + "requests_backfill" + ]).queries(ctx => ` +DELETE FROM ${ctx.resolve("all", "requests")} +WHERE date = '${month}'; + +CREATE TEMP FUNCTION get_ext_from_url(url STRING) +RETURNS STRING +LANGUAGE js +AS """ + try { + let ret_ext = url; + + // Remove query parameters + const i_q = ret_ext.indexOf("?"); + if (i_q > -1) { + ret_ext = ret_ext.substring(0, i_q); + } + + // Get the last segment of the path after the last "/" + ret_ext = ret_ext.substring(ret_ext.lastIndexOf("/") + 1); + + // Find the position of the last dot + const i_dot = ret_ext.lastIndexOf("."); + + if (i_dot === -1) { + // No dot means no extension + ret_ext = ""; + } else { + // Extract the extension + ret_ext = ret_ext.substring(i_dot + 1); + + // Weed out overly long extensions + if (ret_ext.length > 5) { + ret_ext = ""; + } + } + + return ret_ext.toLowerCase(); + } catch (e) { + return ""; // Return an empty string in case of any errors + } +"""; + +CREATE TEMP FUNCTION get_type(mime_typ STRING, ext STRING) +RETURNS STRING +LANGUAGE js +AS """ + try { + mime_typ = mime_typ.toLowerCase(); + + // Order by most unique types first + const uniqueTypes = ["font", "css", "image", "script", "video", "audio", "xml"]; + for (let typ of uniqueTypes) { + if (mime_typ.includes(typ)) { + return typ; + } + } + + // Special cases + if (mime_typ.includes("json") || ["js", "json"].includes(ext)) { + return "script"; + } else if (["eot", "ttf", "woff", "woff2", "otf"].includes(ext)) { + return "font"; + } else if ( + ["png", "gif", "jpg", "jpeg", "webp", "ico", "svg", "avif", "jxl", "heic", "heif"].includes(ext) + ) { + return "image"; + } else if (ext === "css") { + return "css"; + } else if (ext === "xml") { + return "xml"; + } else if ( + ["mp4", "webm", "ts", "m4v", "m4s", "mov", "ogv", "swf", "f4v", "flv"].includes(ext) || + ["flash", "webm", "mp4", "flv"].some(typ => mime_typ.includes(typ)) + ) { + return "video"; + } else if (mime_typ.includes("wasm") || ext === "wasm") { + return "wasm"; + } else if (mime_typ.includes("html") || ["html", "htm"].includes(ext)) { + return "html"; + } else if (mime_typ.includes("text")) { + // Put "text" last because it is often misused, so extension should take precedence. + return "text"; + } else { + return "other"; + } + } catch (e) { + return "other"; // Return "other" if there's any error + } +"""; + +CREATE TEMP FUNCTION parse_headers(headers STRING) +RETURNS ARRAY> +LANGUAGE js +AS """ + try { + return JSON.parse(headers).map(header => { + return { name: header.name, value: header.value }; + }); + } catch (e) { + return []; + } +"""; + +INSERT INTO ${ctx.resolve("all", "requests")} +SELECT + DATE('${month}') AS date, + '${client}' AS client, + pages.url AS page, + TRUE AS is_root_page, + pages.url AS root_page, + requests.url AS url, + requests.firstHTML AS is_main_document, + get_type(requests.mimeType, get_ext_from_url(requests.url)) AS type, + IF(requests.firstReq, 1, NULL) AS index, + NULL AS payload, + TO_JSON_STRING( STRUCT( + requests.time AS time, + requests.method AS method, + requests.redirectUrl AS redirectUrl, + requests.reqHttpVersion AS reqHttpVersion, + requests.reqHeadersSize AS reqHeadersSize, + requests.reqBodySize AS reqBodySize, + requests.reqCookieLen AS reqCookieLen, + requests.reqOtherHeaders AS reqOtherHeaders, + requests.status AS status, + requests.respHttpVersion AS respHttpVersion, + requests.respHeadersSize AS respHeadersSize, + requests.respBodySize AS respBodySize, + requests.respSize AS respSize, + requests.respCookieLen AS respCookieLen, + requests.respOtherHeaders AS respOtherHeaders, + requests.expAge AS expAge, + requests.mimeType AS mimeType + ${flag1 ? ",requests._cdn_provider AS _cdn_provider,requests._gzip_save AS _gzip_save" : ""} + )) AS summary, + ARRAY>[ + ('Accept', requests.req_accept), + ("Accept-Charset", requests.req_accept_charset), + ("Accept-Encoding", requests.req_accept_encoding), + ("Accept-Language", requests.req_accept_language), + ("Connection", requests.req_connection), + ("Host", requests.req_host), + ("If-Modified-Since", requests.req_if_modified_since), + ("If-None-Match", requests.req_if_none_match), + ("Referer", requests.req_referer), + ("User-Agent", requests.req_user_agent) + ] AS request_headers, + ARRAY>[ + ("Accept-Ranges", requests.resp_accept_ranges), + ("Age", requests.resp_age), + ("Cache-Control", requests.resp_cache_control), + ("Connection", requests.resp_connection), + ("Content-Encoding", requests.resp_content_encoding), + ("Content-Length", requests.resp_content_language), + ("Content-Length", requests.resp_content_length), + ("Content-Location", requests.resp_content_location), + ("Content-Type", requests.resp_content_type), + ("Date", requests.resp_date), + ("ETag", requests.resp_etag), + ("Expires", requests.resp_expires), + ("Keep-Alive", requests.resp_keep_alive), + ("Last-Modified", requests.resp_last_modified), + ("Location", requests.resp_location), + ("Pragma", requests.resp_pragma), + ("Server", requests.resp_server), + ("Transfer-Encoding", requests.resp_transfer_encoding), + ("Vary", requests.resp_vary), + ("Via", requests.resp_via), + ("X-Powered-By", requests.resp_x_powered_by) + ] AS response_headers, + NULL AS response_body +FROM summary_requests.${constants.fn_date_underscored(month)}_${client} AS requests ${constants.dev_TABLESAMPLE} +LEFT JOIN summary_pages.${constants.fn_date_underscored(month)}_${client} AS pages ${constants.dev_TABLESAMPLE} +USING(pageid); + `) + }) +}) \ No newline at end of file From c5048782ad790873208eb3222a41b718f490dbce Mon Sep 17 00:00:00 2001 From: Max Ostapenko Date: Sun, 29 Sep 2024 12:00:51 +0000 Subject: [PATCH 11/44] requests backfill for mid month --- definitions/output/all/requests_backfill.js | 31 ++++++++++------- .../output/all/requests_summary_backfill.js | 34 +++++++++++-------- 2 files changed, 37 insertions(+), 28 deletions(-) diff --git a/definitions/output/all/requests_backfill.js b/definitions/output/all/requests_backfill.js index e5881e5..63450c6 100644 --- a/definitions/output/all/requests_backfill.js +++ b/definitions/output/all/requests_backfill.js @@ -1,19 +1,24 @@ - -let monthRange = []; +let datesRange = []; for ( - let month = '2022-06-01'; - month >= '2016-01-01'; - month = constants.fn_past_month(month)) { - monthRange.push(month) + let date = '2016-02-01'; // 2022-06-01 + date >= '2016-01-01'; // 2016-01-01 + date = constants.fn_past_month(date)) { + datesRange.push(date) + + if (date <= "2018-12-01") { + midMonth = new Date(date); + midMonth.setDate(15); + datesRange.push(midMonth.toISOString().substring(0, 10)) + } } -monthRange.forEach((month, i) => { +datesRange.forEach((date, i) => { constants.clients.forEach(client => { - operate(`requests_backfill ${month}_${client}`).tags([ + operate(`requests_backfill ${date}_${client}`).tags([ "requests_backfill" ]).queries(ctx => ` DELETE FROM ${ctx.resolve("all", "requests")} -WHERE date = '${month}'; +WHERE date = '${date}'; CREATE TEMP FUNCTION get_ext_from_url(url STRING) RETURNS STRING @@ -116,7 +121,7 @@ AS """ INSERT INTO ${ctx.resolve("all", "requests")} SELECT - DATE('${month}') AS date, + DATE('${date}') AS date, '${client}' AS client, requests.page AS page, TRUE AS is_root_page, @@ -155,9 +160,9 @@ SELECT parse_headers(JSON_QUERY(payload, '$.request.headers')) AS request_headers, parse_headers(JSON_QUERY(payload, '$.response.headers')) AS response_headers, response_bodies.body AS response_body -FROM requests.${constants.fn_date_underscored(month)}_${client} AS requests ${constants.dev_TABLESAMPLE} -LEFT JOIN response_bodies.${constants.fn_date_underscored(month)}_${client} AS response_bodies ${constants.dev_TABLESAMPLE} +FROM requests.${constants.fn_date_underscored(date)}_${client} AS requests ${constants.dev_TABLESAMPLE} +LEFT JOIN response_bodies.${constants.fn_date_underscored(date)}_${client} AS response_bodies ${constants.dev_TABLESAMPLE} USING (page, url); `) }) -}) \ No newline at end of file +}) diff --git a/definitions/output/all/requests_summary_backfill.js b/definitions/output/all/requests_summary_backfill.js index 4039e91..9610012 100644 --- a/definitions/output/all/requests_summary_backfill.js +++ b/definitions/output/all/requests_summary_backfill.js @@ -1,24 +1,28 @@ -let monthRange = []; +let datesRange = []; for ( - let month = '2015-12-01'; - month >= '2011-06-01'; - month = constants.fn_past_month(month)) { - monthRange.push(month) + let date = '2011-06-01'; // 2015-12-01 + date >= '2011-06-01'; + date = constants.fn_past_month(date)) { + datesRange.push(date) + + midMonth = new Date(date); + midMonth.setDate(15); + datesRange.push(midMonth.toISOString().substring(0, 10)) } -monthRange.forEach((month, i) => { - if(month > "2014-06-01"){ - flag1 = true; +datesRange.forEach((date, i) => { + if(date > "2014-06-01"){ + add_dimensions = true; } else { - flag1 = false; + add_dimensions = false; } constants.clients.forEach(client => { - operate(`requests_backfill_summary ${month}_${client}`).tags([ + operate(`requests_backfill_summary ${date}_${client}`).tags([ "requests_backfill" ]).queries(ctx => ` DELETE FROM ${ctx.resolve("all", "requests")} -WHERE date = '${month}'; +WHERE date = '${date}'; CREATE TEMP FUNCTION get_ext_from_url(url STRING) RETURNS STRING @@ -121,7 +125,7 @@ AS """ INSERT INTO ${ctx.resolve("all", "requests")} SELECT - DATE('${month}') AS date, + DATE('${date}') AS date, '${client}' AS client, pages.url AS page, TRUE AS is_root_page, @@ -149,7 +153,7 @@ SELECT requests.respOtherHeaders AS respOtherHeaders, requests.expAge AS expAge, requests.mimeType AS mimeType - ${flag1 ? ",requests._cdn_provider AS _cdn_provider,requests._gzip_save AS _gzip_save" : ""} + ${add_dimensions ? ",requests._cdn_provider AS _cdn_provider,requests._gzip_save AS _gzip_save" : ""} )) AS summary, ARRAY>[ ('Accept', requests.req_accept), @@ -187,8 +191,8 @@ SELECT ("X-Powered-By", requests.resp_x_powered_by) ] AS response_headers, NULL AS response_body -FROM summary_requests.${constants.fn_date_underscored(month)}_${client} AS requests ${constants.dev_TABLESAMPLE} -LEFT JOIN summary_pages.${constants.fn_date_underscored(month)}_${client} AS pages ${constants.dev_TABLESAMPLE} +FROM summary_requests.${constants.fn_date_underscored(date)}_${client} AS requests ${constants.dev_TABLESAMPLE} +LEFT JOIN summary_pages.${constants.fn_date_underscored(date)}_${client} AS pages ${constants.dev_TABLESAMPLE} USING(pageid); `) }) From 9dc4cf0d34fb583f709a40a9fcd4db02d4c99333 Mon Sep 17 00:00:00 2001 From: Max Ostapenko Date: Sun, 29 Sep 2024 18:05:28 +0000 Subject: [PATCH 12/44] remove legacy pipelines --- definitions/output/pages.js | 20 ----- definitions/output/requests.js | 22 ----- definitions/output/summary_pages.js | 108 ------------------------- definitions/output/summary_requests.js | 80 ------------------ 4 files changed, 230 deletions(-) delete mode 100644 definitions/output/pages.js delete mode 100644 definitions/output/requests.js delete mode 100644 definitions/output/summary_pages.js delete mode 100644 definitions/output/summary_requests.js diff --git a/definitions/output/pages.js b/definitions/output/pages.js deleted file mode 100644 index 2c94f4f..0000000 --- a/definitions/output/pages.js +++ /dev/null @@ -1,20 +0,0 @@ -const current_month_underscored = constants.fn_date_underscored(constants.current_month); - -constants.clients.forEach(client => { - publish( - current_month_underscored + "_" + client, { - type: "table", - schema: "pages", - tags: ["crawl_results_legacy"] - }).query(ctx => ` -SELECT - page AS url, - payload -FROM ${ctx.ref("all", "pages")} -WHERE date = '${constants.current_month}' AND - client = '${client}' AND - is_root_page AND - payload IS NOT NULL AND - LENGTH(payload) <= 2 * 1024 * 1024 -- legacy tables have a different limit - `); -}) diff --git a/definitions/output/requests.js b/definitions/output/requests.js deleted file mode 100644 index 00e0c54..0000000 --- a/definitions/output/requests.js +++ /dev/null @@ -1,22 +0,0 @@ -const current_month_underscored = constants.fn_date_underscored(constants.current_month); - -constants.clients.forEach(client => { - publish( - current_month_underscored + "_" + client, { - type: "table", - schema: "requests", - tags: ["crawl_results_legacy"] - }).query(ctx => ` -SELECT - page, - url, - payload -FROM ${ctx.ref("all", "requests")} -WHERE date = '${constants.current_month}' AND - client = '${client}' AND - is_root_page AND - payload IS NOT NULL AND - LENGTH(payload) <= 2 * 1024 * 1024 AND -- legacy tables have a different limit - SAFE.PARSE_JSON(payload) IS NOT NULL - `); -}) diff --git a/definitions/output/summary_pages.js b/definitions/output/summary_pages.js deleted file mode 100644 index 850960e..0000000 --- a/definitions/output/summary_pages.js +++ /dev/null @@ -1,108 +0,0 @@ -const current_month_underscored = constants.fn_date_underscored(constants.current_month); - -constants.clients.forEach(client => { - publish(current_month_underscored + "_" + client, { - type: "table", - schema: "summary_pages", - tags: ["crawl_results_legacy"] - }).query(ctx => ` -SELECT - SAFE_CAST(JSON_EXTRACT_SCALAR(METADATA, '$.page_id') AS INTEGER) AS pageid, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.createDate') AS INTEGER) AS createDate, - JSON_EXTRACT_SCALAR(summary, '$.archive') AS archive, - JSON_EXTRACT_SCALAR(summary, '$.label') AS label, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.crawlid') AS INTEGER) AS crawlid, - JSON_EXTRACT_SCALAR(summary, '$.wptid') AS wptid, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.wptrun') AS INTEGER) AS wptrun, - JSON_EXTRACT_SCALAR(summary, '$.url') AS url, - JSON_EXTRACT_SCALAR(summary, '$.urlShort') AS urlShort, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.urlhash') AS INTEGER) AS urlhash, - JSON_EXTRACT_SCALAR(summary, '$.cdn') AS cdn, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.startedDateTime') AS INTEGER) AS startedDateTime, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.TTFB') AS INTEGER) AS TTFB, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.renderStart') AS INTEGER) AS renderStart, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.onContentLoaded') AS INTEGER) AS onContentLoaded, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.onLoad') AS INTEGER) AS onLoad, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.fullyLoaded') AS INTEGER) AS fullyLoaded, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.visualComplete') AS INTEGER) AS visualComplete, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.PageSpeed') AS INTEGER) AS PageSpeed, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.SpeedIndex') AS INTEGER) AS SpeedIndex, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.rank') AS INTEGER) AS rank, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqTotal') AS INTEGER) AS reqTotal, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqHtml') AS INTEGER) AS reqHtml, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqJS') AS INTEGER) AS reqJS, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqCss') AS INTEGER) AS reqCSS, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqImg') AS INTEGER) AS reqImg, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqGif') AS INTEGER) AS reqGif, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqJpg') AS INTEGER) AS reqJpg, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqPng') AS INTEGER) AS reqPng, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqFont') AS INTEGER) AS reqFont, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqFlash') AS INTEGER) AS reqFlash, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqJson') AS INTEGER) AS reqJson, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqOther') AS INTEGER) AS reqOther, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.bytesTotal') AS INTEGER) AS bytesTotal, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.bytesHtml') AS INTEGER) AS bytesHtml, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.bytesJS') AS INTEGER) AS bytesJS, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.bytesCss') AS INTEGER) AS bytesCSS, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.bytesImg') AS INTEGER) AS bytesImg, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.bytesGif') AS INTEGER) AS bytesGif, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.bytesJpg') AS INTEGER) AS bytesJpg, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.bytesPng') AS INTEGER) AS bytesPng, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.bytesFont') AS INTEGER) AS bytesFont, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.bytesFlash') AS INTEGER) AS bytesFlash, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.bytesJson') AS INTEGER) AS bytesJson, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.bytesOther') AS INTEGER) AS bytesOther, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.bytesHtmlDoc') AS INTEGER) AS bytesHtmlDoc, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.numDomains') AS INTEGER) AS numDomains, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.maxDomainReqs') AS INTEGER) AS maxDomainReqs, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.numRedirects') AS INTEGER) AS numRedirects, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.numErrors') AS INTEGER) AS numErrors, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.numGlibs') AS INTEGER) AS numGlibs, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.numHttps') AS INTEGER) AS numHttps, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.numCompressed') AS INTEGER) AS numCompressed, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.numDomElements') AS INTEGER) AS numDomElements, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.maxageNull') AS INTEGER) AS maxageNull, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.maxage0') AS INTEGER) AS maxage0, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.maxage1') AS INTEGER) AS maxage1, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.maxage30') AS INTEGER) AS maxage30, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.maxage365') AS INTEGER) AS maxage365, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.maxageMore') AS INTEGER) AS maxageMore, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.gzipTotal') AS INTEGER) AS gzipTotal, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.gzipSavings') AS INTEGER) AS gzipSavings, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$._connections') AS INTEGER) AS _connections, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$._adult_site') AS BOOLEAN) AS _adult_site, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.avg_dom_depth') AS INTEGER) AS avg_dom_depth, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.document_height') AS INTEGER) AS document_height, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.document_width') AS INTEGER) AS document_width, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.localstorage_size') AS INTEGER) AS localstorage_size, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.sessionstorage_size') AS INTEGER) AS sessionstorage_size, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.num_iframes') AS INTEGER) AS num_iframes, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.num_scripts') AS INTEGER) AS num_scripts, - JSON_EXTRACT_SCALAR(summary, '$.doctype') AS doctype, - JSON_EXTRACT_SCALAR(summary, '$.meta_viewport') AS meta_viewport, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqAudio') AS INTEGER) AS reqAudio, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqVideo') AS INTEGER) AS reqVideo, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqText') AS INTEGER) AS reqText, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqXml') AS INTEGER) AS reqXml, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqWebp') AS INTEGER) AS reqWebp, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqSvg') AS INTEGER) AS reqSvg, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.bytesAudio') AS INTEGER) AS bytesAudio, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.bytesVideo') AS INTEGER) AS bytesVideo, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.bytesText') AS INTEGER) AS bytesText, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.bytesXml') AS INTEGER) AS bytesXml, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.bytesWebp') AS INTEGER) AS bytesWebp, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.bytesSvg') AS INTEGER) AS bytesSvg, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.num_scripts_async') AS INTEGER) AS num_scripts_async, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.num_scripts_sync') AS INTEGER) AS num_scripts_sync, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.usertiming') AS INTEGER) AS usertiming, - metadata -FROM ${ctx.ref("all", "pages")} -WHERE - date = '${constants.current_month}' AND - client = '${client}' AND - is_root_page AND - summary IS NOT NULL AND - JSON_EXTRACT_SCALAR(metadata, '$.page_id') IS NOT NULL AND - JSON_EXTRACT_SCALAR(metadata, '$.page_id') != '' - `); -}); diff --git a/definitions/output/summary_requests.js b/definitions/output/summary_requests.js deleted file mode 100644 index 80e4664..0000000 --- a/definitions/output/summary_requests.js +++ /dev/null @@ -1,80 +0,0 @@ -const current_month_underscored = constants.fn_date_underscored(constants.current_month); - -constants.clients.forEach(client => { - publish(current_month_underscored + "_" + client, { - type: "table", - schema: "summary_requests", - tags: ["crawl_results_legacy"] - }).query(ctx => ` -SELECT - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.requestid') AS INTEGER) AS requestid, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.pageid') AS INTEGER) AS pageid, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.startedDateTime') AS INTEGER) AS startedDateTime, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.time') AS INTEGER) AS time, - JSON_EXTRACT_SCALAR(summary, '$.method') AS method, - JSON_EXTRACT_SCALAR(summary, '$.url') AS url, - JSON_EXTRACT_SCALAR(summary, '$.urlShort') AS urlShort, - JSON_EXTRACT_SCALAR(summary, '$.redirectUrl') AS redirectUrl, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.firstReq') AS BOOLEAN) AS firstReq, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.firstHtml') AS BOOLEAN) AS firstHtml, - JSON_EXTRACT_SCALAR(summary, '$.reqHttpVersion') AS reqHttpVersion, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqHeadersSize') AS INTEGER) AS reqHeadersSize, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqBodySize') AS INTEGER) AS reqBodySize, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqCookieLen') AS INTEGER) AS reqCookieLen, - JSON_EXTRACT_SCALAR(summary, '$.reqOtherHeaders') AS reqOtherHeaders, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.status') AS INTEGER) AS status, - JSON_EXTRACT_SCALAR(summary, '$.respHttpVersion') AS respHttpVersion, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.respHeadersSize') AS INTEGER) AS respHeadersSize, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.respBodySize') AS INTEGER) AS respBodySize, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.respSize') AS INTEGER) AS respSize, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.respCookieLen') AS INTEGER) AS respCookieLen, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.expAge') AS INTEGER) AS expAge, - JSON_EXTRACT_SCALAR(summary, '$.mimeType') AS mimeType, - JSON_EXTRACT_SCALAR(summary, '$.respOtherHeaders') AS respOtherHeaders, - JSON_EXTRACT_SCALAR(summary, '$.req_accept') AS req_accept, - JSON_EXTRACT_SCALAR(summary, '$.req_accept_charset') AS req_accept_charset, - JSON_EXTRACT_SCALAR(summary, '$.req_accept_encoding') AS req_accept_encoding, - JSON_EXTRACT_SCALAR(summary, '$.req_accept_language') AS req_accept_language, - JSON_EXTRACT_SCALAR(summary, '$.req_connection') AS req_connection, - JSON_EXTRACT_SCALAR(summary, '$.req_host') AS req_host, - JSON_EXTRACT_SCALAR(summary, '$.req_if_modified_since') AS req_if_modified_since, - JSON_EXTRACT_SCALAR(summary, '$.req_if_none_match') AS req_if_none_match, - JSON_EXTRACT_SCALAR(summary, '$.req_referer') AS req_referer, - JSON_EXTRACT_SCALAR(summary, '$.req_user_agent') AS req_user_agent, - JSON_EXTRACT_SCALAR(summary, '$.resp_accept_ranges') AS resp_accept_ranges, - JSON_EXTRACT_SCALAR(summary, '$.resp_age') AS resp_age, - JSON_EXTRACT_SCALAR(summary, '$.resp_cache_control') AS resp_cache_control, - JSON_EXTRACT_SCALAR(summary, '$.resp_connection') AS resp_connection, - JSON_EXTRACT_SCALAR(summary, '$.resp_content_encoding') AS resp_content_encoding, - JSON_EXTRACT_SCALAR(summary, '$.resp_content_language') AS resp_content_language, - JSON_EXTRACT_SCALAR(summary, '$.resp_content_length') AS resp_content_length, - JSON_EXTRACT_SCALAR(summary, '$.resp_content_location') AS resp_content_location, - JSON_EXTRACT_SCALAR(summary, '$.resp_content_type') AS resp_content_type, - JSON_EXTRACT_SCALAR(summary, '$.resp_date') AS resp_date, - JSON_EXTRACT_SCALAR(summary, '$.resp_etag') AS resp_etag, - JSON_EXTRACT_SCALAR(summary, '$.resp_expires') AS resp_expires, - JSON_EXTRACT_SCALAR(summary, '$.resp_keep_alive') AS resp_keep_alive, - JSON_EXTRACT_SCALAR(summary, '$.resp_last_modified') AS resp_last_modified, - JSON_EXTRACT_SCALAR(summary, '$.resp_location') AS resp_location, - JSON_EXTRACT_SCALAR(summary, '$.resp_pragma') AS resp_pragma, - JSON_EXTRACT_SCALAR(summary, '$.resp_server') AS resp_server, - JSON_EXTRACT_SCALAR(summary, '$.resp_transfer_encoding') AS resp_transfer_encoding, - JSON_EXTRACT_SCALAR(summary, '$.resp_vary') AS resp_vary, - JSON_EXTRACT_SCALAR(summary, '$.resp_via') AS resp_via, - JSON_EXTRACT_SCALAR(summary, '$.resp_x_powered_by') AS resp_x_powered_by, - JSON_EXTRACT_SCALAR(summary, '$._cdn_provider') AS _cdn_provider, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$._gzip_save') AS INTEGER) AS _gzip_save, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.crawlid') AS INTEGER) AS crawlid, - JSON_EXTRACT_SCALAR(summary, '$.type') AS type, - JSON_EXTRACT_SCALAR(summary, '$.ext') AS ext, - JSON_EXTRACT_SCALAR(summary, '$.format') AS format, -FROM ${ctx.ref("all", "requests")} -WHERE - date = '${constants.current_month}' AND - client = '${client}' AND - is_root_page AND - summary IS NOT NULL AND - JSON_EXTRACT_SCALAR(summary, '$.requestid') IS NOT NULL AND - JSON_EXTRACT_SCALAR(summary, '$.requestid') != '' - `); -}); From c316e25c41c313d4cb331410871eb29125463e47 Mon Sep 17 00:00:00 2001 From: Max Ostapenko Date: Sun, 29 Sep 2024 18:33:26 +0000 Subject: [PATCH 13/44] checked against new schema --- ...uests_backfill.js => backfill_requests.js} | 81 ++++++++++++------- definitions/sources/declares.js | 7 ++ 2 files changed, 60 insertions(+), 28 deletions(-) rename definitions/output/all/{requests_backfill.js => backfill_requests.js} (70%) diff --git a/definitions/output/all/requests_backfill.js b/definitions/output/all/backfill_requests.js similarity index 70% rename from definitions/output/all/requests_backfill.js rename to definitions/output/all/backfill_requests.js index 63450c6..f9e346a 100644 --- a/definitions/output/all/requests_backfill.js +++ b/definitions/output/all/backfill_requests.js @@ -1,24 +1,41 @@ -let datesRange = []; +const iterations = [] +const clients = constants.clients + for ( - let date = '2016-02-01'; // 2022-06-01 - date >= '2016-01-01'; // 2016-01-01 - date = constants.fn_past_month(date)) { - datesRange.push(date) - - if (date <= "2018-12-01") { - midMonth = new Date(date); - midMonth.setDate(15); - datesRange.push(midMonth.toISOString().substring(0, 10)) - } + let date = "2016-01-01"; // 2022-06-01 + date >= "2016-01-01"; // 2016-01-01 + date = constants.fn_past_month(date) +) { + clients.forEach((client) => { + iterations.push({ + date: date, + client: client, + }) + }) + + if (date <= "2018-12-01") { + midMonth = new Date(date) + midMonth.setDate(15) + + clients.forEach((client) => { + iterations.push({ + date: midMonth.toISOString().substring(0, 10), + client: client, + }) + }) + } } -datesRange.forEach((date, i) => { - constants.clients.forEach(client => { - operate(`requests_backfill ${date}_${client}`).tags([ - "requests_backfill" - ]).queries(ctx => ` +operate("") + +iterations.forEach((iteration, i) => { + operate(`requests_backfill ${iteration.date} ${iteration.client}`).tags([ + "requests_backfill" + ]).dependencies([ + i===0 ? "" : `requests_backfill ${iterations[i-1].date} ${iterations[i-1].client}` + ]).queries(ctx => ` DELETE FROM ${ctx.resolve("all", "requests")} -WHERE date = '${date}'; +WHERE date = '${iteration.date}' AND client = '${iteration.client}'; CREATE TEMP FUNCTION get_ext_from_url(url STRING) RETURNS STRING @@ -119,24 +136,25 @@ AS """ } """; -INSERT INTO ${ctx.resolve("all", "requests")} +INSERT INTO \`all_dev.requests_stable\` --${ctx.resolve("all", "requests")} SELECT - DATE('${date}') AS date, - '${client}' AS client, + DATE('${iteration.date}') AS date, + '${iteration.client}' AS client, requests.page AS page, TRUE AS is_root_page, requests.page AS root_page, + crux.rank AS rank, requests.url AS url, IF( SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$._request_type') AS STRING) = "Document" AND - MIN(SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$._index') AS INT64)) OVER (PARTITION BY page) = SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$._index') AS INT64), + MIN(SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$._index') AS INT64)) OVER (PARTITION BY requests.page) = SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$._index') AS INT64), TRUE, FALSE ) AS is_main_document, get_type(JSON_VALUE(requests.payload, '$.response.content.mimeType'), get_ext_from_url(requests.url)) AS type, SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$._index') AS INT64) AS index, - requests.payload AS payload, - TO_JSON_STRING( STRUCT( + SAFE.PARSE_JSON(requests.payload, wide_number_mode => 'round') AS payload, + TO_JSON( STRUCT( SAFE_CAST(JSON_VALUE(requests.payload, '$.time') AS INTEGER) AS time, JSON_VALUE(requests.payload, '$._method') AS method, NULL AS redirectUrl, @@ -160,9 +178,16 @@ SELECT parse_headers(JSON_QUERY(payload, '$.request.headers')) AS request_headers, parse_headers(JSON_QUERY(payload, '$.response.headers')) AS response_headers, response_bodies.body AS response_body -FROM requests.${constants.fn_date_underscored(date)}_${client} AS requests ${constants.dev_TABLESAMPLE} -LEFT JOIN response_bodies.${constants.fn_date_underscored(date)}_${client} AS response_bodies ${constants.dev_TABLESAMPLE} -USING (page, url); - `) - }) +FROM requests.${constants.fn_date_underscored(iteration.date)}_${iteration.client} AS requests ${constants.dev_TABLESAMPLE} +LEFT JOIN ( + SELECT DISTINCT + CONCAT(origin, '/') AS page, + experimental.popularity.rank AS rank + FROM ${ctx.resolve("chrome-ux-report", "experimental", "global")} + WHERE yyyymm = ${constants.fn_past_month(iteration.date).substring(0, 7).replace('-', '')} +) AS crux +ON requests.page = crux.page +LEFT JOIN response_bodies.${constants.fn_date_underscored(iteration.date)}_${iteration.client} AS response_bodies ${constants.dev_TABLESAMPLE} +ON requests.page = response_bodies.page AND requests.url = response_bodies.url; + `) }) diff --git a/definitions/sources/declares.js b/definitions/sources/declares.js index ca29d1e..b91c5db 100644 --- a/definitions/sources/declares.js +++ b/definitions/sources/declares.js @@ -24,3 +24,10 @@ GROUP BY yyyymm HAVING COUNT(1) = 0 `); } + + +declare({ + database: "chrome-ux-report", + schema: "experimental", + name: "global", +}); From e3cf47b25f90bd8526ca5da9dba7af6366c19566 Mon Sep 17 00:00:00 2001 From: Max Ostapenko Date: Sun, 29 Sep 2024 18:46:04 +0000 Subject: [PATCH 14/44] adjusted to a new schema --- ...ckfill.js => backfill_requests_summary.js} | 73 ++++++++++++------- 1 file changed, 47 insertions(+), 26 deletions(-) rename definitions/output/all/{requests_summary_backfill.js => backfill_requests_summary.js} (78%) diff --git a/definitions/output/all/requests_summary_backfill.js b/definitions/output/all/backfill_requests_summary.js similarity index 78% rename from definitions/output/all/requests_summary_backfill.js rename to definitions/output/all/backfill_requests_summary.js index 9610012..d5c18cf 100644 --- a/definitions/output/all/requests_summary_backfill.js +++ b/definitions/output/all/backfill_requests_summary.js @@ -1,28 +1,41 @@ -let datesRange = []; +const iterations = [], + clients = constants.clients; + for ( - let date = '2011-06-01'; // 2015-12-01 - date >= '2011-06-01'; - date = constants.fn_past_month(date)) { - datesRange.push(date) - - midMonth = new Date(date); - midMonth.setDate(15); - datesRange.push(midMonth.toISOString().substring(0, 10)) + let date = "2016-01-01"; // 2022-06-01 + date >= "2016-01-01"; // 2016-01-01 + date = constants.fn_past_month(date) +) { + clients.forEach((client) => { + iterations.push({ + date: date, + client: client, + }) + }) + + midMonth = new Date(date) + midMonth.setDate(15) + + clients.forEach((client) => { + iterations.push({ + date: midMonth.toISOString().substring(0, 10), + client: client, + }) + }) } -datesRange.forEach((date, i) => { - if(date > "2014-06-01"){ +iterations.forEach((iteration, i) => { + if(iteration.date > "2014-06-01"){ add_dimensions = true; } else { add_dimensions = false; } - constants.clients.forEach(client => { - operate(`requests_backfill_summary ${date}_${client}`).tags([ - "requests_backfill" - ]).queries(ctx => ` + operate(`requests_backfill_summary ${iteration.date}_${iteration.client}`).tags([ + "requests_backfill" + ]).queries(ctx => ` DELETE FROM ${ctx.resolve("all", "requests")} -WHERE date = '${date}'; +WHERE date = '${iteration.date}' AND client = '${iteration.client}'; CREATE TEMP FUNCTION get_ext_from_url(url STRING) RETURNS STRING @@ -123,19 +136,20 @@ AS """ } """; -INSERT INTO ${ctx.resolve("all", "requests")} +INSERT INTO \`all_dev.requests_stable\` --${ctx.resolve("all", "requests")} SELECT - DATE('${date}') AS date, - '${client}' AS client, + DATE('${iteration.date}') AS date, + '${iteration.client}' AS client, pages.url AS page, TRUE AS is_root_page, pages.url AS root_page, + crux.rank AS rank, requests.url AS url, requests.firstHTML AS is_main_document, get_type(requests.mimeType, get_ext_from_url(requests.url)) AS type, IF(requests.firstReq, 1, NULL) AS index, NULL AS payload, - TO_JSON_STRING( STRUCT( + TO_JSON( STRUCT( requests.time AS time, requests.method AS method, requests.redirectUrl AS redirectUrl, @@ -191,9 +205,16 @@ SELECT ("X-Powered-By", requests.resp_x_powered_by) ] AS response_headers, NULL AS response_body -FROM summary_requests.${constants.fn_date_underscored(date)}_${client} AS requests ${constants.dev_TABLESAMPLE} -LEFT JOIN summary_pages.${constants.fn_date_underscored(date)}_${client} AS pages ${constants.dev_TABLESAMPLE} -USING(pageid); - `) - }) -}) \ No newline at end of file +FROM summary_requests.${constants.fn_date_underscored(iteration.date)}_${iteration.client} AS requests ${constants.dev_TABLESAMPLE} +LEFT JOIN summary_pages.${constants.fn_date_underscored(iteration.date)}_${iteration.client} AS pages ${constants.dev_TABLESAMPLE} +ON requests.pageid = pages.pageid +LEFT JOIN ( + SELECT DISTINCT + CONCAT(origin, '/') AS page, + experimental.popularity.rank AS rank + FROM ${ctx.resolve("chrome-ux-report", "experimental", "global")} + WHERE yyyymm = ${constants.fn_past_month(iteration.date).substring(0, 7).replace('-', '')} +) AS crux +ON pages.url = crux.page; + `) +}) From 8832ffef50800abcb666d75a7040783031593b75 Mon Sep 17 00:00:00 2001 From: Max Ostapenko Date: Sun, 29 Sep 2024 23:58:10 +0000 Subject: [PATCH 15/44] backfill_pages --- definitions/output/all/backfill_pages.js | 122 ++++++++++++++++ definitions/output/all/backfill_requests.js | 6 +- .../output/all/backfill_summary_pages.js | 133 ++++++++++++++++++ ...ummary.js => backfill_summary_requests.js} | 69 ++++++++- definitions/sources/declares.js | 1 - 5 files changed, 323 insertions(+), 8 deletions(-) create mode 100644 definitions/output/all/backfill_pages.js create mode 100644 definitions/output/all/backfill_summary_pages.js rename definitions/output/all/{backfill_requests_summary.js => backfill_summary_requests.js} (78%) diff --git a/definitions/output/all/backfill_pages.js b/definitions/output/all/backfill_pages.js new file mode 100644 index 0000000..2d84e5e --- /dev/null +++ b/definitions/output/all/backfill_pages.js @@ -0,0 +1,122 @@ +const iterations = [] +const clients = constants.clients + +for ( + let date = "2016-01-01"; // 2022-06-01 + date >= "2016-01-01"; // 2016-01-01 + date = constants.fn_past_month(date) +) { + clients.forEach((client) => { + iterations.push({ + date: date, + client: client, + }) + }) + + if (date <= "2018-12-01") { + midMonth = new Date(date) + midMonth.setDate(15) + + clients.forEach((client) => { + iterations.push({ + date: midMonth.toISOString().substring(0, 10), + client: client, + }) + }) + } +} + +iterations.forEach((iteration, i) => { + operate(`backfill_pages ${iteration.date} ${iteration.client}`).tags([ + "backfill_pages" + ]).dependencies([ + i===0 ? "" : `backfill_pages ${iterations[i-1].date} ${iterations[i-1].client}` + ]).queries(ctx => ` +DELETE FROM \`all_dev.pages_stable\` +WHERE date = '${iteration.date}' AND client = '${iteration.client}'; + +CREATE TEMPORARY FUNCTION GET_OTHER_CUSTOM_METRICS( + jsonObject JSON, + keys ARRAY +) RETURNS JSON +LANGUAGE js AS """ +try { + let other_metrics = {}; + keys.forEach(function(key) { + other_metrics[key.substr(1)] = JSON.parse(jsonObject[key]); + }); + return other_metrics; +} catch (e) { + return null; +} +"""; + +INSERT INTO \`all_dev.pages_stable\` --${ctx.resolve("all", "pages")} +SELECT + DATE('${iteration.date}') AS date, + '${iteration.client}' AS client, + pages.url AS page, + TRUE AS is_root_page, + pages.url AS root_page, + crux.rank AS rank, + JSON_VALUE(payload, "$.testID") AS wptid, + SAFE.PARSE_JSON(payload, wide_number_mode => 'round') AS payload, + NULL AS summary, + STRUCT< + a11y JSON, + cms JSON, + css_variables JSON, + cookies JSON, + ecommerce JSON, + element_count JSON, + javascript JSON, + markup JSON, + media JSON, + origin_trials JSON, + performance JSON, + privacy JSON, + responsive_images JSON, + robots_txt JSON, + security JSON, + structured_data JSON, + third_parties JSON, + well_known JSON, + wpt_bodies JSON, + other JSON + >( + SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._a11y"), wide_number_mode => 'round'), + SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._cms"), wide_number_mode => 'round'), + SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._css-variables"), wide_number_mode => 'round'), + SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._cookies"), wide_number_mode => 'round'), + SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._ecommerce"), wide_number_mode => 'round'), + SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._element_count"), wide_number_mode => 'round'), + SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._javascript"), wide_number_mode => 'round'), + SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._markup"), wide_number_mode => 'round'), + SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._media"), wide_number_mode => 'round'), + SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._origin-trials"), wide_number_mode => 'round'), + SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._performance"), wide_number_mode => 'round'), + SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._privacy"), wide_number_mode => 'round'), + SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._responsive_images"), wide_number_mode => 'round'), + SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._robots_txt"), wide_number_mode => 'round'), + SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._security"), wide_number_mode => 'round'), + SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._structured-data"), wide_number_mode => 'round'), + SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._third-parties"), wide_number_mode => 'round'), + SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._well-known"), wide_number_mode => 'round'), + SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._wpt_bodies"), wide_number_mode => 'round'), + GET_OTHER_CUSTOM_METRICS(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), ["_Colordepth", "_Dpi", "_Images", "_Resolution", "_almanac", "_avg_dom_depth", "_css", "_doctype", "_document_height", "_document_width", "_event-names", "_fugu-apis", "_has_shadow_root", "_img-loading-attr", "_initiators", "_inline_style_bytes", "_lib-detector-version", "_localstorage_size", "_meta_viewport", "_num_iframes", "_num_scripts", "_num_scripts_async", "_num_scripts_sync", "_pwa", "_quirks_mode", "_sass", "_sessionstorage_size", "_usertiming"]) + ) AS custom_metrics, + NULL AS lighthouse, + NULL AS features, + NULL AS technologies, + JSON_QUERY(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._metadata") AS metadata +FROM pages.${constants.fn_date_underscored(iteration.date)}_${iteration.client} AS pages ${constants.dev_TABLESAMPLE} +LEFT JOIN ( + SELECT DISTINCT + CONCAT(origin, '/') AS page, + experimental.popularity.rank AS rank + FROM ${ctx.resolve("chrome-ux-report", "experimental", "global")} + WHERE yyyymm = ${constants.fn_past_month(iteration.date).substring(0, 7).replace('-', '')} +) AS crux +ON pages.url = crux.page; + `) +}) diff --git a/definitions/output/all/backfill_requests.js b/definitions/output/all/backfill_requests.js index f9e346a..2f75e35 100644 --- a/definitions/output/all/backfill_requests.js +++ b/definitions/output/all/backfill_requests.js @@ -29,10 +29,10 @@ for ( operate("") iterations.forEach((iteration, i) => { - operate(`requests_backfill ${iteration.date} ${iteration.client}`).tags([ - "requests_backfill" + operate(`backfill_requests ${iteration.date} ${iteration.client}`).tags([ + "backfill_requests" ]).dependencies([ - i===0 ? "" : `requests_backfill ${iterations[i-1].date} ${iterations[i-1].client}` + i===0 ? "" : `backfill_requests ${iterations[i-1].date} ${iterations[i-1].client}` ]).queries(ctx => ` DELETE FROM ${ctx.resolve("all", "requests")} WHERE date = '${iteration.date}' AND client = '${iteration.client}'; diff --git a/definitions/output/all/backfill_summary_pages.js b/definitions/output/all/backfill_summary_pages.js new file mode 100644 index 0000000..c717456 --- /dev/null +++ b/definitions/output/all/backfill_summary_pages.js @@ -0,0 +1,133 @@ +const iterations = [] +const clients = constants.clients + +for ( + let date = "2015-12-01"; + date >= "2015-12-01"; // 2011-06-01 + date = constants.fn_past_month(date) +) { + clients.forEach((client) => { + iterations.push({ + date: date, + client: client, + }) + }) + + midMonth = new Date(date) + midMonth.setDate(15) + + clients.forEach((client) => { + iterations.push({ + date: midMonth.toISOString().substring(0, 10), + client: client, + }) + }) + +} + +iterations.forEach((iteration, i) => { + operate(`backfill_summary_pages ${iteration.date} ${iteration.client}`).tags([ + "pages_backfill" + ]).dependencies([ + i===0 ? "" : `backfill_summary_pages ${iterations[i-1].date} ${iterations[i-1].client}` + ]).queries(ctx => ` +DELETE FROM \`all_dev.pages_stable\` +WHERE date = '${iteration.date}' AND client = '${iteration.client}'; + +INSERT INTO \`all_dev.pages_stable\` --${ctx.resolve("all", "pages")} +SELECT + DATE('${iteration.date}') AS date, + '${iteration.client}' AS client, + pages.url AS page, + TRUE AS is_root_page, + pages.url AS root_page, + CASE + WHEN rank<=1000 THEN 1000 + WHEN rank<=5000 THEN 5000 + ELSE NULL + END AS rank, + wptid, + TO_JSON( STRUCT( + pageid, + createDate, + archive, + label, + crawlid, + wptid, + wptrun, + url, + urlShort, + urlhash, + cdn, + startedDateTime, + TTFB, + renderStart, + onContentLoaded, + onLoad, + fullyLoaded, + visualComplete, + PageSpeed, + SpeedIndex, + rank, + reqTotal, + reqHtml, + reqJS, + reqCSS, + reqImg, + reqGif, + reqJpg, + reqPng, + reqFont, + reqFlash, + reqJson, + reqOther, + bytesTotal, + bytesHtml, + bytesJS, + bytesCSS, + bytesImg, + bytesGif, + bytesJpg, + bytesPng, + bytesFont, + bytesFlash, + bytesJson, + bytesOther, + bytesHtmlDoc, + numDomains, + maxDomainReqs, + numRedirects, + numErrors, + numGlibs, + numHttps, + numCompressed, + numDomElements, + maxageNull, + maxage0, + maxage1, + maxage30, + maxage365, + maxageMore, + gzipTotal, + gzipSavings, + _connections, + _adult_site, + avg_dom_depth, + document_height, + document_width, + localstorage_size, + sessionstorage_size, + num_iframes, + num_scripts, + doctype, + meta_viewport + )) AS payload, + NULL AS summary, + NULL AS custom_metrics, + NULL AS lighthouse, + NULL AS features, + NULL AS technologies, + NULL AS metadata +FROM summary_pages.${constants.fn_date_underscored(iteration.date)}_${iteration.client} AS pages ${constants.dev_TABLESAMPLE}; + `) +}) diff --git a/definitions/output/all/backfill_requests_summary.js b/definitions/output/all/backfill_summary_requests.js similarity index 78% rename from definitions/output/all/backfill_requests_summary.js rename to definitions/output/all/backfill_summary_requests.js index d5c18cf..4b496d5 100644 --- a/definitions/output/all/backfill_requests_summary.js +++ b/definitions/output/all/backfill_summary_requests.js @@ -2,8 +2,8 @@ const iterations = [], clients = constants.clients; for ( - let date = "2016-01-01"; // 2022-06-01 - date >= "2016-01-01"; // 2016-01-01 + let date = "2015-12-01"; + date >= "2015-12-01"; // 2011-06-01 date = constants.fn_past_month(date) ) { clients.forEach((client) => { @@ -31,8 +31,10 @@ iterations.forEach((iteration, i) => { add_dimensions = false; } - operate(`requests_backfill_summary ${iteration.date}_${iteration.client}`).tags([ + operate(`backfill_summary_requests ${iteration.date} ${iteration.client}`).tags([ "requests_backfill" + ]).dependencies([ + i===0 ? "" : `backfill_summary_requests ${iterations[i-1].date} ${iterations[i-1].client}` ]).queries(ctx => ` DELETE FROM ${ctx.resolve("all", "requests")} WHERE date = '${iteration.date}' AND client = '${iteration.client}'; @@ -148,7 +150,66 @@ SELECT requests.firstHTML AS is_main_document, get_type(requests.mimeType, get_ext_from_url(requests.url)) AS type, IF(requests.firstReq, 1, NULL) AS index, - NULL AS payload, + TO_JSON( STRUCT( + requests.requestid, + requests.pageid, + requests.startedDateTime, + requests.time, + requests.method, + requests.url, + requests.urlShort, + requests.redirectUrl, + requests.firstReq, + requests.firstHtml, + requests.reqHttpVersion, + requests.reqHeadersSize, + requests.reqBodySize, + requests.reqCookieLen, + requests.reqOtherHeaders, + requests.status, + requests.respHttpVersion, + requests.respHeadersSize, + requests.respBodySize, + requests.respSize, + requests.respCookieLen, + requests.expAge, + requests.mimeType, + requests.respOtherHeaders, + requests.req_accept, + requests.req_accept_charset, + requests.req_accept_encoding, + requests.req_accept_language, + requests.req_connection, + requests.req_host, + requests.req_if_modified_since, + requests.req_if_none_match, + requests.req_referer, + requests.req_user_agent, + requests.resp_accept_ranges, + requests.resp_age, + requests.resp_cache_control, + requests.resp_connection, + requests.resp_content_encoding, + requests.resp_content_language, + requests.resp_content_length, + requests.resp_content_location, + requests.resp_content_type, + requests.resp_date, + requests.resp_etag, + requests.resp_expires, + requests.resp_keep_alive, + requests.resp_last_modified, + requests.resp_location, + requests.resp_pragma, + requests.resp_server, + requests.resp_transfer_encoding, + requests.resp_vary, + requests.resp_via, + requests.resp_x_powered_by, + requests._cdn_provider, + requests._gzip_save, + requests.crawlid + )) AS payload, TO_JSON( STRUCT( requests.time AS time, requests.method AS method, diff --git a/definitions/sources/declares.js b/definitions/sources/declares.js index b91c5db..bef41d9 100644 --- a/definitions/sources/declares.js +++ b/definitions/sources/declares.js @@ -25,7 +25,6 @@ HAVING COUNT(1) = 0 `); } - declare({ database: "chrome-ux-report", schema: "experimental", From 06d6cb447ed4cc6976a2d28ffa14aed6dc8d0ced Mon Sep 17 00:00:00 2001 From: Max Ostapenko Date: Mon, 30 Sep 2024 09:29:35 +0000 Subject: [PATCH 16/44] legacy removed --- README.md | 13 ------------- src/index.js | 2 +- 2 files changed, 1 insertion(+), 14 deletions(-) diff --git a/README.md b/README.md index e56ba3d..502e02d 100644 --- a/README.md +++ b/README.md @@ -16,18 +16,6 @@ Tag: `cwv_tech_report` - [x] httparchive.core_web_vitals.technologies -### Legacy crawl tables (to be deprecated) - -Tag: `crawl_results_legacy` - -- [x] httparchive.lighthouse.YYYY_MM_DD_client -- [x] httparchive.pages.YYYY_MM_DD_client -- [x] httparchive.requests.YYYY_MM_DD_client -- [x] httparchive.response_bodies.YYYY_MM_DD_client -- [x] httparchive.summary_pages.YYYY_MM_DD_client -- [x] httparchive.summary_requests.YYYY_MM_DD_client -- [x] httparchive.technologies.YYYY_MM_DD_client - ## Schedules 1. [crawl-complete](https://console.cloud.google.com/cloudpubsub/subscription/detail/dataformTrigger?authuser=7&project=httparchive) PubSub subscription @@ -35,7 +23,6 @@ Tag: `crawl_results_legacy` Tags: - crawl_results_all - - crawl_results_legacy 2. [bq-poller-cwv-tech-report](https://console.cloud.google.com/cloudscheduler/jobs/edit/us-east4/bq-poller-cwv-tech-report?authuser=7&project=httparchive) Scheduler diff --git a/src/index.js b/src/index.js index 23d3bd5..56e3133 100644 --- a/src/index.js +++ b/src/index.js @@ -34,7 +34,7 @@ FROM ( action: "runDataformRepo", actionArgs: { repoName: "crawl-data", - tags: ["crawl_results_all", "crawl_results_legacy"] + tags: ["crawl_results_all"] } } }; From 9ba236d59a9872db63f20de42f7d9575dad8da4c Mon Sep 17 00:00:00 2001 From: Max Ostapenko Date: Mon, 30 Sep 2024 09:32:32 +0000 Subject: [PATCH 17/44] remove legacy datasets --- definitions/output/lighthouse.js | 22 ---------------------- definitions/output/technologies.js | 24 ------------------------ 2 files changed, 46 deletions(-) delete mode 100644 definitions/output/lighthouse.js delete mode 100644 definitions/output/technologies.js diff --git a/definitions/output/lighthouse.js b/definitions/output/lighthouse.js deleted file mode 100644 index 6acbdc4..0000000 --- a/definitions/output/lighthouse.js +++ /dev/null @@ -1,22 +0,0 @@ -const current_month_underscored = constants.fn_date_underscored(constants.current_month); - -constants.clients.forEach(client => { - publish( - current_month_underscored + "_" + client, { - type: "table", - schema: "lighthouse", - tags: ["crawl_results_legacy"] - } - ).query(ctx => ` -SELECT - page AS url, - lighthouse AS report -FROM ${ctx.ref("all", "pages")} -WHERE - date = '${constants.current_month}' - AND client = '${client}' - AND is_root_page - AND lighthouse IS NOT NULL - AND LENGTH(lighthouse) <= 2 * 1024 * 1024 -- legacy tables have a different limit - `); -}) diff --git a/definitions/output/technologies.js b/definitions/output/technologies.js deleted file mode 100644 index 116e1be..0000000 --- a/definitions/output/technologies.js +++ /dev/null @@ -1,24 +0,0 @@ -const current_month_underscored = constants.fn_date_underscored(constants.current_month); - -constants.clients.forEach(client => { - publish( - current_month_underscored + "_" + client, { - type: "table", - schema: "technologies", - tags: ["crawl_results_legacy"] - }).query(ctx => ` -SELECT DISTINCT - page as url, - category, - tech.technology AS app, - info -FROM ${ctx.ref("all", "pages")}, -UNNEST (technologies) AS tech, -UNNEST (tech.categories) AS category, -UNNEST (tech.info) AS info -WHERE date = '${constants.current_month}' AND - client = '${client}' AND - is_root_page AND - tech.technology IS NOT NULL - `); -}) From 57da6fb4172e527a439573ea14c114de7488043d Mon Sep 17 00:00:00 2001 From: Max Ostapenko Date: Mon, 30 Sep 2024 15:10:38 +0000 Subject: [PATCH 18/44] metrics sorted --- definitions/output/all/backfill_pages.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/definitions/output/all/backfill_pages.js b/definitions/output/all/backfill_pages.js index 2d84e5e..e8d95db 100644 --- a/definitions/output/all/backfill_pages.js +++ b/definitions/output/all/backfill_pages.js @@ -65,8 +65,8 @@ SELECT STRUCT< a11y JSON, cms JSON, - css_variables JSON, cookies JSON, + css_variables JSON, ecommerce JSON, element_count JSON, javascript JSON, @@ -86,8 +86,8 @@ SELECT >( SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._a11y"), wide_number_mode => 'round'), SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._cms"), wide_number_mode => 'round'), - SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._css-variables"), wide_number_mode => 'round'), SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._cookies"), wide_number_mode => 'round'), + SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._css-variables"), wide_number_mode => 'round'), SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._ecommerce"), wide_number_mode => 'round'), SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._element_count"), wide_number_mode => 'round'), SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._javascript"), wide_number_mode => 'round'), From 3683a89049001752f9896a9f77a9d01abd45d6d7 Mon Sep 17 00:00:00 2001 From: Max Ostapenko Date: Mon, 30 Sep 2024 20:01:54 +0000 Subject: [PATCH 19/44] parse features --- definitions/output/all/backfill_pages.js | 33 +++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/definitions/output/all/backfill_pages.js b/definitions/output/all/backfill_pages.js index e8d95db..85c6fed 100644 --- a/definitions/output/all/backfill_pages.js +++ b/definitions/output/all/backfill_pages.js @@ -51,6 +51,37 @@ try { } """; +CREATE TEMP FUNCTION GET_FEATURES(payload STRING) +RETURNS ARRAY> LANGUAGE js AS +''' + function getFeatureNames(featureMap, featureType) { + try { + return Object.entries(featureMap).map(([key, value]) => { + // After Feb 2020 keys are feature IDs. + if (value.name) { + return {'feature': value.name, 'type': featureType, 'id': key}; + } + // Prior to Feb 2020 keys fell back to IDs if the name was unknown. + if (idPattern.test(key)) { + return {'feature': '', 'type': featureType, 'id': key.match(idPattern)[1]}; + } + // Prior to Feb 2020 keys were names by default. + return {'feature': key, 'type': featureType, 'id': ''}; + }); + } catch (e) { + return []; + } + } + + var $ = JSON.parse(payload); + if (!$._blinkFeatureFirstUsed) return []; + + var idPattern = new RegExp('^Feature_(\d+)$'); + return getFeatureNames($._blinkFeatureFirstUsed.Features, 'default') + .concat(getFeatureNames($._blinkFeatureFirstUsed.CSSFeatures, 'css')) + .concat(getFeatureNames($._blinkFeatureFirstUsed.AnimatedCSSFeatures, 'animated-css')); +'''; + INSERT INTO \`all_dev.pages_stable\` --${ctx.resolve("all", "pages")} SELECT DATE('${iteration.date}') AS date, @@ -106,7 +137,7 @@ SELECT GET_OTHER_CUSTOM_METRICS(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), ["_Colordepth", "_Dpi", "_Images", "_Resolution", "_almanac", "_avg_dom_depth", "_css", "_doctype", "_document_height", "_document_width", "_event-names", "_fugu-apis", "_has_shadow_root", "_img-loading-attr", "_initiators", "_inline_style_bytes", "_lib-detector-version", "_localstorage_size", "_meta_viewport", "_num_iframes", "_num_scripts", "_num_scripts_async", "_num_scripts_sync", "_pwa", "_quirks_mode", "_sass", "_sessionstorage_size", "_usertiming"]) ) AS custom_metrics, NULL AS lighthouse, - NULL AS features, + GET_FEATURES(payload) AS features, NULL AS technologies, JSON_QUERY(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._metadata") AS metadata FROM pages.${constants.fn_date_underscored(iteration.date)}_${iteration.client} AS pages ${constants.dev_TABLESAMPLE} From 2870012b71cdeb166f22274b03f63a9121962c16 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Mon, 14 Oct 2024 23:26:14 +0200 Subject: [PATCH 20/44] lint --- definitions/output/all/backfill_pages.js | 27 ++++++++------- definitions/output/all/backfill_requests.js | 25 +++++++------- .../output/all/backfill_summary_pages.js | 20 +++++------ .../output/all/backfill_summary_requests.js | 34 ++++++++++--------- 4 files changed, 55 insertions(+), 51 deletions(-) diff --git a/definitions/output/all/backfill_pages.js b/definitions/output/all/backfill_pages.js index 85c6fed..aab9537 100644 --- a/definitions/output/all/backfill_pages.js +++ b/definitions/output/all/backfill_pages.js @@ -1,26 +1,27 @@ const iterations = [] const clients = constants.clients +let midMonth for ( - let date = "2016-01-01"; // 2022-06-01 - date >= "2016-01-01"; // 2016-01-01 + let date = '2016-01-01'; // 2022-06-01 + date >= '2016-01-01'; // 2016-01-01 date = constants.fn_past_month(date) ) { clients.forEach((client) => { iterations.push({ - date: date, - client: client, + date, + client }) }) - if (date <= "2018-12-01") { + if (date <= '2018-12-01') { midMonth = new Date(date) midMonth.setDate(15) clients.forEach((client) => { iterations.push({ date: midMonth.toISOString().substring(0, 10), - client: client, + client }) }) } @@ -28,9 +29,9 @@ for ( iterations.forEach((iteration, i) => { operate(`backfill_pages ${iteration.date} ${iteration.client}`).tags([ - "backfill_pages" + 'backfill_pages' ]).dependencies([ - i===0 ? "" : `backfill_pages ${iterations[i-1].date} ${iterations[i-1].client}` + i === 0 ? '' : `backfill_pages ${iterations[i - 1].date} ${iterations[i - 1].client}` ]).queries(ctx => ` DELETE FROM \`all_dev.pages_stable\` WHERE date = '${iteration.date}' AND client = '${iteration.client}'; @@ -72,17 +73,17 @@ RETURNS ARRAY> LANGUAGE js AS return []; } } - + var $ = JSON.parse(payload); if (!$._blinkFeatureFirstUsed) return []; - - var idPattern = new RegExp('^Feature_(\d+)$'); + + var idPattern = new RegExp('^Feature_(\\\\d+)$'); return getFeatureNames($._blinkFeatureFirstUsed.Features, 'default') .concat(getFeatureNames($._blinkFeatureFirstUsed.CSSFeatures, 'css')) .concat(getFeatureNames($._blinkFeatureFirstUsed.AnimatedCSSFeatures, 'animated-css')); '''; -INSERT INTO \`all_dev.pages_stable\` --${ctx.resolve("all", "pages")} +INSERT INTO \`all_dev.pages_stable\` --${ctx.resolve('all', 'pages')} SELECT DATE('${iteration.date}') AS date, '${iteration.client}' AS client, @@ -145,7 +146,7 @@ LEFT JOIN ( SELECT DISTINCT CONCAT(origin, '/') AS page, experimental.popularity.rank AS rank - FROM ${ctx.resolve("chrome-ux-report", "experimental", "global")} + FROM ${ctx.resolve('chrome-ux-report', 'experimental', 'global')} WHERE yyyymm = ${constants.fn_past_month(iteration.date).substring(0, 7).replace('-', '')} ) AS crux ON pages.url = crux.page; diff --git a/definitions/output/all/backfill_requests.js b/definitions/output/all/backfill_requests.js index 2f75e35..09e86b3 100644 --- a/definitions/output/all/backfill_requests.js +++ b/definitions/output/all/backfill_requests.js @@ -1,40 +1,41 @@ const iterations = [] const clients = constants.clients +let midMonth for ( - let date = "2016-01-01"; // 2022-06-01 - date >= "2016-01-01"; // 2016-01-01 + let date = '2016-01-01'; // 2022-06-01 + date >= '2016-01-01'; // 2016-01-01 date = constants.fn_past_month(date) ) { clients.forEach((client) => { iterations.push({ - date: date, - client: client, + date, + client }) }) - if (date <= "2018-12-01") { + if (date <= '2018-12-01') { midMonth = new Date(date) midMonth.setDate(15) clients.forEach((client) => { iterations.push({ date: midMonth.toISOString().substring(0, 10), - client: client, + client }) }) } } -operate("") +operate('') iterations.forEach((iteration, i) => { operate(`backfill_requests ${iteration.date} ${iteration.client}`).tags([ - "backfill_requests" + 'backfill_requests' ]).dependencies([ - i===0 ? "" : `backfill_requests ${iterations[i-1].date} ${iterations[i-1].client}` + i === 0 ? '' : `backfill_requests ${iterations[i - 1].date} ${iterations[i - 1].client}` ]).queries(ctx => ` -DELETE FROM ${ctx.resolve("all", "requests")} +DELETE FROM ${ctx.resolve('all', 'requests')} WHERE date = '${iteration.date}' AND client = '${iteration.client}'; CREATE TEMP FUNCTION get_ext_from_url(url STRING) @@ -136,7 +137,7 @@ AS """ } """; -INSERT INTO \`all_dev.requests_stable\` --${ctx.resolve("all", "requests")} +INSERT INTO \`all_dev.requests_stable\` --${ctx.resolve('all', 'requests')} SELECT DATE('${iteration.date}') AS date, '${iteration.client}' AS client, @@ -183,7 +184,7 @@ LEFT JOIN ( SELECT DISTINCT CONCAT(origin, '/') AS page, experimental.popularity.rank AS rank - FROM ${ctx.resolve("chrome-ux-report", "experimental", "global")} + FROM ${ctx.resolve('chrome-ux-report', 'experimental', 'global')} WHERE yyyymm = ${constants.fn_past_month(iteration.date).substring(0, 7).replace('-', '')} ) AS crux ON requests.page = crux.page diff --git a/definitions/output/all/backfill_summary_pages.js b/definitions/output/all/backfill_summary_pages.js index c717456..15ffad7 100644 --- a/definitions/output/all/backfill_summary_pages.js +++ b/definitions/output/all/backfill_summary_pages.js @@ -1,15 +1,16 @@ const iterations = [] const clients = constants.clients +let midMonth for ( - let date = "2015-12-01"; - date >= "2015-12-01"; // 2011-06-01 + let date = '2015-12-01'; + date >= '2015-12-01'; // 2011-06-01 date = constants.fn_past_month(date) ) { clients.forEach((client) => { iterations.push({ - date: date, - client: client, + date, + client }) }) @@ -19,29 +20,28 @@ for ( clients.forEach((client) => { iterations.push({ date: midMonth.toISOString().substring(0, 10), - client: client, + client }) }) - } iterations.forEach((iteration, i) => { operate(`backfill_summary_pages ${iteration.date} ${iteration.client}`).tags([ - "pages_backfill" + 'pages_backfill' ]).dependencies([ - i===0 ? "" : `backfill_summary_pages ${iterations[i-1].date} ${iterations[i-1].client}` + i === 0 ? '' : `backfill_summary_pages ${iterations[i - 1].date} ${iterations[i - 1].client}` ]).queries(ctx => ` DELETE FROM \`all_dev.pages_stable\` WHERE date = '${iteration.date}' AND client = '${iteration.client}'; -INSERT INTO \`all_dev.pages_stable\` --${ctx.resolve("all", "pages")} +INSERT INTO \`all_dev.pages_stable\` --${ctx.resolve('all', 'pages')} SELECT DATE('${iteration.date}') AS date, '${iteration.client}' AS client, pages.url AS page, TRUE AS is_root_page, pages.url AS root_page, - CASE + CASE WHEN rank<=1000 THEN 1000 WHEN rank<=5000 THEN 5000 ELSE NULL diff --git a/definitions/output/all/backfill_summary_requests.js b/definitions/output/all/backfill_summary_requests.js index 4b496d5..77fd55b 100644 --- a/definitions/output/all/backfill_summary_requests.js +++ b/definitions/output/all/backfill_summary_requests.js @@ -1,15 +1,16 @@ -const iterations = [], - clients = constants.clients; +const iterations = [] +const clients = constants.clients +let midMonth for ( - let date = "2015-12-01"; - date >= "2015-12-01"; // 2011-06-01 + let date = '2015-12-01'; + date >= '2015-12-01'; // 2011-06-01 date = constants.fn_past_month(date) ) { clients.forEach((client) => { iterations.push({ - date: date, - client: client, + date, + client }) }) @@ -19,24 +20,25 @@ for ( clients.forEach((client) => { iterations.push({ date: midMonth.toISOString().substring(0, 10), - client: client, + client }) }) } +let addDimensions iterations.forEach((iteration, i) => { - if(iteration.date > "2014-06-01"){ - add_dimensions = true; + if (iteration.date > '2014-06-01') { + addDimensions = true } else { - add_dimensions = false; + addDimensions = false } operate(`backfill_summary_requests ${iteration.date} ${iteration.client}`).tags([ - "requests_backfill" + 'requests_backfill' ]).dependencies([ - i===0 ? "" : `backfill_summary_requests ${iterations[i-1].date} ${iterations[i-1].client}` + i === 0 ? '' : `backfill_summary_requests ${iterations[i - 1].date} ${iterations[i - 1].client}` ]).queries(ctx => ` -DELETE FROM ${ctx.resolve("all", "requests")} +DELETE FROM ${ctx.resolve('all', 'requests')} WHERE date = '${iteration.date}' AND client = '${iteration.client}'; CREATE TEMP FUNCTION get_ext_from_url(url STRING) @@ -138,7 +140,7 @@ AS """ } """; -INSERT INTO \`all_dev.requests_stable\` --${ctx.resolve("all", "requests")} +INSERT INTO \`all_dev.requests_stable\` --${ctx.resolve('all', 'requests')} SELECT DATE('${iteration.date}') AS date, '${iteration.client}' AS client, @@ -228,7 +230,7 @@ SELECT requests.respOtherHeaders AS respOtherHeaders, requests.expAge AS expAge, requests.mimeType AS mimeType - ${add_dimensions ? ",requests._cdn_provider AS _cdn_provider,requests._gzip_save AS _gzip_save" : ""} + ${addDimensions ? ',requests._cdn_provider AS _cdn_provider,requests._gzip_save AS _gzip_save' : ''} )) AS summary, ARRAY>[ ('Accept', requests.req_accept), @@ -273,7 +275,7 @@ LEFT JOIN ( SELECT DISTINCT CONCAT(origin, '/') AS page, experimental.popularity.rank AS rank - FROM ${ctx.resolve("chrome-ux-report", "experimental", "global")} + FROM ${ctx.resolve('chrome-ux-report', 'experimental', 'global')} WHERE yyyymm = ${constants.fn_past_month(iteration.date).substring(0, 7).replace('-', '')} ) AS crux ON pages.url = crux.page; From 992802f22bcbe7cffc51c6deb7f98e21d25b0df7 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Mon, 14 Oct 2024 23:32:39 +0200 Subject: [PATCH 21/44] jscpd off --- .github/workflows/linter.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/linter.yaml b/.github/workflows/linter.yaml index b7ea4d5..60c9064 100644 --- a/.github/workflows/linter.yaml +++ b/.github/workflows/linter.yaml @@ -30,5 +30,6 @@ jobs: env: DEFAULT_BRANCH: main GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + VALIDATE_JSCPD: false VALIDATE_JAVASCRIPT_PRETTIER: false VALIDATE_MARKDOWN_PRETTIER: false From 23c29b90c609c4457a0848f25b9384ce547bf6b9 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Tue, 15 Oct 2024 00:00:57 +0200 Subject: [PATCH 22/44] update js variable names --- definitions/output/all/backfill_pages.js | 6 +++--- definitions/output/all/backfill_requests.js | 8 ++++---- definitions/output/all/backfill_summary_pages.js | 4 ++-- definitions/output/all/backfill_summary_requests.js | 8 ++++---- package.json | 6 ++---- 5 files changed, 15 insertions(+), 17 deletions(-) diff --git a/definitions/output/all/backfill_pages.js b/definitions/output/all/backfill_pages.js index aab9537..c5eea5c 100644 --- a/definitions/output/all/backfill_pages.js +++ b/definitions/output/all/backfill_pages.js @@ -5,7 +5,7 @@ let midMonth for ( let date = '2016-01-01'; // 2022-06-01 date >= '2016-01-01'; // 2016-01-01 - date = constants.fn_past_month(date) + date = constants.fnPastMonth(date) ) { clients.forEach((client) => { iterations.push({ @@ -141,13 +141,13 @@ SELECT GET_FEATURES(payload) AS features, NULL AS technologies, JSON_QUERY(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._metadata") AS metadata -FROM pages.${constants.fn_date_underscored(iteration.date)}_${iteration.client} AS pages ${constants.dev_TABLESAMPLE} +FROM pages.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} AS pages ${constants.dev_TABLESAMPLE} LEFT JOIN ( SELECT DISTINCT CONCAT(origin, '/') AS page, experimental.popularity.rank AS rank FROM ${ctx.resolve('chrome-ux-report', 'experimental', 'global')} - WHERE yyyymm = ${constants.fn_past_month(iteration.date).substring(0, 7).replace('-', '')} + WHERE yyyymm = ${constants.fnPastMonth(iteration.date).substring(0, 7).replace('-', '')} ) AS crux ON pages.url = crux.page; `) diff --git a/definitions/output/all/backfill_requests.js b/definitions/output/all/backfill_requests.js index 09e86b3..01c419a 100644 --- a/definitions/output/all/backfill_requests.js +++ b/definitions/output/all/backfill_requests.js @@ -5,7 +5,7 @@ let midMonth for ( let date = '2016-01-01'; // 2022-06-01 date >= '2016-01-01'; // 2016-01-01 - date = constants.fn_past_month(date) + date = constants.fnPastMonth(date) ) { clients.forEach((client) => { iterations.push({ @@ -179,16 +179,16 @@ SELECT parse_headers(JSON_QUERY(payload, '$.request.headers')) AS request_headers, parse_headers(JSON_QUERY(payload, '$.response.headers')) AS response_headers, response_bodies.body AS response_body -FROM requests.${constants.fn_date_underscored(iteration.date)}_${iteration.client} AS requests ${constants.dev_TABLESAMPLE} +FROM requests.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} AS requests ${constants.dev_TABLESAMPLE} LEFT JOIN ( SELECT DISTINCT CONCAT(origin, '/') AS page, experimental.popularity.rank AS rank FROM ${ctx.resolve('chrome-ux-report', 'experimental', 'global')} - WHERE yyyymm = ${constants.fn_past_month(iteration.date).substring(0, 7).replace('-', '')} + WHERE yyyymm = ${constants.fnPastMonth(iteration.date).substring(0, 7).replace('-', '')} ) AS crux ON requests.page = crux.page -LEFT JOIN response_bodies.${constants.fn_date_underscored(iteration.date)}_${iteration.client} AS response_bodies ${constants.dev_TABLESAMPLE} +LEFT JOIN response_bodies.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} AS response_bodies ${constants.dev_TABLESAMPLE} ON requests.page = response_bodies.page AND requests.url = response_bodies.url; `) }) diff --git a/definitions/output/all/backfill_summary_pages.js b/definitions/output/all/backfill_summary_pages.js index 15ffad7..cb30d7e 100644 --- a/definitions/output/all/backfill_summary_pages.js +++ b/definitions/output/all/backfill_summary_pages.js @@ -5,7 +5,7 @@ let midMonth for ( let date = '2015-12-01'; date >= '2015-12-01'; // 2011-06-01 - date = constants.fn_past_month(date) + date = constants.fnPastMonth(date) ) { clients.forEach((client) => { iterations.push({ @@ -128,6 +128,6 @@ SELECT NULL AS features, NULL AS technologies, NULL AS metadata -FROM summary_pages.${constants.fn_date_underscored(iteration.date)}_${iteration.client} AS pages ${constants.dev_TABLESAMPLE}; +FROM summary_pages.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} AS pages ${constants.dev_TABLESAMPLE}; `) }) diff --git a/definitions/output/all/backfill_summary_requests.js b/definitions/output/all/backfill_summary_requests.js index 77fd55b..17ef69a 100644 --- a/definitions/output/all/backfill_summary_requests.js +++ b/definitions/output/all/backfill_summary_requests.js @@ -5,7 +5,7 @@ let midMonth for ( let date = '2015-12-01'; date >= '2015-12-01'; // 2011-06-01 - date = constants.fn_past_month(date) + date = constants.fnPastMonth(date) ) { clients.forEach((client) => { iterations.push({ @@ -268,15 +268,15 @@ SELECT ("X-Powered-By", requests.resp_x_powered_by) ] AS response_headers, NULL AS response_body -FROM summary_requests.${constants.fn_date_underscored(iteration.date)}_${iteration.client} AS requests ${constants.dev_TABLESAMPLE} -LEFT JOIN summary_pages.${constants.fn_date_underscored(iteration.date)}_${iteration.client} AS pages ${constants.dev_TABLESAMPLE} +FROM summary_requests.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} AS requests ${constants.dev_TABLESAMPLE} +LEFT JOIN summary_pages.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} AS pages ${constants.dev_TABLESAMPLE} ON requests.pageid = pages.pageid LEFT JOIN ( SELECT DISTINCT CONCAT(origin, '/') AS page, experimental.popularity.rank AS rank FROM ${ctx.resolve('chrome-ux-report', 'experimental', 'global')} - WHERE yyyymm = ${constants.fn_past_month(iteration.date).substring(0, 7).replace('-', '')} + WHERE yyyymm = ${constants.fnPastMonth(iteration.date).substring(0, 7).replace('-', '')} ) AS crux ON pages.url = crux.page; `) diff --git a/package.json b/package.json index dc111f8..ded4db7 100644 --- a/package.json +++ b/package.json @@ -4,11 +4,9 @@ "@dataform/core": "3.0.6" }, "scripts": { - "start": "dataform run", - "compile": "dataform compile", - "test": "dataform test", "format": "npx standard --fix; npx markdownlint --ignore-path .gitignore --config package.json --configPointer /markdownlint . --fix", - "lint": "npx standard; npx markdownlint --ignore-path .gitignore --config package.json --configPointer /markdownlint ." + "lint": "npx standard; npx markdownlint --ignore-path .gitignore --config package.json --configPointer /markdownlint .; dataform compile", + "test": "dataform test" }, "standard": { "globals": [ From 14b9585e8b63caf8347e58d4b1b8e0f24ab0191c Mon Sep 17 00:00:00 2001 From: Max Ostapenko Date: Mon, 14 Oct 2024 22:04:42 +0000 Subject: [PATCH 23/44] other cm format --- definitions/output/all/backfill_pages.js | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/definitions/output/all/backfill_pages.js b/definitions/output/all/backfill_pages.js index 85c6fed..ea909df 100644 --- a/definitions/output/all/backfill_pages.js +++ b/definitions/output/all/backfill_pages.js @@ -134,7 +134,10 @@ SELECT SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._third-parties"), wide_number_mode => 'round'), SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._well-known"), wide_number_mode => 'round'), SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._wpt_bodies"), wide_number_mode => 'round'), - GET_OTHER_CUSTOM_METRICS(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), ["_Colordepth", "_Dpi", "_Images", "_Resolution", "_almanac", "_avg_dom_depth", "_css", "_doctype", "_document_height", "_document_width", "_event-names", "_fugu-apis", "_has_shadow_root", "_img-loading-attr", "_initiators", "_inline_style_bytes", "_lib-detector-version", "_localstorage_size", "_meta_viewport", "_num_iframes", "_num_scripts", "_num_scripts_async", "_num_scripts_sync", "_pwa", "_quirks_mode", "_sass", "_sessionstorage_size", "_usertiming"]) + GET_OTHER_CUSTOM_METRICS( + SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), + ["_Colordepth", "_Dpi", "_Images", "_Resolution", "_almanac", "_avg_dom_depth", "_css", "_doctype", "_document_height", "_document_width", "_event-names", "_fugu-apis", "_has_shadow_root", "_img-loading-attr", "_initiators", "_inline_style_bytes", "_lib-detector-version", "_localstorage_size", "_meta_viewport", "_num_iframes", "_num_scripts", "_num_scripts_async", "_num_scripts_sync", "_pwa", "_quirks_mode", "_sass", "_sessionstorage_size", "_usertiming"] + ) ) AS custom_metrics, NULL AS lighthouse, GET_FEATURES(payload) AS features, From 4cafc6f4285651492bda19ee5d250e6ea6d29465 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Sat, 19 Oct 2024 23:45:06 +0200 Subject: [PATCH 24/44] pages completed --- definitions/output/all/backfill_pages.js | 242 +++++++++++++++++++---- 1 file changed, 203 insertions(+), 39 deletions(-) diff --git a/definitions/output/all/backfill_pages.js b/definitions/output/all/backfill_pages.js index ccfb104..45aab5f 100644 --- a/definitions/output/all/backfill_pages.js +++ b/definitions/output/all/backfill_pages.js @@ -3,8 +3,8 @@ const clients = constants.clients let midMonth for ( - let date = '2016-01-01'; // 2022-06-01 - date >= '2016-01-01'; // 2016-01-01 + let date = '2016-01-01'; + date >= '2016-01-01'; date = constants.fnPastMonth(date) ) { clients.forEach((client) => { @@ -33,8 +33,9 @@ iterations.forEach((iteration, i) => { ]).dependencies([ i === 0 ? '' : `backfill_pages ${iterations[i - 1].date} ${iterations[i - 1].client}` ]).queries(ctx => ` -DELETE FROM \`all_dev.pages_stable\` -WHERE date = '${iteration.date}' AND client = '${iteration.client}'; +DELETE FROM all_dev.pages_stable +WHERE date = '${iteration.date}' + AND client = '${iteration.client}'; CREATE TEMPORARY FUNCTION GET_OTHER_CUSTOM_METRICS( jsonObject JSON, @@ -52,7 +53,7 @@ try { } """; -CREATE TEMP FUNCTION GET_FEATURES(payload STRING) +CREATE TEMP FUNCTION GET_FEATURES(payload JSON) RETURNS ARRAY> LANGUAGE js AS ''' function getFeatureNames(featureMap, featureType) { @@ -74,16 +75,16 @@ RETURNS ARRAY> LANGUAGE js AS } } - var $ = JSON.parse(payload); - if (!$._blinkFeatureFirstUsed) return []; + let blinkFeatureFirstUsed = payload._blinkFeatureFirstUsed; + if (!blinkFeatureFirstUsed) return []; var idPattern = new RegExp('^Feature_(\\\\d+)$'); - return getFeatureNames($._blinkFeatureFirstUsed.Features, 'default') - .concat(getFeatureNames($._blinkFeatureFirstUsed.CSSFeatures, 'css')) - .concat(getFeatureNames($._blinkFeatureFirstUsed.AnimatedCSSFeatures, 'animated-css')); + return getFeatureNames(blinkFeatureFirstUsed.Features, 'default') + .concat(getFeatureNames(blinkFeatureFirstUsed.CSSFeatures, 'css')) + .concat(getFeatureNames(blinkFeatureFirstUsed.AnimatedCSSFeatures, 'animated-css')); '''; -INSERT INTO \`all_dev.pages_stable\` --${ctx.resolve('all', 'pages')} +INSERT INTO all_dev.pages_stable SELECT DATE('${iteration.date}') AS date, '${iteration.client}' AS client, @@ -91,9 +92,140 @@ SELECT TRUE AS is_root_page, pages.url AS root_page, crux.rank AS rank, - JSON_VALUE(payload, "$.testID") AS wptid, - SAFE.PARSE_JSON(payload, wide_number_mode => 'round') AS payload, - NULL AS summary, + STRING(payload.testID) AS wptid, + JSON_REMOVE( + payload, + '$._metadata', + '$._detected', + '$._detected_apps', + '$._detected_technologies', + '$._detected_raw', + '$._custom', + '$._00_reset', + '$._a11y', + '$._ads', + '$._almanac', + '$._aurora', + '$._avg_dom_depth', + '$._cms', + '$._Colordepth', + '$._cookies', + '$._crawl_links', + '$._css-variables', + '$._css', + '$._doctype', + '$._document_height', + '$._document_width', + '$._Dpi', + '$._ecommerce', + '$._element_count', + '$._event-names', + '$._fugu-apis', + '$._generated-content', + '$._has_shadow_root', + '$._Images', + '$._img-loading-attr', + '$._initiators', + '$._inline_style_bytes', + '$._javascript', + '$._lib-detector-version', + '$._localstorage_size', + '$._markup', + '$._media', + '$._meta_viewport', + '$._num_iframes', + '$._num_scripts_async', + '$._num_scripts_sync', + '$._num_scripts', + '$._observers', + '$._origin-trials', + '$._parsed_css', + '$._performance', + '$._privacy-sandbox', + '$._privacy', + '$._pwa', + '$._quirks_mode', + '$._Resolution', + '$._responsive_images', + '$._robots_meta', + '$._robots_txt', + '$._sass', + '$._security', + '$._sessionstorage_size', + '$._structured-data', + '$._third-parties', + '$._usertiming', + '$._valid-head', + '$._well-known', + '$._wpt_bodies', + '$._blinkFeatureFirstUsed', + '$._CrUX' + ) AS payload, + TO_JSON( STRUCT( + SpeedIndex, + TTFB, + _connections, + bytesAudio, + bytesCSS, + bytesFlash, + bytesFont, + bytesGif, + bytesHtml, + bytesHtmlDoc, + bytesImg, + bytesJpg, + bytesJS, + bytesJson, + bytesOther, + bytesPng, + bytesSvg, + bytesText, + bytesTotal, + bytesVideo, + bytesWebp, + bytesXml, + cdn, + payload._CrUX, + fullyLoaded, + gzipSavings, + gzipTotal, + maxDomainReqs, + maxage0, + maxage1, + maxage30, + maxage365, + maxageMore, + maxageNull, + numCompressed, + numDomElements, + numDomains, + numErrors, + numGlibs, + numHttps, + numRedirects, + onContentLoaded, + onLoad, + renderStart, + reqAudio, + reqCSS, + reqFlash, + reqFont, + reqGif, + reqHtml, + reqImg, + reqJpg, + reqJS, + reqJson, + reqOther, + reqPng, + reqSvg, + reqText, + reqTotal, + reqVideo, + reqWebp, + reqXml, + visualComplete + )) AS summary, STRUCT< a11y JSON, cms JSON, @@ -116,35 +248,44 @@ SELECT wpt_bodies JSON, other JSON >( - SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._a11y"), wide_number_mode => 'round'), - SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._cms"), wide_number_mode => 'round'), - SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._cookies"), wide_number_mode => 'round'), - SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._css-variables"), wide_number_mode => 'round'), - SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._ecommerce"), wide_number_mode => 'round'), - SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._element_count"), wide_number_mode => 'round'), - SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._javascript"), wide_number_mode => 'round'), - SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._markup"), wide_number_mode => 'round'), - SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._media"), wide_number_mode => 'round'), - SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._origin-trials"), wide_number_mode => 'round'), - SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._performance"), wide_number_mode => 'round'), - SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._privacy"), wide_number_mode => 'round'), - SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._responsive_images"), wide_number_mode => 'round'), - SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._robots_txt"), wide_number_mode => 'round'), - SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._security"), wide_number_mode => 'round'), - SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._structured-data"), wide_number_mode => 'round'), - SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._third-parties"), wide_number_mode => 'round'), - SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._well-known"), wide_number_mode => 'round'), - SAFE.PARSE_JSON(JSON_VALUE(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._wpt_bodies"), wide_number_mode => 'round'), + payload._a11y, + payload._cms, + payload._cookies, + payload["_css-variables"], + payload._ecommerce, + payload._element_count, + payload._javascript, + payload._markup, + payload._media, + payload["_origin-trials"], + payload._performance, + payload._privacy, + payload._responsive_images, + payload._robots_txt, + payload._security, + payload["_structured-data"], + payload["_third-parties"], + payload["_well-known"], + payload._wpt_bodies, GET_OTHER_CUSTOM_METRICS( - SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), + payload, ["_Colordepth", "_Dpi", "_Images", "_Resolution", "_almanac", "_avg_dom_depth", "_css", "_doctype", "_document_height", "_document_width", "_event-names", "_fugu-apis", "_has_shadow_root", "_img-loading-attr", "_initiators", "_inline_style_bytes", "_lib-detector-version", "_localstorage_size", "_meta_viewport", "_num_iframes", "_num_scripts", "_num_scripts_async", "_num_scripts_sync", "_pwa", "_quirks_mode", "_sass", "_sessionstorage_size", "_usertiming"] ) ) AS custom_metrics, NULL AS lighthouse, - GET_FEATURES(payload) AS features, - NULL AS technologies, - JSON_QUERY(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), "$._metadata") AS metadata -FROM pages.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} AS pages ${constants.dev_TABLESAMPLE} + GET_FEATURES(pages.payload) AS features, + tech.technologies AS technologies, + pages.payload._metadata AS metadata +FROM ( + SELECT + * EXCEPT(payload), + SAFE.PARSE_JSON(payload, wide_number_mode => 'round') AS payload + FROM pages.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} ${constants.devTABLESAMPLE} +) AS pages + +LEFT JOIN summary_pages.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} ${constants.devTABLESAMPLE} AS summary_pages +ON pages.url = summary_pages.url + LEFT JOIN ( SELECT DISTINCT CONCAT(origin, '/') AS page, @@ -152,6 +293,29 @@ LEFT JOIN ( FROM ${ctx.resolve('chrome-ux-report', 'experimental', 'global')} WHERE yyyymm = ${constants.fnPastMonth(iteration.date).substring(0, 7).replace('-', '')} ) AS crux -ON pages.url = crux.page; +ON pages.url = crux.page + +LEFT JOIN ( + SELECT + page, + ARRAY_AGG(technology) AS technologies + FROM( + SELECT + url AS page, + STRUCT< + technology STRING, + categories ARRAY, + info ARRAY + >( + app, + ARRAY_AGG(category), + ARRAY_AGG(info) + ) AS technology + FROM technologies.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} ${constants.devTABLESAMPLE} + GROUP BY page, app + ) + GROUP BY page +) AS tech +ON pages.url = tech.page; `) }) From d1dfd492a47607d635b2e35d3c707ec30cbac685 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Sun, 20 Oct 2024 01:07:04 +0200 Subject: [PATCH 25/44] summary_pages completed --- .../output/all/backfill_summary_pages.js | 238 ++++++++++++------ 1 file changed, 158 insertions(+), 80 deletions(-) diff --git a/definitions/output/all/backfill_summary_pages.js b/definitions/output/all/backfill_summary_pages.js index cb30d7e..29ae1bf 100644 --- a/definitions/output/all/backfill_summary_pages.js +++ b/definitions/output/all/backfill_summary_pages.js @@ -4,7 +4,7 @@ const clients = constants.clients let midMonth for ( let date = '2015-12-01'; - date >= '2015-12-01'; // 2011-06-01 + date >= '2015-12-01'; date = constants.fnPastMonth(date) ) { clients.forEach((client) => { @@ -25,16 +25,122 @@ for ( }) } +function summaryObject (date) { + let list = '' + + if (date >= '2010-11-15') { + list += ` + fullyLoaded, + bytesCSS, + bytesFlash, + bytesFont, + bytesGif, + bytesHtml, + bytesHtmlDoc, + bytesImg, + bytesJpg, + bytesJS, + bytesJson, + bytesOther, + bytesPng, + bytesTotal, + cdn, + gzipSavings, + gzipTotal, + maxage0, + maxage1, + maxage30, + maxage365, + maxageMore, + maxageNull, + maxDomainReqs, + numCompressed, + numDomains, + numDomElements, + numErrors, + numGlibs, + numHttps, + numRedirects, + onContentLoaded, + onLoad, + renderStart, + reqCSS, + reqFlash, + reqFont, + reqGif, + reqHtml, + reqImg, + reqJpg, + reqJS, + reqJson, + reqOther, + reqPng, + reqTotal, + SpeedIndex, + TTFB, + visualComplete,` + } + + if (date >= '2014-05-15') { + list += ` + _connections,` + } + + if (date >= '2015-05-01') { + list += ` + bytesAudio, + bytesSvg, + bytesText, + bytesVideo, + bytesWebp, + bytesXml, + reqAudio, + reqSvg, + reqText, + reqVideo, + reqWebp, + reqXml,` + } + + return list +} + +function customMetrics (date) { + let list = '' + + if (date >= '2014-06-01' && date !== '2014-05-15') { + list += ` + avg_dom_depth, + doctype, + document_height, + document_width, + localstorage_size, + meta_viewport, + num_iframes, + num_scripts, + sessionstorage_size,` + } + + if (date >= '2015-11-01') { + list += ` + num_scripts_async, + num_scripts_sync,` + } + + return list +} + iterations.forEach((iteration, i) => { operate(`backfill_summary_pages ${iteration.date} ${iteration.client}`).tags([ 'pages_backfill' ]).dependencies([ i === 0 ? '' : `backfill_summary_pages ${iterations[i - 1].date} ${iterations[i - 1].client}` ]).queries(ctx => ` -DELETE FROM \`all_dev.pages_stable\` -WHERE date = '${iteration.date}' AND client = '${iteration.client}'; +DELETE FROM all_dev.pages_stable +WHERE date = '${iteration.date}' + AND client = '${iteration.client}'; -INSERT INTO \`all_dev.pages_stable\` --${ctx.resolve('all', 'pages')} +INSERT INTO all_dev.pages_stable SELECT DATE('${iteration.date}') AS date, '${iteration.client}' AS client, @@ -47,83 +153,55 @@ SELECT ELSE NULL END AS rank, wptid, + NULL AS payload, TO_JSON( STRUCT( - pageid, - createDate, - archive, - label, - crawlid, - wptid, - wptrun, - url, - urlShort, - urlhash, - cdn, - startedDateTime, - TTFB, - renderStart, - onContentLoaded, - onLoad, - fullyLoaded, - visualComplete, - PageSpeed, - SpeedIndex, - rank, - reqTotal, - reqHtml, - reqJS, - reqCSS, - reqImg, - reqGif, - reqJpg, - reqPng, - reqFont, - reqFlash, - reqJson, - reqOther, - bytesTotal, - bytesHtml, - bytesJS, - bytesCSS, - bytesImg, - bytesGif, - bytesJpg, - bytesPng, - bytesFont, - bytesFlash, - bytesJson, - bytesOther, - bytesHtmlDoc, - numDomains, - maxDomainReqs, - numRedirects, - numErrors, - numGlibs, - numHttps, - numCompressed, - numDomElements, - maxageNull, - maxage0, - maxage1, - maxage30, - maxage365, - maxageMore, - gzipTotal, - gzipSavings, - _connections, - _adult_site, - avg_dom_depth, - document_height, - document_width, - localstorage_size, - sessionstorage_size, - num_iframes, - num_scripts, - doctype, - meta_viewport - )) AS payload, - NULL AS summary, - NULL AS custom_metrics, + ${summaryObject(iteration.date)} + )) AS summary, + STRUCT< + a11y JSON, + cms JSON, + cookies JSON, + css_variables JSON, + ecommerce JSON, + element_count JSON, + javascript JSON, + markup JSON, + media JSON, + origin_trials JSON, + performance JSON, + privacy JSON, + responsive_images JSON, + robots_txt JSON, + security JSON, + structured_data JSON, + third_parties JSON, + well_known JSON, + wpt_bodies JSON, + other JSON + >( + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + TO_JSON( STRUCT( + ${customMetrics(iteration.date)} + )) + ) AS custom_metrics, NULL AS lighthouse, NULL AS features, NULL AS technologies, From 1244e95af78f5480bc8d01d58903a473852622a4 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Sun, 20 Oct 2024 02:21:17 +0200 Subject: [PATCH 26/44] without other headers --- .../output/all/backfill_summary_requests.js | 224 +++++++----------- 1 file changed, 90 insertions(+), 134 deletions(-) diff --git a/definitions/output/all/backfill_summary_requests.js b/definitions/output/all/backfill_summary_requests.js index 17ef69a..cd1f994 100644 --- a/definitions/output/all/backfill_summary_requests.js +++ b/definitions/output/all/backfill_summary_requests.js @@ -25,14 +25,41 @@ for ( }) } -let addDimensions -iterations.forEach((iteration, i) => { - if (iteration.date > '2014-06-01') { - addDimensions = true - } else { - addDimensions = false +function summaryObject (date) { + let list = '' + if (date >= '2010-11-15') { + list += ` + expAge, + method, + mimeType, + redirectUrl, + reqBodySize, + reqCookieLen, + reqHeadersSize, + respBodySize, + respCookieLen, + respHeadersSize, + respHttpVersion, + respSize, + status, + time,` + } + if (date >= '2014-05-15') { + list += ` + _cdn_provider,` + } + if (date >= '2014-05-01') { + list += ` + _gzip_save,` + } + if (date >= '2015-05-01') { + list += ` + format,` } + return list +} +iterations.forEach((iteration, i) => { operate(`backfill_summary_requests ${iteration.date} ${iteration.client}`).tags([ 'requests_backfill' ]).dependencies([ @@ -132,152 +159,81 @@ RETURNS ARRAY> LANGUAGE js AS """ try { - return JSON.parse(headers).map(header => { - return { name: header.name, value: header.value }; + return headers.split(', ').map(header => { + const [name, value] = header.split(' = '); + return { name: name.trim(), value: value.trim() }; }); } catch (e) { return []; } """; -INSERT INTO \`all_dev.requests_stable\` --${ctx.resolve('all', 'requests')} +INSERT INTO all_dev.requests_stable SELECT DATE('${iteration.date}') AS date, '${iteration.client}' AS client, pages.url AS page, TRUE AS is_root_page, pages.url AS root_page, - crux.rank AS rank, + pages.rank AS rank, requests.url AS url, requests.firstHTML AS is_main_document, - get_type(requests.mimeType, get_ext_from_url(requests.url)) AS type, + get_type(requests.mimeType, requests.ext_from_url) AS type, IF(requests.firstReq, 1, NULL) AS index, + NULL AS payload, TO_JSON( STRUCT( - requests.requestid, - requests.pageid, - requests.startedDateTime, - requests.time, - requests.method, - requests.url, - requests.urlShort, - requests.redirectUrl, - requests.firstReq, - requests.firstHtml, - requests.reqHttpVersion, - requests.reqHeadersSize, - requests.reqBodySize, - requests.reqCookieLen, - requests.reqOtherHeaders, - requests.status, - requests.respHttpVersion, - requests.respHeadersSize, - requests.respBodySize, - requests.respSize, - requests.respCookieLen, - requests.expAge, - requests.mimeType, - requests.respOtherHeaders, - requests.req_accept, - requests.req_accept_charset, - requests.req_accept_encoding, - requests.req_accept_language, - requests.req_connection, - requests.req_host, - requests.req_if_modified_since, - requests.req_if_none_match, - requests.req_referer, - requests.req_user_agent, - requests.resp_accept_ranges, - requests.resp_age, - requests.resp_cache_control, - requests.resp_connection, - requests.resp_content_encoding, - requests.resp_content_language, - requests.resp_content_length, - requests.resp_content_location, - requests.resp_content_type, - requests.resp_date, - requests.resp_etag, - requests.resp_expires, - requests.resp_keep_alive, - requests.resp_last_modified, - requests.resp_location, - requests.resp_pragma, - requests.resp_server, - requests.resp_transfer_encoding, - requests.resp_vary, - requests.resp_via, - requests.resp_x_powered_by, - requests._cdn_provider, - requests._gzip_save, - requests.crawlid - )) AS payload, - TO_JSON( STRUCT( - requests.time AS time, - requests.method AS method, - requests.redirectUrl AS redirectUrl, - requests.reqHttpVersion AS reqHttpVersion, - requests.reqHeadersSize AS reqHeadersSize, - requests.reqBodySize AS reqBodySize, - requests.reqCookieLen AS reqCookieLen, - requests.reqOtherHeaders AS reqOtherHeaders, - requests.status AS status, - requests.respHttpVersion AS respHttpVersion, - requests.respHeadersSize AS respHeadersSize, - requests.respBodySize AS respBodySize, - requests.respSize AS respSize, - requests.respCookieLen AS respCookieLen, - requests.respOtherHeaders AS respOtherHeaders, - requests.expAge AS expAge, - requests.mimeType AS mimeType - ${addDimensions ? ',requests._cdn_provider AS _cdn_provider,requests._gzip_save AS _gzip_save' : ''} + ext_from_url AS ext, + ${summaryObject(iteration.date)} )) AS summary, - ARRAY>[ - ('Accept', requests.req_accept), - ("Accept-Charset", requests.req_accept_charset), - ("Accept-Encoding", requests.req_accept_encoding), - ("Accept-Language", requests.req_accept_language), - ("Connection", requests.req_connection), - ("Host", requests.req_host), - ("If-Modified-Since", requests.req_if_modified_since), - ("If-None-Match", requests.req_if_none_match), - ("Referer", requests.req_referer), - ("User-Agent", requests.req_user_agent) - ] AS request_headers, - ARRAY>[ - ("Accept-Ranges", requests.resp_accept_ranges), - ("Age", requests.resp_age), - ("Cache-Control", requests.resp_cache_control), - ("Connection", requests.resp_connection), - ("Content-Encoding", requests.resp_content_encoding), - ("Content-Length", requests.resp_content_language), - ("Content-Length", requests.resp_content_length), - ("Content-Location", requests.resp_content_location), - ("Content-Type", requests.resp_content_type), - ("Date", requests.resp_date), - ("ETag", requests.resp_etag), - ("Expires", requests.resp_expires), - ("Keep-Alive", requests.resp_keep_alive), - ("Last-Modified", requests.resp_last_modified), - ("Location", requests.resp_location), - ("Pragma", requests.resp_pragma), - ("Server", requests.resp_server), - ("Transfer-Encoding", requests.resp_transfer_encoding), - ("Vary", requests.resp_vary), - ("Via", requests.resp_via), - ("X-Powered-By", requests.resp_x_powered_by) - ] AS response_headers, + ARRAY_CONCAT( + ARRAY>[ + ('Accept', requests.req_accept), + ("Accept-Charset", requests.req_accept_charset), + ("Accept-Encoding", requests.req_accept_encoding), + ("Accept-Language", requests.req_accept_language), + ("Connection", requests.req_connection), + ("Host", requests.req_host), + ("If-Modified-Since", requests.req_if_modified_since), + ("If-None-Match", requests.req_if_none_match), + ("Referer", requests.req_referer), + ("User-Agent", requests.req_user_agent) + ], + parse_headers(requests.reqOtherHeaders) + ) AS request_headers, + ARRAY_CONCAT( + ARRAY>[ + ("Accept-Ranges", requests.resp_accept_ranges), + ("Age", requests.resp_age), + ("Cache-Control", requests.resp_cache_control), + ("Connection", requests.resp_connection), + ("Content-Encoding", requests.resp_content_encoding), + ("Content-Language", requests.resp_content_language), + ("Content-Length", requests.resp_content_length), + ("Content-Location", requests.resp_content_location), + ("Content-Type", requests.resp_content_type), + ("Date", requests.resp_date), + ("ETag", requests.resp_etag), + ("Expires", requests.resp_expires), + ("Keep-Alive", requests.resp_keep_alive), + ("Last-Modified", requests.resp_last_modified), + ("Location", requests.resp_location), + ("Pragma", requests.resp_pragma), + ("Server", requests.resp_server), + ("Transfer-Encoding", requests.resp_transfer_encoding), + ("Vary", requests.resp_vary), + ("Via", requests.resp_via), + ("X-Powered-By", requests.resp_x_powered_by) + ], + parse_headers(requests.respOtherHeaders) + ) AS response_headers, NULL AS response_body -FROM summary_requests.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} AS requests ${constants.dev_TABLESAMPLE} +FROM ( + SELECT + *, + get_ext_from_url(requests.url) AS ext_from_url + FROM summary_requests.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} ${constants.dev_TABLESAMPLE} +) AS requests LEFT JOIN summary_pages.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} AS pages ${constants.dev_TABLESAMPLE} -ON requests.pageid = pages.pageid -LEFT JOIN ( - SELECT DISTINCT - CONCAT(origin, '/') AS page, - experimental.popularity.rank AS rank - FROM ${ctx.resolve('chrome-ux-report', 'experimental', 'global')} - WHERE yyyymm = ${constants.fnPastMonth(iteration.date).substring(0, 7).replace('-', '')} -) AS crux -ON pages.url = crux.page; +ON requests.pageid = pages.pageid; `) }) From e55d8b441162b85be4079c5f60d7083ab93c2cc9 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Sun, 20 Oct 2024 02:28:55 +0200 Subject: [PATCH 27/44] fix --- definitions/output/all/backfill_pages.js | 2 +- definitions/output/all/backfill_summary_pages.js | 9 +-------- definitions/output/all/backfill_summary_requests.js | 4 ++-- 3 files changed, 4 insertions(+), 11 deletions(-) diff --git a/definitions/output/all/backfill_pages.js b/definitions/output/all/backfill_pages.js index 45aab5f..64cdb3e 100644 --- a/definitions/output/all/backfill_pages.js +++ b/definitions/output/all/backfill_pages.js @@ -283,7 +283,7 @@ FROM ( FROM pages.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} ${constants.devTABLESAMPLE} ) AS pages -LEFT JOIN summary_pages.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} ${constants.devTABLESAMPLE} AS summary_pages +LEFT JOIN summary_pages.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} AS summary_pages ${constants.devTABLESAMPLE} ON pages.url = summary_pages.url LEFT JOIN ( diff --git a/definitions/output/all/backfill_summary_pages.js b/definitions/output/all/backfill_summary_pages.js index 29ae1bf..fa045d7 100644 --- a/definitions/output/all/backfill_summary_pages.js +++ b/definitions/output/all/backfill_summary_pages.js @@ -27,7 +27,6 @@ for ( function summaryObject (date) { let list = '' - if (date >= '2010-11-15') { list += ` fullyLoaded, @@ -80,12 +79,10 @@ function summaryObject (date) { TTFB, visualComplete,` } - if (date >= '2014-05-15') { list += ` _connections,` } - if (date >= '2015-05-01') { list += ` bytesAudio, @@ -101,13 +98,11 @@ function summaryObject (date) { reqWebp, reqXml,` } - return list } function customMetrics (date) { let list = '' - if (date >= '2014-06-01' && date !== '2014-05-15') { list += ` avg_dom_depth, @@ -120,13 +115,11 @@ function customMetrics (date) { num_scripts, sessionstorage_size,` } - if (date >= '2015-11-01') { list += ` num_scripts_async, num_scripts_sync,` } - return list } @@ -206,6 +199,6 @@ SELECT NULL AS features, NULL AS technologies, NULL AS metadata -FROM summary_pages.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} AS pages ${constants.dev_TABLESAMPLE}; +FROM summary_pages.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} AS pages ${constants.devTABLESAMPLE}; `) }) diff --git a/definitions/output/all/backfill_summary_requests.js b/definitions/output/all/backfill_summary_requests.js index cd1f994..e563ea8 100644 --- a/definitions/output/all/backfill_summary_requests.js +++ b/definitions/output/all/backfill_summary_requests.js @@ -231,9 +231,9 @@ FROM ( SELECT *, get_ext_from_url(requests.url) AS ext_from_url - FROM summary_requests.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} ${constants.dev_TABLESAMPLE} + FROM summary_requests.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} ${constants.devTABLESAMPLE} ) AS requests -LEFT JOIN summary_pages.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} AS pages ${constants.dev_TABLESAMPLE} +LEFT JOIN summary_pages.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} AS pages ${constants.devTABLESAMPLE} ON requests.pageid = pages.pageid; `) }) From c7afc1120dab6a9803ee425f7cd8b01ddf023220 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Sun, 20 Oct 2024 02:36:19 +0200 Subject: [PATCH 28/44] fix --- definitions/output/all/backfill_summary_pages.js | 14 +++++++------- .../output/all/backfill_summary_requests.js | 14 +++++++------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/definitions/output/all/backfill_summary_pages.js b/definitions/output/all/backfill_summary_pages.js index fa045d7..86e0900 100644 --- a/definitions/output/all/backfill_summary_pages.js +++ b/definitions/output/all/backfill_summary_pages.js @@ -77,14 +77,14 @@ function summaryObject (date) { reqTotal, SpeedIndex, TTFB, - visualComplete,` + visualComplete` } if (date >= '2014-05-15') { - list += ` - _connections,` + list += `, + _connections` } if (date >= '2015-05-01') { - list += ` + list += `, bytesAudio, bytesSvg, bytesText, @@ -96,7 +96,7 @@ function summaryObject (date) { reqText, reqVideo, reqWebp, - reqXml,` + reqXml` } return list } @@ -116,9 +116,9 @@ function customMetrics (date) { sessionstorage_size,` } if (date >= '2015-11-01') { - list += ` + list += `, num_scripts_async, - num_scripts_sync,` + num_scripts_sync` } return list } diff --git a/definitions/output/all/backfill_summary_requests.js b/definitions/output/all/backfill_summary_requests.js index e563ea8..c584bca 100644 --- a/definitions/output/all/backfill_summary_requests.js +++ b/definitions/output/all/backfill_summary_requests.js @@ -42,19 +42,19 @@ function summaryObject (date) { respHttpVersion, respSize, status, - time,` + time` } if (date >= '2014-05-15') { - list += ` - _cdn_provider,` + list += `, + _cdn_provider` } if (date >= '2014-05-01') { - list += ` - _gzip_save,` + list += `, + _gzip_save` } if (date >= '2015-05-01') { - list += ` - format,` + list += `, + format` } return list } From e03a3538c5f151ac0e53498848edcc0c45ddb106 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Sun, 20 Oct 2024 02:50:32 +0200 Subject: [PATCH 29/44] fix --- definitions/output/all/backfill_summary_pages.js | 2 +- definitions/output/all/backfill_summary_requests.js | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/definitions/output/all/backfill_summary_pages.js b/definitions/output/all/backfill_summary_pages.js index 86e0900..858fe88 100644 --- a/definitions/output/all/backfill_summary_pages.js +++ b/definitions/output/all/backfill_summary_pages.js @@ -113,7 +113,7 @@ function customMetrics (date) { meta_viewport, num_iframes, num_scripts, - sessionstorage_size,` + sessionstorage_size` } if (date >= '2015-11-01') { list += `, diff --git a/definitions/output/all/backfill_summary_requests.js b/definitions/output/all/backfill_summary_requests.js index c584bca..a18ece8 100644 --- a/definitions/output/all/backfill_summary_requests.js +++ b/definitions/output/all/backfill_summary_requests.js @@ -4,7 +4,7 @@ const clients = constants.clients let midMonth for ( let date = '2015-12-01'; - date >= '2015-12-01'; // 2011-06-01 + date >= '2015-12-01'; date = constants.fnPastMonth(date) ) { clients.forEach((client) => { @@ -230,7 +230,7 @@ SELECT FROM ( SELECT *, - get_ext_from_url(requests.url) AS ext_from_url + get_ext_from_url(url) AS ext_from_url FROM summary_requests.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} ${constants.devTABLESAMPLE} ) AS requests LEFT JOIN summary_pages.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} AS pages ${constants.devTABLESAMPLE} From 4a6101a1c026757eb0bcad7881c204b8a2c83fc2 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Sun, 20 Oct 2024 14:09:01 +0200 Subject: [PATCH 30/44] actual reprocessing queries --- definitions/output/all/reprocess_pages.js | 71 +++++++++++--------- definitions/output/all/reprocess_requests.js | 56 +++++++-------- 2 files changed, 63 insertions(+), 64 deletions(-) diff --git a/definitions/output/all/reprocess_pages.js b/definitions/output/all/reprocess_pages.js index 22f65f8..1e7b5a8 100644 --- a/definitions/output/all/reprocess_pages.js +++ b/definitions/output/all/reprocess_pages.js @@ -3,8 +3,6 @@ operate('all_pages_stable_pre').tags( ).queries(` CREATE SCHEMA IF NOT EXISTS all_dev; --- DROP TABLE IF EXISTS \`all_dev.pages_stable\`; - CREATE TABLE IF NOT EXISTS \`all_dev.pages_stable\` ( date DATE NOT NULL OPTIONS(description='YYYY-MM-DD format of the HTTP Archive monthly crawl'), @@ -61,9 +59,8 @@ OPTIONS( const iterations = [] const clients = constants.clients -// From 2022-07-01 till today for ( - let month = constants.currentMonth; month >= '2024-09-01'; month = constants.fnPastMonth(month)) { + let month = '2022-03-01'; month >= '2022-03-01'; month = constants.fnPastMonth(month)) { clients.forEach((client) => { iterations.push({ month, @@ -92,7 +89,7 @@ SELECT rank, wptid, JSON_REMOVE( - SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), + payload, '$._metadata', '$._detected', '$._detected_apps', @@ -161,7 +158,7 @@ SELECT ) AS payload, JSON_SET( JSON_REMOVE( - SAFE.PARSE_JSON(summary, wide_number_mode => 'round'), + summary, '$._adult_site', '$.archive', '$.avg_dom_depth', @@ -191,7 +188,7 @@ SELECT '$.wptrun' ), '$.crux', - JSON_QUERY(SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), '$._CrUX') + payload._CrUX ) AS summary, STRUCT< a11y JSON, @@ -215,27 +212,27 @@ SELECT wpt_bodies JSON, other JSON >( - JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), '$.a11y'), - JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), '$.cms'), - JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), '$.cookies'), - JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), '$.css-variables'), - JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), '$.ecommerce'), - JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), '$.element_count'), - JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), '$.javascript'), - JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), '$.markup'), - JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), '$.media'), - JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), '$.origin-trials'), - JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), '$.performance'), - JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), '$.privacy'), - JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), '$.responsive_images'), - JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), '$.robots_txt'), - JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), '$.security'), - JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), '$.structured-data'), - JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), '$.third-parties'), - JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), '$.well-known'), - JSON_QUERY(SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), '$.wpt_bodies'), + custom_metrics.a11y, + custom_metrics.cms, + custom_metrics.cookies, + custom_metrics["css-variables"], + custom_metrics.ecommerce, + custom_metrics.element_count, + custom_metrics.javascript, + custom_metrics.markup, + custom_metrics.media, + custom_metrics["origin-trials"], + custom_metrics.performance, + custom_metrics.privacy, + custom_metrics.responsive_images, + custom_metrics.robots_txt, + custom_metrics.security, + custom_metrics["structured-data"], + custom_metrics["third-parties"], + custom_metrics["well-known"], + custom_metrics.wpt_bodies, JSON_REMOVE( - SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round'), + custom_metrics, '$.a11y', '$.cms', '$.cookies', @@ -257,18 +254,26 @@ SELECT '$.wpt_bodies' ) ) AS custom_metrics, - SAFE.PARSE_JSON(lighthouse, wide_number_mode => 'round') AS lighthouse, + lighthouse, features, technologies, JSON_REMOVE( - SAFE.PARSE_JSON(metadata, wide_number_mode => 'round'), + metadata, '$.page_id', '$.parent_page_id', '$.root_page_id' ) AS metadata -FROM \`all.pages\` -WHERE - date = '${iteration.month}' AND - client = '${iteration.client}' ${constants.devRankFilter}; +FROM ( + SELECT + * EXCEPT (custom_metrics, lighthouse, metadata, payload, summary), + SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round') AS custom_metrics, + SAFE.PARSE_JSON(lighthouse, wide_number_mode => 'round') AS lighthouse, + SAFE.PARSE_JSON(metadata, wide_number_mode => 'round') AS metadata, + SAFE.PARSE_JSON(payload, wide_number_mode => 'round') AS payload, + SAFE.PARSE_JSON(summary, wide_number_mode => 'round') AS summary + FROM \`all.pages\` + WHERE date = '${iteration.month}' AND + client = '${iteration.client}' ${constants.devRankFilter} +); `) }) diff --git a/definitions/output/all/reprocess_requests.js b/definitions/output/all/reprocess_requests.js index 5c77204..2859534 100644 --- a/definitions/output/all/reprocess_requests.js +++ b/definitions/output/all/reprocess_requests.js @@ -3,8 +3,6 @@ operate('all_requests_stable_pre').tags( ).queries(` CREATE SCHEMA IF NOT EXISTS all_dev; --- DROP TABLE IF EXISTS \`all_dev.requests_stable\`; - CREATE TABLE IF NOT EXISTS \`all_dev.requests_stable\` ( date DATE NOT NULL OPTIONS(description='YYYY-MM-DD format of the HTTP Archive monthly crawl'), @@ -37,36 +35,30 @@ OPTIONS( `) const iterations = [] -const types = ['= "script"', '= "image"', 'NOT IN ("script", "image")'] -// From 2022-07-01 till today for ( - let month = constants.currentMonth; month >= '2024-09-01'; month = constants.fnPastMonth(month)) { + let month = '2022-03-01'; month >= '2022-03-01'; month = constants.fnPastMonth(month)) { constants.clients.forEach((client) => { constants.booleans.forEach((isRootPage) => { - types.forEach((type) => { - iterations.push({ - month, - client, - isRootPage, - type - }) + iterations.push({ + month, + client, + isRootPage }) }) }) } iterations.forEach((iteration, i) => { - operate(`all_requests_stable ${iteration.month} ${iteration.client} ${iteration.isRootPage} ${i}`).tags( + operate(`all_requests_stable ${iteration.month} ${iteration.client} ${iteration.isRootPage}`).tags( ['all_requests_stable'] ).dependencies([ - i === 0 ? 'all_requests_stable_pre' : `all_requests_stable ${iterations[i - 1].month} ${iterations[i - 1].client} ${iterations[i - 1].isRootPage} ${i - 1}` + i === 0 ? 'all_requests_stable_pre' : `all_requests_stable3 ${iterations[i - 1].month} ${iterations[i - 1].client} ${iterations[i - 1].isRootPage}` ]).queries(ctx => ` DELETE FROM \`all_dev.requests_stable\` WHERE date = '${iteration.month}' AND client = '${iteration.client}' - AND is_root_page = ${iteration.isRootPage} - AND type ${iteration.type}; + AND is_root_page = ${iteration.isRootPage}; CREATE TEMP FUNCTION PRUNE_HEADERS( jsonObject JSON @@ -86,25 +78,25 @@ try { INSERT INTO \`all_dev.requests_stable\` SELECT - requests.date, - requests.client, + date, + client, requests.page, - requests.is_root_page, - requests.root_page, + is_root_page, + root_page, crux.rank, - requests.url, - requests.is_main_document, - requests.type, - requests.index, + url, + is_main_document, + type, + index, JSON_REMOVE( - SAFE.PARSE_JSON(payload, wide_number_mode => 'round'), + payload, '$._headers', '$.request.headers', '$.response.headers' ) AS payload, PRUNE_HEADERS( JSON_REMOVE( - SAFE.PARSE_JSON(requests.summary, wide_number_mode => 'round'), + summary, '$.crawlid', '$.firstHtml', '$.firstReq', @@ -118,16 +110,18 @@ SELECT '$.urlShort' ) ) as summary, - requests.request_headers, - requests.response_headers, - requests.response_body + request_headers, + response_headers, + response_body FROM ( - SELECT * + SELECT + * EXCEPT (payload, summary), + SAFE.PARSE_JSON(payload, wide_number_mode => 'round') AS payload, + SAFE.PARSE_JSON(summary, wide_number_mode => 'round') AS summary FROM \`all.requests\` ${constants.devTABLESAMPLE} WHERE date = '${iteration.month}' AND client = '${iteration.client}' AND is_root_page = ${iteration.isRootPage} - AND type ${iteration.type} ) AS requests LEFT JOIN ( SELECT DISTINCT From 4eb39ae4517402ef399194005e0a81e17570c988 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Sun, 20 Oct 2024 15:03:07 +0200 Subject: [PATCH 31/44] fix --- definitions/output/all/reprocess_requests.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/definitions/output/all/reprocess_requests.js b/definitions/output/all/reprocess_requests.js index 2859534..3d7bf42 100644 --- a/definitions/output/all/reprocess_requests.js +++ b/definitions/output/all/reprocess_requests.js @@ -53,7 +53,7 @@ iterations.forEach((iteration, i) => { operate(`all_requests_stable ${iteration.month} ${iteration.client} ${iteration.isRootPage}`).tags( ['all_requests_stable'] ).dependencies([ - i === 0 ? 'all_requests_stable_pre' : `all_requests_stable3 ${iterations[i - 1].month} ${iterations[i - 1].client} ${iterations[i - 1].isRootPage}` + i === 0 ? 'all_requests_stable_pre' : `all_requests_stable ${iterations[i - 1].month} ${iterations[i - 1].client} ${iterations[i - 1].isRootPage}` ]).queries(ctx => ` DELETE FROM \`all_dev.requests_stable\` WHERE date = '${iteration.month}' From 8d54b1b9e38ff6cf9af9af3a63edf61348380ccf Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Sun, 20 Oct 2024 15:45:36 +0200 Subject: [PATCH 32/44] requests complete --- definitions/output/all/backfill_requests.js | 71 +++++++++++++-------- 1 file changed, 43 insertions(+), 28 deletions(-) diff --git a/definitions/output/all/backfill_requests.js b/definitions/output/all/backfill_requests.js index 01c419a..4c2dfd2 100644 --- a/definitions/output/all/backfill_requests.js +++ b/definitions/output/all/backfill_requests.js @@ -35,8 +35,9 @@ iterations.forEach((iteration, i) => { ]).dependencies([ i === 0 ? '' : `backfill_requests ${iterations[i - 1].date} ${iterations[i - 1].client}` ]).queries(ctx => ` -DELETE FROM ${ctx.resolve('all', 'requests')} -WHERE date = '${iteration.date}' AND client = '${iteration.client}'; +DELETE FROM all_dev.requests_stable +WHERE date = '${iteration.date}' + AND client = '${iteration.client}'; CREATE TEMP FUNCTION get_ext_from_url(url STRING) RETURNS STRING @@ -124,12 +125,12 @@ AS """ } """; -CREATE TEMP FUNCTION parse_headers(headers STRING) +CREATE TEMP FUNCTION parse_headers(headers JSON) RETURNS ARRAY> LANGUAGE js AS """ try { - return JSON.parse(headers).map(header => { + return headers.map(header => { return { name: header.name, value: header.value }; }); } catch (e) { @@ -137,7 +138,7 @@ AS """ } """; -INSERT INTO \`all_dev.requests_stable\` --${ctx.resolve('all', 'requests')} +INSERT INTO all_dev.requests_stable SELECT DATE('${iteration.date}') AS date, '${iteration.client}' AS client, @@ -147,39 +148,51 @@ SELECT crux.rank AS rank, requests.url AS url, IF( - SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$._request_type') AS STRING) = "Document" AND - MIN(SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$._index') AS INT64)) OVER (PARTITION BY requests.page) = SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$._index') AS INT64), + SAFE_CAST(payload._request_type AS STRING) = "Document" AND + MIN(SAFE_CAST(payload._index AS INT64)) OVER (PARTITION BY requests.page) = SAFE_CAST(payload._index AS INT64), TRUE, FALSE ) AS is_main_document, - get_type(JSON_VALUE(requests.payload, '$.response.content.mimeType'), get_ext_from_url(requests.url)) AS type, - SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$._index') AS INT64) AS index, - SAFE.PARSE_JSON(requests.payload, wide_number_mode => 'round') AS payload, + get_type(SAFE_CAST(payload.response.content.mimeType AS STRING), ext_from_url) AS type, + SAFE_CAST(payload._index AS INT64) AS index, + JSON_REMOVE( + payload, + '$._headers', + '$.request.headers', + '$.response.headers' + ) AS payload, TO_JSON( STRUCT( - SAFE_CAST(JSON_VALUE(requests.payload, '$.time') AS INTEGER) AS time, - JSON_VALUE(requests.payload, '$._method') AS method, + payload.time, + payload._method AS method, NULL AS redirectUrl, - JSON_VALUE(requests.payload, '$.request.httpVersion') AS reqHttpVersion, - JSON_VALUE(requests.payload, '$.request.headersSize') AS reqHeadersSize, - JSON_VALUE(requests.payload, '$.request.bodySize') AS reqBodySize, + IFNULL(SAFE_CAST(payload._protocol AS STRING), SAFE_CAST(payload.request.httpVersion AS STRING) AS reqHttpVersion, + payload.request.headersSize AS reqHeadersSize, + payload.request.bodySize AS reqBodySize, NULL AS reqCookieLen, - JSON_VALUE(requests.payload, '$.response.status') AS status, - JSON_VALUE(requests.payload, '$.response.httpVersion') AS respHttpVersion, - JSON_VALUE(requests.payload, '$.response.headersSize') AS respHeadersSize, - JSON_VALUE(requests.payload, '$.response.bodySize') AS respBodySize, - JSON_VALUE(requests.payload, '$.response.content.size') AS respSize, + payload.response.status, + payload.response.httpVersion AS respHttpVersion, + payload.response.headersSize AS respHeadersSize, + payload.response.bodySize AS respBodySize, + payload.response.content.size AS respSize, NULL AS respCookieLen, NULL AS expAge, - JSON_VALUE(requests.payload, '$.response.content.mimeType') AS mimeType, - JSON_VALUE(requests.payload, '$._cdn_provider') AS _cdn_provide, - JSON_VALUE(requests.payload, '$._gzip_save') AS _gzip_save, - NULL AS ext, + payload.response.content.mimeType, + payload._cdn_provider, + payload._gzip_save, + ext_from_url AS ext, NULL AS format )) AS summary, - parse_headers(JSON_QUERY(payload, '$.request.headers')) AS request_headers, - parse_headers(JSON_QUERY(payload, '$.response.headers')) AS response_headers, + parse_headers(payload.request.headers) AS request_headers, + parse_headers(payload.response.headers) AS response_headers, response_bodies.body AS response_body -FROM requests.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} AS requests ${constants.dev_TABLESAMPLE} +FROM ( + SELECT + * EXCEPT (payload), + SAFE.PARSE_JSON(payload, wide_number_mode => 'round') AS payload, + get_ext_from_url(requests.url) AS ext_from_url + FROM requests.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} ${constants.dev_TABLESAMPLE} +) AS requests + LEFT JOIN ( SELECT DISTINCT CONCAT(origin, '/') AS page, @@ -188,7 +201,9 @@ LEFT JOIN ( WHERE yyyymm = ${constants.fnPastMonth(iteration.date).substring(0, 7).replace('-', '')} ) AS crux ON requests.page = crux.page + LEFT JOIN response_bodies.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} AS response_bodies ${constants.dev_TABLESAMPLE} -ON requests.page = response_bodies.page AND requests.url = response_bodies.url; +ON requests.page = response_bodies.page + AND requests.url = response_bodies.url; `) }) From a38efe068ff7685110fbce3176eb9ae3bd461c08 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Sun, 20 Oct 2024 16:36:35 +0200 Subject: [PATCH 33/44] fix casts --- definitions/output/all/backfill_requests.js | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/definitions/output/all/backfill_requests.js b/definitions/output/all/backfill_requests.js index 4c2dfd2..594e2ae 100644 --- a/definitions/output/all/backfill_requests.js +++ b/definitions/output/all/backfill_requests.js @@ -3,8 +3,8 @@ const clients = constants.clients let midMonth for ( - let date = '2016-01-01'; // 2022-06-01 - date >= '2016-01-01'; // 2016-01-01 + let date = '2016-01-01'; + date >= '2016-01-01'; date = constants.fnPastMonth(date) ) { clients.forEach((client) => { @@ -148,13 +148,13 @@ SELECT crux.rank AS rank, requests.url AS url, IF( - SAFE_CAST(payload._request_type AS STRING) = "Document" AND - MIN(SAFE_CAST(payload._index AS INT64)) OVER (PARTITION BY requests.page) = SAFE_CAST(payload._index AS INT64), + STRING(payload._request_type) = "Document" AND + MIN(INT64(payload._index)) OVER (PARTITION BY requests.page) = INT64(payload._index), TRUE, FALSE ) AS is_main_document, - get_type(SAFE_CAST(payload.response.content.mimeType AS STRING), ext_from_url) AS type, - SAFE_CAST(payload._index AS INT64) AS index, + get_type(STRING(payload.response.content.mimeType), ext_from_url) AS type, + INT64(payload._index) AS index, JSON_REMOVE( payload, '$._headers', @@ -165,7 +165,7 @@ SELECT payload.time, payload._method AS method, NULL AS redirectUrl, - IFNULL(SAFE_CAST(payload._protocol AS STRING), SAFE_CAST(payload.request.httpVersion AS STRING) AS reqHttpVersion, + IFNULL(STRING(payload._protocol), STRING(payload.request.httpVersion)) AS reqHttpVersion, payload.request.headersSize AS reqHeadersSize, payload.request.bodySize AS reqBodySize, NULL AS reqCookieLen, @@ -189,8 +189,8 @@ FROM ( SELECT * EXCEPT (payload), SAFE.PARSE_JSON(payload, wide_number_mode => 'round') AS payload, - get_ext_from_url(requests.url) AS ext_from_url - FROM requests.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} ${constants.dev_TABLESAMPLE} + get_ext_from_url(url) AS ext_from_url + FROM requests.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} ${constants.devTABLESAMPLE} ) AS requests LEFT JOIN ( @@ -202,7 +202,7 @@ LEFT JOIN ( ) AS crux ON requests.page = crux.page -LEFT JOIN response_bodies.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} AS response_bodies ${constants.dev_TABLESAMPLE} +LEFT JOIN response_bodies.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} AS response_bodies ${constants.devTABLESAMPLE} ON requests.page = response_bodies.page AND requests.url = response_bodies.url; `) From 86fff73ec0877014448044c75179de2219c6f931 Mon Sep 17 00:00:00 2001 From: Max Ostapenko Date: Sun, 20 Oct 2024 16:09:07 +0000 Subject: [PATCH 34/44] wptid from summary --- definitions/output/all/backfill_pages.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/definitions/output/all/backfill_pages.js b/definitions/output/all/backfill_pages.js index 64cdb3e..7132147 100644 --- a/definitions/output/all/backfill_pages.js +++ b/definitions/output/all/backfill_pages.js @@ -92,7 +92,7 @@ SELECT TRUE AS is_root_page, pages.url AS root_page, crux.rank AS rank, - STRING(payload.testID) AS wptid, + summary_pages.wptid, JSON_REMOVE( payload, '$._metadata', From e030acce63242e61bfbd9a537b1be758e8b7288d Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Sun, 20 Oct 2024 18:09:25 +0200 Subject: [PATCH 35/44] Update definitions/output/all/backfill_requests.js --- definitions/output/all/backfill_requests.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/definitions/output/all/backfill_requests.js b/definitions/output/all/backfill_requests.js index 594e2ae..4f10388 100644 --- a/definitions/output/all/backfill_requests.js +++ b/definitions/output/all/backfill_requests.js @@ -184,7 +184,7 @@ SELECT )) AS summary, parse_headers(payload.request.headers) AS request_headers, parse_headers(payload.response.headers) AS response_headers, - response_bodies.body AS response_body + IF(requests.type = 'image', NULL, response_bodies.body) AS response_body FROM ( SELECT * EXCEPT (payload), From c8e234302c5d6121443c751ddcbf8af5342132db Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Mon, 21 Oct 2024 20:49:56 +0200 Subject: [PATCH 36/44] summary update --- definitions/output/all/backfill_pages.js | 7 +- definitions/output/all/backfill_requests.js | 274 +++++++++++++------ definitions/output/all/reprocess_requests.js | 2 +- 3 files changed, 188 insertions(+), 95 deletions(-) diff --git a/definitions/output/all/backfill_pages.js b/definitions/output/all/backfill_pages.js index 7132147..411a1a9 100644 --- a/definitions/output/all/backfill_pages.js +++ b/definitions/output/all/backfill_pages.js @@ -53,8 +53,9 @@ try { } """; -CREATE TEMP FUNCTION GET_FEATURES(payload JSON) -RETURNS ARRAY> LANGUAGE js AS +CREATE TEMP FUNCTION getFeatures(payload JSON) +RETURNS ARRAY> +LANGUAGE js AS ''' function getFeatureNames(featureMap, featureType) { try { @@ -273,7 +274,7 @@ SELECT ) ) AS custom_metrics, NULL AS lighthouse, - GET_FEATURES(pages.payload) AS features, + getFeatures(pages.payload) AS features, tech.technologies AS technologies, pages.payload._metadata AS metadata FROM ( diff --git a/definitions/output/all/backfill_requests.js b/definitions/output/all/backfill_requests.js index 4f10388..9293d7a 100644 --- a/definitions/output/all/backfill_requests.js +++ b/definitions/output/all/backfill_requests.js @@ -43,86 +43,120 @@ CREATE TEMP FUNCTION get_ext_from_url(url STRING) RETURNS STRING LANGUAGE js AS """ - try { - let ret_ext = url; +try { + let ret_ext = url; - // Remove query parameters - const i_q = ret_ext.indexOf("?"); - if (i_q > -1) { - ret_ext = ret_ext.substring(0, i_q); - } + // Remove query parameters + const i_q = ret_ext.indexOf("?"); + if (i_q > -1) { + ret_ext = ret_ext.substring(0, i_q); + } + + // Get the last segment of the path after the last "/" + ret_ext = ret_ext.substring(ret_ext.lastIndexOf("/") + 1); - // Get the last segment of the path after the last "/" - ret_ext = ret_ext.substring(ret_ext.lastIndexOf("/") + 1); + // Find the position of the last dot + const i_dot = ret_ext.lastIndexOf("."); - // Find the position of the last dot - const i_dot = ret_ext.lastIndexOf("."); + if (i_dot === -1) { + // No dot means no extension + ret_ext = ""; + } else { + // Extract the extension + ret_ext = ret_ext.substring(i_dot + 1); - if (i_dot === -1) { - // No dot means no extension + // Weed out overly long extensions + if (ret_ext.length > 5) { ret_ext = ""; - } else { - // Extract the extension - ret_ext = ret_ext.substring(i_dot + 1); + } + } - // Weed out overly long extensions - if (ret_ext.length > 5) { - ret_ext = ""; - } + return ret_ext.toLowerCase(); +} catch (e) { + return ""; // Return an empty string in case of any errors +} +"""; + +CREATE TEMP FUNCTION prettyType(mimeTyp STRING, ext STRING) +RETURNS STRING +LANGUAGE js +AS """ +try { + mimeTyp = mimeTyp.toLowerCase(); + + // Order by most unique first. + // Do NOT do html because "text/html" is often misused for other types. We catch it below. + const types = ["font", "css", "image", "script", "video", "audio", "xml"]; + for (const typ of types) { + if (mimeTyp.includes(typ)) { + return typ; } + } - return ret_ext.toLowerCase(); - } catch (e) { - return ""; // Return an empty string in case of any errors + // Special cases found manually + if (ext === "js") { + return "script"; + } else if (mimeTyp.includes("json") || ext === "json") { + return "json"; + } else if (["eot", "ttf", "woff", "woff2", "otf"].includes(ext)) { + return "font"; + } else if (["png", "gif", "jpg", "jpeg", "webp", "ico", "svg", "avif", "jxl", "heic", "heif"].includes(ext)) { + return "image"; + } else if (ext === "css") { + return "css"; + } else if (ext === "xml") { + return "xml"; + } else if ( + ["flash", "webm", "mp4", "flv"].some((typ) => mimeTyp.includes(typ)) || + ["mp4", "webm", "ts", "m4v", "m4s", "mov", "ogv", "swf", "f4v", "flv"].includes(ext) + ) { + return "video"; + } else if (mimeTyp.includes("wasm") || ext === "wasm") { + return "wasm"; + } else if (mimeTyp.includes("html") || ["html", "htm"].includes(ext)) { + return "html"; // Catch "text/html" mime type + } else if (mimeTyp.includes("text")) { + return "text"; // Put "text" LAST because it's often misused, so ext should take precedence + } else { + return "other"; } +} catch (e) { + return "other"; // Return "other" if there's any error +} """; -CREATE TEMP FUNCTION get_type(mime_typ STRING, ext STRING) +CREATE TEMP FUNCTION getFormat(prettyTyp STRING, mimeTyp STRING, ext STRING) RETURNS STRING LANGUAGE js AS """ - try { - mime_typ = mime_typ.toLowerCase(); - - // Order by most unique types first - const uniqueTypes = ["font", "css", "image", "script", "video", "audio", "xml"]; - for (let typ of uniqueTypes) { - if (mime_typ.includes(typ)) { - return typ; +try { + if (prettyTyp === "image") { + // Order by most popular first. + const imageTypes = ["jpg", "png", "gif", "webp", "svg", "ico", "avif", "jxl", "heic", "heif"]; + for (const typ of imageTypes) { + if (mimeTyp.includes(typ) || typ === ext) { + return typ; + } } - } + if (mimeTyp.includes("jpeg")) { + return "jpg"; + } + } - // Special cases - if (mime_typ.includes("json") || ["js", "json"].includes(ext)) { - return "script"; - } else if (["eot", "ttf", "woff", "woff2", "otf"].includes(ext)) { - return "font"; - } else if ( - ["png", "gif", "jpg", "jpeg", "webp", "ico", "svg", "avif", "jxl", "heic", "heif"].includes(ext) - ) { - return "image"; - } else if (ext === "css") { - return "css"; - } else if (ext === "xml") { - return "xml"; - } else if ( - ["mp4", "webm", "ts", "m4v", "m4s", "mov", "ogv", "swf", "f4v", "flv"].includes(ext) || - ["flash", "webm", "mp4", "flv"].some(typ => mime_typ.includes(typ)) - ) { - return "video"; - } else if (mime_typ.includes("wasm") || ext === "wasm") { - return "wasm"; - } else if (mime_typ.includes("html") || ["html", "htm"].includes(ext)) { - return "html"; - } else if (mime_typ.includes("text")) { - // Put "text" last because it is often misused, so extension should take precedence. - return "text"; - } else { - return "other"; - } - } catch (e) { - return "other"; // Return "other" if there's any error + if (prettyTyp === "video") { + // Order by most popular first. + const videoTypes = ["flash", "swf", "mp4", "flv", "f4v"]; + for (const typ of videoTypes) { + if (mimeTyp.includes(typ) || typ === ext) { + return typ; + } + } } + + return ""; +} catch (e) { + return ""; +} """; CREATE TEMP FUNCTION parse_headers(headers JSON) @@ -138,6 +172,61 @@ AS """ } """; +CREATE TEMP FUNCTION getCookieLen(headers JSON, cookieName STRING) +RETURNS INT64 +LANGUAGE js +AS """ + try { + const cookies = headers.find(header => header.name.toLowerCase() === cookieName) + if (!cookies) { + return 0 + } else if (typeof cookies === 'object') { + return cookies.value.length + } else if (Array.isArray(cookies)) { + return cookies.values().reduce((acc, cookie) => acc + cookie.value.length, 0) + } else { + return 0 + } + } catch (e) { + return 0; // Return 0 in case of any errors + } +"""; + +CREATE TEMP FUNCTION getExpAge(startedDateTime STRING, request JSON, response JSON) +RETURNS INT64 +LANGUAGE js +AS """ + try { + expAge = 0; + + // Get the Cache-Control header value + const cacheControl = request.headers.find(header => header.name.toLowerCase() === 'cache-control').value; + + // Handle no-cache scenarios + if (cacheControl && (cacheControl.includes("must-revalidate") || cacheControl.includes("no-cache") || cacheControl.includes("no-store"))) { + expAge = 0; + } + + // Handle max-age directive in Cache-Control header + else if (cacheControl && cacheControl.includes("max-age")) { + const maxAgeValue = cacheControl.match(/max-age=(\\d+)/)[1]; + expAge = min(2**63 - 1, parseInt(maxAgeValue); + } + + // Handle Expires header in the response + else if (response.headers.find(header => header.name.toLowerCase() === 'expires')) { + const respDate = response.headers.find(header => header.name.toLowerCase() === 'date').value + startDate = new Date(respDate).getTime() ? respDate : startedDateTime; + const endDate = new Date(response.headers.find(header => header.name.toLowerCase() === 'expires').value).getTime(); + expAge = endDate - startDate; + } + + return expAge; + } catch (e) { + return 0; // Return 0 in case of any errors + } +"""; + INSERT INTO all_dev.requests_stable SELECT DATE('${iteration.date}') AS date, @@ -149,48 +238,51 @@ SELECT requests.url AS url, IF( STRING(payload._request_type) = "Document" AND - MIN(INT64(payload._index)) OVER (PARTITION BY requests.page) = INT64(payload._index), + MIN(index) OVER (PARTITION BY requests.page) = index, TRUE, FALSE ) AS is_main_document, - get_type(STRING(payload.response.content.mimeType), ext_from_url) AS type, - INT64(payload._index) AS index, - JSON_REMOVE( - payload, - '$._headers', - '$.request.headers', - '$.response.headers' - ) AS payload, + type, + index, + payload, TO_JSON( STRUCT( payload.time, payload._method AS method, - NULL AS redirectUrl, - IFNULL(STRING(payload._protocol), STRING(payload.request.httpVersion)) AS reqHttpVersion, - payload.request.headersSize AS reqHeadersSize, - payload.request.bodySize AS reqBodySize, - NULL AS reqCookieLen, - payload.response.status, - payload.response.httpVersion AS respHttpVersion, - payload.response.headersSize AS respHeadersSize, - payload.response.bodySize AS respBodySize, - payload.response.content.size AS respSize, - NULL AS respCookieLen, - NULL AS expAge, - payload.response.content.mimeType, + response.url AS redirectUrl, + IFNULL(STRING(payload._protocol), STRING(request.httpVersion)) AS reqHttpVersion, + request.headersSize AS reqHeadersSize, + request.bodySize AS reqBodySize, + getCookieLen(request.headers, 'cookie') AS reqCookieLen, + response.status, + response.httpVersion AS respHttpVersion, + response.headersSize AS respHeadersSize, + response.bodySize AS respBodySize, + response.content.size AS respSize, + getCookieLen(response.headers, 'set-cookie') AS respCookieLen, + getExpAge(STRING(payload.startedDateTime), request, response) AS expAge, + response.content.mimeType, payload._cdn_provider, payload._gzip_save, - ext_from_url AS ext, - NULL AS format + ext, + getFormat(type, response.content.mimeType, ext) AS format )) AS summary, parse_headers(payload.request.headers) AS request_headers, parse_headers(payload.response.headers) AS response_headers, IF(requests.type = 'image', NULL, response_bodies.body) AS response_body FROM ( - SELECT - * EXCEPT (payload), - SAFE.PARSE_JSON(payload, wide_number_mode => 'round') AS payload, - get_ext_from_url(url) AS ext_from_url FROM requests.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} ${constants.devTABLESAMPLE} + |> SET payload = SAFE.PARSE_JSON(payload, wide_number_mode => 'round') + |> EXTEND get_ext_from_url(url) AS ext + |> EXTEND prettyType(STRING(payload.response.content.mimeType), ext_from_url) AS type + |> EXTEND INT64(payload._index) AS index + |> EXTEND payload.request AS request + |> EXTEND payload.response AS response + |> SET payload = JSON_REMOVE( + payload, + '$._headers', + '$.request.headers', + '$.response.headers' + ) ) AS requests LEFT JOIN ( diff --git a/definitions/output/all/reprocess_requests.js b/definitions/output/all/reprocess_requests.js index 3d7bf42..e59982c 100644 --- a/definitions/output/all/reprocess_requests.js +++ b/definitions/output/all/reprocess_requests.js @@ -72,7 +72,7 @@ try { } return jsonObject; } catch (e) { - return null; + return jsonObject; } '''; From 603243490324fcb7f5d6d67ca1414415c59d3f14 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Mon, 21 Oct 2024 21:15:52 +0200 Subject: [PATCH 37/44] only valid other headers --- definitions/output/all/backfill_summary_requests.js | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/definitions/output/all/backfill_summary_requests.js b/definitions/output/all/backfill_summary_requests.js index a18ece8..1df1096 100644 --- a/definitions/output/all/backfill_summary_requests.js +++ b/definitions/output/all/backfill_summary_requests.js @@ -159,12 +159,15 @@ RETURNS ARRAY> LANGUAGE js AS """ try { - return headers.split(', ').map(header => { - const [name, value] = header.split(' = '); - return { name: name.trim(), value: value.trim() }; - }); + const parsedHeaders = headers.split(', ').map(header => { + const [name, value] = header.split(' = ') + if (name && value) { + return { name: name.trim(), value: value.trim() } + } + }) + return parsedHeaders.filter(Object) } catch (e) { - return []; + return e } """; From e61df0a538cfc1fcc0cfcd54118061c7f8f3b6ca Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Mon, 21 Oct 2024 21:27:11 +0200 Subject: [PATCH 38/44] move tables --- definitions/output/{all => crawl}/backfill_pages.js | 4 ++-- definitions/output/{all => crawl}/backfill_requests.js | 4 ++-- .../output/{all => crawl}/backfill_summary_pages.js | 4 ++-- .../output/{all => crawl}/backfill_summary_requests.js | 4 ++-- definitions/output/{all => crawl}/reprocess_pages.js | 8 ++++---- definitions/output/{all => crawl}/reprocess_requests.js | 8 ++++---- 6 files changed, 16 insertions(+), 16 deletions(-) rename definitions/output/{all => crawl}/backfill_pages.js (99%) rename definitions/output/{all => crawl}/backfill_requests.js (99%) rename definitions/output/{all => crawl}/backfill_summary_pages.js (98%) rename definitions/output/{all => crawl}/backfill_summary_requests.js (98%) rename definitions/output/{all => crawl}/reprocess_pages.js (97%) rename definitions/output/{all => crawl}/reprocess_requests.js (96%) diff --git a/definitions/output/all/backfill_pages.js b/definitions/output/crawl/backfill_pages.js similarity index 99% rename from definitions/output/all/backfill_pages.js rename to definitions/output/crawl/backfill_pages.js index 411a1a9..30da147 100644 --- a/definitions/output/all/backfill_pages.js +++ b/definitions/output/crawl/backfill_pages.js @@ -33,7 +33,7 @@ iterations.forEach((iteration, i) => { ]).dependencies([ i === 0 ? '' : `backfill_pages ${iterations[i - 1].date} ${iterations[i - 1].client}` ]).queries(ctx => ` -DELETE FROM all_dev.pages_stable +DELETE FROM crawl.pages WHERE date = '${iteration.date}' AND client = '${iteration.client}'; @@ -85,7 +85,7 @@ LANGUAGE js AS .concat(getFeatureNames(blinkFeatureFirstUsed.AnimatedCSSFeatures, 'animated-css')); '''; -INSERT INTO all_dev.pages_stable +INSERT INTO crawl.pages SELECT DATE('${iteration.date}') AS date, '${iteration.client}' AS client, diff --git a/definitions/output/all/backfill_requests.js b/definitions/output/crawl/backfill_requests.js similarity index 99% rename from definitions/output/all/backfill_requests.js rename to definitions/output/crawl/backfill_requests.js index 9293d7a..14cb923 100644 --- a/definitions/output/all/backfill_requests.js +++ b/definitions/output/crawl/backfill_requests.js @@ -35,7 +35,7 @@ iterations.forEach((iteration, i) => { ]).dependencies([ i === 0 ? '' : `backfill_requests ${iterations[i - 1].date} ${iterations[i - 1].client}` ]).queries(ctx => ` -DELETE FROM all_dev.requests_stable +DELETE FROM crawl.requests WHERE date = '${iteration.date}' AND client = '${iteration.client}'; @@ -227,7 +227,7 @@ AS """ } """; -INSERT INTO all_dev.requests_stable +INSERT INTO crawl.requests SELECT DATE('${iteration.date}') AS date, '${iteration.client}' AS client, diff --git a/definitions/output/all/backfill_summary_pages.js b/definitions/output/crawl/backfill_summary_pages.js similarity index 98% rename from definitions/output/all/backfill_summary_pages.js rename to definitions/output/crawl/backfill_summary_pages.js index 858fe88..592d051 100644 --- a/definitions/output/all/backfill_summary_pages.js +++ b/definitions/output/crawl/backfill_summary_pages.js @@ -129,11 +129,11 @@ iterations.forEach((iteration, i) => { ]).dependencies([ i === 0 ? '' : `backfill_summary_pages ${iterations[i - 1].date} ${iterations[i - 1].client}` ]).queries(ctx => ` -DELETE FROM all_dev.pages_stable +DELETE FROM crawl.pages WHERE date = '${iteration.date}' AND client = '${iteration.client}'; -INSERT INTO all_dev.pages_stable +INSERT INTO crawl.pages SELECT DATE('${iteration.date}') AS date, '${iteration.client}' AS client, diff --git a/definitions/output/all/backfill_summary_requests.js b/definitions/output/crawl/backfill_summary_requests.js similarity index 98% rename from definitions/output/all/backfill_summary_requests.js rename to definitions/output/crawl/backfill_summary_requests.js index 1df1096..b9e7ab2 100644 --- a/definitions/output/all/backfill_summary_requests.js +++ b/definitions/output/crawl/backfill_summary_requests.js @@ -65,7 +65,7 @@ iterations.forEach((iteration, i) => { ]).dependencies([ i === 0 ? '' : `backfill_summary_requests ${iterations[i - 1].date} ${iterations[i - 1].client}` ]).queries(ctx => ` -DELETE FROM ${ctx.resolve('all', 'requests')} +DELETE FROM crawl.requests WHERE date = '${iteration.date}' AND client = '${iteration.client}'; CREATE TEMP FUNCTION get_ext_from_url(url STRING) @@ -171,7 +171,7 @@ AS """ } """; -INSERT INTO all_dev.requests_stable +INSERT INTO crawl.requests SELECT DATE('${iteration.date}') AS date, '${iteration.client}' AS client, diff --git a/definitions/output/all/reprocess_pages.js b/definitions/output/crawl/reprocess_pages.js similarity index 97% rename from definitions/output/all/reprocess_pages.js rename to definitions/output/crawl/reprocess_pages.js index 1e7b5a8..7ebdb5f 100644 --- a/definitions/output/all/reprocess_pages.js +++ b/definitions/output/crawl/reprocess_pages.js @@ -1,9 +1,9 @@ operate('all_pages_stable_pre').tags( ['all_pages_stable'] ).queries(` -CREATE SCHEMA IF NOT EXISTS all_dev; +CREATE SCHEMA IF NOT EXISTS crawl; -CREATE TABLE IF NOT EXISTS \`all_dev.pages_stable\` +CREATE TABLE IF NOT EXISTS crawl.pages ( date DATE NOT NULL OPTIONS(description='YYYY-MM-DD format of the HTTP Archive monthly crawl'), client STRING NOT NULL OPTIONS(description='Test environment: desktop or mobile'), @@ -75,11 +75,11 @@ iterations.forEach((iteration, i) => { ]).dependencies([ i === 0 ? 'all_pages_stable_pre' : `all_pages_stable_update ${iterations[i - 1].month} ${iterations[i - 1].client}` ]).queries(ctx => ` -DELETE FROM \`all_dev.pages_stable\` +DELETE FROM crawl.pages WHERE date = '${iteration.month}' AND client = '${iteration.client}'; -INSERT INTO \`all_dev.pages_stable\` +INSERT INTO crawl.pages SELECT date, client, diff --git a/definitions/output/all/reprocess_requests.js b/definitions/output/crawl/reprocess_requests.js similarity index 96% rename from definitions/output/all/reprocess_requests.js rename to definitions/output/crawl/reprocess_requests.js index e59982c..eabd356 100644 --- a/definitions/output/all/reprocess_requests.js +++ b/definitions/output/crawl/reprocess_requests.js @@ -1,9 +1,9 @@ operate('all_requests_stable_pre').tags( ['all_requests_stable'] ).queries(` -CREATE SCHEMA IF NOT EXISTS all_dev; +CREATE SCHEMA IF NOT EXISTS crawl; -CREATE TABLE IF NOT EXISTS \`all_dev.requests_stable\` +CREATE TABLE IF NOT EXISTS crawl.requests ( date DATE NOT NULL OPTIONS(description='YYYY-MM-DD format of the HTTP Archive monthly crawl'), client STRING NOT NULL OPTIONS(description='Test environment: desktop or mobile'), @@ -55,7 +55,7 @@ iterations.forEach((iteration, i) => { ).dependencies([ i === 0 ? 'all_requests_stable_pre' : `all_requests_stable ${iterations[i - 1].month} ${iterations[i - 1].client} ${iterations[i - 1].isRootPage}` ]).queries(ctx => ` -DELETE FROM \`all_dev.requests_stable\` +DELETE FROM crawl.requests WHERE date = '${iteration.month}' AND client = '${iteration.client}' AND is_root_page = ${iteration.isRootPage}; @@ -76,7 +76,7 @@ try { } '''; -INSERT INTO \`all_dev.requests_stable\` +INSERT INTO crawl.requests SELECT date, client, From b2e7b7daa48bb5e6b04b8e3745949b01b5c6fc66 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Mon, 21 Oct 2024 23:41:12 +0200 Subject: [PATCH 39/44] fix json parsing --- definitions/output/crawl/backfill_pages.js | 68 +++++++++++++--------- 1 file changed, 40 insertions(+), 28 deletions(-) diff --git a/definitions/output/crawl/backfill_pages.js b/definitions/output/crawl/backfill_pages.js index 30da147..7425d79 100644 --- a/definitions/output/crawl/backfill_pages.js +++ b/definitions/output/crawl/backfill_pages.js @@ -37,23 +37,29 @@ DELETE FROM crawl.pages WHERE date = '${iteration.date}' AND client = '${iteration.client}'; -CREATE TEMPORARY FUNCTION GET_OTHER_CUSTOM_METRICS( - jsonObject JSON, +CREATE TEMPORARY FUNCTION getOtherCustomMetrics( + payload JSON, keys ARRAY ) RETURNS JSON LANGUAGE js AS """ try { - let other_metrics = {}; - keys.forEach(function(key) { - other_metrics[key.substr(1)] = JSON.parse(jsonObject[key]); + let otherMetrics = {}; + let value = null; + keys.forEach(function (key) { + try { + value = JSON.parse(payload[key]) + } catch (e) { + value = payload[key] + } + otherMetrics[key.substr(1)] = value }); - return other_metrics; + return otherMetrics; } catch (e) { return null; } """; -CREATE TEMP FUNCTION getFeatures(payload JSON) +CREATE TEMP FUNCTION getFeatures(blinkFeatureFirstUsed JSON) RETURNS ARRAY> LANGUAGE js AS ''' @@ -76,7 +82,6 @@ LANGUAGE js AS } } - let blinkFeatureFirstUsed = payload._blinkFeatureFirstUsed; if (!blinkFeatureFirstUsed) return []; var idPattern = new RegExp('^Feature_(\\\\d+)$'); @@ -92,7 +97,14 @@ SELECT pages.url AS page, TRUE AS is_root_page, pages.url AS root_page, - crux.rank AS rank, + COALESCE( + summary_pages.rank, + CASE + WHEN summary_pages.rank <= 1000 THEN 1000 + WHEN summary_pages.rank <= 5000 THEN 5000 + ELSE NULL + END + ) AS rank, summary_pages.wptid, JSON_REMOVE( payload, @@ -249,39 +261,39 @@ SELECT wpt_bodies JSON, other JSON >( - payload._a11y, - payload._cms, - payload._cookies, + SAFE.PARSE_JSON(JSON_VALUE(payload, "$._a11y"), wide_number_mode => 'round'), + SAFE.PARSE_JSON(JSON_VALUE(payload, "$._cms"), wide_number_mode => 'round'), + SAFE.PARSE_JSON(JSON_VALUE(payload, "$._cookies"), wide_number_mode => 'round'), payload["_css-variables"], - payload._ecommerce, - payload._element_count, - payload._javascript, - payload._markup, - payload._media, + SAFE.PARSE_JSON(JSON_VALUE(payload, "$._ecommerce"), wide_number_mode => 'round'), + SAFE.PARSE_JSON(JSON_VALUE(payload, "$._element_count"), wide_number_mode => 'round'), + SAFE.PARSE_JSON(JSON_VALUE(payload, "$._javascript"), wide_number_mode => 'round'), + SAFE.PARSE_JSON(JSON_VALUE(payload, "$._markup"), wide_number_mode => 'round'), + SAFE.PARSE_JSON(JSON_VALUE(payload, "$._media"), wide_number_mode => 'round'), payload["_origin-trials"], payload._performance, - payload._privacy, - payload._responsive_images, - payload._robots_txt, - payload._security, - payload["_structured-data"], - payload["_third-parties"], - payload["_well-known"], - payload._wpt_bodies, - GET_OTHER_CUSTOM_METRICS( + SAFE.PARSE_JSON(JSON_VALUE(payload, "$._privacy"), wide_number_mode => 'round'), + SAFE.PARSE_JSON(JSON_VALUE(payload, "$._responsive_images"), wide_number_mode => 'round'), + SAFE.PARSE_JSON(JSON_VALUE(payload, "$._robots_txt"), wide_number_mode => 'round'), + SAFE.PARSE_JSON(JSON_VALUE(payload, "$._security"), wide_number_mode => 'round'), + SAFE.PARSE_JSON(JSON_VALUE(payload, "$._structured-data"), wide_number_mode => 'round'), + SAFE.PARSE_JSON(JSON_VALUE(payload, "$._third-parties"), wide_number_mode => 'round'), + SAFE.PARSE_JSON(JSON_VALUE(payload, "$._well-known"), wide_number_mode => 'round'), + SAFE.PARSE_JSON(JSON_VALUE(payload, "$._wpt_bodies"), wide_number_mode => 'round'), + getOtherCustomMetrics( payload, ["_Colordepth", "_Dpi", "_Images", "_Resolution", "_almanac", "_avg_dom_depth", "_css", "_doctype", "_document_height", "_document_width", "_event-names", "_fugu-apis", "_has_shadow_root", "_img-loading-attr", "_initiators", "_inline_style_bytes", "_lib-detector-version", "_localstorage_size", "_meta_viewport", "_num_iframes", "_num_scripts", "_num_scripts_async", "_num_scripts_sync", "_pwa", "_quirks_mode", "_sass", "_sessionstorage_size", "_usertiming"] ) ) AS custom_metrics, NULL AS lighthouse, - getFeatures(pages.payload) AS features, + getFeatures(payload._blinkFeatureFirstUsed) AS features, tech.technologies AS technologies, pages.payload._metadata AS metadata FROM ( SELECT * EXCEPT(payload), SAFE.PARSE_JSON(payload, wide_number_mode => 'round') AS payload - FROM pages.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} ${constants.devTABLESAMPLE} + FROM \`pages.${constants.fnDateUnderscored(iteration.date)}_${iteration.client}\` ${constants.devTABLESAMPLE} ) AS pages LEFT JOIN summary_pages.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} AS summary_pages ${constants.devTABLESAMPLE} From 14816f8060189272bb7f471386f830f3bb47318a Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Tue, 22 Oct 2024 14:12:30 +0200 Subject: [PATCH 40/44] fix summary metrics --- definitions/output/crawl/backfill_pages.js | 2 +- definitions/output/crawl/backfill_requests.js | 91 ++++++++++--------- 2 files changed, 47 insertions(+), 46 deletions(-) diff --git a/definitions/output/crawl/backfill_pages.js b/definitions/output/crawl/backfill_pages.js index 7425d79..4c6faee 100644 --- a/definitions/output/crawl/backfill_pages.js +++ b/definitions/output/crawl/backfill_pages.js @@ -98,7 +98,7 @@ SELECT TRUE AS is_root_page, pages.url AS root_page, COALESCE( - summary_pages.rank, + crux.rank, CASE WHEN summary_pages.rank <= 1000 THEN 1000 WHEN summary_pages.rank <= 5000 THEN 5000 diff --git a/definitions/output/crawl/backfill_requests.js b/definitions/output/crawl/backfill_requests.js index 14cb923..038df1b 100644 --- a/definitions/output/crawl/backfill_requests.js +++ b/definitions/output/crawl/backfill_requests.js @@ -39,10 +39,9 @@ DELETE FROM crawl.requests WHERE date = '${iteration.date}' AND client = '${iteration.client}'; -CREATE TEMP FUNCTION get_ext_from_url(url STRING) +CREATE TEMP FUNCTION getExtFromURL(url STRING) RETURNS STRING -LANGUAGE js -AS """ +LANGUAGE js AS """ try { let ret_ext = url; @@ -79,8 +78,7 @@ try { CREATE TEMP FUNCTION prettyType(mimeTyp STRING, ext STRING) RETURNS STRING -LANGUAGE js -AS """ +LANGUAGE js AS """ try { mimeTyp = mimeTyp.toLowerCase(); @@ -127,8 +125,7 @@ try { CREATE TEMP FUNCTION getFormat(prettyTyp STRING, mimeTyp STRING, ext STRING) RETURNS STRING -LANGUAGE js -AS """ +LANGUAGE js AS """ try { if (prettyTyp === "image") { // Order by most popular first. @@ -159,13 +156,12 @@ try { } """; -CREATE TEMP FUNCTION parse_headers(headers JSON) +CREATE TEMP FUNCTION parseHeaders(headers JSON) RETURNS ARRAY> -LANGUAGE js -AS """ +LANGUAGE js AS """ try { return headers.map(header => { - return { name: header.name, value: header.value }; + return { name: header.name.toLowerCase(), value: header.value }; }); } catch (e) { return []; @@ -174,14 +170,11 @@ AS """ CREATE TEMP FUNCTION getCookieLen(headers JSON, cookieName STRING) RETURNS INT64 -LANGUAGE js -AS """ +LANGUAGE js AS """ try { - const cookies = headers.find(header => header.name.toLowerCase() === cookieName) + const cookies = headers.filter(header => header.name.toLowerCase() === headerName) if (!cookies) { return 0 - } else if (typeof cookies === 'object') { - return cookies.value.length } else if (Array.isArray(cookies)) { return cookies.values().reduce((acc, cookie) => acc + cookie.value.length, 0) } else { @@ -192,38 +185,36 @@ AS """ } """; -CREATE TEMP FUNCTION getExpAge(startedDateTime STRING, request JSON, response JSON) +CREATE TEMP FUNCTION getExpAge(startedDateTime STRING, responseHeaders JSON) RETURNS INT64 -LANGUAGE js -AS """ +LANGUAGE js AS """ try { - expAge = 0; + const cacheControlRegExp = /max-age=(\\\\d+)/ // Get the Cache-Control header value - const cacheControl = request.headers.find(header => header.name.toLowerCase() === 'cache-control').value; + const cacheControl = responseHeaders.find(header => header.name.toLowerCase() === 'cache-control')?.value // Handle no-cache scenarios - if (cacheControl && (cacheControl.includes("must-revalidate") || cacheControl.includes("no-cache") || cacheControl.includes("no-store"))) { - expAge = 0; - } + if (cacheControl && (cacheControl.includes('must-revalidate') || cacheControl.includes('no-cache') || cacheControl.includes('no-store'))) { + return 0 + } else if (cacheControl && cacheControlRegExp.test(cacheControl)) { // Handle max-age directive in Cache-Control header + const maxAgeValue = parseInt(cacheControlRegExp.exec(cacheControl)[1]) + return Math.min(2 ** 63 - 1, maxAgeValue) + } else if ( // Handle Expires header in the response + responseHeaders.find(header => header.name.toLowerCase() === 'expires') + ) { + const respDate = responseHeaders.find(header => header.name.toLowerCase() === 'date')?.value + const startDate = new Date(respDate)?.getTime() || Date.parse(startedDateTime) - // Handle max-age directive in Cache-Control header - else if (cacheControl && cacheControl.includes("max-age")) { - const maxAgeValue = cacheControl.match(/max-age=(\\d+)/)[1]; - expAge = min(2**63 - 1, parseInt(maxAgeValue); - } + const expDate = responseHeaders.find(header => header.name.toLowerCase() === 'expires')?.value + const endDate = new Date(expDate)?.getTime() || 0 - // Handle Expires header in the response - else if (response.headers.find(header => header.name.toLowerCase() === 'expires')) { - const respDate = response.headers.find(header => header.name.toLowerCase() === 'date').value - startDate = new Date(respDate).getTime() ? respDate : startedDateTime; - const endDate = new Date(response.headers.find(header => header.name.toLowerCase() === 'expires').value).getTime(); - expAge = endDate - startDate; + return Math.max((endDate - startDate) / 1000, 0) } - return expAge; + return 0 } catch (e) { - return 0; // Return 0 in case of any errors + return 0 // Return 0 in case of any errors } """; @@ -234,7 +225,14 @@ SELECT requests.page AS page, TRUE AS is_root_page, requests.page AS root_page, - crux.rank AS rank, + COALESCE( + crux.rank, + CASE + WHEN summary_pages.rank <= 1000 THEN 1000 + WHEN summary_pages.rank <= 5000 THEN 5000 + ELSE NULL + END + ) AS rank, requests.url AS url, IF( STRING(payload._request_type) = "Document" AND @@ -259,21 +257,21 @@ SELECT response.bodySize AS respBodySize, response.content.size AS respSize, getCookieLen(response.headers, 'set-cookie') AS respCookieLen, - getExpAge(STRING(payload.startedDateTime), request, response) AS expAge, + getExpAge(STRING(payload.startedDateTime), response.headers) AS expAge, response.content.mimeType, payload._cdn_provider, payload._gzip_save, ext, - getFormat(type, response.content.mimeType, ext) AS format + getFormat(type, STRING(response.content.mimeType), ext) AS format )) AS summary, - parse_headers(payload.request.headers) AS request_headers, - parse_headers(payload.response.headers) AS response_headers, + parseHeaders(request.headers) AS request_headers, + parseHeaders(response.headers) AS response_headers, IF(requests.type = 'image', NULL, response_bodies.body) AS response_body FROM ( - FROM requests.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} ${constants.devTABLESAMPLE} + FROM \`requests.${constants.fnDateUnderscored(iteration.date)}_${iteration.client}\` ${constants.devTABLESAMPLE} |> SET payload = SAFE.PARSE_JSON(payload, wide_number_mode => 'round') - |> EXTEND get_ext_from_url(url) AS ext - |> EXTEND prettyType(STRING(payload.response.content.mimeType), ext_from_url) AS type + |> EXTEND getExtFromURL(url) AS ext + |> EXTEND prettyType(STRING(payload.response.content.mimeType), ext) AS type |> EXTEND INT64(payload._index) AS index |> EXTEND payload.request AS request |> EXTEND payload.response AS response @@ -294,6 +292,9 @@ LEFT JOIN ( ) AS crux ON requests.page = crux.page +LEFT JOIN summary_pages.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} AS summary_pages ${constants.devTABLESAMPLE} +ON requests.page = summary_pages.url + LEFT JOIN response_bodies.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} AS response_bodies ${constants.devTABLESAMPLE} ON requests.page = response_bodies.page AND requests.url = response_bodies.url; From 2898c8218abb8e7b983e58a037404ba2ffa4be83 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Tue, 22 Oct 2024 16:42:43 +0200 Subject: [PATCH 41/44] crawl pipeline updated --- README.md | 13 +- definitions/output/all/pages.js | 2 +- definitions/output/all/parsed_css.js | 21 +- definitions/output/all/requests.js | 2 +- definitions/output/crawl/pages.js | 468 ++++++++++++++++++ definitions/output/crawl/parsed_css.js | 32 ++ definitions/output/crawl/reprocess_pages.js | 10 +- .../output/crawl/reprocess_requests.js | 14 +- definitions/output/crawl/requests.js | 162 ++++++ definitions/output/sample_data/pages_10k.js | 4 +- .../output/sample_data/parsed_css_10k.js | 4 +- .../output/sample_data/requests_10k.js | 10 +- src/index.js | 2 +- 13 files changed, 697 insertions(+), 47 deletions(-) create mode 100644 definitions/output/crawl/pages.js create mode 100644 definitions/output/crawl/parsed_css.js create mode 100644 definitions/output/crawl/requests.js diff --git a/README.md b/README.md index 04cc1f9..abfb0cc 100644 --- a/README.md +++ b/README.md @@ -8,11 +8,11 @@ The pipelines are run in Dataform service in Google Cloud Platform (GCP) and are ### Crawl results -Tag: `crawl_results_all` +Tag: `crawl_complete` -- httparchive.all.pages -- httparchive.all.parsed_css -- httparchive.all.requests +- httparchive.crawl.pages +- httparchive.crawl.parsed_css +- httparchive.crawl.requests ### Core Web Vitals Technology Report @@ -39,6 +39,9 @@ Consumers: Tag: `crawl_results_legacy` +- httparchive.all.pages +- httparchive.all.parsed_css +- httparchive.all.requests - httparchive.lighthouse.YYYY_MM_DD_client - httparchive.pages.YYYY_MM_DD_client - httparchive.requests.YYYY_MM_DD_client @@ -51,7 +54,7 @@ Tag: `crawl_results_legacy` 1. [crawl-complete](https://console.cloud.google.com/cloudpubsub/subscription/detail/dataformTrigger?authuser=7&project=httparchive) PubSub subscription - Tags: ["crawl_results_all", "blink_features_report", "crawl_results_legacy"] + Tags: ["crawl_complete", "blink_features_report", "crawl_results_legacy"] 2. [bq-poller-cwv-tech-report](https://console.cloud.google.com/cloudscheduler/jobs/edit/us-east4/bq-poller-cwv-tech-report?authuser=7&project=httparchive) Scheduler diff --git a/definitions/output/all/pages.js b/definitions/output/all/pages.js index 308c958..e5b3490 100644 --- a/definitions/output/all/pages.js +++ b/definitions/output/all/pages.js @@ -7,7 +7,7 @@ publish('pages', { clusterBy: ['client', 'is_root_page', 'rank'], requirePartitionFilter: true }, - tags: ['crawl_results_all'] + tags: ['crawl_results_legacy'] }).preOps(ctx => ` DELETE FROM ${ctx.self()} WHERE date = '${constants.currentMonth}'; diff --git a/definitions/output/all/parsed_css.js b/definitions/output/all/parsed_css.js index 78ffcb4..c370e59 100644 --- a/definitions/output/all/parsed_css.js +++ b/definitions/output/all/parsed_css.js @@ -7,21 +7,10 @@ publish('parsed_css', { clusterBy: ['client', 'is_root_page', 'rank', 'page'], requirePartitionFilter: true }, - tags: ['crawl_results_all'] + tags: ['crawl_results_legacy'] }).preOps(ctx => ` -DELETE FROM ${ctx.self()} -WHERE date = '${constants.currentMonth}'; -`).query(ctx => ` -SELECT * -FROM ${ctx.ref('crawl_staging', 'parsed_css')} -WHERE date = '${constants.currentMonth}' - AND client = 'desktop' - ${constants.devRankFilter} -`).postOps(ctx => ` -INSERT INTO ${ctx.self()} -SELECT * -FROM ${ctx.ref('crawl_staging', 'parsed_css')} -WHERE date = '${constants.currentMonth}' - AND client = 'mobile' - ${constants.devRankFilter}; +DROP SNAPSHOT TABLE IF EXISTS ${ctx.self()}; + +CREATE SNAPSHOT TABLE ${ctx.self()} +CLONE ${ctx.resolve('crawl', 'parsed_css')}; `) diff --git a/definitions/output/all/requests.js b/definitions/output/all/requests.js index f91eada..9e50b0e 100644 --- a/definitions/output/all/requests.js +++ b/definitions/output/all/requests.js @@ -7,7 +7,7 @@ publish('requests', { clusterBy: ['client', 'is_root_page', 'is_main_document', 'type'], requirePartitionFilter: true }, - tags: ['crawl_results_all'] + tags: ['crawl_results_legacy'] }).preOps(ctx => ` DELETE FROM ${ctx.self()} WHERE date = '${constants.currentMonth}'; diff --git a/definitions/output/crawl/pages.js b/definitions/output/crawl/pages.js new file mode 100644 index 0000000..85cd6bd --- /dev/null +++ b/definitions/output/crawl/pages.js @@ -0,0 +1,468 @@ +publish('pages', { + type: 'incremental', + protected: true, + schema: 'crawl', + bigquery: { + partitionBy: 'date', + clusterBy: ['client', 'is_root_page', 'rank', 'page'], + requirePartitionFilter: true + }, + tags: ['crawl_complete'] +}).preOps(ctx => ` +CREATE SCHEMA IF NOT EXISTS crawl; + +CREATE TABLE IF NOT EXISTS ${ctx.self()} +( + date DATE NOT NULL OPTIONS(description='YYYY-MM-DD format of the HTTP Archive monthly crawl'), + client STRING NOT NULL OPTIONS(description='Test environment: desktop or mobile'), + page STRING NOT NULL OPTIONS(description='The URL of the page being tested'), + is_root_page BOOL NOT NULL OPTIONS(description='Whether the page is the root of the origin'), + root_page STRING NOT NULL OPTIONS(description='The URL of the root page being tested, the origin followed by /'), + rank INT64 OPTIONS(description='Site popularity rank, from CrUX'), + wptid STRING OPTIONS(description='ID of the WebPageTest results'), + payload JSON OPTIONS(description='JSON-encoded WebPageTest results for the page'), + summary JSON OPTIONS(description='JSON-encoded summarization of the page-level data'), + custom_metrics STRUCT< + a11y JSON, + cms JSON, + cookies JSON, + css_variables JSON, + ecommerce JSON, + element_count JSON, + javascript JSON, + markup JSON, + media JSON, + origin_trials JSON, + performance JSON, + privacy JSON, + responsive_images JSON, + robots_txt JSON, + security JSON, + structured_data JSON, + third_parties JSON, + well_known JSON, + wpt_bodies JSON, + other JSON + > OPTIONS(description='Custom metrics from WebPageTest'), + lighthouse JSON OPTIONS(description='JSON-encoded Lighthouse report'), + features ARRAY> OPTIONS(description='Blink features detected at runtime (see https://chromestatus.com/features)'), + technologies ARRAY OPTIONS(description='List of categories to which this technology belongs'), + info ARRAY OPTIONS(description='Additional metadata about the detected technology, ie version number') + >> OPTIONS(description='Technologies detected at runtime (see https://www.wappalyzer.com/)'), + metadata JSON OPTIONS(description='Additional metadata about the test') +) +PARTITION BY date +CLUSTER BY client, is_root_page, rank, page +OPTIONS( + require_partition_filter=true +); + +DELETE FROM ${ctx.self()} +WHERE date = '${constants.currentMonth}' AND + client = 'desktop'; +`).query(ctx => ` +SELECT + date, + client, + page, + is_root_page, + root_page, + rank, + wptid, + JSON_REMOVE( + payload, + '$._metadata', + '$._detected', + '$._detected_apps', + '$._detected_technologies', + '$._detected_raw', + '$._custom', + '$._00_reset', + '$._a11y', + '$._ads', + '$._almanac', + '$._aurora', + '$._avg_dom_depth', + '$._cms', + '$._Colordepth', + '$._cookies', + '$._crawl_links', + '$._css-variables', + '$._css', + '$._doctype', + '$._document_height', + '$._document_width', + '$._Dpi', + '$._ecommerce', + '$._element_count', + '$._event-names', + '$._fugu-apis', + '$._generated-content', + '$._has_shadow_root', + '$._Images', + '$._img-loading-attr', + '$._initiators', + '$._inline_style_bytes', + '$._javascript', + '$._lib-detector-version', + '$._localstorage_size', + '$._markup', + '$._media', + '$._meta_viewport', + '$._num_iframes', + '$._num_scripts_async', + '$._num_scripts_sync', + '$._num_scripts', + '$._observers', + '$._origin-trials', + '$._parsed_css', + '$._performance', + '$._privacy-sandbox', + '$._privacy', + '$._pwa', + '$._quirks_mode', + '$._Resolution', + '$._responsive_images', + '$._robots_meta', + '$._robots_txt', + '$._sass', + '$._security', + '$._sessionstorage_size', + '$._structured-data', + '$._third-parties', + '$._usertiming', + '$._valid-head', + '$._well-known', + '$._wpt_bodies', + '$._blinkFeatureFirstUsed', + '$._CrUX' + ) AS payload, + JSON_SET( + JSON_REMOVE( + summary, + '$._adult_site', + '$.archive', + '$.avg_dom_depth', + '$.crawlid', + '$.createDate', + '$.doctype', + '$.document_height', + '$.document_width', + '$.label', + '$.localstorage_size', + '$.meta_viewport', + '$.metadata', + '$.num_iframes', + '$.num_scripts_async', + '$.num_scripts_sync', + '$.num_scripts', + '$.pageid', + '$.PageSpeed', + '$.rank', + '$.sessionstorage_size', + '$.startedDateTime', + '$.url', + '$.urlhash', + '$.urlShort', + '$.usertiming', + '$.wptid', + '$.wptrun' + ), + '$.crux', + payload._CrUX + ) AS summary, + STRUCT< + a11y JSON, + cms JSON, + cookies JSON, + css_variables JSON, + ecommerce JSON, + element_count JSON, + javascript JSON, + markup JSON, + media JSON, + origin_trials JSON, + performance JSON, + privacy JSON, + responsive_images JSON, + robots_txt JSON, + security JSON, + structured_data JSON, + third_parties JSON, + well_known JSON, + wpt_bodies JSON, + other JSON + >( + custom_metrics.a11y, + custom_metrics.cms, + custom_metrics.cookies, + custom_metrics["css-variables"], + custom_metrics.ecommerce, + custom_metrics.element_count, + custom_metrics.javascript, + custom_metrics.markup, + custom_metrics.media, + custom_metrics["origin-trials"], + custom_metrics.performance, + custom_metrics.privacy, + custom_metrics.responsive_images, + custom_metrics.robots_txt, + custom_metrics.security, + custom_metrics["structured-data"], + custom_metrics["third-parties"], + custom_metrics["well-known"], + custom_metrics.wpt_bodies, + JSON_REMOVE( + custom_metrics, + '$.a11y', + '$.cms', + '$.cookies', + '$.css-variables', + '$.ecommerce', + '$.element_count', + '$.javascript', + '$.markup', + '$.media', + '$.origin-trials', + '$.performance', + '$.privacy', + '$.responsive_images', + '$.robots_txt', + '$.security', + '$.structured-data', + '$.third-parties', + '$.well-known', + '$.wpt_bodies' + ) + ) AS custom_metrics, + lighthouse, + features, + technologies, + JSON_REMOVE( + metadata, + '$.page_id', + '$.parent_page_id', + '$.root_page_id' + ) AS metadata +FROM ( + SELECT + * EXCEPT (custom_metrics, lighthouse, metadata, payload, summary), + SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round') AS custom_metrics, + SAFE.PARSE_JSON(lighthouse, wide_number_mode => 'round') AS lighthouse, + SAFE.PARSE_JSON(metadata, wide_number_mode => 'round') AS metadata, + SAFE.PARSE_JSON(payload, wide_number_mode => 'round') AS payload, + SAFE.PARSE_JSON(summary, wide_number_mode => 'round') AS summary + FROM ${ctx.ref('crawl_staging', 'pages')} + WHERE date = '${constants.currentMonth}' AND + client = 'desktop' + ${constants.devRankFilter} +) +`).postOps(ctx => ` +DELETE FROM ${ctx.self()} +WHERE date = '${constants.currentMonth}' AND + client = 'mobile'; + +INSERT INTO ${ctx.self()} +SELECT + date, + client, + page, + is_root_page, + root_page, + rank, + wptid, + JSON_REMOVE( + payload, + '$._metadata', + '$._detected', + '$._detected_apps', + '$._detected_technologies', + '$._detected_raw', + '$._custom', + '$._00_reset', + '$._a11y', + '$._ads', + '$._almanac', + '$._aurora', + '$._avg_dom_depth', + '$._cms', + '$._Colordepth', + '$._cookies', + '$._crawl_links', + '$._css-variables', + '$._css', + '$._doctype', + '$._document_height', + '$._document_width', + '$._Dpi', + '$._ecommerce', + '$._element_count', + '$._event-names', + '$._fugu-apis', + '$._generated-content', + '$._has_shadow_root', + '$._Images', + '$._img-loading-attr', + '$._initiators', + '$._inline_style_bytes', + '$._javascript', + '$._lib-detector-version', + '$._localstorage_size', + '$._markup', + '$._media', + '$._meta_viewport', + '$._num_iframes', + '$._num_scripts_async', + '$._num_scripts_sync', + '$._num_scripts', + '$._observers', + '$._origin-trials', + '$._parsed_css', + '$._performance', + '$._privacy-sandbox', + '$._privacy', + '$._pwa', + '$._quirks_mode', + '$._Resolution', + '$._responsive_images', + '$._robots_meta', + '$._robots_txt', + '$._sass', + '$._security', + '$._sessionstorage_size', + '$._structured-data', + '$._third-parties', + '$._usertiming', + '$._valid-head', + '$._well-known', + '$._wpt_bodies', + '$._blinkFeatureFirstUsed', + '$._CrUX' + ) AS payload, + JSON_SET( + JSON_REMOVE( + summary, + '$._adult_site', + '$.archive', + '$.avg_dom_depth', + '$.crawlid', + '$.createDate', + '$.doctype', + '$.document_height', + '$.document_width', + '$.label', + '$.localstorage_size', + '$.meta_viewport', + '$.metadata', + '$.num_iframes', + '$.num_scripts_async', + '$.num_scripts_sync', + '$.num_scripts', + '$.pageid', + '$.PageSpeed', + '$.rank', + '$.sessionstorage_size', + '$.startedDateTime', + '$.url', + '$.urlhash', + '$.urlShort', + '$.usertiming', + '$.wptid', + '$.wptrun' + ), + '$.crux', + payload._CrUX + ) AS summary, + STRUCT< + a11y JSON, + cms JSON, + cookies JSON, + css_variables JSON, + ecommerce JSON, + element_count JSON, + javascript JSON, + markup JSON, + media JSON, + origin_trials JSON, + performance JSON, + privacy JSON, + responsive_images JSON, + robots_txt JSON, + security JSON, + structured_data JSON, + third_parties JSON, + well_known JSON, + wpt_bodies JSON, + other JSON + >( + custom_metrics.a11y, + custom_metrics.cms, + custom_metrics.cookies, + custom_metrics["css-variables"], + custom_metrics.ecommerce, + custom_metrics.element_count, + custom_metrics.javascript, + custom_metrics.markup, + custom_metrics.media, + custom_metrics["origin-trials"], + custom_metrics.performance, + custom_metrics.privacy, + custom_metrics.responsive_images, + custom_metrics.robots_txt, + custom_metrics.security, + custom_metrics["structured-data"], + custom_metrics["third-parties"], + custom_metrics["well-known"], + custom_metrics.wpt_bodies, + JSON_REMOVE( + custom_metrics, + '$.a11y', + '$.cms', + '$.cookies', + '$.css-variables', + '$.ecommerce', + '$.element_count', + '$.javascript', + '$.markup', + '$.media', + '$.origin-trials', + '$.performance', + '$.privacy', + '$.responsive_images', + '$.robots_txt', + '$.security', + '$.structured-data', + '$.third-parties', + '$.well-known', + '$.wpt_bodies' + ) + ) AS custom_metrics, + lighthouse, + features, + technologies, + JSON_REMOVE( + metadata, + '$.page_id', + '$.parent_page_id', + '$.root_page_id' + ) AS metadata +FROM ( + SELECT + * EXCEPT (custom_metrics, lighthouse, metadata, payload, summary), + SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round') AS custom_metrics, + SAFE.PARSE_JSON(lighthouse, wide_number_mode => 'round') AS lighthouse, + SAFE.PARSE_JSON(metadata, wide_number_mode => 'round') AS metadata, + SAFE.PARSE_JSON(payload, wide_number_mode => 'round') AS payload, + SAFE.PARSE_JSON(summary, wide_number_mode => 'round') AS summary + FROM ${ctx.ref('crawl_staging', 'pages')} + WHERE date = '${constants.currentMonth}' AND + client = 'mobile' + ${constants.devRankFilter} +) +`) diff --git a/definitions/output/crawl/parsed_css.js b/definitions/output/crawl/parsed_css.js new file mode 100644 index 0000000..529bbe0 --- /dev/null +++ b/definitions/output/crawl/parsed_css.js @@ -0,0 +1,32 @@ +publish('parsed_css', { + type: 'incremental', + protected: true, + schema: 'crawl', + bigquery: { + partitionBy: 'date', + clusterBy: ['client', 'is_root_page', 'rank', 'page'], + requirePartitionFilter: true + }, + tags: ['crawl_complete'] +}).preOps(ctx => ` +DELETE FROM ${ctx.self()} +WHERE date = '${constants.currentMonth}' + AND client = 'desktop'; +`).query(ctx => ` +SELECT * +FROM ${ctx.ref('crawl_staging', 'parsed_css')} +WHERE date = '${constants.currentMonth}' + AND client = 'desktop' + ${constants.devRankFilter} +`).postOps(ctx => ` +DELETE FROM ${ctx.self()} +WHERE date = '${constants.currentMonth}' + AND client = 'mobile'; + +INSERT INTO ${ctx.self()} +SELECT * +FROM ${ctx.ref('crawl_staging', 'parsed_css')} +WHERE date = '${constants.currentMonth}' + AND client = 'mobile' + ${constants.devRankFilter}; +`) diff --git a/definitions/output/crawl/reprocess_pages.js b/definitions/output/crawl/reprocess_pages.js index 7ebdb5f..f6f7b9e 100644 --- a/definitions/output/crawl/reprocess_pages.js +++ b/definitions/output/crawl/reprocess_pages.js @@ -1,5 +1,5 @@ -operate('all_pages_stable_pre').tags( - ['all_pages_stable'] +operate('reprocess_pages_pre').tags( + ['reprocess_pages'] ).queries(` CREATE SCHEMA IF NOT EXISTS crawl; @@ -70,10 +70,10 @@ for ( } iterations.forEach((iteration, i) => { - operate(`all_pages_stable_update ${iteration.month} ${iteration.client}`).tags([ - 'all_pages_stable' + operate(`reprocess_pages ${iteration.month} ${iteration.client}`).tags([ + 'reprocess_pages' ]).dependencies([ - i === 0 ? 'all_pages_stable_pre' : `all_pages_stable_update ${iterations[i - 1].month} ${iterations[i - 1].client}` + i === 0 ? 'reprocess_pages_pre' : `reprocess_pages ${iterations[i - 1].month} ${iterations[i - 1].client}` ]).queries(ctx => ` DELETE FROM crawl.pages WHERE date = '${iteration.month}' AND diff --git a/definitions/output/crawl/reprocess_requests.js b/definitions/output/crawl/reprocess_requests.js index eabd356..6434398 100644 --- a/definitions/output/crawl/reprocess_requests.js +++ b/definitions/output/crawl/reprocess_requests.js @@ -1,5 +1,5 @@ -operate('all_requests_stable_pre').tags( - ['all_requests_stable'] +operate('reprocess_requests_pre').tags( + ['reprocess_requests'] ).queries(` CREATE SCHEMA IF NOT EXISTS crawl; @@ -50,17 +50,17 @@ for ( } iterations.forEach((iteration, i) => { - operate(`all_requests_stable ${iteration.month} ${iteration.client} ${iteration.isRootPage}`).tags( - ['all_requests_stable'] + operate(`reprocess_requests ${iteration.month} ${iteration.client} ${iteration.isRootPage}`).tags( + ['reprocess_requests'] ).dependencies([ - i === 0 ? 'all_requests_stable_pre' : `all_requests_stable ${iterations[i - 1].month} ${iterations[i - 1].client} ${iterations[i - 1].isRootPage}` + i === 0 ? 'reprocess_requests_pre' : `reprocess_requests ${iterations[i - 1].month} ${iterations[i - 1].client} ${iterations[i - 1].isRootPage}` ]).queries(ctx => ` DELETE FROM crawl.requests WHERE date = '${iteration.month}' AND client = '${iteration.client}' AND is_root_page = ${iteration.isRootPage}; -CREATE TEMP FUNCTION PRUNE_HEADERS( +CREATE TEMP FUNCTION pruneHeaders( jsonObject JSON ) RETURNS JSON LANGUAGE js AS ''' @@ -94,7 +94,7 @@ SELECT '$.request.headers', '$.response.headers' ) AS payload, - PRUNE_HEADERS( + pruneHeaders( JSON_REMOVE( summary, '$.crawlid', diff --git a/definitions/output/crawl/requests.js b/definitions/output/crawl/requests.js new file mode 100644 index 0000000..5ca4922 --- /dev/null +++ b/definitions/output/crawl/requests.js @@ -0,0 +1,162 @@ +publish('requests', { + type: 'incremental', + protected: true, + schema: 'crawl', + bigquery: { + partitionBy: 'date', + clusterBy: ['client', 'is_root_page', 'type', 'rank'], + requirePartitionFilter: true + }, + tags: ['crawl_complete'] +}).preOps(ctx => ` +CREATE SCHEMA IF NOT EXISTS crawl; + +CREATE TABLE IF NOT EXISTS ${ctx.self()} +( + date DATE NOT NULL OPTIONS(description='YYYY-MM-DD format of the HTTP Archive monthly crawl'), + client STRING NOT NULL OPTIONS(description='Test environment: desktop or mobile'), + page STRING NOT NULL OPTIONS(description='The URL of the page being tested'), + is_root_page BOOL OPTIONS(description='Whether the page is the root of the origin.'), + root_page STRING NOT NULL OPTIONS(description='The URL of the root page being tested'), + rank INT64 OPTIONS(description='Site popularity rank, from CrUX'), + url STRING NOT NULL OPTIONS(description='The URL of the request'), + is_main_document BOOL NOT NULL OPTIONS(description='Whether this request corresponds with the main HTML document of the page, which is the first HTML request after redirects'), + type STRING OPTIONS(description='Simplified description of the type of resource (script, html, css, text, other, etc)'), + index INT64 OPTIONS(description='The sequential 0-based index of the request'), + payload JSON OPTIONS(description='JSON-encoded WebPageTest result data for this request'), + summary JSON OPTIONS(description='JSON-encoded summarization of request data'), + request_headers ARRAY> OPTIONS(description='Request headers'), + response_headers ARRAY> OPTIONS(description='Response headers'), + response_body STRING OPTIONS(description='Text-based response body') +) +PARTITION BY date +CLUSTER BY client, is_root_page, type, rank +OPTIONS( + require_partition_filter=true +); + +CREATE TEMP FUNCTION pruneHeaders( + jsonObject JSON +) RETURNS JSON +LANGUAGE js AS ''' +try { + for (const [key, value] of Object.entries(jsonObject)) { + if(key.startsWith('req_') || key.startsWith('resp_')) { + delete jsonObject[key] + } + } + return jsonObject +} catch (e) { + return jsonObject +} +'''; + +DELETE FROM ${ctx.self()} +WHERE date = '${constants.currentMonth}' AND + client = 'desktop'; +`).query(ctx => ` +SELECT + date, + client, + requests.page, + is_root_page, + root_page, + crux.rank, + url, + is_main_document, + type, + index, + JSON_REMOVE( + payload, + '$._headers', + '$.request.headers', + '$.response.headers' + ) AS payload, + pruneHeaders( + JSON_REMOVE( + summary, + '$.crawlid', + '$.firstHtml', + '$.firstReq', + '$.pageid', + '$.reqOtherHeaders', + '$.requestid', + '$.respOtherHeaders', + '$.startedDateTime', + '$.type', + '$.url', + '$.urlShort' + ) + ) as summary, + request_headers, + response_headers, + response_body +FROM ( + SELECT + * EXCEPT (payload, summary), + SAFE.PARSE_JSON(payload, wide_number_mode => 'round') AS payload, + SAFE.PARSE_JSON(summary, wide_number_mode => 'round') AS summary + FROM ${ctx.ref('crawl_staging', 'requests')} + WHERE date = '${constants.currentMonth}' + AND client = 'desktop' + ${constants.devTABLESAMPLE} +) +`).postOps(ctx => ` +DELETE FROM ${ctx.self()} +WHERE date = '${constants.currentMonth}' AND + client = 'mobile'; + +INSERT INTO ${ctx.self()} +SELECT + date, + client, + requests.page, + is_root_page, + root_page, + crux.rank, + url, + is_main_document, + type, + index, + JSON_REMOVE( + payload, + '$._headers', + '$.request.headers', + '$.response.headers' + ) AS payload, + pruneHeaders( + JSON_REMOVE( + summary, + '$.crawlid', + '$.firstHtml', + '$.firstReq', + '$.pageid', + '$.reqOtherHeaders', + '$.requestid', + '$.respOtherHeaders', + '$.startedDateTime', + '$.type', + '$.url', + '$.urlShort' + ) + ) as summary, + request_headers, + response_headers, + response_body +FROM ( + SELECT + * EXCEPT (payload, summary), + SAFE.PARSE_JSON(payload, wide_number_mode => 'round') AS payload, + SAFE.PARSE_JSON(summary, wide_number_mode => 'round') AS summary + FROM ${ctx.ref('crawl_staging', 'requests')} + WHERE date = '${constants.currentMonth}' + AND client = 'mobile' + ${constants.devTABLESAMPLE} +) +`) diff --git a/definitions/output/sample_data/pages_10k.js b/definitions/output/sample_data/pages_10k.js index 875baa6..eb6e4c2 100644 --- a/definitions/output/sample_data/pages_10k.js +++ b/definitions/output/sample_data/pages_10k.js @@ -5,12 +5,12 @@ publish('pages_10k', { partitionBy: 'date', clusterBy: ['client', 'is_root_page', 'rank'] }, - tags: ['crawl_results_all'] + tags: ['crawl_complete'] }).preOps(ctx => ` DROP TABLE IF EXISTS ${ctx.self()}; `).query(ctx => ` SELECT * -FROM ${ctx.ref('all', 'pages')} +FROM ${ctx.ref('crawl', 'pages')} WHERE date = '${constants.currentMonth}' AND rank <= 10000 `) diff --git a/definitions/output/sample_data/parsed_css_10k.js b/definitions/output/sample_data/parsed_css_10k.js index fd08f07..b22feba 100644 --- a/definitions/output/sample_data/parsed_css_10k.js +++ b/definitions/output/sample_data/parsed_css_10k.js @@ -5,12 +5,12 @@ publish('parsed_css_10k', { partitionBy: 'date', clusterBy: ['client', 'is_root_page', 'rank', 'page'] }, - tags: ['crawl_results_all'] + tags: ['crawl_complete'] }).preOps(ctx => ` DROP TABLE IF EXISTS ${ctx.self()}; `).query(ctx => ` SELECT * -FROM ${ctx.ref('all', 'parsed_css')} +FROM ${ctx.ref('crawl', 'parsed_css')} WHERE date = '${constants.currentMonth}' AND rank <= 10000 `) diff --git a/definitions/output/sample_data/requests_10k.js b/definitions/output/sample_data/requests_10k.js index dc09cc1..08e8e14 100644 --- a/definitions/output/sample_data/requests_10k.js +++ b/definitions/output/sample_data/requests_10k.js @@ -5,16 +5,12 @@ publish('requests_10k', { partitionBy: 'date', clusterBy: ['client', 'is_root_page', 'is_main_document', 'type'] }, - tags: ['crawl_results_all'] + tags: ['crawl_complete'] }).preOps(ctx => ` DROP TABLE IF EXISTS ${ctx.self()}; `).query(ctx => ` SELECT * -FROM ${ctx.ref('all', 'requests')} +FROM ${ctx.ref('crawl', 'requests')} WHERE date = '${constants.currentMonth}' AND - -- rank <= 10000 -- TODO: use rank filtering when https://github.com/HTTPArchive/dataform/pull/5 is complete - page IN ( - SELECT page - FROM ${ctx.ref('sample_data', 'pages_10k')} - ) + rank <= 10000 `) diff --git a/src/index.js b/src/index.js index 8677da6..7d62b11 100644 --- a/src/index.js +++ b/src/index.js @@ -35,7 +35,7 @@ FROM ( actionArgs: { repoName: 'crawl-data', tags: [ - 'crawl_results_all', + 'crawl_complete', 'blink_features_report', 'crawl_results_legacy' ] From d94ca116299154ada00912a06c400243ecbb4fda Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Tue, 22 Oct 2024 17:29:31 +0200 Subject: [PATCH 42/44] update dependents --- definitions/extra/test_env.js | 3 +- definitions/output/all/parsed_css.js | 2 +- definitions/output/blink_features/features.js | 33 +--------- definitions/output/blink_features/usage.js | 2 +- .../output/core_web_vitals/technologies.js | 55 +++++++---------- definitions/output/crawl/backfill_pages.js | 4 +- definitions/output/crawl/backfill_requests.js | 4 +- .../output/crawl/backfill_summary_pages.js | 2 +- .../output/crawl/backfill_summary_requests.js | 2 +- definitions/output/crawl/reprocess_pages.js | 60 +------------------ .../output/crawl/reprocess_requests.js | 38 +----------- 11 files changed, 35 insertions(+), 170 deletions(-) diff --git a/definitions/extra/test_env.js b/definitions/extra/test_env.js index f804bfe..c9c4b03 100644 --- a/definitions/extra/test_env.js +++ b/definitions/extra/test_env.js @@ -1,4 +1,5 @@ const date = constants.currentMonth +operate('test') // List of resources to be copied to the test environment. Comment out the ones you don't need. const resourcesList = [ @@ -15,7 +16,7 @@ const resourcesList = [ resourcesList.forEach(resource => { operate( `test_table ${resource.datasetId}_dev_dev_${resource.tableId}` - ).queries(` + ).dependencies(['test']).queries(` CREATE SCHEMA IF NOT EXISTS ${resource.datasetId}_dev; DROP TABLE IF EXISTS ${resource.datasetId}_dev.dev_${resource.tableId}; diff --git a/definitions/output/all/parsed_css.js b/definitions/output/all/parsed_css.js index c370e59..fcf9f48 100644 --- a/definitions/output/all/parsed_css.js +++ b/definitions/output/all/parsed_css.js @@ -12,5 +12,5 @@ publish('parsed_css', { DROP SNAPSHOT TABLE IF EXISTS ${ctx.self()}; CREATE SNAPSHOT TABLE ${ctx.self()} -CLONE ${ctx.resolve('crawl', 'parsed_css')}; +CLONE ${ctx.ref('crawl', 'parsed_css')}; `) diff --git a/definitions/output/blink_features/features.js b/definitions/output/blink_features/features.js index 260f2e4..f3b6d4b 100644 --- a/definitions/output/blink_features/features.js +++ b/definitions/output/blink_features/features.js @@ -10,37 +10,6 @@ publish('features', { }).preOps(ctx => ` DELETE FROM ${ctx.self()} WHERE yyyymmdd = DATE '${constants.currentMonth}'; - -CREATE TEMP FUNCTION features(payload STRING) -RETURNS ARRAY> LANGUAGE js AS -''' -function getFeatureNames(featureMap, featureType) { - try { - return Object.entries(featureMap).map(([key, value]) => { - // After Feb 2020 keys are feature IDs. - if (value.name) { - return {'name': value.name, 'type': featureType, 'id': key}; - } - // Prior to Feb 2020 keys fell back to IDs if the name was unknown. - if (idPattern.test(key)) { - return {'name': '', 'type': featureType, 'id': key.match(idPattern)[1]}; - } - // Prior to Feb 2020 keys were names by default. - return {'name': key, 'type': featureType, 'id': ''}; - }); - } catch (e) { - return []; - } -} - -var $ = JSON.parse(payload); -if (!$._blinkFeatureFirstUsed) return []; - -var idPattern = new RegExp('^Feature_(\\\\d+)$'); -return getFeatureNames($._blinkFeatureFirstUsed.Features, 'default') - .concat(getFeatureNames($._blinkFeatureFirstUsed.CSSFeatures, 'css')) - .concat(getFeatureNames($._blinkFeatureFirstUsed.AnimatedCSSFeatures, 'animated-css')); -'''; `).query(ctx => ` SELECT date AS yyyymmdd, @@ -58,7 +27,7 @@ FROM ( payload, rank, feature - FROM ${ctx.ref('all', 'pages')}, + FROM ${ctx.ref('crawl', 'pages')}, UNNEST(features) AS feature WHERE date = '${constants.currentMonth}' AND diff --git a/definitions/output/blink_features/usage.js b/definitions/output/blink_features/usage.js index 131bb14..0eb0878 100644 --- a/definitions/output/blink_features/usage.js +++ b/definitions/output/blink_features/usage.js @@ -42,7 +42,7 @@ JOIN ( date, client, COUNT(DISTINCT page) AS total_urls - FROM ${ctx.ref('all', 'pages')} + FROM ${ctx.ref('crawl', 'pages')} WHERE date = '${constants.currentMonth}' AND is_root_page = TRUE diff --git a/definitions/output/core_web_vitals/technologies.js b/definitions/output/core_web_vitals/technologies.js index 9502afd..7ec0e20 100644 --- a/definitions/output/core_web_vitals/technologies.js +++ b/definitions/output/core_web_vitals/technologies.js @@ -22,23 +22,6 @@ CREATE TEMP FUNCTION IS_GOOD(good FLOAT64, needs_improvement FLOAT64, poor FLOAT CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor FLOAT64) RETURNS BOOL AS ( good + needs_improvement + poor > 0 ); - -CREATE TEMP FUNCTION GET_LIGHTHOUSE_CATEGORY_SCORES(categories STRING) -RETURNS STRUCT -LANGUAGE js AS ''' -try { - const $ = JSON.parse(categories); - return { - accessibility: $.accessibility?.score, - best_practices: $['best-practices']?.score, - performance: $.performance?.score, - pwa: $.pwa?.score, - seo: $.seo?.score - }; -} catch (e) { - return {}; -} -'''; `).query(ctx => ` WITH geo_summary AS ( SELECT @@ -111,7 +94,7 @@ technologies AS ( client, page AS url FROM - ${ctx.resolve('all', 'pages')}, + ${ctx.ref('crawl', 'pages')}, UNNEST(technologies) AS technology WHERE date = '${pastMonth}' @@ -124,7 +107,7 @@ UNION ALL client, page AS url FROM - ${ctx.resolve('all', 'pages')} + ${ctx.ref('crawl', 'pages')} WHERE date = '${pastMonth}' ${constants.devRankFilter} @@ -135,7 +118,7 @@ categories AS ( technology.technology AS app, ARRAY_TO_STRING(ARRAY_AGG(DISTINCT category IGNORE NULLS ORDER BY category), ', ') AS category FROM - ${ctx.resolve('all', 'pages')}, + ${ctx.ref('crawl', 'pages')}, UNNEST(technologies) AS technology, UNNEST(technology.categories) AS category WHERE @@ -148,7 +131,7 @@ UNION ALL 'ALL' AS app, ARRAY_TO_STRING(ARRAY_AGG(DISTINCT category IGNORE NULLS ORDER BY category), ', ') AS category FROM - ${ctx.resolve('all', 'pages')}, + ${ctx.ref('crawl', 'pages')}, UNNEST(technologies) AS technology, UNNEST(technology.categories) AS category WHERE @@ -162,12 +145,16 @@ summary_stats AS ( client, page AS url, root_page AS root_page_url, - CAST(JSON_VALUE(summary, '$.bytesTotal') AS INT64) AS bytesTotal, - CAST(JSON_VALUE(summary, '$.bytesJS') AS INT64) AS bytesJS, - CAST(JSON_VALUE(summary, '$.bytesImg') AS INT64) AS bytesImg, - GET_LIGHTHOUSE_CATEGORY_SCORES(JSON_QUERY(lighthouse, '$.categories')) AS lighthouse_category + SAFE.INT64(summary.bytesTotal) AS bytesTotal, + SAFE.INT64(summary.bytesJS) AS bytesJS, + SAFE.INT64(summary.bytesImg) AS bytesImg, + SAFE.FLOAT64(lighthouse_category.accessibility.score) AS accessibility, + SAFE.FLOAT64(lighthouse_category['best-practices'].score) AS best_practices, + SAFE.FLOAT64(lighthouse_category.performance.score) AS performance, + SAFE.FLOAT64(lighthouse_category.pwa.score) AS pwa, + SAFE.FLOAT64(lighthouse_category.seo.score) AS seo FROM - ${ctx.resolve('all', 'pages')} + ${ctx.ref('crawl', 'pages')} WHERE date = '${pastMonth}' ${constants.devRankFilter} @@ -179,14 +166,14 @@ lab_data AS ( root_page_url, app, ANY_VALUE(category) AS category, - CAST(AVG(bytesTotal) AS INT64) AS bytesTotal, - CAST(AVG(bytesJS) AS INT64) AS bytesJS, - CAST(AVG(bytesImg) AS INT64) AS bytesImg, - CAST(AVG(lighthouse_category.accessibility) AS NUMERIC) AS accessibility, - CAST(AVG(lighthouse_category.best_practices) AS NUMERIC) AS best_practices, - CAST(AVG(lighthouse_category.performance) AS NUMERIC) AS performance, - CAST(AVG(lighthouse_category.pwa) AS NUMERIC) AS pwa, - CAST(AVG(lighthouse_category.seo) AS NUMERIC) AS seo + AVG(bytesTotal) AS bytesTotal, + AVG(bytesJS) AS bytesJS, + AVG(bytesImg) AS bytesImg, + AVG(accessibility) AS accessibility, + AVG(best_practices) AS best_practices, + AVG(performance) AS performance, + AVG(pwa) AS pwa, + AVG(seo) AS seo FROM summary_stats JOIN diff --git a/definitions/output/crawl/backfill_pages.js b/definitions/output/crawl/backfill_pages.js index 4c6faee..05d75e4 100644 --- a/definitions/output/crawl/backfill_pages.js +++ b/definitions/output/crawl/backfill_pages.js @@ -1,6 +1,8 @@ const iterations = [] const clients = constants.clients +operate('backfill') + let midMonth for ( let date = '2016-01-01'; @@ -31,7 +33,7 @@ iterations.forEach((iteration, i) => { operate(`backfill_pages ${iteration.date} ${iteration.client}`).tags([ 'backfill_pages' ]).dependencies([ - i === 0 ? '' : `backfill_pages ${iterations[i - 1].date} ${iterations[i - 1].client}` + i === 0 ? 'backfill' : `backfill_pages ${iterations[i - 1].date} ${iterations[i - 1].client}` ]).queries(ctx => ` DELETE FROM crawl.pages WHERE date = '${iteration.date}' diff --git a/definitions/output/crawl/backfill_requests.js b/definitions/output/crawl/backfill_requests.js index 038df1b..8a57e77 100644 --- a/definitions/output/crawl/backfill_requests.js +++ b/definitions/output/crawl/backfill_requests.js @@ -27,13 +27,11 @@ for ( } } -operate('') - iterations.forEach((iteration, i) => { operate(`backfill_requests ${iteration.date} ${iteration.client}`).tags([ 'backfill_requests' ]).dependencies([ - i === 0 ? '' : `backfill_requests ${iterations[i - 1].date} ${iterations[i - 1].client}` + i === 0 ? 'backfill' : `backfill_requests ${iterations[i - 1].date} ${iterations[i - 1].client}` ]).queries(ctx => ` DELETE FROM crawl.requests WHERE date = '${iteration.date}' diff --git a/definitions/output/crawl/backfill_summary_pages.js b/definitions/output/crawl/backfill_summary_pages.js index 592d051..56c8aa0 100644 --- a/definitions/output/crawl/backfill_summary_pages.js +++ b/definitions/output/crawl/backfill_summary_pages.js @@ -127,7 +127,7 @@ iterations.forEach((iteration, i) => { operate(`backfill_summary_pages ${iteration.date} ${iteration.client}`).tags([ 'pages_backfill' ]).dependencies([ - i === 0 ? '' : `backfill_summary_pages ${iterations[i - 1].date} ${iterations[i - 1].client}` + i === 0 ? 'backfill' : `backfill_summary_pages ${iterations[i - 1].date} ${iterations[i - 1].client}` ]).queries(ctx => ` DELETE FROM crawl.pages WHERE date = '${iteration.date}' diff --git a/definitions/output/crawl/backfill_summary_requests.js b/definitions/output/crawl/backfill_summary_requests.js index b9e7ab2..ca84286 100644 --- a/definitions/output/crawl/backfill_summary_requests.js +++ b/definitions/output/crawl/backfill_summary_requests.js @@ -63,7 +63,7 @@ iterations.forEach((iteration, i) => { operate(`backfill_summary_requests ${iteration.date} ${iteration.client}`).tags([ 'requests_backfill' ]).dependencies([ - i === 0 ? '' : `backfill_summary_requests ${iterations[i - 1].date} ${iterations[i - 1].client}` + i === 0 ? 'backfill' : `backfill_summary_requests ${iterations[i - 1].date} ${iterations[i - 1].client}` ]).queries(ctx => ` DELETE FROM crawl.requests WHERE date = '${iteration.date}' AND client = '${iteration.client}'; diff --git a/definitions/output/crawl/reprocess_pages.js b/definitions/output/crawl/reprocess_pages.js index f6f7b9e..91d8f14 100644 --- a/definitions/output/crawl/reprocess_pages.js +++ b/definitions/output/crawl/reprocess_pages.js @@ -1,60 +1,4 @@ -operate('reprocess_pages_pre').tags( - ['reprocess_pages'] -).queries(` -CREATE SCHEMA IF NOT EXISTS crawl; - -CREATE TABLE IF NOT EXISTS crawl.pages -( - date DATE NOT NULL OPTIONS(description='YYYY-MM-DD format of the HTTP Archive monthly crawl'), - client STRING NOT NULL OPTIONS(description='Test environment: desktop or mobile'), - page STRING NOT NULL OPTIONS(description='The URL of the page being tested'), - is_root_page BOOL NOT NULL OPTIONS(description='Whether the page is the root of the origin'), - root_page STRING NOT NULL OPTIONS(description='The URL of the root page being tested, the origin followed by /'), - rank INT64 OPTIONS(description='Site popularity rank, from CrUX'), - wptid STRING OPTIONS(description='ID of the WebPageTest results'), - payload JSON OPTIONS(description='JSON-encoded WebPageTest results for the page'), - summary JSON OPTIONS(description='JSON-encoded summarization of the page-level data'), - custom_metrics STRUCT< - a11y JSON, - cms JSON, - cookies JSON, - css_variables JSON, - ecommerce JSON, - element_count JSON, - javascript JSON, - markup JSON, - media JSON, - origin_trials JSON, - performance JSON, - privacy JSON, - responsive_images JSON, - robots_txt JSON, - security JSON, - structured_data JSON, - third_parties JSON, - well_known JSON, - wpt_bodies JSON, - other JSON - > OPTIONS(description='Custom metrics from WebPageTest'), - lighthouse JSON OPTIONS(description='JSON-encoded Lighthouse report'), - features ARRAY> OPTIONS(description='Blink features detected at runtime (see https://chromestatus.com/features)'), - technologies ARRAY OPTIONS(description='List of categories to which this technology belongs'), - info ARRAY OPTIONS(description='Additional metadata about the detected technology, ie version number') - >> OPTIONS(description='Technologies detected at runtime (see https://www.wappalyzer.com/)'), - metadata JSON OPTIONS(description='Additional metadata about the test') -) -PARTITION BY date -CLUSTER BY client, is_root_page, rank, page -OPTIONS( - require_partition_filter=true -); -`) +operate('reprocess') const iterations = [] const clients = constants.clients @@ -73,7 +17,7 @@ iterations.forEach((iteration, i) => { operate(`reprocess_pages ${iteration.month} ${iteration.client}`).tags([ 'reprocess_pages' ]).dependencies([ - i === 0 ? 'reprocess_pages_pre' : `reprocess_pages ${iterations[i - 1].month} ${iterations[i - 1].client}` + i === 0 ? 'reprocess' : `reprocess_pages ${iterations[i - 1].month} ${iterations[i - 1].client}` ]).queries(ctx => ` DELETE FROM crawl.pages WHERE date = '${iteration.month}' AND diff --git a/definitions/output/crawl/reprocess_requests.js b/definitions/output/crawl/reprocess_requests.js index 6434398..6e20494 100644 --- a/definitions/output/crawl/reprocess_requests.js +++ b/definitions/output/crawl/reprocess_requests.js @@ -1,39 +1,3 @@ -operate('reprocess_requests_pre').tags( - ['reprocess_requests'] -).queries(` -CREATE SCHEMA IF NOT EXISTS crawl; - -CREATE TABLE IF NOT EXISTS crawl.requests -( - date DATE NOT NULL OPTIONS(description='YYYY-MM-DD format of the HTTP Archive monthly crawl'), - client STRING NOT NULL OPTIONS(description='Test environment: desktop or mobile'), - page STRING NOT NULL OPTIONS(description='The URL of the page being tested'), - is_root_page BOOL OPTIONS(description='Whether the page is the root of the origin.'), - root_page STRING NOT NULL OPTIONS(description='The URL of the root page being tested'), - rank INT64 OPTIONS(description='Site popularity rank, from CrUX'), - url STRING NOT NULL OPTIONS(description='The URL of the request'), - is_main_document BOOL NOT NULL OPTIONS(description='Whether this request corresponds with the main HTML document of the page, which is the first HTML request after redirects'), - type STRING OPTIONS(description='Simplified description of the type of resource (script, html, css, text, other, etc)'), - index INT64 OPTIONS(description='The sequential 0-based index of the request'), - payload JSON OPTIONS(description='JSON-encoded WebPageTest result data for this request'), - summary JSON OPTIONS(description='JSON-encoded summarization of request data'), - request_headers ARRAY> OPTIONS(description='Request headers'), - response_headers ARRAY> OPTIONS(description='Response headers'), - response_body STRING OPTIONS(description='Text-based response body') -) -PARTITION BY date -CLUSTER BY client, is_root_page, type, rank -OPTIONS( - require_partition_filter=true -); -`) - const iterations = [] for ( @@ -53,7 +17,7 @@ iterations.forEach((iteration, i) => { operate(`reprocess_requests ${iteration.month} ${iteration.client} ${iteration.isRootPage}`).tags( ['reprocess_requests'] ).dependencies([ - i === 0 ? 'reprocess_requests_pre' : `reprocess_requests ${iterations[i - 1].month} ${iterations[i - 1].client} ${iterations[i - 1].isRootPage}` + i === 0 ? 'reprocess' : `reprocess_requests ${iterations[i - 1].month} ${iterations[i - 1].client} ${iterations[i - 1].isRootPage}` ]).queries(ctx => ` DELETE FROM crawl.requests WHERE date = '${iteration.month}' From 01101dbabf1bc18d30fc250e8584c030c4923712 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Fri, 1 Nov 2024 09:57:25 +0100 Subject: [PATCH 43/44] response_bodies adjustment --- definitions/output/crawl/backfill_requests.js | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/definitions/output/crawl/backfill_requests.js b/definitions/output/crawl/backfill_requests.js index 8a57e77..b7cac9c 100644 --- a/definitions/output/crawl/backfill_requests.js +++ b/definitions/output/crawl/backfill_requests.js @@ -27,6 +27,10 @@ for ( } } +function getResponseBodiesColumnName(date) { + return date >= '2024-02-01' ? 'response_body' : 'body'; +} + iterations.forEach((iteration, i) => { operate(`backfill_requests ${iteration.date} ${iteration.client}`).tags([ 'backfill_requests' @@ -264,7 +268,7 @@ SELECT )) AS summary, parseHeaders(request.headers) AS request_headers, parseHeaders(response.headers) AS response_headers, - IF(requests.type = 'image', NULL, response_bodies.body) AS response_body + IF(requests.type = 'image', NULL, response_bodies.response_body) AS response_body FROM ( FROM \`requests.${constants.fnDateUnderscored(iteration.date)}_${iteration.client}\` ${constants.devTABLESAMPLE} |> SET payload = SAFE.PARSE_JSON(payload, wide_number_mode => 'round') @@ -293,7 +297,14 @@ ON requests.page = crux.page LEFT JOIN summary_pages.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} AS summary_pages ${constants.devTABLESAMPLE} ON requests.page = summary_pages.url -LEFT JOIN response_bodies.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} AS response_bodies ${constants.devTABLESAMPLE} +LEFT JOIN ( + SELECT + page, + url, + ANY_VALUE(${getResponseBodiesColumnName(iteration.date)}) AS response_body + FROM response_bodies.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} + GROUP BY page, url +) AS response_bodies ${constants.devTABLESAMPLE} ON requests.page = response_bodies.page AND requests.url = response_bodies.url; `) From 5ac19db87ce44e0448c7e3e1185b9afc6c4c358f Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Fri, 1 Nov 2024 10:10:11 +0100 Subject: [PATCH 44/44] lint --- definitions/output/crawl/backfill_requests.js | 4 ++-- definitions/sources/chrome-ux-report.js | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/definitions/output/crawl/backfill_requests.js b/definitions/output/crawl/backfill_requests.js index b7cac9c..1be37ea 100644 --- a/definitions/output/crawl/backfill_requests.js +++ b/definitions/output/crawl/backfill_requests.js @@ -27,8 +27,8 @@ for ( } } -function getResponseBodiesColumnName(date) { - return date >= '2024-02-01' ? 'response_body' : 'body'; +function getResponseBodiesColumnName (date) { + return date >= '2024-02-01' ? 'response_body' : 'body' } iterations.forEach((iteration, i) => { diff --git a/definitions/sources/chrome-ux-report.js b/definitions/sources/chrome-ux-report.js index afe7abd..ff98f44 100644 --- a/definitions/sources/chrome-ux-report.js +++ b/definitions/sources/chrome-ux-report.js @@ -14,7 +14,7 @@ FROM ${ctx.ref(database, 'materialized', 'country_summary')} |> WHERE yyyymm = ${pastMonthYYYYMM} |> AGGREGATE COUNT(DISTINCT country_code) AS cnt_countries |> WHERE cnt_countries != 238 -|> SELECT 'Table data doesn't match 238 countries' AS error_message; +|> SELECT "Table data doesn't match 238 countries" AS error_message `) declare({ @@ -28,7 +28,7 @@ FROM ${ctx.ref(database, 'materialized', 'device_summary')} |> WHERE date = ''${pastMonth}'' |> AGGREGATE COUNT(DISTINCT device) AS cnt_devices, COUNT(DISTINCT rank) AS cnt_ranks |> WHERE cnt_devices != 3 OR cnt_ranks != 10 -|> SELECT 'Table data doesn't match 3 unique devices and 10 ranks' AS error_message; +|> SELECT "Table data doesn't match 3 unique devices and 10 ranks" AS error_message `) declare({