From 84ea8f041b614aff6742dd2e47493da480caa233 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Wed, 9 Oct 2024 16:56:18 +0200 Subject: [PATCH 1/3] split by root and type --- definitions/output/all/reprocess_pages.js | 3 +- definitions/output/all/reprocess_requests.js | 33 ++++++++++++++------ 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/definitions/output/all/reprocess_pages.js b/definitions/output/all/reprocess_pages.js index bc60f87..20b7206 100644 --- a/definitions/output/all/reprocess_pages.js +++ b/definitions/output/all/reprocess_pages.js @@ -81,7 +81,8 @@ iterations.forEach((iteration, i) => { i === 0 ? "all_pages_stable_pre" : `all_pages_stable_update ${iterations[i - 1].month} ${iterations[i - 1].client}` ]).queries(ctx => ` DELETE FROM \`all_dev.pages_stable\` -WHERE date = "${iteration.month}"; +WHERE date = "${iteration.month}" AND + client = "${iteration.client}"; INSERT INTO \`all_dev.pages_stable\` SELECT diff --git a/definitions/output/all/reprocess_requests.js b/definitions/output/all/reprocess_requests.js index 0cd6650..6ce0579 100644 --- a/definitions/output/all/reprocess_requests.js +++ b/definitions/output/all/reprocess_requests.js @@ -36,29 +36,39 @@ OPTIONS( ); `); -const iterations = []; -const clients = constants.clients; +const + iterations = [], + types = ["= 'script'", "= 'image'", "NOT IN ('script', 'image')"]; for ( let month = constants.current_month; month >= '2024-09-01'; // 2022-07-01 month = constants.fn_past_month(month)) { - clients.forEach((client) => { - iterations.push({ - month: month, - client: client + constants.clients.forEach((client) => { + constants.booleans.forEach((is_root_page) => { + types.forEach((type) => { + iterations.push({ + month: month, + client: client, + is_root_page: is_root_page, + type: type + }) + }) }) }) } iterations.forEach((iteration, i) => { - operate(`all_requests_stable ${iteration.month} ${iteration.client}`).tags( + operate(`all_requests_stable ${iteration.month} ${iteration.client} ${iteration.is_root_page} ${i}`).tags( ["all_requests_stable"] ).dependencies([ - i === 0 ? "all_requests_stable_pre" : `all_requests_stable ${iterations[i - 1].month} ${iterations[i - 1].client}` + i === 0 ? "all_requests_stable_pre" : `all_requests_stable ${iterations[i - 1].month} ${iterations[i - 1].client} ${iterations[i - 1].is_root_page} ${i-1}` ]).queries(ctx => ` DELETE FROM \`all_dev.requests_stable\` -WHERE date = "${iteration.month}"; +WHERE date = "${iteration.month}" + AND client = '${iteration.client}' + AND is_root_page = ${iteration.is_root_page} + AND type ${iteration.type}; CREATE TEMP FUNCTION PRUNE_HEADERS( jsonObject JSON @@ -114,7 +124,10 @@ FROM ( SELECT * FROM \`all.requests\` ${constants.dev_TABLESAMPLE} WHERE date = '${iteration.month}' - AND client = '${iteration.client}') AS requests + AND client = '${iteration.client}' + AND is_root_page = ${iteration.is_root_page} + AND type ${iteration.type} +) AS requests LEFT JOIN ( SELECT DISTINCT CONCAT(origin, '/') AS page, From c69edaf04ef04fe3aba47d9e182b911454039a45 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Wed, 9 Oct 2024 17:12:07 +0200 Subject: [PATCH 2/3] trim summary.type --- definitions/output/all/reprocess_requests.js | 1 + 1 file changed, 1 insertion(+) diff --git a/definitions/output/all/reprocess_requests.js b/definitions/output/all/reprocess_requests.js index 6ce0579..c786a44 100644 --- a/definitions/output/all/reprocess_requests.js +++ b/definitions/output/all/reprocess_requests.js @@ -113,6 +113,7 @@ SELECT '$.requestid', '$.respOtherHeaders', '$.startedDateTime', + '$.type', '$.url', '$.urlShort' ) From f4df85bf4fb33c3482559df7bbe17b54f98547f2 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Wed, 9 Oct 2024 18:27:32 +0200 Subject: [PATCH 3/3] trim request.summary --- definitions/output/all/reprocess_requests.js | 2 ++ 1 file changed, 2 insertions(+) diff --git a/definitions/output/all/reprocess_requests.js b/definitions/output/all/reprocess_requests.js index c786a44..a154e66 100644 --- a/definitions/output/all/reprocess_requests.js +++ b/definitions/output/all/reprocess_requests.js @@ -107,8 +107,10 @@ SELECT PRUNE_HEADERS( JSON_REMOVE( SAFE.PARSE_JSON(requests.summary, wide_number_mode => 'round'), + '$.crawlid', '$.firstHtml', '$.firstReq', + '$.pageid', '$.reqOtherHeaders', '$.requestid', '$.respOtherHeaders',