From 10bcccd1fc0d78afcca504da4b6959a75af46f9c Mon Sep 17 00:00:00 2001 From: Jacky Jiang Date: Mon, 16 Sep 2024 22:14:34 +1000 Subject: [PATCH] #3564: Add rel="canonical" annotations to dataset & distribution page crawler views --- CHANGES.md | 1 + .../src/crawlerViews/commonView.ts | 21 ++++++++--- .../src/createCrawlerViewRouter.ts | 35 ++++++++++++++++--- magda-web-server/src/index.ts | 1 + .../src/shouldRenderCrawlerView.ts | 1 + 5 files changed, 49 insertions(+), 10 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index c61212b63f..480829f6aa 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -6,6 +6,7 @@ - Increase indexer client connection idle-timeout to avoid encountering connection reset error for downloading large region files - Upgraded OpenSearch to v2.16.0 - #3556: Serves robots.txt as content-type `text/plain` instead and other sitemap & crawler view related improvements. +- #3564: Add rel="canonical" annotations to dataset & distribution page crawler views ## v4.2.3 diff --git a/magda-web-server/src/crawlerViews/commonView.ts b/magda-web-server/src/crawlerViews/commonView.ts index a99a1bbd23..4f2726f0ec 100644 --- a/magda-web-server/src/crawlerViews/commonView.ts +++ b/magda-web-server/src/crawlerViews/commonView.ts @@ -1,12 +1,14 @@ import markdownToHtml from "magda-typescript-common/src/markdownToHtml.js"; -type ContentType = { - title: string; +interface ContentType { + title?: string; __content: string; -}; + canonicalUrl: string; + sitemapUrl: string; +} const commonView = ( - { title, __content }: ContentType, + { title, __content, canonicalUrl, sitemapUrl }: ContentType, shouldShowFullVersionLink: boolean = false, fullVersionUrl: string = "" ) => { @@ -15,7 +17,16 @@ const commonView = ( ${title} - + ${ + canonicalUrl + ? `` + : "" + } + ${ + sitemapUrl + ? `` + : "" + } ${markdownToHtml(__content, true)} diff --git a/magda-web-server/src/createCrawlerViewRouter.ts b/magda-web-server/src/createCrawlerViewRouter.ts index b91f35d40f..d1f8c7a050 100644 --- a/magda-web-server/src/createCrawlerViewRouter.ts +++ b/magda-web-server/src/createCrawlerViewRouter.ts @@ -1,4 +1,5 @@ import { Router } from "express"; +import getAbsoluteUrl from "magda-typescript-common/src/getAbsoluteUrl.js"; import shouldRenderCrawlerView from "./shouldRenderCrawlerView.js"; import datasetView from "./crawlerViews/dataset.js"; import distributionView from "./crawlerViews/distribution.js"; @@ -13,6 +14,7 @@ const { safeLoadFront } = yamlFrontMatter; type OptionType = { enableDiscourseSupport: boolean; uiBaseUrl: string; + baseExternalUrl: string; registryApiBaseUrl: string; }; @@ -22,12 +24,17 @@ function getTenantIdFromReq(req: Request) { : 0; } -const createCralwerViewRouter = ({ +const createCrawlerViewRouter = ({ enableDiscourseSupport, registryApiBaseUrl, - uiBaseUrl + uiBaseUrl, + baseExternalUrl }: OptionType) => { const router: Router = Router(); + const sitemapUrl = `${getAbsoluteUrl( + uiBaseUrl, + baseExternalUrl ? baseExternalUrl : "/" + )}sitemap.xml`; async function datasetViewHandler( req: Request< @@ -70,7 +77,16 @@ const createCralwerViewRouter = ({ throw datasetData; } const content = safeLoadFront(datasetView(datasetData, uiBaseUrl)); - res.send(commonView(content as any)); + res.send( + commonView({ + ...content, + sitemapUrl, + canonicalUrl: getAbsoluteUrl( + `${uiBaseUrl}dataset/${datasetId}`, + baseExternalUrl + ) + }) + ); } catch (e) { console.warn( `Failed to producing crawler view for datasetId \`${datasetId}\`: ${ @@ -142,7 +158,16 @@ const createCralwerViewRouter = ({ const content = safeLoadFront( distributionView(distributionData, datasetData, uiBaseUrl) ); - res.send(commonView(content as any)); + res.send( + commonView({ + ...content, + sitemapUrl, + canonicalUrl: getAbsoluteUrl( + `${uiBaseUrl}dataset/${datasetId}/distribution/${distributionId}`, + baseExternalUrl + ) + }) + ); } catch (e) { console.warn( `Failed to producing crawler view for distributionId \`${distributionId}\`: ${ @@ -165,4 +190,4 @@ const createCralwerViewRouter = ({ return router; }; -export default createCralwerViewRouter; +export default createCrawlerViewRouter; diff --git a/magda-web-server/src/index.ts b/magda-web-server/src/index.ts index 3ad5c9f0da..4e0721df53 100644 --- a/magda-web-server/src/index.ts +++ b/magda-web-server/src/index.ts @@ -519,6 +519,7 @@ if (argv.enableCrawlerViews || enableDiscourseSupport) { createCrawlerViewRouter({ registryApiBaseUrl: argv.registryApiBaseUrlInternal, enableDiscourseSupport: enableDiscourseSupport, + baseExternalUrl, uiBaseUrl }) ); diff --git a/magda-web-server/src/shouldRenderCrawlerView.ts b/magda-web-server/src/shouldRenderCrawlerView.ts index b6349fefb3..bacea6f7bd 100644 --- a/magda-web-server/src/shouldRenderCrawlerView.ts +++ b/magda-web-server/src/shouldRenderCrawlerView.ts @@ -12,6 +12,7 @@ const browserNames = [ const crawlerPatterns = [ "Googlebot\\/", // Google + "Google-InspectionTool\\/", // Google inspectionTool "bingbot", // Bing "Slurp", // Yahoo "DuckDuckBot", // DuckDuckGo