diff --git a/CHANGES.md b/CHANGES.md
index c61212b63f..480829f6aa 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -6,6 +6,7 @@
- Increase indexer client connection idle-timeout to avoid encountering connection reset error for downloading large region files
- Upgraded OpenSearch to v2.16.0
- #3556: Serves robots.txt as content-type `text/plain` instead and other sitemap & crawler view related improvements.
+- #3564: Add rel="canonical" annotations to dataset & distribution page crawler views
## v4.2.3
diff --git a/magda-web-server/src/crawlerViews/commonView.ts b/magda-web-server/src/crawlerViews/commonView.ts
index a99a1bbd23..4f2726f0ec 100644
--- a/magda-web-server/src/crawlerViews/commonView.ts
+++ b/magda-web-server/src/crawlerViews/commonView.ts
@@ -1,12 +1,14 @@
import markdownToHtml from "magda-typescript-common/src/markdownToHtml.js";
-type ContentType = {
- title: string;
+interface ContentType {
+ title?: string;
__content: string;
-};
+ canonicalUrl: string;
+ sitemapUrl: string;
+}
const commonView = (
- { title, __content }: ContentType,
+ { title, __content, canonicalUrl, sitemapUrl }: ContentType,
shouldShowFullVersionLink: boolean = false,
fullVersionUrl: string = ""
) => {
@@ -15,7 +17,16 @@ const commonView = (
${title}
-
+ ${
+ canonicalUrl
+ ? ``
+ : ""
+ }
+ ${
+ sitemapUrl
+ ? ``
+ : ""
+ }
${markdownToHtml(__content, true)}
diff --git a/magda-web-server/src/createCrawlerViewRouter.ts b/magda-web-server/src/createCrawlerViewRouter.ts
index b91f35d40f..d1f8c7a050 100644
--- a/magda-web-server/src/createCrawlerViewRouter.ts
+++ b/magda-web-server/src/createCrawlerViewRouter.ts
@@ -1,4 +1,5 @@
import { Router } from "express";
+import getAbsoluteUrl from "magda-typescript-common/src/getAbsoluteUrl.js";
import shouldRenderCrawlerView from "./shouldRenderCrawlerView.js";
import datasetView from "./crawlerViews/dataset.js";
import distributionView from "./crawlerViews/distribution.js";
@@ -13,6 +14,7 @@ const { safeLoadFront } = yamlFrontMatter;
type OptionType = {
enableDiscourseSupport: boolean;
uiBaseUrl: string;
+ baseExternalUrl: string;
registryApiBaseUrl: string;
};
@@ -22,12 +24,17 @@ function getTenantIdFromReq(req: Request) {
: 0;
}
-const createCralwerViewRouter = ({
+const createCrawlerViewRouter = ({
enableDiscourseSupport,
registryApiBaseUrl,
- uiBaseUrl
+ uiBaseUrl,
+ baseExternalUrl
}: OptionType) => {
const router: Router = Router();
+ const sitemapUrl = `${getAbsoluteUrl(
+ uiBaseUrl,
+ baseExternalUrl ? baseExternalUrl : "/"
+ )}sitemap.xml`;
async function datasetViewHandler(
req: Request<
@@ -70,7 +77,16 @@ const createCralwerViewRouter = ({
throw datasetData;
}
const content = safeLoadFront(datasetView(datasetData, uiBaseUrl));
- res.send(commonView(content as any));
+ res.send(
+ commonView({
+ ...content,
+ sitemapUrl,
+ canonicalUrl: getAbsoluteUrl(
+ `${uiBaseUrl}dataset/${datasetId}`,
+ baseExternalUrl
+ )
+ })
+ );
} catch (e) {
console.warn(
`Failed to producing crawler view for datasetId \`${datasetId}\`: ${
@@ -142,7 +158,16 @@ const createCralwerViewRouter = ({
const content = safeLoadFront(
distributionView(distributionData, datasetData, uiBaseUrl)
);
- res.send(commonView(content as any));
+ res.send(
+ commonView({
+ ...content,
+ sitemapUrl,
+ canonicalUrl: getAbsoluteUrl(
+ `${uiBaseUrl}dataset/${datasetId}/distribution/${distributionId}`,
+ baseExternalUrl
+ )
+ })
+ );
} catch (e) {
console.warn(
`Failed to producing crawler view for distributionId \`${distributionId}\`: ${
@@ -165,4 +190,4 @@ const createCralwerViewRouter = ({
return router;
};
-export default createCralwerViewRouter;
+export default createCrawlerViewRouter;
diff --git a/magda-web-server/src/index.ts b/magda-web-server/src/index.ts
index 3ad5c9f0da..4e0721df53 100644
--- a/magda-web-server/src/index.ts
+++ b/magda-web-server/src/index.ts
@@ -519,6 +519,7 @@ if (argv.enableCrawlerViews || enableDiscourseSupport) {
createCrawlerViewRouter({
registryApiBaseUrl: argv.registryApiBaseUrlInternal,
enableDiscourseSupport: enableDiscourseSupport,
+ baseExternalUrl,
uiBaseUrl
})
);
diff --git a/magda-web-server/src/shouldRenderCrawlerView.ts b/magda-web-server/src/shouldRenderCrawlerView.ts
index b6349fefb3..bacea6f7bd 100644
--- a/magda-web-server/src/shouldRenderCrawlerView.ts
+++ b/magda-web-server/src/shouldRenderCrawlerView.ts
@@ -12,6 +12,7 @@ const browserNames = [
const crawlerPatterns = [
"Googlebot\\/", // Google
+ "Google-InspectionTool\\/", // Google inspectionTool
"bingbot", // Bing
"Slurp", // Yahoo
"DuckDuckBot", // DuckDuckGo