Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#3564: Add rel="canonical" annotations to dataset & distribution page crawler views #3565

Merged
merged 1 commit into from
Sep 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
- Increase indexer client connection idle-timeout to avoid encountering connection reset error for downloading large region files
- Upgraded OpenSearch to v2.16.0
- #3556: Serves robots.txt as content-type `text/plain` instead and other sitemap & crawler view related improvements.
- #3564: Add rel="canonical" annotations to dataset & distribution page crawler views

## v4.2.3

Expand Down
21 changes: 16 additions & 5 deletions magda-web-server/src/crawlerViews/commonView.ts
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
import markdownToHtml from "magda-typescript-common/src/markdownToHtml.js";

type ContentType = {
title: string;
interface ContentType {
title?: string;
__content: string;
};
canonicalUrl: string;
sitemapUrl: string;
}

const commonView = (
{ title, __content }: ContentType,
{ title, __content, canonicalUrl, sitemapUrl }: ContentType,
shouldShowFullVersionLink: boolean = false,
fullVersionUrl: string = ""
) => {
Expand All @@ -15,7 +17,16 @@ const commonView = (
<head>
<meta charset="UTF-8">
<title>${title}</title>
<style></style>
${
canonicalUrl
? `<link rel="canonical" href="${canonicalUrl}">`
: ""
}
${
sitemapUrl
? `<link rel="sitemap" type="application/xml" href="${sitemapUrl}">`
: ""
}
</head>
<body>
${markdownToHtml(__content, true)}
Expand Down
35 changes: 30 additions & 5 deletions magda-web-server/src/createCrawlerViewRouter.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { Router } from "express";
import getAbsoluteUrl from "magda-typescript-common/src/getAbsoluteUrl.js";
import shouldRenderCrawlerView from "./shouldRenderCrawlerView.js";
import datasetView from "./crawlerViews/dataset.js";
import distributionView from "./crawlerViews/distribution.js";
Expand All @@ -13,6 +14,7 @@ const { safeLoadFront } = yamlFrontMatter;
type OptionType = {
enableDiscourseSupport: boolean;
uiBaseUrl: string;
baseExternalUrl: string;
registryApiBaseUrl: string;
};

Expand All @@ -22,12 +24,17 @@ function getTenantIdFromReq(req: Request) {
: 0;
}

const createCralwerViewRouter = ({
const createCrawlerViewRouter = ({
enableDiscourseSupport,
registryApiBaseUrl,
uiBaseUrl
uiBaseUrl,
baseExternalUrl
}: OptionType) => {
const router: Router = Router();
const sitemapUrl = `${getAbsoluteUrl(
uiBaseUrl,
baseExternalUrl ? baseExternalUrl : "/"
)}sitemap.xml`;

async function datasetViewHandler(
req: Request<
Expand Down Expand Up @@ -70,7 +77,16 @@ const createCralwerViewRouter = ({
throw datasetData;
}
const content = safeLoadFront(datasetView(datasetData, uiBaseUrl));
res.send(commonView(content as any));
res.send(
commonView({
...content,
sitemapUrl,
canonicalUrl: getAbsoluteUrl(
`${uiBaseUrl}dataset/${datasetId}`,
baseExternalUrl
)
})
);
} catch (e) {
console.warn(
`Failed to producing crawler view for datasetId \`${datasetId}\`: ${
Expand Down Expand Up @@ -142,7 +158,16 @@ const createCralwerViewRouter = ({
const content = safeLoadFront(
distributionView(distributionData, datasetData, uiBaseUrl)
);
res.send(commonView(content as any));
res.send(
commonView({
...content,
sitemapUrl,
canonicalUrl: getAbsoluteUrl(
`${uiBaseUrl}dataset/${datasetId}/distribution/${distributionId}`,
baseExternalUrl
)
})
);
} catch (e) {
console.warn(
`Failed to producing crawler view for distributionId \`${distributionId}\`: ${
Expand All @@ -165,4 +190,4 @@ const createCralwerViewRouter = ({
return router;
};

export default createCralwerViewRouter;
export default createCrawlerViewRouter;
1 change: 1 addition & 0 deletions magda-web-server/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -519,6 +519,7 @@ if (argv.enableCrawlerViews || enableDiscourseSupport) {
createCrawlerViewRouter({
registryApiBaseUrl: argv.registryApiBaseUrlInternal,
enableDiscourseSupport: enableDiscourseSupport,
baseExternalUrl,
uiBaseUrl
})
);
Expand Down
1 change: 1 addition & 0 deletions magda-web-server/src/shouldRenderCrawlerView.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ const browserNames = [

const crawlerPatterns = [
"Googlebot\\/", // Google
"Google-InspectionTool\\/", // Google inspectionTool
"bingbot", // Bing
"Slurp", // Yahoo
"DuckDuckBot", // DuckDuckGo
Expand Down