diff --git a/common/text/transform.go b/common/text/transform.go index de093af0d4c..d4666bd7465 100644 --- a/common/text/transform.go +++ b/common/text/transform.go @@ -18,6 +18,8 @@ import ( "sync" "unicode" + "github.com/alexsergivan/transliterator" + "golang.org/x/text/language" "golang.org/x/text/runes" "golang.org/x/text/transform" "golang.org/x/text/unicode/norm" @@ -76,3 +78,11 @@ func VisitLinesAfter(s string, fn func(line string)) { fn(s) } } + +// Transliterate converts the given string s from Unicode to ASCII using rules +// predefined for the given language lang. +func Transliterate(s string, lang string) string { + trans := transliterator.NewTransliterator(nil) + language, _ := language.Make(lang).Base() + return trans.Transliterate(s, language.String()) +} diff --git a/common/text/transform_test.go b/common/text/transform_test.go index 41447715f47..661bbf74b5f 100644 --- a/common/text/transform_test.go +++ b/common/text/transform_test.go @@ -27,6 +27,15 @@ func TestRemoveAccents(t *testing.T) { c.Assert(string(RemoveAccentsString("Resumé")), qt.Equals, "Resume") } +func TestTransliterate(t *testing.T) { + c := qt.New(t) + + c.Assert(string(Transliterate("Hugo", "en")), qt.Equals, "Hugo") + c.Assert(string(Transliterate("áéíñóú", "en")), qt.Equals, "aeinou") + c.Assert(string(Transliterate("ÄÖÜäöüß", "en")), qt.Equals, "AOUaouss") + c.Assert(string(Transliterate("ÄÖÜäöüß", "de")), qt.Equals, "AeOeUeaeoeuess") +} + func TestChomp(t *testing.T) { c := qt.New(t) @@ -63,7 +72,7 @@ line 3` func BenchmarkVisitLinesAfter(b *testing.B) { const lines = `line 1 line 2 - + line 3` for i := 0; i < b.N; i++ { diff --git a/config/allconfig/allconfig.go b/config/allconfig/allconfig.go index 9f0d73ecda3..74f0efb09f2 100644 --- a/config/allconfig/allconfig.go +++ b/config/allconfig/allconfig.go @@ -541,6 +541,9 @@ type RootConfig struct { // Removes non-spacing marks from composite characters in content paths. RemovePathAccents bool + // Converts content paths from Unicode to ASCII. + TransliteratePaths bool + // Whether to track and print unused templates during the build. PrintUnusedTemplates bool diff --git a/config/allconfig/configlanguage.go b/config/allconfig/configlanguage.go index 2c5a116f420..0be3245aade 100644 --- a/config/allconfig/configlanguage.go +++ b/config/allconfig/configlanguage.go @@ -173,6 +173,10 @@ func (c ConfigLanguage) RemovePathAccents() bool { return c.config.RemovePathAccents } +func (c ConfigLanguage) TransliteratePaths() bool { + return c.config.TransliteratePaths +} + func (c ConfigLanguage) DefaultContentLanguage() string { return c.config.DefaultContentLanguage } diff --git a/config/allconfig/load.go b/config/allconfig/load.go index 7d706c7e3d7..f0cc926fa73 100644 --- a/config/allconfig/load.go +++ b/config/allconfig/load.go @@ -180,6 +180,7 @@ func (l configLoader) applyDefaultConfig() error { "canonifyURLs": false, "relativeURLs": false, "removePathAccents": false, + "transliteratePaths": false, "titleCaseStyle": "AP", "taxonomies": maps.Params{"tag": "tags", "category": "categories"}, "permalinks": maps.Params{}, diff --git a/config/configProvider.go b/config/configProvider.go index 8e2ab033482..fd6c38c80f6 100644 --- a/config/configProvider.go +++ b/config/configProvider.go @@ -43,6 +43,7 @@ type AllProvider interface { CanonifyURLs() bool DisablePathToLower() bool RemovePathAccents() bool + TransliteratePaths() bool IsUglyURLs(section string) bool DefaultContentLanguage() string DefaultContentLanguageInSubdir() bool diff --git a/docs/content/en/functions/transform/Transliterate.md b/docs/content/en/functions/transform/Transliterate.md new file mode 100644 index 00000000000..31a53c880d8 --- /dev/null +++ b/docs/content/en/functions/transform/Transliterate.md @@ -0,0 +1,42 @@ +--- +title: transform.Transliterate +description: Returns the given string, converting Unicode to ASCII. +categories: [] +keywords: [] +action: + aliases: [] + related: [] + returnType: string + signatures: [transform.Transliterate INPUT] +--- + +The `transform.Transliterate` function converts a string from Unicode to ASCII using rules predefined for your site's `defaultContentLanguage`, or using default rules if language-specific rules do not exist. + +Hugo provides language-specific transliteration rules for Bosnian (bs), Bulgarian (bg), Catalan (ca), Croatian (hr), Danish (da), Esperanto (eo), German (de), Hungarian (hu), Macedonian (mk), Norwegian Bokmål (nb), Russian (ru), Serbian (sr), Slovenian (sl), Swedish (sv), and Ukrainian (uk). + +For a site with English (en) as the default content language: + +```go-html-template +{{ transform.Transliterate "Hugo" }} → Hugo +{{ transform.Transliterate "çđħłƚŧ" }} → cdhllt +{{ transform.Transliterate "áéíñóú" }} → aeinou +{{ transform.Transliterate "ÄÖÜäöüß" }} → AOUaouss +``` + +For a site with German (de) as the default content language: + +```go-html-template +{{ transform.Transliterate "ÄÖÜäöüß" }} → AeOeUeaeoeuess +``` + +If you have enabled [`transliteratePath`] in your site configuration, you can use `transform.Transliterate` with the [`GetPage`] method to retrieve term pages: + +[`transliteratePath`]: /getting-started/configuration/#transliteratepath +[`GetPage`]: /methods/site/getpage/ + + +```go-html-template +{{ with .Site.GetPage (path.Join "tags" (transform.Transliterate "çđħłƚŧ äöü" | anchorize)) }} + {{ .LinkTitle }} +{{ end }} +``` diff --git a/docs/content/en/getting-started/configuration.md b/docs/content/en/getting-started/configuration.md index 5e861759b00..80fcd7f108f 100644 --- a/docs/content/en/getting-started/configuration.md +++ b/docs/content/en/getting-started/configuration.md @@ -19,7 +19,30 @@ Hugo uses the `hugo.toml`, `hugo.yaml`, or `hugo.json` (if found in the site root) as the default site configuration file. The user can choose to override that default with one or more site configuration files using the command-line `--config` switch. +### transliteratePath +**Default value:** false + +Converts path characters from Unicode to ASCII using rules predefined for your site's `defaultContentLanguage`, or using default rules if language-specific rules do not exist. + +Hugo provides language-specific transliteration rules for Bosnian (bs), Bulgarian (bg), Catalan (ca), Croatian (hr), Danish (da), Esperanto (eo), German (de), Hungarian (hu), Macedonian (mk), Norwegian Bokmål (nb), Russian (ru), Serbian (sr), Slovenian (sl), Swedish (sv), and Ukrainian (uk). + +Hugo ignores the `removePathAccents` setting if you enable `transliteratePath`. + +For a site with English (en) as the default content language: + +```text +content/Hugo.md → https://example.org/hugo/ +content/çđħłƚŧ.md → https://example.org/cdhllt/ +content/áéíñóú.md → https://example.org/aeinou/ +content/ÄÖÜäöüß.md → https://example.org/AOUaouss/ +``` + +For a site with German (de) as the default content language: + +```text +content/ÄÖÜäöüß.md → https://example.org/AeOeUeaeoeuess/ +``` Examples: ```txt @@ -455,7 +478,7 @@ See [Configure Taxonomies](/content-management/taxonomies#configure-taxonomies). ### theme -: See [module configuration](/hugo-modules/configuration/#module-configuration-imports) for how to import a theme. +See [module configuration](/hugo-modules/configuration/#module-configuration-imports) for how to import a theme. ### themesDir @@ -483,6 +506,31 @@ Site title. See [Configure Title Case](#configure-title-case) +### transliteratePath + +**Default value:** false + +Converts path characters from Unicode to ASCII using rules predefined for your site's `defaultContentLanguage`, or using default rules if language-specific rules do not exist. + +Hugo provides language-specific transliteration rules for Bosnian (bs), Bulgarian (bg), Catalan (ca), Croatian (hr), Danish (da), Esperanto (eo), German (de), Hungarian (hu), Macedonian (mk), Norwegian Bokmål (nb), Russian (ru), Serbian (sr), Slovenian (sl), Swedish (sv), and Ukrainian (uk). + +Hugo ignores the `removePathAccents` setting if you enable `transliteratePath`. + +For a site with English (en) as the default content language: + +```text +content/Hugo.md → https://example.org/hugo/ +content/çđħłƚŧ.md → https://example.org/cdhllt/ +content/áéíñóú.md → https://example.org/aeinou/ +content/ÄÖÜäöüß.md → https://example.org/AOUaouss/ +``` + +For a site with German (de) as the default content language: + +```text +content/ÄÖÜäöüß.md → https://example.org/AeOeUeaeoeuess/ +``` + ### uglyURLs **Default value:** false diff --git a/docs/data/docs.yaml b/docs/data/docs.yaml index dc81ad8bcc5..9f140f89be3 100644 --- a/docs/data/docs.yaml +++ b/docs/data/docs.yaml @@ -1623,6 +1623,7 @@ config: timeout: 30s title: "" titleCaseStyle: AP + transliteratePaths: false uglyURLs: false workingDir: "" config_helpers: @@ -4367,6 +4368,16 @@ tpl: { "title": "Hello World" } + Transliterate: + Aliases: null + Args: + - s + Description: Transliterate converts Unicode to ASCII. + Examples: + - - '{{ "áéíñóú" | transform.Transliterate }}' + - aeinou + - - '{{ "çđħłƚŧ" | transform.Transliterate }}' + - cdhllt Unmarshal: Aliases: - unmarshal diff --git a/go.mod b/go.mod index 9f87a4a9b6a..1a11db30a0f 100644 --- a/go.mod +++ b/go.mod @@ -4,6 +4,7 @@ require ( github.com/BurntSushi/locker v0.0.0-20171006230638-a6e239ea1c69 github.com/PuerkitoBio/purell v1.1.1 github.com/alecthomas/chroma/v2 v2.11.1 + github.com/alexsergivan/transliterator v1.0.0 github.com/armon/go-radix v1.0.0 github.com/aws/aws-sdk-go v1.48.6 github.com/bep/clocks v0.5.0 diff --git a/go.sum b/go.sum index 49a3ea5b40f..ab68fc190dd 100644 --- a/go.sum +++ b/go.sum @@ -36,6 +36,8 @@ github.com/alecthomas/assert/v2 v2.2.1 h1:XivOgYcduV98QCahG8T5XTezV5bylXe+lBxLG2 github.com/alecthomas/chroma/v2 v2.11.1 h1:m9uUtgcdAwgfFNxuqj7AIG75jD2YmL61BBIJWtdzJPs= github.com/alecthomas/chroma/v2 v2.11.1/go.mod h1:4TQu7gdfuPjSh76j78ietmqh9LiurGF0EpseFXdKMBw= github.com/alecthomas/repr v0.2.0 h1:HAzS41CIzNW5syS8Mf9UwXhNH1J9aix/BvDRf1Ml2Yk= +github.com/alexsergivan/transliterator v1.0.0 h1:SAA+fkGZKLnak47h8Dr6829IE2kpSZR2Y3yTd69cIwY= +github.com/alexsergivan/transliterator v1.0.0/go.mod h1:0IrumukulURJ4PD0z6UcdJKP2job1DYDhnHAP5y+5pE= github.com/armon/go-radix v1.0.0 h1:F4z6KzEeeQIMeLFa97iZU6vupzoecKdU5TX24SNppXI= github.com/armon/go-radix v1.0.0/go.mod h1:ufUuZ+zHj4x4TnLV4JWEpy2hxWSpsRywHrMgIH9cCH8= github.com/aws/aws-sdk-go v1.48.6 h1:hnL/TE3eRigirDLrdRE9AWE1ALZSVLAsC4wK8TGsMqk= @@ -320,6 +322,7 @@ github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+ github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= github.com/stretchr/testify v0.0.0-20161117074351-18a02ba4a312/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= +github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA= github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= diff --git a/helpers/integration_test.go b/helpers/integration_test.go new file mode 100644 index 00000000000..c8e54b2ed09 --- /dev/null +++ b/helpers/integration_test.go @@ -0,0 +1,53 @@ +// Copyright 2022 The Hugo Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package helpers_test + +import ( + "testing" + + "github.com/gohugoio/hugo/hugolib" +) + +func TestTransliterate(t *testing.T) { + t.Parallel() + + files := ` +-- config.toml -- +disableKinds = ['RSS','sitemap','taxonomy','term'] +disablePathToLower = true +removePathAccents = false +transliteratePaths = true +-- content/Hugo.md -- +--- +title: Hugo +--- +-- content/áéíñóú.md -- +--- +title: Áéíñóú +--- +-- content/çđħłƚŧ.md -- +--- +title: Çđħłƚŧ +--- +-- layouts/_default/single.html -- +{{ .Title }} + ` + + b := hugolib.Test(t, files) + + b.AssertFileContent("public/Hugo/index.html", "Hugo") + b.AssertFileContent("public/aeinou/index.html", "Áéíñóú") + b.AssertFileContent("public/cdhllt/index.html", "Çđħłƚŧ") + +} diff --git a/helpers/path.go b/helpers/path.go index 3172d345249..6068751b4fc 100644 --- a/helpers/path.go +++ b/helpers/path.go @@ -85,13 +85,19 @@ func ishex(c rune) bool { // UnicodeSanitize sanitizes string to be used in Hugo URL's, allowing only // a predefined set of special Unicode characters. -// If RemovePathAccents configuration flag is enabled, Unicode accents -// are also removed. +// If the TransliteratePaths configuration flag is enabled, Unicode characters +// will be converted to ASCII. +// If the RemovePathAccents configuration flag is enabled, non-spacing marks in +// composite characters are removed. // Hyphens in the original input are maintained. // Spaces will be replaced with a single hyphen, and sequential replacement hyphens will be reduced to one. func (p *PathSpec) UnicodeSanitize(s string) string { - if p.Cfg.RemovePathAccents() { - s = text.RemoveAccentsString(s) + if p.Cfg.TransliteratePaths() { + s = text.Transliterate(s, p.Cfg.DefaultContentLanguage()) + } else { + if p.Cfg.RemovePathAccents() { + s = text.RemoveAccentsString(s) + } } source := []rune(s) diff --git a/tpl/transform/init.go b/tpl/transform/init.go index e439604277a..0bbee3ae2a3 100644 --- a/tpl/transform/init.go +++ b/tpl/transform/init.go @@ -104,6 +104,14 @@ func init() { }, ) + ns.AddMethodMapping(ctx.Transliterate, + nil, + [][2]string{ + {`{{ transform.Transliterate "áéíñóú" }}`, `aeinou`}, + {`{{ transform.Transliterate "çđħłƚŧ" }}`, `cdhllt`}, + }, + ) + ns.AddMethodMapping(ctx.Unmarshal, []string{"unmarshal"}, [][2]string{ diff --git a/tpl/transform/integration_test.go b/tpl/transform/integration_test.go index 3ba65c71511..4f2a9862f43 100644 --- a/tpl/transform/integration_test.go +++ b/tpl/transform/integration_test.go @@ -97,6 +97,7 @@ disableKinds = ['page','rss','section','sitemap','taxonomy','term'] -- layouts/index.html -- {{ highlight "a" "b" 0 }} ` + b := hugolib.NewIntegrationTestBuilder( hugolib.IntegrationTestConfig{ T: t, @@ -107,3 +108,27 @@ disableKinds = ['page','rss','section','sitemap','taxonomy','term'] _, err := b.BuildE() b.Assert(err.Error(), qt.Contains, "error calling highlight: invalid Highlight option: 0") } + +func TestTransliterate(t *testing.T) { + t.Parallel() + + files := ` +-- config.toml -- +disableKinds = ['RSS','sitemap','taxonomy','term'] +defaultContentLanguage = 'de' +-- layouts/index.html -- +{{ transform.Transliterate "Hugo" }} +{{ transform.Transliterate "áéíñóú" }} +{{ transform.Transliterate "çđħłƚŧ" }} +{{ transform.Transliterate "ÄÖÜäöüß" }} + ` + + b := hugolib.Test(t, files) + + b.AssertFileContent("public/index.html", ` + Hugo + aeinou + cdhllt + AeOeUeaeoeuess + `) +} diff --git a/tpl/transform/transform.go b/tpl/transform/transform.go index 8078bc0ceac..9e2d234dea2 100644 --- a/tpl/transform/transform.go +++ b/tpl/transform/transform.go @@ -23,6 +23,7 @@ import ( "strings" "github.com/gohugoio/hugo/cache/namedmemcache" + "github.com/gohugoio/hugo/common/text" "github.com/gohugoio/hugo/markup/converter/hooks" "github.com/gohugoio/hugo/markup/highlight" "github.com/gohugoio/hugo/markup/highlight/chromalexers" @@ -180,6 +181,15 @@ func (ns *Namespace) Plainify(s any) (string, error) { return tpl.StripHTML(ss), nil } +// Transliterate returns the given string, converting Unicode to ASCII. +func (ns *Namespace) Transliterate(s any) (string, error) { + ss, err := cast.ToStringE(s) + if err != nil { + return "", err + } + return text.Transliterate(ss, ns.deps.Conf.DefaultContentLanguage()), nil +} + // For internal use. func (ns *Namespace) Reset() { ns.cache.Clear()