shithub: hugo

Download patch

ref: a82d2700fcc772aada15d65b8f76913ca23f7404
parent: ae816452b171b6b6aabca6a7423ed28a653baaa2
author: Bjørn Erik Pedersen <[email protected]>
date: Sat Jan 4 06:28:19 EST 2020

markup/goldmark: Make auto IDs GitHub compatible

You can turn off this behaviour:

```toml
[markup]
  [markup.goldmark]
    [markup.goldmark.parser]
      autoHeadingIDAsciiOnly = true
```
Note that the `anchorize` now adapts its behaviour depending on the default Markdown handler.

Fixes #6616

--- /dev/null
+++ b/common/text/transform.go
@@ -1,0 +1,47 @@
+// Copyright 2019 The Hugo Authors. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package text
+
+import (
+	"sync"
+	"unicode"
+
+	"golang.org/x/text/runes"
+	"golang.org/x/text/transform"
+	"golang.org/x/text/unicode/norm"
+)
+
+var accentTransformerPool = &sync.Pool{
+	New: func() interface{} {
+		return transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC)
+	},
+}
+
+// RemoveAccents removes all accents from b.
+func RemoveAccents(b []byte) []byte {
+	t := accentTransformerPool.Get().(transform.Transformer)
+	b, _, _ = transform.Bytes(t, b)
+	t.Reset()
+	accentTransformerPool.Put(t)
+	return b
+}
+
+// RemoveAccentsString removes all accents from s.
+func RemoveAccentsString(s string) string {
+	t := accentTransformerPool.Get().(transform.Transformer)
+	s, _, _ = transform.String(t, s)
+	t.Reset()
+	accentTransformerPool.Put(t)
+	return s
+}
--- /dev/null
+++ b/common/text/transform_test.go
@@ -1,0 +1,29 @@
+// Copyright 2019 The Hugo Authors. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package text
+
+import (
+	"testing"
+
+	qt "github.com/frankban/quicktest"
+)
+
+func TestRemoveAccents(t *testing.T) {
+	c := qt.New(t)
+
+	c.Assert(string(RemoveAccents([]byte("Resumé"))), qt.Equals, "Resume")
+	c.Assert(string(RemoveAccents([]byte("Hugo Rocks!"))), qt.Equals, "Hugo Rocks!")
+	c.Assert(string(RemoveAccentsString("Resumé")), qt.Equals, "Resume")
+
+}
--- a/helpers/content.go
+++ b/helpers/content.go
@@ -48,8 +48,9 @@
 
 // ContentSpec provides functionality to render markdown content.
 type ContentSpec struct {
-	Converters       markup.ConverterProvider
-	MardownConverter converter.Converter // Markdown converter with no document context
+	Converters          markup.ConverterProvider
+	MardownConverter    converter.Converter // Markdown converter with no document context
+	anchorNameSanitizer converter.AnchorNameSanitizer
 
 	// SummaryLength is the length of the summary that Hugo extracts from a content.
 	summaryLength int
@@ -91,6 +92,17 @@
 		return nil, err
 	}
 	spec.MardownConverter = conv
+	if as, ok := conv.(converter.AnchorNameSanitizer); ok {
+		spec.anchorNameSanitizer = as
+	} else {
+		// Use Goldmark's sanitizer
+		p := converterProvider.Get("goldmark")
+		conv, err := p.New(converter.DocumentContext{})
+		if err != nil {
+			return nil, err
+		}
+		spec.anchorNameSanitizer = conv.(converter.AnchorNameSanitizer)
+	}
 
 	return spec, nil
 }
@@ -190,6 +202,10 @@
 		return nil, err
 	}
 	return b.Bytes(), nil
+}
+
+func (c *ContentSpec) SanitizeAnchorName(s string) string {
+	return c.anchorNameSanitizer.SanitizeAnchorName(s)
 }
 
 func (c *ContentSpec) ResolveMarkup(in string) string {
--- a/helpers/path.go
+++ b/helpers/path.go
@@ -24,6 +24,8 @@
 	"strings"
 	"unicode"
 
+	"github.com/gohugoio/hugo/common/text"
+
 	"github.com/gohugoio/hugo/config"
 
 	"github.com/gohugoio/hugo/hugofs"
@@ -31,9 +33,6 @@
 	"github.com/gohugoio/hugo/common/hugio"
 	_errors "github.com/pkg/errors"
 	"github.com/spf13/afero"
-	"golang.org/x/text/runes"
-	"golang.org/x/text/transform"
-	"golang.org/x/text/unicode/norm"
 )
 
 var (
@@ -134,6 +133,10 @@
 // are also removed.
 // Spaces will be replaced with a single hyphen, and sequential hyphens will be reduced to one.
 func (p *PathSpec) UnicodeSanitize(s string) string {
+	if p.RemovePathAccents {
+		s = text.RemoveAccentsString(s)
+	}
+
 	source := []rune(s)
 	target := make([]rune, 0, len(source))
 	var prependHyphen bool
@@ -154,17 +157,7 @@
 		}
 	}
 
-	var result string
-
-	if p.RemovePathAccents {
-		// remove accents - see https://blog.golang.org/normalization
-		t := transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC)
-		result, _, _ = transform.String(t, string(target))
-	} else {
-		result = string(target)
-	}
-
-	return result
+	return string(target)
 }
 
 // ReplaceExtension takes a path and an extension, strips the old extension
--- a/markup/blackfriday/convert.go
+++ b/markup/blackfriday/convert.go
@@ -60,6 +60,10 @@
 	cfg        converter.ProviderConfig
 }
 
+func (c *blackfridayConverter) SanitizeAnchorName(s string) string {
+	return blackfriday.SanitizedAnchorName(s)
+}
+
 func (c *blackfridayConverter) AnchorSuffix() string {
 	if c.bf.PlainIDAnchors {
 		return ""
@@ -204,5 +208,6 @@
 }
 
 var (
-	_ converter.DocumentInfo = (*blackfridayConverter)(nil)
+	_ converter.DocumentInfo        = (*blackfridayConverter)(nil)
+	_ converter.AnchorNameSanitizer = (*blackfridayConverter)(nil)
 )
--- a/markup/converter/converter.go
+++ b/markup/converter/converter.go
@@ -87,6 +87,11 @@
 	TableOfContents() tableofcontents.Root
 }
 
+// AnchorNameSanitizer tells how a converter sanitizes anchor names.
+type AnchorNameSanitizer interface {
+	SanitizeAnchorName(s string) string
+}
+
 // Bytes holds a byte slice and implements the Result interface.
 type Bytes []byte
 
--- /dev/null
+++ b/markup/goldmark/autoid.go
@@ -1,0 +1,125 @@
+// Copyright 2019 The Hugo Authors. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package goldmark
+
+import (
+	"bytes"
+	"strconv"
+	"unicode"
+	"unicode/utf8"
+
+	"github.com/gohugoio/hugo/common/text"
+
+	"github.com/yuin/goldmark/ast"
+	"github.com/yuin/goldmark/parser"
+	"github.com/yuin/goldmark/util"
+
+	bp "github.com/gohugoio/hugo/bufferpool"
+)
+
+func sanitizeAnchorNameString(s string, asciiOnly bool) string {
+	return string(sanitizeAnchorName([]byte(s), asciiOnly))
+}
+
+func sanitizeAnchorName(b []byte, asciiOnly bool) []byte {
+	return sanitizeAnchorNameWithHook(b, asciiOnly, nil)
+}
+
+func sanitizeAnchorNameWithHook(b []byte, asciiOnly bool, hook func(buf *bytes.Buffer)) []byte {
+	buf := bp.GetBuffer()
+
+	if asciiOnly {
+		// Normalize it to preserve accents if possible.
+		b = text.RemoveAccents(b)
+	}
+
+	for len(b) > 0 {
+		r, size := utf8.DecodeRune(b)
+		switch {
+		case asciiOnly && size != 1:
+		case isSpace(r):
+			buf.WriteString("-")
+		case r == '-' || isAlphaNumeric(r):
+			buf.WriteRune(unicode.ToLower(r))
+		default:
+		}
+
+		b = b[size:]
+	}
+
+	if hook != nil {
+		hook(buf)
+	}
+
+	result := make([]byte, buf.Len())
+	copy(result, buf.Bytes())
+
+	bp.PutBuffer(buf)
+
+	return result
+}
+
+func isAlphaNumeric(r rune) bool {
+	return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r)
+}
+
+func isSpace(r rune) bool {
+	return r == ' ' || r == '\t'
+}
+
+var _ parser.IDs = (*idFactory)(nil)
+
+type idFactory struct {
+	asciiOnly bool
+	vals      map[string]struct{}
+}
+
+func newIDFactory(asciiOnly bool) *idFactory {
+	return &idFactory{
+		vals:      make(map[string]struct{}),
+		asciiOnly: asciiOnly,
+	}
+}
+
+func (ids *idFactory) Generate(value []byte, kind ast.NodeKind) []byte {
+	return sanitizeAnchorNameWithHook(value, ids.asciiOnly, func(buf *bytes.Buffer) {
+		if buf.Len() == 0 {
+			if kind == ast.KindHeading {
+				buf.WriteString("heading")
+			} else {
+				buf.WriteString("id")
+			}
+		}
+
+		if _, found := ids.vals[util.BytesToReadOnlyString(buf.Bytes())]; found {
+			// Append a hypen and a number, starting with 1.
+			buf.WriteRune('-')
+			pos := buf.Len()
+			for i := 1; ; i++ {
+				buf.WriteString(strconv.Itoa(i))
+				if _, found := ids.vals[util.BytesToReadOnlyString(buf.Bytes())]; !found {
+					break
+				}
+				buf.Truncate(pos)
+			}
+		}
+
+		ids.vals[buf.String()] = struct{}{}
+
+	})
+}
+
+func (ids *idFactory) Put(value []byte) {
+	ids.vals[util.BytesToReadOnlyString(value)] = struct{}{}
+}
--- /dev/null
+++ b/markup/goldmark/autoid_test.go
@@ -1,0 +1,121 @@
+// Copyright 2019 The Hugo Authors. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package goldmark
+
+import (
+	"strings"
+	"testing"
+
+	qt "github.com/frankban/quicktest"
+)
+
+func TestSanitizeAnchorName(t *testing.T) {
+	c := qt.New(t)
+
+	// Tests generated manually on github.com
+	tests := `
+God is good: 神真美好
+Number 32
+Question?
+1+2=3
+Special !"#$%&(parens)=?´* chars
+Resumé
+One-Hyphen
+Multiple--Hyphens
+Trailing hyphen-
+Many   spaces  here
+Forward/slash
+Backward\slash
+Under_score
+`
+
+	expect := `
+god-is-good-神真美好
+number-32
+question
+123
+special-parens-chars
+resumé
+one-hyphen
+multiple--hyphens
+trailing-hyphen-
+many---spaces--here
+forwardslash
+backwardslash
+under_score
+`
+
+	tests, expect = strings.TrimSpace(tests), strings.TrimSpace(expect)
+
+	testlines, expectlines := strings.Split(tests, "\n"), strings.Split(expect, "\n")
+
+	if len(testlines) != len(expectlines) {
+		panic("test setup failed")
+	}
+
+	for i, input := range testlines {
+		input := input
+		expect := expectlines[i]
+		c.Run(input, func(c *qt.C) {
+			b := []byte(input)
+			got := string(sanitizeAnchorName(b, false))
+			c.Assert(got, qt.Equals, expect)
+			c.Assert(sanitizeAnchorNameString(input, false), qt.Equals, expect)
+			c.Assert(string(b), qt.Equals, input)
+		})
+	}
+}
+
+func TestSanitizeAnchorNameAsciiOnly(t *testing.T) {
+	c := qt.New(t)
+
+	c.Assert(sanitizeAnchorNameString("god is神真美好 good", true), qt.Equals, "god-is-good")
+	c.Assert(sanitizeAnchorNameString("Resumé", true), qt.Equals, "resume")
+
+}
+
+func BenchmarkSanitizeAnchorName(b *testing.B) {
+	input := []byte("God is good: 神真美好")
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		result := sanitizeAnchorName(input, false)
+		if len(result) != 24 {
+			b.Fatalf("got %d", len(result))
+
+		}
+	}
+}
+
+func BenchmarkSanitizeAnchorNameAsciiOnly(b *testing.B) {
+	input := []byte("God is good: 神真美好")
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		result := sanitizeAnchorName(input, true)
+		if len(result) != 12 {
+			b.Fatalf("got %d", len(result))
+
+		}
+	}
+}
+
+func BenchmarkSanitizeAnchorNameString(b *testing.B) {
+	input := "God is good: 神真美好"
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		result := sanitizeAnchorNameString(input, false)
+		if len(result) != 24 {
+			b.Fatalf("got %d", len(result))
+		}
+	}
+}
--- a/markup/goldmark/convert.go
+++ b/markup/goldmark/convert.go
@@ -50,21 +50,35 @@
 
 func (p provide) New(cfg converter.ProviderConfig) (converter.Provider, error) {
 	md := newMarkdown(cfg)
+
 	return converter.NewProvider("goldmark", func(ctx converter.DocumentContext) (converter.Converter, error) {
 		return &goldmarkConverter{
 			ctx: ctx,
 			cfg: cfg,
 			md:  md,
+			sanitizeAnchorName: func(s string) string {
+				return sanitizeAnchorNameString(s, cfg.MarkupConfig.Goldmark.Parser.AutoHeadingIDAsciiOnly)
+			},
 		}, nil
 	}), nil
 }
 
+var (
+	_ converter.AnchorNameSanitizer = (*goldmarkConverter)(nil)
+)
+
 type goldmarkConverter struct {
 	md  goldmark.Markdown
 	ctx converter.DocumentContext
 	cfg converter.ProviderConfig
+
+	sanitizeAnchorName func(s string) string
 }
 
+func (c *goldmarkConverter) SanitizeAnchorName(s string) string {
+	return c.sanitizeAnchorName(s)
+}
+
 func newMarkdown(pcfg converter.ProviderConfig) goldmark.Markdown {
 	mcfg := pcfg.MarkupConfig
 	cfg := pcfg.MarkupConfig.Goldmark
@@ -226,7 +240,7 @@
 
 	buf := &bufWriter{Buffer: &bytes.Buffer{}}
 	result = buf
-	pctx := newParserContext(ctx)
+	pctx := c.newParserContext(ctx)
 	reader := text.NewReader(ctx.Src)
 
 	doc := c.md.Parser().Parse(
@@ -265,8 +279,8 @@
 	return featureSet[feature.GetIdentity()]
 }
 
-func newParserContext(rctx converter.RenderContext) *parserContext {
-	ctx := parser.NewContext()
+func (c *goldmarkConverter) newParserContext(rctx converter.RenderContext) *parserContext {
+	ctx := parser.NewContext(parser.WithIDs(newIDFactory(c.cfg.MarkupConfig.Goldmark.Parser.AutoHeadingIDAsciiOnly)))
 	ctx.Set(tocEnableKey, rctx.RenderTOC)
 	return &parserContext{
 		Context: ctx,
--- a/markup/goldmark/convert_test.go
+++ b/markup/goldmark/convert_test.go
@@ -28,6 +28,23 @@
 	qt "github.com/frankban/quicktest"
 )
 
+func convert(c *qt.C, mconf markup_config.Config, content string) converter.Result {
+
+	p, err := Provider.New(
+		converter.ProviderConfig{
+			MarkupConfig: mconf,
+			Logger:       loggers.NewErrorLogger(),
+		},
+	)
+	c.Assert(err, qt.IsNil)
+	conv, err := p.New(converter.DocumentContext{DocumentID: "thedoc"})
+	c.Assert(err, qt.IsNil)
+	b, err := conv.Convert(converter.RenderContext{RenderTOC: true, Src: []byte(content)})
+	c.Assert(err, qt.IsNil)
+
+	return b
+}
+
 func TestConvert(t *testing.T) {
 	c := qt.New(t)
 
@@ -92,6 +109,12 @@
 : the description for the content.
 
 
+## 神真美好
+
+## 神真美好
+
+## 神真美好
+
 [^1]: And that's the footnote.
 
 `
@@ -98,23 +121,11 @@
 
 	// Code fences
 	content = strings.Replace(content, "§§§", "```", -1)
-
 	mconf := markup_config.Default
 	mconf.Highlight.NoClasses = false
 	mconf.Goldmark.Renderer.Unsafe = true
 
-	p, err := Provider.New(
-		converter.ProviderConfig{
-			MarkupConfig: mconf,
-			Logger:       loggers.NewErrorLogger(),
-		},
-	)
-	c.Assert(err, qt.IsNil)
-	conv, err := p.New(converter.DocumentContext{DocumentID: "thedoc"})
-	c.Assert(err, qt.IsNil)
-	b, err := conv.Convert(converter.RenderContext{RenderTOC: true, Src: []byte(content)})
-	c.Assert(err, qt.IsNil)
-
+	b := convert(c, mconf, content)
 	got := string(b.Bytes())
 
 	// Links
@@ -123,6 +134,9 @@
 	// Header IDs
 	c.Assert(got, qt.Contains, `<h2 id="custom">Custom ID</h2>`, qt.Commentf(got))
 	c.Assert(got, qt.Contains, `<h2 id="auto-id">Auto ID</h2>`, qt.Commentf(got))
+	c.Assert(got, qt.Contains, `<h2 id="神真美好">神真美好</h2>`, qt.Commentf(got))
+	c.Assert(got, qt.Contains, `<h2 id="神真美好-1">神真美好</h2>`, qt.Commentf(got))
+	c.Assert(got, qt.Contains, `<h2 id="神真美好-2">神真美好</h2>`, qt.Commentf(got))
 
 	// Code fences
 	c.Assert(got, qt.Contains, "<div class=\"highlight\"><pre class=\"chroma\"><code class=\"language-bash\" data-lang=\"bash\">LINE1\n</code></pre></div>")
@@ -146,6 +160,20 @@
 	tocHTML := toc.TableOfContents().ToHTML(1, 2, false)
 	c.Assert(tocHTML, qt.Contains, "TableOfContents")
 
+}
+
+func TestConvertAutoIDAsciiOnly(t *testing.T) {
+	c := qt.New(t)
+
+	content := `
+## God is Good: 神真美好
+`
+	mconf := markup_config.Default
+	mconf.Goldmark.Parser.AutoHeadingIDAsciiOnly = true
+	b := convert(c, mconf, content)
+	got := string(b.Bytes())
+
+	c.Assert(got, qt.Contains, "<h2 id=\"god-is-good-\">")
 }
 
 func TestCodeFence(t *testing.T) {
--- a/markup/goldmark/goldmark_config/config.go
+++ b/markup/goldmark/goldmark_config/config.go
@@ -69,6 +69,10 @@
 	// auto generated heading ids.
 	AutoHeadingID bool
 
+	// When AutoHeadingID is enabled this will generate IDs with Ascii
+	// characters only.
+	AutoHeadingIDAsciiOnly bool
+
 	// Enables custom attributes.
 	Attribute bool
 }
--- a/tpl/urls/urls.go
+++ b/tpl/urls/urls.go
@@ -25,7 +25,6 @@
 	"github.com/gohugoio/hugo/common/urls"
 	"github.com/gohugoio/hugo/deps"
 	_errors "github.com/pkg/errors"
-	"github.com/russross/blackfriday"
 	"github.com/spf13/cast"
 )
 
@@ -90,7 +89,7 @@
 	if err != nil {
 		return "", nil
 	}
-	return blackfriday.SanitizedAnchorName(s), nil
+	return ns.deps.ContentSpec.SanitizeAnchorName(s), nil
 }
 
 // Ref returns the absolute URL path to a given content item.