ref: 9cd54cab20a03475e34ca462bd943069111481ae
parent: a8853f1c5ace30ae8d256ad374bdb280c95d4228
author: Bjørn Erik Pedersen <[email protected]>
date: Mon Dec 17 16:03:23 EST 2018
Move the emoji parsing to pageparser This avoids double parsing the page content when `enableEmoji=true`. This commit also adds some general improvements to the parser, making it in general much faster: ```bash benchmark old ns/op new ns/op delta BenchmarkShortcodeLexer-4 90258 101730 +12.71% BenchmarkParse-4 148940 15037 -89.90% benchmark old allocs new allocs delta BenchmarkShortcodeLexer-4 456 700 +53.51% BenchmarkParse-4 28 33 +17.86% benchmark old bytes new bytes delta BenchmarkShortcodeLexer-4 69875 81014 +15.94% BenchmarkParse-4 8128 8304 +2.17% ``` Running some site benchmarks with Emoji support turned on: ```bash benchmark old ns/op new ns/op delta BenchmarkSiteBuilding/TOML,num_langs=3,num_pages=5000,tags_per_page=5,shortcodes,render-4 924556797 818115620 -11.51% benchmark old allocs new allocs delta BenchmarkSiteBuilding/TOML,num_langs=3,num_pages=5000,tags_per_page=5,shortcodes,render-4 4112613 4133787 +0.51% benchmark old bytes new bytes delta BenchmarkSiteBuilding/TOML,num_langs=3,num_pages=5000,tags_per_page=5,shortcodes,render-4 426982864 424363832 -0.61% ``` Fixes #5534
--- a/commands/convert.go
+++ b/commands/convert.go
@@ -215,7 +215,7 @@
func parseContentFile(r io.Reader) (parsedFile, error) {
var pf parsedFile
- psr, err := pageparser.Parse(r)
+ psr, err := pageparser.Parse(r, pageparser.Config{})
if err != nil {
return pf, err
}
--- a/helpers/emoji.go
+++ b/helpers/emoji.go
@@ -30,6 +30,12 @@
emojiMaxSize int
)
+// Emoji returns the emojy given a key, e.g. ":smile:", nil if not found.
+func Emoji(key string) []byte {
+ emojiInit.Do(initEmoji)
+ return emojis[key]
+}
+
// Emojify "emojifies" the input source.
// Note that the input byte slice will be modified if needed.
// See http://www.emoji-cheat-sheet.com/
--- a/hugolib/page_content.go
+++ b/hugolib/page_content.go
@@ -17,6 +17,8 @@
"bytes"
"io"
+ "github.com/gohugoio/hugo/helpers"
+
errors "github.com/pkg/errors"
bp "github.com/gohugoio/hugo/bufferpool"
@@ -149,6 +151,12 @@
result.WriteString(placeHolder)
ordinal++
s.shortcodes.Add(placeHolder, currShortcode)
+ case it.Type == pageparser.TypeEmoji:
+ if emoji := helpers.Emoji(it.ValStr()); emoji != nil {
+ result.Write(emoji)
+ } else {
+ result.Write(it.Val)
+ }
case it.IsEOF():
break Loop
case it.IsError():
@@ -170,7 +178,10 @@
func (p *Page) parse(reader io.Reader) error {
- parseResult, err := pageparser.Parse(reader)
+ parseResult, err := pageparser.Parse(
+ reader,
+ pageparser.Config{EnableEmoji: p.s.Cfg.GetBool("enableEmoji")},
+ )
if err != nil {
return err
}
--- a/hugolib/page_test.go
+++ b/hugolib/page_test.go
@@ -1497,6 +1497,45 @@
checkPageTitle(t, p, "Simple")
}
+func TestPageWithEmoji(t *testing.T) {
+ for _, enableEmoji := range []bool{true, false} {
+ v := viper.New()
+ v.Set("enableEmoji", enableEmoji)
+ b := newTestSitesBuilder(t)
+ b.WithViper(v)
+
+ b.WithSimpleConfigFile()
+
+ b.WithContent("page-emoji.md", `---
+title: "Hugo Smile"
+---
+This is a :smile:.
+<!--more-->
+
+Another :smile: This is :not: an emoji.
+
+`)
+
+ b.CreateSites().Build(BuildCfg{})
+
+ if enableEmoji {
+ b.AssertFileContent("public/page-emoji/index.html",
+ "This is a 😄",
+ "Another 😄",
+ "This is :not: an emoji",
+ )
+ } else {
+ b.AssertFileContent("public/page-emoji/index.html",
+ "This is a :smile:",
+ "Another :smile:",
+ "This is :not: an emoji",
+ )
+ }
+
+ }
+
+}
+
// https://github.com/gohugoio/hugo/issues/5381
func TestPageManualSummary(t *testing.T) {
b := newTestSitesBuilder(t)
--- a/hugolib/pagebundler_handlers.go
+++ b/hugolib/pagebundler_handlers.go
@@ -272,10 +272,6 @@
p := ctx.currentPage
- if c.s.Cfg.GetBool("enableEmoji") {
- p.workContent = helpers.Emojify(p.workContent)
- }
-
p.workContent = p.renderContent(p.workContent)
tmpContent, tmpTableOfContents := helpers.ExtractTOC(p.workContent)
--- a/hugolib/shortcode.go
+++ b/hugolib/shortcode.go
@@ -177,6 +177,16 @@
pos int // the position in bytes in the source file
}
+func (s shortcode) innerString() string {
+ var sb strings.Builder
+
+ for _, inner := range s.inner {
+ sb.WriteString(inner.(string))
+ }
+
+ return sb.String()
+}
+
func (sc shortcode) String() string {
// for testing (mostly), so any change here will break tests!
var params interface{}
@@ -363,7 +373,7 @@
if sc.isInline {
templName := path.Join("_inline_shortcode", p.Path(), sc.name)
if sc.isClosing {
- templStr := sc.inner[0].(string)
+ templStr := sc.innerString()
var err error
tmpl, err = p.s.TextTmpl.Parse(templName, templStr)
--- a/parser/pageparser/item.go
+++ b/parser/pageparser/item.go
@@ -113,6 +113,7 @@
TypeFrontMatterTOML
TypeFrontMatterJSON
TypeFrontMatterORG
+ TypeEmoji
TypeIgnore // // The BOM Unicode byte order marker and possibly others
// shortcode items
--- a/parser/pageparser/pagelexer.go
+++ b/parser/pageparser/pagelexer.go
@@ -37,6 +37,12 @@
start int // item start position
width int // width of last element
+ // Contains lexers for shortcodes and other main section
+ // elements.
+ sectionHandlers *sectionHandlers
+
+ cfg Config
+
// The summary divider to look for.
summaryDivider []byte
// Set when we have parsed any summary divider
@@ -60,13 +66,17 @@
}
+type Config struct {
+ EnableEmoji bool
+}
+
// note: the input position here is normally 0 (start), but
// can be set if position of first shortcode is known
-func newPageLexer(input []byte, inputPosition int, stateStart stateFunc) *pageLexer {
+func newPageLexer(input []byte, stateStart stateFunc, cfg Config) *pageLexer {
lexer := &pageLexer{
input: input,
- pos: inputPosition,
stateStart: stateStart,
+ cfg: cfg,
lexerShortcodeState: lexerShortcodeState{
currLeftDelimItem: tLeftDelimScNoMarkup,
currRightDelimItem: tRightDelimScNoMarkup,
@@ -75,6 +85,8 @@
items: make([]Item, 0, 5),
}
+ lexer.sectionHandlers = createSectionHandlers(lexer)
+
return lexer
}
@@ -100,6 +112,8 @@
delimOrg = []byte("#+")
htmlCommentStart = []byte("<!--")
htmlCommentEnd = []byte("-->")
+
+ emojiDelim = byte(':')
)
func (l *pageLexer) next() rune {
@@ -132,6 +146,10 @@
l.start = l.pos
}
+func (l *pageLexer) isEOF() bool {
+ return l.pos >= len(l.input)
+}
+
// special case, do not send '\\' back to client
func (l *pageLexer) ignoreEscapesAndEmit(t ItemType) {
val := bytes.Map(func(r rune) rune {
@@ -193,30 +211,80 @@
}
}
-func lexMainSection(l *pageLexer) stateFunc {
- if l.isInHTMLComment {
- return lexEndFromtMatterHTMLComment
+// lex a string starting at ":"
+func lexEmoji(l *pageLexer) stateFunc {
+ pos := l.pos + 1
+ valid := false
+
+ for i := pos; i < len(l.input); i++ {
+ if i > pos && l.input[i] == emojiDelim {
+ pos = i + 1
+ valid = true
+ break
+ }
+ r, _ := utf8.DecodeRune(l.input[i:])
+ if !isAlphaNumeric(r) {
+ break
+ }
}
- // Fast forward as far as possible.
- var l1, l2 int
+ if valid {
+ l.pos = pos
+ l.emit(TypeEmoji)
+ } else {
+ l.pos++
+ l.emit(tText)
+ }
- if !l.summaryDividerChecked && l.summaryDivider != nil {
- l1 = l.index(l.summaryDivider)
- if l1 == -1 {
- l.summaryDividerChecked = true
+ return lexMainSection
+}
+
+type sectionHandlers struct {
+ l *pageLexer
+
+ // Set when none of the sections are found so we
+ // can safely stop looking and skip to the end.
+ skipAll bool
+
+ handlers []*sectionHandler
+ skipIndexes []int
+}
+
+func (s *sectionHandlers) skip() int {
+ if s.skipAll {
+ return -1
+ }
+
+ s.skipIndexes = s.skipIndexes[:0]
+ var shouldSkip bool
+ for _, skipper := range s.handlers {
+ idx := skipper.skip()
+ if idx != -1 {
+ shouldSkip = true
+ s.skipIndexes = append(s.skipIndexes, idx)
}
}
- l2 = l.index(leftDelimSc)
- skip := minIndex(l1, l2)
-
- if skip > 0 {
- l.pos += skip
+ if !shouldSkip {
+ s.skipAll = true
+ return -1
}
- for {
- if l.isShortCodeStart() {
+ return minIndex(s.skipIndexes...)
+}
+
+func createSectionHandlers(l *pageLexer) *sectionHandlers {
+
+ shortCodeHandler := §ionHandler{
+ l: l,
+ skipFunc: func(l *pageLexer) int {
+ return l.index(leftDelimSc)
+ },
+ lexFunc: func(origin stateFunc, l *pageLexer) (stateFunc, bool) {
+ if !l.isShortCodeStart() {
+ return origin, false
+ }
+
if l.isInline {
// If we're inside an inline shortcode, the only valid shortcode markup is
// the markup which closes it.
@@ -225,14 +293,11 @@
if end != len(l.input)-1 {
b = bytes.TrimSpace(b[end+1:])
if end == -1 || !bytes.HasPrefix(b, []byte(l.currShortcodeName+" ")) {
- return l.errorf("inline shortcodes do not support nesting")
+ return l.errorf("inline shortcodes do not support nesting"), true
}
}
}
- if l.pos > l.start {
- l.emit(tText)
- }
if l.hasPrefix(leftDelimScWithMarkup) {
l.currLeftDelimItem = tLeftDelimScWithMarkup
l.currRightDelimItem = tRightDelimScWithMarkup
@@ -240,32 +305,139 @@
l.currLeftDelimItem = tLeftDelimScNoMarkup
l.currRightDelimItem = tRightDelimScNoMarkup
}
- return lexShortcodeLeftDelim
- }
- if !l.summaryDividerChecked && l.summaryDivider != nil {
- if l.hasPrefix(l.summaryDivider) {
- if l.pos > l.start {
- l.emit(tText)
- }
- l.summaryDividerChecked = true
- l.pos += len(l.summaryDivider)
- // This makes it a little easier to reason about later.
- l.consumeSpace()
- l.emit(TypeLeadSummaryDivider)
+ return lexShortcodeLeftDelim, true
+ },
+ }
- // We have already moved to the next.
- continue
+ summaryDividerHandler := §ionHandler{
+ l: l,
+ skipFunc: func(l *pageLexer) int {
+ if l.summaryDividerChecked || l.summaryDivider == nil {
+ return -1
+
}
+ return l.index(l.summaryDivider)
+ },
+ lexFunc: func(origin stateFunc, l *pageLexer) (stateFunc, bool) {
+ if !l.hasPrefix(l.summaryDivider) {
+ return origin, false
+ }
+
+ l.summaryDividerChecked = true
+ l.pos += len(l.summaryDivider)
+ // This makes it a little easier to reason about later.
+ l.consumeSpace()
+ l.emit(TypeLeadSummaryDivider)
+
+ return origin, true
+
+ },
+ }
+
+ handlers := []*sectionHandler{shortCodeHandler, summaryDividerHandler}
+
+ if l.cfg.EnableEmoji {
+ emojiHandler := §ionHandler{
+ l: l,
+ skipFunc: func(l *pageLexer) int {
+ return l.indexByte(emojiDelim)
+ },
+ lexFunc: func(origin stateFunc, l *pageLexer) (stateFunc, bool) {
+ return lexEmoji, true
+ },
}
- r := l.next()
- if r == eof {
- break
+ handlers = append(handlers, emojiHandler)
+ }
+
+ return §ionHandlers{
+ l: l,
+ handlers: handlers,
+ skipIndexes: make([]int, len(handlers)),
+ }
+}
+
+func (s *sectionHandlers) lex(origin stateFunc) stateFunc {
+ if s.skipAll {
+ return nil
+ }
+
+ if s.l.pos > s.l.start {
+ s.l.emit(tText)
+ }
+
+ for _, handler := range s.handlers {
+ if handler.skipAll {
+ continue
}
+ next, handled := handler.lexFunc(origin, handler.l)
+ if next == nil || handled {
+ return next
+ }
}
+ // Not handled by the above.
+ s.l.pos++
+
+ return origin
+}
+
+type sectionHandler struct {
+ l *pageLexer
+
+ // No more sections of this type.
+ skipAll bool
+
+ // Returns the index of the next match, -1 if none found.
+ skipFunc func(l *pageLexer) int
+
+ // Lex lexes the current section and returns the next state func and
+ // a bool telling if this section was handled.
+ // Note that returning nil as the next state will terminate the
+ // lexer.
+ lexFunc func(origin stateFunc, l *pageLexer) (stateFunc, bool)
+}
+
+func (s *sectionHandler) skip() int {
+ if s.skipAll {
+ return -1
+ }
+
+ idx := s.skipFunc(s.l)
+ if idx == -1 {
+ s.skipAll = true
+ }
+ return idx
+}
+
+func lexMainSection(l *pageLexer) stateFunc {
+
+ if l.isEOF() {
+ return lexDone
+ }
+
+ if l.isInHTMLComment {
+ return lexEndFromtMatterHTMLComment
+ }
+
+ // Fast forward as far as possible.
+ skip := l.sectionHandlers.skip()
+
+ if skip == -1 {
+ l.pos = len(l.input)
+ return lexDone
+ } else if skip > 0 {
+ l.pos += skip
+ }
+
+ next := l.sectionHandlers.lex(lexMainSection)
+ if next != nil {
+ return next
+ }
+
+ l.pos = len(l.input)
return lexDone
}
@@ -297,8 +469,20 @@
return bytes.Index(l.input[l.pos:], sep)
}
+func (l *pageLexer) indexByte(sep byte) int {
+ return bytes.IndexByte(l.input[l.pos:], sep)
+}
+
func (l *pageLexer) hasPrefix(prefix []byte) bool {
return bytes.HasPrefix(l.input[l.pos:], prefix)
+}
+
+func (l *pageLexer) hasPrefixByte(prefix byte) bool {
+ b := l.input[l.pos:]
+ if len(b) == 0 {
+ return false
+ }
+ return b[0] == prefix
}
// helper functions
--- a/parser/pageparser/pageparser.go
+++ b/parser/pageparser/pageparser.go
@@ -27,7 +27,7 @@
// Result holds the parse result.
type Result interface {
- // Iterator returns a new Iterator positioned at the benning of the parse tree.
+ // Iterator returns a new Iterator positioned at the beginning of the parse tree.
Iterator() *Iterator
// Input returns the input to Parse.
Input() []byte
@@ -35,25 +35,19 @@
var _ Result = (*pageLexer)(nil)
-// Parse parses the page in the given reader.
-func Parse(r io.Reader) (Result, error) {
+// Parse parses the page in the given reader according to the given Config.
+func Parse(r io.Reader, cfg Config) (Result, error) {
b, err := ioutil.ReadAll(r)
if err != nil {
return nil, errors.Wrap(err, "failed to read page content")
}
- return parseBytes(b)
+ return parseBytes(b, cfg)
}
-func parseBytes(b []byte) (Result, error) {
- lexer := newPageLexer(b, 0, lexIntroSection)
+func parseBytes(b []byte, cfg Config) (Result, error) {
+ lexer := newPageLexer(b, lexIntroSection, cfg)
lexer.run()
return lexer, nil
-}
-
-func parseMainSection(input []byte, from int) Result {
- lexer := newPageLexer(input, from, lexMainSection)
- lexer.run()
- return lexer
}
// An Iterator has methods to iterate a parsed page with support going back
--- a/parser/pageparser/pageparser_intro_test.go
+++ b/parser/pageparser/pageparser_intro_test.go
@@ -88,8 +88,8 @@
}
}
-func collect(input []byte, skipFrontMatter bool, stateStart stateFunc) (items []Item) {
- l := newPageLexer(input, 0, stateStart)
+func collectWithConfig(input []byte, skipFrontMatter bool, stateStart stateFunc, cfg Config) (items []Item) {
+ l := newPageLexer(input, stateStart, cfg)
l.run()
t := l.newIterator()
@@ -101,6 +101,13 @@
}
}
return
+}
+
+func collect(input []byte, skipFrontMatter bool, stateStart stateFunc) (items []Item) {
+ var cfg Config
+
+ return collectWithConfig(input, skipFrontMatter, stateStart, cfg)
+
}
// no positional checking, for now ...
--- /dev/null
+++ b/parser/pageparser/pageparser_main_test.go
@@ -1,0 +1,40 @@
+// Copyright 2018 The Hugo Authors. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pageparser
+
+import (
+ "fmt"
+ "testing"
+)
+
+func TestMain(t *testing.T) {
+ t.Parallel()
+
+ var mainTests = []lexerTest{
+ {"emoji #1", "Some text with :emoji:", []Item{nti(tText, "Some text with "), nti(TypeEmoji, ":emoji:"), tstEOF}},
+ {"emoji #2", "Some text with :emoji: and some text.", []Item{nti(tText, "Some text with "), nti(TypeEmoji, ":emoji:"), nti(tText, " and some text."), tstEOF}},
+ {"looks like an emoji #1", "Some text and then :emoji", []Item{nti(tText, "Some text and then "), nti(tText, ":"), nti(tText, "emoji"), tstEOF}},
+ {"looks like an emoji #2", "Some text and then ::", []Item{nti(tText, "Some text and then "), nti(tText, ":"), nti(tText, ":"), tstEOF}},
+ {"looks like an emoji #3", ":Some :text", []Item{nti(tText, ":"), nti(tText, "Some "), nti(tText, ":"), nti(tText, "text"), tstEOF}},
+ }
+
+ for i, test := range mainTests {
+ items := collectWithConfig([]byte(test.input), false, lexMainSection, Config{EnableEmoji: true})
+ if !equal(items, test.items) {
+ got := crLfReplacer.Replace(fmt.Sprint(items))
+ expected := crLfReplacer.Replace(fmt.Sprint(test.items))
+ t.Errorf("[%d] %s: got\n\t%v\nexpected\n\t%v", i, test.name, got, expected)
+ }
+ }
+}
--- a/parser/pageparser/pageparser_shortcode_test.go
+++ b/parser/pageparser/pageparser_shortcode_test.go
@@ -152,7 +152,8 @@
{"basic inline", `{{< sc1.inline >}}Hello World{{< /sc1.inline >}}`, []Item{tstLeftNoMD, tstSC1Inline, tstRightNoMD, tstText, tstLeftNoMD, tstSCClose, tstSC1Inline, tstRightNoMD, tstEOF}},
{"basic inline with space", `{{< sc1.inline >}}Hello World{{< / sc1.inline >}}`, []Item{tstLeftNoMD, tstSC1Inline, tstRightNoMD, tstText, tstLeftNoMD, tstSCClose, tstSC1Inline, tstRightNoMD, tstEOF}},
{"inline self closing", `{{< sc1.inline >}}Hello World{{< /sc1.inline >}}Hello World{{< sc1.inline />}}`, []Item{tstLeftNoMD, tstSC1Inline, tstRightNoMD, tstText, tstLeftNoMD, tstSCClose, tstSC1Inline, tstRightNoMD, tstText, tstLeftNoMD, tstSC1Inline, tstSCClose, tstRightNoMD, tstEOF}},
- {"inline with nested shortcode (not supported)", `{{< sc1.inline >}}Hello World{{< sc1 >}}{{< /sc1.inline >}}`, []Item{tstLeftNoMD, tstSC1Inline, tstRightNoMD, nti(tError, "inline shortcodes do not support nesting")}},
+ {"inline with template syntax", `{{< sc1.inline >}}{{ .Get 0 }}{{ .Get 1 }}{{< /sc1.inline >}}`, []Item{tstLeftNoMD, tstSC1Inline, tstRightNoMD, nti(tText, "{{ .Get 0 }}"), nti(tText, "{{ .Get 1 }}"), tstLeftNoMD, tstSCClose, tstSC1Inline, tstRightNoMD, tstEOF}},
+ {"inline with nested shortcode (not supported)", `{{< sc1.inline >}}Hello World{{< sc1 >}}{{< /sc1.inline >}}`, []Item{tstLeftNoMD, tstSC1Inline, tstRightNoMD, tstText, nti(tError, "inline shortcodes do not support nesting")}},
{"inline case mismatch", `{{< sc1.Inline >}}Hello World{{< /sc1.Inline >}}`, []Item{tstLeftNoMD, nti(tError, "period in shortcode name only allowed for inline identifiers")}},
}
@@ -171,10 +172,11 @@
for i, input := range shortCodeLexerTests {
testInputs[i] = []byte(input.input)
}
+ var cfg Config
b.ResetTimer()
for i := 0; i < b.N; i++ {
for _, input := range testInputs {
- items := collect(input, true, lexMainSection)
+ items := collectWithConfig(input, true, lexMainSection, cfg)
if len(items) == 0 {
}
--- a/parser/pageparser/pageparser_test.go
+++ b/parser/pageparser/pageparser_test.go
@@ -34,10 +34,37 @@
`
input := []byte(start + strings.Repeat(strings.Repeat("this is text", 30)+"{{< myshortcode >}}This is some inner content.{{< /myshortcode >}}", 10))
+ cfg := Config{EnableEmoji: false}
b.ResetTimer()
for i := 0; i < b.N; i++ {
- if _, err := parseBytes(input); err != nil {
+ if _, err := parseBytes(input, cfg); err != nil {
+ b.Fatal(err)
+ }
+ }
+}
+
+func BenchmarkParseWithEmoji(b *testing.B) {
+ start := `
+
+
+---
+title: "Front Matters"
+description: "It really does"
+---
+
+This is some summary. This is some summary. This is some summary. This is some summary.
+
+ <!--more-->
+
+
+`
+ input := []byte(start + strings.Repeat("this is not emoji: ", 50) + strings.Repeat("some text ", 70) + strings.Repeat("this is not: ", 50) + strings.Repeat("but this is a :smile: ", 3) + strings.Repeat("some text ", 70))
+ cfg := Config{EnableEmoji: true}
+
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ if _, err := parseBytes(input, cfg); err != nil {
b.Fatal(err)
}
}