shithub: hugo

Download patch

ref: bcd434794a28ff75a6e6504c6c3bada554ba88ce
parent: 74ffb45fbe2c121881b2386fc3210f8b1c6bd952
author: Bjørn Erik Pedersen <[email protected]>
date: Tue Aug 16 18:50:15 EDT 2016

Avoid splitting words for summary

For people using autogenerated summaries, this is one of the hot spots in the memory department.

We don't need to split al the content into words to do proper summary truncation.

This is obviously more effective:

```
BenchmarkTestTruncateWordsToWholeSentence-4            300000          4720 ns/op           0 B/op           0 allocs/op
BenchmarkTestTruncateWordsToWholeSentenceOld-4         100000         17699 ns/op        3072 B/op           3 allocs/op
```

--- a/helpers/content.go
+++ b/helpers/content.go
@@ -21,6 +21,7 @@
 	"bytes"
 	"html/template"
 	"os/exec"
+	"unicode"
 	"unicode/utf8"
 
 	"github.com/miekg/mmark"
@@ -424,10 +425,55 @@
 	return strings.Join(words, " "), false
 }
 
-// TruncateWordsToWholeSentence takes content and an int
-// and returns entire sentences from content, delimited by the int
-// and whether it's truncated or not.
-func TruncateWordsToWholeSentence(words []string, max int) (string, bool) {
+// TruncateWordsToWholeSentence takes content and truncates to whole sentence
+// limited by max number of words. It also returns whether it is truncated.
+func TruncateWordsToWholeSentence(s string, max int) (string, bool) {
+
+	var (
+		wordCount     = 0
+		lastWordIndex = -1
+	)
+
+	for i, r := range s {
+		if unicode.IsSpace(r) {
+			wordCount++
+			lastWordIndex = i
+
+			if wordCount >= max {
+				break
+			}
+
+		}
+	}
+
+	if lastWordIndex == -1 {
+		return s, false
+	}
+
+	endIndex := -1
+
+	for j, r := range s[lastWordIndex:] {
+		if isEndOfSentence(r) {
+			endIndex = j + lastWordIndex + utf8.RuneLen(r)
+			break
+		}
+	}
+
+	if endIndex == -1 {
+		return s, false
+	}
+
+	return strings.TrimSpace(s[:endIndex]), endIndex < len(s)
+}
+
+func isEndOfSentence(r rune) bool {
+	return r == '.' || r == '?' || r == '!' || r == '"' || r == '\n'
+}
+
+// Kept only for benchmark.
+func truncateWordsToWholeSentenceOld(content string, max int) (string, bool) {
+	words := strings.Fields(content)
+
 	if max >= len(words) {
 		return strings.Join(words, " "), false
 	}
--- a/helpers/content_test.go
+++ b/helpers/content_test.go
@@ -64,6 +64,22 @@
 	assert.Equal(t, template.HTML("dobedobedo"), BytesToHTML([]byte("dobedobedo")))
 }
 
+var benchmarkTruncateString = strings.Repeat("This is a sentence about nothing.", 20)
+
+func BenchmarkTestTruncateWordsToWholeSentence(b *testing.B) {
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		TruncateWordsToWholeSentence(benchmarkTruncateString, SummaryLength)
+	}
+}
+
+func BenchmarkTestTruncateWordsToWholeSentenceOld(b *testing.B) {
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		truncateWordsToWholeSentenceOld(benchmarkTruncateString, SummaryLength)
+	}
+}
+
 func TestTruncateWordsToWholeSentence(t *testing.T) {
 	type test struct {
 		input, expected string
@@ -77,10 +93,11 @@
 		{"This is a sentence.", "This is a sentence.", 5, false},
 		{"This is also a sentence!", "This is also a sentence!", 1, false},
 		{"To be. Or not to be. That's the question.", "To be.", 1, true},
-		{" \nThis is not a sentence\n ", "This is not a", 4, true},
+		{" \nThis is not a sentence\nAnd this is another", "This is not a sentence", 4, true},
+		{"", "", 10, false},
 	}
 	for i, d := range data {
-		output, truncated := TruncateWordsToWholeSentence(strings.Fields(d.input), d.max)
+		output, truncated := TruncateWordsToWholeSentence(d.input, d.max)
 		if d.expected != output {
 			t.Errorf("Test %d failed. Expected %q got %q", i, d.expected, output)
 		}
--- a/hugolib/page.go
+++ b/hugolib/page.go
@@ -89,6 +89,7 @@
 	plain               string // TODO should be []byte
 	plainWords          []string
 	plainInit           sync.Once
+	plainWordsInit      sync.Once
 	renderingConfig     *helpers.Blackfriday
 	renderingConfigInit sync.Once
 	pageMenus           PageMenus
@@ -147,7 +148,7 @@
 }
 
 func (p *Page) PlainWords() []string {
-	p.initPlain()
+	p.initPlainWords()
 	return p.plainWords
 }
 
@@ -154,11 +155,17 @@
 func (p *Page) initPlain() {
 	p.plainInit.Do(func() {
 		p.plain = helpers.StripHTML(string(p.Content))
-		p.plainWords = strings.Fields(p.plain)
 		return
 	})
 }
 
+func (p *Page) initPlainWords() {
+	p.plainWordsInit.Do(func() {
+		p.plainWords = strings.Fields(p.Plain())
+		return
+	})
+}
+
 func (p *Page) IsNode() bool {
 	return false
 }
@@ -335,7 +342,7 @@
 	if p.isCJKLanguage {
 		summary, truncated = helpers.TruncateWordsByRune(p.PlainWords(), helpers.SummaryLength)
 	} else {
-		summary, truncated = helpers.TruncateWordsToWholeSentence(p.PlainWords(), helpers.SummaryLength)
+		summary, truncated = helpers.TruncateWordsToWholeSentence(p.Plain(), helpers.SummaryLength)
 	}
 	p.Summary = template.HTML(summary)
 	p.Truncated = truncated
@@ -479,6 +486,10 @@
 }
 
 func (p *Page) analyzePage() {
+	// TODO(bep)
+	if true {
+		return
+	}
 	if p.isCJKLanguage {
 		p.WordCount = 0
 		for _, word := range p.PlainWords() {