shithub: hugo

Download patch

ref: 823334875d396bdc15770c335c2029a01a7ef2ce
parent: 2c045ac449fbdca33daae828813a3b4a08224ef7
author: coderzh <[email protected]>
date: Thu Sep 3 14:22:20 EDT 2015

WordCount and Summary support CJK Language

 * add global `hasCJKLanguage` flag, if true, turn on auto-detecting CJKLanguage
 * add `isCJKLanguage` frontmatter to force specify whether is CJKLanguage or not
 * For .Summary: If isCJKLanguage is true, use the runes as basis for truncation, else keep as today.
 * For WordCount: If isCJKLanguage is true, use the runes as basis for calculation, else keep as today.
 * Unexport RuneCount

Fixes #1377

--- a/commands/hugo.go
+++ b/commands/hugo.go
@@ -168,6 +168,7 @@
 	viper.SetDefault("RSSUri", "index.xml")
 	viper.SetDefault("SectionPagesMenu", "")
 	viper.SetDefault("DisablePathToLower", false)
+	viper.SetDefault("HasCJKLanguage", false)
 }
 
 // InitializeConfig initializes a config file with sensible default configuration flags.
--- a/helpers/content.go
+++ b/helpers/content.go
@@ -19,9 +19,9 @@
 
 import (
 	"bytes"
-	"unicode/utf8"
 	"html/template"
 	"os/exec"
+	"unicode/utf8"
 
 	"github.com/miekg/mmark"
 	"github.com/russross/blackfriday"
@@ -178,7 +178,6 @@
 	}
 }
 
-
 func getMarkdownExtensions(ctx *RenderingContext) int {
 	flags := 0 | blackfriday.EXTENSION_NO_INTRA_EMPHASIS |
 		blackfriday.EXTENSION_TABLES | blackfriday.EXTENSION_FENCED_CODE |
@@ -385,61 +384,51 @@
 	return strings.Join(words[:max], " ")
 }
 
-// TruncateWordsToWholeSentence takes content and an int
-// and returns entire sentences from content, delimited by the int
-// and whether it's truncated or not.
-func TruncateWordsToWholeSentence(words []string, max int) (string, bool) {
+func TruncateWordsByRune(words []string, max int) (string, bool) {
 	count := 0
-	index, word := 0, ""
-	truncated := false
-	
-	for index, word = range words {
+	for index, word := range words {
+		if count >= max {
+			return strings.Join(words[:index], " "), true
+		}
 		runeCount := utf8.RuneCountInString(word)
 		if len(word) == runeCount {
-			count++;
+			count++
+		} else if count+runeCount < max {
+			count += runeCount
 		} else {
-			if count + runeCount <= max {
-				count += runeCount
-			} else {
-				offset := 0
-				for count < max {
-					_, width := utf8.DecodeRuneInString(word[offset:])
-			        offset += width
+			for ri, _ := range word {
+				if count >= max {
+					truncatedWords := append(words[:index], word[:ri])
+					return strings.Join(truncatedWords, " "), true
+				} else {
 					count++
 				}
-				words[index] = word[:offset]
-				truncated = true
 			}
 		}
-		
-		if count >= max {
-			if index < len(words) - 1 {
-				truncated = true	
-			}
-			break
-		}
 	}
-	
-	index += 1
-	
-	if index < len(words) {
-		for counter, word := range words[index:] {
-			if len(word) != utf8.RuneCountInString(word) {
-				break
-			}
-			if strings.HasSuffix(word, ".") ||
-				strings.HasSuffix(word, "?") ||
-				strings.HasSuffix(word, ".\"") ||
-				strings.HasSuffix(word, "!") {
-				upper := index + counter + 1
-				return strings.Join(words[:upper], " "), (upper < len(words))
-			}
+
+	return strings.Join(words, " "), false
+}
+
+// TruncateWordsToWholeSentence takes content and an int
+// and returns entire sentences from content, delimited by the int
+// and whether it's truncated or not.
+func TruncateWordsToWholeSentence(words []string, max int) (string, bool) {
+	if max >= len(words) {
+		return strings.Join(words, " "), false
+	}
+
+	for counter, word := range words[max:] {
+		if strings.HasSuffix(word, ".") ||
+			strings.HasSuffix(word, "?") ||
+			strings.HasSuffix(word, ".\"") ||
+			strings.HasSuffix(word, "!") {
+			upper := max + counter + 1
+			return strings.Join(words[:upper], " "), (upper < len(words))
 		}
-	} else if index > len(words) {
-		return strings.Join(words, " "), truncated
 	}
-	
-	return strings.Join(words[:index], " "), truncated
+
+	return strings.Join(words[:max], " "), true
 }
 
 // GetAsciidocContent calls asciidoctor or asciidoc as an external helper
--- a/helpers/content_test.go
+++ b/helpers/content_test.go
@@ -1,10 +1,11 @@
 package helpers
 
 import (
-	"github.com/stretchr/testify/assert"
 	"html/template"
 	"strings"
 	"testing"
+
+	"github.com/stretchr/testify/assert"
 )
 
 const tstHTMLContent = "<!DOCTYPE html><html><head><script src=\"http://two/foobar.js\"></script></head><body><nav><ul><li hugo-nav=\"section_0\"></li><li hugo-nav=\"section_1\"></li></ul></nav><article>content <a href=\"http://two/foobar\">foobar</a>. Follow up</article><p>This is some text.<br>And some more.</p></body></html>"
@@ -54,8 +55,6 @@
 		{"a b c", "a b c", 12, false},
 		{"a b c", "a b c", 3, false},
 		{"a", "a", 1, false},
-		{"Hello 中国", "Hello 中", 2, true},
-		{"Hello 中国", "Hello 中国", 3, false},
 		{"This is a sentence.", "This is a sentence.", 5, false},
 		{"This is also a sentence!", "This is also a sentence!", 1, false},
 		{"To be. Or not to be. That's the question.", "To be.", 1, true},
@@ -63,6 +62,39 @@
 	}
 	for i, d := range data {
 		output, truncated := TruncateWordsToWholeSentence(strings.Fields(d.input), d.max)
+		if d.expected != output {
+			t.Errorf("Test %d failed. Expected %q got %q", i, d.expected, output)
+		}
+
+		if d.truncated != truncated {
+			t.Errorf("Test %d failed. Expected truncated=%t got %t", i, d.truncated, truncated)
+		}
+	}
+}
+
+func TestTruncateWordsByRune(t *testing.T) {
+	type test struct {
+		input, expected string
+		max             int
+		truncated       bool
+	}
+	data := []test{
+		{"", "", 1, false},
+		{"a b c", "a b c", 12, false},
+		{"a b c", "a b c", 3, false},
+		{"a", "a", 1, false},
+		{"Hello 中国", "", 0, true},
+		{"这是中文,全中文。", "这是中文,", 5, true},
+		{"Hello 中国", "Hello 中", 2, true},
+		{"Hello 中国", "Hello 中国", 3, false},
+		{"Hello中国 Good 好的", "Hello中国 Good 好", 9, true},
+		{"This is a sentence.", "This is", 2, true},
+		{"This is also a sentence!", "This", 1, true},
+		{"To be. Or not to be. That's the question.", "To be. Or not", 4, true},
+		{" \nThis is    not a sentence\n ", "This is not", 3, true},
+	}
+	for i, d := range data {
+		output, truncated := TruncateWordsByRune(strings.Fields(d.input), d.max)
 		if d.expected != output {
 			t.Errorf("Test %d failed. Expected %q got %q", i, d.expected, output)
 		}
--- a/hugolib/page.go
+++ b/hugolib/page.go
@@ -28,6 +28,7 @@
 	"net/url"
 	"path"
 	"path/filepath"
+	"regexp"
 	"strings"
 	"sync"
 	"time"
@@ -42,6 +43,10 @@
 	"github.com/spf13/viper"
 )
 
+var (
+	cjk = regexp.MustCompile(`\p{Han}|\p{Hangul}|\p{Hiragana}|\p{Katakana}`)
+)
+
 type Page struct {
 	Params          map[string]interface{}
 	Content         template.HTML
@@ -67,7 +72,6 @@
 	contentShortCodes   map[string]string
 	plain               string // TODO should be []byte
 	plainWords          []string
-	plainRuneCount      int
 	plainInit           sync.Once
 	plainSecondaryInit  sync.Once
 	renderingConfig     *helpers.Blackfriday
@@ -78,6 +82,7 @@
 	Node
 	pageMenus     PageMenus
 	pageMenusInit sync.Once
+	isCJKLanguage bool
 }
 
 type Source struct {
@@ -111,12 +116,6 @@
 	return p.plainWords
 }
 
-// RuneCount returns the rune count, excluding any whitespace, of the plain content.
-func (p *Page) RuneCount() int {
-	p.initPlainSecondary()
-	return p.plainRuneCount
-}
-
 func (p *Page) initPlain() {
 	p.plainInit.Do(func() {
 		p.plain = helpers.StripHTML(string(p.Content))
@@ -125,20 +124,6 @@
 	})
 }
 
-func (p *Page) initPlainSecondary() {
-	p.plainSecondaryInit.Do(func() {
-		p.initPlain()
-		runeCount := 0
-		for _, r := range p.plain {
-			if !helpers.IsWhitespace(r) {
-				runeCount++
-			}
-		}
-		p.plainRuneCount = runeCount
-		return
-	})
-}
-
 func (p *Page) IsNode() bool {
 	return false
 }
@@ -218,7 +203,13 @@
 	} else {
 		// If hugo defines split:
 		// render, strip html, then split
-		summary, truncated := helpers.TruncateWordsToWholeSentence(p.PlainWords(), helpers.SummaryLength)
+		var summary string
+		var truncated bool
+		if p.isCJKLanguage {
+			summary, truncated = helpers.TruncateWordsByRune(p.PlainWords(), helpers.SummaryLength)
+		} else {
+			summary, truncated = helpers.TruncateWordsToWholeSentence(p.PlainWords(), helpers.SummaryLength)
+		}
 		p.Summary = template.HTML(summary)
 		p.Truncated = truncated
 
@@ -363,18 +354,27 @@
 }
 
 func (p *Page) analyzePage() {
-	p.WordCount = 0
-	for _, word := range p.PlainWords() {
-		runeCount := utf8.RuneCountInString(word)
-		if len(word) == runeCount {
-			p.WordCount++	
-		} else {
-			p.WordCount += runeCount
+	if p.isCJKLanguage {
+		p.WordCount = 0
+		for _, word := range p.PlainWords() {
+			runeCount := utf8.RuneCountInString(word)
+			if len(word) == runeCount {
+				p.WordCount++
+			} else {
+				p.WordCount += runeCount
+			}
 		}
+	} else {
+		p.WordCount = len(p.PlainWords())
 	}
-	
+
 	p.FuzzyWordCount = int((p.WordCount+100)/100) * 100
-	p.ReadingTime = int((p.WordCount + 212) / 213)
+
+	if p.isCJKLanguage {
+		p.ReadingTime = int((p.WordCount + 500) / 501)
+	} else {
+		p.ReadingTime = int((p.WordCount + 212) / 213)
+	}
 }
 
 func (p *Page) permalink() (*url.URL, error) {
@@ -481,7 +481,7 @@
 	}
 	m := f.(map[string]interface{})
 	var err error
-	var draft, published *bool
+	var draft, published, isCJKLanguage *bool
 	for k, v := range m {
 		loki := strings.ToLower(k)
 		switch loki {
@@ -542,6 +542,9 @@
 			p.Status = cast.ToString(v)
 		case "sitemap":
 			p.Sitemap = parseSitemap(cast.ToStringMap(v))
+		case "iscjklanguage":
+			isCJKLanguage = new(bool)
+			*isCJKLanguage = cast.ToBool(v)
 		default:
 			// If not one of the explicit values, store in Params
 			switch vv := v.(type) {
@@ -596,6 +599,16 @@
 		p.Lastmod = p.Date
 	}
 
+	if isCJKLanguage != nil {
+		p.isCJKLanguage = *isCJKLanguage
+	} else if viper.GetBool("HasCJKLanguage") {
+		if cjk.Match(p.rawContent) {
+			p.isCJKLanguage = true
+		} else {
+			p.isCJKLanguage = false
+		}
+	}
+
 	return nil
 
 }
@@ -766,6 +779,8 @@
 
 	p.renderable = psr.IsRenderable()
 	p.frontmatter = psr.FrontMatter()
+	p.rawContent = psr.Content()
+
 	meta, err := psr.Metadata()
 	if meta != nil {
 		if err != nil {
@@ -777,8 +792,6 @@
 			return err
 		}
 	}
-
-	p.rawContent = psr.Content()
 
 	return nil
 }
--- a/hugolib/page_test.go
+++ b/hugolib/page_test.go
@@ -146,16 +146,67 @@
 Some more text
 `
 
-	SIMPLE_PAGE_WITH_FIVE_MULTIBYTE_UFT8_RUNES = `---
+	SIMPLE_PAGE_WITH_ALL_CJK_RUNES = `---
 title: Simple
 ---
 
 
 € € € € €
+你好
+도형이
+カテゴリー
 
 
 `
 
+	SIMPLE_PAGE_WITH_MAIN_ENGLISH_WITH_CJK_RUNES = `---
+title: Simple
+---
+
+
+In Chinese, 好 means good.  In Chinese, 好 means good.
+In Chinese, 好 means good.  In Chinese, 好 means good.
+In Chinese, 好 means good.  In Chinese, 好 means good.
+In Chinese, 好 means good.  In Chinese, 好 means good.
+In Chinese, 好 means good.  In Chinese, 好 means good.
+In Chinese, 好 means good.  In Chinese, 好 means good.
+In Chinese, 好 means good.  In Chinese, 好 means good.
+More then 70 words.
+
+
+`
+	SIMPLE_PAGE_WITH_MAIN_ENGLISH_WITH_CJK_RUNES_SUMMARY = "In Chinese, 好 means good. In Chinese, 好 means good. " +
+		"In Chinese, 好 means good. In Chinese, 好 means good. " +
+		"In Chinese, 好 means good. In Chinese, 好 means good. " +
+		"In Chinese, 好 means good. In Chinese, 好 means good. " +
+		"In Chinese, 好 means good. In Chinese, 好 means good. " +
+		"In Chinese, 好 means good. In Chinese, 好 means good. " +
+		"In Chinese, 好 means good. In Chinese, 好 means good."
+
+	SIMPLE_PAGE_WITH_ISCJKLANGUAGE_FALSE = `---
+title: Simple
+isCJKLanguage: false
+---
+
+In Chinese, 好的啊 means good.  In Chinese, 好的呀 means good.
+In Chinese, 好的啊 means good.  In Chinese, 好的呀 means good.
+In Chinese, 好的啊 means good.  In Chinese, 好的呀 means good.
+In Chinese, 好的啊 means good.  In Chinese, 好的呀 means good.
+In Chinese, 好的啊 means good.  In Chinese, 好的呀 means good.
+In Chinese, 好的啊 means good.  In Chinese, 好的呀 means good.
+In Chinese, 好的啊 means good.  In Chinese, 好的呀呀 means good enough.
+More then 70 words.
+
+
+`
+	SIMPLE_PAGE_WITH_ISCJKLANGUAGE_FALSE_SUMMARY = "In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. " +
+		"In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. " +
+		"In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. " +
+		"In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. " +
+		"In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. " +
+		"In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. " +
+		"In Chinese, 好的啊 means good. In Chinese, 好的呀呀 means good enough."
+
 	SIMPLE_PAGE_WITH_LONG_CONTENT = `---
 title: Simple
 ---
@@ -584,9 +635,11 @@
 	checkPageDate(t, p, d)
 }
 
-func TestRuneCount(t *testing.T) {
+func TestWordCountWithAllCJKRunesWithoutHasCJKLanguage(t *testing.T) {
+	viper.Reset()
+
 	p, _ := NewPage("simple.md")
-	_, err := p.ReadFrom(strings.NewReader(SIMPLE_PAGE_WITH_FIVE_MULTIBYTE_UFT8_RUNES))
+	_, err := p.ReadFrom(strings.NewReader(SIMPLE_PAGE_WITH_ALL_CJK_RUNES))
 	p.Convert()
 	p.analyzePage()
 	if err != nil {
@@ -593,9 +646,75 @@
 		t.Fatalf("Unable to create a page with frontmatter and body content: %s", err)
 	}
 
-	if p.RuneCount() != 5 {
-		t.Fatalf("incorrect rune count for content '%s'. expected %v, got %v", p.plain, 5, p.RuneCount())
+	if p.WordCount != 8 {
+		t.Fatalf("incorrect word count for content '%s'. expected %v, got %v", p.plain, 8, p.WordCount)
+	}
+}
 
+func TestWordCountWithAllCJKRunesHasCJKLanguage(t *testing.T) {
+	viper.Reset()
+	defer viper.Reset()
+
+	viper.Set("HasCJKLanguage", true)
+
+	p, _ := NewPage("simple.md")
+	_, err := p.ReadFrom(strings.NewReader(SIMPLE_PAGE_WITH_ALL_CJK_RUNES))
+	p.Convert()
+	p.analyzePage()
+	if err != nil {
+		t.Fatalf("Unable to create a page with frontmatter and body content: %s", err)
+	}
+
+	if p.WordCount != 15 {
+		t.Fatalf("incorrect word count for content '%s'. expected %v, got %v", p.plain, 15, p.WordCount)
+	}
+}
+
+func TestWordCountWithMainEnglishWithCJKRunes(t *testing.T) {
+	viper.Reset()
+	defer viper.Reset()
+
+	viper.Set("HasCJKLanguage", true)
+
+	p, _ := NewPage("simple.md")
+	_, err := p.ReadFrom(strings.NewReader(SIMPLE_PAGE_WITH_MAIN_ENGLISH_WITH_CJK_RUNES))
+	p.Convert()
+	p.analyzePage()
+	if err != nil {
+		t.Fatalf("Unable to create a page with frontmatter and body content: %s", err)
+	}
+
+	if p.WordCount != 74 {
+		t.Fatalf("incorrect word count for content '%s'. expected %v, got %v", p.plain, 74, p.WordCount)
+	}
+
+	if p.Summary != SIMPLE_PAGE_WITH_MAIN_ENGLISH_WITH_CJK_RUNES_SUMMARY {
+		t.Fatalf("incorrect Summary for content '%s'. expected %v, got %v", p.plain,
+			SIMPLE_PAGE_WITH_MAIN_ENGLISH_WITH_CJK_RUNES_SUMMARY, p.Summary)
+	}
+}
+
+func TestWordCountWithIsCJKLanguageFalse(t *testing.T) {
+	viper.Reset()
+	defer viper.Reset()
+
+	viper.Set("HasCJKLanguage", true)
+
+	p, _ := NewPage("simple.md")
+	_, err := p.ReadFrom(strings.NewReader(SIMPLE_PAGE_WITH_ISCJKLANGUAGE_FALSE))
+	p.Convert()
+	p.analyzePage()
+	if err != nil {
+		t.Fatalf("Unable to create a page with frontmatter and body content: %s", err)
+	}
+
+	if p.WordCount != 75 {
+		t.Fatalf("incorrect word count for content '%s'. expected %v, got %v", p.plain, 75, p.WordCount)
+	}
+
+	if p.Summary != SIMPLE_PAGE_WITH_ISCJKLANGUAGE_FALSE_SUMMARY {
+		t.Fatalf("incorrect Summary for content '%s'. expected %v, got %v", p.plain,
+			SIMPLE_PAGE_WITH_ISCJKLANGUAGE_FALSE_SUMMARY, p.Summary)
 	}
 }