ref: bee6a69846f476013e78c32d55f6c508ed8f1a99
parent: be6482603ee8ac3516ae19ceebb35662f038b25e
author: bep <[email protected]>
date: Sun May 3 15:54:17 EDT 2015
canonifyurls in srcset Speed is about the same as before, uses slightly less memory: ``` benchmark old ns/op new ns/op delta BenchmarkAbsURL 17302 17713 +2.38% BenchmarkXMLAbsURL 9463 9470 +0.07% benchmark old allocs new allocs delta BenchmarkAbsURL 28 24 -14.29% BenchmarkXMLAbsURL 14 12 -14.29% benchmark old bytes new bytes delta BenchmarkAbsURL 3422 3144 -8.12% BenchmarkXMLAbsURL 1985 1864 -6.10% ``` Fixes #1059
--- a/transform/absurlreplacer.go
+++ b/transform/absurlreplacer.go
@@ -29,119 +29,207 @@
start int // item start position
width int // width of last element
- matchers []absURLMatcher
- state stateFunc
- prefixLookup *prefixes
+ matchers []absURLMatcher
+ state stateFunc
+ ms matchState
+ matches [3]bool // track matches of the 3 prefixes
+ i int // last index in matches checked
+
w io.Writer
}
type stateFunc func(*contentlexer) stateFunc
-type prefixRunes []rune
+type prefix struct {
+ r []rune
+ f func(l *contentlexer)
+}
-type prefixes struct {
- pr []prefixRunes
- curr prefixRunes // current prefix lookup table
- i int // current index
+var prefixes = []*prefix{
+ &prefix{r: []rune{'s', 'r', 'c', '='}, f: checkCandidateSrc},
+ &prefix{r: []rune{'s', 'r', 'c', 's', 'e', 't', '='}, f: checkCandidateSrcset},
+ &prefix{r: []rune{'h', 'r', 'e', 'f', '='}, f: checkCandidateHref}}
- // first rune in potential match
- first rune
-
- // match-state:
- // none, whitespace, partial, full
- ms matchState
+type absURLMatcher struct {
+ prefix int
+ match []byte
+ quote []byte
+ replacementURL []byte
}
-// match returns partial and full match for the prefix in play
-// - it's a full match if all prefix runes has checked out in row
-// - it's a partial match if it's on its way towards a full match
func (l *contentlexer) match(r rune) {
- p := l.prefixLookup
- if p.curr == nil {
- // assumes prefixes all start off on a different rune
- // works in this special case: href, src
- p.i = 0
- for _, pr := range p.pr {
- if pr[p.i] == r {
- fullMatch := len(p.pr) == 1
- p.first = r
- if !fullMatch {
- p.curr = pr
- l.prefixLookup.ms = matchStatePartial
- } else {
- l.prefixLookup.ms = matchStateFull
+
+ var found bool
+
+ // note, the prefixes can start off on the same foot, i.e.
+ // src and srcset.
+ if l.ms == matchStateWhitespace {
+ l.i = 0
+ for j, p := range prefixes {
+ if r == p.r[l.i] {
+ l.matches[j] = true
+ found = true
+ if l.checkMatchState(r, j) {
+ return
}
- return
+ } else {
+ l.matches[j] = false
}
}
- } else {
- p.i++
- if p.curr[p.i] == r {
- fullMatch := len(p.curr) == p.i+1
- if fullMatch {
- p.curr = nil
- l.prefixLookup.ms = matchStateFull
+
+ if !found {
+ l.ms = matchStateNone
+ }
+
+ return
+ }
+
+ l.i++
+ for j, m := range l.matches {
+ // still a match?
+ if m {
+ if prefixes[j].r[l.i] == r {
+ found = true
+ if l.checkMatchState(r, j) {
+ return
+ }
} else {
- l.prefixLookup.ms = matchStatePartial
+ l.matches[j] = false
}
- return
}
+ }
- p.curr = nil
+ if found {
+ return
}
- l.prefixLookup.ms = matchStateNone
+ l.ms = matchStateNone
}
+func (l *contentlexer) checkMatchState(r rune, idx int) bool {
+ if r == '=' {
+ l.ms = matchStateFull
+ for k := range l.matches {
+ if k != idx {
+ l.matches[k] = false
+ }
+ }
+ return true
+ }
+
+ l.ms = matchStatePartial
+
+ return false
+}
+
func (l *contentlexer) emit() {
l.w.Write(l.content[l.start:l.pos])
l.start = l.pos
}
-var mainPrefixRunes = []prefixRunes{{'s', 'r', 'c', '='}, {'h', 'r', 'e', 'f', '='}}
+func (a absURLMatcher) isSourceType() bool {
+ return a.prefix == matchPrefixSrc
+}
-type absURLMatcher struct {
- prefix int
- match []byte
- replacement []byte
+func checkCandidateSrc(l *contentlexer) {
+ for _, m := range l.matchers {
+ if !m.isSourceType() {
+ continue
+ }
+ l.replaceSimple(m)
+ }
}
-func (a absURLMatcher) isSourceType() bool {
- return a.prefix == matchPrefixSrc
+func checkCandidateHref(l *contentlexer) {
+ for _, m := range l.matchers {
+ if m.isSourceType() {
+ continue
+ }
+ l.replaceSimple(m)
+ }
}
-func checkCandidate(l *contentlexer) {
- isSource := l.prefixLookup.first == 's'
+func checkCandidateSrcset(l *contentlexer) {
+ // special case, not frequent (me think)
for _, m := range l.matchers {
+ if m.isSourceType() {
+ continue
+ }
- if isSource && !m.isSourceType() || !isSource && m.isSourceType() {
+ if !bytes.HasPrefix(l.content[l.pos:], m.match) {
continue
}
- if bytes.HasPrefix(l.content[l.pos:], m.match) {
- // check for schemaless URLs
- posAfter := l.pos + len(m.match)
- if posAfter >= len(l.content) {
- return
- }
- r, _ := utf8.DecodeRune(l.content[posAfter:])
- if r == '/' {
- // schemaless: skip
- return
- }
- if l.pos > l.start {
- l.emit()
- }
- l.pos += len(m.match)
- l.w.Write(m.replacement)
- l.start = l.pos
+ // check for schemaless URLs
+ posAfter := l.pos + len(m.match)
+ if posAfter >= len(l.content) {
return
+ }
+ r, _ := utf8.DecodeRune(l.content[posAfter:])
+ if r == '/' {
+ // schemaless: skip
+ continue
+ }
+ posLastQuote := bytes.Index(l.content[l.pos+1:], m.quote)
+
+ // safe guard
+ if posLastQuote < 0 || posLastQuote > 2000 {
+ return
}
+
+ if l.pos > l.start {
+ l.emit()
+ }
+
+ section := l.content[l.pos+len(m.quote) : l.pos+posLastQuote+1]
+
+ fields := bytes.Fields(section)
+ l.w.Write([]byte(m.quote))
+ for i, f := range fields {
+ if f[0] == '/' {
+ l.w.Write(m.replacementURL)
+ l.w.Write(f[1:])
+
+ } else {
+ l.w.Write(f)
+ }
+
+ if i < len(fields)-1 {
+ l.w.Write([]byte(" "))
+ }
+ }
+
+ l.w.Write(m.quote)
+ l.pos += len(section) + (len(m.quote) * 2)
+ l.start = l.pos
}
}
+func (l *contentlexer) replaceSimple(m absURLMatcher) {
+ if !bytes.HasPrefix(l.content[l.pos:], m.match) {
+ return
+ }
+ // check for schemaless URLs
+ posAfter := l.pos + len(m.match)
+ if posAfter >= len(l.content) {
+ return
+ }
+ r, _ := utf8.DecodeRune(l.content[posAfter:])
+ if r == '/' {
+ // schemaless: skip
+ return
+ }
+ if l.pos > l.start {
+ l.emit()
+ }
+ l.pos += len(m.match)
+ l.w.Write(m.quote)
+ l.w.Write(m.replacementURL)
+ l.start = l.pos
+}
+
func (l *contentlexer) replace() {
contentLength := len(l.content)
var r rune
@@ -152,7 +240,7 @@
break
}
- var width int = 1
+ var width = 1
r = rune(l.content[l.pos])
if r >= utf8.RuneSelf {
r, width = utf8.DecodeRune(l.content[l.pos:])
@@ -160,14 +248,24 @@
l.width = width
l.pos += l.width
if r == ' ' {
- l.prefixLookup.ms = matchStateWhitespace
- } else if l.prefixLookup.ms != matchStateNone {
+ l.ms = matchStateWhitespace
+ } else if l.ms != matchStateNone {
l.match(r)
- if l.prefixLookup.ms == matchStateFull {
- checkCandidate(l)
+ if l.ms == matchStateFull {
+ var p *prefix
+ for i, m := range l.matches {
+ if m {
+ p = prefixes[i]
+ }
+ l.matches[i] = false
+ }
+ if p == nil {
+ panic("illegal state: curr is nil when state is full")
+ }
+ l.ms = matchStateNone
+ p.f(l)
}
}
-
}
// Done!
@@ -177,15 +275,12 @@
}
func doReplace(ct contentTransformer, matchers []absURLMatcher) {
-
lexer := &contentlexer{
- content: ct.Content(),
- w: ct,
- prefixLookup: &prefixes{pr: mainPrefixRunes},
- matchers: matchers}
+ content: ct.Content(),
+ w: ct,
+ matchers: matchers}
lexer.replace()
-
}
type absURLReplacer struct {
@@ -195,7 +290,7 @@
func newAbsURLReplacer(baseURL string) *absURLReplacer {
u, _ := url.Parse(baseURL)
- base := strings.TrimRight(u.String(), "/")
+ base := []byte(strings.TrimRight(u.String(), "/") + "/")
// HTML
dqHTMLMatch := []byte("\"/")
@@ -205,23 +300,23 @@
dqXMLMatch := []byte(""/")
sqXMLMatch := []byte("'/")
- dqHTML := []byte("\"" + base + "/")
- sqHTML := []byte("'" + base + "/")
+ dqHTML := []byte("\"")
+ sqHTML := []byte("'")
- dqXML := []byte(""" + base + "/")
- sqXML := []byte("'" + base + "/")
+ dqXML := []byte(""")
+ sqXML := []byte("'")
return &absURLReplacer{
htmlMatchers: []absURLMatcher{
- {matchPrefixSrc, dqHTMLMatch, dqHTML},
- {matchPrefixSrc, sqHTMLMatch, sqHTML},
- {matchPrefixHref, dqHTMLMatch, dqHTML},
- {matchPrefixHref, sqHTMLMatch, sqHTML}},
+ {matchPrefixSrc, dqHTMLMatch, dqHTML, base},
+ {matchPrefixSrc, sqHTMLMatch, sqHTML, base},
+ {matchPrefixHref, dqHTMLMatch, dqHTML, base},
+ {matchPrefixHref, sqHTMLMatch, sqHTML, base}},
xmlMatchers: []absURLMatcher{
- {matchPrefixSrc, dqXMLMatch, dqXML},
- {matchPrefixSrc, sqXMLMatch, sqXML},
- {matchPrefixHref, dqXMLMatch, dqXML},
- {matchPrefixHref, sqXMLMatch, sqXML},
+ {matchPrefixSrc, dqXMLMatch, dqXML, base},
+ {matchPrefixSrc, sqXMLMatch, sqXML, base},
+ {matchPrefixHref, dqXMLMatch, dqXML, base},
+ {matchPrefixHref, sqXMLMatch, sqXML, base},
}}
}
--- a/transform/chain_test.go
+++ b/transform/chain_test.go
@@ -25,9 +25,43 @@
// Issue: 816, schemaless links combined with others
const REPLACE_SCHEMALESS_HTML = `Pre. src='//schemaless' src='/normal' <a href="//schemaless">Schemaless</a>. <a href="/normal">normal</a>. Post.`
const REPLACE_SCHEMALESS_HTML_CORRECT = `Pre. src='//schemaless' src='http://base/normal' <a href="//schemaless">Schemaless</a>. <a href="http://base/normal">normal</a>. Post.`
-const REPLACE_SCHEMALESS_XML = `Pre. src="//schemaless" src="/normal" <a href='//schemaless'>Schemaless</a>. <a href='/normal'>normal</a>. Post.`
-const REPLACE_SCHEMALESS_XML_CORRECT = `Pre. src="//schemaless" src="http://base/normal" <a href='//schemaless'>Schemaless</a>. <a href='http://base/normal'>normal</a>. Post.`
+const REPLACE_SCHEMALESS_XML = `Pre. src='//schemaless' src='/normal' <a href='//schemaless'>Schemaless</a>. <a href='/normal'>normal</a>. Post.`
+const REPLACE_SCHEMALESS_XML_CORRECT = `Pre. src='//schemaless' src='http://base/normal' <a href='//schemaless'>Schemaless</a>. <a href='http://base/normal'>normal</a>. Post.`
+// srcset=
+const SRCSET_BASIC = `Pre. <img srcset="/img/small.jpg 200w /img/big.jpg 700w" alt="text" src="/img/foo.jpg">`
+const SRCSET_BASIC_CORRECT = `Pre. <img srcset="http://base/img/small.jpg 200w http://base/img/big.jpg 700w" alt="text" src="http://base/img/foo.jpg">`
+const SRCSET_SINGLE_QUOTE = `Pre. <img srcset='/img/small.jpg 200w /img/big.jpg 700w' alt="text" src="/img/foo.jpg"> POST.`
+const SRCSET_SINGLE_QUOTE_CORRECT = `Pre. <img srcset='http://base/img/small.jpg 200w http://base/img/big.jpg 700w' alt="text" src="http://base/img/foo.jpg"> POST.`
+const SRCSET_XML_BASIC = `Pre. <img srcset="/img/small.jpg 200w /img/big.jpg 700w" alt="text" src="/img/foo.jpg">`
+const SRCSET_XML_BASIC_CORRECT = `Pre. <img srcset="http://base/img/small.jpg 200w http://base/img/big.jpg 700w" alt="text" src="http://base/img/foo.jpg">`
+const SRCSET_XML_SINGLE_QUOTE = `Pre. <img srcset="/img/small.jpg 200w /img/big.jpg 700w" alt="text" src="/img/foo.jpg">`
+const SRCSET_XML_SINGLE_QUOTE_CORRECT = `Pre. <img srcset="http://base/img/small.jpg 200w http://base/img/big.jpg 700w" alt="text" src="http://base/img/foo.jpg">`
+const SRCSET_VARIATIONS = `Pre.
+Missing start quote: <img srcset=/img/small.jpg 200w /img/big.jpg 700w" alt="text"> src='/img/foo.jpg'> FOO.
+<img srcset='/img.jpg'>
+schemaless: <img srcset='//img.jpg' src='//basic.jpg'>
+schemaless2: <img srcset="//img.jpg" src="//basic.jpg2> POST
+`
+const SRCSET_VARIATIONS_CORRECT = `Pre.
+Missing start quote: <img srcset=/img/small.jpg 200w /img/big.jpg 700w" alt="text"> src='http://base/img/foo.jpg'> FOO.
+<img srcset='http://base/img.jpg'>
+schemaless: <img srcset='//img.jpg' src='//basic.jpg'>
+schemaless2: <img srcset="//img.jpg" src="//basic.jpg2> POST
+`
+const SRCSET_XML_VARIATIONS = `Pre.
+Missing start quote: <img srcset=/img/small.jpg 200w /img/big.jpg 700w" alt="text"> src='/img/foo.jpg'> FOO.
+<img srcset='/img.jpg'>
+schemaless: <img srcset='//img.jpg' src='//basic.jpg'>
+schemaless2: <img srcset="//img.jpg" src="//basic.jpg2> POST
+`
+const SRCSET_XML_VARIATIONS_CORRECT = `Pre.
+Missing start quote: <img srcset=/img/small.jpg 200w /img/big.jpg 700w" alt="text"> src='http://base/img/foo.jpg'> FOO.
+<img srcset='http://base/img.jpg'>
+schemaless: <img srcset='//img.jpg' src='//basic.jpg'>
+schemaless2: <img srcset="//img.jpg" src="//basic.jpg2> POST
+`
+
var abs_url_bench_tests = []test{
{H5_JS_CONTENT_DOUBLE_QUOTE, CORRECT_OUTPUT_SRC_HREF_DQ},
{H5_JS_CONTENT_SINGLE_QUOTE, CORRECT_OUTPUT_SRC_HREF_SQ},
@@ -46,6 +80,12 @@
var extra_tests_xml = []test{{REPLACE_SCHEMALESS_XML, REPLACE_SCHEMALESS_XML_CORRECT}}
var xml_abs_url_tests = append(xml_abs_url_bench_tests, append(sanity_tests, extra_tests_xml...)...)
+var srcset_tests = []test{{SRCSET_BASIC, SRCSET_BASIC_CORRECT}, {SRCSET_SINGLE_QUOTE, SRCSET_SINGLE_QUOTE_CORRECT}, {SRCSET_VARIATIONS, SRCSET_VARIATIONS_CORRECT}}
+var srcset_xml_tests = []test{
+ {SRCSET_XML_BASIC, SRCSET_XML_BASIC_CORRECT},
+ {SRCSET_XML_SINGLE_QUOTE, SRCSET_XML_SINGLE_QUOTE_CORRECT},
+ {SRCSET_XML_VARIATIONS, SRCSET_XML_VARIATIONS_CORRECT}}
+
func TestChainZeroTransformers(t *testing.T) {
tr := NewChain()
in := new(bytes.Buffer)
@@ -99,6 +139,21 @@
tr := NewChain(absURL...)
apply(t.Errorf, tr, abs_url_tests)
+
+}
+
+func TestAbsURLSrcSet(t *testing.T) {
+ absURL, _ := absURLFromURL("http://base")
+ tr := NewChain(absURL...)
+
+ apply(t.Errorf, tr, srcset_tests)
+}
+
+func TestAbsXMLURLSrcSet(t *testing.T) {
+ absURLInXML, _ := absURLInXMLFromURL("http://base")
+ tr := NewChain(absURLInXML...)
+
+ apply(t.Errorf, tr, srcset_xml_tests)
}