shithub: hugo

Download patch

ref: e50a8c7a142487d88fe0780c24873c1b95a2283c
parent: 7e76a6fd3bc78363ed31d712c63e6b17734797d7
author: Bjørn Erik Pedersen <[email protected]>
date: Wed Dec 27 14:31:42 EST 2017

resource: Use MD5 to identify image files

But only a set of byte chunks spread around in the image file to calculate the fingerprint, which is much faster than reading the whole file:

```bash
BenchmarkMD5FromFileFast/full=false-4         	  300000	      4356 ns/op	     240 B/op	       5 allocs/op
BenchmarkMD5FromFileFast/full=true-4          	   30000	     42899 ns/op	   32944 B/op	       5 allocs/op
```

Fixes #4186

--- a/Gopkg.lock
+++ b/Gopkg.lock
@@ -240,8 +240,8 @@
     ".",
     "mem"
   ]
-  revision = "8d919cbe7e2627e417f3e45c3c0e489a5b7e2536"
-  version = "v1.0.0"
+  revision = "ec3a3111d1e1bdff38a61e09d5a5f5e974905611"
+  version = "v1.0.1"
 
 [[projects]]
   name = "github.com/spf13/cast"
@@ -369,6 +369,6 @@
 [solve-meta]
   analyzer-name = "dep"
   analyzer-version = 1
-  inputs-digest = "2d9c34c260bc26814a0635c93009daeb9d8ffa56c29c0cff6827ae2d3e9ef96d"
+  inputs-digest = "7259b4caf8e75db0b809f06d4897dc870261252e3aecd68ea1348c87a5da9d50"
   solver-name = "gps-cdcl"
   solver-version = 1
--- a/Gopkg.toml
+++ b/Gopkg.toml
@@ -82,7 +82,7 @@
 
 [[constraint]]
   name = "github.com/spf13/afero"
-  version = "1.0.0"
+  version = "1.0.1"
 
 [[constraint]]
   name = "github.com/spf13/cast"
--- a/helpers/general.go
+++ b/helpers/general.go
@@ -26,6 +26,8 @@
 	"unicode"
 	"unicode/utf8"
 
+	"github.com/spf13/afero"
+
 	"github.com/jdkato/prose/transform"
 
 	bp "github.com/gohugoio/hugo/bufferpool"
@@ -370,6 +372,57 @@
 	h := md5.New()
 	h.Write([]byte(f))
 	return hex.EncodeToString(h.Sum([]byte{}))
+}
+
+// MD5FromFileFast creates a MD5 hash from the given file. It only reads parts of
+// the file for speed, so don't use it if the files are very subtly different.
+// It will not close the file.
+func MD5FromFileFast(f afero.File) (string, error) {
+	const (
+		// Do not change once set in stone!
+		maxChunks = 8
+		peekSize  = 64
+		seek      = 2048
+	)
+
+	h := md5.New()
+	buff := make([]byte, peekSize)
+
+	for i := 0; i < maxChunks; i++ {
+		if i > 0 {
+			_, err := f.Seek(seek, 0)
+			if err != nil {
+				if err == io.EOF {
+					break
+				}
+				return "", err
+			}
+		}
+
+		_, err := io.ReadAtLeast(f, buff, peekSize)
+		if err != nil {
+			if err == io.EOF || err == io.ErrUnexpectedEOF {
+				h.Write(buff)
+				break
+			}
+			return "", err
+		}
+		h.Write(buff)
+	}
+
+	h.Write(buff)
+
+	return hex.EncodeToString(h.Sum(nil)), nil
+}
+
+// MD5FromFile creates a MD5 hash from the given file.
+// It will not close the file.
+func MD5FromFile(f afero.File) (string, error) {
+	h := md5.New()
+	if _, err := io.Copy(h, f); err != nil {
+		return "", nil
+	}
+	return hex.EncodeToString(h.Sum(nil)), nil
 }
 
 // IsWhitespace determines if the given rune is whitespace.
--- a/helpers/general_test.go
+++ b/helpers/general_test.go
@@ -14,10 +14,12 @@
 package helpers
 
 import (
+	"fmt"
 	"reflect"
 	"strings"
 	"testing"
 
+	"github.com/spf13/afero"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 )
@@ -269,4 +271,92 @@
 			t.Errorf("[%d] Expected\n%#v, got\n%#v\n", i, test.expected, test.input)
 		}
 	}
+}
+
+func TestFastMD5FromFile(t *testing.T) {
+	fs := afero.NewMemMapFs()
+
+	if err := afero.WriteFile(fs, "small.txt", []byte("abc"), 0777); err != nil {
+		t.Fatal(err)
+	}
+
+	if err := afero.WriteFile(fs, "small2.txt", []byte("abd"), 0777); err != nil {
+		t.Fatal(err)
+	}
+
+	if err := afero.WriteFile(fs, "bigger.txt", []byte(strings.Repeat("a bc d e", 100)), 0777); err != nil {
+		t.Fatal(err)
+	}
+
+	if err := afero.WriteFile(fs, "bigger2.txt", []byte(strings.Repeat("c d e f g", 100)), 0777); err != nil {
+		t.Fatal(err)
+	}
+
+	req := require.New(t)
+
+	sf1, err := fs.Open("small.txt")
+	req.NoError(err)
+	sf2, err := fs.Open("small2.txt")
+	req.NoError(err)
+
+	bf1, err := fs.Open("bigger.txt")
+	req.NoError(err)
+	bf2, err := fs.Open("bigger2.txt")
+	req.NoError(err)
+
+	defer sf1.Close()
+	defer sf2.Close()
+	defer bf1.Close()
+	defer bf2.Close()
+
+	m1, err := MD5FromFileFast(sf1)
+	req.NoError(err)
+	req.Equal("308d8a1127b46524b51507424071c22c", m1)
+
+	m2, err := MD5FromFileFast(sf2)
+	req.NoError(err)
+	req.NotEqual(m1, m2)
+
+	m3, err := MD5FromFileFast(bf1)
+	req.NoError(err)
+	req.NotEqual(m2, m3)
+
+	m4, err := MD5FromFileFast(bf2)
+	req.NoError(err)
+	req.NotEqual(m3, m4)
+
+	m5, err := MD5FromFile(bf2)
+	req.NoError(err)
+	req.NotEqual(m4, m5)
+}
+
+func BenchmarkMD5FromFileFast(b *testing.B) {
+	fs := afero.NewMemMapFs()
+
+	for _, full := range []bool{false, true} {
+		b.Run(fmt.Sprintf("full=%t", full), func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				b.StopTimer()
+				if err := afero.WriteFile(fs, "file.txt", []byte(strings.Repeat("1234567890", 2000)), 0777); err != nil {
+					b.Fatal(err)
+				}
+				f, err := fs.Open("file.txt")
+				if err != nil {
+					b.Fatal(err)
+				}
+				b.StartTimer()
+				if full {
+					if _, err := MD5FromFile(f); err != nil {
+						b.Fatal(err)
+					}
+				} else {
+					if _, err := MD5FromFileFast(f); err != nil {
+						b.Fatal(err)
+					}
+				}
+				f.Close()
+			}
+		})
+	}
+
 }
--- a/resource/image.go
+++ b/resource/image.go
@@ -112,6 +112,8 @@
 
 	imaging *Imaging
 
+	hash string
+
 	*genericResource
 }
 
@@ -129,6 +131,7 @@
 func (i *Image) WithNewBase(base string) Resource {
 	return &Image{
 		imaging:         i.imaging,
+		hash:            i.hash,
 		genericResource: i.genericResource.WithNewBase(base).(*genericResource)}
 }
 
@@ -490,6 +493,7 @@
 
 	return &Image{
 		imaging:         i.imaging,
+		hash:            i.hash,
 		genericResource: &g}
 }
 
@@ -497,20 +501,11 @@
 	i.rel = i.filenameFromConfig(conf)
 }
 
-// We need to set this to something static during tests.
-var fiModTimeFunc = func(fi os.FileInfo) int64 {
-	return fi.ModTime().Unix()
-}
-
 func (i *Image) filenameFromConfig(conf imageConfig) string {
 	p1, p2 := helpers.FileAndExt(i.rel)
-	sizeModeStr := fmt.Sprintf("_S%d_T%d", i.osFileInfo.Size(), fiModTimeFunc(i.osFileInfo))
-	// On scaling an already scaled image, we get the file info from the original.
-	// Repeating the same info in the filename makes it stuttery for no good reason.
-	if strings.Contains(p1, sizeModeStr) {
-		sizeModeStr = ""
-	}
+	idStr := fmt.Sprintf("_H%s_%d", i.hash, i.osFileInfo.Size())
 
+	// Do not change for no good reason.
 	const md5Threshold = 100
 
 	key := conf.key()
@@ -518,12 +513,16 @@
 	// It is useful to have the key in clear text, but when nesting transforms, it
 	// can easily be too long to read, and maybe even too long
 	// for the different OSes to handle.
-	if len(p1)+len(sizeModeStr)+len(p2) > md5Threshold {
+	if len(p1)+len(idStr)+len(p2) > md5Threshold {
 		key = helpers.MD5String(p1 + key + p2)
-		p1 = p1[:strings.Index(p1, "_S")]
+		p1 = p1[:strings.Index(p1, "_H")]
+	} else if strings.Contains(p1, idStr) {
+		// On scaling an already scaled image, we get the file info from the original.
+		// Repeating the same info in the filename makes it stuttery for no good reason.
+		idStr = ""
 	}
 
-	return fmt.Sprintf("%s%s_%s%s", p1, sizeModeStr, key, p2)
+	return fmt.Sprintf("%s%s_%s%s", p1, idStr, key, p2)
 }
 
 func decodeImaging(m map[string]interface{}) (Imaging, error) {
--- a/resource/image_test.go
+++ b/resource/image_test.go
@@ -15,7 +15,6 @@
 
 import (
 	"fmt"
-	"os"
 	"testing"
 
 	"github.com/stretchr/testify/require"
@@ -52,9 +51,6 @@
 }
 
 func TestImageTransform(t *testing.T) {
-	fiModTimeFunc = func(fi os.FileInfo) int64 {
-		return int64(10111213)
-	}
 
 	assert := require.New(t)
 
@@ -86,13 +82,13 @@
 	assert.Equal(200, resizedAndRotated.Height())
 	assertFileCache(assert, image.spec.Fs, resizedAndRotated.RelPermalink(), 125, 200)
 
-	assert.Equal("/a/sunset_S90587_T10111213_300x200_resize_q75_box_center.jpg", resized.RelPermalink())
+	assert.Equal("/a/sunset_H47566bb0ca0462db92c65f4033d77175_90587_300x200_resize_q75_box_center.jpg", resized.RelPermalink())
 	assert.Equal(300, resized.Width())
 	assert.Equal(200, resized.Height())
 
 	fitted, err := resized.Fit("50x50")
 	assert.NoError(err)
-	assert.Equal("/a/sunset_S90587_T10111213_300x200_resize_q75_box_center_50x50_fit_q75_box_center.jpg", fitted.RelPermalink())
+	assert.Equal("/a/sunset_H47566bb0ca0462db92c65f4033d77175_90587_9b37eba4e4e6ea0cc56a59bb5aa98143.jpg", fitted.RelPermalink())
 	assert.Equal(50, fitted.Width())
 	assert.Equal(31, fitted.Height())
 
@@ -100,13 +96,13 @@
 	fittedAgain, _ := fitted.Fit("10x20")
 	fittedAgain, err = fittedAgain.Fit("10x20")
 	assert.NoError(err)
-	assert.Equal("/a/sunset_f1fb715a17c42d5d4602a1870424d590.jpg", fittedAgain.RelPermalink())
+	assert.Equal("/a/sunset_H47566bb0ca0462db92c65f4033d77175_90587_9a8be1402216c385e0dfd73e267c6827.jpg", fittedAgain.RelPermalink())
 	assert.Equal(10, fittedAgain.Width())
 	assert.Equal(6, fittedAgain.Height())
 
 	filled, err := image.Fill("200x100 bottomLeft")
 	assert.NoError(err)
-	assert.Equal("/a/sunset_S90587_T10111213_200x100_fill_q75_box_bottomleft.jpg", filled.RelPermalink())
+	assert.Equal("/a/sunset_H47566bb0ca0462db92c65f4033d77175_90587_200x100_fill_q75_box_bottomleft.jpg", filled.RelPermalink())
 	assert.Equal(200, filled.Width())
 	assert.Equal(100, filled.Height())
 	assertFileCache(assert, image.spec.Fs, filled.RelPermalink(), 200, 100)
--- a/resource/resource.go
+++ b/resource/resource.go
@@ -153,7 +153,19 @@
 	gr := r.newGenericResource(linker, fi, absPublishDir, absSourceFilename, filepath.ToSlash(relTargetFilename), mimeType)
 
 	if mimeType == "image" {
+		f, err := r.Fs.Source.Open(absSourceFilename)
+		if err != nil {
+			return nil, err
+		}
+		defer f.Close()
+
+		hash, err := helpers.MD5FromFileFast(f)
+		if err != nil {
+			return nil, err
+		}
+
 		return &Image{
+			hash:            hash,
 			imaging:         r.imaging,
 			genericResource: gr}, nil
 	}