New Upstream Release - golang-github-pierrec-xxhash

Ready changes

Summary

Merged new upstream version: 0.1.5 (was: 0.1.1).

Resulting package

Built on 2022-03-14T02:10 (took 2m26s)

The resulting binary packages can be installed (if you have the apt repository enabled) by running one of:

apt install -t fresh-releases golang-github-pierrec-xxhash-dev

Lintian Result

Diff

diff --git a/debian/changelog b/debian/changelog
index 8d6e5d6..f473aef 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,9 @@
+golang-github-pierrec-xxhash (0.1.5-1) UNRELEASED; urgency=low
+
+  * New upstream release.
+
+ -- Debian Janitor <janitor@jelmer.uk>  Mon, 14 Mar 2022 02:07:52 -0000
+
 golang-github-pierrec-xxhash (0.1.1-4) unstable; urgency=medium
 
   * Vcs-* urls: pkg-go-team -> go-team.
diff --git a/go.mod b/go.mod
new file mode 100644
index 0000000..9e691af
--- /dev/null
+++ b/go.mod
@@ -0,0 +1 @@
+module github.com/pierrec/xxHash
diff --git a/xxHash32/xxHash32.go b/xxHash32/xxHash32.go
index 411504e..ff58256 100644
--- a/xxHash32/xxHash32.go
+++ b/xxHash32/xxHash32.go
@@ -79,36 +79,24 @@ func (xxh *xxHash) Write(input []byte) (int, error) {
 		xxh.bufused += len(input) - r
 
 		// fast rotl(13)
-		p32 := xxh.v1 + (uint32(xxh.buf[p+3])<<24|uint32(xxh.buf[p+2])<<16|uint32(xxh.buf[p+1])<<8|uint32(xxh.buf[p]))*prime32_2
-		xxh.v1 = (p32<<13 | p32>>19) * prime32_1
-		p += 4
-		p32 = xxh.v2 + (uint32(xxh.buf[p+3])<<24|uint32(xxh.buf[p+2])<<16|uint32(xxh.buf[p+1])<<8|uint32(xxh.buf[p]))*prime32_2
-		xxh.v2 = (p32<<13 | p32>>19) * prime32_1
-		p += 4
-		p32 = xxh.v3 + (uint32(xxh.buf[p+3])<<24|uint32(xxh.buf[p+2])<<16|uint32(xxh.buf[p+1])<<8|uint32(xxh.buf[p]))*prime32_2
-		xxh.v3 = (p32<<13 | p32>>19) * prime32_1
-		p += 4
-		p32 = xxh.v4 + (uint32(xxh.buf[p+3])<<24|uint32(xxh.buf[p+2])<<16|uint32(xxh.buf[p+1])<<8|uint32(xxh.buf[p]))*prime32_2
-		xxh.v4 = (p32<<13 | p32>>19) * prime32_1
-
+		xxh.v1 = rol13(xxh.v1+u32(xxh.buf[:])*prime32_2) * prime32_1
+		xxh.v2 = rol13(xxh.v2+u32(xxh.buf[4:])*prime32_2) * prime32_1
+		xxh.v3 = rol13(xxh.v3+u32(xxh.buf[8:])*prime32_2) * prime32_1
+		xxh.v4 = rol13(xxh.v4+u32(xxh.buf[12:])*prime32_2) * prime32_1
 		p = r
 		xxh.bufused = 0
 	}
 
-	for n := n - 16; p <= n; {
-		p32 := xxh.v1 + (uint32(input[p+3])<<24|uint32(input[p+2])<<16|uint32(input[p+1])<<8|uint32(input[p]))*prime32_2
-		xxh.v1 = (p32<<13 | p32>>19) * prime32_1
-		p += 4
-		p32 = xxh.v2 + (uint32(input[p+3])<<24|uint32(input[p+2])<<16|uint32(input[p+1])<<8|uint32(input[p]))*prime32_2
-		xxh.v2 = (p32<<13 | p32>>19) * prime32_1
-		p += 4
-		p32 = xxh.v3 + (uint32(input[p+3])<<24|uint32(input[p+2])<<16|uint32(input[p+1])<<8|uint32(input[p]))*prime32_2
-		xxh.v3 = (p32<<13 | p32>>19) * prime32_1
-		p += 4
-		p32 = xxh.v4 + (uint32(input[p+3])<<24|uint32(input[p+2])<<16|uint32(input[p+1])<<8|uint32(input[p]))*prime32_2
-		xxh.v4 = (p32<<13 | p32>>19) * prime32_1
-		p += 4
+	// Causes compiler to work directly from registers instead of stack:
+	v1, v2, v3, v4 := xxh.v1, xxh.v2, xxh.v3, xxh.v4
+	for n := n - 16; p <= n; p += 16 {
+		sub := input[p:][:16] //BCE hint for compiler
+		v1 = rol13(v1+u32(sub[:])*prime32_2) * prime32_1
+		v2 = rol13(v2+u32(sub[4:])*prime32_2) * prime32_1
+		v3 = rol13(v3+u32(sub[8:])*prime32_2) * prime32_1
+		v4 = rol13(v4+u32(sub[12:])*prime32_2) * prime32_1
 	}
+	xxh.v1, xxh.v2, xxh.v3, xxh.v4 = v1, v2, v3, v4
 
 	copy(xxh.buf[xxh.bufused:], input[p:])
 	xxh.bufused += len(input) - p
@@ -120,10 +108,7 @@ func (xxh *xxHash) Write(input []byte) (int, error) {
 func (xxh *xxHash) Sum32() uint32 {
 	h32 := uint32(xxh.totalLen)
 	if xxh.totalLen >= 16 {
-		h32 += ((xxh.v1 << 1) | (xxh.v1 >> 31)) +
-			((xxh.v2 << 7) | (xxh.v2 >> 25)) +
-			((xxh.v3 << 12) | (xxh.v3 >> 20)) +
-			((xxh.v4 << 18) | (xxh.v4 >> 14))
+		h32 += rol1(xxh.v1) + rol7(xxh.v2) + rol12(xxh.v3) + rol18(xxh.v4)
 	} else {
 		h32 += xxh.seed + prime32_5
 	}
@@ -131,12 +116,12 @@ func (xxh *xxHash) Sum32() uint32 {
 	p := 0
 	n := xxh.bufused
 	for n := n - 4; p <= n; p += 4 {
-		h32 += (uint32(xxh.buf[p+3])<<24 | uint32(xxh.buf[p+2])<<16 | uint32(xxh.buf[p+1])<<8 | uint32(xxh.buf[p])) * prime32_3
-		h32 = ((h32 << 17) | (h32 >> 15)) * prime32_4
+		h32 += u32(xxh.buf[p:p+4]) * prime32_3
+		h32 = rol17(h32) * prime32_4
 	}
 	for ; p < n; p++ {
 		h32 += uint32(xxh.buf[p]) * prime32_5
-		h32 = ((h32 << 11) | (h32 >> 21)) * prime32_1
+		h32 = rol11(h32) * prime32_1
 	}
 
 	h32 ^= h32 >> 15
@@ -161,37 +146,26 @@ func Checksum(input []byte, seed uint32) uint32 {
 		v3 := seed
 		v4 := seed - prime32_1
 		p := 0
-		for p <= n-16 {
-			v1 += (uint32(input[p+3])<<24 | uint32(input[p+2])<<16 | uint32(input[p+1])<<8 | uint32(input[p])) * prime32_2
-			v1 = (v1<<13 | v1>>19) * prime32_1
-			p += 4
-			v2 += (uint32(input[p+3])<<24 | uint32(input[p+2])<<16 | uint32(input[p+1])<<8 | uint32(input[p])) * prime32_2
-			v2 = (v2<<13 | v2>>19) * prime32_1
-			p += 4
-			v3 += (uint32(input[p+3])<<24 | uint32(input[p+2])<<16 | uint32(input[p+1])<<8 | uint32(input[p])) * prime32_2
-			v3 = (v3<<13 | v3>>19) * prime32_1
-			p += 4
-			v4 += (uint32(input[p+3])<<24 | uint32(input[p+2])<<16 | uint32(input[p+1])<<8 | uint32(input[p])) * prime32_2
-			v4 = (v4<<13 | v4>>19) * prime32_1
-			p += 4
+		for n := n - 16; p <= n; p += 16 {
+			sub := input[p:][:16] //BCE hint for compiler
+			v1 = rol13(v1+u32(sub[:])*prime32_2) * prime32_1
+			v2 = rol13(v2+u32(sub[4:])*prime32_2) * prime32_1
+			v3 = rol13(v3+u32(sub[8:])*prime32_2) * prime32_1
+			v4 = rol13(v4+u32(sub[12:])*prime32_2) * prime32_1
 		}
 		input = input[p:]
 		n -= p
-		h32 += ((v1 << 1) | (v1 >> 31)) +
-			((v2 << 7) | (v2 >> 25)) +
-			((v3 << 12) | (v3 >> 20)) +
-			((v4 << 18) | (v4 >> 14))
+		h32 += rol1(v1) + rol7(v2) + rol12(v3) + rol18(v4)
 	}
 
 	p := 0
-	for p <= n-4 {
-		h32 += (uint32(input[p+3])<<24 | uint32(input[p+2])<<16 | uint32(input[p+1])<<8 | uint32(input[p])) * prime32_3
-		h32 = ((h32 << 17) | (h32 >> 15)) * prime32_4
-		p += 4
+	for n := n - 4; p <= n; p += 4 {
+		h32 += u32(input[p:p+4]) * prime32_3
+		h32 = rol17(h32) * prime32_4
 	}
 	for p < n {
 		h32 += uint32(input[p]) * prime32_5
-		h32 = ((h32 << 11) | (h32 >> 21)) * prime32_1
+		h32 = rol11(h32) * prime32_1
 		p++
 	}
 
@@ -203,3 +177,36 @@ func Checksum(input []byte, seed uint32) uint32 {
 
 	return h32
 }
+
+func u32(buf []byte) uint32 {
+	// go compiler recognizes this pattern and optimizes it on little endian platforms
+	return uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
+}
+
+func rol1(u uint32) uint32 {
+	return u<<1 | u>>31
+}
+
+func rol7(u uint32) uint32 {
+	return u<<7 | u>>25
+}
+
+func rol11(u uint32) uint32 {
+	return u<<11 | u>>21
+}
+
+func rol12(u uint32) uint32 {
+	return u<<12 | u>>20
+}
+
+func rol13(u uint32) uint32 {
+	return u<<13 | u>>19
+}
+
+func rol17(u uint32) uint32 {
+	return u<<17 | u>>15
+}
+
+func rol18(u uint32) uint32 {
+	return u<<18 | u>>14
+}
diff --git a/xxHash64/xxHash64.go b/xxHash64/xxHash64.go
index 2788e95..8186fb1 100644
--- a/xxHash64/xxHash64.go
+++ b/xxHash64/xxHash64.go
@@ -79,36 +79,24 @@ func (xxh *xxHash) Write(input []byte) (int, error) {
 		xxh.bufused += len(input) - r
 
 		// fast rotl(31)
-		p64 := xxh.v1 + (uint64(xxh.buf[p+7])<<56|uint64(xxh.buf[p+6])<<48|uint64(xxh.buf[p+5])<<40|uint64(xxh.buf[p+4])<<32|uint64(xxh.buf[p+3])<<24|uint64(xxh.buf[p+2])<<16|uint64(xxh.buf[p+1])<<8|uint64(xxh.buf[p]))*prime64_2
-		xxh.v1 = (p64<<31 | p64>>33) * prime64_1
-		p += 8
-		p64 = xxh.v2 + (uint64(xxh.buf[p+7])<<56|uint64(xxh.buf[p+6])<<48|uint64(xxh.buf[p+5])<<40|uint64(xxh.buf[p+4])<<32|uint64(xxh.buf[p+3])<<24|uint64(xxh.buf[p+2])<<16|uint64(xxh.buf[p+1])<<8|uint64(xxh.buf[p]))*prime64_2
-		xxh.v2 = (p64<<31 | p64>>33) * prime64_1
-		p += 8
-		p64 = xxh.v3 + (uint64(xxh.buf[p+7])<<56|uint64(xxh.buf[p+6])<<48|uint64(xxh.buf[p+5])<<40|uint64(xxh.buf[p+4])<<32|uint64(xxh.buf[p+3])<<24|uint64(xxh.buf[p+2])<<16|uint64(xxh.buf[p+1])<<8|uint64(xxh.buf[p]))*prime64_2
-		xxh.v3 = (p64<<31 | p64>>33) * prime64_1
-		p += 8
-		p64 = xxh.v4 + (uint64(xxh.buf[p+7])<<56|uint64(xxh.buf[p+6])<<48|uint64(xxh.buf[p+5])<<40|uint64(xxh.buf[p+4])<<32|uint64(xxh.buf[p+3])<<24|uint64(xxh.buf[p+2])<<16|uint64(xxh.buf[p+1])<<8|uint64(xxh.buf[p]))*prime64_2
-		xxh.v4 = (p64<<31 | p64>>33) * prime64_1
-
+		xxh.v1 = rol31(xxh.v1+u64(xxh.buf[:])*prime64_2) * prime64_1
+		xxh.v2 = rol31(xxh.v2+u64(xxh.buf[8:])*prime64_2) * prime64_1
+		xxh.v3 = rol31(xxh.v3+u64(xxh.buf[16:])*prime64_2) * prime64_1
+		xxh.v4 = rol31(xxh.v4+u64(xxh.buf[24:])*prime64_2) * prime64_1
 		p = r
 		xxh.bufused = 0
 	}
 
-	for n := n - 32; p <= n; {
-		p64 := xxh.v1 + (uint64(input[p+7])<<56|uint64(input[p+6])<<48|uint64(input[p+5])<<40|uint64(input[p+4])<<32|uint64(input[p+3])<<24|uint64(input[p+2])<<16|uint64(input[p+1])<<8|uint64(input[p]))*prime64_2
-		xxh.v1 = (p64<<31 | p64>>33) * prime64_1
-		p += 8
-		p64 = xxh.v2 + (uint64(input[p+7])<<56|uint64(input[p+6])<<48|uint64(input[p+5])<<40|uint64(input[p+4])<<32|uint64(input[p+3])<<24|uint64(input[p+2])<<16|uint64(input[p+1])<<8|uint64(input[p]))*prime64_2
-		xxh.v2 = (p64<<31 | p64>>33) * prime64_1
-		p += 8
-		p64 = xxh.v3 + (uint64(input[p+7])<<56|uint64(input[p+6])<<48|uint64(input[p+5])<<40|uint64(input[p+4])<<32|uint64(input[p+3])<<24|uint64(input[p+2])<<16|uint64(input[p+1])<<8|uint64(input[p]))*prime64_2
-		xxh.v3 = (p64<<31 | p64>>33) * prime64_1
-		p += 8
-		p64 = xxh.v4 + (uint64(input[p+7])<<56|uint64(input[p+6])<<48|uint64(input[p+5])<<40|uint64(input[p+4])<<32|uint64(input[p+3])<<24|uint64(input[p+2])<<16|uint64(input[p+1])<<8|uint64(input[p]))*prime64_2
-		xxh.v4 = (p64<<31 | p64>>33) * prime64_1
-		p += 8
+	// Causes compiler to work directly from registers instead of stack:
+	v1, v2, v3, v4 := xxh.v1, xxh.v2, xxh.v3, xxh.v4
+	for n := n - 32; p <= n; p += 32 {
+		sub := input[p:][:32] //BCE hint for compiler
+		v1 = rol31(v1+u64(sub[:])*prime64_2) * prime64_1
+		v2 = rol31(v2+u64(sub[8:])*prime64_2) * prime64_1
+		v3 = rol31(v3+u64(sub[16:])*prime64_2) * prime64_1
+		v4 = rol31(v4+u64(sub[24:])*prime64_2) * prime64_1
 	}
+	xxh.v1, xxh.v2, xxh.v3, xxh.v4 = v1, v2, v3, v4
 
 	copy(xxh.buf[xxh.bufused:], input[p:])
 	xxh.bufused += len(input) - p
@@ -120,26 +108,19 @@ func (xxh *xxHash) Write(input []byte) (int, error) {
 func (xxh *xxHash) Sum64() uint64 {
 	var h64 uint64
 	if xxh.totalLen >= 32 {
-		h64 = ((xxh.v1 << 1) | (xxh.v1 >> 63)) +
-			((xxh.v2 << 7) | (xxh.v2 >> 57)) +
-			((xxh.v3 << 12) | (xxh.v3 >> 52)) +
-			((xxh.v4 << 18) | (xxh.v4 >> 46))
+		h64 = rol1(xxh.v1) + rol7(xxh.v2) + rol12(xxh.v3) + rol18(xxh.v4)
 
 		xxh.v1 *= prime64_2
-		h64 ^= ((xxh.v1 << 31) | (xxh.v1 >> 33)) * prime64_1
-		h64 = h64*prime64_1 + prime64_4
-
 		xxh.v2 *= prime64_2
-		h64 ^= ((xxh.v2 << 31) | (xxh.v2 >> 33)) * prime64_1
-		h64 = h64*prime64_1 + prime64_4
-
 		xxh.v3 *= prime64_2
-		h64 ^= ((xxh.v3 << 31) | (xxh.v3 >> 33)) * prime64_1
-		h64 = h64*prime64_1 + prime64_4
-
 		xxh.v4 *= prime64_2
-		h64 ^= ((xxh.v4 << 31) | (xxh.v4 >> 33)) * prime64_1
-		h64 = h64*prime64_1 + prime64_4 + xxh.totalLen
+
+		h64 = (h64^(rol31(xxh.v1)*prime64_1))*prime64_1 + prime64_4
+		h64 = (h64^(rol31(xxh.v2)*prime64_1))*prime64_1 + prime64_4
+		h64 = (h64^(rol31(xxh.v3)*prime64_1))*prime64_1 + prime64_4
+		h64 = (h64^(rol31(xxh.v4)*prime64_1))*prime64_1 + prime64_4
+
+		h64 += xxh.totalLen
 	} else {
 		h64 = xxh.seed + prime64_5 + xxh.totalLen
 	}
@@ -147,18 +128,18 @@ func (xxh *xxHash) Sum64() uint64 {
 	p := 0
 	n := xxh.bufused
 	for n := n - 8; p <= n; p += 8 {
-		p64 := (uint64(xxh.buf[p+7])<<56 | uint64(xxh.buf[p+6])<<48 | uint64(xxh.buf[p+5])<<40 | uint64(xxh.buf[p+4])<<32 | uint64(xxh.buf[p+3])<<24 | uint64(xxh.buf[p+2])<<16 | uint64(xxh.buf[p+1])<<8 | uint64(xxh.buf[p])) * prime64_2
-		h64 ^= ((p64 << 31) | (p64 >> 33)) * prime64_1
-		h64 = ((h64<<27)|(h64>>37))*prime64_1 + prime64_4
+		h64 ^= rol31(u64(xxh.buf[p:p+8])*prime64_2) * prime64_1
+		h64 = rol27(h64)*prime64_1 + prime64_4
 	}
 	if p+4 <= n {
-		h64 ^= (uint64(xxh.buf[p+3])<<24 | uint64(xxh.buf[p+2])<<16 | uint64(xxh.buf[p+1])<<8 | uint64(xxh.buf[p])) * prime64_1
-		h64 = ((h64<<23)|(h64>>41))*prime64_2 + prime64_3
+		sub := xxh.buf[p : p+4]
+		h64 ^= uint64(u32(sub)) * prime64_1
+		h64 = rol23(h64)*prime64_2 + prime64_3
 		p += 4
 	}
 	for ; p < n; p++ {
 		h64 ^= uint64(xxh.buf[p]) * prime64_5
-		h64 = ((h64 << 11) | (h64 >> 53)) * prime64_1
+		h64 = rol11(h64) * prime64_1
 	}
 
 	h64 ^= h64 >> 33
@@ -181,41 +162,27 @@ func Checksum(input []byte, seed uint64) uint64 {
 		v3 := seed
 		v4 := seed - prime64_1
 		p := 0
-		for n := n - 32; p <= n; {
-			p64 := v1 + (uint64(input[p+7])<<56|uint64(input[p+6])<<48|uint64(input[p+5])<<40|uint64(input[p+4])<<32|uint64(input[p+3])<<24|uint64(input[p+2])<<16|uint64(input[p+1])<<8|uint64(input[p]))*prime64_2
-			v1 = (p64<<31 | p64>>33) * prime64_1
-			p += 8
-			p64 = v2 + (uint64(input[p+7])<<56|uint64(input[p+6])<<48|uint64(input[p+5])<<40|uint64(input[p+4])<<32|uint64(input[p+3])<<24|uint64(input[p+2])<<16|uint64(input[p+1])<<8|uint64(input[p]))*prime64_2
-			v2 = (p64<<31 | p64>>33) * prime64_1
-			p += 8
-			p64 = v3 + (uint64(input[p+7])<<56|uint64(input[p+6])<<48|uint64(input[p+5])<<40|uint64(input[p+4])<<32|uint64(input[p+3])<<24|uint64(input[p+2])<<16|uint64(input[p+1])<<8|uint64(input[p]))*prime64_2
-			v3 = (p64<<31 | p64>>33) * prime64_1
-			p += 8
-			p64 = v4 + (uint64(input[p+7])<<56|uint64(input[p+6])<<48|uint64(input[p+5])<<40|uint64(input[p+4])<<32|uint64(input[p+3])<<24|uint64(input[p+2])<<16|uint64(input[p+1])<<8|uint64(input[p]))*prime64_2
-			v4 = (p64<<31 | p64>>33) * prime64_1
-			p += 8
+		for n := n - 32; p <= n; p += 32 {
+			sub := input[p:][:32] //BCE hint for compiler
+			v1 = rol31(v1+u64(sub[:])*prime64_2) * prime64_1
+			v2 = rol31(v2+u64(sub[8:])*prime64_2) * prime64_1
+			v3 = rol31(v3+u64(sub[16:])*prime64_2) * prime64_1
+			v4 = rol31(v4+u64(sub[24:])*prime64_2) * prime64_1
 		}
 
-		h64 = ((v1 << 1) | (v1 >> 63)) +
-			((v2 << 7) | (v2 >> 57)) +
-			((v3 << 12) | (v3 >> 52)) +
-			((v4 << 18) | (v4 >> 46))
+		h64 = rol1(v1) + rol7(v2) + rol12(v3) + rol18(v4)
 
 		v1 *= prime64_2
-		h64 ^= ((v1 << 31) | (v1 >> 33)) * prime64_1
-		h64 = h64*prime64_1 + prime64_4
-
 		v2 *= prime64_2
-		h64 ^= ((v2 << 31) | (v2 >> 33)) * prime64_1
-		h64 = h64*prime64_1 + prime64_4
-
 		v3 *= prime64_2
-		h64 ^= ((v3 << 31) | (v3 >> 33)) * prime64_1
-		h64 = h64*prime64_1 + prime64_4
-
 		v4 *= prime64_2
-		h64 ^= ((v4 << 31) | (v4 >> 33)) * prime64_1
-		h64 = h64*prime64_1 + prime64_4 + uint64(n)
+
+		h64 = (h64^(rol31(v1)*prime64_1))*prime64_1 + prime64_4
+		h64 = (h64^(rol31(v2)*prime64_1))*prime64_1 + prime64_4
+		h64 = (h64^(rol31(v3)*prime64_1))*prime64_1 + prime64_4
+		h64 = (h64^(rol31(v4)*prime64_1))*prime64_1 + prime64_4
+
+		h64 += uint64(n)
 
 		input = input[p:]
 		n -= p
@@ -225,18 +192,19 @@ func Checksum(input []byte, seed uint64) uint64 {
 
 	p := 0
 	for n := n - 8; p <= n; p += 8 {
-		p64 := (uint64(input[p+7])<<56 | uint64(input[p+6])<<48 | uint64(input[p+5])<<40 | uint64(input[p+4])<<32 | uint64(input[p+3])<<24 | uint64(input[p+2])<<16 | uint64(input[p+1])<<8 | uint64(input[p])) * prime64_2
-		h64 ^= ((p64 << 31) | (p64 >> 33)) * prime64_1
-		h64 = ((h64<<27)|(h64>>37))*prime64_1 + prime64_4
+		sub := input[p : p+8]
+		h64 ^= rol31(u64(sub)*prime64_2) * prime64_1
+		h64 = rol27(h64)*prime64_1 + prime64_4
 	}
 	if p+4 <= n {
-		h64 ^= (uint64(input[p+3])<<24 | uint64(input[p+2])<<16 | uint64(input[p+1])<<8 | uint64(input[p])) * prime64_1
-		h64 = ((h64<<23)|(h64>>41))*prime64_2 + prime64_3
+		sub := input[p : p+4]
+		h64 ^= uint64(u32(sub)) * prime64_1
+		h64 = rol23(h64)*prime64_2 + prime64_3
 		p += 4
 	}
 	for ; p < n; p++ {
 		h64 ^= uint64(input[p]) * prime64_5
-		h64 = ((h64 << 11) | (h64 >> 53)) * prime64_1
+		h64 = rol11(h64) * prime64_1
 	}
 
 	h64 ^= h64 >> 33
@@ -247,3 +215,43 @@ func Checksum(input []byte, seed uint64) uint64 {
 
 	return h64
 }
+
+func u64(buf []byte) uint64 {
+	// go compiler recognizes this pattern and optimizes it on little endian platforms
+	return uint64(buf[0]) | uint64(buf[1])<<8 | uint64(buf[2])<<16 | uint64(buf[3])<<24 | uint64(buf[4])<<32 | uint64(buf[5])<<40 | uint64(buf[6])<<48 | uint64(buf[7])<<56
+}
+
+func u32(buf []byte) uint32 {
+	return uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
+}
+
+func rol1(u uint64) uint64 {
+	return u<<1 | u>>63
+}
+
+func rol7(u uint64) uint64 {
+	return u<<7 | u>>57
+}
+
+func rol11(u uint64) uint64 {
+	return u<<11 | u>>53
+}
+
+func rol12(u uint64) uint64 {
+	return u<<12 | u>>52
+}
+
+func rol18(u uint64) uint64 {
+	return u<<18 | u>>46
+}
+
+func rol23(u uint64) uint64 {
+	return u<<23 | u>>41
+}
+
+func rol27(u uint64) uint64 {
+	return u<<27 | u>>37
+}
+func rol31(u uint64) uint64 {
+	return u<<31 | u>>33
+}
diff --git a/xxhsum/main.go b/xxhsum/main.go
index b73158b..f4103a8 100644
--- a/xxhsum/main.go
+++ b/xxhsum/main.go
@@ -18,7 +18,7 @@ import (
 )
 
 func main() {
-	seed := flag.Uint64("seed", 0, "seed value")
+	seed := flag.Uint64("seed", 0, "uint32 or uint64 `seed` based on the selected mode (default 0)")
 	mode := flag.Int("mode", 1, "hash mode: 0=32bits, 1=64bits")
 	flag.Parse()
 
@@ -29,6 +29,25 @@ func main() {
 		xxh = xxHash64.New(*seed)
 	}
 
+	print := func(s string) {
+		h := xxh.Sum(nil)
+		n := len(h)
+		j := n - 1
+		for i := 0; i < n/2; {
+			h[i], h[j] = h[j], h[i]
+			i++
+			j--
+		}
+		fmt.Printf("%x %s\n", h, s)
+	}
+
+	if len(flag.Args()) == 0 {
+		if _, err := io.Copy(xxh, os.Stdin); err == nil {
+			print("stdin")
+		}
+		return
+	}
+
 	// Process each file in sequence
 	for _, filename := range flag.Args() {
 		inputFile, err := os.Open(filename)

More details

Full run details