New Upstream Release - golang-siphash-dev

Ready changes

Summary

Merged new upstream version: 1.2.3 (was: 1.0.0).

Diff

diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..0e259d4
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,121 @@
+Creative Commons Legal Code
+
+CC0 1.0 Universal
+
+    CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
+    LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
+    ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
+    INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
+    REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
+    PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
+    THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
+    HEREUNDER.
+
+Statement of Purpose
+
+The laws of most jurisdictions throughout the world automatically confer
+exclusive Copyright and Related Rights (defined below) upon the creator
+and subsequent owner(s) (each and all, an "owner") of an original work of
+authorship and/or a database (each, a "Work").
+
+Certain owners wish to permanently relinquish those rights to a Work for
+the purpose of contributing to a commons of creative, cultural and
+scientific works ("Commons") that the public can reliably and without fear
+of later claims of infringement build upon, modify, incorporate in other
+works, reuse and redistribute as freely as possible in any form whatsoever
+and for any purposes, including without limitation commercial purposes.
+These owners may contribute to the Commons to promote the ideal of a free
+culture and the further production of creative, cultural and scientific
+works, or to gain reputation or greater distribution for their Work in
+part through the use and efforts of others.
+
+For these and/or other purposes and motivations, and without any
+expectation of additional consideration or compensation, the person
+associating CC0 with a Work (the "Affirmer"), to the extent that he or she
+is an owner of Copyright and Related Rights in the Work, voluntarily
+elects to apply CC0 to the Work and publicly distribute the Work under its
+terms, with knowledge of his or her Copyright and Related Rights in the
+Work and the meaning and intended legal effect of CC0 on those rights.
+
+1. Copyright and Related Rights. A Work made available under CC0 may be
+protected by copyright and related or neighboring rights ("Copyright and
+Related Rights"). Copyright and Related Rights include, but are not
+limited to, the following:
+
+  i. the right to reproduce, adapt, distribute, perform, display,
+     communicate, and translate a Work;
+ ii. moral rights retained by the original author(s) and/or performer(s);
+iii. publicity and privacy rights pertaining to a person's image or
+     likeness depicted in a Work;
+ iv. rights protecting against unfair competition in regards to a Work,
+     subject to the limitations in paragraph 4(a), below;
+  v. rights protecting the extraction, dissemination, use and reuse of data
+     in a Work;
+ vi. database rights (such as those arising under Directive 96/9/EC of the
+     European Parliament and of the Council of 11 March 1996 on the legal
+     protection of databases, and under any national implementation
+     thereof, including any amended or successor version of such
+     directive); and
+vii. other similar, equivalent or corresponding rights throughout the
+     world based on applicable law or treaty, and any national
+     implementations thereof.
+
+2. Waiver. To the greatest extent permitted by, but not in contravention
+of, applicable law, Affirmer hereby overtly, fully, permanently,
+irrevocably and unconditionally waives, abandons, and surrenders all of
+Affirmer's Copyright and Related Rights and associated claims and causes
+of action, whether now known or unknown (including existing as well as
+future claims and causes of action), in the Work (i) in all territories
+worldwide, (ii) for the maximum duration provided by applicable law or
+treaty (including future time extensions), (iii) in any current or future
+medium and for any number of copies, and (iv) for any purpose whatsoever,
+including without limitation commercial, advertising or promotional
+purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
+member of the public at large and to the detriment of Affirmer's heirs and
+successors, fully intending that such Waiver shall not be subject to
+revocation, rescission, cancellation, termination, or any other legal or
+equitable action to disrupt the quiet enjoyment of the Work by the public
+as contemplated by Affirmer's express Statement of Purpose.
+
+3. Public License Fallback. Should any part of the Waiver for any reason
+be judged legally invalid or ineffective under applicable law, then the
+Waiver shall be preserved to the maximum extent permitted taking into
+account Affirmer's express Statement of Purpose. In addition, to the
+extent the Waiver is so judged Affirmer hereby grants to each affected
+person a royalty-free, non transferable, non sublicensable, non exclusive,
+irrevocable and unconditional license to exercise Affirmer's Copyright and
+Related Rights in the Work (i) in all territories worldwide, (ii) for the
+maximum duration provided by applicable law or treaty (including future
+time extensions), (iii) in any current or future medium and for any number
+of copies, and (iv) for any purpose whatsoever, including without
+limitation commercial, advertising or promotional purposes (the
+"License"). The License shall be deemed effective as of the date CC0 was
+applied by Affirmer to the Work. Should any part of the License for any
+reason be judged legally invalid or ineffective under applicable law, such
+partial invalidity or ineffectiveness shall not invalidate the remainder
+of the License, and in such case Affirmer hereby affirms that he or she
+will not (i) exercise any of his or her remaining Copyright and Related
+Rights in the Work or (ii) assert any associated claims and causes of
+action with respect to the Work, in either case contrary to Affirmer's
+express Statement of Purpose.
+
+4. Limitations and Disclaimers.
+
+ a. No trademark or patent rights held by Affirmer are waived, abandoned,
+    surrendered, licensed or otherwise affected by this document.
+ b. Affirmer offers the Work as-is and makes no representations or
+    warranties of any kind concerning the Work, express, implied,
+    statutory or otherwise, including without limitation warranties of
+    title, merchantability, fitness for a particular purpose, non
+    infringement, or the absence of latent or other defects, accuracy, or
+    the present or absence of errors, whether or not discoverable, all to
+    the greatest extent permissible under applicable law.
+ c. Affirmer disclaims responsibility for clearing rights of other persons
+    that may apply to the Work or any use thereof, including without
+    limitation any person's Copyright and Related Rights in the Work.
+    Further, Affirmer disclaims responsibility for obtaining any necessary
+    consents, permissions or other rights required for any use of the
+    Work.
+ d. Affirmer understands and acknowledges that Creative Commons is not a
+    party to this document and has no duty or obligation with respect to
+    this CC0 or use of the Work.
diff --git a/README b/README
deleted file mode 100644
index 9c9ca20..0000000
--- a/README
+++ /dev/null
@@ -1,49 +0,0 @@
-Go implementation of SipHash-2-4, a fast short-input PRF created by
-Jean-Philippe Aumasson and Daniel J. Bernstein (http://131002.net/siphash/).
-
-INSTALLATION
-
-    $ go get github.com/dchest/siphash
-
-USAGE
-
-    import "github.com/dchest/siphash"
-
-There are two ways to use this package.
-The slower one is to use the standard hash.Hash64 interface:
-
-    h := siphash.New(key)
-    h.Write([]byte("Hello"))
-    sum := h.Sum(nil) // returns 8-byte []byte
-
-or
-
-    sum64 := h.Sum64() // returns uint64
-
-The faster one is to use Hash() function, which takes two uint64 parts of
-16-byte key and a byte slice, and returns uint64 hash:
-
-    sum64 := siphash.Hash(key0, key1, []byte("Hello"))
-
-The keys and output are little-endian.
-
-FUNCTIONS
-
-func Hash(k0, k1 uint64, p []byte) uint64
-
-    Hash returns the 64-bit SipHash-2-4 of the given byte slice with two
-    64-bit parts of 128-bit key: k0 and k1.
-
-func New(key []byte) hash.Hash64
-
-    New returns a new hash.Hash64 computing SipHash-2-4 with 16-byte key.
-
-
-PUBLIC DOMAIN DEDICATION
-
-Written in 2012 by Dmitry Chestnykh.
-
-To the extent possible under law, the author have dedicated all copyright
-and related and neighboring rights to this software to the public domain
-worldwide. This software is distributed without any warranty.
-http://creativecommons.org/publicdomain/zero/1.0/
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..5745ded
--- /dev/null
+++ b/README.md
@@ -0,0 +1,67 @@
+SipHash (Go)
+============
+
+Go implementation of SipHash-2-4, a fast short-input PRF created by
+Jean-Philippe Aumasson and Daniel J. Bernstein (http://131002.net/siphash/).
+
+
+## Installation
+
+    $ go get github.com/dchest/siphash
+
+## Usage
+
+    import "github.com/dchest/siphash"
+
+There are two ways to use this package.
+The slower one is to use the standard hash.Hash64 interface:
+
+    h := siphash.New(key)
+    h.Write([]byte("Hello"))
+    sum := h.Sum(nil) // returns 8-byte []byte
+
+or
+
+    sum64 := h.Sum64() // returns uint64
+
+The faster one is to use Hash() function, which takes two uint64 parts of
+16-byte key and a byte slice, and returns uint64 hash:
+
+    sum64 := siphash.Hash(key0, key1, []byte("Hello"))
+
+The keys and output are little-endian.
+
+
+## Functions
+
+### func Hash(k0, k1 uint64, p []byte) uint64
+
+Hash returns the 64-bit SipHash-2-4 of the given byte slice with two
+64-bit parts of 128-bit key: k0 and k1.
+
+### func Hash128(k0, k1 uint64, p []byte) (uint64, uint64)
+
+Hash128 returns the 128-bit SipHash-2-4 of the given byte slice with two
+64-bit parts of 128-bit key: k0 and k1.
+
+Note that 128-bit SipHash is considered experimental by SipHash authors at this time.
+
+### func New(key []byte) hash.Hash64
+
+New returns a new hash.Hash64 computing SipHash-2-4 with 16-byte key.
+
+### func New128(key []byte) hash.Hash
+
+New128 returns a new hash.Hash computing SipHash-2-4 with 16-byte key and 16-byte output.
+
+Note that 16-byte output is considered experimental by SipHash authors at this time.
+
+
+## Public domain dedication
+
+Written by Dmitry Chestnykh and Damian Gryski.
+
+To the extent possible under law, the authors have dedicated all copyright
+and related and neighboring rights to this software to the public domain
+worldwide. This software is distributed without any warranty.
+http://creativecommons.org/publicdomain/zero/1.0/
diff --git a/blocks.go b/blocks.go
new file mode 100644
index 0000000..14e763c
--- /dev/null
+++ b/blocks.go
@@ -0,0 +1,149 @@
+//go:build (!arm && !amd64) || appengine || gccgo
+// +build !arm,!amd64 appengine gccgo
+
+package siphash
+
+func once(d *digest) {
+	blocks(d, d.x[:])
+}
+
+func finalize(d *digest) uint64 {
+	d0 := *d
+	once(&d0)
+
+	v0, v1, v2, v3 := d0.v0, d0.v1, d0.v2, d0.v3
+	v2 ^= 0xff
+
+	// Round 1.
+	v0 += v1
+	v1 = v1<<13 | v1>>(64-13)
+	v1 ^= v0
+	v0 = v0<<32 | v0>>(64-32)
+
+	v2 += v3
+	v3 = v3<<16 | v3>>(64-16)
+	v3 ^= v2
+
+	v0 += v3
+	v3 = v3<<21 | v3>>(64-21)
+	v3 ^= v0
+
+	v2 += v1
+	v1 = v1<<17 | v1>>(64-17)
+	v1 ^= v2
+	v2 = v2<<32 | v2>>(64-32)
+
+	// Round 2.
+	v0 += v1
+	v1 = v1<<13 | v1>>(64-13)
+	v1 ^= v0
+	v0 = v0<<32 | v0>>(64-32)
+
+	v2 += v3
+	v3 = v3<<16 | v3>>(64-16)
+	v3 ^= v2
+
+	v0 += v3
+	v3 = v3<<21 | v3>>(64-21)
+	v3 ^= v0
+
+	v2 += v1
+	v1 = v1<<17 | v1>>(64-17)
+	v1 ^= v2
+	v2 = v2<<32 | v2>>(64-32)
+
+	// Round 3.
+	v0 += v1
+	v1 = v1<<13 | v1>>(64-13)
+	v1 ^= v0
+	v0 = v0<<32 | v0>>(64-32)
+
+	v2 += v3
+	v3 = v3<<16 | v3>>(64-16)
+	v3 ^= v2
+
+	v0 += v3
+	v3 = v3<<21 | v3>>(64-21)
+	v3 ^= v0
+
+	v2 += v1
+	v1 = v1<<17 | v1>>(64-17)
+	v1 ^= v2
+	v2 = v2<<32 | v2>>(64-32)
+
+	// Round 4.
+	v0 += v1
+	v1 = v1<<13 | v1>>(64-13)
+	v1 ^= v0
+	v0 = v0<<32 | v0>>(64-32)
+
+	v2 += v3
+	v3 = v3<<16 | v3>>(64-16)
+	v3 ^= v2
+
+	v0 += v3
+	v3 = v3<<21 | v3>>(64-21)
+	v3 ^= v0
+
+	v2 += v1
+	v1 = v1<<17 | v1>>(64-17)
+	v1 ^= v2
+	v2 = v2<<32 | v2>>(64-32)
+
+	return v0 ^ v1 ^ v2 ^ v3
+}
+
+func blocks(d *digest, p []uint8) {
+	v0, v1, v2, v3 := d.v0, d.v1, d.v2, d.v3
+
+	for len(p) >= BlockSize {
+		m := uint64(p[0]) | uint64(p[1])<<8 | uint64(p[2])<<16 | uint64(p[3])<<24 |
+			uint64(p[4])<<32 | uint64(p[5])<<40 | uint64(p[6])<<48 | uint64(p[7])<<56
+
+		v3 ^= m
+
+		// Round 1.
+		v0 += v1
+		v1 = v1<<13 | v1>>(64-13)
+		v1 ^= v0
+		v0 = v0<<32 | v0>>(64-32)
+
+		v2 += v3
+		v3 = v3<<16 | v3>>(64-16)
+		v3 ^= v2
+
+		v0 += v3
+		v3 = v3<<21 | v3>>(64-21)
+		v3 ^= v0
+
+		v2 += v1
+		v1 = v1<<17 | v1>>(64-17)
+		v1 ^= v2
+		v2 = v2<<32 | v2>>(64-32)
+
+		// Round 2.
+		v0 += v1
+		v1 = v1<<13 | v1>>(64-13)
+		v1 ^= v0
+		v0 = v0<<32 | v0>>(64-32)
+
+		v2 += v3
+		v3 = v3<<16 | v3>>(64-16)
+		v3 ^= v2
+
+		v0 += v3
+		v3 = v3<<21 | v3>>(64-21)
+		v3 ^= v0
+
+		v2 += v1
+		v1 = v1<<17 | v1>>(64-17)
+		v1 ^= v2
+		v2 = v2<<32 | v2>>(64-32)
+
+		v0 ^= m
+
+		p = p[BlockSize:]
+	}
+
+	d.v0, d.v1, d.v2, d.v3 = v0, v1, v2, v3
+}
diff --git a/blocks_amd64.s b/blocks_amd64.s
new file mode 100644
index 0000000..2327866
--- /dev/null
+++ b/blocks_amd64.s
@@ -0,0 +1,87 @@
+//go:build amd64 && !appengine && !gccgo
+// +build amd64,!appengine,!gccgo
+
+#define ROUND(v0, v1, v2, v3) \
+	ADDQ v1, v0; \
+	RORQ $51, v1; \
+	ADDQ v3, v2; \
+	XORQ v0, v1; \
+	RORQ $48, v3; \
+	RORQ $32, v0; \
+	XORQ v2, v3; \
+	ADDQ v1, v2; \
+	ADDQ v3, v0; \
+	RORQ $43, v3; \
+	RORQ $47, v1; \
+	XORQ v0, v3; \
+	XORQ v2, v1; \
+	RORQ $32, v2
+
+// blocks(d *digest, data []uint8)
+TEXT ·blocks(SB),4,$0-32
+	MOVQ d+0(FP), BX
+	MOVQ 0(BX), R9		// R9 = v0
+	MOVQ 8(BX), R10		// R10 = v1
+	MOVQ 16(BX), R11	// R11 = v2
+	MOVQ 24(BX), R12	// R12 = v3
+	MOVQ p_base+8(FP), DI	// DI = *uint64
+	MOVQ p_len+16(FP), SI	// SI = nblocks
+	XORL DX, DX		// DX = index (0)
+	SHRQ $3, SI 		// SI /= 8
+body:
+	CMPQ DX, SI
+	JGE  end
+	MOVQ 0(DI)(DX*8), CX	// CX = m
+	XORQ CX, R12
+	ROUND(R9, R10, R11, R12)
+	ROUND(R9, R10, R11, R12)
+	XORQ CX, R9
+	ADDQ $1, DX
+	JMP  body
+end:
+	MOVQ R9, 0(BX)
+	MOVQ R10, 8(BX)
+	MOVQ R11, 16(BX)
+	MOVQ R12, 24(BX)
+	RET
+
+// once(d *digest)
+TEXT ·once(SB),4,$0-8
+	MOVQ d+0(FP), BX
+	MOVQ 0(BX), R9		// R9 = v0
+	MOVQ 8(BX), R10		// R10 = v1
+	MOVQ 16(BX), R11	// R11 = v2
+	MOVQ 24(BX), R12	// R12 = v3
+	MOVQ 48(BX), CX		// CX = d.x[:]
+	XORQ CX, R12
+	ROUND(R9, R10, R11, R12)
+	ROUND(R9, R10, R11, R12)
+	XORQ CX, R9
+	MOVQ R9, 0(BX)
+	MOVQ R10, 8(BX)
+	MOVQ R11, 16(BX)
+	MOVQ R12, 24(BX)
+	RET
+
+// finalize(d *digest) uint64
+TEXT ·finalize(SB),4,$0-16
+	MOVQ d+0(FP), BX
+	MOVQ 0(BX), R9		// R9 = v0
+	MOVQ 8(BX), R10		// R10 = v1
+	MOVQ 16(BX), R11	// R11 = v2
+	MOVQ 24(BX), R12	// R12 = v3
+	MOVQ 48(BX), CX		// CX = d.x[:]
+	XORQ CX, R12
+	ROUND(R9, R10, R11, R12)
+	ROUND(R9, R10, R11, R12)
+	XORQ CX, R9
+	NOTB R11
+	ROUND(R9, R10, R11, R12)
+	ROUND(R9, R10, R11, R12)
+	ROUND(R9, R10, R11, R12)
+	ROUND(R9, R10, R11, R12)
+	XORQ R12, R11
+	XORQ R10, R9
+	XORQ R11, R9
+	MOVQ R9, ret+8(FP)
+	RET
diff --git a/blocks_arm.s b/blocks_arm.s
new file mode 100644
index 0000000..adf5d67
--- /dev/null
+++ b/blocks_arm.s
@@ -0,0 +1,140 @@
+#include "textflag.h"
+
+#define ROUND()\
+	ADD.S	R2,R0,R0;\
+	ADC	R3,R1,R1;\
+	EOR	R2<<13,R0,R8;\
+	EOR	R3>>19,R8,R8;\
+	EOR	R2>>19,R1,R11;\
+	EOR	R3<<13,R11,R11;\
+	ADD.S	R6,R4,R4;\
+	ADC	R7,R5,R5;\
+	EOR	R6<<16,R4,R2;\
+	EOR	R7>>16,R2,R2;\
+	EOR	R6>>16,R5,R3;\
+	EOR	R7<<16,R3,R3;\
+	ADD.S	R2,R1,R1;\
+	ADC	R3,R0,R0;\
+	EOR	R2<<21,R1,R6;\
+	EOR	R3>>11,R6,R6;\
+	EOR	R2>>11,R0,R7;\
+	EOR	R3<<21,R7,R7;\
+	ADD.S	R8,R4,R4;\
+	ADC	R11,R5,R5;\
+	EOR	R8<<17,R4,R2;\
+	EOR	R11>>15,R2,R2;\
+	EOR	R8>>15,R5,R3;\
+	EOR	R11<<17,R3,R3;\
+	ADD.S	R2,R1,R1;\
+	ADC	R3,R0,R0;\
+	EOR	R2<<13,R1,R8;\
+	EOR	R3>>19,R8,R8;\
+	EOR	R2>>19,R0,R11;\
+	EOR	R3<<13,R11,R11;\
+	ADD.S	R6,R5,R5;\
+	ADC	R7,R4,R4;\
+	EOR	R6<<16,R5,R2;\
+	EOR	R7>>16,R2,R2;\
+	EOR	R6>>16,R4,R3;\
+	EOR	R7<<16,R3,R3;\
+	ADD.S	R2,R0,R0;\
+	ADC	R3,R1,R1;\
+	EOR	R2<<21,R0,R6;\
+	EOR	R3>>11,R6,R6;\
+	EOR	R2>>11,R1,R7;\
+	EOR	R3<<21,R7,R7;\
+	ADD.S	R8,R5,R5;\
+	ADC	R11,R4,R4;\
+	EOR	R8<<17,R5,R2;\
+	EOR	R11>>15,R2,R2;\
+	EOR	R8>>15,R4,R3;\
+	EOR	R11<<17,R3,R3;
+
+// once(d *digest)
+TEXT ·once(SB),NOSPLIT,$4-4
+	MOVW	d+0(FP),R8
+	MOVM.IA	(R8),[R0,R1,R2,R3,R4,R5,R6,R7]
+	MOVW	48(R8),R12
+	MOVW	52(R8),R14
+	EOR	R12,R6,R6
+	EOR	R14,R7,R7
+	ROUND()
+	EOR	R12,R0,R0
+	EOR	R14,R1,R1
+	MOVW	d+0(FP),R8
+	MOVM.IA [R0,R1,R2,R3,R4,R5,R6,R7],(R8)
+	RET
+
+// finalize(d *digest) uint64
+TEXT ·finalize(SB),NOSPLIT,$4-12
+	MOVW	d+0(FP),R8
+	MOVM.IA	(R8),[R0,R1,R2,R3,R4,R5,R6,R7]
+	MOVW	48(R8),R12
+	MOVW	52(R8),R14
+	EOR	R12,R6,R6
+	EOR	R14,R7,R7
+	ROUND()
+	EOR	R12,R0,R0
+	EOR	R14,R1,R1
+	EOR	$255,R4
+	ROUND()
+	ROUND()
+	EOR	R2,R0,R0
+	EOR	R3,R1,R1
+	EOR	R6,R4,R4
+	EOR	R7,R5,R5
+	EOR	R4,R0,R0
+	EOR	R5,R1,R1
+	MOVW	R0,ret_lo+4(FP)
+	MOVW	R1,ret_hi+8(FP)
+	RET
+
+// blocks(d *digest, data []uint8)
+TEXT ·blocks(SB),NOSPLIT,$4-16
+	MOVW	d+0(FP),R8
+	MOVM.IA	(R8),[R0,R1,R2,R3,R4,R5,R6,R7]
+	MOVW	p+4(FP),R9
+	MOVW	p_len+8(FP),R11
+	ADD	R9,R11,R11
+	MOVW	R11,endp-4(SP)
+	AND.S   $3,R9,R8
+	BNE     blocksunaligned
+blocksloop:
+	MOVM.IA.W (R9),[R12,R14]
+	EOR	R12,R6,R6
+	EOR	R14,R7,R7
+	ROUND()
+	EOR	R12,R0,R0
+	EOR	R14,R1,R1
+	MOVW	endp-4(SP),R11
+	CMP	R11,R9
+	BLO	blocksloop
+	MOVW	d+0(FP),R8
+	MOVM.IA [R0,R1,R2,R3,R4,R5,R6,R7],(R8)
+	RET
+blocksunaligned:
+	MOVBU.P   8(R9),R12
+	MOVBU    -7(R9),R11
+	ORR     R11<<8,R12,R12
+	MOVBU    -6(R9),R11
+	ORR     R11<<16,R12,R12
+	MOVBU    -5(R9),R11
+	ORR     R11<<24,R12,R12
+	MOVBU    -4(R9),R14
+	MOVBU    -3(R9),R11
+	ORR     R11<<8,R14,R14
+	MOVBU    -2(R9),R11
+	ORR     R11<<16,R14,R14
+	MOVBU    -1(R9),R11
+	ORR     R11<<24,R14,R14
+	EOR     R12,R6,R6
+	EOR     R14,R7,R7
+	ROUND()
+	EOR     R12,R0,R0
+	EOR     R14,R1,R1
+	MOVW    endp-4(SP),R11
+	CMP     R11,R9
+	BLO     blocksunaligned
+	MOVW    d+0(FP),R8
+	MOVM.IA [R0,R1,R2,R3,R4,R5,R6,R7],(R8)
+	RET
diff --git a/blocks_asm.go b/blocks_asm.go
new file mode 100644
index 0000000..c14b35d
--- /dev/null
+++ b/blocks_asm.go
@@ -0,0 +1,22 @@
+//go:build arm || (amd64 && !appengine && !gccgo)
+// +build arm amd64,!appengine,!gccgo
+
+// Written in 2012 by Dmitry Chestnykh.
+//
+// To the extent possible under law, the author have dedicated all copyright
+// and related and neighboring rights to this software to the public domain
+// worldwide. This software is distributed without any warranty.
+// http://creativecommons.org/publicdomain/zero/1.0/
+
+// This file contains a function definition for use with assembly implementations of Hash()
+
+package siphash
+
+//go:noescape
+func blocks(d *digest, p []uint8)
+
+//go:noescape
+func finalize(d *digest) uint64
+
+//go:noescape
+func once(d *digest)
diff --git a/debian/changelog b/debian/changelog
index 5fd8bfb..9bcf3ab 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,9 @@
+golang-siphash-dev (1.2.3-1) UNRELEASED; urgency=low
+
+  * New upstream release.
+
+ -- Debian Janitor <janitor@jelmer.uk>  Tue, 27 Jun 2023 02:00:50 -0000
+
 golang-siphash-dev (1.0.0-2) unstable; urgency=medium
 
   [ Ximin Luo ]
diff --git a/go.mod b/go.mod
new file mode 100644
index 0000000..3783991
--- /dev/null
+++ b/go.mod
@@ -0,0 +1,3 @@
+module github.com/dchest/siphash
+
+go 1.16
diff --git a/hash.go b/hash.go
index 4e8027d..8754416 100644
--- a/hash.go
+++ b/hash.go
@@ -1,3 +1,6 @@
+//go:build (!arm && !amd64) || appengine || gccgo
+// +build !arm,!amd64 appengine gccgo
+
 // Written in 2012 by Dmitry Chestnykh.
 //
 // To the extent possible under law, the author have dedicated all copyright
diff --git a/hash128.go b/hash128.go
new file mode 100644
index 0000000..f92f773
--- /dev/null
+++ b/hash128.go
@@ -0,0 +1,304 @@
+//go:build (!arm && !amd64) || appengine || gccgo
+// +build !arm,!amd64 appengine gccgo
+
+// Written in 2012 by Dmitry Chestnykh.
+// Modifications 2014 for 128-bit hash function by Damian Gryski.
+//
+// To the extent possible under law, the authors have dedicated all copyright
+// and related and neighboring rights to this software to the public domain
+// worldwide. This software is distributed without any warranty.
+// http://creativecommons.org/publicdomain/zero/1.0/
+
+package siphash
+
+// Hash returns the 128-bit SipHash-2-4 of the given byte slice with two 64-bit
+// parts of 128-bit key: k0 and k1.
+//
+// Note that 128-bit SipHash is considered experimental by SipHash authors at this time.
+func Hash128(k0, k1 uint64, p []byte) (uint64, uint64) {
+	// Initialization.
+	v0 := k0 ^ 0x736f6d6570736575
+	v1 := k1 ^ 0x646f72616e646f6d
+	v2 := k0 ^ 0x6c7967656e657261
+	v3 := k1 ^ 0x7465646279746573
+	t := uint64(len(p)) << 56
+
+	v1 ^= 0xee
+
+	// Compression.
+	for len(p) >= BlockSize {
+		m := uint64(p[0]) | uint64(p[1])<<8 | uint64(p[2])<<16 | uint64(p[3])<<24 |
+			uint64(p[4])<<32 | uint64(p[5])<<40 | uint64(p[6])<<48 | uint64(p[7])<<56
+		v3 ^= m
+
+		// Round 1.
+		v0 += v1
+		v1 = v1<<13 | v1>>(64-13)
+		v1 ^= v0
+		v0 = v0<<32 | v0>>(64-32)
+
+		v2 += v3
+		v3 = v3<<16 | v3>>(64-16)
+		v3 ^= v2
+
+		v0 += v3
+		v3 = v3<<21 | v3>>(64-21)
+		v3 ^= v0
+
+		v2 += v1
+		v1 = v1<<17 | v1>>(64-17)
+		v1 ^= v2
+		v2 = v2<<32 | v2>>(64-32)
+
+		// Round 2.
+		v0 += v1
+		v1 = v1<<13 | v1>>(64-13)
+		v1 ^= v0
+		v0 = v0<<32 | v0>>(64-32)
+
+		v2 += v3
+		v3 = v3<<16 | v3>>(64-16)
+		v3 ^= v2
+
+		v0 += v3
+		v3 = v3<<21 | v3>>(64-21)
+		v3 ^= v0
+
+		v2 += v1
+		v1 = v1<<17 | v1>>(64-17)
+		v1 ^= v2
+		v2 = v2<<32 | v2>>(64-32)
+
+		v0 ^= m
+		p = p[BlockSize:]
+	}
+
+	// Compress last block.
+	switch len(p) {
+	case 7:
+		t |= uint64(p[6]) << 48
+		fallthrough
+	case 6:
+		t |= uint64(p[5]) << 40
+		fallthrough
+	case 5:
+		t |= uint64(p[4]) << 32
+		fallthrough
+	case 4:
+		t |= uint64(p[3]) << 24
+		fallthrough
+	case 3:
+		t |= uint64(p[2]) << 16
+		fallthrough
+	case 2:
+		t |= uint64(p[1]) << 8
+		fallthrough
+	case 1:
+		t |= uint64(p[0])
+	}
+
+	v3 ^= t
+
+	// Round 1.
+	v0 += v1
+	v1 = v1<<13 | v1>>(64-13)
+	v1 ^= v0
+	v0 = v0<<32 | v0>>(64-32)
+
+	v2 += v3
+	v3 = v3<<16 | v3>>(64-16)
+	v3 ^= v2
+
+	v0 += v3
+	v3 = v3<<21 | v3>>(64-21)
+	v3 ^= v0
+
+	v2 += v1
+	v1 = v1<<17 | v1>>(64-17)
+	v1 ^= v2
+	v2 = v2<<32 | v2>>(64-32)
+
+	// Round 2.
+	v0 += v1
+	v1 = v1<<13 | v1>>(64-13)
+	v1 ^= v0
+	v0 = v0<<32 | v0>>(64-32)
+
+	v2 += v3
+	v3 = v3<<16 | v3>>(64-16)
+	v3 ^= v2
+
+	v0 += v3
+	v3 = v3<<21 | v3>>(64-21)
+	v3 ^= v0
+
+	v2 += v1
+	v1 = v1<<17 | v1>>(64-17)
+	v1 ^= v2
+	v2 = v2<<32 | v2>>(64-32)
+
+	v0 ^= t
+
+	// Finalization.
+	v2 ^= 0xee
+
+	// Round 1.
+	v0 += v1
+	v1 = v1<<13 | v1>>(64-13)
+	v1 ^= v0
+	v0 = v0<<32 | v0>>(64-32)
+
+	v2 += v3
+	v3 = v3<<16 | v3>>(64-16)
+	v3 ^= v2
+
+	v0 += v3
+	v3 = v3<<21 | v3>>(64-21)
+	v3 ^= v0
+
+	v2 += v1
+	v1 = v1<<17 | v1>>(64-17)
+	v1 ^= v2
+	v2 = v2<<32 | v2>>(64-32)
+
+	// Round 2.
+	v0 += v1
+	v1 = v1<<13 | v1>>(64-13)
+	v1 ^= v0
+	v0 = v0<<32 | v0>>(64-32)
+
+	v2 += v3
+	v3 = v3<<16 | v3>>(64-16)
+	v3 ^= v2
+
+	v0 += v3
+	v3 = v3<<21 | v3>>(64-21)
+	v3 ^= v0
+
+	v2 += v1
+	v1 = v1<<17 | v1>>(64-17)
+	v1 ^= v2
+	v2 = v2<<32 | v2>>(64-32)
+
+	// Round 3.
+	v0 += v1
+	v1 = v1<<13 | v1>>(64-13)
+	v1 ^= v0
+	v0 = v0<<32 | v0>>(64-32)
+
+	v2 += v3
+	v3 = v3<<16 | v3>>(64-16)
+	v3 ^= v2
+
+	v0 += v3
+	v3 = v3<<21 | v3>>(64-21)
+	v3 ^= v0
+
+	v2 += v1
+	v1 = v1<<17 | v1>>(64-17)
+	v1 ^= v2
+	v2 = v2<<32 | v2>>(64-32)
+
+	// Round 4.
+	v0 += v1
+	v1 = v1<<13 | v1>>(64-13)
+	v1 ^= v0
+	v0 = v0<<32 | v0>>(64-32)
+
+	v2 += v3
+	v3 = v3<<16 | v3>>(64-16)
+	v3 ^= v2
+
+	v0 += v3
+	v3 = v3<<21 | v3>>(64-21)
+	v3 ^= v0
+
+	v2 += v1
+	v1 = v1<<17 | v1>>(64-17)
+	v1 ^= v2
+	v2 = v2<<32 | v2>>(64-32)
+
+	r0 := v0 ^ v1 ^ v2 ^ v3
+
+	v1 ^= 0xdd
+
+	// Round 1.
+	v0 += v1
+	v1 = v1<<13 | v1>>(64-13)
+	v1 ^= v0
+	v0 = v0<<32 | v0>>(64-32)
+
+	v2 += v3
+	v3 = v3<<16 | v3>>(64-16)
+	v3 ^= v2
+
+	v0 += v3
+	v3 = v3<<21 | v3>>(64-21)
+	v3 ^= v0
+
+	v2 += v1
+	v1 = v1<<17 | v1>>(64-17)
+	v1 ^= v2
+	v2 = v2<<32 | v2>>(64-32)
+
+	// Round 2.
+	v0 += v1
+	v1 = v1<<13 | v1>>(64-13)
+	v1 ^= v0
+	v0 = v0<<32 | v0>>(64-32)
+
+	v2 += v3
+	v3 = v3<<16 | v3>>(64-16)
+	v3 ^= v2
+
+	v0 += v3
+	v3 = v3<<21 | v3>>(64-21)
+	v3 ^= v0
+
+	v2 += v1
+	v1 = v1<<17 | v1>>(64-17)
+	v1 ^= v2
+	v2 = v2<<32 | v2>>(64-32)
+
+	// Round 3.
+	v0 += v1
+	v1 = v1<<13 | v1>>(64-13)
+	v1 ^= v0
+	v0 = v0<<32 | v0>>(64-32)
+
+	v2 += v3
+	v3 = v3<<16 | v3>>(64-16)
+	v3 ^= v2
+
+	v0 += v3
+	v3 = v3<<21 | v3>>(64-21)
+	v3 ^= v0
+
+	v2 += v1
+	v1 = v1<<17 | v1>>(64-17)
+	v1 ^= v2
+	v2 = v2<<32 | v2>>(64-32)
+
+	// Round 4.
+	v0 += v1
+	v1 = v1<<13 | v1>>(64-13)
+	v1 ^= v0
+	v0 = v0<<32 | v0>>(64-32)
+
+	v2 += v3
+	v3 = v3<<16 | v3>>(64-16)
+	v3 ^= v2
+
+	v0 += v3
+	v3 = v3<<21 | v3>>(64-21)
+	v3 ^= v0
+
+	v2 += v1
+	v1 = v1<<17 | v1>>(64-17)
+	v1 ^= v2
+	v2 = v2<<32 | v2>>(64-32)
+
+	r1 := v0 ^ v1 ^ v2 ^ v3
+
+	return r0, r1
+}
diff --git a/hash128_amd64.s b/hash128_amd64.s
new file mode 100644
index 0000000..230cdf2
--- /dev/null
+++ b/hash128_amd64.s
@@ -0,0 +1,288 @@
+//go:build amd64 && !appengine && !gccgo
+// +build amd64,!appengine,!gccgo
+
+// This is a translation of the gcc output of FloodyBerry's pure-C public
+// domain siphash implementation at https://github.com/floodyberry/siphash
+
+// This assembly code has been modified from the 64-bit output to the experiment 128-bit output.
+
+// SI = v0
+// AX = v1
+// CX = v2
+// DX = v3
+
+// func Hash128(k0, k1 uint64, b []byte) (r0 uint64, r1 uint64)
+TEXT	·Hash128(SB),4,$0-56
+	MOVQ	k0+0(FP),CX
+	MOVQ	$0x736F6D6570736575,R9
+	MOVQ	k1+8(FP),DI
+	MOVQ	$0x6C7967656E657261,BX
+	MOVQ	$0x646F72616E646F6D,AX
+	MOVQ	b_len+24(FP),DX
+	XORQ	$0xEE,AX
+	MOVQ	DX,R11
+	MOVQ	DX,R10
+	XORQ	CX,R9
+	XORQ	CX,BX
+	MOVQ	$0x7465646279746573,CX
+	XORQ	DI,AX
+	XORQ	DI,CX
+	SHLQ	$0x38,R11
+	XORQ	DI,DI
+	MOVQ	b_base+16(FP),SI
+	ANDQ	$0xFFFFFFFFFFFFFFF8,R10
+	JE	afterLoop
+	XCHGQ	AX,AX
+loopBody:
+	MOVQ	0(SI)(DI*1),R8
+	ADDQ	AX,R9
+	RORQ	$0x33,AX
+	XORQ	R9,AX
+	RORQ	$0x20,R9
+	ADDQ	$0x8,DI
+	XORQ	R8,CX
+	ADDQ	CX,BX
+	RORQ	$0x30,CX
+	XORQ	BX,CX
+	ADDQ	AX,BX
+	RORQ	$0x2F,AX
+	ADDQ	CX,R9
+	RORQ	$0x2B,CX
+	XORQ	BX,AX
+	XORQ	R9,CX
+	RORQ	$0x20,BX
+	ADDQ	AX,R9
+	ADDQ	CX,BX
+	RORQ	$0x33,AX
+	RORQ	$0x30,CX
+	XORQ	R9,AX
+	XORQ	BX,CX
+	RORQ	$0x20,R9
+	ADDQ	AX,BX
+	ADDQ	CX,R9
+	RORQ	$0x2F,AX
+	RORQ	$0x2B,CX
+	XORQ	BX,AX
+	RORQ	$0x20,BX
+	XORQ	R9,CX
+	XORQ	R8,R9
+	CMPQ	R10,DI
+	JA	loopBody
+afterLoop:
+	ANDL	$7, DX
+	JZ	afterSwitch
+
+	// no support for jump tables
+
+	CMPQ	DX,$0x7
+	JE	sw7
+
+	CMPQ	DX,$0x6
+	JE	sw6
+
+	CMPQ	DX,$0x5
+	JE	sw5
+
+	CMPQ	DX,$0x4
+	JE	sw4
+
+	CMPQ	DX,$0x3
+	JE	sw3
+
+	CMPQ	DX,$0x2
+	JE	sw2
+
+	JMP	sw1
+
+sw7:	MOVBQZX	6(SI)(DI*1),DX
+	SHLQ	$0x30,DX
+	ORQ	DX,R11
+sw6:	MOVBQZX	0x5(SI)(DI*1),DX
+	SHLQ	$0x28,DX
+	ORQ	DX,R11
+sw5:	MOVBQZX	0x4(SI)(DI*1),DX
+	SHLQ	$0x20,DX
+	ORQ	DX,R11
+sw4:	MOVBQZX	0x3(SI)(DI*1),DX
+	SHLQ	$0x18,DX
+	ORQ	DX,R11
+sw3:	MOVBQZX	0x2(SI)(DI*1),DX
+	SHLQ	$0x10,DX
+	ORQ	DX,R11
+sw2:	MOVBQZX	0x1(SI)(DI*1),DX
+	SHLQ	$0x8,DX
+	ORQ	DX,R11
+sw1:	MOVBQZX	0(SI)(DI*1),DX
+	ORQ	DX,R11
+afterSwitch:
+	LEAQ	(AX)(R9*1),SI
+	XORQ	R11,CX
+	RORQ	$0x33,AX
+	ADDQ	CX,BX
+	MOVQ	CX,DX
+	XORQ	SI,AX
+	RORQ	$0x30,DX
+	RORQ	$0x20,SI
+	LEAQ	0(BX)(AX*1),CX
+	XORQ	BX,DX
+	RORQ	$0x2F,AX
+	ADDQ	DX,SI
+	RORQ	$0x2B,DX
+	XORQ	CX,AX
+	XORQ	SI,DX
+	RORQ	$0x20,CX
+	ADDQ	AX,SI
+	RORQ	$0x33,AX
+	ADDQ	DX,CX
+	XORQ	SI,AX
+	RORQ	$0x30,DX
+	RORQ	$0x20,SI
+	XORQ	CX,DX
+	ADDQ	AX,CX
+	RORQ	$0x2F,AX
+	ADDQ	DX,SI
+	XORQ	CX,AX
+	RORQ	$0x2B,DX
+	RORQ	$0x20,CX
+	XORQ	SI,DX
+	XORQ	R11,SI
+	XORB	$0xEE,CL
+	ADDQ	AX,SI
+	RORQ	$0x33,AX
+	ADDQ	DX,CX
+	RORQ	$0x30,DX
+	XORQ	SI,AX
+	XORQ	CX,DX
+	RORQ	$0x20,SI
+	ADDQ	AX,CX
+	ADDQ	DX,SI
+	RORQ	$0x2F,AX
+	RORQ	$0x2B,DX
+	XORQ	CX,AX
+	XORQ	SI,DX
+	RORQ	$0x20,CX
+	ADDQ	AX,SI
+	ADDQ	DX,CX
+	RORQ	$0x33,AX
+	RORQ	$0x30,DX
+	XORQ	SI,AX
+	RORQ	$0x20,SI
+	XORQ	CX,DX
+	ADDQ	AX,CX
+	RORQ	$0x2F,AX
+	ADDQ	DX,SI
+	RORQ	$0x2B,DX
+	XORQ	CX,AX
+	XORQ	SI,DX
+	RORQ	$0x20,CX
+	ADDQ	AX,SI
+	ADDQ	DX,CX
+	RORQ	$0x33,AX
+	RORQ	$0x30,DX
+	XORQ	CX,DX
+	XORQ	SI,AX
+	RORQ	$0x20,SI
+	ADDQ	DX,SI
+	ADDQ	AX,CX
+	RORQ	$0x2F,AX
+	XORQ	CX,AX
+	RORQ	$0x2B,DX
+	RORQ	$0x20,CX
+	XORQ	SI,DX
+
+	// gcc optimized the tail end of this function differently.  However,
+	// we need to preserve out registers to carry out the second stage of
+	// the finalization.  This is a duplicate of an earlier finalization
+	// round.
+
+	ADDQ	AX,SI
+	RORQ	$0x33,AX
+	ADDQ	DX,CX
+	RORQ	$0x30,DX
+	XORQ	SI,AX
+	XORQ	CX,DX
+	RORQ	$0x20,SI
+	ADDQ	AX,CX
+	ADDQ	DX,SI
+	RORQ	$0x2F,AX
+	RORQ	$0x2B,DX
+	XORQ	CX,AX
+	XORQ	SI,DX
+	RORQ	$0x20,CX
+
+	// Stuff the result into BX instead of AX as gcc had done
+
+	MOVQ	SI,BX
+	XORQ	AX,BX
+	XORQ	DX,BX
+	XORQ	CX,BX
+	MOVQ	BX,ret+40(FP)
+
+	// Start the second finalization round
+
+	XORB	$0xDD,AL
+	ADDQ	AX,SI
+	RORQ	$0x33,AX
+	ADDQ	DX,CX
+	RORQ	$0x30,DX
+	XORQ	SI,AX
+	XORQ	CX,DX
+	RORQ	$0x20,SI
+	ADDQ	AX,CX
+	ADDQ	DX,SI
+	RORQ	$0x2F,AX
+	RORQ	$0x2B,DX
+	XORQ	CX,AX
+	XORQ	SI,DX
+	RORQ	$0x20,CX
+	ADDQ	AX,SI
+	ADDQ	DX,CX
+	RORQ	$0x33,AX
+	RORQ	$0x30,DX
+	XORQ	SI,AX
+	RORQ	$0x20,SI
+	XORQ	CX,DX
+	ADDQ	AX,CX
+	RORQ	$0x2F,AX
+	ADDQ	DX,SI
+	RORQ	$0x2B,DX
+	XORQ	CX,AX
+	XORQ	SI,DX
+	RORQ	$0x20,CX
+	ADDQ	AX,SI
+	ADDQ	DX,CX
+	RORQ	$0x33,AX
+	RORQ	$0x30,DX
+	XORQ	CX,DX
+	XORQ	SI,AX
+	RORQ	$0x20,SI
+	ADDQ	DX,SI
+	ADDQ	AX,CX
+	RORQ	$0x2F,AX
+	XORQ	CX,AX
+	RORQ	$0x2B,DX
+	RORQ	$0x20,CX
+	XORQ	SI,DX
+
+	ADDQ	AX,SI
+	RORQ	$0x33,AX
+	ADDQ	DX,CX
+	RORQ	$0x30,DX
+	XORQ	SI,AX
+	XORQ	CX,DX
+	RORQ	$0x20,SI
+	ADDQ	AX,CX
+	ADDQ	DX,SI
+	RORQ	$0x2F,AX
+	RORQ	$0x2B,DX
+	XORQ	CX,AX
+	XORQ	SI,DX
+	RORQ	$0x20,CX
+
+	MOVQ	SI,BX
+	XORQ	AX,BX
+	XORQ	DX,BX
+	XORQ	CX,BX
+	MOVQ	BX,ret1+48(FP)
+
+	RET
diff --git a/hash_amd64.s b/hash_amd64.s
new file mode 100644
index 0000000..b3b87ee
--- /dev/null
+++ b/hash_amd64.s
@@ -0,0 +1,197 @@
+//go:build amd64 && !appengine && !gccgo
+// +build amd64,!appengine,!gccgo
+
+// This is a translation of the gcc output of FloodyBerry's pure-C public
+// domain siphash implementation at https://github.com/floodyberry/siphash
+// func Hash(k0, k1 uint64, b []byte) uint64
+TEXT	·Hash(SB),4,$0-48
+	MOVQ	k0+0(FP),CX
+	MOVQ	$0x736F6D6570736575,R9
+	MOVQ	k1+8(FP),DI
+	MOVQ	$0x6C7967656E657261,BX
+	MOVQ	$0x646F72616E646F6D,AX
+	MOVQ	b_len+24(FP),DX
+	MOVQ	DX,R11
+	MOVQ	DX,R10
+	XORQ	CX,R9
+	XORQ	CX,BX
+	MOVQ	$0x7465646279746573,CX
+	XORQ	DI,AX
+	XORQ	DI,CX
+	SHLQ	$0x38,R11
+	XORQ	DI,DI
+	MOVQ	b_base+16(FP),SI
+	ANDQ	$0xFFFFFFFFFFFFFFF8,R10
+	JE	afterLoop
+	XCHGQ	AX,AX
+loopBody:
+	MOVQ	0(SI)(DI*1),R8
+	ADDQ	AX,R9
+	RORQ	$0x33,AX
+	XORQ	R9,AX
+	RORQ	$0x20,R9
+	ADDQ	$0x8,DI
+	XORQ	R8,CX
+	ADDQ	CX,BX
+	RORQ	$0x30,CX
+	XORQ	BX,CX
+	ADDQ	AX,BX
+	RORQ	$0x2F,AX
+	ADDQ	CX,R9
+	RORQ	$0x2B,CX
+	XORQ	BX,AX
+	XORQ	R9,CX
+	RORQ	$0x20,BX
+	ADDQ	AX,R9
+	ADDQ	CX,BX
+	RORQ	$0x33,AX
+	RORQ	$0x30,CX
+	XORQ	R9,AX
+	XORQ	BX,CX
+	RORQ	$0x20,R9
+	ADDQ	AX,BX
+	ADDQ	CX,R9
+	RORQ	$0x2F,AX
+	RORQ	$0x2B,CX
+	XORQ	BX,AX
+	RORQ	$0x20,BX
+	XORQ	R9,CX
+	XORQ	R8,R9
+	CMPQ	R10,DI
+	JA	loopBody
+afterLoop:
+	ANDL	$7, DX
+	JZ	afterSwitch
+
+	// no support for jump tables
+
+	CMPQ	DX,$0x7
+	JE	sw7
+
+	CMPQ	DX,$0x6
+	JE	sw6
+
+	CMPQ	DX,$0x5
+	JE	sw5
+
+	CMPQ	DX,$0x4
+	JE	sw4
+
+	CMPQ	DX,$0x3
+	JE	sw3
+
+	CMPQ	DX,$0x2
+	JE	sw2
+
+	JMP	sw1
+
+sw7:	MOVBQZX	6(SI)(DI*1),DX
+	SHLQ	$0x30,DX
+	ORQ	DX,R11
+sw6:	MOVBQZX	0x5(SI)(DI*1),DX
+	SHLQ	$0x28,DX
+	ORQ	DX,R11
+sw5:	MOVBQZX	0x4(SI)(DI*1),DX
+	SHLQ	$0x20,DX
+	ORQ	DX,R11
+sw4:	MOVBQZX	0x3(SI)(DI*1),DX
+	SHLQ	$0x18,DX
+	ORQ	DX,R11
+sw3:	MOVBQZX	0x2(SI)(DI*1),DX
+	SHLQ	$0x10,DX
+	ORQ	DX,R11
+sw2:	MOVBQZX	0x1(SI)(DI*1),DX
+	SHLQ	$0x8,DX
+	ORQ	DX,R11
+sw1:	MOVBQZX	0(SI)(DI*1),DX
+	ORQ	DX,R11
+afterSwitch:
+	LEAQ	(AX)(R9*1),SI
+	XORQ	R11,CX
+	RORQ	$0x33,AX
+	ADDQ	CX,BX
+	MOVQ	CX,DX
+	XORQ	SI,AX
+	RORQ	$0x30,DX
+	RORQ	$0x20,SI
+	LEAQ	0(BX)(AX*1),CX
+	XORQ	BX,DX
+	RORQ	$0x2F,AX
+	ADDQ	DX,SI
+	RORQ	$0x2B,DX
+	XORQ	CX,AX
+	XORQ	SI,DX
+	RORQ	$0x20,CX
+	ADDQ	AX,SI
+	RORQ	$0x33,AX
+	ADDQ	DX,CX
+	XORQ	SI,AX
+	RORQ	$0x30,DX
+	RORQ	$0x20,SI
+	XORQ	CX,DX
+	ADDQ	AX,CX
+	RORQ	$0x2F,AX
+	ADDQ	DX,SI
+	XORQ	CX,AX
+	RORQ	$0x2B,DX
+	RORQ	$0x20,CX
+	XORQ	SI,DX
+	XORQ	R11,SI
+	XORB	$0xFF,CL
+	ADDQ	AX,SI
+	RORQ	$0x33,AX
+	ADDQ	DX,CX
+	RORQ	$0x30,DX
+	XORQ	SI,AX
+	XORQ	CX,DX
+	RORQ	$0x20,SI
+	ADDQ	AX,CX
+	ADDQ	DX,SI
+	RORQ	$0x2F,AX
+	RORQ	$0x2B,DX
+	XORQ	CX,AX
+	XORQ	SI,DX
+	RORQ	$0x20,CX
+	ADDQ	AX,SI
+	ADDQ	DX,CX
+	RORQ	$0x33,AX
+	RORQ	$0x30,DX
+	XORQ	SI,AX
+	RORQ	$0x20,SI
+	XORQ	CX,DX
+	ADDQ	AX,CX
+	RORQ	$0x2F,AX
+	ADDQ	DX,SI
+	RORQ	$0x2B,DX
+	XORQ	CX,AX
+	XORQ	SI,DX
+	RORQ	$0x20,CX
+	ADDQ	AX,SI
+	ADDQ	DX,CX
+	RORQ	$0x33,AX
+	RORQ	$0x30,DX
+	XORQ	CX,DX
+	XORQ	SI,AX
+	RORQ	$0x20,SI
+	ADDQ	DX,SI
+	ADDQ	AX,CX
+	RORQ	$0x2F,AX
+	XORQ	CX,AX
+	RORQ	$0x2B,DX
+	RORQ	$0x20,CX
+	XORQ	SI,DX
+	ADDQ	AX,SI
+	RORQ	$0x33,AX
+	ADDQ	DX,CX
+	XORQ	SI,AX
+	RORQ	$0x30,DX
+	XORQ	CX,DX
+	ADDQ	AX,CX
+	RORQ	$0x2F,AX
+	XORQ	CX,AX
+	RORQ	$0x2B,DX
+	RORQ	$0x20,CX
+	XORQ	DX,AX
+	XORQ	CX,AX
+	MOVQ	AX,ret+40(FP)
+	RET
diff --git a/hash_arm.go b/hash_arm.go
new file mode 100644
index 0000000..e52f86c
--- /dev/null
+++ b/hash_arm.go
@@ -0,0 +1,28 @@
+//go:build arm
+// +build arm
+
+package siphash
+
+// NB: ARM implementation of forgoes extra speed for Hash()
+// and Hash128() by simply reusing the same blocks() implementation
+// in assembly used by the streaming hash.
+
+func Hash(k0, k1 uint64, p []byte) uint64 {
+	var d digest
+	d.size = Size
+	d.k0 = k0
+	d.k1 = k1
+	d.Reset()
+	d.Write(p)
+	return d.Sum64()
+}
+
+func Hash128(k0, k1 uint64, p []byte) (uint64, uint64) {
+	var d digest
+	d.size = Size128
+	d.k0 = k0
+	d.k1 = k1
+	d.Reset()
+	d.Write(p)
+	return d.sum128()
+}
diff --git a/hash_asm.go b/hash_asm.go
new file mode 100644
index 0000000..c29fb49
--- /dev/null
+++ b/hash_asm.go
@@ -0,0 +1,25 @@
+//go:build amd64 && !appengine && !gccgo
+// +build amd64,!appengine,!gccgo
+
+// Written in 2012 by Dmitry Chestnykh.
+//
+// To the extent possible under law, the author have dedicated all copyright
+// and related and neighboring rights to this software to the public domain
+// worldwide. This software is distributed without any warranty.
+// http://creativecommons.org/publicdomain/zero/1.0/
+
+// This file contains a function definition for use with assembly implementations of Hash()
+
+package siphash
+
+//go:noescape
+
+// Hash returns the 64-bit SipHash-2-4 of the given byte slice with two 64-bit
+// parts of 128-bit key: k0 and k1.
+func Hash(k0, k1 uint64, b []byte) uint64
+
+//go:noescape
+
+// Hash128 returns the 128-bit SipHash-2-4 of the given byte slice with two
+// 64-bit parts of 128-bit key: k0 and k1.
+func Hash128(k0, k1 uint64, b []byte) (uint64, uint64)
diff --git a/siphash.go b/siphash.go
index 89d40ad..4a3cb49 100644
--- a/siphash.go
+++ b/siphash.go
@@ -1,4 +1,4 @@
-// Written in 2012 by Dmitry Chestnykh.
+// Written in 2012-2014 by Dmitry Chestnykh.
 //
 // To the extent possible under law, the author have dedicated all copyright
 // and related and neighboring rights to this software to the public domain
@@ -12,34 +12,52 @@ package siphash
 import "hash"
 
 const (
-	// The block size of hash algorithm in bytes.
+	// BlockSize is the block size of hash algorithm in bytes.
 	BlockSize = 8
-	// The size of hash output in bytes.
+
+	// Size is the size of hash output in bytes.
 	Size = 8
+
+	// Size128 is the size of 128-bit hash output in bytes.
+	Size128 = 16
 )
 
 type digest struct {
 	v0, v1, v2, v3 uint64  // state
 	k0, k1         uint64  // two parts of key
-	t              uint8   // message bytes counter (mod 256)
-	nx             int     // number of bytes in buffer x
 	x              [8]byte // buffer for unprocessed bytes
+	nx             int     // number of bytes in buffer x
+	size           int     // output size in bytes (8 or 16)
+	t              uint8   // message bytes counter (mod 256)
 }
 
-// New returns a new hash.Hash64 computing SipHash-2-4 with 16-byte key.
-func New(key []byte) hash.Hash64 {
+// newDigest returns a new digest with the given output size in bytes (must be 8 or 16).
+func newDigest(size int, key []byte) *digest {
+	if size != Size && size != Size128 {
+		panic("size must be 8 or 16")
+	}
 	d := new(digest)
-
 	d.k0 = uint64(key[0]) | uint64(key[1])<<8 | uint64(key[2])<<16 | uint64(key[3])<<24 |
 		uint64(key[4])<<32 | uint64(key[5])<<40 | uint64(key[6])<<48 | uint64(key[7])<<56
-
 	d.k1 = uint64(key[8]) | uint64(key[9])<<8 | uint64(key[10])<<16 | uint64(key[11])<<24 |
 		uint64(key[12])<<32 | uint64(key[13])<<40 | uint64(key[14])<<48 | uint64(key[15])<<56
-
+	d.size = size
 	d.Reset()
 	return d
 }
 
+// New returns a new hash.Hash64 computing SipHash-2-4 with 16-byte key and 8-byte output.
+func New(key []byte) hash.Hash64 {
+	return newDigest(Size, key)
+}
+
+// New128 returns a new hash.Hash computing SipHash-2-4 with 16-byte key and 16-byte output.
+//
+// Note that 16-byte output is considered experimental by SipHash authors at this time.
+func New128(key []byte) hash.Hash {
+	return newDigest(Size128, key)
+}
+
 func (d *digest) Reset() {
 	d.v0 = d.k0 ^ 0x736f6d6570736575
 	d.v1 = d.k1 ^ 0x646f72616e646f6d
@@ -47,67 +65,15 @@ func (d *digest) Reset() {
 	d.v3 = d.k1 ^ 0x7465646279746573
 	d.t = 0
 	d.nx = 0
+	if d.size == Size128 {
+		d.v1 ^= 0xee
+	}
 }
 
-func (d *digest) Size() int { return Size }
+func (d *digest) Size() int { return d.size }
 
 func (d *digest) BlockSize() int { return BlockSize }
 
-func blocks(d *digest, p []uint8) {
-	v0, v1, v2, v3 := d.v0, d.v1, d.v2, d.v3
-
-	for len(p) >= BlockSize {
-		m := uint64(p[0]) | uint64(p[1])<<8 | uint64(p[2])<<16 | uint64(p[3])<<24 |
-			uint64(p[4])<<32 | uint64(p[5])<<40 | uint64(p[6])<<48 | uint64(p[7])<<56
-
-		v3 ^= m
-
-		// Round 1.
-		v0 += v1
-		v1 = v1<<13 | v1>>(64-13)
-		v1 ^= v0
-		v0 = v0<<32 | v0>>(64-32)
-
-		v2 += v3
-		v3 = v3<<16 | v3>>(64-16)
-		v3 ^= v2
-
-		v0 += v3
-		v3 = v3<<21 | v3>>(64-21)
-		v3 ^= v0
-
-		v2 += v1
-		v1 = v1<<17 | v1>>(64-17)
-		v1 ^= v2
-		v2 = v2<<32 | v2>>(64-32)
-
-		// Round 2.
-		v0 += v1
-		v1 = v1<<13 | v1>>(64-13)
-		v1 ^= v0
-		v0 = v0<<32 | v0>>(64-32)
-
-		v2 += v3
-		v3 = v3<<16 | v3>>(64-16)
-		v3 ^= v2
-
-		v0 += v3
-		v3 = v3<<21 | v3>>(64-21)
-		v3 ^= v0
-
-		v2 += v1
-		v1 = v1<<17 | v1>>(64-17)
-		v1 ^= v2
-		v2 = v2<<32 | v2>>(64-32)
-
-		v0 ^= m
-
-		p = p[BlockSize:]
-	}
-
-	d.v0, d.v1, d.v2, d.v3 = v0, v1, v2, v3
-}
-
 func (d *digest) Write(p []byte) (nn int, err error) {
 	nn = len(p)
 	d.t += uint8(nn)
@@ -118,7 +84,7 @@ func (d *digest) Write(p []byte) (nn int, err error) {
 		}
 		d.nx += copy(d.x[d.nx:], p)
 		if d.nx == BlockSize {
-			blocks(d, d.x[:])
+			once(d)
 			d.nx = 0
 		}
 		p = p[n:]
@@ -134,7 +100,15 @@ func (d *digest) Write(p []byte) (nn int, err error) {
 	return
 }
 
-func (d0 *digest) Sum64() uint64 {
+func (d *digest) Sum64() uint64 {
+	for i := d.nx; i < BlockSize-1; i++ {
+		d.x[i] = 0
+	}
+	d.x[7] = d.t
+	return finalize(d)
+}
+
+func (d0 *digest) sum128() (r0, r1 uint64) {
 	// Make a copy of d0 so that caller can keep writing and summing.
 	d := *d0
 
@@ -145,7 +119,7 @@ func (d0 *digest) Sum64() uint64 {
 	blocks(&d, d.x[:])
 
 	v0, v1, v2, v3 := d.v0, d.v1, d.v2, d.v3
-	v2 ^= 0xff
+	v2 ^= 0xee
 
 	// Round 1.
 	v0 += v1
@@ -223,12 +197,122 @@ func (d0 *digest) Sum64() uint64 {
 	v1 ^= v2
 	v2 = v2<<32 | v2>>(64-32)
 
-	return v0 ^ v1 ^ v2 ^ v3
+	r0 = v0 ^ v1 ^ v2 ^ v3
+
+	v1 ^= 0xdd
+
+	// Round 1.
+	v0 += v1
+	v1 = v1<<13 | v1>>(64-13)
+	v1 ^= v0
+	v0 = v0<<32 | v0>>(64-32)
+
+	v2 += v3
+	v3 = v3<<16 | v3>>(64-16)
+	v3 ^= v2
+
+	v0 += v3
+	v3 = v3<<21 | v3>>(64-21)
+	v3 ^= v0
+
+	v2 += v1
+	v1 = v1<<17 | v1>>(64-17)
+	v1 ^= v2
+	v2 = v2<<32 | v2>>(64-32)
+
+	// Round 2.
+	v0 += v1
+	v1 = v1<<13 | v1>>(64-13)
+	v1 ^= v0
+	v0 = v0<<32 | v0>>(64-32)
+
+	v2 += v3
+	v3 = v3<<16 | v3>>(64-16)
+	v3 ^= v2
+
+	v0 += v3
+	v3 = v3<<21 | v3>>(64-21)
+	v3 ^= v0
+
+	v2 += v1
+	v1 = v1<<17 | v1>>(64-17)
+	v1 ^= v2
+	v2 = v2<<32 | v2>>(64-32)
+
+	// Round 3.
+	v0 += v1
+	v1 = v1<<13 | v1>>(64-13)
+	v1 ^= v0
+	v0 = v0<<32 | v0>>(64-32)
+
+	v2 += v3
+	v3 = v3<<16 | v3>>(64-16)
+	v3 ^= v2
+
+	v0 += v3
+	v3 = v3<<21 | v3>>(64-21)
+	v3 ^= v0
+
+	v2 += v1
+	v1 = v1<<17 | v1>>(64-17)
+	v1 ^= v2
+	v2 = v2<<32 | v2>>(64-32)
+
+	// Round 4.
+	v0 += v1
+	v1 = v1<<13 | v1>>(64-13)
+	v1 ^= v0
+	v0 = v0<<32 | v0>>(64-32)
+
+	v2 += v3
+	v3 = v3<<16 | v3>>(64-16)
+	v3 ^= v2
+
+	v0 += v3
+	v3 = v3<<21 | v3>>(64-21)
+	v3 ^= v0
+
+	v2 += v1
+	v1 = v1<<17 | v1>>(64-17)
+	v1 ^= v2
+	v2 = v2<<32 | v2>>(64-32)
+
+	r1 = v0 ^ v1 ^ v2 ^ v3
+
+	return r0, r1
 }
 
 func (d *digest) Sum(in []byte) []byte {
-	v := d.Sum64()
-	in = append(in, byte(v), byte(v>>8), byte(v>>16), byte(v>>24),
-		byte(v>>32), byte(v>>40), byte(v>>48), byte(v>>56))
+	if d.size == Size {
+		r := d.Sum64()
+		in = append(in,
+			byte(r),
+			byte(r>>8),
+			byte(r>>16),
+			byte(r>>24),
+			byte(r>>32),
+			byte(r>>40),
+			byte(r>>48),
+			byte(r>>56))
+	} else {
+		r0, r1 := d.sum128()
+		in = append(in,
+			byte(r0),
+			byte(r0>>8),
+			byte(r0>>16),
+			byte(r0>>24),
+			byte(r0>>32),
+			byte(r0>>40),
+			byte(r0>>48),
+			byte(r0>>56),
+			byte(r1),
+			byte(r1>>8),
+			byte(r1>>16),
+			byte(r1>>24),
+			byte(r1>>32),
+			byte(r1>>40),
+			byte(r1>>48),
+			byte(r1>>56))
+	}
 	return in
 }
diff --git a/siphash_test.go b/siphash_test.go
index 5e01542..843b051 100644
--- a/siphash_test.go
+++ b/siphash_test.go
@@ -10,6 +10,7 @@ package siphash
 import (
 	"bytes"
 	"encoding/binary"
+	"encoding/hex"
 	"testing"
 )
 
@@ -130,6 +131,73 @@ var goldenRef = [][]byte{
 	{0x72, 0x45, 0x06, 0xeb, 0x4c, 0x32, 0x8a, 0x95},
 }
 
+var goldenRef128 = [][]byte{
+	{0xa3, 0x81, 0x7f, 0x04, 0xba, 0x25, 0xa8, 0xe6, 0x6d, 0xf6, 0x72, 0x14, 0xc7, 0x55, 0x02, 0x93},
+	{0xda, 0x87, 0xc1, 0xd8, 0x6b, 0x99, 0xaf, 0x44, 0x34, 0x76, 0x59, 0x11, 0x9b, 0x22, 0xfc, 0x45},
+	{0x81, 0x77, 0x22, 0x8d, 0xa4, 0xa4, 0x5d, 0xc7, 0xfc, 0xa3, 0x8b, 0xde, 0xf6, 0x0a, 0xff, 0xe4},
+	{0x9c, 0x70, 0xb6, 0x0c, 0x52, 0x67, 0xa9, 0x4e, 0x5f, 0x33, 0xb6, 0xb0, 0x29, 0x85, 0xed, 0x51},
+	{0xf8, 0x81, 0x64, 0xc1, 0x2d, 0x9c, 0x8f, 0xaf, 0x7d, 0x0f, 0x6e, 0x7c, 0x7b, 0xcd, 0x55, 0x79},
+	{0x13, 0x68, 0x87, 0x59, 0x80, 0x77, 0x6f, 0x88, 0x54, 0x52, 0x7a, 0x07, 0x69, 0x0e, 0x96, 0x27},
+	{0x14, 0xee, 0xca, 0x33, 0x8b, 0x20, 0x86, 0x13, 0x48, 0x5e, 0xa0, 0x30, 0x8f, 0xd7, 0xa1, 0x5e},
+	{0xa1, 0xf1, 0xeb, 0xbe, 0xd8, 0xdb, 0xc1, 0x53, 0xc0, 0xb8, 0x4a, 0xa6, 0x1f, 0xf0, 0x82, 0x39},
+	{0x3b, 0x62, 0xa9, 0xba, 0x62, 0x58, 0xf5, 0x61, 0x0f, 0x83, 0xe2, 0x64, 0xf3, 0x14, 0x97, 0xb4},
+	{0x26, 0x44, 0x99, 0x06, 0x0a, 0xd9, 0xba, 0xab, 0xc4, 0x7f, 0x8b, 0x02, 0xbb, 0x6d, 0x71, 0xed},
+	{0x00, 0x11, 0x0d, 0xc3, 0x78, 0x14, 0x69, 0x56, 0xc9, 0x54, 0x47, 0xd3, 0xf3, 0xd0, 0xfb, 0xba},
+	{0x01, 0x51, 0xc5, 0x68, 0x38, 0x6b, 0x66, 0x77, 0xa2, 0xb4, 0xdc, 0x6f, 0x81, 0xe5, 0xdc, 0x18},
+	{0xd6, 0x26, 0xb2, 0x66, 0x90, 0x5e, 0xf3, 0x58, 0x82, 0x63, 0x4d, 0xf6, 0x85, 0x32, 0xc1, 0x25},
+	{0x98, 0x69, 0xe2, 0x47, 0xe9, 0xc0, 0x8b, 0x10, 0xd0, 0x29, 0x93, 0x4f, 0xc4, 0xb9, 0x52, 0xf7},
+	{0x31, 0xfc, 0xef, 0xac, 0x66, 0xd7, 0xde, 0x9c, 0x7e, 0xc7, 0x48, 0x5f, 0xe4, 0x49, 0x49, 0x02},
+	{0x54, 0x93, 0xe9, 0x99, 0x33, 0xb0, 0xa8, 0x11, 0x7e, 0x08, 0xec, 0x0f, 0x97, 0xcf, 0xc3, 0xd9},
+	{0x6e, 0xe2, 0xa4, 0xca, 0x67, 0xb0, 0x54, 0xbb, 0xfd, 0x33, 0x15, 0xbf, 0x85, 0x23, 0x05, 0x77},
+	{0x47, 0x3d, 0x06, 0xe8, 0x73, 0x8d, 0xb8, 0x98, 0x54, 0xc0, 0x66, 0xc4, 0x7a, 0xe4, 0x77, 0x40},
+	{0xa4, 0x26, 0xe5, 0xe4, 0x23, 0xbf, 0x48, 0x85, 0x29, 0x4d, 0xa4, 0x81, 0xfe, 0xae, 0xf7, 0x23},
+	{0x78, 0x01, 0x77, 0x31, 0xcf, 0x65, 0xfa, 0xb0, 0x74, 0xd5, 0x20, 0x89, 0x52, 0x51, 0x2e, 0xb1},
+	{0x9e, 0x25, 0xfc, 0x83, 0x3f, 0x22, 0x90, 0x73, 0x3e, 0x93, 0x44, 0xa5, 0xe8, 0x38, 0x39, 0xeb},
+	{0x56, 0x8e, 0x49, 0x5a, 0xbe, 0x52, 0x5a, 0x21, 0x8a, 0x22, 0x14, 0xcd, 0x3e, 0x07, 0x1d, 0x12},
+	{0x4a, 0x29, 0xb5, 0x45, 0x52, 0xd1, 0x6b, 0x9a, 0x46, 0x9c, 0x10, 0x52, 0x8e, 0xff, 0x0a, 0xae},
+	{0xc9, 0xd1, 0x84, 0xdd, 0xd5, 0xa9, 0xf5, 0xe0, 0xcf, 0x8c, 0xe2, 0x9a, 0x9a, 0xbf, 0x69, 0x1c},
+	{0x2d, 0xb4, 0x79, 0xae, 0x78, 0xbd, 0x50, 0xd8, 0x88, 0x2a, 0x8a, 0x17, 0x8a, 0x61, 0x32, 0xad},
+	{0x8e, 0xce, 0x5f, 0x04, 0x2d, 0x5e, 0x44, 0x7b, 0x50, 0x51, 0xb9, 0xea, 0xcb, 0x8d, 0x8f, 0x6f},
+	{0x9c, 0x0b, 0x53, 0xb4, 0xb3, 0xc3, 0x07, 0xe8, 0x7e, 0xae, 0xe0, 0x86, 0x78, 0x14, 0x1f, 0x66},
+	{0xab, 0xf2, 0x48, 0xaf, 0x69, 0xa6, 0xea, 0xe4, 0xbf, 0xd3, 0xeb, 0x2f, 0x12, 0x9e, 0xeb, 0x94},
+	{0x06, 0x64, 0xda, 0x16, 0x68, 0x57, 0x4b, 0x88, 0xb9, 0x35, 0xf3, 0x02, 0x73, 0x58, 0xae, 0xf4},
+	{0xaa, 0x4b, 0x9d, 0xc4, 0xbf, 0x33, 0x7d, 0xe9, 0x0c, 0xd4, 0xfd, 0x3c, 0x46, 0x7c, 0x6a, 0xb7},
+	{0xea, 0x5c, 0x7f, 0x47, 0x1f, 0xaf, 0x6b, 0xde, 0x2b, 0x1a, 0xd7, 0xd4, 0x68, 0x6d, 0x22, 0x87},
+	{0x29, 0x39, 0xb0, 0x18, 0x32, 0x23, 0xfa, 0xfc, 0x17, 0x23, 0xde, 0x4f, 0x52, 0xc4, 0x3d, 0x35},
+	{0x7c, 0x39, 0x56, 0xca, 0x5e, 0xea, 0xfc, 0x3e, 0x36, 0x3e, 0x9d, 0x55, 0x65, 0x46, 0xeb, 0x68},
+	{0x77, 0xc6, 0x07, 0x71, 0x46, 0xf0, 0x1c, 0x32, 0xb6, 0xb6, 0x9d, 0x5f, 0x4e, 0xa9, 0xff, 0xcf},
+	{0x37, 0xa6, 0x98, 0x6c, 0xb8, 0x84, 0x7e, 0xdf, 0x09, 0x25, 0xf0, 0xf1, 0x30, 0x9b, 0x54, 0xde},
+	{0xa7, 0x05, 0xf0, 0xe6, 0x9d, 0xa9, 0xa8, 0xf9, 0x07, 0x24, 0x1a, 0x2e, 0x92, 0x3c, 0x8c, 0xc8},
+	{0x3d, 0xc4, 0x7d, 0x1f, 0x29, 0xc4, 0x48, 0x46, 0x1e, 0x9e, 0x76, 0xed, 0x90, 0x4f, 0x67, 0x11},
+	{0x0d, 0x62, 0xbf, 0x01, 0xe6, 0xfc, 0x0e, 0x1a, 0x0d, 0x3c, 0x47, 0x51, 0xc5, 0xd3, 0x69, 0x2b},
+	{0x8c, 0x03, 0x46, 0x8b, 0xca, 0x7c, 0x66, 0x9e, 0xe4, 0xfd, 0x5e, 0x08, 0x4b, 0xbe, 0xe7, 0xb5},
+	{0x52, 0x8a, 0x5b, 0xb9, 0x3b, 0xaf, 0x2c, 0x9c, 0x44, 0x73, 0xcc, 0xe5, 0xd0, 0xd2, 0x2b, 0xd9},
+	{0xdf, 0x6a, 0x30, 0x1e, 0x95, 0xc9, 0x5d, 0xad, 0x97, 0xae, 0x0c, 0xc8, 0xc6, 0x91, 0x3b, 0xd8},
+	{0x80, 0x11, 0x89, 0x90, 0x2c, 0x85, 0x7f, 0x39, 0xe7, 0x35, 0x91, 0x28, 0x5e, 0x70, 0xb6, 0xdb},
+	{0xe6, 0x17, 0x34, 0x6a, 0xc9, 0xc2, 0x31, 0xbb, 0x36, 0x50, 0xae, 0x34, 0xcc, 0xca, 0x0c, 0x5b},
+	{0x27, 0xd9, 0x34, 0x37, 0xef, 0xb7, 0x21, 0xaa, 0x40, 0x18, 0x21, 0xdc, 0xec, 0x5a, 0xdf, 0x89},
+	{0x89, 0x23, 0x7d, 0x9d, 0xed, 0x9c, 0x5e, 0x78, 0xd8, 0xb1, 0xc9, 0xb1, 0x66, 0xcc, 0x73, 0x42},
+	{0x4a, 0x6d, 0x80, 0x91, 0xbf, 0x5e, 0x7d, 0x65, 0x11, 0x89, 0xfa, 0x94, 0xa2, 0x50, 0xb1, 0x4c},
+	{0x0e, 0x33, 0xf9, 0x60, 0x55, 0xe7, 0xae, 0x89, 0x3f, 0xfc, 0x0e, 0x3d, 0xcf, 0x49, 0x29, 0x02},
+	{0xe6, 0x1c, 0x43, 0x2b, 0x72, 0x0b, 0x19, 0xd1, 0x8e, 0xc8, 0xd8, 0x4b, 0xdc, 0x63, 0x15, 0x1b},
+	{0xf7, 0xe5, 0xae, 0xf5, 0x49, 0xf7, 0x82, 0xcf, 0x37, 0x90, 0x55, 0xa6, 0x08, 0x26, 0x9b, 0x16},
+	{0x43, 0x8d, 0x03, 0x0f, 0xd0, 0xb7, 0xa5, 0x4f, 0xa8, 0x37, 0xf2, 0xad, 0x20, 0x1a, 0x64, 0x03},
+	{0xa5, 0x90, 0xd3, 0xee, 0x4f, 0xbf, 0x04, 0xe3, 0x24, 0x7e, 0x0d, 0x27, 0xf2, 0x86, 0x42, 0x3f},
+	{0x5f, 0xe2, 0xc1, 0xa1, 0x72, 0xfe, 0x93, 0xc4, 0xb1, 0x5c, 0xd3, 0x7c, 0xae, 0xf9, 0xf5, 0x38},
+	{0x2c, 0x97, 0x32, 0x5c, 0xbd, 0x06, 0xb3, 0x6e, 0xb2, 0x13, 0x3d, 0xd0, 0x8b, 0x3a, 0x01, 0x7c},
+	{0x92, 0xc8, 0x14, 0x22, 0x7a, 0x6b, 0xca, 0x94, 0x9f, 0xf0, 0x65, 0x9f, 0x00, 0x2a, 0xd3, 0x9e},
+	{0xdc, 0xe8, 0x50, 0x11, 0x0b, 0xd8, 0x32, 0x8c, 0xfb, 0xd5, 0x08, 0x41, 0xd6, 0x91, 0x1d, 0x87},
+	{0x67, 0xf1, 0x49, 0x84, 0xc7, 0xda, 0x79, 0x12, 0x48, 0xe3, 0x2b, 0xb5, 0x92, 0x25, 0x83, 0xda},
+	{0x19, 0x38, 0xf2, 0xcf, 0x72, 0xd5, 0x4e, 0xe9, 0x7e, 0x94, 0x16, 0x6f, 0xa9, 0x1d, 0x2a, 0x36},
+	{0x74, 0x48, 0x1e, 0x96, 0x46, 0xed, 0x49, 0xfe, 0x0f, 0x62, 0x24, 0x30, 0x16, 0x04, 0x69, 0x8e},
+	{0x57, 0xfc, 0xa5, 0xde, 0x98, 0xa9, 0xd6, 0xd8, 0x00, 0x64, 0x38, 0xd0, 0x58, 0x3d, 0x8a, 0x1d},
+	{0x9f, 0xec, 0xde, 0x1c, 0xef, 0xdc, 0x1c, 0xbe, 0xd4, 0x76, 0x36, 0x74, 0xd9, 0x57, 0x53, 0x59},
+	{0xe3, 0x04, 0x0c, 0x00, 0xeb, 0x28, 0xf1, 0x53, 0x66, 0xca, 0x73, 0xcb, 0xd8, 0x72, 0xe7, 0x40},
+	{0x76, 0x97, 0x00, 0x9a, 0x6a, 0x83, 0x1d, 0xfe, 0xcc, 0xa9, 0x1c, 0x59, 0x93, 0x67, 0x0f, 0x7a},
+	{0x58, 0x53, 0x54, 0x23, 0x21, 0xf5, 0x67, 0xa0, 0x05, 0xd5, 0x47, 0xa4, 0xf0, 0x47, 0x59, 0xbd},
+	{0x51, 0x50, 0xd1, 0x77, 0x2f, 0x50, 0x83, 0x4a, 0x50, 0x3e, 0x06, 0x9a, 0x97, 0x3f, 0xbd, 0x7c},
+}
+
 func TestSum64(t *testing.T) {
 	for i, v := range golden {
 		h := New(v.k)
@@ -167,6 +235,43 @@ func TestSum(t *testing.T) {
 	}
 }
 
+func TestSumUnaligned(t *testing.T) {
+	const align = 8
+	var k [16]byte
+	var in [64 + align]byte
+	for i := range k {
+		k[i] = byte(i)
+	}
+
+	for a := 1; a < align; a++ {
+		for i := 0; i < 64; i++ {
+			in[a+i] = byte(i)
+			h := New(k[:])
+			h.Write(in[a : a+i])
+			if sum := h.Sum(nil); !bytes.Equal(sum, goldenRef[i]) {
+				t.Errorf(`%d: expected "%x", got "%x"`, i, goldenRef[i], sum)
+			}
+		}
+	}
+}
+
+func TestSum128(t *testing.T) {
+	var k [16]byte
+	var in [64]byte
+	for i := range k {
+		k[i] = byte(i)
+	}
+
+	for i := 0; i < 64; i++ {
+		in[i] = byte(i)
+		h := New128(k[:])
+		h.Write(in[:i])
+		if sum := h.Sum(nil); !bytes.Equal(sum, goldenRef128[i]) {
+			t.Errorf(`%d: expected "%x", got "%x"`, i, goldenRef128[i], sum)
+		}
+	}
+}
+
 func TestHash(t *testing.T) {
 	var k0, k1 uint64
 	for i, v := range golden {
@@ -194,10 +299,118 @@ func TestHash(t *testing.T) {
 	}
 }
 
-var key = zeroKey
-var key0, key1 uint64
-var bench = New(key)
-var buf = make([]byte, 8<<10)
+func TestHashUnaligned(t *testing.T) {
+	const align = 8
+	var k0, k1 uint64
+	var k [16]byte
+	var in [64 + align]byte
+
+	for i := range k {
+		k[i] = byte(i)
+	}
+	k0 = binary.LittleEndian.Uint64(k[0:8])
+	k1 = binary.LittleEndian.Uint64(k[8:16])
+
+	for a := 1; a < align; a++ {
+		for i := 0; i < 64; i++ {
+			in[a+i] = byte(i)
+			ref := binary.LittleEndian.Uint64(goldenRef[i])
+			if sum := Hash(k0, k1, in[a:a+i]); sum != ref {
+				t.Errorf(`%d: expected "%x", got "%x"`, i, ref, sum)
+			}
+		}
+	}
+}
+
+func TestHash128(t *testing.T) {
+	var k0, k1 uint64
+
+	var k [16]byte
+	var in [64]byte
+	for i := range k {
+		k[i] = byte(i)
+	}
+	k0 = binary.LittleEndian.Uint64(k[0:8])
+	k1 = binary.LittleEndian.Uint64(k[8:16])
+
+	for i := 0; i < 64; i++ {
+		in[i] = byte(i)
+		ref0 := binary.LittleEndian.Uint64(goldenRef128[i][0:])
+		ref1 := binary.LittleEndian.Uint64(goldenRef128[i][8:])
+		if sum0, sum1 := Hash128(k0, k1, in[:i]); sum0 != ref0 || sum1 != ref1 {
+			t.Errorf(`%d: expected "%x, %x", got "%x, %x"`, i, ref0, ref1, sum0, sum1)
+		}
+	}
+}
+
+func TestAlign(t *testing.T) {
+	data := "0076a9143219adce9b6f0a21fd53cb17e2fd9b2b4fac40b388ac"
+	k0 := uint64(316665572293978160)
+	k1 := uint64(8573005253291875333)
+
+	want := []uint64{
+		16380727507974277821,
+		16770526497674945769,
+		11373998677292870540,
+		10374222295991299613,
+	}
+	want128 := []uint64{
+		14802151199638645495,
+		13251497035884452880,
+		7034723853391616289,
+		16742813562040528752,
+		10468120447644272532,
+		10941274532208162335,
+		11293904790559355408,
+		15432350433573653068,
+	}
+
+	d, err := hex.DecodeString(data)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	var k [16]byte
+	binary.LittleEndian.PutUint64(k[0:], k0)
+	binary.LittleEndian.PutUint64(k[8:], k1)
+
+	for i := range want {
+		res := Hash(k0, k1, d[i:])
+		if res != want[i] {
+			t.Fatalf("Expected %v got %v", want[i], res)
+		}
+		reslo, reshi := Hash128(k0, k1, d[i:])
+		if reslo != want128[i*2] {
+			t.Fatalf("Expected %v got %v", want128[i*2], reslo)
+		}
+		if reshi != want128[i*2+1] {
+			t.Fatalf("Expected %v got %v", want128[i*2+1], reshi)
+		}
+		dig := newDigest(Size, k[:])
+		dig.Write(d[i:])
+		res = dig.Sum64()
+		if res != want[i] {
+			t.Fatalf("Expected %v got %v", want[i], res)
+		}
+		dig128 := newDigest(Size128, k[:])
+		dig128.Write(d[i:])
+		reslo, reshi = dig128.sum128()
+		if reslo != want128[i*2] {
+			t.Fatalf("Expected %v got %v", want128[i*2], reslo)
+		}
+		if reshi != want128[i*2+1] {
+			t.Fatalf("Expected %v got %v", want128[i*2+1], reshi)
+		}
+	}
+}
+
+var (
+	key        = zeroKey
+	key0, key1 uint64
+	bench      = New(key)
+	bench128   = New128(key)
+	buf        = make([]byte, 8<<10)
+)
 
 func BenchmarkHash8(b *testing.B) {
 	b.SetBytes(8)
@@ -241,6 +454,13 @@ func BenchmarkHash1K(b *testing.B) {
 	}
 }
 
+func BenchmarkHash1Kunaligned(b *testing.B) {
+	b.SetBytes(1024)
+	for i := 0; i < b.N; i++ {
+		Hash(key0, key1, buf[1:1025])
+	}
+}
+
 func BenchmarkHash8K(b *testing.B) {
 	b.SetBytes(int64(len(buf)))
 	for i := 0; i < b.N; i++ {
@@ -248,6 +468,55 @@ func BenchmarkHash8K(b *testing.B) {
 	}
 }
 
+func BenchmarkHash128_8(b *testing.B) {
+	b.SetBytes(8)
+	for i := 0; i < b.N; i++ {
+		Hash128(key0, key1, buf[:8])
+	}
+}
+
+func BenchmarkHash128_16(b *testing.B) {
+	b.SetBytes(16)
+	for i := 0; i < b.N; i++ {
+		Hash128(key0, key1, buf[:16])
+	}
+}
+
+func BenchmarkHash128_40(b *testing.B) {
+	b.SetBytes(40)
+	for i := 0; i < b.N; i++ {
+		Hash128(key0, key1, buf[:40])
+	}
+}
+
+func BenchmarkHash128_64(b *testing.B) {
+	b.SetBytes(64)
+	for i := 0; i < b.N; i++ {
+		Hash128(key0, key1, buf[:64])
+	}
+}
+
+func BenchmarkHash128_128(b *testing.B) {
+	b.SetBytes(128)
+	for i := 0; i < b.N; i++ {
+		Hash128(key0, key1, buf[:128])
+	}
+}
+
+func BenchmarkHash128_1K(b *testing.B) {
+	b.SetBytes(1024)
+	for i := 0; i < b.N; i++ {
+		Hash128(key0, key1, buf[:1024])
+	}
+}
+
+func BenchmarkHash128_8K(b *testing.B) {
+	b.SetBytes(int64(len(buf)))
+	for i := 0; i < b.N; i++ {
+		Hash128(key0, key1, buf)
+	}
+}
+
 func BenchmarkFull8(b *testing.B) {
 	b.SetBytes(8)
 	for i := 0; i < b.N; i++ {
@@ -302,6 +571,15 @@ func BenchmarkFull1K(b *testing.B) {
 	}
 }
 
+func BenchmarkFull1Kunaligned(b *testing.B) {
+	b.SetBytes(1024)
+	for i := 0; i < b.N; i++ {
+		bench.Reset()
+		bench.Write(buf[1:1025])
+		bench.Sum64()
+	}
+}
+
 func BenchmarkFull8K(b *testing.B) {
 	b.SetBytes(int64(len(buf)))
 	for i := 0; i < b.N; i++ {
@@ -310,3 +588,66 @@ func BenchmarkFull8K(b *testing.B) {
 		bench.Sum64()
 	}
 }
+
+func BenchmarkFull128_8(b *testing.B) {
+	b.SetBytes(8)
+	for i := 0; i < b.N; i++ {
+		bench128.Reset()
+		bench128.Write(buf[:8])
+		bench128.Sum(nil)
+	}
+}
+
+func BenchmarkFull128_16(b *testing.B) {
+	b.SetBytes(16)
+	for i := 0; i < b.N; i++ {
+		bench128.Reset()
+		bench128.Write(buf[:16])
+		bench128.Sum(nil)
+	}
+}
+
+func BenchmarkFull128_40(b *testing.B) {
+	b.SetBytes(24)
+	for i := 0; i < b.N; i++ {
+		bench128.Reset()
+		bench128.Write(buf[:16])
+		bench128.Sum(nil)
+	}
+}
+
+func BenchmarkFull128_64(b *testing.B) {
+	b.SetBytes(64)
+	for i := 0; i < b.N; i++ {
+		bench128.Reset()
+		bench128.Write(buf[:64])
+		bench128.Sum(nil)
+	}
+}
+
+func BenchmarkFull128_128(b *testing.B) {
+	b.SetBytes(128)
+	for i := 0; i < b.N; i++ {
+		bench128.Reset()
+		bench128.Write(buf[:64])
+		bench128.Sum(nil)
+	}
+}
+
+func BenchmarkFull128_1K(b *testing.B) {
+	b.SetBytes(1024)
+	for i := 0; i < b.N; i++ {
+		bench128.Reset()
+		bench128.Write(buf[:1024])
+		bench128.Sum(nil)
+	}
+}
+
+func BenchmarkFull128_8K(b *testing.B) {
+	b.SetBytes(int64(len(buf)))
+	for i := 0; i < b.N; i++ {
+		bench128.Reset()
+		bench128.Write(buf)
+		bench128.Sum(nil)
+	}
+}

More details

Full run details

Historical runs