diff --git a/.travis.yml b/.travis.yml
index 00c2bea..744e64c 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -21,3 +21,4 @@
 script:
 - diff -au <(gofmt -d .) <(printf "")
 - go test -race -v ./...
+- go tool vet -asmdecl .
diff --git a/sha256blockAvx2_amd64.s b/sha256blockAvx2_amd64.s
index 1427f3a..079d6b9 100644
--- a/sha256blockAvx2_amd64.s
+++ b/sha256blockAvx2_amd64.s
@@ -31,8 +31,6 @@
 // github.com/minio/asm2plan9s to assemble Intel instructions to their Plan9
 // equivalents
 //
-
-#include "textflag.h"
 
 DATA K256<>+0x000(SB)/8, $0x71374491428a2f98
 DATA K256<>+0x008(SB)/8, $0xe9b5dba5b5c0fbcf
@@ -114,16 +112,25 @@
 
 GLOBL K256<>(SB), 8, $608
 
+// We need 0x220 stack space aligned on a 512 boundary, so for the
+// worstcase-aligned SP we need twice this amount, being 1088 (=0x440)
+//
+// SP        aligned   end-aligned  stacksize
+// 100013d0  10001400  10001620     592
+// 100013d8  10001400  10001620     584
+// 100013e0  10001600  10001820     1088
+// 100013e8  10001600  10001820     1080
+
 // func blockAvx2(h []uint32, message []uint8)
-TEXT ·blockAvx2(SB), 7, $0
-
-	MOVQ ctx+0(FP), DI           // DI: &h
-	MOVQ inp+24(FP), SI          // SI: &message
-	MOVQ inplength+32(FP), DX    // len(message)
+TEXT ·blockAvx2(SB),$1088-48
+
+	MOVQ h+0(FP), DI             // DI: &h
+	MOVQ message_base+24(FP), SI // SI: &message
+	MOVQ message_len+32(FP), DX  // len(message)
 	ADDQ SI, DX                  // end pointer of input
 	MOVQ SP, R11                 // copy stack pointer
-	SUBQ $0x220, SP              // sp -= 0x220
-	ANDQ $0xfffffffffffffc00, SP // align stack frame
+	ADDQ $0x220, SP              // sp += 0x220
+	ANDQ $0xfffffffffffffe00, SP // align stack frame
 	ADDQ $0x1c0, SP
 	MOVQ DI, 0x40(SP)            // save ctx
 	MOVQ SI, 0x48(SP)            // save input
@@ -1435,7 +1442,7 @@
 
 done:
 	MOVQ BP, SP
-	MOVQ 0x58(SP), SP
+	MOVQ 0x58(SP), SP        // restore saved stack pointer
 	WORD $0xf8c5; BYTE $0x77 // vzeroupper
 
 	RET
diff --git a/sha256blockAvx512_amd64.s b/sha256blockAvx512_amd64.s
index 0ac97b2..14ae8a2 100644
--- a/sha256blockAvx512_amd64.s
+++ b/sha256blockAvx512_amd64.s
@@ -2,7 +2,7 @@
 	MOVQ  digests+0(FP), DI
 	MOVQ  scratch+8(FP), R12
 	MOVQ  mask_len+32(FP), SI
-	MOVQ  r14+24(FP), R13
+	MOVQ  mask_base+24(FP), R13
 	MOVQ  (R13), R14
 	LONG  $0x92fbc1c4; BYTE $0xce
 	LEAQ  inputs+48(FP), AX
diff --git a/sha256blockAvx_amd64.s b/sha256blockAvx_amd64.s
index 6645519..4a6b28d 100644
--- a/sha256blockAvx_amd64.s
+++ b/sha256blockAvx_amd64.s
@@ -232,15 +232,15 @@
 	ADDL R13, h                // h = h + S1 + CH + k + w + S0 + MAJ
 
 // func blockAvx(h []uint32, message []uint8, reserved0, reserved1, reserved2, reserved3 uint64)
-TEXT ·blockAvx(SB), 7, $0
-
-	MOVQ h+0(FP), SI           // SI: &h
-	MOVQ message+24(FP), R8    // &message
-	MOVQ lenmessage+32(FP), R9 // length of message
+TEXT ·blockAvx(SB), 7, $0-80
+
+	MOVQ h+0(FP), SI             // SI: &h
+	MOVQ message_base+24(FP), R8 // &message
+	MOVQ message_len+32(FP), R9  // length of message
 	CMPQ R9, $0
 	JEQ  done_hash
 	ADDQ R8, R9
-	MOVQ R9, _inp_end+64(FP)   // store end of message
+	MOVQ R9, reserved2+64(FP)    // store end of message
 
 	// Register definition
 	//  a -->  eax
@@ -269,7 +269,7 @@
 	MOVOU shuf00BA<>(SB), X10  // shuffle xBxA -> 00BA
 	MOVOU shufDC00<>(SB), X12  // shuffle xDxC -> DC00
 
-	MOVQ message+24(FP), SI // SI: &message
+	MOVQ message_base+24(FP), SI // SI: &message
 
 loop0:
 	LEAQ constants<>(SB), BP
@@ -284,25 +284,25 @@
 	MOVOU 3*16(SI), X7
 	LONG  $0x0041c2c4; BYTE $0xfd // VPSHUFB XMM7, XMM7, XMM13
 
-	MOVQ SI, _inp+72(FP)
+	MOVQ SI, reserved3+72(FP)
 	MOVD $0x3, DI
 
 	// schedule 48 input dwords, by doing 3 rounds of 16 each
 loop1:
 	LONG  $0x4dfe59c5; BYTE $0x00 // VPADDD XMM9, XMM4, 0[RBP]   /* Add 1st constant to first part of message */
-	MOVOU X9, _xfer+48(FP)
+	MOVOU X9, reserved0+48(FP)
 	FOUR_ROUNDS_AND_SCHED(AX, BX,  CX,  R8, DX, R9, R10, R11)
 
 	LONG  $0x4dfe59c5; BYTE $0x10 // VPADDD XMM9, XMM4, 16[RBP]   /* Add 2nd constant to message */
-	MOVOU X9, _xfer+48(FP)
+	MOVOU X9, reserved0+48(FP)
 	FOUR_ROUNDS_AND_SCHED(DX, R9, R10, R11, AX, BX,  CX,  R8)
 
 	LONG  $0x4dfe59c5; BYTE $0x20 // VPADDD XMM9, XMM4, 32[RBP]   /* Add 3rd constant to message */
-	MOVOU X9, _xfer+48(FP)
+	MOVOU X9, reserved0+48(FP)
 	FOUR_ROUNDS_AND_SCHED(AX, BX,  CX,  R8, DX, R9, R10, R11)
 
 	LONG  $0x4dfe59c5; BYTE $0x30 // VPADDD XMM9, XMM4, 48[RBP]   /* Add 4th constant to message */
-	MOVOU X9, _xfer+48(FP)
+	MOVOU X9, reserved0+48(FP)
 	ADDQ  $64, BP
 	FOUR_ROUNDS_AND_SCHED(DX, R9, R10, R11, AX, BX,  CX,  R8)
 
@@ -313,14 +313,14 @@
 
 loop2:
 	LONG  $0x4dfe59c5; BYTE $0x00 // VPADDD XMM9, XMM4, 0[RBP]   /* Add 1st constant to first part of message */
-	MOVOU X9, _xfer+48(FP)
+	MOVOU X9, reserved0+48(FP)
 	DO_ROUND( AX,  BX,  CX,  R8,  DX,  R9, R10, R11, 48)
 	DO_ROUND(R11,  AX,  BX,  CX,  R8,  DX,  R9, R10, 52)
 	DO_ROUND(R10, R11,  AX,  BX,  CX,  R8,  DX,  R9, 56)
 	DO_ROUND( R9, R10, R11,  AX,  BX,  CX,  R8,  DX, 60)
 
 	LONG  $0x4dfe51c5; BYTE $0x10 // VPADDD XMM9, XMM5, 16[RBP]   /* Add 2nd constant to message */
-	MOVOU X9, _xfer+48(FP)
+	MOVOU X9, reserved0+48(FP)
 	ADDQ  $32, BP
 	DO_ROUND( DX,  R9, R10, R11,  AX,  BX,  CX,  R8, 48)
 	DO_ROUND( R8,  DX,  R9, R10, R11,  AX,  BX,  CX, 52)
@@ -351,9 +351,9 @@
 	ADDL (7*4)(SI), R11 // H7 = h + H7
 	MOVL R11, (7*4)(SI)
 
-	MOVQ _inp+72(FP), SI
+	MOVQ reserved3+72(FP), SI
 	ADDQ $64, SI
-	CMPQ _inp_end+64(FP), SI
+	CMPQ reserved2+64(FP), SI
 	JNE  loop0
 
 done_hash:
diff --git a/sha256blockSsse_amd64.s b/sha256blockSsse_amd64.s
index ac68d9b..71666fc 100644
--- a/sha256blockSsse_amd64.s
+++ b/sha256blockSsse_amd64.s
@@ -244,15 +244,15 @@
 	ADDL R13, h                // h = h + S1 + CH + k + w + S0 + MAJ
 
 // func blockSsse(h []uint32, message []uint8, reserved0, reserved1, reserved2, reserved3 uint64)
-TEXT ·blockSsse(SB), 7, $0
-
-	MOVQ h+0(FP), SI           // SI: &h
-	MOVQ message+24(FP), R8    // &message
-	MOVQ lenmessage+32(FP), R9 // length of message
+TEXT ·blockSsse(SB), 7, $0-80
+
+	MOVQ h+0(FP), SI             // SI: &h
+	MOVQ message_base+24(FP), R8 // &message
+	MOVQ message_len+32(FP), R9  // length of message
 	CMPQ R9, $0
 	JEQ  done_hash
 	ADDQ R8, R9
-	MOVQ R9, _inp_end+64(FP)   // store end of message
+	MOVQ R9, reserved2+64(FP)    // store end of message
 
 	// Register definition
 	//  a -->  eax
@@ -281,7 +281,7 @@
 	MOVOU shuf00BA<>(SB), X10  // shuffle xBxA -> 00BA
 	MOVOU shufDC00<>(SB), X12  // shuffle xDxC -> DC00
 
-	MOVQ message+24(FP), SI // SI: &message
+	MOVQ message_base+24(FP), SI // SI: &message
 
 loop0:
 	LEAQ constants<>(SB), BP
@@ -296,7 +296,7 @@
 	MOVOU 3*16(SI), X7
 	LONG  $0x380f4166; WORD $0xfd00 // PSHUFB XMM7, XMM13
 
-	MOVQ SI, _inp+72(FP)
+	MOVQ SI, reserved3+72(FP)
 	MOVD $0x3, DI
 
 	// Align
@@ -306,22 +306,22 @@
 loop1:
 	MOVOU X4, X9
 	LONG  $0xfe0f4466; WORD $0x004d // PADDD XMM9, 0[RBP]   /* Add 1st constant to first part of message */
-	MOVOU X9, _xfer+48(FP)
+	MOVOU X9, reserved0+48(FP)
 	FOUR_ROUNDS_AND_SCHED(AX, BX,  CX,  R8, DX, R9, R10, R11)
 
 	MOVOU X4, X9
 	LONG  $0xfe0f4466; WORD $0x104d // PADDD XMM9, 16[RBP]   /* Add 2nd constant to message */
-	MOVOU X9, _xfer+48(FP)
+	MOVOU X9, reserved0+48(FP)
 	FOUR_ROUNDS_AND_SCHED(DX, R9, R10, R11, AX, BX,  CX,  R8)
 
 	MOVOU X4, X9
 	LONG  $0xfe0f4466; WORD $0x204d // PADDD XMM9, 32[RBP]   /* Add 3rd constant to message */
-	MOVOU X9, _xfer+48(FP)
+	MOVOU X9, reserved0+48(FP)
 	FOUR_ROUNDS_AND_SCHED(AX, BX,  CX,  R8, DX, R9, R10, R11)
 
 	MOVOU X4, X9
 	LONG  $0xfe0f4466; WORD $0x304d // PADDD XMM9, 48[RBP]   /* Add 4th constant to message */
-	MOVOU X9, _xfer+48(FP)
+	MOVOU X9, reserved0+48(FP)
 	ADDQ  $64, BP
 	FOUR_ROUNDS_AND_SCHED(DX, R9, R10, R11, AX, BX,  CX,  R8)
 
@@ -333,7 +333,7 @@
 loop2:
 	MOVOU X4, X9
 	LONG  $0xfe0f4466; WORD $0x004d // PADDD XMM9, 0[RBP]   /* Add 1st constant to first part of message */
-	MOVOU X9, _xfer+48(FP)
+	MOVOU X9, reserved0+48(FP)
 	DO_ROUND( AX,  BX,  CX,  R8,  DX,  R9, R10, R11, 48)
 	DO_ROUND(R11,  AX,  BX,  CX,  R8,  DX,  R9, R10, 52)
 	DO_ROUND(R10, R11,  AX,  BX,  CX,  R8,  DX,  R9, 56)
@@ -341,7 +341,7 @@
 
 	MOVOU X5, X9
 	LONG  $0xfe0f4466; WORD $0x104d // PADDD XMM9, 16[RBP]   /* Add 2nd constant to message */
-	MOVOU X9, _xfer+48(FP)
+	MOVOU X9, reserved0+48(FP)
 	ADDQ  $32, BP
 	DO_ROUND( DX,  R9, R10, R11,  AX,  BX,  CX,  R8, 48)
 	DO_ROUND( R8,  DX,  R9, R10, R11,  AX,  BX,  CX, 52)
@@ -372,9 +372,9 @@
 	ADDL (7*4)(SI), R11 // H7 = h + H7
 	MOVL R11, (7*4)(SI)
 
-	MOVQ _inp+72(FP), SI
+	MOVQ reserved3+72(FP), SI
 	ADDQ $64, SI
-	CMPQ _inp_end+64(FP), SI
+	CMPQ reserved2+64(FP), SI
 	JNE  loop0
 
 done_hash: