diff --git a/.travis.yml b/.travis.yml index 00c2bea..744e64c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -21,3 +21,4 @@ script: - diff -au <(gofmt -d .) <(printf "") - go test -race -v ./... +- go tool vet -asmdecl . diff --git a/sha256blockAvx2_amd64.s b/sha256blockAvx2_amd64.s index 1427f3a..079d6b9 100644 --- a/sha256blockAvx2_amd64.s +++ b/sha256blockAvx2_amd64.s @@ -31,8 +31,6 @@ // github.com/minio/asm2plan9s to assemble Intel instructions to their Plan9 // equivalents // - -#include "textflag.h" DATA K256<>+0x000(SB)/8, $0x71374491428a2f98 DATA K256<>+0x008(SB)/8, $0xe9b5dba5b5c0fbcf @@ -114,16 +112,25 @@ GLOBL K256<>(SB), 8, $608 +// We need 0x220 stack space aligned on a 512 boundary, so for the +// worstcase-aligned SP we need twice this amount, being 1088 (=0x440) +// +// SP aligned end-aligned stacksize +// 100013d0 10001400 10001620 592 +// 100013d8 10001400 10001620 584 +// 100013e0 10001600 10001820 1088 +// 100013e8 10001600 10001820 1080 + // func blockAvx2(h []uint32, message []uint8) -TEXT ·blockAvx2(SB), 7, $0 - - MOVQ ctx+0(FP), DI // DI: &h - MOVQ inp+24(FP), SI // SI: &message - MOVQ inplength+32(FP), DX // len(message) +TEXT ·blockAvx2(SB),$1088-48 + + MOVQ h+0(FP), DI // DI: &h + MOVQ message_base+24(FP), SI // SI: &message + MOVQ message_len+32(FP), DX // len(message) ADDQ SI, DX // end pointer of input MOVQ SP, R11 // copy stack pointer - SUBQ $0x220, SP // sp -= 0x220 - ANDQ $0xfffffffffffffc00, SP // align stack frame + ADDQ $0x220, SP // sp += 0x220 + ANDQ $0xfffffffffffffe00, SP // align stack frame ADDQ $0x1c0, SP MOVQ DI, 0x40(SP) // save ctx MOVQ SI, 0x48(SP) // save input @@ -1435,7 +1442,7 @@ done: MOVQ BP, SP - MOVQ 0x58(SP), SP + MOVQ 0x58(SP), SP // restore saved stack pointer WORD $0xf8c5; BYTE $0x77 // vzeroupper RET diff --git a/sha256blockAvx512_amd64.s b/sha256blockAvx512_amd64.s index 0ac97b2..14ae8a2 100644 --- a/sha256blockAvx512_amd64.s +++ b/sha256blockAvx512_amd64.s @@ -2,7 +2,7 @@ MOVQ digests+0(FP), DI MOVQ scratch+8(FP), R12 MOVQ mask_len+32(FP), SI - MOVQ r14+24(FP), R13 + MOVQ mask_base+24(FP), R13 MOVQ (R13), R14 LONG $0x92fbc1c4; BYTE $0xce LEAQ inputs+48(FP), AX diff --git a/sha256blockAvx_amd64.s b/sha256blockAvx_amd64.s index 6645519..4a6b28d 100644 --- a/sha256blockAvx_amd64.s +++ b/sha256blockAvx_amd64.s @@ -232,15 +232,15 @@ ADDL R13, h // h = h + S1 + CH + k + w + S0 + MAJ // func blockAvx(h []uint32, message []uint8, reserved0, reserved1, reserved2, reserved3 uint64) -TEXT ·blockAvx(SB), 7, $0 - - MOVQ h+0(FP), SI // SI: &h - MOVQ message+24(FP), R8 // &message - MOVQ lenmessage+32(FP), R9 // length of message +TEXT ·blockAvx(SB), 7, $0-80 + + MOVQ h+0(FP), SI // SI: &h + MOVQ message_base+24(FP), R8 // &message + MOVQ message_len+32(FP), R9 // length of message CMPQ R9, $0 JEQ done_hash ADDQ R8, R9 - MOVQ R9, _inp_end+64(FP) // store end of message + MOVQ R9, reserved2+64(FP) // store end of message // Register definition // a --> eax @@ -269,7 +269,7 @@ MOVOU shuf00BA<>(SB), X10 // shuffle xBxA -> 00BA MOVOU shufDC00<>(SB), X12 // shuffle xDxC -> DC00 - MOVQ message+24(FP), SI // SI: &message + MOVQ message_base+24(FP), SI // SI: &message loop0: LEAQ constants<>(SB), BP @@ -284,25 +284,25 @@ MOVOU 3*16(SI), X7 LONG $0x0041c2c4; BYTE $0xfd // VPSHUFB XMM7, XMM7, XMM13 - MOVQ SI, _inp+72(FP) + MOVQ SI, reserved3+72(FP) MOVD $0x3, DI // schedule 48 input dwords, by doing 3 rounds of 16 each loop1: LONG $0x4dfe59c5; BYTE $0x00 // VPADDD XMM9, XMM4, 0[RBP] /* Add 1st constant to first part of message */ - MOVOU X9, _xfer+48(FP) + MOVOU X9, reserved0+48(FP) FOUR_ROUNDS_AND_SCHED(AX, BX, CX, R8, DX, R9, R10, R11) LONG $0x4dfe59c5; BYTE $0x10 // VPADDD XMM9, XMM4, 16[RBP] /* Add 2nd constant to message */ - MOVOU X9, _xfer+48(FP) + MOVOU X9, reserved0+48(FP) FOUR_ROUNDS_AND_SCHED(DX, R9, R10, R11, AX, BX, CX, R8) LONG $0x4dfe59c5; BYTE $0x20 // VPADDD XMM9, XMM4, 32[RBP] /* Add 3rd constant to message */ - MOVOU X9, _xfer+48(FP) + MOVOU X9, reserved0+48(FP) FOUR_ROUNDS_AND_SCHED(AX, BX, CX, R8, DX, R9, R10, R11) LONG $0x4dfe59c5; BYTE $0x30 // VPADDD XMM9, XMM4, 48[RBP] /* Add 4th constant to message */ - MOVOU X9, _xfer+48(FP) + MOVOU X9, reserved0+48(FP) ADDQ $64, BP FOUR_ROUNDS_AND_SCHED(DX, R9, R10, R11, AX, BX, CX, R8) @@ -313,14 +313,14 @@ loop2: LONG $0x4dfe59c5; BYTE $0x00 // VPADDD XMM9, XMM4, 0[RBP] /* Add 1st constant to first part of message */ - MOVOU X9, _xfer+48(FP) + MOVOU X9, reserved0+48(FP) DO_ROUND( AX, BX, CX, R8, DX, R9, R10, R11, 48) DO_ROUND(R11, AX, BX, CX, R8, DX, R9, R10, 52) DO_ROUND(R10, R11, AX, BX, CX, R8, DX, R9, 56) DO_ROUND( R9, R10, R11, AX, BX, CX, R8, DX, 60) LONG $0x4dfe51c5; BYTE $0x10 // VPADDD XMM9, XMM5, 16[RBP] /* Add 2nd constant to message */ - MOVOU X9, _xfer+48(FP) + MOVOU X9, reserved0+48(FP) ADDQ $32, BP DO_ROUND( DX, R9, R10, R11, AX, BX, CX, R8, 48) DO_ROUND( R8, DX, R9, R10, R11, AX, BX, CX, 52) @@ -351,9 +351,9 @@ ADDL (7*4)(SI), R11 // H7 = h + H7 MOVL R11, (7*4)(SI) - MOVQ _inp+72(FP), SI + MOVQ reserved3+72(FP), SI ADDQ $64, SI - CMPQ _inp_end+64(FP), SI + CMPQ reserved2+64(FP), SI JNE loop0 done_hash: diff --git a/sha256blockSsse_amd64.s b/sha256blockSsse_amd64.s index ac68d9b..71666fc 100644 --- a/sha256blockSsse_amd64.s +++ b/sha256blockSsse_amd64.s @@ -244,15 +244,15 @@ ADDL R13, h // h = h + S1 + CH + k + w + S0 + MAJ // func blockSsse(h []uint32, message []uint8, reserved0, reserved1, reserved2, reserved3 uint64) -TEXT ·blockSsse(SB), 7, $0 - - MOVQ h+0(FP), SI // SI: &h - MOVQ message+24(FP), R8 // &message - MOVQ lenmessage+32(FP), R9 // length of message +TEXT ·blockSsse(SB), 7, $0-80 + + MOVQ h+0(FP), SI // SI: &h + MOVQ message_base+24(FP), R8 // &message + MOVQ message_len+32(FP), R9 // length of message CMPQ R9, $0 JEQ done_hash ADDQ R8, R9 - MOVQ R9, _inp_end+64(FP) // store end of message + MOVQ R9, reserved2+64(FP) // store end of message // Register definition // a --> eax @@ -281,7 +281,7 @@ MOVOU shuf00BA<>(SB), X10 // shuffle xBxA -> 00BA MOVOU shufDC00<>(SB), X12 // shuffle xDxC -> DC00 - MOVQ message+24(FP), SI // SI: &message + MOVQ message_base+24(FP), SI // SI: &message loop0: LEAQ constants<>(SB), BP @@ -296,7 +296,7 @@ MOVOU 3*16(SI), X7 LONG $0x380f4166; WORD $0xfd00 // PSHUFB XMM7, XMM13 - MOVQ SI, _inp+72(FP) + MOVQ SI, reserved3+72(FP) MOVD $0x3, DI // Align @@ -306,22 +306,22 @@ loop1: MOVOU X4, X9 LONG $0xfe0f4466; WORD $0x004d // PADDD XMM9, 0[RBP] /* Add 1st constant to first part of message */ - MOVOU X9, _xfer+48(FP) + MOVOU X9, reserved0+48(FP) FOUR_ROUNDS_AND_SCHED(AX, BX, CX, R8, DX, R9, R10, R11) MOVOU X4, X9 LONG $0xfe0f4466; WORD $0x104d // PADDD XMM9, 16[RBP] /* Add 2nd constant to message */ - MOVOU X9, _xfer+48(FP) + MOVOU X9, reserved0+48(FP) FOUR_ROUNDS_AND_SCHED(DX, R9, R10, R11, AX, BX, CX, R8) MOVOU X4, X9 LONG $0xfe0f4466; WORD $0x204d // PADDD XMM9, 32[RBP] /* Add 3rd constant to message */ - MOVOU X9, _xfer+48(FP) + MOVOU X9, reserved0+48(FP) FOUR_ROUNDS_AND_SCHED(AX, BX, CX, R8, DX, R9, R10, R11) MOVOU X4, X9 LONG $0xfe0f4466; WORD $0x304d // PADDD XMM9, 48[RBP] /* Add 4th constant to message */ - MOVOU X9, _xfer+48(FP) + MOVOU X9, reserved0+48(FP) ADDQ $64, BP FOUR_ROUNDS_AND_SCHED(DX, R9, R10, R11, AX, BX, CX, R8) @@ -333,7 +333,7 @@ loop2: MOVOU X4, X9 LONG $0xfe0f4466; WORD $0x004d // PADDD XMM9, 0[RBP] /* Add 1st constant to first part of message */ - MOVOU X9, _xfer+48(FP) + MOVOU X9, reserved0+48(FP) DO_ROUND( AX, BX, CX, R8, DX, R9, R10, R11, 48) DO_ROUND(R11, AX, BX, CX, R8, DX, R9, R10, 52) DO_ROUND(R10, R11, AX, BX, CX, R8, DX, R9, 56) @@ -341,7 +341,7 @@ MOVOU X5, X9 LONG $0xfe0f4466; WORD $0x104d // PADDD XMM9, 16[RBP] /* Add 2nd constant to message */ - MOVOU X9, _xfer+48(FP) + MOVOU X9, reserved0+48(FP) ADDQ $32, BP DO_ROUND( DX, R9, R10, R11, AX, BX, CX, R8, 48) DO_ROUND( R8, DX, R9, R10, R11, AX, BX, CX, 52) @@ -372,9 +372,9 @@ ADDL (7*4)(SI), R11 // H7 = h + H7 MOVL R11, (7*4)(SI) - MOVQ _inp+72(FP), SI + MOVQ reserved3+72(FP), SI ADDQ $64, SI - CMPQ _inp_end+64(FP), SI + CMPQ reserved2+64(FP), SI JNE loop0 done_hash: