Package list golang-github-minio-sha256-simd / 66352e7
Fix asmdecl errors and reserve stack space for AVX2 block function (#38) Frank Wessels authored 2 years ago Harshavardhana committed 2 years ago
5 changed file(s) with 51 addition(s) and 43 deletion(s). Raw diff Collapse all Expand all
2020 script:
2121 - diff -au <(gofmt -d .) <(printf "")
2222 - go test -race -v ./...
23 - go tool vet -asmdecl .
3030 // github.com/minio/asm2plan9s to assemble Intel instructions to their Plan9
3131 // equivalents
3232 //
33
34 #include "textflag.h"
3533
3634 DATA K256<>+0x000(SB)/8, $0x71374491428a2f98
3735 DATA K256<>+0x008(SB)/8, $0xe9b5dba5b5c0fbcf
113111
114112 GLOBL K256<>(SB), 8, $608
115113
114 // We need 0x220 stack space aligned on a 512 boundary, so for the
115 // worstcase-aligned SP we need twice this amount, being 1088 (=0x440)
116 //
117 // SP aligned end-aligned stacksize
118 // 100013d0 10001400 10001620 592
119 // 100013d8 10001400 10001620 584
120 // 100013e0 10001600 10001820 1088
121 // 100013e8 10001600 10001820 1080
122
116123 // func blockAvx2(h []uint32, message []uint8)
117 TEXT ·blockAvx2(SB), 7, $0
118
119 MOVQ ctx+0(FP), DI // DI: &h
120 MOVQ inp+24(FP), SI // SI: &message
121 MOVQ inplength+32(FP), DX // len(message)
124 TEXT ·blockAvx2(SB),$1088-48
125
126 MOVQ h+0(FP), DI // DI: &h
127 MOVQ message_base+24(FP), SI // SI: &message
128 MOVQ message_len+32(FP), DX // len(message)
122129 ADDQ SI, DX // end pointer of input
123130 MOVQ SP, R11 // copy stack pointer
124 SUBQ $0x220, SP // sp -= 0x220
125 ANDQ $0xfffffffffffffc00, SP // align stack frame
131 ADDQ $0x220, SP // sp += 0x220
132 ANDQ $0xfffffffffffffe00, SP // align stack frame
126133 ADDQ $0x1c0, SP
127134 MOVQ DI, 0x40(SP) // save ctx
128135 MOVQ SI, 0x48(SP) // save input
14341441
14351442 done:
14361443 MOVQ BP, SP
1437 MOVQ 0x58(SP), SP
1444 MOVQ 0x58(SP), SP // restore saved stack pointer
14381445 WORD $0xf8c5; BYTE $0x77 // vzeroupper
14391446
14401447 RET
11 MOVQ digests+0(FP), DI
22 MOVQ scratch+8(FP), R12
33 MOVQ mask_len+32(FP), SI
4 MOVQ r14+24(FP), R13
4 MOVQ mask_base+24(FP), R13
55 MOVQ (R13), R14
66 LONG $0x92fbc1c4; BYTE $0xce
77 LEAQ inputs+48(FP), AX
231231 ADDL R13, h // h = h + S1 + CH + k + w + S0 + MAJ
232232
233233 // func blockAvx(h []uint32, message []uint8, reserved0, reserved1, reserved2, reserved3 uint64)
234 TEXT ·blockAvx(SB), 7, $0
235
236 MOVQ h+0(FP), SI // SI: &h
237 MOVQ message+24(FP), R8 // &message
238 MOVQ lenmessage+32(FP), R9 // length of message
234 TEXT ·blockAvx(SB), 7, $0-80
235
236 MOVQ h+0(FP), SI // SI: &h
237 MOVQ message_base+24(FP), R8 // &message
238 MOVQ message_len+32(FP), R9 // length of message
239239 CMPQ R9, $0
240240 JEQ done_hash
241241 ADDQ R8, R9
242 MOVQ R9, _inp_end+64(FP) // store end of message
242 MOVQ R9, reserved2+64(FP) // store end of message
243243
244244 // Register definition
245245 // a --> eax
268268 MOVOU shuf00BA<>(SB), X10 // shuffle xBxA -> 00BA
269269 MOVOU shufDC00<>(SB), X12 // shuffle xDxC -> DC00
270270
271 MOVQ message+24(FP), SI // SI: &message
271 MOVQ message_base+24(FP), SI // SI: &message
272272
273273 loop0:
274274 LEAQ constants<>(SB), BP
283283 MOVOU 3*16(SI), X7
284284 LONG $0x0041c2c4; BYTE $0xfd // VPSHUFB XMM7, XMM7, XMM13
285285
286 MOVQ SI, _inp+72(FP)
286 MOVQ SI, reserved3+72(FP)
287287 MOVD $0x3, DI
288288
289289 // schedule 48 input dwords, by doing 3 rounds of 16 each
290290 loop1:
291291 LONG $0x4dfe59c5; BYTE $0x00 // VPADDD XMM9, XMM4, 0[RBP] /* Add 1st constant to first part of message */
292 MOVOU X9, _xfer+48(FP)
292 MOVOU X9, reserved0+48(FP)
293293 FOUR_ROUNDS_AND_SCHED(AX, BX, CX, R8, DX, R9, R10, R11)
294294
295295 LONG $0x4dfe59c5; BYTE $0x10 // VPADDD XMM9, XMM4, 16[RBP] /* Add 2nd constant to message */
296 MOVOU X9, _xfer+48(FP)
296 MOVOU X9, reserved0+48(FP)
297297 FOUR_ROUNDS_AND_SCHED(DX, R9, R10, R11, AX, BX, CX, R8)
298298
299299 LONG $0x4dfe59c5; BYTE $0x20 // VPADDD XMM9, XMM4, 32[RBP] /* Add 3rd constant to message */
300 MOVOU X9, _xfer+48(FP)
300 MOVOU X9, reserved0+48(FP)
301301 FOUR_ROUNDS_AND_SCHED(AX, BX, CX, R8, DX, R9, R10, R11)
302302
303303 LONG $0x4dfe59c5; BYTE $0x30 // VPADDD XMM9, XMM4, 48[RBP] /* Add 4th constant to message */
304 MOVOU X9, _xfer+48(FP)
304 MOVOU X9, reserved0+48(FP)
305305 ADDQ $64, BP
306306 FOUR_ROUNDS_AND_SCHED(DX, R9, R10, R11, AX, BX, CX, R8)
307307
312312
313313 loop2:
314314 LONG $0x4dfe59c5; BYTE $0x00 // VPADDD XMM9, XMM4, 0[RBP] /* Add 1st constant to first part of message */
315 MOVOU X9, _xfer+48(FP)
315 MOVOU X9, reserved0+48(FP)
316316 DO_ROUND( AX, BX, CX, R8, DX, R9, R10, R11, 48)
317317 DO_ROUND(R11, AX, BX, CX, R8, DX, R9, R10, 52)
318318 DO_ROUND(R10, R11, AX, BX, CX, R8, DX, R9, 56)
319319 DO_ROUND( R9, R10, R11, AX, BX, CX, R8, DX, 60)
320320
321321 LONG $0x4dfe51c5; BYTE $0x10 // VPADDD XMM9, XMM5, 16[RBP] /* Add 2nd constant to message */
322 MOVOU X9, _xfer+48(FP)
322 MOVOU X9, reserved0+48(FP)
323323 ADDQ $32, BP
324324 DO_ROUND( DX, R9, R10, R11, AX, BX, CX, R8, 48)
325325 DO_ROUND( R8, DX, R9, R10, R11, AX, BX, CX, 52)
350350 ADDL (7*4)(SI), R11 // H7 = h + H7
351351 MOVL R11, (7*4)(SI)
352352
353 MOVQ _inp+72(FP), SI
353 MOVQ reserved3+72(FP), SI
354354 ADDQ $64, SI
355 CMPQ _inp_end+64(FP), SI
355 CMPQ reserved2+64(FP), SI
356356 JNE loop0
357357
358358 done_hash:
243243 ADDL R13, h // h = h + S1 + CH + k + w + S0 + MAJ
244244
245245 // func blockSsse(h []uint32, message []uint8, reserved0, reserved1, reserved2, reserved3 uint64)
246 TEXT ·blockSsse(SB), 7, $0
247
248 MOVQ h+0(FP), SI // SI: &h
249 MOVQ message+24(FP), R8 // &message
250 MOVQ lenmessage+32(FP), R9 // length of message
246 TEXT ·blockSsse(SB), 7, $0-80
247
248 MOVQ h+0(FP), SI // SI: &h
249 MOVQ message_base+24(FP), R8 // &message
250 MOVQ message_len+32(FP), R9 // length of message
251251 CMPQ R9, $0
252252 JEQ done_hash
253253 ADDQ R8, R9
254 MOVQ R9, _inp_end+64(FP) // store end of message
254 MOVQ R9, reserved2+64(FP) // store end of message
255255
256256 // Register definition
257257 // a --> eax
280280 MOVOU shuf00BA<>(SB), X10 // shuffle xBxA -> 00BA
281281 MOVOU shufDC00<>(SB), X12 // shuffle xDxC -> DC00
282282
283 MOVQ message+24(FP), SI // SI: &message
283 MOVQ message_base+24(FP), SI // SI: &message
284284
285285 loop0:
286286 LEAQ constants<>(SB), BP
295295 MOVOU 3*16(SI), X7
296296 LONG $0x380f4166; WORD $0xfd00 // PSHUFB XMM7, XMM13
297297
298 MOVQ SI, _inp+72(FP)
298 MOVQ SI, reserved3+72(FP)
299299 MOVD $0x3, DI
300300
301301 // Align
305305 loop1:
306306 MOVOU X4, X9
307307 LONG $0xfe0f4466; WORD $0x004d // PADDD XMM9, 0[RBP] /* Add 1st constant to first part of message */
308 MOVOU X9, _xfer+48(FP)
308 MOVOU X9, reserved0+48(FP)
309309 FOUR_ROUNDS_AND_SCHED(AX, BX, CX, R8, DX, R9, R10, R11)
310310
311311 MOVOU X4, X9
312312 LONG $0xfe0f4466; WORD $0x104d // PADDD XMM9, 16[RBP] /* Add 2nd constant to message */
313 MOVOU X9, _xfer+48(FP)
313 MOVOU X9, reserved0+48(FP)
314314 FOUR_ROUNDS_AND_SCHED(DX, R9, R10, R11, AX, BX, CX, R8)
315315
316316 MOVOU X4, X9
317317 LONG $0xfe0f4466; WORD $0x204d // PADDD XMM9, 32[RBP] /* Add 3rd constant to message */
318 MOVOU X9, _xfer+48(FP)
318 MOVOU X9, reserved0+48(FP)
319319 FOUR_ROUNDS_AND_SCHED(AX, BX, CX, R8, DX, R9, R10, R11)
320320
321321 MOVOU X4, X9
322322 LONG $0xfe0f4466; WORD $0x304d // PADDD XMM9, 48[RBP] /* Add 4th constant to message */
323 MOVOU X9, _xfer+48(FP)
323 MOVOU X9, reserved0+48(FP)
324324 ADDQ $64, BP
325325 FOUR_ROUNDS_AND_SCHED(DX, R9, R10, R11, AX, BX, CX, R8)
326326
332332 loop2:
333333 MOVOU X4, X9
334334 LONG $0xfe0f4466; WORD $0x004d // PADDD XMM9, 0[RBP] /* Add 1st constant to first part of message */
335 MOVOU X9, _xfer+48(FP)
335 MOVOU X9, reserved0+48(FP)
336336 DO_ROUND( AX, BX, CX, R8, DX, R9, R10, R11, 48)
337337 DO_ROUND(R11, AX, BX, CX, R8, DX, R9, R10, 52)
338338 DO_ROUND(R10, R11, AX, BX, CX, R8, DX, R9, 56)
340340
341341 MOVOU X5, X9
342342 LONG $0xfe0f4466; WORD $0x104d // PADDD XMM9, 16[RBP] /* Add 2nd constant to message */
343 MOVOU X9, _xfer+48(FP)
343 MOVOU X9, reserved0+48(FP)
344344 ADDQ $32, BP
345345 DO_ROUND( DX, R9, R10, R11, AX, BX, CX, R8, 48)
346346 DO_ROUND( R8, DX, R9, R10, R11, AX, BX, CX, 52)
371371 ADDL (7*4)(SI), R11 // H7 = h + H7
372372 MOVL R11, (7*4)(SI)
373373
374 MOVQ _inp+72(FP), SI
374 MOVQ reserved3+72(FP), SI
375375 ADDQ $64, SI
376 CMPQ _inp_end+64(FP), SI
376 CMPQ reserved2+64(FP), SI
377377 JNE loop0
378378
379379 done_hash: