Package list golang-github-minio-sha256-simd / e529fa1
Support SHA* intrinsics on Intel CPU (#37) * Support SHA* intrinsics on Intel CPU - optimise: select block function at init - added dedicated padding function, optimised endian conversion - add assembly for Intel SHA extensions - update benchmarks - stream line checksum function - cleanup of sha assembly code * Cleanup code to be idiomatic Go Harshavardhana authored 2 years ago Frank Wessels committed 2 years ago
22 changed file(s) with 2845 addition(s) and 2257 deletion(s). Raw diff Collapse all Expand all
0 *.test
66
77 go:
88 - tip
9 - 1.11
10 - 1.10
9 - 1.11.x
1110
1211 env:
1312 - ARCH=x86_64
1515 package sha256
1616
1717 // True when SIMD instructions are available.
18 var avx512 = haveAVX512()
19 var avx2 = haveAVX2()
20 var avx = haveAVX()
21 var ssse3 = haveSSSE3()
18 var avx512 bool
19 var avx2 bool
20 var avx bool
21 var sse bool
22 var sse2 bool
23 var sse3 bool
24 var ssse3 bool
25 var sse41 bool
26 var sse42 bool
27 var popcnt bool
28 var sha bool
2229 var armSha = haveArmSha()
2330
24 // haveAVX returns true when there is AVX support
25 func haveAVX() bool {
26 _, _, c, _ := cpuid(1)
31 func init() {
32 var _xsave bool
33 var _osxsave bool
34 var _avx bool
35 var _avx2 bool
36 var _avx512f bool
37 var _avx512dq bool
38 // var _avx512pf bool
39 // var _avx512er bool
40 // var _avx512cd bool
41 var _avx512bw bool
42 var _avx512vl bool
43 var _sseState bool
44 var _avxState bool
45 var _opmaskState bool
46 var _zmmHI256State bool
47 var _hi16ZmmState bool
2748
28 // Check XGETBV, OXSAVE and AVX bits
29 if c&(1<<26) != 0 && c&(1<<27) != 0 && c&(1<<28) != 0 {
30 // Check for OS support
31 eax, _ := xgetbv(0)
32 return (eax & 0x6) == 0x6
33 }
34 return false
35 }
36
37 // haveAVX2 returns true when there is AVX2 support
38 func haveAVX2() bool {
3949 mfi, _, _, _ := cpuid(0)
4050
41 // Check AVX2, AVX2 requires OS support, but BMI1/2 don't.
42 if mfi >= 7 && haveAVX() {
43 _, ebx, _, _ := cpuidex(7, 0)
44 return (ebx & 0x00000020) != 0
51 if mfi >= 1 {
52 _, _, c, d := cpuid(1)
53
54 sse = (d & (1 << 25)) != 0
55 sse2 = (d & (1 << 26)) != 0
56 sse3 = (c & (1 << 0)) != 0
57 ssse3 = (c & (1 << 9)) != 0
58 sse41 = (c & (1 << 19)) != 0
59 sse42 = (c & (1 << 20)) != 0
60 popcnt = (c & (1 << 23)) != 0
61 _xsave = (c & (1 << 26)) != 0
62 _osxsave = (c & (1 << 27)) != 0
63 _avx = (c & (1 << 28)) != 0
4564 }
46 return false
65
66 if mfi >= 7 {
67 _, b, _, _ := cpuid(7)
68
69 _avx2 = (b & (1 << 5)) != 0
70 _avx512f = (b & (1 << 16)) != 0
71 _avx512dq = (b & (1 << 17)) != 0
72 // _avx512pf = (b & (1 << 26)) != 0
73 // _avx512er = (b & (1 << 27)) != 0
74 // _avx512cd = (b & (1 << 28)) != 0
75 _avx512bw = (b & (1 << 30)) != 0
76 _avx512vl = (b & (1 << 31)) != 0
77 sha = (b & (1 << 29)) != 0
78 }
79
80 // Stop here if XSAVE unsupported or not enabled
81 if !_xsave || !_osxsave {
82 return
83 }
84
85 if _xsave && _osxsave {
86 a, _ := xgetbv(0)
87
88 _sseState = (a & (1 << 1)) != 0
89 _avxState = (a & (1 << 2)) != 0
90 _opmaskState = (a & (1 << 5)) != 0
91 _zmmHI256State = (a & (1 << 6)) != 0
92 _hi16ZmmState = (a & (1 << 7)) != 0
93 } else {
94 _sseState = true
95 }
96
97 // Very unlikely that OS would enable XSAVE and then disable SSE
98 if !_sseState {
99 sse = false
100 sse2 = false
101 sse3 = false
102 ssse3 = false
103 sse41 = false
104 sse42 = false
105 }
106
107 if _avxState {
108 avx = _avx
109 avx2 = _avx2
110 }
111
112 if _opmaskState && _zmmHI256State && _hi16ZmmState {
113 avx512 = (_avx512f &&
114 _avx512dq &&
115 _avx512bw &&
116 _avx512vl)
117 }
47118 }
48
49 // haveAVX512 returns true when there is AVX512 support
50 func haveAVX512() bool {
51 mfi, _, _, _ := cpuid(0)
52
53 // Check AVX2, AVX2 requires OS support, but BMI1/2 don't.
54 if mfi >= 7 {
55 _, _, c, _ := cpuid(1)
56
57 // Only detect AVX-512 features if XGETBV is supported
58 if c&((1<<26)|(1<<27)) == (1<<26)|(1<<27) {
59 // Check for OS support
60 eax, _ := xgetbv(0)
61 _, ebx, _, _ := cpuidex(7, 0)
62
63 // Verify that XCR0[7:5] = ‘111b’ (OPMASK state, upper 256-bit of ZMM0-ZMM15 and
64 // ZMM16-ZMM31 state are enabled by OS)
65 /// and that XCR0[2:1] = ‘11b’ (XMM state and YMM state are enabled by OS).
66 if (eax>>5)&7 == 7 && (eax>>1)&3 == 3 {
67 if ebx&(1<<16) == 0 {
68 return false // no AVX512F
69 }
70 if ebx&(1<<17) == 0 {
71 return false // no AVX512DQ
72 }
73 if ebx&(1<<30) == 0 {
74 return false // no AVX512BW
75 }
76 if ebx&(1<<31) == 0 {
77 return false // no AVX512VL
78 }
79 return true
80 }
81 }
82 }
83 return false
84 }
85
86 // haveSSSE3 returns true when there is SSSE3 support
87 func haveSSSE3() bool {
88
89 _, _, c, _ := cpuid(1)
90
91 return (c & 0x00000200) != 0
92 }
2323
2424 // func cpuid(op uint32) (eax, ebx, ecx, edx uint32)
2525 TEXT ·cpuid(SB), 7, $0
26 XORL CX, CX
27 MOVL op+0(FP), AX
28 CPUID
29 MOVL AX, eax+4(FP)
30 MOVL BX, ebx+8(FP)
31 MOVL CX, ecx+12(FP)
32 MOVL DX, edx+16(FP)
33 RET
26 XORL CX, CX
27 MOVL op+0(FP), AX
28 CPUID
29 MOVL AX, eax+4(FP)
30 MOVL BX, ebx+8(FP)
31 MOVL CX, ecx+12(FP)
32 MOVL DX, edx+16(FP)
33 RET
3434
3535 // func cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
3636 TEXT ·cpuidex(SB), 7, $0
37 MOVL op+0(FP), AX
38 MOVL op2+4(FP), CX
39 CPUID
40 MOVL AX, eax+8(FP)
41 MOVL BX, ebx+12(FP)
42 MOVL CX, ecx+16(FP)
43 MOVL DX, edx+20(FP)
44 RET
37 MOVL op+0(FP), AX
38 MOVL op2+4(FP), CX
39 CPUID
40 MOVL AX, eax+8(FP)
41 MOVL BX, ebx+12(FP)
42 MOVL CX, ecx+16(FP)
43 MOVL DX, edx+20(FP)
44 RET
4545
4646 // func xgetbv(index uint32) (eax, edx uint32)
4747 TEXT ·xgetbv(SB), 7, $0
48 MOVL index+0(FP), CX
49 BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV
50 MOVL AX, eax+4(FP)
51 MOVL DX, edx+8(FP)
52 RET
48 MOVL index+0(FP), CX
49 BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV
50 MOVL AX, eax+4(FP)
51 MOVL DX, edx+8(FP)
52 RET
2323
2424 // func cpuid(op uint32) (eax, ebx, ecx, edx uint32)
2525 TEXT ·cpuid(SB), 7, $0
26 XORQ CX, CX
27 MOVL op+0(FP), AX
28 CPUID
29 MOVL AX, eax+8(FP)
30 MOVL BX, ebx+12(FP)
31 MOVL CX, ecx+16(FP)
32 MOVL DX, edx+20(FP)
33 RET
34
26 XORQ CX, CX
27 MOVL op+0(FP), AX
28 CPUID
29 MOVL AX, eax+8(FP)
30 MOVL BX, ebx+12(FP)
31 MOVL CX, ecx+16(FP)
32 MOVL DX, edx+20(FP)
33 RET
3534
3635 // func cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
3736 TEXT ·cpuidex(SB), 7, $0
38 MOVL op+0(FP), AX
39 MOVL op2+4(FP), CX
40 CPUID
41 MOVL AX, eax+8(FP)
42 MOVL BX, ebx+12(FP)
43 MOVL CX, ecx+16(FP)
44 MOVL DX, edx+20(FP)
45 RET
37 MOVL op+0(FP), AX
38 MOVL op2+4(FP), CX
39 CPUID
40 MOVL AX, eax+8(FP)
41 MOVL BX, ebx+12(FP)
42 MOVL CX, ecx+16(FP)
43 MOVL DX, edx+20(FP)
44 RET
4645
4746 // func xgetbv(index uint32) (eax, edx uint32)
4847 TEXT ·xgetbv(SB), 7, $0
49 MOVL index+0(FP), CX
50 BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV
51 MOVL AX, eax+8(FP)
52 MOVL DX, edx+12(FP)
53 RET
48 MOVL index+0(FP), CX
49 BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV
50 MOVL AX, eax+8(FP)
51 MOVL DX, edx+12(FP)
52 RET
1717
1818 import (
1919 "crypto/sha256"
20 "encoding/binary"
2021 "hash"
2122 "runtime"
2223 )
2829 const BlockSize = 64
2930
3031 const (
31 chunk = 64
32 chunk = BlockSize
3233 init0 = 0x6A09E667
3334 init1 = 0xBB67AE85
3435 init2 = 0x3C6EF372
6162 d.len = 0
6263 }
6364
65 type blockfuncType int
66
67 const (
68 blockfuncGeneric blockfuncType = iota
69 blockfuncAvx512 blockfuncType = iota
70 blockfuncAvx2 blockfuncType = iota
71 blockfuncAvx blockfuncType = iota
72 blockfuncSsse blockfuncType = iota
73 blockfuncSha blockfuncType = iota
74 blockfuncArm blockfuncType = iota
75 )
76
77 var blockfunc blockfuncType
78
6479 func block(dig *digest, p []byte) {
80 if blockfunc == blockfuncSha {
81 blockShaGo(dig, p)
82 } else if blockfunc == blockfuncAvx2 {
83 blockAvx2Go(dig, p)
84 } else if blockfunc == blockfuncAvx {
85 blockAvxGo(dig, p)
86 } else if blockfunc == blockfuncSsse {
87 blockSsseGo(dig, p)
88 } else if blockfunc == blockfuncArm {
89 blockArmGo(dig, p)
90 } else if blockfunc == blockfuncGeneric {
91 blockGeneric(dig, p)
92 }
93 }
94
95 func init() {
6596 is386bit := runtime.GOARCH == "386"
6697 isARM := runtime.GOARCH == "arm"
67 if is386bit || isARM {
68 blockGeneric(dig, p)
69 }
70 switch !is386bit && !isARM {
98 switch {
99 case is386bit || isARM:
100 blockfunc = blockfuncGeneric
101 case sha && ssse3 && sse41:
102 blockfunc = blockfuncSha
71103 case avx2:
72 blockAvx2Go(dig, p)
104 blockfunc = blockfuncAvx2
73105 case avx:
74 blockAvxGo(dig, p)
106 blockfunc = blockfuncAvx
75107 case ssse3:
76 blockSsseGo(dig, p)
108 blockfunc = blockfuncSsse
77109 case armSha:
78 blockArmGo(dig, p)
110 blockfunc = blockfuncArm
79111 default:
80 blockGeneric(dig, p)
112 blockfunc = blockfuncGeneric
81113 }
82114 }
83115
84116 // New returns a new hash.Hash computing the SHA256 checksum.
85117 func New() hash.Hash {
86 if avx2 || avx || ssse3 || armSha {
118 if blockfunc != blockfuncGeneric {
87119 d := new(digest)
88120 d.Reset()
89121 return d
94126 }
95127
96128 // Sum256 - single caller sha256 helper
97 func Sum256(data []byte) [Size]byte {
129 func Sum256(data []byte) (result [Size]byte) {
98130 var d digest
99131 d.Reset()
100132 d.Write(data)
101 return d.checkSum()
133 result = d.checkSum()
134 return
102135 }
103136
104137 // Return size of checksum
140173 }
141174
142175 // Intermediate checksum function
143 func (d *digest) checkSum() [Size]byte {
144 len := d.len
145 // Padding. Add a 1 bit and 0 bits until 56 bytes mod 64.
146 var tmp [64]byte
147 tmp[0] = 0x80
148 if len%64 < 56 {
149 d.Write(tmp[0 : 56-len%64])
150 } else {
151 d.Write(tmp[0 : 64+56-len%64])
152 }
153
154 // Length in bits.
155 len <<= 3
156 for i := uint(0); i < 8; i++ {
157 tmp[i] = byte(len >> (56 - 8*i))
158 }
159 d.Write(tmp[0:8])
160
161 if d.nx != 0 {
162 panic("d.nx != 0")
163 }
164
165 h := d.h[:]
166
167 var digest [Size]byte
168 for i, s := range h {
169 digest[i*4] = byte(s >> 24)
170 digest[i*4+1] = byte(s >> 16)
171 digest[i*4+2] = byte(s >> 8)
172 digest[i*4+3] = byte(s)
173 }
174
175 return digest
176 }
176 func (d *digest) checkSum() (digest [Size]byte) {
177 n := d.nx
178
179 var k [64]byte
180 copy(k[:], d.x[:n])
181
182 k[n] = 0x80
183
184 if n >= 56 {
185 block(d, k[:])
186
187 // clear block buffer - go compiles this to optimal 1x xorps + 4x movups
188 // unfortunately expressing this more succinctly results in much worse code
189 k[0] = 0
190 k[1] = 0
191 k[2] = 0
192 k[3] = 0
193 k[4] = 0
194 k[5] = 0
195 k[6] = 0
196 k[7] = 0
197 k[8] = 0
198 k[9] = 0
199 k[10] = 0
200 k[11] = 0
201 k[12] = 0
202 k[13] = 0
203 k[14] = 0
204 k[15] = 0
205 k[16] = 0
206 k[17] = 0
207 k[18] = 0
208 k[19] = 0
209 k[20] = 0
210 k[21] = 0
211 k[22] = 0
212 k[23] = 0
213 k[24] = 0
214 k[25] = 0
215 k[26] = 0
216 k[27] = 0
217 k[28] = 0
218 k[29] = 0
219 k[30] = 0
220 k[31] = 0
221 k[32] = 0
222 k[33] = 0
223 k[34] = 0
224 k[35] = 0
225 k[36] = 0
226 k[37] = 0
227 k[38] = 0
228 k[39] = 0
229 k[40] = 0
230 k[41] = 0
231 k[42] = 0
232 k[43] = 0
233 k[44] = 0
234 k[45] = 0
235 k[46] = 0
236 k[47] = 0
237 k[48] = 0
238 k[49] = 0
239 k[50] = 0
240 k[51] = 0
241 k[52] = 0
242 k[53] = 0
243 k[54] = 0
244 k[55] = 0
245 k[56] = 0
246 k[57] = 0
247 k[58] = 0
248 k[59] = 0
249 k[60] = 0
250 k[61] = 0
251 k[62] = 0
252 k[63] = 0
253 }
254 binary.BigEndian.PutUint64(k[56:64], uint64(d.len)<<3)
255 block(d, k[:])
256
257 {
258 const i = 0
259 binary.BigEndian.PutUint32(digest[i*4:i*4+4], d.h[i])
260 }
261 {
262 const i = 1
263 binary.BigEndian.PutUint32(digest[i*4:i*4+4], d.h[i])
264 }
265 {
266 const i = 2
267 binary.BigEndian.PutUint32(digest[i*4:i*4+4], d.h[i])
268 }
269 {
270 const i = 3
271 binary.BigEndian.PutUint32(digest[i*4:i*4+4], d.h[i])
272 }
273 {
274 const i = 4
275 binary.BigEndian.PutUint32(digest[i*4:i*4+4], d.h[i])
276 }
277 {
278 const i = 5
279 binary.BigEndian.PutUint32(digest[i*4:i*4+4], d.h[i])
280 }
281 {
282 const i = 6
283 binary.BigEndian.PutUint32(digest[i*4:i*4+4], d.h[i])
284 }
285 {
286 const i = 7
287 binary.BigEndian.PutUint32(digest[i*4:i*4+4], d.h[i])
288 }
289
290 return
291 }
22072207 }
22082208
22092209 func TestGolden(t *testing.T) {
2210 blockfuncSaved := blockfunc
2211
2212 if sha && ssse3 && sse41 {
2213 blockfunc = blockfuncSha
2214 for _, g := range golden {
2215 s := fmt.Sprintf("%x", Sum256([]byte(g.in)))
2216 if Sum256([]byte(g.in)) != g.out {
2217 t.Fatalf("SHA: Sum256 function: sha256(%s) = %s want %s", g.in, s, hex.EncodeToString(g.out[:]))
2218 }
2219 }
2220 }
22102221 if avx2 {
2222 blockfunc = blockfuncAvx2
22112223 for _, g := range golden {
22122224 s := fmt.Sprintf("%x", Sum256([]byte(g.in)))
22132225 if Sum256([]byte(g.in)) != g.out {
22142226 t.Fatalf("AVX2: Sum256 function: sha256(%s) = %s want %s", g.in, s, hex.EncodeToString(g.out[:]))
22152227 }
22162228 }
2217 avx2 = false
22182229 }
22192230 if avx {
2231 blockfunc = blockfuncAvx
22202232 for _, g := range golden {
22212233 s := fmt.Sprintf("%x", Sum256([]byte(g.in)))
22222234 if Sum256([]byte(g.in)) != g.out {
22232235 t.Fatalf("AVX: Sum256 function: sha256(%s) = %s want %s", g.in, s, hex.EncodeToString(g.out[:]))
22242236 }
22252237 }
2226 avx = false
22272238 }
22282239 if ssse3 {
2240 blockfunc = blockfuncSsse
22292241 for _, g := range golden {
22302242 s := fmt.Sprintf("%x", Sum256([]byte(g.in)))
22312243 if Sum256([]byte(g.in)) != g.out {
22332245 }
22342246 }
22352247 }
2248 if true {
2249 blockfunc = blockfuncGeneric
2250 for _, g := range golden {
2251 s := fmt.Sprintf("%x", Sum256([]byte(g.in)))
2252 if Sum256([]byte(g.in)) != g.out {
2253 t.Fatalf("Generic: Sum256 function: sha256(%s) = %s want %s", g.in, s, hex.EncodeToString(g.out[:]))
2254 }
2255 }
2256 }
2257
2258 blockfunc = blockfuncSaved
22362259 }
22372260
22382261 func TestSize(t *testing.T) {
22542277 var buf = make([]byte, size)
22552278 b.SetBytes(int64(size))
22562279 sum := make([]byte, bench.Size())
2280 b.ResetTimer()
22572281 for i := 0; i < b.N; i++ {
22582282 bench.Reset()
22592283 bench.Write(buf[:size])
22612285 }
22622286 }
22632287
2264 func BenchmarkHash8Bytes(b *testing.B) { benchmarkSize(b, 8) }
2265 func BenchmarkHash1K(b *testing.B) { benchmarkSize(b, 1024) }
2266 func BenchmarkHash8K(b *testing.B) { benchmarkSize(b, 8192) }
2267 func BenchmarkHash1MAvx2(b *testing.B) { benchmarkSize(b, 1024*1024) }
2268 func BenchmarkHash5MAvx2(b *testing.B) { benchmarkSize(b, 5*1024*1024) }
2269 func BenchmarkHash10MAvx2(b *testing.B) { benchmarkSize(b, 10*1024*1024) }
2288 func BenchmarkHash(b *testing.B) {
2289 algos := []struct {
2290 n string
2291 t blockfuncType
2292 f bool
2293 }{
2294 {"SHA_", blockfuncSha, sha && sse41 && ssse3},
2295 {"AVX2", blockfuncAvx2, avx2},
2296 {"AVX_", blockfuncAvx, avx},
2297 {"SSSE", blockfuncSsse, ssse3},
2298 {"GEN_", blockfuncGeneric, true},
2299 }
2300
2301 sizes := []struct {
2302 n string
2303 f func(*testing.B, int)
2304 s int
2305 }{
2306 {"8Bytes", benchmarkSize, 1 << 3},
2307 {"1K", benchmarkSize, 1 << 10},
2308 {"8K", benchmarkSize, 1 << 13},
2309 {"1M", benchmarkSize, 1 << 20},
2310 {"5M", benchmarkSize, 5 << 20},
2311 {"10M", benchmarkSize, 5 << 21},
2312 }
2313
2314 for _, a := range algos {
2315 if a.f {
2316 blockfuncSaved := blockfunc
2317 blockfunc = a.t
2318 for _, y := range sizes {
2319 s := a.n + "/" + y.n
2320 b.Run(s, func(b *testing.B) { y.f(b, y.s) })
2321 }
2322 blockfunc = blockfuncSaved
2323 }
2324 }
2325 }
116116 // func blockAvx2(h []uint32, message []uint8)
117117 TEXT ·blockAvx2(SB), 7, $0
118118
119 MOVQ ctx+0(FP), DI // DI: &h
120 MOVQ inp+24(FP), SI // SI: &message
121 MOVQ inplength+32(FP), DX // len(message)
122 ADDQ SI, DX // end pointer of input
123 MOVQ SP, R11 // copy stack pointer
124 SUBQ $0x220, SP // sp -= 0x220
125 ANDQ $0xfffffffffffffc00, SP // align stack frame
126 ADDQ $0x1c0, SP
127 MOVQ DI, 0x40(SP) // save ctx
128 MOVQ SI, 0x48(SP) // save input
129 MOVQ DX, 0x50(SP) // save end pointer
130 MOVQ R11, 0x58(SP) // save copy of stack pointer
131
132 WORD $0xf8c5; BYTE $0x77 // vzeroupper
133 ADDQ $0x40, SI // input++
134 MOVL (DI), AX
135 MOVQ SI, R12 // borrow $T1
136 MOVL 4(DI), BX
137 CMPQ SI, DX // $_end
138 MOVL 8(DI), CX
139 LONG $0xe4440f4c // cmove r12,rsp /* next block or random data */
140 MOVL 12(DI), DX
141 MOVL 16(DI), R8
142 MOVL 20(DI), R9
143 MOVL 24(DI), R10
144 MOVL 28(DI), R11
145
146 LEAQ K256<>(SB), BP
147 LONG $0x856f7dc5; LONG $0x00000220 // VMOVDQA YMM8, 0x220[rbp] /* vmovdqa ymm8,YMMWORD PTR [rip+0x220] */
148 LONG $0x8d6f7dc5; LONG $0x00000240 // VMOVDQA YMM9, 0x240[rbp] /* vmovdqa ymm9,YMMWORD PTR [rip+0x240] */
149 LONG $0x956f7dc5; LONG $0x00000200 // VMOVDQA YMM10, 0x200[rbp] /* vmovdqa ymm7,YMMWORD PTR [rip+0x200] */
119 MOVQ ctx+0(FP), DI // DI: &h
120 MOVQ inp+24(FP), SI // SI: &message
121 MOVQ inplength+32(FP), DX // len(message)
122 ADDQ SI, DX // end pointer of input
123 MOVQ SP, R11 // copy stack pointer
124 SUBQ $0x220, SP // sp -= 0x220
125 ANDQ $0xfffffffffffffc00, SP // align stack frame
126 ADDQ $0x1c0, SP
127 MOVQ DI, 0x40(SP) // save ctx
128 MOVQ SI, 0x48(SP) // save input
129 MOVQ DX, 0x50(SP) // save end pointer
130 MOVQ R11, 0x58(SP) // save copy of stack pointer
131
132 WORD $0xf8c5; BYTE $0x77 // vzeroupper
133 ADDQ $0x40, SI // input++
134 MOVL (DI), AX
135 MOVQ SI, R12 // borrow $T1
136 MOVL 4(DI), BX
137 CMPQ SI, DX // $_end
138 MOVL 8(DI), CX
139 LONG $0xe4440f4c // cmove r12,rsp /* next block or random data */
140 MOVL 12(DI), DX
141 MOVL 16(DI), R8
142 MOVL 20(DI), R9
143 MOVL 24(DI), R10
144 MOVL 28(DI), R11
145
146 LEAQ K256<>(SB), BP
147 LONG $0x856f7dc5; LONG $0x00000220 // VMOVDQA YMM8, 0x220[rbp] /* vmovdqa ymm8,YMMWORD PTR [rip+0x220] */
148 LONG $0x8d6f7dc5; LONG $0x00000240 // VMOVDQA YMM9, 0x240[rbp] /* vmovdqa ymm9,YMMWORD PTR [rip+0x240] */
149 LONG $0x956f7dc5; LONG $0x00000200 // VMOVDQA YMM10, 0x200[rbp] /* vmovdqa ymm7,YMMWORD PTR [rip+0x200] */
150150
151151 loop0:
152 LONG $0x6f7dc1c4; BYTE $0xfa // VMOVDQA YMM7, YMM10
153
154 // Load first 16 dwords from two blocks
155 MOVOU -64(SI), X0 // vmovdqu xmm0,XMMWORD PTR [rsi-0x40]
156 MOVOU -48(SI), X1 // vmovdqu xmm1,XMMWORD PTR [rsi-0x30]
157 MOVOU -32(SI), X2 // vmovdqu xmm2,XMMWORD PTR [rsi-0x20]
158 MOVOU -16(SI), X3 // vmovdqu xmm3,XMMWORD PTR [rsi-0x10]
159
160 // Byte swap data and transpose data into high/low
161 LONG $0x387dc3c4; WORD $0x2404; BYTE $0x01 // vinserti128 ymm0,ymm0,[r12],0x1
162 LONG $0x3875c3c4; LONG $0x0110244c // vinserti128 ymm1,ymm1,0x10[r12],0x1
163 LONG $0x007de2c4; BYTE $0xc7 // vpshufb ymm0,ymm0,ymm7
164 LONG $0x386dc3c4; LONG $0x01202454 // vinserti128 ymm2,ymm2,0x20[r12],0x1
165 LONG $0x0075e2c4; BYTE $0xcf // vpshufb ymm1,ymm1,ymm7
166 LONG $0x3865c3c4; LONG $0x0130245c // vinserti128 ymm3,ymm3,0x30[r12],0x1
167
168 LEAQ K256<>(SB), BP
169 LONG $0x006de2c4; BYTE $0xd7 // vpshufb ymm2,ymm2,ymm7
170 LONG $0x65fefdc5; BYTE $0x00 // vpaddd ymm4,ymm0,[rbp]
171 LONG $0x0065e2c4; BYTE $0xdf // vpshufb ymm3,ymm3,ymm7
172 LONG $0x6dfef5c5; BYTE $0x20 // vpaddd ymm5,ymm1,0x20[rbp]
173 LONG $0x75feedc5; BYTE $0x40 // vpaddd ymm6,ymm2,0x40[rbp]
174 LONG $0x7dfee5c5; BYTE $0x60 // vpaddd ymm7,ymm3,0x60[rbp]
175
176 LONG $0x247ffdc5; BYTE $0x24 // vmovdqa [rsp],ymm4
177 XORQ R14, R14
178 LONG $0x6c7ffdc5; WORD $0x2024 // vmovdqa [rsp+0x20],ymm5
179
180 ADDQ $-0x40, SP
181 MOVQ BX, DI
182 LONG $0x347ffdc5; BYTE $0x24 // vmovdqa [rsp],ymm6
183 XORQ CX, DI // magic
184 LONG $0x7c7ffdc5; WORD $0x2024 // vmovdqa [rsp+0x20],ymm7
185 MOVQ R9, R12
186 ADDQ $0x80,BP
152 LONG $0x6f7dc1c4; BYTE $0xfa // VMOVDQA YMM7, YMM10
153
154 // Load first 16 dwords from two blocks
155 MOVOU -64(SI), X0 // vmovdqu xmm0,XMMWORD PTR [rsi-0x40]
156 MOVOU -48(SI), X1 // vmovdqu xmm1,XMMWORD PTR [rsi-0x30]
157 MOVOU -32(SI), X2 // vmovdqu xmm2,XMMWORD PTR [rsi-0x20]
158 MOVOU -16(SI), X3 // vmovdqu xmm3,XMMWORD PTR [rsi-0x10]
159
160 // Byte swap data and transpose data into high/low
161 LONG $0x387dc3c4; WORD $0x2404; BYTE $0x01 // vinserti128 ymm0,ymm0,[r12],0x1
162 LONG $0x3875c3c4; LONG $0x0110244c // vinserti128 ymm1,ymm1,0x10[r12],0x1
163 LONG $0x007de2c4; BYTE $0xc7 // vpshufb ymm0,ymm0,ymm7
164 LONG $0x386dc3c4; LONG $0x01202454 // vinserti128 ymm2,ymm2,0x20[r12],0x1
165 LONG $0x0075e2c4; BYTE $0xcf // vpshufb ymm1,ymm1,ymm7
166 LONG $0x3865c3c4; LONG $0x0130245c // vinserti128 ymm3,ymm3,0x30[r12],0x1
167
168 LEAQ K256<>(SB), BP
169 LONG $0x006de2c4; BYTE $0xd7 // vpshufb ymm2,ymm2,ymm7
170 LONG $0x65fefdc5; BYTE $0x00 // vpaddd ymm4,ymm0,[rbp]
171 LONG $0x0065e2c4; BYTE $0xdf // vpshufb ymm3,ymm3,ymm7
172 LONG $0x6dfef5c5; BYTE $0x20 // vpaddd ymm5,ymm1,0x20[rbp]
173 LONG $0x75feedc5; BYTE $0x40 // vpaddd ymm6,ymm2,0x40[rbp]
174 LONG $0x7dfee5c5; BYTE $0x60 // vpaddd ymm7,ymm3,0x60[rbp]
175
176 LONG $0x247ffdc5; BYTE $0x24 // vmovdqa [rsp],ymm4
177 XORQ R14, R14
178 LONG $0x6c7ffdc5; WORD $0x2024 // vmovdqa [rsp+0x20],ymm5
179
180 ADDQ $-0x40, SP
181 MOVQ BX, DI
182 LONG $0x347ffdc5; BYTE $0x24 // vmovdqa [rsp],ymm6
183 XORQ CX, DI // magic
184 LONG $0x7c7ffdc5; WORD $0x2024 // vmovdqa [rsp+0x20],ymm7
185 MOVQ R9, R12
186 ADDQ $0x80, BP
187187
188188 loop1:
189 // Schedule 48 input dwords, by doing 3 rounds of 12 each
190 // Note: SIMD instructions are interleaved with the SHA calculations
191 ADDQ $-0x40, SP
192 LONG $0x0f75e3c4; WORD $0x04e0 // vpalignr ymm4,ymm1,ymm0,0x4
193
194 // ROUND(AX, BX, CX, DX, R8, R9, R10, R11, R12, R13, R14, R15, DI, SP, 0x80)
195 LONG $0x249c0344; LONG $0x00000080 // add r11d,[rsp+0x80]
196 WORD $0x2145; BYTE $0xc4 // and r12d,r8d
197 LONG $0xf07b43c4; WORD $0x19e8 // rorx r13d,r8d,0x19
198 LONG $0x0f65e3c4; WORD $0x04fa // vpalignr ymm7,ymm3,ymm2,0x4
199 LONG $0xf07b43c4; WORD $0x0bf8 // rorx r15d,r8d,0xb
200 LONG $0x30048d42 // lea eax,[rax+r14*1]
201 LONG $0x231c8d47 // lea r11d,[r11+r12*1]
202 LONG $0xd472cdc5; BYTE $0x07 // vpsrld ymm6,ymm4,0x7
203 LONG $0xf23842c4; BYTE $0xe2 // andn r12d,r8d,r10d
204 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
205 LONG $0xf07b43c4; WORD $0x06f0 // rorx r14d,r8d,0x6
206 LONG $0xc7fefdc5 // vpaddd ymm0,ymm0,ymm7
207 LONG $0x231c8d47 // lea r11d,[r11+r12*1]
208 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
209 WORD $0x8941; BYTE $0xc7 // mov r15d,eax
210 LONG $0xd472c5c5; BYTE $0x03 // vpsrld ymm7,ymm4,0x3
211 LONG $0xf07b63c4; WORD $0x16e0 // rorx r12d,eax,0x16
212 LONG $0x2b1c8d47 // lea r11d,[r11+r13*1]
213 WORD $0x3141; BYTE $0xdf // xor r15d,ebx
214 LONG $0xf472d5c5; BYTE $0x0e // vpslld ymm5,ymm4,0xe
215 LONG $0xf07b63c4; WORD $0x0df0 // rorx r14d,eax,0xd
216 LONG $0xf07b63c4; WORD $0x02e8 // rorx r13d,eax,0x2
217 LONG $0x1a148d42 // lea edx,[rdx+r11*1]
218 LONG $0xe6efc5c5 // vpxor ymm4,ymm7,ymm6
219 WORD $0x2144; BYTE $0xff // and edi,r15d
220 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
221 WORD $0xdf31 // xor edi,ebx
222 LONG $0xfb70fdc5; BYTE $0xfa // vpshufd ymm7,ymm3,0xfa
223 WORD $0x3145; BYTE $0xee // xor r14d,r13d
224 LONG $0x3b1c8d45 // lea r11d,[r11+rdi*1]
225 WORD $0x8945; BYTE $0xc4 // mov r12d,r8d
226 LONG $0xd672cdc5; BYTE $0x0b // vpsrld ymm6,ymm6,0xb
227
228 // ROUND(R11, AX, BX, CX, DX, R8, R9, R10, R12, R13, R14, DI, R15, SP, 0x84)
229 LONG $0x24940344; LONG $0x00000084 // add r10d,[rsp+0x84]
230 WORD $0x2141; BYTE $0xd4 // and r12d,edx
231 LONG $0xf07b63c4; WORD $0x19ea // rorx r13d,edx,0x19
232 LONG $0xe5efddc5 // vpxor ymm4,ymm4,ymm5
233 LONG $0xf07be3c4; WORD $0x0bfa // rorx edi,edx,0xb
234 LONG $0x331c8d47 // lea r11d,[r11+r14*1]
235 LONG $0x22148d47 // lea r10d,[r10+r12*1]
236 LONG $0xf572d5c5; BYTE $0x0b // vpslld ymm5,ymm5,0xb
237 LONG $0xf26842c4; BYTE $0xe1 // andn r12d,edx,r9d
238 WORD $0x3141; BYTE $0xfd // xor r13d,edi
239 LONG $0xf07b63c4; WORD $0x06f2 // rorx r14d,edx,0x6
240 LONG $0xe6efddc5 // vpxor ymm4,ymm4,ymm6
241 LONG $0x22148d47 // lea r10d,[r10+r12*1]
242 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
243 WORD $0x8944; BYTE $0xdf // mov edi,r11d
244 LONG $0xd772cdc5; BYTE $0x0a // vpsrld ymm6,ymm7,0xa
245 LONG $0xf07b43c4; WORD $0x16e3 // rorx r12d,r11d,0x16
246 LONG $0x2a148d47 // lea r10d,[r10+r13*1]
247 WORD $0xc731 // xor edi,eax
248 LONG $0xe5efddc5 // vpxor ymm4,ymm4,ymm5
249 LONG $0xf07b43c4; WORD $0x0df3 // rorx r14d,r11d,0xd
250 LONG $0xf07b43c4; WORD $0x02eb // rorx r13d,r11d,0x2
251 LONG $0x110c8d42 // lea ecx,[rcx+r10*1]
252 LONG $0xd773c5c5; BYTE $0x11 // vpsrlq ymm7,ymm7,0x11
253 WORD $0x2141; BYTE $0xff // and r15d,edi
254 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
255 WORD $0x3141; BYTE $0xc7 // xor r15d,eax
256 LONG $0xc4fefdc5 // vpaddd ymm0,ymm0,ymm4
257 WORD $0x3145; BYTE $0xee // xor r14d,r13d
258 LONG $0x3a148d47 // lea r10d,[r10+r15*1]
259 WORD $0x8941; BYTE $0xd4 // mov r12d,edx
260 LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7
261
262 // ROUND(R10, R11, AX, BX, CX, DX, R8, R9, R12, R13, R14, R15, DI, SP, 0x88)
263 LONG $0x248c0344; LONG $0x00000088 // add r9d,[rsp+0x88]
264 WORD $0x2141; BYTE $0xcc // and r12d,ecx
265 LONG $0xf07b63c4; WORD $0x19e9 // rorx r13d,ecx,0x19
266 LONG $0xd773c5c5; BYTE $0x02 // vpsrlq ymm7,ymm7,0x2
267 LONG $0xf07b63c4; WORD $0x0bf9 // rorx r15d,ecx,0xb
268 LONG $0x32148d47 // lea r10d,[r10+r14*1]
269 LONG $0x210c8d47 // lea r9d,[r9+r12*1]
270 LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7
271 LONG $0xf27042c4; BYTE $0xe0 // andn r12d,ecx,r8d
272 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
273 LONG $0xf07b63c4; WORD $0x06f1 // rorx r14d,ecx,0x6
274 LONG $0x004dc2c4; BYTE $0xf0 // vpshufb ymm6,ymm6,ymm8
275 LONG $0x210c8d47 // lea r9d,[r9+r12*1]
276 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
277 WORD $0x8945; BYTE $0xd7 // mov r15d,r10d
278 LONG $0xc6fefdc5 // vpaddd ymm0,ymm0,ymm6
279 LONG $0xf07b43c4; WORD $0x16e2 // rorx r12d,r10d,0x16
280 LONG $0x290c8d47 // lea r9d,[r9+r13*1]
281 WORD $0x3145; BYTE $0xdf // xor r15d,r11d
282 LONG $0xf870fdc5; BYTE $0x50 // vpshufd ymm7,ymm0,0x50
283 LONG $0xf07b43c4; WORD $0x0df2 // rorx r14d,r10d,0xd
284 LONG $0xf07b43c4; WORD $0x02ea // rorx r13d,r10d,0x2
285 LONG $0x0b1c8d42 // lea ebx,[rbx+r9*1]
286 LONG $0xd772cdc5; BYTE $0x0a // vpsrld ymm6,ymm7,0xa
287 WORD $0x2144; BYTE $0xff // and edi,r15d
288 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
289 WORD $0x3144; BYTE $0xdf // xor edi,r11d
290 LONG $0xd773c5c5; BYTE $0x11 // vpsrlq ymm7,ymm7,0x11
291 WORD $0x3145; BYTE $0xee // xor r14d,r13d
292 LONG $0x390c8d45 // lea r9d,[r9+rdi*1]
293 WORD $0x8941; BYTE $0xcc // mov r12d,ecx
294 LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7
295
296 // ROUND(R9, R10, R11, AX, BX, CX, DX, R8, R12, R13, R14, DI, R15, SP, 0x8c)
297 LONG $0x24840344; LONG $0x0000008c // add r8d,[rsp+0x8c]
298 WORD $0x2141; BYTE $0xdc // and r12d,ebx
299 LONG $0xf07b63c4; WORD $0x19eb // rorx r13d,ebx,0x19
300 LONG $0xd773c5c5; BYTE $0x02 // vpsrlq ymm7,ymm7,0x2
301 LONG $0xf07be3c4; WORD $0x0bfb // rorx edi,ebx,0xb
302 LONG $0x310c8d47 // lea r9d,[r9+r14*1]
303 LONG $0x20048d47 // lea r8d,[r8+r12*1]
304 LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7
305 LONG $0xf26062c4; BYTE $0xe2 // andn r12d,ebx,edx
306 WORD $0x3141; BYTE $0xfd // xor r13d,edi
307 LONG $0xf07b63c4; WORD $0x06f3 // rorx r14d,ebx,0x6
308 LONG $0x004dc2c4; BYTE $0xf1 // vpshufb ymm6,ymm6,ymm9
309 LONG $0x20048d47 // lea r8d,[r8+r12*1]
310 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
311 WORD $0x8944; BYTE $0xcf // mov edi,r9d
312 LONG $0xc6fefdc5 // vpaddd ymm0,ymm0,ymm6
313 LONG $0xf07b43c4; WORD $0x16e1 // rorx r12d,r9d,0x16
314 LONG $0x28048d47 // lea r8d,[r8+r13*1]
315 WORD $0x3144; BYTE $0xd7 // xor edi,r10d
316 LONG $0x75fefdc5; BYTE $0x00 // vpaddd ymm6,ymm0,[rbp+0x0]
317 LONG $0xf07b43c4; WORD $0x0df1 // rorx r14d,r9d,0xd
318 LONG $0xf07b43c4; WORD $0x02e9 // rorx r13d,r9d,0x2
319 LONG $0x00048d42 // lea eax,[rax+r8*1]
320 WORD $0x2141; BYTE $0xff // and r15d,edi
321 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
322 WORD $0x3145; BYTE $0xd7 // xor r15d,r10d
323 WORD $0x3145; BYTE $0xee // xor r14d,r13d
324 LONG $0x38048d47 // lea r8d,[r8+r15*1]
325 WORD $0x8941; BYTE $0xdc // mov r12d,ebx
326
327 LONG $0x347ffdc5; BYTE $0x24 // vmovdqa [rsp],ymm6
328 LONG $0x0f6de3c4; WORD $0x04e1 // vpalignr ymm4,ymm2,ymm1,0x4
329
330 // ROUND(R8, R9, R10, R11, AX, BX, CX, DX, R12, R13, R14, R15, DI, SP, 0xa0)
331 LONG $0xa0249403; WORD $0x0000; BYTE $0x00 // add edx,[rsp+0xa0]
332 WORD $0x2141; BYTE $0xc4 // and r12d,eax
333 LONG $0xf07b63c4; WORD $0x19e8 // rorx r13d,eax,0x19
334 LONG $0x0f7de3c4; WORD $0x04fb // vpalignr ymm7,ymm0,ymm3,0x4
335 LONG $0xf07b63c4; WORD $0x0bf8 // rorx r15d,eax,0xb
336 LONG $0x30048d47 // lea r8d,[r8+r14*1]
337 LONG $0x22148d42 // lea edx,[rdx+r12*1]
338 LONG $0xd472cdc5; BYTE $0x07 // vpsrld ymm6,ymm4,0x7
339 LONG $0xf27862c4; BYTE $0xe1 // andn r12d,eax,ecx
340 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
341 LONG $0xf07b63c4; WORD $0x06f0 // rorx r14d,eax,0x6
342 LONG $0xcffef5c5 // vpaddd ymm1,ymm1,ymm7
343 LONG $0x22148d42 // lea edx,[rdx+r12*1]
344 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
345 WORD $0x8945; BYTE $0xc7 // mov r15d,r8d
346 LONG $0xd472c5c5; BYTE $0x03 // vpsrld ymm7,ymm4,0x3
347 LONG $0xf07b43c4; WORD $0x16e0 // rorx r12d,r8d,0x16
348 LONG $0x2a148d42 // lea edx,[rdx+r13*1]
349 WORD $0x3145; BYTE $0xcf // xor r15d,r9d
350 LONG $0xf472d5c5; BYTE $0x0e // vpslld ymm5,ymm4,0xe
351 LONG $0xf07b43c4; WORD $0x0df0 // rorx r14d,r8d,0xd
352 LONG $0xf07b43c4; WORD $0x02e8 // rorx r13d,r8d,0x2
353 LONG $0x131c8d45 // lea r11d,[r11+rdx*1]
354 LONG $0xe6efc5c5 // vpxor ymm4,ymm7,ymm6
355 WORD $0x2144; BYTE $0xff // and edi,r15d
356 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
357 WORD $0x3144; BYTE $0xcf // xor edi,r9d
358 LONG $0xf870fdc5; BYTE $0xfa // vpshufd ymm7,ymm0,0xfa
359 WORD $0x3145; BYTE $0xee // xor r14d,r13d
360 WORD $0x148d; BYTE $0x3a // lea edx,[rdx+rdi*1]
361 WORD $0x8941; BYTE $0xc4 // mov r12d,eax
362 LONG $0xd672cdc5; BYTE $0x0b // vpsrld ymm6,ymm6,0xb
363
364 // ROUND(DX, R8, R9, R10, R11, AX, BX, CX, R12, R13, R14, DI, R15, SP, 0xa4)
365 LONG $0xa4248c03; WORD $0x0000; BYTE $0x00 // add ecx,[rsp+0xa4]
366 WORD $0x2145; BYTE $0xdc // and r12d,r11d
367 LONG $0xf07b43c4; WORD $0x19eb // rorx r13d,r11d,0x19
368 LONG $0xe5efddc5 // vpxor ymm4,ymm4,ymm5
369 LONG $0xf07bc3c4; WORD $0x0bfb // rorx edi,r11d,0xb
370 LONG $0x32148d42 // lea edx,[rdx+r14*1]
371 LONG $0x210c8d42 // lea ecx,[rcx+r12*1]
372 LONG $0xf572d5c5; BYTE $0x0b // vpslld ymm5,ymm5,0xb
373 LONG $0xf22062c4; BYTE $0xe3 // andn r12d,r11d,ebx
374 WORD $0x3141; BYTE $0xfd // xor r13d,edi
375 LONG $0xf07b43c4; WORD $0x06f3 // rorx r14d,r11d,0x6
376 LONG $0xe6efddc5 // vpxor ymm4,ymm4,ymm6
377 LONG $0x210c8d42 // lea ecx,[rcx+r12*1]
378 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
379 WORD $0xd789 // mov edi,edx
380 LONG $0xd772cdc5; BYTE $0x0a // vpsrld ymm6,ymm7,0xa
381 LONG $0xf07b63c4; WORD $0x16e2 // rorx r12d,edx,0x16
382 LONG $0x290c8d42 // lea ecx,[rcx+r13*1]
383 WORD $0x3144; BYTE $0xc7 // xor edi,r8d
384 LONG $0xe5efddc5 // vpxor ymm4,ymm4,ymm5
385 LONG $0xf07b63c4; WORD $0x0df2 // rorx r14d,edx,0xd
386 LONG $0xf07b63c4; WORD $0x02ea // rorx r13d,edx,0x2
387 LONG $0x0a148d45 // lea r10d,[r10+rcx*1]
388 LONG $0xd773c5c5; BYTE $0x11 // vpsrlq ymm7,ymm7,0x11
389 WORD $0x2141; BYTE $0xff // and r15d,edi
390 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
391 WORD $0x3145; BYTE $0xc7 // xor r15d,r8d
392 LONG $0xccfef5c5 // vpaddd ymm1,ymm1,ymm4
393 WORD $0x3145; BYTE $0xee // xor r14d,r13d
394 LONG $0x390c8d42 // lea ecx,[rcx+r15*1]
395 WORD $0x8945; BYTE $0xdc // mov r12d,r11d
396 LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7
397
398 // ROUND(CX, DX, R8, R9, R10, R11, AX, BX, R12, R13, R14, R15, DI, SP, 0xa8)
399 LONG $0xa8249c03; WORD $0x0000; BYTE $0x00 // add ebx,[rsp+0xa8]
400 WORD $0x2145; BYTE $0xd4 // and r12d,r10d
401 LONG $0xf07b43c4; WORD $0x19ea // rorx r13d,r10d,0x19
402 LONG $0xd773c5c5; BYTE $0x02 // vpsrlq ymm7,ymm7,0x2
403 LONG $0xf07b43c4; WORD $0x0bfa // rorx r15d,r10d,0xb
404 LONG $0x310c8d42 // lea ecx,[rcx+r14*1]
405 LONG $0x231c8d42 // lea ebx,[rbx+r12*1]
406 LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7
407 LONG $0xf22862c4; BYTE $0xe0 // andn r12d,r10d,eax
408 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
409 LONG $0xf07b43c4; WORD $0x06f2 // rorx r14d,r10d,0x6
410 LONG $0x004dc2c4; BYTE $0xf0 // vpshufb ymm6,ymm6,ymm8
411 LONG $0x231c8d42 // lea ebx,[rbx+r12*1]
412 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
413 WORD $0x8941; BYTE $0xcf // mov r15d,ecx
414 LONG $0xcefef5c5 // vpaddd ymm1,ymm1,ymm6
415 LONG $0xf07b63c4; WORD $0x16e1 // rorx r12d,ecx,0x16
416 LONG $0x2b1c8d42 // lea ebx,[rbx+r13*1]
417 WORD $0x3141; BYTE $0xd7 // xor r15d,edx
418 LONG $0xf970fdc5; BYTE $0x50 // vpshufd ymm7,ymm1,0x50
419 LONG $0xf07b63c4; WORD $0x0df1 // rorx r14d,ecx,0xd
420 LONG $0xf07b63c4; WORD $0x02e9 // rorx r13d,ecx,0x2
421 LONG $0x190c8d45 // lea r9d,[r9+rbx*1]
422 LONG $0xd772cdc5; BYTE $0x0a // vpsrld ymm6,ymm7,0xa
423 WORD $0x2144; BYTE $0xff // and edi,r15d
424 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
425 WORD $0xd731 // xor edi,edx
426 LONG $0xd773c5c5; BYTE $0x11 // vpsrlq ymm7,ymm7,0x11
427 WORD $0x3145; BYTE $0xee // xor r14d,r13d
428 WORD $0x1c8d; BYTE $0x3b // lea ebx,[rbx+rdi*1]
429 WORD $0x8945; BYTE $0xd4 // mov r12d,r10d
430 LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7
431
432 // ROUND(BX, CX, DX, R8, R9, R10, R11, AX, R12, R13, R14, DI, R15, SP, 0xac)
433 LONG $0xac248403; WORD $0x0000; BYTE $0x00 // add eax,[rsp+0xac]
434 WORD $0x2145; BYTE $0xcc // and r12d,r9d
435 LONG $0xf07b43c4; WORD $0x19e9 // rorx r13d,r9d,0x19
436 LONG $0xd773c5c5; BYTE $0x02 // vpsrlq ymm7,ymm7,0x2
437 LONG $0xf07bc3c4; WORD $0x0bf9 // rorx edi,r9d,0xb
438 LONG $0x331c8d42 // lea ebx,[rbx+r14*1]
439 LONG $0x20048d42 // lea eax,[rax+r12*1]
440 LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7
441 LONG $0xf23042c4; BYTE $0xe3 // andn r12d,r9d,r11d
442 WORD $0x3141; BYTE $0xfd // xor r13d,edi
443 LONG $0xf07b43c4; WORD $0x06f1 // rorx r14d,r9d,0x6
444 LONG $0x004dc2c4; BYTE $0xf1 // vpshufb ymm6,ymm6,ymm9
445 LONG $0x20048d42 // lea eax,[rax+r12*1]
446 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
447 WORD $0xdf89 // mov edi,ebx
448 LONG $0xcefef5c5 // vpaddd ymm1,ymm1,ymm6
449 LONG $0xf07b63c4; WORD $0x16e3 // rorx r12d,ebx,0x16
450 LONG $0x28048d42 // lea eax,[rax+r13*1]
451 WORD $0xcf31 // xor edi,ecx
452 LONG $0x75fef5c5; BYTE $0x20 // vpaddd ymm6,ymm1,[rbp+0x20]
453 LONG $0xf07b63c4; WORD $0x0df3 // rorx r14d,ebx,0xd
454 LONG $0xf07b63c4; WORD $0x02eb // rorx r13d,ebx,0x2
455 LONG $0x00048d45 // lea r8d,[r8+rax*1]
456 WORD $0x2141; BYTE $0xff // and r15d,edi
457 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
458 WORD $0x3141; BYTE $0xcf // xor r15d,ecx
459 WORD $0x3145; BYTE $0xee // xor r14d,r13d
460 LONG $0x38048d42 // lea eax,[rax+r15*1]
461 WORD $0x8945; BYTE $0xcc // mov r12d,r9d
462
463 LONG $0x747ffdc5; WORD $0x2024 // vmovdqa [rsp+0x20],ymm6
464
465 LONG $0x24648d48; BYTE $0xc0 // lea rsp,[rsp-0x40]
466 LONG $0x0f65e3c4; WORD $0x04e2 // vpalignr ymm4,ymm3,ymm2,0x4
467
468 // ROUND(AX, BX, CX, DX, R8, R9, R10, R11, R12, R13, R14, R15, DI, SP, 0x80)
469 LONG $0x249c0344; LONG $0x00000080 // add r11d,[rsp+0x80]
470 WORD $0x2145; BYTE $0xc4 // and r12d,r8d
471 LONG $0xf07b43c4; WORD $0x19e8 // rorx r13d,r8d,0x19
472 LONG $0x0f75e3c4; WORD $0x04f8 // vpalignr ymm7,ymm1,ymm0,0x4
473 LONG $0xf07b43c4; WORD $0x0bf8 // rorx r15d,r8d,0xb
474 LONG $0x30048d42 // lea eax,[rax+r14*1]
475 LONG $0x231c8d47 // lea r11d,[r11+r12*1]
476 LONG $0xd472cdc5; BYTE $0x07 // vpsrld ymm6,ymm4,0x7
477 LONG $0xf23842c4; BYTE $0xe2 // andn r12d,r8d,r10d
478 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
479 LONG $0xf07b43c4; WORD $0x06f0 // rorx r14d,r8d,0x6
480 LONG $0xd7feedc5 // vpaddd ymm2,ymm2,ymm7
481 LONG $0x231c8d47 // lea r11d,[r11+r12*1]
482 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
483 WORD $0x8941; BYTE $0xc7 // mov r15d,eax
484 LONG $0xd472c5c5; BYTE $0x03 // vpsrld ymm7,ymm4,0x3
485 LONG $0xf07b63c4; WORD $0x16e0 // rorx r12d,eax,0x16
486 LONG $0x2b1c8d47 // lea r11d,[r11+r13*1]
487 WORD $0x3141; BYTE $0xdf // xor r15d,ebx
488 LONG $0xf472d5c5; BYTE $0x0e // vpslld ymm5,ymm4,0xe
489 LONG $0xf07b63c4; WORD $0x0df0 // rorx r14d,eax,0xd
490 LONG $0xf07b63c4; WORD $0x02e8 // rorx r13d,eax,0x2
491 LONG $0x1a148d42 // lea edx,[rdx+r11*1]
492 LONG $0xe6efc5c5 // vpxor ymm4,ymm7,ymm6
493 WORD $0x2144; BYTE $0xff // and edi,r15d
494 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
495 WORD $0xdf31 // xor edi,ebx
496 LONG $0xf970fdc5; BYTE $0xfa // vpshufd ymm7,ymm1,0xfa
497 WORD $0x3145; BYTE $0xee // xor r14d,r13d
498 LONG $0x3b1c8d45 // lea r11d,[r11+rdi*1]
499 WORD $0x8945; BYTE $0xc4 // mov r12d,r8d
500 LONG $0xd672cdc5; BYTE $0x0b // vpsrld ymm6,ymm6,0xb
501
502 // ROUND(R11, AX, BX, CX, DX, R8, R9, R10, R12, R13, R14, DI, R15, SP, 0x84)
503 LONG $0x24940344; LONG $0x00000084 // add r10d,[rsp+0x84]
504 WORD $0x2141; BYTE $0xd4 // and r12d,edx
505 LONG $0xf07b63c4; WORD $0x19ea // rorx r13d,edx,0x19
506 LONG $0xe5efddc5 // vpxor ymm4,ymm4,ymm5
507 LONG $0xf07be3c4; WORD $0x0bfa // rorx edi,edx,0xb
508 LONG $0x331c8d47 // lea r11d,[r11+r14*1]
509 LONG $0x22148d47 // lea r10d,[r10+r12*1]
510 LONG $0xf572d5c5; BYTE $0x0b // vpslld ymm5,ymm5,0xb
511 LONG $0xf26842c4; BYTE $0xe1 // andn r12d,edx,r9d
512 WORD $0x3141; BYTE $0xfd // xor r13d,edi
513 LONG $0xf07b63c4; WORD $0x06f2 // rorx r14d,edx,0x6
514 LONG $0xe6efddc5 // vpxor ymm4,ymm4,ymm6
515 LONG $0x22148d47 // lea r10d,[r10+r12*1]
516 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
517 WORD $0x8944; BYTE $0xdf // mov edi,r11d
518 LONG $0xd772cdc5; BYTE $0x0a // vpsrld ymm6,ymm7,0xa
519 LONG $0xf07b43c4; WORD $0x16e3 // rorx r12d,r11d,0x16
520 LONG $0x2a148d47 // lea r10d,[r10+r13*1]
521 WORD $0xc731 // xor edi,eax
522 LONG $0xe5efddc5 // vpxor ymm4,ymm4,ymm5
523 LONG $0xf07b43c4; WORD $0x0df3 // rorx r14d,r11d,0xd
524 LONG $0xf07b43c4; WORD $0x02eb // rorx r13d,r11d,0x2
525 LONG $0x110c8d42 // lea ecx,[rcx+r10*1]
526 LONG $0xd773c5c5; BYTE $0x11 // vpsrlq ymm7,ymm7,0x11
527 WORD $0x2141; BYTE $0xff // and r15d,edi
528 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
529 WORD $0x3141; BYTE $0xc7 // xor r15d,eax
530 LONG $0xd4feedc5 // vpaddd ymm2,ymm2,ymm4
531 WORD $0x3145; BYTE $0xee // xor r14d,r13d
532 LONG $0x3a148d47 // lea r10d,[r10+r15*1]
533 WORD $0x8941; BYTE $0xd4 // mov r12d,edx
534 LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7
535
536 // ROUND(R10, R11, AX, BX, CX, DX, R8, R9, R12, R13, R14, R15, DI, SP, 0x88)
537 LONG $0x248c0344; LONG $0x00000088 // add r9d,[rsp+0x88]
538 WORD $0x2141; BYTE $0xcc // and r12d,ecx
539 LONG $0xf07b63c4; WORD $0x19e9 // rorx r13d,ecx,0x19
540 LONG $0xd773c5c5; BYTE $0x02 // vpsrlq ymm7,ymm7,0x2
541 LONG $0xf07b63c4; WORD $0x0bf9 // rorx r15d,ecx,0xb
542 LONG $0x32148d47 // lea r10d,[r10+r14*1]
543 LONG $0x210c8d47 // lea r9d,[r9+r12*1]
544 LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7
545 LONG $0xf27042c4; BYTE $0xe0 // andn r12d,ecx,r8d
546 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
547 LONG $0xf07b63c4; WORD $0x06f1 // rorx r14d,ecx,0x6
548 LONG $0x004dc2c4; BYTE $0xf0 // vpshufb ymm6,ymm6,ymm8
549 LONG $0x210c8d47 // lea r9d,[r9+r12*1]
550 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
551 WORD $0x8945; BYTE $0xd7 // mov r15d,r10d
552 LONG $0xd6feedc5 // vpaddd ymm2,ymm2,ymm6
553 LONG $0xf07b43c4; WORD $0x16e2 // rorx r12d,r10d,0x16
554 LONG $0x290c8d47 // lea r9d,[r9+r13*1]
555 WORD $0x3145; BYTE $0xdf // xor r15d,r11d
556 LONG $0xfa70fdc5; BYTE $0x50 // vpshufd ymm7,ymm2,0x50
557 LONG $0xf07b43c4; WORD $0x0df2 // rorx r14d,r10d,0xd
558 LONG $0xf07b43c4; WORD $0x02ea // rorx r13d,r10d,0x2
559 LONG $0x0b1c8d42 // lea ebx,[rbx+r9*1]
560 LONG $0xd772cdc5; BYTE $0x0a // vpsrld ymm6,ymm7,0xa
561 WORD $0x2144; BYTE $0xff // and edi,r15d
562 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
563 WORD $0x3144; BYTE $0xdf // xor edi,r11d
564 LONG $0xd773c5c5; BYTE $0x11 // vpsrlq ymm7,ymm7,0x11
565 WORD $0x3145; BYTE $0xee // xor r14d,r13d
566 LONG $0x390c8d45 // lea r9d,[r9+rdi*1]
567 WORD $0x8941; BYTE $0xcc // mov r12d,ecx
568 LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7
569
570 // ROUND(R9, R10, R11, AX, BX, CX, DX, R8, R12, R13, R14, DI, R15, SP, 0x8c)
571 LONG $0x24840344; LONG $0x0000008c // add r8d,[rsp+0x8c]
572 WORD $0x2141; BYTE $0xdc // and r12d,ebx
573 LONG $0xf07b63c4; WORD $0x19eb // rorx r13d,ebx,0x19
574 LONG $0xd773c5c5; BYTE $0x02 // vpsrlq ymm7,ymm7,0x2
575 LONG $0xf07be3c4; WORD $0x0bfb // rorx edi,ebx,0xb
576 LONG $0x310c8d47 // lea r9d,[r9+r14*1]
577 LONG $0x20048d47 // lea r8d,[r8+r12*1]
578 LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7
579 LONG $0xf26062c4; BYTE $0xe2 // andn r12d,ebx,edx
580 WORD $0x3141; BYTE $0xfd // xor r13d,edi
581 LONG $0xf07b63c4; WORD $0x06f3 // rorx r14d,ebx,0x6
582 LONG $0x004dc2c4; BYTE $0xf1 // vpshufb ymm6,ymm6,ymm9
583 LONG $0x20048d47 // lea r8d,[r8+r12*1]
584 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
585 WORD $0x8944; BYTE $0xcf // mov edi,r9d
586 LONG $0xd6feedc5 // vpaddd ymm2,ymm2,ymm6
587 LONG $0xf07b43c4; WORD $0x16e1 // rorx r12d,r9d,0x16
588 LONG $0x28048d47 // lea r8d,[r8+r13*1]
589 WORD $0x3144; BYTE $0xd7 // xor edi,r10d
590 LONG $0x75feedc5; BYTE $0x40 // vpaddd ymm6,ymm2,[rbp+0x40]
591 LONG $0xf07b43c4; WORD $0x0df1 // rorx r14d,r9d,0xd
592 LONG $0xf07b43c4; WORD $0x02e9 // rorx r13d,r9d,0x2
593 LONG $0x00048d42 // lea eax,[rax+r8*1]
594 WORD $0x2141; BYTE $0xff // and r15d,edi
595 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
596 WORD $0x3145; BYTE $0xd7 // xor r15d,r10d
597 WORD $0x3145; BYTE $0xee // xor r14d,r13d
598 LONG $0x38048d47 // lea r8d,[r8+r15*1]
599 WORD $0x8941; BYTE $0xdc // mov r12d,ebx
600
601 LONG $0x347ffdc5; BYTE $0x24 // vmovdqa [rsp],ymm6
602 LONG $0x0f7de3c4; WORD $0x04e3 // vpalignr ymm4,ymm0,ymm3,0x4
603
604 // ROUND(R8, R9, R10, R11, AX, BX, CX, DX, R12, R13, R14, R15, DI, SP, 0xa0)
605 LONG $0xa0249403; WORD $0x0000; BYTE $0x00 // add edx,[rsp+0xa0]
606 WORD $0x2141; BYTE $0xc4 // and r12d,eax
607 LONG $0xf07b63c4; WORD $0x19e8 // rorx r13d,eax,0x19
608 LONG $0x0f6de3c4; WORD $0x04f9 // vpalignr ymm7,ymm2,ymm1,0x4
609 LONG $0xf07b63c4; WORD $0x0bf8 // rorx r15d,eax,0xb
610 LONG $0x30048d47 // lea r8d,[r8+r14*1]
611 LONG $0x22148d42 // lea edx,[rdx+r12*1]
612 LONG $0xd472cdc5; BYTE $0x07 // vpsrld ymm6,ymm4,0x7
613 LONG $0xf27862c4; BYTE $0xe1 // andn r12d,eax,ecx
614 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
615 LONG $0xf07b63c4; WORD $0x06f0 // rorx r14d,eax,0x6
616 LONG $0xdffee5c5 // vpaddd ymm3,ymm3,ymm7
617 LONG $0x22148d42 // lea edx,[rdx+r12*1]
618 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
619 WORD $0x8945; BYTE $0xc7 // mov r15d,r8d
620 LONG $0xd472c5c5; BYTE $0x03 // vpsrld ymm7,ymm4,0x3
621 LONG $0xf07b43c4; WORD $0x16e0 // rorx r12d,r8d,0x16
622 LONG $0x2a148d42 // lea edx,[rdx+r13*1]
623 WORD $0x3145; BYTE $0xcf // xor r15d,r9d
624 LONG $0xf472d5c5; BYTE $0x0e // vpslld ymm5,ymm4,0xe
625 LONG $0xf07b43c4; WORD $0x0df0 // rorx r14d,r8d,0xd
626 LONG $0xf07b43c4; WORD $0x02e8 // rorx r13d,r8d,0x2
627 LONG $0x131c8d45 // lea r11d,[r11+rdx*1]
628 LONG $0xe6efc5c5 // vpxor ymm4,ymm7,ymm6
629 WORD $0x2144; BYTE $0xff // and edi,r15d
630 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
631 WORD $0x3144; BYTE $0xcf // xor edi,r9d
632 LONG $0xfa70fdc5; BYTE $0xfa // vpshufd ymm7,ymm2,0xfa
633 WORD $0x3145; BYTE $0xee // xor r14d,r13d
634 WORD $0x148d; BYTE $0x3a // lea edx,[rdx+rdi*1]
635 WORD $0x8941; BYTE $0xc4 // mov r12d,eax
636 LONG $0xd672cdc5; BYTE $0x0b // vpsrld ymm6,ymm6,0xb
637
638 // ROUND(DX, R8, R9, R10, R11, AX, BX, CX, R12, R13, R14, DI, R15, SP, 0xa4)
639 LONG $0xa4248c03; WORD $0x0000; BYTE $0x00 // add ecx,[rsp+0xa4]
640 WORD $0x2145; BYTE $0xdc // and r12d,r11d
641 LONG $0xf07b43c4; WORD $0x19eb // rorx r13d,r11d,0x19
642 LONG $0xe5efddc5 // vpxor ymm4,ymm4,ymm5
643 LONG $0xf07bc3c4; WORD $0x0bfb // rorx edi,r11d,0xb
644 LONG $0x32148d42 // lea edx,[rdx+r14*1]
645 LONG $0x210c8d42 // lea ecx,[rcx+r12*1]
646 LONG $0xf572d5c5; BYTE $0x0b // vpslld ymm5,ymm5,0xb
647 LONG $0xf22062c4; BYTE $0xe3 // andn r12d,r11d,ebx
648 WORD $0x3141; BYTE $0xfd // xor r13d,edi
649 LONG $0xf07b43c4; WORD $0x06f3 // rorx r14d,r11d,0x6
650 LONG $0xe6efddc5 // vpxor ymm4,ymm4,ymm6
651 LONG $0x210c8d42 // lea ecx,[rcx+r12*1]
652 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
653 WORD $0xd789 // mov edi,edx
654 LONG $0xd772cdc5; BYTE $0x0a // vpsrld ymm6,ymm7,0xa
655 LONG $0xf07b63c4; WORD $0x16e2 // rorx r12d,edx,0x16
656 LONG $0x290c8d42 // lea ecx,[rcx+r13*1]
657 WORD $0x3144; BYTE $0xc7 // xor edi,r8d
658 LONG $0xe5efddc5 // vpxor ymm4,ymm4,ymm5
659 LONG $0xf07b63c4; WORD $0x0df2 // rorx r14d,edx,0xd
660 LONG $0xf07b63c4; WORD $0x02ea // rorx r13d,edx,0x2
661 LONG $0x0a148d45 // lea r10d,[r10+rcx*1]
662 LONG $0xd773c5c5; BYTE $0x11 // vpsrlq ymm7,ymm7,0x11
663 WORD $0x2141; BYTE $0xff // and r15d,edi
664 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
665 WORD $0x3145; BYTE $0xc7 // xor r15d,r8d
666 LONG $0xdcfee5c5 // vpaddd ymm3,ymm3,ymm4
667 WORD $0x3145; BYTE $0xee // xor r14d,r13d
668 LONG $0x390c8d42 // lea ecx,[rcx+r15*1]
669 WORD $0x8945; BYTE $0xdc // mov r12d,r11d
670 LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7
671
672 // ROUND(CX, DX, R8, R9, R10, R11, AX, BX, R12, R13, R14, R15, DI, SP, 0xa8)
673 LONG $0xa8249c03; WORD $0x0000; BYTE $0x00 // add ebx,[rsp+0xa8]
674 WORD $0x2145; BYTE $0xd4 // and r12d,r10d
675 LONG $0xf07b43c4; WORD $0x19ea // rorx r13d,r10d,0x19
676 LONG $0xd773c5c5; BYTE $0x02 // vpsrlq ymm7,ymm7,0x2
677 LONG $0xf07b43c4; WORD $0x0bfa // rorx r15d,r10d,0xb
678 LONG $0x310c8d42 // lea ecx,[rcx+r14*1]
679 LONG $0x231c8d42 // lea ebx,[rbx+r12*1]
680 LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7
681 LONG $0xf22862c4; BYTE $0xe0 // andn r12d,r10d,eax
682 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
683 LONG $0xf07b43c4; WORD $0x06f2 // rorx r14d,r10d,0x6
684 LONG $0x004dc2c4; BYTE $0xf0 // vpshufb ymm6,ymm6,ymm8
685 LONG $0x231c8d42 // lea ebx,[rbx+r12*1]
686 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
687 WORD $0x8941; BYTE $0xcf // mov r15d,ecx
688 LONG $0xdefee5c5 // vpaddd ymm3,ymm3,ymm6
689 LONG $0xf07b63c4; WORD $0x16e1 // rorx r12d,ecx,0x16
690 LONG $0x2b1c8d42 // lea ebx,[rbx+r13*1]
691 WORD $0x3141; BYTE $0xd7 // xor r15d,edx
692 LONG $0xfb70fdc5; BYTE $0x50 // vpshufd ymm7,ymm3,0x50
693 LONG $0xf07b63c4; WORD $0x0df1 // rorx r14d,ecx,0xd
694 LONG $0xf07b63c4; WORD $0x02e9 // rorx r13d,ecx,0x2
695 LONG $0x190c8d45 // lea r9d,[r9+rbx*1]
696 LONG $0xd772cdc5; BYTE $0x0a // vpsrld ymm6,ymm7,0xa
697 WORD $0x2144; BYTE $0xff // and edi,r15d
698 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
699 WORD $0xd731 // xor edi,edx
700 LONG $0xd773c5c5; BYTE $0x11 // vpsrlq ymm7,ymm7,0x11
701 WORD $0x3145; BYTE $0xee // xor r14d,r13d
702 WORD $0x1c8d; BYTE $0x3b // lea ebx,[rbx+rdi*1]
703 WORD $0x8945; BYTE $0xd4 // mov r12d,r10d
704 LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7
705
706 // ROUND(BX, CX, DX, R8, R9, R10, R11, AX, R12, R13, R14, DI, R15, SP, 0xac)
707 LONG $0xac248403; WORD $0x0000; BYTE $0x00 // add eax,[rsp+0xac]
708 WORD $0x2145; BYTE $0xcc // and r12d,r9d
709 LONG $0xf07b43c4; WORD $0x19e9 // rorx r13d,r9d,0x19
710 LONG $0xd773c5c5; BYTE $0x02 // vpsrlq ymm7,ymm7,0x2
711 LONG $0xf07bc3c4; WORD $0x0bf9 // rorx edi,r9d,0xb
712 LONG $0x331c8d42 // lea ebx,[rbx+r14*1]
713 LONG $0x20048d42 // lea eax,[rax+r12*1]
714 LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7
715 LONG $0xf23042c4; BYTE $0xe3 // andn r12d,r9d,r11d
716 WORD $0x3141; BYTE $0xfd // xor r13d,edi
717 LONG $0xf07b43c4; WORD $0x06f1 // rorx r14d,r9d,0x6
718 LONG $0x004dc2c4; BYTE $0xf1 // vpshufb ymm6,ymm6,ymm9
719 LONG $0x20048d42 // lea eax,[rax+r12*1]
720 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
721 WORD $0xdf89 // mov edi,ebx
722 LONG $0xdefee5c5 // vpaddd ymm3,ymm3,ymm6
723 LONG $0xf07b63c4; WORD $0x16e3 // rorx r12d,ebx,0x16
724 LONG $0x28048d42 // lea eax,[rax+r13*1]
725 WORD $0xcf31 // xor edi,ecx
726 LONG $0x75fee5c5; BYTE $0x60 // vpaddd ymm6,ymm3,[rbp+0x60]
727 LONG $0xf07b63c4; WORD $0x0df3 // rorx r14d,ebx,0xd
728 LONG $0xf07b63c4; WORD $0x02eb // rorx r13d,ebx,0x2
729 LONG $0x00048d45 // lea r8d,[r8+rax*1]
730 WORD $0x2141; BYTE $0xff // and r15d,edi
731 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
732 WORD $0x3141; BYTE $0xcf // xor r15d,ecx
733 WORD $0x3145; BYTE $0xee // xor r14d,r13d
734 LONG $0x38048d42 // lea eax,[rax+r15*1]
735 WORD $0x8945; BYTE $0xcc // mov r12d,r9d
736
737 LONG $0x747ffdc5; WORD $0x2024 // vmovdqa [rsp+0x20],ymm6
738 ADDQ $0x80, BP
739
740 CMPB 0x3(BP),$0x0
741 JNE loop1
742
743 // ROUND(AX, BX, CX, DX, R8, R9, R10, R11, R12, R13, R14, R15, DI, SP, 0x40)
744 LONG $0x245c0344; BYTE $0x40 // add r11d,[rsp+0x40]
745 WORD $0x2145; BYTE $0xc4 // and r12d,r8d
746 LONG $0xf07b43c4; WORD $0x19e8 // rorx r13d,r8d,0x19
747 LONG $0xf07b43c4; WORD $0x0bf8 // rorx r15d,r8d,0xb
748 LONG $0x30048d42 // lea eax,[rax+r14*1]
749 LONG $0x231c8d47 // lea r11d,[r11+r12*1]
750 LONG $0xf23842c4; BYTE $0xe2 // andn r12d,r8d,r10d
751 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
752 LONG $0xf07b43c4; WORD $0x06f0 // rorx r14d,r8d,0x6
753 LONG $0x231c8d47 // lea r11d,[r11+r12*1]
754 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
755 WORD $0x8941; BYTE $0xc7 // mov r15d,eax
756 LONG $0xf07b63c4; WORD $0x16e0 // rorx r12d,eax,0x16
757 LONG $0x2b1c8d47 // lea r11d,[r11+r13*1]
758 WORD $0x3141; BYTE $0xdf // xor r15d,ebx
759 LONG $0xf07b63c4; WORD $0x0df0 // rorx r14d,eax,0xd
760 LONG $0xf07b63c4; WORD $0x02e8 // rorx r13d,eax,0x2
761 LONG $0x1a148d42 // lea edx,[rdx+r11*1]
762 WORD $0x2144; BYTE $0xff // and edi,r15d
763 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
764 WORD $0xdf31 // xor edi,ebx
765 WORD $0x3145; BYTE $0xee // xor r14d,r13d
766 LONG $0x3b1c8d45 // lea r11d,[r11+rdi*1]
767 WORD $0x8945; BYTE $0xc4 // mov r12d,r8d
768
769 // ROUND(R11, AX, BX, CX, DX, R8, R9, R10, R12, R13, R14, DI, R15, SP, 0x44)
770 LONG $0x24540344; BYTE $0x44 // add r10d,[rsp+0x44]
771 WORD $0x2141; BYTE $0xd4 // and r12d,edx
772 LONG $0xf07b63c4; WORD $0x19ea // rorx r13d,edx,0x19
773 LONG $0xf07be3c4; WORD $0x0bfa // rorx edi,edx,0xb
774 LONG $0x331c8d47 // lea r11d,[r11+r14*1]
775 LONG $0x22148d47 // lea r10d,[r10+r12*1]
776 LONG $0xf26842c4; BYTE $0xe1 // andn r12d,edx,r9d
777 WORD $0x3141; BYTE $0xfd // xor r13d,edi
778 LONG $0xf07b63c4; WORD $0x06f2 // rorx r14d,edx,0x6
779 LONG $0x22148d47 // lea r10d,[r10+r12*1]
780 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
781 WORD $0x8944; BYTE $0xdf // mov edi,r11d
782 LONG $0xf07b43c4; WORD $0x16e3 // rorx r12d,r11d,0x16
783 LONG $0x2a148d47 // lea r10d,[r10+r13*1]
784 WORD $0xc731 // xor edi,eax
785 LONG $0xf07b43c4; WORD $0x0df3 // rorx r14d,r11d,0xd
786 LONG $0xf07b43c4; WORD $0x02eb // rorx r13d,r11d,0x2
787 LONG $0x110c8d42 // lea ecx,[rcx+r10*1]
788 WORD $0x2141; BYTE $0xff // and r15d,edi
789 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
790 WORD $0x3141; BYTE $0xc7 // xor r15d,eax
791 WORD $0x3145; BYTE $0xee // xor r14d,r13d
792 LONG $0x3a148d47 // lea r10d,[r10+r15*1]
793 WORD $0x8941; BYTE $0xd4 // mov r12d,edx
794
795 // ROUND(R10, R11, AX, BX, CX, DX, R8, R9, R12, R13, R14, R15, DI, SP, 0x48)
796 LONG $0x244c0344; BYTE $0x48 // add r9d,[rsp+0x48]
797 WORD $0x2141; BYTE $0xcc // and r12d,ecx
798 LONG $0xf07b63c4; WORD $0x19e9 // rorx r13d,ecx,0x19
799 LONG $0xf07b63c4; WORD $0x0bf9 // rorx r15d,ecx,0xb
800 LONG $0x32148d47 // lea r10d,[r10+r14*1]
801 LONG $0x210c8d47 // lea r9d,[r9+r12*1]
802 LONG $0xf27042c4; BYTE $0xe0 // andn r12d,ecx,r8d
803 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
804 LONG $0xf07b63c4; WORD $0x06f1 // rorx r14d,ecx,0x6
805 LONG $0x210c8d47 // lea r9d,[r9+r12*1]
806 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
807 WORD $0x8945; BYTE $0xd7 // mov r15d,r10d
808 LONG $0xf07b43c4; WORD $0x16e2 // rorx r12d,r10d,0x16
809 LONG $0x290c8d47 // lea r9d,[r9+r13*1]
810 WORD $0x3145; BYTE $0xdf // xor r15d,r11d
811 LONG $0xf07b43c4; WORD $0x0df2 // rorx r14d,r10d,0xd
812 LONG $0xf07b43c4; WORD $0x02ea // rorx r13d,r10d,0x2
813 LONG $0x0b1c8d42 // lea ebx,[rbx+r9*1]
814 WORD $0x2144; BYTE $0xff // and edi,r15d
815 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
816 WORD $0x3144; BYTE $0xdf // xor edi,r11d
817 WORD $0x3145; BYTE $0xee // xor r14d,r13d
818 LONG $0x390c8d45 // lea r9d,[r9+rdi*1]
819 WORD $0x8941; BYTE $0xcc // mov r12d,ecx
820
821 // ROUND(R9, R10, R11, AX, BX, CX, DX, R8, R12, R13, R14, DI, R15, SP, 0x4c)
822 LONG $0x24440344; BYTE $0x4c // add r8d,[rsp+0x4c]
823 WORD $0x2141; BYTE $0xdc // and r12d,ebx
824 LONG $0xf07b63c4; WORD $0x19eb // rorx r13d,ebx,0x19
825 LONG $0xf07be3c4; WORD $0x0bfb // rorx edi,ebx,0xb
826 LONG $0x310c8d47 // lea r9d,[r9+r14*1]
827 LONG $0x20048d47 // lea r8d,[r8+r12*1]
828 LONG $0xf26062c4; BYTE $0xe2 // andn r12d,ebx,edx
829 WORD $0x3141; BYTE $0xfd // xor r13d,edi
830 LONG $0xf07b63c4; WORD $0x06f3 // rorx r14d,ebx,0x6
831 LONG $0x20048d47 // lea r8d,[r8+r12*1]
832 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
833 WORD $0x8944; BYTE $0xcf // mov edi,r9d
834 LONG $0xf07b43c4; WORD $0x16e1 // rorx r12d,r9d,0x16
835 LONG $0x28048d47 // lea r8d,[r8+r13*1]
836 WORD $0x3144; BYTE $0xd7 // xor edi,r10d
837 LONG $0xf07b43c4; WORD $0x0df1 // rorx r14d,r9d,0xd
838 LONG $0xf07b43c4; WORD $0x02e9 // rorx r13d,r9d,0x2
839 LONG $0x00048d42 // lea eax,[rax+r8*1]
840 WORD $0x2141; BYTE $0xff // and r15d,edi
841 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
842 WORD $0x3145; BYTE $0xd7 // xor r15d,r10d
843 WORD $0x3145; BYTE $0xee // xor r14d,r13d
844 LONG $0x38048d47 // lea r8d,[r8+r15*1]
845 WORD $0x8941; BYTE $0xdc // mov r12d,ebx
846
847 // ROUND(R8, R9, R10, R11, AX, BX, CX, DX, R12, R13, R14, R15, DI, SP, 0x60)
848 LONG $0x60245403 // add edx,[rsp+0x60]
849 WORD $0x2141; BYTE $0xc4 // and r12d,eax
850 LONG $0xf07b63c4; WORD $0x19e8 // rorx r13d,eax,0x19
851 LONG $0xf07b63c4; WORD $0x0bf8 // rorx r15d,eax,0xb
852 LONG $0x30048d47 // lea r8d,[r8+r14*1]
853 LONG $0x22148d42 // lea edx,[rdx+r12*1]
854 LONG $0xf27862c4; BYTE $0xe1 // andn r12d,eax,ecx
855 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
856 LONG $0xf07b63c4; WORD $0x06f0 // rorx r14d,eax,0x6
857 LONG $0x22148d42 // lea edx,[rdx+r12*1]
858 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
859 WORD $0x8945; BYTE $0xc7 // mov r15d,r8d
860 LONG $0xf07b43c4; WORD $0x16e0 // rorx r12d,r8d,0x16
861 LONG $0x2a148d42 // lea edx,[rdx+r13*1]
862 WORD $0x3145; BYTE $0xcf // xor r15d,r9d
863 LONG $0xf07b43c4; WORD $0x0df0 // rorx r14d,r8d,0xd
864 LONG $0xf07b43c4; WORD $0x02e8 // rorx r13d,r8d,0x2
865 LONG $0x131c8d45 // lea r11d,[r11+rdx*1]
866 WORD $0x2144; BYTE $0xff // and edi,r15d
867 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
868 WORD $0x3144; BYTE $0xcf // xor edi,r9d
869 WORD $0x3145; BYTE $0xee // xor r14d,r13d
870 WORD $0x148d; BYTE $0x3a // lea edx,[rdx+rdi*1]
871 WORD $0x8941; BYTE $0xc4 // mov r12d,eax
872
873 // ROUND(DX, R8, R9, R10, R11, AX, BX, CX, R12, R13, R14, DI, R15, SP, 0x64)
874 LONG $0x64244c03 // add ecx,[rsp+0x64]
875 WORD $0x2145; BYTE $0xdc // and r12d,r11d
876 LONG $0xf07b43c4; WORD $0x19eb // rorx r13d,r11d,0x19
877 LONG $0xf07bc3c4; WORD $0x0bfb // rorx edi,r11d,0xb
878 LONG $0x32148d42 // lea edx,[rdx+r14*1]
879 LONG $0x210c8d42 // lea ecx,[rcx+r12*1]
880 LONG $0xf22062c4; BYTE $0xe3 // andn r12d,r11d,ebx
881 WORD $0x3141; BYTE $0xfd // xor r13d,edi
882 LONG $0xf07b43c4; WORD $0x06f3 // rorx r14d,r11d,0x6
883 LONG $0x210c8d42 // lea ecx,[rcx+r12*1]
884 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
885 WORD $0xd789 // mov edi,edx
886 LONG $0xf07b63c4; WORD $0x16e2 // rorx r12d,edx,0x16
887 LONG $0x290c8d42 // lea ecx,[rcx+r13*1]
888 WORD $0x3144; BYTE $0xc7 // xor edi,r8d
889 LONG $0xf07b63c4; WORD $0x0df2 // rorx r14d,edx,0xd
890 LONG $0xf07b63c4; WORD $0x02ea // rorx r13d,edx,0x2
891 LONG $0x0a148d45 // lea r10d,[r10+rcx*1]
892 WORD $0x2141; BYTE $0xff // and r15d,edi
893 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
894 WORD $0x3145; BYTE $0xc7 // xor r15d,r8d
895 WORD $0x3145; BYTE $0xee // xor r14d,r13d
896 LONG $0x390c8d42 // lea ecx,[rcx+r15*1]
897 WORD $0x8945; BYTE $0xdc // mov r12d,r11d
898
899 // ROUND(CX, DX, R8, R9, R10, R11, AX, BX, R12, R13, R14, R15, DI, SP, 0x68)
900 LONG $0x68245c03 // add ebx,[rsp+0x68]
901 WORD $0x2145; BYTE $0xd4 // and r12d,r10d
902 LONG $0xf07b43c4; WORD $0x19ea // rorx r13d,r10d,0x19
903 LONG $0xf07b43c4; WORD $0x0bfa // rorx r15d,r10d,0xb
904 LONG $0x310c8d42 // lea ecx,[rcx+r14*1]
905 LONG $0x231c8d42 // lea ebx,[rbx+r12*1]
906 LONG $0xf22862c4; BYTE $0xe0 // andn r12d,r10d,eax
907 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
908 LONG $0xf07b43c4; WORD $0x06f2 // rorx r14d,r10d,0x6
909 LONG $0x231c8d42 // lea ebx,[rbx+r12*1]
910 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
911 WORD $0x8941; BYTE $0xcf // mov r15d,ecx
912 LONG $0xf07b63c4; WORD $0x16e1 // rorx r12d,ecx,0x16
913 LONG $0x2b1c8d42 // lea ebx,[rbx+r13*1]
914 WORD $0x3141; BYTE $0xd7 // xor r15d,edx
915 LONG $0xf07b63c4; WORD $0x0df1 // rorx r14d,ecx,0xd
916 LONG $0xf07b63c4; WORD $0x02e9 // rorx r13d,ecx,0x2
917 LONG $0x190c8d45 // lea r9d,[r9+rbx*1]
918 WORD $0x2144; BYTE $0xff // and edi,r15d
919 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
920 WORD $0xd731 // xor edi,edx
921 WORD $0x3145; BYTE $0xee // xor r14d,r13d
922 WORD $0x1c8d; BYTE $0x3b // lea ebx,[rbx+rdi*1]
923 WORD $0x8945; BYTE $0xd4 // mov r12d,r10d
924
925 // ROUND(BX, CX, DX, R8, R9, R10, R11, AX, R12, R13, R14, DI, R15, SP, 0x6c)
926 LONG $0x6c244403 // add eax,[rsp+0x6c]
927 WORD $0x2145; BYTE $0xcc // and r12d,r9d
928 LONG $0xf07b43c4; WORD $0x19e9 // rorx r13d,r9d,0x19
929 LONG $0xf07bc3c4; WORD $0x0bf9 // rorx edi,r9d,0xb
930 LONG $0x331c8d42 // lea ebx,[rbx+r14*1]
931 LONG $0x20048d42 // lea eax,[rax+r12*1]
932 LONG $0xf23042c4; BYTE $0xe3 // andn r12d,r9d,r11d
933 WORD $0x3141; BYTE $0xfd // xor r13d,edi
934 LONG $0xf07b43c4; WORD $0x06f1 // rorx r14d,r9d,0x6
935 LONG $0x20048d42 // lea eax,[rax+r12*1]
936 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
937 WORD $0xdf89 // mov edi,ebx
938 LONG $0xf07b63c4; WORD $0x16e3 // rorx r12d,ebx,0x16
939 LONG $0x28048d42 // lea eax,[rax+r13*1]
940 WORD $0xcf31 // xor edi,ecx
941 LONG $0xf07b63c4; WORD $0x0df3 // rorx r14d,ebx,0xd
942 LONG $0xf07b63c4; WORD $0x02eb // rorx r13d,ebx,0x2
943 LONG $0x00048d45 // lea r8d,[r8+rax*1]
944 WORD $0x2141; BYTE $0xff // and r15d,edi
945 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
946 WORD $0x3141; BYTE $0xcf // xor r15d,ecx
947 WORD $0x3145; BYTE $0xee // xor r14d,r13d
948 LONG $0x38048d42 // lea eax,[rax+r15*1]
949 WORD $0x8945; BYTE $0xcc // mov r12d,r9d
950
951 // ROUND(AX, BX, CX, DX, R8, R9, R10, R11, R12, R13, R14, R15, DI, SP, 0x00)
952 LONG $0x241c0344 // add r11d,[rsp]
953 WORD $0x2145; BYTE $0xc4 // and r12d,r8d
954 LONG $0xf07b43c4; WORD $0x19e8 // rorx r13d,r8d,0x19
955 LONG $0xf07b43c4; WORD $0x0bf8 // rorx r15d,r8d,0xb
956 LONG $0x30048d42 // lea eax,[rax+r14*1]
957 LONG $0x231c8d47 // lea r11d,[r11+r12*1]
958 LONG $0xf23842c4; BYTE $0xe2 // andn r12d,r8d,r10d
959 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
960 LONG $0xf07b43c4; WORD $0x06f0 // rorx r14d,r8d,0x6
961 LONG $0x231c8d47 // lea r11d,[r11+r12*1]
962 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
963 WORD $0x8941; BYTE $0xc7 // mov r15d,eax
964 LONG $0xf07b63c4; WORD $0x16e0 // rorx r12d,eax,0x16
965 LONG $0x2b1c8d47 // lea r11d,[r11+r13*1]
966 WORD $0x3141; BYTE $0xdf // xor r15d,ebx
967 LONG $0xf07b63c4; WORD $0x0df0 // rorx r14d,eax,0xd
968 LONG $0xf07b63c4; WORD $0x02e8 // rorx r13d,eax,0x2
969 LONG $0x1a148d42 // lea edx,[rdx+r11*1]
970 WORD $0x2144; BYTE $0xff // and edi,r15d
971 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
972 WORD $0xdf31 // xor edi,ebx
973 WORD $0x3145; BYTE $0xee // xor r14d,r13d
974 LONG $0x3b1c8d45 // lea r11d,[r11+rdi*1]
975 WORD $0x8945; BYTE $0xc4 // mov r12d,r8d
976
977 // ROUND(R11, AX, BX, CX, DX, R8, R9, R10, R12, R13, R14, DI, R15, SP, 0x04)
978 LONG $0x24540344; BYTE $0x04 // add r10d,[rsp+0x4]
979 WORD $0x2141; BYTE $0xd4 // and r12d,edx
980 LONG $0xf07b63c4; WORD $0x19ea // rorx r13d,edx,0x19
981 LONG $0xf07be3c4; WORD $0x0bfa // rorx edi,edx,0xb
982 LONG $0x331c8d47 // lea r11d,[r11+r14*1]
983 LONG $0x22148d47 // lea r10d,[r10+r12*1]
984 LONG $0xf26842c4; BYTE $0xe1 // andn r12d,edx,r9d
985 WORD $0x3141; BYTE $0xfd // xor r13d,edi
986 LONG $0xf07b63c4; WORD $0x06f2 // rorx r14d,edx,0x6
987 LONG $0x22148d47 // lea r10d,[r10+r12*1]
988 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
989 WORD $0x8944; BYTE $0xdf // mov edi,r11d
990 LONG $0xf07b43c4; WORD $0x16e3 // rorx r12d,r11d,0x16
991 LONG $0x2a148d47 // lea r10d,[r10+r13*1]
992 WORD $0xc731 // xor edi,eax
993 LONG $0xf07b43c4; WORD $0x0df3 // rorx r14d,r11d,0xd
994 LONG $0xf07b43c4; WORD $0x02eb // rorx r13d,r11d,0x2
995 LONG $0x110c8d42 // lea ecx,[rcx+r10*1]
996 WORD $0x2141; BYTE $0xff // and r15d,edi
997 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
998 WORD $0x3141; BYTE $0xc7 // xor r15d,eax
999 WORD $0x3145; BYTE $0xee // xor r14d,r13d
1000 LONG $0x3a148d47 // lea r10d,[r10+r15*1]
1001 WORD $0x8941; BYTE $0xd4 // mov r12d,edx
1002
1003 // ROUND(R10, R11, AX, BX, CX, DX, R8, R9, R12, R13, R14, R15, DI, SP, 0x08)
1004 LONG $0x244c0344; BYTE $0x08 // add r9d,[rsp+0x8]
1005 WORD $0x2141; BYTE $0xcc // and r12d,ecx
1006 LONG $0xf07b63c4; WORD $0x19e9 // rorx r13d,ecx,0x19
1007 LONG $0xf07b63c4; WORD $0x0bf9 // rorx r15d,ecx,0xb
1008 LONG $0x32148d47 // lea r10d,[r10+r14*1]
1009 LONG $0x210c8d47 // lea r9d,[r9+r12*1]
1010 LONG $0xf27042c4; BYTE $0xe0 // andn r12d,ecx,r8d
1011 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
1012 LONG $0xf07b63c4; WORD $0x06f1 // rorx r14d,ecx,0x6
1013 LONG $0x210c8d47 // lea r9d,[r9+r12*1]
1014 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
1015 WORD $0x8945; BYTE $0xd7 // mov r15d,r10d
1016 LONG $0xf07b43c4; WORD $0x16e2 // rorx r12d,r10d,0x16
1017 LONG $0x290c8d47 // lea r9d,[r9+r13*1]
1018 WORD $0x3145; BYTE $0xdf // xor r15d,r11d
1019 LONG $0xf07b43c4; WORD $0x0df2 // rorx r14d,r10d,0xd
1020 LONG $0xf07b43c4; WORD $0x02ea // rorx r13d,r10d,0x2
1021 LONG $0x0b1c8d42 // lea ebx,[rbx+r9*1]
1022 WORD $0x2144; BYTE $0xff // and edi,r15d
1023 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
1024 WORD $0x3144; BYTE $0xdf // xor edi,r11d
1025 WORD $0x3145; BYTE $0xee // xor r14d,r13d
1026 LONG $0x390c8d45 // lea r9d,[r9+rdi*1]
1027 WORD $0x8941; BYTE $0xcc // mov r12d,ecx
1028
1029 // ROUND(R9, R10, R11, AX, BX, CX, DX, R8, R12, R13, R14, DI, R15, SP, 0x0c)
1030 LONG $0x24440344; BYTE $0x0c // add r8d,[rsp+0xc]
1031 WORD $0x2141; BYTE $0xdc // and r12d,ebx
1032 LONG $0xf07b63c4; WORD $0x19eb // rorx r13d,ebx,0x19
1033 LONG $0xf07be3c4; WORD $0x0bfb // rorx edi,ebx,0xb
1034 LONG $0x310c8d47 // lea r9d,[r9+r14*1]
1035 LONG $0x20048d47 // lea r8d,[r8+r12*1]
1036 LONG $0xf26062c4; BYTE $0xe2 // andn r12d,ebx,edx
1037 WORD $0x3141; BYTE $0xfd // xor r13d,edi
1038 LONG $0xf07b63c4; WORD $0x06f3 // rorx r14d,ebx,0x6
1039 LONG $0x20048d47 // lea r8d,[r8+r12*1]
1040 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
1041 WORD $0x8944; BYTE $0xcf // mov edi,r9d
1042 LONG $0xf07b43c4; WORD $0x16e1 // rorx r12d,r9d,0x16
1043 LONG $0x28048d47 // lea r8d,[r8+r13*1]
1044 WORD $0x3144; BYTE $0xd7 // xor edi,r10d
1045 LONG $0xf07b43c4; WORD $0x0df1 // rorx r14d,r9d,0xd
1046 LONG $0xf07b43c4; WORD $0x02e9 // rorx r13d,r9d,0x2
1047 LONG $0x00048d42 // lea eax,[rax+r8*1]
1048 WORD $0x2141; BYTE $0xff // and r15d,edi
1049 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
1050 WORD $0x3145; BYTE $0xd7 // xor r15d,r10d
1051 WORD $0x3145; BYTE $0xee // xor r14d,r13d
1052 LONG $0x38048d47 // lea r8d,[r8+r15*1]
1053 WORD $0x8941; BYTE $0xdc // mov r12d,ebx
1054
1055 // ROUND(R8, R9, R10, R11, AX, BX, CX, DX, R12, R13, R14, R15, DI, SP, 0x20)
1056 LONG $0x20245403 // add edx,[rsp+0x20]
1057 WORD $0x2141; BYTE $0xc4 // and r12d,eax
1058 LONG $0xf07b63c4; WORD $0x19e8 // rorx r13d,eax,0x19
1059 LONG $0xf07b63c4; WORD $0x0bf8 // rorx r15d,eax,0xb
1060 LONG $0x30048d47 // lea r8d,[r8+r14*1]
1061 LONG $0x22148d42 // lea edx,[rdx+r12*1]
1062 LONG $0xf27862c4; BYTE $0xe1 // andn r12d,eax,ecx
1063 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
1064 LONG $0xf07b63c4; WORD $0x06f0 // rorx r14d,eax,0x6
1065 LONG $0x22148d42 // lea edx,[rdx+r12*1]
1066 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
1067 WORD $0x8945; BYTE $0xc7 // mov r15d,r8d
1068 LONG $0xf07b43c4; WORD $0x16e0 // rorx r12d,r8d,0x16
1069 LONG $0x2a148d42 // lea edx,[rdx+r13*1]
1070 WORD $0x3145; BYTE $0xcf // xor r15d,r9d
1071 LONG $0xf07b43c4; WORD $0x0df0 // rorx r14d,r8d,0xd
1072 LONG $0xf07b43c4; WORD $0x02e8 // rorx r13d,r8d,0x2
1073 LONG $0x131c8d45 // lea r11d,[r11+rdx*1]
1074 WORD $0x2144; BYTE $0xff // and edi,r15d
1075 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
1076 WORD $0x3144; BYTE $0xcf // xor edi,r9d
1077 WORD $0x3145; BYTE $0xee // xor r14d,r13d
1078 WORD $0x148d; BYTE $0x3a // lea edx,[rdx+rdi*1]
1079 WORD $0x8941; BYTE $0xc4 // mov r12d,eax
1080
1081 // ROUND(DX, R8, R9, R10, R11, AX, BX, CX, R12, R13, R14, DI, R15, SP, 0x24)
1082 LONG $0x24244c03 // add ecx,[rsp+0x24]
1083 WORD $0x2145; BYTE $0xdc // and r12d,r11d
1084 LONG $0xf07b43c4; WORD $0x19eb // rorx r13d,r11d,0x19
1085 LONG $0xf07bc3c4; WORD $0x0bfb // rorx edi,r11d,0xb
1086 LONG $0x32148d42 // lea edx,[rdx+r14*1]
1087 LONG $0x210c8d42 // lea ecx,[rcx+r12*1]
1088 LONG $0xf22062c4; BYTE $0xe3 // andn r12d,r11d,ebx
1089 WORD $0x3141; BYTE $0xfd // xor r13d,edi
1090 LONG $0xf07b43c4; WORD $0x06f3 // rorx r14d,r11d,0x6
1091 LONG $0x210c8d42 // lea ecx,[rcx+r12*1]
1092 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
1093 WORD $0xd789 // mov edi,edx
1094 LONG $0xf07b63c4; WORD $0x16e2 // rorx r12d,edx,0x16
1095 LONG $0x290c8d42 // lea ecx,[rcx+r13*1]
1096 WORD $0x3144; BYTE $0xc7 // xor edi,r8d
1097 LONG $0xf07b63c4; WORD $0x0df2 // rorx r14d,edx,0xd
1098 LONG $0xf07b63c4; WORD $0x02ea // rorx r13d,edx,0x2
1099 LONG $0x0a148d45 // lea r10d,[r10+rcx*1]
1100 WORD $0x2141; BYTE $0xff // and r15d,edi
1101 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
1102 WORD $0x3145; BYTE $0xc7 // xor r15d,r8d
1103 WORD $0x3145; BYTE $0xee // xor r14d,r13d
1104 LONG $0x390c8d42 // lea ecx,[rcx+r15*1]
1105 WORD $0x8945; BYTE $0xdc // mov r12d,r11d
1106
1107 // ROUND(CX, DX, R8, R9, R10, R11, AX, BX, R12, R13, R14, R15, DI, SP, 0x28)
1108 LONG $0x28245c03 // add ebx,[rsp+0x28]
1109 WORD $0x2145; BYTE $0xd4 // and r12d,r10d
1110 LONG $0xf07b43c4; WORD $0x19ea // rorx r13d,r10d,0x19
1111 LONG $0xf07b43c4; WORD $0x0bfa // rorx r15d,r10d,0xb
1112 LONG $0x310c8d42 // lea ecx,[rcx+r14*1]
1113 LONG $0x231c8d42 // lea ebx,[rbx+r12*1]
1114 LONG $0xf22862c4; BYTE $0xe0 // andn r12d,r10d,eax
1115 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
1116 LONG $0xf07b43c4; WORD $0x06f2 // rorx r14d,r10d,0x6
1117 LONG $0x231c8d42 // lea ebx,[rbx+r12*1]
1118 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
1119 WORD $0x8941; BYTE $0xcf // mov r15d,ecx
1120 LONG $0xf07b63c4; WORD $0x16e1 // rorx r12d,ecx,0x16
1121 LONG $0x2b1c8d42 // lea ebx,[rbx+r13*1]
1122 WORD $0x3141; BYTE $0xd7 // xor r15d,edx
1123 LONG $0xf07b63c4; WORD $0x0df1 // rorx r14d,ecx,0xd
1124 LONG $0xf07b63c4; WORD $0x02e9 // rorx r13d,ecx,0x2
1125 LONG $0x190c8d45 // lea r9d,[r9+rbx*1]
1126 WORD $0x2144; BYTE $0xff // and edi,r15d
1127 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
1128 WORD $0xd731 // xor edi,edx
1129 WORD $0x3145; BYTE $0xee // xor r14d,r13d
1130 WORD $0x1c8d; BYTE $0x3b // lea ebx,[rbx+rdi*1]
1131 WORD $0x8945; BYTE $0xd4 // mov r12d,r10d
1132
1133 // ROUND(BX, CX, DX, R8, R9, R10, R11, AX, R12, R13, R14, DI, R15, SP, 0x2c)
1134 LONG $0x2c244403 // add eax,[rsp+0x2c]
1135 WORD $0x2145; BYTE $0xcc // and r12d,r9d
1136 LONG $0xf07b43c4; WORD $0x19e9 // rorx r13d,r9d,0x19
1137 LONG $0xf07bc3c4; WORD $0x0bf9 // rorx edi,r9d,0xb
1138 LONG $0x331c8d42 // lea ebx,[rbx+r14*1]
1139 LONG $0x20048d42 // lea eax,[rax+r12*1]
1140 LONG $0xf23042c4; BYTE $0xe3 // andn r12d,r9d,r11d
1141 WORD $0x3141; BYTE $0xfd // xor r13d,edi
1142 LONG $0xf07b43c4; WORD $0x06f1 // rorx r14d,r9d,0x6
1143 LONG $0x20048d42 // lea eax,[rax+r12*1]
1144 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
1145 WORD $0xdf89 // mov edi,ebx
1146 LONG $0xf07b63c4; WORD $0x16e3 // rorx r12d,ebx,0x16
1147 LONG $0x28048d42 // lea eax,[rax+r13*1]
1148 WORD $0xcf31 // xor edi,ecx
1149 LONG $0xf07b63c4; WORD $0x0df3 // rorx r14d,ebx,0xd
1150 LONG $0xf07b63c4; WORD $0x02eb // rorx r13d,ebx,0x2
1151 LONG $0x00048d45 // lea r8d,[r8+rax*1]
1152 WORD $0x2141; BYTE $0xff // and r15d,edi
1153 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
1154 WORD $0x3141; BYTE $0xcf // xor r15d,ecx
1155 WORD $0x3145; BYTE $0xee // xor r14d,r13d
1156 LONG $0x38048d42 // lea eax,[rax+r15*1]
1157 WORD $0x8945; BYTE $0xcc // mov r12d,r9d
1158
1159 MOVQ 0x200(SP), DI // $_ctx
1160 ADDQ R14, AX
1161
1162 LEAQ 0x1c0(SP), BP
1163
1164 ADDL (DI), AX
1165 ADDL 4(DI), BX
1166 ADDL 8(DI), CX
1167 ADDL 12(DI), DX
1168 ADDL 16(DI), R8
1169 ADDL 20(DI), R9
1170 ADDL 24(DI), R10
1171 ADDL 28(DI), R11
1172
1173 MOVL AX, (DI)
1174 MOVL BX, 4(DI)
1175 MOVL CX, 8(DI)
1176 MOVL DX, 12(DI)
1177 MOVL R8, 16(DI)
1178 MOVL R9, 20(DI)
1179 MOVL R10, 24(DI)
1180 MOVL R11, 28(DI)
1181
1182 CMPQ SI, 0x50(BP) // $_end
1183 JE done
1184
1185 XORQ R14, R14
1186 MOVQ BX, DI
1187 XORQ CX, DI // magic
1188 MOVQ R9, R12
189 // Schedule 48 input dwords, by doing 3 rounds of 12 each
190 // Note: SIMD instructions are interleaved with the SHA calculations
191 ADDQ $-0x40, SP
192 LONG $0x0f75e3c4; WORD $0x04e0 // vpalignr ymm4,ymm1,ymm0,0x4
193
194 // ROUND(AX, BX, CX, DX, R8, R9, R10, R11, R12, R13, R14, R15, DI, SP, 0x80)
195 LONG $0x249c0344; LONG $0x00000080 // add r11d,[rsp+0x80]
196 WORD $0x2145; BYTE $0xc4 // and r12d,r8d
197 LONG $0xf07b43c4; WORD $0x19e8 // rorx r13d,r8d,0x19
198 LONG $0x0f65e3c4; WORD $0x04fa // vpalignr ymm7,ymm3,ymm2,0x4
199 LONG $0xf07b43c4; WORD $0x0bf8 // rorx r15d,r8d,0xb
200 LONG $0x30048d42 // lea eax,[rax+r14*1]
201 LONG $0x231c8d47 // lea r11d,[r11+r12*1]
202 LONG $0xd472cdc5; BYTE $0x07 // vpsrld ymm6,ymm4,0x7
203 LONG $0xf23842c4; BYTE $0xe2 // andn r12d,r8d,r10d
204 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
205 LONG $0xf07b43c4; WORD $0x06f0 // rorx r14d,r8d,0x6
206 LONG $0xc7fefdc5 // vpaddd ymm0,ymm0,ymm7
207 LONG $0x231c8d47 // lea r11d,[r11+r12*1]
208 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
209 WORD $0x8941; BYTE $0xc7 // mov r15d,eax
210 LONG $0xd472c5c5; BYTE $0x03 // vpsrld ymm7,ymm4,0x3
211 LONG $0xf07b63c4; WORD $0x16e0 // rorx r12d,eax,0x16
212 LONG $0x2b1c8d47 // lea r11d,[r11+r13*1]
213 WORD $0x3141; BYTE $0xdf // xor r15d,ebx
214 LONG $0xf472d5c5; BYTE $0x0e // vpslld ymm5,ymm4,0xe
215 LONG $0xf07b63c4; WORD $0x0df0 // rorx r14d,eax,0xd
216 LONG $0xf07b63c4; WORD $0x02e8 // rorx r13d,eax,0x2
217 LONG $0x1a148d42 // lea edx,[rdx+r11*1]
218 LONG $0xe6efc5c5 // vpxor ymm4,ymm7,ymm6
219 WORD $0x2144; BYTE $0xff // and edi,r15d
220 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
221 WORD $0xdf31 // xor edi,ebx
222 LONG $0xfb70fdc5; BYTE $0xfa // vpshufd ymm7,ymm3,0xfa
223 WORD $0x3145; BYTE $0xee // xor r14d,r13d
224 LONG $0x3b1c8d45 // lea r11d,[r11+rdi*1]
225 WORD $0x8945; BYTE $0xc4 // mov r12d,r8d
226 LONG $0xd672cdc5; BYTE $0x0b // vpsrld ymm6,ymm6,0xb
227
228 // ROUND(R11, AX, BX, CX, DX, R8, R9, R10, R12, R13, R14, DI, R15, SP, 0x84)
229 LONG $0x24940344; LONG $0x00000084 // add r10d,[rsp+0x84]
230 WORD $0x2141; BYTE $0xd4 // and r12d,edx
231 LONG $0xf07b63c4; WORD $0x19ea // rorx r13d,edx,0x19
232 LONG $0xe5efddc5 // vpxor ymm4,ymm4,ymm5
233 LONG $0xf07be3c4; WORD $0x0bfa // rorx edi,edx,0xb
234 LONG $0x331c8d47 // lea r11d,[r11+r14*1]
235 LONG $0x22148d47 // lea r10d,[r10+r12*1]
236 LONG $0xf572d5c5; BYTE $0x0b // vpslld ymm5,ymm5,0xb
237 LONG $0xf26842c4; BYTE $0xe1 // andn r12d,edx,r9d
238 WORD $0x3141; BYTE $0xfd // xor r13d,edi
239 LONG $0xf07b63c4; WORD $0x06f2 // rorx r14d,edx,0x6
240 LONG $0xe6efddc5 // vpxor ymm4,ymm4,ymm6
241 LONG $0x22148d47 // lea r10d,[r10+r12*1]
242 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
243 WORD $0x8944; BYTE $0xdf // mov edi,r11d
244 LONG $0xd772cdc5; BYTE $0x0a // vpsrld ymm6,ymm7,0xa
245 LONG $0xf07b43c4; WORD $0x16e3 // rorx r12d,r11d,0x16
246 LONG $0x2a148d47 // lea r10d,[r10+r13*1]
247 WORD $0xc731 // xor edi,eax
248 LONG $0xe5efddc5 // vpxor ymm4,ymm4,ymm5
249 LONG $0xf07b43c4; WORD $0x0df3 // rorx r14d,r11d,0xd
250 LONG $0xf07b43c4; WORD $0x02eb // rorx r13d,r11d,0x2
251 LONG $0x110c8d42 // lea ecx,[rcx+r10*1]
252 LONG $0xd773c5c5; BYTE $0x11 // vpsrlq ymm7,ymm7,0x11
253 WORD $0x2141; BYTE $0xff // and r15d,edi
254 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
255 WORD $0x3141; BYTE $0xc7 // xor r15d,eax
256 LONG $0xc4fefdc5 // vpaddd ymm0,ymm0,ymm4
257 WORD $0x3145; BYTE $0xee // xor r14d,r13d
258 LONG $0x3a148d47 // lea r10d,[r10+r15*1]
259 WORD $0x8941; BYTE $0xd4 // mov r12d,edx
260 LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7
261
262 // ROUND(R10, R11, AX, BX, CX, DX, R8, R9, R12, R13, R14, R15, DI, SP, 0x88)
263 LONG $0x248c0344; LONG $0x00000088 // add r9d,[rsp+0x88]
264 WORD $0x2141; BYTE $0xcc // and r12d,ecx
265 LONG $0xf07b63c4; WORD $0x19e9 // rorx r13d,ecx,0x19
266 LONG $0xd773c5c5; BYTE $0x02 // vpsrlq ymm7,ymm7,0x2
267 LONG $0xf07b63c4; WORD $0x0bf9 // rorx r15d,ecx,0xb
268 LONG $0x32148d47 // lea r10d,[r10+r14*1]
269 LONG $0x210c8d47 // lea r9d,[r9+r12*1]
270 LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7
271 LONG $0xf27042c4; BYTE $0xe0 // andn r12d,ecx,r8d
272 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
273 LONG $0xf07b63c4; WORD $0x06f1 // rorx r14d,ecx,0x6
274 LONG $0x004dc2c4; BYTE $0xf0 // vpshufb ymm6,ymm6,ymm8
275 LONG $0x210c8d47 // lea r9d,[r9+r12*1]
276 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
277 WORD $0x8945; BYTE $0xd7 // mov r15d,r10d
278 LONG $0xc6fefdc5 // vpaddd ymm0,ymm0,ymm6
279 LONG $0xf07b43c4; WORD $0x16e2 // rorx r12d,r10d,0x16
280 LONG $0x290c8d47 // lea r9d,[r9+r13*1]
281 WORD $0x3145; BYTE $0xdf // xor r15d,r11d
282 LONG $0xf870fdc5; BYTE $0x50 // vpshufd ymm7,ymm0,0x50
283 LONG $0xf07b43c4; WORD $0x0df2 // rorx r14d,r10d,0xd
284 LONG $0xf07b43c4; WORD $0x02ea // rorx r13d,r10d,0x2
285 LONG $0x0b1c8d42 // lea ebx,[rbx+r9*1]
286 LONG $0xd772cdc5; BYTE $0x0a // vpsrld ymm6,ymm7,0xa
287 WORD $0x2144; BYTE $0xff // and edi,r15d
288 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
289 WORD $0x3144; BYTE $0xdf // xor edi,r11d
290 LONG $0xd773c5c5; BYTE $0x11 // vpsrlq ymm7,ymm7,0x11
291 WORD $0x3145; BYTE $0xee // xor r14d,r13d
292 LONG $0x390c8d45 // lea r9d,[r9+rdi*1]
293 WORD $0x8941; BYTE $0xcc // mov r12d,ecx
294 LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7
295
296 // ROUND(R9, R10, R11, AX, BX, CX, DX, R8, R12, R13, R14, DI, R15, SP, 0x8c)
297 LONG $0x24840344; LONG $0x0000008c // add r8d,[rsp+0x8c]
298 WORD $0x2141; BYTE $0xdc // and r12d,ebx
299 LONG $0xf07b63c4; WORD $0x19eb // rorx r13d,ebx,0x19
300 LONG $0xd773c5c5; BYTE $0x02 // vpsrlq ymm7,ymm7,0x2
301 LONG $0xf07be3c4; WORD $0x0bfb // rorx edi,ebx,0xb
302 LONG $0x310c8d47 // lea r9d,[r9+r14*1]
303 LONG $0x20048d47 // lea r8d,[r8+r12*1]
304 LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7
305 LONG $0xf26062c4; BYTE $0xe2 // andn r12d,ebx,edx
306 WORD $0x3141; BYTE $0xfd // xor r13d,edi
307 LONG $0xf07b63c4; WORD $0x06f3 // rorx r14d,ebx,0x6
308 LONG $0x004dc2c4; BYTE $0xf1 // vpshufb ymm6,ymm6,ymm9
309 LONG $0x20048d47 // lea r8d,[r8+r12*1]
310 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
311 WORD $0x8944; BYTE $0xcf // mov edi,r9d
312 LONG $0xc6fefdc5 // vpaddd ymm0,ymm0,ymm6
313 LONG $0xf07b43c4; WORD $0x16e1 // rorx r12d,r9d,0x16
314 LONG $0x28048d47 // lea r8d,[r8+r13*1]
315 WORD $0x3144; BYTE $0xd7 // xor edi,r10d
316 LONG $0x75fefdc5; BYTE $0x00 // vpaddd ymm6,ymm0,[rbp+0x0]
317 LONG $0xf07b43c4; WORD $0x0df1 // rorx r14d,r9d,0xd
318 LONG $0xf07b43c4; WORD $0x02e9 // rorx r13d,r9d,0x2
319 LONG $0x00048d42 // lea eax,[rax+r8*1]
320 WORD $0x2141; BYTE $0xff // and r15d,edi
321 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
322 WORD $0x3145; BYTE $0xd7 // xor r15d,r10d
323 WORD $0x3145; BYTE $0xee // xor r14d,r13d
324 LONG $0x38048d47 // lea r8d,[r8+r15*1]
325 WORD $0x8941; BYTE $0xdc // mov r12d,ebx
326
327 LONG $0x347ffdc5; BYTE $0x24 // vmovdqa [rsp],ymm6
328 LONG $0x0f6de3c4; WORD $0x04e1 // vpalignr ymm4,ymm2,ymm1,0x4
329
330 // ROUND(R8, R9, R10, R11, AX, BX, CX, DX, R12, R13, R14, R15, DI, SP, 0xa0)
331 LONG $0xa0249403; WORD $0x0000; BYTE $0x00 // add edx,[rsp+0xa0]
332 WORD $0x2141; BYTE $0xc4 // and r12d,eax
333 LONG $0xf07b63c4; WORD $0x19e8 // rorx r13d,eax,0x19
334 LONG $0x0f7de3c4; WORD $0x04fb // vpalignr ymm7,ymm0,ymm3,0x4
335 LONG $0xf07b63c4; WORD $0x0bf8 // rorx r15d,eax,0xb
336 LONG $0x30048d47 // lea r8d,[r8+r14*1]
337 LONG $0x22148d42 // lea edx,[rdx+r12*1]
338 LONG $0xd472cdc5; BYTE $0x07 // vpsrld ymm6,ymm4,0x7
339 LONG $0xf27862c4; BYTE $0xe1 // andn r12d,eax,ecx
340 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
341 LONG $0xf07b63c4; WORD $0x06f0 // rorx r14d,eax,0x6
342 LONG $0xcffef5c5 // vpaddd ymm1,ymm1,ymm7
343 LONG $0x22148d42 // lea edx,[rdx+r12*1]
344 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
345 WORD $0x8945; BYTE $0xc7 // mov r15d,r8d
346 LONG $0xd472c5c5; BYTE $0x03 // vpsrld ymm7,ymm4,0x3
347 LONG $0xf07b43c4; WORD $0x16e0 // rorx r12d,r8d,0x16
348 LONG $0x2a148d42 // lea edx,[rdx+r13*1]
349 WORD $0x3145; BYTE $0xcf // xor r15d,r9d
350 LONG $0xf472d5c5; BYTE $0x0e // vpslld ymm5,ymm4,0xe
351 LONG $0xf07b43c4; WORD $0x0df0 // rorx r14d,r8d,0xd
352 LONG $0xf07b43c4; WORD $0x02e8 // rorx r13d,r8d,0x2
353 LONG $0x131c8d45 // lea r11d,[r11+rdx*1]
354 LONG $0xe6efc5c5 // vpxor ymm4,ymm7,ymm6
355 WORD $0x2144; BYTE $0xff // and edi,r15d
356 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
357 WORD $0x3144; BYTE $0xcf // xor edi,r9d
358 LONG $0xf870fdc5; BYTE $0xfa // vpshufd ymm7,ymm0,0xfa
359 WORD $0x3145; BYTE $0xee // xor r14d,r13d
360 WORD $0x148d; BYTE $0x3a // lea edx,[rdx+rdi*1]
361 WORD $0x8941; BYTE $0xc4 // mov r12d,eax
362 LONG $0xd672cdc5; BYTE $0x0b // vpsrld ymm6,ymm6,0xb
363
364 // ROUND(DX, R8, R9, R10, R11, AX, BX, CX, R12, R13, R14, DI, R15, SP, 0xa4)
365 LONG $0xa4248c03; WORD $0x0000; BYTE $0x00 // add ecx,[rsp+0xa4]
366 WORD $0x2145; BYTE $0xdc // and r12d,r11d
367 LONG $0xf07b43c4; WORD $0x19eb // rorx r13d,r11d,0x19
368 LONG $0xe5efddc5 // vpxor ymm4,ymm4,ymm5
369 LONG $0xf07bc3c4; WORD $0x0bfb // rorx edi,r11d,0xb
370 LONG $0x32148d42 // lea edx,[rdx+r14*1]
371 LONG $0x210c8d42 // lea ecx,[rcx+r12*1]
372 LONG $0xf572d5c5; BYTE $0x0b // vpslld ymm5,ymm5,0xb
373 LONG $0xf22062c4; BYTE $0xe3 // andn r12d,r11d,ebx
374 WORD $0x3141; BYTE $0xfd // xor r13d,edi
375 LONG $0xf07b43c4; WORD $0x06f3 // rorx r14d,r11d,0x6
376 LONG $0xe6efddc5 // vpxor ymm4,ymm4,ymm6
377 LONG $0x210c8d42 // lea ecx,[rcx+r12*1]
378 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
379 WORD $0xd789 // mov edi,edx
380 LONG $0xd772cdc5; BYTE $0x0a // vpsrld ymm6,ymm7,0xa
381 LONG $0xf07b63c4; WORD $0x16e2 // rorx r12d,edx,0x16
382 LONG $0x290c8d42 // lea ecx,[rcx+r13*1]
383 WORD $0x3144; BYTE $0xc7 // xor edi,r8d
384 LONG $0xe5efddc5 // vpxor ymm4,ymm4,ymm5
385 LONG $0xf07b63c4; WORD $0x0df2 // rorx r14d,edx,0xd
386 LONG $0xf07b63c4; WORD $0x02ea // rorx r13d,edx,0x2
387 LONG $0x0a148d45 // lea r10d,[r10+rcx*1]
388 LONG $0xd773c5c5; BYTE $0x11 // vpsrlq ymm7,ymm7,0x11
389 WORD $0x2141; BYTE $0xff // and r15d,edi
390 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
391 WORD $0x3145; BYTE $0xc7 // xor r15d,r8d
392 LONG $0xccfef5c5 // vpaddd ymm1,ymm1,ymm4
393 WORD $0x3145; BYTE $0xee // xor r14d,r13d
394 LONG $0x390c8d42 // lea ecx,[rcx+r15*1]
395 WORD $0x8945; BYTE $0xdc // mov r12d,r11d
396 LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7
397
398 // ROUND(CX, DX, R8, R9, R10, R11, AX, BX, R12, R13, R14, R15, DI, SP, 0xa8)
399 LONG $0xa8249c03; WORD $0x0000; BYTE $0x00 // add ebx,[rsp+0xa8]
400 WORD $0x2145; BYTE $0xd4 // and r12d,r10d
401 LONG $0xf07b43c4; WORD $0x19ea // rorx r13d,r10d,0x19
402 LONG $0xd773c5c5; BYTE $0x02 // vpsrlq ymm7,ymm7,0x2
403 LONG $0xf07b43c4; WORD $0x0bfa // rorx r15d,r10d,0xb
404 LONG $0x310c8d42 // lea ecx,[rcx+r14*1]
405 LONG $0x231c8d42 // lea ebx,[rbx+r12*1]
406 LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7
407 LONG $0xf22862c4; BYTE $0xe0 // andn r12d,r10d,eax
408 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
409 LONG $0xf07b43c4; WORD $0x06f2 // rorx r14d,r10d,0x6
410 LONG $0x004dc2c4; BYTE $0xf0 // vpshufb ymm6,ymm6,ymm8
411 LONG $0x231c8d42 // lea ebx,[rbx+r12*1]
412 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
413 WORD $0x8941; BYTE $0xcf // mov r15d,ecx
414 LONG $0xcefef5c5 // vpaddd ymm1,ymm1,ymm6
415 LONG $0xf07b63c4; WORD $0x16e1 // rorx r12d,ecx,0x16
416 LONG $0x2b1c8d42 // lea ebx,[rbx+r13*1]
417 WORD $0x3141; BYTE $0xd7 // xor r15d,edx
418 LONG $0xf970fdc5; BYTE $0x50 // vpshufd ymm7,ymm1,0x50
419 LONG $0xf07b63c4; WORD $0x0df1 // rorx r14d,ecx,0xd
420 LONG $0xf07b63c4; WORD $0x02e9 // rorx r13d,ecx,0x2
421 LONG $0x190c8d45 // lea r9d,[r9+rbx*1]
422 LONG $0xd772cdc5; BYTE $0x0a // vpsrld ymm6,ymm7,0xa
423 WORD $0x2144; BYTE $0xff // and edi,r15d
424 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
425 WORD $0xd731 // xor edi,edx
426 LONG $0xd773c5c5; BYTE $0x11 // vpsrlq ymm7,ymm7,0x11
427 WORD $0x3145; BYTE $0xee // xor r14d,r13d
428 WORD $0x1c8d; BYTE $0x3b // lea ebx,[rbx+rdi*1]
429 WORD $0x8945; BYTE $0xd4 // mov r12d,r10d
430 LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7
431
432 // ROUND(BX, CX, DX, R8, R9, R10, R11, AX, R12, R13, R14, DI, R15, SP, 0xac)
433 LONG $0xac248403; WORD $0x0000; BYTE $0x00 // add eax,[rsp+0xac]
434 WORD $0x2145; BYTE $0xcc // and r12d,r9d
435 LONG $0xf07b43c4; WORD $0x19e9 // rorx r13d,r9d,0x19
436 LONG $0xd773c5c5; BYTE $0x02 // vpsrlq ymm7,ymm7,0x2
437 LONG $0xf07bc3c4; WORD $0x0bf9 // rorx edi,r9d,0xb
438 LONG $0x331c8d42 // lea ebx,[rbx+r14*1]
439 LONG $0x20048d42 // lea eax,[rax+r12*1]
440 LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7
441 LONG $0xf23042c4; BYTE $0xe3 // andn r12d,r9d,r11d
442 WORD $0x3141; BYTE $0xfd // xor r13d,edi
443 LONG $0xf07b43c4; WORD $0x06f1 // rorx r14d,r9d,0x6
444 LONG $0x004dc2c4; BYTE $0xf1 // vpshufb ymm6,ymm6,ymm9
445 LONG $0x20048d42 // lea eax,[rax+r12*1]
446 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
447 WORD $0xdf89 // mov edi,ebx
448 LONG $0xcefef5c5 // vpaddd ymm1,ymm1,ymm6
449 LONG $0xf07b63c4; WORD $0x16e3 // rorx r12d,ebx,0x16
450 LONG $0x28048d42 // lea eax,[rax+r13*1]
451 WORD $0xcf31 // xor edi,ecx
452 LONG $0x75fef5c5; BYTE $0x20 // vpaddd ymm6,ymm1,[rbp+0x20]
453 LONG $0xf07b63c4; WORD $0x0df3 // rorx r14d,ebx,0xd
454 LONG $0xf07b63c4; WORD $0x02eb // rorx r13d,ebx,0x2
455 LONG $0x00048d45 // lea r8d,[r8+rax*1]
456 WORD $0x2141; BYTE $0xff // and r15d,edi
457 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
458 WORD $0x3141; BYTE $0xcf // xor r15d,ecx
459 WORD $0x3145; BYTE $0xee // xor r14d,r13d
460 LONG $0x38048d42 // lea eax,[rax+r15*1]
461 WORD $0x8945; BYTE $0xcc // mov r12d,r9d
462
463 LONG $0x747ffdc5; WORD $0x2024 // vmovdqa [rsp+0x20],ymm6
464
465 LONG $0x24648d48; BYTE $0xc0 // lea rsp,[rsp-0x40]
466 LONG $0x0f65e3c4; WORD $0x04e2 // vpalignr ymm4,ymm3,ymm2,0x4
467
468 // ROUND(AX, BX, CX, DX, R8, R9, R10, R11, R12, R13, R14, R15, DI, SP, 0x80)
469 LONG $0x249c0344; LONG $0x00000080 // add r11d,[rsp+0x80]
470 WORD $0x2145; BYTE $0xc4 // and r12d,r8d
471 LONG $0xf07b43c4; WORD $0x19e8 // rorx r13d,r8d,0x19
472 LONG $0x0f75e3c4; WORD $0x04f8 // vpalignr ymm7,ymm1,ymm0,0x4
473 LONG $0xf07b43c4; WORD $0x0bf8 // rorx r15d,r8d,0xb
474 LONG $0x30048d42 // lea eax,[rax+r14*1]
475 LONG $0x231c8d47 // lea r11d,[r11+r12*1]
476 LONG $0xd472cdc5; BYTE $0x07 // vpsrld ymm6,ymm4,0x7
477 LONG $0xf23842c4; BYTE $0xe2 // andn r12d,r8d,r10d
478 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
479 LONG $0xf07b43c4; WORD $0x06f0 // rorx r14d,r8d,0x6
480 LONG $0xd7feedc5 // vpaddd ymm2,ymm2,ymm7
481 LONG $0x231c8d47 // lea r11d,[r11+r12*1]
482 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
483 WORD $0x8941; BYTE $0xc7 // mov r15d,eax
484 LONG $0xd472c5c5; BYTE $0x03 // vpsrld ymm7,ymm4,0x3
485 LONG $0xf07b63c4; WORD $0x16e0 // rorx r12d,eax,0x16
486 LONG $0x2b1c8d47 // lea r11d,[r11+r13*1]
487 WORD $0x3141; BYTE $0xdf // xor r15d,ebx
488 LONG $0xf472d5c5; BYTE $0x0e // vpslld ymm5,ymm4,0xe
489 LONG $0xf07b63c4; WORD $0x0df0 // rorx r14d,eax,0xd
490 LONG $0xf07b63c4; WORD $0x02e8 // rorx r13d,eax,0x2
491 LONG $0x1a148d42 // lea edx,[rdx+r11*1]
492 LONG $0xe6efc5c5 // vpxor ymm4,ymm7,ymm6
493 WORD $0x2144; BYTE $0xff // and edi,r15d
494 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
495 WORD $0xdf31 // xor edi,ebx
496 LONG $0xf970fdc5; BYTE $0xfa // vpshufd ymm7,ymm1,0xfa
497 WORD $0x3145; BYTE $0xee // xor r14d,r13d
498 LONG $0x3b1c8d45 // lea r11d,[r11+rdi*1]
499 WORD $0x8945; BYTE $0xc4 // mov r12d,r8d
500 LONG $0xd672cdc5; BYTE $0x0b // vpsrld ymm6,ymm6,0xb
501
502 // ROUND(R11, AX, BX, CX, DX, R8, R9, R10, R12, R13, R14, DI, R15, SP, 0x84)
503 LONG $0x24940344; LONG $0x00000084 // add r10d,[rsp+0x84]
504 WORD $0x2141; BYTE $0xd4 // and r12d,edx
505 LONG $0xf07b63c4; WORD $0x19ea // rorx r13d,edx,0x19
506 LONG $0xe5efddc5 // vpxor ymm4,ymm4,ymm5
507 LONG $0xf07be3c4; WORD $0x0bfa // rorx edi,edx,0xb
508 LONG $0x331c8d47 // lea r11d,[r11+r14*1]
509 LONG $0x22148d47 // lea r10d,[r10+r12*1]
510 LONG $0xf572d5c5; BYTE $0x0b // vpslld ymm5,ymm5,0xb
511 LONG $0xf26842c4; BYTE $0xe1 // andn r12d,edx,r9d
512 WORD $0x3141; BYTE $0xfd // xor r13d,edi
513 LONG $0xf07b63c4; WORD $0x06f2 // rorx r14d,edx,0x6
514 LONG $0xe6efddc5 // vpxor ymm4,ymm4,ymm6
515 LONG $0x22148d47 // lea r10d,[r10+r12*1]
516 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
517 WORD $0x8944; BYTE $0xdf // mov edi,r11d
518 LONG $0xd772cdc5; BYTE $0x0a // vpsrld ymm6,ymm7,0xa
519 LONG $0xf07b43c4; WORD $0x16e3 // rorx r12d,r11d,0x16
520 LONG $0x2a148d47 // lea r10d,[r10+r13*1]
521 WORD $0xc731 // xor edi,eax
522 LONG $0xe5efddc5 // vpxor ymm4,ymm4,ymm5
523 LONG $0xf07b43c4; WORD $0x0df3 // rorx r14d,r11d,0xd
524 LONG $0xf07b43c4; WORD $0x02eb // rorx r13d,r11d,0x2
525 LONG $0x110c8d42 // lea ecx,[rcx+r10*1]
526 LONG $0xd773c5c5; BYTE $0x11 // vpsrlq ymm7,ymm7,0x11
527 WORD $0x2141; BYTE $0xff // and r15d,edi
528 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
529 WORD $0x3141; BYTE $0xc7 // xor r15d,eax
530 LONG $0xd4feedc5 // vpaddd ymm2,ymm2,ymm4
531 WORD $0x3145; BYTE $0xee // xor r14d,r13d
532 LONG $0x3a148d47 // lea r10d,[r10+r15*1]
533 WORD $0x8941; BYTE $0xd4 // mov r12d,edx
534 LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7
535
536 // ROUND(R10, R11, AX, BX, CX, DX, R8, R9, R12, R13, R14, R15, DI, SP, 0x88)
537 LONG $0x248c0344; LONG $0x00000088 // add r9d,[rsp+0x88]
538 WORD $0x2141; BYTE $0xcc // and r12d,ecx
539 LONG $0xf07b63c4; WORD $0x19e9 // rorx r13d,ecx,0x19
540 LONG $0xd773c5c5; BYTE $0x02 // vpsrlq ymm7,ymm7,0x2
541 LONG $0xf07b63c4; WORD $0x0bf9 // rorx r15d,ecx,0xb
542 LONG $0x32148d47 // lea r10d,[r10+r14*1]
543 LONG $0x210c8d47 // lea r9d,[r9+r12*1]
544 LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7
545 LONG $0xf27042c4; BYTE $0xe0 // andn r12d,ecx,r8d
546 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
547 LONG $0xf07b63c4; WORD $0x06f1 // rorx r14d,ecx,0x6
548 LONG $0x004dc2c4; BYTE $0xf0 // vpshufb ymm6,ymm6,ymm8
549 LONG $0x210c8d47 // lea r9d,[r9+r12*1]
550 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
551 WORD $0x8945; BYTE $0xd7 // mov r15d,r10d
552 LONG $0xd6feedc5 // vpaddd ymm2,ymm2,ymm6
553 LONG $0xf07b43c4; WORD $0x16e2 // rorx r12d,r10d,0x16
554 LONG $0x290c8d47 // lea r9d,[r9+r13*1]
555 WORD $0x3145; BYTE $0xdf // xor r15d,r11d
556 LONG $0xfa70fdc5; BYTE $0x50 // vpshufd ymm7,ymm2,0x50
557 LONG $0xf07b43c4; WORD $0x0df2 // rorx r14d,r10d,0xd
558 LONG $0xf07b43c4; WORD $0x02ea // rorx r13d,r10d,0x2
559 LONG $0x0b1c8d42 // lea ebx,[rbx+r9*1]
560 LONG $0xd772cdc5; BYTE $0x0a // vpsrld ymm6,ymm7,0xa
561 WORD $0x2144; BYTE $0xff // and edi,r15d
562 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
563 WORD $0x3144; BYTE $0xdf // xor edi,r11d
564 LONG $0xd773c5c5; BYTE $0x11 // vpsrlq ymm7,ymm7,0x11
565 WORD $0x3145; BYTE $0xee // xor r14d,r13d
566 LONG $0x390c8d45 // lea r9d,[r9+rdi*1]
567 WORD $0x8941; BYTE $0xcc // mov r12d,ecx
568 LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7
569
570 // ROUND(R9, R10, R11, AX, BX, CX, DX, R8, R12, R13, R14, DI, R15, SP, 0x8c)
571 LONG $0x24840344; LONG $0x0000008c // add r8d,[rsp+0x8c]
572 WORD $0x2141; BYTE $0xdc // and r12d,ebx
573 LONG $0xf07b63c4; WORD $0x19eb // rorx r13d,ebx,0x19
574 LONG $0xd773c5c5; BYTE $0x02 // vpsrlq ymm7,ymm7,0x2
575 LONG $0xf07be3c4; WORD $0x0bfb // rorx edi,ebx,0xb
576 LONG $0x310c8d47 // lea r9d,[r9+r14*1]
577 LONG $0x20048d47 // lea r8d,[r8+r12*1]
578 LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7
579 LONG $0xf26062c4; BYTE $0xe2 // andn r12d,ebx,edx
580 WORD $0x3141; BYTE $0xfd // xor r13d,edi
581 LONG $0xf07b63c4; WORD $0x06f3 // rorx r14d,ebx,0x6
582 LONG $0x004dc2c4; BYTE $0xf1 // vpshufb ymm6,ymm6,ymm9
583 LONG $0x20048d47 // lea r8d,[r8+r12*1]
584 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
585 WORD $0x8944; BYTE $0xcf // mov edi,r9d
586 LONG $0xd6feedc5 // vpaddd ymm2,ymm2,ymm6
587 LONG $0xf07b43c4; WORD $0x16e1 // rorx r12d,r9d,0x16
588 LONG $0x28048d47 // lea r8d,[r8+r13*1]
589 WORD $0x3144; BYTE $0xd7 // xor edi,r10d
590 LONG $0x75feedc5; BYTE $0x40 // vpaddd ymm6,ymm2,[rbp+0x40]
591 LONG $0xf07b43c4; WORD $0x0df1 // rorx r14d,r9d,0xd
592 LONG $0xf07b43c4; WORD $0x02e9 // rorx r13d,r9d,0x2
593 LONG $0x00048d42 // lea eax,[rax+r8*1]
594 WORD $0x2141; BYTE $0xff // and r15d,edi
595 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
596 WORD $0x3145; BYTE $0xd7 // xor r15d,r10d
597 WORD $0x3145; BYTE $0xee // xor r14d,r13d
598 LONG $0x38048d47 // lea r8d,[r8+r15*1]
599 WORD $0x8941; BYTE $0xdc // mov r12d,ebx
600
601 LONG $0x347ffdc5; BYTE $0x24 // vmovdqa [rsp],ymm6
602 LONG $0x0f7de3c4; WORD $0x04e3 // vpalignr ymm4,ymm0,ymm3,0x4
603
604 // ROUND(R8, R9, R10, R11, AX, BX, CX, DX, R12, R13, R14, R15, DI, SP, 0xa0)
605 LONG $0xa0249403; WORD $0x0000; BYTE $0x00 // add edx,[rsp+0xa0]
606 WORD $0x2141; BYTE $0xc4 // and r12d,eax
607 LONG $0xf07b63c4; WORD $0x19e8 // rorx r13d,eax,0x19
608 LONG $0x0f6de3c4; WORD $0x04f9 // vpalignr ymm7,ymm2,ymm1,0x4
609 LONG $0xf07b63c4; WORD $0x0bf8 // rorx r15d,eax,0xb
610 LONG $0x30048d47 // lea r8d,[r8+r14*1]
611 LONG $0x22148d42 // lea edx,[rdx+r12*1]
612 LONG $0xd472cdc5; BYTE $0x07 // vpsrld ymm6,ymm4,0x7
613 LONG $0xf27862c4; BYTE $0xe1 // andn r12d,eax,ecx
614 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
615 LONG $0xf07b63c4; WORD $0x06f0 // rorx r14d,eax,0x6
616 LONG $0xdffee5c5 // vpaddd ymm3,ymm3,ymm7
617 LONG $0x22148d42 // lea edx,[rdx+r12*1]
618 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
619 WORD $0x8945; BYTE $0xc7 // mov r15d,r8d
620 LONG $0xd472c5c5; BYTE $0x03 // vpsrld ymm7,ymm4,0x3
621 LONG $0xf07b43c4; WORD $0x16e0 // rorx r12d,r8d,0x16
622 LONG $0x2a148d42 // lea edx,[rdx+r13*1]
623 WORD $0x3145; BYTE $0xcf // xor r15d,r9d
624 LONG $0xf472d5c5; BYTE $0x0e // vpslld ymm5,ymm4,0xe
625 LONG $0xf07b43c4; WORD $0x0df0 // rorx r14d,r8d,0xd
626 LONG $0xf07b43c4; WORD $0x02e8 // rorx r13d,r8d,0x2
627 LONG $0x131c8d45 // lea r11d,[r11+rdx*1]
628 LONG $0xe6efc5c5 // vpxor ymm4,ymm7,ymm6
629 WORD $0x2144; BYTE $0xff // and edi,r15d
630 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
631 WORD $0x3144; BYTE $0xcf // xor edi,r9d
632 LONG $0xfa70fdc5; BYTE $0xfa // vpshufd ymm7,ymm2,0xfa
633 WORD $0x3145; BYTE $0xee // xor r14d,r13d
634 WORD $0x148d; BYTE $0x3a // lea edx,[rdx+rdi*1]
635 WORD $0x8941; BYTE $0xc4 // mov r12d,eax
636 LONG $0xd672cdc5; BYTE $0x0b // vpsrld ymm6,ymm6,0xb
637
638 // ROUND(DX, R8, R9, R10, R11, AX, BX, CX, R12, R13, R14, DI, R15, SP, 0xa4)
639 LONG $0xa4248c03; WORD $0x0000; BYTE $0x00 // add ecx,[rsp+0xa4]
640 WORD $0x2145; BYTE $0xdc // and r12d,r11d
641 LONG $0xf07b43c4; WORD $0x19eb // rorx r13d,r11d,0x19
642 LONG $0xe5efddc5 // vpxor ymm4,ymm4,ymm5
643 LONG $0xf07bc3c4; WORD $0x0bfb // rorx edi,r11d,0xb
644 LONG $0x32148d42 // lea edx,[rdx+r14*1]
645 LONG $0x210c8d42 // lea ecx,[rcx+r12*1]
646 LONG $0xf572d5c5; BYTE $0x0b // vpslld ymm5,ymm5,0xb
647 LONG $0xf22062c4; BYTE $0xe3 // andn r12d,r11d,ebx
648 WORD $0x3141; BYTE $0xfd // xor r13d,edi
649 LONG $0xf07b43c4; WORD $0x06f3 // rorx r14d,r11d,0x6
650 LONG $0xe6efddc5 // vpxor ymm4,ymm4,ymm6
651 LONG $0x210c8d42 // lea ecx,[rcx+r12*1]
652 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
653 WORD $0xd789 // mov edi,edx
654 LONG $0xd772cdc5; BYTE $0x0a // vpsrld ymm6,ymm7,0xa
655 LONG $0xf07b63c4; WORD $0x16e2 // rorx r12d,edx,0x16
656 LONG $0x290c8d42 // lea ecx,[rcx+r13*1]
657 WORD $0x3144; BYTE $0xc7 // xor edi,r8d
658 LONG $0xe5efddc5 // vpxor ymm4,ymm4,ymm5
659 LONG $0xf07b63c4; WORD $0x0df2 // rorx r14d,edx,0xd
660 LONG $0xf07b63c4; WORD $0x02ea // rorx r13d,edx,0x2
661 LONG $0x0a148d45 // lea r10d,[r10+rcx*1]
662 LONG $0xd773c5c5; BYTE $0x11 // vpsrlq ymm7,ymm7,0x11
663 WORD $0x2141; BYTE $0xff // and r15d,edi
664 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
665 WORD $0x3145; BYTE $0xc7 // xor r15d,r8d
666 LONG $0xdcfee5c5 // vpaddd ymm3,ymm3,ymm4
667 WORD $0x3145; BYTE $0xee // xor r14d,r13d
668 LONG $0x390c8d42 // lea ecx,[rcx+r15*1]
669 WORD $0x8945; BYTE $0xdc // mov r12d,r11d
670 LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7
671
672 // ROUND(CX, DX, R8, R9, R10, R11, AX, BX, R12, R13, R14, R15, DI, SP, 0xa8)
673 LONG $0xa8249c03; WORD $0x0000; BYTE $0x00 // add ebx,[rsp+0xa8]
674 WORD $0x2145; BYTE $0xd4 // and r12d,r10d
675 LONG $0xf07b43c4; WORD $0x19ea // rorx r13d,r10d,0x19
676 LONG $0xd773c5c5; BYTE $0x02 // vpsrlq ymm7,ymm7,0x2
677 LONG $0xf07b43c4; WORD $0x0bfa // rorx r15d,r10d,0xb
678 LONG $0x310c8d42 // lea ecx,[rcx+r14*1]
679 LONG $0x231c8d42 // lea ebx,[rbx+r12*1]
680 LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7
681 LONG $0xf22862c4; BYTE $0xe0 // andn r12d,r10d,eax
682 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
683 LONG $0xf07b43c4; WORD $0x06f2 // rorx r14d,r10d,0x6
684 LONG $0x004dc2c4; BYTE $0xf0 // vpshufb ymm6,ymm6,ymm8
685 LONG $0x231c8d42 // lea ebx,[rbx+r12*1]
686 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
687 WORD $0x8941; BYTE $0xcf // mov r15d,ecx
688 LONG $0xdefee5c5 // vpaddd ymm3,ymm3,ymm6
689 LONG $0xf07b63c4; WORD $0x16e1 // rorx r12d,ecx,0x16
690 LONG $0x2b1c8d42 // lea ebx,[rbx+r13*1]
691 WORD $0x3141; BYTE $0xd7 // xor r15d,edx
692 LONG $0xfb70fdc5; BYTE $0x50 // vpshufd ymm7,ymm3,0x50
693 LONG $0xf07b63c4; WORD $0x0df1 // rorx r14d,ecx,0xd
694 LONG $0xf07b63c4; WORD $0x02e9 // rorx r13d,ecx,0x2
695 LONG $0x190c8d45 // lea r9d,[r9+rbx*1]
696 LONG $0xd772cdc5; BYTE $0x0a // vpsrld ymm6,ymm7,0xa
697 WORD $0x2144; BYTE $0xff // and edi,r15d
698 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
699 WORD $0xd731 // xor edi,edx
700 LONG $0xd773c5c5; BYTE $0x11 // vpsrlq ymm7,ymm7,0x11
701 WORD $0x3145; BYTE $0xee // xor r14d,r13d
702 WORD $0x1c8d; BYTE $0x3b // lea ebx,[rbx+rdi*1]
703 WORD $0x8945; BYTE $0xd4 // mov r12d,r10d
704 LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7
705
706 // ROUND(BX, CX, DX, R8, R9, R10, R11, AX, R12, R13, R14, DI, R15, SP, 0xac)
707 LONG $0xac248403; WORD $0x0000; BYTE $0x00 // add eax,[rsp+0xac]
708 WORD $0x2145; BYTE $0xcc // and r12d,r9d
709 LONG $0xf07b43c4; WORD $0x19e9 // rorx r13d,r9d,0x19
710 LONG $0xd773c5c5; BYTE $0x02 // vpsrlq ymm7,ymm7,0x2
711 LONG $0xf07bc3c4; WORD $0x0bf9 // rorx edi,r9d,0xb
712 LONG $0x331c8d42 // lea ebx,[rbx+r14*1]
713 LONG $0x20048d42 // lea eax,[rax+r12*1]
714 LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7
715 LONG $0xf23042c4; BYTE $0xe3 // andn r12d,r9d,r11d
716 WORD $0x3141; BYTE $0xfd // xor r13d,edi
717 LONG $0xf07b43c4; WORD $0x06f1 // rorx r14d,r9d,0x6
718 LONG $0x004dc2c4; BYTE $0xf1 // vpshufb ymm6,ymm6,ymm9
719 LONG $0x20048d42 // lea eax,[rax+r12*1]
720 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
721 WORD $0xdf89 // mov edi,ebx
722 LONG $0xdefee5c5 // vpaddd ymm3,ymm3,ymm6
723 LONG $0xf07b63c4; WORD $0x16e3 // rorx r12d,ebx,0x16
724 LONG $0x28048d42 // lea eax,[rax+r13*1]
725 WORD $0xcf31 // xor edi,ecx
726 LONG $0x75fee5c5; BYTE $0x60 // vpaddd ymm6,ymm3,[rbp+0x60]
727 LONG $0xf07b63c4; WORD $0x0df3 // rorx r14d,ebx,0xd
728 LONG $0xf07b63c4; WORD $0x02eb // rorx r13d,ebx,0x2
729 LONG $0x00048d45 // lea r8d,[r8+rax*1]
730 WORD $0x2141; BYTE $0xff // and r15d,edi
731 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
732 WORD $0x3141; BYTE $0xcf // xor r15d,ecx
733 WORD $0x3145; BYTE $0xee // xor r14d,r13d
734 LONG $0x38048d42 // lea eax,[rax+r15*1]
735 WORD $0x8945; BYTE $0xcc // mov r12d,r9d
736
737 LONG $0x747ffdc5; WORD $0x2024 // vmovdqa [rsp+0x20],ymm6
738 ADDQ $0x80, BP
739
740 CMPB 0x3(BP), $0x0
741 JNE loop1
742
743 // ROUND(AX, BX, CX, DX, R8, R9, R10, R11, R12, R13, R14, R15, DI, SP, 0x40)
744 LONG $0x245c0344; BYTE $0x40 // add r11d,[rsp+0x40]
745 WORD $0x2145; BYTE $0xc4 // and r12d,r8d
746 LONG $0xf07b43c4; WORD $0x19e8 // rorx r13d,r8d,0x19
747 LONG $0xf07b43c4; WORD $0x0bf8 // rorx r15d,r8d,0xb
748 LONG $0x30048d42 // lea eax,[rax+r14*1]
749 LONG $0x231c8d47 // lea r11d,[r11+r12*1]
750 LONG $0xf23842c4; BYTE $0xe2 // andn r12d,r8d,r10d
751 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
752 LONG $0xf07b43c4; WORD $0x06f0 // rorx r14d,r8d,0x6
753 LONG $0x231c8d47 // lea r11d,[r11+r12*1]
754 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
755 WORD $0x8941; BYTE $0xc7 // mov r15d,eax
756 LONG $0xf07b63c4; WORD $0x16e0 // rorx r12d,eax,0x16
757 LONG $0x2b1c8d47 // lea r11d,[r11+r13*1]
758 WORD $0x3141; BYTE $0xdf // xor r15d,ebx
759 LONG $0xf07b63c4; WORD $0x0df0 // rorx r14d,eax,0xd
760 LONG $0xf07b63c4; WORD $0x02e8 // rorx r13d,eax,0x2
761 LONG $0x1a148d42 // lea edx,[rdx+r11*1]
762 WORD $0x2144; BYTE $0xff // and edi,r15d
763 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
764 WORD $0xdf31 // xor edi,ebx
765 WORD $0x3145; BYTE $0xee // xor r14d,r13d
766 LONG $0x3b1c8d45 // lea r11d,[r11+rdi*1]
767 WORD $0x8945; BYTE $0xc4 // mov r12d,r8d
768
769 // ROUND(R11, AX, BX, CX, DX, R8, R9, R10, R12, R13, R14, DI, R15, SP, 0x44)
770 LONG $0x24540344; BYTE $0x44 // add r10d,[rsp+0x44]
771 WORD $0x2141; BYTE $0xd4 // and r12d,edx
772 LONG $0xf07b63c4; WORD $0x19ea // rorx r13d,edx,0x19
773 LONG $0xf07be3c4; WORD $0x0bfa // rorx edi,edx,0xb
774 LONG $0x331c8d47 // lea r11d,[r11+r14*1]
775 LONG $0x22148d47 // lea r10d,[r10+r12*1]
776 LONG $0xf26842c4; BYTE $0xe1 // andn r12d,edx,r9d
777 WORD $0x3141; BYTE $0xfd // xor r13d,edi
778 LONG $0xf07b63c4; WORD $0x06f2 // rorx r14d,edx,0x6
779 LONG $0x22148d47 // lea r10d,[r10+r12*1]
780 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
781 WORD $0x8944; BYTE $0xdf // mov edi,r11d
782 LONG $0xf07b43c4; WORD $0x16e3 // rorx r12d,r11d,0x16
783 LONG $0x2a148d47 // lea r10d,[r10+r13*1]
784 WORD $0xc731 // xor edi,eax
785 LONG $0xf07b43c4; WORD $0x0df3 // rorx r14d,r11d,0xd
786 LONG $0xf07b43c4; WORD $0x02eb // rorx r13d,r11d,0x2
787 LONG $0x110c8d42 // lea ecx,[rcx+r10*1]
788 WORD $0x2141; BYTE $0xff // and r15d,edi
789 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
790 WORD $0x3141; BYTE $0xc7 // xor r15d,eax
791 WORD $0x3145; BYTE $0xee // xor r14d,r13d
792 LONG $0x3a148d47 // lea r10d,[r10+r15*1]
793 WORD $0x8941; BYTE $0xd4 // mov r12d,edx
794
795 // ROUND(R10, R11, AX, BX, CX, DX, R8, R9, R12, R13, R14, R15, DI, SP, 0x48)
796 LONG $0x244c0344; BYTE $0x48 // add r9d,[rsp+0x48]
797 WORD $0x2141; BYTE $0xcc // and r12d,ecx
798 LONG $0xf07b63c4; WORD $0x19e9 // rorx r13d,ecx,0x19
799 LONG $0xf07b63c4; WORD $0x0bf9 // rorx r15d,ecx,0xb
800 LONG $0x32148d47 // lea r10d,[r10+r14*1]
801 LONG $0x210c8d47 // lea r9d,[r9+r12*1]
802 LONG $0xf27042c4; BYTE $0xe0 // andn r12d,ecx,r8d
803 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
804 LONG $0xf07b63c4; WORD $0x06f1 // rorx r14d,ecx,0x6
805 LONG $0x210c8d47 // lea r9d,[r9+r12*1]
806 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
807 WORD $0x8945; BYTE $0xd7 // mov r15d,r10d
808 LONG $0xf07b43c4; WORD $0x16e2 // rorx r12d,r10d,0x16
809 LONG $0x290c8d47 // lea r9d,[r9+r13*1]
810 WORD $0x3145; BYTE $0xdf // xor r15d,r11d
811 LONG $0xf07b43c4; WORD $0x0df2 // rorx r14d,r10d,0xd
812 LONG $0xf07b43c4; WORD $0x02ea // rorx r13d,r10d,0x2
813 LONG $0x0b1c8d42 // lea ebx,[rbx+r9*1]
814 WORD $0x2144; BYTE $0xff // and edi,r15d
815 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
816 WORD $0x3144; BYTE $0xdf // xor edi,r11d
817 WORD $0x3145; BYTE $0xee // xor r14d,r13d
818 LONG $0x390c8d45 // lea r9d,[r9+rdi*1]
819 WORD $0x8941; BYTE $0xcc // mov r12d,ecx
820
821 // ROUND(R9, R10, R11, AX, BX, CX, DX, R8, R12, R13, R14, DI, R15, SP, 0x4c)
822 LONG $0x24440344; BYTE $0x4c // add r8d,[rsp+0x4c]
823 WORD $0x2141; BYTE $0xdc // and r12d,ebx
824 LONG $0xf07b63c4; WORD $0x19eb // rorx r13d,ebx,0x19
825 LONG $0xf07be3c4; WORD $0x0bfb // rorx edi,ebx,0xb
826 LONG $0x310c8d47 // lea r9d,[r9+r14*1]
827 LONG $0x20048d47 // lea r8d,[r8+r12*1]
828 LONG $0xf26062c4; BYTE $0xe2 // andn r12d,ebx,edx
829 WORD $0x3141; BYTE $0xfd // xor r13d,edi
830 LONG $0xf07b63c4; WORD $0x06f3 // rorx r14d,ebx,0x6
831 LONG $0x20048d47 // lea r8d,[r8+r12*1]
832 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
833 WORD $0x8944; BYTE $0xcf // mov edi,r9d
834 LONG $0xf07b43c4; WORD $0x16e1 // rorx r12d,r9d,0x16
835 LONG $0x28048d47 // lea r8d,[r8+r13*1]
836 WORD $0x3144; BYTE $0xd7 // xor edi,r10d
837 LONG $0xf07b43c4; WORD $0x0df1 // rorx r14d,r9d,0xd
838 LONG $0xf07b43c4; WORD $0x02e9 // rorx r13d,r9d,0x2
839 LONG $0x00048d42 // lea eax,[rax+r8*1]
840 WORD $0x2141; BYTE $0xff // and r15d,edi
841 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
842 WORD $0x3145; BYTE $0xd7 // xor r15d,r10d
843 WORD $0x3145; BYTE $0xee // xor r14d,r13d
844 LONG $0x38048d47 // lea r8d,[r8+r15*1]
845 WORD $0x8941; BYTE $0xdc // mov r12d,ebx
846
847 // ROUND(R8, R9, R10, R11, AX, BX, CX, DX, R12, R13, R14, R15, DI, SP, 0x60)
848 LONG $0x60245403 // add edx,[rsp+0x60]
849 WORD $0x2141; BYTE $0xc4 // and r12d,eax
850 LONG $0xf07b63c4; WORD $0x19e8 // rorx r13d,eax,0x19
851 LONG $0xf07b63c4; WORD $0x0bf8 // rorx r15d,eax,0xb
852 LONG $0x30048d47 // lea r8d,[r8+r14*1]
853 LONG $0x22148d42 // lea edx,[rdx+r12*1]
854 LONG $0xf27862c4; BYTE $0xe1 // andn r12d,eax,ecx
855 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
856 LONG $0xf07b63c4; WORD $0x06f0 // rorx r14d,eax,0x6
857 LONG $0x22148d42 // lea edx,[rdx+r12*1]
858 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
859 WORD $0x8945; BYTE $0xc7 // mov r15d,r8d
860 LONG $0xf07b43c4; WORD $0x16e0 // rorx r12d,r8d,0x16
861 LONG $0x2a148d42 // lea edx,[rdx+r13*1]
862 WORD $0x3145; BYTE $0xcf // xor r15d,r9d
863 LONG $0xf07b43c4; WORD $0x0df0 // rorx r14d,r8d,0xd
864 LONG $0xf07b43c4; WORD $0x02e8 // rorx r13d,r8d,0x2
865 LONG $0x131c8d45 // lea r11d,[r11+rdx*1]
866 WORD $0x2144; BYTE $0xff // and edi,r15d
867 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
868 WORD $0x3144; BYTE $0xcf // xor edi,r9d
869 WORD $0x3145; BYTE $0xee // xor r14d,r13d
870 WORD $0x148d; BYTE $0x3a // lea edx,[rdx+rdi*1]
871 WORD $0x8941; BYTE $0xc4 // mov r12d,eax
872
873 // ROUND(DX, R8, R9, R10, R11, AX, BX, CX, R12, R13, R14, DI, R15, SP, 0x64)
874 LONG $0x64244c03 // add ecx,[rsp+0x64]
875 WORD $0x2145; BYTE $0xdc // and r12d,r11d
876 LONG $0xf07b43c4; WORD $0x19eb // rorx r13d,r11d,0x19
877 LONG $0xf07bc3c4; WORD $0x0bfb // rorx edi,r11d,0xb
878 LONG $0x32148d42 // lea edx,[rdx+r14*1]
879 LONG $0x210c8d42 // lea ecx,[rcx+r12*1]
880 LONG $0xf22062c4; BYTE $0xe3 // andn r12d,r11d,ebx
881 WORD $0x3141; BYTE $0xfd // xor r13d,edi
882 LONG $0xf07b43c4; WORD $0x06f3 // rorx r14d,r11d,0x6
883 LONG $0x210c8d42 // lea ecx,[rcx+r12*1]
884 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
885 WORD $0xd789 // mov edi,edx
886 LONG $0xf07b63c4; WORD $0x16e2 // rorx r12d,edx,0x16
887 LONG $0x290c8d42 // lea ecx,[rcx+r13*1]
888 WORD $0x3144; BYTE $0xc7 // xor edi,r8d
889 LONG $0xf07b63c4; WORD $0x0df2 // rorx r14d,edx,0xd
890 LONG $0xf07b63c4; WORD $0x02ea // rorx r13d,edx,0x2
891 LONG $0x0a148d45 // lea r10d,[r10+rcx*1]
892 WORD $0x2141; BYTE $0xff // and r15d,edi
893 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
894 WORD $0x3145; BYTE $0xc7 // xor r15d,r8d
895 WORD $0x3145; BYTE $0xee // xor r14d,r13d
896 LONG $0x390c8d42 // lea ecx,[rcx+r15*1]
897 WORD $0x8945; BYTE $0xdc // mov r12d,r11d
898
899 // ROUND(CX, DX, R8, R9, R10, R11, AX, BX, R12, R13, R14, R15, DI, SP, 0x68)
900 LONG $0x68245c03 // add ebx,[rsp+0x68]
901 WORD $0x2145; BYTE $0xd4 // and r12d,r10d
902 LONG $0xf07b43c4; WORD $0x19ea // rorx r13d,r10d,0x19
903 LONG $0xf07b43c4; WORD $0x0bfa // rorx r15d,r10d,0xb
904 LONG $0x310c8d42 // lea ecx,[rcx+r14*1]
905 LONG $0x231c8d42 // lea ebx,[rbx+r12*1]
906 LONG $0xf22862c4; BYTE $0xe0 // andn r12d,r10d,eax
907 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
908 LONG $0xf07b43c4; WORD $0x06f2 // rorx r14d,r10d,0x6
909 LONG $0x231c8d42 // lea ebx,[rbx+r12*1]
910 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
911 WORD $0x8941; BYTE $0xcf // mov r15d,ecx
912 LONG $0xf07b63c4; WORD $0x16e1 // rorx r12d,ecx,0x16
913 LONG $0x2b1c8d42 // lea ebx,[rbx+r13*1]
914 WORD $0x3141; BYTE $0xd7 // xor r15d,edx
915 LONG $0xf07b63c4; WORD $0x0df1 // rorx r14d,ecx,0xd
916 LONG $0xf07b63c4; WORD $0x02e9 // rorx r13d,ecx,0x2
917 LONG $0x190c8d45 // lea r9d,[r9+rbx*1]
918 WORD $0x2144; BYTE $0xff // and edi,r15d
919 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
920 WORD $0xd731 // xor edi,edx
921 WORD $0x3145; BYTE $0xee // xor r14d,r13d
922 WORD $0x1c8d; BYTE $0x3b // lea ebx,[rbx+rdi*1]
923 WORD $0x8945; BYTE $0xd4 // mov r12d,r10d
924
925 // ROUND(BX, CX, DX, R8, R9, R10, R11, AX, R12, R13, R14, DI, R15, SP, 0x6c)
926 LONG $0x6c244403 // add eax,[rsp+0x6c]
927 WORD $0x2145; BYTE $0xcc // and r12d,r9d
928 LONG $0xf07b43c4; WORD $0x19e9 // rorx r13d,r9d,0x19
929 LONG $0xf07bc3c4; WORD $0x0bf9 // rorx edi,r9d,0xb
930 LONG $0x331c8d42 // lea ebx,[rbx+r14*1]
931 LONG $0x20048d42 // lea eax,[rax+r12*1]
932 LONG $0xf23042c4; BYTE $0xe3 // andn r12d,r9d,r11d
933 WORD $0x3141; BYTE $0xfd // xor r13d,edi
934 LONG $0xf07b43c4; WORD $0x06f1 // rorx r14d,r9d,0x6
935 LONG $0x20048d42 // lea eax,[rax+r12*1]
936 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
937 WORD $0xdf89 // mov edi,ebx
938 LONG $0xf07b63c4; WORD $0x16e3 // rorx r12d,ebx,0x16
939 LONG $0x28048d42 // lea eax,[rax+r13*1]
940 WORD $0xcf31 // xor edi,ecx
941 LONG $0xf07b63c4; WORD $0x0df3 // rorx r14d,ebx,0xd
942 LONG $0xf07b63c4; WORD $0x02eb // rorx r13d,ebx,0x2
943 LONG $0x00048d45 // lea r8d,[r8+rax*1]
944 WORD $0x2141; BYTE $0xff // and r15d,edi
945 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
946 WORD $0x3141; BYTE $0xcf // xor r15d,ecx
947 WORD $0x3145; BYTE $0xee // xor r14d,r13d
948 LONG $0x38048d42 // lea eax,[rax+r15*1]
949 WORD $0x8945; BYTE $0xcc // mov r12d,r9d
950
951 // ROUND(AX, BX, CX, DX, R8, R9, R10, R11, R12, R13, R14, R15, DI, SP, 0x00)
952 LONG $0x241c0344 // add r11d,[rsp]
953 WORD $0x2145; BYTE $0xc4 // and r12d,r8d
954 LONG $0xf07b43c4; WORD $0x19e8 // rorx r13d,r8d,0x19
955 LONG $0xf07b43c4; WORD $0x0bf8 // rorx r15d,r8d,0xb
956 LONG $0x30048d42 // lea eax,[rax+r14*1]
957 LONG $0x231c8d47 // lea r11d,[r11+r12*1]
958 LONG $0xf23842c4; BYTE $0xe2 // andn r12d,r8d,r10d
959 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
960 LONG $0xf07b43c4; WORD $0x06f0 // rorx r14d,r8d,0x6
961 LONG $0x231c8d47 // lea r11d,[r11+r12*1]
962 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
963 WORD $0x8941; BYTE $0xc7 // mov r15d,eax
964 LONG $0xf07b63c4; WORD $0x16e0 // rorx r12d,eax,0x16
965 LONG $0x2b1c8d47 // lea r11d,[r11+r13*1]
966 WORD $0x3141; BYTE $0xdf // xor r15d,ebx
967 LONG $0xf07b63c4; WORD $0x0df0 // rorx r14d,eax,0xd
968 LONG $0xf07b63c4; WORD $0x02e8 // rorx r13d,eax,0x2
969 LONG $0x1a148d42 // lea edx,[rdx+r11*1]
970 WORD $0x2144; BYTE $0xff // and edi,r15d
971 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
972 WORD $0xdf31 // xor edi,ebx
973 WORD $0x3145; BYTE $0xee // xor r14d,r13d
974 LONG $0x3b1c8d45 // lea r11d,[r11+rdi*1]
975 WORD $0x8945; BYTE $0xc4 // mov r12d,r8d
976
977 // ROUND(R11, AX, BX, CX, DX, R8, R9, R10, R12, R13, R14, DI, R15, SP, 0x04)
978 LONG $0x24540344; BYTE $0x04 // add r10d,[rsp+0x4]
979 WORD $0x2141; BYTE $0xd4 // and r12d,edx
980 LONG $0xf07b63c4; WORD $0x19ea // rorx r13d,edx,0x19
981 LONG $0xf07be3c4; WORD $0x0bfa // rorx edi,edx,0xb
982 LONG $0x331c8d47 // lea r11d,[r11+r14*1]
983 LONG $0x22148d47 // lea r10d,[r10+r12*1]
984 LONG $0xf26842c4; BYTE $0xe1 // andn r12d,edx,r9d
985 WORD $0x3141; BYTE $0xfd // xor r13d,edi
986 LONG $0xf07b63c4; WORD $0x06f2 // rorx r14d,edx,0x6
987 LONG $0x22148d47 // lea r10d,[r10+r12*1]
988 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
989 WORD $0x8944; BYTE $0xdf // mov edi,r11d
990 LONG $0xf07b43c4; WORD $0x16e3 // rorx r12d,r11d,0x16
991 LONG $0x2a148d47 // lea r10d,[r10+r13*1]
992 WORD $0xc731 // xor edi,eax
993 LONG $0xf07b43c4; WORD $0x0df3 // rorx r14d,r11d,0xd
994 LONG $0xf07b43c4; WORD $0x02eb // rorx r13d,r11d,0x2
995 LONG $0x110c8d42 // lea ecx,[rcx+r10*1]
996 WORD $0x2141; BYTE $0xff // and r15d,edi
997 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
998 WORD $0x3141; BYTE $0xc7 // xor r15d,eax
999 WORD $0x3145; BYTE $0xee // xor r14d,r13d
1000 LONG $0x3a148d47 // lea r10d,[r10+r15*1]
1001 WORD $0x8941; BYTE $0xd4 // mov r12d,edx
1002
1003 // ROUND(R10, R11, AX, BX, CX, DX, R8, R9, R12, R13, R14, R15, DI, SP, 0x08)
1004 LONG $0x244c0344; BYTE $0x08 // add r9d,[rsp+0x8]
1005 WORD $0x2141; BYTE $0xcc // and r12d,ecx
1006 LONG $0xf07b63c4; WORD $0x19e9 // rorx r13d,ecx,0x19
1007 LONG $0xf07b63c4; WORD $0x0bf9 // rorx r15d,ecx,0xb
1008 LONG $0x32148d47 // lea r10d,[r10+r14*1]
1009 LONG $0x210c8d47 // lea r9d,[r9+r12*1]
1010 LONG $0xf27042c4; BYTE $0xe0 // andn r12d,ecx,r8d
1011 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
1012 LONG $0xf07b63c4; WORD $0x06f1 // rorx r14d,ecx,0x6
1013 LONG $0x210c8d47 // lea r9d,[r9+r12*1]
1014 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
1015 WORD $0x8945; BYTE $0xd7 // mov r15d,r10d
1016 LONG $0xf07b43c4; WORD $0x16e2 // rorx r12d,r10d,0x16
1017 LONG $0x290c8d47 // lea r9d,[r9+r13*1]
1018 WORD $0x3145; BYTE $0xdf // xor r15d,r11d
1019 LONG $0xf07b43c4; WORD $0x0df2 // rorx r14d,r10d,0xd
1020 LONG $0xf07b43c4; WORD $0x02ea // rorx r13d,r10d,0x2
1021 LONG $0x0b1c8d42 // lea ebx,[rbx+r9*1]
1022 WORD $0x2144; BYTE $0xff // and edi,r15d
1023 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
1024 WORD $0x3144; BYTE $0xdf // xor edi,r11d
1025 WORD $0x3145; BYTE $0xee // xor r14d,r13d
1026 LONG $0x390c8d45 // lea r9d,[r9+rdi*1]
1027 WORD $0x8941; BYTE $0xcc // mov r12d,ecx
1028
1029 // ROUND(R9, R10, R11, AX, BX, CX, DX, R8, R12, R13, R14, DI, R15, SP, 0x0c)
1030 LONG $0x24440344; BYTE $0x0c // add r8d,[rsp+0xc]
1031 WORD $0x2141; BYTE $0xdc // and r12d,ebx
1032 LONG $0xf07b63c4; WORD $0x19eb // rorx r13d,ebx,0x19
1033 LONG $0xf07be3c4; WORD $0x0bfb // rorx edi,ebx,0xb
1034 LONG $0x310c8d47 // lea r9d,[r9+r14*1]
1035 LONG $0x20048d47 // lea r8d,[r8+r12*1]
1036 LONG $0xf26062c4; BYTE $0xe2 // andn r12d,ebx,edx
1037 WORD $0x3141; BYTE $0xfd // xor r13d,edi
1038 LONG $0xf07b63c4; WORD $0x06f3 // rorx r14d,ebx,0x6
1039 LONG $0x20048d47 // lea r8d,[r8+r12*1]
1040 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
1041 WORD $0x8944; BYTE $0xcf // mov edi,r9d
1042 LONG $0xf07b43c4; WORD $0x16e1 // rorx r12d,r9d,0x16
1043 LONG $0x28048d47 // lea r8d,[r8+r13*1]
1044 WORD $0x3144; BYTE $0xd7 // xor edi,r10d
1045 LONG $0xf07b43c4; WORD $0x0df1 // rorx r14d,r9d,0xd
1046 LONG $0xf07b43c4; WORD $0x02e9 // rorx r13d,r9d,0x2
1047 LONG $0x00048d42 // lea eax,[rax+r8*1]
1048 WORD $0x2141; BYTE $0xff // and r15d,edi
1049 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
1050 WORD $0x3145; BYTE $0xd7 // xor r15d,r10d
1051 WORD $0x3145; BYTE $0xee // xor r14d,r13d
1052 LONG $0x38048d47 // lea r8d,[r8+r15*1]
1053 WORD $0x8941; BYTE $0xdc // mov r12d,ebx
1054
1055 // ROUND(R8, R9, R10, R11, AX, BX, CX, DX, R12, R13, R14, R15, DI, SP, 0x20)
1056 LONG $0x20245403 // add edx,[rsp+0x20]
1057 WORD $0x2141; BYTE $0xc4 // and r12d,eax
1058 LONG $0xf07b63c4; WORD $0x19e8 // rorx r13d,eax,0x19
1059 LONG $0xf07b63c4; WORD $0x0bf8 // rorx r15d,eax,0xb
1060 LONG $0x30048d47 // lea r8d,[r8+r14*1]
1061 LONG $0x22148d42 // lea edx,[rdx+r12*1]
1062 LONG $0xf27862c4; BYTE $0xe1 // andn r12d,eax,ecx
1063 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
1064 LONG $0xf07b63c4; WORD $0x06f0 // rorx r14d,eax,0x6
1065 LONG $0x22148d42 // lea edx,[rdx+r12*1]
1066 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
1067 WORD $0x8945; BYTE $0xc7 // mov r15d,r8d
1068 LONG $0xf07b43c4; WORD $0x16e0 // rorx r12d,r8d,0x16
1069 LONG $0x2a148d42 // lea edx,[rdx+r13*1]
1070 WORD $0x3145; BYTE $0xcf // xor r15d,r9d
1071 LONG $0xf07b43c4; WORD $0x0df0 // rorx r14d,r8d,0xd
1072 LONG $0xf07b43c4; WORD $0x02e8 // rorx r13d,r8d,0x2
1073 LONG $0x131c8d45 // lea r11d,[r11+rdx*1]
1074 WORD $0x2144; BYTE $0xff // and edi,r15d
1075 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
1076 WORD $0x3144; BYTE $0xcf // xor edi,r9d
1077 WORD $0x3145; BYTE $0xee // xor r14d,r13d
1078 WORD $0x148d; BYTE $0x3a // lea edx,[rdx+rdi*1]
1079 WORD $0x8941; BYTE $0xc4 // mov r12d,eax
1080
1081 // ROUND(DX, R8, R9, R10, R11, AX, BX, CX, R12, R13, R14, DI, R15, SP, 0x24)
1082 LONG $0x24244c03 // add ecx,[rsp+0x24]
1083 WORD $0x2145; BYTE $0xdc // and r12d,r11d
1084 LONG $0xf07b43c4; WORD $0x19eb // rorx r13d,r11d,0x19
1085 LONG $0xf07bc3c4; WORD $0x0bfb // rorx edi,r11d,0xb
1086 LONG $0x32148d42 // lea edx,[rdx+r14*1]
1087 LONG $0x210c8d42 // lea ecx,[rcx+r12*1]
1088 LONG $0xf22062c4; BYTE $0xe3 // andn r12d,r11d,ebx
1089 WORD $0x3141; BYTE $0xfd // xor r13d,edi
1090 LONG $0xf07b43c4; WORD $0x06f3 // rorx r14d,r11d,0x6
1091 LONG $0x210c8d42 // lea ecx,[rcx+r12*1]
1092 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
1093 WORD $0xd789 // mov edi,edx
1094 LONG $0xf07b63c4; WORD $0x16e2 // rorx r12d,edx,0x16
1095 LONG $0x290c8d42 // lea ecx,[rcx+r13*1]
1096 WORD $0x3144; BYTE $0xc7 // xor edi,r8d
1097 LONG $0xf07b63c4; WORD $0x0df2 // rorx r14d,edx,0xd
1098 LONG $0xf07b63c4; WORD $0x02ea // rorx r13d,edx,0x2
1099 LONG $0x0a148d45 // lea r10d,[r10+rcx*1]
1100 WORD $0x2141; BYTE $0xff // and r15d,edi
1101 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
1102 WORD $0x3145; BYTE $0xc7 // xor r15d,r8d
1103 WORD $0x3145; BYTE $0xee // xor r14d,r13d
1104 LONG $0x390c8d42 // lea ecx,[rcx+r15*1]
1105 WORD $0x8945; BYTE $0xdc // mov r12d,r11d
1106
1107 // ROUND(CX, DX, R8, R9, R10, R11, AX, BX, R12, R13, R14, R15, DI, SP, 0x28)
1108 LONG $0x28245c03 // add ebx,[rsp+0x28]
1109 WORD $0x2145; BYTE $0xd4 // and r12d,r10d
1110 LONG $0xf07b43c4; WORD $0x19ea // rorx r13d,r10d,0x19
1111 LONG $0xf07b43c4; WORD $0x0bfa // rorx r15d,r10d,0xb
1112 LONG $0x310c8d42 // lea ecx,[rcx+r14*1]
1113 LONG $0x231c8d42 // lea ebx,[rbx+r12*1]
1114 LONG $0xf22862c4; BYTE $0xe0 // andn r12d,r10d,eax
1115 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
1116 LONG $0xf07b43c4; WORD $0x06f2 // rorx r14d,r10d,0x6
1117 LONG $0x231c8d42 // lea ebx,[rbx+r12*1]
1118 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
1119 WORD $0x8941; BYTE $0xcf // mov r15d,ecx
1120 LONG $0xf07b63c4; WORD $0x16e1 // rorx r12d,ecx,0x16
1121 LONG $0x2b1c8d42 // lea ebx,[rbx+r13*1]
1122 WORD $0x3141; BYTE $0xd7 // xor r15d,edx
1123 LONG $0xf07b63c4; WORD $0x0df1 // rorx r14d,ecx,0xd
1124 LONG $0xf07b63c4; WORD $0x02e9 // rorx r13d,ecx,0x2
1125 LONG $0x190c8d45 // lea r9d,[r9+rbx*1]
1126 WORD $0x2144; BYTE $0xff // and edi,r15d
1127 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
1128 WORD $0xd731 // xor edi,edx
1129 WORD $0x3145; BYTE $0xee // xor r14d,r13d
1130 WORD $0x1c8d; BYTE $0x3b // lea ebx,[rbx+rdi*1]
1131 WORD $0x8945; BYTE $0xd4 // mov r12d,r10d
1132
1133 // ROUND(BX, CX, DX, R8, R9, R10, R11, AX, R12, R13, R14, DI, R15, SP, 0x2c)
1134 LONG $0x2c244403 // add eax,[rsp+0x2c]
1135 WORD $0x2145; BYTE $0xcc // and r12d,r9d
1136 LONG $0xf07b43c4; WORD $0x19e9 // rorx r13d,r9d,0x19
1137 LONG $0xf07bc3c4; WORD $0x0bf9 // rorx edi,r9d,0xb
1138 LONG $0x331c8d42 // lea ebx,[rbx+r14*1]
1139 LONG $0x20048d42 // lea eax,[rax+r12*1]
1140 LONG $0xf23042c4; BYTE $0xe3 // andn r12d,r9d,r11d
1141 WORD $0x3141; BYTE $0xfd // xor r13d,edi
1142 LONG $0xf07b43c4; WORD $0x06f1 // rorx r14d,r9d,0x6
1143 LONG $0x20048d42 // lea eax,[rax+r12*1]
1144 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
1145 WORD $0xdf89 // mov edi,ebx
1146 LONG $0xf07b63c4; WORD $0x16e3 // rorx r12d,ebx,0x16
1147 LONG $0x28048d42 // lea eax,[rax+r13*1]
1148 WORD $0xcf31 // xor edi,ecx
1149 LONG $0xf07b63c4; WORD $0x0df3 // rorx r14d,ebx,0xd
1150 LONG $0xf07b63c4; WORD $0x02eb // rorx r13d,ebx,0x2
1151 LONG $0x00048d45 // lea r8d,[r8+rax*1]
1152 WORD $0x2141; BYTE $0xff // and r15d,edi
1153 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
1154 WORD $0x3141; BYTE $0xcf // xor r15d,ecx
1155 WORD $0x3145; BYTE $0xee // xor r14d,r13d
1156 LONG $0x38048d42 // lea eax,[rax+r15*1]
1157 WORD $0x8945; BYTE $0xcc // mov r12d,r9d
1158
1159 MOVQ 0x200(SP), DI // $_ctx
1160 ADDQ R14, AX
1161
1162 LEAQ 0x1c0(SP), BP
1163
1164 ADDL (DI), AX
1165 ADDL 4(DI), BX
1166 ADDL 8(DI), CX
1167 ADDL 12(DI), DX
1168 ADDL 16(DI), R8
1169 ADDL 20(DI), R9
1170 ADDL 24(DI), R10
1171 ADDL 28(DI), R11
1172
1173 MOVL AX, (DI)
1174 MOVL BX, 4(DI)
1175 MOVL CX, 8(DI)
1176 MOVL DX, 12(DI)
1177 MOVL R8, 16(DI)
1178 MOVL R9, 20(DI)
1179 MOVL R10, 24(DI)
1180 MOVL R11, 28(DI)
1181
1182 CMPQ SI, 0x50(BP) // $_end
1183 JE done
1184
1185 XORQ R14, R14
1186 MOVQ BX, DI
1187 XORQ CX, DI // magic
1188 MOVQ R9, R12
11891189
11901190 loop2:
1191 // ROUND(AX, BX, CX, DX, R8, R9, R10, R11, R12, R13, R14, R15, DI, BP, 0x10)
1192 LONG $0x105d0344 // add r11d,[rbp+0x10]
1193 WORD $0x2145; BYTE $0xc4 // and r12d,r8d
1194 LONG $0xf07b43c4; WORD $0x19e8 // rorx r13d,r8d,0x19
1195 LONG $0xf07b43c4; WORD $0x0bf8 // rorx r15d,r8d,0xb
1196 LONG $0x30048d42 // lea eax,[rax+r14*1]
1197 LONG $0x231c8d47 // lea r11d,[r11+r12*1]
1198 LONG $0xf23842c4; BYTE $0xe2 // andn r12d,r8d,r10d
1199 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
1200 LONG $0xf07b43c4; WORD $0x06f0 // rorx r14d,r8d,0x6
1201 LONG $0x231c8d47 // lea r11d,[r11+r12*1]
1202 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
1203 WORD $0x8941; BYTE $0xc7 // mov r15d,eax
1204 LONG $0xf07b63c4; WORD $0x16e0 // rorx r12d,eax,0x16
1205 LONG $0x2b1c8d47 // lea r11d,[r11+r13*1]
1206 WORD $0x3141; BYTE $0xdf // xor r15d,ebx
1207 LONG $0xf07b63c4; WORD $0x0df0 // rorx r14d,eax,0xd
1208 LONG $0xf07b63c4; WORD $0x02e8 // rorx r13d,eax,0x2
1209 LONG $0x1a148d42 // lea edx,[rdx+r11*1]
1210 WORD $0x2144; BYTE $0xff // and edi,r15d
1211 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
1212 WORD $0xdf31 // xor edi,ebx
1213 WORD $0x3145; BYTE $0xee // xor r14d,r13d
1214 LONG $0x3b1c8d45 // lea r11d,[r11+rdi*1]
1215 WORD $0x8945; BYTE $0xc4 // mov r12d,r8d
1216
1217 // ROUND(R11, AX, BX, CX, DX, R8, R9, R10, R12, R13, R14, DI, R15, BP, 0x14)
1218 LONG $0x14550344 // add r10d,[rbp+0x14]
1219 WORD $0x2141; BYTE $0xd4 // and r12d,edx
1220 LONG $0xf07b63c4; WORD $0x19ea // rorx r13d,edx,0x19
1221 LONG $0xf07be3c4; WORD $0x0bfa // rorx edi,edx,0xb
1222 LONG $0x331c8d47 // lea r11d,[r11+r14*1]
1223 LONG $0x22148d47 // lea r10d,[r10+r12*1]
1224 LONG $0xf26842c4; BYTE $0xe1 // andn r12d,edx,r9d
1225 WORD $0x3141; BYTE $0xfd // xor r13d,edi
1226 LONG $0xf07b63c4; WORD $0x06f2 // rorx r14d,edx,0x6
1227 LONG $0x22148d47 // lea r10d,[r10+r12*1]
1228 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
1229 WORD $0x8944; BYTE $0xdf // mov edi,r11d
1230 LONG $0xf07b43c4; WORD $0x16e3 // rorx r12d,r11d,0x16
1231 LONG $0x2a148d47 // lea r10d,[r10+r13*1]
1232 WORD $0xc731 // xor edi,eax
1233 LONG $0xf07b43c4; WORD $0x0df3 // rorx r14d,r11d,0xd
1234 LONG $0xf07b43c4; WORD $0x02eb // rorx r13d,r11d,0x2
1235 LONG $0x110c8d42 // lea ecx,[rcx+r10*1]
1236 WORD $0x2141; BYTE $0xff // and r15d,edi
1237 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
1238 WORD $0x3141; BYTE $0xc7 // xor r15d,eax
1239 WORD $0x3145; BYTE $0xee // xor r14d,r13d
1240 LONG $0x3a148d47 // lea r10d,[r10+r15*1]
1241 WORD $0x8941; BYTE $0xd4 // mov r12d,edx
1242
1243 // ROUND(R10, R11, AX, BX, CX, DX, R8, R9, R12, R13, R14, R15, DI, BP, 0x18)
1244 LONG $0x184d0344 // add r9d,[rbp+0x18]
1245 WORD $0x2141; BYTE $0xcc // and r12d,ecx
1246 LONG $0xf07b63c4; WORD $0x19e9 // rorx r13d,ecx,0x19
1247 LONG $0xf07b63c4; WORD $0x0bf9 // rorx r15d,ecx,0xb
1248 LONG $0x32148d47 // lea r10d,[r10+r14*1]
1249 LONG $0x210c8d47 // lea r9d,[r9+r12*1]
1250 LONG $0xf27042c4; BYTE $0xe0 // andn r12d,ecx,r8d
1251 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
1252 LONG $0xf07b63c4; WORD $0x06f1 // rorx r14d,ecx,0x6
1253 LONG $0x210c8d47 // lea r9d,[r9+r12*1]
1254 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
1255 WORD $0x8945; BYTE $0xd7 // mov r15d,r10d
1256 LONG $0xf07b43c4; WORD $0x16e2 // rorx r12d,r10d,0x16
1257 LONG $0x290c8d47 // lea r9d,[r9+r13*1]
1258 WORD $0x3145; BYTE $0xdf // xor r15d,r11d
1259 LONG $0xf07b43c4; WORD $0x0df2 // rorx r14d,r10d,0xd
1260 LONG $0xf07b43c4; WORD $0x02ea // rorx r13d,r10d,0x2
1261 LONG $0x0b1c8d42 // lea ebx,[rbx+r9*1]
1262 WORD $0x2144; BYTE $0xff // and edi,r15d
1263 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
1264 WORD $0x3144; BYTE $0xdf // xor edi,r11d
1265 WORD $0x3145; BYTE $0xee // xor r14d,r13d
1266 LONG $0x390c8d45 // lea r9d,[r9+rdi*1]
1267 WORD $0x8941; BYTE $0xcc // mov r12d,ecx
1268
1269 // ROUND(R9, R10, R11, AX, BX, CX, DX, R8, R12, R13, R14, DI, R15, BP, 0x1c)
1270 LONG $0x1c450344 // add r8d,[rbp+0x1c]
1271 WORD $0x2141; BYTE $0xdc // and r12d,ebx
1272 LONG $0xf07b63c4; WORD $0x19eb // rorx r13d,ebx,0x19
1273 LONG $0xf07be3c4; WORD $0x0bfb // rorx edi,ebx,0xb
1274 LONG $0x310c8d47 // lea r9d,[r9+r14*1]
1275 LONG $0x20048d47 // lea r8d,[r8+r12*1]
1276 LONG $0xf26062c4; BYTE $0xe2 // andn r12d,ebx,edx
1277 WORD $0x3141; BYTE $0xfd // xor r13d,edi
1278 LONG $0xf07b63c4; WORD $0x06f3 // rorx r14d,ebx,0x6
1279 LONG $0x20048d47 // lea r8d,[r8+r12*1]
1280 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
1281 WORD $0x8944; BYTE $0xcf // mov edi,r9d
1282 LONG $0xf07b43c4; WORD $0x16e1 // rorx r12d,r9d,0x16
1283 LONG $0x28048d47 // lea r8d,[r8+r13*1]
1284 WORD $0x3144; BYTE $0xd7 // xor edi,r10d
1285 LONG $0xf07b43c4; WORD $0x0df1 // rorx r14d,r9d,0xd
1286 LONG $0xf07b43c4; WORD $0x02e9 // rorx r13d,r9d,0x2
1287 LONG $0x00048d42 // lea eax,[rax+r8*1]
1288 WORD $0x2141; BYTE $0xff // and r15d,edi
1289 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
1290 WORD $0x3145; BYTE $0xd7 // xor r15d,r10d
1291 WORD $0x3145; BYTE $0xee // xor r14d,r13d
1292 LONG $0x38048d47 // lea r8d,[r8+r15*1]
1293 WORD $0x8941; BYTE $0xdc // mov r12d,ebx
1294
1295 // ROUND(R8, R9, R10, R11, AX, BX, CX, DX, R12, R13, R14, R15, DI, BP, 0x30)
1296 WORD $0x5503; BYTE $0x30 // add edx,[rbp+0x30]
1297 WORD $0x2141; BYTE $0xc4 // and r12d,eax
1298 LONG $0xf07b63c4; WORD $0x19e8 // rorx r13d,eax,0x19
1299 LONG $0xf07b63c4; WORD $0x0bf8 // rorx r15d,eax,0xb
1300 LONG $0x30048d47 // lea r8d,[r8+r14*1]
1301 LONG $0x22148d42 // lea edx,[rdx+r12*1]
1302 LONG $0xf27862c4; BYTE $0xe1 // andn r12d,eax,ecx
1303 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
1304 LONG $0xf07b63c4; WORD $0x06f0 // rorx r14d,eax,0x6
1305 LONG $0x22148d42 // lea edx,[rdx+r12*1]
1306 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
1307 WORD $0x8945; BYTE $0xc7 // mov r15d,r8d
1308 LONG $0xf07b43c4; WORD $0x16e0 // rorx r12d,r8d,0x16
1309 LONG $0x2a148d42 // lea edx,[rdx+r13*1]
1310 WORD $0x3145; BYTE $0xcf // xor r15d,r9d
1311 LONG $0xf07b43c4; WORD $0x0df0 // rorx r14d,r8d,0xd
1312 LONG $0xf07b43c4; WORD $0x02e8 // rorx r13d,r8d,0x2
1313 LONG $0x131c8d45 // lea r11d,[r11+rdx*1]
1314 WORD $0x2144; BYTE $0xff // and edi,r15d
1315 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
1316 WORD $0x3144; BYTE $0xcf // xor edi,r9d
1317 WORD $0x3145; BYTE $0xee // xor r14d,r13d
1318 WORD $0x148d; BYTE $0x3a // lea edx,[rdx+rdi*1]
1319 WORD $0x8941; BYTE $0xc4 // mov r12d,eax
1320
1321 // ROUND(DX, R8, R9, R10, R11, AX, BX, CX, R12, R13, R14, DI, R15, BP, 0x34)
1322 WORD $0x4d03; BYTE $0x34 // add ecx,[rbp+0x34]
1323 WORD $0x2145; BYTE $0xdc // and r12d,r11d
1324 LONG $0xf07b43c4; WORD $0x19eb // rorx r13d,r11d,0x19
1325 LONG $0xf07bc3c4; WORD $0x0bfb // rorx edi,r11d,0xb
1326 LONG $0x32148d42 // lea edx,[rdx+r14*1]
1327 LONG $0x210c8d42 // lea ecx,[rcx+r12*1]
1328 LONG $0xf22062c4; BYTE $0xe3 // andn r12d,r11d,ebx
1329 WORD $0x3141; BYTE $0xfd // xor r13d,edi
1330 LONG $0xf07b43c4; WORD $0x06f3 // rorx r14d,r11d,0x6
1331 LONG $0x210c8d42 // lea ecx,[rcx+r12*1]
1332 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
1333 WORD $0xd789 // mov edi,edx
1334 LONG $0xf07b63c4; WORD $0x16e2 // rorx r12d,edx,0x16
1335 LONG $0x290c8d42 // lea ecx,[rcx+r13*1]
1336 WORD $0x3144; BYTE $0xc7 // xor edi,r8d
1337 LONG $0xf07b63c4; WORD $0x0df2 // rorx r14d,edx,0xd
1338 LONG $0xf07b63c4; WORD $0x02ea // rorx r13d,edx,0x2
1339 LONG $0x0a148d45 // lea r10d,[r10+rcx*1]
1340 WORD $0x2141; BYTE $0xff // and r15d,edi
1341 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
1342 WORD $0x3145; BYTE $0xc7 // xor r15d,r8d
1343 WORD $0x3145; BYTE $0xee // xor r14d,r13d
1344 LONG $0x390c8d42 // lea ecx,[rcx+r15*1]
1345 WORD $0x8945; BYTE $0xdc // mov r12d,r11d
1346
1347 // ROUND(CX, DX, R8, R9, R10, R11, AX, BX, R12, R13, R14, R15, DI, BP, 0x38)
1348 WORD $0x5d03; BYTE $0x38 // add ebx,[rbp+0x38]
1349 WORD $0x2145; BYTE $0xd4 // and r12d,r10d
1350 LONG $0xf07b43c4; WORD $0x19ea // rorx r13d,r10d,0x19
1351 LONG $0xf07b43c4; WORD $0x0bfa // rorx r15d,r10d,0xb
1352 LONG $0x310c8d42 // lea ecx,[rcx+r14*1]
1353 LONG $0x231c8d42 // lea ebx,[rbx+r12*1]
1354 LONG $0xf22862c4; BYTE $0xe0 // andn r12d,r10d,eax
1355 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
1356 LONG $0xf07b43c4; WORD $0x06f2 // rorx r14d,r10d,0x6
1357 LONG $0x231c8d42 // lea ebx,[rbx+r12*1]
1358 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
1359 WORD $0x8941; BYTE $0xcf // mov r15d,ecx
1360 LONG $0xf07b63c4; WORD $0x16e1 // rorx r12d,ecx,0x16
1361 LONG $0x2b1c8d42 // lea ebx,[rbx+r13*1]
1362 WORD $0x3141; BYTE $0xd7 // xor r15d,edx
1363 LONG $0xf07b63c4; WORD $0x0df1 // rorx r14d,ecx,0xd
1364 LONG $0xf07b63c4; WORD $0x02e9 // rorx r13d,ecx,0x2
1365 LONG $0x190c8d45 // lea r9d,[r9+rbx*1]
1366 WORD $0x2144; BYTE $0xff // and edi,r15d
1367 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
1368 WORD $0xd731 // xor edi,edx
1369 WORD $0x3145; BYTE $0xee // xor r14d,r13d
1370 WORD $0x1c8d; BYTE $0x3b // lea ebx,[rbx+rdi*1]
1371 WORD $0x8945; BYTE $0xd4 // mov r12d,r10d
1372
1373 // ROUND(BX, CX, DX, R8, R9, R10, R11, AX, R12, R13, R14, DI, R15, BP, 0x3c)
1374 WORD $0x4503; BYTE $0x3c // add eax,[rbp+0x3c]
1375 WORD $0x2145; BYTE $0xcc // and r12d,r9d
1376 LONG $0xf07b43c4; WORD $0x19e9 // rorx r13d,r9d,0x19
1377 LONG $0xf07bc3c4; WORD $0x0bf9 // rorx edi,r9d,0xb
1378 LONG $0x331c8d42 // lea ebx,[rbx+r14*1]
1379 LONG $0x20048d42 // lea eax,[rax+r12*1]
1380 LONG $0xf23042c4; BYTE $0xe3 // andn r12d,r9d,r11d
1381 WORD $0x3141; BYTE $0xfd // xor r13d,edi
1382 LONG $0xf07b43c4; WORD $0x06f1 // rorx r14d,r9d,0x6
1383 LONG $0x20048d42 // lea eax,[rax+r12*1]
1384 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
1385 WORD $0xdf89 // mov edi,ebx
1386 LONG $0xf07b63c4; WORD $0x16e3 // rorx r12d,ebx,0x16
1387 LONG $0x28048d42 // lea eax,[rax+r13*1]
1388 WORD $0xcf31 // xor edi,ecx
1389 LONG $0xf07b63c4; WORD $0x0df3 // rorx r14d,ebx,0xd
1390 LONG $0xf07b63c4; WORD $0x02eb // rorx r13d,ebx,0x2
1391 LONG $0x00048d45 // lea r8d,[r8+rax*1]
1392 WORD $0x2141; BYTE $0xff // and r15d,edi
1393 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
1394 WORD $0x3141; BYTE $0xcf // xor r15d,ecx
1395 WORD $0x3145; BYTE $0xee // xor r14d,r13d
1396 LONG $0x38048d42 // lea eax,[rax+r15*1]
1397 WORD $0x8945; BYTE $0xcc // mov r12d,r9d
1398
1399 ADDQ $-0x40, BP
1400 CMPQ BP, SP
1401 JAE loop2
1402
1403 MOVQ 0x200(SP), DI // $_ctx
1404 ADDQ R14, AX
1405
1406 ADDQ $0x1c0, SP
1407
1408 ADDL (DI), AX
1409 ADDL 4(DI), BX
1410 ADDL 8(DI), CX
1411 ADDL 12(DI), DX
1412 ADDL 16(DI), R8
1413 ADDL 20(DI), R9
1414
1415 ADDQ $0x80, SI // input += 2
1416 ADDL 24(DI), R10
1417 MOVQ SI, R12
1418 ADDL 28(DI), R11
1419 CMPQ SI, 0x50(SP) // input == _end
1420
1421 MOVL AX, (DI)
1422 LONG $0xe4440f4c // cmove r12,rsp /* next block or stale data */
1423 MOVL AX, (DI)
1424 MOVL BX, 4(DI)
1425 MOVL CX, 8(DI)
1426 MOVL DX, 12(DI)
1427 MOVL R8, 16(DI)
1428 MOVL R9, 20(DI)
1429 MOVL R10, 24(DI)
1430 MOVL R11, 28(DI)
1431
1432 JBE loop0
1433 LEAQ (SP), BP
1191 // ROUND(AX, BX, CX, DX, R8, R9, R10, R11, R12, R13, R14, R15, DI, BP, 0x10)
1192 LONG $0x105d0344 // add r11d,[rbp+0x10]
1193 WORD $0x2145; BYTE $0xc4 // and r12d,r8d
1194 LONG $0xf07b43c4; WORD $0x19e8 // rorx r13d,r8d,0x19
1195 LONG $0xf07b43c4; WORD $0x0bf8 // rorx r15d,r8d,0xb
1196 LONG $0x30048d42 // lea eax,[rax+r14*1]
1197 LONG $0x231c8d47 // lea r11d,[r11+r12*1]
1198 LONG $0xf23842c4; BYTE $0xe2 // andn r12d,r8d,r10d
1199 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
1200 LONG $0xf07b43c4; WORD $0x06f0 // rorx r14d,r8d,0x6
1201 LONG $0x231c8d47 // lea r11d,[r11+r12*1]
1202 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
1203 WORD $0x8941; BYTE $0xc7 // mov r15d,eax
1204 LONG $0xf07b63c4; WORD $0x16e0 // rorx r12d,eax,0x16
1205 LONG $0x2b1c8d47 // lea r11d,[r11+r13*1]
1206 WORD $0x3141; BYTE $0xdf // xor r15d,ebx
1207 LONG $0xf07b63c4; WORD $0x0df0 // rorx r14d,eax,0xd
1208 LONG $0xf07b63c4; WORD $0x02e8 // rorx r13d,eax,0x2
1209 LONG $0x1a148d42 // lea edx,[rdx+r11*1]
1210 WORD $0x2144; BYTE $0xff // and edi,r15d
1211 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
1212 WORD $0xdf31 // xor edi,ebx
1213 WORD $0x3145; BYTE $0xee // xor r14d,r13d
1214 LONG $0x3b1c8d45 // lea r11d,[r11+rdi*1]
1215 WORD $0x8945; BYTE $0xc4 // mov r12d,r8d
1216
1217 // ROUND(R11, AX, BX, CX, DX, R8, R9, R10, R12, R13, R14, DI, R15, BP, 0x14)
1218 LONG $0x14550344 // add r10d,[rbp+0x14]
1219 WORD $0x2141; BYTE $0xd4 // and r12d,edx
1220 LONG $0xf07b63c4; WORD $0x19ea // rorx r13d,edx,0x19
1221 LONG $0xf07be3c4; WORD $0x0bfa // rorx edi,edx,0xb
1222 LONG $0x331c8d47 // lea r11d,[r11+r14*1]
1223 LONG $0x22148d47 // lea r10d,[r10+r12*1]
1224 LONG $0xf26842c4; BYTE $0xe1 // andn r12d,edx,r9d
1225 WORD $0x3141; BYTE $0xfd // xor r13d,edi
1226 LONG $0xf07b63c4; WORD $0x06f2 // rorx r14d,edx,0x6
1227 LONG $0x22148d47 // lea r10d,[r10+r12*1]
1228 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
1229 WORD $0x8944; BYTE $0xdf // mov edi,r11d
1230 LONG $0xf07b43c4; WORD $0x16e3 // rorx r12d,r11d,0x16
1231 LONG $0x2a148d47 // lea r10d,[r10+r13*1]
1232 WORD $0xc731 // xor edi,eax
1233 LONG $0xf07b43c4; WORD $0x0df3 // rorx r14d,r11d,0xd
1234 LONG $0xf07b43c4; WORD $0x02eb // rorx r13d,r11d,0x2
1235 LONG $0x110c8d42 // lea ecx,[rcx+r10*1]
1236 WORD $0x2141; BYTE $0xff // and r15d,edi
1237 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
1238 WORD $0x3141; BYTE $0xc7 // xor r15d,eax
1239 WORD $0x3145; BYTE $0xee // xor r14d,r13d
1240 LONG $0x3a148d47 // lea r10d,[r10+r15*1]
1241 WORD $0x8941; BYTE $0xd4 // mov r12d,edx
1242
1243 // ROUND(R10, R11, AX, BX, CX, DX, R8, R9, R12, R13, R14, R15, DI, BP, 0x18)
1244 LONG $0x184d0344 // add r9d,[rbp+0x18]
1245 WORD $0x2141; BYTE $0xcc // and r12d,ecx
1246 LONG $0xf07b63c4; WORD $0x19e9 // rorx r13d,ecx,0x19
1247 LONG $0xf07b63c4; WORD $0x0bf9 // rorx r15d,ecx,0xb
1248 LONG $0x32148d47 // lea r10d,[r10+r14*1]
1249 LONG $0x210c8d47 // lea r9d,[r9+r12*1]
1250 LONG $0xf27042c4; BYTE $0xe0 // andn r12d,ecx,r8d
1251 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
1252 LONG $0xf07b63c4; WORD $0x06f1 // rorx r14d,ecx,0x6
1253 LONG $0x210c8d47 // lea r9d,[r9+r12*1]
1254 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
1255 WORD $0x8945; BYTE $0xd7 // mov r15d,r10d
1256 LONG $0xf07b43c4; WORD $0x16e2 // rorx r12d,r10d,0x16
1257 LONG $0x290c8d47 // lea r9d,[r9+r13*1]
1258 WORD $0x3145; BYTE $0xdf // xor r15d,r11d
1259 LONG $0xf07b43c4; WORD $0x0df2 // rorx r14d,r10d,0xd
1260 LONG $0xf07b43c4; WORD $0x02ea // rorx r13d,r10d,0x2
1261 LONG $0x0b1c8d42 // lea ebx,[rbx+r9*1]
1262 WORD $0x2144; BYTE $0xff // and edi,r15d
1263 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
1264 WORD $0x3144; BYTE $0xdf // xor edi,r11d
1265 WORD $0x3145; BYTE $0xee // xor r14d,r13d
1266 LONG $0x390c8d45 // lea r9d,[r9+rdi*1]
1267 WORD $0x8941; BYTE $0xcc // mov r12d,ecx
1268
1269 // ROUND(R9, R10, R11, AX, BX, CX, DX, R8, R12, R13, R14, DI, R15, BP, 0x1c)
1270 LONG $0x1c450344 // add r8d,[rbp+0x1c]
1271 WORD $0x2141; BYTE $0xdc // and r12d,ebx
1272 LONG $0xf07b63c4; WORD $0x19eb // rorx r13d,ebx,0x19
1273 LONG $0xf07be3c4; WORD $0x0bfb // rorx edi,ebx,0xb
1274 LONG $0x310c8d47 // lea r9d,[r9+r14*1]
1275 LONG $0x20048d47 // lea r8d,[r8+r12*1]
1276 LONG $0xf26062c4; BYTE $0xe2 // andn r12d,ebx,edx
1277 WORD $0x3141; BYTE $0xfd // xor r13d,edi
1278 LONG $0xf07b63c4; WORD $0x06f3 // rorx r14d,ebx,0x6
1279 LONG $0x20048d47 // lea r8d,[r8+r12*1]
1280 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
1281 WORD $0x8944; BYTE $0xcf // mov edi,r9d
1282 LONG $0xf07b43c4; WORD $0x16e1 // rorx r12d,r9d,0x16
1283 LONG $0x28048d47 // lea r8d,[r8+r13*1]
1284 WORD $0x3144; BYTE $0xd7 // xor edi,r10d
1285 LONG $0xf07b43c4; WORD $0x0df1 // rorx r14d,r9d,0xd
1286 LONG $0xf07b43c4; WORD $0x02e9 // rorx r13d,r9d,0x2
1287 LONG $0x00048d42 // lea eax,[rax+r8*1]
1288 WORD $0x2141; BYTE $0xff // and r15d,edi
1289 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
1290 WORD $0x3145; BYTE $0xd7 // xor r15d,r10d
1291 WORD $0x3145; BYTE $0xee // xor r14d,r13d
1292 LONG $0x38048d47 // lea r8d,[r8+r15*1]
1293 WORD $0x8941; BYTE $0xdc // mov r12d,ebx
1294
1295 // ROUND(R8, R9, R10, R11, AX, BX, CX, DX, R12, R13, R14, R15, DI, BP, 0x30)
1296 WORD $0x5503; BYTE $0x30 // add edx,[rbp+0x30]
1297 WORD $0x2141; BYTE $0xc4 // and r12d,eax
1298 LONG $0xf07b63c4; WORD $0x19e8 // rorx r13d,eax,0x19
1299 LONG $0xf07b63c4; WORD $0x0bf8 // rorx r15d,eax,0xb
1300 LONG $0x30048d47 // lea r8d,[r8+r14*1]
1301 LONG $0x22148d42 // lea edx,[rdx+r12*1]
1302 LONG $0xf27862c4; BYTE $0xe1 // andn r12d,eax,ecx
1303 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
1304 LONG $0xf07b63c4; WORD $0x06f0 // rorx r14d,eax,0x6
1305 LONG $0x22148d42 // lea edx,[rdx+r12*1]
1306 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
1307 WORD $0x8945; BYTE $0xc7 // mov r15d,r8d
1308 LONG $0xf07b43c4; WORD $0x16e0 // rorx r12d,r8d,0x16
1309 LONG $0x2a148d42 // lea edx,[rdx+r13*1]
1310 WORD $0x3145; BYTE $0xcf // xor r15d,r9d
1311 LONG $0xf07b43c4; WORD $0x0df0 // rorx r14d,r8d,0xd
1312 LONG $0xf07b43c4; WORD $0x02e8 // rorx r13d,r8d,0x2
1313 LONG $0x131c8d45 // lea r11d,[r11+rdx*1]
1314 WORD $0x2144; BYTE $0xff // and edi,r15d
1315 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
1316 WORD $0x3144; BYTE $0xcf // xor edi,r9d
1317 WORD $0x3145; BYTE $0xee // xor r14d,r13d
1318 WORD $0x148d; BYTE $0x3a // lea edx,[rdx+rdi*1]
1319 WORD $0x8941; BYTE $0xc4 // mov r12d,eax
1320
1321 // ROUND(DX, R8, R9, R10, R11, AX, BX, CX, R12, R13, R14, DI, R15, BP, 0x34)
1322 WORD $0x4d03; BYTE $0x34 // add ecx,[rbp+0x34]
1323 WORD $0x2145; BYTE $0xdc // and r12d,r11d
1324 LONG $0xf07b43c4; WORD $0x19eb // rorx r13d,r11d,0x19
1325 LONG $0xf07bc3c4; WORD $0x0bfb // rorx edi,r11d,0xb
1326 LONG $0x32148d42 // lea edx,[rdx+r14*1]
1327 LONG $0x210c8d42 // lea ecx,[rcx+r12*1]
1328 LONG $0xf22062c4; BYTE $0xe3 // andn r12d,r11d,ebx
1329 WORD $0x3141; BYTE $0xfd // xor r13d,edi
1330 LONG $0xf07b43c4; WORD $0x06f3 // rorx r14d,r11d,0x6
1331 LONG $0x210c8d42 // lea ecx,[rcx+r12*1]
1332 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
1333 WORD $0xd789 // mov edi,edx
1334 LONG $0xf07b63c4; WORD $0x16e2 // rorx r12d,edx,0x16
1335 LONG $0x290c8d42 // lea ecx,[rcx+r13*1]
1336 WORD $0x3144; BYTE $0xc7 // xor edi,r8d
1337 LONG $0xf07b63c4; WORD $0x0df2 // rorx r14d,edx,0xd
1338 LONG $0xf07b63c4; WORD $0x02ea // rorx r13d,edx,0x2
1339 LONG $0x0a148d45 // lea r10d,[r10+rcx*1]
1340 WORD $0x2141; BYTE $0xff // and r15d,edi
1341 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
1342 WORD $0x3145; BYTE $0xc7 // xor r15d,r8d
1343 WORD $0x3145; BYTE $0xee // xor r14d,r13d
1344 LONG $0x390c8d42 // lea ecx,[rcx+r15*1]
1345 WORD $0x8945; BYTE $0xdc // mov r12d,r11d
1346
1347 // ROUND(CX, DX, R8, R9, R10, R11, AX, BX, R12, R13, R14, R15, DI, BP, 0x38)
1348 WORD $0x5d03; BYTE $0x38 // add ebx,[rbp+0x38]
1349 WORD $0x2145; BYTE $0xd4 // and r12d,r10d
1350 LONG $0xf07b43c4; WORD $0x19ea // rorx r13d,r10d,0x19
1351 LONG $0xf07b43c4; WORD $0x0bfa // rorx r15d,r10d,0xb
1352 LONG $0x310c8d42 // lea ecx,[rcx+r14*1]
1353 LONG $0x231c8d42 // lea ebx,[rbx+r12*1]
1354 LONG $0xf22862c4; BYTE $0xe0 // andn r12d,r10d,eax
1355 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
1356 LONG $0xf07b43c4; WORD $0x06f2 // rorx r14d,r10d,0x6
1357 LONG $0x231c8d42 // lea ebx,[rbx+r12*1]
1358 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
1359 WORD $0x8941; BYTE $0xcf // mov r15d,ecx
1360 LONG $0xf07b63c4; WORD $0x16e1 // rorx r12d,ecx,0x16
1361 LONG $0x2b1c8d42 // lea ebx,[rbx+r13*1]
1362 WORD $0x3141; BYTE $0xd7 // xor r15d,edx
1363 LONG $0xf07b63c4; WORD $0x0df1 // rorx r14d,ecx,0xd
1364 LONG $0xf07b63c4; WORD $0x02e9 // rorx r13d,ecx,0x2
1365 LONG $0x190c8d45 // lea r9d,[r9+rbx*1]
1366 WORD $0x2144; BYTE $0xff // and edi,r15d
1367 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
1368 WORD $0xd731 // xor edi,edx
1369 WORD $0x3145; BYTE $0xee // xor r14d,r13d
1370 WORD $0x1c8d; BYTE $0x3b // lea ebx,[rbx+rdi*1]
1371 WORD $0x8945; BYTE $0xd4 // mov r12d,r10d
1372
1373 // ROUND(BX, CX, DX, R8, R9, R10, R11, AX, R12, R13, R14, DI, R15, BP, 0x3c)
1374 WORD $0x4503; BYTE $0x3c // add eax,[rbp+0x3c]
1375 WORD $0x2145; BYTE $0xcc // and r12d,r9d
1376 LONG $0xf07b43c4; WORD $0x19e9 // rorx r13d,r9d,0x19
1377 LONG $0xf07bc3c4; WORD $0x0bf9 // rorx edi,r9d,0xb
1378 LONG $0x331c8d42 // lea ebx,[rbx+r14*1]
1379 LONG $0x20048