Codebase list golang-github-minio-sha256-simd / 83f22cf7-9dc9-4c54-94b8-f266a3d606c0/main
New upstream snapshot. Debian Janitor 7 months ago
33 changed file(s) with 139 addition(s) and 2892 deletion(s). Raw diff Collapse all Expand all
0 name: Go
1
2 on:
3 pull_request:
4 branches:
5 - master
6 push:
7 branches:
8 - master
9
10 jobs:
11 build:
12 name: Test on Go ${{ matrix.go-version }} and ${{ matrix.os }}
13 runs-on: ${{ matrix.os }}
14 strategy:
15 max-parallel: 4
16 matrix:
17 go-version: [1.16.x, 1.15.x, 1.14.x]
18 os: [ubuntu-latest, windows-latest, macos-latest]
19 steps:
20 - name: Set up Go ${{ matrix.go-version }}
21 uses: actions/setup-go@v1
22 with:
23 go-version: ${{ matrix.go-version }}
24 id: go
25
26 - name: Check out code into the Go module directory
27 uses: actions/checkout@v1
28
29 - name: Build on ${{ matrix.os }}
30 if: matrix.os == 'windows-latest'
31 run: go test -race -v ./...
32 - name: Build on ${{ matrix.os }}
33 if: matrix.os == 'macos-latest'
34 run: go test -race -v ./...
35 - name: Build on ${{ matrix.os }}
36 if: matrix.os == 'ubuntu-latest'
37 run: |
38 diff -au <(gofmt -d .) <(printf "")
39 go test -race -v ./...
40 go vet -asmdecl .
41 ./test-architectures.sh
+0
-25
.travis.yml less more
0 sudo: required
1 dist: trusty
2 language: go
3
4 os:
5 - linux
6
7 go:
8 - tip
9 - 1.12.x
10
11 env:
12 - ARCH=x86_64
13 - ARCH=i686
14
15 matrix:
16 fast_finish: true
17 allow_failures:
18 - go: tip
19
20 script:
21 - diff -au <(gofmt -d .) <(printf "")
22 - go test -race -v ./...
23 - go vet -asmdecl .
24 - ./test-architectures.sh
00 # sha256-simd
11
2 Accelerate SHA256 computations in pure Go using AVX512, SHA Extensions and AVX2 for Intel and ARM64 for ARM. On AVX512 it provides an up to 8x improvement (over 3 GB/s per core) in comparison to AVX2. SHA Extensions give a performance boost of close to 4x over AVX2.
2 Accelerate SHA256 computations in pure Go using AVX512, SHA Extensions for x86 and ARM64 for ARM.
3 On AVX512 it provides an up to 8x improvement (over 3 GB/s per core).
4 SHA Extensions give a performance boost of close to 4x over native.
35
46 ## Introduction
57
6 This package is designed as a replacement for `crypto/sha256`. For Intel CPUs it has two flavors for AVX512 and AVX2 (AVX/SSE are also supported). For ARM CPUs with the Cryptography Extensions, advantage is taken of the SHA2 instructions resulting in a massive performance improvement.
8 This package is designed as a replacement for `crypto/sha256`.
9 For ARM CPUs with the Cryptography Extensions, advantage is taken of the SHA2 instructions resulting in a massive performance improvement.
710
8 This package uses Golang assembly. The AVX512 version is based on the Intel's "multi-buffer crypto library for IPSec" whereas the other Intel implementations are described in "Fast SHA-256 Implementations on Intel Architecture Processors" by J. Guilford et al.
11 This package uses Golang assembly.
12 The AVX512 version is based on the Intel's "multi-buffer crypto library for IPSec" whereas the other Intel implementations are described in "Fast SHA-256 Implementations on Intel Architecture Processors" by J. Guilford et al.
913
10 ## New: Support for Intel SHA Extensions
14 ## Support for Intel SHA Extensions
1115
1216 Support for the Intel SHA Extensions has been added by Kristofer Peterson (@svenski123), originally developed for spacemeshos [here](https://github.com/spacemeshos/POET/issues/23). On CPUs that support it (known thus far Intel Celeron J3455 and AMD Ryzen) it gives a significant boost in performance (with thanks to @AudriusButkevicius for reporting the results; full results [here](https://github.com/minio/sha256-simd/pull/37#issuecomment-451607827)).
1317
1721 BenchmarkHash5M 514.40 1975.17 3.84x
1822 ```
1923
20 Thanks to Kristofer Peterson, we also added additional performance changes such as optimized padding, endian conversions which sped up all implementations i.e. Intel SHA alone while doubled performance for small sizes, the other changes increased everything roughly 50%.
24 Thanks to Kristofer Peterson, we also added additional performance changes such as optimized padding,
25 endian conversions which sped up all implementations i.e. Intel SHA alone while doubled performance for small sizes,
26 the other changes increased everything roughly 50%.
2127
2228 ## Support for AVX512
2329
5763
5864 ## Drop-In Replacement
5965
60 The following code snippet shows how you can use `github.com/minio/sha256-simd`. This will automatically select the fastest method for the architecture on which it will be executed.
66 The following code snippet shows how you can use `github.com/minio/sha256-simd`.
67 This will automatically select the fastest method for the architecture on which it will be executed.
6168
6269 ```go
6370 import "github.com/minio/sha256-simd"
7986 | 3.0 GHz Intel Xeon Platinum 8124M | AVX512 | 3498 |
8087 | 3.7 GHz AMD Ryzen 7 2700X | SHA Ext | 1979 |
8188 | 1.2 GHz ARM Cortex-A53 | ARM64 | 638 |
82 | 3.0 GHz Intel Xeon Platinum 8124M | AVX2 | 449 |
83 | 3.1 GHz Intel Core i7 | AVX | 362 |
84 | 3.1 GHz Intel Core i7 | SSE | 299 |
8589
8690 ## asm2plan9s
8791
+0
-32
appveyor.yml less more
0 # version format
1 version: "{build}"
2
3 # Operating system (build VM template)
4 os: Windows Server 2012 R2
5
6 # Platform.
7 platform: x64
8
9 clone_folder: c:\gopath\src\github.com\minio\sha256-simd
10
11 # environment variables
12 environment:
13 GOPATH: c:\gopath
14 GO15VENDOREXPERIMENT: 1
15
16 # scripts that run after cloning repository
17 install:
18 - set PATH=%GOPATH%\bin;c:\go\bin;%PATH%
19 - go version
20 - go env
21
22 # to run your custom scripts instead of automatic MSBuild
23 build_script:
24 - go test .
25 - go test -race .
26
27 # to disable automatic tests
28 test: off
29
30 # to disable deployment
31 deploy: off
+0
-119
cpuid.go less more
0 // Minio Cloud Storage, (C) 2016 Minio, Inc.
1 //
2 // Licensed under the Apache License, Version 2.0 (the "License");
3 // you may not use this file except in compliance with the License.
4 // You may obtain a copy of the License at
5 //
6 // http://www.apache.org/licenses/LICENSE-2.0
7 //
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13 //
14
15 package sha256
16
17 // True when SIMD instructions are available.
18 var avx512 bool
19 var avx2 bool
20 var avx bool
21 var sse bool
22 var sse2 bool
23 var sse3 bool
24 var ssse3 bool
25 var sse41 bool
26 var sse42 bool
27 var popcnt bool
28 var sha bool
29 var armSha = haveArmSha()
30
31 func init() {
32 var _xsave bool
33 var _osxsave bool
34 var _avx bool
35 var _avx2 bool
36 var _avx512f bool
37 var _avx512dq bool
38 // var _avx512pf bool
39 // var _avx512er bool
40 // var _avx512cd bool
41 var _avx512bw bool
42 var _avx512vl bool
43 var _sseState bool
44 var _avxState bool
45 var _opmaskState bool
46 var _zmmHI256State bool
47 var _hi16ZmmState bool
48
49 mfi, _, _, _ := cpuid(0)
50
51 if mfi >= 1 {
52 _, _, c, d := cpuid(1)
53
54 sse = (d & (1 << 25)) != 0
55 sse2 = (d & (1 << 26)) != 0
56 sse3 = (c & (1 << 0)) != 0
57 ssse3 = (c & (1 << 9)) != 0
58 sse41 = (c & (1 << 19)) != 0
59 sse42 = (c & (1 << 20)) != 0
60 popcnt = (c & (1 << 23)) != 0
61 _xsave = (c & (1 << 26)) != 0
62 _osxsave = (c & (1 << 27)) != 0
63 _avx = (c & (1 << 28)) != 0
64 }
65
66 if mfi >= 7 {
67 _, b, _, _ := cpuid(7)
68
69 _avx2 = (b & (1 << 5)) != 0
70 _avx512f = (b & (1 << 16)) != 0
71 _avx512dq = (b & (1 << 17)) != 0
72 // _avx512pf = (b & (1 << 26)) != 0
73 // _avx512er = (b & (1 << 27)) != 0
74 // _avx512cd = (b & (1 << 28)) != 0
75 _avx512bw = (b & (1 << 30)) != 0
76 _avx512vl = (b & (1 << 31)) != 0
77 sha = (b & (1 << 29)) != 0
78 }
79
80 // Stop here if XSAVE unsupported or not enabled
81 if !_xsave || !_osxsave {
82 return
83 }
84
85 if _xsave && _osxsave {
86 a, _ := xgetbv(0)
87
88 _sseState = (a & (1 << 1)) != 0
89 _avxState = (a & (1 << 2)) != 0
90 _opmaskState = (a & (1 << 5)) != 0
91 _zmmHI256State = (a & (1 << 6)) != 0
92 _hi16ZmmState = (a & (1 << 7)) != 0
93 } else {
94 _sseState = true
95 }
96
97 // Very unlikely that OS would enable XSAVE and then disable SSE
98 if !_sseState {
99 sse = false
100 sse2 = false
101 sse3 = false
102 ssse3 = false
103 sse41 = false
104 sse42 = false
105 }
106
107 if _avxState {
108 avx = _avx
109 avx2 = _avx2
110 }
111
112 if _opmaskState && _zmmHI256State && _hi16ZmmState {
113 avx512 = (_avx512f &&
114 _avx512dq &&
115 _avx512bw &&
116 _avx512vl)
117 }
118 }
+0
-24
cpuid_386.go less more
0 // Minio Cloud Storage, (C) 2016 Minio, Inc.
1 //
2 // Licensed under the Apache License, Version 2.0 (the "License");
3 // you may not use this file except in compliance with the License.
4 // You may obtain a copy of the License at
5 //
6 // http://www.apache.org/licenses/LICENSE-2.0
7 //
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13 //
14
15 package sha256
16
17 func cpuid(op uint32) (eax, ebx, ecx, edx uint32)
18 func cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
19 func xgetbv(index uint32) (eax, edx uint32)
20
21 func haveArmSha() bool {
22 return false
23 }
+0
-53
cpuid_386.s less more
0 // The MIT License (MIT)
1 //
2 // Copyright (c) 2015 Klaus Post
3 //
4 // Permission is hereby granted, free of charge, to any person obtaining a copy
5 // of this software and associated documentation files (the "Software"), to deal
6 // in the Software without restriction, including without limitation the rights
7 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 // copies of the Software, and to permit persons to whom the Software is
9 // furnished to do so, subject to the following conditions:
10 //
11 // The above copyright notice and this permission notice shall be included in all
12 // copies or substantial portions of the Software.
13 //
14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 // SOFTWARE.
21
22 // +build 386,!gccgo
23
24 // func cpuid(op uint32) (eax, ebx, ecx, edx uint32)
25 TEXT ·cpuid(SB), 7, $0
26 XORL CX, CX
27 MOVL op+0(FP), AX
28 CPUID
29 MOVL AX, eax+4(FP)
30 MOVL BX, ebx+8(FP)
31 MOVL CX, ecx+12(FP)
32 MOVL DX, edx+16(FP)
33 RET
34
35 // func cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
36 TEXT ·cpuidex(SB), 7, $0
37 MOVL op+0(FP), AX
38 MOVL op2+4(FP), CX
39 CPUID
40 MOVL AX, eax+8(FP)
41 MOVL BX, ebx+12(FP)
42 MOVL CX, ecx+16(FP)
43 MOVL DX, edx+20(FP)
44 RET
45
46 // func xgetbv(index uint32) (eax, edx uint32)
47 TEXT ·xgetbv(SB), 7, $0
48 MOVL index+0(FP), CX
49 BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV
50 MOVL AX, eax+4(FP)
51 MOVL DX, edx+8(FP)
52 RET
+0
-24
cpuid_amd64.go less more
0 // Minio Cloud Storage, (C) 2016 Minio, Inc.
1 //
2 // Licensed under the Apache License, Version 2.0 (the "License");
3 // you may not use this file except in compliance with the License.
4 // You may obtain a copy of the License at
5 //
6 // http://www.apache.org/licenses/LICENSE-2.0
7 //
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13 //
14
15 package sha256
16
17 func cpuid(op uint32) (eax, ebx, ecx, edx uint32)
18 func cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
19 func xgetbv(index uint32) (eax, edx uint32)
20
21 func haveArmSha() bool {
22 return false
23 }
+0
-53
cpuid_amd64.s less more
0 // The MIT License (MIT)
1 //
2 // Copyright (c) 2015 Klaus Post
3 //
4 // Permission is hereby granted, free of charge, to any person obtaining a copy
5 // of this software and associated documentation files (the "Software"), to deal
6 // in the Software without restriction, including without limitation the rights
7 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 // copies of the Software, and to permit persons to whom the Software is
9 // furnished to do so, subject to the following conditions:
10 //
11 // The above copyright notice and this permission notice shall be included in all
12 // copies or substantial portions of the Software.
13 //
14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 // SOFTWARE.
21
22 // +build amd64,!gccgo
23
24 // func cpuid(op uint32) (eax, ebx, ecx, edx uint32)
25 TEXT ·cpuid(SB), 7, $0
26 XORQ CX, CX
27 MOVL op+0(FP), AX
28 CPUID
29 MOVL AX, eax+8(FP)
30 MOVL BX, ebx+12(FP)
31 MOVL CX, ecx+16(FP)
32 MOVL DX, edx+20(FP)
33 RET
34
35 // func cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
36 TEXT ·cpuidex(SB), 7, $0
37 MOVL op+0(FP), AX
38 MOVL op2+4(FP), CX
39 CPUID
40 MOVL AX, eax+8(FP)
41 MOVL BX, ebx+12(FP)
42 MOVL CX, ecx+16(FP)
43 MOVL DX, edx+20(FP)
44 RET
45
46 // func xgetbv(index uint32) (eax, edx uint32)
47 TEXT ·xgetbv(SB), 7, $0
48 MOVL index+0(FP), CX
49 BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV
50 MOVL AX, eax+8(FP)
51 MOVL DX, edx+12(FP)
52 RET
+0
-32
cpuid_arm.go less more
0 // Minio Cloud Storage, (C) 2016 Minio, Inc.
1 //
2 // Licensed under the Apache License, Version 2.0 (the "License");
3 // you may not use this file except in compliance with the License.
4 // You may obtain a copy of the License at
5 //
6 // http://www.apache.org/licenses/LICENSE-2.0
7 //
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13 //
14
15 package sha256
16
17 func cpuid(op uint32) (eax, ebx, ecx, edx uint32) {
18 return 0, 0, 0, 0
19 }
20
21 func cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32) {
22 return 0, 0, 0, 0
23 }
24
25 func xgetbv(index uint32) (eax, edx uint32) {
26 return 0, 0
27 }
28
29 func haveArmSha() bool {
30 return false
31 }
+0
-49
cpuid_linux_arm64.go less more
0 // +build arm64,linux
1
2 // Minio Cloud Storage, (C) 2016 Minio, Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16
17 package sha256
18
19 import (
20 "bytes"
21 "io/ioutil"
22 )
23
24 func cpuid(op uint32) (eax, ebx, ecx, edx uint32) {
25 return 0, 0, 0, 0
26 }
27
28 func cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32) {
29 return 0, 0, 0, 0
30 }
31
32 func xgetbv(index uint32) (eax, edx uint32) {
33 return 0, 0
34 }
35
36 // File to check for cpu capabilities.
37 const procCPUInfo = "/proc/cpuinfo"
38
39 // Feature to check for.
40 const sha256Feature = "sha2"
41
42 func haveArmSha() bool {
43 cpuInfo, err := ioutil.ReadFile(procCPUInfo)
44 if err != nil {
45 return false
46 }
47 return bytes.Contains(cpuInfo, []byte(sha256Feature))
48 }
0 // Minio Cloud Storage, (C) 2016 Minio, Inc.
0 // Minio Cloud Storage, (C) 2021 Minio, Inc.
11 //
22 // Licensed under the Apache License, Version 2.0 (the "License");
33 // you may not use this file except in compliance with the License.
1212 // limitations under the License.
1313 //
1414
15 // +build !386,!amd64,!arm,!arm64 arm64,!linux
16
1715 package sha256
1816
19 func cpuid(op uint32) (eax, ebx, ecx, edx uint32) {
20 return 0, 0, 0, 0
17 import (
18 "bytes"
19 "io/ioutil"
20 "runtime"
21
22 "github.com/klauspost/cpuid/v2"
23 )
24
25 func hasArmSha2() bool {
26 if cpuid.CPU.Has(cpuid.SHA2) {
27 return true
28 }
29 if runtime.GOARCH != "arm64" || runtime.GOOS != "linux" {
30 return false
31 }
32
33 // Fall back to hacky cpuinfo parsing...
34 const procCPUInfo = "/proc/cpuinfo"
35
36 // Feature to check for.
37 const sha256Feature = "sha2"
38
39 cpuInfo, err := ioutil.ReadFile(procCPUInfo)
40 if err != nil {
41 return false
42 }
43 return bytes.Contains(cpuInfo, []byte(sha256Feature))
44
2145 }
22
23 func cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32) {
24 return 0, 0, 0, 0
25 }
26
27 func xgetbv(index uint32) (eax, edx uint32) {
28 return 0, 0
29 }
30
31 func haveArmSha() bool {
32 return false
33 }
0 golang-github-minio-sha256-simd (1.0.0+git20210617.1.99e45fa-1) UNRELEASED; urgency=low
1
2 * New upstream snapshot.
3
4 -- Debian Janitor <janitor@jelmer.uk> Fri, 01 Apr 2022 03:51:13 -0000
5
06 golang-github-minio-sha256-simd (0.1.1-1) unstable; urgency=medium
17
28 * New upstream release.
00 module github.com/minio/sha256-simd
11
2 go 1.12
2 go 1.13
3
4 require github.com/klauspost/cpuid/v2 v2.0.6
0 github.com/klauspost/cpuid/v2 v2.0.6 h1:dQ5ueTiftKxp0gyjKSx5+8BtPWkyQbd95m8Gys/RarI=
1 github.com/klauspost/cpuid/v2 v2.0.6/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
2020 "encoding/binary"
2121 "hash"
2222 "runtime"
23
24 "github.com/klauspost/cpuid/v2"
2325 )
2426
2527 // Size - The size of a SHA256 checksum in bytes.
6668
6769 const (
6870 blockfuncGeneric blockfuncType = iota
69 blockfuncAvx512 blockfuncType = iota
70 blockfuncAvx2 blockfuncType = iota
71 blockfuncAvx blockfuncType = iota
72 blockfuncSsse blockfuncType = iota
7371 blockfuncSha blockfuncType = iota
7472 blockfuncArm blockfuncType = iota
7573 )
7775 var blockfunc blockfuncType
7876
7977 func init() {
80 is386bit := runtime.GOARCH == "386"
81 isARM := runtime.GOARCH == "arm"
78 blockfunc = blockfuncGeneric
8279 switch {
83 case is386bit || isARM:
84 blockfunc = blockfuncGeneric
85 case sha && ssse3 && sse41:
80 case hasSHAExtensions():
8681 blockfunc = blockfuncSha
87 case avx2:
88 blockfunc = blockfuncAvx2
89 case avx:
90 blockfunc = blockfuncAvx
91 case ssse3:
92 blockfunc = blockfuncSsse
93 case armSha:
82 case hasArmSha2():
9483 blockfunc = blockfuncArm
9584 default:
9685 blockfunc = blockfuncGeneric
9786 }
87 }
88
89 var avx512 = cpuid.CPU.Supports(cpuid.AVX512F, cpuid.AVX512DQ, cpuid.AVX512BW, cpuid.AVX512VL)
90
91 // hasSHAExtensions return whether the cpu supports SHA extensions.
92 func hasSHAExtensions() bool {
93 return cpuid.CPU.Supports(cpuid.SHA, cpuid.SSSE3, cpuid.SSE4) && runtime.GOARCH == "amd64"
9894 }
9995
10096 // New returns a new hash.Hash computing the SHA256 checksum.
277273 func block(dig *digest, p []byte) {
278274 if blockfunc == blockfuncSha {
279275 blockShaGo(dig, p)
280 } else if blockfunc == blockfuncAvx2 {
281 blockAvx2Go(dig, p)
282 } else if blockfunc == blockfuncAvx {
283 blockAvxGo(dig, p)
284 } else if blockfunc == blockfuncSsse {
285 blockSsseGo(dig, p)
286276 } else if blockfunc == blockfuncArm {
287277 blockArmGo(dig, p)
288278 } else if blockfunc == blockfuncGeneric {
5151 import (
5252 "encoding/hex"
5353 "fmt"
54 "runtime"
5554 "strings"
5655 "testing"
56
57 "github.com/klauspost/cpuid/v2"
5758 )
5859
5960 type sha256Test struct {
22242225 }
22252226 }
22262227
2227 if runtime.GOARCH == "386" || runtime.GOARCH == "arm" {
2228 // doesn't support anything but the generic version.
2229 return
2230 }
2231
2232 if sha && ssse3 && sse41 {
2228 if cpuid.CPU.Supports(cpuid.SHA, cpuid.SSSE3, cpuid.SSE4) {
22332229 blockfunc = blockfuncSha
22342230 for _, g := range golden {
22352231 s := fmt.Sprintf("%x", Sum256([]byte(g.in)))
22382234 }
22392235 }
22402236 }
2241 if avx2 {
2242 blockfunc = blockfuncAvx2
2237
2238 if hasArmSha2() {
2239 blockfunc = blockfuncArm
22432240 for _, g := range golden {
22442241 s := fmt.Sprintf("%x", Sum256([]byte(g.in)))
22452242 if Sum256([]byte(g.in)) != g.out {
2246 t.Fatalf("AVX2: Sum256 function: sha256(%s) = %s want %s", g.in, s, hex.EncodeToString(g.out[:]))
2247 }
2248 }
2249 }
2250 if avx {
2251 blockfunc = blockfuncAvx
2252 for _, g := range golden {
2253 s := fmt.Sprintf("%x", Sum256([]byte(g.in)))
2254 if Sum256([]byte(g.in)) != g.out {
2255 t.Fatalf("AVX: Sum256 function: sha256(%s) = %s want %s", g.in, s, hex.EncodeToString(g.out[:]))
2256 }
2257 }
2258 }
2259 if ssse3 {
2260 blockfunc = blockfuncSsse
2261 for _, g := range golden {
2262 s := fmt.Sprintf("%x", Sum256([]byte(g.in)))
2263 if Sum256([]byte(g.in)) != g.out {
2264 t.Fatalf("SSSE3: Sum256 function: sha256(%s) = %s want %s", g.in, s, hex.EncodeToString(g.out[:]))
2243 t.Fatalf("ARM: Sum256 function: sha256(%s) = %s want %s", g.in, s, hex.EncodeToString(g.out[:]))
22652244 }
22662245 }
22672246 }
23002279 t blockfuncType
23012280 f bool
23022281 }{
2303 {"SHA_", blockfuncSha, sha && sse41 && ssse3},
2304 {"AVX2", blockfuncAvx2, avx2},
2305 {"AVX_", blockfuncAvx, avx},
2306 {"SSSE", blockfuncSsse, ssse3},
2282 {"SHA_", blockfuncSha, hasSHAExtensions()},
23072283 {"GEN_", blockfuncGeneric, true},
23082284 }
23092285
+0
-22
sha256blockAvx2_amd64.go less more
0 //+build !noasm,!appengine
1
2 /*
3 * Minio Cloud Storage, (C) 2016 Minio, Inc.
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package sha256
19
20 //go:noescape
21 func blockAvx2(h []uint32, message []uint8)
+0
-1449
sha256blockAvx2_amd64.s less more
0 //+build !noasm,!appengine
1
2 // SHA256 implementation for AVX2
3
4 //
5 // Minio Cloud Storage, (C) 2016 Minio, Inc.
6 //
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 //
11 // http://www.apache.org/licenses/LICENSE-2.0
12 //
13 // Unless required by applicable law or agreed to in writing, software
14 // distributed under the License is distributed on an "AS IS" BASIS,
15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 // See the License for the specific language governing permissions and
17 // limitations under the License.
18 //
19
20 //
21 // This code is based on an Intel White-Paper:
22 // "Fast SHA-256 Implementations on Intel Architecture Processors"
23 //
24 // together with the reference implementation from the following authors:
25 // James Guilford <james.guilford@intel.com>
26 // Kirk Yap <kirk.s.yap@intel.com>
27 // Tim Chen <tim.c.chen@linux.intel.com>
28 //
29 // For Golang it has been converted to Plan 9 assembly with the help of
30 // github.com/minio/asm2plan9s to assemble Intel instructions to their Plan9
31 // equivalents
32 //
33
34 DATA K256<>+0x000(SB)/8, $0x71374491428a2f98
35 DATA K256<>+0x008(SB)/8, $0xe9b5dba5b5c0fbcf
36 DATA K256<>+0x010(SB)/8, $0x71374491428a2f98
37 DATA K256<>+0x018(SB)/8, $0xe9b5dba5b5c0fbcf
38 DATA K256<>+0x020(SB)/8, $0x59f111f13956c25b
39 DATA K256<>+0x028(SB)/8, $0xab1c5ed5923f82a4
40 DATA K256<>+0x030(SB)/8, $0x59f111f13956c25b
41 DATA K256<>+0x038(SB)/8, $0xab1c5ed5923f82a4
42 DATA K256<>+0x040(SB)/8, $0x12835b01d807aa98
43 DATA K256<>+0x048(SB)/8, $0x550c7dc3243185be
44 DATA K256<>+0x050(SB)/8, $0x12835b01d807aa98
45 DATA K256<>+0x058(SB)/8, $0x550c7dc3243185be
46 DATA K256<>+0x060(SB)/8, $0x80deb1fe72be5d74
47 DATA K256<>+0x068(SB)/8, $0xc19bf1749bdc06a7
48 DATA K256<>+0x070(SB)/8, $0x80deb1fe72be5d74
49 DATA K256<>+0x078(SB)/8, $0xc19bf1749bdc06a7
50 DATA K256<>+0x080(SB)/8, $0xefbe4786e49b69c1
51 DATA K256<>+0x088(SB)/8, $0x240ca1cc0fc19dc6
52 DATA K256<>+0x090(SB)/8, $0xefbe4786e49b69c1
53 DATA K256<>+0x098(SB)/8, $0x240ca1cc0fc19dc6
54 DATA K256<>+0x0a0(SB)/8, $0x4a7484aa2de92c6f
55 DATA K256<>+0x0a8(SB)/8, $0x76f988da5cb0a9dc
56 DATA K256<>+0x0b0(SB)/8, $0x4a7484aa2de92c6f
57 DATA K256<>+0x0b8(SB)/8, $0x76f988da5cb0a9dc
58 DATA K256<>+0x0c0(SB)/8, $0xa831c66d983e5152
59 DATA K256<>+0x0c8(SB)/8, $0xbf597fc7b00327c8
60 DATA K256<>+0x0d0(SB)/8, $0xa831c66d983e5152
61 DATA K256<>+0x0d8(SB)/8, $0xbf597fc7b00327c8
62 DATA K256<>+0x0e0(SB)/8, $0xd5a79147c6e00bf3
63 DATA K256<>+0x0e8(SB)/8, $0x1429296706ca6351
64 DATA K256<>+0x0f0(SB)/8, $0xd5a79147c6e00bf3
65 DATA K256<>+0x0f8(SB)/8, $0x1429296706ca6351
66 DATA K256<>+0x100(SB)/8, $0x2e1b213827b70a85
67 DATA K256<>+0x108(SB)/8, $0x53380d134d2c6dfc
68 DATA K256<>+0x110(SB)/8, $0x2e1b213827b70a85
69 DATA K256<>+0x118(SB)/8, $0x53380d134d2c6dfc
70 DATA K256<>+0x120(SB)/8, $0x766a0abb650a7354
71 DATA K256<>+0x128(SB)/8, $0x92722c8581c2c92e
72 DATA K256<>+0x130(SB)/8, $0x766a0abb650a7354
73 DATA K256<>+0x138(SB)/8, $0x92722c8581c2c92e
74 DATA K256<>+0x140(SB)/8, $0xa81a664ba2bfe8a1
75 DATA K256<>+0x148(SB)/8, $0xc76c51a3c24b8b70
76 DATA K256<>+0x150(SB)/8, $0xa81a664ba2bfe8a1
77 DATA K256<>+0x158(SB)/8, $0xc76c51a3c24b8b70
78 DATA K256<>+0x160(SB)/8, $0xd6990624d192e819
79 DATA K256<>+0x168(SB)/8, $0x106aa070f40e3585
80 DATA K256<>+0x170(SB)/8, $0xd6990624d192e819
81 DATA K256<>+0x178(SB)/8, $0x106aa070f40e3585
82 DATA K256<>+0x180(SB)/8, $0x1e376c0819a4c116
83 DATA K256<>+0x188(SB)/8, $0x34b0bcb52748774c
84 DATA K256<>+0x190(SB)/8, $0x1e376c0819a4c116
85 DATA K256<>+0x198(SB)/8, $0x34b0bcb52748774c
86 DATA K256<>+0x1a0(SB)/8, $0x4ed8aa4a391c0cb3
87 DATA K256<>+0x1a8(SB)/8, $0x682e6ff35b9cca4f
88 DATA K256<>+0x1b0(SB)/8, $0x4ed8aa4a391c0cb3
89 DATA K256<>+0x1b8(SB)/8, $0x682e6ff35b9cca4f
90 DATA K256<>+0x1c0(SB)/8, $0x78a5636f748f82ee
91 DATA K256<>+0x1c8(SB)/8, $0x8cc7020884c87814
92 DATA K256<>+0x1d0(SB)/8, $0x78a5636f748f82ee
93 DATA K256<>+0x1d8(SB)/8, $0x8cc7020884c87814
94 DATA K256<>+0x1e0(SB)/8, $0xa4506ceb90befffa
95 DATA K256<>+0x1e8(SB)/8, $0xc67178f2bef9a3f7
96 DATA K256<>+0x1f0(SB)/8, $0xa4506ceb90befffa
97 DATA K256<>+0x1f8(SB)/8, $0xc67178f2bef9a3f7
98
99 DATA K256<>+0x200(SB)/8, $0x0405060700010203
100 DATA K256<>+0x208(SB)/8, $0x0c0d0e0f08090a0b
101 DATA K256<>+0x210(SB)/8, $0x0405060700010203
102 DATA K256<>+0x218(SB)/8, $0x0c0d0e0f08090a0b
103 DATA K256<>+0x220(SB)/8, $0x0b0a090803020100
104 DATA K256<>+0x228(SB)/8, $0xffffffffffffffff
105 DATA K256<>+0x230(SB)/8, $0x0b0a090803020100
106 DATA K256<>+0x238(SB)/8, $0xffffffffffffffff
107 DATA K256<>+0x240(SB)/8, $0xffffffffffffffff
108 DATA K256<>+0x248(SB)/8, $0x0b0a090803020100
109 DATA K256<>+0x250(SB)/8, $0xffffffffffffffff
110 DATA K256<>+0x258(SB)/8, $0x0b0a090803020100
111
112 GLOBL K256<>(SB), 8, $608
113
114 // We need 0x220 stack space aligned on a 512 boundary, so for the
115 // worstcase-aligned SP we need twice this amount, being 1088 (=0x440)
116 //
117 // SP aligned end-aligned stacksize
118 // 100013d0 10001400 10001620 592
119 // 100013d8 10001400 10001620 584
120 // 100013e0 10001600 10001820 1088
121 // 100013e8 10001600 10001820 1080
122
123 // func blockAvx2(h []uint32, message []uint8)
124 TEXT ·blockAvx2(SB),$1088-48
125
126 MOVQ h+0(FP), DI // DI: &h
127 MOVQ message_base+24(FP), SI // SI: &message
128 MOVQ message_len+32(FP), DX // len(message)
129 ADDQ SI, DX // end pointer of input
130 MOVQ SP, R11 // copy stack pointer
131 ADDQ $0x220, SP // sp += 0x220
132 ANDQ $0xfffffffffffffe00, SP // align stack frame
133 ADDQ $0x1c0, SP
134 MOVQ DI, 0x40(SP) // save ctx
135 MOVQ SI, 0x48(SP) // save input
136 MOVQ DX, 0x50(SP) // save end pointer
137 MOVQ R11, 0x58(SP) // save copy of stack pointer
138
139 WORD $0xf8c5; BYTE $0x77 // vzeroupper
140 ADDQ $0x40, SI // input++
141 MOVL (DI), AX
142 MOVQ SI, R12 // borrow $T1
143 MOVL 4(DI), BX
144 CMPQ SI, DX // $_end
145 MOVL 8(DI), CX
146 LONG $0xe4440f4c // cmove r12,rsp /* next block or random data */
147 MOVL 12(DI), DX
148 MOVL 16(DI), R8
149 MOVL 20(DI), R9
150 MOVL 24(DI), R10
151 MOVL 28(DI), R11
152
153 LEAQ K256<>(SB), BP
154 LONG $0x856f7dc5; LONG $0x00000220 // VMOVDQA YMM8, 0x220[rbp] /* vmovdqa ymm8,YMMWORD PTR [rip+0x220] */
155 LONG $0x8d6f7dc5; LONG $0x00000240 // VMOVDQA YMM9, 0x240[rbp] /* vmovdqa ymm9,YMMWORD PTR [rip+0x240] */
156 LONG $0x956f7dc5; LONG $0x00000200 // VMOVDQA YMM10, 0x200[rbp] /* vmovdqa ymm7,YMMWORD PTR [rip+0x200] */
157
158 loop0:
159 LONG $0x6f7dc1c4; BYTE $0xfa // VMOVDQA YMM7, YMM10
160
161 // Load first 16 dwords from two blocks
162 MOVOU -64(SI), X0 // vmovdqu xmm0,XMMWORD PTR [rsi-0x40]
163 MOVOU -48(SI), X1 // vmovdqu xmm1,XMMWORD PTR [rsi-0x30]
164 MOVOU -32(SI), X2 // vmovdqu xmm2,XMMWORD PTR [rsi-0x20]
165 MOVOU -16(SI), X3 // vmovdqu xmm3,XMMWORD PTR [rsi-0x10]
166
167 // Byte swap data and transpose data into high/low
168 LONG $0x387dc3c4; WORD $0x2404; BYTE $0x01 // vinserti128 ymm0,ymm0,[r12],0x1
169 LONG $0x3875c3c4; LONG $0x0110244c // vinserti128 ymm1,ymm1,0x10[r12],0x1
170 LONG $0x007de2c4; BYTE $0xc7 // vpshufb ymm0,ymm0,ymm7
171 LONG $0x386dc3c4; LONG $0x01202454 // vinserti128 ymm2,ymm2,0x20[r12],0x1
172 LONG $0x0075e2c4; BYTE $0xcf // vpshufb ymm1,ymm1,ymm7
173 LONG $0x3865c3c4; LONG $0x0130245c // vinserti128 ymm3,ymm3,0x30[r12],0x1
174
175 LEAQ K256<>(SB), BP
176 LONG $0x006de2c4; BYTE $0xd7 // vpshufb ymm2,ymm2,ymm7
177 LONG $0x65fefdc5; BYTE $0x00 // vpaddd ymm4,ymm0,[rbp]
178 LONG $0x0065e2c4; BYTE $0xdf // vpshufb ymm3,ymm3,ymm7
179 LONG $0x6dfef5c5; BYTE $0x20 // vpaddd ymm5,ymm1,0x20[rbp]
180 LONG $0x75feedc5; BYTE $0x40 // vpaddd ymm6,ymm2,0x40[rbp]
181 LONG $0x7dfee5c5; BYTE $0x60 // vpaddd ymm7,ymm3,0x60[rbp]
182
183 LONG $0x247ffdc5; BYTE $0x24 // vmovdqa [rsp],ymm4
184 XORQ R14, R14
185 LONG $0x6c7ffdc5; WORD $0x2024 // vmovdqa [rsp+0x20],ymm5
186
187 ADDQ $-0x40, SP
188 MOVQ BX, DI
189 LONG $0x347ffdc5; BYTE $0x24 // vmovdqa [rsp],ymm6
190 XORQ CX, DI // magic
191 LONG $0x7c7ffdc5; WORD $0x2024 // vmovdqa [rsp+0x20],ymm7
192 MOVQ R9, R12
193 ADDQ $0x80, BP
194
195 loop1:
196 // Schedule 48 input dwords, by doing 3 rounds of 12 each
197 // Note: SIMD instructions are interleaved with the SHA calculations
198 ADDQ $-0x40, SP
199 LONG $0x0f75e3c4; WORD $0x04e0 // vpalignr ymm4,ymm1,ymm0,0x4
200
201 // ROUND(AX, BX, CX, DX, R8, R9, R10, R11, R12, R13, R14, R15, DI, SP, 0x80)
202 LONG $0x249c0344; LONG $0x00000080 // add r11d,[rsp+0x80]
203 WORD $0x2145; BYTE $0xc4 // and r12d,r8d
204 LONG $0xf07b43c4; WORD $0x19e8 // rorx r13d,r8d,0x19
205 LONG $0x0f65e3c4; WORD $0x04fa // vpalignr ymm7,ymm3,ymm2,0x4
206 LONG $0xf07b43c4; WORD $0x0bf8 // rorx r15d,r8d,0xb
207 LONG $0x30048d42 // lea eax,[rax+r14*1]
208 LONG $0x231c8d47 // lea r11d,[r11+r12*1]
209 LONG $0xd472cdc5; BYTE $0x07 // vpsrld ymm6,ymm4,0x7
210 LONG $0xf23842c4; BYTE $0xe2 // andn r12d,r8d,r10d
211 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
212 LONG $0xf07b43c4; WORD $0x06f0 // rorx r14d,r8d,0x6
213 LONG $0xc7fefdc5 // vpaddd ymm0,ymm0,ymm7
214 LONG $0x231c8d47 // lea r11d,[r11+r12*1]
215 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
216 WORD $0x8941; BYTE $0xc7 // mov r15d,eax
217 LONG $0xd472c5c5; BYTE $0x03 // vpsrld ymm7,ymm4,0x3
218 LONG $0xf07b63c4; WORD $0x16e0 // rorx r12d,eax,0x16
219 LONG $0x2b1c8d47 // lea r11d,[r11+r13*1]
220 WORD $0x3141; BYTE $0xdf // xor r15d,ebx
221 LONG $0xf472d5c5; BYTE $0x0e // vpslld ymm5,ymm4,0xe
222 LONG $0xf07b63c4; WORD $0x0df0 // rorx r14d,eax,0xd
223 LONG $0xf07b63c4; WORD $0x02e8 // rorx r13d,eax,0x2
224 LONG $0x1a148d42 // lea edx,[rdx+r11*1]
225 LONG $0xe6efc5c5 // vpxor ymm4,ymm7,ymm6
226 WORD $0x2144; BYTE $0xff // and edi,r15d
227 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
228 WORD $0xdf31 // xor edi,ebx
229 LONG $0xfb70fdc5; BYTE $0xfa // vpshufd ymm7,ymm3,0xfa
230 WORD $0x3145; BYTE $0xee // xor r14d,r13d
231 LONG $0x3b1c8d45 // lea r11d,[r11+rdi*1]
232 WORD $0x8945; BYTE $0xc4 // mov r12d,r8d
233 LONG $0xd672cdc5; BYTE $0x0b // vpsrld ymm6,ymm6,0xb
234
235 // ROUND(R11, AX, BX, CX, DX, R8, R9, R10, R12, R13, R14, DI, R15, SP, 0x84)
236 LONG $0x24940344; LONG $0x00000084 // add r10d,[rsp+0x84]
237 WORD $0x2141; BYTE $0xd4 // and r12d,edx
238 LONG $0xf07b63c4; WORD $0x19ea // rorx r13d,edx,0x19
239 LONG $0xe5efddc5 // vpxor ymm4,ymm4,ymm5
240 LONG $0xf07be3c4; WORD $0x0bfa // rorx edi,edx,0xb
241 LONG $0x331c8d47 // lea r11d,[r11+r14*1]
242 LONG $0x22148d47 // lea r10d,[r10+r12*1]
243 LONG $0xf572d5c5; BYTE $0x0b // vpslld ymm5,ymm5,0xb
244 LONG $0xf26842c4; BYTE $0xe1 // andn r12d,edx,r9d
245 WORD $0x3141; BYTE $0xfd // xor r13d,edi
246 LONG $0xf07b63c4; WORD $0x06f2 // rorx r14d,edx,0x6
247 LONG $0xe6efddc5 // vpxor ymm4,ymm4,ymm6
248 LONG $0x22148d47 // lea r10d,[r10+r12*1]
249 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
250 WORD $0x8944; BYTE $0xdf // mov edi,r11d
251 LONG $0xd772cdc5; BYTE $0x0a // vpsrld ymm6,ymm7,0xa
252 LONG $0xf07b43c4; WORD $0x16e3 // rorx r12d,r11d,0x16
253 LONG $0x2a148d47 // lea r10d,[r10+r13*1]
254 WORD $0xc731 // xor edi,eax
255 LONG $0xe5efddc5 // vpxor ymm4,ymm4,ymm5
256 LONG $0xf07b43c4; WORD $0x0df3 // rorx r14d,r11d,0xd
257 LONG $0xf07b43c4; WORD $0x02eb // rorx r13d,r11d,0x2
258 LONG $0x110c8d42 // lea ecx,[rcx+r10*1]
259 LONG $0xd773c5c5; BYTE $0x11 // vpsrlq ymm7,ymm7,0x11
260 WORD $0x2141; BYTE $0xff // and r15d,edi
261 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
262 WORD $0x3141; BYTE $0xc7 // xor r15d,eax
263 LONG $0xc4fefdc5 // vpaddd ymm0,ymm0,ymm4
264 WORD $0x3145; BYTE $0xee // xor r14d,r13d
265 LONG $0x3a148d47 // lea r10d,[r10+r15*1]
266 WORD $0x8941; BYTE $0xd4 // mov r12d,edx
267 LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7
268
269 // ROUND(R10, R11, AX, BX, CX, DX, R8, R9, R12, R13, R14, R15, DI, SP, 0x88)
270 LONG $0x248c0344; LONG $0x00000088 // add r9d,[rsp+0x88]
271 WORD $0x2141; BYTE $0xcc // and r12d,ecx
272 LONG $0xf07b63c4; WORD $0x19e9 // rorx r13d,ecx,0x19
273 LONG $0xd773c5c5; BYTE $0x02 // vpsrlq ymm7,ymm7,0x2
274 LONG $0xf07b63c4; WORD $0x0bf9 // rorx r15d,ecx,0xb
275 LONG $0x32148d47 // lea r10d,[r10+r14*1]
276 LONG $0x210c8d47 // lea r9d,[r9+r12*1]
277 LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7
278 LONG $0xf27042c4; BYTE $0xe0 // andn r12d,ecx,r8d
279 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
280 LONG $0xf07b63c4; WORD $0x06f1 // rorx r14d,ecx,0x6
281 LONG $0x004dc2c4; BYTE $0xf0 // vpshufb ymm6,ymm6,ymm8
282 LONG $0x210c8d47 // lea r9d,[r9+r12*1]
283 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
284 WORD $0x8945; BYTE $0xd7 // mov r15d,r10d
285 LONG $0xc6fefdc5 // vpaddd ymm0,ymm0,ymm6
286 LONG $0xf07b43c4; WORD $0x16e2 // rorx r12d,r10d,0x16
287 LONG $0x290c8d47 // lea r9d,[r9+r13*1]
288 WORD $0x3145; BYTE $0xdf // xor r15d,r11d
289 LONG $0xf870fdc5; BYTE $0x50 // vpshufd ymm7,ymm0,0x50
290 LONG $0xf07b43c4; WORD $0x0df2 // rorx r14d,r10d,0xd
291 LONG $0xf07b43c4; WORD $0x02ea // rorx r13d,r10d,0x2
292 LONG $0x0b1c8d42 // lea ebx,[rbx+r9*1]
293 LONG $0xd772cdc5; BYTE $0x0a // vpsrld ymm6,ymm7,0xa
294 WORD $0x2144; BYTE $0xff // and edi,r15d
295 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
296 WORD $0x3144; BYTE $0xdf // xor edi,r11d
297 LONG $0xd773c5c5; BYTE $0x11 // vpsrlq ymm7,ymm7,0x11
298 WORD $0x3145; BYTE $0xee // xor r14d,r13d
299 LONG $0x390c8d45 // lea r9d,[r9+rdi*1]
300 WORD $0x8941; BYTE $0xcc // mov r12d,ecx
301 LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7
302
303 // ROUND(R9, R10, R11, AX, BX, CX, DX, R8, R12, R13, R14, DI, R15, SP, 0x8c)
304 LONG $0x24840344; LONG $0x0000008c // add r8d,[rsp+0x8c]
305 WORD $0x2141; BYTE $0xdc // and r12d,ebx
306 LONG $0xf07b63c4; WORD $0x19eb // rorx r13d,ebx,0x19
307 LONG $0xd773c5c5; BYTE $0x02 // vpsrlq ymm7,ymm7,0x2
308 LONG $0xf07be3c4; WORD $0x0bfb // rorx edi,ebx,0xb
309 LONG $0x310c8d47 // lea r9d,[r9+r14*1]
310 LONG $0x20048d47 // lea r8d,[r8+r12*1]
311 LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7
312 LONG $0xf26062c4; BYTE $0xe2 // andn r12d,ebx,edx
313 WORD $0x3141; BYTE $0xfd // xor r13d,edi
314 LONG $0xf07b63c4; WORD $0x06f3 // rorx r14d,ebx,0x6
315 LONG $0x004dc2c4; BYTE $0xf1 // vpshufb ymm6,ymm6,ymm9
316 LONG $0x20048d47 // lea r8d,[r8+r12*1]
317 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
318 WORD $0x8944; BYTE $0xcf // mov edi,r9d
319 LONG $0xc6fefdc5 // vpaddd ymm0,ymm0,ymm6
320 LONG $0xf07b43c4; WORD $0x16e1 // rorx r12d,r9d,0x16
321 LONG $0x28048d47 // lea r8d,[r8+r13*1]
322 WORD $0x3144; BYTE $0xd7 // xor edi,r10d
323 LONG $0x75fefdc5; BYTE $0x00 // vpaddd ymm6,ymm0,[rbp+0x0]
324 LONG $0xf07b43c4; WORD $0x0df1 // rorx r14d,r9d,0xd
325 LONG $0xf07b43c4; WORD $0x02e9 // rorx r13d,r9d,0x2
326 LONG $0x00048d42 // lea eax,[rax+r8*1]
327 WORD $0x2141; BYTE $0xff // and r15d,edi
328 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
329 WORD $0x3145; BYTE $0xd7 // xor r15d,r10d
330 WORD $0x3145; BYTE $0xee // xor r14d,r13d
331 LONG $0x38048d47 // lea r8d,[r8+r15*1]
332 WORD $0x8941; BYTE $0xdc // mov r12d,ebx
333
334 LONG $0x347ffdc5; BYTE $0x24 // vmovdqa [rsp],ymm6
335 LONG $0x0f6de3c4; WORD $0x04e1 // vpalignr ymm4,ymm2,ymm1,0x4
336
337 // ROUND(R8, R9, R10, R11, AX, BX, CX, DX, R12, R13, R14, R15, DI, SP, 0xa0)
338 LONG $0xa0249403; WORD $0x0000; BYTE $0x00 // add edx,[rsp+0xa0]
339 WORD $0x2141; BYTE $0xc4 // and r12d,eax
340 LONG $0xf07b63c4; WORD $0x19e8 // rorx r13d,eax,0x19
341 LONG $0x0f7de3c4; WORD $0x04fb // vpalignr ymm7,ymm0,ymm3,0x4
342 LONG $0xf07b63c4; WORD $0x0bf8 // rorx r15d,eax,0xb
343 LONG $0x30048d47 // lea r8d,[r8+r14*1]
344 LONG $0x22148d42 // lea edx,[rdx+r12*1]
345 LONG $0xd472cdc5; BYTE $0x07 // vpsrld ymm6,ymm4,0x7
346 LONG $0xf27862c4; BYTE $0xe1 // andn r12d,eax,ecx
347 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
348 LONG $0xf07b63c4; WORD $0x06f0 // rorx r14d,eax,0x6
349 LONG $0xcffef5c5 // vpaddd ymm1,ymm1,ymm7
350 LONG $0x22148d42 // lea edx,[rdx+r12*1]
351 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
352 WORD $0x8945; BYTE $0xc7 // mov r15d,r8d
353 LONG $0xd472c5c5; BYTE $0x03 // vpsrld ymm7,ymm4,0x3
354 LONG $0xf07b43c4; WORD $0x16e0 // rorx r12d,r8d,0x16
355 LONG $0x2a148d42 // lea edx,[rdx+r13*1]
356 WORD $0x3145; BYTE $0xcf // xor r15d,r9d
357 LONG $0xf472d5c5; BYTE $0x0e // vpslld ymm5,ymm4,0xe
358 LONG $0xf07b43c4; WORD $0x0df0 // rorx r14d,r8d,0xd
359 LONG $0xf07b43c4; WORD $0x02e8 // rorx r13d,r8d,0x2
360 LONG $0x131c8d45 // lea r11d,[r11+rdx*1]
361 LONG $0xe6efc5c5 // vpxor ymm4,ymm7,ymm6
362 WORD $0x2144; BYTE $0xff // and edi,r15d
363 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
364 WORD $0x3144; BYTE $0xcf // xor edi,r9d
365 LONG $0xf870fdc5; BYTE $0xfa // vpshufd ymm7,ymm0,0xfa
366 WORD $0x3145; BYTE $0xee // xor r14d,r13d
367 WORD $0x148d; BYTE $0x3a // lea edx,[rdx+rdi*1]
368 WORD $0x8941; BYTE $0xc4 // mov r12d,eax
369 LONG $0xd672cdc5; BYTE $0x0b // vpsrld ymm6,ymm6,0xb
370
371 // ROUND(DX, R8, R9, R10, R11, AX, BX, CX, R12, R13, R14, DI, R15, SP, 0xa4)
372 LONG $0xa4248c03; WORD $0x0000; BYTE $0x00 // add ecx,[rsp+0xa4]
373 WORD $0x2145; BYTE $0xdc // and r12d,r11d
374 LONG $0xf07b43c4; WORD $0x19eb // rorx r13d,r11d,0x19
375 LONG $0xe5efddc5 // vpxor ymm4,ymm4,ymm5
376 LONG $0xf07bc3c4; WORD $0x0bfb // rorx edi,r11d,0xb
377 LONG $0x32148d42 // lea edx,[rdx+r14*1]
378 LONG $0x210c8d42 // lea ecx,[rcx+r12*1]
379 LONG $0xf572d5c5; BYTE $0x0b // vpslld ymm5,ymm5,0xb
380 LONG $0xf22062c4; BYTE $0xe3 // andn r12d,r11d,ebx
381 WORD $0x3141; BYTE $0xfd // xor r13d,edi
382 LONG $0xf07b43c4; WORD $0x06f3 // rorx r14d,r11d,0x6
383 LONG $0xe6efddc5 // vpxor ymm4,ymm4,ymm6
384 LONG $0x210c8d42 // lea ecx,[rcx+r12*1]
385 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
386 WORD $0xd789 // mov edi,edx
387 LONG $0xd772cdc5; BYTE $0x0a // vpsrld ymm6,ymm7,0xa
388 LONG $0xf07b63c4; WORD $0x16e2 // rorx r12d,edx,0x16
389 LONG $0x290c8d42 // lea ecx,[rcx+r13*1]
390 WORD $0x3144; BYTE $0xc7 // xor edi,r8d
391 LONG $0xe5efddc5 // vpxor ymm4,ymm4,ymm5
392 LONG $0xf07b63c4; WORD $0x0df2 // rorx r14d,edx,0xd
393 LONG $0xf07b63c4; WORD $0x02ea // rorx r13d,edx,0x2
394 LONG $0x0a148d45 // lea r10d,[r10+rcx*1]
395 LONG $0xd773c5c5; BYTE $0x11 // vpsrlq ymm7,ymm7,0x11
396 WORD $0x2141; BYTE $0xff // and r15d,edi
397 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
398 WORD $0x3145; BYTE $0xc7 // xor r15d,r8d
399 LONG $0xccfef5c5 // vpaddd ymm1,ymm1,ymm4
400 WORD $0x3145; BYTE $0xee // xor r14d,r13d
401 LONG $0x390c8d42 // lea ecx,[rcx+r15*1]
402 WORD $0x8945; BYTE $0xdc // mov r12d,r11d
403 LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7
404
405 // ROUND(CX, DX, R8, R9, R10, R11, AX, BX, R12, R13, R14, R15, DI, SP, 0xa8)
406 LONG $0xa8249c03; WORD $0x0000; BYTE $0x00 // add ebx,[rsp+0xa8]
407 WORD $0x2145; BYTE $0xd4 // and r12d,r10d
408 LONG $0xf07b43c4; WORD $0x19ea // rorx r13d,r10d,0x19
409 LONG $0xd773c5c5; BYTE $0x02 // vpsrlq ymm7,ymm7,0x2
410 LONG $0xf07b43c4; WORD $0x0bfa // rorx r15d,r10d,0xb
411 LONG $0x310c8d42 // lea ecx,[rcx+r14*1]
412 LONG $0x231c8d42 // lea ebx,[rbx+r12*1]
413 LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7
414 LONG $0xf22862c4; BYTE $0xe0 // andn r12d,r10d,eax
415 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
416 LONG $0xf07b43c4; WORD $0x06f2 // rorx r14d,r10d,0x6
417 LONG $0x004dc2c4; BYTE $0xf0 // vpshufb ymm6,ymm6,ymm8
418 LONG $0x231c8d42 // lea ebx,[rbx+r12*1]
419 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
420 WORD $0x8941; BYTE $0xcf // mov r15d,ecx
421 LONG $0xcefef5c5 // vpaddd ymm1,ymm1,ymm6
422 LONG $0xf07b63c4; WORD $0x16e1 // rorx r12d,ecx,0x16
423 LONG $0x2b1c8d42 // lea ebx,[rbx+r13*1]
424 WORD $0x3141; BYTE $0xd7 // xor r15d,edx
425 LONG $0xf970fdc5; BYTE $0x50 // vpshufd ymm7,ymm1,0x50
426 LONG $0xf07b63c4; WORD $0x0df1 // rorx r14d,ecx,0xd
427 LONG $0xf07b63c4; WORD $0x02e9 // rorx r13d,ecx,0x2
428 LONG $0x190c8d45 // lea r9d,[r9+rbx*1]
429 LONG $0xd772cdc5; BYTE $0x0a // vpsrld ymm6,ymm7,0xa
430 WORD $0x2144; BYTE $0xff // and edi,r15d
431 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
432 WORD $0xd731 // xor edi,edx
433 LONG $0xd773c5c5; BYTE $0x11 // vpsrlq ymm7,ymm7,0x11
434 WORD $0x3145; BYTE $0xee // xor r14d,r13d
435 WORD $0x1c8d; BYTE $0x3b // lea ebx,[rbx+rdi*1]
436 WORD $0x8945; BYTE $0xd4 // mov r12d,r10d
437 LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7
438
439 // ROUND(BX, CX, DX, R8, R9, R10, R11, AX, R12, R13, R14, DI, R15, SP, 0xac)
440 LONG $0xac248403; WORD $0x0000; BYTE $0x00 // add eax,[rsp+0xac]
441 WORD $0x2145; BYTE $0xcc // and r12d,r9d
442 LONG $0xf07b43c4; WORD $0x19e9 // rorx r13d,r9d,0x19
443 LONG $0xd773c5c5; BYTE $0x02 // vpsrlq ymm7,ymm7,0x2
444 LONG $0xf07bc3c4; WORD $0x0bf9 // rorx edi,r9d,0xb
445 LONG $0x331c8d42 // lea ebx,[rbx+r14*1]
446 LONG $0x20048d42 // lea eax,[rax+r12*1]
447 LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7
448 LONG $0xf23042c4; BYTE $0xe3 // andn r12d,r9d,r11d
449 WORD $0x3141; BYTE $0xfd // xor r13d,edi
450 LONG $0xf07b43c4; WORD $0x06f1 // rorx r14d,r9d,0x6
451 LONG $0x004dc2c4; BYTE $0xf1 // vpshufb ymm6,ymm6,ymm9
452 LONG $0x20048d42 // lea eax,[rax+r12*1]
453 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
454 WORD $0xdf89 // mov edi,ebx
455 LONG $0xcefef5c5 // vpaddd ymm1,ymm1,ymm6
456 LONG $0xf07b63c4; WORD $0x16e3 // rorx r12d,ebx,0x16
457 LONG $0x28048d42 // lea eax,[rax+r13*1]
458 WORD $0xcf31 // xor edi,ecx
459 LONG $0x75fef5c5; BYTE $0x20 // vpaddd ymm6,ymm1,[rbp+0x20]
460 LONG $0xf07b63c4; WORD $0x0df3 // rorx r14d,ebx,0xd
461 LONG $0xf07b63c4; WORD $0x02eb // rorx r13d,ebx,0x2
462 LONG $0x00048d45 // lea r8d,[r8+rax*1]
463 WORD $0x2141; BYTE $0xff // and r15d,edi
464 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
465 WORD $0x3141; BYTE $0xcf // xor r15d,ecx
466 WORD $0x3145; BYTE $0xee // xor r14d,r13d
467 LONG $0x38048d42 // lea eax,[rax+r15*1]
468 WORD $0x8945; BYTE $0xcc // mov r12d,r9d
469
470 LONG $0x747ffdc5; WORD $0x2024 // vmovdqa [rsp+0x20],ymm6
471
472 LONG $0x24648d48; BYTE $0xc0 // lea rsp,[rsp-0x40]
473 LONG $0x0f65e3c4; WORD $0x04e2 // vpalignr ymm4,ymm3,ymm2,0x4
474
475 // ROUND(AX, BX, CX, DX, R8, R9, R10, R11, R12, R13, R14, R15, DI, SP, 0x80)
476 LONG $0x249c0344; LONG $0x00000080 // add r11d,[rsp+0x80]
477 WORD $0x2145; BYTE $0xc4 // and r12d,r8d
478 LONG $0xf07b43c4; WORD $0x19e8 // rorx r13d,r8d,0x19
479 LONG $0x0f75e3c4; WORD $0x04f8 // vpalignr ymm7,ymm1,ymm0,0x4
480 LONG $0xf07b43c4; WORD $0x0bf8 // rorx r15d,r8d,0xb
481 LONG $0x30048d42 // lea eax,[rax+r14*1]
482 LONG $0x231c8d47 // lea r11d,[r11+r12*1]
483 LONG $0xd472cdc5; BYTE $0x07 // vpsrld ymm6,ymm4,0x7
484 LONG $0xf23842c4; BYTE $0xe2 // andn r12d,r8d,r10d
485 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
486 LONG $0xf07b43c4; WORD $0x06f0 // rorx r14d,r8d,0x6
487 LONG $0xd7feedc5 // vpaddd ymm2,ymm2,ymm7
488 LONG $0x231c8d47 // lea r11d,[r11+r12*1]
489 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
490 WORD $0x8941; BYTE $0xc7 // mov r15d,eax
491 LONG $0xd472c5c5; BYTE $0x03 // vpsrld ymm7,ymm4,0x3
492 LONG $0xf07b63c4; WORD $0x16e0 // rorx r12d,eax,0x16
493 LONG $0x2b1c8d47 // lea r11d,[r11+r13*1]
494 WORD $0x3141; BYTE $0xdf // xor r15d,ebx
495 LONG $0xf472d5c5; BYTE $0x0e // vpslld ymm5,ymm4,0xe
496 LONG $0xf07b63c4; WORD $0x0df0 // rorx r14d,eax,0xd
497 LONG $0xf07b63c4; WORD $0x02e8 // rorx r13d,eax,0x2
498 LONG $0x1a148d42 // lea edx,[rdx+r11*1]
499 LONG $0xe6efc5c5 // vpxor ymm4,ymm7,ymm6
500 WORD $0x2144; BYTE $0xff // and edi,r15d
501 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
502 WORD $0xdf31 // xor edi,ebx
503 LONG $0xf970fdc5; BYTE $0xfa // vpshufd ymm7,ymm1,0xfa
504 WORD $0x3145; BYTE $0xee // xor r14d,r13d
505 LONG $0x3b1c8d45 // lea r11d,[r11+rdi*1]
506 WORD $0x8945; BYTE $0xc4 // mov r12d,r8d
507 LONG $0xd672cdc5; BYTE $0x0b // vpsrld ymm6,ymm6,0xb
508
509 // ROUND(R11, AX, BX, CX, DX, R8, R9, R10, R12, R13, R14, DI, R15, SP, 0x84)
510 LONG $0x24940344; LONG $0x00000084 // add r10d,[rsp+0x84]
511 WORD $0x2141; BYTE $0xd4 // and r12d,edx
512 LONG $0xf07b63c4; WORD $0x19ea // rorx r13d,edx,0x19
513 LONG $0xe5efddc5 // vpxor ymm4,ymm4,ymm5
514 LONG $0xf07be3c4; WORD $0x0bfa // rorx edi,edx,0xb
515 LONG $0x331c8d47 // lea r11d,[r11+r14*1]
516 LONG $0x22148d47 // lea r10d,[r10+r12*1]
517 LONG $0xf572d5c5; BYTE $0x0b // vpslld ymm5,ymm5,0xb
518 LONG $0xf26842c4; BYTE $0xe1 // andn r12d,edx,r9d
519 WORD $0x3141; BYTE $0xfd // xor r13d,edi
520 LONG $0xf07b63c4; WORD $0x06f2 // rorx r14d,edx,0x6
521 LONG $0xe6efddc5 // vpxor ymm4,ymm4,ymm6
522 LONG $0x22148d47 // lea r10d,[r10+r12*1]
523 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
524 WORD $0x8944; BYTE $0xdf // mov edi,r11d
525 LONG $0xd772cdc5; BYTE $0x0a // vpsrld ymm6,ymm7,0xa
526 LONG $0xf07b43c4; WORD $0x16e3 // rorx r12d,r11d,0x16
527 LONG $0x2a148d47 // lea r10d,[r10+r13*1]
528 WORD $0xc731 // xor edi,eax
529 LONG $0xe5efddc5 // vpxor ymm4,ymm4,ymm5
530 LONG $0xf07b43c4; WORD $0x0df3 // rorx r14d,r11d,0xd
531 LONG $0xf07b43c4; WORD $0x02eb // rorx r13d,r11d,0x2
532 LONG $0x110c8d42 // lea ecx,[rcx+r10*1]
533 LONG $0xd773c5c5; BYTE $0x11 // vpsrlq ymm7,ymm7,0x11
534 WORD $0x2141; BYTE $0xff // and r15d,edi
535 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
536 WORD $0x3141; BYTE $0xc7 // xor r15d,eax
537 LONG $0xd4feedc5 // vpaddd ymm2,ymm2,ymm4
538 WORD $0x3145; BYTE $0xee // xor r14d,r13d
539 LONG $0x3a148d47 // lea r10d,[r10+r15*1]
540 WORD $0x8941; BYTE $0xd4 // mov r12d,edx
541 LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7
542
543 // ROUND(R10, R11, AX, BX, CX, DX, R8, R9, R12, R13, R14, R15, DI, SP, 0x88)
544 LONG $0x248c0344; LONG $0x00000088 // add r9d,[rsp+0x88]
545 WORD $0x2141; BYTE $0xcc // and r12d,ecx
546 LONG $0xf07b63c4; WORD $0x19e9 // rorx r13d,ecx,0x19
547 LONG $0xd773c5c5; BYTE $0x02 // vpsrlq ymm7,ymm7,0x2
548 LONG $0xf07b63c4; WORD $0x0bf9 // rorx r15d,ecx,0xb
549 LONG $0x32148d47 // lea r10d,[r10+r14*1]
550 LONG $0x210c8d47 // lea r9d,[r9+r12*1]
551 LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7
552 LONG $0xf27042c4; BYTE $0xe0 // andn r12d,ecx,r8d
553 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
554 LONG $0xf07b63c4; WORD $0x06f1 // rorx r14d,ecx,0x6
555 LONG $0x004dc2c4; BYTE $0xf0 // vpshufb ymm6,ymm6,ymm8
556 LONG $0x210c8d47 // lea r9d,[r9+r12*1]
557 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
558 WORD $0x8945; BYTE $0xd7 // mov r15d,r10d
559 LONG $0xd6feedc5 // vpaddd ymm2,ymm2,ymm6
560 LONG $0xf07b43c4; WORD $0x16e2 // rorx r12d,r10d,0x16
561 LONG $0x290c8d47 // lea r9d,[r9+r13*1]
562 WORD $0x3145; BYTE $0xdf // xor r15d,r11d
563 LONG $0xfa70fdc5; BYTE $0x50 // vpshufd ymm7,ymm2,0x50
564 LONG $0xf07b43c4; WORD $0x0df2 // rorx r14d,r10d,0xd
565 LONG $0xf07b43c4; WORD $0x02ea // rorx r13d,r10d,0x2
566 LONG $0x0b1c8d42 // lea ebx,[rbx+r9*1]
567 LONG $0xd772cdc5; BYTE $0x0a // vpsrld ymm6,ymm7,0xa
568 WORD $0x2144; BYTE $0xff // and edi,r15d
569 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
570 WORD $0x3144; BYTE $0xdf // xor edi,r11d
571 LONG $0xd773c5c5; BYTE $0x11 // vpsrlq ymm7,ymm7,0x11
572 WORD $0x3145; BYTE $0xee // xor r14d,r13d
573 LONG $0x390c8d45 // lea r9d,[r9+rdi*1]
574 WORD $0x8941; BYTE $0xcc // mov r12d,ecx
575 LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7
576
577 // ROUND(R9, R10, R11, AX, BX, CX, DX, R8, R12, R13, R14, DI, R15, SP, 0x8c)
578 LONG $0x24840344; LONG $0x0000008c // add r8d,[rsp+0x8c]
579 WORD $0x2141; BYTE $0xdc // and r12d,ebx
580 LONG $0xf07b63c4; WORD $0x19eb // rorx r13d,ebx,0x19
581 LONG $0xd773c5c5; BYTE $0x02 // vpsrlq ymm7,ymm7,0x2
582 LONG $0xf07be3c4; WORD $0x0bfb // rorx edi,ebx,0xb
583 LONG $0x310c8d47 // lea r9d,[r9+r14*1]
584 LONG $0x20048d47 // lea r8d,[r8+r12*1]
585 LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7
586 LONG $0xf26062c4; BYTE $0xe2 // andn r12d,ebx,edx
587 WORD $0x3141; BYTE $0xfd // xor r13d,edi
588 LONG $0xf07b63c4; WORD $0x06f3 // rorx r14d,ebx,0x6
589 LONG $0x004dc2c4; BYTE $0xf1 // vpshufb ymm6,ymm6,ymm9
590 LONG $0x20048d47 // lea r8d,[r8+r12*1]
591 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
592 WORD $0x8944; BYTE $0xcf // mov edi,r9d
593 LONG $0xd6feedc5 // vpaddd ymm2,ymm2,ymm6
594 LONG $0xf07b43c4; WORD $0x16e1 // rorx r12d,r9d,0x16
595 LONG $0x28048d47 // lea r8d,[r8+r13*1]
596 WORD $0x3144; BYTE $0xd7 // xor edi,r10d
597 LONG $0x75feedc5; BYTE $0x40 // vpaddd ymm6,ymm2,[rbp+0x40]
598 LONG $0xf07b43c4; WORD $0x0df1 // rorx r14d,r9d,0xd
599 LONG $0xf07b43c4; WORD $0x02e9 // rorx r13d,r9d,0x2
600 LONG $0x00048d42 // lea eax,[rax+r8*1]
601 WORD $0x2141; BYTE $0xff // and r15d,edi
602 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
603 WORD $0x3145; BYTE $0xd7 // xor r15d,r10d
604 WORD $0x3145; BYTE $0xee // xor r14d,r13d
605 LONG $0x38048d47 // lea r8d,[r8+r15*1]
606 WORD $0x8941; BYTE $0xdc // mov r12d,ebx
607
608 LONG $0x347ffdc5; BYTE $0x24 // vmovdqa [rsp],ymm6
609 LONG $0x0f7de3c4; WORD $0x04e3 // vpalignr ymm4,ymm0,ymm3,0x4
610
611 // ROUND(R8, R9, R10, R11, AX, BX, CX, DX, R12, R13, R14, R15, DI, SP, 0xa0)
612 LONG $0xa0249403; WORD $0x0000; BYTE $0x00 // add edx,[rsp+0xa0]
613 WORD $0x2141; BYTE $0xc4 // and r12d,eax
614 LONG $0xf07b63c4; WORD $0x19e8 // rorx r13d,eax,0x19
615 LONG $0x0f6de3c4; WORD $0x04f9 // vpalignr ymm7,ymm2,ymm1,0x4
616 LONG $0xf07b63c4; WORD $0x0bf8 // rorx r15d,eax,0xb
617 LONG $0x30048d47 // lea r8d,[r8+r14*1]
618 LONG $0x22148d42 // lea edx,[rdx+r12*1]
619 LONG $0xd472cdc5; BYTE $0x07 // vpsrld ymm6,ymm4,0x7
620 LONG $0xf27862c4; BYTE $0xe1 // andn r12d,eax,ecx
621 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
622 LONG $0xf07b63c4; WORD $0x06f0 // rorx r14d,eax,0x6
623 LONG $0xdffee5c5 // vpaddd ymm3,ymm3,ymm7
624 LONG $0x22148d42 // lea edx,[rdx+r12*1]
625 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
626 WORD $0x8945; BYTE $0xc7 // mov r15d,r8d
627 LONG $0xd472c5c5; BYTE $0x03 // vpsrld ymm7,ymm4,0x3
628 LONG $0xf07b43c4; WORD $0x16e0 // rorx r12d,r8d,0x16
629 LONG $0x2a148d42 // lea edx,[rdx+r13*1]
630 WORD $0x3145; BYTE $0xcf // xor r15d,r9d
631 LONG $0xf472d5c5; BYTE $0x0e // vpslld ymm5,ymm4,0xe
632 LONG $0xf07b43c4; WORD $0x0df0 // rorx r14d,r8d,0xd
633 LONG $0xf07b43c4; WORD $0x02e8 // rorx r13d,r8d,0x2
634 LONG $0x131c8d45 // lea r11d,[r11+rdx*1]
635 LONG $0xe6efc5c5 // vpxor ymm4,ymm7,ymm6
636 WORD $0x2144; BYTE $0xff // and edi,r15d
637 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
638 WORD $0x3144; BYTE $0xcf // xor edi,r9d
639 LONG $0xfa70fdc5; BYTE $0xfa // vpshufd ymm7,ymm2,0xfa
640 WORD $0x3145; BYTE $0xee // xor r14d,r13d
641 WORD $0x148d; BYTE $0x3a // lea edx,[rdx+rdi*1]
642 WORD $0x8941; BYTE $0xc4 // mov r12d,eax
643 LONG $0xd672cdc5; BYTE $0x0b // vpsrld ymm6,ymm6,0xb
644
645 // ROUND(DX, R8, R9, R10, R11, AX, BX, CX, R12, R13, R14, DI, R15, SP, 0xa4)
646 LONG $0xa4248c03; WORD $0x0000; BYTE $0x00 // add ecx,[rsp+0xa4]
647 WORD $0x2145; BYTE $0xdc // and r12d,r11d
648 LONG $0xf07b43c4; WORD $0x19eb // rorx r13d,r11d,0x19
649 LONG $0xe5efddc5 // vpxor ymm4,ymm4,ymm5
650 LONG $0xf07bc3c4; WORD $0x0bfb // rorx edi,r11d,0xb
651 LONG $0x32148d42 // lea edx,[rdx+r14*1]
652 LONG $0x210c8d42 // lea ecx,[rcx+r12*1]
653 LONG $0xf572d5c5; BYTE $0x0b // vpslld ymm5,ymm5,0xb
654 LONG $0xf22062c4; BYTE $0xe3 // andn r12d,r11d,ebx
655 WORD $0x3141; BYTE $0xfd // xor r13d,edi
656 LONG $0xf07b43c4; WORD $0x06f3 // rorx r14d,r11d,0x6
657 LONG $0xe6efddc5 // vpxor ymm4,ymm4,ymm6
658 LONG $0x210c8d42 // lea ecx,[rcx+r12*1]
659 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
660 WORD $0xd789 // mov edi,edx
661 LONG $0xd772cdc5; BYTE $0x0a // vpsrld ymm6,ymm7,0xa
662 LONG $0xf07b63c4; WORD $0x16e2 // rorx r12d,edx,0x16
663 LONG $0x290c8d42 // lea ecx,[rcx+r13*1]
664 WORD $0x3144; BYTE $0xc7 // xor edi,r8d
665 LONG $0xe5efddc5 // vpxor ymm4,ymm4,ymm5
666 LONG $0xf07b63c4; WORD $0x0df2 // rorx r14d,edx,0xd
667 LONG $0xf07b63c4; WORD $0x02ea // rorx r13d,edx,0x2
668 LONG $0x0a148d45 // lea r10d,[r10+rcx*1]
669 LONG $0xd773c5c5; BYTE $0x11 // vpsrlq ymm7,ymm7,0x11
670 WORD $0x2141; BYTE $0xff // and r15d,edi
671 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
672 WORD $0x3145; BYTE $0xc7 // xor r15d,r8d
673 LONG $0xdcfee5c5 // vpaddd ymm3,ymm3,ymm4
674 WORD $0x3145; BYTE $0xee // xor r14d,r13d
675 LONG $0x390c8d42 // lea ecx,[rcx+r15*1]
676 WORD $0x8945; BYTE $0xdc // mov r12d,r11d
677 LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7
678
679 // ROUND(CX, DX, R8, R9, R10, R11, AX, BX, R12, R13, R14, R15, DI, SP, 0xa8)
680 LONG $0xa8249c03; WORD $0x0000; BYTE $0x00 // add ebx,[rsp+0xa8]
681 WORD $0x2145; BYTE $0xd4 // and r12d,r10d
682 LONG $0xf07b43c4; WORD $0x19ea // rorx r13d,r10d,0x19
683 LONG $0xd773c5c5; BYTE $0x02 // vpsrlq ymm7,ymm7,0x2
684 LONG $0xf07b43c4; WORD $0x0bfa // rorx r15d,r10d,0xb
685 LONG $0x310c8d42 // lea ecx,[rcx+r14*1]
686 LONG $0x231c8d42 // lea ebx,[rbx+r12*1]
687 LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7
688 LONG $0xf22862c4; BYTE $0xe0 // andn r12d,r10d,eax
689 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
690 LONG $0xf07b43c4; WORD $0x06f2 // rorx r14d,r10d,0x6
691 LONG $0x004dc2c4; BYTE $0xf0 // vpshufb ymm6,ymm6,ymm8
692 LONG $0x231c8d42 // lea ebx,[rbx+r12*1]
693 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
694 WORD $0x8941; BYTE $0xcf // mov r15d,ecx
695 LONG $0xdefee5c5 // vpaddd ymm3,ymm3,ymm6
696 LONG $0xf07b63c4; WORD $0x16e1 // rorx r12d,ecx,0x16
697 LONG $0x2b1c8d42 // lea ebx,[rbx+r13*1]
698 WORD $0x3141; BYTE $0xd7 // xor r15d,edx
699 LONG $0xfb70fdc5; BYTE $0x50 // vpshufd ymm7,ymm3,0x50
700 LONG $0xf07b63c4; WORD $0x0df1 // rorx r14d,ecx,0xd
701 LONG $0xf07b63c4; WORD $0x02e9 // rorx r13d,ecx,0x2
702 LONG $0x190c8d45 // lea r9d,[r9+rbx*1]
703 LONG $0xd772cdc5; BYTE $0x0a // vpsrld ymm6,ymm7,0xa
704 WORD $0x2144; BYTE $0xff // and edi,r15d
705 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
706 WORD $0xd731 // xor edi,edx
707 LONG $0xd773c5c5; BYTE $0x11 // vpsrlq ymm7,ymm7,0x11
708 WORD $0x3145; BYTE $0xee // xor r14d,r13d
709 WORD $0x1c8d; BYTE $0x3b // lea ebx,[rbx+rdi*1]
710 WORD $0x8945; BYTE $0xd4 // mov r12d,r10d
711 LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7
712
713 // ROUND(BX, CX, DX, R8, R9, R10, R11, AX, R12, R13, R14, DI, R15, SP, 0xac)
714 LONG $0xac248403; WORD $0x0000; BYTE $0x00 // add eax,[rsp+0xac]
715 WORD $0x2145; BYTE $0xcc // and r12d,r9d
716 LONG $0xf07b43c4; WORD $0x19e9 // rorx r13d,r9d,0x19
717 LONG $0xd773c5c5; BYTE $0x02 // vpsrlq ymm7,ymm7,0x2
718 LONG $0xf07bc3c4; WORD $0x0bf9 // rorx edi,r9d,0xb
719 LONG $0x331c8d42 // lea ebx,[rbx+r14*1]
720 LONG $0x20048d42 // lea eax,[rax+r12*1]
721 LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7
722 LONG $0xf23042c4; BYTE $0xe3 // andn r12d,r9d,r11d
723 WORD $0x3141; BYTE $0xfd // xor r13d,edi
724 LONG $0xf07b43c4; WORD $0x06f1 // rorx r14d,r9d,0x6
725 LONG $0x004dc2c4; BYTE $0xf1 // vpshufb ymm6,ymm6,ymm9
726 LONG $0x20048d42 // lea eax,[rax+r12*1]
727 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
728 WORD $0xdf89 // mov edi,ebx
729 LONG $0xdefee5c5 // vpaddd ymm3,ymm3,ymm6
730 LONG $0xf07b63c4; WORD $0x16e3 // rorx r12d,ebx,0x16
731 LONG $0x28048d42 // lea eax,[rax+r13*1]
732 WORD $0xcf31 // xor edi,ecx
733 LONG $0x75fee5c5; BYTE $0x60 // vpaddd ymm6,ymm3,[rbp+0x60]
734 LONG $0xf07b63c4; WORD $0x0df3 // rorx r14d,ebx,0xd
735 LONG $0xf07b63c4; WORD $0x02eb // rorx r13d,ebx,0x2
736 LONG $0x00048d45 // lea r8d,[r8+rax*1]
737 WORD $0x2141; BYTE $0xff // and r15d,edi
738 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
739 WORD $0x3141; BYTE $0xcf // xor r15d,ecx
740 WORD $0x3145; BYTE $0xee // xor r14d,r13d
741 LONG $0x38048d42 // lea eax,[rax+r15*1]
742 WORD $0x8945; BYTE $0xcc // mov r12d,r9d
743
744 LONG $0x747ffdc5; WORD $0x2024 // vmovdqa [rsp+0x20],ymm6
745 ADDQ $0x80, BP
746
747 CMPB 0x3(BP), $0x0
748 JNE loop1
749
750 // ROUND(AX, BX, CX, DX, R8, R9, R10, R11, R12, R13, R14, R15, DI, SP, 0x40)
751 LONG $0x245c0344; BYTE $0x40 // add r11d,[rsp+0x40]
752 WORD $0x2145; BYTE $0xc4 // and r12d,r8d
753 LONG $0xf07b43c4; WORD $0x19e8 // rorx r13d,r8d,0x19
754 LONG $0xf07b43c4; WORD $0x0bf8 // rorx r15d,r8d,0xb
755 LONG $0x30048d42 // lea eax,[rax+r14*1]
756 LONG $0x231c8d47 // lea r11d,[r11+r12*1]
757 LONG $0xf23842c4; BYTE $0xe2 // andn r12d,r8d,r10d
758 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
759 LONG $0xf07b43c4; WORD $0x06f0 // rorx r14d,r8d,0x6
760 LONG $0x231c8d47 // lea r11d,[r11+r12*1]
761 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
762 WORD $0x8941; BYTE $0xc7 // mov r15d,eax
763 LONG $0xf07b63c4; WORD $0x16e0 // rorx r12d,eax,0x16
764 LONG $0x2b1c8d47 // lea r11d,[r11+r13*1]
765 WORD $0x3141; BYTE $0xdf // xor r15d,ebx
766 LONG $0xf07b63c4; WORD $0x0df0 // rorx r14d,eax,0xd
767 LONG $0xf07b63c4; WORD $0x02e8 // rorx r13d,eax,0x2
768 LONG $0x1a148d42 // lea edx,[rdx+r11*1]
769 WORD $0x2144; BYTE $0xff // and edi,r15d
770 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
771 WORD $0xdf31 // xor edi,ebx
772 WORD $0x3145; BYTE $0xee // xor r14d,r13d
773 LONG $0x3b1c8d45 // lea r11d,[r11+rdi*1]
774 WORD $0x8945; BYTE $0xc4 // mov r12d,r8d
775
776 // ROUND(R11, AX, BX, CX, DX, R8, R9, R10, R12, R13, R14, DI, R15, SP, 0x44)
777 LONG $0x24540344; BYTE $0x44 // add r10d,[rsp+0x44]
778 WORD $0x2141; BYTE $0xd4 // and r12d,edx
779 LONG $0xf07b63c4; WORD $0x19ea // rorx r13d,edx,0x19
780 LONG $0xf07be3c4; WORD $0x0bfa // rorx edi,edx,0xb
781 LONG $0x331c8d47 // lea r11d,[r11+r14*1]
782 LONG $0x22148d47 // lea r10d,[r10+r12*1]
783 LONG $0xf26842c4; BYTE $0xe1 // andn r12d,edx,r9d
784 WORD $0x3141; BYTE $0xfd // xor r13d,edi
785 LONG $0xf07b63c4; WORD $0x06f2 // rorx r14d,edx,0x6
786 LONG $0x22148d47 // lea r10d,[r10+r12*1]
787 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
788 WORD $0x8944; BYTE $0xdf // mov edi,r11d
789 LONG $0xf07b43c4; WORD $0x16e3 // rorx r12d,r11d,0x16
790 LONG $0x2a148d47 // lea r10d,[r10+r13*1]
791 WORD $0xc731 // xor edi,eax
792 LONG $0xf07b43c4; WORD $0x0df3 // rorx r14d,r11d,0xd
793 LONG $0xf07b43c4; WORD $0x02eb // rorx r13d,r11d,0x2
794 LONG $0x110c8d42 // lea ecx,[rcx+r10*1]
795 WORD $0x2141; BYTE $0xff // and r15d,edi
796 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
797 WORD $0x3141; BYTE $0xc7 // xor r15d,eax
798 WORD $0x3145; BYTE $0xee // xor r14d,r13d
799 LONG $0x3a148d47 // lea r10d,[r10+r15*1]
800 WORD $0x8941; BYTE $0xd4 // mov r12d,edx
801
802 // ROUND(R10, R11, AX, BX, CX, DX, R8, R9, R12, R13, R14, R15, DI, SP, 0x48)
803 LONG $0x244c0344; BYTE $0x48 // add r9d,[rsp+0x48]
804 WORD $0x2141; BYTE $0xcc // and r12d,ecx
805 LONG $0xf07b63c4; WORD $0x19e9 // rorx r13d,ecx,0x19
806 LONG $0xf07b63c4; WORD $0x0bf9 // rorx r15d,ecx,0xb
807 LONG $0x32148d47 // lea r10d,[r10+r14*1]
808 LONG $0x210c8d47 // lea r9d,[r9+r12*1]
809 LONG $0xf27042c4; BYTE $0xe0 // andn r12d,ecx,r8d
810 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
811 LONG $0xf07b63c4; WORD $0x06f1 // rorx r14d,ecx,0x6
812 LONG $0x210c8d47 // lea r9d,[r9+r12*1]
813 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
814 WORD $0x8945; BYTE $0xd7 // mov r15d,r10d
815 LONG $0xf07b43c4; WORD $0x16e2 // rorx r12d,r10d,0x16
816 LONG $0x290c8d47 // lea r9d,[r9+r13*1]
817 WORD $0x3145; BYTE $0xdf // xor r15d,r11d
818 LONG $0xf07b43c4; WORD $0x0df2 // rorx r14d,r10d,0xd
819 LONG $0xf07b43c4; WORD $0x02ea // rorx r13d,r10d,0x2
820 LONG $0x0b1c8d42 // lea ebx,[rbx+r9*1]
821 WORD $0x2144; BYTE $0xff // and edi,r15d
822 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
823 WORD $0x3144; BYTE $0xdf // xor edi,r11d
824 WORD $0x3145; BYTE $0xee // xor r14d,r13d
825 LONG $0x390c8d45 // lea r9d,[r9+rdi*1]
826 WORD $0x8941; BYTE $0xcc // mov r12d,ecx
827
828 // ROUND(R9, R10, R11, AX, BX, CX, DX, R8, R12, R13, R14, DI, R15, SP, 0x4c)
829 LONG $0x24440344; BYTE $0x4c // add r8d,[rsp+0x4c]
830 WORD $0x2141; BYTE $0xdc // and r12d,ebx
831 LONG $0xf07b63c4; WORD $0x19eb // rorx r13d,ebx,0x19
832 LONG $0xf07be3c4; WORD $0x0bfb // rorx edi,ebx,0xb
833 LONG $0x310c8d47 // lea r9d,[r9+r14*1]
834 LONG $0x20048d47 // lea r8d,[r8+r12*1]
835 LONG $0xf26062c4; BYTE $0xe2 // andn r12d,ebx,edx
836 WORD $0x3141; BYTE $0xfd // xor r13d,edi
837 LONG $0xf07b63c4; WORD $0x06f3 // rorx r14d,ebx,0x6
838 LONG $0x20048d47 // lea r8d,[r8+r12*1]
839 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
840 WORD $0x8944; BYTE $0xcf // mov edi,r9d
841 LONG $0xf07b43c4; WORD $0x16e1 // rorx r12d,r9d,0x16
842 LONG $0x28048d47 // lea r8d,[r8+r13*1]
843 WORD $0x3144; BYTE $0xd7 // xor edi,r10d
844 LONG $0xf07b43c4; WORD $0x0df1 // rorx r14d,r9d,0xd
845 LONG $0xf07b43c4; WORD $0x02e9 // rorx r13d,r9d,0x2
846 LONG $0x00048d42 // lea eax,[rax+r8*1]
847 WORD $0x2141; BYTE $0xff // and r15d,edi
848 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
849 WORD $0x3145; BYTE $0xd7 // xor r15d,r10d
850 WORD $0x3145; BYTE $0xee // xor r14d,r13d
851 LONG $0x38048d47 // lea r8d,[r8+r15*1]
852 WORD $0x8941; BYTE $0xdc // mov r12d,ebx
853
854 // ROUND(R8, R9, R10, R11, AX, BX, CX, DX, R12, R13, R14, R15, DI, SP, 0x60)
855 LONG $0x60245403 // add edx,[rsp+0x60]
856 WORD $0x2141; BYTE $0xc4 // and r12d,eax
857 LONG $0xf07b63c4; WORD $0x19e8 // rorx r13d,eax,0x19
858 LONG $0xf07b63c4; WORD $0x0bf8 // rorx r15d,eax,0xb
859 LONG $0x30048d47 // lea r8d,[r8+r14*1]
860 LONG $0x22148d42 // lea edx,[rdx+r12*1]
861 LONG $0xf27862c4; BYTE $0xe1 // andn r12d,eax,ecx
862 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
863 LONG $0xf07b63c4; WORD $0x06f0 // rorx r14d,eax,0x6
864 LONG $0x22148d42 // lea edx,[rdx+r12*1]
865 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
866 WORD $0x8945; BYTE $0xc7 // mov r15d,r8d
867 LONG $0xf07b43c4; WORD $0x16e0 // rorx r12d,r8d,0x16
868 LONG $0x2a148d42 // lea edx,[rdx+r13*1]
869 WORD $0x3145; BYTE $0xcf // xor r15d,r9d
870 LONG $0xf07b43c4; WORD $0x0df0 // rorx r14d,r8d,0xd
871 LONG $0xf07b43c4; WORD $0x02e8 // rorx r13d,r8d,0x2
872 LONG $0x131c8d45 // lea r11d,[r11+rdx*1]
873 WORD $0x2144; BYTE $0xff // and edi,r15d
874 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
875 WORD $0x3144; BYTE $0xcf // xor edi,r9d
876 WORD $0x3145; BYTE $0xee // xor r14d,r13d
877 WORD $0x148d; BYTE $0x3a // lea edx,[rdx+rdi*1]
878 WORD $0x8941; BYTE $0xc4 // mov r12d,eax
879
880 // ROUND(DX, R8, R9, R10, R11, AX, BX, CX, R12, R13, R14, DI, R15, SP, 0x64)
881 LONG $0x64244c03 // add ecx,[rsp+0x64]
882 WORD $0x2145; BYTE $0xdc // and r12d,r11d
883 LONG $0xf07b43c4; WORD $0x19eb // rorx r13d,r11d,0x19
884 LONG $0xf07bc3c4; WORD $0x0bfb // rorx edi,r11d,0xb
885 LONG $0x32148d42 // lea edx,[rdx+r14*1]
886 LONG $0x210c8d42 // lea ecx,[rcx+r12*1]
887 LONG $0xf22062c4; BYTE $0xe3 // andn r12d,r11d,ebx
888 WORD $0x3141; BYTE $0xfd // xor r13d,edi
889 LONG $0xf07b43c4; WORD $0x06f3 // rorx r14d,r11d,0x6
890 LONG $0x210c8d42 // lea ecx,[rcx+r12*1]
891 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
892 WORD $0xd789 // mov edi,edx
893 LONG $0xf07b63c4; WORD $0x16e2 // rorx r12d,edx,0x16
894 LONG $0x290c8d42 // lea ecx,[rcx+r13*1]
895 WORD $0x3144; BYTE $0xc7 // xor edi,r8d
896 LONG $0xf07b63c4; WORD $0x0df2 // rorx r14d,edx,0xd
897 LONG $0xf07b63c4; WORD $0x02ea // rorx r13d,edx,0x2
898 LONG $0x0a148d45 // lea r10d,[r10+rcx*1]
899 WORD $0x2141; BYTE $0xff // and r15d,edi
900 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
901 WORD $0x3145; BYTE $0xc7 // xor r15d,r8d
902 WORD $0x3145; BYTE $0xee // xor r14d,r13d
903 LONG $0x390c8d42 // lea ecx,[rcx+r15*1]
904 WORD $0x8945; BYTE $0xdc // mov r12d,r11d
905
906 // ROUND(CX, DX, R8, R9, R10, R11, AX, BX, R12, R13, R14, R15, DI, SP, 0x68)
907 LONG $0x68245c03 // add ebx,[rsp+0x68]
908 WORD $0x2145; BYTE $0xd4 // and r12d,r10d
909 LONG $0xf07b43c4; WORD $0x19ea // rorx r13d,r10d,0x19
910 LONG $0xf07b43c4; WORD $0x0bfa // rorx r15d,r10d,0xb
911 LONG $0x310c8d42 // lea ecx,[rcx+r14*1]
912 LONG $0x231c8d42 // lea ebx,[rbx+r12*1]
913 LONG $0xf22862c4; BYTE $0xe0 // andn r12d,r10d,eax
914 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
915 LONG $0xf07b43c4; WORD $0x06f2 // rorx r14d,r10d,0x6
916 LONG $0x231c8d42 // lea ebx,[rbx+r12*1]
917 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
918 WORD $0x8941; BYTE $0xcf // mov r15d,ecx
919 LONG $0xf07b63c4; WORD $0x16e1 // rorx r12d,ecx,0x16
920 LONG $0x2b1c8d42 // lea ebx,[rbx+r13*1]
921 WORD $0x3141; BYTE $0xd7 // xor r15d,edx
922 LONG $0xf07b63c4; WORD $0x0df1 // rorx r14d,ecx,0xd
923 LONG $0xf07b63c4; WORD $0x02e9 // rorx r13d,ecx,0x2
924 LONG $0x190c8d45 // lea r9d,[r9+rbx*1]
925 WORD $0x2144; BYTE $0xff // and edi,r15d
926 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
927 WORD $0xd731 // xor edi,edx
928 WORD $0x3145; BYTE $0xee // xor r14d,r13d
929 WORD $0x1c8d; BYTE $0x3b // lea ebx,[rbx+rdi*1]
930 WORD $0x8945; BYTE $0xd4 // mov r12d,r10d
931
932 // ROUND(BX, CX, DX, R8, R9, R10, R11, AX, R12, R13, R14, DI, R15, SP, 0x6c)
933 LONG $0x6c244403 // add eax,[rsp+0x6c]
934 WORD $0x2145; BYTE $0xcc // and r12d,r9d
935 LONG $0xf07b43c4; WORD $0x19e9 // rorx r13d,r9d,0x19
936 LONG $0xf07bc3c4; WORD $0x0bf9 // rorx edi,r9d,0xb
937 LONG $0x331c8d42 // lea ebx,[rbx+r14*1]
938 LONG $0x20048d42 // lea eax,[rax+r12*1]
939 LONG $0xf23042c4; BYTE $0xe3 // andn r12d,r9d,r11d
940 WORD $0x3141; BYTE $0xfd // xor r13d,edi
941 LONG $0xf07b43c4; WORD $0x06f1 // rorx r14d,r9d,0x6
942 LONG $0x20048d42 // lea eax,[rax+r12*1]
943 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
944 WORD $0xdf89 // mov edi,ebx
945 LONG $0xf07b63c4; WORD $0x16e3 // rorx r12d,ebx,0x16
946 LONG $0x28048d42 // lea eax,[rax+r13*1]
947 WORD $0xcf31 // xor edi,ecx
948 LONG $0xf07b63c4; WORD $0x0df3 // rorx r14d,ebx,0xd
949 LONG $0xf07b63c4; WORD $0x02eb // rorx r13d,ebx,0x2
950 LONG $0x00048d45 // lea r8d,[r8+rax*1]
951 WORD $0x2141; BYTE $0xff // and r15d,edi
952 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
953 WORD $0x3141; BYTE $0xcf // xor r15d,ecx
954 WORD $0x3145; BYTE $0xee // xor r14d,r13d
955 LONG $0x38048d42 // lea eax,[rax+r15*1]
956 WORD $0x8945; BYTE $0xcc // mov r12d,r9d
957
958 // ROUND(AX, BX, CX, DX, R8, R9, R10, R11, R12, R13, R14, R15, DI, SP, 0x00)
959 LONG $0x241c0344 // add r11d,[rsp]
960 WORD $0x2145; BYTE $0xc4 // and r12d,r8d
961 LONG $0xf07b43c4; WORD $0x19e8 // rorx r13d,r8d,0x19
962 LONG $0xf07b43c4; WORD $0x0bf8 // rorx r15d,r8d,0xb
963 LONG $0x30048d42 // lea eax,[rax+r14*1]
964 LONG $0x231c8d47 // lea r11d,[r11+r12*1]
965 LONG $0xf23842c4; BYTE $0xe2 // andn r12d,r8d,r10d
966 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
967 LONG $0xf07b43c4; WORD $0x06f0 // rorx r14d,r8d,0x6
968 LONG $0x231c8d47 // lea r11d,[r11+r12*1]
969 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
970 WORD $0x8941; BYTE $0xc7 // mov r15d,eax
971 LONG $0xf07b63c4; WORD $0x16e0 // rorx r12d,eax,0x16
972 LONG $0x2b1c8d47 // lea r11d,[r11+r13*1]
973 WORD $0x3141; BYTE $0xdf // xor r15d,ebx
974 LONG $0xf07b63c4; WORD $0x0df0 // rorx r14d,eax,0xd
975 LONG $0xf07b63c4; WORD $0x02e8 // rorx r13d,eax,0x2
976 LONG $0x1a148d42 // lea edx,[rdx+r11*1]
977 WORD $0x2144; BYTE $0xff // and edi,r15d
978 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
979 WORD $0xdf31 // xor edi,ebx
980 WORD $0x3145; BYTE $0xee // xor r14d,r13d
981 LONG $0x3b1c8d45 // lea r11d,[r11+rdi*1]
982 WORD $0x8945; BYTE $0xc4 // mov r12d,r8d
983
984 // ROUND(R11, AX, BX, CX, DX, R8, R9, R10, R12, R13, R14, DI, R15, SP, 0x04)
985 LONG $0x24540344; BYTE $0x04 // add r10d,[rsp+0x4]
986 WORD $0x2141; BYTE $0xd4 // and r12d,edx
987 LONG $0xf07b63c4; WORD $0x19ea // rorx r13d,edx,0x19
988 LONG $0xf07be3c4; WORD $0x0bfa // rorx edi,edx,0xb
989 LONG $0x331c8d47 // lea r11d,[r11+r14*1]
990 LONG $0x22148d47 // lea r10d,[r10+r12*1]
991 LONG $0xf26842c4; BYTE $0xe1 // andn r12d,edx,r9d
992 WORD $0x3141; BYTE $0xfd // xor r13d,edi
993 LONG $0xf07b63c4; WORD $0x06f2 // rorx r14d,edx,0x6
994 LONG $0x22148d47 // lea r10d,[r10+r12*1]
995 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
996 WORD $0x8944; BYTE $0xdf // mov edi,r11d
997 LONG $0xf07b43c4; WORD $0x16e3 // rorx r12d,r11d,0x16
998 LONG $0x2a148d47 // lea r10d,[r10+r13*1]
999 WORD $0xc731 // xor edi,eax
1000 LONG $0xf07b43c4; WORD $0x0df3 // rorx r14d,r11d,0xd
1001 LONG $0xf07b43c4; WORD $0x02eb // rorx r13d,r11d,0x2
1002 LONG $0x110c8d42 // lea ecx,[rcx+r10*1]
1003 WORD $0x2141; BYTE $0xff // and r15d,edi
1004 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
1005 WORD $0x3141; BYTE $0xc7 // xor r15d,eax
1006 WORD $0x3145; BYTE $0xee // xor r14d,r13d
1007 LONG $0x3a148d47 // lea r10d,[r10+r15*1]
1008 WORD $0x8941; BYTE $0xd4 // mov r12d,edx
1009
1010 // ROUND(R10, R11, AX, BX, CX, DX, R8, R9, R12, R13, R14, R15, DI, SP, 0x08)
1011 LONG $0x244c0344; BYTE $0x08 // add r9d,[rsp+0x8]
1012 WORD $0x2141; BYTE $0xcc // and r12d,ecx
1013 LONG $0xf07b63c4; WORD $0x19e9 // rorx r13d,ecx,0x19
1014 LONG $0xf07b63c4; WORD $0x0bf9 // rorx r15d,ecx,0xb
1015 LONG $0x32148d47 // lea r10d,[r10+r14*1]
1016 LONG $0x210c8d47 // lea r9d,[r9+r12*1]
1017 LONG $0xf27042c4; BYTE $0xe0 // andn r12d,ecx,r8d
1018 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
1019 LONG $0xf07b63c4; WORD $0x06f1 // rorx r14d,ecx,0x6
1020 LONG $0x210c8d47 // lea r9d,[r9+r12*1]
1021 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
1022 WORD $0x8945; BYTE $0xd7 // mov r15d,r10d
1023 LONG $0xf07b43c4; WORD $0x16e2 // rorx r12d,r10d,0x16
1024 LONG $0x290c8d47 // lea r9d,[r9+r13*1]
1025 WORD $0x3145; BYTE $0xdf // xor r15d,r11d
1026 LONG $0xf07b43c4; WORD $0x0df2 // rorx r14d,r10d,0xd
1027 LONG $0xf07b43c4; WORD $0x02ea // rorx r13d,r10d,0x2
1028 LONG $0x0b1c8d42 // lea ebx,[rbx+r9*1]
1029 WORD $0x2144; BYTE $0xff // and edi,r15d
1030 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
1031 WORD $0x3144; BYTE $0xdf // xor edi,r11d
1032 WORD $0x3145; BYTE $0xee // xor r14d,r13d
1033 LONG $0x390c8d45 // lea r9d,[r9+rdi*1]
1034 WORD $0x8941; BYTE $0xcc // mov r12d,ecx
1035
1036 // ROUND(R9, R10, R11, AX, BX, CX, DX, R8, R12, R13, R14, DI, R15, SP, 0x0c)
1037 LONG $0x24440344; BYTE $0x0c // add r8d,[rsp+0xc]
1038 WORD $0x2141; BYTE $0xdc // and r12d,ebx
1039 LONG $0xf07b63c4; WORD $0x19eb // rorx r13d,ebx,0x19
1040 LONG $0xf07be3c4; WORD $0x0bfb // rorx edi,ebx,0xb
1041 LONG $0x310c8d47 // lea r9d,[r9+r14*1]
1042 LONG $0x20048d47 // lea r8d,[r8+r12*1]
1043 LONG $0xf26062c4; BYTE $0xe2 // andn r12d,ebx,edx
1044 WORD $0x3141; BYTE $0xfd // xor r13d,edi
1045 LONG $0xf07b63c4; WORD $0x06f3 // rorx r14d,ebx,0x6
1046 LONG $0x20048d47 // lea r8d,[r8+r12*1]
1047 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
1048 WORD $0x8944; BYTE $0xcf // mov edi,r9d
1049 LONG $0xf07b43c4; WORD $0x16e1 // rorx r12d,r9d,0x16
1050 LONG $0x28048d47 // lea r8d,[r8+r13*1]
1051 WORD $0x3144; BYTE $0xd7 // xor edi,r10d
1052 LONG $0xf07b43c4; WORD $0x0df1 // rorx r14d,r9d,0xd
1053 LONG $0xf07b43c4; WORD $0x02e9 // rorx r13d,r9d,0x2
1054 LONG $0x00048d42 // lea eax,[rax+r8*1]
1055 WORD $0x2141; BYTE $0xff // and r15d,edi
1056 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
1057 WORD $0x3145; BYTE $0xd7 // xor r15d,r10d
1058 WORD $0x3145; BYTE $0xee // xor r14d,r13d
1059 LONG $0x38048d47 // lea r8d,[r8+r15*1]
1060 WORD $0x8941; BYTE $0xdc // mov r12d,ebx
1061
1062 // ROUND(R8, R9, R10, R11, AX, BX, CX, DX, R12, R13, R14, R15, DI, SP, 0x20)
1063 LONG $0x20245403 // add edx,[rsp+0x20]
1064 WORD $0x2141; BYTE $0xc4 // and r12d,eax
1065 LONG $0xf07b63c4; WORD $0x19e8 // rorx r13d,eax,0x19
1066 LONG $0xf07b63c4; WORD $0x0bf8 // rorx r15d,eax,0xb
1067 LONG $0x30048d47 // lea r8d,[r8+r14*1]
1068 LONG $0x22148d42 // lea edx,[rdx+r12*1]
1069 LONG $0xf27862c4; BYTE $0xe1 // andn r12d,eax,ecx
1070 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
1071 LONG $0xf07b63c4; WORD $0x06f0 // rorx r14d,eax,0x6
1072 LONG $0x22148d42 // lea edx,[rdx+r12*1]
1073 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
1074 WORD $0x8945; BYTE $0xc7 // mov r15d,r8d
1075 LONG $0xf07b43c4; WORD $0x16e0 // rorx r12d,r8d,0x16
1076 LONG $0x2a148d42 // lea edx,[rdx+r13*1]
1077 WORD $0x3145; BYTE $0xcf // xor r15d,r9d
1078 LONG $0xf07b43c4; WORD $0x0df0 // rorx r14d,r8d,0xd
1079 LONG $0xf07b43c4; WORD $0x02e8 // rorx r13d,r8d,0x2
1080 LONG $0x131c8d45 // lea r11d,[r11+rdx*1]
1081 WORD $0x2144; BYTE $0xff // and edi,r15d
1082 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
1083 WORD $0x3144; BYTE $0xcf // xor edi,r9d
1084 WORD $0x3145; BYTE $0xee // xor r14d,r13d
1085 WORD $0x148d; BYTE $0x3a // lea edx,[rdx+rdi*1]
1086 WORD $0x8941; BYTE $0xc4 // mov r12d,eax
1087
1088 // ROUND(DX, R8, R9, R10, R11, AX, BX, CX, R12, R13, R14, DI, R15, SP, 0x24)
1089 LONG $0x24244c03 // add ecx,[rsp+0x24]
1090 WORD $0x2145; BYTE $0xdc // and r12d,r11d
1091 LONG $0xf07b43c4; WORD $0x19eb // rorx r13d,r11d,0x19
1092 LONG $0xf07bc3c4; WORD $0x0bfb // rorx edi,r11d,0xb
1093 LONG $0x32148d42 // lea edx,[rdx+r14*1]
1094 LONG $0x210c8d42 // lea ecx,[rcx+r12*1]
1095 LONG $0xf22062c4; BYTE $0xe3 // andn r12d,r11d,ebx
1096 WORD $0x3141; BYTE $0xfd // xor r13d,edi
1097 LONG $0xf07b43c4; WORD $0x06f3 // rorx r14d,r11d,0x6
1098 LONG $0x210c8d42 // lea ecx,[rcx+r12*1]
1099 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
1100 WORD $0xd789 // mov edi,edx
1101 LONG $0xf07b63c4; WORD $0x16e2 // rorx r12d,edx,0x16
1102 LONG $0x290c8d42 // lea ecx,[rcx+r13*1]
1103 WORD $0x3144; BYTE $0xc7 // xor edi,r8d
1104 LONG $0xf07b63c4; WORD $0x0df2 // rorx r14d,edx,0xd
1105 LONG $0xf07b63c4; WORD $0x02ea // rorx r13d,edx,0x2
1106 LONG $0x0a148d45 // lea r10d,[r10+rcx*1]
1107 WORD $0x2141; BYTE $0xff // and r15d,edi
1108 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
1109 WORD $0x3145; BYTE $0xc7 // xor r15d,r8d
1110 WORD $0x3145; BYTE $0xee // xor r14d,r13d
1111 LONG $0x390c8d42 // lea ecx,[rcx+r15*1]
1112 WORD $0x8945; BYTE $0xdc // mov r12d,r11d
1113
1114 // ROUND(CX, DX, R8, R9, R10, R11, AX, BX, R12, R13, R14, R15, DI, SP, 0x28)
1115 LONG $0x28245c03 // add ebx,[rsp+0x28]
1116 WORD $0x2145; BYTE $0xd4 // and r12d,r10d
1117 LONG $0xf07b43c4; WORD $0x19ea // rorx r13d,r10d,0x19
1118 LONG $0xf07b43c4; WORD $0x0bfa // rorx r15d,r10d,0xb
1119 LONG $0x310c8d42 // lea ecx,[rcx+r14*1]
1120 LONG $0x231c8d42 // lea ebx,[rbx+r12*1]
1121 LONG $0xf22862c4; BYTE $0xe0 // andn r12d,r10d,eax
1122 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
1123 LONG $0xf07b43c4; WORD $0x06f2 // rorx r14d,r10d,0x6
1124 LONG $0x231c8d42 // lea ebx,[rbx+r12*1]
1125 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
1126 WORD $0x8941; BYTE $0xcf // mov r15d,ecx
1127 LONG $0xf07b63c4; WORD $0x16e1 // rorx r12d,ecx,0x16
1128 LONG $0x2b1c8d42 // lea ebx,[rbx+r13*1]
1129 WORD $0x3141; BYTE $0xd7 // xor r15d,edx
1130 LONG $0xf07b63c4; WORD $0x0df1 // rorx r14d,ecx,0xd
1131 LONG $0xf07b63c4; WORD $0x02e9 // rorx r13d,ecx,0x2
1132 LONG $0x190c8d45 // lea r9d,[r9+rbx*1]
1133 WORD $0x2144; BYTE $0xff // and edi,r15d
1134 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
1135 WORD $0xd731 // xor edi,edx
1136 WORD $0x3145; BYTE $0xee // xor r14d,r13d
1137 WORD $0x1c8d; BYTE $0x3b // lea ebx,[rbx+rdi*1]
1138 WORD $0x8945; BYTE $0xd4 // mov r12d,r10d
1139
1140 // ROUND(BX, CX, DX, R8, R9, R10, R11, AX, R12, R13, R14, DI, R15, SP, 0x2c)
1141 LONG $0x2c244403 // add eax,[rsp+0x2c]
1142 WORD $0x2145; BYTE $0xcc // and r12d,r9d
1143 LONG $0xf07b43c4; WORD $0x19e9 // rorx r13d,r9d,0x19
1144 LONG $0xf07bc3c4; WORD $0x0bf9 // rorx edi,r9d,0xb
1145 LONG $0x331c8d42 // lea ebx,[rbx+r14*1]
1146 LONG $0x20048d42 // lea eax,[rax+r12*1]
1147 LONG $0xf23042c4; BYTE $0xe3 // andn r12d,r9d,r11d
1148 WORD $0x3141; BYTE $0xfd // xor r13d,edi
1149 LONG $0xf07b43c4; WORD $0x06f1 // rorx r14d,r9d,0x6
1150 LONG $0x20048d42 // lea eax,[rax+r12*1]
1151 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
1152 WORD $0xdf89 // mov edi,ebx
1153 LONG $0xf07b63c4; WORD $0x16e3 // rorx r12d,ebx,0x16
1154 LONG $0x28048d42 // lea eax,[rax+r13*1]
1155 WORD $0xcf31 // xor edi,ecx
1156 LONG $0xf07b63c4; WORD $0x0df3 // rorx r14d,ebx,0xd
1157 LONG $0xf07b63c4; WORD $0x02eb // rorx r13d,ebx,0x2
1158 LONG $0x00048d45 // lea r8d,[r8+rax*1]
1159 WORD $0x2141; BYTE $0xff // and r15d,edi
1160 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
1161 WORD $0x3141; BYTE $0xcf // xor r15d,ecx
1162 WORD $0x3145; BYTE $0xee // xor r14d,r13d
1163 LONG $0x38048d42 // lea eax,[rax+r15*1]
1164 WORD $0x8945; BYTE $0xcc // mov r12d,r9d
1165
1166 MOVQ 0x200(SP), DI // $_ctx
1167 ADDQ R14, AX
1168
1169 LEAQ 0x1c0(SP), BP
1170
1171 ADDL (DI), AX
1172 ADDL 4(DI), BX
1173 ADDL 8(DI), CX
1174 ADDL 12(DI), DX
1175 ADDL 16(DI), R8
1176 ADDL 20(DI), R9
1177 ADDL 24(DI), R10
1178 ADDL 28(DI), R11
1179
1180 MOVL AX, (DI)
1181 MOVL BX, 4(DI)
1182 MOVL CX, 8(DI)
1183 MOVL DX, 12(DI)
1184 MOVL R8, 16(DI)
1185 MOVL R9, 20(DI)
1186 MOVL R10, 24(DI)
1187 MOVL R11, 28(DI)
1188
1189 CMPQ SI, 0x50(BP) // $_end
1190 JE done
1191
1192 XORQ R14, R14
1193 MOVQ BX, DI
1194 XORQ CX, DI // magic
1195 MOVQ R9, R12
1196
1197 loop2:
1198 // ROUND(AX, BX, CX, DX, R8, R9, R10, R11, R12, R13, R14, R15, DI, BP, 0x10)
1199 LONG $0x105d0344 // add r11d,[rbp+0x10]
1200 WORD $0x2145; BYTE $0xc4 // and r12d,r8d
1201 LONG $0xf07b43c4; WORD $0x19e8 // rorx r13d,r8d,0x19
1202 LONG $0xf07b43c4; WORD $0x0bf8 // rorx r15d,r8d,0xb
1203 LONG $0x30048d42 // lea eax,[rax+r14*1]
1204 LONG $0x231c8d47 // lea r11d,[r11+r12*1]
1205 LONG $0xf23842c4; BYTE $0xe2 // andn r12d,r8d,r10d
1206 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
1207 LONG $0xf07b43c4; WORD $0x06f0 // rorx r14d,r8d,0x6
1208 LONG $0x231c8d47 // lea r11d,[r11+r12*1]
1209 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
1210 WORD $0x8941; BYTE $0xc7 // mov r15d,eax
1211 LONG $0xf07b63c4; WORD $0x16e0 // rorx r12d,eax,0x16
1212 LONG $0x2b1c8d47 // lea r11d,[r11+r13*1]
1213 WORD $0x3141; BYTE $0xdf // xor r15d,ebx
1214 LONG $0xf07b63c4; WORD $0x0df0 // rorx r14d,eax,0xd
1215 LONG $0xf07b63c4; WORD $0x02e8 // rorx r13d,eax,0x2
1216 LONG $0x1a148d42 // lea edx,[rdx+r11*1]
1217 WORD $0x2144; BYTE $0xff // and edi,r15d
1218 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
1219 WORD $0xdf31 // xor edi,ebx
1220 WORD $0x3145; BYTE $0xee // xor r14d,r13d
1221 LONG $0x3b1c8d45 // lea r11d,[r11+rdi*1]
1222 WORD $0x8945; BYTE $0xc4 // mov r12d,r8d
1223
1224 // ROUND(R11, AX, BX, CX, DX, R8, R9, R10, R12, R13, R14, DI, R15, BP, 0x14)
1225 LONG $0x14550344 // add r10d,[rbp+0x14]
1226 WORD $0x2141; BYTE $0xd4 // and r12d,edx
1227 LONG $0xf07b63c4; WORD $0x19ea // rorx r13d,edx,0x19
1228 LONG $0xf07be3c4; WORD $0x0bfa // rorx edi,edx,0xb
1229 LONG $0x331c8d47 // lea r11d,[r11+r14*1]
1230 LONG $0x22148d47 // lea r10d,[r10+r12*1]
1231 LONG $0xf26842c4; BYTE $0xe1 // andn r12d,edx,r9d
1232 WORD $0x3141; BYTE $0xfd // xor r13d,edi
1233 LONG $0xf07b63c4; WORD $0x06f2 // rorx r14d,edx,0x6
1234 LONG $0x22148d47 // lea r10d,[r10+r12*1]
1235 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
1236 WORD $0x8944; BYTE $0xdf // mov edi,r11d
1237 LONG $0xf07b43c4; WORD $0x16e3 // rorx r12d,r11d,0x16
1238 LONG $0x2a148d47 // lea r10d,[r10+r13*1]
1239 WORD $0xc731 // xor edi,eax
1240 LONG $0xf07b43c4; WORD $0x0df3 // rorx r14d,r11d,0xd
1241 LONG $0xf07b43c4; WORD $0x02eb // rorx r13d,r11d,0x2
1242 LONG $0x110c8d42 // lea ecx,[rcx+r10*1]
1243 WORD $0x2141; BYTE $0xff // and r15d,edi
1244 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
1245 WORD $0x3141; BYTE $0xc7 // xor r15d,eax
1246 WORD $0x3145; BYTE $0xee // xor r14d,r13d
1247 LONG $0x3a148d47 // lea r10d,[r10+r15*1]
1248 WORD $0x8941; BYTE $0xd4 // mov r12d,edx
1249
1250 // ROUND(R10, R11, AX, BX, CX, DX, R8, R9, R12, R13, R14, R15, DI, BP, 0x18)
1251 LONG $0x184d0344 // add r9d,[rbp+0x18]
1252 WORD $0x2141; BYTE $0xcc // and r12d,ecx
1253 LONG $0xf07b63c4; WORD $0x19e9 // rorx r13d,ecx,0x19
1254 LONG $0xf07b63c4; WORD $0x0bf9 // rorx r15d,ecx,0xb
1255 LONG $0x32148d47 // lea r10d,[r10+r14*1]
1256 LONG $0x210c8d47 // lea r9d,[r9+r12*1]
1257 LONG $0xf27042c4; BYTE $0xe0 // andn r12d,ecx,r8d
1258 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
1259 LONG $0xf07b63c4; WORD $0x06f1 // rorx r14d,ecx,0x6
1260 LONG $0x210c8d47 // lea r9d,[r9+r12*1]
1261 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
1262 WORD $0x8945; BYTE $0xd7 // mov r15d,r10d
1263 LONG $0xf07b43c4; WORD $0x16e2 // rorx r12d,r10d,0x16
1264 LONG $0x290c8d47 // lea r9d,[r9+r13*1]
1265 WORD $0x3145; BYTE $0xdf // xor r15d,r11d
1266 LONG $0xf07b43c4; WORD $0x0df2 // rorx r14d,r10d,0xd
1267 LONG $0xf07b43c4; WORD $0x02ea // rorx r13d,r10d,0x2
1268 LONG $0x0b1c8d42 // lea ebx,[rbx+r9*1]
1269 WORD $0x2144; BYTE $0xff // and edi,r15d
1270 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
1271 WORD $0x3144; BYTE $0xdf // xor edi,r11d
1272 WORD $0x3145; BYTE $0xee // xor r14d,r13d
1273 LONG $0x390c8d45 // lea r9d,[r9+rdi*1]
1274 WORD $0x8941; BYTE $0xcc // mov r12d,ecx
1275
1276 // ROUND(R9, R10, R11, AX, BX, CX, DX, R8, R12, R13, R14, DI, R15, BP, 0x1c)
1277 LONG $0x1c450344 // add r8d,[rbp+0x1c]
1278 WORD $0x2141; BYTE $0xdc // and r12d,ebx
1279 LONG $0xf07b63c4; WORD $0x19eb // rorx r13d,ebx,0x19
1280 LONG $0xf07be3c4; WORD $0x0bfb // rorx edi,ebx,0xb
1281 LONG $0x310c8d47 // lea r9d,[r9+r14*1]
1282 LONG $0x20048d47 // lea r8d,[r8+r12*1]
1283 LONG $0xf26062c4; BYTE $0xe2 // andn r12d,ebx,edx
1284 WORD $0x3141; BYTE $0xfd // xor r13d,edi
1285 LONG $0xf07b63c4; WORD $0x06f3 // rorx r14d,ebx,0x6
1286 LONG $0x20048d47 // lea r8d,[r8+r12*1]
1287 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
1288 WORD $0x8944; BYTE $0xcf // mov edi,r9d
1289 LONG $0xf07b43c4; WORD $0x16e1 // rorx r12d,r9d,0x16
1290 LONG $0x28048d47 // lea r8d,[r8+r13*1]
1291 WORD $0x3144; BYTE $0xd7 // xor edi,r10d
1292 LONG $0xf07b43c4; WORD $0x0df1 // rorx r14d,r9d,0xd
1293 LONG $0xf07b43c4; WORD $0x02e9 // rorx r13d,r9d,0x2
1294 LONG $0x00048d42 // lea eax,[rax+r8*1]
1295 WORD $0x2141; BYTE $0xff // and r15d,edi
1296 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
1297 WORD $0x3145; BYTE $0xd7 // xor r15d,r10d
1298 WORD $0x3145; BYTE $0xee // xor r14d,r13d
1299 LONG $0x38048d47 // lea r8d,[r8+r15*1]
1300 WORD $0x8941; BYTE $0xdc // mov r12d,ebx
1301
1302 // ROUND(R8, R9, R10, R11, AX, BX, CX, DX, R12, R13, R14, R15, DI, BP, 0x30)
1303 WORD $0x5503; BYTE $0x30 // add edx,[rbp+0x30]
1304 WORD $0x2141; BYTE $0xc4 // and r12d,eax
1305 LONG $0xf07b63c4; WORD $0x19e8 // rorx r13d,eax,0x19
1306 LONG $0xf07b63c4; WORD $0x0bf8 // rorx r15d,eax,0xb
1307 LONG $0x30048d47 // lea r8d,[r8+r14*1]
1308 LONG $0x22148d42 // lea edx,[rdx+r12*1]
1309 LONG $0xf27862c4; BYTE $0xe1 // andn r12d,eax,ecx
1310 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
1311 LONG $0xf07b63c4; WORD $0x06f0 // rorx r14d,eax,0x6
1312 LONG $0x22148d42 // lea edx,[rdx+r12*1]
1313 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
1314 WORD $0x8945; BYTE $0xc7 // mov r15d,r8d
1315 LONG $0xf07b43c4; WORD $0x16e0 // rorx r12d,r8d,0x16
1316 LONG $0x2a148d42 // lea edx,[rdx+r13*1]
1317 WORD $0x3145; BYTE $0xcf // xor r15d,r9d
1318 LONG $0xf07b43c4; WORD $0x0df0 // rorx r14d,r8d,0xd
1319 LONG $0xf07b43c4; WORD $0x02e8 // rorx r13d,r8d,0x2
1320 LONG $0x131c8d45 // lea r11d,[r11+rdx*1]
1321 WORD $0x2144; BYTE $0xff // and edi,r15d
1322 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
1323 WORD $0x3144; BYTE $0xcf // xor edi,r9d
1324 WORD $0x3145; BYTE $0xee // xor r14d,r13d
1325 WORD $0x148d; BYTE $0x3a // lea edx,[rdx+rdi*1]
1326 WORD $0x8941; BYTE $0xc4 // mov r12d,eax
1327
1328 // ROUND(DX, R8, R9, R10, R11, AX, BX, CX, R12, R13, R14, DI, R15, BP, 0x34)
1329 WORD $0x4d03; BYTE $0x34 // add ecx,[rbp+0x34]
1330 WORD $0x2145; BYTE $0xdc // and r12d,r11d
1331 LONG $0xf07b43c4; WORD $0x19eb // rorx r13d,r11d,0x19
1332 LONG $0xf07bc3c4; WORD $0x0bfb // rorx edi,r11d,0xb
1333 LONG $0x32148d42 // lea edx,[rdx+r14*1]
1334 LONG $0x210c8d42 // lea ecx,[rcx+r12*1]
1335 LONG $0xf22062c4; BYTE $0xe3 // andn r12d,r11d,ebx
1336 WORD $0x3141; BYTE $0xfd // xor r13d,edi
1337 LONG $0xf07b43c4; WORD $0x06f3 // rorx r14d,r11d,0x6
1338 LONG $0x210c8d42 // lea ecx,[rcx+r12*1]
1339 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
1340 WORD $0xd789 // mov edi,edx
1341 LONG $0xf07b63c4; WORD $0x16e2 // rorx r12d,edx,0x16
1342 LONG $0x290c8d42 // lea ecx,[rcx+r13*1]
1343 WORD $0x3144; BYTE $0xc7 // xor edi,r8d
1344 LONG $0xf07b63c4; WORD $0x0df2 // rorx r14d,edx,0xd
1345 LONG $0xf07b63c4; WORD $0x02ea // rorx r13d,edx,0x2
1346 LONG $0x0a148d45 // lea r10d,[r10+rcx*1]
1347 WORD $0x2141; BYTE $0xff // and r15d,edi
1348 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
1349 WORD $0x3145; BYTE $0xc7 // xor r15d,r8d
1350 WORD $0x3145; BYTE $0xee // xor r14d,r13d
1351 LONG $0x390c8d42 // lea ecx,[rcx+r15*1]
1352 WORD $0x8945; BYTE $0xdc // mov r12d,r11d
1353
1354 // ROUND(CX, DX, R8, R9, R10, R11, AX, BX, R12, R13, R14, R15, DI, BP, 0x38)
1355 WORD $0x5d03; BYTE $0x38 // add ebx,[rbp+0x38]
1356 WORD $0x2145; BYTE $0xd4 // and r12d,r10d
1357 LONG $0xf07b43c4; WORD $0x19ea // rorx r13d,r10d,0x19
1358 LONG $0xf07b43c4; WORD $0x0bfa // rorx r15d,r10d,0xb
1359 LONG $0x310c8d42 // lea ecx,[rcx+r14*1]
1360 LONG $0x231c8d42 // lea ebx,[rbx+r12*1]
1361 LONG $0xf22862c4; BYTE $0xe0 // andn r12d,r10d,eax
1362 WORD $0x3145; BYTE $0xfd // xor r13d,r15d
1363 LONG $0xf07b43c4; WORD $0x06f2 // rorx r14d,r10d,0x6
1364 LONG $0x231c8d42 // lea ebx,[rbx+r12*1]
1365 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
1366 WORD $0x8941; BYTE $0xcf // mov r15d,ecx
1367 LONG $0xf07b63c4; WORD $0x16e1 // rorx r12d,ecx,0x16
1368 LONG $0x2b1c8d42 // lea ebx,[rbx+r13*1]
1369 WORD $0x3141; BYTE $0xd7 // xor r15d,edx
1370 LONG $0xf07b63c4; WORD $0x0df1 // rorx r14d,ecx,0xd
1371 LONG $0xf07b63c4; WORD $0x02e9 // rorx r13d,ecx,0x2
1372 LONG $0x190c8d45 // lea r9d,[r9+rbx*1]
1373 WORD $0x2144; BYTE $0xff // and edi,r15d
1374 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
1375 WORD $0xd731 // xor edi,edx
1376 WORD $0x3145; BYTE $0xee // xor r14d,r13d
1377 WORD $0x1c8d; BYTE $0x3b // lea ebx,[rbx+rdi*1]
1378 WORD $0x8945; BYTE $0xd4 // mov r12d,r10d
1379
1380 // ROUND(BX, CX, DX, R8, R9, R10, R11, AX, R12, R13, R14, DI, R15, BP, 0x3c)
1381 WORD $0x4503; BYTE $0x3c // add eax,[rbp+0x3c]
1382 WORD $0x2145; BYTE $0xcc // and r12d,r9d
1383 LONG $0xf07b43c4; WORD $0x19e9 // rorx r13d,r9d,0x19
1384 LONG $0xf07bc3c4; WORD $0x0bf9 // rorx edi,r9d,0xb
1385 LONG $0x331c8d42 // lea ebx,[rbx+r14*1]
1386 LONG $0x20048d42 // lea eax,[rax+r12*1]
1387 LONG $0xf23042c4; BYTE $0xe3 // andn r12d,r9d,r11d
1388 WORD $0x3141; BYTE $0xfd // xor r13d,edi
1389 LONG $0xf07b43c4; WORD $0x06f1 // rorx r14d,r9d,0x6
1390 LONG $0x20048d42 // lea eax,[rax+r12*1]
1391 WORD $0x3145; BYTE $0xf5 // xor r13d,r14d
1392 WORD $0xdf89 // mov edi,ebx
1393 LONG $0xf07b63c4; WORD $0x16e3 // rorx r12d,ebx,0x16
1394 LONG $0x28048d42 // lea eax,[rax+r13*1]
1395 WORD $0xcf31 // xor edi,ecx
1396 LONG $0xf07b63c4; WORD $0x0df3 // rorx r14d,ebx,0xd
1397 LONG $0xf07b63c4; WORD $0x02eb // rorx r13d,ebx,0x2
1398 LONG $0x00048d45 // lea r8d,[r8+rax*1]
1399 WORD $0x2141; BYTE $0xff // and r15d,edi
1400 WORD $0x3145; BYTE $0xe6 // xor r14d,r12d
1401 WORD $0x3141; BYTE $0xcf // xor r15d,ecx
1402 WORD $0x3145; BYTE $0xee // xor r14d,r13d
1403 LONG $0x38048d42 // lea eax,[rax+r15*1]
1404 WORD $0x8945; BYTE $0xcc // mov r12d,r9d
1405
1406 ADDQ $-0x40, BP
1407 CMPQ BP, SP
1408 JAE loop2
1409
1410 MOVQ 0x200(SP), DI // $_ctx
1411 ADDQ R14, AX
1412
1413 ADDQ $0x1c0, SP
1414
1415 ADDL (DI), AX
1416 ADDL 4(DI), BX
1417 ADDL 8(DI), CX
1418 ADDL 12(DI), DX
1419 ADDL 16(DI), R8
1420 ADDL 20(DI), R9
1421
1422 ADDQ $0x80, SI // input += 2
1423 ADDL 24(DI), R10
1424 MOVQ SI, R12
1425 ADDL 28(DI), R11
1426 CMPQ SI, 0x50(SP) // input == _end
1427
1428 MOVL AX, (DI)
1429 LONG $0xe4440f4c // cmove r12,rsp /* next block or stale data */
1430 MOVL AX, (DI)
1431 MOVL BX, 4(DI)
1432 MOVL CX, 8(DI)
1433 MOVL DX, 12(DI)
1434 MOVL R8, 16(DI)
1435 MOVL R9, 20(DI)
1436 MOVL R10, 24(DI)
1437 MOVL R11, 28(DI)
1438
1439 JBE loop0
1440 LEAQ (SP), BP
1441
1442 done:
1443 MOVQ BP, SP
1444 MOVQ 0x58(SP), SP // restore saved stack pointer
1445 WORD $0xf8c5; BYTE $0x77 // vzeroupper
1446
1447 RET
1448
0 //+build !noasm,!appengine
0 //+build !noasm,!appengine,gc
11
22 /*
33 * Minio Cloud Storage, (C) 2017 Minio, Inc.
0 //+build !noasm,!appengine
0 //+build !noasm,!appengine,gc
11
22 TEXT ·sha256X16Avx512(SB), 7, $0
33 MOVQ digests+0(FP), DI
0 //+build !noasm,!appengine
0 //+build !noasm,!appengine,gc
11
22 /*
33 * Minio Cloud Storage, (C) 2017 Minio, Inc.
+0
-22
sha256blockAvx_amd64.go less more
0 //+build !noasm,!appengine
1
2 /*
3 * Minio Cloud Storage, (C) 2016 Minio, Inc.
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package sha256
19
20 //go:noescape
21 func blockAvx(h []uint32, message []uint8, reserved0, reserved1, reserved2, reserved3 uint64)
+0
-408
sha256blockAvx_amd64.s less more
0 //+build !noasm,!appengine
1
2 // SHA256 implementation for AVX
3
4 //
5 // Minio Cloud Storage, (C) 2016 Minio, Inc.
6 //
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 //
11 // http://www.apache.org/licenses/LICENSE-2.0
12 //
13 // Unless required by applicable law or agreed to in writing, software
14 // distributed under the License is distributed on an "AS IS" BASIS,
15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 // See the License for the specific language governing permissions and
17 // limitations under the License.
18 //
19
20 //
21 // This code is based on an Intel White-Paper:
22 // "Fast SHA-256 Implementations on Intel Architecture Processors"
23 //
24 // together with the reference implementation from the following authors:
25 // James Guilford <james.guilford@intel.com>
26 // Kirk Yap <kirk.s.yap@intel.com>
27 // Tim Chen <tim.c.chen@linux.intel.com>
28 //
29 // For Golang it has been converted to Plan 9 assembly with the help of
30 // github.com/minio/asm2plan9s to assemble Intel instructions to their Plan9
31 // equivalents
32 //
33
34 #include "textflag.h"
35
36 #define ROTATE_XS \
37 MOVOU X4, X15 \
38 MOVOU X5, X4 \
39 MOVOU X6, X5 \
40 MOVOU X7, X6 \
41 MOVOU X15, X7
42
43 // compute s0 four at a time and s1 two at a time
44 // compute W[-16] + W[-7] 4 at a time
45 #define FOUR_ROUNDS_AND_SCHED(a, b, c, d, e, f, g, h) \
46 MOVL e, R13 \ // y0 = e
47 ROLL $18, R13 \ // y0 = e >> (25-11)
48 MOVL a, R14 \ // y1 = a
49 LONG $0x0f41e3c4; WORD $0x04c6 \ // VPALIGNR XMM0,XMM7,XMM6,0x4 /* XTMP0 = W[-7] */
50 ROLL $23, R14 \ // y1 = a >> (22-13)
51 XORL e, R13 \ // y0 = e ^ (e >> (25-11))
52 MOVL f, R15 \ // y2 = f
53 ROLL $27, R13 \ // y0 = (e >> (11-6)) ^ (e >> (25-6))
54 XORL a, R14 \ // y1 = a ^ (a >> (22-13)
55 XORL g, R15 \ // y2 = f^g
56 LONG $0xc4fef9c5 \ // VPADDD XMM0,XMM0,XMM4 /* XTMP0 = W[-7] + W[-16] */
57 XORL e, R13 \ // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6) )
58 ANDL e, R15 \ // y2 = (f^g)&e
59 ROLL $21, R14 \ // y1 = (a >> (13-2)) ^ (a >> (22-2))
60 \
61 \ // compute s0
62 \
63 LONG $0x0f51e3c4; WORD $0x04cc \ // VPALIGNR XMM1,XMM5,XMM4,0x4 /* XTMP1 = W[-15] */
64 XORL a, R14 \ // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
65 ROLL $26, R13 \ // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
66 XORL g, R15 \ // y2 = CH = ((f^g)&e)^g
67 ROLL $30, R14 \ // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
68 ADDL R13, R15 \ // y2 = S1 + CH
69 ADDL _xfer+48(FP), R15 \ // y2 = k + w + S1 + CH
70 MOVL a, R13 \ // y0 = a
71 ADDL R15, h \ // h = h + S1 + CH + k + w
72 \ // ROTATE_ARGS
73 MOVL a, R15 \ // y2 = a
74 LONG $0xd172e9c5; BYTE $0x07 \ // VPSRLD XMM2,XMM1,0x7 /* */
75 ORL c, R13 \ // y0 = a|c
76 ADDL h, d \ // d = d + h + S1 + CH + k + w
77 ANDL c, R15 \ // y2 = a&c
78 LONG $0xf172e1c5; BYTE $0x19 \ // VPSLLD XMM3,XMM1,0x19 /* */
79 ANDL b, R13 \ // y0 = (a|c)&b
80 ADDL R14, h \ // h = h + S1 + CH + k + w + S0
81 LONG $0xdaebe1c5 \ // VPOR XMM3,XMM3,XMM2 /* XTMP1 = W[-15] MY_ROR 7 */
82 ORL R15, R13 \ // y0 = MAJ = (a|c)&b)|(a&c)
83 ADDL R13, h \ // h = h + S1 + CH + k + w + S0 + MAJ
84 \ // ROTATE_ARGS
85 MOVL d, R13 \ // y0 = e
86 MOVL h, R14 \ // y1 = a
87 ROLL $18, R13 \ // y0 = e >> (25-11)
88 XORL d, R13 \ // y0 = e ^ (e >> (25-11))
89 MOVL e, R15 \ // y2 = f
90 ROLL $23, R14 \ // y1 = a >> (22-13)
91 LONG $0xd172e9c5; BYTE $0x12 \ // VPSRLD XMM2,XMM1,0x12 /* */
92 XORL h, R14 \ // y1 = a ^ (a >> (22-13)
93 ROLL $27, R13 \ // y0 = (e >> (11-6)) ^ (e >> (25-6))
94 XORL f, R15 \ // y2 = f^g
95 LONG $0xd172b9c5; BYTE $0x03 \ // VPSRLD XMM8,XMM1,0x3 /* XTMP4 = W[-15] >> 3 */
96 ROLL $21, R14 \ // y1 = (a >> (13-2)) ^ (a >> (22-2))
97 XORL d, R13 \ // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
98 ANDL d, R15 \ // y2 = (f^g)&e
99 ROLL $26, R13 \ // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
100 LONG $0xf172f1c5; BYTE $0x0e \ // VPSLLD XMM1,XMM1,0xe /* */
101 XORL h, R14 \ // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
102 XORL f, R15 \ // y2 = CH = ((f^g)&e)^g
103 LONG $0xd9efe1c5 \ // VPXOR XMM3,XMM3,XMM1 /* */
104 ADDL R13, R15 \ // y2 = S1 + CH
105 ADDL _xfer+52(FP), R15 \ // y2 = k + w + S1 + CH
106 ROLL $30, R14 \ // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
107 LONG $0xdaefe1c5 \ // VPXOR XMM3,XMM3,XMM2 /* XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR */
108 MOVL h, R13 \ // y0 = a
109 ADDL R15, g \ // h = h + S1 + CH + k + w
110 MOVL h, R15 \ // y2 = a
111 LONG $0xef61c1c4; BYTE $0xc8 \ // VPXOR XMM1,XMM3,XMM8 /* XTMP1 = s0 */
112 ORL b, R13 \ // y0 = a|c
113 ADDL g, c \ // d = d + h + S1 + CH + k + w
114 ANDL b, R15 \ // y2 = a&c
115 \
116 \ // compute low s1
117 \
118 LONG $0xd770f9c5; BYTE $0xfa \ // VPSHUFD XMM2,XMM7,0xfa /* XTMP2 = W[-2] {BBAA} */
119 ANDL a, R13 \ // y0 = (a|c)&b
120 ADDL R14, g \ // h = h + S1 + CH + k + w + S0
121 LONG $0xc1fef9c5 \ // VPADDD XMM0,XMM0,XMM1 /* XTMP0 = W[-16] + W[-7] + s0 */
122 ORL R15, R13 \ // y0 = MAJ = (a|c)&b)|(a&c)
123 ADDL R13, g \ // h = h + S1 + CH + k + w + S0 + MAJ
124 \ // ROTATE_ARGS
125 MOVL c, R13 \ // y0 = e
126 MOVL g, R14 \ // y1 = a
127 ROLL $18, R13 \ // y0 = e >> (25-11)
128 XORL c, R13 \ // y0 = e ^ (e >> (25-11))
129 ROLL $23, R14 \ // y1 = a >> (22-13)
130 MOVL d, R15 \ // y2 = f
131 XORL g, R14 \ // y1 = a ^ (a >> (22-13)
132 ROLL $27, R13 \ // y0 = (e >> (11-6)) ^ (e >> (25-6))
133 LONG $0xd272b9c5; BYTE $0x0a \ // VPSRLD XMM8,XMM2,0xa /* XTMP4 = W[-2] >> 10 {BBAA} */
134 XORL e, R15 \ // y2 = f^g
135 LONG $0xd273e1c5; BYTE $0x13 \ // VPSRLQ XMM3,XMM2,0x13 /* XTMP3 = W[-2] MY_ROR 19 {xBxA} */
136 XORL c, R13 \ // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
137 ANDL c, R15 \ // y2 = (f^g)&e
138 LONG $0xd273e9c5; BYTE $0x11 \ // VPSRLQ XMM2,XMM2,0x11 /* XTMP2 = W[-2] MY_ROR 17 {xBxA} */
139 ROLL $21, R14 \ // y1 = (a >> (13-2)) ^ (a >> (22-2))
140 XORL g, R14 \ // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
141 XORL e, R15 \ // y2 = CH = ((f^g)&e)^g
142 ROLL $26, R13 \ // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
143 LONG $0xd3efe9c5 \ // VPXOR XMM2,XMM2,XMM3 /* */
144 ADDL R13, R15 \ // y2 = S1 + CH
145 ROLL $30, R14 \ // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
146 ADDL _xfer+56(FP), R15 \ // y2 = k + w + S1 + CH
147 LONG $0xc2ef39c5 \ // VPXOR XMM8,XMM8,XMM2 /* XTMP4 = s1 {xBxA} */
148 MOVL g, R13 \ // y0 = a
149 ADDL R15, f \ // h = h + S1 + CH + k + w
150 MOVL g, R15 \ // y2 = a
151 LONG $0x003942c4; BYTE $0xc2 \ // VPSHUFB XMM8,XMM8,XMM10 /* XTMP4 = s1 {00BA} */
152 ORL a, R13 \ // y0 = a|c
153 ADDL f, b \ // d = d + h + S1 + CH + k + w
154 ANDL a, R15 \ // y2 = a&c
155 LONG $0xfe79c1c4; BYTE $0xc0 \ // VPADDD XMM0,XMM0,XMM8 /* XTMP0 = {..., ..., W[1], W[0]} */
156 ANDL h, R13 \ // y0 = (a|c)&b
157 ADDL R14, f \ // h = h + S1 + CH + k + w + S0
158 \
159 \ // compute high s1
160 \
161 LONG $0xd070f9c5; BYTE $0x50 \ // VPSHUFD XMM2,XMM0,0x50 /* XTMP2 = W[-2] {DDCC} */
162 ORL R15, R13 \ // y0 = MAJ = (a|c)&b)|(a&c)
163 ADDL R13, f \ // h = h + S1 + CH + k + w + S0 + MAJ
164 \ // ROTATE_ARGS
165 MOVL b, R13 \ // y0 = e
166 ROLL $18, R13 \ // y0 = e >> (25-11)
167 MOVL f, R14 \ // y1 = a
168 ROLL $23, R14 \ // y1 = a >> (22-13)
169 XORL b, R13 \ // y0 = e ^ (e >> (25-11))
170 MOVL c, R15 \ // y2 = f
171 ROLL $27, R13 \ // y0 = (e >> (11-6)) ^ (e >> (25-6))
172 LONG $0xd272a1c5; BYTE $0x0a \ // VPSRLD XMM11,XMM2,0xa /* XTMP5 = W[-2] >> 10 {DDCC} */
173 XORL f, R14 \ // y1 = a ^ (a >> (22-13)
174 XORL d, R15 \ // y2 = f^g
175 LONG $0xd273e1c5; BYTE $0x13 \ // VPSRLQ XMM3,XMM2,0x13 /* XTMP3 = W[-2] MY_ROR 19 {xDxC} */
176 XORL b, R13 \ // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
177 ANDL b, R15 \ // y2 = (f^g)&e
178 ROLL $21, R14 \ // y1 = (a >> (13-2)) ^ (a >> (22-2))
179 LONG $0xd273e9c5; BYTE $0x11 \ // VPSRLQ XMM2,XMM2,0x11 /* XTMP2 = W[-2] MY_ROR 17 {xDxC} */
180 XORL f, R14 \ // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
181 ROLL $26, R13 \ // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
182 XORL d, R15 \ // y2 = CH = ((f^g)&e)^g
183 LONG $0xd3efe9c5 \ // VPXOR XMM2,XMM2,XMM3 /* */
184 ROLL $30, R14 \ // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
185 ADDL R13, R15 \ // y2 = S1 + CH
186 ADDL _xfer+60(FP), R15 \ // y2 = k + w + S1 + CH
187 LONG $0xdaef21c5 \ // VPXOR XMM11,XMM11,XMM2 /* XTMP5 = s1 {xDxC} */
188 MOVL f, R13 \ // y0 = a
189 ADDL R15, e \ // h = h + S1 + CH + k + w
190 MOVL f, R15 \ // y2 = a
191 LONG $0x002142c4; BYTE $0xdc \ // VPSHUFB XMM11,XMM11,XMM12 /* XTMP5 = s1 {DC00} */
192 ORL h, R13 \ // y0 = a|c
193 ADDL e, a \ // d = d + h + S1 + CH + k + w
194 ANDL h, R15 \ // y2 = a&c
195 LONG $0xe0fea1c5 \ // VPADDD XMM4,XMM11,XMM0 /* X0 = {W[3], W[2], W[1], W[0]} */
196 ANDL g, R13 \ // y0 = (a|c)&b
197 ADDL R14, e \ // h = h + S1 + CH + k + w + S0
198 ORL R15, R13 \ // y0 = MAJ = (a|c)&b)|(a&c)
199 ADDL R13, e \ // h = h + S1 + CH + k + w + S0 + MAJ
200 \ // ROTATE_ARGS
201 ROTATE_XS
202
203 #define DO_ROUND(a, b, c, d, e, f, g, h, offset) \
204 MOVL e, R13 \ // y0 = e
205 ROLL $18, R13 \ // y0 = e >> (25-11)
206 MOVL a, R14 \ // y1 = a
207 XORL e, R13 \ // y0 = e ^ (e >> (25-11))
208 ROLL $23, R14 \ // y1 = a >> (22-13)
209 MOVL f, R15 \ // y2 = f
210 XORL a, R14 \ // y1 = a ^ (a >> (22-13)
211 ROLL $27, R13 \ // y0 = (e >> (11-6)) ^ (e >> (25-6))
212 XORL g, R15 \ // y2 = f^g
213 XORL e, R13 \ // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
214 ROLL $21, R14 \ // y1 = (a >> (13-2)) ^ (a >> (22-2))
215 ANDL e, R15 \ // y2 = (f^g)&e
216 XORL a, R14 \ // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
217 ROLL $26, R13 \ // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
218 XORL g, R15 \ // y2 = CH = ((f^g)&e)^g
219 ADDL R13, R15 \ // y2 = S1 + CH
220 ROLL $30, R14 \ // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
221 ADDL _xfer+offset(FP), R15 \ // y2 = k + w + S1 + CH
222 MOVL a, R13 \ // y0 = a
223 ADDL R15, h \ // h = h + S1 + CH + k + w
224 MOVL a, R15 \ // y2 = a
225 ORL c, R13 \ // y0 = a|c
226 ADDL h, d \ // d = d + h + S1 + CH + k + w
227 ANDL c, R15 \ // y2 = a&c
228 ANDL b, R13 \ // y0 = (a|c)&b
229 ADDL R14, h \ // h = h + S1 + CH + k + w + S0
230 ORL R15, R13 \ // y0 = MAJ = (a|c)&b)|(a&c)
231 ADDL R13, h // h = h + S1 + CH + k + w + S0 + MAJ
232
233 // func blockAvx(h []uint32, message []uint8, reserved0, reserved1, reserved2, reserved3 uint64)
234 TEXT ·blockAvx(SB), 7, $0-80
235
236 MOVQ h+0(FP), SI // SI: &h
237 MOVQ message_base+24(FP), R8 // &message
238 MOVQ message_len+32(FP), R9 // length of message
239 CMPQ R9, $0
240 JEQ done_hash
241 ADDQ R8, R9
242 MOVQ R9, reserved2+64(FP) // store end of message
243
244 // Register definition
245 // a --> eax
246 // b --> ebx
247 // c --> ecx
248 // d --> r8d
249 // e --> edx
250 // f --> r9d
251 // g --> r10d
252 // h --> r11d
253 //
254 // y0 --> r13d
255 // y1 --> r14d
256 // y2 --> r15d
257
258 MOVL (0*4)(SI), AX // a = H0
259 MOVL (1*4)(SI), BX // b = H1
260 MOVL (2*4)(SI), CX // c = H2
261 MOVL (3*4)(SI), R8 // d = H3
262 MOVL (4*4)(SI), DX // e = H4
263 MOVL (5*4)(SI), R9 // f = H5
264 MOVL (6*4)(SI), R10 // g = H6
265 MOVL (7*4)(SI), R11 // h = H7
266
267 MOVOU bflipMask<>(SB), X13
268 MOVOU shuf00BA<>(SB), X10 // shuffle xBxA -> 00BA
269 MOVOU shufDC00<>(SB), X12 // shuffle xDxC -> DC00
270
271 MOVQ message_base+24(FP), SI // SI: &message
272
273 loop0:
274 LEAQ constants<>(SB), BP
275
276 // byte swap first 16 dwords
277 MOVOU 0*16(SI), X4
278 LONG $0x0059c2c4; BYTE $0xe5 // VPSHUFB XMM4, XMM4, XMM13
279 MOVOU 1*16(SI), X5
280 LONG $0x0051c2c4; BYTE $0xed // VPSHUFB XMM5, XMM5, XMM13
281 MOVOU 2*16(SI), X6
282 LONG $0x0049c2c4; BYTE $0xf5 // VPSHUFB XMM6, XMM6, XMM13
283 MOVOU 3*16(SI), X7
284 LONG $0x0041c2c4; BYTE $0xfd // VPSHUFB XMM7, XMM7, XMM13
285
286 MOVQ SI, reserved3+72(FP)
287 MOVD $0x3, DI
288
289 // schedule 48 input dwords, by doing 3 rounds of 16 each
290 loop1:
291 LONG $0x4dfe59c5; BYTE $0x00 // VPADDD XMM9, XMM4, 0[RBP] /* Add 1st constant to first part of message */
292 MOVOU X9, reserved0+48(FP)
293 FOUR_ROUNDS_AND_SCHED(AX, BX, CX, R8, DX, R9, R10, R11)
294
295 LONG $0x4dfe59c5; BYTE $0x10 // VPADDD XMM9, XMM4, 16[RBP] /* Add 2nd constant to message */
296 MOVOU X9, reserved0+48(FP)
297 FOUR_ROUNDS_AND_SCHED(DX, R9, R10, R11, AX, BX, CX, R8)
298
299 LONG $0x4dfe59c5; BYTE $0x20 // VPADDD XMM9, XMM4, 32[RBP] /* Add 3rd constant to message */
300 MOVOU X9, reserved0+48(FP)
301 FOUR_ROUNDS_AND_SCHED(AX, BX, CX, R8, DX, R9, R10, R11)
302
303 LONG $0x4dfe59c5; BYTE $0x30 // VPADDD XMM9, XMM4, 48[RBP] /* Add 4th constant to message */
304 MOVOU X9, reserved0+48(FP)
305 ADDQ $64, BP
306 FOUR_ROUNDS_AND_SCHED(DX, R9, R10, R11, AX, BX, CX, R8)
307
308 SUBQ $1, DI
309 JNE loop1
310
311 MOVD $0x2, DI
312
313 loop2:
314 LONG $0x4dfe59c5; BYTE $0x00 // VPADDD XMM9, XMM4, 0[RBP] /* Add 1st constant to first part of message */
315 MOVOU X9, reserved0+48(FP)
316 DO_ROUND( AX, BX, CX, R8, DX, R9, R10, R11, 48)
317 DO_ROUND(R11, AX, BX, CX, R8, DX, R9, R10, 52)
318 DO_ROUND(R10, R11, AX, BX, CX, R8, DX, R9, 56)
319 DO_ROUND( R9, R10, R11, AX, BX, CX, R8, DX, 60)
320
321 LONG $0x4dfe51c5; BYTE $0x10 // VPADDD XMM9, XMM5, 16[RBP] /* Add 2nd constant to message */
322 MOVOU X9, reserved0+48(FP)
323 ADDQ $32, BP
324 DO_ROUND( DX, R9, R10, R11, AX, BX, CX, R8, 48)
325 DO_ROUND( R8, DX, R9, R10, R11, AX, BX, CX, 52)
326 DO_ROUND( CX, R8, DX, R9, R10, R11, AX, BX, 56)
327 DO_ROUND( BX, CX, R8, DX, R9, R10, R11, AX, 60)
328
329 MOVOU X6, X4
330 MOVOU X7, X5
331
332 SUBQ $1, DI
333 JNE loop2
334
335 MOVQ h+0(FP), SI // SI: &h
336 ADDL (0*4)(SI), AX // H0 = a + H0
337 MOVL AX, (0*4)(SI)
338 ADDL (1*4)(SI), BX // H1 = b + H1
339 MOVL BX, (1*4)(SI)
340 ADDL (2*4)(SI), CX // H2 = c + H2
341 MOVL CX, (2*4)(SI)
342 ADDL (3*4)(SI), R8 // H3 = d + H3
343 MOVL R8, (3*4)(SI)
344 ADDL (4*4)(SI), DX // H4 = e + H4
345 MOVL DX, (4*4)(SI)
346 ADDL (5*4)(SI), R9 // H5 = f + H5
347 MOVL R9, (5*4)(SI)
348 ADDL (6*4)(SI), R10 // H6 = g + H6
349 MOVL R10, (6*4)(SI)
350 ADDL (7*4)(SI), R11 // H7 = h + H7
351 MOVL R11, (7*4)(SI)
352
353 MOVQ reserved3+72(FP), SI
354 ADDQ $64, SI
355 CMPQ reserved2+64(FP), SI
356 JNE loop0
357
358 done_hash:
359 RET
360
361 // Constants table
362 DATA constants<>+0x0(SB)/8, $0x71374491428a2f98
363 DATA constants<>+0x8(SB)/8, $0xe9b5dba5b5c0fbcf
364 DATA constants<>+0x10(SB)/8, $0x59f111f13956c25b
365 DATA constants<>+0x18(SB)/8, $0xab1c5ed5923f82a4
366 DATA constants<>+0x20(SB)/8, $0x12835b01d807aa98
367 DATA constants<>+0x28(SB)/8, $0x550c7dc3243185be
368 DATA constants<>+0x30(SB)/8, $0x80deb1fe72be5d74
369 DATA constants<>+0x38(SB)/8, $0xc19bf1749bdc06a7
370 DATA constants<>+0x40(SB)/8, $0xefbe4786e49b69c1
371 DATA constants<>+0x48(SB)/8, $0x240ca1cc0fc19dc6
372 DATA constants<>+0x50(SB)/8, $0x4a7484aa2de92c6f
373 DATA constants<>+0x58(SB)/8, $0x76f988da5cb0a9dc
374 DATA constants<>+0x60(SB)/8, $0xa831c66d983e5152
375 DATA constants<>+0x68(SB)/8, $0xbf597fc7b00327c8
376 DATA constants<>+0x70(SB)/8, $0xd5a79147c6e00bf3
377 DATA constants<>+0x78(SB)/8, $0x1429296706ca6351
378 DATA constants<>+0x80(SB)/8, $0x2e1b213827b70a85
379 DATA constants<>+0x88(SB)/8, $0x53380d134d2c6dfc
380 DATA constants<>+0x90(SB)/8, $0x766a0abb650a7354
381 DATA constants<>+0x98(SB)/8, $0x92722c8581c2c92e
382 DATA constants<>+0xa0(SB)/8, $0xa81a664ba2bfe8a1
383 DATA constants<>+0xa8(SB)/8, $0xc76c51a3c24b8b70
384 DATA constants<>+0xb0(SB)/8, $0xd6990624d192e819
385 DATA constants<>+0xb8(SB)/8, $0x106aa070f40e3585
386 DATA constants<>+0xc0(SB)/8, $0x1e376c0819a4c116
387 DATA constants<>+0xc8(SB)/8, $0x34b0bcb52748774c
388 DATA constants<>+0xd0(SB)/8, $0x4ed8aa4a391c0cb3
389 DATA constants<>+0xd8(SB)/8, $0x682e6ff35b9cca4f
390 DATA constants<>+0xe0(SB)/8, $0x78a5636f748f82ee
391 DATA constants<>+0xe8(SB)/8, $0x8cc7020884c87814
392 DATA constants<>+0xf0(SB)/8, $0xa4506ceb90befffa
393 DATA constants<>+0xf8(SB)/8, $0xc67178f2bef9a3f7
394
395 DATA bflipMask<>+0x00(SB)/8, $0x0405060700010203
396 DATA bflipMask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
397
398 DATA shuf00BA<>+0x00(SB)/8, $0x0b0a090803020100
399 DATA shuf00BA<>+0x08(SB)/8, $0xFFFFFFFFFFFFFFFF
400
401 DATA shufDC00<>+0x00(SB)/8, $0xFFFFFFFFFFFFFFFF
402 DATA shufDC00<>+0x08(SB)/8, $0x0b0a090803020100
403
404 GLOBL constants<>(SB), 8, $256
405 GLOBL bflipMask<>(SB), (NOPTR+RODATA), $16
406 GLOBL shuf00BA<>(SB), (NOPTR+RODATA), $16
407 GLOBL shufDC00<>(SB), (NOPTR+RODATA), $16
0 //+build !noasm,!appengine
0 //+build !noasm,!appengine,gc
11
22 package sha256
33
0 //+build !noasm,!appengine
0 //+build !noasm,!appengine,gc
11
22 // SHA intrinsic version of SHA256
33
0 //+build !noasm,!appengine
0 //+build !noasm,!appengine,gc
11
22 package sha256
33
7070 }
7171
7272 func TestSha1(t *testing.T) {
73 if sha && ssse3 && sse41 && !runTestSha(sha256hash) {
73 if hasSHAExtensions() && !runTestSha(sha256hash) {
7474 t.Errorf("FAILED")
7575 }
7676 }
+0
-22
sha256blockSsse_amd64.go less more
0 //+build !noasm,!appengine
1
2 /*
3 * Minio Cloud Storage, (C) 2016 Minio, Inc.
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package sha256
19
20 //go:noescape
21 func blockSsse(h []uint32, message []uint8, reserved0, reserved1, reserved2, reserved3 uint64)
+0
-429
sha256blockSsse_amd64.s less more
0 //+build !noasm,!appengine
1
2 // SHA256 implementation for SSSE3
3
4 //
5 // Minio Cloud Storage, (C) 2016 Minio, Inc.
6 //
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 //
11 // http://www.apache.org/licenses/LICENSE-2.0
12 //
13 // Unless required by applicable law or agreed to in writing, software
14 // distributed under the License is distributed on an "AS IS" BASIS,
15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 // See the License for the specific language governing permissions and
17 // limitations under the License.
18 //
19
20 //
21 // This code is based on an Intel White-Paper:
22 // "Fast SHA-256 Implementations on Intel Architecture Processors"
23 //
24 // together with the reference implementation from the following authors:
25 // James Guilford <james.guilford@intel.com>
26 // Kirk Yap <kirk.s.yap@intel.com>
27 // Tim Chen <tim.c.chen@linux.intel.com>
28 //
29 // For Golang it has been converted to Plan 9 assembly with the help of
30 // github.com/minio/asm2plan9s to assemble Intel instructions to their Plan9
31 // equivalents
32 //
33
34 #include "textflag.h"
35
36 #define ROTATE_XS \
37 MOVOU X4, X15 \
38 MOVOU X5, X4 \
39 MOVOU X6, X5 \
40 MOVOU X7, X6 \
41 MOVOU X15, X7
42
43 // compute s0 four at a time and s1 two at a time
44 // compute W[-16] + W[-7] 4 at a time
45 #define FOUR_ROUNDS_AND_SCHED(a, b, c, d, e, f, g, h) \
46 MOVL e, R13 \ // y0 = e
47 ROLL $18, R13 \ // y0 = e >> (25-11)
48 MOVL a, R14 \ // y1 = a
49 MOVOU X7, X0 \
50 LONG $0x0f3a0f66; WORD $0x04c6 \ // PALIGNR XMM0,XMM6,0x4 /* XTMP0 = W[-7] */
51 ROLL $23, R14 \ // y1 = a >> (22-13)
52 XORL e, R13 \ // y0 = e ^ (e >> (25-11))
53 MOVL f, R15 \ // y2 = f
54 ROLL $27, R13 \ // y0 = (e >> (11-6)) ^ (e >> (25-6))
55 XORL a, R14 \ // y1 = a ^ (a >> (22-13)
56 XORL g, R15 \ // y2 = f^g
57 LONG $0xc4fe0f66 \ // PADDD XMM0,XMM4 /* XTMP0 = W[-7] + W[-16] */
58 XORL e, R13 \ // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6) )
59 ANDL e, R15 \ // y2 = (f^g)&e
60 ROLL $21, R14 \ // y1 = (a >> (13-2)) ^ (a >> (22-2))
61 \
62 \ // compute s0
63 \
64 MOVOU X5, X1 \
65 LONG $0x0f3a0f66; WORD $0x04cc \ // PALIGNR XMM1,XMM4,0x4 /* XTMP1 = W[-15] */
66 XORL a, R14 \ // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
67 ROLL $26, R13 \ // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
68 XORL g, R15 \ // y2 = CH = ((f^g)&e)^g
69 ROLL $30, R14 \ // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
70 ADDL R13, R15 \ // y2 = S1 + CH
71 ADDL _xfer+48(FP), R15 \ // y2 = k + w + S1 + CH
72 MOVL a, R13 \ // y0 = a
73 ADDL R15, h \ // h = h + S1 + CH + k + w
74 \ // ROTATE_ARGS
75 MOVL a, R15 \ // y2 = a
76 MOVOU X1, X2 \
77 LONG $0xd2720f66; BYTE $0x07 \ // PSRLD XMM2,0x7 /* */
78 ORL c, R13 \ // y0 = a|c
79 ADDL h, d \ // d = d + h + S1 + CH + k + w
80 ANDL c, R15 \ // y2 = a&c
81 MOVOU X1, X3 \
82 LONG $0xf3720f66; BYTE $0x19 \ // PSLLD XMM3,0x19 /* */
83 ANDL b, R13 \ // y0 = (a|c)&b
84 ADDL R14, h \ // h = h + S1 + CH + k + w + S0
85 LONG $0xdaeb0f66 \ // POR XMM3,XMM2 /* XTMP1 = W[-15] MY_ROR 7 */
86 ORL R15, R13 \ // y0 = MAJ = (a|c)&b)|(a&c)
87 ADDL R13, h \ // h = h + S1 + CH + k + w + S0 + MAJ
88 \ // ROTATE_ARGS
89 MOVL d, R13 \ // y0 = e
90 MOVL h, R14 \ // y1 = a
91 ROLL $18, R13 \ // y0 = e >> (25-11)
92 XORL d, R13 \ // y0 = e ^ (e >> (25-11))
93 MOVL e, R15 \ // y2 = f
94 ROLL $23, R14 \ // y1 = a >> (22-13)
95 MOVOU X1, X2 \
96 LONG $0xd2720f66; BYTE $0x12 \ // PSRLD XMM2,0x12 /* */
97 XORL h, R14 \ // y1 = a ^ (a >> (22-13)
98 ROLL $27, R13 \ // y0 = (e >> (11-6)) ^ (e >> (25-6))
99 XORL f, R15 \ // y2 = f^g
100 MOVOU X1, X8 \
101 LONG $0x720f4166; WORD $0x03d0 \ // PSRLD XMM8,0x3 /* XTMP4 = W[-15] >> 3 */
102 ROLL $21, R14 \ // y1 = (a >> (13-2)) ^ (a >> (22-2))
103 XORL d, R13 \ // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
104 ANDL d, R15 \ // y2 = (f^g)&e
105 ROLL $26, R13 \ // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
106 LONG $0xf1720f66; BYTE $0x0e \ // PSLLD XMM1,0xe /* */
107 XORL h, R14 \ // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
108 XORL f, R15 \ // y2 = CH = ((f^g)&e)^g
109 LONG $0xd9ef0f66 \ // PXOR XMM3,XMM1 /* */
110 ADDL R13, R15 \ // y2 = S1 + CH
111 ADDL _xfer+52(FP), R15 \ // y2 = k + w + S1 + CH
112 ROLL $30, R14 \ // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
113 LONG $0xdaef0f66 \ // PXOR XMM3,XMM2 /* XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR */
114 MOVL h, R13 \ // y0 = a
115 ADDL R15, g \ // h = h + S1 + CH + k + w
116 MOVL h, R15 \ // y2 = a
117 MOVOU X3, X1 \
118 LONG $0xef0f4166; BYTE $0xc8 \ // PXOR XMM1,XMM8 /* XTMP1 = s0 */
119 ORL b, R13 \ // y0 = a|c
120 ADDL g, c \ // d = d + h + S1 + CH + k + w
121 ANDL b, R15 \ // y2 = a&c
122 \
123 \ // compute low s1
124 \
125 LONG $0xd7700f66; BYTE $0xfa \ // PSHUFD XMM2,XMM7,0xfa /* XTMP2 = W[-2] {BBAA} */
126 ANDL a, R13 \ // y0 = (a|c)&b
127 ADDL R14, g \ // h = h + S1 + CH + k + w + S0
128 LONG $0xc1fe0f66 \ // PADDD XMM0,XMM1 /* XTMP0 = W[-16] + W[-7] + s0 */
129 ORL R15, R13 \ // y0 = MAJ = (a|c)&b)|(a&c)
130 ADDL R13, g \ // h = h + S1 + CH + k + w + S0 + MAJ
131 \ // ROTATE_ARGS
132 MOVL c, R13 \ // y0 = e
133 MOVL g, R14 \ // y1 = a
134 ROLL $18, R13 \ // y0 = e >> (25-11)
135 XORL c, R13 \ // y0 = e ^ (e >> (25-11))
136 ROLL $23, R14 \ // y1 = a >> (22-13)
137 MOVL d, R15 \ // y2 = f
138 XORL g, R14 \ // y1 = a ^ (a >> (22-13)
139 ROLL $27, R13 \ // y0 = (e >> (11-6)) ^ (e >> (25-6))
140 MOVOU X2, X8 \
141 LONG $0x720f4166; WORD $0x0ad0 \ // PSRLD XMM8,0xa /* XTMP4 = W[-2] >> 10 {BBAA} */
142 XORL e, R15 \ // y2 = f^g
143 MOVOU X2, X3 \
144 LONG $0xd3730f66; BYTE $0x13 \ // PSRLQ XMM3,0x13 /* XTMP3 = W[-2] MY_ROR 19 {xBxA} */
145 XORL c, R13 \ // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
146 ANDL c, R15 \ // y2 = (f^g)&e
147 LONG $0xd2730f66; BYTE $0x11 \ // PSRLQ XMM2,0x11 /* XTMP2 = W[-2] MY_ROR 17 {xBxA} */
148 ROLL $21, R14 \ // y1 = (a >> (13-2)) ^ (a >> (22-2))
149 XORL g, R14 \ // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
150 XORL e, R15 \ // y2 = CH = ((f^g)&e)^g
151 ROLL $26, R13 \ // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
152 LONG $0xd3ef0f66 \ // PXOR XMM2,XMM3 /* */
153 ADDL R13, R15 \ // y2 = S1 + CH
154 ROLL $30, R14 \ // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
155 ADDL _xfer+56(FP), R15 \ // y2 = k + w + S1 + CH
156 LONG $0xef0f4466; BYTE $0xc2 \ // PXOR XMM8,XMM2 /* XTMP4 = s1 {xBxA} */
157 MOVL g, R13 \ // y0 = a
158 ADDL R15, f \ // h = h + S1 + CH + k + w
159 MOVL g, R15 \ // y2 = a
160 LONG $0x380f4566; WORD $0xc200 \ // PSHUFB XMM8,XMM10 /* XTMP4 = s1 {00BA} */
161 ORL a, R13 \ // y0 = a|c
162 ADDL f, b \ // d = d + h + S1 + CH + k + w
163 ANDL a, R15 \ // y2 = a&c
164 LONG $0xfe0f4166; BYTE $0xc0 \ // PADDD XMM0,XMM8 /* XTMP0 = {..., ..., W[1], W[0]} */
165 ANDL h, R13 \ // y0 = (a|c)&b
166 ADDL R14, f \ // h = h + S1 + CH + k + w + S0
167 \
168 \ // compute high s1
169 \
170 LONG $0xd0700f66; BYTE $0x50 \ // PSHUFD XMM2,XMM0,0x50 /* XTMP2 = W[-2] {DDCC} */
171 ORL R15, R13 \ // y0 = MAJ = (a|c)&b)|(a&c)
172 ADDL R13, f \ // h = h + S1 + CH + k + w + S0 + MAJ
173 \ // ROTATE_ARGS
174 MOVL b, R13 \ // y0 = e
175 ROLL $18, R13 \ // y0 = e >> (25-11)
176 MOVL f, R14 \ // y1 = a
177 ROLL $23, R14 \ // y1 = a >> (22-13)
178 XORL b, R13 \ // y0 = e ^ (e >> (25-11))
179 MOVL c, R15 \ // y2 = f
180 ROLL $27, R13 \ // y0 = (e >> (11-6)) ^ (e >> (25-6))
181 MOVOU X2, X11 \
182 LONG $0x720f4166; WORD $0x0ad3 \ // PSRLD XMM11,0xa /* XTMP5 = W[-2] >> 10 {DDCC} */
183 XORL f, R14 \ // y1 = a ^ (a >> (22-13)
184 XORL d, R15 \ // y2 = f^g
185 MOVOU X2, X3 \
186 LONG $0xd3730f66; BYTE $0x13 \ // PSRLQ XMM3,0x13 /* XTMP3 = W[-2] MY_ROR 19 {xDxC} */
187 XORL b, R13 \ // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
188 ANDL b, R15 \ // y2 = (f^g)&e
189 ROLL $21, R14 \ // y1 = (a >> (13-2)) ^ (a >> (22-2))
190 LONG $0xd2730f66; BYTE $0x11 \ // PSRLQ XMM2,0x11 /* XTMP2 = W[-2] MY_ROR 17 {xDxC} */
191 XORL f, R14 \ // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
192 ROLL $26, R13 \ // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
193 XORL d, R15 \ // y2 = CH = ((f^g)&e)^g
194 LONG $0xd3ef0f66 \ // PXOR XMM2,XMM3 /* */
195 ROLL $30, R14 \ // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
196 ADDL R13, R15 \ // y2 = S1 + CH
197 ADDL _xfer+60(FP), R15 \ // y2 = k + w + S1 + CH
198 LONG $0xef0f4466; BYTE $0xda \ // PXOR XMM11,XMM2 /* XTMP5 = s1 {xDxC} */
199 MOVL f, R13 \ // y0 = a
200 ADDL R15, e \ // h = h + S1 + CH + k + w
201 MOVL f, R15 \ // y2 = a
202 LONG $0x380f4566; WORD $0xdc00 \ // PSHUFB XMM11,XMM12 /* XTMP5 = s1 {DC00} */
203 ORL h, R13 \ // y0 = a|c
204 ADDL e, a \ // d = d + h + S1 + CH + k + w
205 ANDL h, R15 \ // y2 = a&c
206 MOVOU X11, X4 \
207 LONG $0xe0fe0f66 \ // PADDD XMM4,XMM0 /* X0 = {W[3], W[2], W[1], W[0]} */
208 ANDL g, R13 \ // y0 = (a|c)&b
209 ADDL R14, e \ // h = h + S1 + CH + k + w + S0
210 ORL R15, R13 \ // y0 = MAJ = (a|c)&b)|(a&c)
211 ADDL R13, e \ // h = h + S1 + CH + k + w + S0 + MAJ
212 \ // ROTATE_ARGS
213 ROTATE_XS
214
215 #define DO_ROUND(a, b, c, d, e, f, g, h, offset) \
216 MOVL e, R13 \ // y0 = e
217 ROLL $18, R13 \ // y0 = e >> (25-11)
218 MOVL a, R14 \ // y1 = a
219 XORL e, R13 \ // y0 = e ^ (e >> (25-11))
220 ROLL $23, R14 \ // y1 = a >> (22-13)
221 MOVL f, R15 \ // y2 = f
222 XORL a, R14 \ // y1 = a ^ (a >> (22-13)
223 ROLL $27, R13 \ // y0 = (e >> (11-6)) ^ (e >> (25-6))
224 XORL g, R15 \ // y2 = f^g
225 XORL e, R13 \ // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
226 ROLL $21, R14 \ // y1 = (a >> (13-2)) ^ (a >> (22-2))
227 ANDL e, R15 \ // y2 = (f^g)&e
228 XORL a, R14 \ // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
229 ROLL $26, R13 \ // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
230 XORL g, R15 \ // y2 = CH = ((f^g)&e)^g
231 ADDL R13, R15 \ // y2 = S1 + CH
232 ROLL $30, R14 \ // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
233 ADDL _xfer+offset(FP), R15 \ // y2 = k + w + S1 + CH
234 MOVL a, R13 \ // y0 = a
235 ADDL R15, h \ // h = h + S1 + CH + k + w
236 MOVL a, R15 \ // y2 = a
237 ORL c, R13 \ // y0 = a|c
238 ADDL h, d \ // d = d + h + S1 + CH + k + w
239 ANDL c, R15 \ // y2 = a&c
240 ANDL b, R13 \ // y0 = (a|c)&b
241 ADDL R14, h \ // h = h + S1 + CH + k + w + S0
242 ORL R15, R13 \ // y0 = MAJ = (a|c)&b)|(a&c)
243 ADDL R13, h // h = h + S1 + CH + k + w + S0 + MAJ
244
245 // func blockSsse(h []uint32, message []uint8, reserved0, reserved1, reserved2, reserved3 uint64)
246 TEXT ·blockSsse(SB), 7, $0-80
247
248 MOVQ h+0(FP), SI // SI: &h
249 MOVQ message_base+24(FP), R8 // &message
250 MOVQ message_len+32(FP), R9 // length of message
251 CMPQ R9, $0
252 JEQ done_hash
253 ADDQ R8, R9
254 MOVQ R9, reserved2+64(FP) // store end of message
255
256 // Register definition
257 // a --> eax
258 // b --> ebx
259 // c --> ecx
260 // d --> r8d
261 // e --> edx
262 // f --> r9d
263 // g --> r10d
264 // h --> r11d
265 //
266 // y0 --> r13d
267 // y1 --> r14d
268 // y2 --> r15d
269
270 MOVL (0*4)(SI), AX // a = H0
271 MOVL (1*4)(SI), BX // b = H1
272 MOVL (2*4)(SI), CX // c = H2
273 MOVL (3*4)(SI), R8 // d = H3
274 MOVL (4*4)(SI), DX // e = H4
275 MOVL (5*4)(SI), R9 // f = H5
276 MOVL (6*4)(SI), R10 // g = H6
277 MOVL (7*4)(SI), R11 // h = H7
278
279 MOVOU bflipMask<>(SB), X13
280 MOVOU shuf00BA<>(SB), X10 // shuffle xBxA -> 00BA
281 MOVOU shufDC00<>(SB), X12 // shuffle xDxC -> DC00
282
283 MOVQ message_base+24(FP), SI // SI: &message
284
285 loop0:
286 LEAQ constants<>(SB), BP
287
288 // byte swap first 16 dwords
289 MOVOU 0*16(SI), X4
290 LONG $0x380f4166; WORD $0xe500 // PSHUFB XMM4, XMM13
291 MOVOU 1*16(SI), X5
292 LONG $0x380f4166; WORD $0xed00 // PSHUFB XMM5, XMM13
293 MOVOU 2*16(SI), X6
294 LONG $0x380f4166; WORD $0xf500 // PSHUFB XMM6, XMM13
295 MOVOU 3*16(SI), X7
296 LONG $0x380f4166; WORD $0xfd00 // PSHUFB XMM7, XMM13
297
298 MOVQ SI, reserved3+72(FP)
299 MOVD $0x3, DI
300
301 // Align
302 // nop WORD PTR [rax+rax*1+0x0]
303
304 // schedule 48 input dwords, by doing 3 rounds of 16 each
305 loop1:
306 MOVOU X4, X9
307 LONG $0xfe0f4466; WORD $0x004d // PADDD XMM9, 0[RBP] /* Add 1st constant to first part of message */
308 MOVOU X9, reserved0+48(FP)
309 FOUR_ROUNDS_AND_SCHED(AX, BX, CX, R8, DX, R9, R10, R11)
310
311 MOVOU X4, X9
312 LONG $0xfe0f4466; WORD $0x104d // PADDD XMM9, 16[RBP] /* Add 2nd constant to message */
313 MOVOU X9, reserved0+48(FP)
314 FOUR_ROUNDS_AND_SCHED(DX, R9, R10, R11, AX, BX, CX, R8)
315
316 MOVOU X4, X9
317 LONG $0xfe0f4466; WORD $0x204d // PADDD XMM9, 32[RBP] /* Add 3rd constant to message */
318 MOVOU X9, reserved0+48(FP)
319 FOUR_ROUNDS_AND_SCHED(AX, BX, CX, R8, DX, R9, R10, R11)
320
321 MOVOU X4, X9
322 LONG $0xfe0f4466; WORD $0x304d // PADDD XMM9, 48[RBP] /* Add 4th constant to message */
323 MOVOU X9, reserved0+48(FP)
324 ADDQ $64, BP
325 FOUR_ROUNDS_AND_SCHED(DX, R9, R10, R11, AX, BX, CX, R8)
326
327 SUBQ $1, DI
328 JNE loop1
329
330 MOVD $0x2, DI
331
332 loop2:
333 MOVOU X4, X9
334 LONG $0xfe0f4466; WORD $0x004d // PADDD XMM9, 0[RBP] /* Add 1st constant to first part of message */
335 MOVOU X9, reserved0+48(FP)
336 DO_ROUND( AX, BX, CX, R8, DX, R9, R10, R11, 48)
337 DO_ROUND(R11, AX, BX, CX, R8, DX, R9, R10, 52)
338 DO_ROUND(R10, R11, AX, BX, CX, R8, DX, R9, 56)
339 DO_ROUND( R9, R10, R11, AX, BX, CX, R8, DX, 60)
340
341 MOVOU X5, X9
342 LONG $0xfe0f4466; WORD $0x104d // PADDD XMM9, 16[RBP] /* Add 2nd constant to message */
343 MOVOU X9, reserved0+48(FP)
344 ADDQ $32, BP
345 DO_ROUND( DX, R9, R10, R11, AX, BX, CX, R8, 48)
346 DO_ROUND( R8, DX, R9, R10, R11, AX, BX, CX, 52)
347 DO_ROUND( CX, R8, DX, R9, R10, R11, AX, BX, 56)
348 DO_ROUND( BX, CX, R8, DX, R9, R10, R11, AX, 60)
349
350 MOVOU X6, X4
351 MOVOU X7, X5
352
353 SUBQ $1, DI
354 JNE loop2
355
356 MOVQ h+0(FP), SI // SI: &h
357 ADDL (0*4)(SI), AX // H0 = a + H0
358 MOVL AX, (0*4)(SI)
359 ADDL (1*4)(SI), BX // H1 = b + H1
360 MOVL BX, (1*4)(SI)
361 ADDL (2*4)(SI), CX // H2 = c + H2
362 MOVL CX, (2*4)(SI)
363 ADDL (3*4)(SI), R8 // H3 = d + H3
364 MOVL R8, (3*4)(SI)
365 ADDL (4*4)(SI), DX // H4 = e + H4
366 MOVL DX, (4*4)(SI)
367 ADDL (5*4)(SI), R9 // H5 = f + H5
368 MOVL R9, (5*4)(SI)
369 ADDL (6*4)(SI), R10 // H6 = g + H6
370 MOVL R10, (6*4)(SI)
371 ADDL (7*4)(SI), R11 // H7 = h + H7
372 MOVL R11, (7*4)(SI)
373
374 MOVQ reserved3+72(FP), SI
375 ADDQ $64, SI
376 CMPQ reserved2+64(FP), SI
377 JNE loop0
378
379 done_hash:
380 RET
381
382 // Constants table
383 DATA constants<>+0x0(SB)/8, $0x71374491428a2f98
384 DATA constants<>+0x8(SB)/8, $0xe9b5dba5b5c0fbcf
385 DATA constants<>+0x10(SB)/8, $0x59f111f13956c25b
386 DATA constants<>+0x18(SB)/8, $0xab1c5ed5923f82a4
387 DATA constants<>+0x20(SB)/8, $0x12835b01d807aa98
388 DATA constants<>+0x28(SB)/8, $0x550c7dc3243185be
389 DATA constants<>+0x30(SB)/8, $0x80deb1fe72be5d74
390 DATA constants<>+0x38(SB)/8, $0xc19bf1749bdc06a7
391 DATA constants<>+0x40(SB)/8, $0xefbe4786e49b69c1
392 DATA constants<>+0x48(SB)/8, $0x240ca1cc0fc19dc6
393 DATA constants<>+0x50(SB)/8, $0x4a7484aa2de92c6f
394 DATA constants<>+0x58(SB)/8, $0x76f988da5cb0a9dc
395 DATA constants<>+0x60(SB)/8, $0xa831c66d983e5152
396 DATA constants<>+0x68(SB)/8, $0xbf597fc7b00327c8
397 DATA constants<>+0x70(SB)/8, $0xd5a79147c6e00bf3
398 DATA constants<>+0x78(SB)/8, $0x1429296706ca6351
399 DATA constants<>+0x80(SB)/8, $0x2e1b213827b70a85
400 DATA constants<>+0x88(SB)/8, $0x53380d134d2c6dfc
401 DATA constants<>+0x90(SB)/8, $0x766a0abb650a7354
402 DATA constants<>+0x98(SB)/8, $0x92722c8581c2c92e
403 DATA constants<>+0xa0(SB)/8, $0xa81a664ba2bfe8a1
404 DATA constants<>+0xa8(SB)/8, $0xc76c51a3c24b8b70
405 DATA constants<>+0xb0(SB)/8, $0xd6990624d192e819
406 DATA constants<>+0xb8(SB)/8, $0x106aa070f40e3585
407 DATA constants<>+0xc0(SB)/8, $0x1e376c0819a4c116
408 DATA constants<>+0xc8(SB)/8, $0x34b0bcb52748774c
409 DATA constants<>+0xd0(SB)/8, $0x4ed8aa4a391c0cb3
410 DATA constants<>+0xd8(SB)/8, $0x682e6ff35b9cca4f
411 DATA constants<>+0xe0(SB)/8, $0x78a5636f748f82ee
412 DATA constants<>+0xe8(SB)/8, $0x8cc7020884c87814
413 DATA constants<>+0xf0(SB)/8, $0xa4506ceb90befffa
414 DATA constants<>+0xf8(SB)/8, $0xc67178f2bef9a3f7
415
416 DATA bflipMask<>+0x00(SB)/8, $0x0405060700010203
417 DATA bflipMask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
418
419 DATA shuf00BA<>+0x00(SB)/8, $0x0b0a090803020100
420 DATA shuf00BA<>+0x08(SB)/8, $0xFFFFFFFFFFFFFFFF
421
422 DATA shufDC00<>+0x00(SB)/8, $0xFFFFFFFFFFFFFFFF
423 DATA shufDC00<>+0x08(SB)/8, $0x0b0a090803020100
424
425 GLOBL constants<>(SB), 8, $256
426 GLOBL bflipMask<>(SB), (NOPTR+RODATA), $16
427 GLOBL shuf00BA<>(SB), (NOPTR+RODATA), $16
428 GLOBL shufDC00<>(SB), (NOPTR+RODATA), $16
0 //+build !noasm,!appengine
0 //+build !noasm,!appengine,gc
11
22 /*
33 * Minio Cloud Storage, (C) 2016 Minio, Inc.
1717
1818 package sha256
1919
20 func blockArmGo(dig *digest, p []byte) {}
21
22 func blockAvxGo(dig *digest, p []byte) {
23
24 h := []uint32{dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4], dig.h[5], dig.h[6], dig.h[7]}
25
26 blockAvx(h[:], p[:], 0, 0, 0, 0)
27
28 dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4], dig.h[5], dig.h[6], dig.h[7] = h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7]
29 }
30
31 func blockAvx2Go(dig *digest, p []byte) {
32
33 h := []uint32{dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4], dig.h[5], dig.h[6], dig.h[7]}
34
35 blockAvx2(h[:], p[:])
36
37 dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4], dig.h[5], dig.h[6], dig.h[7] = h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7]
38 }
39
40 func blockSsseGo(dig *digest, p []byte) {
41
42 h := []uint32{dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4], dig.h[5], dig.h[6], dig.h[7]}
43
44 blockSsse(h[:], p[:], 0, 0, 0, 0)
45
46 dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4], dig.h[5], dig.h[6], dig.h[7] = h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7]
20 func blockArmGo(dig *digest, p []byte) {
21 panic("blockArmGo called unexpectedly")
4722 }
4823
4924 func blockShaGo(dig *digest, p []byte) {
50
5125 blockSha(&dig.h, p)
5226 }
0 //+build !noasm,!appengine
0 //+build !noasm,!appengine,gc
11
22 /*
33 * Minio Cloud Storage, (C) 2016 Minio, Inc.
1717
1818 package sha256
1919
20 func blockAvx2Go(dig *digest, p []byte) {}
21 func blockAvxGo(dig *digest, p []byte) {}
22 func blockSsseGo(dig *digest, p []byte) {}
23 func blockShaGo(dig *digest, p []byte) {}
20 func blockShaGo(dig *digest, p []byte) {
21 panic("blockShaGoc called unexpectedly")
22 }
2423
2524 //go:noescape
2625 func blockArm(h []uint32, message []uint8)
0 //+build !noasm,!appengine
0 //+build !noasm,!appengine,gc
11
22 // ARM64 version of SHA256
33
0 //+build appengine noasm !amd64,!arm64
0 //+build appengine noasm !amd64,!arm64 !gc
11
22 /*
33 * Minio Cloud Storage, (C) 2019 Minio, Inc.
1717
1818 package sha256
1919
20 func blockAvx2Go(dig *digest, p []byte) {}
21 func blockAvxGo(dig *digest, p []byte) {}
22 func blockSsseGo(dig *digest, p []byte) {}
23 func blockShaGo(dig *digest, p []byte) {}
24 func blockArmGo(dig *digest, p []byte) {}
20 func blockShaGo(dig *digest, p []byte) {
21 panic("blockShaGo called unexpectedly")
22
23 }
24
25 func blockArmGo(dig *digest, p []byte) {
26 panic("blockArmGo called unexpectedly")
27 }