diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml new file mode 100644 index 0000000..20a639a --- /dev/null +++ b/.github/workflows/go.yml @@ -0,0 +1,42 @@ +name: Go + +on: + pull_request: + branches: + - master + push: + branches: + - master + +jobs: + build: + name: Test on Go ${{ matrix.go-version }} and ${{ matrix.os }} + runs-on: ${{ matrix.os }} + strategy: + max-parallel: 4 + matrix: + go-version: [1.16.x, 1.15.x, 1.14.x] + os: [ubuntu-latest, windows-latest, macos-latest] + steps: + - name: Set up Go ${{ matrix.go-version }} + uses: actions/setup-go@v1 + with: + go-version: ${{ matrix.go-version }} + id: go + + - name: Check out code into the Go module directory + uses: actions/checkout@v1 + + - name: Build on ${{ matrix.os }} + if: matrix.os == 'windows-latest' + run: go test -race -v ./... + - name: Build on ${{ matrix.os }} + if: matrix.os == 'macos-latest' + run: go test -race -v ./... + - name: Build on ${{ matrix.os }} + if: matrix.os == 'ubuntu-latest' + run: | + diff -au <(gofmt -d .) <(printf "") + go test -race -v ./... + go vet -asmdecl . + ./test-architectures.sh diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 4f85db5..0000000 --- a/.travis.yml +++ /dev/null @@ -1,25 +0,0 @@ -sudo: required -dist: trusty -language: go - -os: -- linux - -go: -- tip -- 1.12.x - -env: -- ARCH=x86_64 -- ARCH=i686 - -matrix: - fast_finish: true - allow_failures: - - go: tip - -script: -- diff -au <(gofmt -d .) <(printf "") -- go test -race -v ./... -- go vet -asmdecl . -- ./test-architectures.sh diff --git a/README.md b/README.md index 5282d83..6117488 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,18 @@ # sha256-simd -Accelerate SHA256 computations in pure Go using AVX512, SHA Extensions and AVX2 for Intel and ARM64 for ARM. On AVX512 it provides an up to 8x improvement (over 3 GB/s per core) in comparison to AVX2. SHA Extensions give a performance boost of close to 4x over AVX2. +Accelerate SHA256 computations in pure Go using AVX512, SHA Extensions for x86 and ARM64 for ARM. +On AVX512 it provides an up to 8x improvement (over 3 GB/s per core). +SHA Extensions give a performance boost of close to 4x over native. ## Introduction -This package is designed as a replacement for `crypto/sha256`. For Intel CPUs it has two flavors for AVX512 and AVX2 (AVX/SSE are also supported). For ARM CPUs with the Cryptography Extensions, advantage is taken of the SHA2 instructions resulting in a massive performance improvement. +This package is designed as a replacement for `crypto/sha256`. +For ARM CPUs with the Cryptography Extensions, advantage is taken of the SHA2 instructions resulting in a massive performance improvement. -This package uses Golang assembly. The AVX512 version is based on the Intel's "multi-buffer crypto library for IPSec" whereas the other Intel implementations are described in "Fast SHA-256 Implementations on Intel Architecture Processors" by J. Guilford et al. +This package uses Golang assembly. +The AVX512 version is based on the Intel's "multi-buffer crypto library for IPSec" whereas the other Intel implementations are described in "Fast SHA-256 Implementations on Intel Architecture Processors" by J. Guilford et al. -## New: Support for Intel SHA Extensions +## Support for Intel SHA Extensions Support for the Intel SHA Extensions has been added by Kristofer Peterson (@svenski123), originally developed for spacemeshos [here](https://github.com/spacemeshos/POET/issues/23). On CPUs that support it (known thus far Intel Celeron J3455 and AMD Ryzen) it gives a significant boost in performance (with thanks to @AudriusButkevicius for reporting the results; full results [here](https://github.com/minio/sha256-simd/pull/37#issuecomment-451607827)). @@ -18,7 +22,9 @@ BenchmarkHash5M 514.40 1975.17 3.84x ``` -Thanks to Kristofer Peterson, we also added additional performance changes such as optimized padding, endian conversions which sped up all implementations i.e. Intel SHA alone while doubled performance for small sizes, the other changes increased everything roughly 50%. +Thanks to Kristofer Peterson, we also added additional performance changes such as optimized padding, +endian conversions which sped up all implementations i.e. Intel SHA alone while doubled performance for small sizes, +the other changes increased everything roughly 50%. ## Support for AVX512 @@ -58,7 +64,8 @@ ## Drop-In Replacement -The following code snippet shows how you can use `github.com/minio/sha256-simd`. This will automatically select the fastest method for the architecture on which it will be executed. +The following code snippet shows how you can use `github.com/minio/sha256-simd`. +This will automatically select the fastest method for the architecture on which it will be executed. ```go import "github.com/minio/sha256-simd" @@ -80,9 +87,6 @@ | 3.0 GHz Intel Xeon Platinum 8124M | AVX512 | 3498 | | 3.7 GHz AMD Ryzen 7 2700X | SHA Ext | 1979 | | 1.2 GHz ARM Cortex-A53 | ARM64 | 638 | -| 3.0 GHz Intel Xeon Platinum 8124M | AVX2 | 449 | -| 3.1 GHz Intel Core i7 | AVX | 362 | -| 3.1 GHz Intel Core i7 | SSE | 299 | ## asm2plan9s diff --git a/appveyor.yml b/appveyor.yml deleted file mode 100644 index a66bfa9..0000000 --- a/appveyor.yml +++ /dev/null @@ -1,32 +0,0 @@ -# version format -version: "{build}" - -# Operating system (build VM template) -os: Windows Server 2012 R2 - -# Platform. -platform: x64 - -clone_folder: c:\gopath\src\github.com\minio\sha256-simd - -# environment variables -environment: - GOPATH: c:\gopath - GO15VENDOREXPERIMENT: 1 - -# scripts that run after cloning repository -install: - - set PATH=%GOPATH%\bin;c:\go\bin;%PATH% - - go version - - go env - -# to run your custom scripts instead of automatic MSBuild -build_script: - - go test . - - go test -race . - -# to disable automatic tests -test: off - -# to disable deployment -deploy: off diff --git a/cpuid.go b/cpuid.go deleted file mode 100644 index 878ad46..0000000 --- a/cpuid.go +++ /dev/null @@ -1,119 +0,0 @@ -// Minio Cloud Storage, (C) 2016 Minio, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -package sha256 - -// True when SIMD instructions are available. -var avx512 bool -var avx2 bool -var avx bool -var sse bool -var sse2 bool -var sse3 bool -var ssse3 bool -var sse41 bool -var sse42 bool -var popcnt bool -var sha bool -var armSha = haveArmSha() - -func init() { - var _xsave bool - var _osxsave bool - var _avx bool - var _avx2 bool - var _avx512f bool - var _avx512dq bool - // var _avx512pf bool - // var _avx512er bool - // var _avx512cd bool - var _avx512bw bool - var _avx512vl bool - var _sseState bool - var _avxState bool - var _opmaskState bool - var _zmmHI256State bool - var _hi16ZmmState bool - - mfi, _, _, _ := cpuid(0) - - if mfi >= 1 { - _, _, c, d := cpuid(1) - - sse = (d & (1 << 25)) != 0 - sse2 = (d & (1 << 26)) != 0 - sse3 = (c & (1 << 0)) != 0 - ssse3 = (c & (1 << 9)) != 0 - sse41 = (c & (1 << 19)) != 0 - sse42 = (c & (1 << 20)) != 0 - popcnt = (c & (1 << 23)) != 0 - _xsave = (c & (1 << 26)) != 0 - _osxsave = (c & (1 << 27)) != 0 - _avx = (c & (1 << 28)) != 0 - } - - if mfi >= 7 { - _, b, _, _ := cpuid(7) - - _avx2 = (b & (1 << 5)) != 0 - _avx512f = (b & (1 << 16)) != 0 - _avx512dq = (b & (1 << 17)) != 0 - // _avx512pf = (b & (1 << 26)) != 0 - // _avx512er = (b & (1 << 27)) != 0 - // _avx512cd = (b & (1 << 28)) != 0 - _avx512bw = (b & (1 << 30)) != 0 - _avx512vl = (b & (1 << 31)) != 0 - sha = (b & (1 << 29)) != 0 - } - - // Stop here if XSAVE unsupported or not enabled - if !_xsave || !_osxsave { - return - } - - if _xsave && _osxsave { - a, _ := xgetbv(0) - - _sseState = (a & (1 << 1)) != 0 - _avxState = (a & (1 << 2)) != 0 - _opmaskState = (a & (1 << 5)) != 0 - _zmmHI256State = (a & (1 << 6)) != 0 - _hi16ZmmState = (a & (1 << 7)) != 0 - } else { - _sseState = true - } - - // Very unlikely that OS would enable XSAVE and then disable SSE - if !_sseState { - sse = false - sse2 = false - sse3 = false - ssse3 = false - sse41 = false - sse42 = false - } - - if _avxState { - avx = _avx - avx2 = _avx2 - } - - if _opmaskState && _zmmHI256State && _hi16ZmmState { - avx512 = (_avx512f && - _avx512dq && - _avx512bw && - _avx512vl) - } -} diff --git a/cpuid_386.go b/cpuid_386.go deleted file mode 100644 index c9890be..0000000 --- a/cpuid_386.go +++ /dev/null @@ -1,24 +0,0 @@ -// Minio Cloud Storage, (C) 2016 Minio, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -package sha256 - -func cpuid(op uint32) (eax, ebx, ecx, edx uint32) -func cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32) -func xgetbv(index uint32) (eax, edx uint32) - -func haveArmSha() bool { - return false -} diff --git a/cpuid_386.s b/cpuid_386.s deleted file mode 100644 index 1511cd6..0000000 --- a/cpuid_386.s +++ /dev/null @@ -1,53 +0,0 @@ -// The MIT License (MIT) -// -// Copyright (c) 2015 Klaus Post -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -// +build 386,!gccgo - -// func cpuid(op uint32) (eax, ebx, ecx, edx uint32) -TEXT ·cpuid(SB), 7, $0 - XORL CX, CX - MOVL op+0(FP), AX - CPUID - MOVL AX, eax+4(FP) - MOVL BX, ebx+8(FP) - MOVL CX, ecx+12(FP) - MOVL DX, edx+16(FP) - RET - -// func cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32) -TEXT ·cpuidex(SB), 7, $0 - MOVL op+0(FP), AX - MOVL op2+4(FP), CX - CPUID - MOVL AX, eax+8(FP) - MOVL BX, ebx+12(FP) - MOVL CX, ecx+16(FP) - MOVL DX, edx+20(FP) - RET - -// func xgetbv(index uint32) (eax, edx uint32) -TEXT ·xgetbv(SB), 7, $0 - MOVL index+0(FP), CX - BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV - MOVL AX, eax+4(FP) - MOVL DX, edx+8(FP) - RET diff --git a/cpuid_amd64.go b/cpuid_amd64.go deleted file mode 100644 index c9890be..0000000 --- a/cpuid_amd64.go +++ /dev/null @@ -1,24 +0,0 @@ -// Minio Cloud Storage, (C) 2016 Minio, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -package sha256 - -func cpuid(op uint32) (eax, ebx, ecx, edx uint32) -func cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32) -func xgetbv(index uint32) (eax, edx uint32) - -func haveArmSha() bool { - return false -} diff --git a/cpuid_amd64.s b/cpuid_amd64.s deleted file mode 100644 index b0f4147..0000000 --- a/cpuid_amd64.s +++ /dev/null @@ -1,53 +0,0 @@ -// The MIT License (MIT) -// -// Copyright (c) 2015 Klaus Post -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -// +build amd64,!gccgo - -// func cpuid(op uint32) (eax, ebx, ecx, edx uint32) -TEXT ·cpuid(SB), 7, $0 - XORQ CX, CX - MOVL op+0(FP), AX - CPUID - MOVL AX, eax+8(FP) - MOVL BX, ebx+12(FP) - MOVL CX, ecx+16(FP) - MOVL DX, edx+20(FP) - RET - -// func cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32) -TEXT ·cpuidex(SB), 7, $0 - MOVL op+0(FP), AX - MOVL op2+4(FP), CX - CPUID - MOVL AX, eax+8(FP) - MOVL BX, ebx+12(FP) - MOVL CX, ecx+16(FP) - MOVL DX, edx+20(FP) - RET - -// func xgetbv(index uint32) (eax, edx uint32) -TEXT ·xgetbv(SB), 7, $0 - MOVL index+0(FP), CX - BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV - MOVL AX, eax+8(FP) - MOVL DX, edx+12(FP) - RET diff --git a/cpuid_arm.go b/cpuid_arm.go deleted file mode 100644 index 351dff4..0000000 --- a/cpuid_arm.go +++ /dev/null @@ -1,32 +0,0 @@ -// Minio Cloud Storage, (C) 2016 Minio, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -package sha256 - -func cpuid(op uint32) (eax, ebx, ecx, edx uint32) { - return 0, 0, 0, 0 -} - -func cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32) { - return 0, 0, 0, 0 -} - -func xgetbv(index uint32) (eax, edx uint32) { - return 0, 0 -} - -func haveArmSha() bool { - return false -} diff --git a/cpuid_linux_arm64.go b/cpuid_linux_arm64.go deleted file mode 100644 index e739996..0000000 --- a/cpuid_linux_arm64.go +++ /dev/null @@ -1,49 +0,0 @@ -// +build arm64,linux - -// Minio Cloud Storage, (C) 2016 Minio, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -package sha256 - -import ( - "bytes" - "io/ioutil" -) - -func cpuid(op uint32) (eax, ebx, ecx, edx uint32) { - return 0, 0, 0, 0 -} - -func cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32) { - return 0, 0, 0, 0 -} - -func xgetbv(index uint32) (eax, edx uint32) { - return 0, 0 -} - -// File to check for cpu capabilities. -const procCPUInfo = "/proc/cpuinfo" - -// Feature to check for. -const sha256Feature = "sha2" - -func haveArmSha() bool { - cpuInfo, err := ioutil.ReadFile(procCPUInfo) - if err != nil { - return false - } - return bytes.Contains(cpuInfo, []byte(sha256Feature)) -} diff --git a/cpuid_other.go b/cpuid_other.go index 3e44158..cd9fbf2 100644 --- a/cpuid_other.go +++ b/cpuid_other.go @@ -1,4 +1,4 @@ -// Minio Cloud Storage, (C) 2016 Minio, Inc. +// Minio Cloud Storage, (C) 2021 Minio, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -13,22 +13,34 @@ // limitations under the License. // -// +build !386,!amd64,!arm,!arm64 arm64,!linux - package sha256 -func cpuid(op uint32) (eax, ebx, ecx, edx uint32) { - return 0, 0, 0, 0 +import ( + "bytes" + "io/ioutil" + "runtime" + + "github.com/klauspost/cpuid/v2" +) + +func hasArmSha2() bool { + if cpuid.CPU.Has(cpuid.SHA2) { + return true + } + if runtime.GOARCH != "arm64" || runtime.GOOS != "linux" { + return false + } + + // Fall back to hacky cpuinfo parsing... + const procCPUInfo = "/proc/cpuinfo" + + // Feature to check for. + const sha256Feature = "sha2" + + cpuInfo, err := ioutil.ReadFile(procCPUInfo) + if err != nil { + return false + } + return bytes.Contains(cpuInfo, []byte(sha256Feature)) + } - -func cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32) { - return 0, 0, 0, 0 -} - -func xgetbv(index uint32) (eax, edx uint32) { - return 0, 0 -} - -func haveArmSha() bool { - return false -} diff --git a/go.mod b/go.mod index 4451e9e..b254fa3 100644 --- a/go.mod +++ b/go.mod @@ -1,3 +1,5 @@ module github.com/minio/sha256-simd -go 1.12 +go 1.13 + +require github.com/klauspost/cpuid/v2 v2.0.6 diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..5b8b0f4 --- /dev/null +++ b/go.sum @@ -0,0 +1,2 @@ +github.com/klauspost/cpuid/v2 v2.0.6 h1:dQ5ueTiftKxp0gyjKSx5+8BtPWkyQbd95m8Gys/RarI= +github.com/klauspost/cpuid/v2 v2.0.6/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= diff --git a/sha256.go b/sha256.go index 4e1f6d2..b137ead 100644 --- a/sha256.go +++ b/sha256.go @@ -21,6 +21,8 @@ "encoding/binary" "hash" "runtime" + + "github.com/klauspost/cpuid/v2" ) // Size - The size of a SHA256 checksum in bytes. @@ -67,10 +69,6 @@ const ( blockfuncGeneric blockfuncType = iota - blockfuncAvx512 blockfuncType = iota - blockfuncAvx2 blockfuncType = iota - blockfuncAvx blockfuncType = iota - blockfuncSsse blockfuncType = iota blockfuncSha blockfuncType = iota blockfuncArm blockfuncType = iota ) @@ -78,24 +76,22 @@ var blockfunc blockfuncType func init() { - is386bit := runtime.GOARCH == "386" - isARM := runtime.GOARCH == "arm" + blockfunc = blockfuncGeneric switch { - case is386bit || isARM: - blockfunc = blockfuncGeneric - case sha && ssse3 && sse41: + case hasSHAExtensions(): blockfunc = blockfuncSha - case avx2: - blockfunc = blockfuncAvx2 - case avx: - blockfunc = blockfuncAvx - case ssse3: - blockfunc = blockfuncSsse - case armSha: + case hasArmSha2(): blockfunc = blockfuncArm default: blockfunc = blockfuncGeneric } +} + +var avx512 = cpuid.CPU.Supports(cpuid.AVX512F, cpuid.AVX512DQ, cpuid.AVX512BW, cpuid.AVX512VL) + +// hasSHAExtensions return whether the cpu supports SHA extensions. +func hasSHAExtensions() bool { + return cpuid.CPU.Supports(cpuid.SHA, cpuid.SSSE3, cpuid.SSE4) && runtime.GOARCH == "amd64" } // New returns a new hash.Hash computing the SHA256 checksum. @@ -278,12 +274,6 @@ func block(dig *digest, p []byte) { if blockfunc == blockfuncSha { blockShaGo(dig, p) - } else if blockfunc == blockfuncAvx2 { - blockAvx2Go(dig, p) - } else if blockfunc == blockfuncAvx { - blockAvxGo(dig, p) - } else if blockfunc == blockfuncSsse { - blockSsseGo(dig, p) } else if blockfunc == blockfuncArm { blockArmGo(dig, p) } else if blockfunc == blockfuncGeneric { diff --git a/sha256_test.go b/sha256_test.go index 89499e3..602a298 100644 --- a/sha256_test.go +++ b/sha256_test.go @@ -52,9 +52,10 @@ import ( "encoding/hex" "fmt" - "runtime" "strings" "testing" + + "github.com/klauspost/cpuid/v2" ) type sha256Test struct { @@ -2225,12 +2226,7 @@ } } - if runtime.GOARCH == "386" || runtime.GOARCH == "arm" { - // doesn't support anything but the generic version. - return - } - - if sha && ssse3 && sse41 { + if cpuid.CPU.Supports(cpuid.SHA, cpuid.SSSE3, cpuid.SSE4) { blockfunc = blockfuncSha for _, g := range golden { s := fmt.Sprintf("%x", Sum256([]byte(g.in))) @@ -2239,30 +2235,13 @@ } } } - if avx2 { - blockfunc = blockfuncAvx2 + + if hasArmSha2() { + blockfunc = blockfuncArm for _, g := range golden { s := fmt.Sprintf("%x", Sum256([]byte(g.in))) if Sum256([]byte(g.in)) != g.out { - t.Fatalf("AVX2: Sum256 function: sha256(%s) = %s want %s", g.in, s, hex.EncodeToString(g.out[:])) - } - } - } - if avx { - blockfunc = blockfuncAvx - for _, g := range golden { - s := fmt.Sprintf("%x", Sum256([]byte(g.in))) - if Sum256([]byte(g.in)) != g.out { - t.Fatalf("AVX: Sum256 function: sha256(%s) = %s want %s", g.in, s, hex.EncodeToString(g.out[:])) - } - } - } - if ssse3 { - blockfunc = blockfuncSsse - for _, g := range golden { - s := fmt.Sprintf("%x", Sum256([]byte(g.in))) - if Sum256([]byte(g.in)) != g.out { - t.Fatalf("SSSE3: Sum256 function: sha256(%s) = %s want %s", g.in, s, hex.EncodeToString(g.out[:])) + t.Fatalf("ARM: Sum256 function: sha256(%s) = %s want %s", g.in, s, hex.EncodeToString(g.out[:])) } } } @@ -2301,10 +2280,7 @@ t blockfuncType f bool }{ - {"SHA_", blockfuncSha, sha && sse41 && ssse3}, - {"AVX2", blockfuncAvx2, avx2}, - {"AVX_", blockfuncAvx, avx}, - {"SSSE", blockfuncSsse, ssse3}, + {"SHA_", blockfuncSha, hasSHAExtensions()}, {"GEN_", blockfuncGeneric, true}, } diff --git a/sha256blockAvx2_amd64.go b/sha256blockAvx2_amd64.go deleted file mode 100644 index 52fcaee..0000000 --- a/sha256blockAvx2_amd64.go +++ /dev/null @@ -1,22 +0,0 @@ -//+build !noasm,!appengine - -/* - * Minio Cloud Storage, (C) 2016 Minio, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package sha256 - -//go:noescape -func blockAvx2(h []uint32, message []uint8) diff --git a/sha256blockAvx2_amd64.s b/sha256blockAvx2_amd64.s deleted file mode 100644 index 80b0b73..0000000 --- a/sha256blockAvx2_amd64.s +++ /dev/null @@ -1,1449 +0,0 @@ -//+build !noasm,!appengine - -// SHA256 implementation for AVX2 - -// -// Minio Cloud Storage, (C) 2016 Minio, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -// -// This code is based on an Intel White-Paper: -// "Fast SHA-256 Implementations on Intel Architecture Processors" -// -// together with the reference implementation from the following authors: -// James Guilford -// Kirk Yap -// Tim Chen -// -// For Golang it has been converted to Plan 9 assembly with the help of -// github.com/minio/asm2plan9s to assemble Intel instructions to their Plan9 -// equivalents -// - -DATA K256<>+0x000(SB)/8, $0x71374491428a2f98 -DATA K256<>+0x008(SB)/8, $0xe9b5dba5b5c0fbcf -DATA K256<>+0x010(SB)/8, $0x71374491428a2f98 -DATA K256<>+0x018(SB)/8, $0xe9b5dba5b5c0fbcf -DATA K256<>+0x020(SB)/8, $0x59f111f13956c25b -DATA K256<>+0x028(SB)/8, $0xab1c5ed5923f82a4 -DATA K256<>+0x030(SB)/8, $0x59f111f13956c25b -DATA K256<>+0x038(SB)/8, $0xab1c5ed5923f82a4 -DATA K256<>+0x040(SB)/8, $0x12835b01d807aa98 -DATA K256<>+0x048(SB)/8, $0x550c7dc3243185be -DATA K256<>+0x050(SB)/8, $0x12835b01d807aa98 -DATA K256<>+0x058(SB)/8, $0x550c7dc3243185be -DATA K256<>+0x060(SB)/8, $0x80deb1fe72be5d74 -DATA K256<>+0x068(SB)/8, $0xc19bf1749bdc06a7 -DATA K256<>+0x070(SB)/8, $0x80deb1fe72be5d74 -DATA K256<>+0x078(SB)/8, $0xc19bf1749bdc06a7 -DATA K256<>+0x080(SB)/8, $0xefbe4786e49b69c1 -DATA K256<>+0x088(SB)/8, $0x240ca1cc0fc19dc6 -DATA K256<>+0x090(SB)/8, $0xefbe4786e49b69c1 -DATA K256<>+0x098(SB)/8, $0x240ca1cc0fc19dc6 -DATA K256<>+0x0a0(SB)/8, $0x4a7484aa2de92c6f -DATA K256<>+0x0a8(SB)/8, $0x76f988da5cb0a9dc -DATA K256<>+0x0b0(SB)/8, $0x4a7484aa2de92c6f -DATA K256<>+0x0b8(SB)/8, $0x76f988da5cb0a9dc -DATA K256<>+0x0c0(SB)/8, $0xa831c66d983e5152 -DATA K256<>+0x0c8(SB)/8, $0xbf597fc7b00327c8 -DATA K256<>+0x0d0(SB)/8, $0xa831c66d983e5152 -DATA K256<>+0x0d8(SB)/8, $0xbf597fc7b00327c8 -DATA K256<>+0x0e0(SB)/8, $0xd5a79147c6e00bf3 -DATA K256<>+0x0e8(SB)/8, $0x1429296706ca6351 -DATA K256<>+0x0f0(SB)/8, $0xd5a79147c6e00bf3 -DATA K256<>+0x0f8(SB)/8, $0x1429296706ca6351 -DATA K256<>+0x100(SB)/8, $0x2e1b213827b70a85 -DATA K256<>+0x108(SB)/8, $0x53380d134d2c6dfc -DATA K256<>+0x110(SB)/8, $0x2e1b213827b70a85 -DATA K256<>+0x118(SB)/8, $0x53380d134d2c6dfc -DATA K256<>+0x120(SB)/8, $0x766a0abb650a7354 -DATA K256<>+0x128(SB)/8, $0x92722c8581c2c92e -DATA K256<>+0x130(SB)/8, $0x766a0abb650a7354 -DATA K256<>+0x138(SB)/8, $0x92722c8581c2c92e -DATA K256<>+0x140(SB)/8, $0xa81a664ba2bfe8a1 -DATA K256<>+0x148(SB)/8, $0xc76c51a3c24b8b70 -DATA K256<>+0x150(SB)/8, $0xa81a664ba2bfe8a1 -DATA K256<>+0x158(SB)/8, $0xc76c51a3c24b8b70 -DATA K256<>+0x160(SB)/8, $0xd6990624d192e819 -DATA K256<>+0x168(SB)/8, $0x106aa070f40e3585 -DATA K256<>+0x170(SB)/8, $0xd6990624d192e819 -DATA K256<>+0x178(SB)/8, $0x106aa070f40e3585 -DATA K256<>+0x180(SB)/8, $0x1e376c0819a4c116 -DATA K256<>+0x188(SB)/8, $0x34b0bcb52748774c -DATA K256<>+0x190(SB)/8, $0x1e376c0819a4c116 -DATA K256<>+0x198(SB)/8, $0x34b0bcb52748774c -DATA K256<>+0x1a0(SB)/8, $0x4ed8aa4a391c0cb3 -DATA K256<>+0x1a8(SB)/8, $0x682e6ff35b9cca4f -DATA K256<>+0x1b0(SB)/8, $0x4ed8aa4a391c0cb3 -DATA K256<>+0x1b8(SB)/8, $0x682e6ff35b9cca4f -DATA K256<>+0x1c0(SB)/8, $0x78a5636f748f82ee -DATA K256<>+0x1c8(SB)/8, $0x8cc7020884c87814 -DATA K256<>+0x1d0(SB)/8, $0x78a5636f748f82ee -DATA K256<>+0x1d8(SB)/8, $0x8cc7020884c87814 -DATA K256<>+0x1e0(SB)/8, $0xa4506ceb90befffa -DATA K256<>+0x1e8(SB)/8, $0xc67178f2bef9a3f7 -DATA K256<>+0x1f0(SB)/8, $0xa4506ceb90befffa -DATA K256<>+0x1f8(SB)/8, $0xc67178f2bef9a3f7 - -DATA K256<>+0x200(SB)/8, $0x0405060700010203 -DATA K256<>+0x208(SB)/8, $0x0c0d0e0f08090a0b -DATA K256<>+0x210(SB)/8, $0x0405060700010203 -DATA K256<>+0x218(SB)/8, $0x0c0d0e0f08090a0b -DATA K256<>+0x220(SB)/8, $0x0b0a090803020100 -DATA K256<>+0x228(SB)/8, $0xffffffffffffffff -DATA K256<>+0x230(SB)/8, $0x0b0a090803020100 -DATA K256<>+0x238(SB)/8, $0xffffffffffffffff -DATA K256<>+0x240(SB)/8, $0xffffffffffffffff -DATA K256<>+0x248(SB)/8, $0x0b0a090803020100 -DATA K256<>+0x250(SB)/8, $0xffffffffffffffff -DATA K256<>+0x258(SB)/8, $0x0b0a090803020100 - -GLOBL K256<>(SB), 8, $608 - -// We need 0x220 stack space aligned on a 512 boundary, so for the -// worstcase-aligned SP we need twice this amount, being 1088 (=0x440) -// -// SP aligned end-aligned stacksize -// 100013d0 10001400 10001620 592 -// 100013d8 10001400 10001620 584 -// 100013e0 10001600 10001820 1088 -// 100013e8 10001600 10001820 1080 - -// func blockAvx2(h []uint32, message []uint8) -TEXT ·blockAvx2(SB),$1088-48 - - MOVQ h+0(FP), DI // DI: &h - MOVQ message_base+24(FP), SI // SI: &message - MOVQ message_len+32(FP), DX // len(message) - ADDQ SI, DX // end pointer of input - MOVQ SP, R11 // copy stack pointer - ADDQ $0x220, SP // sp += 0x220 - ANDQ $0xfffffffffffffe00, SP // align stack frame - ADDQ $0x1c0, SP - MOVQ DI, 0x40(SP) // save ctx - MOVQ SI, 0x48(SP) // save input - MOVQ DX, 0x50(SP) // save end pointer - MOVQ R11, 0x58(SP) // save copy of stack pointer - - WORD $0xf8c5; BYTE $0x77 // vzeroupper - ADDQ $0x40, SI // input++ - MOVL (DI), AX - MOVQ SI, R12 // borrow $T1 - MOVL 4(DI), BX - CMPQ SI, DX // $_end - MOVL 8(DI), CX - LONG $0xe4440f4c // cmove r12,rsp /* next block or random data */ - MOVL 12(DI), DX - MOVL 16(DI), R8 - MOVL 20(DI), R9 - MOVL 24(DI), R10 - MOVL 28(DI), R11 - - LEAQ K256<>(SB), BP - LONG $0x856f7dc5; LONG $0x00000220 // VMOVDQA YMM8, 0x220[rbp] /* vmovdqa ymm8,YMMWORD PTR [rip+0x220] */ - LONG $0x8d6f7dc5; LONG $0x00000240 // VMOVDQA YMM9, 0x240[rbp] /* vmovdqa ymm9,YMMWORD PTR [rip+0x240] */ - LONG $0x956f7dc5; LONG $0x00000200 // VMOVDQA YMM10, 0x200[rbp] /* vmovdqa ymm7,YMMWORD PTR [rip+0x200] */ - -loop0: - LONG $0x6f7dc1c4; BYTE $0xfa // VMOVDQA YMM7, YMM10 - - // Load first 16 dwords from two blocks - MOVOU -64(SI), X0 // vmovdqu xmm0,XMMWORD PTR [rsi-0x40] - MOVOU -48(SI), X1 // vmovdqu xmm1,XMMWORD PTR [rsi-0x30] - MOVOU -32(SI), X2 // vmovdqu xmm2,XMMWORD PTR [rsi-0x20] - MOVOU -16(SI), X3 // vmovdqu xmm3,XMMWORD PTR [rsi-0x10] - - // Byte swap data and transpose data into high/low - LONG $0x387dc3c4; WORD $0x2404; BYTE $0x01 // vinserti128 ymm0,ymm0,[r12],0x1 - LONG $0x3875c3c4; LONG $0x0110244c // vinserti128 ymm1,ymm1,0x10[r12],0x1 - LONG $0x007de2c4; BYTE $0xc7 // vpshufb ymm0,ymm0,ymm7 - LONG $0x386dc3c4; LONG $0x01202454 // vinserti128 ymm2,ymm2,0x20[r12],0x1 - LONG $0x0075e2c4; BYTE $0xcf // vpshufb ymm1,ymm1,ymm7 - LONG $0x3865c3c4; LONG $0x0130245c // vinserti128 ymm3,ymm3,0x30[r12],0x1 - - LEAQ K256<>(SB), BP - LONG $0x006de2c4; BYTE $0xd7 // vpshufb ymm2,ymm2,ymm7 - LONG $0x65fefdc5; BYTE $0x00 // vpaddd ymm4,ymm0,[rbp] - LONG $0x0065e2c4; BYTE $0xdf // vpshufb ymm3,ymm3,ymm7 - LONG $0x6dfef5c5; BYTE $0x20 // vpaddd ymm5,ymm1,0x20[rbp] - LONG $0x75feedc5; BYTE $0x40 // vpaddd ymm6,ymm2,0x40[rbp] - LONG $0x7dfee5c5; BYTE $0x60 // vpaddd ymm7,ymm3,0x60[rbp] - - LONG $0x247ffdc5; BYTE $0x24 // vmovdqa [rsp],ymm4 - XORQ R14, R14 - LONG $0x6c7ffdc5; WORD $0x2024 // vmovdqa [rsp+0x20],ymm5 - - ADDQ $-0x40, SP - MOVQ BX, DI - LONG $0x347ffdc5; BYTE $0x24 // vmovdqa [rsp],ymm6 - XORQ CX, DI // magic - LONG $0x7c7ffdc5; WORD $0x2024 // vmovdqa [rsp+0x20],ymm7 - MOVQ R9, R12 - ADDQ $0x80, BP - -loop1: - // Schedule 48 input dwords, by doing 3 rounds of 12 each - // Note: SIMD instructions are interleaved with the SHA calculations - ADDQ $-0x40, SP - LONG $0x0f75e3c4; WORD $0x04e0 // vpalignr ymm4,ymm1,ymm0,0x4 - - // ROUND(AX, BX, CX, DX, R8, R9, R10, R11, R12, R13, R14, R15, DI, SP, 0x80) - LONG $0x249c0344; LONG $0x00000080 // add r11d,[rsp+0x80] - WORD $0x2145; BYTE $0xc4 // and r12d,r8d - LONG $0xf07b43c4; WORD $0x19e8 // rorx r13d,r8d,0x19 - LONG $0x0f65e3c4; WORD $0x04fa // vpalignr ymm7,ymm3,ymm2,0x4 - LONG $0xf07b43c4; WORD $0x0bf8 // rorx r15d,r8d,0xb - LONG $0x30048d42 // lea eax,[rax+r14*1] - LONG $0x231c8d47 // lea r11d,[r11+r12*1] - LONG $0xd472cdc5; BYTE $0x07 // vpsrld ymm6,ymm4,0x7 - LONG $0xf23842c4; BYTE $0xe2 // andn r12d,r8d,r10d - WORD $0x3145; BYTE $0xfd // xor r13d,r15d - LONG $0xf07b43c4; WORD $0x06f0 // rorx r14d,r8d,0x6 - LONG $0xc7fefdc5 // vpaddd ymm0,ymm0,ymm7 - LONG $0x231c8d47 // lea r11d,[r11+r12*1] - WORD $0x3145; BYTE $0xf5 // xor r13d,r14d - WORD $0x8941; BYTE $0xc7 // mov r15d,eax - LONG $0xd472c5c5; BYTE $0x03 // vpsrld ymm7,ymm4,0x3 - LONG $0xf07b63c4; WORD $0x16e0 // rorx r12d,eax,0x16 - LONG $0x2b1c8d47 // lea r11d,[r11+r13*1] - WORD $0x3141; BYTE $0xdf // xor r15d,ebx - LONG $0xf472d5c5; BYTE $0x0e // vpslld ymm5,ymm4,0xe - LONG $0xf07b63c4; WORD $0x0df0 // rorx r14d,eax,0xd - LONG $0xf07b63c4; WORD $0x02e8 // rorx r13d,eax,0x2 - LONG $0x1a148d42 // lea edx,[rdx+r11*1] - LONG $0xe6efc5c5 // vpxor ymm4,ymm7,ymm6 - WORD $0x2144; BYTE $0xff // and edi,r15d - WORD $0x3145; BYTE $0xe6 // xor r14d,r12d - WORD $0xdf31 // xor edi,ebx - LONG $0xfb70fdc5; BYTE $0xfa // vpshufd ymm7,ymm3,0xfa - WORD $0x3145; BYTE $0xee // xor r14d,r13d - LONG $0x3b1c8d45 // lea r11d,[r11+rdi*1] - WORD $0x8945; BYTE $0xc4 // mov r12d,r8d - LONG $0xd672cdc5; BYTE $0x0b // vpsrld ymm6,ymm6,0xb - - // ROUND(R11, AX, BX, CX, DX, R8, R9, R10, R12, R13, R14, DI, R15, SP, 0x84) - LONG $0x24940344; LONG $0x00000084 // add r10d,[rsp+0x84] - WORD $0x2141; BYTE $0xd4 // and r12d,edx - LONG $0xf07b63c4; WORD $0x19ea // rorx r13d,edx,0x19 - LONG $0xe5efddc5 // vpxor ymm4,ymm4,ymm5 - LONG $0xf07be3c4; WORD $0x0bfa // rorx edi,edx,0xb - LONG $0x331c8d47 // lea r11d,[r11+r14*1] - LONG $0x22148d47 // lea r10d,[r10+r12*1] - LONG $0xf572d5c5; BYTE $0x0b // vpslld ymm5,ymm5,0xb - LONG $0xf26842c4; BYTE $0xe1 // andn r12d,edx,r9d - WORD $0x3141; BYTE $0xfd // xor r13d,edi - LONG $0xf07b63c4; WORD $0x06f2 // rorx r14d,edx,0x6 - LONG $0xe6efddc5 // vpxor ymm4,ymm4,ymm6 - LONG $0x22148d47 // lea r10d,[r10+r12*1] - WORD $0x3145; BYTE $0xf5 // xor r13d,r14d - WORD $0x8944; BYTE $0xdf // mov edi,r11d - LONG $0xd772cdc5; BYTE $0x0a // vpsrld ymm6,ymm7,0xa - LONG $0xf07b43c4; WORD $0x16e3 // rorx r12d,r11d,0x16 - LONG $0x2a148d47 // lea r10d,[r10+r13*1] - WORD $0xc731 // xor edi,eax - LONG $0xe5efddc5 // vpxor ymm4,ymm4,ymm5 - LONG $0xf07b43c4; WORD $0x0df3 // rorx r14d,r11d,0xd - LONG $0xf07b43c4; WORD $0x02eb // rorx r13d,r11d,0x2 - LONG $0x110c8d42 // lea ecx,[rcx+r10*1] - LONG $0xd773c5c5; BYTE $0x11 // vpsrlq ymm7,ymm7,0x11 - WORD $0x2141; BYTE $0xff // and r15d,edi - WORD $0x3145; BYTE $0xe6 // xor r14d,r12d - WORD $0x3141; BYTE $0xc7 // xor r15d,eax - LONG $0xc4fefdc5 // vpaddd ymm0,ymm0,ymm4 - WORD $0x3145; BYTE $0xee // xor r14d,r13d - LONG $0x3a148d47 // lea r10d,[r10+r15*1] - WORD $0x8941; BYTE $0xd4 // mov r12d,edx - LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7 - - // ROUND(R10, R11, AX, BX, CX, DX, R8, R9, R12, R13, R14, R15, DI, SP, 0x88) - LONG $0x248c0344; LONG $0x00000088 // add r9d,[rsp+0x88] - WORD $0x2141; BYTE $0xcc // and r12d,ecx - LONG $0xf07b63c4; WORD $0x19e9 // rorx r13d,ecx,0x19 - LONG $0xd773c5c5; BYTE $0x02 // vpsrlq ymm7,ymm7,0x2 - LONG $0xf07b63c4; WORD $0x0bf9 // rorx r15d,ecx,0xb - LONG $0x32148d47 // lea r10d,[r10+r14*1] - LONG $0x210c8d47 // lea r9d,[r9+r12*1] - LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7 - LONG $0xf27042c4; BYTE $0xe0 // andn r12d,ecx,r8d - WORD $0x3145; BYTE $0xfd // xor r13d,r15d - LONG $0xf07b63c4; WORD $0x06f1 // rorx r14d,ecx,0x6 - LONG $0x004dc2c4; BYTE $0xf0 // vpshufb ymm6,ymm6,ymm8 - LONG $0x210c8d47 // lea r9d,[r9+r12*1] - WORD $0x3145; BYTE $0xf5 // xor r13d,r14d - WORD $0x8945; BYTE $0xd7 // mov r15d,r10d - LONG $0xc6fefdc5 // vpaddd ymm0,ymm0,ymm6 - LONG $0xf07b43c4; WORD $0x16e2 // rorx r12d,r10d,0x16 - LONG $0x290c8d47 // lea r9d,[r9+r13*1] - WORD $0x3145; BYTE $0xdf // xor r15d,r11d - LONG $0xf870fdc5; BYTE $0x50 // vpshufd ymm7,ymm0,0x50 - LONG $0xf07b43c4; WORD $0x0df2 // rorx r14d,r10d,0xd - LONG $0xf07b43c4; WORD $0x02ea // rorx r13d,r10d,0x2 - LONG $0x0b1c8d42 // lea ebx,[rbx+r9*1] - LONG $0xd772cdc5; BYTE $0x0a // vpsrld ymm6,ymm7,0xa - WORD $0x2144; BYTE $0xff // and edi,r15d - WORD $0x3145; BYTE $0xe6 // xor r14d,r12d - WORD $0x3144; BYTE $0xdf // xor edi,r11d - LONG $0xd773c5c5; BYTE $0x11 // vpsrlq ymm7,ymm7,0x11 - WORD $0x3145; BYTE $0xee // xor r14d,r13d - LONG $0x390c8d45 // lea r9d,[r9+rdi*1] - WORD $0x8941; BYTE $0xcc // mov r12d,ecx - LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7 - - // ROUND(R9, R10, R11, AX, BX, CX, DX, R8, R12, R13, R14, DI, R15, SP, 0x8c) - LONG $0x24840344; LONG $0x0000008c // add r8d,[rsp+0x8c] - WORD $0x2141; BYTE $0xdc // and r12d,ebx - LONG $0xf07b63c4; WORD $0x19eb // rorx r13d,ebx,0x19 - LONG $0xd773c5c5; BYTE $0x02 // vpsrlq ymm7,ymm7,0x2 - LONG $0xf07be3c4; WORD $0x0bfb // rorx edi,ebx,0xb - LONG $0x310c8d47 // lea r9d,[r9+r14*1] - LONG $0x20048d47 // lea r8d,[r8+r12*1] - LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7 - LONG $0xf26062c4; BYTE $0xe2 // andn r12d,ebx,edx - WORD $0x3141; BYTE $0xfd // xor r13d,edi - LONG $0xf07b63c4; WORD $0x06f3 // rorx r14d,ebx,0x6 - LONG $0x004dc2c4; BYTE $0xf1 // vpshufb ymm6,ymm6,ymm9 - LONG $0x20048d47 // lea r8d,[r8+r12*1] - WORD $0x3145; BYTE $0xf5 // xor r13d,r14d - WORD $0x8944; BYTE $0xcf // mov edi,r9d - LONG $0xc6fefdc5 // vpaddd ymm0,ymm0,ymm6 - LONG $0xf07b43c4; WORD $0x16e1 // rorx r12d,r9d,0x16 - LONG $0x28048d47 // lea r8d,[r8+r13*1] - WORD $0x3144; BYTE $0xd7 // xor edi,r10d - LONG $0x75fefdc5; BYTE $0x00 // vpaddd ymm6,ymm0,[rbp+0x0] - LONG $0xf07b43c4; WORD $0x0df1 // rorx r14d,r9d,0xd - LONG $0xf07b43c4; WORD $0x02e9 // rorx r13d,r9d,0x2 - LONG $0x00048d42 // lea eax,[rax+r8*1] - WORD $0x2141; BYTE $0xff // and r15d,edi - WORD $0x3145; BYTE $0xe6 // xor r14d,r12d - WORD $0x3145; BYTE $0xd7 // xor r15d,r10d - WORD $0x3145; BYTE $0xee // xor r14d,r13d - LONG $0x38048d47 // lea r8d,[r8+r15*1] - WORD $0x8941; BYTE $0xdc // mov r12d,ebx - - LONG $0x347ffdc5; BYTE $0x24 // vmovdqa [rsp],ymm6 - LONG $0x0f6de3c4; WORD $0x04e1 // vpalignr ymm4,ymm2,ymm1,0x4 - - // ROUND(R8, R9, R10, R11, AX, BX, CX, DX, R12, R13, R14, R15, DI, SP, 0xa0) - LONG $0xa0249403; WORD $0x0000; BYTE $0x00 // add edx,[rsp+0xa0] - WORD $0x2141; BYTE $0xc4 // and r12d,eax - LONG $0xf07b63c4; WORD $0x19e8 // rorx r13d,eax,0x19 - LONG $0x0f7de3c4; WORD $0x04fb // vpalignr ymm7,ymm0,ymm3,0x4 - LONG $0xf07b63c4; WORD $0x0bf8 // rorx r15d,eax,0xb - LONG $0x30048d47 // lea r8d,[r8+r14*1] - LONG $0x22148d42 // lea edx,[rdx+r12*1] - LONG $0xd472cdc5; BYTE $0x07 // vpsrld ymm6,ymm4,0x7 - LONG $0xf27862c4; BYTE $0xe1 // andn r12d,eax,ecx - WORD $0x3145; BYTE $0xfd // xor r13d,r15d - LONG $0xf07b63c4; WORD $0x06f0 // rorx r14d,eax,0x6 - LONG $0xcffef5c5 // vpaddd ymm1,ymm1,ymm7 - LONG $0x22148d42 // lea edx,[rdx+r12*1] - WORD $0x3145; BYTE $0xf5 // xor r13d,r14d - WORD $0x8945; BYTE $0xc7 // mov r15d,r8d - LONG $0xd472c5c5; BYTE $0x03 // vpsrld ymm7,ymm4,0x3 - LONG $0xf07b43c4; WORD $0x16e0 // rorx r12d,r8d,0x16 - LONG $0x2a148d42 // lea edx,[rdx+r13*1] - WORD $0x3145; BYTE $0xcf // xor r15d,r9d - LONG $0xf472d5c5; BYTE $0x0e // vpslld ymm5,ymm4,0xe - LONG $0xf07b43c4; WORD $0x0df0 // rorx r14d,r8d,0xd - LONG $0xf07b43c4; WORD $0x02e8 // rorx r13d,r8d,0x2 - LONG $0x131c8d45 // lea r11d,[r11+rdx*1] - LONG $0xe6efc5c5 // vpxor ymm4,ymm7,ymm6 - WORD $0x2144; BYTE $0xff // and edi,r15d - WORD $0x3145; BYTE $0xe6 // xor r14d,r12d - WORD $0x3144; BYTE $0xcf // xor edi,r9d - LONG $0xf870fdc5; BYTE $0xfa // vpshufd ymm7,ymm0,0xfa - WORD $0x3145; BYTE $0xee // xor r14d,r13d - WORD $0x148d; BYTE $0x3a // lea edx,[rdx+rdi*1] - WORD $0x8941; BYTE $0xc4 // mov r12d,eax - LONG $0xd672cdc5; BYTE $0x0b // vpsrld ymm6,ymm6,0xb - - // ROUND(DX, R8, R9, R10, R11, AX, BX, CX, R12, R13, R14, DI, R15, SP, 0xa4) - LONG $0xa4248c03; WORD $0x0000; BYTE $0x00 // add ecx,[rsp+0xa4] - WORD $0x2145; BYTE $0xdc // and r12d,r11d - LONG $0xf07b43c4; WORD $0x19eb // rorx r13d,r11d,0x19 - LONG $0xe5efddc5 // vpxor ymm4,ymm4,ymm5 - LONG $0xf07bc3c4; WORD $0x0bfb // rorx edi,r11d,0xb - LONG $0x32148d42 // lea edx,[rdx+r14*1] - LONG $0x210c8d42 // lea ecx,[rcx+r12*1] - LONG $0xf572d5c5; BYTE $0x0b // vpslld ymm5,ymm5,0xb - LONG $0xf22062c4; BYTE $0xe3 // andn r12d,r11d,ebx - WORD $0x3141; BYTE $0xfd // xor r13d,edi - LONG $0xf07b43c4; WORD $0x06f3 // rorx r14d,r11d,0x6 - LONG $0xe6efddc5 // vpxor ymm4,ymm4,ymm6 - LONG $0x210c8d42 // lea ecx,[rcx+r12*1] - WORD $0x3145; BYTE $0xf5 // xor r13d,r14d - WORD $0xd789 // mov edi,edx - LONG $0xd772cdc5; BYTE $0x0a // vpsrld ymm6,ymm7,0xa - LONG $0xf07b63c4; WORD $0x16e2 // rorx r12d,edx,0x16 - LONG $0x290c8d42 // lea ecx,[rcx+r13*1] - WORD $0x3144; BYTE $0xc7 // xor edi,r8d - LONG $0xe5efddc5 // vpxor ymm4,ymm4,ymm5 - LONG $0xf07b63c4; WORD $0x0df2 // rorx r14d,edx,0xd - LONG $0xf07b63c4; WORD $0x02ea // rorx r13d,edx,0x2 - LONG $0x0a148d45 // lea r10d,[r10+rcx*1] - LONG $0xd773c5c5; BYTE $0x11 // vpsrlq ymm7,ymm7,0x11 - WORD $0x2141; BYTE $0xff // and r15d,edi - WORD $0x3145; BYTE $0xe6 // xor r14d,r12d - WORD $0x3145; BYTE $0xc7 // xor r15d,r8d - LONG $0xccfef5c5 // vpaddd ymm1,ymm1,ymm4 - WORD $0x3145; BYTE $0xee // xor r14d,r13d - LONG $0x390c8d42 // lea ecx,[rcx+r15*1] - WORD $0x8945; BYTE $0xdc // mov r12d,r11d - LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7 - - // ROUND(CX, DX, R8, R9, R10, R11, AX, BX, R12, R13, R14, R15, DI, SP, 0xa8) - LONG $0xa8249c03; WORD $0x0000; BYTE $0x00 // add ebx,[rsp+0xa8] - WORD $0x2145; BYTE $0xd4 // and r12d,r10d - LONG $0xf07b43c4; WORD $0x19ea // rorx r13d,r10d,0x19 - LONG $0xd773c5c5; BYTE $0x02 // vpsrlq ymm7,ymm7,0x2 - LONG $0xf07b43c4; WORD $0x0bfa // rorx r15d,r10d,0xb - LONG $0x310c8d42 // lea ecx,[rcx+r14*1] - LONG $0x231c8d42 // lea ebx,[rbx+r12*1] - LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7 - LONG $0xf22862c4; BYTE $0xe0 // andn r12d,r10d,eax - WORD $0x3145; BYTE $0xfd // xor r13d,r15d - LONG $0xf07b43c4; WORD $0x06f2 // rorx r14d,r10d,0x6 - LONG $0x004dc2c4; BYTE $0xf0 // vpshufb ymm6,ymm6,ymm8 - LONG $0x231c8d42 // lea ebx,[rbx+r12*1] - WORD $0x3145; BYTE $0xf5 // xor r13d,r14d - WORD $0x8941; BYTE $0xcf // mov r15d,ecx - LONG $0xcefef5c5 // vpaddd ymm1,ymm1,ymm6 - LONG $0xf07b63c4; WORD $0x16e1 // rorx r12d,ecx,0x16 - LONG $0x2b1c8d42 // lea ebx,[rbx+r13*1] - WORD $0x3141; BYTE $0xd7 // xor r15d,edx - LONG $0xf970fdc5; BYTE $0x50 // vpshufd ymm7,ymm1,0x50 - LONG $0xf07b63c4; WORD $0x0df1 // rorx r14d,ecx,0xd - LONG $0xf07b63c4; WORD $0x02e9 // rorx r13d,ecx,0x2 - LONG $0x190c8d45 // lea r9d,[r9+rbx*1] - LONG $0xd772cdc5; BYTE $0x0a // vpsrld ymm6,ymm7,0xa - WORD $0x2144; BYTE $0xff // and edi,r15d - WORD $0x3145; BYTE $0xe6 // xor r14d,r12d - WORD $0xd731 // xor edi,edx - LONG $0xd773c5c5; BYTE $0x11 // vpsrlq ymm7,ymm7,0x11 - WORD $0x3145; BYTE $0xee // xor r14d,r13d - WORD $0x1c8d; BYTE $0x3b // lea ebx,[rbx+rdi*1] - WORD $0x8945; BYTE $0xd4 // mov r12d,r10d - LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7 - - // ROUND(BX, CX, DX, R8, R9, R10, R11, AX, R12, R13, R14, DI, R15, SP, 0xac) - LONG $0xac248403; WORD $0x0000; BYTE $0x00 // add eax,[rsp+0xac] - WORD $0x2145; BYTE $0xcc // and r12d,r9d - LONG $0xf07b43c4; WORD $0x19e9 // rorx r13d,r9d,0x19 - LONG $0xd773c5c5; BYTE $0x02 // vpsrlq ymm7,ymm7,0x2 - LONG $0xf07bc3c4; WORD $0x0bf9 // rorx edi,r9d,0xb - LONG $0x331c8d42 // lea ebx,[rbx+r14*1] - LONG $0x20048d42 // lea eax,[rax+r12*1] - LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7 - LONG $0xf23042c4; BYTE $0xe3 // andn r12d,r9d,r11d - WORD $0x3141; BYTE $0xfd // xor r13d,edi - LONG $0xf07b43c4; WORD $0x06f1 // rorx r14d,r9d,0x6 - LONG $0x004dc2c4; BYTE $0xf1 // vpshufb ymm6,ymm6,ymm9 - LONG $0x20048d42 // lea eax,[rax+r12*1] - WORD $0x3145; BYTE $0xf5 // xor r13d,r14d - WORD $0xdf89 // mov edi,ebx - LONG $0xcefef5c5 // vpaddd ymm1,ymm1,ymm6 - LONG $0xf07b63c4; WORD $0x16e3 // rorx r12d,ebx,0x16 - LONG $0x28048d42 // lea eax,[rax+r13*1] - WORD $0xcf31 // xor edi,ecx - LONG $0x75fef5c5; BYTE $0x20 // vpaddd ymm6,ymm1,[rbp+0x20] - LONG $0xf07b63c4; WORD $0x0df3 // rorx r14d,ebx,0xd - LONG $0xf07b63c4; WORD $0x02eb // rorx r13d,ebx,0x2 - LONG $0x00048d45 // lea r8d,[r8+rax*1] - WORD $0x2141; BYTE $0xff // and r15d,edi - WORD $0x3145; BYTE $0xe6 // xor r14d,r12d - WORD $0x3141; BYTE $0xcf // xor r15d,ecx - WORD $0x3145; BYTE $0xee // xor r14d,r13d - LONG $0x38048d42 // lea eax,[rax+r15*1] - WORD $0x8945; BYTE $0xcc // mov r12d,r9d - - LONG $0x747ffdc5; WORD $0x2024 // vmovdqa [rsp+0x20],ymm6 - - LONG $0x24648d48; BYTE $0xc0 // lea rsp,[rsp-0x40] - LONG $0x0f65e3c4; WORD $0x04e2 // vpalignr ymm4,ymm3,ymm2,0x4 - - // ROUND(AX, BX, CX, DX, R8, R9, R10, R11, R12, R13, R14, R15, DI, SP, 0x80) - LONG $0x249c0344; LONG $0x00000080 // add r11d,[rsp+0x80] - WORD $0x2145; BYTE $0xc4 // and r12d,r8d - LONG $0xf07b43c4; WORD $0x19e8 // rorx r13d,r8d,0x19 - LONG $0x0f75e3c4; WORD $0x04f8 // vpalignr ymm7,ymm1,ymm0,0x4 - LONG $0xf07b43c4; WORD $0x0bf8 // rorx r15d,r8d,0xb - LONG $0x30048d42 // lea eax,[rax+r14*1] - LONG $0x231c8d47 // lea r11d,[r11+r12*1] - LONG $0xd472cdc5; BYTE $0x07 // vpsrld ymm6,ymm4,0x7 - LONG $0xf23842c4; BYTE $0xe2 // andn r12d,r8d,r10d - WORD $0x3145; BYTE $0xfd // xor r13d,r15d - LONG $0xf07b43c4; WORD $0x06f0 // rorx r14d,r8d,0x6 - LONG $0xd7feedc5 // vpaddd ymm2,ymm2,ymm7 - LONG $0x231c8d47 // lea r11d,[r11+r12*1] - WORD $0x3145; BYTE $0xf5 // xor r13d,r14d - WORD $0x8941; BYTE $0xc7 // mov r15d,eax - LONG $0xd472c5c5; BYTE $0x03 // vpsrld ymm7,ymm4,0x3 - LONG $0xf07b63c4; WORD $0x16e0 // rorx r12d,eax,0x16 - LONG $0x2b1c8d47 // lea r11d,[r11+r13*1] - WORD $0x3141; BYTE $0xdf // xor r15d,ebx - LONG $0xf472d5c5; BYTE $0x0e // vpslld ymm5,ymm4,0xe - LONG $0xf07b63c4; WORD $0x0df0 // rorx r14d,eax,0xd - LONG $0xf07b63c4; WORD $0x02e8 // rorx r13d,eax,0x2 - LONG $0x1a148d42 // lea edx,[rdx+r11*1] - LONG $0xe6efc5c5 // vpxor ymm4,ymm7,ymm6 - WORD $0x2144; BYTE $0xff // and edi,r15d - WORD $0x3145; BYTE $0xe6 // xor r14d,r12d - WORD $0xdf31 // xor edi,ebx - LONG $0xf970fdc5; BYTE $0xfa // vpshufd ymm7,ymm1,0xfa - WORD $0x3145; BYTE $0xee // xor r14d,r13d - LONG $0x3b1c8d45 // lea r11d,[r11+rdi*1] - WORD $0x8945; BYTE $0xc4 // mov r12d,r8d - LONG $0xd672cdc5; BYTE $0x0b // vpsrld ymm6,ymm6,0xb - - // ROUND(R11, AX, BX, CX, DX, R8, R9, R10, R12, R13, R14, DI, R15, SP, 0x84) - LONG $0x24940344; LONG $0x00000084 // add r10d,[rsp+0x84] - WORD $0x2141; BYTE $0xd4 // and r12d,edx - LONG $0xf07b63c4; WORD $0x19ea // rorx r13d,edx,0x19 - LONG $0xe5efddc5 // vpxor ymm4,ymm4,ymm5 - LONG $0xf07be3c4; WORD $0x0bfa // rorx edi,edx,0xb - LONG $0x331c8d47 // lea r11d,[r11+r14*1] - LONG $0x22148d47 // lea r10d,[r10+r12*1] - LONG $0xf572d5c5; BYTE $0x0b // vpslld ymm5,ymm5,0xb - LONG $0xf26842c4; BYTE $0xe1 // andn r12d,edx,r9d - WORD $0x3141; BYTE $0xfd // xor r13d,edi - LONG $0xf07b63c4; WORD $0x06f2 // rorx r14d,edx,0x6 - LONG $0xe6efddc5 // vpxor ymm4,ymm4,ymm6 - LONG $0x22148d47 // lea r10d,[r10+r12*1] - WORD $0x3145; BYTE $0xf5 // xor r13d,r14d - WORD $0x8944; BYTE $0xdf // mov edi,r11d - LONG $0xd772cdc5; BYTE $0x0a // vpsrld ymm6,ymm7,0xa - LONG $0xf07b43c4; WORD $0x16e3 // rorx r12d,r11d,0x16 - LONG $0x2a148d47 // lea r10d,[r10+r13*1] - WORD $0xc731 // xor edi,eax - LONG $0xe5efddc5 // vpxor ymm4,ymm4,ymm5 - LONG $0xf07b43c4; WORD $0x0df3 // rorx r14d,r11d,0xd - LONG $0xf07b43c4; WORD $0x02eb // rorx r13d,r11d,0x2 - LONG $0x110c8d42 // lea ecx,[rcx+r10*1] - LONG $0xd773c5c5; BYTE $0x11 // vpsrlq ymm7,ymm7,0x11 - WORD $0x2141; BYTE $0xff // and r15d,edi - WORD $0x3145; BYTE $0xe6 // xor r14d,r12d - WORD $0x3141; BYTE $0xc7 // xor r15d,eax - LONG $0xd4feedc5 // vpaddd ymm2,ymm2,ymm4 - WORD $0x3145; BYTE $0xee // xor r14d,r13d - LONG $0x3a148d47 // lea r10d,[r10+r15*1] - WORD $0x8941; BYTE $0xd4 // mov r12d,edx - LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7 - - // ROUND(R10, R11, AX, BX, CX, DX, R8, R9, R12, R13, R14, R15, DI, SP, 0x88) - LONG $0x248c0344; LONG $0x00000088 // add r9d,[rsp+0x88] - WORD $0x2141; BYTE $0xcc // and r12d,ecx - LONG $0xf07b63c4; WORD $0x19e9 // rorx r13d,ecx,0x19 - LONG $0xd773c5c5; BYTE $0x02 // vpsrlq ymm7,ymm7,0x2 - LONG $0xf07b63c4; WORD $0x0bf9 // rorx r15d,ecx,0xb - LONG $0x32148d47 // lea r10d,[r10+r14*1] - LONG $0x210c8d47 // lea r9d,[r9+r12*1] - LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7 - LONG $0xf27042c4; BYTE $0xe0 // andn r12d,ecx,r8d - WORD $0x3145; BYTE $0xfd // xor r13d,r15d - LONG $0xf07b63c4; WORD $0x06f1 // rorx r14d,ecx,0x6 - LONG $0x004dc2c4; BYTE $0xf0 // vpshufb ymm6,ymm6,ymm8 - LONG $0x210c8d47 // lea r9d,[r9+r12*1] - WORD $0x3145; BYTE $0xf5 // xor r13d,r14d - WORD $0x8945; BYTE $0xd7 // mov r15d,r10d - LONG $0xd6feedc5 // vpaddd ymm2,ymm2,ymm6 - LONG $0xf07b43c4; WORD $0x16e2 // rorx r12d,r10d,0x16 - LONG $0x290c8d47 // lea r9d,[r9+r13*1] - WORD $0x3145; BYTE $0xdf // xor r15d,r11d - LONG $0xfa70fdc5; BYTE $0x50 // vpshufd ymm7,ymm2,0x50 - LONG $0xf07b43c4; WORD $0x0df2 // rorx r14d,r10d,0xd - LONG $0xf07b43c4; WORD $0x02ea // rorx r13d,r10d,0x2 - LONG $0x0b1c8d42 // lea ebx,[rbx+r9*1] - LONG $0xd772cdc5; BYTE $0x0a // vpsrld ymm6,ymm7,0xa - WORD $0x2144; BYTE $0xff // and edi,r15d - WORD $0x3145; BYTE $0xe6 // xor r14d,r12d - WORD $0x3144; BYTE $0xdf // xor edi,r11d - LONG $0xd773c5c5; BYTE $0x11 // vpsrlq ymm7,ymm7,0x11 - WORD $0x3145; BYTE $0xee // xor r14d,r13d - LONG $0x390c8d45 // lea r9d,[r9+rdi*1] - WORD $0x8941; BYTE $0xcc // mov r12d,ecx - LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7 - - // ROUND(R9, R10, R11, AX, BX, CX, DX, R8, R12, R13, R14, DI, R15, SP, 0x8c) - LONG $0x24840344; LONG $0x0000008c // add r8d,[rsp+0x8c] - WORD $0x2141; BYTE $0xdc // and r12d,ebx - LONG $0xf07b63c4; WORD $0x19eb // rorx r13d,ebx,0x19 - LONG $0xd773c5c5; BYTE $0x02 // vpsrlq ymm7,ymm7,0x2 - LONG $0xf07be3c4; WORD $0x0bfb // rorx edi,ebx,0xb - LONG $0x310c8d47 // lea r9d,[r9+r14*1] - LONG $0x20048d47 // lea r8d,[r8+r12*1] - LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7 - LONG $0xf26062c4; BYTE $0xe2 // andn r12d,ebx,edx - WORD $0x3141; BYTE $0xfd // xor r13d,edi - LONG $0xf07b63c4; WORD $0x06f3 // rorx r14d,ebx,0x6 - LONG $0x004dc2c4; BYTE $0xf1 // vpshufb ymm6,ymm6,ymm9 - LONG $0x20048d47 // lea r8d,[r8+r12*1] - WORD $0x3145; BYTE $0xf5 // xor r13d,r14d - WORD $0x8944; BYTE $0xcf // mov edi,r9d - LONG $0xd6feedc5 // vpaddd ymm2,ymm2,ymm6 - LONG $0xf07b43c4; WORD $0x16e1 // rorx r12d,r9d,0x16 - LONG $0x28048d47 // lea r8d,[r8+r13*1] - WORD $0x3144; BYTE $0xd7 // xor edi,r10d - LONG $0x75feedc5; BYTE $0x40 // vpaddd ymm6,ymm2,[rbp+0x40] - LONG $0xf07b43c4; WORD $0x0df1 // rorx r14d,r9d,0xd - LONG $0xf07b43c4; WORD $0x02e9 // rorx r13d,r9d,0x2 - LONG $0x00048d42 // lea eax,[rax+r8*1] - WORD $0x2141; BYTE $0xff // and r15d,edi - WORD $0x3145; BYTE $0xe6 // xor r14d,r12d - WORD $0x3145; BYTE $0xd7 // xor r15d,r10d - WORD $0x3145; BYTE $0xee // xor r14d,r13d - LONG $0x38048d47 // lea r8d,[r8+r15*1] - WORD $0x8941; BYTE $0xdc // mov r12d,ebx - - LONG $0x347ffdc5; BYTE $0x24 // vmovdqa [rsp],ymm6 - LONG $0x0f7de3c4; WORD $0x04e3 // vpalignr ymm4,ymm0,ymm3,0x4 - - // ROUND(R8, R9, R10, R11, AX, BX, CX, DX, R12, R13, R14, R15, DI, SP, 0xa0) - LONG $0xa0249403; WORD $0x0000; BYTE $0x00 // add edx,[rsp+0xa0] - WORD $0x2141; BYTE $0xc4 // and r12d,eax - LONG $0xf07b63c4; WORD $0x19e8 // rorx r13d,eax,0x19 - LONG $0x0f6de3c4; WORD $0x04f9 // vpalignr ymm7,ymm2,ymm1,0x4 - LONG $0xf07b63c4; WORD $0x0bf8 // rorx r15d,eax,0xb - LONG $0x30048d47 // lea r8d,[r8+r14*1] - LONG $0x22148d42 // lea edx,[rdx+r12*1] - LONG $0xd472cdc5; BYTE $0x07 // vpsrld ymm6,ymm4,0x7 - LONG $0xf27862c4; BYTE $0xe1 // andn r12d,eax,ecx - WORD $0x3145; BYTE $0xfd // xor r13d,r15d - LONG $0xf07b63c4; WORD $0x06f0 // rorx r14d,eax,0x6 - LONG $0xdffee5c5 // vpaddd ymm3,ymm3,ymm7 - LONG $0x22148d42 // lea edx,[rdx+r12*1] - WORD $0x3145; BYTE $0xf5 // xor r13d,r14d - WORD $0x8945; BYTE $0xc7 // mov r15d,r8d - LONG $0xd472c5c5; BYTE $0x03 // vpsrld ymm7,ymm4,0x3 - LONG $0xf07b43c4; WORD $0x16e0 // rorx r12d,r8d,0x16 - LONG $0x2a148d42 // lea edx,[rdx+r13*1] - WORD $0x3145; BYTE $0xcf // xor r15d,r9d - LONG $0xf472d5c5; BYTE $0x0e // vpslld ymm5,ymm4,0xe - LONG $0xf07b43c4; WORD $0x0df0 // rorx r14d,r8d,0xd - LONG $0xf07b43c4; WORD $0x02e8 // rorx r13d,r8d,0x2 - LONG $0x131c8d45 // lea r11d,[r11+rdx*1] - LONG $0xe6efc5c5 // vpxor ymm4,ymm7,ymm6 - WORD $0x2144; BYTE $0xff // and edi,r15d - WORD $0x3145; BYTE $0xe6 // xor r14d,r12d - WORD $0x3144; BYTE $0xcf // xor edi,r9d - LONG $0xfa70fdc5; BYTE $0xfa // vpshufd ymm7,ymm2,0xfa - WORD $0x3145; BYTE $0xee // xor r14d,r13d - WORD $0x148d; BYTE $0x3a // lea edx,[rdx+rdi*1] - WORD $0x8941; BYTE $0xc4 // mov r12d,eax - LONG $0xd672cdc5; BYTE $0x0b // vpsrld ymm6,ymm6,0xb - - // ROUND(DX, R8, R9, R10, R11, AX, BX, CX, R12, R13, R14, DI, R15, SP, 0xa4) - LONG $0xa4248c03; WORD $0x0000; BYTE $0x00 // add ecx,[rsp+0xa4] - WORD $0x2145; BYTE $0xdc // and r12d,r11d - LONG $0xf07b43c4; WORD $0x19eb // rorx r13d,r11d,0x19 - LONG $0xe5efddc5 // vpxor ymm4,ymm4,ymm5 - LONG $0xf07bc3c4; WORD $0x0bfb // rorx edi,r11d,0xb - LONG $0x32148d42 // lea edx,[rdx+r14*1] - LONG $0x210c8d42 // lea ecx,[rcx+r12*1] - LONG $0xf572d5c5; BYTE $0x0b // vpslld ymm5,ymm5,0xb - LONG $0xf22062c4; BYTE $0xe3 // andn r12d,r11d,ebx - WORD $0x3141; BYTE $0xfd // xor r13d,edi - LONG $0xf07b43c4; WORD $0x06f3 // rorx r14d,r11d,0x6 - LONG $0xe6efddc5 // vpxor ymm4,ymm4,ymm6 - LONG $0x210c8d42 // lea ecx,[rcx+r12*1] - WORD $0x3145; BYTE $0xf5 // xor r13d,r14d - WORD $0xd789 // mov edi,edx - LONG $0xd772cdc5; BYTE $0x0a // vpsrld ymm6,ymm7,0xa - LONG $0xf07b63c4; WORD $0x16e2 // rorx r12d,edx,0x16 - LONG $0x290c8d42 // lea ecx,[rcx+r13*1] - WORD $0x3144; BYTE $0xc7 // xor edi,r8d - LONG $0xe5efddc5 // vpxor ymm4,ymm4,ymm5 - LONG $0xf07b63c4; WORD $0x0df2 // rorx r14d,edx,0xd - LONG $0xf07b63c4; WORD $0x02ea // rorx r13d,edx,0x2 - LONG $0x0a148d45 // lea r10d,[r10+rcx*1] - LONG $0xd773c5c5; BYTE $0x11 // vpsrlq ymm7,ymm7,0x11 - WORD $0x2141; BYTE $0xff // and r15d,edi - WORD $0x3145; BYTE $0xe6 // xor r14d,r12d - WORD $0x3145; BYTE $0xc7 // xor r15d,r8d - LONG $0xdcfee5c5 // vpaddd ymm3,ymm3,ymm4 - WORD $0x3145; BYTE $0xee // xor r14d,r13d - LONG $0x390c8d42 // lea ecx,[rcx+r15*1] - WORD $0x8945; BYTE $0xdc // mov r12d,r11d - LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7 - - // ROUND(CX, DX, R8, R9, R10, R11, AX, BX, R12, R13, R14, R15, DI, SP, 0xa8) - LONG $0xa8249c03; WORD $0x0000; BYTE $0x00 // add ebx,[rsp+0xa8] - WORD $0x2145; BYTE $0xd4 // and r12d,r10d - LONG $0xf07b43c4; WORD $0x19ea // rorx r13d,r10d,0x19 - LONG $0xd773c5c5; BYTE $0x02 // vpsrlq ymm7,ymm7,0x2 - LONG $0xf07b43c4; WORD $0x0bfa // rorx r15d,r10d,0xb - LONG $0x310c8d42 // lea ecx,[rcx+r14*1] - LONG $0x231c8d42 // lea ebx,[rbx+r12*1] - LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7 - LONG $0xf22862c4; BYTE $0xe0 // andn r12d,r10d,eax - WORD $0x3145; BYTE $0xfd // xor r13d,r15d - LONG $0xf07b43c4; WORD $0x06f2 // rorx r14d,r10d,0x6 - LONG $0x004dc2c4; BYTE $0xf0 // vpshufb ymm6,ymm6,ymm8 - LONG $0x231c8d42 // lea ebx,[rbx+r12*1] - WORD $0x3145; BYTE $0xf5 // xor r13d,r14d - WORD $0x8941; BYTE $0xcf // mov r15d,ecx - LONG $0xdefee5c5 // vpaddd ymm3,ymm3,ymm6 - LONG $0xf07b63c4; WORD $0x16e1 // rorx r12d,ecx,0x16 - LONG $0x2b1c8d42 // lea ebx,[rbx+r13*1] - WORD $0x3141; BYTE $0xd7 // xor r15d,edx - LONG $0xfb70fdc5; BYTE $0x50 // vpshufd ymm7,ymm3,0x50 - LONG $0xf07b63c4; WORD $0x0df1 // rorx r14d,ecx,0xd - LONG $0xf07b63c4; WORD $0x02e9 // rorx r13d,ecx,0x2 - LONG $0x190c8d45 // lea r9d,[r9+rbx*1] - LONG $0xd772cdc5; BYTE $0x0a // vpsrld ymm6,ymm7,0xa - WORD $0x2144; BYTE $0xff // and edi,r15d - WORD $0x3145; BYTE $0xe6 // xor r14d,r12d - WORD $0xd731 // xor edi,edx - LONG $0xd773c5c5; BYTE $0x11 // vpsrlq ymm7,ymm7,0x11 - WORD $0x3145; BYTE $0xee // xor r14d,r13d - WORD $0x1c8d; BYTE $0x3b // lea ebx,[rbx+rdi*1] - WORD $0x8945; BYTE $0xd4 // mov r12d,r10d - LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7 - - // ROUND(BX, CX, DX, R8, R9, R10, R11, AX, R12, R13, R14, DI, R15, SP, 0xac) - LONG $0xac248403; WORD $0x0000; BYTE $0x00 // add eax,[rsp+0xac] - WORD $0x2145; BYTE $0xcc // and r12d,r9d - LONG $0xf07b43c4; WORD $0x19e9 // rorx r13d,r9d,0x19 - LONG $0xd773c5c5; BYTE $0x02 // vpsrlq ymm7,ymm7,0x2 - LONG $0xf07bc3c4; WORD $0x0bf9 // rorx edi,r9d,0xb - LONG $0x331c8d42 // lea ebx,[rbx+r14*1] - LONG $0x20048d42 // lea eax,[rax+r12*1] - LONG $0xf7efcdc5 // vpxor ymm6,ymm6,ymm7 - LONG $0xf23042c4; BYTE $0xe3 // andn r12d,r9d,r11d - WORD $0x3141; BYTE $0xfd // xor r13d,edi - LONG $0xf07b43c4; WORD $0x06f1 // rorx r14d,r9d,0x6 - LONG $0x004dc2c4; BYTE $0xf1 // vpshufb ymm6,ymm6,ymm9 - LONG $0x20048d42 // lea eax,[rax+r12*1] - WORD $0x3145; BYTE $0xf5 // xor r13d,r14d - WORD $0xdf89 // mov edi,ebx - LONG $0xdefee5c5 // vpaddd ymm3,ymm3,ymm6 - LONG $0xf07b63c4; WORD $0x16e3 // rorx r12d,ebx,0x16 - LONG $0x28048d42 // lea eax,[rax+r13*1] - WORD $0xcf31 // xor edi,ecx - LONG $0x75fee5c5; BYTE $0x60 // vpaddd ymm6,ymm3,[rbp+0x60] - LONG $0xf07b63c4; WORD $0x0df3 // rorx r14d,ebx,0xd - LONG $0xf07b63c4; WORD $0x02eb // rorx r13d,ebx,0x2 - LONG $0x00048d45 // lea r8d,[r8+rax*1] - WORD $0x2141; BYTE $0xff // and r15d,edi - WORD $0x3145; BYTE $0xe6 // xor r14d,r12d - WORD $0x3141; BYTE $0xcf // xor r15d,ecx - WORD $0x3145; BYTE $0xee // xor r14d,r13d - LONG $0x38048d42 // lea eax,[rax+r15*1] - WORD $0x8945; BYTE $0xcc // mov r12d,r9d - - LONG $0x747ffdc5; WORD $0x2024 // vmovdqa [rsp+0x20],ymm6 - ADDQ $0x80, BP - - CMPB 0x3(BP), $0x0 - JNE loop1 - - // ROUND(AX, BX, CX, DX, R8, R9, R10, R11, R12, R13, R14, R15, DI, SP, 0x40) - LONG $0x245c0344; BYTE $0x40 // add r11d,[rsp+0x40] - WORD $0x2145; BYTE $0xc4 // and r12d,r8d - LONG $0xf07b43c4; WORD $0x19e8 // rorx r13d,r8d,0x19 - LONG $0xf07b43c4; WORD $0x0bf8 // rorx r15d,r8d,0xb - LONG $0x30048d42 // lea eax,[rax+r14*1] - LONG $0x231c8d47 // lea r11d,[r11+r12*1] - LONG $0xf23842c4; BYTE $0xe2 // andn r12d,r8d,r10d - WORD $0x3145; BYTE $0xfd // xor r13d,r15d - LONG $0xf07b43c4; WORD $0x06f0 // rorx r14d,r8d,0x6 - LONG $0x231c8d47 // lea r11d,[r11+r12*1] - WORD $0x3145; BYTE $0xf5 // xor r13d,r14d - WORD $0x8941; BYTE $0xc7 // mov r15d,eax - LONG $0xf07b63c4; WORD $0x16e0 // rorx r12d,eax,0x16 - LONG $0x2b1c8d47 // lea r11d,[r11+r13*1] - WORD $0x3141; BYTE $0xdf // xor r15d,ebx - LONG $0xf07b63c4; WORD $0x0df0 // rorx r14d,eax,0xd - LONG $0xf07b63c4; WORD $0x02e8 // rorx r13d,eax,0x2 - LONG $0x1a148d42 // lea edx,[rdx+r11*1] - WORD $0x2144; BYTE $0xff // and edi,r15d - WORD $0x3145; BYTE $0xe6 // xor r14d,r12d - WORD $0xdf31 // xor edi,ebx - WORD $0x3145; BYTE $0xee // xor r14d,r13d - LONG $0x3b1c8d45 // lea r11d,[r11+rdi*1] - WORD $0x8945; BYTE $0xc4 // mov r12d,r8d - - // ROUND(R11, AX, BX, CX, DX, R8, R9, R10, R12, R13, R14, DI, R15, SP, 0x44) - LONG $0x24540344; BYTE $0x44 // add r10d,[rsp+0x44] - WORD $0x2141; BYTE $0xd4 // and r12d,edx - LONG $0xf07b63c4; WORD $0x19ea // rorx r13d,edx,0x19 - LONG $0xf07be3c4; WORD $0x0bfa // rorx edi,edx,0xb - LONG $0x331c8d47 // lea r11d,[r11+r14*1] - LONG $0x22148d47 // lea r10d,[r10+r12*1] - LONG $0xf26842c4; BYTE $0xe1 // andn r12d,edx,r9d - WORD $0x3141; BYTE $0xfd // xor r13d,edi - LONG $0xf07b63c4; WORD $0x06f2 // rorx r14d,edx,0x6 - LONG $0x22148d47 // lea r10d,[r10+r12*1] - WORD $0x3145; BYTE $0xf5 // xor r13d,r14d - WORD $0x8944; BYTE $0xdf // mov edi,r11d - LONG $0xf07b43c4; WORD $0x16e3 // rorx r12d,r11d,0x16 - LONG $0x2a148d47 // lea r10d,[r10+r13*1] - WORD $0xc731 // xor edi,eax - LONG $0xf07b43c4; WORD $0x0df3 // rorx r14d,r11d,0xd - LONG $0xf07b43c4; WORD $0x02eb // rorx r13d,r11d,0x2 - LONG $0x110c8d42 // lea ecx,[rcx+r10*1] - WORD $0x2141; BYTE $0xff // and r15d,edi - WORD $0x3145; BYTE $0xe6 // xor r14d,r12d - WORD $0x3141; BYTE $0xc7 // xor r15d,eax - WORD $0x3145; BYTE $0xee // xor r14d,r13d - LONG $0x3a148d47 // lea r10d,[r10+r15*1] - WORD $0x8941; BYTE $0xd4 // mov r12d,edx - - // ROUND(R10, R11, AX, BX, CX, DX, R8, R9, R12, R13, R14, R15, DI, SP, 0x48) - LONG $0x244c0344; BYTE $0x48 // add r9d,[rsp+0x48] - WORD $0x2141; BYTE $0xcc // and r12d,ecx - LONG $0xf07b63c4; WORD $0x19e9 // rorx r13d,ecx,0x19 - LONG $0xf07b63c4; WORD $0x0bf9 // rorx r15d,ecx,0xb - LONG $0x32148d47 // lea r10d,[r10+r14*1] - LONG $0x210c8d47 // lea r9d,[r9+r12*1] - LONG $0xf27042c4; BYTE $0xe0 // andn r12d,ecx,r8d - WORD $0x3145; BYTE $0xfd // xor r13d,r15d - LONG $0xf07b63c4; WORD $0x06f1 // rorx r14d,ecx,0x6 - LONG $0x210c8d47 // lea r9d,[r9+r12*1] - WORD $0x3145; BYTE $0xf5 // xor r13d,r14d - WORD $0x8945; BYTE $0xd7 // mov r15d,r10d - LONG $0xf07b43c4; WORD $0x16e2 // rorx r12d,r10d,0x16 - LONG $0x290c8d47 // lea r9d,[r9+r13*1] - WORD $0x3145; BYTE $0xdf // xor r15d,r11d - LONG $0xf07b43c4; WORD $0x0df2 // rorx r14d,r10d,0xd - LONG $0xf07b43c4; WORD $0x02ea // rorx r13d,r10d,0x2 - LONG $0x0b1c8d42 // lea ebx,[rbx+r9*1] - WORD $0x2144; BYTE $0xff // and edi,r15d - WORD $0x3145; BYTE $0xe6 // xor r14d,r12d - WORD $0x3144; BYTE $0xdf // xor edi,r11d - WORD $0x3145; BYTE $0xee // xor r14d,r13d - LONG $0x390c8d45 // lea r9d,[r9+rdi*1] - WORD $0x8941; BYTE $0xcc // mov r12d,ecx - - // ROUND(R9, R10, R11, AX, BX, CX, DX, R8, R12, R13, R14, DI, R15, SP, 0x4c) - LONG $0x24440344; BYTE $0x4c // add r8d,[rsp+0x4c] - WORD $0x2141; BYTE $0xdc // and r12d,ebx - LONG $0xf07b63c4; WORD $0x19eb // rorx r13d,ebx,0x19 - LONG $0xf07be3c4; WORD $0x0bfb // rorx edi,ebx,0xb - LONG $0x310c8d47 // lea r9d,[r9+r14*1] - LONG $0x20048d47 // lea r8d,[r8+r12*1] - LONG $0xf26062c4; BYTE $0xe2 // andn r12d,ebx,edx - WORD $0x3141; BYTE $0xfd // xor r13d,edi - LONG $0xf07b63c4; WORD $0x06f3 // rorx r14d,ebx,0x6 - LONG $0x20048d47 // lea r8d,[r8+r12*1] - WORD $0x3145; BYTE $0xf5 // xor r13d,r14d - WORD $0x8944; BYTE $0xcf // mov edi,r9d - LONG $0xf07b43c4; WORD $0x16e1 // rorx r12d,r9d,0x16 - LONG $0x28048d47 // lea r8d,[r8+r13*1] - WORD $0x3144; BYTE $0xd7 // xor edi,r10d - LONG $0xf07b43c4; WORD $0x0df1 // rorx r14d,r9d,0xd - LONG $0xf07b43c4; WORD $0x02e9 // rorx r13d,r9d,0x2 - LONG $0x00048d42 // lea eax,[rax+r8*1] - WORD $0x2141; BYTE $0xff // and r15d,edi - WORD $0x3145; BYTE $0xe6 // xor r14d,r12d - WORD $0x3145; BYTE $0xd7 // xor r15d,r10d - WORD $0x3145; BYTE $0xee // xor r14d,r13d - LONG $0x38048d47 // lea r8d,[r8+r15*1] - WORD $0x8941; BYTE $0xdc // mov r12d,ebx - - // ROUND(R8, R9, R10, R11, AX, BX, CX, DX, R12, R13, R14, R15, DI, SP, 0x60) - LONG $0x60245403 // add edx,[rsp+0x60] - WORD $0x2141; BYTE $0xc4 // and r12d,eax - LONG $0xf07b63c4; WORD $0x19e8 // rorx r13d,eax,0x19 - LONG $0xf07b63c4; WORD $0x0bf8 // rorx r15d,eax,0xb - LONG $0x30048d47 // lea r8d,[r8+r14*1] - LONG $0x22148d42 // lea edx,[rdx+r12*1] - LONG $0xf27862c4; BYTE $0xe1 // andn r12d,eax,ecx - WORD $0x3145; BYTE $0xfd // xor r13d,r15d - LONG $0xf07b63c4; WORD $0x06f0 // rorx r14d,eax,0x6 - LONG $0x22148d42 // lea edx,[rdx+r12*1] - WORD $0x3145; BYTE $0xf5 // xor r13d,r14d - WORD $0x8945; BYTE $0xc7 // mov r15d,r8d - LONG $0xf07b43c4; WORD $0x16e0 // rorx r12d,r8d,0x16 - LONG $0x2a148d42 // lea edx,[rdx+r13*1] - WORD $0x3145; BYTE $0xcf // xor r15d,r9d - LONG $0xf07b43c4; WORD $0x0df0 // rorx r14d,r8d,0xd - LONG $0xf07b43c4; WORD $0x02e8 // rorx r13d,r8d,0x2 - LONG $0x131c8d45 // lea r11d,[r11+rdx*1] - WORD $0x2144; BYTE $0xff // and edi,r15d - WORD $0x3145; BYTE $0xe6 // xor r14d,r12d - WORD $0x3144; BYTE $0xcf // xor edi,r9d - WORD $0x3145; BYTE $0xee // xor r14d,r13d - WORD $0x148d; BYTE $0x3a // lea edx,[rdx+rdi*1] - WORD $0x8941; BYTE $0xc4 // mov r12d,eax - - // ROUND(DX, R8, R9, R10, R11, AX, BX, CX, R12, R13, R14, DI, R15, SP, 0x64) - LONG $0x64244c03 // add ecx,[rsp+0x64] - WORD $0x2145; BYTE $0xdc // and r12d,r11d - LONG $0xf07b43c4; WORD $0x19eb // rorx r13d,r11d,0x19 - LONG $0xf07bc3c4; WORD $0x0bfb // rorx edi,r11d,0xb - LONG $0x32148d42 // lea edx,[rdx+r14*1] - LONG $0x210c8d42 // lea ecx,[rcx+r12*1] - LONG $0xf22062c4; BYTE $0xe3 // andn r12d,r11d,ebx - WORD $0x3141; BYTE $0xfd // xor r13d,edi - LONG $0xf07b43c4; WORD $0x06f3 // rorx r14d,r11d,0x6 - LONG $0x210c8d42 // lea ecx,[rcx+r12*1] - WORD $0x3145; BYTE $0xf5 // xor r13d,r14d - WORD $0xd789 // mov edi,edx - LONG $0xf07b63c4; WORD $0x16e2 // rorx r12d,edx,0x16 - LONG $0x290c8d42 // lea ecx,[rcx+r13*1] - WORD $0x3144; BYTE $0xc7 // xor edi,r8d - LONG $0xf07b63c4; WORD $0x0df2 // rorx r14d,edx,0xd - LONG $0xf07b63c4; WORD $0x02ea // rorx r13d,edx,0x2 - LONG $0x0a148d45 // lea r10d,[r10+rcx*1] - WORD $0x2141; BYTE $0xff // and r15d,edi - WORD $0x3145; BYTE $0xe6 // xor r14d,r12d - WORD $0x3145; BYTE $0xc7 // xor r15d,r8d - WORD $0x3145; BYTE $0xee // xor r14d,r13d - LONG $0x390c8d42 // lea ecx,[rcx+r15*1] - WORD $0x8945; BYTE $0xdc // mov r12d,r11d - - // ROUND(CX, DX, R8, R9, R10, R11, AX, BX, R12, R13, R14, R15, DI, SP, 0x68) - LONG $0x68245c03 // add ebx,[rsp+0x68] - WORD $0x2145; BYTE $0xd4 // and r12d,r10d - LONG $0xf07b43c4; WORD $0x19ea // rorx r13d,r10d,0x19 - LONG $0xf07b43c4; WORD $0x0bfa // rorx r15d,r10d,0xb - LONG $0x310c8d42 // lea ecx,[rcx+r14*1] - LONG $0x231c8d42 // lea ebx,[rbx+r12*1] - LONG $0xf22862c4; BYTE $0xe0 // andn r12d,r10d,eax - WORD $0x3145; BYTE $0xfd // xor r13d,r15d - LONG $0xf07b43c4; WORD $0x06f2 // rorx r14d,r10d,0x6 - LONG $0x231c8d42 // lea ebx,[rbx+r12*1] - WORD $0x3145; BYTE $0xf5 // xor r13d,r14d - WORD $0x8941; BYTE $0xcf // mov r15d,ecx - LONG $0xf07b63c4; WORD $0x16e1 // rorx r12d,ecx,0x16 - LONG $0x2b1c8d42 // lea ebx,[rbx+r13*1] - WORD $0x3141; BYTE $0xd7 // xor r15d,edx - LONG $0xf07b63c4; WORD $0x0df1 // rorx r14d,ecx,0xd - LONG $0xf07b63c4; WORD $0x02e9 // rorx r13d,ecx,0x2 - LONG $0x190c8d45 // lea r9d,[r9+rbx*1] - WORD $0x2144; BYTE $0xff // and edi,r15d - WORD $0x3145; BYTE $0xe6 // xor r14d,r12d - WORD $0xd731 // xor edi,edx - WORD $0x3145; BYTE $0xee // xor r14d,r13d - WORD $0x1c8d; BYTE $0x3b // lea ebx,[rbx+rdi*1] - WORD $0x8945; BYTE $0xd4 // mov r12d,r10d - - // ROUND(BX, CX, DX, R8, R9, R10, R11, AX, R12, R13, R14, DI, R15, SP, 0x6c) - LONG $0x6c244403 // add eax,[rsp+0x6c] - WORD $0x2145; BYTE $0xcc // and r12d,r9d - LONG $0xf07b43c4; WORD $0x19e9 // rorx r13d,r9d,0x19 - LONG $0xf07bc3c4; WORD $0x0bf9 // rorx edi,r9d,0xb - LONG $0x331c8d42 // lea ebx,[rbx+r14*1] - LONG $0x20048d42 // lea eax,[rax+r12*1] - LONG $0xf23042c4; BYTE $0xe3 // andn r12d,r9d,r11d - WORD $0x3141; BYTE $0xfd // xor r13d,edi - LONG $0xf07b43c4; WORD $0x06f1 // rorx r14d,r9d,0x6 - LONG $0x20048d42 // lea eax,[rax+r12*1] - WORD $0x3145; BYTE $0xf5 // xor r13d,r14d - WORD $0xdf89 // mov edi,ebx - LONG $0xf07b63c4; WORD $0x16e3 // rorx r12d,ebx,0x16 - LONG $0x28048d42 // lea eax,[rax+r13*1] - WORD $0xcf31 // xor edi,ecx - LONG $0xf07b63c4; WORD $0x0df3 // rorx r14d,ebx,0xd - LONG $0xf07b63c4; WORD $0x02eb // rorx r13d,ebx,0x2 - LONG $0x00048d45 // lea r8d,[r8+rax*1] - WORD $0x2141; BYTE $0xff // and r15d,edi - WORD $0x3145; BYTE $0xe6 // xor r14d,r12d - WORD $0x3141; BYTE $0xcf // xor r15d,ecx - WORD $0x3145; BYTE $0xee // xor r14d,r13d - LONG $0x38048d42 // lea eax,[rax+r15*1] - WORD $0x8945; BYTE $0xcc // mov r12d,r9d - - // ROUND(AX, BX, CX, DX, R8, R9, R10, R11, R12, R13, R14, R15, DI, SP, 0x00) - LONG $0x241c0344 // add r11d,[rsp] - WORD $0x2145; BYTE $0xc4 // and r12d,r8d - LONG $0xf07b43c4; WORD $0x19e8 // rorx r13d,r8d,0x19 - LONG $0xf07b43c4; WORD $0x0bf8 // rorx r15d,r8d,0xb - LONG $0x30048d42 // lea eax,[rax+r14*1] - LONG $0x231c8d47 // lea r11d,[r11+r12*1] - LONG $0xf23842c4; BYTE $0xe2 // andn r12d,r8d,r10d - WORD $0x3145; BYTE $0xfd // xor r13d,r15d - LONG $0xf07b43c4; WORD $0x06f0 // rorx r14d,r8d,0x6 - LONG $0x231c8d47 // lea r11d,[r11+r12*1] - WORD $0x3145; BYTE $0xf5 // xor r13d,r14d - WORD $0x8941; BYTE $0xc7 // mov r15d,eax - LONG $0xf07b63c4; WORD $0x16e0 // rorx r12d,eax,0x16 - LONG $0x2b1c8d47 // lea r11d,[r11+r13*1] - WORD $0x3141; BYTE $0xdf // xor r15d,ebx - LONG $0xf07b63c4; WORD $0x0df0 // rorx r14d,eax,0xd - LONG $0xf07b63c4; WORD $0x02e8 // rorx r13d,eax,0x2 - LONG $0x1a148d42 // lea edx,[rdx+r11*1] - WORD $0x2144; BYTE $0xff // and edi,r15d - WORD $0x3145; BYTE $0xe6 // xor r14d,r12d - WORD $0xdf31 // xor edi,ebx - WORD $0x3145; BYTE $0xee // xor r14d,r13d - LONG $0x3b1c8d45 // lea r11d,[r11+rdi*1] - WORD $0x8945; BYTE $0xc4 // mov r12d,r8d - - // ROUND(R11, AX, BX, CX, DX, R8, R9, R10, R12, R13, R14, DI, R15, SP, 0x04) - LONG $0x24540344; BYTE $0x04 // add r10d,[rsp+0x4] - WORD $0x2141; BYTE $0xd4 // and r12d,edx - LONG $0xf07b63c4; WORD $0x19ea // rorx r13d,edx,0x19 - LONG $0xf07be3c4; WORD $0x0bfa // rorx edi,edx,0xb - LONG $0x331c8d47 // lea r11d,[r11+r14*1] - LONG $0x22148d47 // lea r10d,[r10+r12*1] - LONG $0xf26842c4; BYTE $0xe1 // andn r12d,edx,r9d - WORD $0x3141; BYTE $0xfd // xor r13d,edi - LONG $0xf07b63c4; WORD $0x06f2 // rorx r14d,edx,0x6 - LONG $0x22148d47 // lea r10d,[r10+r12*1] - WORD $0x3145; BYTE $0xf5 // xor r13d,r14d - WORD $0x8944; BYTE $0xdf // mov edi,r11d - LONG $0xf07b43c4; WORD $0x16e3 // rorx r12d,r11d,0x16 - LONG $0x2a148d47 // lea r10d,[r10+r13*1] - WORD $0xc731 // xor edi,eax - LONG $0xf07b43c4; WORD $0x0df3 // rorx r14d,r11d,0xd - LONG $0xf07b43c4; WORD $0x02eb // rorx r13d,r11d,0x2 - LONG $0x110c8d42 // lea ecx,[rcx+r10*1] - WORD $0x2141; BYTE $0xff // and r15d,edi - WORD $0x3145; BYTE $0xe6 // xor r14d,r12d - WORD $0x3141; BYTE $0xc7 // xor r15d,eax - WORD $0x3145; BYTE $0xee // xor r14d,r13d - LONG $0x3a148d47 // lea r10d,[r10+r15*1] - WORD $0x8941; BYTE $0xd4 // mov r12d,edx - - // ROUND(R10, R11, AX, BX, CX, DX, R8, R9, R12, R13, R14, R15, DI, SP, 0x08) - LONG $0x244c0344; BYTE $0x08 // add r9d,[rsp+0x8] - WORD $0x2141; BYTE $0xcc // and r12d,ecx - LONG $0xf07b63c4; WORD $0x19e9 // rorx r13d,ecx,0x19 - LONG $0xf07b63c4; WORD $0x0bf9 // rorx r15d,ecx,0xb - LONG $0x32148d47 // lea r10d,[r10+r14*1] - LONG $0x210c8d47 // lea r9d,[r9+r12*1] - LONG $0xf27042c4; BYTE $0xe0 // andn r12d,ecx,r8d - WORD $0x3145; BYTE $0xfd // xor r13d,r15d - LONG $0xf07b63c4; WORD $0x06f1 // rorx r14d,ecx,0x6 - LONG $0x210c8d47 // lea r9d,[r9+r12*1] - WORD $0x3145; BYTE $0xf5 // xor r13d,r14d - WORD $0x8945; BYTE $0xd7 // mov r15d,r10d - LONG $0xf07b43c4; WORD $0x16e2 // rorx r12d,r10d,0x16 - LONG $0x290c8d47 // lea r9d,[r9+r13*1] - WORD $0x3145; BYTE $0xdf // xor r15d,r11d - LONG $0xf07b43c4; WORD $0x0df2 // rorx r14d,r10d,0xd - LONG $0xf07b43c4; WORD $0x02ea // rorx r13d,r10d,0x2 - LONG $0x0b1c8d42 // lea ebx,[rbx+r9*1] - WORD $0x2144; BYTE $0xff // and edi,r15d - WORD $0x3145; BYTE $0xe6 // xor r14d,r12d - WORD $0x3144; BYTE $0xdf // xor edi,r11d - WORD $0x3145; BYTE $0xee // xor r14d,r13d - LONG $0x390c8d45 // lea r9d,[r9+rdi*1] - WORD $0x8941; BYTE $0xcc // mov r12d,ecx - - // ROUND(R9, R10, R11, AX, BX, CX, DX, R8, R12, R13, R14, DI, R15, SP, 0x0c) - LONG $0x24440344; BYTE $0x0c // add r8d,[rsp+0xc] - WORD $0x2141; BYTE $0xdc // and r12d,ebx - LONG $0xf07b63c4; WORD $0x19eb // rorx r13d,ebx,0x19 - LONG $0xf07be3c4; WORD $0x0bfb // rorx edi,ebx,0xb - LONG $0x310c8d47 // lea r9d,[r9+r14*1] - LONG $0x20048d47 // lea r8d,[r8+r12*1] - LONG $0xf26062c4; BYTE $0xe2 // andn r12d,ebx,edx - WORD $0x3141; BYTE $0xfd // xor r13d,edi - LONG $0xf07b63c4; WORD $0x06f3 // rorx r14d,ebx,0x6 - LONG $0x20048d47 // lea r8d,[r8+r12*1] - WORD $0x3145; BYTE $0xf5 // xor r13d,r14d - WORD $0x8944; BYTE $0xcf // mov edi,r9d - LONG $0xf07b43c4; WORD $0x16e1 // rorx r12d,r9d,0x16 - LONG $0x28048d47 // lea r8d,[r8+r13*1] - WORD $0x3144; BYTE $0xd7 // xor edi,r10d - LONG $0xf07b43c4; WORD $0x0df1 // rorx r14d,r9d,0xd - LONG $0xf07b43c4; WORD $0x02e9 // rorx r13d,r9d,0x2 - LONG $0x00048d42 // lea eax,[rax+r8*1] - WORD $0x2141; BYTE $0xff // and r15d,edi - WORD $0x3145; BYTE $0xe6 // xor r14d,r12d - WORD $0x3145; BYTE $0xd7 // xor r15d,r10d - WORD $0x3145; BYTE $0xee // xor r14d,r13d - LONG $0x38048d47 // lea r8d,[r8+r15*1] - WORD $0x8941; BYTE $0xdc // mov r12d,ebx - - // ROUND(R8, R9, R10, R11, AX, BX, CX, DX, R12, R13, R14, R15, DI, SP, 0x20) - LONG $0x20245403 // add edx,[rsp+0x20] - WORD $0x2141; BYTE $0xc4 // and r12d,eax - LONG $0xf07b63c4; WORD $0x19e8 // rorx r13d,eax,0x19 - LONG $0xf07b63c4; WORD $0x0bf8 // rorx r15d,eax,0xb - LONG $0x30048d47 // lea r8d,[r8+r14*1] - LONG $0x22148d42 // lea edx,[rdx+r12*1] - LONG $0xf27862c4; BYTE $0xe1 // andn r12d,eax,ecx - WORD $0x3145; BYTE $0xfd // xor r13d,r15d - LONG $0xf07b63c4; WORD $0x06f0 // rorx r14d,eax,0x6 - LONG $0x22148d42 // lea edx,[rdx+r12*1] - WORD $0x3145; BYTE $0xf5 // xor r13d,r14d - WORD $0x8945; BYTE $0xc7 // mov r15d,r8d - LONG $0xf07b43c4; WORD $0x16e0 // rorx r12d,r8d,0x16 - LONG $0x2a148d42 // lea edx,[rdx+r13*1] - WORD $0x3145; BYTE $0xcf // xor r15d,r9d - LONG $0xf07b43c4; WORD $0x0df0 // rorx r14d,r8d,0xd - LONG $0xf07b43c4; WORD $0x02e8 // rorx r13d,r8d,0x2 - LONG $0x131c8d45 // lea r11d,[r11+rdx*1] - WORD $0x2144; BYTE $0xff // and edi,r15d - WORD $0x3145; BYTE $0xe6 // xor r14d,r12d - WORD $0x3144; BYTE $0xcf // xor edi,r9d - WORD $0x3145; BYTE $0xee // xor r14d,r13d - WORD $0x148d; BYTE $0x3a // lea edx,[rdx+rdi*1] - WORD $0x8941; BYTE $0xc4 // mov r12d,eax - - // ROUND(DX, R8, R9, R10, R11, AX, BX, CX, R12, R13, R14, DI, R15, SP, 0x24) - LONG $0x24244c03 // add ecx,[rsp+0x24] - WORD $0x2145; BYTE $0xdc // and r12d,r11d - LONG $0xf07b43c4; WORD $0x19eb // rorx r13d,r11d,0x19 - LONG $0xf07bc3c4; WORD $0x0bfb // rorx edi,r11d,0xb - LONG $0x32148d42 // lea edx,[rdx+r14*1] - LONG $0x210c8d42 // lea ecx,[rcx+r12*1] - LONG $0xf22062c4; BYTE $0xe3 // andn r12d,r11d,ebx - WORD $0x3141; BYTE $0xfd // xor r13d,edi - LONG $0xf07b43c4; WORD $0x06f3 // rorx r14d,r11d,0x6 - LONG $0x210c8d42 // lea ecx,[rcx+r12*1] - WORD $0x3145; BYTE $0xf5 // xor r13d,r14d - WORD $0xd789 // mov edi,edx - LONG $0xf07b63c4; WORD $0x16e2 // rorx r12d,edx,0x16 - LONG $0x290c8d42 // lea ecx,[rcx+r13*1] - WORD $0x3144; BYTE $0xc7 // xor edi,r8d - LONG $0xf07b63c4; WORD $0x0df2 // rorx r14d,edx,0xd - LONG $0xf07b63c4; WORD $0x02ea // rorx r13d,edx,0x2 - LONG $0x0a148d45 // lea r10d,[r10+rcx*1] - WORD $0x2141; BYTE $0xff // and r15d,edi - WORD $0x3145; BYTE $0xe6 // xor r14d,r12d - WORD $0x3145; BYTE $0xc7 // xor r15d,r8d - WORD $0x3145; BYTE $0xee // xor r14d,r13d - LONG $0x390c8d42 // lea ecx,[rcx+r15*1] - WORD $0x8945; BYTE $0xdc // mov r12d,r11d - - // ROUND(CX, DX, R8, R9, R10, R11, AX, BX, R12, R13, R14, R15, DI, SP, 0x28) - LONG $0x28245c03 // add ebx,[rsp+0x28] - WORD $0x2145; BYTE $0xd4 // and r12d,r10d - LONG $0xf07b43c4; WORD $0x19ea // rorx r13d,r10d,0x19 - LONG $0xf07b43c4; WORD $0x0bfa // rorx r15d,r10d,0xb - LONG $0x310c8d42 // lea ecx,[rcx+r14*1] - LONG $0x231c8d42 // lea ebx,[rbx+r12*1] - LONG $0xf22862c4; BYTE $0xe0 // andn r12d,r10d,eax - WORD $0x3145; BYTE $0xfd // xor r13d,r15d - LONG $0xf07b43c4; WORD $0x06f2 // rorx r14d,r10d,0x6 - LONG $0x231c8d42 // lea ebx,[rbx+r12*1] - WORD $0x3145; BYTE $0xf5 // xor r13d,r14d - WORD $0x8941; BYTE $0xcf // mov r15d,ecx - LONG $0xf07b63c4; WORD $0x16e1 // rorx r12d,ecx,0x16 - LONG $0x2b1c8d42 // lea ebx,[rbx+r13*1] - WORD $0x3141; BYTE $0xd7 // xor r15d,edx - LONG $0xf07b63c4; WORD $0x0df1 // rorx r14d,ecx,0xd - LONG $0xf07b63c4; WORD $0x02e9 // rorx r13d,ecx,0x2 - LONG $0x190c8d45 // lea r9d,[r9+rbx*1] - WORD $0x2144; BYTE $0xff // and edi,r15d - WORD $0x3145; BYTE $0xe6 // xor r14d,r12d - WORD $0xd731 // xor edi,edx - WORD $0x3145; BYTE $0xee // xor r14d,r13d - WORD $0x1c8d; BYTE $0x3b // lea ebx,[rbx+rdi*1] - WORD $0x8945; BYTE $0xd4 // mov r12d,r10d - - // ROUND(BX, CX, DX, R8, R9, R10, R11, AX, R12, R13, R14, DI, R15, SP, 0x2c) - LONG $0x2c244403 // add eax,[rsp+0x2c] - WORD $0x2145; BYTE $0xcc // and r12d,r9d - LONG $0xf07b43c4; WORD $0x19e9 // rorx r13d,r9d,0x19 - LONG $0xf07bc3c4; WORD $0x0bf9 // rorx edi,r9d,0xb - LONG $0x331c8d42 // lea ebx,[rbx+r14*1] - LONG $0x20048d42 // lea eax,[rax+r12*1] - LONG $0xf23042c4; BYTE $0xe3 // andn r12d,r9d,r11d - WORD $0x3141; BYTE $0xfd // xor r13d,edi - LONG $0xf07b43c4; WORD $0x06f1 // rorx r14d,r9d,0x6 - LONG $0x20048d42 // lea eax,[rax+r12*1] - WORD $0x3145; BYTE $0xf5 // xor r13d,r14d - WORD $0xdf89 // mov edi,ebx - LONG $0xf07b63c4; WORD $0x16e3 // rorx r12d,ebx,0x16 - LONG $0x28048d42 // lea eax,[rax+r13*1] - WORD $0xcf31 // xor edi,ecx - LONG $0xf07b63c4; WORD $0x0df3 // rorx r14d,ebx,0xd - LONG $0xf07b63c4; WORD $0x02eb // rorx r13d,ebx,0x2 - LONG $0x00048d45 // lea r8d,[r8+rax*1] - WORD $0x2141; BYTE $0xff // and r15d,edi - WORD $0x3145; BYTE $0xe6 // xor r14d,r12d - WORD $0x3141; BYTE $0xcf // xor r15d,ecx - WORD $0x3145; BYTE $0xee // xor r14d,r13d - LONG $0x38048d42 // lea eax,[rax+r15*1] - WORD $0x8945; BYTE $0xcc // mov r12d,r9d - - MOVQ 0x200(SP), DI // $_ctx - ADDQ R14, AX - - LEAQ 0x1c0(SP), BP - - ADDL (DI), AX - ADDL 4(DI), BX - ADDL 8(DI), CX - ADDL 12(DI), DX - ADDL 16(DI), R8 - ADDL 20(DI), R9 - ADDL 24(DI), R10 - ADDL 28(DI), R11 - - MOVL AX, (DI) - MOVL BX, 4(DI) - MOVL CX, 8(DI) - MOVL DX, 12(DI) - MOVL R8, 16(DI) - MOVL R9, 20(DI) - MOVL R10, 24(DI) - MOVL R11, 28(DI) - - CMPQ SI, 0x50(BP) // $_end - JE done - - XORQ R14, R14 - MOVQ BX, DI - XORQ CX, DI // magic - MOVQ R9, R12 - -loop2: - // ROUND(AX, BX, CX, DX, R8, R9, R10, R11, R12, R13, R14, R15, DI, BP, 0x10) - LONG $0x105d0344 // add r11d,[rbp+0x10] - WORD $0x2145; BYTE $0xc4 // and r12d,r8d - LONG $0xf07b43c4; WORD $0x19e8 // rorx r13d,r8d,0x19 - LONG $0xf07b43c4; WORD $0x0bf8 // rorx r15d,r8d,0xb - LONG $0x30048d42 // lea eax,[rax+r14*1] - LONG $0x231c8d47 // lea r11d,[r11+r12*1] - LONG $0xf23842c4; BYTE $0xe2 // andn r12d,r8d,r10d - WORD $0x3145; BYTE $0xfd // xor r13d,r15d - LONG $0xf07b43c4; WORD $0x06f0 // rorx r14d,r8d,0x6 - LONG $0x231c8d47 // lea r11d,[r11+r12*1] - WORD $0x3145; BYTE $0xf5 // xor r13d,r14d - WORD $0x8941; BYTE $0xc7 // mov r15d,eax - LONG $0xf07b63c4; WORD $0x16e0 // rorx r12d,eax,0x16 - LONG $0x2b1c8d47 // lea r11d,[r11+r13*1] - WORD $0x3141; BYTE $0xdf // xor r15d,ebx - LONG $0xf07b63c4; WORD $0x0df0 // rorx r14d,eax,0xd - LONG $0xf07b63c4; WORD $0x02e8 // rorx r13d,eax,0x2 - LONG $0x1a148d42 // lea edx,[rdx+r11*1] - WORD $0x2144; BYTE $0xff // and edi,r15d - WORD $0x3145; BYTE $0xe6 // xor r14d,r12d - WORD $0xdf31 // xor edi,ebx - WORD $0x3145; BYTE $0xee // xor r14d,r13d - LONG $0x3b1c8d45 // lea r11d,[r11+rdi*1] - WORD $0x8945; BYTE $0xc4 // mov r12d,r8d - - // ROUND(R11, AX, BX, CX, DX, R8, R9, R10, R12, R13, R14, DI, R15, BP, 0x14) - LONG $0x14550344 // add r10d,[rbp+0x14] - WORD $0x2141; BYTE $0xd4 // and r12d,edx - LONG $0xf07b63c4; WORD $0x19ea // rorx r13d,edx,0x19 - LONG $0xf07be3c4; WORD $0x0bfa // rorx edi,edx,0xb - LONG $0x331c8d47 // lea r11d,[r11+r14*1] - LONG $0x22148d47 // lea r10d,[r10+r12*1] - LONG $0xf26842c4; BYTE $0xe1 // andn r12d,edx,r9d - WORD $0x3141; BYTE $0xfd // xor r13d,edi - LONG $0xf07b63c4; WORD $0x06f2 // rorx r14d,edx,0x6 - LONG $0x22148d47 // lea r10d,[r10+r12*1] - WORD $0x3145; BYTE $0xf5 // xor r13d,r14d - WORD $0x8944; BYTE $0xdf // mov edi,r11d - LONG $0xf07b43c4; WORD $0x16e3 // rorx r12d,r11d,0x16 - LONG $0x2a148d47 // lea r10d,[r10+r13*1] - WORD $0xc731 // xor edi,eax - LONG $0xf07b43c4; WORD $0x0df3 // rorx r14d,r11d,0xd - LONG $0xf07b43c4; WORD $0x02eb // rorx r13d,r11d,0x2 - LONG $0x110c8d42 // lea ecx,[rcx+r10*1] - WORD $0x2141; BYTE $0xff // and r15d,edi - WORD $0x3145; BYTE $0xe6 // xor r14d,r12d - WORD $0x3141; BYTE $0xc7 // xor r15d,eax - WORD $0x3145; BYTE $0xee // xor r14d,r13d - LONG $0x3a148d47 // lea r10d,[r10+r15*1] - WORD $0x8941; BYTE $0xd4 // mov r12d,edx - - // ROUND(R10, R11, AX, BX, CX, DX, R8, R9, R12, R13, R14, R15, DI, BP, 0x18) - LONG $0x184d0344 // add r9d,[rbp+0x18] - WORD $0x2141; BYTE $0xcc // and r12d,ecx - LONG $0xf07b63c4; WORD $0x19e9 // rorx r13d,ecx,0x19 - LONG $0xf07b63c4; WORD $0x0bf9 // rorx r15d,ecx,0xb - LONG $0x32148d47 // lea r10d,[r10+r14*1] - LONG $0x210c8d47 // lea r9d,[r9+r12*1] - LONG $0xf27042c4; BYTE $0xe0 // andn r12d,ecx,r8d - WORD $0x3145; BYTE $0xfd // xor r13d,r15d - LONG $0xf07b63c4; WORD $0x06f1 // rorx r14d,ecx,0x6 - LONG $0x210c8d47 // lea r9d,[r9+r12*1] - WORD $0x3145; BYTE $0xf5 // xor r13d,r14d - WORD $0x8945; BYTE $0xd7 // mov r15d,r10d - LONG $0xf07b43c4; WORD $0x16e2 // rorx r12d,r10d,0x16 - LONG $0x290c8d47 // lea r9d,[r9+r13*1] - WORD $0x3145; BYTE $0xdf // xor r15d,r11d - LONG $0xf07b43c4; WORD $0x0df2 // rorx r14d,r10d,0xd - LONG $0xf07b43c4; WORD $0x02ea // rorx r13d,r10d,0x2 - LONG $0x0b1c8d42 // lea ebx,[rbx+r9*1] - WORD $0x2144; BYTE $0xff // and edi,r15d - WORD $0x3145; BYTE $0xe6 // xor r14d,r12d - WORD $0x3144; BYTE $0xdf // xor edi,r11d - WORD $0x3145; BYTE $0xee // xor r14d,r13d - LONG $0x390c8d45 // lea r9d,[r9+rdi*1] - WORD $0x8941; BYTE $0xcc // mov r12d,ecx - - // ROUND(R9, R10, R11, AX, BX, CX, DX, R8, R12, R13, R14, DI, R15, BP, 0x1c) - LONG $0x1c450344 // add r8d,[rbp+0x1c] - WORD $0x2141; BYTE $0xdc // and r12d,ebx - LONG $0xf07b63c4; WORD $0x19eb // rorx r13d,ebx,0x19 - LONG $0xf07be3c4; WORD $0x0bfb // rorx edi,ebx,0xb - LONG $0x310c8d47 // lea r9d,[r9+r14*1] - LONG $0x20048d47 // lea r8d,[r8+r12*1] - LONG $0xf26062c4; BYTE $0xe2 // andn r12d,ebx,edx - WORD $0x3141; BYTE $0xfd // xor r13d,edi - LONG $0xf07b63c4; WORD $0x06f3 // rorx r14d,ebx,0x6 - LONG $0x20048d47 // lea r8d,[r8+r12*1] - WORD $0x3145; BYTE $0xf5 // xor r13d,r14d - WORD $0x8944; BYTE $0xcf // mov edi,r9d - LONG $0xf07b43c4; WORD $0x16e1 // rorx r12d,r9d,0x16 - LONG $0x28048d47 // lea r8d,[r8+r13*1] - WORD $0x3144; BYTE $0xd7 // xor edi,r10d - LONG $0xf07b43c4; WORD $0x0df1 // rorx r14d,r9d,0xd - LONG $0xf07b43c4; WORD $0x02e9 // rorx r13d,r9d,0x2 - LONG $0x00048d42 // lea eax,[rax+r8*1] - WORD $0x2141; BYTE $0xff // and r15d,edi - WORD $0x3145; BYTE $0xe6 // xor r14d,r12d - WORD $0x3145; BYTE $0xd7 // xor r15d,r10d - WORD $0x3145; BYTE $0xee // xor r14d,r13d - LONG $0x38048d47 // lea r8d,[r8+r15*1] - WORD $0x8941; BYTE $0xdc // mov r12d,ebx - - // ROUND(R8, R9, R10, R11, AX, BX, CX, DX, R12, R13, R14, R15, DI, BP, 0x30) - WORD $0x5503; BYTE $0x30 // add edx,[rbp+0x30] - WORD $0x2141; BYTE $0xc4 // and r12d,eax - LONG $0xf07b63c4; WORD $0x19e8 // rorx r13d,eax,0x19 - LONG $0xf07b63c4; WORD $0x0bf8 // rorx r15d,eax,0xb - LONG $0x30048d47 // lea r8d,[r8+r14*1] - LONG $0x22148d42 // lea edx,[rdx+r12*1] - LONG $0xf27862c4; BYTE $0xe1 // andn r12d,eax,ecx - WORD $0x3145; BYTE $0xfd // xor r13d,r15d - LONG $0xf07b63c4; WORD $0x06f0 // rorx r14d,eax,0x6 - LONG $0x22148d42 // lea edx,[rdx+r12*1] - WORD $0x3145; BYTE $0xf5 // xor r13d,r14d - WORD $0x8945; BYTE $0xc7 // mov r15d,r8d - LONG $0xf07b43c4; WORD $0x16e0 // rorx r12d,r8d,0x16 - LONG $0x2a148d42 // lea edx,[rdx+r13*1] - WORD $0x3145; BYTE $0xcf // xor r15d,r9d - LONG $0xf07b43c4; WORD $0x0df0 // rorx r14d,r8d,0xd - LONG $0xf07b43c4; WORD $0x02e8 // rorx r13d,r8d,0x2 - LONG $0x131c8d45 // lea r11d,[r11+rdx*1] - WORD $0x2144; BYTE $0xff // and edi,r15d - WORD $0x3145; BYTE $0xe6 // xor r14d,r12d - WORD $0x3144; BYTE $0xcf // xor edi,r9d - WORD $0x3145; BYTE $0xee // xor r14d,r13d - WORD $0x148d; BYTE $0x3a // lea edx,[rdx+rdi*1] - WORD $0x8941; BYTE $0xc4 // mov r12d,eax - - // ROUND(DX, R8, R9, R10, R11, AX, BX, CX, R12, R13, R14, DI, R15, BP, 0x34) - WORD $0x4d03; BYTE $0x34 // add ecx,[rbp+0x34] - WORD $0x2145; BYTE $0xdc // and r12d,r11d - LONG $0xf07b43c4; WORD $0x19eb // rorx r13d,r11d,0x19 - LONG $0xf07bc3c4; WORD $0x0bfb // rorx edi,r11d,0xb - LONG $0x32148d42 // lea edx,[rdx+r14*1] - LONG $0x210c8d42 // lea ecx,[rcx+r12*1] - LONG $0xf22062c4; BYTE $0xe3 // andn r12d,r11d,ebx - WORD $0x3141; BYTE $0xfd // xor r13d,edi - LONG $0xf07b43c4; WORD $0x06f3 // rorx r14d,r11d,0x6 - LONG $0x210c8d42 // lea ecx,[rcx+r12*1] - WORD $0x3145; BYTE $0xf5 // xor r13d,r14d - WORD $0xd789 // mov edi,edx - LONG $0xf07b63c4; WORD $0x16e2 // rorx r12d,edx,0x16 - LONG $0x290c8d42 // lea ecx,[rcx+r13*1] - WORD $0x3144; BYTE $0xc7 // xor edi,r8d - LONG $0xf07b63c4; WORD $0x0df2 // rorx r14d,edx,0xd - LONG $0xf07b63c4; WORD $0x02ea // rorx r13d,edx,0x2 - LONG $0x0a148d45 // lea r10d,[r10+rcx*1] - WORD $0x2141; BYTE $0xff // and r15d,edi - WORD $0x3145; BYTE $0xe6 // xor r14d,r12d - WORD $0x3145; BYTE $0xc7 // xor r15d,r8d - WORD $0x3145; BYTE $0xee // xor r14d,r13d - LONG $0x390c8d42 // lea ecx,[rcx+r15*1] - WORD $0x8945; BYTE $0xdc // mov r12d,r11d - - // ROUND(CX, DX, R8, R9, R10, R11, AX, BX, R12, R13, R14, R15, DI, BP, 0x38) - WORD $0x5d03; BYTE $0x38 // add ebx,[rbp+0x38] - WORD $0x2145; BYTE $0xd4 // and r12d,r10d - LONG $0xf07b43c4; WORD $0x19ea // rorx r13d,r10d,0x19 - LONG $0xf07b43c4; WORD $0x0bfa // rorx r15d,r10d,0xb - LONG $0x310c8d42 // lea ecx,[rcx+r14*1] - LONG $0x231c8d42 // lea ebx,[rbx+r12*1] - LONG $0xf22862c4; BYTE $0xe0 // andn r12d,r10d,eax - WORD $0x3145; BYTE $0xfd // xor r13d,r15d - LONG $0xf07b43c4; WORD $0x06f2 // rorx r14d,r10d,0x6 - LONG $0x231c8d42 // lea ebx,[rbx+r12*1] - WORD $0x3145; BYTE $0xf5 // xor r13d,r14d - WORD $0x8941; BYTE $0xcf // mov r15d,ecx - LONG $0xf07b63c4; WORD $0x16e1 // rorx r12d,ecx,0x16 - LONG $0x2b1c8d42 // lea ebx,[rbx+r13*1] - WORD $0x3141; BYTE $0xd7 // xor r15d,edx - LONG $0xf07b63c4; WORD $0x0df1 // rorx r14d,ecx,0xd - LONG $0xf07b63c4; WORD $0x02e9 // rorx r13d,ecx,0x2 - LONG $0x190c8d45 // lea r9d,[r9+rbx*1] - WORD $0x2144; BYTE $0xff // and edi,r15d - WORD $0x3145; BYTE $0xe6 // xor r14d,r12d - WORD $0xd731 // xor edi,edx - WORD $0x3145; BYTE $0xee // xor r14d,r13d - WORD $0x1c8d; BYTE $0x3b // lea ebx,[rbx+rdi*1] - WORD $0x8945; BYTE $0xd4 // mov r12d,r10d - - // ROUND(BX, CX, DX, R8, R9, R10, R11, AX, R12, R13, R14, DI, R15, BP, 0x3c) - WORD $0x4503; BYTE $0x3c // add eax,[rbp+0x3c] - WORD $0x2145; BYTE $0xcc // and r12d,r9d - LONG $0xf07b43c4; WORD $0x19e9 // rorx r13d,r9d,0x19 - LONG $0xf07bc3c4; WORD $0x0bf9 // rorx edi,r9d,0xb - LONG $0x331c8d42 // lea ebx,[rbx+r14*1] - LONG $0x20048d42 // lea eax,[rax+r12*1] - LONG $0xf23042c4; BYTE $0xe3 // andn r12d,r9d,r11d - WORD $0x3141; BYTE $0xfd // xor r13d,edi - LONG $0xf07b43c4; WORD $0x06f1 // rorx r14d,r9d,0x6 - LONG $0x20048d42 // lea eax,[rax+r12*1] - WORD $0x3145; BYTE $0xf5 // xor r13d,r14d - WORD $0xdf89 // mov edi,ebx - LONG $0xf07b63c4; WORD $0x16e3 // rorx r12d,ebx,0x16 - LONG $0x28048d42 // lea eax,[rax+r13*1] - WORD $0xcf31 // xor edi,ecx - LONG $0xf07b63c4; WORD $0x0df3 // rorx r14d,ebx,0xd - LONG $0xf07b63c4; WORD $0x02eb // rorx r13d,ebx,0x2 - LONG $0x00048d45 // lea r8d,[r8+rax*1] - WORD $0x2141; BYTE $0xff // and r15d,edi - WORD $0x3145; BYTE $0xe6 // xor r14d,r12d - WORD $0x3141; BYTE $0xcf // xor r15d,ecx - WORD $0x3145; BYTE $0xee // xor r14d,r13d - LONG $0x38048d42 // lea eax,[rax+r15*1] - WORD $0x8945; BYTE $0xcc // mov r12d,r9d - - ADDQ $-0x40, BP - CMPQ BP, SP - JAE loop2 - - MOVQ 0x200(SP), DI // $_ctx - ADDQ R14, AX - - ADDQ $0x1c0, SP - - ADDL (DI), AX - ADDL 4(DI), BX - ADDL 8(DI), CX - ADDL 12(DI), DX - ADDL 16(DI), R8 - ADDL 20(DI), R9 - - ADDQ $0x80, SI // input += 2 - ADDL 24(DI), R10 - MOVQ SI, R12 - ADDL 28(DI), R11 - CMPQ SI, 0x50(SP) // input == _end - - MOVL AX, (DI) - LONG $0xe4440f4c // cmove r12,rsp /* next block or stale data */ - MOVL AX, (DI) - MOVL BX, 4(DI) - MOVL CX, 8(DI) - MOVL DX, 12(DI) - MOVL R8, 16(DI) - MOVL R9, 20(DI) - MOVL R10, 24(DI) - MOVL R11, 28(DI) - - JBE loop0 - LEAQ (SP), BP - -done: - MOVQ BP, SP - MOVQ 0x58(SP), SP // restore saved stack pointer - WORD $0xf8c5; BYTE $0x77 // vzeroupper - - RET - diff --git a/sha256blockAvx512_amd64.go b/sha256blockAvx512_amd64.go index db8e48d..b7d7c16 100644 --- a/sha256blockAvx512_amd64.go +++ b/sha256blockAvx512_amd64.go @@ -1,4 +1,4 @@ -//+build !noasm,!appengine +//+build !noasm,!appengine,gc /* * Minio Cloud Storage, (C) 2017 Minio, Inc. diff --git a/sha256blockAvx512_amd64.s b/sha256blockAvx512_amd64.s index 275bcac..cca534e 100644 --- a/sha256blockAvx512_amd64.s +++ b/sha256blockAvx512_amd64.s @@ -1,4 +1,4 @@ -//+build !noasm,!appengine +//+build !noasm,!appengine,gc TEXT ·sha256X16Avx512(SB), 7, $0 MOVQ digests+0(FP), DI diff --git a/sha256blockAvx512_amd64_test.go b/sha256blockAvx512_amd64_test.go index bab089c..f4eab62 100644 --- a/sha256blockAvx512_amd64_test.go +++ b/sha256blockAvx512_amd64_test.go @@ -1,4 +1,4 @@ -//+build !noasm,!appengine +//+build !noasm,!appengine,gc /* * Minio Cloud Storage, (C) 2017 Minio, Inc. diff --git a/sha256blockAvx_amd64.go b/sha256blockAvx_amd64.go deleted file mode 100644 index c2f7118..0000000 --- a/sha256blockAvx_amd64.go +++ /dev/null @@ -1,22 +0,0 @@ -//+build !noasm,!appengine - -/* - * Minio Cloud Storage, (C) 2016 Minio, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package sha256 - -//go:noescape -func blockAvx(h []uint32, message []uint8, reserved0, reserved1, reserved2, reserved3 uint64) diff --git a/sha256blockAvx_amd64.s b/sha256blockAvx_amd64.s deleted file mode 100644 index 9f444d4..0000000 --- a/sha256blockAvx_amd64.s +++ /dev/null @@ -1,408 +0,0 @@ -//+build !noasm,!appengine - -// SHA256 implementation for AVX - -// -// Minio Cloud Storage, (C) 2016 Minio, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -// -// This code is based on an Intel White-Paper: -// "Fast SHA-256 Implementations on Intel Architecture Processors" -// -// together with the reference implementation from the following authors: -// James Guilford -// Kirk Yap -// Tim Chen -// -// For Golang it has been converted to Plan 9 assembly with the help of -// github.com/minio/asm2plan9s to assemble Intel instructions to their Plan9 -// equivalents -// - -#include "textflag.h" - -#define ROTATE_XS \ - MOVOU X4, X15 \ - MOVOU X5, X4 \ - MOVOU X6, X5 \ - MOVOU X7, X6 \ - MOVOU X15, X7 - -// compute s0 four at a time and s1 two at a time -// compute W[-16] + W[-7] 4 at a time -#define FOUR_ROUNDS_AND_SCHED(a, b, c, d, e, f, g, h) \ - MOVL e, R13 \ // y0 = e - ROLL $18, R13 \ // y0 = e >> (25-11) - MOVL a, R14 \ // y1 = a - LONG $0x0f41e3c4; WORD $0x04c6 \ // VPALIGNR XMM0,XMM7,XMM6,0x4 /* XTMP0 = W[-7] */ - ROLL $23, R14 \ // y1 = a >> (22-13) - XORL e, R13 \ // y0 = e ^ (e >> (25-11)) - MOVL f, R15 \ // y2 = f - ROLL $27, R13 \ // y0 = (e >> (11-6)) ^ (e >> (25-6)) - XORL a, R14 \ // y1 = a ^ (a >> (22-13) - XORL g, R15 \ // y2 = f^g - LONG $0xc4fef9c5 \ // VPADDD XMM0,XMM0,XMM4 /* XTMP0 = W[-7] + W[-16] */ - XORL e, R13 \ // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6) ) - ANDL e, R15 \ // y2 = (f^g)&e - ROLL $21, R14 \ // y1 = (a >> (13-2)) ^ (a >> (22-2)) - \ - \ // compute s0 - \ - LONG $0x0f51e3c4; WORD $0x04cc \ // VPALIGNR XMM1,XMM5,XMM4,0x4 /* XTMP1 = W[-15] */ - XORL a, R14 \ // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - ROLL $26, R13 \ // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - XORL g, R15 \ // y2 = CH = ((f^g)&e)^g - ROLL $30, R14 \ // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - ADDL R13, R15 \ // y2 = S1 + CH - ADDL _xfer+48(FP), R15 \ // y2 = k + w + S1 + CH - MOVL a, R13 \ // y0 = a - ADDL R15, h \ // h = h + S1 + CH + k + w - \ // ROTATE_ARGS - MOVL a, R15 \ // y2 = a - LONG $0xd172e9c5; BYTE $0x07 \ // VPSRLD XMM2,XMM1,0x7 /* */ - ORL c, R13 \ // y0 = a|c - ADDL h, d \ // d = d + h + S1 + CH + k + w - ANDL c, R15 \ // y2 = a&c - LONG $0xf172e1c5; BYTE $0x19 \ // VPSLLD XMM3,XMM1,0x19 /* */ - ANDL b, R13 \ // y0 = (a|c)&b - ADDL R14, h \ // h = h + S1 + CH + k + w + S0 - LONG $0xdaebe1c5 \ // VPOR XMM3,XMM3,XMM2 /* XTMP1 = W[-15] MY_ROR 7 */ - ORL R15, R13 \ // y0 = MAJ = (a|c)&b)|(a&c) - ADDL R13, h \ // h = h + S1 + CH + k + w + S0 + MAJ - \ // ROTATE_ARGS - MOVL d, R13 \ // y0 = e - MOVL h, R14 \ // y1 = a - ROLL $18, R13 \ // y0 = e >> (25-11) - XORL d, R13 \ // y0 = e ^ (e >> (25-11)) - MOVL e, R15 \ // y2 = f - ROLL $23, R14 \ // y1 = a >> (22-13) - LONG $0xd172e9c5; BYTE $0x12 \ // VPSRLD XMM2,XMM1,0x12 /* */ - XORL h, R14 \ // y1 = a ^ (a >> (22-13) - ROLL $27, R13 \ // y0 = (e >> (11-6)) ^ (e >> (25-6)) - XORL f, R15 \ // y2 = f^g - LONG $0xd172b9c5; BYTE $0x03 \ // VPSRLD XMM8,XMM1,0x3 /* XTMP4 = W[-15] >> 3 */ - ROLL $21, R14 \ // y1 = (a >> (13-2)) ^ (a >> (22-2)) - XORL d, R13 \ // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - ANDL d, R15 \ // y2 = (f^g)&e - ROLL $26, R13 \ // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - LONG $0xf172f1c5; BYTE $0x0e \ // VPSLLD XMM1,XMM1,0xe /* */ - XORL h, R14 \ // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - XORL f, R15 \ // y2 = CH = ((f^g)&e)^g - LONG $0xd9efe1c5 \ // VPXOR XMM3,XMM3,XMM1 /* */ - ADDL R13, R15 \ // y2 = S1 + CH - ADDL _xfer+52(FP), R15 \ // y2 = k + w + S1 + CH - ROLL $30, R14 \ // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - LONG $0xdaefe1c5 \ // VPXOR XMM3,XMM3,XMM2 /* XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR */ - MOVL h, R13 \ // y0 = a - ADDL R15, g \ // h = h + S1 + CH + k + w - MOVL h, R15 \ // y2 = a - LONG $0xef61c1c4; BYTE $0xc8 \ // VPXOR XMM1,XMM3,XMM8 /* XTMP1 = s0 */ - ORL b, R13 \ // y0 = a|c - ADDL g, c \ // d = d + h + S1 + CH + k + w - ANDL b, R15 \ // y2 = a&c - \ - \ // compute low s1 - \ - LONG $0xd770f9c5; BYTE $0xfa \ // VPSHUFD XMM2,XMM7,0xfa /* XTMP2 = W[-2] {BBAA} */ - ANDL a, R13 \ // y0 = (a|c)&b - ADDL R14, g \ // h = h + S1 + CH + k + w + S0 - LONG $0xc1fef9c5 \ // VPADDD XMM0,XMM0,XMM1 /* XTMP0 = W[-16] + W[-7] + s0 */ - ORL R15, R13 \ // y0 = MAJ = (a|c)&b)|(a&c) - ADDL R13, g \ // h = h + S1 + CH + k + w + S0 + MAJ - \ // ROTATE_ARGS - MOVL c, R13 \ // y0 = e - MOVL g, R14 \ // y1 = a - ROLL $18, R13 \ // y0 = e >> (25-11) - XORL c, R13 \ // y0 = e ^ (e >> (25-11)) - ROLL $23, R14 \ // y1 = a >> (22-13) - MOVL d, R15 \ // y2 = f - XORL g, R14 \ // y1 = a ^ (a >> (22-13) - ROLL $27, R13 \ // y0 = (e >> (11-6)) ^ (e >> (25-6)) - LONG $0xd272b9c5; BYTE $0x0a \ // VPSRLD XMM8,XMM2,0xa /* XTMP4 = W[-2] >> 10 {BBAA} */ - XORL e, R15 \ // y2 = f^g - LONG $0xd273e1c5; BYTE $0x13 \ // VPSRLQ XMM3,XMM2,0x13 /* XTMP3 = W[-2] MY_ROR 19 {xBxA} */ - XORL c, R13 \ // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - ANDL c, R15 \ // y2 = (f^g)&e - LONG $0xd273e9c5; BYTE $0x11 \ // VPSRLQ XMM2,XMM2,0x11 /* XTMP2 = W[-2] MY_ROR 17 {xBxA} */ - ROLL $21, R14 \ // y1 = (a >> (13-2)) ^ (a >> (22-2)) - XORL g, R14 \ // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - XORL e, R15 \ // y2 = CH = ((f^g)&e)^g - ROLL $26, R13 \ // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - LONG $0xd3efe9c5 \ // VPXOR XMM2,XMM2,XMM3 /* */ - ADDL R13, R15 \ // y2 = S1 + CH - ROLL $30, R14 \ // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - ADDL _xfer+56(FP), R15 \ // y2 = k + w + S1 + CH - LONG $0xc2ef39c5 \ // VPXOR XMM8,XMM8,XMM2 /* XTMP4 = s1 {xBxA} */ - MOVL g, R13 \ // y0 = a - ADDL R15, f \ // h = h + S1 + CH + k + w - MOVL g, R15 \ // y2 = a - LONG $0x003942c4; BYTE $0xc2 \ // VPSHUFB XMM8,XMM8,XMM10 /* XTMP4 = s1 {00BA} */ - ORL a, R13 \ // y0 = a|c - ADDL f, b \ // d = d + h + S1 + CH + k + w - ANDL a, R15 \ // y2 = a&c - LONG $0xfe79c1c4; BYTE $0xc0 \ // VPADDD XMM0,XMM0,XMM8 /* XTMP0 = {..., ..., W[1], W[0]} */ - ANDL h, R13 \ // y0 = (a|c)&b - ADDL R14, f \ // h = h + S1 + CH + k + w + S0 - \ - \ // compute high s1 - \ - LONG $0xd070f9c5; BYTE $0x50 \ // VPSHUFD XMM2,XMM0,0x50 /* XTMP2 = W[-2] {DDCC} */ - ORL R15, R13 \ // y0 = MAJ = (a|c)&b)|(a&c) - ADDL R13, f \ // h = h + S1 + CH + k + w + S0 + MAJ - \ // ROTATE_ARGS - MOVL b, R13 \ // y0 = e - ROLL $18, R13 \ // y0 = e >> (25-11) - MOVL f, R14 \ // y1 = a - ROLL $23, R14 \ // y1 = a >> (22-13) - XORL b, R13 \ // y0 = e ^ (e >> (25-11)) - MOVL c, R15 \ // y2 = f - ROLL $27, R13 \ // y0 = (e >> (11-6)) ^ (e >> (25-6)) - LONG $0xd272a1c5; BYTE $0x0a \ // VPSRLD XMM11,XMM2,0xa /* XTMP5 = W[-2] >> 10 {DDCC} */ - XORL f, R14 \ // y1 = a ^ (a >> (22-13) - XORL d, R15 \ // y2 = f^g - LONG $0xd273e1c5; BYTE $0x13 \ // VPSRLQ XMM3,XMM2,0x13 /* XTMP3 = W[-2] MY_ROR 19 {xDxC} */ - XORL b, R13 \ // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - ANDL b, R15 \ // y2 = (f^g)&e - ROLL $21, R14 \ // y1 = (a >> (13-2)) ^ (a >> (22-2)) - LONG $0xd273e9c5; BYTE $0x11 \ // VPSRLQ XMM2,XMM2,0x11 /* XTMP2 = W[-2] MY_ROR 17 {xDxC} */ - XORL f, R14 \ // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - ROLL $26, R13 \ // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - XORL d, R15 \ // y2 = CH = ((f^g)&e)^g - LONG $0xd3efe9c5 \ // VPXOR XMM2,XMM2,XMM3 /* */ - ROLL $30, R14 \ // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - ADDL R13, R15 \ // y2 = S1 + CH - ADDL _xfer+60(FP), R15 \ // y2 = k + w + S1 + CH - LONG $0xdaef21c5 \ // VPXOR XMM11,XMM11,XMM2 /* XTMP5 = s1 {xDxC} */ - MOVL f, R13 \ // y0 = a - ADDL R15, e \ // h = h + S1 + CH + k + w - MOVL f, R15 \ // y2 = a - LONG $0x002142c4; BYTE $0xdc \ // VPSHUFB XMM11,XMM11,XMM12 /* XTMP5 = s1 {DC00} */ - ORL h, R13 \ // y0 = a|c - ADDL e, a \ // d = d + h + S1 + CH + k + w - ANDL h, R15 \ // y2 = a&c - LONG $0xe0fea1c5 \ // VPADDD XMM4,XMM11,XMM0 /* X0 = {W[3], W[2], W[1], W[0]} */ - ANDL g, R13 \ // y0 = (a|c)&b - ADDL R14, e \ // h = h + S1 + CH + k + w + S0 - ORL R15, R13 \ // y0 = MAJ = (a|c)&b)|(a&c) - ADDL R13, e \ // h = h + S1 + CH + k + w + S0 + MAJ - \ // ROTATE_ARGS - ROTATE_XS - -#define DO_ROUND(a, b, c, d, e, f, g, h, offset) \ - MOVL e, R13 \ // y0 = e - ROLL $18, R13 \ // y0 = e >> (25-11) - MOVL a, R14 \ // y1 = a - XORL e, R13 \ // y0 = e ^ (e >> (25-11)) - ROLL $23, R14 \ // y1 = a >> (22-13) - MOVL f, R15 \ // y2 = f - XORL a, R14 \ // y1 = a ^ (a >> (22-13) - ROLL $27, R13 \ // y0 = (e >> (11-6)) ^ (e >> (25-6)) - XORL g, R15 \ // y2 = f^g - XORL e, R13 \ // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - ROLL $21, R14 \ // y1 = (a >> (13-2)) ^ (a >> (22-2)) - ANDL e, R15 \ // y2 = (f^g)&e - XORL a, R14 \ // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - ROLL $26, R13 \ // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - XORL g, R15 \ // y2 = CH = ((f^g)&e)^g - ADDL R13, R15 \ // y2 = S1 + CH - ROLL $30, R14 \ // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - ADDL _xfer+offset(FP), R15 \ // y2 = k + w + S1 + CH - MOVL a, R13 \ // y0 = a - ADDL R15, h \ // h = h + S1 + CH + k + w - MOVL a, R15 \ // y2 = a - ORL c, R13 \ // y0 = a|c - ADDL h, d \ // d = d + h + S1 + CH + k + w - ANDL c, R15 \ // y2 = a&c - ANDL b, R13 \ // y0 = (a|c)&b - ADDL R14, h \ // h = h + S1 + CH + k + w + S0 - ORL R15, R13 \ // y0 = MAJ = (a|c)&b)|(a&c) - ADDL R13, h // h = h + S1 + CH + k + w + S0 + MAJ - -// func blockAvx(h []uint32, message []uint8, reserved0, reserved1, reserved2, reserved3 uint64) -TEXT ·blockAvx(SB), 7, $0-80 - - MOVQ h+0(FP), SI // SI: &h - MOVQ message_base+24(FP), R8 // &message - MOVQ message_len+32(FP), R9 // length of message - CMPQ R9, $0 - JEQ done_hash - ADDQ R8, R9 - MOVQ R9, reserved2+64(FP) // store end of message - - // Register definition - // a --> eax - // b --> ebx - // c --> ecx - // d --> r8d - // e --> edx - // f --> r9d - // g --> r10d - // h --> r11d - // - // y0 --> r13d - // y1 --> r14d - // y2 --> r15d - - MOVL (0*4)(SI), AX // a = H0 - MOVL (1*4)(SI), BX // b = H1 - MOVL (2*4)(SI), CX // c = H2 - MOVL (3*4)(SI), R8 // d = H3 - MOVL (4*4)(SI), DX // e = H4 - MOVL (5*4)(SI), R9 // f = H5 - MOVL (6*4)(SI), R10 // g = H6 - MOVL (7*4)(SI), R11 // h = H7 - - MOVOU bflipMask<>(SB), X13 - MOVOU shuf00BA<>(SB), X10 // shuffle xBxA -> 00BA - MOVOU shufDC00<>(SB), X12 // shuffle xDxC -> DC00 - - MOVQ message_base+24(FP), SI // SI: &message - -loop0: - LEAQ constants<>(SB), BP - - // byte swap first 16 dwords - MOVOU 0*16(SI), X4 - LONG $0x0059c2c4; BYTE $0xe5 // VPSHUFB XMM4, XMM4, XMM13 - MOVOU 1*16(SI), X5 - LONG $0x0051c2c4; BYTE $0xed // VPSHUFB XMM5, XMM5, XMM13 - MOVOU 2*16(SI), X6 - LONG $0x0049c2c4; BYTE $0xf5 // VPSHUFB XMM6, XMM6, XMM13 - MOVOU 3*16(SI), X7 - LONG $0x0041c2c4; BYTE $0xfd // VPSHUFB XMM7, XMM7, XMM13 - - MOVQ SI, reserved3+72(FP) - MOVD $0x3, DI - - // schedule 48 input dwords, by doing 3 rounds of 16 each -loop1: - LONG $0x4dfe59c5; BYTE $0x00 // VPADDD XMM9, XMM4, 0[RBP] /* Add 1st constant to first part of message */ - MOVOU X9, reserved0+48(FP) - FOUR_ROUNDS_AND_SCHED(AX, BX, CX, R8, DX, R9, R10, R11) - - LONG $0x4dfe59c5; BYTE $0x10 // VPADDD XMM9, XMM4, 16[RBP] /* Add 2nd constant to message */ - MOVOU X9, reserved0+48(FP) - FOUR_ROUNDS_AND_SCHED(DX, R9, R10, R11, AX, BX, CX, R8) - - LONG $0x4dfe59c5; BYTE $0x20 // VPADDD XMM9, XMM4, 32[RBP] /* Add 3rd constant to message */ - MOVOU X9, reserved0+48(FP) - FOUR_ROUNDS_AND_SCHED(AX, BX, CX, R8, DX, R9, R10, R11) - - LONG $0x4dfe59c5; BYTE $0x30 // VPADDD XMM9, XMM4, 48[RBP] /* Add 4th constant to message */ - MOVOU X9, reserved0+48(FP) - ADDQ $64, BP - FOUR_ROUNDS_AND_SCHED(DX, R9, R10, R11, AX, BX, CX, R8) - - SUBQ $1, DI - JNE loop1 - - MOVD $0x2, DI - -loop2: - LONG $0x4dfe59c5; BYTE $0x00 // VPADDD XMM9, XMM4, 0[RBP] /* Add 1st constant to first part of message */ - MOVOU X9, reserved0+48(FP) - DO_ROUND( AX, BX, CX, R8, DX, R9, R10, R11, 48) - DO_ROUND(R11, AX, BX, CX, R8, DX, R9, R10, 52) - DO_ROUND(R10, R11, AX, BX, CX, R8, DX, R9, 56) - DO_ROUND( R9, R10, R11, AX, BX, CX, R8, DX, 60) - - LONG $0x4dfe51c5; BYTE $0x10 // VPADDD XMM9, XMM5, 16[RBP] /* Add 2nd constant to message */ - MOVOU X9, reserved0+48(FP) - ADDQ $32, BP - DO_ROUND( DX, R9, R10, R11, AX, BX, CX, R8, 48) - DO_ROUND( R8, DX, R9, R10, R11, AX, BX, CX, 52) - DO_ROUND( CX, R8, DX, R9, R10, R11, AX, BX, 56) - DO_ROUND( BX, CX, R8, DX, R9, R10, R11, AX, 60) - - MOVOU X6, X4 - MOVOU X7, X5 - - SUBQ $1, DI - JNE loop2 - - MOVQ h+0(FP), SI // SI: &h - ADDL (0*4)(SI), AX // H0 = a + H0 - MOVL AX, (0*4)(SI) - ADDL (1*4)(SI), BX // H1 = b + H1 - MOVL BX, (1*4)(SI) - ADDL (2*4)(SI), CX // H2 = c + H2 - MOVL CX, (2*4)(SI) - ADDL (3*4)(SI), R8 // H3 = d + H3 - MOVL R8, (3*4)(SI) - ADDL (4*4)(SI), DX // H4 = e + H4 - MOVL DX, (4*4)(SI) - ADDL (5*4)(SI), R9 // H5 = f + H5 - MOVL R9, (5*4)(SI) - ADDL (6*4)(SI), R10 // H6 = g + H6 - MOVL R10, (6*4)(SI) - ADDL (7*4)(SI), R11 // H7 = h + H7 - MOVL R11, (7*4)(SI) - - MOVQ reserved3+72(FP), SI - ADDQ $64, SI - CMPQ reserved2+64(FP), SI - JNE loop0 - -done_hash: - RET - -// Constants table -DATA constants<>+0x0(SB)/8, $0x71374491428a2f98 -DATA constants<>+0x8(SB)/8, $0xe9b5dba5b5c0fbcf -DATA constants<>+0x10(SB)/8, $0x59f111f13956c25b -DATA constants<>+0x18(SB)/8, $0xab1c5ed5923f82a4 -DATA constants<>+0x20(SB)/8, $0x12835b01d807aa98 -DATA constants<>+0x28(SB)/8, $0x550c7dc3243185be -DATA constants<>+0x30(SB)/8, $0x80deb1fe72be5d74 -DATA constants<>+0x38(SB)/8, $0xc19bf1749bdc06a7 -DATA constants<>+0x40(SB)/8, $0xefbe4786e49b69c1 -DATA constants<>+0x48(SB)/8, $0x240ca1cc0fc19dc6 -DATA constants<>+0x50(SB)/8, $0x4a7484aa2de92c6f -DATA constants<>+0x58(SB)/8, $0x76f988da5cb0a9dc -DATA constants<>+0x60(SB)/8, $0xa831c66d983e5152 -DATA constants<>+0x68(SB)/8, $0xbf597fc7b00327c8 -DATA constants<>+0x70(SB)/8, $0xd5a79147c6e00bf3 -DATA constants<>+0x78(SB)/8, $0x1429296706ca6351 -DATA constants<>+0x80(SB)/8, $0x2e1b213827b70a85 -DATA constants<>+0x88(SB)/8, $0x53380d134d2c6dfc -DATA constants<>+0x90(SB)/8, $0x766a0abb650a7354 -DATA constants<>+0x98(SB)/8, $0x92722c8581c2c92e -DATA constants<>+0xa0(SB)/8, $0xa81a664ba2bfe8a1 -DATA constants<>+0xa8(SB)/8, $0xc76c51a3c24b8b70 -DATA constants<>+0xb0(SB)/8, $0xd6990624d192e819 -DATA constants<>+0xb8(SB)/8, $0x106aa070f40e3585 -DATA constants<>+0xc0(SB)/8, $0x1e376c0819a4c116 -DATA constants<>+0xc8(SB)/8, $0x34b0bcb52748774c -DATA constants<>+0xd0(SB)/8, $0x4ed8aa4a391c0cb3 -DATA constants<>+0xd8(SB)/8, $0x682e6ff35b9cca4f -DATA constants<>+0xe0(SB)/8, $0x78a5636f748f82ee -DATA constants<>+0xe8(SB)/8, $0x8cc7020884c87814 -DATA constants<>+0xf0(SB)/8, $0xa4506ceb90befffa -DATA constants<>+0xf8(SB)/8, $0xc67178f2bef9a3f7 - -DATA bflipMask<>+0x00(SB)/8, $0x0405060700010203 -DATA bflipMask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b - -DATA shuf00BA<>+0x00(SB)/8, $0x0b0a090803020100 -DATA shuf00BA<>+0x08(SB)/8, $0xFFFFFFFFFFFFFFFF - -DATA shufDC00<>+0x00(SB)/8, $0xFFFFFFFFFFFFFFFF -DATA shufDC00<>+0x08(SB)/8, $0x0b0a090803020100 - -GLOBL constants<>(SB), 8, $256 -GLOBL bflipMask<>(SB), (NOPTR+RODATA), $16 -GLOBL shuf00BA<>(SB), (NOPTR+RODATA), $16 -GLOBL shufDC00<>(SB), (NOPTR+RODATA), $16 diff --git a/sha256blockSha_amd64.go b/sha256blockSha_amd64.go index 483689e..bef9494 100644 --- a/sha256blockSha_amd64.go +++ b/sha256blockSha_amd64.go @@ -1,4 +1,4 @@ -//+build !noasm,!appengine +//+build !noasm,!appengine,gc package sha256 diff --git a/sha256blockSha_amd64.s b/sha256blockSha_amd64.s index 909fc0e..14cf2c6 100644 --- a/sha256blockSha_amd64.s +++ b/sha256blockSha_amd64.s @@ -1,4 +1,4 @@ -//+build !noasm,!appengine +//+build !noasm,!appengine,gc // SHA intrinsic version of SHA256 diff --git a/sha256blockSha_amd64_test.go b/sha256blockSha_amd64_test.go index c43202e..1ca144f 100644 --- a/sha256blockSha_amd64_test.go +++ b/sha256blockSha_amd64_test.go @@ -1,4 +1,4 @@ -//+build !noasm,!appengine +//+build !noasm,!appengine,gc package sha256 @@ -71,7 +71,7 @@ } func TestSha1(t *testing.T) { - if sha && ssse3 && sse41 && !runTestSha(sha256hash) { + if hasSHAExtensions() && !runTestSha(sha256hash) { t.Errorf("FAILED") } } diff --git a/sha256blockSsse_amd64.go b/sha256blockSsse_amd64.go deleted file mode 100644 index 1ae2320..0000000 --- a/sha256blockSsse_amd64.go +++ /dev/null @@ -1,22 +0,0 @@ -//+build !noasm,!appengine - -/* - * Minio Cloud Storage, (C) 2016 Minio, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package sha256 - -//go:noescape -func blockSsse(h []uint32, message []uint8, reserved0, reserved1, reserved2, reserved3 uint64) diff --git a/sha256blockSsse_amd64.s b/sha256blockSsse_amd64.s deleted file mode 100644 index 7afb45c..0000000 --- a/sha256blockSsse_amd64.s +++ /dev/null @@ -1,429 +0,0 @@ -//+build !noasm,!appengine - -// SHA256 implementation for SSSE3 - -// -// Minio Cloud Storage, (C) 2016 Minio, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -// -// This code is based on an Intel White-Paper: -// "Fast SHA-256 Implementations on Intel Architecture Processors" -// -// together with the reference implementation from the following authors: -// James Guilford -// Kirk Yap -// Tim Chen -// -// For Golang it has been converted to Plan 9 assembly with the help of -// github.com/minio/asm2plan9s to assemble Intel instructions to their Plan9 -// equivalents -// - -#include "textflag.h" - -#define ROTATE_XS \ - MOVOU X4, X15 \ - MOVOU X5, X4 \ - MOVOU X6, X5 \ - MOVOU X7, X6 \ - MOVOU X15, X7 - -// compute s0 four at a time and s1 two at a time -// compute W[-16] + W[-7] 4 at a time -#define FOUR_ROUNDS_AND_SCHED(a, b, c, d, e, f, g, h) \ - MOVL e, R13 \ // y0 = e - ROLL $18, R13 \ // y0 = e >> (25-11) - MOVL a, R14 \ // y1 = a - MOVOU X7, X0 \ - LONG $0x0f3a0f66; WORD $0x04c6 \ // PALIGNR XMM0,XMM6,0x4 /* XTMP0 = W[-7] */ - ROLL $23, R14 \ // y1 = a >> (22-13) - XORL e, R13 \ // y0 = e ^ (e >> (25-11)) - MOVL f, R15 \ // y2 = f - ROLL $27, R13 \ // y0 = (e >> (11-6)) ^ (e >> (25-6)) - XORL a, R14 \ // y1 = a ^ (a >> (22-13) - XORL g, R15 \ // y2 = f^g - LONG $0xc4fe0f66 \ // PADDD XMM0,XMM4 /* XTMP0 = W[-7] + W[-16] */ - XORL e, R13 \ // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6) ) - ANDL e, R15 \ // y2 = (f^g)&e - ROLL $21, R14 \ // y1 = (a >> (13-2)) ^ (a >> (22-2)) - \ - \ // compute s0 - \ - MOVOU X5, X1 \ - LONG $0x0f3a0f66; WORD $0x04cc \ // PALIGNR XMM1,XMM4,0x4 /* XTMP1 = W[-15] */ - XORL a, R14 \ // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - ROLL $26, R13 \ // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - XORL g, R15 \ // y2 = CH = ((f^g)&e)^g - ROLL $30, R14 \ // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - ADDL R13, R15 \ // y2 = S1 + CH - ADDL _xfer+48(FP), R15 \ // y2 = k + w + S1 + CH - MOVL a, R13 \ // y0 = a - ADDL R15, h \ // h = h + S1 + CH + k + w - \ // ROTATE_ARGS - MOVL a, R15 \ // y2 = a - MOVOU X1, X2 \ - LONG $0xd2720f66; BYTE $0x07 \ // PSRLD XMM2,0x7 /* */ - ORL c, R13 \ // y0 = a|c - ADDL h, d \ // d = d + h + S1 + CH + k + w - ANDL c, R15 \ // y2 = a&c - MOVOU X1, X3 \ - LONG $0xf3720f66; BYTE $0x19 \ // PSLLD XMM3,0x19 /* */ - ANDL b, R13 \ // y0 = (a|c)&b - ADDL R14, h \ // h = h + S1 + CH + k + w + S0 - LONG $0xdaeb0f66 \ // POR XMM3,XMM2 /* XTMP1 = W[-15] MY_ROR 7 */ - ORL R15, R13 \ // y0 = MAJ = (a|c)&b)|(a&c) - ADDL R13, h \ // h = h + S1 + CH + k + w + S0 + MAJ - \ // ROTATE_ARGS - MOVL d, R13 \ // y0 = e - MOVL h, R14 \ // y1 = a - ROLL $18, R13 \ // y0 = e >> (25-11) - XORL d, R13 \ // y0 = e ^ (e >> (25-11)) - MOVL e, R15 \ // y2 = f - ROLL $23, R14 \ // y1 = a >> (22-13) - MOVOU X1, X2 \ - LONG $0xd2720f66; BYTE $0x12 \ // PSRLD XMM2,0x12 /* */ - XORL h, R14 \ // y1 = a ^ (a >> (22-13) - ROLL $27, R13 \ // y0 = (e >> (11-6)) ^ (e >> (25-6)) - XORL f, R15 \ // y2 = f^g - MOVOU X1, X8 \ - LONG $0x720f4166; WORD $0x03d0 \ // PSRLD XMM8,0x3 /* XTMP4 = W[-15] >> 3 */ - ROLL $21, R14 \ // y1 = (a >> (13-2)) ^ (a >> (22-2)) - XORL d, R13 \ // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - ANDL d, R15 \ // y2 = (f^g)&e - ROLL $26, R13 \ // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - LONG $0xf1720f66; BYTE $0x0e \ // PSLLD XMM1,0xe /* */ - XORL h, R14 \ // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - XORL f, R15 \ // y2 = CH = ((f^g)&e)^g - LONG $0xd9ef0f66 \ // PXOR XMM3,XMM1 /* */ - ADDL R13, R15 \ // y2 = S1 + CH - ADDL _xfer+52(FP), R15 \ // y2 = k + w + S1 + CH - ROLL $30, R14 \ // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - LONG $0xdaef0f66 \ // PXOR XMM3,XMM2 /* XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR */ - MOVL h, R13 \ // y0 = a - ADDL R15, g \ // h = h + S1 + CH + k + w - MOVL h, R15 \ // y2 = a - MOVOU X3, X1 \ - LONG $0xef0f4166; BYTE $0xc8 \ // PXOR XMM1,XMM8 /* XTMP1 = s0 */ - ORL b, R13 \ // y0 = a|c - ADDL g, c \ // d = d + h + S1 + CH + k + w - ANDL b, R15 \ // y2 = a&c - \ - \ // compute low s1 - \ - LONG $0xd7700f66; BYTE $0xfa \ // PSHUFD XMM2,XMM7,0xfa /* XTMP2 = W[-2] {BBAA} */ - ANDL a, R13 \ // y0 = (a|c)&b - ADDL R14, g \ // h = h + S1 + CH + k + w + S0 - LONG $0xc1fe0f66 \ // PADDD XMM0,XMM1 /* XTMP0 = W[-16] + W[-7] + s0 */ - ORL R15, R13 \ // y0 = MAJ = (a|c)&b)|(a&c) - ADDL R13, g \ // h = h + S1 + CH + k + w + S0 + MAJ - \ // ROTATE_ARGS - MOVL c, R13 \ // y0 = e - MOVL g, R14 \ // y1 = a - ROLL $18, R13 \ // y0 = e >> (25-11) - XORL c, R13 \ // y0 = e ^ (e >> (25-11)) - ROLL $23, R14 \ // y1 = a >> (22-13) - MOVL d, R15 \ // y2 = f - XORL g, R14 \ // y1 = a ^ (a >> (22-13) - ROLL $27, R13 \ // y0 = (e >> (11-6)) ^ (e >> (25-6)) - MOVOU X2, X8 \ - LONG $0x720f4166; WORD $0x0ad0 \ // PSRLD XMM8,0xa /* XTMP4 = W[-2] >> 10 {BBAA} */ - XORL e, R15 \ // y2 = f^g - MOVOU X2, X3 \ - LONG $0xd3730f66; BYTE $0x13 \ // PSRLQ XMM3,0x13 /* XTMP3 = W[-2] MY_ROR 19 {xBxA} */ - XORL c, R13 \ // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - ANDL c, R15 \ // y2 = (f^g)&e - LONG $0xd2730f66; BYTE $0x11 \ // PSRLQ XMM2,0x11 /* XTMP2 = W[-2] MY_ROR 17 {xBxA} */ - ROLL $21, R14 \ // y1 = (a >> (13-2)) ^ (a >> (22-2)) - XORL g, R14 \ // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - XORL e, R15 \ // y2 = CH = ((f^g)&e)^g - ROLL $26, R13 \ // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - LONG $0xd3ef0f66 \ // PXOR XMM2,XMM3 /* */ - ADDL R13, R15 \ // y2 = S1 + CH - ROLL $30, R14 \ // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - ADDL _xfer+56(FP), R15 \ // y2 = k + w + S1 + CH - LONG $0xef0f4466; BYTE $0xc2 \ // PXOR XMM8,XMM2 /* XTMP4 = s1 {xBxA} */ - MOVL g, R13 \ // y0 = a - ADDL R15, f \ // h = h + S1 + CH + k + w - MOVL g, R15 \ // y2 = a - LONG $0x380f4566; WORD $0xc200 \ // PSHUFB XMM8,XMM10 /* XTMP4 = s1 {00BA} */ - ORL a, R13 \ // y0 = a|c - ADDL f, b \ // d = d + h + S1 + CH + k + w - ANDL a, R15 \ // y2 = a&c - LONG $0xfe0f4166; BYTE $0xc0 \ // PADDD XMM0,XMM8 /* XTMP0 = {..., ..., W[1], W[0]} */ - ANDL h, R13 \ // y0 = (a|c)&b - ADDL R14, f \ // h = h + S1 + CH + k + w + S0 - \ - \ // compute high s1 - \ - LONG $0xd0700f66; BYTE $0x50 \ // PSHUFD XMM2,XMM0,0x50 /* XTMP2 = W[-2] {DDCC} */ - ORL R15, R13 \ // y0 = MAJ = (a|c)&b)|(a&c) - ADDL R13, f \ // h = h + S1 + CH + k + w + S0 + MAJ - \ // ROTATE_ARGS - MOVL b, R13 \ // y0 = e - ROLL $18, R13 \ // y0 = e >> (25-11) - MOVL f, R14 \ // y1 = a - ROLL $23, R14 \ // y1 = a >> (22-13) - XORL b, R13 \ // y0 = e ^ (e >> (25-11)) - MOVL c, R15 \ // y2 = f - ROLL $27, R13 \ // y0 = (e >> (11-6)) ^ (e >> (25-6)) - MOVOU X2, X11 \ - LONG $0x720f4166; WORD $0x0ad3 \ // PSRLD XMM11,0xa /* XTMP5 = W[-2] >> 10 {DDCC} */ - XORL f, R14 \ // y1 = a ^ (a >> (22-13) - XORL d, R15 \ // y2 = f^g - MOVOU X2, X3 \ - LONG $0xd3730f66; BYTE $0x13 \ // PSRLQ XMM3,0x13 /* XTMP3 = W[-2] MY_ROR 19 {xDxC} */ - XORL b, R13 \ // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - ANDL b, R15 \ // y2 = (f^g)&e - ROLL $21, R14 \ // y1 = (a >> (13-2)) ^ (a >> (22-2)) - LONG $0xd2730f66; BYTE $0x11 \ // PSRLQ XMM2,0x11 /* XTMP2 = W[-2] MY_ROR 17 {xDxC} */ - XORL f, R14 \ // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - ROLL $26, R13 \ // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - XORL d, R15 \ // y2 = CH = ((f^g)&e)^g - LONG $0xd3ef0f66 \ // PXOR XMM2,XMM3 /* */ - ROLL $30, R14 \ // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - ADDL R13, R15 \ // y2 = S1 + CH - ADDL _xfer+60(FP), R15 \ // y2 = k + w + S1 + CH - LONG $0xef0f4466; BYTE $0xda \ // PXOR XMM11,XMM2 /* XTMP5 = s1 {xDxC} */ - MOVL f, R13 \ // y0 = a - ADDL R15, e \ // h = h + S1 + CH + k + w - MOVL f, R15 \ // y2 = a - LONG $0x380f4566; WORD $0xdc00 \ // PSHUFB XMM11,XMM12 /* XTMP5 = s1 {DC00} */ - ORL h, R13 \ // y0 = a|c - ADDL e, a \ // d = d + h + S1 + CH + k + w - ANDL h, R15 \ // y2 = a&c - MOVOU X11, X4 \ - LONG $0xe0fe0f66 \ // PADDD XMM4,XMM0 /* X0 = {W[3], W[2], W[1], W[0]} */ - ANDL g, R13 \ // y0 = (a|c)&b - ADDL R14, e \ // h = h + S1 + CH + k + w + S0 - ORL R15, R13 \ // y0 = MAJ = (a|c)&b)|(a&c) - ADDL R13, e \ // h = h + S1 + CH + k + w + S0 + MAJ - \ // ROTATE_ARGS - ROTATE_XS - -#define DO_ROUND(a, b, c, d, e, f, g, h, offset) \ - MOVL e, R13 \ // y0 = e - ROLL $18, R13 \ // y0 = e >> (25-11) - MOVL a, R14 \ // y1 = a - XORL e, R13 \ // y0 = e ^ (e >> (25-11)) - ROLL $23, R14 \ // y1 = a >> (22-13) - MOVL f, R15 \ // y2 = f - XORL a, R14 \ // y1 = a ^ (a >> (22-13) - ROLL $27, R13 \ // y0 = (e >> (11-6)) ^ (e >> (25-6)) - XORL g, R15 \ // y2 = f^g - XORL e, R13 \ // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - ROLL $21, R14 \ // y1 = (a >> (13-2)) ^ (a >> (22-2)) - ANDL e, R15 \ // y2 = (f^g)&e - XORL a, R14 \ // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - ROLL $26, R13 \ // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - XORL g, R15 \ // y2 = CH = ((f^g)&e)^g - ADDL R13, R15 \ // y2 = S1 + CH - ROLL $30, R14 \ // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - ADDL _xfer+offset(FP), R15 \ // y2 = k + w + S1 + CH - MOVL a, R13 \ // y0 = a - ADDL R15, h \ // h = h + S1 + CH + k + w - MOVL a, R15 \ // y2 = a - ORL c, R13 \ // y0 = a|c - ADDL h, d \ // d = d + h + S1 + CH + k + w - ANDL c, R15 \ // y2 = a&c - ANDL b, R13 \ // y0 = (a|c)&b - ADDL R14, h \ // h = h + S1 + CH + k + w + S0 - ORL R15, R13 \ // y0 = MAJ = (a|c)&b)|(a&c) - ADDL R13, h // h = h + S1 + CH + k + w + S0 + MAJ - -// func blockSsse(h []uint32, message []uint8, reserved0, reserved1, reserved2, reserved3 uint64) -TEXT ·blockSsse(SB), 7, $0-80 - - MOVQ h+0(FP), SI // SI: &h - MOVQ message_base+24(FP), R8 // &message - MOVQ message_len+32(FP), R9 // length of message - CMPQ R9, $0 - JEQ done_hash - ADDQ R8, R9 - MOVQ R9, reserved2+64(FP) // store end of message - - // Register definition - // a --> eax - // b --> ebx - // c --> ecx - // d --> r8d - // e --> edx - // f --> r9d - // g --> r10d - // h --> r11d - // - // y0 --> r13d - // y1 --> r14d - // y2 --> r15d - - MOVL (0*4)(SI), AX // a = H0 - MOVL (1*4)(SI), BX // b = H1 - MOVL (2*4)(SI), CX // c = H2 - MOVL (3*4)(SI), R8 // d = H3 - MOVL (4*4)(SI), DX // e = H4 - MOVL (5*4)(SI), R9 // f = H5 - MOVL (6*4)(SI), R10 // g = H6 - MOVL (7*4)(SI), R11 // h = H7 - - MOVOU bflipMask<>(SB), X13 - MOVOU shuf00BA<>(SB), X10 // shuffle xBxA -> 00BA - MOVOU shufDC00<>(SB), X12 // shuffle xDxC -> DC00 - - MOVQ message_base+24(FP), SI // SI: &message - -loop0: - LEAQ constants<>(SB), BP - - // byte swap first 16 dwords - MOVOU 0*16(SI), X4 - LONG $0x380f4166; WORD $0xe500 // PSHUFB XMM4, XMM13 - MOVOU 1*16(SI), X5 - LONG $0x380f4166; WORD $0xed00 // PSHUFB XMM5, XMM13 - MOVOU 2*16(SI), X6 - LONG $0x380f4166; WORD $0xf500 // PSHUFB XMM6, XMM13 - MOVOU 3*16(SI), X7 - LONG $0x380f4166; WORD $0xfd00 // PSHUFB XMM7, XMM13 - - MOVQ SI, reserved3+72(FP) - MOVD $0x3, DI - - // Align - // nop WORD PTR [rax+rax*1+0x0] - - // schedule 48 input dwords, by doing 3 rounds of 16 each -loop1: - MOVOU X4, X9 - LONG $0xfe0f4466; WORD $0x004d // PADDD XMM9, 0[RBP] /* Add 1st constant to first part of message */ - MOVOU X9, reserved0+48(FP) - FOUR_ROUNDS_AND_SCHED(AX, BX, CX, R8, DX, R9, R10, R11) - - MOVOU X4, X9 - LONG $0xfe0f4466; WORD $0x104d // PADDD XMM9, 16[RBP] /* Add 2nd constant to message */ - MOVOU X9, reserved0+48(FP) - FOUR_ROUNDS_AND_SCHED(DX, R9, R10, R11, AX, BX, CX, R8) - - MOVOU X4, X9 - LONG $0xfe0f4466; WORD $0x204d // PADDD XMM9, 32[RBP] /* Add 3rd constant to message */ - MOVOU X9, reserved0+48(FP) - FOUR_ROUNDS_AND_SCHED(AX, BX, CX, R8, DX, R9, R10, R11) - - MOVOU X4, X9 - LONG $0xfe0f4466; WORD $0x304d // PADDD XMM9, 48[RBP] /* Add 4th constant to message */ - MOVOU X9, reserved0+48(FP) - ADDQ $64, BP - FOUR_ROUNDS_AND_SCHED(DX, R9, R10, R11, AX, BX, CX, R8) - - SUBQ $1, DI - JNE loop1 - - MOVD $0x2, DI - -loop2: - MOVOU X4, X9 - LONG $0xfe0f4466; WORD $0x004d // PADDD XMM9, 0[RBP] /* Add 1st constant to first part of message */ - MOVOU X9, reserved0+48(FP) - DO_ROUND( AX, BX, CX, R8, DX, R9, R10, R11, 48) - DO_ROUND(R11, AX, BX, CX, R8, DX, R9, R10, 52) - DO_ROUND(R10, R11, AX, BX, CX, R8, DX, R9, 56) - DO_ROUND( R9, R10, R11, AX, BX, CX, R8, DX, 60) - - MOVOU X5, X9 - LONG $0xfe0f4466; WORD $0x104d // PADDD XMM9, 16[RBP] /* Add 2nd constant to message */ - MOVOU X9, reserved0+48(FP) - ADDQ $32, BP - DO_ROUND( DX, R9, R10, R11, AX, BX, CX, R8, 48) - DO_ROUND( R8, DX, R9, R10, R11, AX, BX, CX, 52) - DO_ROUND( CX, R8, DX, R9, R10, R11, AX, BX, 56) - DO_ROUND( BX, CX, R8, DX, R9, R10, R11, AX, 60) - - MOVOU X6, X4 - MOVOU X7, X5 - - SUBQ $1, DI - JNE loop2 - - MOVQ h+0(FP), SI // SI: &h - ADDL (0*4)(SI), AX // H0 = a + H0 - MOVL AX, (0*4)(SI) - ADDL (1*4)(SI), BX // H1 = b + H1 - MOVL BX, (1*4)(SI) - ADDL (2*4)(SI), CX // H2 = c + H2 - MOVL CX, (2*4)(SI) - ADDL (3*4)(SI), R8 // H3 = d + H3 - MOVL R8, (3*4)(SI) - ADDL (4*4)(SI), DX // H4 = e + H4 - MOVL DX, (4*4)(SI) - ADDL (5*4)(SI), R9 // H5 = f + H5 - MOVL R9, (5*4)(SI) - ADDL (6*4)(SI), R10 // H6 = g + H6 - MOVL R10, (6*4)(SI) - ADDL (7*4)(SI), R11 // H7 = h + H7 - MOVL R11, (7*4)(SI) - - MOVQ reserved3+72(FP), SI - ADDQ $64, SI - CMPQ reserved2+64(FP), SI - JNE loop0 - -done_hash: - RET - -// Constants table -DATA constants<>+0x0(SB)/8, $0x71374491428a2f98 -DATA constants<>+0x8(SB)/8, $0xe9b5dba5b5c0fbcf -DATA constants<>+0x10(SB)/8, $0x59f111f13956c25b -DATA constants<>+0x18(SB)/8, $0xab1c5ed5923f82a4 -DATA constants<>+0x20(SB)/8, $0x12835b01d807aa98 -DATA constants<>+0x28(SB)/8, $0x550c7dc3243185be -DATA constants<>+0x30(SB)/8, $0x80deb1fe72be5d74 -DATA constants<>+0x38(SB)/8, $0xc19bf1749bdc06a7 -DATA constants<>+0x40(SB)/8, $0xefbe4786e49b69c1 -DATA constants<>+0x48(SB)/8, $0x240ca1cc0fc19dc6 -DATA constants<>+0x50(SB)/8, $0x4a7484aa2de92c6f -DATA constants<>+0x58(SB)/8, $0x76f988da5cb0a9dc -DATA constants<>+0x60(SB)/8, $0xa831c66d983e5152 -DATA constants<>+0x68(SB)/8, $0xbf597fc7b00327c8 -DATA constants<>+0x70(SB)/8, $0xd5a79147c6e00bf3 -DATA constants<>+0x78(SB)/8, $0x1429296706ca6351 -DATA constants<>+0x80(SB)/8, $0x2e1b213827b70a85 -DATA constants<>+0x88(SB)/8, $0x53380d134d2c6dfc -DATA constants<>+0x90(SB)/8, $0x766a0abb650a7354 -DATA constants<>+0x98(SB)/8, $0x92722c8581c2c92e -DATA constants<>+0xa0(SB)/8, $0xa81a664ba2bfe8a1 -DATA constants<>+0xa8(SB)/8, $0xc76c51a3c24b8b70 -DATA constants<>+0xb0(SB)/8, $0xd6990624d192e819 -DATA constants<>+0xb8(SB)/8, $0x106aa070f40e3585 -DATA constants<>+0xc0(SB)/8, $0x1e376c0819a4c116 -DATA constants<>+0xc8(SB)/8, $0x34b0bcb52748774c -DATA constants<>+0xd0(SB)/8, $0x4ed8aa4a391c0cb3 -DATA constants<>+0xd8(SB)/8, $0x682e6ff35b9cca4f -DATA constants<>+0xe0(SB)/8, $0x78a5636f748f82ee -DATA constants<>+0xe8(SB)/8, $0x8cc7020884c87814 -DATA constants<>+0xf0(SB)/8, $0xa4506ceb90befffa -DATA constants<>+0xf8(SB)/8, $0xc67178f2bef9a3f7 - -DATA bflipMask<>+0x00(SB)/8, $0x0405060700010203 -DATA bflipMask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b - -DATA shuf00BA<>+0x00(SB)/8, $0x0b0a090803020100 -DATA shuf00BA<>+0x08(SB)/8, $0xFFFFFFFFFFFFFFFF - -DATA shufDC00<>+0x00(SB)/8, $0xFFFFFFFFFFFFFFFF -DATA shufDC00<>+0x08(SB)/8, $0x0b0a090803020100 - -GLOBL constants<>(SB), 8, $256 -GLOBL bflipMask<>(SB), (NOPTR+RODATA), $16 -GLOBL shuf00BA<>(SB), (NOPTR+RODATA), $16 -GLOBL shufDC00<>(SB), (NOPTR+RODATA), $16 diff --git a/sha256block_amd64.go b/sha256block_amd64.go index 1c4d97f..0c48d45 100644 --- a/sha256block_amd64.go +++ b/sha256block_amd64.go @@ -1,4 +1,4 @@ -//+build !noasm,!appengine +//+build !noasm,!appengine,gc /* * Minio Cloud Storage, (C) 2016 Minio, Inc. @@ -18,36 +18,10 @@ package sha256 -func blockArmGo(dig *digest, p []byte) {} - -func blockAvxGo(dig *digest, p []byte) { - - h := []uint32{dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4], dig.h[5], dig.h[6], dig.h[7]} - - blockAvx(h[:], p[:], 0, 0, 0, 0) - - dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4], dig.h[5], dig.h[6], dig.h[7] = h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7] -} - -func blockAvx2Go(dig *digest, p []byte) { - - h := []uint32{dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4], dig.h[5], dig.h[6], dig.h[7]} - - blockAvx2(h[:], p[:]) - - dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4], dig.h[5], dig.h[6], dig.h[7] = h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7] -} - -func blockSsseGo(dig *digest, p []byte) { - - h := []uint32{dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4], dig.h[5], dig.h[6], dig.h[7]} - - blockSsse(h[:], p[:], 0, 0, 0, 0) - - dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4], dig.h[5], dig.h[6], dig.h[7] = h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7] +func blockArmGo(dig *digest, p []byte) { + panic("blockArmGo called unexpectedly") } func blockShaGo(dig *digest, p []byte) { - blockSha(&dig.h, p) } diff --git a/sha256block_arm64.go b/sha256block_arm64.go index 0979c20..58ccf6e 100644 --- a/sha256block_arm64.go +++ b/sha256block_arm64.go @@ -1,4 +1,4 @@ -//+build !noasm,!appengine +//+build !noasm,!appengine,gc /* * Minio Cloud Storage, (C) 2016 Minio, Inc. @@ -18,10 +18,9 @@ package sha256 -func blockAvx2Go(dig *digest, p []byte) {} -func blockAvxGo(dig *digest, p []byte) {} -func blockSsseGo(dig *digest, p []byte) {} -func blockShaGo(dig *digest, p []byte) {} +func blockShaGo(dig *digest, p []byte) { + panic("blockShaGoc called unexpectedly") +} //go:noescape func blockArm(h []uint32, message []uint8) diff --git a/sha256block_arm64.s b/sha256block_arm64.s index c6ddb37..d85170d 100644 --- a/sha256block_arm64.s +++ b/sha256block_arm64.s @@ -1,4 +1,4 @@ -//+build !noasm,!appengine +//+build !noasm,!appengine,gc // ARM64 version of SHA256 diff --git a/sha256block_other.go b/sha256block_other.go index 0187c95..ec586c0 100644 --- a/sha256block_other.go +++ b/sha256block_other.go @@ -1,4 +1,4 @@ -//+build appengine noasm !amd64,!arm64 +//+build appengine noasm !amd64,!arm64 !gc /* * Minio Cloud Storage, (C) 2019 Minio, Inc. @@ -18,8 +18,11 @@ package sha256 -func blockAvx2Go(dig *digest, p []byte) {} -func blockAvxGo(dig *digest, p []byte) {} -func blockSsseGo(dig *digest, p []byte) {} -func blockShaGo(dig *digest, p []byte) {} -func blockArmGo(dig *digest, p []byte) {} +func blockShaGo(dig *digest, p []byte) { + panic("blockShaGo called unexpectedly") + +} + +func blockArmGo(dig *digest, p []byte) { + panic("blockArmGo called unexpectedly") +}