Codebase list golang-github-minio-sha256-simd / 6898e4f
Feature/avx512 support (#30) * Initial implementation of 16x parallel support for SHA256 hashing on AVX512 * Updated tests * Add support for detection of AVX512 capabilities * Add Write support for arbitrary blocks and proper length adding for Sum * Fix test and remove formatting * Remove old comments * Cache final digest on client * Updated version with more optimized assembly listing/formatting Frank Wessels authored 6 years ago GitHub committed 6 years ago
5 changed file(s) with 2071 addition(s) and 51 deletion(s). Raw diff Collapse all Expand all
1515 package sha256
1616
1717 // True when SIMD instructions are available.
18 var avx512 = haveAVX512()
1819 var avx2 = haveAVX2()
1920 var avx = haveAVX()
2021 var ssse3 = haveSSSE3()
4546 return false
4647 }
4748
49 // haveAVX512 returns true when there is AVX512 support
50 func haveAVX512() bool {
51 mfi, _, _, _ := cpuid(0)
52
53 // Check AVX2, AVX2 requires OS support, but BMI1/2 don't.
54 if mfi >= 7 {
55 _, _, c, _ := cpuid(1)
56
57 // Only detect AVX-512 features if XGETBV is supported
58 if c&((1<<26)|(1<<27)) == (1<<26)|(1<<27) {
59 // Check for OS support
60 eax, _ := xgetbv(0)
61 _, ebx, _, _ := cpuidex(7, 0)
62
63 // Verify that XCR0[7:5] = ‘111b’ (OPMASK state, upper 256-bit of ZMM0-ZMM15 and
64 // ZMM16-ZMM31 state are enabled by OS)
65 /// and that XCR0[2:1] = ‘11b’ (XMM state and YMM state are enabled by OS).
66 if (eax>>5)&7 == 7 && (eax>>1)&3 == 3 {
67 if ebx&(1<<16) == 0 {
68 return false // no AVX512F
69 }
70 if ebx&(1<<17) == 0 {
71 return false // no AVX512DQ
72 }
73 if ebx&(1<<30) == 0 {
74 return false // no AVX512BW
75 }
76 if ebx&(1<<31) == 0 {
77 return false // no AVX512VL
78 }
79 return true
80 }
81 }
82 }
83 return false
84 }
85
4886 // haveSSSE3 returns true when there is SSSE3 support
4987 func haveSSSE3() bool {
5088
4949 package sha256
5050
5151 import (
52 "bytes"
53 "encoding/binary"
5254 "encoding/hex"
5355 "fmt"
56 "hash"
57 "reflect"
58 "strings"
59 "sync"
5460 "testing"
5561 )
5662
159165 (0 * 1) + (1 * 2) + (1 * 4) + (0 * 8) + (0 * 16) + (0 * 32) + (0 * 64) + (0 * 128),
160166 (1 * 1) + (1 * 2) + (0 * 4) + (0 * 8) + (0 * 16) + (0 * 32) + (0 * 64) + (0 * 128)},
161167 "ab"},
162 {[32]byte{(0 * 1) + (1 * 2) + (0 * 4) + (1 * 8) + (1 * 16) + (1 * 32) + (0 * 64) + (1 * 128),
163 (0 * 1) + (0 * 2) + (0 * 4) + (1 * 8) + (1 * 16) + (1 * 32) + (1 * 64) + (0 * 128),
164 (0 * 1) + (1 * 2) + (1 * 4) + (0 * 8) + (1 * 16) + (0 * 32) + (0 * 64) + (0 * 128),
165 (1 * 1) + (1 * 2) + (1 * 4) + (1 * 8) + (1 * 16) + (1 * 32) + (0 * 64) + (1 * 128),
166 (1 * 1) + (1 * 2) + (1 * 4) + (1 * 8) + (0 * 16) + (0 * 32) + (0 * 64) + (1 * 128),
167 (1 * 1) + (0 * 2) + (0 * 4) + (0 * 8) + (0 * 16) + (0 * 32) + (0 * 64) + (0 * 128),
168 (1 * 1) + (1 * 2) + (1 * 4) + (1 * 8) + (0 * 16) + (0 * 32) + (1 * 64) + (1 * 128),
169 (0 * 1) + (1 * 2) + (0 * 4) + (1 * 8) + (0 * 16) + (1 * 32) + (1 * 64) + (1 * 128),
170 (1 * 1) + (0 * 2) + (0 * 4) + (0 * 8) + (0 * 16) + (0 * 32) + (1 * 64) + (0 * 128),
171 (1 * 1) + (0 * 2) + (0 * 4) + (0 * 8) + (0 * 16) + (0 * 32) + (1 * 64) + (0 * 128),
172 (0 * 1) + (0 * 2) + (0 * 4) + (0 * 8) + (0 * 16) + (0 * 32) + (1 * 64) + (0 * 128),
173 (0 * 1) + (1 * 2) + (1 * 4) + (1 * 8) + (1 * 16) + (0 * 32) + (1 * 64) + (1 * 128),
174 (1 * 1) + (0 * 2) + (1 * 4) + (1 * 8) + (1 * 16) + (0 * 32) + (1 * 64) + (0 * 128),
175 (0 * 1) + (1 * 2) + (1 * 4) + (1 * 8) + (0 * 16) + (1 * 32) + (0 * 64) + (1 * 128),
176 (0 * 1) + (1 * 2) + (0 * 4) + (0 * 8) + (0 * 16) + (1 * 32) + (0 * 64) + (0 * 128),
177 (1 * 1) + (1 * 2) + (0 * 4) + (0 * 8) + (0 * 16) + (1 * 32) + (0 * 64) + (0 * 128),
178 (0 * 1) + (0 * 2) + (0 * 4) + (0 * 8) + (1 * 16) + (1 * 32) + (0 * 64) + (1 * 128),
179 (1 * 1) + (1 * 2) + (0 * 4) + (0 * 8) + (0 * 16) + (0 * 32) + (0 * 64) + (0 * 128),
180 (1 * 1) + (0 * 2) + (0 * 4) + (0 * 8) + (0 * 16) + (1 * 32) + (1 * 64) + (0 * 128),
181 (1 * 1) + (1 * 2) + (0 * 4) + (0 * 8) + (0 * 16) + (1 * 32) + (0 * 64) + (1 * 128),
182 (0 * 1) + (1 * 2) + (1 * 4) + (0 * 8) + (1 * 16) + (0 * 32) + (0 * 64) + (1 * 128),
183 (1 * 1) + (1 * 2) + (1 * 4) + (0 * 8) + (1 * 16) + (0 * 32) + (0 * 64) + (0 * 128),
184 (0 * 1) + (1 * 2) + (0 * 4) + (1 * 8) + (1 * 16) + (1 * 32) + (1 * 64) + (0 * 128),
185 (0 * 1) + (0 * 2) + (1 * 4) + (1 * 8) + (1 * 16) + (0 * 32) + (0 * 64) + (1 * 128),
186 (0 * 1) + (0 * 2) + (1 * 4) + (0 * 8) + (1 * 16) + (1 * 32) + (0 * 64) + (1 * 128),
187 (0 * 1) + (0 * 2) + (0 * 4) + (0 * 8) + (1 * 16) + (0 * 32) + (0 * 64) + (0 * 128),
188 (1 * 1) + (1 * 2) + (1 * 4) + (1 * 8) + (1 * 16) + (1 * 32) + (1 * 64) + (1 * 128),
189 (1 * 1) + (0 * 2) + (0 * 4) + (0 * 8) + (0 * 16) + (1 * 32) + (1 * 64) + (0 * 128),
190 (0 * 1) + (1 * 2) + (0 * 4) + (0 * 8) + (1 * 16) + (1 * 32) + (1 * 64) + (1 * 128),
191 (0 * 1) + (0 * 2) + (0 * 4) + (0 * 8) + (0 * 16) + (0 * 32) + (0 * 64) + (0 * 128),
192 (1 * 1) + (0 * 2) + (1 * 4) + (0 * 8) + (1 * 16) + (0 * 32) + (0 * 64) + (0 * 128),
193 (1 * 1) + (0 * 2) + (1 * 4) + (1 * 8) + (0 * 16) + (1 * 32) + (0 * 64) + (1 * 128)},
194 "abc"},
195168 {[32]byte{(0 * 1) + (1 * 2) + (0 * 4) + (1 * 8) + (1 * 16) + (1 * 32) + (0 * 64) + (1 * 128),
196169 (0 * 1) + (0 * 2) + (0 * 4) + (1 * 8) + (1 * 16) + (1 * 32) + (1 * 64) + (0 * 128),
197170 (0 * 1) + (1 * 2) + (1 * 4) + (0 * 8) + (1 * 16) + (0 * 32) + (0 * 64) + (0 * 128),
11161089 (1 * 1) + (1 * 2) + (1 * 4) + (1 * 8) + (0 * 16) + (0 * 32) + (0 * 64) + (1 * 128),
11171090 (0 * 1) + (1 * 2) + (1 * 4) + (0 * 8) + (0 * 16) + (1 * 32) + (1 * 64) + (1 * 128)},
11181091 "How can you write a big system without C++? -Paul Glick"},
1092 // $ echo -n "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz123" | sha256sum
1093 // 13d8b6bf5cc79c03c07c719c48597bd33b79677e65098589b1580fca7f22bb22
1094 {[32]byte{0x13, 0xd8, 0xb6, 0xbf, 0x5c, 0xc7, 0x9c, 0x03,
1095 0xc0, 0x7c, 0x71, 0x9c, 0x48, 0x59, 0x7b, 0xd3,
1096 0x3b, 0x79, 0x67, 0x7e, 0x65, 0x09, 0x85, 0x89,
1097 0xb1, 0x58, 0x0f, 0xca, 0x7f, 0x22, 0xbb, 0x22},
1098 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz123"},
1099 // $ echo -n "BCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234" | sha256sum
1100 // 624ddef3009879c6874da2dd771d54f7330781b60e1955ceff5f9dce8bf4ea43
1101 {[32]byte{0x62, 0x4d, 0xde, 0xf3, 0x00, 0x98, 0x79, 0xc6,
1102 0x87, 0x4d, 0xa2, 0xdd, 0x77, 0x1d, 0x54, 0xf7,
1103 0x33, 0x07, 0x81, 0xb6, 0x0e, 0x19, 0x55, 0xce,
1104 0xff, 0x5f, 0x9d, 0xce, 0x8b, 0xf4, 0xea, 0x43},
1105 "BCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234"},
1106 // $ echo -n "CDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz12345" | sha256sum
1107 // cc031589b70dd4b24dc6def2121835ef1aa8074ff6952cdd3f81b5099a93c58d
1108 {[32]byte{0xcc, 0x03, 0x15, 0x89, 0xb7, 0x0d, 0xd4, 0xb2,
1109 0x4d, 0xc6, 0xde, 0xf2, 0x12, 0x18, 0x35, 0xef,
1110 0x1a, 0xa8, 0x07, 0x4f, 0xf6, 0x95, 0x2c, 0xdd,
1111 0x3f, 0x81, 0xb5, 0x09, 0x9a, 0x93, 0xc5, 0x8d},
1112 "CDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz12345"},
1113 // $ echo -n "DEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz123456" | sha256sum
1114 // d354abb6d538402db3d73daf95537a255ebaf3a943c80205be163e044fc46a70
1115 {[32]byte{0xd3, 0x54, 0xab, 0xb6, 0xd5, 0x38, 0x40, 0x2d,
1116 0xb3, 0xd7, 0x3d, 0xaf, 0x95, 0x53, 0x7a, 0x25,
1117 0x5e, 0xba, 0xf3, 0xa9, 0x43, 0xc8, 0x02, 0x05,
1118 0xbe, 0x16, 0x3e, 0x04, 0x4f, 0xc4, 0x6a, 0x70},
1119 "DEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz123456"},
1120 // $ echo -n "EFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567" | sha256sum
1121 // f78410b90a20b521afb28f41d6388482afab7265ff8884aa6290cc9f9ada30d3
1122 {[32]byte{0xf7, 0x84, 0x10, 0xb9, 0x0a, 0x20, 0xb5, 0x21,
1123 0xaf, 0xb2, 0x8f, 0x41, 0xd6, 0x38, 0x84, 0x82,
1124 0xaf, 0xab, 0x72, 0x65, 0xff, 0x88, 0x84, 0xaa,
1125 0x62, 0x90, 0xcc, 0x9f, 0x9a, 0xda, 0x30, 0xd3},
1126 "EFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567"},
1127 // $ echo -n "FGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz12345678" | sha256sum
1128 // c93a8cb7ed80166b15b79c8617410ca69e46fa1e3c1d14876699d3ce6090384f
1129 {[32]byte{0xc9, 0x3a, 0x8c, 0xb7, 0xed, 0x80, 0x16, 0x6b,
1130 0x15, 0xb7, 0x9c, 0x86, 0x17, 0x41, 0x0c, 0xa6,
1131 0x9e, 0x46, 0xfa, 0x1e, 0x3c, 0x1d, 0x14, 0x87,
1132 0x66, 0x99, 0xd3, 0xce, 0x60, 0x90, 0x38, 0x4f},
1133 "FGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz12345678"},
1134 // $ echo -n "GHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz123456789" | sha256sum
1135 // 6cb808e9a7fb53fa680824f08554b660d29a4afc9a101f990b4bae3a12b7fbd8
1136 {[32]byte{0x6c, 0xb8, 0x08, 0xe9, 0xa7, 0xfb, 0x53, 0xfa,
1137 0x68, 0x08, 0x24, 0xf0, 0x85, 0x54, 0xb6, 0x60,
1138 0xd2, 0x9a, 0x4a, 0xfc, 0x9a, 0x10, 0x1f, 0x99,
1139 0x0b, 0x4b, 0xae, 0x3a, 0x12, 0xb7, 0xfb, 0xd8},
1140 "GHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz123456789"},
1141 // $ echo -n "HIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890" | sha256sum
1142 // 84e8dd1afa78db222860ed40b6fcfc7a269469365f81f5712fb589555bdb01fe
1143 {[32]byte{0x84, 0xe8, 0xdd, 0x1a, 0xfa, 0x78, 0xdb, 0x22,
1144 0x28, 0x60, 0xed, 0x40, 0xb6, 0xfc, 0xfc, 0x7a,
1145 0x26, 0x94, 0x69, 0x36, 0x5f, 0x81, 0xf5, 0x71,
1146 0x2f, 0xb5, 0x89, 0x55, 0x5b, 0xdb, 0x01, 0xfe},
1147 "HIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890"},
1148 // $ echo -n "IJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890A" | sha256sum
1149 // accab8e85b6bd178e975aaaa354aed8258bcd6af3e61bd4f12267635856cab0b
1150 {[32]byte{0xac, 0xca, 0xb8, 0xe8, 0x5b, 0x6b, 0xd1, 0x78,
1151 0xe9, 0x75, 0xaa, 0xaa, 0x35, 0x4a, 0xed, 0x82,
1152 0x58, 0xbc, 0xd6, 0xaf, 0x3e, 0x61, 0xbd, 0x4f,
1153 0x12, 0x26, 0x76, 0x35, 0x85, 0x6c, 0xab, 0x0b},
1154 "IJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890A"},
1155 // $ echo -n "JKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890AB" | sha256sum
1156 // 107f5ad8bc5d427246fc5f9c581134b61d8ba447e877df56cddad2bf53789172
1157 {[32]byte{0x10, 0x7f, 0x5a, 0xd8, 0xbc, 0x5d, 0x42, 0x72,
1158 0x46, 0xfc, 0x5f, 0x9c, 0x58, 0x11, 0x34, 0xb6,
1159 0x1d, 0x8b, 0xa4, 0x47, 0xe8, 0x77, 0xdf, 0x56,
1160 0xcd, 0xda, 0xd2, 0xbf, 0x53, 0x78, 0x91, 0x72},
1161 "JKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890AB"},
1162 // $ echo -n "KLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890ABC" | sha256sum
1163 // 7666f65b234f78aa537c8d098b181091ce8b7866a0285b52e6bf31b6f21ca9bb
1164 {[32]byte{0x76, 0x66, 0xf6, 0x5b, 0x23, 0x4f, 0x78, 0xaa,
1165 0x53, 0x7c, 0x8d, 0x09, 0x8b, 0x18, 0x10, 0x91,
1166 0xce, 0x8b, 0x78, 0x66, 0xa0, 0x28, 0x5b, 0x52,
1167 0xe6, 0xbf, 0x31, 0xb6, 0xf2, 0x1c, 0xa9, 0xbb},
1168 "KLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890ABC"},
1169 // $ echo -n "LMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890ABCD" | sha256sum
1170 // 4eba948ccee7289ab1f01628a1ab756dee39a6894aed217edc9a91a8b35e50ca
1171 {[32]byte{0x4e, 0xba, 0x94, 0x8c, 0xce, 0xe7, 0x28, 0x9a,
1172 0xb1, 0xf0, 0x16, 0x28, 0xa1, 0xab, 0x75, 0x6d,
1173 0xee, 0x39, 0xa6, 0x89, 0x4a, 0xed, 0x21, 0x7e,
1174 0xdc, 0x9a, 0x91, 0xa8, 0xb3, 0x5e, 0x50, 0xca},
1175 "LMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890ABCD"},
1176 // $ echo -n "MNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890ABCDE" | sha256sum
1177 // 5011218873e7ca84871668d26461e449e7033b7959d69cfb5c2fee773c3d432d
1178 {[32]byte{0x50, 0x11, 0x21, 0x88, 0x73, 0xe7, 0xca, 0x84,
1179 0x87, 0x16, 0x68, 0xd2, 0x64, 0x61, 0xe4, 0x49,
1180 0xe7, 0x03, 0x3b, 0x79, 0x59, 0xd6, 0x9c, 0xfb,
1181 0x5c, 0x2f, 0xee, 0x77, 0x3c, 0x3d, 0x43, 0x2d},
1182 "MNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890ABCDE"},
1183 // $ echo -n "NOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890ABCDEF" | sha256sum
1184 // 6932b4ddaf3696e5d5270739bdbe6ab120bb8034b877bd3a8e5a5d5ca263e1c5
1185 {[32]byte{0x69, 0x32, 0xb4, 0xdd, 0xaf, 0x36, 0x96, 0xe5,
1186 0xd5, 0x27, 0x07, 0x39, 0xbd, 0xbe, 0x6a, 0xb1,
1187 0x20, 0xbb, 0x80, 0x34, 0xb8, 0x77, 0xbd, 0x3a,
1188 0x8e, 0x5a, 0x5d, 0x5c, 0xa2, 0x63, 0xe1, 0xc5},
1189 "NOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890ABCDEF"},
1190 // $ echo -n "OPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890ABCDEFG" | sha256sum
1191 // 91bb1bcbfcb4c093aab255a0b8c8b5b93605e2f51dd6b0898b70b9f3c10fc1f9
1192 {[32]byte{0x91, 0xbb, 0x1b, 0xcb, 0xfc, 0xb4, 0xc0, 0x93,
1193 0xaa, 0xb2, 0x55, 0xa0, 0xb8, 0xc8, 0xb5, 0xb9,
1194 0x36, 0x05, 0xe2, 0xf5, 0x1d, 0xd6, 0xb0, 0x89,
1195 0x8b, 0x70, 0xb9, 0xf3, 0xc1, 0x0f, 0xc1, 0xf9},
1196 "OPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890ABCDEFG"},
1197 // $ echo -n "PQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890ABCDEFGH" | sha256sum
1198 // 0d1fa5355388e361c4591bd49c004e3d99044be274db43e91036611365aead02
1199 {[32]byte{0x0d, 0x1f, 0xa5, 0x35, 0x53, 0x88, 0xe3, 0x61,
1200 0xc4, 0x59, 0x1b, 0xd4, 0x9c, 0x00, 0x4e, 0x3d,
1201 0x99, 0x04, 0x4b, 0xe2, 0x74, 0xdb, 0x43, 0xe9,
1202 0x10, 0x36, 0x61, 0x13, 0x65, 0xae, 0xad, 0x02},
1203 "PQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890ABCDEFGH"},
1204 // $ echo -n "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" | sha256sum
1205 // b6ac3cc10386331c765f04f041c147d0f278f2aed8eaa021e2d0057fc6f6ff9e
1206 {[32]byte{0xb6, 0xac, 0x3c, 0xc1, 0x03, 0x86, 0x33, 0x1c,
1207 0x76, 0x5f, 0x04, 0xf0, 0x41, 0xc1, 0x47, 0xd0,
1208 0xf2, 0x78, 0xf2, 0xae, 0xd8, 0xea, 0xa0, 0x21,
1209 0xe2, 0xd0, 0x05, 0x7f, 0xc6, 0xf6, 0xff, 0x9e},
1210 strings.Repeat("A", 128)},
1211 // $ echo -n "BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB" | sha256sum
1212 // 7abaa701a6f4bb8d9ea3872a315597eb6f2ccfd03392d8d10560837f6136d06a
1213 {[32]byte{0x7a, 0xba, 0xa7, 0x01, 0xa6, 0xf4, 0xbb, 0x8d,
1214 0x9e, 0xa3, 0x87, 0x2a, 0x31, 0x55, 0x97, 0xeb,
1215 0x6f, 0x2c, 0xcf, 0xd0, 0x33, 0x92, 0xd8, 0xd1,
1216 0x05, 0x60, 0x83, 0x7f, 0x61, 0x36, 0xd0, 0x6a},
1217 strings.Repeat("B", 128)},
1218 // $ echo -n "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC" | sha256sum
1219 // 6e8b9325f779dba60c4c148dee5ded43b19ed20d25d66e338abec53b99174fe8
1220 {[32]byte{0x6e, 0x8b, 0x93, 0x25, 0xf7, 0x79, 0xdb, 0xa6,
1221 0x0c, 0x4c, 0x14, 0x8d, 0xee, 0x5d, 0xed, 0x43,
1222 0xb1, 0x9e, 0xd2, 0x0d, 0x25, 0xd6, 0x6e, 0x33,
1223 0x8a, 0xbe, 0xc5, 0x3b, 0x99, 0x17, 0x4f, 0xe8},
1224 strings.Repeat("C", 128)},
1225 // $ echo -n "DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD" | sha256sum
1226 // 7aa020c91ac4d32e17efd9b64648b92e375987e0eae7d0a58544ca1e4fc32c3c
1227 {[32]byte{0x7a, 0xa0, 0x20, 0xc9, 0x1a, 0xc4, 0xd3, 0x2e,
1228 0x17, 0xef, 0xd9, 0xb6, 0x46, 0x48, 0xb9, 0x2e,
1229 0x37, 0x59, 0x87, 0xe0, 0xea, 0xe7, 0xd0, 0xa5,
1230 0x85, 0x44, 0xca, 0x1e, 0x4f, 0xc3, 0x2c, 0x3c},
1231 strings.Repeat("D", 128)},
1232 // $ echo -n "EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE" | sha256sum
1233 // 997f6a2fc44f1400e9f64d7eac11fe99e21f4b7a3fc2ff3ec95c2ef016abb9e5
1234 {[32]byte{0x99, 0x7f, 0x6a, 0x2f, 0xc4, 0x4f, 0x14, 0x00,
1235 0xe9, 0xf6, 0x4d, 0x7e, 0xac, 0x11, 0xfe, 0x99,
1236 0xe2, 0x1f, 0x4b, 0x7a, 0x3f, 0xc2, 0xff, 0x3e,
1237 0xc9, 0x5c, 0x2e, 0xf0, 0x16, 0xab, 0xb9, 0xe5},
1238 strings.Repeat("E", 128)},
1239 // $ echo -n "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF" | sha256sum
1240 // 5c6cdeb9ccaa1d9c57662605ab738ec4ecf0467f576d4c2d7fae48710215582a
1241 {[32]byte{0x5c, 0x6c, 0xde, 0xb9, 0xcc, 0xaa, 0x1d, 0x9c,
1242 0x57, 0x66, 0x26, 0x05, 0xab, 0x73, 0x8e, 0xc4,
1243 0xec, 0xf0, 0x46, 0x7f, 0x57, 0x6d, 0x4c, 0x2d,
1244 0x7f, 0xae, 0x48, 0x71, 0x02, 0x15, 0x58, 0x2a},
1245 strings.Repeat("F", 128)},
1246 // $ echo -n "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG" | sha256sum
1247 // 394394b5f0e91a21d1e932f9ed55e098c8b05f3668f77134eeee843fef1d1758
1248 {[32]byte{0x39, 0x43, 0x94, 0xb5, 0xf0, 0xe9, 0x1a, 0x21,
1249 0xd1, 0xe9, 0x32, 0xf9, 0xed, 0x55, 0xe0, 0x98,
1250 0xc8, 0xb0, 0x5f, 0x36, 0x68, 0xf7, 0x71, 0x34,
1251 0xee, 0xee, 0x84, 0x3f, 0xef, 0x1d, 0x17, 0x58},
1252 strings.Repeat("G", 128)},
1253 // $ echo -n "HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH" | sha256sum
1254 // cab546612de68eaa849487342baadbac2561df6380ddac66137ef649e0cdfd0a
1255 {[32]byte{0xca, 0xb5, 0x46, 0x61, 0x2d, 0xe6, 0x8e, 0xaa,
1256 0x84, 0x94, 0x87, 0x34, 0x2b, 0xaa, 0xdb, 0xac,
1257 0x25, 0x61, 0xdf, 0x63, 0x80, 0xdd, 0xac, 0x66,
1258 0x13, 0x7e, 0xf6, 0x49, 0xe0, 0xcd, 0xfd, 0x0a},
1259 strings.Repeat("H", 128)},
1260 // $ echo -n "IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII" | sha256sum
1261 // 2be96cc28445876429be3005db465d1b9c8ed1432e3ac6f1514b6e9eee725ad8
1262 {[32]byte{0x2b, 0xe9, 0x6c, 0xc2, 0x84, 0x45, 0x87, 0x64,
1263 0x29, 0xbe, 0x30, 0x05, 0xdb, 0x46, 0x5d, 0x1b,
1264 0x9c, 0x8e, 0xd1, 0x43, 0x2e, 0x3a, 0xc6, 0xf1,
1265 0x51, 0x4b, 0x6e, 0x9e, 0xee, 0x72, 0x5a, 0xd8},
1266 strings.Repeat("I", 128)},
1267 // $ echo -n "JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ" | sha256sum
1268 // 238e5f81d54f2af58049b944c4a1b9516a36c2ef1e20887450b3482045714444
1269 {[32]byte{0x23, 0x8e, 0x5f, 0x81, 0xd5, 0x4f, 0x2a, 0xf5,
1270 0x80, 0x49, 0xb9, 0x44, 0xc4, 0xa1, 0xb9, 0x51,
1271 0x6a, 0x36, 0xc2, 0xef, 0x1e, 0x20, 0x88, 0x74,
1272 0x50, 0xb3, 0x48, 0x20, 0x45, 0x71, 0x44, 0x44},
1273 strings.Repeat("J", 128)},
1274 // $ echo -n "KKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKK" | sha256sum
1275 // f3a5b826c64951661ce22dc67f0f79d13f633f0601aca2f5e1cf1a9f17dffd4f
1276 {[32]byte{0xf3, 0xa5, 0xb8, 0x26, 0xc6, 0x49, 0x51, 0x66,
1277 0x1c, 0xe2, 0x2d, 0xc6, 0x7f, 0x0f, 0x79, 0xd1,
1278 0x3f, 0x63, 0x3f, 0x06, 0x01, 0xac, 0xa2, 0xf5,
1279 0xe1, 0xcf, 0x1a, 0x9f, 0x17, 0xdf, 0xfd, 0x4f},
1280 strings.Repeat("K", 128)},
1281 // $ echo -n "LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL" | sha256sum
1282 // 1e90c05bedd24dc3e297d5b8fb215b95d8b7f4a040ee912069614c7a3382725d
1283 {[32]byte{0x1e, 0x90, 0xc0, 0x5b, 0xed, 0xd2, 0x4d, 0xc3,
1284 0xe2, 0x97, 0xd5, 0xb8, 0xfb, 0x21, 0x5b, 0x95,
1285 0xd8, 0xb7, 0xf4, 0xa0, 0x40, 0xee, 0x91, 0x20,
1286 0x69, 0x61, 0x4c, 0x7a, 0x33, 0x82, 0x72, 0x5d},
1287 strings.Repeat("L", 128)},
1288 // $ echo -n "MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM" | sha256sum
1289 // 96239ac6fb99822797308f18d8455778fb5885103aa5ff59afe2219df657df99
1290 {[32]byte{0x96, 0x23, 0x9a, 0xc6, 0xfb, 0x99, 0x82, 0x27,
1291 0x97, 0x30, 0x8f, 0x18, 0xd8, 0x45, 0x57, 0x78,
1292 0xfb, 0x58, 0x85, 0x10, 0x3a, 0xa5, 0xff, 0x59,
1293 0xaf, 0xe2, 0x21, 0x9d, 0xf6, 0x57, 0xdf, 0x99},
1294 strings.Repeat("M", 128)},
1295 // $ echo -n "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN" | sha256sum
1296 // 11e7f5a6f15a4addba9b6b21bc4f8ecbdd969e179335269fc68d3a05f0f3da4a
1297 {[32]byte{0x11, 0xe7, 0xf5, 0xa6, 0xf1, 0x5a, 0x4a, 0xdd,
1298 0xba, 0x9b, 0x6b, 0x21, 0xbc, 0x4f, 0x8e, 0xcb,
1299 0xdd, 0x96, 0x9e, 0x17, 0x93, 0x35, 0x26, 0x9f,
1300 0xc6, 0x8d, 0x3a, 0x05, 0xf0, 0xf3, 0xda, 0x4a},
1301 strings.Repeat("N", 128)},
1302 // $ echo -n "OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO" | sha256sum
1303 // ae843b7e4e00afeb972bf948a345b319cca8bd0bcaa1428c1c67c88ea663c1e0
1304 {[32]byte{0xae, 0x84, 0x3b, 0x7e, 0x4e, 0x00, 0xaf, 0xeb,
1305 0x97, 0x2b, 0xf9, 0x48, 0xa3, 0x45, 0xb3, 0x19,
1306 0xcc, 0xa8, 0xbd, 0x0b, 0xca, 0xa1, 0x42, 0x8c,
1307 0x1c, 0x67, 0xc8, 0x8e, 0xa6, 0x63, 0xc1, 0xe0},
1308 strings.Repeat("O", 128)},
1309 // $ echo -n "PPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPP" | sha256sum
1310 // f16ef3e254ffb74b7e3c97d99486ef8c549e4c80bc6dfed7fe8c5e7e76f4fbcd
1311 {[32]byte{0xf1, 0x6e, 0xf3, 0xe2, 0x54, 0xff, 0xb7, 0x4b,
1312 0x7e, 0x3c, 0x97, 0xd9, 0x94, 0x86, 0xef, 0x8c,
1313 0x54, 0x9e, 0x4c, 0x80, 0xbc, 0x6d, 0xfe, 0xd7,
1314 0xfe, 0x8c, 0x5e, 0x7e, 0x76, 0xf4, 0xfb, 0xcd},
1315 strings.Repeat("P", 128)},
11191316 }
11201317
11211318 func TestGolden(t *testing.T) {
11271324 }
11281325 }
11291326
1327 func TestGoldenAVX512(t *testing.T) {
1328
1329 if !avx512 {
1330 t.SkipNow()
1331 return
1332 }
1333
1334 server := NewAvx512Server()
1335 h512 := NewAvx512(server)
1336
1337 for _, g := range golden {
1338 h512.Reset()
1339 h512.Write([]byte(g.in))
1340 digest := h512.Sum([]byte{})
1341 s := fmt.Sprintf("%x", digest)
1342 if !reflect.DeepEqual(digest, g.out[:]) {
1343 t.Fatalf("Sum256 function: sha256(%s) = %s want %s", g.in, s, hex.EncodeToString(g.out[:]))
1344 }
1345 }
1346 }
1347
11301348 func TestSize(t *testing.T) {
11311349 c := New()
11321350 if got := c.Size(); got != Size {
11411359 }
11421360 }
11431361
1144 var bench = New()
1145 var buf = make([]byte, 1024*1024)
1146
11471362 func benchmarkSize(b *testing.B, size int) {
1363 var bench = New()
1364 var buf = make([]byte, size)
11481365 b.SetBytes(int64(size))
11491366 sum := make([]byte, bench.Size())
11501367 for i := 0; i < b.N; i++ {
11541371 }
11551372 }
11561373
1157 func BenchmarkHash8Bytes(b *testing.B) {
1158 benchmarkSize(b, 8)
1159 }
1160
1161 func BenchmarkHash1K(b *testing.B) {
1162 benchmarkSize(b, 1024)
1163 }
1164
1165 func BenchmarkHash8K(b *testing.B) {
1166 benchmarkSize(b, 8192)
1167 }
1168
1169 func BenchmarkHash1M(b *testing.B) {
1170 benchmarkSize(b, 1024*1024)
1171 }
1374 func BenchmarkHash8Bytes(b *testing.B) { benchmarkSize(b, 8) }
1375 func BenchmarkHash1K(b *testing.B) { benchmarkSize(b, 1024) }
1376 func BenchmarkHash8K(b *testing.B) { benchmarkSize(b, 8192) }
1377 func BenchmarkHash1MAvx2(b *testing.B) { benchmarkSize(b, 1024*1024) }
1378 func BenchmarkHash5MAvx2(b *testing.B) { benchmarkSize(b, 5*1024*1024) }
1379 func BenchmarkHash10MAvx2(b *testing.B) { benchmarkSize(b, 10*1024*1024) }
1380
1381 func createInputs(size int) [16][]byte {
1382 input := [16][]byte{}
1383 for i := 0; i < 16; i++ {
1384 input[i] = make([]byte, size)
1385 }
1386 return input
1387 }
1388
1389 func initDigests() *[512]byte {
1390 digests := [512]byte{}
1391 for i := 0; i < 16; i++ {
1392 binary.LittleEndian.PutUint32(digests[(i+0*16)*4:], init0)
1393 binary.LittleEndian.PutUint32(digests[(i+1*16)*4:], init1)
1394 binary.LittleEndian.PutUint32(digests[(i+2*16)*4:], init2)
1395 binary.LittleEndian.PutUint32(digests[(i+3*16)*4:], init3)
1396 binary.LittleEndian.PutUint32(digests[(i+4*16)*4:], init4)
1397 binary.LittleEndian.PutUint32(digests[(i+5*16)*4:], init5)
1398 binary.LittleEndian.PutUint32(digests[(i+6*16)*4:], init6)
1399 binary.LittleEndian.PutUint32(digests[(i+7*16)*4:], init7)
1400 }
1401 return &digests
1402 }
1403
1404 func testSha256Avx512(t *testing.T, offset, padding int) [16][]byte {
1405
1406 if !avx512 {
1407 t.SkipNow()
1408 return [16][]byte{}
1409 }
1410
1411 l := uint(len(golden[offset].in))
1412 extraBlock := uint(0)
1413 if padding == 0 {
1414 extraBlock += 9
1415 } else {
1416 extraBlock += 64
1417 }
1418 input := createInputs(int(l + extraBlock))
1419 for i := 0; i < 16; i++ {
1420 copy(input[i], golden[offset+i].in)
1421 input[i][l] = 0x80
1422 copy(input[i][l+1:], bytes.Repeat([]byte{0}, padding))
1423
1424 // Length in bits.
1425 len := uint64(l)
1426 len <<= 3
1427 for ii := uint(0); ii < 8; ii++ {
1428 input[i][l+1+uint(padding)+ii] = byte(len >> (56 - 8*ii))
1429 }
1430 }
1431 mask := make([]uint64, len(input[0])>>6)
1432 for m := range mask {
1433 mask[m] = 0xffff
1434 }
1435 output := blockAvx512(initDigests(), input, mask)
1436 for i := 0; i < 16; i++ {
1437 if bytes.Compare(output[i][:], golden[offset+i].out[:]) != 0 {
1438 t.Fatalf("Sum256 function: sha256(%s) = %s want %s", golden[offset+i].in, hex.EncodeToString(output[i][:]), hex.EncodeToString(golden[offset+i].out[:]))
1439 }
1440 }
1441 return input
1442 }
1443
1444 func TestAvx512_1Block(t *testing.T) { testSha256Avx512(t, 31, 0) }
1445 func TestAvx512_3Blocks(t *testing.T) { testSha256Avx512(t, 47, 55) }
1446
1447 func TestAvx512_MixedBlocks(t *testing.T) {
1448
1449 if !avx512 {
1450 t.SkipNow()
1451 return
1452 }
1453
1454 inputSingleBlock := testSha256Avx512(t, 31, 0)
1455 inputMultiBlock := testSha256Avx512(t, 47, 55)
1456
1457 input := [16][]byte{}
1458
1459 for i := range input {
1460 if i%2 == 0 {
1461 input[i] = inputMultiBlock[i]
1462 } else {
1463 input[i] = inputSingleBlock[i]
1464 }
1465 }
1466
1467 mask := [3]uint64{0xffff, 0x5555, 0x5555}
1468 output := blockAvx512(initDigests(), input, mask[:])
1469 var offset int
1470 for i := 0; i < len(output); i++ {
1471 if i%2 == 0 {
1472 offset = 47
1473 } else {
1474 offset = 31
1475 }
1476 if bytes.Compare(output[i][:], golden[offset+i].out[:]) != 0 {
1477 t.Fatalf("Sum256 function: sha256(%s) = %s want %s", golden[offset+i].in, hex.EncodeToString(output[i][:]), hex.EncodeToString(golden[offset+i].out[:]))
1478 }
1479 }
1480 }
1481
1482 func TestAvx512_MixedWithNilBlocks(t *testing.T) {
1483
1484 if !avx512 {
1485 t.SkipNow()
1486 return
1487 }
1488
1489 inputSingleBlock := testSha256Avx512(t, 31, 0)
1490 inputMultiBlock := testSha256Avx512(t, 47, 55)
1491
1492 input := [16][]byte{}
1493
1494 for i := range input {
1495 if i%3 == 0 {
1496 input[i] = inputMultiBlock[i]
1497 } else if i%3 == 1 {
1498 input[i] = inputSingleBlock[i]
1499 } else {
1500 input[i] = nil
1501 }
1502 }
1503
1504 mask := [3]uint64{0xb6db, 0x9249, 0x9249}
1505 output := blockAvx512(initDigests(), input, mask[:])
1506 var offset int
1507 for i := 0; i < len(output); i++ {
1508 if i%3 == 2 { // for nil inputs
1509 initvec := [32]byte{0x6a, 0x09, 0xe6, 0x67, 0xbb, 0x67, 0xae, 0x85,
1510 0x3c, 0x6e, 0xf3, 0x72, 0xa5, 0x4f, 0xf5, 0x3a,
1511 0x51, 0x0e, 0x52, 0x7f, 0x9b, 0x05, 0x68, 0x8c,
1512 0x1f, 0x83, 0xd9, 0xab, 0x5b, 0xe0, 0xcd, 0x19}
1513 if bytes.Compare(output[i][:], initvec[:]) != 0 {
1514 t.Fatalf("Sum256 function: sha256 for nil vector = %s want %s", hex.EncodeToString(output[i][:]), hex.EncodeToString(initvec[:]))
1515 }
1516 continue
1517 }
1518 if i%3 == 0 {
1519 offset = 47
1520 } else {
1521 offset = 31
1522 }
1523 if bytes.Compare(output[i][:], golden[offset+i].out[:]) != 0 {
1524 t.Fatalf("Sum256 function: sha256(%s) = %s want %s", golden[offset+i].in, hex.EncodeToString(output[i][:]), hex.EncodeToString(golden[offset+i].out[:]))
1525 }
1526 }
1527 }
1528
1529 func TestAvx512Server(t *testing.T) {
1530
1531 if !avx512 {
1532 t.SkipNow()
1533 return
1534 }
1535
1536 const offset = 31 + 16
1537 server := NewAvx512Server()
1538
1539 // First block of 64 bytes
1540 for i := 0; i < 16; i++ {
1541 input := make([]byte, 64)
1542 copy(input, golden[offset+i].in)
1543 server.Write(uint64(Avx512ServerUid+i), input)
1544 }
1545
1546 // Second block of 64 bytes
1547 for i := 0; i < 16; i++ {
1548 input := make([]byte, 64)
1549 copy(input, golden[offset+i].in[64:])
1550 server.Write(uint64(Avx512ServerUid+i), input)
1551 }
1552
1553 wg := sync.WaitGroup{}
1554 wg.Add(16)
1555
1556 // Third and final block
1557 for i := 0; i < 16; i++ {
1558 input := make([]byte, 64)
1559 input[0] = 0x80
1560 copy(input[1:], bytes.Repeat([]byte{0}, 63-8))
1561
1562 // Length in bits.
1563 len := uint64(128)
1564 len <<= 3
1565 for ii := uint(0); ii < 8; ii++ {
1566 input[63-8+1+ii] = byte(len >> (56 - 8*ii))
1567 }
1568 go func(i int, uid uint64, input []byte) {
1569 output := server.Sum(uid, input)
1570 if bytes.Compare(output[:], golden[offset+i].out[:]) != 0 {
1571 t.Fatalf("Sum256 function: sha256(%s) = %s want %s", golden[offset+i].in, hex.EncodeToString(output[:]), hex.EncodeToString(golden[offset+i].out[:]))
1572 }
1573 wg.Done()
1574 }(i, uint64(Avx512ServerUid+i), input)
1575 }
1576
1577 wg.Wait()
1578 }
1579
1580 func TestAvx512Digest(t *testing.T) {
1581
1582 if !avx512 {
1583 t.SkipNow()
1584 return
1585 }
1586
1587 server := NewAvx512Server()
1588
1589 const tests = 16
1590 h512 := [16]hash.Hash{}
1591 for i := 0; i < tests; i++ {
1592 h512[i] = NewAvx512(server)
1593 }
1594
1595 const offset = 31 + 16
1596 for i := 0; i < tests; i++ {
1597 input := make([]byte, 64)
1598 copy(input, golden[offset+i].in)
1599 h512[i].Write(input)
1600 }
1601 for i := 0; i < tests; i++ {
1602 input := make([]byte, 64)
1603 copy(input, golden[offset+i].in[64:])
1604 h512[i].Write(input)
1605 }
1606 for i := 0; i < tests; i++ {
1607 output := h512[i].Sum([]byte{})
1608 if bytes.Compare(output[:], golden[offset+i].out[:]) != 0 {
1609 t.Fatalf("Sum256 function: sha256(%s) = %s want %s", golden[offset+i].in, hex.EncodeToString(output[:]), hex.EncodeToString(golden[offset+i].out[:]))
1610 }
1611 }
1612 }
1613
1614 func benchmarkAvx512SingleCore(h512 []hash.Hash, body []byte) {
1615
1616 for i := 0; i < len(h512); i++ {
1617 h512[i].Write(body)
1618 }
1619 for i := 0; i < len(h512); i++ {
1620 _ = h512[i].Sum([]byte{})
1621 }
1622 }
1623
1624 func benchmarkAvx512(b *testing.B, size int) {
1625
1626 if !avx512 {
1627 b.SkipNow()
1628 return
1629 }
1630
1631 server := NewAvx512Server()
1632
1633 const tests = 16
1634 body := make([]byte, size)
1635
1636 b.SetBytes(int64(len(body) * tests))
1637 b.ResetTimer()
1638
1639 for i := 0; i < b.N; i++ {
1640 h512 := make([]hash.Hash, tests)
1641 for i := 0; i < tests; i++ {
1642 h512[i] = NewAvx512(server)
1643 }
1644
1645 benchmarkAvx512SingleCore(h512, body)
1646 }
1647 }
1648
1649 func BenchmarkAvx512_05M(b *testing.B) { benchmarkAvx512(b, 512*1024) }
1650 func BenchmarkAvx512_1M(b *testing.B) { benchmarkAvx512(b, 1*1024*1024) }
1651 func BenchmarkAvx512_5M(b *testing.B) { benchmarkAvx512(b, 5*1024*1024) }
1652 func BenchmarkAvx512_10M(b *testing.B) { benchmarkAvx512(b, 10*1024*1024) }
1653
1654 func benchmarkAvx512MultiCore(b *testing.B, size, cores int) {
1655
1656 if !avx512 {
1657 b.SkipNow()
1658 return
1659 }
1660
1661 servers := make([]*Avx512Server, cores)
1662 for c := 0; c < cores; c++ {
1663 servers[c] = NewAvx512Server()
1664 }
1665
1666 const tests = 16
1667
1668 body := make([]byte, size)
1669
1670 h512 := make([]hash.Hash, tests*cores)
1671 for i := 0; i < tests*cores; i++ {
1672 h512[i] = NewAvx512(servers[i>>4])
1673 }
1674
1675 b.SetBytes(int64(size * 16 * cores))
1676 b.ResetTimer()
1677
1678 var wg sync.WaitGroup
1679
1680 for i := 0; i < b.N; i++ {
1681 wg.Add(cores)
1682 for c := 0; c < cores; c++ {
1683 go func(c int) { benchmarkAvx512SingleCore(h512[c*tests:(c+1)*tests], body); wg.Done() }(c)
1684 }
1685 wg.Wait()
1686 }
1687 }
1688
1689 func BenchmarkAvx512_5M_2Cores(b *testing.B) { benchmarkAvx512MultiCore(b, 5*1024*1024, 2) }
1690 func BenchmarkAvx512_5M_4Cores(b *testing.B) { benchmarkAvx512MultiCore(b, 5*1024*1024, 4) }
1691 func BenchmarkAvx512_5M_6Cores(b *testing.B) { benchmarkAvx512MultiCore(b, 5*1024*1024, 6) }
1692
1693 type maskTest struct {
1694 in [16]int
1695 out [16]maskRounds
1696 }
1697
1698 var goldenMask = []maskTest{
1699 {[16]int{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, [16]maskRounds{}},
1700 {[16]int{64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0}, [16]maskRounds{{0x5555, 1}}},
1701 {[16]int{0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64}, [16]maskRounds{{0xaaaa, 1}}},
1702 {[16]int{64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64}, [16]maskRounds{{0xffff, 1}}},
1703 {[16]int{128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, [16]maskRounds{{0xffff, 2}}},
1704 {[16]int{64, 128, 64, 128, 64, 128, 64, 128, 64, 128, 64, 128, 64, 128, 64, 128}, [16]maskRounds{{0xffff, 1}, {0xaaaa, 1}}},
1705 {[16]int{128, 64, 128, 64, 128, 64, 128, 64, 128, 64, 128, 64, 128, 64, 128, 64}, [16]maskRounds{{0xffff, 1}, {0x5555, 1}}},
1706 {[16]int{64, 192, 64, 192, 64, 192, 64, 192, 64, 192, 64, 192, 64, 192, 64, 192}, [16]maskRounds{{0xffff, 1}, {0xaaaa, 2}}},
1707 //
1708 // >= 64 0110=6 1011=b 1101=d 0110=6
1709 // >=128 0100=4 0010=2 1001=9 0100=4
1710 {[16]int{0, 64, 128, 0, 64, 128, 0, 64, 128, 0, 64, 128, 0, 64, 128, 0}, [16]maskRounds{{0x6db6, 1}, {0x4924, 1}}},
1711 {[16]int{1 * 64, 2 * 64, 3 * 64, 4 * 64, 5 * 64, 6 * 64, 7 * 64, 8 * 64, 9 * 64, 10 * 64, 11 * 64, 12 * 64, 13 * 64, 14 * 64, 15 * 64, 16 * 64},
1712 [16]maskRounds{{0xffff, 1}, {0xfffe, 1}, {0xfffc, 1}, {0xfff8, 1}, {0xfff0, 1}, {0xffe0, 1}, {0xffc0, 1}, {0xff80, 1},
1713 {0xff00, 1}, {0xfe00, 1}, {0xfc00, 1}, {0xf800, 1}, {0xf000, 1}, {0xe000, 1}, {0xc000, 1}, {0x8000, 1}}},
1714 {[16]int{2 * 64, 1 * 64, 3 * 64, 4 * 64, 5 * 64, 6 * 64, 7 * 64, 8 * 64, 9 * 64, 10 * 64, 11 * 64, 12 * 64, 13 * 64, 14 * 64, 15 * 64, 16 * 64},
1715 [16]maskRounds{{0xffff, 1}, {0xfffd, 1}, {0xfffc, 1}, {0xfff8, 1}, {0xfff0, 1}, {0xffe0, 1}, {0xffc0, 1}, {0xff80, 1},
1716 {0xff00, 1}, {0xfe00, 1}, {0xfc00, 1}, {0xf800, 1}, {0xf000, 1}, {0xe000, 1}, {0xc000, 1}, {0x8000, 1}}},
1717 {[16]int{10 * 64, 20 * 64, 30 * 64, 40 * 64, 50 * 64, 60 * 64, 70 * 64, 80 * 64, 90 * 64, 100 * 64, 110 * 64, 120 * 64, 130 * 64, 140 * 64, 150 * 64, 160 * 64},
1718 [16]maskRounds{{0xffff, 10}, {0xfffe, 10}, {0xfffc, 10}, {0xfff8, 10}, {0xfff0, 10}, {0xffe0, 10}, {0xffc0, 10}, {0xff80, 10},
1719 {0xff00, 10}, {0xfe00, 10}, {0xfc00, 10}, {0xf800, 10}, {0xf000, 10}, {0xe000, 10}, {0xc000, 10}, {0x8000, 10}}},
1720 {[16]int{10 * 64, 19 * 64, 27 * 64, 34 * 64, 40 * 64, 45 * 64, 49 * 64, 52 * 64, 54 * 64, 55 * 64, 57 * 64, 60 * 64, 64 * 64, 69 * 64, 75 * 64, 82 * 64},
1721 [16]maskRounds{{0xffff, 10}, {0xfffe, 9}, {0xfffc, 8}, {0xfff8, 7}, {0xfff0, 6}, {0xffe0, 5}, {0xffc0, 4}, {0xff80, 3},
1722 {0xff00, 2}, {0xfe00, 1}, {0xfc00, 2}, {0xf800, 3}, {0xf000, 4}, {0xe000, 5}, {0xc000, 6}, {0x8000, 7}}},
1723 }
1724
1725 func TestMaskGen(t *testing.T) {
1726 input := [16][]byte{}
1727 for gcase, g := range goldenMask {
1728 for i, l := range g.in {
1729 buf := make([]byte, l)
1730 input[i] = buf[:]
1731 }
1732
1733 mr := genMask(input)
1734
1735 if !reflect.DeepEqual(mr, g.out) {
1736 t.Fatalf("case %d: got %04x\n want %04x", gcase, mr, g.out)
1737 }
1738 }
1739 }
0
1 // 16x Parallel implementation of SHA256 for AVX512
2
3 //
4 // Minio Cloud Storage, (C) 2017 Minio, Inc.
5 //
6 // Licensed under the Apache License, Version 2.0 (the "License");
7 // you may not use this file except in compliance with the License.
8 // You may obtain a copy of the License at
9 //
10 // http://www.apache.org/licenses/LICENSE-2.0
11 //
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17
18 //
19 // This code is based on the Intel Multi-Buffer Crypto for IPSec library
20 // and more specifically the following implementation:
21 // https://github.com/intel/intel-ipsec-mb/blob/master/avx512/sha256_x16_avx512.asm
22 //
23 // For Golang it has been converted into Plan 9 assembly with the help of
24 // github.com/minio/asm2plan9s to assemble the AVX512 instructions
25 //
26
27 // Copyright (c) 2017, Intel Corporation
28 //
29 // Redistribution and use in source and binary forms, with or without
30 // modification, are permitted provided that the following conditions are met:
31 //
32 // * Redistributions of source code must retain the above copyright notice,
33 // this list of conditions and the following disclaimer.
34 // * Redistributions in binary form must reproduce the above copyright
35 // notice, this list of conditions and the following disclaimer in the
36 // documentation and/or other materials provided with the distribution.
37 // * Neither the name of Intel Corporation nor the names of its contributors
38 // may be used to endorse or promote products derived from this software
39 // without specific prior written permission.
40 //
41 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
42 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
44 // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
45 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
47 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
48 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
49 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
50 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
51
52 #define SHA256_DIGEST_ROW_SIZE 64
53
54 // arg1
55 #define STATE rdi
56 #define STATE_P9 DI
57 // arg2
58 #define INP_SIZE rsi
59 #define INP_SIZE_P9 SI
60
61 #define IDX rcx
62 #define TBL rdx
63 #define TBL_P9 DX
64
65 #define INPUT rax
66 #define INPUT_P9 AX
67
68 #define inp0 r9
69 #define SCRATCH_P9 R12
70 #define SCRATCH r12
71 #define maskp r13
72 #define MASKP_P9 R13
73 #define mask r14
74 #define MASK_P9 R14
75
76 #define A zmm0
77 #define B zmm1
78 #define C zmm2
79 #define D zmm3
80 #define E zmm4
81 #define F zmm5
82 #define G zmm6
83 #define H zmm7
84 #define T1 zmm8
85 #define TMP0 zmm9
86 #define TMP1 zmm10
87 #define TMP2 zmm11
88 #define TMP3 zmm12
89 #define TMP4 zmm13
90 #define TMP5 zmm14
91 #define TMP6 zmm15
92
93 #define W0 zmm16
94 #define W1 zmm17
95 #define W2 zmm18
96 #define W3 zmm19
97 #define W4 zmm20
98 #define W5 zmm21
99 #define W6 zmm22
100 #define W7 zmm23
101 #define W8 zmm24
102 #define W9 zmm25
103 #define W10 zmm26
104 #define W11 zmm27
105 #define W12 zmm28
106 #define W13 zmm29
107 #define W14 zmm30
108 #define W15 zmm31
109
110
111 #define TRANSPOSE16(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _r10, _r11, _r12, _r13, _r14, _r15, _t0, _t1) \
112 \
113 \ // input r0 = {a15 a14 a13 a12 a11 a10 a9 a8 a7 a6 a5 a4 a3 a2 a1 a0}
114 \ // r1 = {b15 b14 b13 b12 b11 b10 b9 b8 b7 b6 b5 b4 b3 b2 b1 b0}
115 \ // r2 = {c15 c14 c13 c12 c11 c10 c9 c8 c7 c6 c5 c4 c3 c2 c1 c0}
116 \ // r3 = {d15 d14 d13 d12 d11 d10 d9 d8 d7 d6 d5 d4 d3 d2 d1 d0}
117 \ // r4 = {e15 e14 e13 e12 e11 e10 e9 e8 e7 e6 e5 e4 e3 e2 e1 e0}
118 \ // r5 = {f15 f14 f13 f12 f11 f10 f9 f8 f7 f6 f5 f4 f3 f2 f1 f0}
119 \ // r6 = {g15 g14 g13 g12 g11 g10 g9 g8 g7 g6 g5 g4 g3 g2 g1 g0}
120 \ // r7 = {h15 h14 h13 h12 h11 h10 h9 h8 h7 h6 h5 h4 h3 h2 h1 h0}
121 \ // r8 = {i15 i14 i13 i12 i11 i10 i9 i8 i7 i6 i5 i4 i3 i2 i1 i0}
122 \ // r9 = {j15 j14 j13 j12 j11 j10 j9 j8 j7 j6 j5 j4 j3 j2 j1 j0}
123 \ // r10 = {k15 k14 k13 k12 k11 k10 k9 k8 k7 k6 k5 k4 k3 k2 k1 k0}
124 \ // r11 = {l15 l14 l13 l12 l11 l10 l9 l8 l7 l6 l5 l4 l3 l2 l1 l0}
125 \ // r12 = {m15 m14 m13 m12 m11 m10 m9 m8 m7 m6 m5 m4 m3 m2 m1 m0}
126 \ // r13 = {n15 n14 n13 n12 n11 n10 n9 n8 n7 n6 n5 n4 n3 n2 n1 n0}
127 \ // r14 = {o15 o14 o13 o12 o11 o10 o9 o8 o7 o6 o5 o4 o3 o2 o1 o0}
128 \ // r15 = {p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0}
129 \
130 \ // output r0 = { p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0}
131 \ // r1 = { p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1}
132 \ // r2 = { p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2}
133 \ // r3 = { p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3}
134 \ // r4 = { p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4}
135 \ // r5 = { p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5}
136 \ // r6 = { p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6}
137 \ // r7 = { p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7}
138 \ // r8 = { p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8}
139 \ // r9 = { p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9}
140 \ // r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10}
141 \ // r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11}
142 \ // r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12}
143 \ // r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13}
144 \ // r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14}
145 \ // r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15}
146 \
147 \ // process top half
148 vshufps _t0, _r0, _r1, 0x44 \ // t0 = {b13 b12 a13 a12 b9 b8 a9 a8 b5 b4 a5 a4 b1 b0 a1 a0}
149 vshufps _r0, _r0, _r1, 0xEE \ // r0 = {b15 b14 a15 a14 b11 b10 a11 a10 b7 b6 a7 a6 b3 b2 a3 a2}
150 vshufps _t1, _r2, _r3, 0x44 \ // t1 = {d13 d12 c13 c12 d9 d8 c9 c8 d5 d4 c5 c4 d1 d0 c1 c0}
151 vshufps _r2, _r2, _r3, 0xEE \ // r2 = {d15 d14 c15 c14 d11 d10 c11 c10 d7 d6 c7 c6 d3 d2 c3 c2}
152 \
153 vshufps _r3, _t0, _t1, 0xDD \ // r3 = {d13 c13 b13 a13 d9 c9 b9 a9 d5 c5 b5 a5 d1 c1 b1 a1}
154 vshufps _r1, _r0, _r2, 0x88 \ // r1 = {d14 c14 b14 a14 d10 c10 b10 a10 d6 c6 b6 a6 d2 c2 b2 a2}
155 vshufps _r0, _r0, _r2, 0xDD \ // r0 = {d15 c15 b15 a15 d11 c11 b11 a11 d7 c7 b7 a7 d3 c3 b3 a3}
156 vshufps _t0, _t0, _t1, 0x88 \ // t0 = {d12 c12 b12 a12 d8 c8 b8 a8 d4 c4 b4 a4 d0 c0 b0 a0}
157 \
158 \ // use r2 in place of t0
159 vshufps _r2, _r4, _r5, 0x44 \ // r2 = {f13 f12 e13 e12 f9 f8 e9 e8 f5 f4 e5 e4 f1 f0 e1 e0}
160 vshufps _r4, _r4, _r5, 0xEE \ // r4 = {f15 f14 e15 e14 f11 f10 e11 e10 f7 f6 e7 e6 f3 f2 e3 e2}
161 vshufps _t1, _r6, _r7, 0x44 \ // t1 = {h13 h12 g13 g12 h9 h8 g9 g8 h5 h4 g5 g4 h1 h0 g1 g0}
162 vshufps _r6, _r6, _r7, 0xEE \ // r6 = {h15 h14 g15 g14 h11 h10 g11 g10 h7 h6 g7 g6 h3 h2 g3 g2}
163 \
164 vshufps _r7, _r2, _t1, 0xDD \ // r7 = {h13 g13 f13 e13 h9 g9 f9 e9 h5 g5 f5 e5 h1 g1 f1 e1}
165 vshufps _r5, _r4, _r6, 0x88 \ // r5 = {h14 g14 f14 e14 h10 g10 f10 e10 h6 g6 f6 e6 h2 g2 f2 e2}
166 vshufps _r4, _r4, _r6, 0xDD \ // r4 = {h15 g15 f15 e15 h11 g11 f11 e11 h7 g7 f7 e7 h3 g3 f3 e3}
167 vshufps _r2, _r2, _t1, 0x88 \ // r2 = {h12 g12 f12 e12 h8 g8 f8 e8 h4 g4 f4 e4 h0 g0 f0 e0}
168 \
169 \ // use r6 in place of t0
170 vshufps _r6, _r8, _r9, 0x44 \ // r6 = {j13 j12 i13 i12 j9 j8 i9 i8 j5 j4 i5 i4 j1 j0 i1 i0}
171 vshufps _r8, _r8, _r9, 0xEE \ // r8 = {j15 j14 i15 i14 j11 j10 i11 i10 j7 j6 i7 i6 j3 j2 i3 i2}
172 vshufps _t1, _r10, _r11, 0x44 \ // t1 = {l13 l12 k13 k12 l9 l8 k9 k8 l5 l4 k5 k4 l1 l0 k1 k0}
173 vshufps _r10, _r10, _r11, 0xEE \ // r10 = {l15 l14 k15 k14 l11 l10 k11 k10 l7 l6 k7 k6 l3 l2 k3 k2}
174 \
175 vshufps _r11, _r6, _t1, 0xDD \ // r11 = {l13 k13 j13 113 l9 k9 j9 i9 l5 k5 j5 i5 l1 k1 j1 i1}
176 vshufps _r9, _r8, _r10, 0x88 \ // r9 = {l14 k14 j14 114 l10 k10 j10 i10 l6 k6 j6 i6 l2 k2 j2 i2}
177 vshufps _r8, _r8, _r10, 0xDD \ // r8 = {l15 k15 j15 115 l11 k11 j11 i11 l7 k7 j7 i7 l3 k3 j3 i3}
178 vshufps _r6, _r6, _t1, 0x88 \ // r6 = {l12 k12 j12 112 l8 k8 j8 i8 l4 k4 j4 i4 l0 k0 j0 i0}
179 \
180 \ // use r10 in place of t0
181 vshufps _r10, _r12, _r13, 0x44 \ // r10 = {n13 n12 m13 m12 n9 n8 m9 m8 n5 n4 m5 m4 n1 n0 a1 m0}
182 vshufps _r12, _r12, _r13, 0xEE \ // r12 = {n15 n14 m15 m14 n11 n10 m11 m10 n7 n6 m7 m6 n3 n2 a3 m2}
183 vshufps _t1, _r14, _r15, 0x44 \ // t1 = {p13 p12 013 012 p9 p8 09 08 p5 p4 05 04 p1 p0 01 00}
184 vshufps _r14, _r14, _r15, 0xEE \ // r14 = {p15 p14 015 014 p11 p10 011 010 p7 p6 07 06 p3 p2 03 02}
185 \
186 vshufps _r15, _r10, _t1, 0xDD \ // r15 = {p13 013 n13 m13 p9 09 n9 m9 p5 05 n5 m5 p1 01 n1 m1}
187 vshufps _r13, _r12, _r14, 0x88 \ // r13 = {p14 014 n14 m14 p10 010 n10 m10 p6 06 n6 m6 p2 02 n2 m2}
188 vshufps _r12, _r12, _r14, 0xDD \ // r12 = {p15 015 n15 m15 p11 011 n11 m11 p7 07 n7 m7 p3 03 n3 m3}
189 vshufps _r10, _r10, _t1, 0x88 \ // r10 = {p12 012 n12 m12 p8 08 n8 m8 p4 04 n4 m4 p0 00 n0 m0}
190 \
191 \ // At this point, the registers that contain interesting data are:
192 \ // t0, r3, r1, r0, r2, r7, r5, r4, r6, r11, r9, r8, r10, r15, r13, r12
193 \ // Can use t1 and r14 as scratch registers
194 LEAQ PSHUFFLE_TRANSPOSE16_MASK1<>(SB), BX \
195 LEAQ PSHUFFLE_TRANSPOSE16_MASK2<>(SB), R8 \
196 \
197 vmovdqu32 _r14, [rbx] \
198 vpermi2q _r14, _t0, _r2 \ // r14 = {h8 g8 f8 e8 d8 c8 b8 a8 h0 g0 f0 e0 d0 c0 b0 a0}
199 vmovdqu32 _t1, [r8] \
200 vpermi2q _t1, _t0, _r2 \ // t1 = {h12 g12 f12 e12 d12 c12 b12 a12 h4 g4 f4 e4 d4 c4 b4 a4}
201 \
202 vmovdqu32 _r2, [rbx] \
203 vpermi2q _r2, _r3, _r7 \ // r2 = {h9 g9 f9 e9 d9 c9 b9 a9 h1 g1 f1 e1 d1 c1 b1 a1}
204 vmovdqu32 _t0, [r8] \
205 vpermi2q _t0, _r3, _r7 \ // t0 = {h13 g13 f13 e13 d13 c13 b13 a13 h5 g5 f5 e5 d5 c5 b5 a5}
206 \
207 vmovdqu32 _r3, [rbx] \
208 vpermi2q _r3, _r1, _r5 \ // r3 = {h10 g10 f10 e10 d10 c10 b10 a10 h2 g2 f2 e2 d2 c2 b2 a2}
209 vmovdqu32 _r7, [r8] \
210 vpermi2q _r7, _r1, _r5 \ // r7 = {h14 g14 f14 e14 d14 c14 b14 a14 h6 g6 f6 e6 d6 c6 b6 a6}
211 \
212 vmovdqu32 _r1, [rbx] \
213 vpermi2q _r1, _r0, _r4 \ // r1 = {h11 g11 f11 e11 d11 c11 b11 a11 h3 g3 f3 e3 d3 c3 b3 a3}
214 vmovdqu32 _r5, [r8] \
215 vpermi2q _r5, _r0, _r4 \ // r5 = {h15 g15 f15 e15 d15 c15 b15 a15 h7 g7 f7 e7 d7 c7 b7 a7}
216 \
217 vmovdqu32 _r0, [rbx] \
218 vpermi2q _r0, _r6, _r10 \ // r0 = {p8 o8 n8 m8 l8 k8 j8 i8 p0 o0 n0 m0 l0 k0 j0 i0}
219 vmovdqu32 _r4, [r8] \
220 vpermi2q _r4, _r6, _r10 \ // r4 = {p12 o12 n12 m12 l12 k12 j12 i12 p4 o4 n4 m4 l4 k4 j4 i4}
221 \
222 vmovdqu32 _r6, [rbx] \
223 vpermi2q _r6, _r11, _r15 \ // r6 = {p9 o9 n9 m9 l9 k9 j9 i9 p1 o1 n1 m1 l1 k1 j1 i1}
224 vmovdqu32 _r10, [r8] \
225 vpermi2q _r10, _r11, _r15 \ // r10 = {p13 o13 n13 m13 l13 k13 j13 i13 p5 o5 n5 m5 l5 k5 j5 i5}
226 \
227 vmovdqu32 _r11, [rbx] \
228 vpermi2q _r11, _r9, _r13 \ // r11 = {p10 o10 n10 m10 l10 k10 j10 i10 p2 o2 n2 m2 l2 k2 j2 i2}
229 vmovdqu32 _r15, [r8] \
230 vpermi2q _r15, _r9, _r13 \ // r15 = {p14 o14 n14 m14 l14 k14 j14 i14 p6 o6 n6 m6 l6 k6 j6 i6}
231 \
232 vmovdqu32 _r9, [rbx] \
233 vpermi2q _r9, _r8, _r12 \ // r9 = {p11 o11 n11 m11 l11 k11 j11 i11 p3 o3 n3 m3 l3 k3 j3 i3}
234 vmovdqu32 _r13, [r8] \
235 vpermi2q _r13, _r8, _r12 \ // r13 = {p15 o15 n15 m15 l15 k15 j15 i15 p7 o7 n7 m7 l7 k7 j7 i7}
236 \
237 \ // At this point r8 and r12 can be used as scratch registers
238 vshuff64x2 _r8, _r14, _r0, 0xEE \ // r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8}
239 vshuff64x2 _r0, _r14, _r0, 0x44 \ // r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0}
240 \
241 vshuff64x2 _r12, _t1, _r4, 0xEE \ // r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12}
242 vshuff64x2 _r4, _t1, _r4, 0x44 \ // r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4}
243 \
244 vshuff64x2 _r14, _r7, _r15, 0xEE \ // r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14}
245 vshuff64x2 _t1, _r7, _r15, 0x44 \ // t1 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6}
246 \
247 vshuff64x2 _r15, _r5, _r13, 0xEE \ // r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15}
248 vshuff64x2 _r7, _r5, _r13, 0x44 \ // r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7}
249 \
250 vshuff64x2 _r13, _t0, _r10, 0xEE \ // r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13}
251 vshuff64x2 _r5, _t0, _r10, 0x44 \ // r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5}
252 \
253 vshuff64x2 _r10, _r3, _r11, 0xEE \ // r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10}
254 vshuff64x2 _t0, _r3, _r11, 0x44 \ // t0 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2}
255 \
256 vshuff64x2 _r11, _r1, _r9, 0xEE \ // r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11}
257 vshuff64x2 _r3, _r1, _r9, 0x44 \ // r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3}
258 \
259 vshuff64x2 _r9, _r2, _r6, 0xEE \ // r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9}
260 vshuff64x2 _r1, _r2, _r6, 0x44 \ // r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1}
261 \
262 vmovdqu32 _r2, _t0 \ // r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2}
263 vmovdqu32 _r6, _t1 \ // r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6}
264
265
266 // CH(A, B, C) = (A&B) ^ (~A&C)
267 // MAJ(E, F, G) = (E&F) ^ (E&G) ^ (F&G)
268 // SIGMA0 = ROR_2 ^ ROR_13 ^ ROR_22
269 // SIGMA1 = ROR_6 ^ ROR_11 ^ ROR_25
270 // sigma0 = ROR_7 ^ ROR_18 ^ SHR_3
271 // sigma1 = ROR_17 ^ ROR_19 ^ SHR_10
272
273 // Main processing loop per round
274 #define PROCESS_LOOP(_WT, _ROUND, _A, _B, _C, _D, _E, _F, _G, _H) \
275 \ // T1 = H + SIGMA1(E) + CH(E, F, G) + Kt + Wt
276 \ // T2 = SIGMA0(A) + MAJ(A, B, C)
277 \ // H=G, G=F, F=E, E=D+T1, D=C, C=B, B=A, A=T1+T2
278 \
279 \ // H becomes T2, then add T1 for A
280 \ // D becomes D + T1 for E
281 \
282 vpaddd T1, _H, TMP3 \ // T1 = H + Kt
283 vmovdqu32 TMP0, _E \
284 vprord TMP1, _E, 6 \ // ROR_6(E)
285 vprord TMP2, _E, 11 \ // ROR_11(E)
286 vprord TMP3, _E, 25 \ // ROR_25(E)
287 vpternlogd TMP0, _F, _G, 0xCA \ // TMP0 = CH(E,F,G)
288 vpaddd T1, T1, _WT \ // T1 = T1 + Wt
289 vpternlogd TMP1, TMP2, TMP3, 0x96 \ // TMP1 = SIGMA1(E)
290 vpaddd T1, T1, TMP0 \ // T1 = T1 + CH(E,F,G)
291 vpaddd T1, T1, TMP1 \ // T1 = T1 + SIGMA1(E)
292 vpaddd _D, _D, T1 \ // D = D + T1
293 \
294 vprord _H, _A, 2 \ // ROR_2(A)
295 vprord TMP2, _A, 13 \ // ROR_13(A)
296 vprord TMP3, _A, 22 \ // ROR_22(A)
297 vmovdqu32 TMP0, _A \
298 vpternlogd TMP0, _B, _C, 0xE8 \ // TMP0 = MAJ(A,B,C)
299 vpternlogd _H, TMP2, TMP3, 0x96 \ // H(T2) = SIGMA0(A)
300 vpaddd _H, _H, TMP0 \ // H(T2) = SIGMA0(A) + MAJ(A,B,C)
301 vpaddd _H, _H, T1 \ // H(A) = H(T2) + T1
302 \
303 vmovdqu32 TMP3, [TBL + ((_ROUND+1)*64)] \ // Next Kt
304
305
306 #define MSG_SCHED_ROUND_16_63(_WT, _WTp1, _WTp9, _WTp14) \
307 vprord TMP4, _WTp14, 17 \ // ROR_17(Wt-2)
308 vprord TMP5, _WTp14, 19 \ // ROR_19(Wt-2)
309 vpsrld TMP6, _WTp14, 10 \ // SHR_10(Wt-2)
310 vpternlogd TMP4, TMP5, TMP6, 0x96 \ // TMP4 = sigma1(Wt-2)
311 \
312 vpaddd _WT, _WT, TMP4 \ // Wt = Wt-16 + sigma1(Wt-2)
313 vpaddd _WT, _WT, _WTp9 \ // Wt = Wt-16 + sigma1(Wt-2) + Wt-7
314 \
315 vprord TMP4, _WTp1, 7 \ // ROR_7(Wt-15)
316 vprord TMP5, _WTp1, 18 \ // ROR_18(Wt-15)
317 vpsrld TMP6, _WTp1, 3 \ // SHR_3(Wt-15)
318 vpternlogd TMP4, TMP5, TMP6, 0x96 \ // TMP4 = sigma0(Wt-15)
319 \
320 vpaddd _WT, _WT, TMP4 \ // Wt = Wt-16 + sigma1(Wt-2) +
321 \ // Wt-7 + sigma0(Wt-15) +
322
323
324 // Note this is reading in a block of data for one lane
325 // When all 16 are read, the data must be transposed to build msg schedule
326 #define MSG_SCHED_ROUND_00_15(_WT, OFFSET, LABEL) \
327 TESTQ $(1<<OFFSET), MASK_P9 \
328 JE LABEL \
329 MOVQ OFFSET*24(INPUT_P9), R9 \
330 vmovups _WT, [inp0+IDX] \
331 LABEL: \
332
333 #define MASKED_LOAD(_WT, OFFSET, LABEL) \
334 TESTQ $(1<<OFFSET), MASK_P9 \
335 JE LABEL \
336 MOVQ OFFSET*24(INPUT_P9), R9 \
337 vmovups _WT,[inp0+IDX] \
338 LABEL: \
339
340 TEXT ·sha256_x16_avx512(SB), 7, $0
341 MOVQ digests+0(FP), STATE_P9 //
342 MOVQ scratch+8(FP), SCRATCH_P9
343 MOVQ mask_len+32(FP), INP_SIZE_P9 // number of blocks to process
344 MOVQ mask+24(FP), MASKP_P9
345 MOVQ (MASKP_P9), MASK_P9
346 kmovq k1, mask
347 LEAQ inputs+48(FP), INPUT_P9
348
349 // Initialize digests
350 vmovdqu32 A, [STATE + 0*SHA256_DIGEST_ROW_SIZE]
351 vmovdqu32 B, [STATE + 1*SHA256_DIGEST_ROW_SIZE]
352 vmovdqu32 C, [STATE + 2*SHA256_DIGEST_ROW_SIZE]
353 vmovdqu32 D, [STATE + 3*SHA256_DIGEST_ROW_SIZE]
354 vmovdqu32 E, [STATE + 4*SHA256_DIGEST_ROW_SIZE]
355 vmovdqu32 F, [STATE + 5*SHA256_DIGEST_ROW_SIZE]
356 vmovdqu32 G, [STATE + 6*SHA256_DIGEST_ROW_SIZE]
357 vmovdqu32 H, [STATE + 7*SHA256_DIGEST_ROW_SIZE]
358
359 MOVQ table+16(FP), TBL_P9
360
361 xor IDX, IDX
362
363 // Read in first block of input data
364 MASKED_LOAD( W0, 0, skipInput0)
365 MASKED_LOAD( W1, 1, skipInput1)
366 MASKED_LOAD( W2, 2, skipInput2)
367 MASKED_LOAD( W3, 3, skipInput3)
368 MASKED_LOAD( W4, 4, skipInput4)
369 MASKED_LOAD( W5, 5, skipInput5)
370 MASKED_LOAD( W6, 6, skipInput6)
371 MASKED_LOAD( W7, 7, skipInput7)
372 MASKED_LOAD( W8, 8, skipInput8)
373 MASKED_LOAD( W9, 9, skipInput9)
374 MASKED_LOAD(W10, 10, skipInput10)
375 MASKED_LOAD(W11, 11, skipInput11)
376 MASKED_LOAD(W12, 12, skipInput12)
377 MASKED_LOAD(W13, 13, skipInput13)
378 MASKED_LOAD(W14, 14, skipInput14)
379 MASKED_LOAD(W15, 15, skipInput15)
380
381 lloop:
382 LEAQ PSHUFFLE_BYTE_FLIP_MASK<>(SB), TBL_P9
383 vmovdqu32 TMP2, [TBL]
384
385 // Get first K from table
386 MOVQ table+16(FP), TBL_P9
387 vmovdqu32 TMP3, [TBL]
388
389 // Save digests for later addition
390 vmovdqu32 [SCRATCH + 64*0], A
391 vmovdqu32 [SCRATCH + 64*1], B
392 vmovdqu32 [SCRATCH + 64*2], C
393 vmovdqu32 [SCRATCH + 64*3], D
394 vmovdqu32 [SCRATCH + 64*4], E
395 vmovdqu32 [SCRATCH + 64*5], F
396 vmovdqu32 [SCRATCH + 64*6], G
397 vmovdqu32 [SCRATCH + 64*7], H
398
399 add IDX, 64
400
401 // Transpose input data
402 TRANSPOSE16(W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1)
403
404 vpshufb W0, W0, TMP2
405 vpshufb W1, W1, TMP2
406 vpshufb W2, W2, TMP2
407 vpshufb W3, W3, TMP2
408 vpshufb W4, W4, TMP2
409 vpshufb W5, W5, TMP2
410 vpshufb W6, W6, TMP2
411 vpshufb W7, W7, TMP2
412 vpshufb W8, W8, TMP2
413 vpshufb W9, W9, TMP2
414 vpshufb W10, W10, TMP2
415 vpshufb W11, W11, TMP2
416 vpshufb W12, W12, TMP2
417 vpshufb W13, W13, TMP2
418 vpshufb W14, W14, TMP2
419 vpshufb W15, W15, TMP2
420
421 // MSG Schedule for W0-W15 is now complete in registers
422 // Process first 48 rounds
423 // Calculate next Wt+16 after processing is complete and Wt is unneeded
424
425 PROCESS_LOOP( W0, 0, A, B, C, D, E, F, G, H)
426 MSG_SCHED_ROUND_16_63( W0, W1, W9, W14)
427 PROCESS_LOOP( W1, 1, H, A, B, C, D, E, F, G)
428 MSG_SCHED_ROUND_16_63( W1, W2, W10, W15)
429 PROCESS_LOOP( W2, 2, G, H, A, B, C, D, E, F)
430 MSG_SCHED_ROUND_16_63( W2, W3, W11, W0)
431 PROCESS_LOOP( W3, 3, F, G, H, A, B, C, D, E)
432 MSG_SCHED_ROUND_16_63( W3, W4, W12, W1)
433 PROCESS_LOOP( W4, 4, E, F, G, H, A, B, C, D)
434 MSG_SCHED_ROUND_16_63( W4, W5, W13, W2)
435 PROCESS_LOOP( W5, 5, D, E, F, G, H, A, B, C)
436 MSG_SCHED_ROUND_16_63( W5, W6, W14, W3)
437 PROCESS_LOOP( W6, 6, C, D, E, F, G, H, A, B)
438 MSG_SCHED_ROUND_16_63( W6, W7, W15, W4)
439 PROCESS_LOOP( W7, 7, B, C, D, E, F, G, H, A)
440 MSG_SCHED_ROUND_16_63( W7, W8, W0, W5)
441 PROCESS_LOOP( W8, 8, A, B, C, D, E, F, G, H)
442 MSG_SCHED_ROUND_16_63( W8, W9, W1, W6)
443 PROCESS_LOOP( W9, 9, H, A, B, C, D, E, F, G)
444 MSG_SCHED_ROUND_16_63( W9, W10, W2, W7)
445 PROCESS_LOOP(W10, 10, G, H, A, B, C, D, E, F)
446 MSG_SCHED_ROUND_16_63(W10, W11, W3, W8)
447 PROCESS_LOOP(W11, 11, F, G, H, A, B, C, D, E)
448 MSG_SCHED_ROUND_16_63(W11, W12, W4, W9)
449 PROCESS_LOOP(W12, 12, E, F, G, H, A, B, C, D)
450 MSG_SCHED_ROUND_16_63(W12, W13, W5, W10)
451 PROCESS_LOOP(W13, 13, D, E, F, G, H, A, B, C)
452 MSG_SCHED_ROUND_16_63(W13, W14, W6, W11)
453 PROCESS_LOOP(W14, 14, C, D, E, F, G, H, A, B)
454 MSG_SCHED_ROUND_16_63(W14, W15, W7, W12)
455 PROCESS_LOOP(W15, 15, B, C, D, E, F, G, H, A)
456 MSG_SCHED_ROUND_16_63(W15, W0, W8, W13)
457 PROCESS_LOOP( W0, 16, A, B, C, D, E, F, G, H)
458 MSG_SCHED_ROUND_16_63( W0, W1, W9, W14)
459 PROCESS_LOOP( W1, 17, H, A, B, C, D, E, F, G)
460 MSG_SCHED_ROUND_16_63( W1, W2, W10, W15)
461 PROCESS_LOOP( W2, 18, G, H, A, B, C, D, E, F)
462 MSG_SCHED_ROUND_16_63( W2, W3, W11, W0)
463 PROCESS_LOOP( W3, 19, F, G, H, A, B, C, D, E)
464 MSG_SCHED_ROUND_16_63( W3, W4, W12, W1)
465 PROCESS_LOOP( W4, 20, E, F, G, H, A, B, C, D)
466 MSG_SCHED_ROUND_16_63( W4, W5, W13, W2)
467 PROCESS_LOOP( W5, 21, D, E, F, G, H, A, B, C)
468 MSG_SCHED_ROUND_16_63( W5, W6, W14, W3)
469 PROCESS_LOOP( W6, 22, C, D, E, F, G, H, A, B)
470 MSG_SCHED_ROUND_16_63( W6, W7, W15, W4)
471 PROCESS_LOOP( W7, 23, B, C, D, E, F, G, H, A)
472 MSG_SCHED_ROUND_16_63( W7, W8, W0, W5)
473 PROCESS_LOOP( W8, 24, A, B, C, D, E, F, G, H)
474 MSG_SCHED_ROUND_16_63( W8, W9, W1, W6)
475 PROCESS_LOOP( W9, 25, H, A, B, C, D, E, F, G)
476 MSG_SCHED_ROUND_16_63( W9, W10, W2, W7)
477 PROCESS_LOOP(W10, 26, G, H, A, B, C, D, E, F)
478 MSG_SCHED_ROUND_16_63(W10, W11, W3, W8)
479 PROCESS_LOOP(W11, 27, F, G, H, A, B, C, D, E)
480 MSG_SCHED_ROUND_16_63(W11, W12, W4, W9)
481 PROCESS_LOOP(W12, 28, E, F, G, H, A, B, C, D)
482 MSG_SCHED_ROUND_16_63(W12, W13, W5, W10)
483 PROCESS_LOOP(W13, 29, D, E, F, G, H, A, B, C)
484 MSG_SCHED_ROUND_16_63(W13, W14, W6, W11)
485 PROCESS_LOOP(W14, 30, C, D, E, F, G, H, A, B)
486 MSG_SCHED_ROUND_16_63(W14, W15, W7, W12)
487 PROCESS_LOOP(W15, 31, B, C, D, E, F, G, H, A)
488 MSG_SCHED_ROUND_16_63(W15, W0, W8, W13)
489 PROCESS_LOOP( W0, 32, A, B, C, D, E, F, G, H)
490 MSG_SCHED_ROUND_16_63( W0, W1, W9, W14)
491 PROCESS_LOOP( W1, 33, H, A, B, C, D, E, F, G)
492 MSG_SCHED_ROUND_16_63( W1, W2, W10, W15)
493 PROCESS_LOOP( W2, 34, G, H, A, B, C, D, E, F)
494 MSG_SCHED_ROUND_16_63( W2, W3, W11, W0)
495 PROCESS_LOOP( W3, 35, F, G, H, A, B, C, D, E)
496 MSG_SCHED_ROUND_16_63( W3, W4, W12, W1)
497 PROCESS_LOOP( W4, 36, E, F, G, H, A, B, C, D)
498 MSG_SCHED_ROUND_16_63( W4, W5, W13, W2)
499 PROCESS_LOOP( W5, 37, D, E, F, G, H, A, B, C)
500 MSG_SCHED_ROUND_16_63( W5, W6, W14, W3)
501 PROCESS_LOOP( W6, 38, C, D, E, F, G, H, A, B)
502 MSG_SCHED_ROUND_16_63( W6, W7, W15, W4)
503 PROCESS_LOOP( W7, 39, B, C, D, E, F, G, H, A)
504 MSG_SCHED_ROUND_16_63( W7, W8, W0, W5)
505 PROCESS_LOOP( W8, 40, A, B, C, D, E, F, G, H)
506 MSG_SCHED_ROUND_16_63( W8, W9, W1, W6)
507 PROCESS_LOOP( W9, 41, H, A, B, C, D, E, F, G)
508 MSG_SCHED_ROUND_16_63( W9, W10, W2, W7)
509 PROCESS_LOOP(W10, 42, G, H, A, B, C, D, E, F)
510 MSG_SCHED_ROUND_16_63(W10, W11, W3, W8)
511 PROCESS_LOOP(W11, 43, F, G, H, A, B, C, D, E)
512 MSG_SCHED_ROUND_16_63(W11, W12, W4, W9)
513 PROCESS_LOOP(W12, 44, E, F, G, H, A, B, C, D)
514 MSG_SCHED_ROUND_16_63(W12, W13, W5, W10)
515 PROCESS_LOOP(W13, 45, D, E, F, G, H, A, B, C)
516 MSG_SCHED_ROUND_16_63(W13, W14, W6, W11)
517 PROCESS_LOOP(W14, 46, C, D, E, F, G, H, A, B)
518 MSG_SCHED_ROUND_16_63(W14, W15, W7, W12)
519 PROCESS_LOOP(W15, 47, B, C, D, E, F, G, H, A)
520 MSG_SCHED_ROUND_16_63(W15, W0, W8, W13)
521
522 // Check if this is the last block
523 sub INP_SIZE, 1
524 JE lastLoop
525
526 // Load next mask for inputs
527 ADDQ $8, MASKP_P9
528 MOVQ (MASKP_P9), MASK_P9
529
530 // Process last 16 rounds
531 // Read in next block msg data for use in first 16 words of msg sched
532
533 PROCESS_LOOP( W0, 48, A, B, C, D, E, F, G, H)
534 MSG_SCHED_ROUND_00_15( W0, 0, skipNext0)
535 PROCESS_LOOP( W1, 49, H, A, B, C, D, E, F, G)
536 MSG_SCHED_ROUND_00_15( W1, 1, skipNext1)
537 PROCESS_LOOP( W2, 50, G, H, A, B, C, D, E, F)
538 MSG_SCHED_ROUND_00_15( W2, 2, skipNext2)
539 PROCESS_LOOP( W3, 51, F, G, H, A, B, C, D, E)
540 MSG_SCHED_ROUND_00_15( W3, 3, skipNext3)
541 PROCESS_LOOP( W4, 52, E, F, G, H, A, B, C, D)
542 MSG_SCHED_ROUND_00_15( W4, 4, skipNext4)
543 PROCESS_LOOP( W5, 53, D, E, F, G, H, A, B, C)
544 MSG_SCHED_ROUND_00_15( W5, 5, skipNext5)
545 PROCESS_LOOP( W6, 54, C, D, E, F, G, H, A, B)
546 MSG_SCHED_ROUND_00_15( W6, 6, skipNext6)
547 PROCESS_LOOP( W7, 55, B, C, D, E, F, G, H, A)
548 MSG_SCHED_ROUND_00_15( W7, 7, skipNext7)
549 PROCESS_LOOP( W8, 56, A, B, C, D, E, F, G, H)
550 MSG_SCHED_ROUND_00_15( W8, 8, skipNext8)
551 PROCESS_LOOP( W9, 57, H, A, B, C, D, E, F, G)
552 MSG_SCHED_ROUND_00_15( W9, 9, skipNext9)
553 PROCESS_LOOP(W10, 58, G, H, A, B, C, D, E, F)
554 MSG_SCHED_ROUND_00_15(W10, 10, skipNext10)
555 PROCESS_LOOP(W11, 59, F, G, H, A, B, C, D, E)
556 MSG_SCHED_ROUND_00_15(W11, 11, skipNext11)
557 PROCESS_LOOP(W12, 60, E, F, G, H, A, B, C, D)
558 MSG_SCHED_ROUND_00_15(W12, 12, skipNext12)
559 PROCESS_LOOP(W13, 61, D, E, F, G, H, A, B, C)
560 MSG_SCHED_ROUND_00_15(W13, 13, skipNext13)
561 PROCESS_LOOP(W14, 62, C, D, E, F, G, H, A, B)
562 MSG_SCHED_ROUND_00_15(W14, 14, skipNext14)
563 PROCESS_LOOP(W15, 63, B, C, D, E, F, G, H, A)
564 MSG_SCHED_ROUND_00_15(W15, 15, skipNext15)
565
566 // Add old digest
567 vmovdqu32 TMP2, A
568 vmovdqu32 A, [SCRATCH + 64*0]
569 vpaddd A{k1}, A, TMP2
570 vmovdqu32 TMP2, B
571 vmovdqu32 B, [SCRATCH + 64*1]
572 vpaddd B{k1}, B, TMP2
573 vmovdqu32 TMP2, C
574 vmovdqu32 C, [SCRATCH + 64*2]
575 vpaddd C{k1}, C, TMP2
576 vmovdqu32 TMP2, D
577 vmovdqu32 D, [SCRATCH + 64*3]
578 vpaddd D{k1}, D, TMP2
579 vmovdqu32 TMP2, E
580 vmovdqu32 E, [SCRATCH + 64*4]
581 vpaddd E{k1}, E, TMP2
582 vmovdqu32 TMP2, F
583 vmovdqu32 F, [SCRATCH + 64*5]
584 vpaddd F{k1}, F, TMP2
585 vmovdqu32 TMP2, G
586 vmovdqu32 G, [SCRATCH + 64*6]
587 vpaddd G{k1}, G, TMP2
588 vmovdqu32 TMP2, H
589 vmovdqu32 H, [SCRATCH + 64*7]
590 vpaddd H{k1}, H, TMP2
591
592 kmovq k1, mask
593 JMP lloop
594
595 lastLoop:
596 // Process last 16 rounds
597 PROCESS_LOOP( W0, 48, A, B, C, D, E, F, G, H)
598 PROCESS_LOOP( W1, 49, H, A, B, C, D, E, F, G)
599 PROCESS_LOOP( W2, 50, G, H, A, B, C, D, E, F)
600 PROCESS_LOOP( W3, 51, F, G, H, A, B, C, D, E)
601 PROCESS_LOOP( W4, 52, E, F, G, H, A, B, C, D)
602 PROCESS_LOOP( W5, 53, D, E, F, G, H, A, B, C)
603 PROCESS_LOOP( W6, 54, C, D, E, F, G, H, A, B)
604 PROCESS_LOOP( W7, 55, B, C, D, E, F, G, H, A)
605 PROCESS_LOOP( W8, 56, A, B, C, D, E, F, G, H)
606 PROCESS_LOOP( W9, 57, H, A, B, C, D, E, F, G)
607 PROCESS_LOOP(W10, 58, G, H, A, B, C, D, E, F)
608 PROCESS_LOOP(W11, 59, F, G, H, A, B, C, D, E)
609 PROCESS_LOOP(W12, 60, E, F, G, H, A, B, C, D)
610 PROCESS_LOOP(W13, 61, D, E, F, G, H, A, B, C)
611 PROCESS_LOOP(W14, 62, C, D, E, F, G, H, A, B)
612 PROCESS_LOOP(W15, 63, B, C, D, E, F, G, H, A)
613
614 // Add old digest
615 vmovdqu32 TMP2, A
616 vmovdqu32 A, [SCRATCH + 64*0]
617 vpaddd A{k1}, A, TMP2
618 vmovdqu32 TMP2, B
619 vmovdqu32 B, [SCRATCH + 64*1]
620 vpaddd B{k1}, B, TMP2
621 vmovdqu32 TMP2, C
622 vmovdqu32 C, [SCRATCH + 64*2]
623 vpaddd C{k1}, C, TMP2
624 vmovdqu32 TMP2, D
625 vmovdqu32 D, [SCRATCH + 64*3]
626 vpaddd D{k1}, D, TMP2
627 vmovdqu32 TMP2, E
628 vmovdqu32 E, [SCRATCH + 64*4]
629 vpaddd E{k1}, E, TMP2
630 vmovdqu32 TMP2, F
631 vmovdqu32 F, [SCRATCH + 64*5]
632 vpaddd F{k1}, F, TMP2
633 vmovdqu32 TMP2, G
634 vmovdqu32 G, [SCRATCH + 64*6]
635 vpaddd G{k1}, G, TMP2
636 vmovdqu32 TMP2, H
637 vmovdqu32 H, [SCRATCH + 64*7]
638 vpaddd H{k1}, H, TMP2
639
640 // Write out digest
641 vmovdqu32 [STATE + 0*SHA256_DIGEST_ROW_SIZE], A
642 vmovdqu32 [STATE + 1*SHA256_DIGEST_ROW_SIZE], B
643 vmovdqu32 [STATE + 2*SHA256_DIGEST_ROW_SIZE], C
644 vmovdqu32 [STATE + 3*SHA256_DIGEST_ROW_SIZE], D
645 vmovdqu32 [STATE + 4*SHA256_DIGEST_ROW_SIZE], E
646 vmovdqu32 [STATE + 5*SHA256_DIGEST_ROW_SIZE], F
647 vmovdqu32 [STATE + 6*SHA256_DIGEST_ROW_SIZE], G
648 vmovdqu32 [STATE + 7*SHA256_DIGEST_ROW_SIZE], H
649
650 VZEROUPPER
651 RET
652
653 //
654 // Tables
655 //
656
657 DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x000(SB)/8, $0x0405060700010203
658 DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x008(SB)/8, $0x0c0d0e0f08090a0b
659 DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x010(SB)/8, $0x0405060700010203
660 DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x018(SB)/8, $0x0c0d0e0f08090a0b
661 DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x020(SB)/8, $0x0405060700010203
662 DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x028(SB)/8, $0x0c0d0e0f08090a0b
663 DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x030(SB)/8, $0x0405060700010203
664 DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x038(SB)/8, $0x0c0d0e0f08090a0b
665 GLOBL PSHUFFLE_BYTE_FLIP_MASK<>(SB), 8, $64
666
667 DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x000(SB)/8, $0x0000000000000000
668 DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x008(SB)/8, $0x0000000000000001
669 DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x010(SB)/8, $0x0000000000000008
670 DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x018(SB)/8, $0x0000000000000009
671 DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x020(SB)/8, $0x0000000000000004
672 DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x028(SB)/8, $0x0000000000000005
673 DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x030(SB)/8, $0x000000000000000C
674 DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x038(SB)/8, $0x000000000000000D
675 GLOBL PSHUFFLE_TRANSPOSE16_MASK1<>(SB), 8, $64
676
677 DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x000(SB)/8, $0x0000000000000002
678 DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x008(SB)/8, $0x0000000000000003
679 DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x010(SB)/8, $0x000000000000000A
680 DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x018(SB)/8, $0x000000000000000B
681 DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x020(SB)/8, $0x0000000000000006
682 DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x028(SB)/8, $0x0000000000000007
683 DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x030(SB)/8, $0x000000000000000E
684 DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x038(SB)/8, $0x000000000000000F
685 GLOBL PSHUFFLE_TRANSPOSE16_MASK2<>(SB), 8, $64
0 //+build !noasm
1
2 /*
3 * Minio Cloud Storage, (C) 2017 Minio, Inc.
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package sha256
19
20 import (
21 "encoding/binary"
22 "errors"
23 "hash"
24 "sort"
25 "sync/atomic"
26 "time"
27 )
28
29 //go:noescape
30 func sha256_x16_avx512(digests *[512]byte, scratch *[512]byte, table *[4096]uint64, mask []uint64, inputs [16][]byte)
31
32 // Do not start at 0 but next multiple of 16 so as to be able to
33 // differentiate with default initialiation value of 0
34 const Avx512ServerUid = 16
35
36 var uidCounter uint64
37
38 func NewAvx512(a512srv *Avx512Server) hash.Hash {
39 uid := atomic.AddUint64(&uidCounter, 1)
40 return &Avx512Digest{uid: uid, a512srv: a512srv}
41 }
42
43 // Type for computing SHA256 using AVX51
44 type Avx512Digest struct {
45 uid uint64
46 a512srv *Avx512Server
47 x [chunk]byte
48 nx int
49 len uint64
50 final bool
51 result [Size]byte
52 }
53
54 // Return size of checksum
55 func (d *Avx512Digest) Size() int { return Size }
56
57 // Return blocksize of checksum
58 func (d Avx512Digest) BlockSize() int { return BlockSize }
59
60 func (d *Avx512Digest) Reset() {
61 d.a512srv.blocksCh <- blockInput{uid: d.uid, reset: true}
62 d.nx = 0
63 d.len = 0
64 d.final = false
65 }
66
67 // Write to digest
68 func (d *Avx512Digest) Write(p []byte) (nn int, err error) {
69
70 if d.final {
71 return 0, errors.New("Avx512Digest already finalized. Reset first before writing again.")
72 }
73
74 nn = len(p)
75 d.len += uint64(nn)
76 if d.nx > 0 {
77 n := copy(d.x[d.nx:], p)
78 d.nx += n
79 if d.nx == chunk {
80 d.a512srv.blocksCh <- blockInput{uid: d.uid, msg: d.x[:]}
81 d.nx = 0
82 }
83 p = p[n:]
84 }
85 if len(p) >= chunk {
86 n := len(p) &^ (chunk - 1)
87 d.a512srv.blocksCh <- blockInput{uid: d.uid, msg: p[:n]}
88 p = p[n:]
89 }
90 if len(p) > 0 {
91 d.nx = copy(d.x[:], p)
92 }
93 return
94 }
95
96 // Return sha256 sum in bytes
97 func (d *Avx512Digest) Sum(in []byte) (result []byte) {
98
99 if d.final {
100 return append(in, d.result[:]...)
101 }
102
103 trail := make([]byte, 0, 128)
104
105 len := d.len
106 // Padding. Add a 1 bit and 0 bits until 56 bytes mod 64.
107 var tmp [64]byte
108 tmp[0] = 0x80
109 if len%64 < 56 {
110 trail = append(d.x[:d.nx], tmp[0:56-len%64]...)
111 } else {
112 trail = append(d.x[:d.nx], tmp[0:64+56-len%64]...)
113 }
114 d.nx = 0
115
116 // Length in bits.
117 len <<= 3
118 for i := uint(0); i < 8; i++ {
119 tmp[i] = byte(len >> (56 - 8*i))
120 }
121 trail = append(trail, tmp[0:8]...)
122
123 sumCh := make(chan [Size]byte)
124 d.a512srv.blocksCh <- blockInput{uid: d.uid, msg: trail, final: true, sumCh: sumCh}
125 d.result = <-sumCh
126 d.final = true
127 return append(in, d.result[:]...)
128 }
129
130 var table = [4096]uint64{
131 0x428a2f98428a2f98, 0x428a2f98428a2f98, 0x428a2f98428a2f98, 0x428a2f98428a2f98,
132 0x428a2f98428a2f98, 0x428a2f98428a2f98, 0x428a2f98428a2f98, 0x428a2f98428a2f98,
133 0x7137449171374491, 0x7137449171374491, 0x7137449171374491, 0x7137449171374491,
134 0x7137449171374491, 0x7137449171374491, 0x7137449171374491, 0x7137449171374491,
135 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf,
136 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf,
137 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5,
138 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5,
139 0x3956c25b3956c25b, 0x3956c25b3956c25b, 0x3956c25b3956c25b, 0x3956c25b3956c25b,
140 0x3956c25b3956c25b, 0x3956c25b3956c25b, 0x3956c25b3956c25b, 0x3956c25b3956c25b,
141 0x59f111f159f111f1, 0x59f111f159f111f1, 0x59f111f159f111f1, 0x59f111f159f111f1,
142 0x59f111f159f111f1, 0x59f111f159f111f1, 0x59f111f159f111f1, 0x59f111f159f111f1,
143 0x923f82a4923f82a4, 0x923f82a4923f82a4, 0x923f82a4923f82a4, 0x923f82a4923f82a4,
144 0x923f82a4923f82a4, 0x923f82a4923f82a4, 0x923f82a4923f82a4, 0x923f82a4923f82a4,
145 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5,
146 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5,
147 0xd807aa98d807aa98, 0xd807aa98d807aa98, 0xd807aa98d807aa98, 0xd807aa98d807aa98,
148 0xd807aa98d807aa98, 0xd807aa98d807aa98, 0xd807aa98d807aa98, 0xd807aa98d807aa98,
149 0x12835b0112835b01, 0x12835b0112835b01, 0x12835b0112835b01, 0x12835b0112835b01,
150 0x12835b0112835b01, 0x12835b0112835b01, 0x12835b0112835b01, 0x12835b0112835b01,
151 0x243185be243185be, 0x243185be243185be, 0x243185be243185be, 0x243185be243185be,
152 0x243185be243185be, 0x243185be243185be, 0x243185be243185be, 0x243185be243185be,
153 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3,
154 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3,
155 0x72be5d7472be5d74, 0x72be5d7472be5d74, 0x72be5d7472be5d74, 0x72be5d7472be5d74,
156 0x72be5d7472be5d74, 0x72be5d7472be5d74, 0x72be5d7472be5d74, 0x72be5d7472be5d74,
157 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe,
158 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe,
159 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7,
160 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7,
161 0xc19bf174c19bf174, 0xc19bf174c19bf174, 0xc19bf174c19bf174, 0xc19bf174c19bf174,
162 0xc19bf174c19bf174, 0xc19bf174c19bf174, 0xc19bf174c19bf174, 0xc19bf174c19bf174,
163 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1,
164 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1,
165 0xefbe4786efbe4786, 0xefbe4786efbe4786, 0xefbe4786efbe4786, 0xefbe4786efbe4786,
166 0xefbe4786efbe4786, 0xefbe4786efbe4786, 0xefbe4786efbe4786, 0xefbe4786efbe4786,
167 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6,
168 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6,
169 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc,
170 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc,
171 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f,
172 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f,
173 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa,
174 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa,
175 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc,
176 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc,
177 0x76f988da76f988da, 0x76f988da76f988da, 0x76f988da76f988da, 0x76f988da76f988da,
178 0x76f988da76f988da, 0x76f988da76f988da, 0x76f988da76f988da, 0x76f988da76f988da,
179 0x983e5152983e5152, 0x983e5152983e5152, 0x983e5152983e5152, 0x983e5152983e5152,
180 0x983e5152983e5152, 0x983e5152983e5152, 0x983e5152983e5152, 0x983e5152983e5152,
181 0xa831c66da831c66d, 0xa831c66da831c66d, 0xa831c66da831c66d, 0xa831c66da831c66d,
182 0xa831c66da831c66d, 0xa831c66da831c66d, 0xa831c66da831c66d, 0xa831c66da831c66d,
183 0xb00327c8b00327c8, 0xb00327c8b00327c8, 0xb00327c8b00327c8, 0xb00327c8b00327c8,
184 0xb00327c8b00327c8, 0xb00327c8b00327c8, 0xb00327c8b00327c8, 0xb00327c8b00327c8,
185 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7,
186 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7,
187 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3,
188 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3,
189 0xd5a79147d5a79147, 0xd5a79147d5a79147, 0xd5a79147d5a79147, 0xd5a79147d5a79147,
190 0xd5a79147d5a79147, 0xd5a79147d5a79147, 0xd5a79147d5a79147, 0xd5a79147d5a79147,
191 0x06ca635106ca6351, 0x06ca635106ca6351, 0x06ca635106ca6351, 0x06ca635106ca6351,
192 0x06ca635106ca6351, 0x06ca635106ca6351, 0x06ca635106ca6351, 0x06ca635106ca6351,
193 0x1429296714292967, 0x1429296714292967, 0x1429296714292967, 0x1429296714292967,
194 0x1429296714292967, 0x1429296714292967, 0x1429296714292967, 0x1429296714292967,
195 0x27b70a8527b70a85, 0x27b70a8527b70a85, 0x27b70a8527b70a85, 0x27b70a8527b70a85,
196 0x27b70a8527b70a85, 0x27b70a8527b70a85, 0x27b70a8527b70a85, 0x27b70a8527b70a85,
197 0x2e1b21382e1b2138, 0x2e1b21382e1b2138, 0x2e1b21382e1b2138, 0x2e1b21382e1b2138,
198 0x2e1b21382e1b2138, 0x2e1b21382e1b2138, 0x2e1b21382e1b2138, 0x2e1b21382e1b2138,
199 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc,
200 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc,
201 0x53380d1353380d13, 0x53380d1353380d13, 0x53380d1353380d13, 0x53380d1353380d13,
202 0x53380d1353380d13, 0x53380d1353380d13, 0x53380d1353380d13, 0x53380d1353380d13,
203 0x650a7354650a7354, 0x650a7354650a7354, 0x650a7354650a7354, 0x650a7354650a7354,
204 0x650a7354650a7354, 0x650a7354650a7354, 0x650a7354650a7354, 0x650a7354650a7354,
205 0x766a0abb766a0abb, 0x766a0abb766a0abb, 0x766a0abb766a0abb, 0x766a0abb766a0abb,
206 0x766a0abb766a0abb, 0x766a0abb766a0abb, 0x766a0abb766a0abb, 0x766a0abb766a0abb,
207 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e,
208 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e,
209 0x92722c8592722c85, 0x92722c8592722c85, 0x92722c8592722c85, 0x92722c8592722c85,
210 0x92722c8592722c85, 0x92722c8592722c85, 0x92722c8592722c85, 0x92722c8592722c85,
211 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1,
212 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1,
213 0xa81a664ba81a664b, 0xa81a664ba81a664b, 0xa81a664ba81a664b, 0xa81a664ba81a664b,
214 0xa81a664ba81a664b, 0xa81a664ba81a664b, 0xa81a664ba81a664b, 0xa81a664ba81a664b,
215 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70,
216 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70,
217 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3,
218 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3,
219 0xd192e819d192e819, 0xd192e819d192e819, 0xd192e819d192e819, 0xd192e819d192e819,
220 0xd192e819d192e819, 0xd192e819d192e819, 0xd192e819d192e819, 0xd192e819d192e819,
221 0xd6990624d6990624, 0xd6990624d6990624, 0xd6990624d6990624, 0xd6990624d6990624,
222 0xd6990624d6990624, 0xd6990624d6990624, 0xd6990624d6990624, 0xd6990624d6990624,
223 0xf40e3585f40e3585, 0xf40e3585f40e3585, 0xf40e3585f40e3585, 0xf40e3585f40e3585,
224 0xf40e3585f40e3585, 0xf40e3585f40e3585, 0xf40e3585f40e3585, 0xf40e3585f40e3585,
225 0x106aa070106aa070, 0x106aa070106aa070, 0x106aa070106aa070, 0x106aa070106aa070,
226 0x106aa070106aa070, 0x106aa070106aa070, 0x106aa070106aa070, 0x106aa070106aa070,
227 0x19a4c11619a4c116, 0x19a4c11619a4c116, 0x19a4c11619a4c116, 0x19a4c11619a4c116,
228 0x19a4c11619a4c116, 0x19a4c11619a4c116, 0x19a4c11619a4c116, 0x19a4c11619a4c116,
229 0x1e376c081e376c08, 0x1e376c081e376c08, 0x1e376c081e376c08, 0x1e376c081e376c08,
230 0x1e376c081e376c08, 0x1e376c081e376c08, 0x1e376c081e376c08, 0x1e376c081e376c08,
231 0x2748774c2748774c, 0x2748774c2748774c, 0x2748774c2748774c, 0x2748774c2748774c,
232 0x2748774c2748774c, 0x2748774c2748774c, 0x2748774c2748774c, 0x2748774c2748774c,
233 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5,
234 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5,
235 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3,
236 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3,
237 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a,
238 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a,
239 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f,
240 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f,
241 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3,
242 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3,
243 0x748f82ee748f82ee, 0x748f82ee748f82ee, 0x748f82ee748f82ee, 0x748f82ee748f82ee,
244 0x748f82ee748f82ee, 0x748f82ee748f82ee, 0x748f82ee748f82ee, 0x748f82ee748f82ee,
245 0x78a5636f78a5636f, 0x78a5636f78a5636f, 0x78a5636f78a5636f, 0x78a5636f78a5636f,
246 0x78a5636f78a5636f, 0x78a5636f78a5636f, 0x78a5636f78a5636f, 0x78a5636f78a5636f,
247 0x84c8781484c87814, 0x84c8781484c87814, 0x84c8781484c87814, 0x84c8781484c87814,
248 0x84c8781484c87814, 0x84c8781484c87814, 0x84c8781484c87814, 0x84c8781484c87814,
249 0x8cc702088cc70208, 0x8cc702088cc70208, 0x8cc702088cc70208, 0x8cc702088cc70208,
250 0x8cc702088cc70208, 0x8cc702088cc70208, 0x8cc702088cc70208, 0x8cc702088cc70208,
251 0x90befffa90befffa, 0x90befffa90befffa, 0x90befffa90befffa, 0x90befffa90befffa,
252 0x90befffa90befffa, 0x90befffa90befffa, 0x90befffa90befffa, 0x90befffa90befffa,
253 0xa4506ceba4506ceb, 0xa4506ceba4506ceb, 0xa4506ceba4506ceb, 0xa4506ceba4506ceb,
254 0xa4506ceba4506ceb, 0xa4506ceba4506ceb, 0xa4506ceba4506ceb, 0xa4506ceba4506ceb,
255 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7,
256 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7,
257 0xc67178f2c67178f2, 0xc67178f2c67178f2, 0xc67178f2c67178f2, 0xc67178f2c67178f2,
258 0xc67178f2c67178f2, 0xc67178f2c67178f2, 0xc67178f2c67178f2, 0xc67178f2c67178f2}
259
260 // Interface function to assembly ode
261 func blockAvx512(digests *[512]byte, input [16][]byte, mask []uint64) [16][Size]byte {
262
263 scratch := [512]byte{}
264 sha256_x16_avx512(digests, &scratch, &table, mask, input)
265
266 output := [16][Size]byte{}
267 for i := 0; i < 16; i++ {
268 output[i] = getDigest(i, digests[:])
269 }
270
271 return output
272 }
273
274 func getDigest(index int, state []byte) (sum [Size]byte) {
275 for j := 0; j < 16; j += 2 {
276 for i := index*4 + j*Size; i < index*4+(j+1)*Size; i += Size {
277 binary.BigEndian.PutUint32(sum[j*2:], binary.LittleEndian.Uint32(state[i:i+4]))
278 }
279 }
280 return
281 }
282
283 // Message to send across input channel
284 type blockInput struct {
285 uid uint64
286 msg []byte
287 reset bool
288 final bool
289 sumCh chan [Size]byte
290 }
291
292 // Type to implement 16x parallel handling of SHA256 invocations
293 type Avx512Server struct {
294 blocksCh chan blockInput // Input channel
295 totalIn int // Total number of inputs waiting to be processed
296 lanes [16]Avx512LaneInfo // Array with info per lane (out of 16)
297 digests map[uint64][Size]byte // Map of uids to (interim) digest results
298 }
299
300 // Info for each lane
301 type Avx512LaneInfo struct {
302 uid uint64 // unique identification for this SHA processing
303 block []byte // input block to be processed
304 outputCh chan [Size]byte // channel for output result
305 }
306
307 // Create new object for parallel processing handling
308 func NewAvx512Server() *Avx512Server {
309 a512srv := &Avx512Server{}
310 a512srv.digests = make(map[uint64][Size]byte)
311 a512srv.blocksCh = make(chan blockInput)
312
313 // Start a single thread for reading from the input channel
314 go a512srv.Process()
315 return a512srv
316 }
317
318 // Sole handler for reading from the input channel
319 func (a512srv *Avx512Server) Process() {
320 for {
321 select {
322 case block := <-a512srv.blocksCh:
323 if block.reset {
324 a512srv.reset(block.uid)
325 continue
326 }
327 index := block.uid & 0xf
328 // fmt.Println("Adding message:", block.uid, index)
329
330 if a512srv.lanes[index].block != nil { // If slot is already filled, process all inputs
331 //fmt.Println("Invoking Blocks()")
332 a512srv.blocks()
333 }
334 a512srv.totalIn++
335 a512srv.lanes[index] = Avx512LaneInfo{uid: block.uid, block: block.msg}
336 if block.final {
337 a512srv.lanes[index].outputCh = block.sumCh
338 }
339 if a512srv.totalIn == len(a512srv.lanes) {
340 // fmt.Println("Invoking Blocks() while FULL: ")
341 a512srv.blocks()
342 }
343
344 // TODO: test with larger timeout
345 case <-time.After(1 * time.Microsecond):
346 for _, lane := range a512srv.lanes {
347 if lane.block != nil { // check if there is any input to process
348 // fmt.Println("Invoking Blocks() on TIMEOUT: ")
349 a512srv.blocks()
350 break // we are done
351 }
352 }
353 }
354 }
355 }
356
357 // Do a reset for this calculation
358 func (a512srv *Avx512Server) reset(uid uint64) {
359
360 // Check if there is a message still waiting to be processed (and remove if so)
361 for i, lane := range a512srv.lanes {
362 if lane.uid == uid {
363 if lane.block != nil {
364 a512srv.lanes[i] = Avx512LaneInfo{} // clear message
365 a512srv.totalIn -= 1
366 }
367 }
368 }
369
370 // Delete entry from hash map
371 delete(a512srv.digests, uid)
372 }
373
374 // Invoke assembly and send results back
375 func (a512srv *Avx512Server) blocks() (err error) {
376
377 inputs := [16][]byte{}
378 for i := range inputs {
379 inputs[i] = a512srv.lanes[i].block
380 }
381
382 mask := expandMask(genMask(inputs))
383 outputs := blockAvx512(a512srv.getDigests(), inputs, mask)
384
385 a512srv.totalIn = 0
386 for i := 0; i < len(outputs); i++ {
387 uid, outputCh := a512srv.lanes[i].uid, a512srv.lanes[i].outputCh
388 a512srv.digests[uid] = outputs[i]
389 a512srv.lanes[i] = Avx512LaneInfo{}
390
391 if outputCh != nil {
392 // Send back result
393 outputCh <- outputs[i]
394 delete(a512srv.digests, uid) // Delete entry from hashmap
395 }
396 }
397 return
398 }
399
400 func (a512srv *Avx512Server) Write(uid uint64, p []byte) (nn int, err error) {
401 a512srv.blocksCh <- blockInput{uid: uid, msg: p}
402 return len(p), nil
403 }
404
405 func (a512srv *Avx512Server) Sum(uid uint64, p []byte) [32]byte {
406 sumCh := make(chan [32]byte)
407 a512srv.blocksCh <- blockInput{uid: uid, msg: p, final: true, sumCh: sumCh}
408 return <-sumCh
409 }
410
411 func (a512srv *Avx512Server) getDigests() *[512]byte {
412 digests := [512]byte{}
413 for i, lane := range a512srv.lanes {
414 a, ok := a512srv.digests[lane.uid]
415 if ok {
416 binary.BigEndian.PutUint32(digests[(i+0*16)*4:], binary.LittleEndian.Uint32(a[0:4]))
417 binary.BigEndian.PutUint32(digests[(i+1*16)*4:], binary.LittleEndian.Uint32(a[4:8]))
418 binary.BigEndian.PutUint32(digests[(i+2*16)*4:], binary.LittleEndian.Uint32(a[8:12]))
419 binary.BigEndian.PutUint32(digests[(i+3*16)*4:], binary.LittleEndian.Uint32(a[12:16]))
420 binary.BigEndian.PutUint32(digests[(i+4*16)*4:], binary.LittleEndian.Uint32(a[16:20]))
421 binary.BigEndian.PutUint32(digests[(i+5*16)*4:], binary.LittleEndian.Uint32(a[20:24]))
422 binary.BigEndian.PutUint32(digests[(i+6*16)*4:], binary.LittleEndian.Uint32(a[24:28]))
423 binary.BigEndian.PutUint32(digests[(i+7*16)*4:], binary.LittleEndian.Uint32(a[28:32]))
424 } else {
425 binary.LittleEndian.PutUint32(digests[(i+0*16)*4:], init0)
426 binary.LittleEndian.PutUint32(digests[(i+1*16)*4:], init1)
427 binary.LittleEndian.PutUint32(digests[(i+2*16)*4:], init2)
428 binary.LittleEndian.PutUint32(digests[(i+3*16)*4:], init3)
429 binary.LittleEndian.PutUint32(digests[(i+4*16)*4:], init4)
430 binary.LittleEndian.PutUint32(digests[(i+5*16)*4:], init5)
431 binary.LittleEndian.PutUint32(digests[(i+6*16)*4:], init6)
432 binary.LittleEndian.PutUint32(digests[(i+7*16)*4:], init7)
433 }
434 }
435 return &digests
436 }
437
438 // Helper struct for sorting blocks based on length
439 type lane struct {
440 len uint
441 pos uint
442 }
443
444 type lanes []lane
445
446 func (lns lanes) Len() int { return len(lns) }
447 func (lns lanes) Swap(i, j int) { lns[i], lns[j] = lns[j], lns[i] }
448 func (lns lanes) Less(i, j int) bool { return lns[i].len < lns[j].len }
449
450 // Helper struct for
451 type maskRounds struct {
452 mask uint64
453 rounds uint64
454 }
455
456 func genMask(input [16][]byte) [16]maskRounds {
457
458 // Sort on blocks length small to large
459 var sorted [16]lane
460 for c, inpt := range input {
461 sorted[c] = lane{uint(len(inpt)), uint(c)}
462 }
463 sort.Sort(lanes(sorted[:]))
464
465 // Create mask array including 'rounds' between masks
466 m, round, index := uint64(0xffff), uint64(0), 0
467 var mr [16]maskRounds
468 for _, s := range sorted {
469 if s.len > 0 {
470 if uint64(s.len)>>6 > round {
471 mr[index] = maskRounds{m, (uint64(s.len) >> 6) - round}
472 index++
473 }
474 round = uint64(s.len) >> 6
475 }
476 m = m & ^(1 << uint(s.pos))
477 }
478
479 return mr
480 }
481
482 // TODO: remove function
483 func expandMask(mr [16]maskRounds) []uint64 {
484 size := uint64(0)
485 for _, r := range mr {
486 size += r.rounds
487 }
488 result, index := make([]uint64, size), 0
489 for _, r := range mr {
490 for j := uint64(0); j < r.rounds; j++ {
491 result[index] = r.mask
492 index++
493 }
494 }
495 return result
496 }
0 TEXT ·sha256_x16_avx512(SB), 7, $0
1 MOVQ digests+0(FP), DI
2 MOVQ scratch+8(FP), R12
3 MOVQ mask_len+32(FP), SI
4 MOVQ r14+24(FP), R13
5 MOVQ (R13), R14
6 LONG $0x92fbc1c4; BYTE $0xce
7 LEAQ inputs+48(FP), AX
8 QUAD $0xf162076f487ef162; QUAD $0x7ef162014f6f487e; QUAD $0x487ef16202576f48; QUAD $0x6f487ef162035f6f; QUAD $0x6f6f487ef1620467; QUAD $0x06776f487ef16205; LONG $0x487ef162; WORD $0x7f6f; BYTE $0x07
9 MOVQ table+16(FP), DX
10 WORD $0x3148; BYTE $0xc9
11 TESTQ $(1<<0), R14
12 JE skipInput0
13 MOVQ 0*24(AX), R9
14 LONG $0x487cc162; WORD $0x0410; BYTE $0x09
15 skipInput0:
16 TESTQ $(1<<1), R14
17 JE skipInput1
18 MOVQ 1*24(AX), R9
19 LONG $0x487cc162; WORD $0x0c10; BYTE $0x09
20 skipInput1:
21 TESTQ $(1<<2), R14
22 JE skipInput2
23 MOVQ 2*24(AX), R9
24 LONG $0x487cc162; WORD $0x1410; BYTE $0x09
25 skipInput2:
26 TESTQ $(1<<3), R14
27 JE skipInput3
28 MOVQ 3*24(AX), R9
29 LONG $0x487cc162; WORD $0x1c10; BYTE $0x09
30 skipInput3:
31 TESTQ $(1<<4), R14
32 JE skipInput4
33 MOVQ 4*24(AX), R9
34 LONG $0x487cc162; WORD $0x2410; BYTE $0x09
35 skipInput4:
36 TESTQ $(1<<5), R14
37 JE skipInput5
38 MOVQ 5*24(AX), R9
39 LONG $0x487cc162; WORD $0x2c10; BYTE $0x09
40 skipInput5:
41 TESTQ $(1<<6), R14
42 JE skipInput6
43 MOVQ 6*24(AX), R9
44 LONG $0x487cc162; WORD $0x3410; BYTE $0x09
45 skipInput6:
46 TESTQ $(1<<7), R14
47 JE skipInput7
48 MOVQ 7*24(AX), R9
49 LONG $0x487cc162; WORD $0x3c10; BYTE $0x09
50 skipInput7:
51 TESTQ $(1<<8), R14
52 JE skipInput8
53 MOVQ 8*24(AX), R9
54 LONG $0x487c4162; WORD $0x0410; BYTE $0x09
55 skipInput8:
56 TESTQ $(1<<9), R14
57 JE skipInput9
58 MOVQ 9*24(AX), R9
59 LONG $0x487c4162; WORD $0x0c10; BYTE $0x09
60 skipInput9:
61 TESTQ $(1<<10), R14
62 JE skipInput10
63 MOVQ 10*24(AX), R9
64 LONG $0x487c4162; WORD $0x1410; BYTE $0x09
65 skipInput10:
66 TESTQ $(1<<11), R14
67 JE skipInput11
68 MOVQ 11*24(AX), R9
69 LONG $0x487c4162; WORD $0x1c10; BYTE $0x09
70 skipInput11:
71 TESTQ $(1<<12), R14
72 JE skipInput12
73 MOVQ 12*24(AX), R9
74 LONG $0x487c4162; WORD $0x2410; BYTE $0x09
75 skipInput12:
76 TESTQ $(1<<13), R14
77 JE skipInput13
78 MOVQ 13*24(AX), R9
79 LONG $0x487c4162; WORD $0x2c10; BYTE $0x09
80 skipInput13:
81 TESTQ $(1<<14), R14
82 JE skipInput14
83 MOVQ 14*24(AX), R9
84 LONG $0x487c4162; WORD $0x3410; BYTE $0x09
85 skipInput14:
86 TESTQ $(1<<15), R14
87 JE skipInput15
88 MOVQ 15*24(AX), R9
89 LONG $0x487c4162; WORD $0x3c10; BYTE $0x09
90 skipInput15:
91 lloop:
92 LEAQ PSHUFFLE_BYTE_FLIP_MASK<>(SB), DX
93 LONG $0x487e7162; WORD $0x1a6f
94 MOVQ table+16(FP), DX
95 QUAD $0xd162226f487e7162; QUAD $0x7ed16224047f487e; QUAD $0x7ed16201244c7f48; QUAD $0x7ed1620224547f48; QUAD $0x7ed16203245c7f48; QUAD $0x7ed1620424647f48; QUAD $0x7ed16205246c7f48; QUAD $0x7ed1620624747f48; QUAD $0xc1834807247c7f48; QUAD $0x44c9c6407c316240; QUAD $0x62eec1c6407ca162; QUAD $0xa16244d3c6406c31; QUAD $0x34c162eed3c6406c; QUAD $0x407ca162dddac648; QUAD $0xc6407ca16288cac6; QUAD $0xcac648345162ddc2; QUAD $0x44d5c6405ca16288; QUAD $0x62eee5c6405ca162; QUAD $0xa16244d7c6404c31; QUAD $0x6cc162eef7c6404c; QUAD $0x405ca162ddfac640; QUAD $0xc6405ca16288eec6; QUAD $0xd2c6406cc162dde6; QUAD $0x44f1c6403c816288; QUAD $0x62eec1c6403c0162; QUAD $0x016244d3c6402c11; QUAD $0x4c4162eed3c6402c; QUAD $0x403c0162dddac640; QUAD $0xc6403c016288cac6; QUAD $0xf2c6404cc162ddc2; QUAD $0x44d5c6401c016288; QUAD $0x62eee5c6401c0162; QUAD $0x016244d7c6400c11; QUAD $0x2c4162eef7c6400c; QUAD $0x401c0162ddfac640; QUAD $0xc6401c016288eec6; QUAD $0xd2c6402c4162dde6; BYTE $0x88
96 LEAQ PSHUFFLE_TRANSPOSE16_MASK1<>(SB), BX
97 LEAQ PSHUFFLE_TRANSPOSE16_MASK2<>(SB), R8
98 QUAD $0x2262336f487e6162; QUAD $0x487e5162f27648b5; QUAD $0xd27648b53262106f; QUAD $0xa262136f487ee162; QUAD $0x487e5162d77640e5; QUAD $0xcf7640e53262086f; QUAD $0xa2621b6f487ee162; QUAD $0x487ec162dd7640f5; QUAD $0xfd7640f5a262386f; QUAD $0xa2620b6f487ee162; QUAD $0x487ec162cc7640fd; QUAD $0xec7640fda262286f; QUAD $0x8262036f487ee162; QUAD $0x487ec162c27640cd; QUAD $0xe27640cd8262206f; QUAD $0x8262336f487ee162; QUAD $0x487e4162f77640a5; QUAD $0xd77640a50262106f; QUAD $0x02621b6f487e6162; QUAD $0x487e4162dd7640b5; QUAD $0xfd7640b50262386f; QUAD $0x02620b6f487e6162; QUAD $0x487e4162cc7640bd; QUAD $0xec7640bd0262286f; QUAD $0x62eec023408d2362; QUAD $0x236244c023408da3; QUAD $0xada362eee42348ad; QUAD $0x40c5036244e42348; QUAD $0x2340c51362eef723; QUAD $0xfd2340d5036244d7; QUAD $0x44fd2340d58362ee; QUAD $0x62eeea2348b50362; QUAD $0x036244ea2348b583; QUAD $0xe51362eed32340e5; QUAD $0x40f5036244cb2340; QUAD $0x2340f58362eed923; QUAD $0xce2340ed236244d9; QUAD $0x44ce2340eda362ee; QUAD $0xc162d16f487ec162; QUAD $0x407dc262f26f487e; QUAD $0xcb004075c262c300; QUAD $0xc262d300406dc262; QUAD $0x405dc262db004065; QUAD $0xeb004055c262e300; QUAD $0xc262f300404dc262; QUAD $0x403d4262fb004045; QUAD $0xcb0040354262c300; QUAD $0x4262d300402d4262; QUAD $0x401d4262db004025; QUAD $0xeb0040154262e300; QUAD $0x4262f300400d4262; QUAD $0x48455162fb004005; QUAD $0xcc6f487e7162c4fe; QUAD $0x6206c472482df162; QUAD $0xf1620bc4724825f1; QUAD $0x55736219c472481d; QUAD $0x483d3162cace2548; QUAD $0xd42548255362c0fe; QUAD $0x62c1fe483d516296; QUAD $0x65d162c2fe483d51; QUAD $0x724845f162d8fe48; QUAD $0xc0724825f16202c0; QUAD $0x16c072481df1620d; QUAD $0x7362c86f487e7162; QUAD $0x25d362e8ca254875; QUAD $0x4845d16296fc2548; QUAD $0xf8fe4845d162f9fe; QUAD $0x6201626f487e7162; QUAD $0x916211c672481591; QUAD $0x05916213c672480d; QUAD $0x480d53620ad67248; QUAD $0xfe407dc16296ef25; QUAD $0x62c1fe407d8162c5; QUAD $0xb16207c1724815b1; QUAD $0x05b16212c172480d; QUAD $0x480d536203d17248; QUAD $0xfe407dc16296ef25; QUAD $0x62c4fe484d5162c5; QUAD $0x2df162cb6f487e71; QUAD $0x4825f16206c37248; QUAD $0x72481df1620bc372; QUAD $0xcd25485d736219c3; QUAD $0x62c1fe483d3162ca; QUAD $0x516296d425482553; QUAD $0x483d5162c1fe483d; QUAD $0xd0fe486dd162c2fe; QUAD $0x6202c772484df162; QUAD $0xf1620dc7724825f1; QUAD $0x7e716216c772481d; QUAD $0x25487d7362cf6f48; QUAD $0xf4254825d362e8c9; QUAD $0x62f1fe484dd16296; QUAD $0x7e7162f0fe484dd1; QUAD $0x4815916202626f48; QUAD $0x72480d916211c772; QUAD $0xd7724805916213c7; QUAD $0x96ef25480d53620a; QUAD $0x8162cdfe4075c162; QUAD $0x4815b162cafe4075; QUAD $0x72480db16207c272; QUAD $0xd2724805b16212c2; QUAD $0x96ef25480d536203; QUAD $0x5162cdfe4075c162; QUAD $0x487e7162c4fe4855; QUAD $0xc272482df162ca6f; QUAD $0x0bc2724825f16206; QUAD $0x6219c272481df162; QUAD $0x3162cacc25486573; QUAD $0x48255362c2fe483d; QUAD $0xfe483d516296d425; QUAD $0x62c2fe483d5162c1; QUAD $0x55f162c8fe4875d1; QUAD $0x4825f16202c67248; QUAD $0x72481df1620dc672; QUAD $0xce6f487e716216c6; QUAD $0x62e8c82548457362; QUAD $0xd16296ec254825d3; QUAD $0x4855d162e9fe4855; QUAD $0x626f487e7162e8fe; QUAD $0x11c0724815b16203; QUAD $0x6213c072480db162; QUAD $0x53620ad0724805b1; QUAD $0x6dc16296ef25480d; QUAD $0xfe406d8162d5fe40; QUAD $0x07c3724815b162d3; QUAD $0x6212c372480db162; QUAD $0x536203d3724805b1; QUAD $0x6dc16296ef25480d; QUAD $0xfe485d5162d5fe40; QUAD $0x62c96f487e7162c4; QUAD $0xf16206c172482df1; QUAD $0x1df1620bc1724825; QUAD $0x486d736219c17248; QUAD $0xfe483d3162cacb25; QUAD $0x96d42548255362c3; QUAD $0x5162c1fe483d5162; QUAD $0x487dd162c2fe483d; QUAD $0xc572485df162c0fe; QUAD $0x0dc5724825f16202; QUAD $0x6216c572481df162; QUAD $0x4d7362cd6f487e71; QUAD $0x4825d362e8cf2548; QUAD $0xfe485dd16296e425; QUAD $0x62e0fe485dd162e1; QUAD $0xb16204626f487e71; QUAD $0x0db16211c1724815; QUAD $0x4805b16213c17248; QUAD $0x25480d53620ad172; QUAD $0xddfe4065c16296ef; QUAD $0xb162dcfe40658162; QUAD $0x0db16207c4724815; QUAD $0x4805b16212c47248; QUAD $0x25480d536203d472; QUAD $0xddfe4065c16296ef; QUAD $0x7162c4fe48655162; QUAD $0x482df162c86f487e; QUAD $0x724825f16206c072; QUAD $0xc072481df1620bc0; QUAD $0xcaca254875736219; QUAD $0x5362c4fe483d3162; QUAD $0x3d516296d4254825; QUAD $0xfe483d5162c1fe48; QUAD $0x62f8fe4845d162c2; QUAD $0xf16202c4724865f1; QUAD $0x1df1620dc4724825; QUAD $0x487e716216c47248; QUAD $0xce2548557362cc6f; QUAD $0x96dc254825d362e8; QUAD $0xd162d9fe4865d162; QUAD $0x487e7162d8fe4865; QUAD $0x724815b16205626f; QUAD $0xc272480db16211c2; QUAD $0x0ad2724805b16213; QUAD $0x6296ef25480d5362; QUAD $0x5d8162e5fe405dc1; QUAD $0x724815b162e5fe40; QUAD $0xc572480db16207c5; QUAD $0x03d5724805b16212; QUAD $0x6296ef25480d5362; QUAD $0x6d5162e5fe405dc1; QUAD $0x6f487e7162c4fe48; QUAD $0x06c772482df162cf; QUAD $0x620bc7724825f162; QUAD $0x736219c772481df1; QUAD $0x3d3162cac925487d; QUAD $0x2548255362c5fe48; QUAD $0xc1fe483d516296d4; QUAD $0xd162c2fe483d5162; QUAD $0x486df162f0fe484d; QUAD $0x724825f16202c372; QUAD $0xc372481df1620dc3; QUAD $0x62cb6f487e716216; QUAD $0xd362e8cd25485d73; QUAD $0x6dd16296d4254825; QUAD $0xfe486dd162d1fe48; QUAD $0x06626f487e7162d0; QUAD $0x6211c3724815b162; QUAD $0xb16213c372480db1; QUAD $0x0d53620ad3724805; QUAD $0x4055c16296ef2548; QUAD $0xeefe40558162edfe; QUAD $0x6207c6724815b162; QUAD $0xb16212c672480db1; QUAD $0x0d536203d6724805; QUAD $0x4055c16296ef2548; QUAD $0xc4fe48755162edfe; QUAD $0xf162ce6f487e7162; QUAD $0x25f16206c672482d; QUAD $0x481df1620bc67248; QUAD $0x254845736219c672; QUAD $0xc6fe483d3162cac8; QUAD $0x6296d42548255362; QUAD $0x3d5162c1fe483d51; QUAD $0xfe4855d162c2fe48; QUAD $0x02c2724875f162e8; QUAD $0x620dc2724825f162; QUAD $0x716216c272481df1; QUAD $0x48657362ca6f487e; QUAD $0x254825d362e8cc25; QUAD $0xc9fe4875d16296cc; QUAD $0x7162c8fe4875d162; QUAD $0x15b16207626f487e; QUAD $0x480db16211c47248; QUAD $0x724805b16213c472; QUAD $0xef25480d53620ad4; QUAD $0x62f5fe404dc16296; QUAD $0x15b162f7fe404d81; QUAD $0x480db16207c77248; QUAD $0x724805b16212c772; QUAD $0xef25480d536203d7; QUAD $0x62f5fe404dc16296; QUAD $0x7e7162c4fe487d51; QUAD $0x72482df162cd6f48; QUAD $0xc5724825f16206c5; QUAD $0x19c572481df1620b; QUAD $0x62cacf25484d7362; QUAD $0x255362c7fe483d31; QUAD $0x483d516296d42548; QUAD $0xc2fe483d5162c1fe; QUAD $0xf162e0fe485dd162; QUAD $0x25f16202c172487d; QUAD $0x481df1620dc17248; QUAD $0x6f487e716216c172; QUAD $0xe8cb25486d7362c9; QUAD $0x6296c4254825d362; QUAD $0x7dd162c1fe487dd1; QUAD $0x6f487e7162c0fe48; QUAD $0xc5724815b1620862; QUAD $0x13c572480db16211; QUAD $0x620ad5724805b162; QUAD $0xc16296ef25480d53; QUAD $0x4045a162fdfe4045; QUAD $0xc07248159162f8fe; QUAD $0x12c072480d916207; QUAD $0x6203d07248059162; QUAD $0xc16296ef25480d53; QUAD $0x48455162fdfe4045; QUAD $0xcc6f487e7162c4fe; QUAD $0x6206c472482df162; QUAD $0xf1620bc4724825f1; QUAD $0x55736219c472481d; QUAD $0x483d1162cace2548; QUAD $0xd42548255362c0fe; QUAD $0x62c1fe483d516296; QUAD $0x65d162c2fe483d51; QUAD $0x724845f162d8fe48; QUAD $0xc0724825f16202c0; QUAD $0x16c072481df1620d; QUAD $0x7362c86f487e7162; QUAD $0x25d362e8ca254875; QUAD $0x4845d16296fc2548; QUAD $0xf8fe4845d162f9fe; QUAD $0x6209626f487e7162; QUAD $0xb16211c6724815b1; QUAD $0x05b16213c672480d; QUAD $0x480d53620ad67248; QUAD $0xfe403d416296ef25; QUAD $0x62c1fe403d2162c5; QUAD $0x916207c172481591; QUAD $0x05916212c172480d; QUAD $0x480d536203d17248; QUAD $0xfe403d416296ef25; QUAD $0x62c4fe484d5162c5; QUAD $0x2df162cb6f487e71; QUAD $0x4825f16206c37248; QUAD $0x72481df1620bc372; QUAD $0xcd25485d736219c3; QUAD $0x62c1fe483d1162ca; QUAD $0x516296d425482553; QUAD $0x483d5162c1fe483d; QUAD $0xd0fe486dd162c2fe; QUAD $0x6202c772484df162; QUAD $0xf1620dc7724825f1; QUAD $0x7e716216c772481d; QUAD $0x25487d7362cf6f48; QUAD $0xf4254825d362e8c9; QUAD $0x62f1fe484dd16296; QUAD $0x7e7162f0fe484dd1; QUAD $0x4815b1620a626f48; QUAD $0x72480db16211c772; QUAD $0xd7724805b16213c7; QUAD $0x96ef25480d53620a; QUAD $0x2162cdfe40354162; QUAD $0x48159162cafe4035; QUAD $0x72480d916207c272; QUAD $0xd2724805916212c2; QUAD $0x96ef25480d536203; QUAD $0x5162cdfe40354162; QUAD $0x487e7162c4fe4855; QUAD $0xc272482df162ca6f; QUAD $0x0bc2724825f16206; QUAD $0x6219c272481df162; QUAD $0x1162cacc25486573; QUAD $0x48255362c2fe483d; QUAD $0xfe483d516296d425; QUAD $0x62c2fe483d5162c1; QUAD $0x55f162c8fe4875d1; QUAD $0x4825f16202c67248; QUAD $0x72481df1620dc672; QUAD $0xce6f487e716216c6; QUAD $0x62e8c82548457362; QUAD $0xd16296ec254825d3; QUAD $0x4855d162e9fe4855; QUAD $0x626f487e7162e8fe; QUAD $0x11c072481591620b; QUAD $0x6213c072480d9162; QUAD $0x53620ad072480591; QUAD $0x2d416296ef25480d; QUAD $0xfe402d2162d5fe40; QUAD $0x07c37248159162d3; QUAD $0x6212c372480d9162; QUAD $0x536203d372480591; QUAD $0x2d416296ef25480d; QUAD $0xfe485d5162d5fe40; QUAD $0x62c96f487e7162c4; QUAD $0xf16206c172482df1; QUAD $0x1df1620bc1724825; QUAD $0x486d736219c17248; QUAD $0xfe483d1162cacb25; QUAD $0x96d42548255362c3; QUAD $0x5162c1fe483d5162; QUAD $0x487dd162c2fe483d; QUAD $0xc572485df162c0fe; QUAD $0x0dc5724825f16202; QUAD $0x6216c572481df162; QUAD $0x4d7362cd6f487e71; QUAD $0x4825d362e8cf2548; QUAD $0xfe485dd16296e425; QUAD $0x62e0fe485dd162e1; QUAD $0x91620c626f487e71; QUAD $0x0d916211c1724815; QUAD $0x4805916213c17248; QUAD $0x25480d53620ad172; QUAD $0xddfe4025416296ef; QUAD $0x9162dcfe40252162; QUAD $0x0d916207c4724815; QUAD $0x4805916212c47248; QUAD $0x25480d536203d472; QUAD $0xddfe4025416296ef; QUAD $0x7162c4fe48655162; QUAD $0x482df162c86f487e; QUAD $0x724825f16206c072; QUAD $0xc072481df1620bc0; QUAD $0xcaca254875736219; QUAD $0x5362c4fe483d1162; QUAD $0x3d516296d4254825; QUAD $0xfe483d5162c1fe48; QUAD $0x62f8fe4845d162c2; QUAD $0xf16202c4724865f1; QUAD $0x1df1620dc4724825; QUAD $0x487e716216c47248; QUAD $0xce2548557362cc6f; QUAD $0x96dc254825d362e8; QUAD $0xd162d9fe4865d162; QUAD $0x487e7162d8fe4865; QUAD $0x72481591620d626f; QUAD $0xc272480d916211c2; QUAD $0x0ad2724805916213; QUAD $0x6296ef25480d5362; QUAD $0x1d2162e5fe401d41; QUAD $0x7248159162e5fe40; QUAD $0xc572480d916207c5; QUAD $0x03d5724805916212; QUAD $0x6296ef25480d5362; QUAD $0x6d5162e5fe401d41; QUAD $0x6f487e7162c4fe48; QUAD $0x06c772482df162cf; QUAD $0x620bc7724825f162; QUAD $0x736219c772481df1; QUAD $0x3d1162cac925487d; QUAD $0x2548255362c5fe48; QUAD $0xc1fe483d516296d4; QUAD $0xd162c2fe483d5162; QUAD $0x486df162f0fe484d; QUAD $0x724825f16202c372; QUAD $0xc372481df1620dc3; QUAD $0x62cb6f487e716216; QUAD $0xd362e8cd25485d73; QUAD $0x6dd16296d4254825; QUAD $0xfe486dd162d1fe48; QUAD $0x0e626f487e7162d0; QUAD $0x6211c37248159162; QUAD $0x916213c372480d91; QUAD $0x0d53620ad3724805; QUAD $0x4015416296ef2548; QUAD $0xeefe40152162edfe; QUAD $0x6207c67248159162; QUAD $0x916212c672480d91; QUAD $0x0d536203d6724805; QUAD $0x4015416296ef2548; QUAD $0xc4fe48755162edfe; QUAD $0xf162ce6f487e7162; QUAD $0x25f16206c672482d; QUAD $0x481df1620bc67248; QUAD $0x254845736219c672; QUAD $0xc6fe483d1162cac8; QUAD $0x6296d42548255362; QUAD $0x3d5162c1fe483d51; QUAD $0xfe4855d162c2fe48; QUAD $0x02c2724875f162e8; QUAD $0x620dc2724825f162; QUAD $0x716216c272481df1; QUAD $0x48657362ca6f487e; QUAD $0x254825d362e8cc25; QUAD $0xc9fe4875d16296cc; QUAD $0x7162c8fe4875d162; QUAD $0x1591620f626f487e; QUAD $0x480d916211c47248; QUAD $0x724805916213c472; QUAD $0xef25480d53620ad4; QUAD $0x62f5fe400d416296; QUAD $0x159162f7fe400d21; QUAD $0x480d916207c77248; QUAD $0x724805916212c772; QUAD $0xef25480d536203d7; QUAD $0x62f5fe400d416296; QUAD $0x7e7162c4fe487d51; QUAD $0x72482df162cd6f48; QUAD $0xc5724825f16206c5; QUAD $0x19c572481df1620b; QUAD $0x62cacf25484d7362; QUAD $0x255362c7fe483d11; QUAD $0x483d516296d42548; QUAD $0xc2fe483d5162c1fe; QUAD $0xf162e0fe485dd162; QUAD $0x25f16202c172487d; QUAD $0x481df1620dc17248; QUAD $0x6f487e716216c172; QUAD $0xe8cb25486d7362c9; QUAD $0x6296c4254825d362; QUAD $0x7dd162c1fe487dd1; QUAD $0x6f487e7162c0fe48; QUAD $0xc572481591621062; QUAD $0x13c572480d916211; QUAD $0x620ad57248059162; QUAD $0x416296ef25480d53; QUAD $0x40050162fdfe4005; QUAD $0xc0724815b162f8fe; QUAD $0x12c072480db16207; QUAD $0x6203d0724805b162; QUAD $0x416296ef25480d53; QUAD $0x48455162fdfe4005; QUAD $0xcc6f487e7162c4fe; QUAD $0x6206c472482df162; QUAD $0xf1620bc4724825f1; QUAD $0x55736219c472481d; QUAD $0x483d3162cace2548; QUAD $0xd42548255362c0fe; QUAD $0x62c1fe483d516296; QUAD $0x65d162c2fe483d51; QUAD $0x724845f162d8fe48; QUAD $0xc0724825f16202c0; QUAD $0x16c072481df1620d; QUAD $0x7362c86f487e7162; QUAD $0x25d362e8ca254875; QUAD $0x4845d16296fc2548; QUAD $0xf8fe4845d162f9fe; QUAD $0x6211626f487e7162; QUAD $0x916211c672481591; QUAD $0x05916213c672480d; QUAD $0x480d53620ad67248; QUAD $0xfe407dc16296ef25; QUAD $0x62c1fe407d8162c5; QUAD $0xb16207c1724815b1; QUAD $0x05b16212c172480d; QUAD $0x480d536203d17248; QUAD $0xfe407dc16296ef25; QUAD $0x62c4fe484d5162c5; QUAD $0x2df162cb6f487e71; QUAD $0x4825f16206c37248; QUAD $0x72481df1620bc372; QUAD $0xcd25485d736219c3; QUAD $0x62c1fe483d3162ca; QUAD $0x516296d425482553; QUAD $0x483d5162c1fe483d; QUAD $0xd0fe486dd162c2fe; QUAD $0x6202c772484df162; QUAD $0xf1620dc7724825f1; QUAD $0x7e716216c772481d; QUAD $0x25487d7362cf6f48; QUAD $0xf4254825d362e8c9; QUAD $0x62f1fe484dd16296; QUAD $0x7e7162f0fe484dd1; QUAD $0x4815916212626f48; QUAD $0x72480d916211c772; QUAD $0xd7724805916213c7; QUAD $0x96ef25480d53620a; QUAD $0x8162cdfe4075c162; QUAD $0x4815b162cafe4075; QUAD $0x72480db16207c272; QUAD $0xd2724805b16212c2; QUAD $0x96ef25480d536203; QUAD $0x5162cdfe4075c162; QUAD $0x487e7162c4fe4855; QUAD $0xc272482df162ca6f; QUAD $0x0bc2724825f16206; QUAD $0x6219c272481df162; QUAD $0x3162cacc25486573; QUAD $0x48255362c2fe483d; QUAD $0xfe483d516296d425; QUAD $0x62c2fe483d5162c1; QUAD $0x55f162c8fe4875d1; QUAD $0x4825f16202c67248; QUAD $0x72481df1620dc672; QUAD $0xce6f487e716216c6; QUAD $0x62e8c82548457362; QUAD $0xd16296ec254825d3; QUAD $0x4855d162e9fe4855; QUAD $0x626f487e7162e8fe; QUAD $0x11c0724815b16213; QUAD $0x6213c072480db162; QUAD $0x53620ad0724805b1; QUAD $0x6dc16296ef25480d; QUAD $0xfe406d8162d5fe40; QUAD $0x07c3724815b162d3; QUAD $0x6212c372480db162; QUAD $0x536203d3724805b1; QUAD $0x6dc16296ef25480d; QUAD $0xfe485d5162d5fe40; QUAD $0x62c96f487e7162c4; QUAD $0xf16206c172482df1; QUAD $0x1df1620bc1724825; QUAD $0x486d736219c17248; QUAD $0xfe483d3162cacb25; QUAD $0x96d42548255362c3; QUAD $0x5162c1fe483d5162; QUAD $0x487dd162c2fe483d; QUAD $0xc572485df162c0fe; QUAD $0x0dc5724825f16202; QUAD $0x6216c572481df162; QUAD $0x4d7362cd6f487e71; QUAD $0x4825d362e8cf2548; QUAD $0xfe485dd16296e425; QUAD $0x62e0fe485dd162e1; QUAD $0xb16214626f487e71; QUAD $0x0db16211c1724815; QUAD $0x4805b16213c17248; QUAD $0x25480d53620ad172; QUAD $0xddfe4065c16296ef; QUAD $0xb162dcfe40658162; QUAD $0x0db16207c4724815; QUAD $0x4805b16212c47248; QUAD $0x25480d536203d472; QUAD $0xddfe4065c16296ef; QUAD $0x7162c4fe48655162; QUAD $0x482df162c86f487e; QUAD $0x724825f16206c072; QUAD $0xc072481df1620bc0; QUAD $0xcaca254875736219; QUAD $0x5362c4fe483d3162; QUAD $0x3d516296d4254825; QUAD $0xfe483d5162c1fe48; QUAD $0x62f8fe4845d162c2; QUAD $0xf16202c4724865f1; QUAD $0x1df1620dc4724825; QUAD $0x487e716216c47248; QUAD $0xce2548557362cc6f; QUAD $0x96dc254825d362e8; QUAD $0xd162d9fe4865d162; QUAD $0x487e7162d8fe4865; QUAD $0x724815b16215626f; QUAD $0xc272480db16211c2; QUAD $0x0ad2724805b16213; QUAD $0x6296ef25480d5362; QUAD $0x5d8162e5fe405dc1; QUAD $0x724815b162e5fe40; QUAD $0xc572480db16207c5; QUAD $0x03d5724805b16212; QUAD $0x6296ef25480d5362; QUAD $0x6d5162e5fe405dc1; QUAD $0x6f487e7162c4fe48; QUAD $0x06c772482df162cf; QUAD $0x620bc7724825f162; QUAD $0x736219c772481df1; QUAD $0x3d3162cac925487d; QUAD $0x2548255362c5fe48; QUAD $0xc1fe483d516296d4; QUAD $0xd162c2fe483d5162; QUAD $0x486df162f0fe484d; QUAD $0x724825f16202c372; QUAD $0xc372481df1620dc3; QUAD $0x62cb6f487e716216; QUAD $0xd362e8cd25485d73; QUAD $0x6dd16296d4254825; QUAD $0xfe486dd162d1fe48; QUAD $0x16626f487e7162d0; QUAD $0x6211c3724815b162; QUAD $0xb16213c372480db1; QUAD $0x0d53620ad3724805; QUAD $0x4055c16296ef2548; QUAD $0xeefe40558162edfe; QUAD $0x6207c6724815b162; QUAD $0xb16212c672480db1; QUAD $0x0d536203d6724805; QUAD $0x4055c16296ef2548; QUAD $0xc4fe48755162edfe; QUAD $0xf162ce6f487e7162; QUAD $0x25f16206c672482d; QUAD $0x481df1620bc67248; QUAD $0x254845736219c672; QUAD $0xc6fe483d3162cac8; QUAD $0x6296d42548255362; QUAD $0x3d5162c1fe483d51; QUAD $0xfe4855d162c2fe48; QUAD $0x02c2724875f162e8; QUAD $0x620dc2724825f162; QUAD $0x716216c272481df1; QUAD $0x48657362ca6f487e; QUAD $0x254825d362e8cc25; QUAD $0xc9fe4875d16296cc; QUAD $0x7162c8fe4875d162; QUAD $0x15b16217626f487e; QUAD $0x480db16211c47248; QUAD $0x724805b16213c472; QUAD $0xef25480d53620ad4; QUAD $0x62f5fe404dc16296; QUAD $0x15b162f7fe404d81; QUAD $0x480db16207c77248; QUAD $0x724805b16212c772; QUAD $0xef25480d536203d7; QUAD $0x62f5fe404dc16296; QUAD $0x7e7162c4fe487d51; QUAD $0x72482df162cd6f48; QUAD $0xc5724825f16206c5; QUAD $0x19c572481df1620b; QUAD $0x62cacf25484d7362; QUAD $0x255362c7fe483d31; QUAD $0x483d516296d42548; QUAD $0xc2fe483d5162c1fe; QUAD $0xf162e0fe485dd162; QUAD $0x25f16202c172487d; QUAD $0x481df1620dc17248; QUAD $0x6f487e716216c172; QUAD $0xe8cb25486d7362c9; QUAD $0x6296c4254825d362; QUAD $0x7dd162c1fe487dd1; QUAD $0x6f487e7162c0fe48; QUAD $0xc5724815b1621862; QUAD $0x13c572480db16211; QUAD $0x620ad5724805b162; QUAD $0xc16296ef25480d53; QUAD $0x4045a162fdfe4045; QUAD $0xc07248159162f8fe; QUAD $0x12c072480d916207; QUAD $0x6203d07248059162; QUAD $0xc16296ef25480d53; QUAD $0x48455162fdfe4045; QUAD $0xcc6f487e7162c4fe; QUAD $0x6206c472482df162; QUAD $0xf1620bc4724825f1; QUAD $0x55736219c472481d; QUAD $0x483d1162cace2548; QUAD $0xd42548255362c0fe; QUAD $0x62c1fe483d516296; QUAD $0x65d162c2fe483d51; QUAD $0x724845f162d8fe48; QUAD $0xc0724825f16202c0; QUAD $0x16c072481df1620d; QUAD $0x7362c86f487e7162; QUAD $0x25d362e8ca254875; QUAD $0x4845d16296fc2548; QUAD $0xf8fe4845d162f9fe; QUAD $0x6219626f487e7162; QUAD $0xb16211c6724815b1; QUAD $0x05b16213c672480d; QUAD $0x480d53620ad67248; QUAD $0xfe403d416296ef25; QUAD $0x62c1fe403d2162c5; QUAD $0x916207c172481591; QUAD $0x05916212c172480d; QUAD $0x480d536203d17248; QUAD $0xfe403d416296ef25; QUAD $0x62c4fe484d5162c5; QUAD $0x2df162cb6f487e71; QUAD $0x4825f16206c37248; QUAD $0x72481df1620bc372; QUAD $0xcd25485d736219c3; QUAD $0x62c1fe483d1162ca; QUAD $0x516296d425482553; QUAD $0x483d5162c1fe483d; QUAD $0xd0fe486dd162c2fe; QUAD $0x6202c772484df162; QUAD $0xf1620dc7724825f1; QUAD $0x7e716216c772481d; QUAD $0x25487d7362cf6f48; QUAD $0xf4254825d362e8c9; QUAD $0x62f1fe484dd16296; QUAD $0x7e7162f0fe484dd1; QUAD $0x4815b1621a626f48; QUAD $0x72480db16211c772; QUAD $0xd7724805b16213c7; QUAD $0x96ef25480d53620a; QUAD $0x2162cdfe40354162; QUAD $0x48159162cafe4035; QUAD $0x72480d916207c272; QUAD $0xd2724805916212c2; QUAD $0x96ef25480d536203; QUAD $0x5162cdfe40354162; QUAD $0x487e7162c4fe4855; QUAD $0xc272482df162ca6f; QUAD $0x0bc2724825f16206; QUAD $0x6219c272481df162; QUAD $0x1162cacc25486573; QUAD $0x48255362c2fe483d; QUAD $0xfe483d516296d425; QUAD $0x62c2fe483d5162c1; QUAD $0x55f162c8fe4875d1; QUAD $0x4825f16202c67248; QUAD $0x72481df1620dc672; QUAD $0xce6f487e716216c6; QUAD $0x62e8c82548457362; QUAD $0xd16296ec254825d3; QUAD $0x4855d162e9fe4855; QUAD $0x626f487e7162e8fe; QUAD $0x11c072481591621b; QUAD $0x6213c072480d9162; QUAD $0x53620ad072480591; QUAD $0x2d416296ef25480d; QUAD $0xfe402d2162d5fe40; QUAD $0x07c37248159162d3; QUAD $0x6212c372480d9162; QUAD $0x536203d372480591; QUAD $0x2d416296ef25480d; QUAD $0xfe485d5162d5fe40; QUAD $0x62c96f487e7162c4; QUAD $0xf16206c172482df1; QUAD $0x1df1620bc1724825; QUAD $0x486d736219c17248; QUAD $0xfe483d1162cacb25; QUAD $0x96d42548255362c3; QUAD $0x5162c1fe483d5162; QUAD $0x487dd162c2fe483d; QUAD $0xc572485df162c0fe; QUAD $0x0dc5724825f16202; QUAD $0x6216c572481df162; QUAD $0x4d7362cd6f487e71; QUAD $0x4825d362e8cf2548; QUAD $0xfe485dd16296e425; QUAD $0x62e0fe485dd162e1; QUAD $0x91621c626f487e71; QUAD $0x0d916211c1724815; QUAD $0x4805916213c17248; QUAD $0x25480d53620ad172; QUAD $0xddfe4025416296ef; QUAD $0x9162dcfe40252162; QUAD $0x0d916207c4724815; QUAD $0x4805916212c47248; QUAD $0x25480d536203d472; QUAD $0xddfe4025416296ef; QUAD $0x7162c4fe48655162; QUAD $0x482df162c86f487e; QUAD $0x724825f16206c072; QUAD $0xc072481df1620bc0; QUAD $0xcaca254875736219; QUAD $0x5362c4fe483d1162; QUAD $0x3d516296d4254825; QUAD $0xfe483d5162c1fe48; QUAD $0x62f8fe4845d162c2; QUAD $0xf16202c4724865f1; QUAD $0x1df1620dc4724825; QUAD $0x487e716216c47248; QUAD $0xce2548557362cc6f; QUAD $0x96dc254825d362e8; QUAD $0xd162d9fe4865d162; QUAD $0x487e7162d8fe4865; QUAD $0x72481591621d626f; QUAD $0xc272480d916211c2; QUAD $0x0ad2724805916213; QUAD $0x6296ef25480d5362; QUAD $0x1d2162e5fe401d41; QUAD $0x7248159162e5fe40; QUAD $0xc572480d916207c5; QUAD $0x03d5724805916212; QUAD $0x6296ef25480d5362; QUAD $0x6d5162e5fe401d41; QUAD $0x6f487e7162c4fe48; QUAD $0x06c772482df162cf; QUAD $0x620bc7724825f162; QUAD $0x736219c772481df1; QUAD $0x3d1162cac925487d; QUAD $0x2548255362c5fe48; QUAD $0xc1fe483d516296d4; QUAD $0xd162c2fe483d5162; QUAD $0x486df162f0fe484d; QUAD $0x724825f16202c372; QUAD $0xc372481df1620dc3; QUAD $0x62cb6f487e716216; QUAD $0xd362e8cd25485d73; QUAD $0x6dd16296d4254825; QUAD $0xfe486dd162d1fe48; QUAD $0x1e626f487e7162d0; QUAD $0x6211c37248159162; QUAD $0x916213c372480d91; QUAD $0x0d53620ad3724805; QUAD $0x4015416296ef2548; QUAD $0xeefe40152162edfe; QUAD $0x6207c67248159162; QUAD $0x916212c672480d91; QUAD $0x0d536203d6724805; QUAD $0x4015416296ef2548; QUAD $0xc4fe48755162edfe; QUAD $0xf162ce6f487e7162; QUAD $0x25f16206c672482d; QUAD $0x481df1620bc67248; QUAD $0x254845736219c672; QUAD $0xc6fe483d1162cac8; QUAD $0x6296d42548255362; QUAD $0x3d5162c1fe483d51; QUAD $0xfe4855d162c2fe48; QUAD $0x02c2724875f162e8; QUAD $0x620dc2724825f162; QUAD $0x716216c272481df1; QUAD $0x48657362ca6f487e; QUAD $0x254825d362e8cc25; QUAD $0xc9fe4875d16296cc; QUAD $0x7162c8fe4875d162; QUAD $0x1591621f626f487e; QUAD $0x480d916211c47248; QUAD $0x724805916213c472; QUAD $0xef25480d53620ad4; QUAD $0x62f5fe400d416296; QUAD $0x159162f7fe400d21; QUAD $0x480d916207c77248; QUAD $0x724805916212c772; QUAD $0xef25480d536203d7; QUAD $0x62f5fe400d416296; QUAD $0x7e7162c4fe487d51; QUAD $0x72482df162cd6f48; QUAD $0xc5724825f16206c5; QUAD $0x19c572481df1620b; QUAD $0x62cacf25484d7362; QUAD $0x255362c7fe483d11; QUAD $0x483d516296d42548; QUAD $0xc2fe483d5162c1fe; QUAD $0xf162e0fe485dd162; QUAD $0x25f16202c172487d; QUAD $0x481df1620dc17248; QUAD $0x6f487e716216c172; QUAD $0xe8cb25486d7362c9; QUAD $0x6296c4254825d362; QUAD $0x7dd162c1fe487dd1; QUAD $0x6f487e7162c0fe48; QUAD $0xc572481591622062; QUAD $0x13c572480d916211; QUAD $0x620ad57248059162; QUAD $0x416296ef25480d53; QUAD $0x40050162fdfe4005; QUAD $0xc0724815b162f8fe; QUAD $0x12c072480db16207; QUAD $0x6203d0724805b162; QUAD $0x416296ef25480d53; QUAD $0x48455162fdfe4005; QUAD $0xcc6f487e7162c4fe; QUAD $0x6206c472482df162; QUAD $0xf1620bc4724825f1; QUAD $0x55736219c472481d; QUAD $0x483d3162cace2548; QUAD $0xd42548255362c0fe; QUAD $0x62c1fe483d516296; QUAD $0x65d162c2fe483d51; QUAD $0x724845f162d8fe48; QUAD $0xc0724825f16202c0; QUAD $0x16c072481df1620d; QUAD $0x7362c86f487e7162; QUAD $0x25d362e8ca254875; QUAD $0x4845d16296fc2548; QUAD $0xf8fe4845d162f9fe; QUAD $0x6221626f487e7162; QUAD $0x916211c672481591; QUAD $0x05916213c672480d; QUAD $0x480d53620ad67248; QUAD $0xfe407dc16296ef25; QUAD $0x62c1fe407d8162c5; QUAD $0xb16207c1724815b1; QUAD $0x05b16212c172480d; QUAD $0x480d536203d17248; QUAD $0xfe407dc16296ef25; QUAD $0x62c4fe484d5162c5; QUAD $0x2df162cb6f487e71; QUAD $0x4825f16206c37248; QUAD $0x72481df1620bc372; QUAD $0xcd25485d736219c3; QUAD $0x62c1fe483d3162ca; QUAD $0x516296d425482553; QUAD $0x483d5162c1fe483d; QUAD $0xd0fe486dd162c2fe; QUAD $0x6202c772484df162; QUAD $0xf1620dc7724825f1; QUAD $0x7e716216c772481d; QUAD $0x25487d7362cf6f48; QUAD $0xf4254825d362e8c9; QUAD $0x62f1fe484dd16296; QUAD $0x7e7162f0fe484dd1; QUAD $0x4815916222626f48; QUAD $0x72480d916211c772; QUAD $0xd7724805916213c7; QUAD $0x96ef25480d53620a; QUAD $0x8162cdfe4075c162; QUAD $0x4815b162cafe4075; QUAD $0x72480db16207c272; QUAD $0xd2724805b16212c2; QUAD $0x96ef25480d536203; QUAD $0x5162cdfe4075c162; QUAD $0x487e7162c4fe4855; QUAD $0xc272482df162ca6f; QUAD $0x0bc2724825f16206; QUAD $0x6219c272481df162; QUAD $0x3162cacc25486573; QUAD $0x48255362c2fe483d; QUAD $0xfe483d516296d425; QUAD $0x62c2fe483d5162c1; QUAD $0x55f162c8fe4875d1; QUAD $0x4825f16202c67248; QUAD $0x72481df1620dc672; QUAD $0xce6f487e716216c6; QUAD $0x62e8c82548457362; QUAD $0xd16296ec254825d3; QUAD $0x4855d162e9fe4855; QUAD $0x626f487e7162e8fe; QUAD $0x11c0724815b16223; QUAD $0x6213c072480db162; QUAD $0x53620ad0724805b1; QUAD $0x6dc16296ef25480d; QUAD $0xfe406d8162d5fe40; QUAD $0x07c3724815b162d3; QUAD $0x6212c372480db162; QUAD $0x536203d3724805b1; QUAD $0x6dc16296ef25480d; QUAD $0xfe485d5162d5fe40; QUAD $0x62c96f487e7162c4; QUAD $0xf16206c172482df1; QUAD $0x1df1620bc1724825; QUAD $0x486d736219c17248; QUAD $0xfe483d3162cacb25; QUAD $0x96d42548255362c3; QUAD $0x5162c1fe483d5162; QUAD $0x487dd162c2fe483d; QUAD $0xc572485df162c0fe; QUAD $0x0dc5724825f16202; QUAD $0x6216c572481df162; QUAD $0x4d7362cd6f487e71; QUAD $0x4825d362e8cf2548; QUAD $0xfe485dd16296e425; QUAD $0x62e0fe485dd162e1; QUAD $0xb16224626f487e71; QUAD $0x0db16211c1724815; QUAD $0x4805b16213c17248; QUAD $0x25480d53620ad172; QUAD $0xddfe4065c16296ef; QUAD $0xb162dcfe40658162; QUAD $0x0db16207c4724815; QUAD $0x4805b16212c47248; QUAD $0x25480d536203d472; QUAD $0xddfe4065c16296ef; QUAD $0x7162c4fe48655162; QUAD $0x482df162c86f487e; QUAD $0x724825f16206c072; QUAD $0xc072481df1620bc0; QUAD $0xcaca254875736219; QUAD $0x5362c4fe483d3162; QUAD $0x3d516296d4254825; QUAD $0xfe483d5162c1fe48; QUAD $0x62f8fe4845d162c2; QUAD $0xf16202c4724865f1; QUAD $0x1df1620dc4724825; QUAD $0x487e716216c47248; QUAD $0xce2548557362cc6f; QUAD $0x96dc254825d362e8; QUAD $0xd162d9fe4865d162; QUAD $0x487e7162d8fe4865; QUAD $0x724815b16225626f; QUAD $0xc272480db16211c2; QUAD $0x0ad2724805b16213; QUAD $0x6296ef25480d5362; QUAD $0x5d8162e5fe405dc1; QUAD $0x724815b162e5fe40; QUAD $0xc572480db16207c5; QUAD $0x03d5724805b16212; QUAD $0x6296ef25480d5362; QUAD $0x6d5162e5fe405dc1; QUAD $0x6f487e7162c4fe48; QUAD $0x06c772482df162cf; QUAD $0x620bc7724825f162; QUAD $0x736219c772481df1; QUAD $0x3d3162cac925487d; QUAD $0x2548255362c5fe48; QUAD $0xc1fe483d516296d4; QUAD $0xd162c2fe483d5162; QUAD $0x486df162f0fe484d; QUAD $0x724825f16202c372; QUAD $0xc372481df1620dc3; QUAD $0x62cb6f487e716216; QUAD $0xd362e8cd25485d73; QUAD $0x6dd16296d4254825; QUAD $0xfe486dd162d1fe48; QUAD $0x26626f487e7162d0; QUAD $0x6211c3724815b162; QUAD $0xb16213c372480db1; QUAD $0x0d53620ad3724805; QUAD $0x4055c16296ef2548; QUAD $0xeefe40558162edfe; QUAD $0x6207c6724815b162; QUAD $0xb16212c672480db1; QUAD $0x0d536203d6724805; QUAD $0x4055c16296ef2548; QUAD $0xc4fe48755162edfe; QUAD $0xf162ce6f487e7162; QUAD $0x25f16206c672482d; QUAD $0x481df1620bc67248; QUAD $0x254845736219c672; QUAD $0xc6fe483d3162cac8; QUAD $0x6296d42548255362; QUAD $0x3d5162c1fe483d51; QUAD $0xfe4855d162c2fe48; QUAD $0x02c2724875f162e8; QUAD $0x620dc2724825f162; QUAD $0x716216c272481df1; QUAD $0x48657362ca6f487e; QUAD $0x254825d362e8cc25; QUAD $0xc9fe4875d16296cc; QUAD $0x7162c8fe4875d162; QUAD $0x15b16227626f487e; QUAD $0x480db16211c47248; QUAD $0x724805b16213c472; QUAD $0xef25480d53620ad4; QUAD $0x62f5fe404dc16296; QUAD $0x15b162f7fe404d81; QUAD $0x480db16207c77248; QUAD $0x724805b16212c772; QUAD $0xef25480d536203d7; QUAD $0x62f5fe404dc16296; QUAD $0x7e7162c4fe487d51; QUAD $0x72482df162cd6f48; QUAD $0xc5724825f16206c5; QUAD $0x19c572481df1620b; QUAD $0x62cacf25484d7362; QUAD $0x255362c7fe483d31; QUAD $0x483d516296d42548; QUAD $0xc2fe483d5162c1fe; QUAD $0xf162e0fe485dd162; QUAD $0x25f16202c172487d; QUAD $0x481df1620dc17248; QUAD $0x6f487e716216c172; QUAD $0xe8cb25486d7362c9; QUAD $0x6296c4254825d362; QUAD $0x7dd162c1fe487dd1; QUAD $0x6f487e7162c0fe48; QUAD $0xc5724815b1622862; QUAD $0x13c572480db16211; QUAD $0x620ad5724805b162; QUAD $0xc16296ef25480d53; QUAD $0x4045a162fdfe4045; QUAD $0xc07248159162f8fe; QUAD $0x12c072480d916207; QUAD $0x6203d07248059162; QUAD $0xc16296ef25480d53; QUAD $0x48455162fdfe4045; QUAD $0xcc6f487e7162c4fe; QUAD $0x6206c472482df162; QUAD $0xf1620bc4724825f1; QUAD $0x55736219c472481d; QUAD $0x483d1162cace2548; QUAD $0xd42548255362c0fe; QUAD $0x62c1fe483d516296; QUAD $0x65d162c2fe483d51; QUAD $0x724845f162d8fe48; QUAD $0xc0724825f16202c0; QUAD $0x16c072481df1620d; QUAD $0x7362c86f487e7162; QUAD $0x25d362e8ca254875; QUAD $0x4845d16296fc2548; QUAD $0xf8fe4845d162f9fe; QUAD $0x6229626f487e7162; QUAD $0xb16211c6724815b1; QUAD $0x05b16213c672480d; QUAD $0x480d53620ad67248; QUAD $0xfe403d416296ef25; QUAD $0x62c1fe403d2162c5; QUAD $0x916207c172481591; QUAD $0x05916212c172480d; QUAD $0x480d536203d17248; QUAD $0xfe403d416296ef25; QUAD $0x62c4fe484d5162c5; QUAD $0x2df162cb6f487e71; QUAD $0x4825f16206c37248; QUAD $0x72481df1620bc372; QUAD $0xcd25485d736219c3; QUAD $0x62c1fe483d1162ca; QUAD $0x516296d425482553; QUAD $0x483d5162c1fe483d; QUAD $0xd0fe486dd162c2fe; QUAD $0x6202c772484df162; QUAD $0xf1620dc7724825f1; QUAD $0x7e716216c772481d; QUAD $0x25487d7362cf6f48; QUAD $0xf4254825d362e8c9; QUAD $0x62f1fe484dd16296; QUAD $0x7e7162f0fe484dd1; QUAD $0x4815b1622a626f48; QUAD $0x72480db16211c772; QUAD $0xd7724805b16213c7; QUAD $0x96ef25480d53620a; QUAD $0x2162cdfe40354162; QUAD $0x48159162cafe4035; QUAD $0x72480d916207c272; QUAD $0xd2724805916212c2; QUAD $0x96ef25480d536203; QUAD $0x5162cdfe40354162; QUAD $0x487e7162c4fe4855; QUAD $0xc272482df162ca6f; QUAD $0x0bc2724825f16206; QUAD $0x6219c272481df162; QUAD $0x1162cacc25486573; QUAD $0x48255362c2fe483d; QUAD $0xfe483d516296d425; QUAD $0x62c2fe483d5162c1; QUAD $0x55f162c8fe4875d1; QUAD $0x4825f16202c67248; QUAD $0x72481df1620dc672; QUAD $0xce6f487e716216c6; QUAD $0x62e8c82548457362; QUAD $0xd16296ec254825d3; QUAD $0x4855d162e9fe4855; QUAD $0x626f487e7162e8fe; QUAD $0x11c072481591622b; QUAD $0x6213c072480d9162; QUAD $0x53620ad072480591; QUAD $0x2d416296ef25480d; QUAD $0xfe402d2162d5fe40; QUAD $0x07c37248159162d3; QUAD $0x6212c372480d9162; QUAD $0x536203d372480591; QUAD $0x2d416296ef25480d; QUAD $0xfe485d5162d5fe40; QUAD $0x62c96f487e7162c4; QUAD $0xf16206c172482df1; QUAD $0x1df1620bc1724825; QUAD $0x486d736219c17248; QUAD $0xfe483d1162cacb25; QUAD $0x96d42548255362c3; QUAD $0x5162c1fe483d5162; QUAD $0x487dd162c2fe483d; QUAD $0xc572485df162c0fe; QUAD $0x0dc5724825f16202; QUAD $0x6216c572481df162; QUAD $0x4d7362cd6f487e71; QUAD $0x4825d362e8cf2548; QUAD $0xfe485dd16296e425; QUAD $0x62e0fe485dd162e1; QUAD $0x91622c626f487e71; QUAD $0x0d916211c1724815; QUAD $0x4805916213c17248; QUAD $0x25480d53620ad172; QUAD $0xddfe4025416296ef; QUAD $0x9162dcfe40252162; QUAD $0x0d916207c4724815; QUAD $0x4805916212c47248; QUAD $0x25480d536203d472; QUAD $0xddfe4025416296ef; QUAD $0x7162c4fe48655162; QUAD $0x482df162c86f487e; QUAD $0x724825f16206c072; QUAD $0xc072481df1620bc0; QUAD $0xcaca254875736219; QUAD $0x5362c4fe483d1162; QUAD $0x3d516296d4254825; QUAD $0xfe483d5162c1fe48; QUAD $0x62f8fe4845d162c2; QUAD $0xf16202c4724865f1; QUAD $0x1df1620dc4724825; QUAD $0x487e716216c47248; QUAD $0xce2548557362cc6f; QUAD $0x96dc254825d362e8; QUAD $0xd162d9fe4865d162; QUAD $0x487e7162d8fe4865; QUAD $0x72481591622d626f; QUAD $0xc272480d916211c2; QUAD $0x0ad2724805916213; QUAD $0x6296ef25480d5362; QUAD $0x1d2162e5fe401d41; QUAD $0x7248159162e5fe40; QUAD $0xc572480d916207c5; QUAD $0x03d5724805916212; QUAD $0x6296ef25480d5362; QUAD $0x6d5162e5fe401d41; QUAD $0x6f487e7162c4fe48; QUAD $0x06c772482df162cf; QUAD $0x620bc7724825f162; QUAD $0x736219c772481df1; QUAD $0x3d1162cac925487d; QUAD $0x2548255362c5fe48; QUAD $0xc1fe483d516296d4; QUAD $0xd162c2fe483d5162; QUAD $0x486df162f0fe484d; QUAD $0x724825f16202c372; QUAD $0xc372481df1620dc3; QUAD $0x62cb6f487e716216; QUAD $0xd362e8cd25485d73; QUAD $0x6dd16296d4254825; QUAD $0xfe486dd162d1fe48; QUAD $0x2e626f487e7162d0; QUAD $0x6211c37248159162; QUAD $0x916213c372480d91; QUAD $0x0d53620ad3724805; QUAD $0x4015416296ef2548; QUAD $0xeefe40152162edfe; QUAD $0x6207c67248159162; QUAD $0x916212c672480d91; QUAD $0x0d536203d6724805; QUAD $0x4015416296ef2548; QUAD $0xc4fe48755162edfe; QUAD $0xf162ce6f487e7162; QUAD $0x25f16206c672482d; QUAD $0x481df1620bc67248; QUAD $0x254845736219c672; QUAD $0xc6fe483d1162cac8; QUAD $0x6296d42548255362; QUAD $0x3d5162c1fe483d51; QUAD $0xfe4855d162c2fe48; QUAD $0x02c2724875f162e8; QUAD $0x620dc2724825f162; QUAD $0x716216c272481df1; QUAD $0x48657362ca6f487e; QUAD $0x254825d362e8cc25; QUAD $0xc9fe4875d16296cc; QUAD $0x7162c8fe4875d162; QUAD $0x1591622f626f487e; QUAD $0x480d916211c47248; QUAD $0x724805916213c472; QUAD $0xef25480d53620ad4; QUAD $0x62f5fe400d416296; QUAD $0x159162f7fe400d21; QUAD $0x480d916207c77248; QUAD $0x724805916212c772; QUAD $0xef25480d536203d7; QUAD $0x62f5fe400d416296; QUAD $0x7e7162c4fe487d51; QUAD $0x72482df162cd6f48; QUAD $0xc5724825f16206c5; QUAD $0x19c572481df1620b; QUAD $0x62cacf25484d7362; QUAD $0x255362c7fe483d11; QUAD $0x483d516296d42548; QUAD $0xc2fe483d5162c1fe; QUAD $0xf162e0fe485dd162; QUAD $0x25f16202c172487d; QUAD $0x481df1620dc17248; QUAD $0x6f487e716216c172; QUAD $0xe8cb25486d7362c9; QUAD $0x6296c4254825d362; QUAD $0x7dd162c1fe487dd1; QUAD $0x6f487e7162c0fe48; QUAD $0xc572481591623062; QUAD $0x13c572480d916211; QUAD $0x620ad57248059162; QUAD $0x416296ef25480d53; QUAD $0x40050162fdfe4005; QUAD $0xc0724815b162f8fe; QUAD $0x12c072480db16207; QUAD $0x6203d0724805b162; QUAD $0x416296ef25480d53; QUAD $0x01ee8348fdfe4005
99 JE lastLoop
100 ADDQ $8, R13
101 MOVQ (R13), R14
102 QUAD $0x7162c4fe48455162; QUAD $0x482df162cc6f487e; QUAD $0x724825f16206c472; QUAD $0xc472481df1620bc4; QUAD $0xcace254855736219; QUAD $0x5362c0fe483d3162; QUAD $0x3d516296d4254825; QUAD $0xfe483d5162c1fe48; QUAD $0x62d8fe4865d162c2; QUAD $0xf16202c0724845f1; QUAD $0x1df1620dc0724825; QUAD $0x487e716216c07248; QUAD $0xca2548757362c86f; QUAD $0x96fc254825d362e8; QUAD $0xd162f9fe4845d162; QUAD $0x487e7162f8fe4845; WORD $0x626f; BYTE $0x31
103 TESTQ $(1<<0), R14
104 JE skipNext0
105 MOVQ 0*24(AX), R9
106 LONG $0x487cc162; WORD $0x0410; BYTE $0x09
107 skipNext0:
108 QUAD $0x7162c4fe484d5162; QUAD $0x482df162cb6f487e; QUAD $0x724825f16206c372; QUAD $0xc372481df1620bc3; QUAD $0xcacd25485d736219; QUAD $0x5362c1fe483d3162; QUAD $0x3d516296d4254825; QUAD $0xfe483d5162c1fe48; QUAD $0x62d0fe486dd162c2; QUAD $0xf16202c772484df1; QUAD $0x1df1620dc7724825; QUAD $0x487e716216c77248; QUAD $0xc925487d7362cf6f; QUAD $0x96f4254825d362e8; QUAD $0xd162f1fe484dd162; QUAD $0x487e7162f0fe484d; WORD $0x626f; BYTE $0x32
109 TESTQ $(1<<1), R14
110 JE skipNext1
111 MOVQ 1*24(AX), R9
112 LONG $0x487cc162; WORD $0x0c10; BYTE $0x09
113 skipNext1:
114 QUAD $0x7162c4fe48555162; QUAD $0x482df162ca6f487e; QUAD $0x724825f16206c272; QUAD $0xc272481df1620bc2; QUAD $0xcacc254865736219; QUAD $0x5362c2fe483d3162; QUAD $0x3d516296d4254825; QUAD $0xfe483d5162c1fe48; QUAD $0x62c8fe4875d162c2; QUAD $0xf16202c6724855f1; QUAD $0x1df1620dc6724825; QUAD $0x487e716216c67248; QUAD $0xc82548457362ce6f; QUAD $0x96ec254825d362e8; QUAD $0xd162e9fe4855d162; QUAD $0x487e7162e8fe4855; WORD $0x626f; BYTE $0x33
115 TESTQ $(1<<2), R14
116 JE skipNext2
117 MOVQ 2*24(AX), R9
118 LONG $0x487cc162; WORD $0x1410; BYTE $0x09
119 skipNext2:
120 QUAD $0x7162c4fe485d5162; QUAD $0x482df162c96f487e; QUAD $0x724825f16206c172; QUAD $0xc172481df1620bc1; QUAD $0xcacb25486d736219; QUAD $0x5362c3fe483d3162; QUAD $0x3d516296d4254825; QUAD $0xfe483d5162c1fe48; QUAD $0x62c0fe487dd162c2; QUAD $0xf16202c572485df1; QUAD $0x1df1620dc5724825; QUAD $0x487e716216c57248; QUAD $0xcf25484d7362cd6f; QUAD $0x96e4254825d362e8; QUAD $0xd162e1fe485dd162; QUAD $0x487e7162e0fe485d; WORD $0x626f; BYTE $0x34
121 TESTQ $(1<<3), R14
122 JE skipNext3
123 MOVQ 3*24(AX), R9
124 LONG $0x487cc162; WORD $0x1c10; BYTE $0x09
125 skipNext3:
126 QUAD $0x7162c4fe48655162; QUAD $0x482df162c86f487e; QUAD $0x724825f16206c072; QUAD $0xc072481df1620bc0; QUAD $0xcaca254875736219; QUAD $0x5362c4fe483d3162; QUAD $0x3d516296d4254825; QUAD $0xfe483d5162c1fe48; QUAD $0x62f8fe4845d162c2; QUAD $0xf16202c4724865f1; QUAD $0x1df1620dc4724825; QUAD $0x487e716216c47248; QUAD $0xce2548557362cc6f; QUAD $0x96dc254825d362e8; QUAD $0xd162d9fe4865d162; QUAD $0x487e7162d8fe4865; WORD $0x626f; BYTE $0x35
127 TESTQ $(1<<4), R14
128 JE skipNext4
129 MOVQ 4*24(AX), R9
130 LONG $0x487cc162; WORD $0x2410; BYTE $0x09
131 skipNext4:
132 QUAD $0x7162c4fe486d5162; QUAD $0x482df162cf6f487e; QUAD $0x724825f16206c772; QUAD $0xc772481df1620bc7; QUAD $0xcac925487d736219; QUAD $0x5362c5fe483d3162; QUAD $0x3d516296d4254825; QUAD $0xfe483d5162c1fe48; QUAD $0x62f0fe484dd162c2; QUAD $0xf16202c372486df1; QUAD $0x1df1620dc3724825; QUAD $0x487e716216c37248; QUAD $0xcd25485d7362cb6f; QUAD $0x96d4254825d362e8; QUAD $0xd162d1fe486dd162; QUAD $0x487e7162d0fe486d; WORD $0x626f; BYTE $0x36
133 TESTQ $(1<<5), R14
134 JE skipNext5
135 MOVQ 5*24(AX), R9
136 LONG $0x487cc162; WORD $0x2c10; BYTE $0x09
137 skipNext5:
138 QUAD $0x7162c4fe48755162; QUAD $0x482df162ce6f487e; QUAD $0x724825f16206c672; QUAD $0xc672481df1620bc6; QUAD $0xcac8254845736219; QUAD $0x5362c6fe483d3162; QUAD $0x3d516296d4254825; QUAD $0xfe483d5162c1fe48; QUAD $0x62e8fe4855d162c2; QUAD $0xf16202c2724875f1; QUAD $0x1df1620dc2724825; QUAD $0x487e716216c27248; QUAD $0xcc2548657362ca6f; QUAD $0x96cc254825d362e8; QUAD $0xd162c9fe4875d162; QUAD $0x487e7162c8fe4875; WORD $0x626f; BYTE $0x37
139 TESTQ $(1<<6), R14
140 JE skipNext6
141 MOVQ 6*24(AX), R9
142 LONG $0x487cc162; WORD $0x3410; BYTE $0x09
143 skipNext6:
144 QUAD $0x7162c4fe487d5162; QUAD $0x482df162cd6f487e; QUAD $0x724825f16206c572; QUAD $0xc572481df1620bc5; QUAD $0xcacf25484d736219; QUAD $0x5362c7fe483d3162; QUAD $0x3d516296d4254825; QUAD $0xfe483d5162c1fe48; QUAD $0x62e0fe485dd162c2; QUAD $0xf16202c172487df1; QUAD $0x1df1620dc1724825; QUAD $0x487e716216c17248; QUAD $0xcb25486d7362c96f; QUAD $0x96c4254825d362e8; QUAD $0xd162c1fe487dd162; QUAD $0x487e7162c0fe487d; WORD $0x626f; BYTE $0x38
145 TESTQ $(1<<7), R14
146 JE skipNext7
147 MOVQ 7*24(AX), R9
148 LONG $0x487cc162; WORD $0x3c10; BYTE $0x09
149 skipNext7:
150 QUAD $0x7162c4fe48455162; QUAD $0x482df162cc6f487e; QUAD $0x724825f16206c472; QUAD $0xc472481df1620bc4; QUAD $0xcace254855736219; QUAD $0x5362c0fe483d1162; QUAD $0x3d516296d4254825; QUAD $0xfe483d5162c1fe48; QUAD $0x62d8fe4865d162c2; QUAD $0xf16202c0724845f1; QUAD $0x1df1620dc0724825; QUAD $0x487e716216c07248; QUAD $0xca2548757362c86f; QUAD $0x96fc254825d362e8; QUAD $0xd162f9fe4845d162; QUAD $0x487e7162f8fe4845; WORD $0x626f; BYTE $0x39
151 TESTQ $(1<<8), R14
152 JE skipNext8
153 MOVQ 8*24(AX), R9
154 LONG $0x487c4162; WORD $0x0410; BYTE $0x09
155 skipNext8:
156 QUAD $0x7162c4fe484d5162; QUAD $0x482df162cb6f487e; QUAD $0x724825f16206c372; QUAD $0xc372481df1620bc3; QUAD $0xcacd25485d736219; QUAD $0x5362c1fe483d1162; QUAD $0x3d516296d4254825; QUAD $0xfe483d5162c1fe48; QUAD $0x62d0fe486dd162c2; QUAD $0xf16202c772484df1; QUAD $0x1df1620dc7724825; QUAD $0x487e716216c77248; QUAD $0xc925487d7362cf6f; QUAD $0x96f4254825d362e8; QUAD $0xd162f1fe484dd162; QUAD $0x487e7162f0fe484d; WORD $0x626f; BYTE $0x3a
157 TESTQ $(1<<9), R14
158 JE skipNext9
159 MOVQ 9*24(AX), R9
160 LONG $0x487c4162; WORD $0x0c10; BYTE $0x09
161 skipNext9:
162 QUAD $0x7162c4fe48555162; QUAD $0x482df162ca6f487e; QUAD $0x724825f16206c272; QUAD $0xc272481df1620bc2; QUAD $0xcacc254865736219; QUAD $0x5362c2fe483d1162; QUAD $0x3d516296d4254825; QUAD $0xfe483d5162c1fe48; QUAD $0x62c8fe4875d162c2; QUAD $0xf16202c6724855f1; QUAD $0x1df1620dc6724825; QUAD $0x487e716216c67248; QUAD $0xc82548457362ce6f; QUAD $0x96ec254825d362e8; QUAD $0xd162e9fe4855d162; QUAD $0x487e7162e8fe4855; WORD $0x626f; BYTE $0x3b
163 TESTQ $(1<<10), R14
164 JE skipNext10
165 MOVQ 10*24(AX), R9
166 LONG $0x487c4162; WORD $0x1410; BYTE $0x09
167 skipNext10:
168 QUAD $0x7162c4fe485d5162; QUAD $0x482df162c96f487e; QUAD $0x724825f16206c172; QUAD $0xc172481df1620bc1; QUAD $0xcacb25486d736219; QUAD $0x5362c3fe483d1162; QUAD $0x3d516296d4254825; QUAD $0xfe483d5162c1fe48; QUAD $0x62c0fe487dd162c2; QUAD $0xf16202c572485df1; QUAD $0x1df1620dc5724825; QUAD $0x487e716216c57248; QUAD $0xcf25484d7362cd6f; QUAD $0x96e4254825d362e8; QUAD $0xd162e1fe485dd162; QUAD $0x487e7162e0fe485d; WORD $0x626f; BYTE $0x3c
169 TESTQ $(1<<11), R14
170 JE skipNext11
171 MOVQ 11*24(AX), R9
172 LONG $0x487c4162; WORD $0x1c10; BYTE $0x09
173 skipNext11:
174 QUAD $0x7162c4fe48655162; QUAD $0x482df162c86f487e; QUAD $0x724825f16206c072; QUAD $0xc072481df1620bc0; QUAD $0xcaca254875736219; QUAD $0x5362c4fe483d1162; QUAD $0x3d516296d4254825; QUAD $0xfe483d5162c1fe48; QUAD $0x62f8fe4845d162c2; QUAD $0xf16202c4724865f1; QUAD $0x1df1620dc4724825; QUAD $0x487e716216c47248; QUAD $0xce2548557362cc6f; QUAD $0x96dc254825d362e8; QUAD $0xd162d9fe4865d162; QUAD $0x487e7162d8fe4865; WORD $0x626f; BYTE $0x3d
175 TESTQ $(1<<12), R14
176 JE skipNext12
177 MOVQ 12*24(AX), R9
178 LONG $0x487c4162; WORD $0x2410; BYTE $0x09
179 skipNext12:
180 QUAD $0x7162c4fe486d5162; QUAD $0x482df162cf6f487e; QUAD $0x724825f16206c772; QUAD $0xc772481df1620bc7; QUAD $0xcac925487d736219; QUAD $0x5362c5fe483d1162; QUAD $0x3d516296d4254825; QUAD $0xfe483d5162c1fe48; QUAD $0x62f0fe484dd162c2; QUAD $0xf16202c372486df1; QUAD $0x1df1620dc3724825; QUAD $0x487e716216c37248; QUAD $0xcd25485d7362cb6f; QUAD $0x96d4254825d362e8; QUAD $0xd162d1fe486dd162; QUAD $0x487e7162d0fe486d; WORD $0x626f; BYTE $0x3e
181 TESTQ $(1<<13), R14
182 JE skipNext13
183 MOVQ 13*24(AX), R9
184 LONG $0x487c4162; WORD $0x2c10; BYTE $0x09
185 skipNext13:
186 QUAD $0x7162c4fe48755162; QUAD $0x482df162ce6f487e; QUAD $0x724825f16206c672; QUAD $0xc672481df1620bc6; QUAD $0xcac8254845736219; QUAD $0x5362c6fe483d1162; QUAD $0x3d516296d4254825; QUAD $0xfe483d5162c1fe48; QUAD $0x62e8fe4855d162c2; QUAD $0xf16202c2724875f1; QUAD $0x1df1620dc2724825; QUAD $0x487e716216c27248; QUAD $0xcc2548657362ca6f; QUAD $0x96cc254825d362e8; QUAD $0xd162c9fe4875d162; QUAD $0x487e7162c8fe4875; WORD $0x626f; BYTE $0x3f
187 TESTQ $(1<<14), R14
188 JE skipNext14
189 MOVQ 14*24(AX), R9
190 LONG $0x487c4162; WORD $0x3410; BYTE $0x09
191 skipNext14:
192 QUAD $0x7162c4fe487d5162; QUAD $0x482df162cd6f487e; QUAD $0x724825f16206c572; QUAD $0xc572481df1620bc5; QUAD $0xcacf25484d736219; QUAD $0x5362c7fe483d1162; QUAD $0x3d516296d4254825; QUAD $0xfe483d5162c1fe48; QUAD $0x62e0fe485dd162c2; QUAD $0xf16202c172487df1; QUAD $0x1df1620dc1724825; QUAD $0x487e716216c17248; QUAD $0xcb25486d7362c96f; QUAD $0x96c4254825d362e8; QUAD $0xd162c1fe487dd162; QUAD $0x487e7162c0fe487d; WORD $0x626f; BYTE $0x40
193 TESTQ $(1<<15), R14
194 JE skipNext15
195 MOVQ 15*24(AX), R9
196 LONG $0x487c4162; WORD $0x3c10; BYTE $0x09
197 skipNext15:
198 QUAD $0xd162d86f487e7162; QUAD $0x7dd16224046f487e; QUAD $0x6f487e7162c3fe49; QUAD $0x244c6f487ed162d9; QUAD $0x62cbfe4975d16201; QUAD $0x7ed162da6f487e71; QUAD $0x6dd1620224546f48; QUAD $0x6f487e7162d3fe49; QUAD $0x245c6f487ed162db; QUAD $0x62dbfe4965d16203; QUAD $0x7ed162dc6f487e71; QUAD $0x5dd1620424646f48; QUAD $0x6f487e7162e3fe49; QUAD $0x246c6f487ed162dd; QUAD $0x62ebfe4955d16205; QUAD $0x7ed162de6f487e71; QUAD $0x4dd1620624746f48; QUAD $0x6f487e7162f3fe49; QUAD $0x247c6f487ed162df; QUAD $0xc4fbfe4945d16207; LONG $0xce92fbc1
199 JMP lloop
200 lastLoop:
201 QUAD $0x7162c4fe48455162; QUAD $0x482df162cc6f487e; QUAD $0x724825f16206c472; QUAD $0xc472481df1620bc4; QUAD $0xcace254855736219; QUAD $0x5362c0fe483d3162; QUAD $0x3d516296d4254825; QUAD $0xfe483d5162c1fe48; QUAD $0x62d8fe4865d162c2; QUAD $0xf16202c0724845f1; QUAD $0x1df1620dc0724825; QUAD $0x487e716216c07248; QUAD $0xca2548757362c86f; QUAD $0x96fc254825d362e8; QUAD $0xd162f9fe4845d162; QUAD $0x487e7162f8fe4845; QUAD $0xfe484d516231626f; QUAD $0x62cb6f487e7162c4; QUAD $0xf16206c372482df1; QUAD $0x1df1620bc3724825; QUAD $0x485d736219c37248; QUAD $0xfe483d3162cacd25; QUAD $0x96d42548255362c1; QUAD $0x5162c1fe483d5162; QUAD $0x486dd162c2fe483d; QUAD $0xc772484df162d0fe; QUAD $0x0dc7724825f16202; QUAD $0x6216c772481df162; QUAD $0x7d7362cf6f487e71; QUAD $0x4825d362e8c92548; QUAD $0xfe484dd16296f425; QUAD $0x62f0fe484dd162f1; QUAD $0x516232626f487e71; QUAD $0x487e7162c4fe4855; QUAD $0xc272482df162ca6f; QUAD $0x0bc2724825f16206; QUAD $0x6219c272481df162; QUAD $0x3162cacc25486573; QUAD $0x48255362c2fe483d; QUAD $0xfe483d516296d425; QUAD $0x62c2fe483d5162c1; QUAD $0x55f162c8fe4875d1; QUAD $0x4825f16202c67248; QUAD $0x72481df1620dc672; QUAD $0xce6f487e716216c6; QUAD $0x62e8c82548457362; QUAD $0xd16296ec254825d3; QUAD $0x4855d162e9fe4855; QUAD $0x626f487e7162e8fe; QUAD $0x62c4fe485d516233; QUAD $0x2df162c96f487e71; QUAD $0x4825f16206c17248; QUAD $0x72481df1620bc172; QUAD $0xcb25486d736219c1; QUAD $0x62c3fe483d3162ca; QUAD $0x516296d425482553; QUAD $0x483d5162c1fe483d; QUAD $0xc0fe487dd162c2fe; QUAD $0x6202c572485df162; QUAD $0xf1620dc5724825f1; QUAD $0x7e716216c572481d; QUAD $0x25484d7362cd6f48; QUAD $0xe4254825d362e8cf; QUAD $0x62e1fe485dd16296; QUAD $0x7e7162e0fe485dd1; QUAD $0x4865516234626f48; QUAD $0xc86f487e7162c4fe; QUAD $0x6206c072482df162; QUAD $0xf1620bc0724825f1; QUAD $0x75736219c072481d; QUAD $0x483d3162caca2548; QUAD $0xd42548255362c4fe; QUAD $0x62c1fe483d516296; QUAD $0x45d162c2fe483d51; QUAD $0x724865f162f8fe48; QUAD $0xc4724825f16202c4; QUAD $0x16c472481df1620d; QUAD $0x7362cc6f487e7162; QUAD $0x25d362e8ce254855; QUAD $0x4865d16296dc2548; QUAD $0xd8fe4865d162d9fe; QUAD $0x6235626f487e7162; QUAD $0x7e7162c4fe486d51; QUAD $0x72482df162cf6f48; QUAD $0xc7724825f16206c7; QUAD $0x19c772481df1620b; QUAD $0x62cac925487d7362; QUAD $0x255362c5fe483d31; QUAD $0x483d516296d42548; QUAD $0xc2fe483d5162c1fe; QUAD $0xf162f0fe484dd162; QUAD $0x25f16202c372486d; QUAD $0x481df1620dc37248; QUAD $0x6f487e716216c372; QUAD $0xe8cd25485d7362cb; QUAD $0x6296d4254825d362; QUAD $0x6dd162d1fe486dd1; QUAD $0x6f487e7162d0fe48; QUAD $0xc4fe487551623662; QUAD $0xf162ce6f487e7162; QUAD $0x25f16206c672482d; QUAD $0x481df1620bc67248; QUAD $0x254845736219c672; QUAD $0xc6fe483d3162cac8; QUAD $0x6296d42548255362; QUAD $0x3d5162c1fe483d51; QUAD $0xfe4855d162c2fe48; QUAD $0x02c2724875f162e8; QUAD $0x620dc2724825f162; QUAD $0x716216c272481df1; QUAD $0x48657362ca6f487e; QUAD $0x254825d362e8cc25; QUAD $0xc9fe4875d16296cc; QUAD $0x7162c8fe4875d162; QUAD $0x7d516237626f487e; QUAD $0x6f487e7162c4fe48; QUAD $0x06c572482df162cd; QUAD $0x620bc5724825f162; QUAD $0x736219c572481df1; QUAD $0x3d3162cacf25484d; QUAD $0x2548255362c7fe48; QUAD $0xc1fe483d516296d4; QUAD $0xd162c2fe483d5162; QUAD $0x487df162e0fe485d; QUAD $0x724825f16202c172; QUAD $0xc172481df1620dc1; QUAD $0x62c96f487e716216; QUAD $0xd362e8cb25486d73; QUAD $0x7dd16296c4254825; QUAD $0xfe487dd162c1fe48; QUAD $0x38626f487e7162c0; QUAD $0x7162c4fe48455162; QUAD $0x482df162cc6f487e; QUAD $0x724825f16206c472; QUAD $0xc472481df1620bc4; QUAD $0xcace254855736219; QUAD $0x5362c0fe483d1162; QUAD $0x3d516296d4254825; QUAD $0xfe483d5162c1fe48; QUAD $0x62d8fe4865d162c2; QUAD $0xf16202c0724845f1; QUAD $0x1df1620dc0724825; QUAD $0x487e716216c07248; QUAD $0xca2548757362c86f; QUAD $0x96fc254825d362e8; QUAD $0xd162f9fe4845d162; QUAD $0x487e7162f8fe4845; QUAD $0xfe484d516239626f; QUAD $0x62cb6f487e7162c4; QUAD $0xf16206c372482df1; QUAD $0x1df1620bc3724825; QUAD $0x485d736219c37248; QUAD $0xfe483d1162cacd25; QUAD $0x96d42548255362c1; QUAD $0x5162c1fe483d5162; QUAD $0x486dd162c2fe483d; QUAD $0xc772484df162d0fe; QUAD $0x0dc7724825f16202; QUAD $0x6216c772481df162; QUAD $0x7d7362cf6f487e71; QUAD $0x4825d362e8c92548; QUAD $0xfe484dd16296f425; QUAD $0x62f0fe484dd162f1; QUAD $0x51623a626f487e71; QUAD $0x487e7162c4fe4855; QUAD $0xc272482df162ca6f; QUAD $0x0bc2724825f16206; QUAD $0x6219c272481df162; QUAD $0x1162cacc25486573; QUAD $0x48255362c2fe483d; QUAD $0xfe483d516296d425; QUAD $0x62c2fe483d5162c1; QUAD $0x55f162c8fe4875d1; QUAD $0x4825f16202c67248; QUAD $0x72481df1620dc672; QUAD $0xce6f487e716216c6; QUAD $0x62e8c82548457362; QUAD $0xd16296ec254825d3; QUAD $0x4855d162e9fe4855; QUAD $0x626f487e7162e8fe; QUAD $0x62c4fe485d51623b; QUAD $0x2df162c96f487e71; QUAD $0x4825f16206c17248; QUAD $0x72481df1620bc172; QUAD $0xcb25486d736219c1; QUAD $0x62c3fe483d1162ca; QUAD $0x516296d425482553; QUAD $0x483d5162c1fe483d; QUAD $0xc0fe487dd162c2fe; QUAD $0x6202c572485df162; QUAD $0xf1620dc5724825f1; QUAD $0x7e716216c572481d; QUAD $0x25484d7362cd6f48; QUAD $0xe4254825d362e8cf; QUAD $0x62e1fe485dd16296; QUAD $0x7e7162e0fe485dd1; QUAD $0x486551623c626f48; QUAD $0xc86f487e7162c4fe; QUAD $0x6206c072482df162; QUAD $0xf1620bc0724825f1; QUAD $0x75736219c072481d; QUAD $0x483d1162caca2548; QUAD $0xd42548255362c4fe; QUAD $0x62c1fe483d516296; QUAD $0x45d162c2fe483d51; QUAD $0x724865f162f8fe48; QUAD $0xc4724825f16202c4; QUAD $0x16c472481df1620d; QUAD $0x7362cc6f487e7162; QUAD $0x25d362e8ce254855; QUAD $0x4865d16296dc2548; QUAD $0xd8fe4865d162d9fe; QUAD $0x623d626f487e7162; QUAD $0x7e7162c4fe486d51; QUAD $0x72482df162cf6f48; QUAD $0xc7724825f16206c7; QUAD $0x19c772481df1620b; QUAD $0x62cac925487d7362; QUAD $0x255362c5fe483d11; QUAD $0x483d516296d42548; QUAD $0xc2fe483d5162c1fe; QUAD $0xf162f0fe484dd162; QUAD $0x25f16202c372486d; QUAD $0x481df1620dc37248; QUAD $0x6f487e716216c372; QUAD $0xe8cd25485d7362cb; QUAD $0x6296d4254825d362; QUAD $0x6dd162d1fe486dd1; QUAD $0x6f487e7162d0fe48; QUAD $0xc4fe487551623e62; QUAD $0xf162ce6f487e7162; QUAD $0x25f16206c672482d; QUAD $0x481df1620bc67248; QUAD $0x254845736219c672; QUAD $0xc6fe483d1162cac8; QUAD $0x6296d42548255362; QUAD $0x3d5162c1fe483d51; QUAD $0xfe4855d162c2fe48; QUAD $0x02c2724875f162e8; QUAD $0x620dc2724825f162; QUAD $0x716216c272481df1; QUAD $0x48657362ca6f487e; QUAD $0x254825d362e8cc25; QUAD $0xc9fe4875d16296cc; QUAD $0x7162c8fe4875d162; QUAD $0x7d51623f626f487e; QUAD $0x6f487e7162c4fe48; QUAD $0x06c572482df162cd; QUAD $0x620bc5724825f162; QUAD $0x736219c572481df1; QUAD $0x3d1162cacf25484d; QUAD $0x2548255362c7fe48; QUAD $0xc1fe483d516296d4; QUAD $0xd162c2fe483d5162; QUAD $0x487df162e0fe485d; QUAD $0x724825f16202c172; QUAD $0xc172481df1620dc1; QUAD $0x62c96f487e716216; QUAD $0xd362e8cb25486d73; QUAD $0x7dd16296c4254825; QUAD $0xfe487dd162c1fe48; QUAD $0x40626f487e7162c0; QUAD $0xd162d86f487e7162; QUAD $0x7dd16224046f487e; QUAD $0x6f487e7162c3fe49; QUAD $0x244c6f487ed162d9; QUAD $0x62cbfe4975d16201; QUAD $0x7ed162da6f487e71; QUAD $0x6dd1620224546f48; QUAD $0x6f487e7162d3fe49; QUAD $0x245c6f487ed162db; QUAD $0x62dbfe4965d16203; QUAD $0x7ed162dc6f487e71; QUAD $0x5dd1620424646f48; QUAD $0x6f487e7162e3fe49; QUAD $0x246c6f487ed162dd; QUAD $0x62ebfe4955d16205; QUAD $0x7ed162de6f487e71; QUAD $0x4dd1620624746f48; QUAD $0x6f487e7162f3fe49; QUAD $0x247c6f487ed162df; QUAD $0x62fbfe4945d16207; QUAD $0x7ef162077f487ef1; QUAD $0x487ef162014f7f48; QUAD $0x7f487ef16202577f; QUAD $0x677f487ef162035f; QUAD $0x056f7f487ef16204; QUAD $0x6206777f487ef162; LONG $0x7f487ef1; WORD $0x077f
202 VZEROUPPER
203 RET
204 DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x000(SB)/8, $0x0405060700010203
205 DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x008(SB)/8, $0x0c0d0e0f08090a0b
206 DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x010(SB)/8, $0x0405060700010203
207 DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x018(SB)/8, $0x0c0d0e0f08090a0b
208 DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x020(SB)/8, $0x0405060700010203
209 DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x028(SB)/8, $0x0c0d0e0f08090a0b
210 DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x030(SB)/8, $0x0405060700010203
211 DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x038(SB)/8, $0x0c0d0e0f08090a0b
212 GLOBL PSHUFFLE_BYTE_FLIP_MASK<>(SB), 8, $64
213 DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x000(SB)/8, $0x0000000000000000
214 DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x008(SB)/8, $0x0000000000000001
215 DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x010(SB)/8, $0x0000000000000008
216 DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x018(SB)/8, $0x0000000000000009
217 DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x020(SB)/8, $0x0000000000000004
218 DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x028(SB)/8, $0x0000000000000005
219 DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x030(SB)/8, $0x000000000000000C
220 DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x038(SB)/8, $0x000000000000000D
221 GLOBL PSHUFFLE_TRANSPOSE16_MASK1<>(SB), 8, $64
222 DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x000(SB)/8, $0x0000000000000002
223 DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x008(SB)/8, $0x0000000000000003
224 DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x010(SB)/8, $0x000000000000000A
225 DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x018(SB)/8, $0x000000000000000B
226 DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x020(SB)/8, $0x0000000000000006
227 DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x028(SB)/8, $0x0000000000000007
228 DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x030(SB)/8, $0x000000000000000E
229 DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x038(SB)/8, $0x000000000000000F
230 GLOBL PSHUFFLE_TRANSPOSE16_MASK2<>(SB), 8, $64