Feature/avx512 support (#30)
* Initial implementation of 16x parallel support for SHA256 hashing on AVX512
* Updated tests
* Add support for detection of AVX512 capabilities
* Add Write support for arbitrary blocks and proper length adding for Sum
* Fix test and remove formatting
* Remove old comments
* Cache final digest on client
* Updated version with more optimized assembly listing/formatting
Frank Wessels authored 6 years ago
GitHub committed 6 years ago
15 | 15 | package sha256 |
16 | 16 | |
17 | 17 | // True when SIMD instructions are available. |
18 | var avx512 = haveAVX512() | |
18 | 19 | var avx2 = haveAVX2() |
19 | 20 | var avx = haveAVX() |
20 | 21 | var ssse3 = haveSSSE3() |
45 | 46 | return false |
46 | 47 | } |
47 | 48 | |
49 | // haveAVX512 returns true when there is AVX512 support | |
50 | func haveAVX512() bool { | |
51 | mfi, _, _, _ := cpuid(0) | |
52 | ||
53 | // Check AVX2, AVX2 requires OS support, but BMI1/2 don't. | |
54 | if mfi >= 7 { | |
55 | _, _, c, _ := cpuid(1) | |
56 | ||
57 | // Only detect AVX-512 features if XGETBV is supported | |
58 | if c&((1<<26)|(1<<27)) == (1<<26)|(1<<27) { | |
59 | // Check for OS support | |
60 | eax, _ := xgetbv(0) | |
61 | _, ebx, _, _ := cpuidex(7, 0) | |
62 | ||
63 | // Verify that XCR0[7:5] = ‘111b’ (OPMASK state, upper 256-bit of ZMM0-ZMM15 and | |
64 | // ZMM16-ZMM31 state are enabled by OS) | |
65 | /// and that XCR0[2:1] = ‘11b’ (XMM state and YMM state are enabled by OS). | |
66 | if (eax>>5)&7 == 7 && (eax>>1)&3 == 3 { | |
67 | if ebx&(1<<16) == 0 { | |
68 | return false // no AVX512F | |
69 | } | |
70 | if ebx&(1<<17) == 0 { | |
71 | return false // no AVX512DQ | |
72 | } | |
73 | if ebx&(1<<30) == 0 { | |
74 | return false // no AVX512BW | |
75 | } | |
76 | if ebx&(1<<31) == 0 { | |
77 | return false // no AVX512VL | |
78 | } | |
79 | return true | |
80 | } | |
81 | } | |
82 | } | |
83 | return false | |
84 | } | |
85 | ||
48 | 86 | // haveSSSE3 returns true when there is SSSE3 support |
49 | 87 | func haveSSSE3() bool { |
50 | 88 |
49 | 49 | package sha256 |
50 | 50 | |
51 | 51 | import ( |
52 | "bytes" | |
53 | "encoding/binary" | |
52 | 54 | "encoding/hex" |
53 | 55 | "fmt" |
56 | "hash" | |
57 | "reflect" | |
58 | "strings" | |
59 | "sync" | |
54 | 60 | "testing" |
55 | 61 | ) |
56 | 62 | |
159 | 165 | (0 * 1) + (1 * 2) + (1 * 4) + (0 * 8) + (0 * 16) + (0 * 32) + (0 * 64) + (0 * 128), |
160 | 166 | (1 * 1) + (1 * 2) + (0 * 4) + (0 * 8) + (0 * 16) + (0 * 32) + (0 * 64) + (0 * 128)}, |
161 | 167 | "ab"}, |
162 | {[32]byte{(0 * 1) + (1 * 2) + (0 * 4) + (1 * 8) + (1 * 16) + (1 * 32) + (0 * 64) + (1 * 128), | |
163 | (0 * 1) + (0 * 2) + (0 * 4) + (1 * 8) + (1 * 16) + (1 * 32) + (1 * 64) + (0 * 128), | |
164 | (0 * 1) + (1 * 2) + (1 * 4) + (0 * 8) + (1 * 16) + (0 * 32) + (0 * 64) + (0 * 128), | |
165 | (1 * 1) + (1 * 2) + (1 * 4) + (1 * 8) + (1 * 16) + (1 * 32) + (0 * 64) + (1 * 128), | |
166 | (1 * 1) + (1 * 2) + (1 * 4) + (1 * 8) + (0 * 16) + (0 * 32) + (0 * 64) + (1 * 128), | |
167 | (1 * 1) + (0 * 2) + (0 * 4) + (0 * 8) + (0 * 16) + (0 * 32) + (0 * 64) + (0 * 128), | |
168 | (1 * 1) + (1 * 2) + (1 * 4) + (1 * 8) + (0 * 16) + (0 * 32) + (1 * 64) + (1 * 128), | |
169 | (0 * 1) + (1 * 2) + (0 * 4) + (1 * 8) + (0 * 16) + (1 * 32) + (1 * 64) + (1 * 128), | |
170 | (1 * 1) + (0 * 2) + (0 * 4) + (0 * 8) + (0 * 16) + (0 * 32) + (1 * 64) + (0 * 128), | |
171 | (1 * 1) + (0 * 2) + (0 * 4) + (0 * 8) + (0 * 16) + (0 * 32) + (1 * 64) + (0 * 128), | |
172 | (0 * 1) + (0 * 2) + (0 * 4) + (0 * 8) + (0 * 16) + (0 * 32) + (1 * 64) + (0 * 128), | |
173 | (0 * 1) + (1 * 2) + (1 * 4) + (1 * 8) + (1 * 16) + (0 * 32) + (1 * 64) + (1 * 128), | |
174 | (1 * 1) + (0 * 2) + (1 * 4) + (1 * 8) + (1 * 16) + (0 * 32) + (1 * 64) + (0 * 128), | |
175 | (0 * 1) + (1 * 2) + (1 * 4) + (1 * 8) + (0 * 16) + (1 * 32) + (0 * 64) + (1 * 128), | |
176 | (0 * 1) + (1 * 2) + (0 * 4) + (0 * 8) + (0 * 16) + (1 * 32) + (0 * 64) + (0 * 128), | |
177 | (1 * 1) + (1 * 2) + (0 * 4) + (0 * 8) + (0 * 16) + (1 * 32) + (0 * 64) + (0 * 128), | |
178 | (0 * 1) + (0 * 2) + (0 * 4) + (0 * 8) + (1 * 16) + (1 * 32) + (0 * 64) + (1 * 128), | |
179 | (1 * 1) + (1 * 2) + (0 * 4) + (0 * 8) + (0 * 16) + (0 * 32) + (0 * 64) + (0 * 128), | |
180 | (1 * 1) + (0 * 2) + (0 * 4) + (0 * 8) + (0 * 16) + (1 * 32) + (1 * 64) + (0 * 128), | |
181 | (1 * 1) + (1 * 2) + (0 * 4) + (0 * 8) + (0 * 16) + (1 * 32) + (0 * 64) + (1 * 128), | |
182 | (0 * 1) + (1 * 2) + (1 * 4) + (0 * 8) + (1 * 16) + (0 * 32) + (0 * 64) + (1 * 128), | |
183 | (1 * 1) + (1 * 2) + (1 * 4) + (0 * 8) + (1 * 16) + (0 * 32) + (0 * 64) + (0 * 128), | |
184 | (0 * 1) + (1 * 2) + (0 * 4) + (1 * 8) + (1 * 16) + (1 * 32) + (1 * 64) + (0 * 128), | |
185 | (0 * 1) + (0 * 2) + (1 * 4) + (1 * 8) + (1 * 16) + (0 * 32) + (0 * 64) + (1 * 128), | |
186 | (0 * 1) + (0 * 2) + (1 * 4) + (0 * 8) + (1 * 16) + (1 * 32) + (0 * 64) + (1 * 128), | |
187 | (0 * 1) + (0 * 2) + (0 * 4) + (0 * 8) + (1 * 16) + (0 * 32) + (0 * 64) + (0 * 128), | |
188 | (1 * 1) + (1 * 2) + (1 * 4) + (1 * 8) + (1 * 16) + (1 * 32) + (1 * 64) + (1 * 128), | |
189 | (1 * 1) + (0 * 2) + (0 * 4) + (0 * 8) + (0 * 16) + (1 * 32) + (1 * 64) + (0 * 128), | |
190 | (0 * 1) + (1 * 2) + (0 * 4) + (0 * 8) + (1 * 16) + (1 * 32) + (1 * 64) + (1 * 128), | |
191 | (0 * 1) + (0 * 2) + (0 * 4) + (0 * 8) + (0 * 16) + (0 * 32) + (0 * 64) + (0 * 128), | |
192 | (1 * 1) + (0 * 2) + (1 * 4) + (0 * 8) + (1 * 16) + (0 * 32) + (0 * 64) + (0 * 128), | |
193 | (1 * 1) + (0 * 2) + (1 * 4) + (1 * 8) + (0 * 16) + (1 * 32) + (0 * 64) + (1 * 128)}, | |
194 | "abc"}, | |
195 | 168 | {[32]byte{(0 * 1) + (1 * 2) + (0 * 4) + (1 * 8) + (1 * 16) + (1 * 32) + (0 * 64) + (1 * 128), |
196 | 169 | (0 * 1) + (0 * 2) + (0 * 4) + (1 * 8) + (1 * 16) + (1 * 32) + (1 * 64) + (0 * 128), |
197 | 170 | (0 * 1) + (1 * 2) + (1 * 4) + (0 * 8) + (1 * 16) + (0 * 32) + (0 * 64) + (0 * 128), |
1116 | 1089 | (1 * 1) + (1 * 2) + (1 * 4) + (1 * 8) + (0 * 16) + (0 * 32) + (0 * 64) + (1 * 128), |
1117 | 1090 | (0 * 1) + (1 * 2) + (1 * 4) + (0 * 8) + (0 * 16) + (1 * 32) + (1 * 64) + (1 * 128)}, |
1118 | 1091 | "How can you write a big system without C++? -Paul Glick"}, |
1092 | // $ echo -n "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz123" | sha256sum | |
1093 | // 13d8b6bf5cc79c03c07c719c48597bd33b79677e65098589b1580fca7f22bb22 | |
1094 | {[32]byte{0x13, 0xd8, 0xb6, 0xbf, 0x5c, 0xc7, 0x9c, 0x03, | |
1095 | 0xc0, 0x7c, 0x71, 0x9c, 0x48, 0x59, 0x7b, 0xd3, | |
1096 | 0x3b, 0x79, 0x67, 0x7e, 0x65, 0x09, 0x85, 0x89, | |
1097 | 0xb1, 0x58, 0x0f, 0xca, 0x7f, 0x22, 0xbb, 0x22}, | |
1098 | "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz123"}, | |
1099 | // $ echo -n "BCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234" | sha256sum | |
1100 | // 624ddef3009879c6874da2dd771d54f7330781b60e1955ceff5f9dce8bf4ea43 | |
1101 | {[32]byte{0x62, 0x4d, 0xde, 0xf3, 0x00, 0x98, 0x79, 0xc6, | |
1102 | 0x87, 0x4d, 0xa2, 0xdd, 0x77, 0x1d, 0x54, 0xf7, | |
1103 | 0x33, 0x07, 0x81, 0xb6, 0x0e, 0x19, 0x55, 0xce, | |
1104 | 0xff, 0x5f, 0x9d, 0xce, 0x8b, 0xf4, 0xea, 0x43}, | |
1105 | "BCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234"}, | |
1106 | // $ echo -n "CDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz12345" | sha256sum | |
1107 | // cc031589b70dd4b24dc6def2121835ef1aa8074ff6952cdd3f81b5099a93c58d | |
1108 | {[32]byte{0xcc, 0x03, 0x15, 0x89, 0xb7, 0x0d, 0xd4, 0xb2, | |
1109 | 0x4d, 0xc6, 0xde, 0xf2, 0x12, 0x18, 0x35, 0xef, | |
1110 | 0x1a, 0xa8, 0x07, 0x4f, 0xf6, 0x95, 0x2c, 0xdd, | |
1111 | 0x3f, 0x81, 0xb5, 0x09, 0x9a, 0x93, 0xc5, 0x8d}, | |
1112 | "CDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz12345"}, | |
1113 | // $ echo -n "DEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz123456" | sha256sum | |
1114 | // d354abb6d538402db3d73daf95537a255ebaf3a943c80205be163e044fc46a70 | |
1115 | {[32]byte{0xd3, 0x54, 0xab, 0xb6, 0xd5, 0x38, 0x40, 0x2d, | |
1116 | 0xb3, 0xd7, 0x3d, 0xaf, 0x95, 0x53, 0x7a, 0x25, | |
1117 | 0x5e, 0xba, 0xf3, 0xa9, 0x43, 0xc8, 0x02, 0x05, | |
1118 | 0xbe, 0x16, 0x3e, 0x04, 0x4f, 0xc4, 0x6a, 0x70}, | |
1119 | "DEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz123456"}, | |
1120 | // $ echo -n "EFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567" | sha256sum | |
1121 | // f78410b90a20b521afb28f41d6388482afab7265ff8884aa6290cc9f9ada30d3 | |
1122 | {[32]byte{0xf7, 0x84, 0x10, 0xb9, 0x0a, 0x20, 0xb5, 0x21, | |
1123 | 0xaf, 0xb2, 0x8f, 0x41, 0xd6, 0x38, 0x84, 0x82, | |
1124 | 0xaf, 0xab, 0x72, 0x65, 0xff, 0x88, 0x84, 0xaa, | |
1125 | 0x62, 0x90, 0xcc, 0x9f, 0x9a, 0xda, 0x30, 0xd3}, | |
1126 | "EFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567"}, | |
1127 | // $ echo -n "FGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz12345678" | sha256sum | |
1128 | // c93a8cb7ed80166b15b79c8617410ca69e46fa1e3c1d14876699d3ce6090384f | |
1129 | {[32]byte{0xc9, 0x3a, 0x8c, 0xb7, 0xed, 0x80, 0x16, 0x6b, | |
1130 | 0x15, 0xb7, 0x9c, 0x86, 0x17, 0x41, 0x0c, 0xa6, | |
1131 | 0x9e, 0x46, 0xfa, 0x1e, 0x3c, 0x1d, 0x14, 0x87, | |
1132 | 0x66, 0x99, 0xd3, 0xce, 0x60, 0x90, 0x38, 0x4f}, | |
1133 | "FGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz12345678"}, | |
1134 | // $ echo -n "GHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz123456789" | sha256sum | |
1135 | // 6cb808e9a7fb53fa680824f08554b660d29a4afc9a101f990b4bae3a12b7fbd8 | |
1136 | {[32]byte{0x6c, 0xb8, 0x08, 0xe9, 0xa7, 0xfb, 0x53, 0xfa, | |
1137 | 0x68, 0x08, 0x24, 0xf0, 0x85, 0x54, 0xb6, 0x60, | |
1138 | 0xd2, 0x9a, 0x4a, 0xfc, 0x9a, 0x10, 0x1f, 0x99, | |
1139 | 0x0b, 0x4b, 0xae, 0x3a, 0x12, 0xb7, 0xfb, 0xd8}, | |
1140 | "GHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz123456789"}, | |
1141 | // $ echo -n "HIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890" | sha256sum | |
1142 | // 84e8dd1afa78db222860ed40b6fcfc7a269469365f81f5712fb589555bdb01fe | |
1143 | {[32]byte{0x84, 0xe8, 0xdd, 0x1a, 0xfa, 0x78, 0xdb, 0x22, | |
1144 | 0x28, 0x60, 0xed, 0x40, 0xb6, 0xfc, 0xfc, 0x7a, | |
1145 | 0x26, 0x94, 0x69, 0x36, 0x5f, 0x81, 0xf5, 0x71, | |
1146 | 0x2f, 0xb5, 0x89, 0x55, 0x5b, 0xdb, 0x01, 0xfe}, | |
1147 | "HIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890"}, | |
1148 | // $ echo -n "IJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890A" | sha256sum | |
1149 | // accab8e85b6bd178e975aaaa354aed8258bcd6af3e61bd4f12267635856cab0b | |
1150 | {[32]byte{0xac, 0xca, 0xb8, 0xe8, 0x5b, 0x6b, 0xd1, 0x78, | |
1151 | 0xe9, 0x75, 0xaa, 0xaa, 0x35, 0x4a, 0xed, 0x82, | |
1152 | 0x58, 0xbc, 0xd6, 0xaf, 0x3e, 0x61, 0xbd, 0x4f, | |
1153 | 0x12, 0x26, 0x76, 0x35, 0x85, 0x6c, 0xab, 0x0b}, | |
1154 | "IJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890A"}, | |
1155 | // $ echo -n "JKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890AB" | sha256sum | |
1156 | // 107f5ad8bc5d427246fc5f9c581134b61d8ba447e877df56cddad2bf53789172 | |
1157 | {[32]byte{0x10, 0x7f, 0x5a, 0xd8, 0xbc, 0x5d, 0x42, 0x72, | |
1158 | 0x46, 0xfc, 0x5f, 0x9c, 0x58, 0x11, 0x34, 0xb6, | |
1159 | 0x1d, 0x8b, 0xa4, 0x47, 0xe8, 0x77, 0xdf, 0x56, | |
1160 | 0xcd, 0xda, 0xd2, 0xbf, 0x53, 0x78, 0x91, 0x72}, | |
1161 | "JKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890AB"}, | |
1162 | // $ echo -n "KLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890ABC" | sha256sum | |
1163 | // 7666f65b234f78aa537c8d098b181091ce8b7866a0285b52e6bf31b6f21ca9bb | |
1164 | {[32]byte{0x76, 0x66, 0xf6, 0x5b, 0x23, 0x4f, 0x78, 0xaa, | |
1165 | 0x53, 0x7c, 0x8d, 0x09, 0x8b, 0x18, 0x10, 0x91, | |
1166 | 0xce, 0x8b, 0x78, 0x66, 0xa0, 0x28, 0x5b, 0x52, | |
1167 | 0xe6, 0xbf, 0x31, 0xb6, 0xf2, 0x1c, 0xa9, 0xbb}, | |
1168 | "KLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890ABC"}, | |
1169 | // $ echo -n "LMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890ABCD" | sha256sum | |
1170 | // 4eba948ccee7289ab1f01628a1ab756dee39a6894aed217edc9a91a8b35e50ca | |
1171 | {[32]byte{0x4e, 0xba, 0x94, 0x8c, 0xce, 0xe7, 0x28, 0x9a, | |
1172 | 0xb1, 0xf0, 0x16, 0x28, 0xa1, 0xab, 0x75, 0x6d, | |
1173 | 0xee, 0x39, 0xa6, 0x89, 0x4a, 0xed, 0x21, 0x7e, | |
1174 | 0xdc, 0x9a, 0x91, 0xa8, 0xb3, 0x5e, 0x50, 0xca}, | |
1175 | "LMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890ABCD"}, | |
1176 | // $ echo -n "MNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890ABCDE" | sha256sum | |
1177 | // 5011218873e7ca84871668d26461e449e7033b7959d69cfb5c2fee773c3d432d | |
1178 | {[32]byte{0x50, 0x11, 0x21, 0x88, 0x73, 0xe7, 0xca, 0x84, | |
1179 | 0x87, 0x16, 0x68, 0xd2, 0x64, 0x61, 0xe4, 0x49, | |
1180 | 0xe7, 0x03, 0x3b, 0x79, 0x59, 0xd6, 0x9c, 0xfb, | |
1181 | 0x5c, 0x2f, 0xee, 0x77, 0x3c, 0x3d, 0x43, 0x2d}, | |
1182 | "MNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890ABCDE"}, | |
1183 | // $ echo -n "NOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890ABCDEF" | sha256sum | |
1184 | // 6932b4ddaf3696e5d5270739bdbe6ab120bb8034b877bd3a8e5a5d5ca263e1c5 | |
1185 | {[32]byte{0x69, 0x32, 0xb4, 0xdd, 0xaf, 0x36, 0x96, 0xe5, | |
1186 | 0xd5, 0x27, 0x07, 0x39, 0xbd, 0xbe, 0x6a, 0xb1, | |
1187 | 0x20, 0xbb, 0x80, 0x34, 0xb8, 0x77, 0xbd, 0x3a, | |
1188 | 0x8e, 0x5a, 0x5d, 0x5c, 0xa2, 0x63, 0xe1, 0xc5}, | |
1189 | "NOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890ABCDEF"}, | |
1190 | // $ echo -n "OPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890ABCDEFG" | sha256sum | |
1191 | // 91bb1bcbfcb4c093aab255a0b8c8b5b93605e2f51dd6b0898b70b9f3c10fc1f9 | |
1192 | {[32]byte{0x91, 0xbb, 0x1b, 0xcb, 0xfc, 0xb4, 0xc0, 0x93, | |
1193 | 0xaa, 0xb2, 0x55, 0xa0, 0xb8, 0xc8, 0xb5, 0xb9, | |
1194 | 0x36, 0x05, 0xe2, 0xf5, 0x1d, 0xd6, 0xb0, 0x89, | |
1195 | 0x8b, 0x70, 0xb9, 0xf3, 0xc1, 0x0f, 0xc1, 0xf9}, | |
1196 | "OPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890ABCDEFG"}, | |
1197 | // $ echo -n "PQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890ABCDEFGH" | sha256sum | |
1198 | // 0d1fa5355388e361c4591bd49c004e3d99044be274db43e91036611365aead02 | |
1199 | {[32]byte{0x0d, 0x1f, 0xa5, 0x35, 0x53, 0x88, 0xe3, 0x61, | |
1200 | 0xc4, 0x59, 0x1b, 0xd4, 0x9c, 0x00, 0x4e, 0x3d, | |
1201 | 0x99, 0x04, 0x4b, 0xe2, 0x74, 0xdb, 0x43, 0xe9, | |
1202 | 0x10, 0x36, 0x61, 0x13, 0x65, 0xae, 0xad, 0x02}, | |
1203 | "PQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890ABCDEFGH"}, | |
1204 | // $ echo -n "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" | sha256sum | |
1205 | // b6ac3cc10386331c765f04f041c147d0f278f2aed8eaa021e2d0057fc6f6ff9e | |
1206 | {[32]byte{0xb6, 0xac, 0x3c, 0xc1, 0x03, 0x86, 0x33, 0x1c, | |
1207 | 0x76, 0x5f, 0x04, 0xf0, 0x41, 0xc1, 0x47, 0xd0, | |
1208 | 0xf2, 0x78, 0xf2, 0xae, 0xd8, 0xea, 0xa0, 0x21, | |
1209 | 0xe2, 0xd0, 0x05, 0x7f, 0xc6, 0xf6, 0xff, 0x9e}, | |
1210 | strings.Repeat("A", 128)}, | |
1211 | // $ echo -n "BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB" | sha256sum | |
1212 | // 7abaa701a6f4bb8d9ea3872a315597eb6f2ccfd03392d8d10560837f6136d06a | |
1213 | {[32]byte{0x7a, 0xba, 0xa7, 0x01, 0xa6, 0xf4, 0xbb, 0x8d, | |
1214 | 0x9e, 0xa3, 0x87, 0x2a, 0x31, 0x55, 0x97, 0xeb, | |
1215 | 0x6f, 0x2c, 0xcf, 0xd0, 0x33, 0x92, 0xd8, 0xd1, | |
1216 | 0x05, 0x60, 0x83, 0x7f, 0x61, 0x36, 0xd0, 0x6a}, | |
1217 | strings.Repeat("B", 128)}, | |
1218 | // $ echo -n "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC" | sha256sum | |
1219 | // 6e8b9325f779dba60c4c148dee5ded43b19ed20d25d66e338abec53b99174fe8 | |
1220 | {[32]byte{0x6e, 0x8b, 0x93, 0x25, 0xf7, 0x79, 0xdb, 0xa6, | |
1221 | 0x0c, 0x4c, 0x14, 0x8d, 0xee, 0x5d, 0xed, 0x43, | |
1222 | 0xb1, 0x9e, 0xd2, 0x0d, 0x25, 0xd6, 0x6e, 0x33, | |
1223 | 0x8a, 0xbe, 0xc5, 0x3b, 0x99, 0x17, 0x4f, 0xe8}, | |
1224 | strings.Repeat("C", 128)}, | |
1225 | // $ echo -n "DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD" | sha256sum | |
1226 | // 7aa020c91ac4d32e17efd9b64648b92e375987e0eae7d0a58544ca1e4fc32c3c | |
1227 | {[32]byte{0x7a, 0xa0, 0x20, 0xc9, 0x1a, 0xc4, 0xd3, 0x2e, | |
1228 | 0x17, 0xef, 0xd9, 0xb6, 0x46, 0x48, 0xb9, 0x2e, | |
1229 | 0x37, 0x59, 0x87, 0xe0, 0xea, 0xe7, 0xd0, 0xa5, | |
1230 | 0x85, 0x44, 0xca, 0x1e, 0x4f, 0xc3, 0x2c, 0x3c}, | |
1231 | strings.Repeat("D", 128)}, | |
1232 | // $ echo -n "EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE" | sha256sum | |
1233 | // 997f6a2fc44f1400e9f64d7eac11fe99e21f4b7a3fc2ff3ec95c2ef016abb9e5 | |
1234 | {[32]byte{0x99, 0x7f, 0x6a, 0x2f, 0xc4, 0x4f, 0x14, 0x00, | |
1235 | 0xe9, 0xf6, 0x4d, 0x7e, 0xac, 0x11, 0xfe, 0x99, | |
1236 | 0xe2, 0x1f, 0x4b, 0x7a, 0x3f, 0xc2, 0xff, 0x3e, | |
1237 | 0xc9, 0x5c, 0x2e, 0xf0, 0x16, 0xab, 0xb9, 0xe5}, | |
1238 | strings.Repeat("E", 128)}, | |
1239 | // $ echo -n "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF" | sha256sum | |
1240 | // 5c6cdeb9ccaa1d9c57662605ab738ec4ecf0467f576d4c2d7fae48710215582a | |
1241 | {[32]byte{0x5c, 0x6c, 0xde, 0xb9, 0xcc, 0xaa, 0x1d, 0x9c, | |
1242 | 0x57, 0x66, 0x26, 0x05, 0xab, 0x73, 0x8e, 0xc4, | |
1243 | 0xec, 0xf0, 0x46, 0x7f, 0x57, 0x6d, 0x4c, 0x2d, | |
1244 | 0x7f, 0xae, 0x48, 0x71, 0x02, 0x15, 0x58, 0x2a}, | |
1245 | strings.Repeat("F", 128)}, | |
1246 | // $ echo -n "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG" | sha256sum | |
1247 | // 394394b5f0e91a21d1e932f9ed55e098c8b05f3668f77134eeee843fef1d1758 | |
1248 | {[32]byte{0x39, 0x43, 0x94, 0xb5, 0xf0, 0xe9, 0x1a, 0x21, | |
1249 | 0xd1, 0xe9, 0x32, 0xf9, 0xed, 0x55, 0xe0, 0x98, | |
1250 | 0xc8, 0xb0, 0x5f, 0x36, 0x68, 0xf7, 0x71, 0x34, | |
1251 | 0xee, 0xee, 0x84, 0x3f, 0xef, 0x1d, 0x17, 0x58}, | |
1252 | strings.Repeat("G", 128)}, | |
1253 | // $ echo -n "HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH" | sha256sum | |
1254 | // cab546612de68eaa849487342baadbac2561df6380ddac66137ef649e0cdfd0a | |
1255 | {[32]byte{0xca, 0xb5, 0x46, 0x61, 0x2d, 0xe6, 0x8e, 0xaa, | |
1256 | 0x84, 0x94, 0x87, 0x34, 0x2b, 0xaa, 0xdb, 0xac, | |
1257 | 0x25, 0x61, 0xdf, 0x63, 0x80, 0xdd, 0xac, 0x66, | |
1258 | 0x13, 0x7e, 0xf6, 0x49, 0xe0, 0xcd, 0xfd, 0x0a}, | |
1259 | strings.Repeat("H", 128)}, | |
1260 | // $ echo -n "IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII" | sha256sum | |
1261 | // 2be96cc28445876429be3005db465d1b9c8ed1432e3ac6f1514b6e9eee725ad8 | |
1262 | {[32]byte{0x2b, 0xe9, 0x6c, 0xc2, 0x84, 0x45, 0x87, 0x64, | |
1263 | 0x29, 0xbe, 0x30, 0x05, 0xdb, 0x46, 0x5d, 0x1b, | |
1264 | 0x9c, 0x8e, 0xd1, 0x43, 0x2e, 0x3a, 0xc6, 0xf1, | |
1265 | 0x51, 0x4b, 0x6e, 0x9e, 0xee, 0x72, 0x5a, 0xd8}, | |
1266 | strings.Repeat("I", 128)}, | |
1267 | // $ echo -n "JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ" | sha256sum | |
1268 | // 238e5f81d54f2af58049b944c4a1b9516a36c2ef1e20887450b3482045714444 | |
1269 | {[32]byte{0x23, 0x8e, 0x5f, 0x81, 0xd5, 0x4f, 0x2a, 0xf5, | |
1270 | 0x80, 0x49, 0xb9, 0x44, 0xc4, 0xa1, 0xb9, 0x51, | |
1271 | 0x6a, 0x36, 0xc2, 0xef, 0x1e, 0x20, 0x88, 0x74, | |
1272 | 0x50, 0xb3, 0x48, 0x20, 0x45, 0x71, 0x44, 0x44}, | |
1273 | strings.Repeat("J", 128)}, | |
1274 | // $ echo -n "KKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKK" | sha256sum | |
1275 | // f3a5b826c64951661ce22dc67f0f79d13f633f0601aca2f5e1cf1a9f17dffd4f | |
1276 | {[32]byte{0xf3, 0xa5, 0xb8, 0x26, 0xc6, 0x49, 0x51, 0x66, | |
1277 | 0x1c, 0xe2, 0x2d, 0xc6, 0x7f, 0x0f, 0x79, 0xd1, | |
1278 | 0x3f, 0x63, 0x3f, 0x06, 0x01, 0xac, 0xa2, 0xf5, | |
1279 | 0xe1, 0xcf, 0x1a, 0x9f, 0x17, 0xdf, 0xfd, 0x4f}, | |
1280 | strings.Repeat("K", 128)}, | |
1281 | // $ echo -n "LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL" | sha256sum | |
1282 | // 1e90c05bedd24dc3e297d5b8fb215b95d8b7f4a040ee912069614c7a3382725d | |
1283 | {[32]byte{0x1e, 0x90, 0xc0, 0x5b, 0xed, 0xd2, 0x4d, 0xc3, | |
1284 | 0xe2, 0x97, 0xd5, 0xb8, 0xfb, 0x21, 0x5b, 0x95, | |
1285 | 0xd8, 0xb7, 0xf4, 0xa0, 0x40, 0xee, 0x91, 0x20, | |
1286 | 0x69, 0x61, 0x4c, 0x7a, 0x33, 0x82, 0x72, 0x5d}, | |
1287 | strings.Repeat("L", 128)}, | |
1288 | // $ echo -n "MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM" | sha256sum | |
1289 | // 96239ac6fb99822797308f18d8455778fb5885103aa5ff59afe2219df657df99 | |
1290 | {[32]byte{0x96, 0x23, 0x9a, 0xc6, 0xfb, 0x99, 0x82, 0x27, | |
1291 | 0x97, 0x30, 0x8f, 0x18, 0xd8, 0x45, 0x57, 0x78, | |
1292 | 0xfb, 0x58, 0x85, 0x10, 0x3a, 0xa5, 0xff, 0x59, | |
1293 | 0xaf, 0xe2, 0x21, 0x9d, 0xf6, 0x57, 0xdf, 0x99}, | |
1294 | strings.Repeat("M", 128)}, | |
1295 | // $ echo -n "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN" | sha256sum | |
1296 | // 11e7f5a6f15a4addba9b6b21bc4f8ecbdd969e179335269fc68d3a05f0f3da4a | |
1297 | {[32]byte{0x11, 0xe7, 0xf5, 0xa6, 0xf1, 0x5a, 0x4a, 0xdd, | |
1298 | 0xba, 0x9b, 0x6b, 0x21, 0xbc, 0x4f, 0x8e, 0xcb, | |
1299 | 0xdd, 0x96, 0x9e, 0x17, 0x93, 0x35, 0x26, 0x9f, | |
1300 | 0xc6, 0x8d, 0x3a, 0x05, 0xf0, 0xf3, 0xda, 0x4a}, | |
1301 | strings.Repeat("N", 128)}, | |
1302 | // $ echo -n "OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO" | sha256sum | |
1303 | // ae843b7e4e00afeb972bf948a345b319cca8bd0bcaa1428c1c67c88ea663c1e0 | |
1304 | {[32]byte{0xae, 0x84, 0x3b, 0x7e, 0x4e, 0x00, 0xaf, 0xeb, | |
1305 | 0x97, 0x2b, 0xf9, 0x48, 0xa3, 0x45, 0xb3, 0x19, | |
1306 | 0xcc, 0xa8, 0xbd, 0x0b, 0xca, 0xa1, 0x42, 0x8c, | |
1307 | 0x1c, 0x67, 0xc8, 0x8e, 0xa6, 0x63, 0xc1, 0xe0}, | |
1308 | strings.Repeat("O", 128)}, | |
1309 | // $ echo -n "PPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPP" | sha256sum | |
1310 | // f16ef3e254ffb74b7e3c97d99486ef8c549e4c80bc6dfed7fe8c5e7e76f4fbcd | |
1311 | {[32]byte{0xf1, 0x6e, 0xf3, 0xe2, 0x54, 0xff, 0xb7, 0x4b, | |
1312 | 0x7e, 0x3c, 0x97, 0xd9, 0x94, 0x86, 0xef, 0x8c, | |
1313 | 0x54, 0x9e, 0x4c, 0x80, 0xbc, 0x6d, 0xfe, 0xd7, | |
1314 | 0xfe, 0x8c, 0x5e, 0x7e, 0x76, 0xf4, 0xfb, 0xcd}, | |
1315 | strings.Repeat("P", 128)}, | |
1119 | 1316 | } |
1120 | 1317 | |
1121 | 1318 | func TestGolden(t *testing.T) { |
1127 | 1324 | } |
1128 | 1325 | } |
1129 | 1326 | |
1327 | func TestGoldenAVX512(t *testing.T) { | |
1328 | ||
1329 | if !avx512 { | |
1330 | t.SkipNow() | |
1331 | return | |
1332 | } | |
1333 | ||
1334 | server := NewAvx512Server() | |
1335 | h512 := NewAvx512(server) | |
1336 | ||
1337 | for _, g := range golden { | |
1338 | h512.Reset() | |
1339 | h512.Write([]byte(g.in)) | |
1340 | digest := h512.Sum([]byte{}) | |
1341 | s := fmt.Sprintf("%x", digest) | |
1342 | if !reflect.DeepEqual(digest, g.out[:]) { | |
1343 | t.Fatalf("Sum256 function: sha256(%s) = %s want %s", g.in, s, hex.EncodeToString(g.out[:])) | |
1344 | } | |
1345 | } | |
1346 | } | |
1347 | ||
1130 | 1348 | func TestSize(t *testing.T) { |
1131 | 1349 | c := New() |
1132 | 1350 | if got := c.Size(); got != Size { |
1141 | 1359 | } |
1142 | 1360 | } |
1143 | 1361 | |
1144 | var bench = New() | |
1145 | var buf = make([]byte, 1024*1024) | |
1146 | ||
1147 | 1362 | func benchmarkSize(b *testing.B, size int) { |
1363 | var bench = New() | |
1364 | var buf = make([]byte, size) | |
1148 | 1365 | b.SetBytes(int64(size)) |
1149 | 1366 | sum := make([]byte, bench.Size()) |
1150 | 1367 | for i := 0; i < b.N; i++ { |
1154 | 1371 | } |
1155 | 1372 | } |
1156 | 1373 | |
1157 | func BenchmarkHash8Bytes(b *testing.B) { | |
1158 | benchmarkSize(b, 8) | |
1159 | } | |
1160 | ||
1161 | func BenchmarkHash1K(b *testing.B) { | |
1162 | benchmarkSize(b, 1024) | |
1163 | } | |
1164 | ||
1165 | func BenchmarkHash8K(b *testing.B) { | |
1166 | benchmarkSize(b, 8192) | |
1167 | } | |
1168 | ||
1169 | func BenchmarkHash1M(b *testing.B) { | |
1170 | benchmarkSize(b, 1024*1024) | |
1171 | } | |
1374 | func BenchmarkHash8Bytes(b *testing.B) { benchmarkSize(b, 8) } | |
1375 | func BenchmarkHash1K(b *testing.B) { benchmarkSize(b, 1024) } | |
1376 | func BenchmarkHash8K(b *testing.B) { benchmarkSize(b, 8192) } | |
1377 | func BenchmarkHash1MAvx2(b *testing.B) { benchmarkSize(b, 1024*1024) } | |
1378 | func BenchmarkHash5MAvx2(b *testing.B) { benchmarkSize(b, 5*1024*1024) } | |
1379 | func BenchmarkHash10MAvx2(b *testing.B) { benchmarkSize(b, 10*1024*1024) } | |
1380 | ||
1381 | func createInputs(size int) [16][]byte { | |
1382 | input := [16][]byte{} | |
1383 | for i := 0; i < 16; i++ { | |
1384 | input[i] = make([]byte, size) | |
1385 | } | |
1386 | return input | |
1387 | } | |
1388 | ||
1389 | func initDigests() *[512]byte { | |
1390 | digests := [512]byte{} | |
1391 | for i := 0; i < 16; i++ { | |
1392 | binary.LittleEndian.PutUint32(digests[(i+0*16)*4:], init0) | |
1393 | binary.LittleEndian.PutUint32(digests[(i+1*16)*4:], init1) | |
1394 | binary.LittleEndian.PutUint32(digests[(i+2*16)*4:], init2) | |
1395 | binary.LittleEndian.PutUint32(digests[(i+3*16)*4:], init3) | |
1396 | binary.LittleEndian.PutUint32(digests[(i+4*16)*4:], init4) | |
1397 | binary.LittleEndian.PutUint32(digests[(i+5*16)*4:], init5) | |
1398 | binary.LittleEndian.PutUint32(digests[(i+6*16)*4:], init6) | |
1399 | binary.LittleEndian.PutUint32(digests[(i+7*16)*4:], init7) | |
1400 | } | |
1401 | return &digests | |
1402 | } | |
1403 | ||
1404 | func testSha256Avx512(t *testing.T, offset, padding int) [16][]byte { | |
1405 | ||
1406 | if !avx512 { | |
1407 | t.SkipNow() | |
1408 | return [16][]byte{} | |
1409 | } | |
1410 | ||
1411 | l := uint(len(golden[offset].in)) | |
1412 | extraBlock := uint(0) | |
1413 | if padding == 0 { | |
1414 | extraBlock += 9 | |
1415 | } else { | |
1416 | extraBlock += 64 | |
1417 | } | |
1418 | input := createInputs(int(l + extraBlock)) | |
1419 | for i := 0; i < 16; i++ { | |
1420 | copy(input[i], golden[offset+i].in) | |
1421 | input[i][l] = 0x80 | |
1422 | copy(input[i][l+1:], bytes.Repeat([]byte{0}, padding)) | |
1423 | ||
1424 | // Length in bits. | |
1425 | len := uint64(l) | |
1426 | len <<= 3 | |
1427 | for ii := uint(0); ii < 8; ii++ { | |
1428 | input[i][l+1+uint(padding)+ii] = byte(len >> (56 - 8*ii)) | |
1429 | } | |
1430 | } | |
1431 | mask := make([]uint64, len(input[0])>>6) | |
1432 | for m := range mask { | |
1433 | mask[m] = 0xffff | |
1434 | } | |
1435 | output := blockAvx512(initDigests(), input, mask) | |
1436 | for i := 0; i < 16; i++ { | |
1437 | if bytes.Compare(output[i][:], golden[offset+i].out[:]) != 0 { | |
1438 | t.Fatalf("Sum256 function: sha256(%s) = %s want %s", golden[offset+i].in, hex.EncodeToString(output[i][:]), hex.EncodeToString(golden[offset+i].out[:])) | |
1439 | } | |
1440 | } | |
1441 | return input | |
1442 | } | |
1443 | ||
1444 | func TestAvx512_1Block(t *testing.T) { testSha256Avx512(t, 31, 0) } | |
1445 | func TestAvx512_3Blocks(t *testing.T) { testSha256Avx512(t, 47, 55) } | |
1446 | ||
1447 | func TestAvx512_MixedBlocks(t *testing.T) { | |
1448 | ||
1449 | if !avx512 { | |
1450 | t.SkipNow() | |
1451 | return | |
1452 | } | |
1453 | ||
1454 | inputSingleBlock := testSha256Avx512(t, 31, 0) | |
1455 | inputMultiBlock := testSha256Avx512(t, 47, 55) | |
1456 | ||
1457 | input := [16][]byte{} | |
1458 | ||
1459 | for i := range input { | |
1460 | if i%2 == 0 { | |
1461 | input[i] = inputMultiBlock[i] | |
1462 | } else { | |
1463 | input[i] = inputSingleBlock[i] | |
1464 | } | |
1465 | } | |
1466 | ||
1467 | mask := [3]uint64{0xffff, 0x5555, 0x5555} | |
1468 | output := blockAvx512(initDigests(), input, mask[:]) | |
1469 | var offset int | |
1470 | for i := 0; i < len(output); i++ { | |
1471 | if i%2 == 0 { | |
1472 | offset = 47 | |
1473 | } else { | |
1474 | offset = 31 | |
1475 | } | |
1476 | if bytes.Compare(output[i][:], golden[offset+i].out[:]) != 0 { | |
1477 | t.Fatalf("Sum256 function: sha256(%s) = %s want %s", golden[offset+i].in, hex.EncodeToString(output[i][:]), hex.EncodeToString(golden[offset+i].out[:])) | |
1478 | } | |
1479 | } | |
1480 | } | |
1481 | ||
1482 | func TestAvx512_MixedWithNilBlocks(t *testing.T) { | |
1483 | ||
1484 | if !avx512 { | |
1485 | t.SkipNow() | |
1486 | return | |
1487 | } | |
1488 | ||
1489 | inputSingleBlock := testSha256Avx512(t, 31, 0) | |
1490 | inputMultiBlock := testSha256Avx512(t, 47, 55) | |
1491 | ||
1492 | input := [16][]byte{} | |
1493 | ||
1494 | for i := range input { | |
1495 | if i%3 == 0 { | |
1496 | input[i] = inputMultiBlock[i] | |
1497 | } else if i%3 == 1 { | |
1498 | input[i] = inputSingleBlock[i] | |
1499 | } else { | |
1500 | input[i] = nil | |
1501 | } | |
1502 | } | |
1503 | ||
1504 | mask := [3]uint64{0xb6db, 0x9249, 0x9249} | |
1505 | output := blockAvx512(initDigests(), input, mask[:]) | |
1506 | var offset int | |
1507 | for i := 0; i < len(output); i++ { | |
1508 | if i%3 == 2 { // for nil inputs | |
1509 | initvec := [32]byte{0x6a, 0x09, 0xe6, 0x67, 0xbb, 0x67, 0xae, 0x85, | |
1510 | 0x3c, 0x6e, 0xf3, 0x72, 0xa5, 0x4f, 0xf5, 0x3a, | |
1511 | 0x51, 0x0e, 0x52, 0x7f, 0x9b, 0x05, 0x68, 0x8c, | |
1512 | 0x1f, 0x83, 0xd9, 0xab, 0x5b, 0xe0, 0xcd, 0x19} | |
1513 | if bytes.Compare(output[i][:], initvec[:]) != 0 { | |
1514 | t.Fatalf("Sum256 function: sha256 for nil vector = %s want %s", hex.EncodeToString(output[i][:]), hex.EncodeToString(initvec[:])) | |
1515 | } | |
1516 | continue | |
1517 | } | |
1518 | if i%3 == 0 { | |
1519 | offset = 47 | |
1520 | } else { | |
1521 | offset = 31 | |
1522 | } | |
1523 | if bytes.Compare(output[i][:], golden[offset+i].out[:]) != 0 { | |
1524 | t.Fatalf("Sum256 function: sha256(%s) = %s want %s", golden[offset+i].in, hex.EncodeToString(output[i][:]), hex.EncodeToString(golden[offset+i].out[:])) | |
1525 | } | |
1526 | } | |
1527 | } | |
1528 | ||
1529 | func TestAvx512Server(t *testing.T) { | |
1530 | ||
1531 | if !avx512 { | |
1532 | t.SkipNow() | |
1533 | return | |
1534 | } | |
1535 | ||
1536 | const offset = 31 + 16 | |
1537 | server := NewAvx512Server() | |
1538 | ||
1539 | // First block of 64 bytes | |
1540 | for i := 0; i < 16; i++ { | |
1541 | input := make([]byte, 64) | |
1542 | copy(input, golden[offset+i].in) | |
1543 | server.Write(uint64(Avx512ServerUid+i), input) | |
1544 | } | |
1545 | ||
1546 | // Second block of 64 bytes | |
1547 | for i := 0; i < 16; i++ { | |
1548 | input := make([]byte, 64) | |
1549 | copy(input, golden[offset+i].in[64:]) | |
1550 | server.Write(uint64(Avx512ServerUid+i), input) | |
1551 | } | |
1552 | ||
1553 | wg := sync.WaitGroup{} | |
1554 | wg.Add(16) | |
1555 | ||
1556 | // Third and final block | |
1557 | for i := 0; i < 16; i++ { | |
1558 | input := make([]byte, 64) | |
1559 | input[0] = 0x80 | |
1560 | copy(input[1:], bytes.Repeat([]byte{0}, 63-8)) | |
1561 | ||
1562 | // Length in bits. | |
1563 | len := uint64(128) | |
1564 | len <<= 3 | |
1565 | for ii := uint(0); ii < 8; ii++ { | |
1566 | input[63-8+1+ii] = byte(len >> (56 - 8*ii)) | |
1567 | } | |
1568 | go func(i int, uid uint64, input []byte) { | |
1569 | output := server.Sum(uid, input) | |
1570 | if bytes.Compare(output[:], golden[offset+i].out[:]) != 0 { | |
1571 | t.Fatalf("Sum256 function: sha256(%s) = %s want %s", golden[offset+i].in, hex.EncodeToString(output[:]), hex.EncodeToString(golden[offset+i].out[:])) | |
1572 | } | |
1573 | wg.Done() | |
1574 | }(i, uint64(Avx512ServerUid+i), input) | |
1575 | } | |
1576 | ||
1577 | wg.Wait() | |
1578 | } | |
1579 | ||
1580 | func TestAvx512Digest(t *testing.T) { | |
1581 | ||
1582 | if !avx512 { | |
1583 | t.SkipNow() | |
1584 | return | |
1585 | } | |
1586 | ||
1587 | server := NewAvx512Server() | |
1588 | ||
1589 | const tests = 16 | |
1590 | h512 := [16]hash.Hash{} | |
1591 | for i := 0; i < tests; i++ { | |
1592 | h512[i] = NewAvx512(server) | |
1593 | } | |
1594 | ||
1595 | const offset = 31 + 16 | |
1596 | for i := 0; i < tests; i++ { | |
1597 | input := make([]byte, 64) | |
1598 | copy(input, golden[offset+i].in) | |
1599 | h512[i].Write(input) | |
1600 | } | |
1601 | for i := 0; i < tests; i++ { | |
1602 | input := make([]byte, 64) | |
1603 | copy(input, golden[offset+i].in[64:]) | |
1604 | h512[i].Write(input) | |
1605 | } | |
1606 | for i := 0; i < tests; i++ { | |
1607 | output := h512[i].Sum([]byte{}) | |
1608 | if bytes.Compare(output[:], golden[offset+i].out[:]) != 0 { | |
1609 | t.Fatalf("Sum256 function: sha256(%s) = %s want %s", golden[offset+i].in, hex.EncodeToString(output[:]), hex.EncodeToString(golden[offset+i].out[:])) | |
1610 | } | |
1611 | } | |
1612 | } | |
1613 | ||
1614 | func benchmarkAvx512SingleCore(h512 []hash.Hash, body []byte) { | |
1615 | ||
1616 | for i := 0; i < len(h512); i++ { | |
1617 | h512[i].Write(body) | |
1618 | } | |
1619 | for i := 0; i < len(h512); i++ { | |
1620 | _ = h512[i].Sum([]byte{}) | |
1621 | } | |
1622 | } | |
1623 | ||
1624 | func benchmarkAvx512(b *testing.B, size int) { | |
1625 | ||
1626 | if !avx512 { | |
1627 | b.SkipNow() | |
1628 | return | |
1629 | } | |
1630 | ||
1631 | server := NewAvx512Server() | |
1632 | ||
1633 | const tests = 16 | |
1634 | body := make([]byte, size) | |
1635 | ||
1636 | b.SetBytes(int64(len(body) * tests)) | |
1637 | b.ResetTimer() | |
1638 | ||
1639 | for i := 0; i < b.N; i++ { | |
1640 | h512 := make([]hash.Hash, tests) | |
1641 | for i := 0; i < tests; i++ { | |
1642 | h512[i] = NewAvx512(server) | |
1643 | } | |
1644 | ||
1645 | benchmarkAvx512SingleCore(h512, body) | |
1646 | } | |
1647 | } | |
1648 | ||
1649 | func BenchmarkAvx512_05M(b *testing.B) { benchmarkAvx512(b, 512*1024) } | |
1650 | func BenchmarkAvx512_1M(b *testing.B) { benchmarkAvx512(b, 1*1024*1024) } | |
1651 | func BenchmarkAvx512_5M(b *testing.B) { benchmarkAvx512(b, 5*1024*1024) } | |
1652 | func BenchmarkAvx512_10M(b *testing.B) { benchmarkAvx512(b, 10*1024*1024) } | |
1653 | ||
1654 | func benchmarkAvx512MultiCore(b *testing.B, size, cores int) { | |
1655 | ||
1656 | if !avx512 { | |
1657 | b.SkipNow() | |
1658 | return | |
1659 | } | |
1660 | ||
1661 | servers := make([]*Avx512Server, cores) | |
1662 | for c := 0; c < cores; c++ { | |
1663 | servers[c] = NewAvx512Server() | |
1664 | } | |
1665 | ||
1666 | const tests = 16 | |
1667 | ||
1668 | body := make([]byte, size) | |
1669 | ||
1670 | h512 := make([]hash.Hash, tests*cores) | |
1671 | for i := 0; i < tests*cores; i++ { | |
1672 | h512[i] = NewAvx512(servers[i>>4]) | |
1673 | } | |
1674 | ||
1675 | b.SetBytes(int64(size * 16 * cores)) | |
1676 | b.ResetTimer() | |
1677 | ||
1678 | var wg sync.WaitGroup | |
1679 | ||
1680 | for i := 0; i < b.N; i++ { | |
1681 | wg.Add(cores) | |
1682 | for c := 0; c < cores; c++ { | |
1683 | go func(c int) { benchmarkAvx512SingleCore(h512[c*tests:(c+1)*tests], body); wg.Done() }(c) | |
1684 | } | |
1685 | wg.Wait() | |
1686 | } | |
1687 | } | |
1688 | ||
1689 | func BenchmarkAvx512_5M_2Cores(b *testing.B) { benchmarkAvx512MultiCore(b, 5*1024*1024, 2) } | |
1690 | func BenchmarkAvx512_5M_4Cores(b *testing.B) { benchmarkAvx512MultiCore(b, 5*1024*1024, 4) } | |
1691 | func BenchmarkAvx512_5M_6Cores(b *testing.B) { benchmarkAvx512MultiCore(b, 5*1024*1024, 6) } | |
1692 | ||
1693 | type maskTest struct { | |
1694 | in [16]int | |
1695 | out [16]maskRounds | |
1696 | } | |
1697 | ||
1698 | var goldenMask = []maskTest{ | |
1699 | {[16]int{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, [16]maskRounds{}}, | |
1700 | {[16]int{64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0}, [16]maskRounds{{0x5555, 1}}}, | |
1701 | {[16]int{0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64}, [16]maskRounds{{0xaaaa, 1}}}, | |
1702 | {[16]int{64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64}, [16]maskRounds{{0xffff, 1}}}, | |
1703 | {[16]int{128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, [16]maskRounds{{0xffff, 2}}}, | |
1704 | {[16]int{64, 128, 64, 128, 64, 128, 64, 128, 64, 128, 64, 128, 64, 128, 64, 128}, [16]maskRounds{{0xffff, 1}, {0xaaaa, 1}}}, | |
1705 | {[16]int{128, 64, 128, 64, 128, 64, 128, 64, 128, 64, 128, 64, 128, 64, 128, 64}, [16]maskRounds{{0xffff, 1}, {0x5555, 1}}}, | |
1706 | {[16]int{64, 192, 64, 192, 64, 192, 64, 192, 64, 192, 64, 192, 64, 192, 64, 192}, [16]maskRounds{{0xffff, 1}, {0xaaaa, 2}}}, | |
1707 | // | |
1708 | // >= 64 0110=6 1011=b 1101=d 0110=6 | |
1709 | // >=128 0100=4 0010=2 1001=9 0100=4 | |
1710 | {[16]int{0, 64, 128, 0, 64, 128, 0, 64, 128, 0, 64, 128, 0, 64, 128, 0}, [16]maskRounds{{0x6db6, 1}, {0x4924, 1}}}, | |
1711 | {[16]int{1 * 64, 2 * 64, 3 * 64, 4 * 64, 5 * 64, 6 * 64, 7 * 64, 8 * 64, 9 * 64, 10 * 64, 11 * 64, 12 * 64, 13 * 64, 14 * 64, 15 * 64, 16 * 64}, | |
1712 | [16]maskRounds{{0xffff, 1}, {0xfffe, 1}, {0xfffc, 1}, {0xfff8, 1}, {0xfff0, 1}, {0xffe0, 1}, {0xffc0, 1}, {0xff80, 1}, | |
1713 | {0xff00, 1}, {0xfe00, 1}, {0xfc00, 1}, {0xf800, 1}, {0xf000, 1}, {0xe000, 1}, {0xc000, 1}, {0x8000, 1}}}, | |
1714 | {[16]int{2 * 64, 1 * 64, 3 * 64, 4 * 64, 5 * 64, 6 * 64, 7 * 64, 8 * 64, 9 * 64, 10 * 64, 11 * 64, 12 * 64, 13 * 64, 14 * 64, 15 * 64, 16 * 64}, | |
1715 | [16]maskRounds{{0xffff, 1}, {0xfffd, 1}, {0xfffc, 1}, {0xfff8, 1}, {0xfff0, 1}, {0xffe0, 1}, {0xffc0, 1}, {0xff80, 1}, | |
1716 | {0xff00, 1}, {0xfe00, 1}, {0xfc00, 1}, {0xf800, 1}, {0xf000, 1}, {0xe000, 1}, {0xc000, 1}, {0x8000, 1}}}, | |
1717 | {[16]int{10 * 64, 20 * 64, 30 * 64, 40 * 64, 50 * 64, 60 * 64, 70 * 64, 80 * 64, 90 * 64, 100 * 64, 110 * 64, 120 * 64, 130 * 64, 140 * 64, 150 * 64, 160 * 64}, | |
1718 | [16]maskRounds{{0xffff, 10}, {0xfffe, 10}, {0xfffc, 10}, {0xfff8, 10}, {0xfff0, 10}, {0xffe0, 10}, {0xffc0, 10}, {0xff80, 10}, | |
1719 | {0xff00, 10}, {0xfe00, 10}, {0xfc00, 10}, {0xf800, 10}, {0xf000, 10}, {0xe000, 10}, {0xc000, 10}, {0x8000, 10}}}, | |
1720 | {[16]int{10 * 64, 19 * 64, 27 * 64, 34 * 64, 40 * 64, 45 * 64, 49 * 64, 52 * 64, 54 * 64, 55 * 64, 57 * 64, 60 * 64, 64 * 64, 69 * 64, 75 * 64, 82 * 64}, | |
1721 | [16]maskRounds{{0xffff, 10}, {0xfffe, 9}, {0xfffc, 8}, {0xfff8, 7}, {0xfff0, 6}, {0xffe0, 5}, {0xffc0, 4}, {0xff80, 3}, | |
1722 | {0xff00, 2}, {0xfe00, 1}, {0xfc00, 2}, {0xf800, 3}, {0xf000, 4}, {0xe000, 5}, {0xc000, 6}, {0x8000, 7}}}, | |
1723 | } | |
1724 | ||
1725 | func TestMaskGen(t *testing.T) { | |
1726 | input := [16][]byte{} | |
1727 | for gcase, g := range goldenMask { | |
1728 | for i, l := range g.in { | |
1729 | buf := make([]byte, l) | |
1730 | input[i] = buf[:] | |
1731 | } | |
1732 | ||
1733 | mr := genMask(input) | |
1734 | ||
1735 | if !reflect.DeepEqual(mr, g.out) { | |
1736 | t.Fatalf("case %d: got %04x\n want %04x", gcase, mr, g.out) | |
1737 | } | |
1738 | } | |
1739 | } |
0 | ||
1 | // 16x Parallel implementation of SHA256 for AVX512 | |
2 | ||
3 | // | |
4 | // Minio Cloud Storage, (C) 2017 Minio, Inc. | |
5 | // | |
6 | // Licensed under the Apache License, Version 2.0 (the "License"); | |
7 | // you may not use this file except in compliance with the License. | |
8 | // You may obtain a copy of the License at | |
9 | // | |
10 | // http://www.apache.org/licenses/LICENSE-2.0 | |
11 | // | |
12 | // Unless required by applicable law or agreed to in writing, software | |
13 | // distributed under the License is distributed on an "AS IS" BASIS, | |
14 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
15 | // See the License for the specific language governing permissions and | |
16 | // limitations under the License. | |
17 | ||
18 | // | |
19 | // This code is based on the Intel Multi-Buffer Crypto for IPSec library | |
20 | // and more specifically the following implementation: | |
21 | // https://github.com/intel/intel-ipsec-mb/blob/master/avx512/sha256_x16_avx512.asm | |
22 | // | |
23 | // For Golang it has been converted into Plan 9 assembly with the help of | |
24 | // github.com/minio/asm2plan9s to assemble the AVX512 instructions | |
25 | // | |
26 | ||
27 | // Copyright (c) 2017, Intel Corporation | |
28 | // | |
29 | // Redistribution and use in source and binary forms, with or without | |
30 | // modification, are permitted provided that the following conditions are met: | |
31 | // | |
32 | // * Redistributions of source code must retain the above copyright notice, | |
33 | // this list of conditions and the following disclaimer. | |
34 | // * Redistributions in binary form must reproduce the above copyright | |
35 | // notice, this list of conditions and the following disclaimer in the | |
36 | // documentation and/or other materials provided with the distribution. | |
37 | // * Neither the name of Intel Corporation nor the names of its contributors | |
38 | // may be used to endorse or promote products derived from this software | |
39 | // without specific prior written permission. | |
40 | // | |
41 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
42 | // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
43 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
44 | // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE | |
45 | // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
46 | // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |
47 | // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |
48 | // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
49 | // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
50 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
51 | ||
52 | #define SHA256_DIGEST_ROW_SIZE 64 | |
53 | ||
54 | // arg1 | |
55 | #define STATE rdi | |
56 | #define STATE_P9 DI | |
57 | // arg2 | |
58 | #define INP_SIZE rsi | |
59 | #define INP_SIZE_P9 SI | |
60 | ||
61 | #define IDX rcx | |
62 | #define TBL rdx | |
63 | #define TBL_P9 DX | |
64 | ||
65 | #define INPUT rax | |
66 | #define INPUT_P9 AX | |
67 | ||
68 | #define inp0 r9 | |
69 | #define SCRATCH_P9 R12 | |
70 | #define SCRATCH r12 | |
71 | #define maskp r13 | |
72 | #define MASKP_P9 R13 | |
73 | #define mask r14 | |
74 | #define MASK_P9 R14 | |
75 | ||
76 | #define A zmm0 | |
77 | #define B zmm1 | |
78 | #define C zmm2 | |
79 | #define D zmm3 | |
80 | #define E zmm4 | |
81 | #define F zmm5 | |
82 | #define G zmm6 | |
83 | #define H zmm7 | |
84 | #define T1 zmm8 | |
85 | #define TMP0 zmm9 | |
86 | #define TMP1 zmm10 | |
87 | #define TMP2 zmm11 | |
88 | #define TMP3 zmm12 | |
89 | #define TMP4 zmm13 | |
90 | #define TMP5 zmm14 | |
91 | #define TMP6 zmm15 | |
92 | ||
93 | #define W0 zmm16 | |
94 | #define W1 zmm17 | |
95 | #define W2 zmm18 | |
96 | #define W3 zmm19 | |
97 | #define W4 zmm20 | |
98 | #define W5 zmm21 | |
99 | #define W6 zmm22 | |
100 | #define W7 zmm23 | |
101 | #define W8 zmm24 | |
102 | #define W9 zmm25 | |
103 | #define W10 zmm26 | |
104 | #define W11 zmm27 | |
105 | #define W12 zmm28 | |
106 | #define W13 zmm29 | |
107 | #define W14 zmm30 | |
108 | #define W15 zmm31 | |
109 | ||
110 | ||
111 | #define TRANSPOSE16(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _r10, _r11, _r12, _r13, _r14, _r15, _t0, _t1) \ | |
112 | \ | |
113 | \ // input r0 = {a15 a14 a13 a12 a11 a10 a9 a8 a7 a6 a5 a4 a3 a2 a1 a0} | |
114 | \ // r1 = {b15 b14 b13 b12 b11 b10 b9 b8 b7 b6 b5 b4 b3 b2 b1 b0} | |
115 | \ // r2 = {c15 c14 c13 c12 c11 c10 c9 c8 c7 c6 c5 c4 c3 c2 c1 c0} | |
116 | \ // r3 = {d15 d14 d13 d12 d11 d10 d9 d8 d7 d6 d5 d4 d3 d2 d1 d0} | |
117 | \ // r4 = {e15 e14 e13 e12 e11 e10 e9 e8 e7 e6 e5 e4 e3 e2 e1 e0} | |
118 | \ // r5 = {f15 f14 f13 f12 f11 f10 f9 f8 f7 f6 f5 f4 f3 f2 f1 f0} | |
119 | \ // r6 = {g15 g14 g13 g12 g11 g10 g9 g8 g7 g6 g5 g4 g3 g2 g1 g0} | |
120 | \ // r7 = {h15 h14 h13 h12 h11 h10 h9 h8 h7 h6 h5 h4 h3 h2 h1 h0} | |
121 | \ // r8 = {i15 i14 i13 i12 i11 i10 i9 i8 i7 i6 i5 i4 i3 i2 i1 i0} | |
122 | \ // r9 = {j15 j14 j13 j12 j11 j10 j9 j8 j7 j6 j5 j4 j3 j2 j1 j0} | |
123 | \ // r10 = {k15 k14 k13 k12 k11 k10 k9 k8 k7 k6 k5 k4 k3 k2 k1 k0} | |
124 | \ // r11 = {l15 l14 l13 l12 l11 l10 l9 l8 l7 l6 l5 l4 l3 l2 l1 l0} | |
125 | \ // r12 = {m15 m14 m13 m12 m11 m10 m9 m8 m7 m6 m5 m4 m3 m2 m1 m0} | |
126 | \ // r13 = {n15 n14 n13 n12 n11 n10 n9 n8 n7 n6 n5 n4 n3 n2 n1 n0} | |
127 | \ // r14 = {o15 o14 o13 o12 o11 o10 o9 o8 o7 o6 o5 o4 o3 o2 o1 o0} | |
128 | \ // r15 = {p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0} | |
129 | \ | |
130 | \ // output r0 = { p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0} | |
131 | \ // r1 = { p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1} | |
132 | \ // r2 = { p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2} | |
133 | \ // r3 = { p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3} | |
134 | \ // r4 = { p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4} | |
135 | \ // r5 = { p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5} | |
136 | \ // r6 = { p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6} | |
137 | \ // r7 = { p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7} | |
138 | \ // r8 = { p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8} | |
139 | \ // r9 = { p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9} | |
140 | \ // r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10} | |
141 | \ // r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11} | |
142 | \ // r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12} | |
143 | \ // r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13} | |
144 | \ // r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14} | |
145 | \ // r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15} | |
146 | \ | |
147 | \ // process top half | |
148 | vshufps _t0, _r0, _r1, 0x44 \ // t0 = {b13 b12 a13 a12 b9 b8 a9 a8 b5 b4 a5 a4 b1 b0 a1 a0} | |
149 | vshufps _r0, _r0, _r1, 0xEE \ // r0 = {b15 b14 a15 a14 b11 b10 a11 a10 b7 b6 a7 a6 b3 b2 a3 a2} | |
150 | vshufps _t1, _r2, _r3, 0x44 \ // t1 = {d13 d12 c13 c12 d9 d8 c9 c8 d5 d4 c5 c4 d1 d0 c1 c0} | |
151 | vshufps _r2, _r2, _r3, 0xEE \ // r2 = {d15 d14 c15 c14 d11 d10 c11 c10 d7 d6 c7 c6 d3 d2 c3 c2} | |
152 | \ | |
153 | vshufps _r3, _t0, _t1, 0xDD \ // r3 = {d13 c13 b13 a13 d9 c9 b9 a9 d5 c5 b5 a5 d1 c1 b1 a1} | |
154 | vshufps _r1, _r0, _r2, 0x88 \ // r1 = {d14 c14 b14 a14 d10 c10 b10 a10 d6 c6 b6 a6 d2 c2 b2 a2} | |
155 | vshufps _r0, _r0, _r2, 0xDD \ // r0 = {d15 c15 b15 a15 d11 c11 b11 a11 d7 c7 b7 a7 d3 c3 b3 a3} | |
156 | vshufps _t0, _t0, _t1, 0x88 \ // t0 = {d12 c12 b12 a12 d8 c8 b8 a8 d4 c4 b4 a4 d0 c0 b0 a0} | |
157 | \ | |
158 | \ // use r2 in place of t0 | |
159 | vshufps _r2, _r4, _r5, 0x44 \ // r2 = {f13 f12 e13 e12 f9 f8 e9 e8 f5 f4 e5 e4 f1 f0 e1 e0} | |
160 | vshufps _r4, _r4, _r5, 0xEE \ // r4 = {f15 f14 e15 e14 f11 f10 e11 e10 f7 f6 e7 e6 f3 f2 e3 e2} | |
161 | vshufps _t1, _r6, _r7, 0x44 \ // t1 = {h13 h12 g13 g12 h9 h8 g9 g8 h5 h4 g5 g4 h1 h0 g1 g0} | |
162 | vshufps _r6, _r6, _r7, 0xEE \ // r6 = {h15 h14 g15 g14 h11 h10 g11 g10 h7 h6 g7 g6 h3 h2 g3 g2} | |
163 | \ | |
164 | vshufps _r7, _r2, _t1, 0xDD \ // r7 = {h13 g13 f13 e13 h9 g9 f9 e9 h5 g5 f5 e5 h1 g1 f1 e1} | |
165 | vshufps _r5, _r4, _r6, 0x88 \ // r5 = {h14 g14 f14 e14 h10 g10 f10 e10 h6 g6 f6 e6 h2 g2 f2 e2} | |
166 | vshufps _r4, _r4, _r6, 0xDD \ // r4 = {h15 g15 f15 e15 h11 g11 f11 e11 h7 g7 f7 e7 h3 g3 f3 e3} | |
167 | vshufps _r2, _r2, _t1, 0x88 \ // r2 = {h12 g12 f12 e12 h8 g8 f8 e8 h4 g4 f4 e4 h0 g0 f0 e0} | |
168 | \ | |
169 | \ // use r6 in place of t0 | |
170 | vshufps _r6, _r8, _r9, 0x44 \ // r6 = {j13 j12 i13 i12 j9 j8 i9 i8 j5 j4 i5 i4 j1 j0 i1 i0} | |
171 | vshufps _r8, _r8, _r9, 0xEE \ // r8 = {j15 j14 i15 i14 j11 j10 i11 i10 j7 j6 i7 i6 j3 j2 i3 i2} | |
172 | vshufps _t1, _r10, _r11, 0x44 \ // t1 = {l13 l12 k13 k12 l9 l8 k9 k8 l5 l4 k5 k4 l1 l0 k1 k0} | |
173 | vshufps _r10, _r10, _r11, 0xEE \ // r10 = {l15 l14 k15 k14 l11 l10 k11 k10 l7 l6 k7 k6 l3 l2 k3 k2} | |
174 | \ | |
175 | vshufps _r11, _r6, _t1, 0xDD \ // r11 = {l13 k13 j13 113 l9 k9 j9 i9 l5 k5 j5 i5 l1 k1 j1 i1} | |
176 | vshufps _r9, _r8, _r10, 0x88 \ // r9 = {l14 k14 j14 114 l10 k10 j10 i10 l6 k6 j6 i6 l2 k2 j2 i2} | |
177 | vshufps _r8, _r8, _r10, 0xDD \ // r8 = {l15 k15 j15 115 l11 k11 j11 i11 l7 k7 j7 i7 l3 k3 j3 i3} | |
178 | vshufps _r6, _r6, _t1, 0x88 \ // r6 = {l12 k12 j12 112 l8 k8 j8 i8 l4 k4 j4 i4 l0 k0 j0 i0} | |
179 | \ | |
180 | \ // use r10 in place of t0 | |
181 | vshufps _r10, _r12, _r13, 0x44 \ // r10 = {n13 n12 m13 m12 n9 n8 m9 m8 n5 n4 m5 m4 n1 n0 a1 m0} | |
182 | vshufps _r12, _r12, _r13, 0xEE \ // r12 = {n15 n14 m15 m14 n11 n10 m11 m10 n7 n6 m7 m6 n3 n2 a3 m2} | |
183 | vshufps _t1, _r14, _r15, 0x44 \ // t1 = {p13 p12 013 012 p9 p8 09 08 p5 p4 05 04 p1 p0 01 00} | |
184 | vshufps _r14, _r14, _r15, 0xEE \ // r14 = {p15 p14 015 014 p11 p10 011 010 p7 p6 07 06 p3 p2 03 02} | |
185 | \ | |
186 | vshufps _r15, _r10, _t1, 0xDD \ // r15 = {p13 013 n13 m13 p9 09 n9 m9 p5 05 n5 m5 p1 01 n1 m1} | |
187 | vshufps _r13, _r12, _r14, 0x88 \ // r13 = {p14 014 n14 m14 p10 010 n10 m10 p6 06 n6 m6 p2 02 n2 m2} | |
188 | vshufps _r12, _r12, _r14, 0xDD \ // r12 = {p15 015 n15 m15 p11 011 n11 m11 p7 07 n7 m7 p3 03 n3 m3} | |
189 | vshufps _r10, _r10, _t1, 0x88 \ // r10 = {p12 012 n12 m12 p8 08 n8 m8 p4 04 n4 m4 p0 00 n0 m0} | |
190 | \ | |
191 | \ // At this point, the registers that contain interesting data are: | |
192 | \ // t0, r3, r1, r0, r2, r7, r5, r4, r6, r11, r9, r8, r10, r15, r13, r12 | |
193 | \ // Can use t1 and r14 as scratch registers | |
194 | LEAQ PSHUFFLE_TRANSPOSE16_MASK1<>(SB), BX \ | |
195 | LEAQ PSHUFFLE_TRANSPOSE16_MASK2<>(SB), R8 \ | |
196 | \ | |
197 | vmovdqu32 _r14, [rbx] \ | |
198 | vpermi2q _r14, _t0, _r2 \ // r14 = {h8 g8 f8 e8 d8 c8 b8 a8 h0 g0 f0 e0 d0 c0 b0 a0} | |
199 | vmovdqu32 _t1, [r8] \ | |
200 | vpermi2q _t1, _t0, _r2 \ // t1 = {h12 g12 f12 e12 d12 c12 b12 a12 h4 g4 f4 e4 d4 c4 b4 a4} | |
201 | \ | |
202 | vmovdqu32 _r2, [rbx] \ | |
203 | vpermi2q _r2, _r3, _r7 \ // r2 = {h9 g9 f9 e9 d9 c9 b9 a9 h1 g1 f1 e1 d1 c1 b1 a1} | |
204 | vmovdqu32 _t0, [r8] \ | |
205 | vpermi2q _t0, _r3, _r7 \ // t0 = {h13 g13 f13 e13 d13 c13 b13 a13 h5 g5 f5 e5 d5 c5 b5 a5} | |
206 | \ | |
207 | vmovdqu32 _r3, [rbx] \ | |
208 | vpermi2q _r3, _r1, _r5 \ // r3 = {h10 g10 f10 e10 d10 c10 b10 a10 h2 g2 f2 e2 d2 c2 b2 a2} | |
209 | vmovdqu32 _r7, [r8] \ | |
210 | vpermi2q _r7, _r1, _r5 \ // r7 = {h14 g14 f14 e14 d14 c14 b14 a14 h6 g6 f6 e6 d6 c6 b6 a6} | |
211 | \ | |
212 | vmovdqu32 _r1, [rbx] \ | |
213 | vpermi2q _r1, _r0, _r4 \ // r1 = {h11 g11 f11 e11 d11 c11 b11 a11 h3 g3 f3 e3 d3 c3 b3 a3} | |
214 | vmovdqu32 _r5, [r8] \ | |
215 | vpermi2q _r5, _r0, _r4 \ // r5 = {h15 g15 f15 e15 d15 c15 b15 a15 h7 g7 f7 e7 d7 c7 b7 a7} | |
216 | \ | |
217 | vmovdqu32 _r0, [rbx] \ | |
218 | vpermi2q _r0, _r6, _r10 \ // r0 = {p8 o8 n8 m8 l8 k8 j8 i8 p0 o0 n0 m0 l0 k0 j0 i0} | |
219 | vmovdqu32 _r4, [r8] \ | |
220 | vpermi2q _r4, _r6, _r10 \ // r4 = {p12 o12 n12 m12 l12 k12 j12 i12 p4 o4 n4 m4 l4 k4 j4 i4} | |
221 | \ | |
222 | vmovdqu32 _r6, [rbx] \ | |
223 | vpermi2q _r6, _r11, _r15 \ // r6 = {p9 o9 n9 m9 l9 k9 j9 i9 p1 o1 n1 m1 l1 k1 j1 i1} | |
224 | vmovdqu32 _r10, [r8] \ | |
225 | vpermi2q _r10, _r11, _r15 \ // r10 = {p13 o13 n13 m13 l13 k13 j13 i13 p5 o5 n5 m5 l5 k5 j5 i5} | |
226 | \ | |
227 | vmovdqu32 _r11, [rbx] \ | |
228 | vpermi2q _r11, _r9, _r13 \ // r11 = {p10 o10 n10 m10 l10 k10 j10 i10 p2 o2 n2 m2 l2 k2 j2 i2} | |
229 | vmovdqu32 _r15, [r8] \ | |
230 | vpermi2q _r15, _r9, _r13 \ // r15 = {p14 o14 n14 m14 l14 k14 j14 i14 p6 o6 n6 m6 l6 k6 j6 i6} | |
231 | \ | |
232 | vmovdqu32 _r9, [rbx] \ | |
233 | vpermi2q _r9, _r8, _r12 \ // r9 = {p11 o11 n11 m11 l11 k11 j11 i11 p3 o3 n3 m3 l3 k3 j3 i3} | |
234 | vmovdqu32 _r13, [r8] \ | |
235 | vpermi2q _r13, _r8, _r12 \ // r13 = {p15 o15 n15 m15 l15 k15 j15 i15 p7 o7 n7 m7 l7 k7 j7 i7} | |
236 | \ | |
237 | \ // At this point r8 and r12 can be used as scratch registers | |
238 | vshuff64x2 _r8, _r14, _r0, 0xEE \ // r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8} | |
239 | vshuff64x2 _r0, _r14, _r0, 0x44 \ // r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0} | |
240 | \ | |
241 | vshuff64x2 _r12, _t1, _r4, 0xEE \ // r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12} | |
242 | vshuff64x2 _r4, _t1, _r4, 0x44 \ // r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4} | |
243 | \ | |
244 | vshuff64x2 _r14, _r7, _r15, 0xEE \ // r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14} | |
245 | vshuff64x2 _t1, _r7, _r15, 0x44 \ // t1 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6} | |
246 | \ | |
247 | vshuff64x2 _r15, _r5, _r13, 0xEE \ // r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15} | |
248 | vshuff64x2 _r7, _r5, _r13, 0x44 \ // r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7} | |
249 | \ | |
250 | vshuff64x2 _r13, _t0, _r10, 0xEE \ // r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13} | |
251 | vshuff64x2 _r5, _t0, _r10, 0x44 \ // r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5} | |
252 | \ | |
253 | vshuff64x2 _r10, _r3, _r11, 0xEE \ // r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10} | |
254 | vshuff64x2 _t0, _r3, _r11, 0x44 \ // t0 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2} | |
255 | \ | |
256 | vshuff64x2 _r11, _r1, _r9, 0xEE \ // r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11} | |
257 | vshuff64x2 _r3, _r1, _r9, 0x44 \ // r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3} | |
258 | \ | |
259 | vshuff64x2 _r9, _r2, _r6, 0xEE \ // r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9} | |
260 | vshuff64x2 _r1, _r2, _r6, 0x44 \ // r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1} | |
261 | \ | |
262 | vmovdqu32 _r2, _t0 \ // r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2} | |
263 | vmovdqu32 _r6, _t1 \ // r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6} | |
264 | ||
265 | ||
266 | // CH(A, B, C) = (A&B) ^ (~A&C) | |
267 | // MAJ(E, F, G) = (E&F) ^ (E&G) ^ (F&G) | |
268 | // SIGMA0 = ROR_2 ^ ROR_13 ^ ROR_22 | |
269 | // SIGMA1 = ROR_6 ^ ROR_11 ^ ROR_25 | |
270 | // sigma0 = ROR_7 ^ ROR_18 ^ SHR_3 | |
271 | // sigma1 = ROR_17 ^ ROR_19 ^ SHR_10 | |
272 | ||
273 | // Main processing loop per round | |
274 | #define PROCESS_LOOP(_WT, _ROUND, _A, _B, _C, _D, _E, _F, _G, _H) \ | |
275 | \ // T1 = H + SIGMA1(E) + CH(E, F, G) + Kt + Wt | |
276 | \ // T2 = SIGMA0(A) + MAJ(A, B, C) | |
277 | \ // H=G, G=F, F=E, E=D+T1, D=C, C=B, B=A, A=T1+T2 | |
278 | \ | |
279 | \ // H becomes T2, then add T1 for A | |
280 | \ // D becomes D + T1 for E | |
281 | \ | |
282 | vpaddd T1, _H, TMP3 \ // T1 = H + Kt | |
283 | vmovdqu32 TMP0, _E \ | |
284 | vprord TMP1, _E, 6 \ // ROR_6(E) | |
285 | vprord TMP2, _E, 11 \ // ROR_11(E) | |
286 | vprord TMP3, _E, 25 \ // ROR_25(E) | |
287 | vpternlogd TMP0, _F, _G, 0xCA \ // TMP0 = CH(E,F,G) | |
288 | vpaddd T1, T1, _WT \ // T1 = T1 + Wt | |
289 | vpternlogd TMP1, TMP2, TMP3, 0x96 \ // TMP1 = SIGMA1(E) | |
290 | vpaddd T1, T1, TMP0 \ // T1 = T1 + CH(E,F,G) | |
291 | vpaddd T1, T1, TMP1 \ // T1 = T1 + SIGMA1(E) | |
292 | vpaddd _D, _D, T1 \ // D = D + T1 | |
293 | \ | |
294 | vprord _H, _A, 2 \ // ROR_2(A) | |
295 | vprord TMP2, _A, 13 \ // ROR_13(A) | |
296 | vprord TMP3, _A, 22 \ // ROR_22(A) | |
297 | vmovdqu32 TMP0, _A \ | |
298 | vpternlogd TMP0, _B, _C, 0xE8 \ // TMP0 = MAJ(A,B,C) | |
299 | vpternlogd _H, TMP2, TMP3, 0x96 \ // H(T2) = SIGMA0(A) | |
300 | vpaddd _H, _H, TMP0 \ // H(T2) = SIGMA0(A) + MAJ(A,B,C) | |
301 | vpaddd _H, _H, T1 \ // H(A) = H(T2) + T1 | |
302 | \ | |
303 | vmovdqu32 TMP3, [TBL + ((_ROUND+1)*64)] \ // Next Kt | |
304 | ||
305 | ||
306 | #define MSG_SCHED_ROUND_16_63(_WT, _WTp1, _WTp9, _WTp14) \ | |
307 | vprord TMP4, _WTp14, 17 \ // ROR_17(Wt-2) | |
308 | vprord TMP5, _WTp14, 19 \ // ROR_19(Wt-2) | |
309 | vpsrld TMP6, _WTp14, 10 \ // SHR_10(Wt-2) | |
310 | vpternlogd TMP4, TMP5, TMP6, 0x96 \ // TMP4 = sigma1(Wt-2) | |
311 | \ | |
312 | vpaddd _WT, _WT, TMP4 \ // Wt = Wt-16 + sigma1(Wt-2) | |
313 | vpaddd _WT, _WT, _WTp9 \ // Wt = Wt-16 + sigma1(Wt-2) + Wt-7 | |
314 | \ | |
315 | vprord TMP4, _WTp1, 7 \ // ROR_7(Wt-15) | |
316 | vprord TMP5, _WTp1, 18 \ // ROR_18(Wt-15) | |
317 | vpsrld TMP6, _WTp1, 3 \ // SHR_3(Wt-15) | |
318 | vpternlogd TMP4, TMP5, TMP6, 0x96 \ // TMP4 = sigma0(Wt-15) | |
319 | \ | |
320 | vpaddd _WT, _WT, TMP4 \ // Wt = Wt-16 + sigma1(Wt-2) + | |
321 | \ // Wt-7 + sigma0(Wt-15) + | |
322 | ||
323 | ||
324 | // Note this is reading in a block of data for one lane | |
325 | // When all 16 are read, the data must be transposed to build msg schedule | |
326 | #define MSG_SCHED_ROUND_00_15(_WT, OFFSET, LABEL) \ | |
327 | TESTQ $(1<<OFFSET), MASK_P9 \ | |
328 | JE LABEL \ | |
329 | MOVQ OFFSET*24(INPUT_P9), R9 \ | |
330 | vmovups _WT, [inp0+IDX] \ | |
331 | LABEL: \ | |
332 | ||
333 | #define MASKED_LOAD(_WT, OFFSET, LABEL) \ | |
334 | TESTQ $(1<<OFFSET), MASK_P9 \ | |
335 | JE LABEL \ | |
336 | MOVQ OFFSET*24(INPUT_P9), R9 \ | |
337 | vmovups _WT,[inp0+IDX] \ | |
338 | LABEL: \ | |
339 | ||
340 | TEXT ·sha256_x16_avx512(SB), 7, $0 | |
341 | MOVQ digests+0(FP), STATE_P9 // | |
342 | MOVQ scratch+8(FP), SCRATCH_P9 | |
343 | MOVQ mask_len+32(FP), INP_SIZE_P9 // number of blocks to process | |
344 | MOVQ mask+24(FP), MASKP_P9 | |
345 | MOVQ (MASKP_P9), MASK_P9 | |
346 | kmovq k1, mask | |
347 | LEAQ inputs+48(FP), INPUT_P9 | |
348 | ||
349 | // Initialize digests | |
350 | vmovdqu32 A, [STATE + 0*SHA256_DIGEST_ROW_SIZE] | |
351 | vmovdqu32 B, [STATE + 1*SHA256_DIGEST_ROW_SIZE] | |
352 | vmovdqu32 C, [STATE + 2*SHA256_DIGEST_ROW_SIZE] | |
353 | vmovdqu32 D, [STATE + 3*SHA256_DIGEST_ROW_SIZE] | |
354 | vmovdqu32 E, [STATE + 4*SHA256_DIGEST_ROW_SIZE] | |
355 | vmovdqu32 F, [STATE + 5*SHA256_DIGEST_ROW_SIZE] | |
356 | vmovdqu32 G, [STATE + 6*SHA256_DIGEST_ROW_SIZE] | |
357 | vmovdqu32 H, [STATE + 7*SHA256_DIGEST_ROW_SIZE] | |
358 | ||
359 | MOVQ table+16(FP), TBL_P9 | |
360 | ||
361 | xor IDX, IDX | |
362 | ||
363 | // Read in first block of input data | |
364 | MASKED_LOAD( W0, 0, skipInput0) | |
365 | MASKED_LOAD( W1, 1, skipInput1) | |
366 | MASKED_LOAD( W2, 2, skipInput2) | |
367 | MASKED_LOAD( W3, 3, skipInput3) | |
368 | MASKED_LOAD( W4, 4, skipInput4) | |
369 | MASKED_LOAD( W5, 5, skipInput5) | |
370 | MASKED_LOAD( W6, 6, skipInput6) | |
371 | MASKED_LOAD( W7, 7, skipInput7) | |
372 | MASKED_LOAD( W8, 8, skipInput8) | |
373 | MASKED_LOAD( W9, 9, skipInput9) | |
374 | MASKED_LOAD(W10, 10, skipInput10) | |
375 | MASKED_LOAD(W11, 11, skipInput11) | |
376 | MASKED_LOAD(W12, 12, skipInput12) | |
377 | MASKED_LOAD(W13, 13, skipInput13) | |
378 | MASKED_LOAD(W14, 14, skipInput14) | |
379 | MASKED_LOAD(W15, 15, skipInput15) | |
380 | ||
381 | lloop: | |
382 | LEAQ PSHUFFLE_BYTE_FLIP_MASK<>(SB), TBL_P9 | |
383 | vmovdqu32 TMP2, [TBL] | |
384 | ||
385 | // Get first K from table | |
386 | MOVQ table+16(FP), TBL_P9 | |
387 | vmovdqu32 TMP3, [TBL] | |
388 | ||
389 | // Save digests for later addition | |
390 | vmovdqu32 [SCRATCH + 64*0], A | |
391 | vmovdqu32 [SCRATCH + 64*1], B | |
392 | vmovdqu32 [SCRATCH + 64*2], C | |
393 | vmovdqu32 [SCRATCH + 64*3], D | |
394 | vmovdqu32 [SCRATCH + 64*4], E | |
395 | vmovdqu32 [SCRATCH + 64*5], F | |
396 | vmovdqu32 [SCRATCH + 64*6], G | |
397 | vmovdqu32 [SCRATCH + 64*7], H | |
398 | ||
399 | add IDX, 64 | |
400 | ||
401 | // Transpose input data | |
402 | TRANSPOSE16(W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1) | |
403 | ||
404 | vpshufb W0, W0, TMP2 | |
405 | vpshufb W1, W1, TMP2 | |
406 | vpshufb W2, W2, TMP2 | |
407 | vpshufb W3, W3, TMP2 | |
408 | vpshufb W4, W4, TMP2 | |
409 | vpshufb W5, W5, TMP2 | |
410 | vpshufb W6, W6, TMP2 | |
411 | vpshufb W7, W7, TMP2 | |
412 | vpshufb W8, W8, TMP2 | |
413 | vpshufb W9, W9, TMP2 | |
414 | vpshufb W10, W10, TMP2 | |
415 | vpshufb W11, W11, TMP2 | |
416 | vpshufb W12, W12, TMP2 | |
417 | vpshufb W13, W13, TMP2 | |
418 | vpshufb W14, W14, TMP2 | |
419 | vpshufb W15, W15, TMP2 | |
420 | ||
421 | // MSG Schedule for W0-W15 is now complete in registers | |
422 | // Process first 48 rounds | |
423 | // Calculate next Wt+16 after processing is complete and Wt is unneeded | |
424 | ||
425 | PROCESS_LOOP( W0, 0, A, B, C, D, E, F, G, H) | |
426 | MSG_SCHED_ROUND_16_63( W0, W1, W9, W14) | |
427 | PROCESS_LOOP( W1, 1, H, A, B, C, D, E, F, G) | |
428 | MSG_SCHED_ROUND_16_63( W1, W2, W10, W15) | |
429 | PROCESS_LOOP( W2, 2, G, H, A, B, C, D, E, F) | |
430 | MSG_SCHED_ROUND_16_63( W2, W3, W11, W0) | |
431 | PROCESS_LOOP( W3, 3, F, G, H, A, B, C, D, E) | |
432 | MSG_SCHED_ROUND_16_63( W3, W4, W12, W1) | |
433 | PROCESS_LOOP( W4, 4, E, F, G, H, A, B, C, D) | |
434 | MSG_SCHED_ROUND_16_63( W4, W5, W13, W2) | |
435 | PROCESS_LOOP( W5, 5, D, E, F, G, H, A, B, C) | |
436 | MSG_SCHED_ROUND_16_63( W5, W6, W14, W3) | |
437 | PROCESS_LOOP( W6, 6, C, D, E, F, G, H, A, B) | |
438 | MSG_SCHED_ROUND_16_63( W6, W7, W15, W4) | |
439 | PROCESS_LOOP( W7, 7, B, C, D, E, F, G, H, A) | |
440 | MSG_SCHED_ROUND_16_63( W7, W8, W0, W5) | |
441 | PROCESS_LOOP( W8, 8, A, B, C, D, E, F, G, H) | |
442 | MSG_SCHED_ROUND_16_63( W8, W9, W1, W6) | |
443 | PROCESS_LOOP( W9, 9, H, A, B, C, D, E, F, G) | |
444 | MSG_SCHED_ROUND_16_63( W9, W10, W2, W7) | |
445 | PROCESS_LOOP(W10, 10, G, H, A, B, C, D, E, F) | |
446 | MSG_SCHED_ROUND_16_63(W10, W11, W3, W8) | |
447 | PROCESS_LOOP(W11, 11, F, G, H, A, B, C, D, E) | |
448 | MSG_SCHED_ROUND_16_63(W11, W12, W4, W9) | |
449 | PROCESS_LOOP(W12, 12, E, F, G, H, A, B, C, D) | |
450 | MSG_SCHED_ROUND_16_63(W12, W13, W5, W10) | |
451 | PROCESS_LOOP(W13, 13, D, E, F, G, H, A, B, C) | |
452 | MSG_SCHED_ROUND_16_63(W13, W14, W6, W11) | |
453 | PROCESS_LOOP(W14, 14, C, D, E, F, G, H, A, B) | |
454 | MSG_SCHED_ROUND_16_63(W14, W15, W7, W12) | |
455 | PROCESS_LOOP(W15, 15, B, C, D, E, F, G, H, A) | |
456 | MSG_SCHED_ROUND_16_63(W15, W0, W8, W13) | |
457 | PROCESS_LOOP( W0, 16, A, B, C, D, E, F, G, H) | |
458 | MSG_SCHED_ROUND_16_63( W0, W1, W9, W14) | |
459 | PROCESS_LOOP( W1, 17, H, A, B, C, D, E, F, G) | |
460 | MSG_SCHED_ROUND_16_63( W1, W2, W10, W15) | |
461 | PROCESS_LOOP( W2, 18, G, H, A, B, C, D, E, F) | |
462 | MSG_SCHED_ROUND_16_63( W2, W3, W11, W0) | |
463 | PROCESS_LOOP( W3, 19, F, G, H, A, B, C, D, E) | |
464 | MSG_SCHED_ROUND_16_63( W3, W4, W12, W1) | |
465 | PROCESS_LOOP( W4, 20, E, F, G, H, A, B, C, D) | |
466 | MSG_SCHED_ROUND_16_63( W4, W5, W13, W2) | |
467 | PROCESS_LOOP( W5, 21, D, E, F, G, H, A, B, C) | |
468 | MSG_SCHED_ROUND_16_63( W5, W6, W14, W3) | |
469 | PROCESS_LOOP( W6, 22, C, D, E, F, G, H, A, B) | |
470 | MSG_SCHED_ROUND_16_63( W6, W7, W15, W4) | |
471 | PROCESS_LOOP( W7, 23, B, C, D, E, F, G, H, A) | |
472 | MSG_SCHED_ROUND_16_63( W7, W8, W0, W5) | |
473 | PROCESS_LOOP( W8, 24, A, B, C, D, E, F, G, H) | |
474 | MSG_SCHED_ROUND_16_63( W8, W9, W1, W6) | |
475 | PROCESS_LOOP( W9, 25, H, A, B, C, D, E, F, G) | |
476 | MSG_SCHED_ROUND_16_63( W9, W10, W2, W7) | |
477 | PROCESS_LOOP(W10, 26, G, H, A, B, C, D, E, F) | |
478 | MSG_SCHED_ROUND_16_63(W10, W11, W3, W8) | |
479 | PROCESS_LOOP(W11, 27, F, G, H, A, B, C, D, E) | |
480 | MSG_SCHED_ROUND_16_63(W11, W12, W4, W9) | |
481 | PROCESS_LOOP(W12, 28, E, F, G, H, A, B, C, D) | |
482 | MSG_SCHED_ROUND_16_63(W12, W13, W5, W10) | |
483 | PROCESS_LOOP(W13, 29, D, E, F, G, H, A, B, C) | |
484 | MSG_SCHED_ROUND_16_63(W13, W14, W6, W11) | |
485 | PROCESS_LOOP(W14, 30, C, D, E, F, G, H, A, B) | |
486 | MSG_SCHED_ROUND_16_63(W14, W15, W7, W12) | |
487 | PROCESS_LOOP(W15, 31, B, C, D, E, F, G, H, A) | |
488 | MSG_SCHED_ROUND_16_63(W15, W0, W8, W13) | |
489 | PROCESS_LOOP( W0, 32, A, B, C, D, E, F, G, H) | |
490 | MSG_SCHED_ROUND_16_63( W0, W1, W9, W14) | |
491 | PROCESS_LOOP( W1, 33, H, A, B, C, D, E, F, G) | |
492 | MSG_SCHED_ROUND_16_63( W1, W2, W10, W15) | |
493 | PROCESS_LOOP( W2, 34, G, H, A, B, C, D, E, F) | |
494 | MSG_SCHED_ROUND_16_63( W2, W3, W11, W0) | |
495 | PROCESS_LOOP( W3, 35, F, G, H, A, B, C, D, E) | |
496 | MSG_SCHED_ROUND_16_63( W3, W4, W12, W1) | |
497 | PROCESS_LOOP( W4, 36, E, F, G, H, A, B, C, D) | |
498 | MSG_SCHED_ROUND_16_63( W4, W5, W13, W2) | |
499 | PROCESS_LOOP( W5, 37, D, E, F, G, H, A, B, C) | |
500 | MSG_SCHED_ROUND_16_63( W5, W6, W14, W3) | |
501 | PROCESS_LOOP( W6, 38, C, D, E, F, G, H, A, B) | |
502 | MSG_SCHED_ROUND_16_63( W6, W7, W15, W4) | |
503 | PROCESS_LOOP( W7, 39, B, C, D, E, F, G, H, A) | |
504 | MSG_SCHED_ROUND_16_63( W7, W8, W0, W5) | |
505 | PROCESS_LOOP( W8, 40, A, B, C, D, E, F, G, H) | |
506 | MSG_SCHED_ROUND_16_63( W8, W9, W1, W6) | |
507 | PROCESS_LOOP( W9, 41, H, A, B, C, D, E, F, G) | |
508 | MSG_SCHED_ROUND_16_63( W9, W10, W2, W7) | |
509 | PROCESS_LOOP(W10, 42, G, H, A, B, C, D, E, F) | |
510 | MSG_SCHED_ROUND_16_63(W10, W11, W3, W8) | |
511 | PROCESS_LOOP(W11, 43, F, G, H, A, B, C, D, E) | |
512 | MSG_SCHED_ROUND_16_63(W11, W12, W4, W9) | |
513 | PROCESS_LOOP(W12, 44, E, F, G, H, A, B, C, D) | |
514 | MSG_SCHED_ROUND_16_63(W12, W13, W5, W10) | |
515 | PROCESS_LOOP(W13, 45, D, E, F, G, H, A, B, C) | |
516 | MSG_SCHED_ROUND_16_63(W13, W14, W6, W11) | |
517 | PROCESS_LOOP(W14, 46, C, D, E, F, G, H, A, B) | |
518 | MSG_SCHED_ROUND_16_63(W14, W15, W7, W12) | |
519 | PROCESS_LOOP(W15, 47, B, C, D, E, F, G, H, A) | |
520 | MSG_SCHED_ROUND_16_63(W15, W0, W8, W13) | |
521 | ||
522 | // Check if this is the last block | |
523 | sub INP_SIZE, 1 | |
524 | JE lastLoop | |
525 | ||
526 | // Load next mask for inputs | |
527 | ADDQ $8, MASKP_P9 | |
528 | MOVQ (MASKP_P9), MASK_P9 | |
529 | ||
530 | // Process last 16 rounds | |
531 | // Read in next block msg data for use in first 16 words of msg sched | |
532 | ||
533 | PROCESS_LOOP( W0, 48, A, B, C, D, E, F, G, H) | |
534 | MSG_SCHED_ROUND_00_15( W0, 0, skipNext0) | |
535 | PROCESS_LOOP( W1, 49, H, A, B, C, D, E, F, G) | |
536 | MSG_SCHED_ROUND_00_15( W1, 1, skipNext1) | |
537 | PROCESS_LOOP( W2, 50, G, H, A, B, C, D, E, F) | |
538 | MSG_SCHED_ROUND_00_15( W2, 2, skipNext2) | |
539 | PROCESS_LOOP( W3, 51, F, G, H, A, B, C, D, E) | |
540 | MSG_SCHED_ROUND_00_15( W3, 3, skipNext3) | |
541 | PROCESS_LOOP( W4, 52, E, F, G, H, A, B, C, D) | |
542 | MSG_SCHED_ROUND_00_15( W4, 4, skipNext4) | |
543 | PROCESS_LOOP( W5, 53, D, E, F, G, H, A, B, C) | |
544 | MSG_SCHED_ROUND_00_15( W5, 5, skipNext5) | |
545 | PROCESS_LOOP( W6, 54, C, D, E, F, G, H, A, B) | |
546 | MSG_SCHED_ROUND_00_15( W6, 6, skipNext6) | |
547 | PROCESS_LOOP( W7, 55, B, C, D, E, F, G, H, A) | |
548 | MSG_SCHED_ROUND_00_15( W7, 7, skipNext7) | |
549 | PROCESS_LOOP( W8, 56, A, B, C, D, E, F, G, H) | |
550 | MSG_SCHED_ROUND_00_15( W8, 8, skipNext8) | |
551 | PROCESS_LOOP( W9, 57, H, A, B, C, D, E, F, G) | |
552 | MSG_SCHED_ROUND_00_15( W9, 9, skipNext9) | |
553 | PROCESS_LOOP(W10, 58, G, H, A, B, C, D, E, F) | |
554 | MSG_SCHED_ROUND_00_15(W10, 10, skipNext10) | |
555 | PROCESS_LOOP(W11, 59, F, G, H, A, B, C, D, E) | |
556 | MSG_SCHED_ROUND_00_15(W11, 11, skipNext11) | |
557 | PROCESS_LOOP(W12, 60, E, F, G, H, A, B, C, D) | |
558 | MSG_SCHED_ROUND_00_15(W12, 12, skipNext12) | |
559 | PROCESS_LOOP(W13, 61, D, E, F, G, H, A, B, C) | |
560 | MSG_SCHED_ROUND_00_15(W13, 13, skipNext13) | |
561 | PROCESS_LOOP(W14, 62, C, D, E, F, G, H, A, B) | |
562 | MSG_SCHED_ROUND_00_15(W14, 14, skipNext14) | |
563 | PROCESS_LOOP(W15, 63, B, C, D, E, F, G, H, A) | |
564 | MSG_SCHED_ROUND_00_15(W15, 15, skipNext15) | |
565 | ||
566 | // Add old digest | |
567 | vmovdqu32 TMP2, A | |
568 | vmovdqu32 A, [SCRATCH + 64*0] | |
569 | vpaddd A{k1}, A, TMP2 | |
570 | vmovdqu32 TMP2, B | |
571 | vmovdqu32 B, [SCRATCH + 64*1] | |
572 | vpaddd B{k1}, B, TMP2 | |
573 | vmovdqu32 TMP2, C | |
574 | vmovdqu32 C, [SCRATCH + 64*2] | |
575 | vpaddd C{k1}, C, TMP2 | |
576 | vmovdqu32 TMP2, D | |
577 | vmovdqu32 D, [SCRATCH + 64*3] | |
578 | vpaddd D{k1}, D, TMP2 | |
579 | vmovdqu32 TMP2, E | |
580 | vmovdqu32 E, [SCRATCH + 64*4] | |
581 | vpaddd E{k1}, E, TMP2 | |
582 | vmovdqu32 TMP2, F | |
583 | vmovdqu32 F, [SCRATCH + 64*5] | |
584 | vpaddd F{k1}, F, TMP2 | |
585 | vmovdqu32 TMP2, G | |
586 | vmovdqu32 G, [SCRATCH + 64*6] | |
587 | vpaddd G{k1}, G, TMP2 | |
588 | vmovdqu32 TMP2, H | |
589 | vmovdqu32 H, [SCRATCH + 64*7] | |
590 | vpaddd H{k1}, H, TMP2 | |
591 | ||
592 | kmovq k1, mask | |
593 | JMP lloop | |
594 | ||
595 | lastLoop: | |
596 | // Process last 16 rounds | |
597 | PROCESS_LOOP( W0, 48, A, B, C, D, E, F, G, H) | |
598 | PROCESS_LOOP( W1, 49, H, A, B, C, D, E, F, G) | |
599 | PROCESS_LOOP( W2, 50, G, H, A, B, C, D, E, F) | |
600 | PROCESS_LOOP( W3, 51, F, G, H, A, B, C, D, E) | |
601 | PROCESS_LOOP( W4, 52, E, F, G, H, A, B, C, D) | |
602 | PROCESS_LOOP( W5, 53, D, E, F, G, H, A, B, C) | |
603 | PROCESS_LOOP( W6, 54, C, D, E, F, G, H, A, B) | |
604 | PROCESS_LOOP( W7, 55, B, C, D, E, F, G, H, A) | |
605 | PROCESS_LOOP( W8, 56, A, B, C, D, E, F, G, H) | |
606 | PROCESS_LOOP( W9, 57, H, A, B, C, D, E, F, G) | |
607 | PROCESS_LOOP(W10, 58, G, H, A, B, C, D, E, F) | |
608 | PROCESS_LOOP(W11, 59, F, G, H, A, B, C, D, E) | |
609 | PROCESS_LOOP(W12, 60, E, F, G, H, A, B, C, D) | |
610 | PROCESS_LOOP(W13, 61, D, E, F, G, H, A, B, C) | |
611 | PROCESS_LOOP(W14, 62, C, D, E, F, G, H, A, B) | |
612 | PROCESS_LOOP(W15, 63, B, C, D, E, F, G, H, A) | |
613 | ||
614 | // Add old digest | |
615 | vmovdqu32 TMP2, A | |
616 | vmovdqu32 A, [SCRATCH + 64*0] | |
617 | vpaddd A{k1}, A, TMP2 | |
618 | vmovdqu32 TMP2, B | |
619 | vmovdqu32 B, [SCRATCH + 64*1] | |
620 | vpaddd B{k1}, B, TMP2 | |
621 | vmovdqu32 TMP2, C | |
622 | vmovdqu32 C, [SCRATCH + 64*2] | |
623 | vpaddd C{k1}, C, TMP2 | |
624 | vmovdqu32 TMP2, D | |
625 | vmovdqu32 D, [SCRATCH + 64*3] | |
626 | vpaddd D{k1}, D, TMP2 | |
627 | vmovdqu32 TMP2, E | |
628 | vmovdqu32 E, [SCRATCH + 64*4] | |
629 | vpaddd E{k1}, E, TMP2 | |
630 | vmovdqu32 TMP2, F | |
631 | vmovdqu32 F, [SCRATCH + 64*5] | |
632 | vpaddd F{k1}, F, TMP2 | |
633 | vmovdqu32 TMP2, G | |
634 | vmovdqu32 G, [SCRATCH + 64*6] | |
635 | vpaddd G{k1}, G, TMP2 | |
636 | vmovdqu32 TMP2, H | |
637 | vmovdqu32 H, [SCRATCH + 64*7] | |
638 | vpaddd H{k1}, H, TMP2 | |
639 | ||
640 | // Write out digest | |
641 | vmovdqu32 [STATE + 0*SHA256_DIGEST_ROW_SIZE], A | |
642 | vmovdqu32 [STATE + 1*SHA256_DIGEST_ROW_SIZE], B | |
643 | vmovdqu32 [STATE + 2*SHA256_DIGEST_ROW_SIZE], C | |
644 | vmovdqu32 [STATE + 3*SHA256_DIGEST_ROW_SIZE], D | |
645 | vmovdqu32 [STATE + 4*SHA256_DIGEST_ROW_SIZE], E | |
646 | vmovdqu32 [STATE + 5*SHA256_DIGEST_ROW_SIZE], F | |
647 | vmovdqu32 [STATE + 6*SHA256_DIGEST_ROW_SIZE], G | |
648 | vmovdqu32 [STATE + 7*SHA256_DIGEST_ROW_SIZE], H | |
649 | ||
650 | VZEROUPPER | |
651 | RET | |
652 | ||
653 | // | |
654 | // Tables | |
655 | // | |
656 | ||
657 | DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x000(SB)/8, $0x0405060700010203 | |
658 | DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x008(SB)/8, $0x0c0d0e0f08090a0b | |
659 | DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x010(SB)/8, $0x0405060700010203 | |
660 | DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x018(SB)/8, $0x0c0d0e0f08090a0b | |
661 | DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x020(SB)/8, $0x0405060700010203 | |
662 | DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x028(SB)/8, $0x0c0d0e0f08090a0b | |
663 | DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x030(SB)/8, $0x0405060700010203 | |
664 | DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x038(SB)/8, $0x0c0d0e0f08090a0b | |
665 | GLOBL PSHUFFLE_BYTE_FLIP_MASK<>(SB), 8, $64 | |
666 | ||
667 | DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x000(SB)/8, $0x0000000000000000 | |
668 | DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x008(SB)/8, $0x0000000000000001 | |
669 | DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x010(SB)/8, $0x0000000000000008 | |
670 | DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x018(SB)/8, $0x0000000000000009 | |
671 | DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x020(SB)/8, $0x0000000000000004 | |
672 | DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x028(SB)/8, $0x0000000000000005 | |
673 | DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x030(SB)/8, $0x000000000000000C | |
674 | DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x038(SB)/8, $0x000000000000000D | |
675 | GLOBL PSHUFFLE_TRANSPOSE16_MASK1<>(SB), 8, $64 | |
676 | ||
677 | DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x000(SB)/8, $0x0000000000000002 | |
678 | DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x008(SB)/8, $0x0000000000000003 | |
679 | DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x010(SB)/8, $0x000000000000000A | |
680 | DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x018(SB)/8, $0x000000000000000B | |
681 | DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x020(SB)/8, $0x0000000000000006 | |
682 | DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x028(SB)/8, $0x0000000000000007 | |
683 | DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x030(SB)/8, $0x000000000000000E | |
684 | DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x038(SB)/8, $0x000000000000000F | |
685 | GLOBL PSHUFFLE_TRANSPOSE16_MASK2<>(SB), 8, $64 |
0 | //+build !noasm | |
1 | ||
2 | /* | |
3 | * Minio Cloud Storage, (C) 2017 Minio, Inc. | |
4 | * | |
5 | * Licensed under the Apache License, Version 2.0 (the "License"); | |
6 | * you may not use this file except in compliance with the License. | |
7 | * You may obtain a copy of the License at | |
8 | * | |
9 | * http://www.apache.org/licenses/LICENSE-2.0 | |
10 | * | |
11 | * Unless required by applicable law or agreed to in writing, software | |
12 | * distributed under the License is distributed on an "AS IS" BASIS, | |
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
14 | * See the License for the specific language governing permissions and | |
15 | * limitations under the License. | |
16 | */ | |
17 | ||
18 | package sha256 | |
19 | ||
20 | import ( | |
21 | "encoding/binary" | |
22 | "errors" | |
23 | "hash" | |
24 | "sort" | |
25 | "sync/atomic" | |
26 | "time" | |
27 | ) | |
28 | ||
29 | //go:noescape | |
30 | func sha256_x16_avx512(digests *[512]byte, scratch *[512]byte, table *[4096]uint64, mask []uint64, inputs [16][]byte) | |
31 | ||
32 | // Do not start at 0 but next multiple of 16 so as to be able to | |
33 | // differentiate with default initialiation value of 0 | |
34 | const Avx512ServerUid = 16 | |
35 | ||
36 | var uidCounter uint64 | |
37 | ||
38 | func NewAvx512(a512srv *Avx512Server) hash.Hash { | |
39 | uid := atomic.AddUint64(&uidCounter, 1) | |
40 | return &Avx512Digest{uid: uid, a512srv: a512srv} | |
41 | } | |
42 | ||
43 | // Type for computing SHA256 using AVX51 | |
44 | type Avx512Digest struct { | |
45 | uid uint64 | |
46 | a512srv *Avx512Server | |
47 | x [chunk]byte | |
48 | nx int | |
49 | len uint64 | |
50 | final bool | |
51 | result [Size]byte | |
52 | } | |
53 | ||
54 | // Return size of checksum | |
55 | func (d *Avx512Digest) Size() int { return Size } | |
56 | ||
57 | // Return blocksize of checksum | |
58 | func (d Avx512Digest) BlockSize() int { return BlockSize } | |
59 | ||
60 | func (d *Avx512Digest) Reset() { | |
61 | d.a512srv.blocksCh <- blockInput{uid: d.uid, reset: true} | |
62 | d.nx = 0 | |
63 | d.len = 0 | |
64 | d.final = false | |
65 | } | |
66 | ||
67 | // Write to digest | |
68 | func (d *Avx512Digest) Write(p []byte) (nn int, err error) { | |
69 | ||
70 | if d.final { | |
71 | return 0, errors.New("Avx512Digest already finalized. Reset first before writing again.") | |
72 | } | |
73 | ||
74 | nn = len(p) | |
75 | d.len += uint64(nn) | |
76 | if d.nx > 0 { | |
77 | n := copy(d.x[d.nx:], p) | |
78 | d.nx += n | |
79 | if d.nx == chunk { | |
80 | d.a512srv.blocksCh <- blockInput{uid: d.uid, msg: d.x[:]} | |
81 | d.nx = 0 | |
82 | } | |
83 | p = p[n:] | |
84 | } | |
85 | if len(p) >= chunk { | |
86 | n := len(p) &^ (chunk - 1) | |
87 | d.a512srv.blocksCh <- blockInput{uid: d.uid, msg: p[:n]} | |
88 | p = p[n:] | |
89 | } | |
90 | if len(p) > 0 { | |
91 | d.nx = copy(d.x[:], p) | |
92 | } | |
93 | return | |
94 | } | |
95 | ||
96 | // Return sha256 sum in bytes | |
97 | func (d *Avx512Digest) Sum(in []byte) (result []byte) { | |
98 | ||
99 | if d.final { | |
100 | return append(in, d.result[:]...) | |
101 | } | |
102 | ||
103 | trail := make([]byte, 0, 128) | |
104 | ||
105 | len := d.len | |
106 | // Padding. Add a 1 bit and 0 bits until 56 bytes mod 64. | |
107 | var tmp [64]byte | |
108 | tmp[0] = 0x80 | |
109 | if len%64 < 56 { | |
110 | trail = append(d.x[:d.nx], tmp[0:56-len%64]...) | |
111 | } else { | |
112 | trail = append(d.x[:d.nx], tmp[0:64+56-len%64]...) | |
113 | } | |
114 | d.nx = 0 | |
115 | ||
116 | // Length in bits. | |
117 | len <<= 3 | |
118 | for i := uint(0); i < 8; i++ { | |
119 | tmp[i] = byte(len >> (56 - 8*i)) | |
120 | } | |
121 | trail = append(trail, tmp[0:8]...) | |
122 | ||
123 | sumCh := make(chan [Size]byte) | |
124 | d.a512srv.blocksCh <- blockInput{uid: d.uid, msg: trail, final: true, sumCh: sumCh} | |
125 | d.result = <-sumCh | |
126 | d.final = true | |
127 | return append(in, d.result[:]...) | |
128 | } | |
129 | ||
130 | var table = [4096]uint64{ | |
131 | 0x428a2f98428a2f98, 0x428a2f98428a2f98, 0x428a2f98428a2f98, 0x428a2f98428a2f98, | |
132 | 0x428a2f98428a2f98, 0x428a2f98428a2f98, 0x428a2f98428a2f98, 0x428a2f98428a2f98, | |
133 | 0x7137449171374491, 0x7137449171374491, 0x7137449171374491, 0x7137449171374491, | |
134 | 0x7137449171374491, 0x7137449171374491, 0x7137449171374491, 0x7137449171374491, | |
135 | 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf, | |
136 | 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf, | |
137 | 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5, | |
138 | 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5, | |
139 | 0x3956c25b3956c25b, 0x3956c25b3956c25b, 0x3956c25b3956c25b, 0x3956c25b3956c25b, | |
140 | 0x3956c25b3956c25b, 0x3956c25b3956c25b, 0x3956c25b3956c25b, 0x3956c25b3956c25b, | |
141 | 0x59f111f159f111f1, 0x59f111f159f111f1, 0x59f111f159f111f1, 0x59f111f159f111f1, | |
142 | 0x59f111f159f111f1, 0x59f111f159f111f1, 0x59f111f159f111f1, 0x59f111f159f111f1, | |
143 | 0x923f82a4923f82a4, 0x923f82a4923f82a4, 0x923f82a4923f82a4, 0x923f82a4923f82a4, | |
144 | 0x923f82a4923f82a4, 0x923f82a4923f82a4, 0x923f82a4923f82a4, 0x923f82a4923f82a4, | |
145 | 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5, | |
146 | 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5, | |
147 | 0xd807aa98d807aa98, 0xd807aa98d807aa98, 0xd807aa98d807aa98, 0xd807aa98d807aa98, | |
148 | 0xd807aa98d807aa98, 0xd807aa98d807aa98, 0xd807aa98d807aa98, 0xd807aa98d807aa98, | |
149 | 0x12835b0112835b01, 0x12835b0112835b01, 0x12835b0112835b01, 0x12835b0112835b01, | |
150 | 0x12835b0112835b01, 0x12835b0112835b01, 0x12835b0112835b01, 0x12835b0112835b01, | |
151 | 0x243185be243185be, 0x243185be243185be, 0x243185be243185be, 0x243185be243185be, | |
152 | 0x243185be243185be, 0x243185be243185be, 0x243185be243185be, 0x243185be243185be, | |
153 | 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3, | |
154 | 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3, | |
155 | 0x72be5d7472be5d74, 0x72be5d7472be5d74, 0x72be5d7472be5d74, 0x72be5d7472be5d74, | |
156 | 0x72be5d7472be5d74, 0x72be5d7472be5d74, 0x72be5d7472be5d74, 0x72be5d7472be5d74, | |
157 | 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe, | |
158 | 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe, | |
159 | 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7, | |
160 | 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7, | |
161 | 0xc19bf174c19bf174, 0xc19bf174c19bf174, 0xc19bf174c19bf174, 0xc19bf174c19bf174, | |
162 | 0xc19bf174c19bf174, 0xc19bf174c19bf174, 0xc19bf174c19bf174, 0xc19bf174c19bf174, | |
163 | 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1, | |
164 | 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1, | |
165 | 0xefbe4786efbe4786, 0xefbe4786efbe4786, 0xefbe4786efbe4786, 0xefbe4786efbe4786, | |
166 | 0xefbe4786efbe4786, 0xefbe4786efbe4786, 0xefbe4786efbe4786, 0xefbe4786efbe4786, | |
167 | 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6, | |
168 | 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6, | |
169 | 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc, | |
170 | 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc, | |
171 | 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f, | |
172 | 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f, | |
173 | 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa, | |
174 | 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa, | |
175 | 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc, | |
176 | 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc, | |
177 | 0x76f988da76f988da, 0x76f988da76f988da, 0x76f988da76f988da, 0x76f988da76f988da, | |
178 | 0x76f988da76f988da, 0x76f988da76f988da, 0x76f988da76f988da, 0x76f988da76f988da, | |
179 | 0x983e5152983e5152, 0x983e5152983e5152, 0x983e5152983e5152, 0x983e5152983e5152, | |
180 | 0x983e5152983e5152, 0x983e5152983e5152, 0x983e5152983e5152, 0x983e5152983e5152, | |
181 | 0xa831c66da831c66d, 0xa831c66da831c66d, 0xa831c66da831c66d, 0xa831c66da831c66d, | |
182 | 0xa831c66da831c66d, 0xa831c66da831c66d, 0xa831c66da831c66d, 0xa831c66da831c66d, | |
183 | 0xb00327c8b00327c8, 0xb00327c8b00327c8, 0xb00327c8b00327c8, 0xb00327c8b00327c8, | |
184 | 0xb00327c8b00327c8, 0xb00327c8b00327c8, 0xb00327c8b00327c8, 0xb00327c8b00327c8, | |
185 | 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7, | |
186 | 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7, | |
187 | 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3, | |
188 | 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3, | |
189 | 0xd5a79147d5a79147, 0xd5a79147d5a79147, 0xd5a79147d5a79147, 0xd5a79147d5a79147, | |
190 | 0xd5a79147d5a79147, 0xd5a79147d5a79147, 0xd5a79147d5a79147, 0xd5a79147d5a79147, | |
191 | 0x06ca635106ca6351, 0x06ca635106ca6351, 0x06ca635106ca6351, 0x06ca635106ca6351, | |
192 | 0x06ca635106ca6351, 0x06ca635106ca6351, 0x06ca635106ca6351, 0x06ca635106ca6351, | |
193 | 0x1429296714292967, 0x1429296714292967, 0x1429296714292967, 0x1429296714292967, | |
194 | 0x1429296714292967, 0x1429296714292967, 0x1429296714292967, 0x1429296714292967, | |
195 | 0x27b70a8527b70a85, 0x27b70a8527b70a85, 0x27b70a8527b70a85, 0x27b70a8527b70a85, | |
196 | 0x27b70a8527b70a85, 0x27b70a8527b70a85, 0x27b70a8527b70a85, 0x27b70a8527b70a85, | |
197 | 0x2e1b21382e1b2138, 0x2e1b21382e1b2138, 0x2e1b21382e1b2138, 0x2e1b21382e1b2138, | |
198 | 0x2e1b21382e1b2138, 0x2e1b21382e1b2138, 0x2e1b21382e1b2138, 0x2e1b21382e1b2138, | |
199 | 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc, | |
200 | 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc, | |
201 | 0x53380d1353380d13, 0x53380d1353380d13, 0x53380d1353380d13, 0x53380d1353380d13, | |
202 | 0x53380d1353380d13, 0x53380d1353380d13, 0x53380d1353380d13, 0x53380d1353380d13, | |
203 | 0x650a7354650a7354, 0x650a7354650a7354, 0x650a7354650a7354, 0x650a7354650a7354, | |
204 | 0x650a7354650a7354, 0x650a7354650a7354, 0x650a7354650a7354, 0x650a7354650a7354, | |
205 | 0x766a0abb766a0abb, 0x766a0abb766a0abb, 0x766a0abb766a0abb, 0x766a0abb766a0abb, | |
206 | 0x766a0abb766a0abb, 0x766a0abb766a0abb, 0x766a0abb766a0abb, 0x766a0abb766a0abb, | |
207 | 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e, | |
208 | 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e, | |
209 | 0x92722c8592722c85, 0x92722c8592722c85, 0x92722c8592722c85, 0x92722c8592722c85, | |
210 | 0x92722c8592722c85, 0x92722c8592722c85, 0x92722c8592722c85, 0x92722c8592722c85, | |
211 | 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1, | |
212 | 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1, | |
213 | 0xa81a664ba81a664b, 0xa81a664ba81a664b, 0xa81a664ba81a664b, 0xa81a664ba81a664b, | |
214 | 0xa81a664ba81a664b, 0xa81a664ba81a664b, 0xa81a664ba81a664b, 0xa81a664ba81a664b, | |
215 | 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70, | |
216 | 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70, | |
217 | 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3, | |
218 | 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3, | |
219 | 0xd192e819d192e819, 0xd192e819d192e819, 0xd192e819d192e819, 0xd192e819d192e819, | |
220 | 0xd192e819d192e819, 0xd192e819d192e819, 0xd192e819d192e819, 0xd192e819d192e819, | |
221 | 0xd6990624d6990624, 0xd6990624d6990624, 0xd6990624d6990624, 0xd6990624d6990624, | |
222 | 0xd6990624d6990624, 0xd6990624d6990624, 0xd6990624d6990624, 0xd6990624d6990624, | |
223 | 0xf40e3585f40e3585, 0xf40e3585f40e3585, 0xf40e3585f40e3585, 0xf40e3585f40e3585, | |
224 | 0xf40e3585f40e3585, 0xf40e3585f40e3585, 0xf40e3585f40e3585, 0xf40e3585f40e3585, | |
225 | 0x106aa070106aa070, 0x106aa070106aa070, 0x106aa070106aa070, 0x106aa070106aa070, | |
226 | 0x106aa070106aa070, 0x106aa070106aa070, 0x106aa070106aa070, 0x106aa070106aa070, | |
227 | 0x19a4c11619a4c116, 0x19a4c11619a4c116, 0x19a4c11619a4c116, 0x19a4c11619a4c116, | |
228 | 0x19a4c11619a4c116, 0x19a4c11619a4c116, 0x19a4c11619a4c116, 0x19a4c11619a4c116, | |
229 | 0x1e376c081e376c08, 0x1e376c081e376c08, 0x1e376c081e376c08, 0x1e376c081e376c08, | |
230 | 0x1e376c081e376c08, 0x1e376c081e376c08, 0x1e376c081e376c08, 0x1e376c081e376c08, | |
231 | 0x2748774c2748774c, 0x2748774c2748774c, 0x2748774c2748774c, 0x2748774c2748774c, | |
232 | 0x2748774c2748774c, 0x2748774c2748774c, 0x2748774c2748774c, 0x2748774c2748774c, | |
233 | 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5, | |
234 | 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5, | |
235 | 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3, | |
236 | 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3, | |
237 | 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a, | |
238 | 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a, | |
239 | 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f, | |
240 | 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f, | |
241 | 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3, | |
242 | 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3, | |
243 | 0x748f82ee748f82ee, 0x748f82ee748f82ee, 0x748f82ee748f82ee, 0x748f82ee748f82ee, | |
244 | 0x748f82ee748f82ee, 0x748f82ee748f82ee, 0x748f82ee748f82ee, 0x748f82ee748f82ee, | |
245 | 0x78a5636f78a5636f, 0x78a5636f78a5636f, 0x78a5636f78a5636f, 0x78a5636f78a5636f, | |
246 | 0x78a5636f78a5636f, 0x78a5636f78a5636f, 0x78a5636f78a5636f, 0x78a5636f78a5636f, | |
247 | 0x84c8781484c87814, 0x84c8781484c87814, 0x84c8781484c87814, 0x84c8781484c87814, | |
248 | 0x84c8781484c87814, 0x84c8781484c87814, 0x84c8781484c87814, 0x84c8781484c87814, | |
249 | 0x8cc702088cc70208, 0x8cc702088cc70208, 0x8cc702088cc70208, 0x8cc702088cc70208, | |
250 | 0x8cc702088cc70208, 0x8cc702088cc70208, 0x8cc702088cc70208, 0x8cc702088cc70208, | |
251 | 0x90befffa90befffa, 0x90befffa90befffa, 0x90befffa90befffa, 0x90befffa90befffa, | |
252 | 0x90befffa90befffa, 0x90befffa90befffa, 0x90befffa90befffa, 0x90befffa90befffa, | |
253 | 0xa4506ceba4506ceb, 0xa4506ceba4506ceb, 0xa4506ceba4506ceb, 0xa4506ceba4506ceb, | |
254 | 0xa4506ceba4506ceb, 0xa4506ceba4506ceb, 0xa4506ceba4506ceb, 0xa4506ceba4506ceb, | |
255 | 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7, | |
256 | 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7, | |
257 | 0xc67178f2c67178f2, 0xc67178f2c67178f2, 0xc67178f2c67178f2, 0xc67178f2c67178f2, | |
258 | 0xc67178f2c67178f2, 0xc67178f2c67178f2, 0xc67178f2c67178f2, 0xc67178f2c67178f2} | |
259 | ||
260 | // Interface function to assembly ode | |
261 | func blockAvx512(digests *[512]byte, input [16][]byte, mask []uint64) [16][Size]byte { | |
262 | ||
263 | scratch := [512]byte{} | |
264 | sha256_x16_avx512(digests, &scratch, &table, mask, input) | |
265 | ||
266 | output := [16][Size]byte{} | |
267 | for i := 0; i < 16; i++ { | |
268 | output[i] = getDigest(i, digests[:]) | |
269 | } | |
270 | ||
271 | return output | |
272 | } | |
273 | ||
274 | func getDigest(index int, state []byte) (sum [Size]byte) { | |
275 | for j := 0; j < 16; j += 2 { | |
276 | for i := index*4 + j*Size; i < index*4+(j+1)*Size; i += Size { | |
277 | binary.BigEndian.PutUint32(sum[j*2:], binary.LittleEndian.Uint32(state[i:i+4])) | |
278 | } | |
279 | } | |
280 | return | |
281 | } | |
282 | ||
283 | // Message to send across input channel | |
284 | type blockInput struct { | |
285 | uid uint64 | |
286 | msg []byte | |
287 | reset bool | |
288 | final bool | |
289 | sumCh chan [Size]byte | |
290 | } | |
291 | ||
292 | // Type to implement 16x parallel handling of SHA256 invocations | |
293 | type Avx512Server struct { | |
294 | blocksCh chan blockInput // Input channel | |
295 | totalIn int // Total number of inputs waiting to be processed | |
296 | lanes [16]Avx512LaneInfo // Array with info per lane (out of 16) | |
297 | digests map[uint64][Size]byte // Map of uids to (interim) digest results | |
298 | } | |
299 | ||
300 | // Info for each lane | |
301 | type Avx512LaneInfo struct { | |
302 | uid uint64 // unique identification for this SHA processing | |
303 | block []byte // input block to be processed | |
304 | outputCh chan [Size]byte // channel for output result | |
305 | } | |
306 | ||
307 | // Create new object for parallel processing handling | |
308 | func NewAvx512Server() *Avx512Server { | |
309 | a512srv := &Avx512Server{} | |
310 | a512srv.digests = make(map[uint64][Size]byte) | |
311 | a512srv.blocksCh = make(chan blockInput) | |
312 | ||
313 | // Start a single thread for reading from the input channel | |
314 | go a512srv.Process() | |
315 | return a512srv | |
316 | } | |
317 | ||
318 | // Sole handler for reading from the input channel | |
319 | func (a512srv *Avx512Server) Process() { | |
320 | for { | |
321 | select { | |
322 | case block := <-a512srv.blocksCh: | |
323 | if block.reset { | |
324 | a512srv.reset(block.uid) | |
325 | continue | |
326 | } | |
327 | index := block.uid & 0xf | |
328 | // fmt.Println("Adding message:", block.uid, index) | |
329 | ||
330 | if a512srv.lanes[index].block != nil { // If slot is already filled, process all inputs | |
331 | //fmt.Println("Invoking Blocks()") | |
332 | a512srv.blocks() | |
333 | } | |
334 | a512srv.totalIn++ | |
335 | a512srv.lanes[index] = Avx512LaneInfo{uid: block.uid, block: block.msg} | |
336 | if block.final { | |
337 | a512srv.lanes[index].outputCh = block.sumCh | |
338 | } | |
339 | if a512srv.totalIn == len(a512srv.lanes) { | |
340 | // fmt.Println("Invoking Blocks() while FULL: ") | |
341 | a512srv.blocks() | |
342 | } | |
343 | ||
344 | // TODO: test with larger timeout | |
345 | case <-time.After(1 * time.Microsecond): | |
346 | for _, lane := range a512srv.lanes { | |
347 | if lane.block != nil { // check if there is any input to process | |
348 | // fmt.Println("Invoking Blocks() on TIMEOUT: ") | |
349 | a512srv.blocks() | |
350 | break // we are done | |
351 | } | |
352 | } | |
353 | } | |
354 | } | |
355 | } | |
356 | ||
357 | // Do a reset for this calculation | |
358 | func (a512srv *Avx512Server) reset(uid uint64) { | |
359 | ||
360 | // Check if there is a message still waiting to be processed (and remove if so) | |
361 | for i, lane := range a512srv.lanes { | |
362 | if lane.uid == uid { | |
363 | if lane.block != nil { | |
364 | a512srv.lanes[i] = Avx512LaneInfo{} // clear message | |
365 | a512srv.totalIn -= 1 | |
366 | } | |
367 | } | |
368 | } | |
369 | ||
370 | // Delete entry from hash map | |
371 | delete(a512srv.digests, uid) | |
372 | } | |
373 | ||
374 | // Invoke assembly and send results back | |
375 | func (a512srv *Avx512Server) blocks() (err error) { | |
376 | ||
377 | inputs := [16][]byte{} | |
378 | for i := range inputs { | |
379 | inputs[i] = a512srv.lanes[i].block | |
380 | } | |
381 | ||
382 | mask := expandMask(genMask(inputs)) | |
383 | outputs := blockAvx512(a512srv.getDigests(), inputs, mask) | |
384 | ||
385 | a512srv.totalIn = 0 | |
386 | for i := 0; i < len(outputs); i++ { | |
387 | uid, outputCh := a512srv.lanes[i].uid, a512srv.lanes[i].outputCh | |
388 | a512srv.digests[uid] = outputs[i] | |
389 | a512srv.lanes[i] = Avx512LaneInfo{} | |
390 | ||
391 | if outputCh != nil { | |
392 | // Send back result | |
393 | outputCh <- outputs[i] | |
394 | delete(a512srv.digests, uid) // Delete entry from hashmap | |
395 | } | |
396 | } | |
397 | return | |
398 | } | |
399 | ||
400 | func (a512srv *Avx512Server) Write(uid uint64, p []byte) (nn int, err error) { | |
401 | a512srv.blocksCh <- blockInput{uid: uid, msg: p} | |
402 | return len(p), nil | |
403 | } | |
404 | ||
405 | func (a512srv *Avx512Server) Sum(uid uint64, p []byte) [32]byte { | |
406 | sumCh := make(chan [32]byte) | |
407 | a512srv.blocksCh <- blockInput{uid: uid, msg: p, final: true, sumCh: sumCh} | |
408 | return <-sumCh | |
409 | } | |
410 | ||
411 | func (a512srv *Avx512Server) getDigests() *[512]byte { | |
412 | digests := [512]byte{} | |
413 | for i, lane := range a512srv.lanes { | |
414 | a, ok := a512srv.digests[lane.uid] | |
415 | if ok { | |
416 | binary.BigEndian.PutUint32(digests[(i+0*16)*4:], binary.LittleEndian.Uint32(a[0:4])) | |
417 | binary.BigEndian.PutUint32(digests[(i+1*16)*4:], binary.LittleEndian.Uint32(a[4:8])) | |
418 | binary.BigEndian.PutUint32(digests[(i+2*16)*4:], binary.LittleEndian.Uint32(a[8:12])) | |
419 | binary.BigEndian.PutUint32(digests[(i+3*16)*4:], binary.LittleEndian.Uint32(a[12:16])) | |
420 | binary.BigEndian.PutUint32(digests[(i+4*16)*4:], binary.LittleEndian.Uint32(a[16:20])) | |
421 | binary.BigEndian.PutUint32(digests[(i+5*16)*4:], binary.LittleEndian.Uint32(a[20:24])) | |
422 | binary.BigEndian.PutUint32(digests[(i+6*16)*4:], binary.LittleEndian.Uint32(a[24:28])) | |
423 | binary.BigEndian.PutUint32(digests[(i+7*16)*4:], binary.LittleEndian.Uint32(a[28:32])) | |
424 | } else { | |
425 | binary.LittleEndian.PutUint32(digests[(i+0*16)*4:], init0) | |
426 | binary.LittleEndian.PutUint32(digests[(i+1*16)*4:], init1) | |
427 | binary.LittleEndian.PutUint32(digests[(i+2*16)*4:], init2) | |
428 | binary.LittleEndian.PutUint32(digests[(i+3*16)*4:], init3) | |
429 | binary.LittleEndian.PutUint32(digests[(i+4*16)*4:], init4) | |
430 | binary.LittleEndian.PutUint32(digests[(i+5*16)*4:], init5) | |
431 | binary.LittleEndian.PutUint32(digests[(i+6*16)*4:], init6) | |
432 | binary.LittleEndian.PutUint32(digests[(i+7*16)*4:], init7) | |
433 | } | |
434 | } | |
435 | return &digests | |
436 | } | |
437 | ||
438 | // Helper struct for sorting blocks based on length | |
439 | type lane struct { | |
440 | len uint | |
441 | pos uint | |
442 | } | |
443 | ||
444 | type lanes []lane | |
445 | ||
446 | func (lns lanes) Len() int { return len(lns) } | |
447 | func (lns lanes) Swap(i, j int) { lns[i], lns[j] = lns[j], lns[i] } | |
448 | func (lns lanes) Less(i, j int) bool { return lns[i].len < lns[j].len } | |
449 | ||
450 | // Helper struct for | |
451 | type maskRounds struct { | |
452 | mask uint64 | |
453 | rounds uint64 | |
454 | } | |
455 | ||
456 | func genMask(input [16][]byte) [16]maskRounds { | |
457 | ||
458 | // Sort on blocks length small to large | |
459 | var sorted [16]lane | |
460 | for c, inpt := range input { | |
461 | sorted[c] = lane{uint(len(inpt)), uint(c)} | |
462 | } | |
463 | sort.Sort(lanes(sorted[:])) | |
464 | ||
465 | // Create mask array including 'rounds' between masks | |
466 | m, round, index := uint64(0xffff), uint64(0), 0 | |
467 | var mr [16]maskRounds | |
468 | for _, s := range sorted { | |
469 | if s.len > 0 { | |
470 | if uint64(s.len)>>6 > round { | |
471 | mr[index] = maskRounds{m, (uint64(s.len) >> 6) - round} | |
472 | index++ | |
473 | } | |
474 | round = uint64(s.len) >> 6 | |
475 | } | |
476 | m = m & ^(1 << uint(s.pos)) | |
477 | } | |
478 | ||
479 | return mr | |
480 | } | |
481 | ||
482 | // TODO: remove function | |
483 | func expandMask(mr [16]maskRounds) []uint64 { | |
484 | size := uint64(0) | |
485 | for _, r := range mr { | |
486 | size += r.rounds | |
487 | } | |
488 | result, index := make([]uint64, size), 0 | |
489 | for _, r := range mr { | |
490 | for j := uint64(0); j < r.rounds; j++ { | |
491 | result[index] = r.mask | |
492 | index++ | |
493 | } | |
494 | } | |
495 | return result | |
496 | } |
0 | TEXT ·sha256_x16_avx512(SB), 7, $0 | |
1 | MOVQ digests+0(FP), DI | |
2 | MOVQ scratch+8(FP), R12 | |
3 | MOVQ mask_len+32(FP), SI | |
4 | MOVQ r14+24(FP), R13 | |
5 | MOVQ (R13), R14 | |
6 | LONG $0x92fbc1c4; BYTE $0xce | |
7 | LEAQ inputs+48(FP), AX | |
8 | QUAD $0xf162076f487ef162; QUAD $0x7ef162014f6f487e; QUAD $0x487ef16202576f48; QUAD $0x6f487ef162035f6f; QUAD $0x6f6f487ef1620467; QUAD $0x06776f487ef16205; LONG $0x487ef162; WORD $0x7f6f; BYTE $0x07 | |
9 | MOVQ table+16(FP), DX | |
10 | WORD $0x3148; BYTE $0xc9 | |
11 | TESTQ $(1<<0), R14 | |
12 | JE skipInput0 | |
13 | MOVQ 0*24(AX), R9 | |
14 | LONG $0x487cc162; WORD $0x0410; BYTE $0x09 | |
15 | skipInput0: | |
16 | TESTQ $(1<<1), R14 | |
17 | JE skipInput1 | |
18 | MOVQ 1*24(AX), R9 | |
19 | LONG $0x487cc162; WORD $0x0c10; BYTE $0x09 | |
20 | skipInput1: | |
21 | TESTQ $(1<<2), R14 | |
22 | JE skipInput2 | |
23 | MOVQ 2*24(AX), R9 | |
24 | LONG $0x487cc162; WORD $0x1410; BYTE $0x09 | |
25 | skipInput2: | |
26 | TESTQ $(1<<3), R14 | |
27 | JE skipInput3 | |
28 | MOVQ 3*24(AX), R9 | |
29 | LONG $0x487cc162; WORD $0x1c10; BYTE $0x09 | |
30 | skipInput3: | |
31 | TESTQ $(1<<4), R14 | |
32 | JE skipInput4 | |
33 | MOVQ 4*24(AX), R9 | |
34 | LONG $0x487cc162; WORD $0x2410; BYTE $0x09 | |
35 | skipInput4: | |
36 | TESTQ $(1<<5), R14 | |
37 | JE skipInput5 | |
38 | MOVQ 5*24(AX), R9 | |
39 | LONG $0x487cc162; WORD $0x2c10; BYTE $0x09 | |
40 | skipInput5: | |
41 | TESTQ $(1<<6), R14 | |
42 | JE skipInput6 | |
43 | MOVQ 6*24(AX), R9 | |
44 | LONG $0x487cc162; WORD $0x3410; BYTE $0x09 | |
45 | skipInput6: | |
46 | TESTQ $(1<<7), R14 | |
47 | JE skipInput7 | |
48 | MOVQ 7*24(AX), R9 | |
49 | LONG $0x487cc162; WORD $0x3c10; BYTE $0x09 | |
50 | skipInput7: | |
51 | TESTQ $(1<<8), R14 | |
52 | JE skipInput8 | |
53 | MOVQ 8*24(AX), R9 | |
54 | LONG $0x487c4162; WORD $0x0410; BYTE $0x09 | |
55 | skipInput8: | |
56 | TESTQ $(1<<9), R14 | |
57 | JE skipInput9 | |
58 | MOVQ 9*24(AX), R9 | |
59 | LONG $0x487c4162; WORD $0x0c10; BYTE $0x09 | |
60 | skipInput9: | |
61 | TESTQ $(1<<10), R14 | |
62 | JE skipInput10 | |
63 | MOVQ 10*24(AX), R9 | |
64 | LONG $0x487c4162; WORD $0x1410; BYTE $0x09 | |
65 | skipInput10: | |
66 | TESTQ $(1<<11), R14 | |
67 | JE skipInput11 | |
68 | MOVQ 11*24(AX), R9 | |
69 | LONG $0x487c4162; WORD $0x1c10; BYTE $0x09 | |
70 | skipInput11: | |
71 | TESTQ $(1<<12), R14 | |
72 | JE skipInput12 | |
73 | MOVQ 12*24(AX), R9 | |
74 | LONG $0x487c4162; WORD $0x2410; BYTE $0x09 | |
75 | skipInput12: | |
76 | TESTQ $(1<<13), R14 | |
77 | JE skipInput13 | |
78 | MOVQ 13*24(AX), R9 | |
79 | LONG $0x487c4162; WORD $0x2c10; BYTE $0x09 | |
80 | skipInput13: | |
81 | TESTQ $(1<<14), R14 | |
82 | JE skipInput14 | |
83 | MOVQ 14*24(AX), R9 | |
84 | LONG $0x487c4162; WORD $0x3410; BYTE $0x09 | |
85 | skipInput14: | |
86 | TESTQ $(1<<15), R14 | |
87 | JE skipInput15 | |
88 | MOVQ 15*24(AX), R9 | |
89 | LONG $0x487c4162; WORD $0x3c10; BYTE $0x09 | |
90 | skipInput15: | |
91 | lloop: | |
92 | LEAQ PSHUFFLE_BYTE_FLIP_MASK<>(SB), DX | |
93 | LONG $0x487e7162; WORD $0x1a6f | |
94 | MOVQ table+16(FP), DX | |
95 | QUAD $0xd162226f487e7162; QUAD $0x7ed16224047f487e; QUAD $0x7ed16201244c7f48; QUAD $0x7ed1620224547f48; QUAD $0x7ed16203245c7f48; QUAD $0x7ed1620424647f48; QUAD $0x7ed16205246c7f48; QUAD $0x7ed1620624747f48; QUAD $0xc1834807247c7f48; QUAD $0x44c9c6407c316240; QUAD $0x62eec1c6407ca162; QUAD $0xa16244d3c6406c31; QUAD $0x34c162eed3c6406c; QUAD $0x407ca162dddac648; QUAD $0xc6407ca16288cac6; QUAD $0xcac648345162ddc2; QUAD $0x44d5c6405ca16288; QUAD $0x62eee5c6405ca162; QUAD $0xa16244d7c6404c31; QUAD $0x6cc162eef7c6404c; QUAD $0x405ca162ddfac640; QUAD $0xc6405ca16288eec6; QUAD $0xd2c6406cc162dde6; QUAD $0x44f1c6403c816288; QUAD $0x62eec1c6403c0162; QUAD $0x016244d3c6402c11; QUAD $0x4c4162eed3c6402c; QUAD $0x403c0162dddac640; QUAD $0xc6403c016288cac6; QUAD $0xf2c6404cc162ddc2; QUAD $0x44d5c6401c016288; QUAD $0x62eee5c6401c0162; QUAD $0x016244d7c6400c11; QUAD $0x2c4162eef7c6400c; QUAD $0x401c0162ddfac640; QUAD $0xc6401c016288eec6; QUAD $0xd2c6402c4162dde6; BYTE $0x88 | |
96 | LEAQ PSHUFFLE_TRANSPOSE16_MASK1<>(SB), BX | |
97 | LEAQ PSHUFFLE_TRANSPOSE16_MASK2<>(SB), R8 | |
98 | QUAD $0x2262336f487e6162; QUAD $0x487e5162f27648b5; QUAD $0xd27648b53262106f; QUAD $0xa262136f487ee162; QUAD $0x487e5162d77640e5; QUAD $0xcf7640e53262086f; QUAD $0xa2621b6f487ee162; QUAD $0x487ec162dd7640f5; QUAD $0xfd7640f5a262386f; QUAD $0xa2620b6f487ee162; QUAD $0x487ec162cc7640fd; QUAD $0xec7640fda262286f; QUAD $0x8262036f487ee162; QUAD $0x487ec162c27640cd; QUAD $0xe27640cd8262206f; QUAD $0x8262336f487ee162; QUAD $0x487e4162f77640a5; QUAD $0xd77640a50262106f; QUAD $0x02621b6f487e6162; QUAD $0x487e4162dd7640b5; QUAD $0xfd7640b50262386f; QUAD $0x02620b6f487e6162; QUAD $0x487e4162cc7640bd; QUAD $0xec7640bd0262286f; QUAD $0x62eec023408d2362; QUAD $0x236244c023408da3; QUAD $0xada362eee42348ad; QUAD $0x40c5036244e42348; QUAD $0x2340c51362eef723; QUAD $0xfd2340d5036244d7; QUAD $0x44fd2340d58362ee; QUAD $0x62eeea2348b50362; QUAD $0x036244ea2348b583; QUAD $0xe51362eed32340e5; QUAD $0x40f5036244cb2340; QUAD $0x2340f58362eed923; QUAD $0xce2340ed236244d9; QUAD $0x44ce2340eda362ee; QUAD $0xc162d16f487ec162; QUAD $0x407dc262f26f487e; QUAD $0xcb004075c262c300; QUAD $0xc262d300406dc262; QUAD $0x405dc262db004065; QUAD $0xeb004055c262e300; QUAD $0xc262f300404dc262; QUAD $0x403d4262fb004045; QUAD $0xcb0040354262c300; QUAD $0x4262d300402d4262; QUAD $0x401d4262db004025; QUAD $0xeb0040154262e300; QUAD $0x4262f300400d4262; QUAD $0x48455162fb004005; QUAD $0xcc6f487e7162c4fe; QUAD $0x6206c472482df162; QUAD $0xf1620bc4724825f1; QUAD $0x55736219c472481d; QUAD $0x483d3162cace2548; QUAD $0xd42548255362c0fe; QUAD $0x62c1fe483d516296; QUAD $0x65d162c2fe483d51; QUAD $0x724845f162d8fe48; QUAD $0xc0724825f16202c0; QUAD $0x16c072481df1620d; QUAD $0x7362c86f487e7162; QUAD $0x25d362e8ca254875; QUAD $0x4845d16296fc2548; QUAD $0xf8fe4845d162f9fe; QUAD $0x6201626f487e7162; QUAD $0x916211c672481591; QUAD $0x05916213c672480d; QUAD $0x480d53620ad67248; QUAD $0xfe407dc16296ef25; QUAD $0x62c1fe407d8162c5; QUAD $0xb16207c1724815b1; QUAD $0x05b16212c172480d; QUAD $0x480d536203d17248; QUAD $0xfe407dc16296ef25; QUAD $0x62c4fe484d5162c5; QUAD $0x2df162cb6f487e71; QUAD $0x4825f16206c37248; QUAD $0x72481df1620bc372; QUAD $0xcd25485d736219c3; QUAD $0x62c1fe483d3162ca; QUAD $0x516296d425482553; QUAD $0x483d5162c1fe483d; QUAD $0xd0fe486dd162c2fe; QUAD $0x6202c772484df162; QUAD $0xf1620dc7724825f1; QUAD $0x7e716216c772481d; QUAD $0x25487d7362cf6f48; QUAD $0xf4254825d362e8c9; QUAD $0x62f1fe484dd16296; QUAD $0x7e7162f0fe484dd1; QUAD $0x4815916202626f48; QUAD $0x72480d916211c772; QUAD $0xd7724805916213c7; QUAD $0x96ef25480d53620a; QUAD $0x8162cdfe4075c162; QUAD $0x4815b162cafe4075; QUAD $0x72480db16207c272; QUAD $0xd2724805b16212c2; QUAD $0x96ef25480d536203; QUAD $0x5162cdfe4075c162; QUAD $0x487e7162c4fe4855; QUAD $0xc272482df162ca6f; QUAD $0x0bc2724825f16206; QUAD $0x6219c272481df162; QUAD $0x3162cacc25486573; QUAD $0x48255362c2fe483d; QUAD $0xfe483d516296d425; QUAD $0x62c2fe483d5162c1; QUAD $0x55f162c8fe4875d1; QUAD $0x4825f16202c67248; QUAD $0x72481df1620dc672; QUAD $0xce6f487e716216c6; QUAD $0x62e8c82548457362; QUAD $0xd16296ec254825d3; QUAD $0x4855d162e9fe4855; QUAD $0x626f487e7162e8fe; QUAD $0x11c0724815b16203; QUAD $0x6213c072480db162; QUAD $0x53620ad0724805b1; QUAD $0x6dc16296ef25480d; QUAD $0xfe406d8162d5fe40; QUAD $0x07c3724815b162d3; QUAD $0x6212c372480db162; QUAD $0x536203d3724805b1; QUAD $0x6dc16296ef25480d; QUAD $0xfe485d5162d5fe40; QUAD $0x62c96f487e7162c4; QUAD $0xf16206c172482df1; QUAD $0x1df1620bc1724825; QUAD $0x486d736219c17248; QUAD $0xfe483d3162cacb25; QUAD $0x96d42548255362c3; QUAD $0x5162c1fe483d5162; QUAD $0x487dd162c2fe483d; QUAD $0xc572485df162c0fe; QUAD $0x0dc5724825f16202; QUAD $0x6216c572481df162; QUAD $0x4d7362cd6f487e71; QUAD $0x4825d362e8cf2548; QUAD $0xfe485dd16296e425; QUAD $0x62e0fe485dd162e1; QUAD $0xb16204626f487e71; QUAD $0x0db16211c1724815; QUAD $0x4805b16213c17248; QUAD $0x25480d53620ad172; QUAD $0xddfe4065c16296ef; QUAD $0xb162dcfe40658162; QUAD $0x0db16207c4724815; QUAD $0x4805b16212c47248; QUAD $0x25480d536203d472; QUAD $0xddfe4065c16296ef; QUAD $0x7162c4fe48655162; QUAD $0x482df162c86f487e; QUAD $0x724825f16206c072; QUAD $0xc072481df1620bc0; QUAD $0xcaca254875736219; QUAD $0x5362c4fe483d3162; QUAD $0x3d516296d4254825; QUAD $0xfe483d5162c1fe48; QUAD $0x62f8fe4845d162c2; QUAD $0xf16202c4724865f1; QUAD $0x1df1620dc4724825; QUAD $0x487e716216c47248; QUAD $0xce2548557362cc6f; QUAD $0x96dc254825d362e8; QUAD $0xd162d9fe4865d162; QUAD $0x487e7162d8fe4865; QUAD $0x724815b16205626f; QUAD $0xc272480db16211c2; QUAD $0x0ad2724805b16213; QUAD $0x6296ef25480d5362; QUAD $0x5d8162e5fe405dc1; QUAD $0x724815b162e5fe40; QUAD $0xc572480db16207c5; QUAD $0x03d5724805b16212; QUAD $0x6296ef25480d5362; QUAD $0x6d5162e5fe405dc1; QUAD $0x6f487e7162c4fe48; QUAD $0x06c772482df162cf; QUAD $0x620bc7724825f162; QUAD $0x736219c772481df1; QUAD $0x3d3162cac925487d; QUAD $0x2548255362c5fe48; QUAD $0xc1fe483d516296d4; QUAD $0xd162c2fe483d5162; QUAD $0x486df162f0fe484d; QUAD $0x724825f16202c372; QUAD $0xc372481df1620dc3; QUAD $0x62cb6f487e716216; QUAD $0xd362e8cd25485d73; QUAD $0x6dd16296d4254825; QUAD $0xfe486dd162d1fe48; QUAD $0x06626f487e7162d0; QUAD $0x6211c3724815b162; QUAD $0xb16213c372480db1; QUAD $0x0d53620ad3724805; QUAD $0x4055c16296ef2548; QUAD $0xeefe40558162edfe; QUAD $0x6207c6724815b162; QUAD $0xb16212c672480db1; QUAD $0x0d536203d6724805; QUAD $0x4055c16296ef2548; QUAD $0xc4fe48755162edfe; QUAD $0xf162ce6f487e7162; QUAD $0x25f16206c672482d; QUAD $0x481df1620bc67248; QUAD $0x254845736219c672; QUAD $0xc6fe483d3162cac8; QUAD $0x6296d42548255362; QUAD $0x3d5162c1fe483d51; QUAD $0xfe4855d162c2fe48; QUAD $0x02c2724875f162e8; QUAD $0x620dc2724825f162; QUAD $0x716216c272481df1; QUAD $0x48657362ca6f487e; QUAD $0x254825d362e8cc25; QUAD $0xc9fe4875d16296cc; QUAD $0x7162c8fe4875d162; QUAD $0x15b16207626f487e; QUAD $0x480db16211c47248; QUAD $0x724805b16213c472; QUAD $0xef25480d53620ad4; QUAD $0x62f5fe404dc16296; QUAD $0x15b162f7fe404d81; QUAD $0x480db16207c77248; QUAD $0x724805b16212c772; QUAD $0xef25480d536203d7; QUAD $0x62f5fe404dc16296; QUAD $0x7e7162c4fe487d51; QUAD $0x72482df162cd6f48; QUAD $0xc5724825f16206c5; QUAD $0x19c572481df1620b; QUAD $0x62cacf25484d7362; QUAD $0x255362c7fe483d31; QUAD $0x483d516296d42548; QUAD $0xc2fe483d5162c1fe; QUAD $0xf162e0fe485dd162; QUAD $0x25f16202c172487d; QUAD $0x481df1620dc17248; QUAD $0x6f487e716216c172; QUAD $0xe8cb25486d7362c9; QUAD $0x6296c4254825d362; QUAD $0x7dd162c1fe487dd1; QUAD $0x6f487e7162c0fe48; QUAD $0xc5724815b1620862; QUAD $0x13c572480db16211; QUAD $0x620ad5724805b162; QUAD $0xc16296ef25480d53; QUAD $0x4045a162fdfe4045; QUAD $0xc07248159162f8fe; QUAD $0x12c072480d916207; QUAD $0x6203d07248059162; QUAD $0xc16296ef25480d53; QUAD $0x48455162fdfe4045; QUAD $0xcc6f487e7162c4fe; QUAD $0x6206c472482df162; QUAD $0xf1620bc4724825f1; QUAD $0x55736219c472481d; QUAD $0x483d1162cace2548; QUAD $0xd42548255362c0fe; QUAD $0x62c1fe483d516296; QUAD $0x65d162c2fe483d51; QUAD $0x724845f162d8fe48; QUAD $0xc0724825f16202c0; QUAD $0x16c072481df1620d; QUAD $0x7362c86f487e7162; QUAD $0x25d362e8ca254875; QUAD $0x4845d16296fc2548; QUAD $0xf8fe4845d162f9fe; QUAD $0x6209626f487e7162; QUAD $0xb16211c6724815b1; QUAD $0x05b16213c672480d; QUAD $0x480d53620ad67248; QUAD $0xfe403d416296ef25; QUAD $0x62c1fe403d2162c5; QUAD $0x916207c172481591; QUAD $0x05916212c172480d; QUAD $0x480d536203d17248; QUAD $0xfe403d416296ef25; QUAD $0x62c4fe484d5162c5; QUAD $0x2df162cb6f487e71; QUAD $0x4825f16206c37248; QUAD $0x72481df1620bc372; QUAD $0xcd25485d736219c3; QUAD $0x62c1fe483d1162ca; QUAD $0x516296d425482553; QUAD $0x483d5162c1fe483d; QUAD $0xd0fe486dd162c2fe; QUAD $0x6202c772484df162; QUAD $0xf1620dc7724825f1; QUAD $0x7e716216c772481d; QUAD $0x25487d7362cf6f48; QUAD $0xf4254825d362e8c9; QUAD $0x62f1fe484dd16296; QUAD $0x7e7162f0fe484dd1; QUAD $0x4815b1620a626f48; QUAD $0x72480db16211c772; QUAD $0xd7724805b16213c7; QUAD $0x96ef25480d53620a; QUAD $0x2162cdfe40354162; QUAD $0x48159162cafe4035; QUAD $0x72480d916207c272; QUAD $0xd2724805916212c2; QUAD $0x96ef25480d536203; QUAD $0x5162cdfe40354162; QUAD $0x487e7162c4fe4855; QUAD $0xc272482df162ca6f; QUAD $0x0bc2724825f16206; QUAD $0x6219c272481df162; QUAD $0x1162cacc25486573; QUAD $0x48255362c2fe483d; QUAD $0xfe483d516296d425; QUAD $0x62c2fe483d5162c1; QUAD $0x55f162c8fe4875d1; QUAD $0x4825f16202c67248; QUAD $0x72481df1620dc672; QUAD $0xce6f487e716216c6; QUAD $0x62e8c82548457362; QUAD $0xd16296ec254825d3; QUAD $0x4855d162e9fe4855; QUAD $0x626f487e7162e8fe; QUAD $0x11c072481591620b; QUAD $0x6213c072480d9162; QUAD $0x53620ad072480591; QUAD $0x2d416296ef25480d; QUAD $0xfe402d2162d5fe40; QUAD $0x07c37248159162d3; QUAD $0x6212c372480d9162; QUAD $0x536203d372480591; QUAD $0x2d416296ef25480d; QUAD $0xfe485d5162d5fe40; QUAD $0x62c96f487e7162c4; QUAD $0xf16206c172482df1; QUAD $0x1df1620bc1724825; QUAD $0x486d736219c17248; QUAD $0xfe483d1162cacb25; QUAD $0x96d42548255362c3; QUAD $0x5162c1fe483d5162; QUAD $0x487dd162c2fe483d; QUAD $0xc572485df162c0fe; QUAD $0x0dc5724825f16202; QUAD $0x6216c572481df162; QUAD $0x4d7362cd6f487e71; QUAD $0x4825d362e8cf2548; QUAD $0xfe485dd16296e425; QUAD $0x62e0fe485dd162e1; QUAD $0x91620c626f487e71; QUAD $0x0d916211c1724815; QUAD $0x4805916213c17248; QUAD $0x25480d53620ad172; QUAD $0xddfe4025416296ef; QUAD $0x9162dcfe40252162; QUAD $0x0d916207c4724815; QUAD $0x4805916212c47248; QUAD $0x25480d536203d472; QUAD $0xddfe4025416296ef; QUAD $0x7162c4fe48655162; QUAD $0x482df162c86f487e; QUAD $0x724825f16206c072; QUAD $0xc072481df1620bc0; QUAD $0xcaca254875736219; QUAD $0x5362c4fe483d1162; QUAD $0x3d516296d4254825; QUAD $0xfe483d5162c1fe48; QUAD $0x62f8fe4845d162c2; QUAD $0xf16202c4724865f1; QUAD $0x1df1620dc4724825; QUAD $0x487e716216c47248; QUAD $0xce2548557362cc6f; QUAD $0x96dc254825d362e8; QUAD $0xd162d9fe4865d162; QUAD $0x487e7162d8fe4865; QUAD $0x72481591620d626f; QUAD $0xc272480d916211c2; QUAD $0x0ad2724805916213; QUAD $0x6296ef25480d5362; QUAD $0x1d2162e5fe401d41; QUAD $0x7248159162e5fe40; QUAD $0xc572480d916207c5; QUAD $0x03d5724805916212; QUAD $0x6296ef25480d5362; QUAD $0x6d5162e5fe401d41; QUAD $0x6f487e7162c4fe48; QUAD $0x06c772482df162cf; QUAD $0x620bc7724825f162; QUAD $0x736219c772481df1; QUAD $0x3d1162cac925487d; QUAD $0x2548255362c5fe48; QUAD $0xc1fe483d516296d4; QUAD $0xd162c2fe483d5162; QUAD $0x486df162f0fe484d; QUAD $0x724825f16202c372; QUAD $0xc372481df1620dc3; QUAD $0x62cb6f487e716216; QUAD $0xd362e8cd25485d73; QUAD $0x6dd16296d4254825; QUAD $0xfe486dd162d1fe48; QUAD $0x0e626f487e7162d0; QUAD $0x6211c37248159162; QUAD $0x916213c372480d91; QUAD $0x0d53620ad3724805; QUAD $0x4015416296ef2548; QUAD $0xeefe40152162edfe; QUAD $0x6207c67248159162; QUAD $0x916212c672480d91; QUAD $0x0d536203d6724805; QUAD $0x4015416296ef2548; QUAD $0xc4fe48755162edfe; QUAD $0xf162ce6f487e7162; QUAD $0x25f16206c672482d; QUAD $0x481df1620bc67248; QUAD $0x254845736219c672; QUAD $0xc6fe483d1162cac8; QUAD $0x6296d42548255362; QUAD $0x3d5162c1fe483d51; QUAD $0xfe4855d162c2fe48; QUAD $0x02c2724875f162e8; QUAD $0x620dc2724825f162; QUAD $0x716216c272481df1; QUAD $0x48657362ca6f487e; QUAD $0x254825d362e8cc25; QUAD $0xc9fe4875d16296cc; QUAD $0x7162c8fe4875d162; QUAD $0x1591620f626f487e; QUAD $0x480d916211c47248; QUAD $0x724805916213c472; QUAD $0xef25480d53620ad4; QUAD $0x62f5fe400d416296; QUAD $0x159162f7fe400d21; QUAD $0x480d916207c77248; QUAD $0x724805916212c772; QUAD $0xef25480d536203d7; QUAD $0x62f5fe400d416296; QUAD $0x7e7162c4fe487d51; QUAD $0x72482df162cd6f48; QUAD $0xc5724825f16206c5; QUAD $0x19c572481df1620b; QUAD $0x62cacf25484d7362; QUAD $0x255362c7fe483d11; QUAD $0x483d516296d42548; QUAD $0xc2fe483d5162c1fe; QUAD $0xf162e0fe485dd162; QUAD $0x25f16202c172487d; QUAD $0x481df1620dc17248; QUAD $0x6f487e716216c172; QUAD $0xe8cb25486d7362c9; QUAD $0x6296c4254825d362; QUAD $0x7dd162c1fe487dd1; QUAD $0x6f487e7162c0fe48; QUAD $0xc572481591621062; QUAD $0x13c572480d916211; QUAD $0x620ad57248059162; QUAD $0x416296ef25480d53; QUAD $0x40050162fdfe4005; QUAD $0xc0724815b162f8fe; QUAD $0x12c072480db16207; QUAD $0x6203d0724805b162; QUAD $0x416296ef25480d53; QUAD $0x48455162fdfe4005; QUAD $0xcc6f487e7162c4fe; QUAD $0x6206c472482df162; QUAD $0xf1620bc4724825f1; QUAD $0x55736219c472481d; QUAD $0x483d3162cace2548; QUAD $0xd42548255362c0fe; QUAD $0x62c1fe483d516296; QUAD $0x65d162c2fe483d51; QUAD $0x724845f162d8fe48; QUAD $0xc0724825f16202c0; QUAD $0x16c072481df1620d; QUAD $0x7362c86f487e7162; QUAD $0x25d362e8ca254875; QUAD $0x4845d16296fc2548; QUAD $0xf8fe4845d162f9fe; QUAD $0x6211626f487e7162; QUAD $0x916211c672481591; QUAD $0x05916213c672480d; QUAD $0x480d53620ad67248; QUAD $0xfe407dc16296ef25; QUAD $0x62c1fe407d8162c5; QUAD $0xb16207c1724815b1; QUAD $0x05b16212c172480d; QUAD $0x480d536203d17248; QUAD $0xfe407dc16296ef25; QUAD $0x62c4fe484d5162c5; QUAD $0x2df162cb6f487e71; QUAD $0x4825f16206c37248; QUAD $0x72481df1620bc372; QUAD $0xcd25485d736219c3; QUAD $0x62c1fe483d3162ca; QUAD $0x516296d425482553; QUAD $0x483d5162c1fe483d; QUAD $0xd0fe486dd162c2fe; QUAD $0x6202c772484df162; QUAD $0xf1620dc7724825f1; QUAD $0x7e716216c772481d; QUAD $0x25487d7362cf6f48; QUAD $0xf4254825d362e8c9; QUAD $0x62f1fe484dd16296; QUAD $0x7e7162f0fe484dd1; QUAD $0x4815916212626f48; QUAD $0x72480d916211c772; QUAD $0xd7724805916213c7; QUAD $0x96ef25480d53620a; QUAD $0x8162cdfe4075c162; QUAD $0x4815b162cafe4075; QUAD $0x72480db16207c272; QUAD $0xd2724805b16212c2; QUAD $0x96ef25480d536203; QUAD $0x5162cdfe4075c162; QUAD $0x487e7162c4fe4855; QUAD $0xc272482df162ca6f; QUAD $0x0bc2724825f16206; QUAD $0x6219c272481df162; QUAD $0x3162cacc25486573; QUAD $0x48255362c2fe483d; QUAD $0xfe483d516296d425; QUAD $0x62c2fe483d5162c1; QUAD $0x55f162c8fe4875d1; QUAD $0x4825f16202c67248; QUAD $0x72481df1620dc672; QUAD $0xce6f487e716216c6; QUAD $0x62e8c82548457362; QUAD $0xd16296ec254825d3; QUAD $0x4855d162e9fe4855; QUAD $0x626f487e7162e8fe; QUAD $0x11c0724815b16213; QUAD $0x6213c072480db162; QUAD $0x53620ad0724805b1; QUAD $0x6dc16296ef25480d; QUAD $0xfe406d8162d5fe40; QUAD $0x07c3724815b162d3; QUAD $0x6212c372480db162; QUAD $0x536203d3724805b1; QUAD $0x6dc16296ef25480d; QUAD $0xfe485d5162d5fe40; QUAD $0x62c96f487e7162c4; QUAD $0xf16206c172482df1; QUAD $0x1df1620bc1724825; QUAD $0x486d736219c17248; QUAD $0xfe483d3162cacb25; QUAD $0x96d42548255362c3; QUAD $0x5162c1fe483d5162; QUAD $0x487dd162c2fe483d; QUAD $0xc572485df162c0fe; QUAD $0x0dc5724825f16202; QUAD $0x6216c572481df162; QUAD $0x4d7362cd6f487e71; QUAD $0x4825d362e8cf2548; QUAD $0xfe485dd16296e425; QUAD $0x62e0fe485dd162e1; QUAD $0xb16214626f487e71; QUAD $0x0db16211c1724815; QUAD $0x4805b16213c17248; QUAD $0x25480d53620ad172; QUAD $0xddfe4065c16296ef; QUAD $0xb162dcfe40658162; QUAD $0x0db16207c4724815; QUAD $0x4805b16212c47248; QUAD $0x25480d536203d472; QUAD $0xddfe4065c16296ef; QUAD $0x7162c4fe48655162; QUAD $0x482df162c86f487e; QUAD $0x724825f16206c072; QUAD $0xc072481df1620bc0; QUAD $0xcaca254875736219; QUAD $0x5362c4fe483d3162; QUAD $0x3d516296d4254825; QUAD $0xfe483d5162c1fe48; QUAD $0x62f8fe4845d162c2; QUAD $0xf16202c4724865f1; QUAD $0x1df1620dc4724825; QUAD $0x487e716216c47248; QUAD $0xce2548557362cc6f; QUAD $0x96dc254825d362e8; QUAD $0xd162d9fe4865d162; QUAD $0x487e7162d8fe4865; QUAD $0x724815b16215626f; QUAD $0xc272480db16211c2; QUAD $0x0ad2724805b16213; QUAD $0x6296ef25480d5362; QUAD $0x5d8162e5fe405dc1; QUAD $0x724815b162e5fe40; QUAD $0xc572480db16207c5; QUAD $0x03d5724805b16212; QUAD $0x6296ef25480d5362; QUAD $0x6d5162e5fe405dc1; QUAD $0x6f487e7162c4fe48; QUAD $0x06c772482df162cf; QUAD $0x620bc7724825f162; QUAD $0x736219c772481df1; QUAD $0x3d3162cac925487d; QUAD $0x2548255362c5fe48; QUAD $0xc1fe483d516296d4; QUAD $0xd162c2fe483d5162; QUAD $0x486df162f0fe484d; QUAD $0x724825f16202c372; QUAD $0xc372481df1620dc3; QUAD $0x62cb6f487e716216; QUAD $0xd362e8cd25485d73; QUAD $0x6dd16296d4254825; QUAD $0xfe486dd162d1fe48; QUAD $0x16626f487e7162d0; QUAD $0x6211c3724815b162; QUAD $0xb16213c372480db1; QUAD $0x0d53620ad3724805; QUAD $0x4055c16296ef2548; QUAD $0xeefe40558162edfe; QUAD $0x6207c6724815b162; QUAD $0xb16212c672480db1; QUAD $0x0d536203d6724805; QUAD $0x4055c16296ef2548; QUAD $0xc4fe48755162edfe; QUAD $0xf162ce6f487e7162; QUAD $0x25f16206c672482d; QUAD $0x481df1620bc67248; QUAD $0x254845736219c672; QUAD $0xc6fe483d3162cac8; QUAD $0x6296d42548255362; QUAD $0x3d5162c1fe483d51; QUAD $0xfe4855d162c2fe48; QUAD $0x02c2724875f162e8; QUAD $0x620dc2724825f162; QUAD $0x716216c272481df1; QUAD $0x48657362ca6f487e; QUAD $0x254825d362e8cc25; QUAD $0xc9fe4875d16296cc; QUAD $0x7162c8fe4875d162; QUAD $0x15b16217626f487e; QUAD $0x480db16211c47248; QUAD $0x724805b16213c472; QUAD $0xef25480d53620ad4; QUAD $0x62f5fe404dc16296; QUAD $0x15b162f7fe404d81; QUAD $0x480db16207c77248; QUAD $0x724805b16212c772; QUAD $0xef25480d536203d7; QUAD $0x62f5fe404dc16296; QUAD $0x7e7162c4fe487d51; QUAD $0x72482df162cd6f48; QUAD $0xc5724825f16206c5; QUAD $0x19c572481df1620b; QUAD $0x62cacf25484d7362; QUAD $0x255362c7fe483d31; QUAD $0x483d516296d42548; QUAD $0xc2fe483d5162c1fe; QUAD $0xf162e0fe485dd162; QUAD $0x25f16202c172487d; QUAD $0x481df1620dc17248; QUAD $0x6f487e716216c172; QUAD $0xe8cb25486d7362c9; QUAD $0x6296c4254825d362; QUAD $0x7dd162c1fe487dd1; QUAD $0x6f487e7162c0fe48; QUAD $0xc5724815b1621862; QUAD $0x13c572480db16211; QUAD $0x620ad5724805b162; QUAD $0xc16296ef25480d53; QUAD $0x4045a162fdfe4045; QUAD $0xc07248159162f8fe; QUAD $0x12c072480d916207; QUAD $0x6203d07248059162; QUAD $0xc16296ef25480d53; QUAD $0x48455162fdfe4045; QUAD $0xcc6f487e7162c4fe; QUAD $0x6206c472482df162; QUAD $0xf1620bc4724825f1; QUAD $0x55736219c472481d; QUAD $0x483d1162cace2548; QUAD $0xd42548255362c0fe; QUAD $0x62c1fe483d516296; QUAD $0x65d162c2fe483d51; QUAD $0x724845f162d8fe48; QUAD $0xc0724825f16202c0; QUAD $0x16c072481df1620d; QUAD $0x7362c86f487e7162; QUAD $0x25d362e8ca254875; QUAD $0x4845d16296fc2548; QUAD $0xf8fe4845d162f9fe; QUAD $0x6219626f487e7162; QUAD $0xb16211c6724815b1; QUAD $0x05b16213c672480d; QUAD $0x480d53620ad67248; QUAD $0xfe403d416296ef25; QUAD $0x62c1fe403d2162c5; QUAD $0x916207c172481591; QUAD $0x05916212c172480d; QUAD $0x480d536203d17248; QUAD $0xfe403d416296ef25; QUAD $0x62c4fe484d5162c5; QUAD $0x2df162cb6f487e71; QUAD $0x4825f16206c37248; QUAD $0x72481df1620bc372; QUAD $0xcd25485d736219c3; QUAD $0x62c1fe483d1162ca; QUAD $0x516296d425482553; QUAD $0x483d5162c1fe483d; QUAD $0xd0fe486dd162c2fe; QUAD $0x6202c772484df162; QUAD $0xf1620dc7724825f1; QUAD $0x7e716216c772481d; QUAD $0x25487d7362cf6f48; QUAD $0xf4254825d362e8c9; QUAD $0x62f1fe484dd16296; QUAD $0x7e7162f0fe484dd1; QUAD $0x4815b1621a626f48; QUAD $0x72480db16211c772; QUAD $0xd7724805b16213c7; QUAD $0x96ef25480d53620a; QUAD $0x2162cdfe40354162; QUAD $0x48159162cafe4035; QUAD $0x72480d916207c272; QUAD $0xd2724805916212c2; QUAD $0x96ef25480d536203; QUAD $0x5162cdfe40354162; QUAD $0x487e7162c4fe4855; QUAD $0xc272482df162ca6f; QUAD $0x0bc2724825f16206; QUAD $0x6219c272481df162; QUAD $0x1162cacc25486573; QUAD $0x48255362c2fe483d; QUAD $0xfe483d516296d425; QUAD $0x62c2fe483d5162c1; QUAD $0x55f162c8fe4875d1; QUAD $0x4825f16202c67248; QUAD $0x72481df1620dc672; QUAD $0xce6f487e716216c6; QUAD $0x62e8c82548457362; QUAD $0xd16296ec254825d3; QUAD $0x4855d162e9fe4855; QUAD $0x626f487e7162e8fe; QUAD $0x11c072481591621b; QUAD $0x6213c072480d9162; QUAD $0x53620ad072480591; QUAD $0x2d416296ef25480d; QUAD $0xfe402d2162d5fe40; QUAD $0x07c37248159162d3; QUAD $0x6212c372480d9162; QUAD $0x536203d372480591; QUAD $0x2d416296ef25480d; QUAD $0xfe485d5162d5fe40; QUAD $0x62c96f487e7162c4; QUAD $0xf16206c172482df1; QUAD $0x1df1620bc1724825; QUAD $0x486d736219c17248; QUAD $0xfe483d1162cacb25; QUAD $0x96d42548255362c3; QUAD $0x5162c1fe483d5162; QUAD $0x487dd162c2fe483d; QUAD $0xc572485df162c0fe; QUAD $0x0dc5724825f16202; QUAD $0x6216c572481df162; QUAD $0x4d7362cd6f487e71; QUAD $0x4825d362e8cf2548; QUAD $0xfe485dd16296e425; QUAD $0x62e0fe485dd162e1; QUAD $0x91621c626f487e71; QUAD $0x0d916211c1724815; QUAD $0x4805916213c17248; QUAD $0x25480d53620ad172; QUAD $0xddfe4025416296ef; QUAD $0x9162dcfe40252162; QUAD $0x0d916207c4724815; QUAD $0x4805916212c47248; QUAD $0x25480d536203d472; QUAD $0xddfe4025416296ef; QUAD $0x7162c4fe48655162; QUAD $0x482df162c86f487e; QUAD $0x724825f16206c072; QUAD $0xc072481df1620bc0; QUAD $0xcaca254875736219; QUAD $0x5362c4fe483d1162; QUAD $0x3d516296d4254825; QUAD $0xfe483d5162c1fe48; QUAD $0x62f8fe4845d162c2; QUAD $0xf16202c4724865f1; QUAD $0x1df1620dc4724825; QUAD $0x487e716216c47248; QUAD $0xce2548557362cc6f; QUAD $0x96dc254825d362e8; QUAD $0xd162d9fe4865d162; QUAD $0x487e7162d8fe4865; QUAD $0x72481591621d626f; QUAD $0xc272480d916211c2; QUAD $0x0ad2724805916213; QUAD $0x6296ef25480d5362; QUAD $0x1d2162e5fe401d41; QUAD $0x7248159162e5fe40; QUAD $0xc572480d916207c5; QUAD $0x03d5724805916212; QUAD $0x6296ef25480d5362; QUAD $0x6d5162e5fe401d41; QUAD $0x6f487e7162c4fe48; QUAD $0x06c772482df162cf; QUAD $0x620bc7724825f162; QUAD $0x736219c772481df1; QUAD $0x3d1162cac925487d; QUAD $0x2548255362c5fe48; QUAD $0xc1fe483d516296d4; QUAD $0xd162c2fe483d5162; QUAD $0x486df162f0fe484d; QUAD $0x724825f16202c372; QUAD $0xc372481df1620dc3; QUAD $0x62cb6f487e716216; QUAD $0xd362e8cd25485d73; QUAD $0x6dd16296d4254825; QUAD $0xfe486dd162d1fe48; QUAD $0x1e626f487e7162d0; QUAD $0x6211c37248159162; QUAD $0x916213c372480d91; QUAD $0x0d53620ad3724805; QUAD $0x4015416296ef2548; QUAD $0xeefe40152162edfe; QUAD $0x6207c67248159162; QUAD $0x916212c672480d91; QUAD $0x0d536203d6724805; QUAD $0x4015416296ef2548; QUAD $0xc4fe48755162edfe; QUAD $0xf162ce6f487e7162; QUAD $0x25f16206c672482d; QUAD $0x481df1620bc67248; QUAD $0x254845736219c672; QUAD $0xc6fe483d1162cac8; QUAD $0x6296d42548255362; QUAD $0x3d5162c1fe483d51; QUAD $0xfe4855d162c2fe48; QUAD $0x02c2724875f162e8; QUAD $0x620dc2724825f162; QUAD $0x716216c272481df1; QUAD $0x48657362ca6f487e; QUAD $0x254825d362e8cc25; QUAD $0xc9fe4875d16296cc; QUAD $0x7162c8fe4875d162; QUAD $0x1591621f626f487e; QUAD $0x480d916211c47248; QUAD $0x724805916213c472; QUAD $0xef25480d53620ad4; QUAD $0x62f5fe400d416296; QUAD $0x159162f7fe400d21; QUAD $0x480d916207c77248; QUAD $0x724805916212c772; QUAD $0xef25480d536203d7; QUAD $0x62f5fe400d416296; QUAD $0x7e7162c4fe487d51; QUAD $0x72482df162cd6f48; QUAD $0xc5724825f16206c5; QUAD $0x19c572481df1620b; QUAD $0x62cacf25484d7362; QUAD $0x255362c7fe483d11; QUAD $0x483d516296d42548; QUAD $0xc2fe483d5162c1fe; QUAD $0xf162e0fe485dd162; QUAD $0x25f16202c172487d; QUAD $0x481df1620dc17248; QUAD $0x6f487e716216c172; QUAD $0xe8cb25486d7362c9; QUAD $0x6296c4254825d362; QUAD $0x7dd162c1fe487dd1; QUAD $0x6f487e7162c0fe48; QUAD $0xc572481591622062; QUAD $0x13c572480d916211; QUAD $0x620ad57248059162; QUAD $0x416296ef25480d53; QUAD $0x40050162fdfe4005; QUAD $0xc0724815b162f8fe; QUAD $0x12c072480db16207; QUAD $0x6203d0724805b162; QUAD $0x416296ef25480d53; QUAD $0x48455162fdfe4005; QUAD $0xcc6f487e7162c4fe; QUAD $0x6206c472482df162; QUAD $0xf1620bc4724825f1; QUAD $0x55736219c472481d; QUAD $0x483d3162cace2548; QUAD $0xd42548255362c0fe; QUAD $0x62c1fe483d516296; QUAD $0x65d162c2fe483d51; QUAD $0x724845f162d8fe48; QUAD $0xc0724825f16202c0; QUAD $0x16c072481df1620d; QUAD $0x7362c86f487e7162; QUAD $0x25d362e8ca254875; QUAD $0x4845d16296fc2548; QUAD $0xf8fe4845d162f9fe; QUAD $0x6221626f487e7162; QUAD $0x916211c672481591; QUAD $0x05916213c672480d; QUAD $0x480d53620ad67248; QUAD $0xfe407dc16296ef25; QUAD $0x62c1fe407d8162c5; QUAD $0xb16207c1724815b1; QUAD $0x05b16212c172480d; QUAD $0x480d536203d17248; QUAD $0xfe407dc16296ef25; QUAD $0x62c4fe484d5162c5; QUAD $0x2df162cb6f487e71; QUAD $0x4825f16206c37248; QUAD $0x72481df1620bc372; QUAD $0xcd25485d736219c3; QUAD $0x62c1fe483d3162ca; QUAD $0x516296d425482553; QUAD $0x483d5162c1fe483d; QUAD $0xd0fe486dd162c2fe; QUAD $0x6202c772484df162; QUAD $0xf1620dc7724825f1; QUAD $0x7e716216c772481d; QUAD $0x25487d7362cf6f48; QUAD $0xf4254825d362e8c9; QUAD $0x62f1fe484dd16296; QUAD $0x7e7162f0fe484dd1; QUAD $0x4815916222626f48; QUAD $0x72480d916211c772; QUAD $0xd7724805916213c7; QUAD $0x96ef25480d53620a; QUAD $0x8162cdfe4075c162; QUAD $0x4815b162cafe4075; QUAD $0x72480db16207c272; QUAD $0xd2724805b16212c2; QUAD $0x96ef25480d536203; QUAD $0x5162cdfe4075c162; QUAD $0x487e7162c4fe4855; QUAD $0xc272482df162ca6f; QUAD $0x0bc2724825f16206; QUAD $0x6219c272481df162; QUAD $0x3162cacc25486573; QUAD $0x48255362c2fe483d; QUAD $0xfe483d516296d425; QUAD $0x62c2fe483d5162c1; QUAD $0x55f162c8fe4875d1; QUAD $0x4825f16202c67248; QUAD $0x72481df1620dc672; QUAD $0xce6f487e716216c6; QUAD $0x62e8c82548457362; QUAD $0xd16296ec254825d3; QUAD $0x4855d162e9fe4855; QUAD $0x626f487e7162e8fe; QUAD $0x11c0724815b16223; QUAD $0x6213c072480db162; QUAD $0x53620ad0724805b1; QUAD $0x6dc16296ef25480d; QUAD $0xfe406d8162d5fe40; QUAD $0x07c3724815b162d3; QUAD $0x6212c372480db162; QUAD $0x536203d3724805b1; QUAD $0x6dc16296ef25480d; QUAD $0xfe485d5162d5fe40; QUAD $0x62c96f487e7162c4; QUAD $0xf16206c172482df1; QUAD $0x1df1620bc1724825; QUAD $0x486d736219c17248; QUAD $0xfe483d3162cacb25; QUAD $0x96d42548255362c3; QUAD $0x5162c1fe483d5162; QUAD $0x487dd162c2fe483d; QUAD $0xc572485df162c0fe; QUAD $0x0dc5724825f16202; QUAD $0x6216c572481df162; QUAD $0x4d7362cd6f487e71; QUAD $0x4825d362e8cf2548; QUAD $0xfe485dd16296e425; QUAD $0x62e0fe485dd162e1; QUAD $0xb16224626f487e71; QUAD $0x0db16211c1724815; QUAD $0x4805b16213c17248; QUAD $0x25480d53620ad172; QUAD $0xddfe4065c16296ef; QUAD $0xb162dcfe40658162; QUAD $0x0db16207c4724815; QUAD $0x4805b16212c47248; QUAD $0x25480d536203d472; QUAD $0xddfe4065c16296ef; QUAD $0x7162c4fe48655162; QUAD $0x482df162c86f487e; QUAD $0x724825f16206c072; QUAD $0xc072481df1620bc0; QUAD $0xcaca254875736219; QUAD $0x5362c4fe483d3162; QUAD $0x3d516296d4254825; QUAD $0xfe483d5162c1fe48; QUAD $0x62f8fe4845d162c2; QUAD $0xf16202c4724865f1; QUAD $0x1df1620dc4724825; QUAD $0x487e716216c47248; QUAD $0xce2548557362cc6f; QUAD $0x96dc254825d362e8; QUAD $0xd162d9fe4865d162; QUAD $0x487e7162d8fe4865; QUAD $0x724815b16225626f; QUAD $0xc272480db16211c2; QUAD $0x0ad2724805b16213; QUAD $0x6296ef25480d5362; QUAD $0x5d8162e5fe405dc1; QUAD $0x724815b162e5fe40; QUAD $0xc572480db16207c5; QUAD $0x03d5724805b16212; QUAD $0x6296ef25480d5362; QUAD $0x6d5162e5fe405dc1; QUAD $0x6f487e7162c4fe48; QUAD $0x06c772482df162cf; QUAD $0x620bc7724825f162; QUAD $0x736219c772481df1; QUAD $0x3d3162cac925487d; QUAD $0x2548255362c5fe48; QUAD $0xc1fe483d516296d4; QUAD $0xd162c2fe483d5162; QUAD $0x486df162f0fe484d; QUAD $0x724825f16202c372; QUAD $0xc372481df1620dc3; QUAD $0x62cb6f487e716216; QUAD $0xd362e8cd25485d73; QUAD $0x6dd16296d4254825; QUAD $0xfe486dd162d1fe48; QUAD $0x26626f487e7162d0; QUAD $0x6211c3724815b162; QUAD $0xb16213c372480db1; QUAD $0x0d53620ad3724805; QUAD $0x4055c16296ef2548; QUAD $0xeefe40558162edfe; QUAD $0x6207c6724815b162; QUAD $0xb16212c672480db1; QUAD $0x0d536203d6724805; QUAD $0x4055c16296ef2548; QUAD $0xc4fe48755162edfe; QUAD $0xf162ce6f487e7162; QUAD $0x25f16206c672482d; QUAD $0x481df1620bc67248; QUAD $0x254845736219c672; QUAD $0xc6fe483d3162cac8; QUAD $0x6296d42548255362; QUAD $0x3d5162c1fe483d51; QUAD $0xfe4855d162c2fe48; QUAD $0x02c2724875f162e8; QUAD $0x620dc2724825f162; QUAD $0x716216c272481df1; QUAD $0x48657362ca6f487e; QUAD $0x254825d362e8cc25; QUAD $0xc9fe4875d16296cc; QUAD $0x7162c8fe4875d162; QUAD $0x15b16227626f487e; QUAD $0x480db16211c47248; QUAD $0x724805b16213c472; QUAD $0xef25480d53620ad4; QUAD $0x62f5fe404dc16296; QUAD $0x15b162f7fe404d81; QUAD $0x480db16207c77248; QUAD $0x724805b16212c772; QUAD $0xef25480d536203d7; QUAD $0x62f5fe404dc16296; QUAD $0x7e7162c4fe487d51; QUAD $0x72482df162cd6f48; QUAD $0xc5724825f16206c5; QUAD $0x19c572481df1620b; QUAD $0x62cacf25484d7362; QUAD $0x255362c7fe483d31; QUAD $0x483d516296d42548; QUAD $0xc2fe483d5162c1fe; QUAD $0xf162e0fe485dd162; QUAD $0x25f16202c172487d; QUAD $0x481df1620dc17248; QUAD $0x6f487e716216c172; QUAD $0xe8cb25486d7362c9; QUAD $0x6296c4254825d362; QUAD $0x7dd162c1fe487dd1; QUAD $0x6f487e7162c0fe48; QUAD $0xc5724815b1622862; QUAD $0x13c572480db16211; QUAD $0x620ad5724805b162; QUAD $0xc16296ef25480d53; QUAD $0x4045a162fdfe4045; QUAD $0xc07248159162f8fe; QUAD $0x12c072480d916207; QUAD $0x6203d07248059162; QUAD $0xc16296ef25480d53; QUAD $0x48455162fdfe4045; QUAD $0xcc6f487e7162c4fe; QUAD $0x6206c472482df162; QUAD $0xf1620bc4724825f1; QUAD $0x55736219c472481d; QUAD $0x483d1162cace2548; QUAD $0xd42548255362c0fe; QUAD $0x62c1fe483d516296; QUAD $0x65d162c2fe483d51; QUAD $0x724845f162d8fe48; QUAD $0xc0724825f16202c0; QUAD $0x16c072481df1620d; QUAD $0x7362c86f487e7162; QUAD $0x25d362e8ca254875; QUAD $0x4845d16296fc2548; QUAD $0xf8fe4845d162f9fe; QUAD $0x6229626f487e7162; QUAD $0xb16211c6724815b1; QUAD $0x05b16213c672480d; QUAD $0x480d53620ad67248; QUAD $0xfe403d416296ef25; QUAD $0x62c1fe403d2162c5; QUAD $0x916207c172481591; QUAD $0x05916212c172480d; QUAD $0x480d536203d17248; QUAD $0xfe403d416296ef25; QUAD $0x62c4fe484d5162c5; QUAD $0x2df162cb6f487e71; QUAD $0x4825f16206c37248; QUAD $0x72481df1620bc372; QUAD $0xcd25485d736219c3; QUAD $0x62c1fe483d1162ca; QUAD $0x516296d425482553; QUAD $0x483d5162c1fe483d; QUAD $0xd0fe486dd162c2fe; QUAD $0x6202c772484df162; QUAD $0xf1620dc7724825f1; QUAD $0x7e716216c772481d; QUAD $0x25487d7362cf6f48; QUAD $0xf4254825d362e8c9; QUAD $0x62f1fe484dd16296; QUAD $0x7e7162f0fe484dd1; QUAD $0x4815b1622a626f48; QUAD $0x72480db16211c772; QUAD $0xd7724805b16213c7; QUAD $0x96ef25480d53620a; QUAD $0x2162cdfe40354162; QUAD $0x48159162cafe4035; QUAD $0x72480d916207c272; QUAD $0xd2724805916212c2; QUAD $0x96ef25480d536203; QUAD $0x5162cdfe40354162; QUAD $0x487e7162c4fe4855; QUAD $0xc272482df162ca6f; QUAD $0x0bc2724825f16206; QUAD $0x6219c272481df162; QUAD $0x1162cacc25486573; QUAD $0x48255362c2fe483d; QUAD $0xfe483d516296d425; QUAD $0x62c2fe483d5162c1; QUAD $0x55f162c8fe4875d1; QUAD $0x4825f16202c67248; QUAD $0x72481df1620dc672; QUAD $0xce6f487e716216c6; QUAD $0x62e8c82548457362; QUAD $0xd16296ec254825d3; QUAD $0x4855d162e9fe4855; QUAD $0x626f487e7162e8fe; QUAD $0x11c072481591622b; QUAD $0x6213c072480d9162; QUAD $0x53620ad072480591; QUAD $0x2d416296ef25480d; QUAD $0xfe402d2162d5fe40; QUAD $0x07c37248159162d3; QUAD $0x6212c372480d9162; QUAD $0x536203d372480591; QUAD $0x2d416296ef25480d; QUAD $0xfe485d5162d5fe40; QUAD $0x62c96f487e7162c4; QUAD $0xf16206c172482df1; QUAD $0x1df1620bc1724825; QUAD $0x486d736219c17248; QUAD $0xfe483d1162cacb25; QUAD $0x96d42548255362c3; QUAD $0x5162c1fe483d5162; QUAD $0x487dd162c2fe483d; QUAD $0xc572485df162c0fe; QUAD $0x0dc5724825f16202; QUAD $0x6216c572481df162; QUAD $0x4d7362cd6f487e71; QUAD $0x4825d362e8cf2548; QUAD $0xfe485dd16296e425; QUAD $0x62e0fe485dd162e1; QUAD $0x91622c626f487e71; QUAD $0x0d916211c1724815; QUAD $0x4805916213c17248; QUAD $0x25480d53620ad172; QUAD $0xddfe4025416296ef; QUAD $0x9162dcfe40252162; QUAD $0x0d916207c4724815; QUAD $0x4805916212c47248; QUAD $0x25480d536203d472; QUAD $0xddfe4025416296ef; QUAD $0x7162c4fe48655162; QUAD $0x482df162c86f487e; QUAD $0x724825f16206c072; QUAD $0xc072481df1620bc0; QUAD $0xcaca254875736219; QUAD $0x5362c4fe483d1162; QUAD $0x3d516296d4254825; QUAD $0xfe483d5162c1fe48; QUAD $0x62f8fe4845d162c2; QUAD $0xf16202c4724865f1; QUAD $0x1df1620dc4724825; QUAD $0x487e716216c47248; QUAD $0xce2548557362cc6f; QUAD $0x96dc254825d362e8; QUAD $0xd162d9fe4865d162; QUAD $0x487e7162d8fe4865; QUAD $0x72481591622d626f; QUAD $0xc272480d916211c2; QUAD $0x0ad2724805916213; QUAD $0x6296ef25480d5362; QUAD $0x1d2162e5fe401d41; QUAD $0x7248159162e5fe40; QUAD $0xc572480d916207c5; QUAD $0x03d5724805916212; QUAD $0x6296ef25480d5362; QUAD $0x6d5162e5fe401d41; QUAD $0x6f487e7162c4fe48; QUAD $0x06c772482df162cf; QUAD $0x620bc7724825f162; QUAD $0x736219c772481df1; QUAD $0x3d1162cac925487d; QUAD $0x2548255362c5fe48; QUAD $0xc1fe483d516296d4; QUAD $0xd162c2fe483d5162; QUAD $0x486df162f0fe484d; QUAD $0x724825f16202c372; QUAD $0xc372481df1620dc3; QUAD $0x62cb6f487e716216; QUAD $0xd362e8cd25485d73; QUAD $0x6dd16296d4254825; QUAD $0xfe486dd162d1fe48; QUAD $0x2e626f487e7162d0; QUAD $0x6211c37248159162; QUAD $0x916213c372480d91; QUAD $0x0d53620ad3724805; QUAD $0x4015416296ef2548; QUAD $0xeefe40152162edfe; QUAD $0x6207c67248159162; QUAD $0x916212c672480d91; QUAD $0x0d536203d6724805; QUAD $0x4015416296ef2548; QUAD $0xc4fe48755162edfe; QUAD $0xf162ce6f487e7162; QUAD $0x25f16206c672482d; QUAD $0x481df1620bc67248; QUAD $0x254845736219c672; QUAD $0xc6fe483d1162cac8; QUAD $0x6296d42548255362; QUAD $0x3d5162c1fe483d51; QUAD $0xfe4855d162c2fe48; QUAD $0x02c2724875f162e8; QUAD $0x620dc2724825f162; QUAD $0x716216c272481df1; QUAD $0x48657362ca6f487e; QUAD $0x254825d362e8cc25; QUAD $0xc9fe4875d16296cc; QUAD $0x7162c8fe4875d162; QUAD $0x1591622f626f487e; QUAD $0x480d916211c47248; QUAD $0x724805916213c472; QUAD $0xef25480d53620ad4; QUAD $0x62f5fe400d416296; QUAD $0x159162f7fe400d21; QUAD $0x480d916207c77248; QUAD $0x724805916212c772; QUAD $0xef25480d536203d7; QUAD $0x62f5fe400d416296; QUAD $0x7e7162c4fe487d51; QUAD $0x72482df162cd6f48; QUAD $0xc5724825f16206c5; QUAD $0x19c572481df1620b; QUAD $0x62cacf25484d7362; QUAD $0x255362c7fe483d11; QUAD $0x483d516296d42548; QUAD $0xc2fe483d5162c1fe; QUAD $0xf162e0fe485dd162; QUAD $0x25f16202c172487d; QUAD $0x481df1620dc17248; QUAD $0x6f487e716216c172; QUAD $0xe8cb25486d7362c9; QUAD $0x6296c4254825d362; QUAD $0x7dd162c1fe487dd1; QUAD $0x6f487e7162c0fe48; QUAD $0xc572481591623062; QUAD $0x13c572480d916211; QUAD $0x620ad57248059162; QUAD $0x416296ef25480d53; QUAD $0x40050162fdfe4005; QUAD $0xc0724815b162f8fe; QUAD $0x12c072480db16207; QUAD $0x6203d0724805b162; QUAD $0x416296ef25480d53; QUAD $0x01ee8348fdfe4005 | |
99 | JE lastLoop | |
100 | ADDQ $8, R13 | |
101 | MOVQ (R13), R14 | |
102 | QUAD $0x7162c4fe48455162; QUAD $0x482df162cc6f487e; QUAD $0x724825f16206c472; QUAD $0xc472481df1620bc4; QUAD $0xcace254855736219; QUAD $0x5362c0fe483d3162; QUAD $0x3d516296d4254825; QUAD $0xfe483d5162c1fe48; QUAD $0x62d8fe4865d162c2; QUAD $0xf16202c0724845f1; QUAD $0x1df1620dc0724825; QUAD $0x487e716216c07248; QUAD $0xca2548757362c86f; QUAD $0x96fc254825d362e8; QUAD $0xd162f9fe4845d162; QUAD $0x487e7162f8fe4845; WORD $0x626f; BYTE $0x31 | |
103 | TESTQ $(1<<0), R14 | |
104 | JE skipNext0 | |
105 | MOVQ 0*24(AX), R9 | |
106 | LONG $0x487cc162; WORD $0x0410; BYTE $0x09 | |
107 | skipNext0: | |
108 | QUAD $0x7162c4fe484d5162; QUAD $0x482df162cb6f487e; QUAD $0x724825f16206c372; QUAD $0xc372481df1620bc3; QUAD $0xcacd25485d736219; QUAD $0x5362c1fe483d3162; QUAD $0x3d516296d4254825; QUAD $0xfe483d5162c1fe48; QUAD $0x62d0fe486dd162c2; QUAD $0xf16202c772484df1; QUAD $0x1df1620dc7724825; QUAD $0x487e716216c77248; QUAD $0xc925487d7362cf6f; QUAD $0x96f4254825d362e8; QUAD $0xd162f1fe484dd162; QUAD $0x487e7162f0fe484d; WORD $0x626f; BYTE $0x32 | |
109 | TESTQ $(1<<1), R14 | |
110 | JE skipNext1 | |
111 | MOVQ 1*24(AX), R9 | |
112 | LONG $0x487cc162; WORD $0x0c10; BYTE $0x09 | |
113 | skipNext1: | |
114 | QUAD $0x7162c4fe48555162; QUAD $0x482df162ca6f487e; QUAD $0x724825f16206c272; QUAD $0xc272481df1620bc2; QUAD $0xcacc254865736219; QUAD $0x5362c2fe483d3162; QUAD $0x3d516296d4254825; QUAD $0xfe483d5162c1fe48; QUAD $0x62c8fe4875d162c2; QUAD $0xf16202c6724855f1; QUAD $0x1df1620dc6724825; QUAD $0x487e716216c67248; QUAD $0xc82548457362ce6f; QUAD $0x96ec254825d362e8; QUAD $0xd162e9fe4855d162; QUAD $0x487e7162e8fe4855; WORD $0x626f; BYTE $0x33 | |
115 | TESTQ $(1<<2), R14 | |
116 | JE skipNext2 | |
117 | MOVQ 2*24(AX), R9 | |
118 | LONG $0x487cc162; WORD $0x1410; BYTE $0x09 | |
119 | skipNext2: | |
120 | QUAD $0x7162c4fe485d5162; QUAD $0x482df162c96f487e; QUAD $0x724825f16206c172; QUAD $0xc172481df1620bc1; QUAD $0xcacb25486d736219; QUAD $0x5362c3fe483d3162; QUAD $0x3d516296d4254825; QUAD $0xfe483d5162c1fe48; QUAD $0x62c0fe487dd162c2; QUAD $0xf16202c572485df1; QUAD $0x1df1620dc5724825; QUAD $0x487e716216c57248; QUAD $0xcf25484d7362cd6f; QUAD $0x96e4254825d362e8; QUAD $0xd162e1fe485dd162; QUAD $0x487e7162e0fe485d; WORD $0x626f; BYTE $0x34 | |
121 | TESTQ $(1<<3), R14 | |
122 | JE skipNext3 | |
123 | MOVQ 3*24(AX), R9 | |
124 | LONG $0x487cc162; WORD $0x1c10; BYTE $0x09 | |
125 | skipNext3: | |
126 | QUAD $0x7162c4fe48655162; QUAD $0x482df162c86f487e; QUAD $0x724825f16206c072; QUAD $0xc072481df1620bc0; QUAD $0xcaca254875736219; QUAD $0x5362c4fe483d3162; QUAD $0x3d516296d4254825; QUAD $0xfe483d5162c1fe48; QUAD $0x62f8fe4845d162c2; QUAD $0xf16202c4724865f1; QUAD $0x1df1620dc4724825; QUAD $0x487e716216c47248; QUAD $0xce2548557362cc6f; QUAD $0x96dc254825d362e8; QUAD $0xd162d9fe4865d162; QUAD $0x487e7162d8fe4865; WORD $0x626f; BYTE $0x35 | |
127 | TESTQ $(1<<4), R14 | |
128 | JE skipNext4 | |
129 | MOVQ 4*24(AX), R9 | |
130 | LONG $0x487cc162; WORD $0x2410; BYTE $0x09 | |
131 | skipNext4: | |
132 | QUAD $0x7162c4fe486d5162; QUAD $0x482df162cf6f487e; QUAD $0x724825f16206c772; QUAD $0xc772481df1620bc7; QUAD $0xcac925487d736219; QUAD $0x5362c5fe483d3162; QUAD $0x3d516296d4254825; QUAD $0xfe483d5162c1fe48; QUAD $0x62f0fe484dd162c2; QUAD $0xf16202c372486df1; QUAD $0x1df1620dc3724825; QUAD $0x487e716216c37248; QUAD $0xcd25485d7362cb6f; QUAD $0x96d4254825d362e8; QUAD $0xd162d1fe486dd162; QUAD $0x487e7162d0fe486d; WORD $0x626f; BYTE $0x36 | |
133 | TESTQ $(1<<5), R14 | |
134 | JE skipNext5 | |
135 | MOVQ 5*24(AX), R9 | |
136 | LONG $0x487cc162; WORD $0x2c10; BYTE $0x09 | |
137 | skipNext5: | |
138 | QUAD $0x7162c4fe48755162; QUAD $0x482df162ce6f487e; QUAD $0x724825f16206c672; QUAD $0xc672481df1620bc6; QUAD $0xcac8254845736219; QUAD $0x5362c6fe483d3162; QUAD $0x3d516296d4254825; QUAD $0xfe483d5162c1fe48; QUAD $0x62e8fe4855d162c2; QUAD $0xf16202c2724875f1; QUAD $0x1df1620dc2724825; QUAD $0x487e716216c27248; QUAD $0xcc2548657362ca6f; QUAD $0x96cc254825d362e8; QUAD $0xd162c9fe4875d162; QUAD $0x487e7162c8fe4875; WORD $0x626f; BYTE $0x37 | |
139 | TESTQ $(1<<6), R14 | |
140 | JE skipNext6 | |
141 | MOVQ 6*24(AX), R9 | |
142 | LONG $0x487cc162; WORD $0x3410; BYTE $0x09 | |
143 | skipNext6: | |
144 | QUAD $0x7162c4fe487d5162; QUAD $0x482df162cd6f487e; QUAD $0x724825f16206c572; QUAD $0xc572481df1620bc5; QUAD $0xcacf25484d736219; QUAD $0x5362c7fe483d3162; QUAD $0x3d516296d4254825; QUAD $0xfe483d5162c1fe48; QUAD $0x62e0fe485dd162c2; QUAD $0xf16202c172487df1; QUAD $0x1df1620dc1724825; QUAD $0x487e716216c17248; QUAD $0xcb25486d7362c96f; QUAD $0x96c4254825d362e8; QUAD $0xd162c1fe487dd162; QUAD $0x487e7162c0fe487d; WORD $0x626f; BYTE $0x38 | |
145 | TESTQ $(1<<7), R14 | |
146 | JE skipNext7 | |
147 | MOVQ 7*24(AX), R9 | |
148 | LONG $0x487cc162; WORD $0x3c10; BYTE $0x09 | |
149 | skipNext7: | |
150 | QUAD $0x7162c4fe48455162; QUAD $0x482df162cc6f487e; QUAD $0x724825f16206c472; QUAD $0xc472481df1620bc4; QUAD $0xcace254855736219; QUAD $0x5362c0fe483d1162; QUAD $0x3d516296d4254825; QUAD $0xfe483d5162c1fe48; QUAD $0x62d8fe4865d162c2; QUAD $0xf16202c0724845f1; QUAD $0x1df1620dc0724825; QUAD $0x487e716216c07248; QUAD $0xca2548757362c86f; QUAD $0x96fc254825d362e8; QUAD $0xd162f9fe4845d162; QUAD $0x487e7162f8fe4845; WORD $0x626f; BYTE $0x39 | |
151 | TESTQ $(1<<8), R14 | |
152 | JE skipNext8 | |
153 | MOVQ 8*24(AX), R9 | |
154 | LONG $0x487c4162; WORD $0x0410; BYTE $0x09 | |
155 | skipNext8: | |
156 | QUAD $0x7162c4fe484d5162; QUAD $0x482df162cb6f487e; QUAD $0x724825f16206c372; QUAD $0xc372481df1620bc3; QUAD $0xcacd25485d736219; QUAD $0x5362c1fe483d1162; QUAD $0x3d516296d4254825; QUAD $0xfe483d5162c1fe48; QUAD $0x62d0fe486dd162c2; QUAD $0xf16202c772484df1; QUAD $0x1df1620dc7724825; QUAD $0x487e716216c77248; QUAD $0xc925487d7362cf6f; QUAD $0x96f4254825d362e8; QUAD $0xd162f1fe484dd162; QUAD $0x487e7162f0fe484d; WORD $0x626f; BYTE $0x3a | |
157 | TESTQ $(1<<9), R14 | |
158 | JE skipNext9 | |
159 | MOVQ 9*24(AX), R9 | |
160 | LONG $0x487c4162; WORD $0x0c10; BYTE $0x09 | |
161 | skipNext9: | |
162 | QUAD $0x7162c4fe48555162; QUAD $0x482df162ca6f487e; QUAD $0x724825f16206c272; QUAD $0xc272481df1620bc2; QUAD $0xcacc254865736219; QUAD $0x5362c2fe483d1162; QUAD $0x3d516296d4254825; QUAD $0xfe483d5162c1fe48; QUAD $0x62c8fe4875d162c2; QUAD $0xf16202c6724855f1; QUAD $0x1df1620dc6724825; QUAD $0x487e716216c67248; QUAD $0xc82548457362ce6f; QUAD $0x96ec254825d362e8; QUAD $0xd162e9fe4855d162; QUAD $0x487e7162e8fe4855; WORD $0x626f; BYTE $0x3b | |
163 | TESTQ $(1<<10), R14 | |
164 | JE skipNext10 | |
165 | MOVQ 10*24(AX), R9 | |
166 | LONG $0x487c4162; WORD $0x1410; BYTE $0x09 | |
167 | skipNext10: | |
168 | QUAD $0x7162c4fe485d5162; QUAD $0x482df162c96f487e; QUAD $0x724825f16206c172; QUAD $0xc172481df1620bc1; QUAD $0xcacb25486d736219; QUAD $0x5362c3fe483d1162; QUAD $0x3d516296d4254825; QUAD $0xfe483d5162c1fe48; QUAD $0x62c0fe487dd162c2; QUAD $0xf16202c572485df1; QUAD $0x1df1620dc5724825; QUAD $0x487e716216c57248; QUAD $0xcf25484d7362cd6f; QUAD $0x96e4254825d362e8; QUAD $0xd162e1fe485dd162; QUAD $0x487e7162e0fe485d; WORD $0x626f; BYTE $0x3c | |
169 | TESTQ $(1<<11), R14 | |
170 | JE skipNext11 | |
171 | MOVQ 11*24(AX), R9 | |
172 | LONG $0x487c4162; WORD $0x1c10; BYTE $0x09 | |
173 | skipNext11: | |
174 | QUAD $0x7162c4fe48655162; QUAD $0x482df162c86f487e; QUAD $0x724825f16206c072; QUAD $0xc072481df1620bc0; QUAD $0xcaca254875736219; QUAD $0x5362c4fe483d1162; QUAD $0x3d516296d4254825; QUAD $0xfe483d5162c1fe48; QUAD $0x62f8fe4845d162c2; QUAD $0xf16202c4724865f1; QUAD $0x1df1620dc4724825; QUAD $0x487e716216c47248; QUAD $0xce2548557362cc6f; QUAD $0x96dc254825d362e8; QUAD $0xd162d9fe4865d162; QUAD $0x487e7162d8fe4865; WORD $0x626f; BYTE $0x3d | |
175 | TESTQ $(1<<12), R14 | |
176 | JE skipNext12 | |
177 | MOVQ 12*24(AX), R9 | |
178 | LONG $0x487c4162; WORD $0x2410; BYTE $0x09 | |
179 | skipNext12: | |
180 | QUAD $0x7162c4fe486d5162; QUAD $0x482df162cf6f487e; QUAD $0x724825f16206c772; QUAD $0xc772481df1620bc7; QUAD $0xcac925487d736219; QUAD $0x5362c5fe483d1162; QUAD $0x3d516296d4254825; QUAD $0xfe483d5162c1fe48; QUAD $0x62f0fe484dd162c2; QUAD $0xf16202c372486df1; QUAD $0x1df1620dc3724825; QUAD $0x487e716216c37248; QUAD $0xcd25485d7362cb6f; QUAD $0x96d4254825d362e8; QUAD $0xd162d1fe486dd162; QUAD $0x487e7162d0fe486d; WORD $0x626f; BYTE $0x3e | |
181 | TESTQ $(1<<13), R14 | |
182 | JE skipNext13 | |
183 | MOVQ 13*24(AX), R9 | |
184 | LONG $0x487c4162; WORD $0x2c10; BYTE $0x09 | |
185 | skipNext13: | |
186 | QUAD $0x7162c4fe48755162; QUAD $0x482df162ce6f487e; QUAD $0x724825f16206c672; QUAD $0xc672481df1620bc6; QUAD $0xcac8254845736219; QUAD $0x5362c6fe483d1162; QUAD $0x3d516296d4254825; QUAD $0xfe483d5162c1fe48; QUAD $0x62e8fe4855d162c2; QUAD $0xf16202c2724875f1; QUAD $0x1df1620dc2724825; QUAD $0x487e716216c27248; QUAD $0xcc2548657362ca6f; QUAD $0x96cc254825d362e8; QUAD $0xd162c9fe4875d162; QUAD $0x487e7162c8fe4875; WORD $0x626f; BYTE $0x3f | |
187 | TESTQ $(1<<14), R14 | |
188 | JE skipNext14 | |
189 | MOVQ 14*24(AX), R9 | |
190 | LONG $0x487c4162; WORD $0x3410; BYTE $0x09 | |
191 | skipNext14: | |
192 | QUAD $0x7162c4fe487d5162; QUAD $0x482df162cd6f487e; QUAD $0x724825f16206c572; QUAD $0xc572481df1620bc5; QUAD $0xcacf25484d736219; QUAD $0x5362c7fe483d1162; QUAD $0x3d516296d4254825; QUAD $0xfe483d5162c1fe48; QUAD $0x62e0fe485dd162c2; QUAD $0xf16202c172487df1; QUAD $0x1df1620dc1724825; QUAD $0x487e716216c17248; QUAD $0xcb25486d7362c96f; QUAD $0x96c4254825d362e8; QUAD $0xd162c1fe487dd162; QUAD $0x487e7162c0fe487d; WORD $0x626f; BYTE $0x40 | |
193 | TESTQ $(1<<15), R14 | |
194 | JE skipNext15 | |
195 | MOVQ 15*24(AX), R9 | |
196 | LONG $0x487c4162; WORD $0x3c10; BYTE $0x09 | |
197 | skipNext15: | |
198 | QUAD $0xd162d86f487e7162; QUAD $0x7dd16224046f487e; QUAD $0x6f487e7162c3fe49; QUAD $0x244c6f487ed162d9; QUAD $0x62cbfe4975d16201; QUAD $0x7ed162da6f487e71; QUAD $0x6dd1620224546f48; QUAD $0x6f487e7162d3fe49; QUAD $0x245c6f487ed162db; QUAD $0x62dbfe4965d16203; QUAD $0x7ed162dc6f487e71; QUAD $0x5dd1620424646f48; QUAD $0x6f487e7162e3fe49; QUAD $0x246c6f487ed162dd; QUAD $0x62ebfe4955d16205; QUAD $0x7ed162de6f487e71; QUAD $0x4dd1620624746f48; QUAD $0x6f487e7162f3fe49; QUAD $0x247c6f487ed162df; QUAD $0xc4fbfe4945d16207; LONG $0xce92fbc1 | |
199 | JMP lloop | |
200 | lastLoop: | |
201 | QUAD $0x7162c4fe48455162; QUAD $0x482df162cc6f487e; QUAD $0x724825f16206c472; QUAD $0xc472481df1620bc4; QUAD $0xcace254855736219; QUAD $0x5362c0fe483d3162; QUAD $0x3d516296d4254825; QUAD $0xfe483d5162c1fe48; QUAD $0x62d8fe4865d162c2; QUAD $0xf16202c0724845f1; QUAD $0x1df1620dc0724825; QUAD $0x487e716216c07248; QUAD $0xca2548757362c86f; QUAD $0x96fc254825d362e8; QUAD $0xd162f9fe4845d162; QUAD $0x487e7162f8fe4845; QUAD $0xfe484d516231626f; QUAD $0x62cb6f487e7162c4; QUAD $0xf16206c372482df1; QUAD $0x1df1620bc3724825; QUAD $0x485d736219c37248; QUAD $0xfe483d3162cacd25; QUAD $0x96d42548255362c1; QUAD $0x5162c1fe483d5162; QUAD $0x486dd162c2fe483d; QUAD $0xc772484df162d0fe; QUAD $0x0dc7724825f16202; QUAD $0x6216c772481df162; QUAD $0x7d7362cf6f487e71; QUAD $0x4825d362e8c92548; QUAD $0xfe484dd16296f425; QUAD $0x62f0fe484dd162f1; QUAD $0x516232626f487e71; QUAD $0x487e7162c4fe4855; QUAD $0xc272482df162ca6f; QUAD $0x0bc2724825f16206; QUAD $0x6219c272481df162; QUAD $0x3162cacc25486573; QUAD $0x48255362c2fe483d; QUAD $0xfe483d516296d425; QUAD $0x62c2fe483d5162c1; QUAD $0x55f162c8fe4875d1; QUAD $0x4825f16202c67248; QUAD $0x72481df1620dc672; QUAD $0xce6f487e716216c6; QUAD $0x62e8c82548457362; QUAD $0xd16296ec254825d3; QUAD $0x4855d162e9fe4855; QUAD $0x626f487e7162e8fe; QUAD $0x62c4fe485d516233; QUAD $0x2df162c96f487e71; QUAD $0x4825f16206c17248; QUAD $0x72481df1620bc172; QUAD $0xcb25486d736219c1; QUAD $0x62c3fe483d3162ca; QUAD $0x516296d425482553; QUAD $0x483d5162c1fe483d; QUAD $0xc0fe487dd162c2fe; QUAD $0x6202c572485df162; QUAD $0xf1620dc5724825f1; QUAD $0x7e716216c572481d; QUAD $0x25484d7362cd6f48; QUAD $0xe4254825d362e8cf; QUAD $0x62e1fe485dd16296; QUAD $0x7e7162e0fe485dd1; QUAD $0x4865516234626f48; QUAD $0xc86f487e7162c4fe; QUAD $0x6206c072482df162; QUAD $0xf1620bc0724825f1; QUAD $0x75736219c072481d; QUAD $0x483d3162caca2548; QUAD $0xd42548255362c4fe; QUAD $0x62c1fe483d516296; QUAD $0x45d162c2fe483d51; QUAD $0x724865f162f8fe48; QUAD $0xc4724825f16202c4; QUAD $0x16c472481df1620d; QUAD $0x7362cc6f487e7162; QUAD $0x25d362e8ce254855; QUAD $0x4865d16296dc2548; QUAD $0xd8fe4865d162d9fe; QUAD $0x6235626f487e7162; QUAD $0x7e7162c4fe486d51; QUAD $0x72482df162cf6f48; QUAD $0xc7724825f16206c7; QUAD $0x19c772481df1620b; QUAD $0x62cac925487d7362; QUAD $0x255362c5fe483d31; QUAD $0x483d516296d42548; QUAD $0xc2fe483d5162c1fe; QUAD $0xf162f0fe484dd162; QUAD $0x25f16202c372486d; QUAD $0x481df1620dc37248; QUAD $0x6f487e716216c372; QUAD $0xe8cd25485d7362cb; QUAD $0x6296d4254825d362; QUAD $0x6dd162d1fe486dd1; QUAD $0x6f487e7162d0fe48; QUAD $0xc4fe487551623662; QUAD $0xf162ce6f487e7162; QUAD $0x25f16206c672482d; QUAD $0x481df1620bc67248; QUAD $0x254845736219c672; QUAD $0xc6fe483d3162cac8; QUAD $0x6296d42548255362; QUAD $0x3d5162c1fe483d51; QUAD $0xfe4855d162c2fe48; QUAD $0x02c2724875f162e8; QUAD $0x620dc2724825f162; QUAD $0x716216c272481df1; QUAD $0x48657362ca6f487e; QUAD $0x254825d362e8cc25; QUAD $0xc9fe4875d16296cc; QUAD $0x7162c8fe4875d162; QUAD $0x7d516237626f487e; QUAD $0x6f487e7162c4fe48; QUAD $0x06c572482df162cd; QUAD $0x620bc5724825f162; QUAD $0x736219c572481df1; QUAD $0x3d3162cacf25484d; QUAD $0x2548255362c7fe48; QUAD $0xc1fe483d516296d4; QUAD $0xd162c2fe483d5162; QUAD $0x487df162e0fe485d; QUAD $0x724825f16202c172; QUAD $0xc172481df1620dc1; QUAD $0x62c96f487e716216; QUAD $0xd362e8cb25486d73; QUAD $0x7dd16296c4254825; QUAD $0xfe487dd162c1fe48; QUAD $0x38626f487e7162c0; QUAD $0x7162c4fe48455162; QUAD $0x482df162cc6f487e; QUAD $0x724825f16206c472; QUAD $0xc472481df1620bc4; QUAD $0xcace254855736219; QUAD $0x5362c0fe483d1162; QUAD $0x3d516296d4254825; QUAD $0xfe483d5162c1fe48; QUAD $0x62d8fe4865d162c2; QUAD $0xf16202c0724845f1; QUAD $0x1df1620dc0724825; QUAD $0x487e716216c07248; QUAD $0xca2548757362c86f; QUAD $0x96fc254825d362e8; QUAD $0xd162f9fe4845d162; QUAD $0x487e7162f8fe4845; QUAD $0xfe484d516239626f; QUAD $0x62cb6f487e7162c4; QUAD $0xf16206c372482df1; QUAD $0x1df1620bc3724825; QUAD $0x485d736219c37248; QUAD $0xfe483d1162cacd25; QUAD $0x96d42548255362c1; QUAD $0x5162c1fe483d5162; QUAD $0x486dd162c2fe483d; QUAD $0xc772484df162d0fe; QUAD $0x0dc7724825f16202; QUAD $0x6216c772481df162; QUAD $0x7d7362cf6f487e71; QUAD $0x4825d362e8c92548; QUAD $0xfe484dd16296f425; QUAD $0x62f0fe484dd162f1; QUAD $0x51623a626f487e71; QUAD $0x487e7162c4fe4855; QUAD $0xc272482df162ca6f; QUAD $0x0bc2724825f16206; QUAD $0x6219c272481df162; QUAD $0x1162cacc25486573; QUAD $0x48255362c2fe483d; QUAD $0xfe483d516296d425; QUAD $0x62c2fe483d5162c1; QUAD $0x55f162c8fe4875d1; QUAD $0x4825f16202c67248; QUAD $0x72481df1620dc672; QUAD $0xce6f487e716216c6; QUAD $0x62e8c82548457362; QUAD $0xd16296ec254825d3; QUAD $0x4855d162e9fe4855; QUAD $0x626f487e7162e8fe; QUAD $0x62c4fe485d51623b; QUAD $0x2df162c96f487e71; QUAD $0x4825f16206c17248; QUAD $0x72481df1620bc172; QUAD $0xcb25486d736219c1; QUAD $0x62c3fe483d1162ca; QUAD $0x516296d425482553; QUAD $0x483d5162c1fe483d; QUAD $0xc0fe487dd162c2fe; QUAD $0x6202c572485df162; QUAD $0xf1620dc5724825f1; QUAD $0x7e716216c572481d; QUAD $0x25484d7362cd6f48; QUAD $0xe4254825d362e8cf; QUAD $0x62e1fe485dd16296; QUAD $0x7e7162e0fe485dd1; QUAD $0x486551623c626f48; QUAD $0xc86f487e7162c4fe; QUAD $0x6206c072482df162; QUAD $0xf1620bc0724825f1; QUAD $0x75736219c072481d; QUAD $0x483d1162caca2548; QUAD $0xd42548255362c4fe; QUAD $0x62c1fe483d516296; QUAD $0x45d162c2fe483d51; QUAD $0x724865f162f8fe48; QUAD $0xc4724825f16202c4; QUAD $0x16c472481df1620d; QUAD $0x7362cc6f487e7162; QUAD $0x25d362e8ce254855; QUAD $0x4865d16296dc2548; QUAD $0xd8fe4865d162d9fe; QUAD $0x623d626f487e7162; QUAD $0x7e7162c4fe486d51; QUAD $0x72482df162cf6f48; QUAD $0xc7724825f16206c7; QUAD $0x19c772481df1620b; QUAD $0x62cac925487d7362; QUAD $0x255362c5fe483d11; QUAD $0x483d516296d42548; QUAD $0xc2fe483d5162c1fe; QUAD $0xf162f0fe484dd162; QUAD $0x25f16202c372486d; QUAD $0x481df1620dc37248; QUAD $0x6f487e716216c372; QUAD $0xe8cd25485d7362cb; QUAD $0x6296d4254825d362; QUAD $0x6dd162d1fe486dd1; QUAD $0x6f487e7162d0fe48; QUAD $0xc4fe487551623e62; QUAD $0xf162ce6f487e7162; QUAD $0x25f16206c672482d; QUAD $0x481df1620bc67248; QUAD $0x254845736219c672; QUAD $0xc6fe483d1162cac8; QUAD $0x6296d42548255362; QUAD $0x3d5162c1fe483d51; QUAD $0xfe4855d162c2fe48; QUAD $0x02c2724875f162e8; QUAD $0x620dc2724825f162; QUAD $0x716216c272481df1; QUAD $0x48657362ca6f487e; QUAD $0x254825d362e8cc25; QUAD $0xc9fe4875d16296cc; QUAD $0x7162c8fe4875d162; QUAD $0x7d51623f626f487e; QUAD $0x6f487e7162c4fe48; QUAD $0x06c572482df162cd; QUAD $0x620bc5724825f162; QUAD $0x736219c572481df1; QUAD $0x3d1162cacf25484d; QUAD $0x2548255362c7fe48; QUAD $0xc1fe483d516296d4; QUAD $0xd162c2fe483d5162; QUAD $0x487df162e0fe485d; QUAD $0x724825f16202c172; QUAD $0xc172481df1620dc1; QUAD $0x62c96f487e716216; QUAD $0xd362e8cb25486d73; QUAD $0x7dd16296c4254825; QUAD $0xfe487dd162c1fe48; QUAD $0x40626f487e7162c0; QUAD $0xd162d86f487e7162; QUAD $0x7dd16224046f487e; QUAD $0x6f487e7162c3fe49; QUAD $0x244c6f487ed162d9; QUAD $0x62cbfe4975d16201; QUAD $0x7ed162da6f487e71; QUAD $0x6dd1620224546f48; QUAD $0x6f487e7162d3fe49; QUAD $0x245c6f487ed162db; QUAD $0x62dbfe4965d16203; QUAD $0x7ed162dc6f487e71; QUAD $0x5dd1620424646f48; QUAD $0x6f487e7162e3fe49; QUAD $0x246c6f487ed162dd; QUAD $0x62ebfe4955d16205; QUAD $0x7ed162de6f487e71; QUAD $0x4dd1620624746f48; QUAD $0x6f487e7162f3fe49; QUAD $0x247c6f487ed162df; QUAD $0x62fbfe4945d16207; QUAD $0x7ef162077f487ef1; QUAD $0x487ef162014f7f48; QUAD $0x7f487ef16202577f; QUAD $0x677f487ef162035f; QUAD $0x056f7f487ef16204; QUAD $0x6206777f487ef162; LONG $0x7f487ef1; WORD $0x077f | |
202 | VZEROUPPER | |
203 | RET | |
204 | DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x000(SB)/8, $0x0405060700010203 | |
205 | DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x008(SB)/8, $0x0c0d0e0f08090a0b | |
206 | DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x010(SB)/8, $0x0405060700010203 | |
207 | DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x018(SB)/8, $0x0c0d0e0f08090a0b | |
208 | DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x020(SB)/8, $0x0405060700010203 | |
209 | DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x028(SB)/8, $0x0c0d0e0f08090a0b | |
210 | DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x030(SB)/8, $0x0405060700010203 | |
211 | DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x038(SB)/8, $0x0c0d0e0f08090a0b | |
212 | GLOBL PSHUFFLE_BYTE_FLIP_MASK<>(SB), 8, $64 | |
213 | DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x000(SB)/8, $0x0000000000000000 | |
214 | DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x008(SB)/8, $0x0000000000000001 | |
215 | DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x010(SB)/8, $0x0000000000000008 | |
216 | DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x018(SB)/8, $0x0000000000000009 | |
217 | DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x020(SB)/8, $0x0000000000000004 | |
218 | DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x028(SB)/8, $0x0000000000000005 | |
219 | DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x030(SB)/8, $0x000000000000000C | |
220 | DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x038(SB)/8, $0x000000000000000D | |
221 | GLOBL PSHUFFLE_TRANSPOSE16_MASK1<>(SB), 8, $64 | |
222 | DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x000(SB)/8, $0x0000000000000002 | |
223 | DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x008(SB)/8, $0x0000000000000003 | |
224 | DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x010(SB)/8, $0x000000000000000A | |
225 | DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x018(SB)/8, $0x000000000000000B | |
226 | DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x020(SB)/8, $0x0000000000000006 | |
227 | DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x028(SB)/8, $0x0000000000000007 | |
228 | DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x030(SB)/8, $0x000000000000000E | |
229 | DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x038(SB)/8, $0x000000000000000F | |
230 | GLOBL PSHUFFLE_TRANSPOSE16_MASK2<>(SB), 8, $64 |