Codebase list openssl / 7d34a9e
Add AES consttime code for no-asm configurations This adds optional constant time support for AES when building openssl for no-asm. Enable with: ./config no-asm -DOPENSSL_AES_CONST_TIME [extended tests] Reviewed-by: Nicola Tuveri <nic.tuv@gmail.com> (Merged from https://github.com/openssl/openssl/pull/11411) Bernd Edlinger 4 years ago
3 changed file(s) with 988 addition(s) and 1 deletion(s). Raw diff Collapse all Expand all
4242 #include <openssl/aes.h>
4343 #include "aes_local.h"
4444
45 #ifndef AES_ASM
45 #if defined(OPENSSL_AES_CONST_TIME) && !defined(AES_ASM)
46 typedef union {
47 unsigned char b[8];
48 u32 w[2];
49 u64 d;
50 } uni;
51
52 /*
53 * Compute w := (w * x) mod (x^8 + x^4 + x^3 + x^1 + 1)
54 * Therefore the name "xtime".
55 */
56 static void XtimeWord(u32 *w)
57 {
58 u32 a, b;
59
60 a = *w;
61 b = a & 0x80808080u;
62 a ^= b;
63 b -= b >> 7;
64 b &= 0x1B1B1B1Bu;
65 b ^= a << 1;
66 *w = b;
67 }
68
69 static void XtimeLong(u64 *w)
70 {
71 u64 a, b;
72
73 a = *w;
74 b = a & 0x8080808080808080uLL;
75 a ^= b;
76 b -= b >> 7;
77 b &= 0x1B1B1B1B1B1B1B1BuLL;
78 b ^= a << 1;
79 *w = b;
80 }
81
82 /*
83 * This computes w := S * w ^ -1 + c, where c = {01100011}.
84 * Instead of using GF(2^8) mod (x^8+x^4+x^3+x+1} we do the inversion
85 * in GF(GF(GF(2^2)^2)^2) mod (X^2+X+8)
86 * and GF(GF(2^2)^2) mod (X^2+X+2)
87 * and GF(2^2) mod (X^2+X+1)
88 * The first part of the algorithm below transfers the coordinates
89 * {0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80} =>
90 * {1,Y,Y^2,Y^3,Y^4,Y^5,Y^6,Y^7} with Y=0x41:
91 * {0x01,0x41,0x66,0x6c,0x56,0x9a,0x58,0xc4}
92 * The last part undoes the coordinate transfer and the final affine
93 * transformation S:
94 * b[i] = b[i] + b[(i+4)%8] + b[(i+5)%8] + b[(i+6)%8] + b[(i+7)%8] + c[i]
95 * in one step.
96 * The multiplication in GF(2^2^2^2) is done in ordinary coords:
97 * A = (a0*1 + a1*x^4)
98 * B = (b0*1 + b1*x^4)
99 * AB = ((a0*b0 + 8*a1*b1)*1 + (a1*b0 + (a0+a1)*b1)*x^4)
100 * When A = (a0,a1) is given we want to solve AB = 1:
101 * (a) 1 = a0*b0 + 8*a1*b1
102 * (b) 0 = a1*b0 + (a0+a1)*b1
103 * => multiply (a) by a1 and (b) by a0
104 * (c) a1 = a1*a0*b0 + (8*a1*a1)*b1
105 * (d) 0 = a1*a0*b0 + (a0*a0+a1*a0)*b1
106 * => add (c) + (d)
107 * (e) a1 = (a0*a0 + a1*a0 + 8*a1*a1)*b1
108 * => therefore
109 * b1 = (a0*a0 + a1*a0 + 8*a1*a1)^-1 * a1
110 * => and adding (a1*b0) to (b) we get
111 * (f) a1*b0 = (a0+a1)*b1
112 * => therefore
113 * b0 = (a0*a0 + a1*a0 + 8*a1*a1)^-1 * (a0+a1)
114 * Note this formula also works for the case
115 * (a0+a1)*a0 + 8*a1*a1 = 0
116 * if the inverse element for 0^-1 is mapped to 0.
117 * Repeat the same for GF(2^2^2) and GF(2^2).
118 * We get the following algorithm:
119 * inv8(a0,a1):
120 * x0 = a0^a1
121 * [y0,y1] = mul4([x0,a1],[a0,a1]); (*)
122 * y1 = mul4(8,y1);
123 * t = inv4(y0^y1);
124 * [b0,b1] = mul4([x0,a1],[t,t]); (*)
125 * return [b0,b1];
126 * The non-linear multiplies (*) can be done in parallel at no extra cost.
127 */
128 static void SubWord(u32 *w)
129 {
130 u32 x, y, a1, a2, a3, a4, a5, a6;
131
132 x = *w;
133 y = ((x & 0xFEFEFEFEu) >> 1) | ((x & 0x01010101u) << 7);
134 x &= 0xDDDDDDDDu;
135 x ^= y & 0x57575757u;
136 y = ((y & 0xFEFEFEFEu) >> 1) | ((y & 0x01010101u) << 7);
137 x ^= y & 0x1C1C1C1Cu;
138 y = ((y & 0xFEFEFEFEu) >> 1) | ((y & 0x01010101u) << 7);
139 x ^= y & 0x4A4A4A4Au;
140 y = ((y & 0xFEFEFEFEu) >> 1) | ((y & 0x01010101u) << 7);
141 x ^= y & 0x42424242u;
142 y = ((y & 0xFEFEFEFEu) >> 1) | ((y & 0x01010101u) << 7);
143 x ^= y & 0x64646464u;
144 y = ((y & 0xFEFEFEFEu) >> 1) | ((y & 0x01010101u) << 7);
145 x ^= y & 0xE0E0E0E0u;
146 a1 = x;
147 a1 ^= (x & 0xF0F0F0F0u) >> 4;
148 a2 = ((x & 0xCCCCCCCCu) >> 2) | ((x & 0x33333333u) << 2);
149 a3 = x & a1;
150 a3 ^= (a3 & 0xAAAAAAAAu) >> 1;
151 a3 ^= (((x << 1) & a1) ^ ((a1 << 1) & x)) & 0xAAAAAAAAu;
152 a4 = a2 & a1;
153 a4 ^= (a4 & 0xAAAAAAAAu) >> 1;
154 a4 ^= (((a2 << 1) & a1) ^ ((a1 << 1) & a2)) & 0xAAAAAAAAu;
155 a5 = (a3 & 0xCCCCCCCCu) >> 2;
156 a3 ^= ((a4 << 2) ^ a4) & 0xCCCCCCCCu;
157 a4 = a5 & 0x22222222u;
158 a4 |= a4 >> 1;
159 a4 ^= (a5 << 1) & 0x22222222u;
160 a3 ^= a4;
161 a5 = a3 & 0xA0A0A0A0u;
162 a5 |= a5 >> 1;
163 a5 ^= (a3 << 1) & 0xA0A0A0A0u;
164 a4 = a5 & 0xC0C0C0C0u;
165 a6 = a4 >> 2;
166 a4 ^= (a5 << 2) & 0xC0C0C0C0u;
167 a5 = a6 & 0x20202020u;
168 a5 |= a5 >> 1;
169 a5 ^= (a6 << 1) & 0x20202020u;
170 a4 |= a5;
171 a3 ^= a4 >> 4;
172 a3 &= 0x0F0F0F0Fu;
173 a2 = a3;
174 a2 ^= (a3 & 0x0C0C0C0Cu) >> 2;
175 a4 = a3 & a2;
176 a4 ^= (a4 & 0x0A0A0A0A0Au) >> 1;
177 a4 ^= (((a3 << 1) & a2) ^ ((a2 << 1) & a3)) & 0x0A0A0A0Au;
178 a5 = a4 & 0x08080808u;
179 a5 |= a5 >> 1;
180 a5 ^= (a4 << 1) & 0x08080808u;
181 a4 ^= a5 >> 2;
182 a4 &= 0x03030303u;
183 a4 ^= (a4 & 0x02020202u) >> 1;
184 a4 |= a4 << 2;
185 a3 = a2 & a4;
186 a3 ^= (a3 & 0x0A0A0A0Au) >> 1;
187 a3 ^= (((a2 << 1) & a4) ^ ((a4 << 1) & a2)) & 0x0A0A0A0Au;
188 a3 |= a3 << 4;
189 a2 = ((a1 & 0xCCCCCCCCu) >> 2) | ((a1 & 0x33333333u) << 2);
190 x = a1 & a3;
191 x ^= (x & 0xAAAAAAAAu) >> 1;
192 x ^= (((a1 << 1) & a3) ^ ((a3 << 1) & a1)) & 0xAAAAAAAAu;
193 a4 = a2 & a3;
194 a4 ^= (a4 & 0xAAAAAAAAu) >> 1;
195 a4 ^= (((a2 << 1) & a3) ^ ((a3 << 1) & a2)) & 0xAAAAAAAAu;
196 a5 = (x & 0xCCCCCCCCu) >> 2;
197 x ^= ((a4 << 2) ^ a4) & 0xCCCCCCCCu;
198 a4 = a5 & 0x22222222u;
199 a4 |= a4 >> 1;
200 a4 ^= (a5 << 1) & 0x22222222u;
201 x ^= a4;
202 y = ((x & 0xFEFEFEFEu) >> 1) | ((x & 0x01010101u) << 7);
203 x &= 0x39393939u;
204 x ^= y & 0x3F3F3F3Fu;
205 y = ((y & 0xFCFCFCFCu) >> 2) | ((y & 0x03030303u) << 6);
206 x ^= y & 0x97979797u;
207 y = ((y & 0xFEFEFEFEu) >> 1) | ((y & 0x01010101u) << 7);
208 x ^= y & 0x9B9B9B9Bu;
209 y = ((y & 0xFEFEFEFEu) >> 1) | ((y & 0x01010101u) << 7);
210 x ^= y & 0x3C3C3C3Cu;
211 y = ((y & 0xFEFEFEFEu) >> 1) | ((y & 0x01010101u) << 7);
212 x ^= y & 0xDDDDDDDDu;
213 y = ((y & 0xFEFEFEFEu) >> 1) | ((y & 0x01010101u) << 7);
214 x ^= y & 0x72727272u;
215 x ^= 0x63636363u;
216 *w = x;
217 }
218
219 static void SubLong(u64 *w)
220 {
221 u64 x, y, a1, a2, a3, a4, a5, a6;
222
223 x = *w;
224 y = ((x & 0xFEFEFEFEFEFEFEFEuLL) >> 1) | ((x & 0x0101010101010101uLL) << 7);
225 x &= 0xDDDDDDDDDDDDDDDDuLL;
226 x ^= y & 0x5757575757575757uLL;
227 y = ((y & 0xFEFEFEFEFEFEFEFEuLL) >> 1) | ((y & 0x0101010101010101uLL) << 7);
228 x ^= y & 0x1C1C1C1C1C1C1C1CuLL;
229 y = ((y & 0xFEFEFEFEFEFEFEFEuLL) >> 1) | ((y & 0x0101010101010101uLL) << 7);
230 x ^= y & 0x4A4A4A4A4A4A4A4AuLL;
231 y = ((y & 0xFEFEFEFEFEFEFEFEuLL) >> 1) | ((y & 0x0101010101010101uLL) << 7);
232 x ^= y & 0x4242424242424242uLL;
233 y = ((y & 0xFEFEFEFEFEFEFEFEuLL) >> 1) | ((y & 0x0101010101010101uLL) << 7);
234 x ^= y & 0x6464646464646464uLL;
235 y = ((y & 0xFEFEFEFEFEFEFEFEuLL) >> 1) | ((y & 0x0101010101010101uLL) << 7);
236 x ^= y & 0xE0E0E0E0E0E0E0E0uLL;
237 a1 = x;
238 a1 ^= (x & 0xF0F0F0F0F0F0F0F0uLL) >> 4;
239 a2 = ((x & 0xCCCCCCCCCCCCCCCCuLL) >> 2) | ((x & 0x3333333333333333uLL) << 2);
240 a3 = x & a1;
241 a3 ^= (a3 & 0xAAAAAAAAAAAAAAAAuLL) >> 1;
242 a3 ^= (((x << 1) & a1) ^ ((a1 << 1) & x)) & 0xAAAAAAAAAAAAAAAAuLL;
243 a4 = a2 & a1;
244 a4 ^= (a4 & 0xAAAAAAAAAAAAAAAAuLL) >> 1;
245 a4 ^= (((a2 << 1) & a1) ^ ((a1 << 1) & a2)) & 0xAAAAAAAAAAAAAAAAuLL;
246 a5 = (a3 & 0xCCCCCCCCCCCCCCCCuLL) >> 2;
247 a3 ^= ((a4 << 2) ^ a4) & 0xCCCCCCCCCCCCCCCCuLL;
248 a4 = a5 & 0x2222222222222222uLL;
249 a4 |= a4 >> 1;
250 a4 ^= (a5 << 1) & 0x2222222222222222uLL;
251 a3 ^= a4;
252 a5 = a3 & 0xA0A0A0A0A0A0A0A0uLL;
253 a5 |= a5 >> 1;
254 a5 ^= (a3 << 1) & 0xA0A0A0A0A0A0A0A0uLL;
255 a4 = a5 & 0xC0C0C0C0C0C0C0C0uLL;
256 a6 = a4 >> 2;
257 a4 ^= (a5 << 2) & 0xC0C0C0C0C0C0C0C0uLL;
258 a5 = a6 & 0x2020202020202020uLL;
259 a5 |= a5 >> 1;
260 a5 ^= (a6 << 1) & 0x2020202020202020uLL;
261 a4 |= a5;
262 a3 ^= a4 >> 4;
263 a3 &= 0x0F0F0F0F0F0F0F0FuLL;
264 a2 = a3;
265 a2 ^= (a3 & 0x0C0C0C0C0C0C0C0CuLL) >> 2;
266 a4 = a3 & a2;
267 a4 ^= (a4 & 0x0A0A0A0A0A0A0A0AuLL) >> 1;
268 a4 ^= (((a3 << 1) & a2) ^ ((a2 << 1) & a3)) & 0x0A0A0A0A0A0A0A0AuLL;
269 a5 = a4 & 0x0808080808080808uLL;
270 a5 |= a5 >> 1;
271 a5 ^= (a4 << 1) & 0x0808080808080808uLL;
272 a4 ^= a5 >> 2;
273 a4 &= 0x0303030303030303uLL;
274 a4 ^= (a4 & 0x0202020202020202uLL) >> 1;
275 a4 |= a4 << 2;
276 a3 = a2 & a4;
277 a3 ^= (a3 & 0x0A0A0A0A0A0A0A0AuLL) >> 1;
278 a3 ^= (((a2 << 1) & a4) ^ ((a4 << 1) & a2)) & 0x0A0A0A0A0A0A0A0AuLL;
279 a3 |= a3 << 4;
280 a2 = ((a1 & 0xCCCCCCCCCCCCCCCCuLL) >> 2) | ((a1 & 0x3333333333333333uLL) << 2);
281 x = a1 & a3;
282 x ^= (x & 0xAAAAAAAAAAAAAAAAuLL) >> 1;
283 x ^= (((a1 << 1) & a3) ^ ((a3 << 1) & a1)) & 0xAAAAAAAAAAAAAAAAuLL;
284 a4 = a2 & a3;
285 a4 ^= (a4 & 0xAAAAAAAAAAAAAAAAuLL) >> 1;
286 a4 ^= (((a2 << 1) & a3) ^ ((a3 << 1) & a2)) & 0xAAAAAAAAAAAAAAAAuLL;
287 a5 = (x & 0xCCCCCCCCCCCCCCCCuLL) >> 2;
288 x ^= ((a4 << 2) ^ a4) & 0xCCCCCCCCCCCCCCCCuLL;
289 a4 = a5 & 0x2222222222222222uLL;
290 a4 |= a4 >> 1;
291 a4 ^= (a5 << 1) & 0x2222222222222222uLL;
292 x ^= a4;
293 y = ((x & 0xFEFEFEFEFEFEFEFEuLL) >> 1) | ((x & 0x0101010101010101uLL) << 7);
294 x &= 0x3939393939393939uLL;
295 x ^= y & 0x3F3F3F3F3F3F3F3FuLL;
296 y = ((y & 0xFCFCFCFCFCFCFCFCuLL) >> 2) | ((y & 0x0303030303030303uLL) << 6);
297 x ^= y & 0x9797979797979797uLL;
298 y = ((y & 0xFEFEFEFEFEFEFEFEuLL) >> 1) | ((y & 0x0101010101010101uLL) << 7);
299 x ^= y & 0x9B9B9B9B9B9B9B9BuLL;
300 y = ((y & 0xFEFEFEFEFEFEFEFEuLL) >> 1) | ((y & 0x0101010101010101uLL) << 7);
301 x ^= y & 0x3C3C3C3C3C3C3C3CuLL;
302 y = ((y & 0xFEFEFEFEFEFEFEFEuLL) >> 1) | ((y & 0x0101010101010101uLL) << 7);
303 x ^= y & 0xDDDDDDDDDDDDDDDDuLL;
304 y = ((y & 0xFEFEFEFEFEFEFEFEuLL) >> 1) | ((y & 0x0101010101010101uLL) << 7);
305 x ^= y & 0x7272727272727272uLL;
306 x ^= 0x6363636363636363uLL;
307 *w = x;
308 }
309
310 /*
311 * This computes w := (S^-1 * (w + c))^-1
312 */
313 static void InvSubLong(u64 *w)
314 {
315 u64 x, y, a1, a2, a3, a4, a5, a6;
316
317 x = *w;
318 x ^= 0x6363636363636363uLL;
319 y = ((x & 0xFEFEFEFEFEFEFEFEuLL) >> 1) | ((x & 0x0101010101010101uLL) << 7);
320 x &= 0xFDFDFDFDFDFDFDFDuLL;
321 x ^= y & 0x5E5E5E5E5E5E5E5EuLL;
322 y = ((y & 0xFEFEFEFEFEFEFEFEuLL) >> 1) | ((y & 0x0101010101010101uLL) << 7);
323 x ^= y & 0xF3F3F3F3F3F3F3F3uLL;
324 y = ((y & 0xFEFEFEFEFEFEFEFEuLL) >> 1) | ((y & 0x0101010101010101uLL) << 7);
325 x ^= y & 0xF5F5F5F5F5F5F5F5uLL;
326 y = ((y & 0xFEFEFEFEFEFEFEFEuLL) >> 1) | ((y & 0x0101010101010101uLL) << 7);
327 x ^= y & 0x7878787878787878uLL;
328 y = ((y & 0xFEFEFEFEFEFEFEFEuLL) >> 1) | ((y & 0x0101010101010101uLL) << 7);
329 x ^= y & 0x7777777777777777uLL;
330 y = ((y & 0xFEFEFEFEFEFEFEFEuLL) >> 1) | ((y & 0x0101010101010101uLL) << 7);
331 x ^= y & 0x1515151515151515uLL;
332 y = ((y & 0xFEFEFEFEFEFEFEFEuLL) >> 1) | ((y & 0x0101010101010101uLL) << 7);
333 x ^= y & 0xA5A5A5A5A5A5A5A5uLL;
334 a1 = x;
335 a1 ^= (x & 0xF0F0F0F0F0F0F0F0uLL) >> 4;
336 a2 = ((x & 0xCCCCCCCCCCCCCCCCuLL) >> 2) | ((x & 0x3333333333333333uLL) << 2);
337 a3 = x & a1;
338 a3 ^= (a3 & 0xAAAAAAAAAAAAAAAAuLL) >> 1;
339 a3 ^= (((x << 1) & a1) ^ ((a1 << 1) & x)) & 0xAAAAAAAAAAAAAAAAuLL;
340 a4 = a2 & a1;
341 a4 ^= (a4 & 0xAAAAAAAAAAAAAAAAuLL) >> 1;
342 a4 ^= (((a2 << 1) & a1) ^ ((a1 << 1) & a2)) & 0xAAAAAAAAAAAAAAAAuLL;
343 a5 = (a3 & 0xCCCCCCCCCCCCCCCCuLL) >> 2;
344 a3 ^= ((a4 << 2) ^ a4) & 0xCCCCCCCCCCCCCCCCuLL;
345 a4 = a5 & 0x2222222222222222uLL;
346 a4 |= a4 >> 1;
347 a4 ^= (a5 << 1) & 0x2222222222222222uLL;
348 a3 ^= a4;
349 a5 = a3 & 0xA0A0A0A0A0A0A0A0uLL;
350 a5 |= a5 >> 1;
351 a5 ^= (a3 << 1) & 0xA0A0A0A0A0A0A0A0uLL;
352 a4 = a5 & 0xC0C0C0C0C0C0C0C0uLL;
353 a6 = a4 >> 2;
354 a4 ^= (a5 << 2) & 0xC0C0C0C0C0C0C0C0uLL;
355 a5 = a6 & 0x2020202020202020uLL;
356 a5 |= a5 >> 1;
357 a5 ^= (a6 << 1) & 0x2020202020202020uLL;
358 a4 |= a5;
359 a3 ^= a4 >> 4;
360 a3 &= 0x0F0F0F0F0F0F0F0FuLL;
361 a2 = a3;
362 a2 ^= (a3 & 0x0C0C0C0C0C0C0C0CuLL) >> 2;
363 a4 = a3 & a2;
364 a4 ^= (a4 & 0x0A0A0A0A0A0A0A0AuLL) >> 1;
365 a4 ^= (((a3 << 1) & a2) ^ ((a2 << 1) & a3)) & 0x0A0A0A0A0A0A0A0AuLL;
366 a5 = a4 & 0x0808080808080808uLL;
367 a5 |= a5 >> 1;
368 a5 ^= (a4 << 1) & 0x0808080808080808uLL;
369 a4 ^= a5 >> 2;
370 a4 &= 0x0303030303030303uLL;
371 a4 ^= (a4 & 0x0202020202020202uLL) >> 1;
372 a4 |= a4 << 2;
373 a3 = a2 & a4;
374 a3 ^= (a3 & 0x0A0A0A0A0A0A0A0AuLL) >> 1;
375 a3 ^= (((a2 << 1) & a4) ^ ((a4 << 1) & a2)) & 0x0A0A0A0A0A0A0A0AuLL;
376 a3 |= a3 << 4;
377 a2 = ((a1 & 0xCCCCCCCCCCCCCCCCuLL) >> 2) | ((a1 & 0x3333333333333333uLL) << 2);
378 x = a1 & a3;
379 x ^= (x & 0xAAAAAAAAAAAAAAAAuLL) >> 1;
380 x ^= (((a1 << 1) & a3) ^ ((a3 << 1) & a1)) & 0xAAAAAAAAAAAAAAAAuLL;
381 a4 = a2 & a3;
382 a4 ^= (a4 & 0xAAAAAAAAAAAAAAAAuLL) >> 1;
383 a4 ^= (((a2 << 1) & a3) ^ ((a3 << 1) & a2)) & 0xAAAAAAAAAAAAAAAAuLL;
384 a5 = (x & 0xCCCCCCCCCCCCCCCCuLL) >> 2;
385 x ^= ((a4 << 2) ^ a4) & 0xCCCCCCCCCCCCCCCCuLL;
386 a4 = a5 & 0x2222222222222222uLL;
387 a4 |= a4 >> 1;
388 a4 ^= (a5 << 1) & 0x2222222222222222uLL;
389 x ^= a4;
390 y = ((x & 0xFEFEFEFEFEFEFEFEuLL) >> 1) | ((x & 0x0101010101010101uLL) << 7);
391 x &= 0xB5B5B5B5B5B5B5B5uLL;
392 x ^= y & 0x4040404040404040uLL;
393 y = ((y & 0xFEFEFEFEFEFEFEFEuLL) >> 1) | ((y & 0x0101010101010101uLL) << 7);
394 x ^= y & 0x8080808080808080uLL;
395 y = ((y & 0xFEFEFEFEFEFEFEFEuLL) >> 1) | ((y & 0x0101010101010101uLL) << 7);
396 x ^= y & 0x1616161616161616uLL;
397 y = ((y & 0xFEFEFEFEFEFEFEFEuLL) >> 1) | ((y & 0x0101010101010101uLL) << 7);
398 x ^= y & 0xEBEBEBEBEBEBEBEBuLL;
399 y = ((y & 0xFEFEFEFEFEFEFEFEuLL) >> 1) | ((y & 0x0101010101010101uLL) << 7);
400 x ^= y & 0x9797979797979797uLL;
401 y = ((y & 0xFEFEFEFEFEFEFEFEuLL) >> 1) | ((y & 0x0101010101010101uLL) << 7);
402 x ^= y & 0xFBFBFBFBFBFBFBFBuLL;
403 y = ((y & 0xFEFEFEFEFEFEFEFEuLL) >> 1) | ((y & 0x0101010101010101uLL) << 7);
404 x ^= y & 0x7D7D7D7D7D7D7D7DuLL;
405 *w = x;
406 }
407
408 static void ShiftRows(u64 *state)
409 {
410 unsigned char s[4];
411 unsigned char *s0;
412 int r;
413
414 s0 = (unsigned char *)state;
415 for (r = 0; r < 4; r++) {
416 s[0] = s0[0*4 + r];
417 s[1] = s0[1*4 + r];
418 s[2] = s0[2*4 + r];
419 s[3] = s0[3*4 + r];
420 s0[0*4 + r] = s[(r+0) % 4];
421 s0[1*4 + r] = s[(r+1) % 4];
422 s0[2*4 + r] = s[(r+2) % 4];
423 s0[3*4 + r] = s[(r+3) % 4];
424 }
425 }
426
427 static void InvShiftRows(u64 *state)
428 {
429 unsigned char s[4];
430 unsigned char *s0;
431 int r;
432
433 s0 = (unsigned char *)state;
434 for (r = 0; r < 4; r++) {
435 s[0] = s0[0*4 + r];
436 s[1] = s0[1*4 + r];
437 s[2] = s0[2*4 + r];
438 s[3] = s0[3*4 + r];
439 s0[0*4 + r] = s[(4-r) % 4];
440 s0[1*4 + r] = s[(5-r) % 4];
441 s0[2*4 + r] = s[(6-r) % 4];
442 s0[3*4 + r] = s[(7-r) % 4];
443 }
444 }
445
446 static void MixColumns(u64 *state)
447 {
448 uni s1;
449 uni s;
450 int c;
451
452 for (c = 0; c < 2; c++) {
453 s1.d = state[c];
454 s.d = s1.d;
455 s.d ^= ((s.d & 0xFFFF0000FFFF0000uLL) >> 16)
456 | ((s.d & 0x0000FFFF0000FFFFuLL) << 16);
457 s.d ^= ((s.d & 0xFF00FF00FF00FF00uLL) >> 8)
458 | ((s.d & 0x00FF00FF00FF00FFuLL) << 8);
459 s.d ^= s1.d;
460 XtimeLong(&s1.d);
461 s.d ^= s1.d;
462 s.b[0] ^= s1.b[1];
463 s.b[1] ^= s1.b[2];
464 s.b[2] ^= s1.b[3];
465 s.b[3] ^= s1.b[0];
466 s.b[4] ^= s1.b[5];
467 s.b[5] ^= s1.b[6];
468 s.b[6] ^= s1.b[7];
469 s.b[7] ^= s1.b[4];
470 state[c] = s.d;
471 }
472 }
473
474 static void InvMixColumns(u64 *state)
475 {
476 uni s1;
477 uni s;
478 int c;
479
480 for (c = 0; c < 2; c++) {
481 s1.d = state[c];
482 s.d = s1.d;
483 s.d ^= ((s.d & 0xFFFF0000FFFF0000uLL) >> 16)
484 | ((s.d & 0x0000FFFF0000FFFFuLL) << 16);
485 s.d ^= ((s.d & 0xFF00FF00FF00FF00uLL) >> 8)
486 | ((s.d & 0x00FF00FF00FF00FFuLL) << 8);
487 s.d ^= s1.d;
488 XtimeLong(&s1.d);
489 s.d ^= s1.d;
490 s.b[0] ^= s1.b[1];
491 s.b[1] ^= s1.b[2];
492 s.b[2] ^= s1.b[3];
493 s.b[3] ^= s1.b[0];
494 s.b[4] ^= s1.b[5];
495 s.b[5] ^= s1.b[6];
496 s.b[6] ^= s1.b[7];
497 s.b[7] ^= s1.b[4];
498 XtimeLong(&s1.d);
499 s1.d ^= ((s1.d & 0xFFFF0000FFFF0000uLL) >> 16)
500 | ((s1.d & 0x0000FFFF0000FFFFuLL) << 16);
501 s.d ^= s1.d;
502 XtimeLong(&s1.d);
503 s1.d ^= ((s1.d & 0xFF00FF00FF00FF00uLL) >> 8)
504 | ((s1.d & 0x00FF00FF00FF00FFuLL) << 8);
505 s.d ^= s1.d;
506 state[c] = s.d;
507 }
508 }
509
510 static void AddRoundKey(u64 *state, const u64 *w)
511 {
512 state[0] ^= w[0];
513 state[1] ^= w[1];
514 }
515
516 static void Cipher(const unsigned char *in, unsigned char *out,
517 const u64 *w, int nr)
518 {
519 u64 state[2];
520 int i;
521
522 memcpy(state, in, 16);
523
524 AddRoundKey(state, w);
525
526 for (i = 1; i < nr; i++) {
527 SubLong(&state[0]);
528 SubLong(&state[1]);
529 ShiftRows(state);
530 MixColumns(state);
531 AddRoundKey(state, w + i*2);
532 }
533
534 SubLong(&state[0]);
535 SubLong(&state[1]);
536 ShiftRows(state);
537 AddRoundKey(state, w + nr*2);
538
539 memcpy(out, state, 16);
540 }
541
542 static void InvCipher(const unsigned char *in, unsigned char *out,
543 const u64 *w, int nr)
544
545 {
546 u64 state[2];
547 int i;
548
549 memcpy(state, in, 16);
550
551 AddRoundKey(state, w + nr*2);
552
553 for (i = nr - 1; i > 0; i--) {
554 InvShiftRows(state);
555 InvSubLong(&state[0]);
556 InvSubLong(&state[1]);
557 AddRoundKey(state, w + i*2);
558 InvMixColumns(state);
559 }
560
561 InvShiftRows(state);
562 InvSubLong(&state[0]);
563 InvSubLong(&state[1]);
564 AddRoundKey(state, w);
565
566 memcpy(out, state, 16);
567 }
568
569 static void RotWord(u32 *x)
570 {
571 unsigned char *w0;
572 unsigned char tmp;
573
574 w0 = (unsigned char *)x;
575 tmp = w0[0];
576 w0[0] = w0[1];
577 w0[1] = w0[2];
578 w0[2] = w0[3];
579 w0[3] = tmp;
580 }
581
582 static void KeyExpansion(const unsigned char *key, u64 *w,
583 int nr, int nk)
584 {
585 u32 rcon;
586 uni prev;
587 u32 temp;
588 int i, n;
589
590 memcpy(w, key, nk*4);
591 memcpy(&rcon, "\1\0\0\0", 4);
592 n = nk/2;
593 prev.d = w[n-1];
594 for (i = n; i < (nr+1)*2; i++) {
595 temp = prev.w[1];
596 if (i % n == 0) {
597 RotWord(&temp);
598 SubWord(&temp);
599 temp ^= rcon;
600 XtimeWord(&rcon);
601 } else if (nk > 6 && i % n == 2) {
602 SubWord(&temp);
603 }
604 prev.d = w[i-n];
605 prev.w[0] ^= temp;
606 prev.w[1] ^= prev.w[0];
607 w[i] = prev.d;
608 }
609 }
610
611 /**
612 * Expand the cipher key into the encryption key schedule.
613 */
614 int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
615 AES_KEY *key)
616 {
617 u64 *rk;
618
619 if (!userKey || !key)
620 return -1;
621 if (bits != 128 && bits != 192 && bits != 256)
622 return -2;
623
624 rk = (u64*)key->rd_key;
625
626 if (bits == 128)
627 key->rounds = 10;
628 else if (bits == 192)
629 key->rounds = 12;
630 else
631 key->rounds = 14;
632
633 KeyExpansion(userKey, rk, key->rounds, bits/32);
634 return 0;
635 }
636
637 /**
638 * Expand the cipher key into the decryption key schedule.
639 */
640 int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
641 AES_KEY *key)
642 {
643 return AES_set_encrypt_key(userKey, bits, key);
644 }
645
646 /*
647 * Encrypt a single block
648 * in and out can overlap
649 */
650 void AES_encrypt(const unsigned char *in, unsigned char *out,
651 const AES_KEY *key)
652 {
653 const u64 *rk;
654
655 assert(in && out && key);
656 rk = (u64*)key->rd_key;
657
658 Cipher(in, out, rk, key->rounds);
659 }
660
661 /*
662 * Decrypt a single block
663 * in and out can overlap
664 */
665 void AES_decrypt(const unsigned char *in, unsigned char *out,
666 const AES_KEY *key)
667 {
668 const u64 *rk;
669
670 assert(in && out && key);
671 rk = (u64*)key->rd_key;
672
673 InvCipher(in, out, rk, key->rounds);
674 }
675
676 # ifndef OPENSSL_SMALL_FOOTPRINT
677 void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out,
678 size_t blocks, const AES_KEY *key,
679 const unsigned char *ivec);
680
681 static void RawToBits(const u8 raw[64], u64 bits[8])
682 {
683 int i, j;
684 u64 in, out;
685
686 memset(bits, 0, 64);
687 for (i = 0; i < 8; i++) {
688 in = 0;
689 for (j = 0; j < 8; j++)
690 in |= ((u64)raw[i * 8 + j]) << (8 * j);
691 out = in & 0xF0F0F0F00F0F0F0FuLL;
692 out |= (in & 0x0F0F0F0F00000000uLL) >> 28;
693 out |= (in & 0x00000000F0F0F0F0uLL) << 28;
694 in = out & 0xCCCC3333CCCC3333uLL;
695 in |= (out & 0x3333000033330000uLL) >> 14;
696 in |= (out & 0x0000CCCC0000CCCCuLL) << 14;
697 out = in & 0xAA55AA55AA55AA55uLL;
698 out |= (in & 0x5500550055005500uLL) >> 7;
699 out |= (in & 0x00AA00AA00AA00AAuLL) << 7;
700 for (j = 0; j < 8; j++) {
701 bits[j] |= (out & 0xFFuLL) << (8 * i);
702 out = out >> 8;
703 }
704 }
705 }
706
707 static void BitsToRaw(const u64 bits[8], u8 raw[64])
708 {
709 int i, j;
710 u64 in, out;
711
712 for (i = 0; i < 8; i++) {
713 in = 0;
714 for (j = 0; j < 8; j++)
715 in |= ((bits[j] >> (8 * i)) & 0xFFuLL) << (8 * j);
716 out = in & 0xF0F0F0F00F0F0F0FuLL;
717 out |= (in & 0x0F0F0F0F00000000uLL) >> 28;
718 out |= (in & 0x00000000F0F0F0F0uLL) << 28;
719 in = out & 0xCCCC3333CCCC3333uLL;
720 in |= (out & 0x3333000033330000uLL) >> 14;
721 in |= (out & 0x0000CCCC0000CCCCuLL) << 14;
722 out = in & 0xAA55AA55AA55AA55uLL;
723 out |= (in & 0x5500550055005500uLL) >> 7;
724 out |= (in & 0x00AA00AA00AA00AAuLL) << 7;
725 for (j = 0; j < 8; j++) {
726 raw[i * 8 + j] = (u8)out;
727 out = out >> 8;
728 }
729 }
730 }
731
732 static void BitsXtime(u64 state[8])
733 {
734 u64 b;
735
736 b = state[7];
737 state[7] = state[6];
738 state[6] = state[5];
739 state[5] = state[4];
740 state[4] = state[3] ^ b;
741 state[3] = state[2] ^ b;
742 state[2] = state[1];
743 state[1] = state[0] ^ b;
744 state[0] = b;
745 }
746
747 /*
748 * This S-box implementation follows a circuit described in
749 * Boyar and Peralta: "A new combinational logic minimization
750 * technique with applications to cryptology."
751 * https://eprint.iacr.org/2009/191.pdf
752 *
753 * The math is similar to above, in that it uses
754 * a tower field of GF(2^2^2^2) but with a different
755 * basis representation, that is better suited to
756 * logic designs.
757 */
758 static void BitsSub(u64 state[8])
759 {
760 u64 x0, x1, x2, x3, x4, x5, x6, x7;
761 u64 y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11;
762 u64 y12, y13, y14, y15, y16, y17, y18, y19, y20, y21;
763 u64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11;
764 u64 t12, t13, t14, t15, t16, t17, t18, t19, t20, t21;
765 u64 t22, t23, t24, t25, t26, t27, t28, t29, t30, t31;
766 u64 t32, t33, t34, t35, t36, t37, t38, t39, t40, t41;
767 u64 t42, t43, t44, t45, t46, t47, t48, t49, t50, t51;
768 u64 t52, t53, t54, t55, t56, t57, t58, t59, t60, t61;
769 u64 t62, t63, t64, t65, t66, t67;
770 u64 z0, z1, z2, z3, z4, z5, z6, z7, z8, z9, z10, z11;
771 u64 z12, z13, z14, z15, z16, z17;
772 u64 s0, s1, s2, s3, s4, s5, s6, s7;
773
774 x7 = state[0];
775 x6 = state[1];
776 x5 = state[2];
777 x4 = state[3];
778 x3 = state[4];
779 x2 = state[5];
780 x1 = state[6];
781 x0 = state[7];
782 y14 = x3 ^ x5;
783 y13 = x0 ^ x6;
784 y9 = x0 ^ x3;
785 y8 = x0 ^ x5;
786 t0 = x1 ^ x2;
787 y1 = t0 ^ x7;
788 y4 = y1 ^ x3;
789 y12 = y13 ^ y14;
790 y2 = y1 ^ x0;
791 y5 = y1 ^ x6;
792 y3 = y5 ^ y8;
793 t1 = x4 ^ y12;
794 y15 = t1 ^ x5;
795 y20 = t1 ^ x1;
796 y6 = y15 ^ x7;
797 y10 = y15 ^ t0;
798 y11 = y20 ^ y9;
799 y7 = x7 ^ y11;
800 y17 = y10 ^ y11;
801 y19 = y10 ^ y8;
802 y16 = t0 ^ y11;
803 y21 = y13 ^ y16;
804 y18 = x0 ^ y16;
805 t2 = y12 & y15;
806 t3 = y3 & y6;
807 t4 = t3 ^ t2;
808 t5 = y4 & x7;
809 t6 = t5 ^ t2;
810 t7 = y13 & y16;
811 t8 = y5 & y1;
812 t9 = t8 ^ t7;
813 t10 = y2 & y7;
814 t11 = t10 ^ t7;
815 t12 = y9 & y11;
816 t13 = y14 & y17;
817 t14 = t13 ^ t12;
818 t15 = y8 & y10;
819 t16 = t15 ^ t12;
820 t17 = t4 ^ t14;
821 t18 = t6 ^ t16;
822 t19 = t9 ^ t14;
823 t20 = t11 ^ t16;
824 t21 = t17 ^ y20;
825 t22 = t18 ^ y19;
826 t23 = t19 ^ y21;
827 t24 = t20 ^ y18;
828 t25 = t21 ^ t22;
829 t26 = t21 & t23;
830 t27 = t24 ^ t26;
831 t28 = t25 & t27;
832 t29 = t28 ^ t22;
833 t30 = t23 ^ t24;
834 t31 = t22 ^ t26;
835 t32 = t31 & t30;
836 t33 = t32 ^ t24;
837 t34 = t23 ^ t33;
838 t35 = t27 ^ t33;
839 t36 = t24 & t35;
840 t37 = t36 ^ t34;
841 t38 = t27 ^ t36;
842 t39 = t29 & t38;
843 t40 = t25 ^ t39;
844 t41 = t40 ^ t37;
845 t42 = t29 ^ t33;
846 t43 = t29 ^ t40;
847 t44 = t33 ^ t37;
848 t45 = t42 ^ t41;
849 z0 = t44 & y15;
850 z1 = t37 & y6;
851 z2 = t33 & x7;
852 z3 = t43 & y16;
853 z4 = t40 & y1;
854 z5 = t29 & y7;
855 z6 = t42 & y11;
856 z7 = t45 & y17;
857 z8 = t41 & y10;
858 z9 = t44 & y12;
859 z10 = t37 & y3;
860 z11 = t33 & y4;
861 z12 = t43 & y13;
862 z13 = t40 & y5;
863 z14 = t29 & y2;
864 z15 = t42 & y9;
865 z16 = t45 & y14;
866 z17 = t41 & y8;
867 t46 = z15 ^ z16;
868 t47 = z10 ^ z11;
869 t48 = z5 ^ z13;
870 t49 = z9 ^ z10;
871 t50 = z2 ^ z12;
872 t51 = z2 ^ z5;
873 t52 = z7 ^ z8;
874 t53 = z0 ^ z3;
875 t54 = z6 ^ z7;
876 t55 = z16 ^ z17;
877 t56 = z12 ^ t48;
878 t57 = t50 ^ t53;
879 t58 = z4 ^ t46;
880 t59 = z3 ^ t54;
881 t60 = t46 ^ t57;
882 t61 = z14 ^ t57;
883 t62 = t52 ^ t58;
884 t63 = t49 ^ t58;
885 t64 = z4 ^ t59;
886 t65 = t61 ^ t62;
887 t66 = z1 ^ t63;
888 s0 = t59 ^ t63;
889 s6 = ~(t56 ^ t62);
890 s7 = ~(t48 ^ t60);
891 t67 = t64 ^ t65;
892 s3 = t53 ^ t66;
893 s4 = t51 ^ t66;
894 s5 = t47 ^ t65;
895 s1 = ~(t64 ^ s3);
896 s2 = ~(t55 ^ t67);
897 state[0] = s7;
898 state[1] = s6;
899 state[2] = s5;
900 state[3] = s4;
901 state[4] = s3;
902 state[5] = s2;
903 state[6] = s1;
904 state[7] = s0;
905 }
906
907 static void BitsShiftRows(u64 state[8])
908 {
909 u64 s, s0;
910 int i;
911
912 for (i = 0; i < 8; i++) {
913 s = state[i];
914 s0 = s & 0x1111111111111111uLL;
915 s0 |= ((s & 0x2220222022202220uLL) >> 4) | ((s & 0x0002000200020002uLL) << 12);
916 s0 |= ((s & 0x4400440044004400uLL) >> 8) | ((s & 0x0044004400440044uLL) << 8);
917 s0 |= ((s & 0x8000800080008000uLL) >> 12) | ((s & 0x0888088808880888uLL) << 4);
918 state[i] = s0;
919 }
920 }
921
922 static void BitsMixColumns(u64 state[8])
923 {
924 u64 s1, s;
925 u64 s0[8];
926 int i;
927
928 for (i = 0; i < 8; i++) {
929 s1 = state[i];
930 s = s1;
931 s ^= ((s & 0xCCCCCCCCCCCCCCCCuLL) >> 2) | ((s & 0x3333333333333333uLL) << 2);
932 s ^= ((s & 0xAAAAAAAAAAAAAAAAuLL) >> 1) | ((s & 0x5555555555555555uLL) << 1);
933 s ^= s1;
934 s0[i] = s;
935 }
936 BitsXtime(state);
937 for (i = 0; i < 8; i++) {
938 s1 = state[i];
939 s = s0[i];
940 s ^= s1;
941 s ^= ((s1 & 0xEEEEEEEEEEEEEEEEuLL) >> 1) | ((s1 & 0x1111111111111111uLL) << 3);
942 state[i] = s;
943 }
944 }
945
946 static void BitsAddRoundKey(u64 state[8], const u64 key[8])
947 {
948 int i;
949
950 for (i = 0; i < 8; i++)
951 state[i] ^= key[i];
952 }
953
954 void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out,
955 size_t blocks, const AES_KEY *key,
956 const unsigned char *ivec)
957 {
958 struct {
959 u8 cipher[64];
960 u64 state[8];
961 u64 rd_key[AES_MAXNR + 1][8];
962 } *bs;
963 u32 ctr32;
964 int i;
965
966 ctr32 = GETU32(ivec + 12);
967 if (blocks >= 4
968 && (bs = OPENSSL_malloc(sizeof(*bs)))) {
969 for (i = 0; i < key->rounds + 1; i++) {
970 memcpy(bs->cipher + 0, &key->rd_key[4 * i], 16);
971 memcpy(bs->cipher + 16, bs->cipher, 16);
972 memcpy(bs->cipher + 32, bs->cipher, 32);
973 RawToBits(bs->cipher, bs->rd_key[i]);
974 }
975 while (blocks) {
976 memcpy(bs->cipher, ivec, 12);
977 PUTU32(bs->cipher + 12, ctr32);
978 ctr32++;
979 memcpy(bs->cipher + 16, ivec, 12);
980 PUTU32(bs->cipher + 28, ctr32);
981 ctr32++;
982 memcpy(bs->cipher + 32, ivec, 12);
983 PUTU32(bs->cipher + 44, ctr32);
984 ctr32++;
985 memcpy(bs->cipher + 48, ivec, 12);
986 PUTU32(bs->cipher + 60, ctr32);
987 ctr32++;
988 RawToBits(bs->cipher, bs->state);
989 BitsAddRoundKey(bs->state, bs->rd_key[0]);
990 for (i = 1; i < key->rounds; i++) {
991 BitsSub(bs->state);
992 BitsShiftRows(bs->state);
993 BitsMixColumns(bs->state);
994 BitsAddRoundKey(bs->state, bs->rd_key[i]);
995 }
996 BitsSub(bs->state);
997 BitsShiftRows(bs->state);
998 BitsAddRoundKey(bs->state, bs->rd_key[key->rounds]);
999 BitsToRaw(bs->state, bs->cipher);
1000 for (i = 0; i < 64 && blocks; i++) {
1001 out[i] = in[i] ^ bs->cipher[i];
1002 if ((i & 15) == 15)
1003 blocks--;
1004 }
1005 in += i;
1006 out += i;
1007 }
1008 OPENSSL_clear_free(bs, sizeof(*bs));
1009 } else {
1010 unsigned char cipher[16];
1011
1012 while (blocks) {
1013 memcpy(cipher, ivec, 12);
1014 PUTU32(cipher + 12, ctr32);
1015 AES_encrypt(cipher, cipher, key);
1016 for (i = 0; i < 16; i++)
1017 out[i] = in[i] ^ cipher[i];
1018 in += 16;
1019 out += 16;
1020 ctr32++;
1021 blocks--;
1022 }
1023 }
1024 }
1025 # endif
1026 #elif !defined(AES_ASM)
461027 /*-
471028 Te0[x] = S [x].[02, 01, 01, 03];
481029 Te1[x] = S [x].[03, 02, 01, 01];
2323 # define PUTU32(ct, st) { (ct)[0] = (u8)((st) >> 24); (ct)[1] = (u8)((st) >> 16); (ct)[2] = (u8)((st) >> 8); (ct)[3] = (u8)(st); }
2424 # endif
2525
26 typedef unsigned long long u64;
2627 # ifdef AES_LONG
2728 typedef unsigned long u32;
2829 # else
128128 void bsaes_xts_decrypt(const unsigned char *inp, unsigned char *out,
129129 size_t len, const AES_KEY *key1,
130130 const AES_KEY *key2, const unsigned char iv[16]);
131 #endif
132 #if !defined(AES_ASM) && !defined(AES_CTR_ASM) \
133 && defined(OPENSSL_AES_CONST_TIME) \
134 && !defined(OPENSSL_SMALL_FOOTPRINT)
135 # define AES_CTR_ASM
131136 #endif
132137 #ifdef AES_CTR_ASM
133138 void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out,