Crypto++  7.0
Free C++ class library of cryptographic schemes
sha-simd.cpp
1 // sha-simd.cpp - written and placed in the public domain by
2 // Jeffrey Walton, Uri Blumenthal and Marcel Raad.
3 //
4 // This source file uses intrinsics to gain access to SHA-NI and
5 // ARMv8a SHA instructions. A separate source file is needed
6 // because additional CXXFLAGS are required to enable the
7 // appropriate instructions sets in some build configurations.
8 
9 #include "pch.h"
10 #include "config.h"
11 #include "sha.h"
12 #include "misc.h"
13 
14 #if (CRYPTOPP_SHANI_AVAILABLE)
15 # include <nmmintrin.h>
16 # include <immintrin.h>
17 #endif
18 
19 // Use ARMv8 rather than NEON due to compiler inconsistencies
20 #if (CRYPTOPP_ARM_SHA_AVAILABLE)
21 # include <arm_neon.h>
22 #endif
23 
24 // Can't use CRYPTOPP_ARM_XXX_AVAILABLE because too many
25 // compilers don't follow ACLE conventions for the include.
26 #if defined(CRYPTOPP_ARM_ACLE_AVAILABLE)
27 # include <stdint.h>
28 # include <arm_acle.h>
29 #endif
30 
31 #if CRYPTOPP_POWER8_SHA_AVAILABLE
32 # include "ppc-simd.h"
33 #endif
34 
35 #ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
36 # include <signal.h>
37 # include <setjmp.h>
38 #endif
39 
40 #ifndef EXCEPTION_EXECUTE_HANDLER
41 # define EXCEPTION_EXECUTE_HANDLER 1
42 #endif
43 
44 // Clang __m128i casts
45 #define M128_CAST(x) ((__m128i *)(void *)(x))
46 #define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
47 
48 NAMESPACE_BEGIN(CryptoPP)
49 
50 // ***************** SIGILL probes ********************
51 
52 #ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
53 extern "C" {
54  typedef void (*SigHandler)(int);
55 
56  static jmp_buf s_jmpSIGILL;
57  static void SigIllHandler(int)
58  {
59  longjmp(s_jmpSIGILL, 1);
60  }
61 }
62 #endif // Not CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY
63 
64 #if (CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64)
65 bool CPU_ProbeSHA1()
66 {
67 #if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES)
68  return false;
69 #elif (CRYPTOPP_ARM_SHA_AVAILABLE)
70 # if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
71  volatile bool result = true;
72  __try
73  {
74  uint32x4_t data1 = {1,2,3,4}, data2 = {5,6,7,8}, data3 = {9,10,11,12};
75 
76  uint32x4_t r1 = vsha1cq_u32 (data1, 0, data2);
77  uint32x4_t r2 = vsha1mq_u32 (data1, 0, data2);
78  uint32x4_t r3 = vsha1pq_u32 (data1, 0, data2);
79  uint32x4_t r4 = vsha1su0q_u32 (data1, data2, data3);
80  uint32x4_t r5 = vsha1su1q_u32 (data1, data2);
81 
82  result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3) | vgetq_lane_u32(r5,0));
83  }
84  __except (EXCEPTION_EXECUTE_HANDLER)
85  {
86  return false;
87  }
88  return result;
89 # else
90 
91  // longjmp and clobber warnings. Volatile is required.
92  // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
93  volatile bool result = true;
94 
95  volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
96  if (oldHandler == SIG_ERR)
97  return false;
98 
99  volatile sigset_t oldMask;
100  if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
101  return false;
102 
103  if (setjmp(s_jmpSIGILL))
104  result = false;
105  else
106  {
107  uint32x4_t data1 = {1,2,3,4}, data2 = {5,6,7,8}, data3 = {9,10,11,12};
108 
109  uint32x4_t r1 = vsha1cq_u32 (data1, 0, data2);
110  uint32x4_t r2 = vsha1mq_u32 (data1, 0, data2);
111  uint32x4_t r3 = vsha1pq_u32 (data1, 0, data2);
112  uint32x4_t r4 = vsha1su0q_u32 (data1, data2, data3);
113  uint32x4_t r5 = vsha1su1q_u32 (data1, data2);
114 
115  result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3) | vgetq_lane_u32(r5,0));
116  }
117 
118  sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
119  signal(SIGILL, oldHandler);
120  return result;
121 # endif
122 #else
123  return false;
124 #endif // CRYPTOPP_ARM_SHA_AVAILABLE
125 }
126 
127 bool CPU_ProbeSHA2()
128 {
129 #if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES)
130  return false;
131 #elif (CRYPTOPP_ARM_SHA_AVAILABLE)
132 # if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
133  volatile bool result = true;
134  __try
135  {
136  uint32x4_t data1 = {1,2,3,4}, data2 = {5,6,7,8}, data3 = {9,10,11,12};
137 
138  uint32x4_t r1 = vsha256hq_u32 (data1, data2, data3);
139  uint32x4_t r2 = vsha256h2q_u32 (data1, data2, data3);
140  uint32x4_t r3 = vsha256su0q_u32 (data1, data2);
141  uint32x4_t r4 = vsha256su1q_u32 (data1, data2, data3);
142 
143  result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3));
144  }
145  __except (EXCEPTION_EXECUTE_HANDLER)
146  {
147  return false;
148  }
149  return result;
150 #else
151 
152  // longjmp and clobber warnings. Volatile is required.
153  // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
154  volatile bool result = true;
155 
156  volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
157  if (oldHandler == SIG_ERR)
158  return false;
159 
160  volatile sigset_t oldMask;
161  if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
162  return false;
163 
164  if (setjmp(s_jmpSIGILL))
165  result = false;
166  else
167  {
168  uint32x4_t data1 = {1,2,3,4}, data2 = {5,6,7,8}, data3 = {9,10,11,12};
169 
170  uint32x4_t r1 = vsha256hq_u32 (data1, data2, data3);
171  uint32x4_t r2 = vsha256h2q_u32 (data1, data2, data3);
172  uint32x4_t r3 = vsha256su0q_u32 (data1, data2);
173  uint32x4_t r4 = vsha256su1q_u32 (data1, data2, data3);
174 
175  result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3));
176  }
177 
178  sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
179  signal(SIGILL, oldHandler);
180  return result;
181 # endif
182 #else
183  return false;
184 #endif // CRYPTOPP_ARM_SHA_AVAILABLE
185 }
186 #endif // ARM32 or ARM64
187 
188 // ***************** Intel x86 SHA ********************
189 
190 // provided by sha.cpp
191 extern const word32 SHA256_K[64];
192 extern const word64 SHA512_K[80];
193 
194 /////////////////////////////////////
195 // start of Walton and Gulley code //
196 /////////////////////////////////////
197 
198 #if CRYPTOPP_SHANI_AVAILABLE
199 // Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley.
200 void SHA1_HashMultipleBlocks_SHANI(word32 *state, const word32 *data, size_t length, ByteOrder order)
201 {
202  CRYPTOPP_ASSERT(state);
203  CRYPTOPP_ASSERT(data);
204  CRYPTOPP_ASSERT(length >= SHA1::BLOCKSIZE);
205 
206  __m128i ABCD, ABCD_SAVE, E0, E0_SAVE, E1;
207  __m128i MASK, MSG0, MSG1, MSG2, MSG3;
208 
209  // Load initial values
210  ABCD = _mm_loadu_si128(CONST_M128_CAST(state));
211  E0 = _mm_set_epi32(state[4], 0, 0, 0);
212  ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
213 
214  // IA-32 SHA is little endian, SHA::Transform is big endian,
215  // and SHA::HashMultipleBlocks can be either. ByteOrder
216  // allows us to avoid extra endian reversals. It saves 1.0 cpb.
217  MASK = order == BIG_ENDIAN_ORDER ? // Data arrangement
218  _mm_set_epi8(0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15) :
219  _mm_set_epi8(3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12) ;
220 
221  while (length >= SHA1::BLOCKSIZE)
222  {
223  // Save current hash
224  ABCD_SAVE = ABCD;
225  E0_SAVE = E0;
226 
227  // Rounds 0-3
228  MSG0 = _mm_loadu_si128(CONST_M128_CAST(data+0));
229  MSG0 = _mm_shuffle_epi8(MSG0, MASK);
230  E0 = _mm_add_epi32(E0, MSG0);
231  E1 = ABCD;
232  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
233 
234  // Rounds 4-7
235  MSG1 = _mm_loadu_si128(CONST_M128_CAST(data+4));
236  MSG1 = _mm_shuffle_epi8(MSG1, MASK);
237  E1 = _mm_sha1nexte_epu32(E1, MSG1);
238  E0 = ABCD;
239  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
240  MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
241 
242  // Rounds 8-11
243  MSG2 = _mm_loadu_si128(CONST_M128_CAST(data+8));
244  MSG2 = _mm_shuffle_epi8(MSG2, MASK);
245  E0 = _mm_sha1nexte_epu32(E0, MSG2);
246  E1 = ABCD;
247  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
248  MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
249  MSG0 = _mm_xor_si128(MSG0, MSG2);
250 
251  // Rounds 12-15
252  MSG3 = _mm_loadu_si128(CONST_M128_CAST(data+12));
253  MSG3 = _mm_shuffle_epi8(MSG3, MASK);
254  E1 = _mm_sha1nexte_epu32(E1, MSG3);
255  E0 = ABCD;
256  MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
257  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
258  MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
259  MSG1 = _mm_xor_si128(MSG1, MSG3);
260 
261  // Rounds 16-19
262  E0 = _mm_sha1nexte_epu32(E0, MSG0);
263  E1 = ABCD;
264  MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
265  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
266  MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
267  MSG2 = _mm_xor_si128(MSG2, MSG0);
268 
269  // Rounds 20-23
270  E1 = _mm_sha1nexte_epu32(E1, MSG1);
271  E0 = ABCD;
272  MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
273  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
274  MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
275  MSG3 = _mm_xor_si128(MSG3, MSG1);
276 
277  // Rounds 24-27
278  E0 = _mm_sha1nexte_epu32(E0, MSG2);
279  E1 = ABCD;
280  MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
281  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
282  MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
283  MSG0 = _mm_xor_si128(MSG0, MSG2);
284 
285  // Rounds 28-31
286  E1 = _mm_sha1nexte_epu32(E1, MSG3);
287  E0 = ABCD;
288  MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
289  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
290  MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
291  MSG1 = _mm_xor_si128(MSG1, MSG3);
292 
293  // Rounds 32-35
294  E0 = _mm_sha1nexte_epu32(E0, MSG0);
295  E1 = ABCD;
296  MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
297  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
298  MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
299  MSG2 = _mm_xor_si128(MSG2, MSG0);
300 
301  // Rounds 36-39
302  E1 = _mm_sha1nexte_epu32(E1, MSG1);
303  E0 = ABCD;
304  MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
305  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
306  MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
307  MSG3 = _mm_xor_si128(MSG3, MSG1);
308 
309  // Rounds 40-43
310  E0 = _mm_sha1nexte_epu32(E0, MSG2);
311  E1 = ABCD;
312  MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
313  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
314  MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
315  MSG0 = _mm_xor_si128(MSG0, MSG2);
316 
317  // Rounds 44-47
318  E1 = _mm_sha1nexte_epu32(E1, MSG3);
319  E0 = ABCD;
320  MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
321  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
322  MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
323  MSG1 = _mm_xor_si128(MSG1, MSG3);
324 
325  // Rounds 48-51
326  E0 = _mm_sha1nexte_epu32(E0, MSG0);
327  E1 = ABCD;
328  MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
329  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
330  MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
331  MSG2 = _mm_xor_si128(MSG2, MSG0);
332 
333  // Rounds 52-55
334  E1 = _mm_sha1nexte_epu32(E1, MSG1);
335  E0 = ABCD;
336  MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
337  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
338  MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
339  MSG3 = _mm_xor_si128(MSG3, MSG1);
340 
341  // Rounds 56-59
342  E0 = _mm_sha1nexte_epu32(E0, MSG2);
343  E1 = ABCD;
344  MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
345  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
346  MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
347  MSG0 = _mm_xor_si128(MSG0, MSG2);
348 
349  // Rounds 60-63
350  E1 = _mm_sha1nexte_epu32(E1, MSG3);
351  E0 = ABCD;
352  MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
353  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
354  MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
355  MSG1 = _mm_xor_si128(MSG1, MSG3);
356 
357  // Rounds 64-67
358  E0 = _mm_sha1nexte_epu32(E0, MSG0);
359  E1 = ABCD;
360  MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
361  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
362  MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
363  MSG2 = _mm_xor_si128(MSG2, MSG0);
364 
365  // Rounds 68-71
366  E1 = _mm_sha1nexte_epu32(E1, MSG1);
367  E0 = ABCD;
368  MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
369  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
370  MSG3 = _mm_xor_si128(MSG3, MSG1);
371 
372  // Rounds 72-75
373  E0 = _mm_sha1nexte_epu32(E0, MSG2);
374  E1 = ABCD;
375  MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
376  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
377 
378  // Rounds 76-79
379  E1 = _mm_sha1nexte_epu32(E1, MSG3);
380  E0 = ABCD;
381  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
382 
383  // Add values back to state
384  E0 = _mm_sha1nexte_epu32(E0, E0_SAVE);
385  ABCD = _mm_add_epi32(ABCD, ABCD_SAVE);
386 
387  data += SHA1::BLOCKSIZE/sizeof(word32);
388  length -= SHA1::BLOCKSIZE;
389  }
390 
391  // Save state
392  ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
393  _mm_storeu_si128(M128_CAST(state), ABCD);
394  state[4] = _mm_extract_epi32(E0, 3);
395 }
396 
397 // Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley.
398 void SHA256_HashMultipleBlocks_SHANI(word32 *state, const word32 *data, size_t length, ByteOrder order)
399 {
400  CRYPTOPP_ASSERT(state);
401  CRYPTOPP_ASSERT(data);
402  CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE);
403 
404  __m128i STATE0, STATE1;
405  __m128i MSG, TMP, MASK;
406  __m128i TMSG0, TMSG1, TMSG2, TMSG3;
407  __m128i ABEF_SAVE, CDGH_SAVE;
408 
409  // Load initial values
410  TMP = _mm_loadu_si128(M128_CAST(&state[0]));
411  STATE1 = _mm_loadu_si128(M128_CAST(&state[4]));
412 
413  // IA-32 SHA is little endian, SHA::Transform is big endian,
414  // and SHA::HashMultipleBlocks can be either. ByteOrder
415  // allows us to avoid extra endian reversals. It saves 1.0 cpb.
416  MASK = order == BIG_ENDIAN_ORDER ? // Data arrangement
417  _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3) :
418  _mm_set_epi8(15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0) ;
419 
420  TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
421  STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
422  STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
423  STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
424 
425  while (length >= SHA256::BLOCKSIZE)
426  {
427  // Save current hash
428  ABEF_SAVE = STATE0;
429  CDGH_SAVE = STATE1;
430 
431  // Rounds 0-3
432  MSG = _mm_loadu_si128(CONST_M128_CAST(data+0));
433  TMSG0 = _mm_shuffle_epi8(MSG, MASK);
434  MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0xE9B5DBA5B5C0FBCF), W64LIT(0x71374491428A2F98)));
435  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
436  MSG = _mm_shuffle_epi32(MSG, 0x0E);
437  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
438 
439  // Rounds 4-7
440  TMSG1 = _mm_loadu_si128(CONST_M128_CAST(data+4));
441  TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
442  MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0xAB1C5ED5923F82A4), W64LIT(0x59F111F13956C25B)));
443  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
444  MSG = _mm_shuffle_epi32(MSG, 0x0E);
445  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
446  TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
447 
448  // Rounds 8-11
449  TMSG2 = _mm_loadu_si128(CONST_M128_CAST(data+8));
450  TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
451  MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x550C7DC3243185BE), W64LIT(0x12835B01D807AA98)));
452  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
453  MSG = _mm_shuffle_epi32(MSG, 0x0E);
454  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
455  TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
456 
457  // Rounds 12-15
458  TMSG3 = _mm_loadu_si128(CONST_M128_CAST(data+12));
459  TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
460  MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC19BF1749BDC06A7), W64LIT(0x80DEB1FE72BE5D74)));
461  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
462  TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
463  TMSG0 = _mm_add_epi32(TMSG0, TMP);
464  TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
465  MSG = _mm_shuffle_epi32(MSG, 0x0E);
466  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
467  TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
468 
469  // Rounds 16-19
470  MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x240CA1CC0FC19DC6), W64LIT(0xEFBE4786E49B69C1)));
471  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
472  TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
473  TMSG1 = _mm_add_epi32(TMSG1, TMP);
474  TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
475  MSG = _mm_shuffle_epi32(MSG, 0x0E);
476  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
477  TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
478 
479  // Rounds 20-23
480  MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x76F988DA5CB0A9DC), W64LIT(0x4A7484AA2DE92C6F)));
481  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
482  TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
483  TMSG2 = _mm_add_epi32(TMSG2, TMP);
484  TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
485  MSG = _mm_shuffle_epi32(MSG, 0x0E);
486  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
487  TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
488 
489  // Rounds 24-27
490  MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0xBF597FC7B00327C8), W64LIT(0xA831C66D983E5152)));
491  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
492  TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
493  TMSG3 = _mm_add_epi32(TMSG3, TMP);
494  TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
495  MSG = _mm_shuffle_epi32(MSG, 0x0E);
496  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
497  TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
498 
499  // Rounds 28-31
500  MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0x1429296706CA6351), W64LIT(0xD5A79147C6E00BF3)));
501  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
502  TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
503  TMSG0 = _mm_add_epi32(TMSG0, TMP);
504  TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
505  MSG = _mm_shuffle_epi32(MSG, 0x0E);
506  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
507  TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
508 
509  // Rounds 32-35
510  MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x53380D134D2C6DFC), W64LIT(0x2E1B213827B70A85)));
511  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
512  TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
513  TMSG1 = _mm_add_epi32(TMSG1, TMP);
514  TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
515  MSG = _mm_shuffle_epi32(MSG, 0x0E);
516  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
517  TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
518 
519  // Rounds 36-39
520  MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x92722C8581C2C92E), W64LIT(0x766A0ABB650A7354)));
521  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
522  TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
523  TMSG2 = _mm_add_epi32(TMSG2, TMP);
524  TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
525  MSG = _mm_shuffle_epi32(MSG, 0x0E);
526  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
527  TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
528 
529  // Rounds 40-43
530  MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0xC76C51A3C24B8B70), W64LIT(0xA81A664BA2BFE8A1)));
531  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
532  TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
533  TMSG3 = _mm_add_epi32(TMSG3, TMP);
534  TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
535  MSG = _mm_shuffle_epi32(MSG, 0x0E);
536  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
537  TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
538 
539  // Rounds 44-47
540  MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0x106AA070F40E3585), W64LIT(0xD6990624D192E819)));
541  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
542  TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
543  TMSG0 = _mm_add_epi32(TMSG0, TMP);
544  TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
545  MSG = _mm_shuffle_epi32(MSG, 0x0E);
546  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
547  TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
548 
549  // Rounds 48-51
550  MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x34B0BCB52748774C), W64LIT(0x1E376C0819A4C116)));
551  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
552  TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
553  TMSG1 = _mm_add_epi32(TMSG1, TMP);
554  TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
555  MSG = _mm_shuffle_epi32(MSG, 0x0E);
556  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
557  TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
558 
559  // Rounds 52-55
560  MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x682E6FF35B9CCA4F), W64LIT(0x4ED8AA4A391C0CB3)));
561  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
562  TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
563  TMSG2 = _mm_add_epi32(TMSG2, TMP);
564  TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
565  MSG = _mm_shuffle_epi32(MSG, 0x0E);
566  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
567 
568  // Rounds 56-59
569  MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x8CC7020884C87814), W64LIT(0x78A5636F748F82EE)));
570  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
571  TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
572  TMSG3 = _mm_add_epi32(TMSG3, TMP);
573  TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
574  MSG = _mm_shuffle_epi32(MSG, 0x0E);
575  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
576 
577  // Rounds 60-63
578  MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC67178F2BEF9A3F7), W64LIT(0xA4506CEB90BEFFFA)));
579  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
580  MSG = _mm_shuffle_epi32(MSG, 0x0E);
581  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
582 
583  // Add values back to state
584  STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
585  STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
586 
587  data += SHA256::BLOCKSIZE/sizeof(word32);
588  length -= SHA256::BLOCKSIZE;
589  }
590 
591  TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
592  STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
593  STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
594  STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
595 
596  // Save state
597  _mm_storeu_si128(M128_CAST(&state[0]), STATE0);
598  _mm_storeu_si128(M128_CAST(&state[4]), STATE1);
599 }
600 #endif // CRYPTOPP_SHANI_AVAILABLE
601 
602 ///////////////////////////////////
603 // end of Walton and Gulley code //
604 ///////////////////////////////////
605 
606 // ***************** ARMV8 SHA ********************
607 
608 /////////////////////////////////////////////////////////////
609 // start of Walton, Schneiders, O'Rourke and Hovsmith code //
610 /////////////////////////////////////////////////////////////
611 
612 #if CRYPTOPP_ARM_SHA_AVAILABLE
613 void SHA1_HashMultipleBlocks_ARMV8(word32 *state, const word32 *data, size_t length, ByteOrder order)
614 {
615  CRYPTOPP_ASSERT(state);
616  CRYPTOPP_ASSERT(data);
617  CRYPTOPP_ASSERT(length >= SHA1::BLOCKSIZE);
618 
619  uint32x4_t C0, C1, C2, C3;
620  uint32x4_t ABCD, ABCD_SAVED;
621  uint32x4_t MSG0, MSG1, MSG2, MSG3;
622  uint32x4_t TMP0, TMP1;
623  uint32_t E0, E0_SAVED, E1;
624 
625  // Load initial values
626  C0 = vdupq_n_u32(0x5A827999);
627  C1 = vdupq_n_u32(0x6ED9EBA1);
628  C2 = vdupq_n_u32(0x8F1BBCDC);
629  C3 = vdupq_n_u32(0xCA62C1D6);
630 
631  ABCD = vld1q_u32(&state[0]);
632  E0 = state[4];
633 
634  while (length >= SHA1::BLOCKSIZE)
635  {
636  // Save current hash
637  ABCD_SAVED = ABCD;
638  E0_SAVED = E0;
639 
640  MSG0 = vld1q_u32(data + 0);
641  MSG1 = vld1q_u32(data + 4);
642  MSG2 = vld1q_u32(data + 8);
643  MSG3 = vld1q_u32(data + 12);
644 
645  if (order == BIG_ENDIAN_ORDER) // Data arrangement
646  {
647  MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0)));
648  MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1)));
649  MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2)));
650  MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3)));
651  }
652 
653  TMP0 = vaddq_u32(MSG0, C0);
654  TMP1 = vaddq_u32(MSG1, C0);
655 
656  // Rounds 0-3
657  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
658  ABCD = vsha1cq_u32(ABCD, E0, TMP0);
659  TMP0 = vaddq_u32(MSG2, C0);
660  MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
661 
662  // Rounds 4-7
663  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
664  ABCD = vsha1cq_u32(ABCD, E1, TMP1);
665  TMP1 = vaddq_u32(MSG3, C0);
666  MSG0 = vsha1su1q_u32(MSG0, MSG3);
667  MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
668 
669  // Rounds 8-11
670  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
671  ABCD = vsha1cq_u32(ABCD, E0, TMP0);
672  TMP0 = vaddq_u32(MSG0, C0);
673  MSG1 = vsha1su1q_u32(MSG1, MSG0);
674  MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
675 
676  // Rounds 12-15
677  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
678  ABCD = vsha1cq_u32(ABCD, E1, TMP1);
679  TMP1 = vaddq_u32(MSG1, C1);
680  MSG2 = vsha1su1q_u32(MSG2, MSG1);
681  MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
682 
683  // Rounds 16-19
684  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
685  ABCD = vsha1cq_u32(ABCD, E0, TMP0);
686  TMP0 = vaddq_u32(MSG2, C1);
687  MSG3 = vsha1su1q_u32(MSG3, MSG2);
688  MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
689 
690  // Rounds 20-23
691  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
692  ABCD = vsha1pq_u32(ABCD, E1, TMP1);
693  TMP1 = vaddq_u32(MSG3, C1);
694  MSG0 = vsha1su1q_u32(MSG0, MSG3);
695  MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
696 
697  // Rounds 24-27
698  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
699  ABCD = vsha1pq_u32(ABCD, E0, TMP0);
700  TMP0 = vaddq_u32(MSG0, C1);
701  MSG1 = vsha1su1q_u32(MSG1, MSG0);
702  MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
703 
704  // Rounds 28-31
705  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
706  ABCD = vsha1pq_u32(ABCD, E1, TMP1);
707  TMP1 = vaddq_u32(MSG1, C1);
708  MSG2 = vsha1su1q_u32(MSG2, MSG1);
709  MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
710 
711  // Rounds 32-35
712  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
713  ABCD = vsha1pq_u32(ABCD, E0, TMP0);
714  TMP0 = vaddq_u32(MSG2, C2);
715  MSG3 = vsha1su1q_u32(MSG3, MSG2);
716  MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
717 
718  // Rounds 36-39
719  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
720  ABCD = vsha1pq_u32(ABCD, E1, TMP1);
721  TMP1 = vaddq_u32(MSG3, C2);
722  MSG0 = vsha1su1q_u32(MSG0, MSG3);
723  MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
724 
725  // Rounds 40-43
726  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
727  ABCD = vsha1mq_u32(ABCD, E0, TMP0);
728  TMP0 = vaddq_u32(MSG0, C2);
729  MSG1 = vsha1su1q_u32(MSG1, MSG0);
730  MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
731 
732  // Rounds 44-47
733  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
734  ABCD = vsha1mq_u32(ABCD, E1, TMP1);
735  TMP1 = vaddq_u32(MSG1, C2);
736  MSG2 = vsha1su1q_u32(MSG2, MSG1);
737  MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
738 
739  // Rounds 48-51
740  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
741  ABCD = vsha1mq_u32(ABCD, E0, TMP0);
742  TMP0 = vaddq_u32(MSG2, C2);
743  MSG3 = vsha1su1q_u32(MSG3, MSG2);
744  MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
745 
746  // Rounds 52-55
747  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
748  ABCD = vsha1mq_u32(ABCD, E1, TMP1);
749  TMP1 = vaddq_u32(MSG3, C3);
750  MSG0 = vsha1su1q_u32(MSG0, MSG3);
751  MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
752 
753  // Rounds 56-59
754  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
755  ABCD = vsha1mq_u32(ABCD, E0, TMP0);
756  TMP0 = vaddq_u32(MSG0, C3);
757  MSG1 = vsha1su1q_u32(MSG1, MSG0);
758  MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
759 
760  // Rounds 60-63
761  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
762  ABCD = vsha1pq_u32(ABCD, E1, TMP1);
763  TMP1 = vaddq_u32(MSG1, C3);
764  MSG2 = vsha1su1q_u32(MSG2, MSG1);
765  MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
766 
767  // Rounds 64-67
768  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
769  ABCD = vsha1pq_u32(ABCD, E0, TMP0);
770  TMP0 = vaddq_u32(MSG2, C3);
771  MSG3 = vsha1su1q_u32(MSG3, MSG2);
772  MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
773 
774  // Rounds 68-71
775  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
776  ABCD = vsha1pq_u32(ABCD, E1, TMP1);
777  TMP1 = vaddq_u32(MSG3, C3);
778  MSG0 = vsha1su1q_u32(MSG0, MSG3);
779 
780  // Rounds 72-75
781  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
782  ABCD = vsha1pq_u32(ABCD, E0, TMP0);
783 
784  // Rounds 76-79
785  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
786  ABCD = vsha1pq_u32(ABCD, E1, TMP1);
787 
788  E0 += E0_SAVED;
789  ABCD = vaddq_u32(ABCD_SAVED, ABCD);
790 
791  data += SHA1::BLOCKSIZE/sizeof(word32);
792  length -= SHA1::BLOCKSIZE;
793  }
794 
795  // Save state
796  vst1q_u32(&state[0], ABCD);
797  state[4] = E0;
798 }
799 
800 void SHA256_HashMultipleBlocks_ARMV8(word32 *state, const word32 *data, size_t length, ByteOrder order)
801 {
802  CRYPTOPP_ASSERT(state);
803  CRYPTOPP_ASSERT(data);
804  CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE);
805 
806  uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE;
807  uint32x4_t MSG0, MSG1, MSG2, MSG3;
808  uint32x4_t TMP0, TMP1, TMP2;
809 
810  // Load initial values
811  STATE0 = vld1q_u32(&state[0]);
812  STATE1 = vld1q_u32(&state[4]);
813 
814  while (length >= SHA256::BLOCKSIZE)
815  {
816  // Save current hash
817  ABEF_SAVE = STATE0;
818  CDGH_SAVE = STATE1;
819 
820  // Load message
821  MSG0 = vld1q_u32(data + 0);
822  MSG1 = vld1q_u32(data + 4);
823  MSG2 = vld1q_u32(data + 8);
824  MSG3 = vld1q_u32(data + 12);
825 
826  if (order == BIG_ENDIAN_ORDER) // Data arrangement
827  {
828  MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0)));
829  MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1)));
830  MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2)));
831  MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3)));
832  }
833 
834  TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x00]));
835 
836  // Rounds 0-3
837  MSG0 = vsha256su0q_u32(MSG0, MSG1);
838  TMP2 = STATE0;
839  TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x04]));
840  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
841  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
842  MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);;
843 
844  // Rounds 4-7
845  MSG1 = vsha256su0q_u32(MSG1, MSG2);
846  TMP2 = STATE0;
847  TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x08]));
848  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
849  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
850  MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);;
851 
852  // Rounds 8-11
853  MSG2 = vsha256su0q_u32(MSG2, MSG3);
854  TMP2 = STATE0;
855  TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x0c]));
856  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
857  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
858  MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);;
859 
860  // Rounds 12-15
861  MSG3 = vsha256su0q_u32(MSG3, MSG0);
862  TMP2 = STATE0;
863  TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x10]));
864  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
865  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
866  MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);;
867 
868  // Rounds 16-19
869  MSG0 = vsha256su0q_u32(MSG0, MSG1);
870  TMP2 = STATE0;
871  TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x14]));
872  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
873  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
874  MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);;
875 
876  // Rounds 20-23
877  MSG1 = vsha256su0q_u32(MSG1, MSG2);
878  TMP2 = STATE0;
879  TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x18]));
880  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
881  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
882  MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);;
883 
884  // Rounds 24-27
885  MSG2 = vsha256su0q_u32(MSG2, MSG3);
886  TMP2 = STATE0;
887  TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x1c]));
888  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
889  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
890  MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);;
891 
892  // Rounds 28-31
893  MSG3 = vsha256su0q_u32(MSG3, MSG0);
894  TMP2 = STATE0;
895  TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x20]));
896  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
897  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
898  MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);;
899 
900  // Rounds 32-35
901  MSG0 = vsha256su0q_u32(MSG0, MSG1);
902  TMP2 = STATE0;
903  TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x24]));
904  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
905  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
906  MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);;
907 
908  // Rounds 36-39
909  MSG1 = vsha256su0q_u32(MSG1, MSG2);
910  TMP2 = STATE0;
911  TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x28]));
912  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
913  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
914  MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);;
915 
916  // Rounds 40-43
917  MSG2 = vsha256su0q_u32(MSG2, MSG3);
918  TMP2 = STATE0;
919  TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x2c]));
920  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
921  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
922  MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);;
923 
924  // Rounds 44-47
925  MSG3 = vsha256su0q_u32(MSG3, MSG0);
926  TMP2 = STATE0;
927  TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x30]));
928  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
929  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
930  MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);;
931 
932  // Rounds 48-51
933  TMP2 = STATE0;
934  TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x34]));
935  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
936  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);;
937 
938  // Rounds 52-55
939  TMP2 = STATE0;
940  TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x38]));
941  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
942  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);;
943 
944  // Rounds 56-59
945  TMP2 = STATE0;
946  TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x3c]));
947  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
948  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);;
949 
950  // Rounds 60-63
951  TMP2 = STATE0;
952  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
953  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);;
954 
955  // Add back to state
956  STATE0 = vaddq_u32(STATE0, ABEF_SAVE);
957  STATE1 = vaddq_u32(STATE1, CDGH_SAVE);
958 
959  data += SHA256::BLOCKSIZE/sizeof(word32);
960  length -= SHA256::BLOCKSIZE;
961  }
962 
963  // Save state
964  vst1q_u32(&state[0], STATE0);
965  vst1q_u32(&state[4], STATE1);
966 }
967 #endif // CRYPTOPP_ARM_SHA_AVAILABLE
968 
969 ///////////////////////////////////////////////////////////
970 // end of Walton, Schneiders, O'Rourke and Hovsmith code //
971 ///////////////////////////////////////////////////////////
972 
973 // ***************** Power8 SHA ********************
974 
975 //////////////////////////////////////////////////
976 // start Gustavo, Serra, Scalet and Walton code //
977 //////////////////////////////////////////////////
978 
979 #if CRYPTOPP_POWER8_SHA_AVAILABLE
980 
981 // Indexes into the S[] array
982 enum {A=0, B=1, C, D, E, F, G, H};
983 
984 typedef __vector unsigned char uint8x16_p8;
985 typedef __vector unsigned int uint32x4_p8;
986 typedef __vector unsigned long long uint64x2_p8;
987 
988 uint32x4_p8 VEC_XL_BE(int offset, const uint8_t* data)
989 {
990 #if defined(CRYPTOPP_XLC_VERSION)
991  return vec_xl_be(offset, data);
992 #else
993  uint32x4_p8 res;
994  __asm(" lxvd2x %x0, %1, %2 \n\t"
995  : "=wa" (res)
996  : "b" (data), "r" (offset));
997  return res;
998 #endif
999 }
1000 
1001 #endif // CRYPTOPP_POWER8_SHA_AVAILABLE
1002 
1003 #if CRYPTOPP_POWER8_SHA_AVAILABLE
1004 
1005 // Aligned load
1006 template <class T> static inline
1007 uint32x4_p8 VectorLoad32x4(const T* data, int offset)
1008 {
1009  return (uint32x4_p8)vec_ld(offset, data);
1010 }
1011 
1012 // Unaligned load
1013 template <class T> static inline
1014 uint32x4_p8 VectorLoad32x4u(const T* data, int offset)
1015 {
1016 #if defined(CRYPTOPP_XLC_VERSION)
1017  return (uint32x4_p8)vec_xl(offset, data);
1018 #else
1019  return (uint32x4_p8)vec_vsx_ld(offset, data);
1020 #endif
1021 }
1022 
1023 // Aligned store
1024 template <class T> static inline
1025 void VectorStore32x4(const uint32x4_p8 val, T* data, int offset)
1026 {
1027  vec_st((uint8x16_p8)val, offset, data);
1028 }
1029 
1030 // Unaligned store
1031 template <class T> static inline
1032 void VectorStore32x4u(const uint32x4_p8 val, T* data, int offset)
1033 {
1034 #if defined(CRYPTOPP_XLC_VERSION)
1035  vec_xst((uint8x16_p8)val, offset, (uint8_t*)data);
1036 #else
1037  vec_vsx_st((uint8x16_p8)val, offset, (uint8_t*)data);
1038 #endif
1039 }
1040 
1041 // Unaligned load of a user message. The load is big-endian,
1042 // and then the message is permuted for 32-bit words.
1043 template <class T> static inline
1044 uint32x4_p8 VectorLoadMsg32x4(const T* data, int offset)
1045 {
1046 #if defined(CRYPTOPP_LITTLE_ENDIAN)
1047  const uint8x16_p8 mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
1048  const uint32x4_p8 r = VectorLoad32x4u(data, offset);
1049  return (uint32x4_p8)vec_perm(r, r, mask);
1050 #else
1051  return VectorLoad32x4u(data, offset);
1052 #endif
1053 }
1054 
1055 static inline
1056 uint32x4_p8 VectorCh(const uint32x4_p8 x, const uint32x4_p8 y, const uint32x4_p8 z)
1057 {
1058  // The trick below is due to Andy Polyakov and Jack Lloyd
1059  return vec_sel(z,y,x);
1060 }
1061 
1062 static inline
1063 uint32x4_p8 VectorMaj(const uint32x4_p8 x, const uint32x4_p8 y, const uint32x4_p8 z)
1064 {
1065  // The trick below is due to Andy Polyakov and Jack Lloyd
1066  return vec_sel(y, z, vec_xor(x, y));
1067 }
1068 
1069 static inline
1070 uint32x4_p8 Vector_sigma0(const uint32x4_p8 val)
1071 {
1072 #if defined(CRYPTOPP_XLC_VERSION)
1073  return __vshasigmaw(val, 0, 0);
1074 #else
1075  return __builtin_crypto_vshasigmaw(val, 0, 0);
1076 #endif
1077 }
1078 
1079 static inline
1080 uint32x4_p8 Vector_sigma1(const uint32x4_p8 val)
1081 {
1082 #if defined(CRYPTOPP_XLC_VERSION)
1083  return __vshasigmaw(val, 0, 0xf);
1084 #else
1085  return __builtin_crypto_vshasigmaw(val, 0, 0xf);
1086 #endif
1087 }
1088 
1089 static inline
1090 uint32x4_p8 VectorSigma0(const uint32x4_p8 val)
1091 {
1092 #if defined(CRYPTOPP_XLC_VERSION)
1093  return __vshasigmaw(val, 1, 0);
1094 #else
1095  return __builtin_crypto_vshasigmaw(val, 1, 0);
1096 #endif
1097 }
1098 
1099 static inline
1100 uint32x4_p8 VectorSigma1(const uint32x4_p8 val)
1101 {
1102 #if defined(CRYPTOPP_XLC_VERSION)
1103  return __vshasigmaw(val, 1, 0xf);
1104 #else
1105  return __builtin_crypto_vshasigmaw(val, 1, 0xf);
1106 #endif
1107 }
1108 
1109 static inline
1110 uint32x4_p8 VectorPack(const uint32x4_p8 a, const uint32x4_p8 b,
1111  const uint32x4_p8 c, const uint32x4_p8 d)
1112 {
1113  const uint8x16_p8 m1 = {0,1,2,3, 16,17,18,19, 0,0,0,0, 0,0,0,0};
1114  const uint8x16_p8 m2 = {0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
1115  return vec_perm(vec_perm(a,b,m1), vec_perm(c,d,m1), m2);
1116 }
1117 
1118 template <unsigned int L> static inline
1119 uint32x4_p8 VectorShiftLeft(const uint32x4_p8 val)
1120 {
1121 #if (defined(CRYPTOPP_LITTLE_ENDIAN))
1122  return (uint32x4_p8)vec_sld((uint8x16_p8)val, (uint8x16_p8)val, (16-L)&0xf);
1123 #else
1124  return (uint32x4_p8)vec_sld((uint8x16_p8)val, (uint8x16_p8)val, L&0xf);
1125 #endif
1126 }
1127 
1128 template <>
1129 uint32x4_p8 VectorShiftLeft<0>(const uint32x4_p8 val) { return val; }
1130 
1131 template <>
1132 uint32x4_p8 VectorShiftLeft<16>(const uint32x4_p8 val) { return val; }
1133 
1134 template <unsigned int R> static inline
1135 void SHA256_ROUND1(uint32x4_p8 W[16], uint32x4_p8 S[8], const uint32x4_p8 K, const uint32x4_p8 M)
1136 {
1137  uint32x4_p8 T1, T2;
1138 
1139  W[R] = M;
1140  T1 = S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K + M;
1141  T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
1142 
1143  S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
1144  S[E] = S[D] + T1;
1145  S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
1146  S[A] = T1 + T2;
1147 }
1148 
1149 template <unsigned int R> static inline
1150 void SHA256_ROUND2(uint32x4_p8 W[16], uint32x4_p8 S[8], const uint32x4_p8 K)
1151 {
1152  // Indexes into the W[] array
1153  enum {IDX0=(R+0)&0xf, IDX1=(R+1)&0xf, IDX9=(R+9)&0xf, IDX14=(R+14)&0xf};
1154 
1155  const uint32x4_p8 s0 = Vector_sigma0(W[IDX1]);
1156  const uint32x4_p8 s1 = Vector_sigma1(W[IDX14]);
1157 
1158  uint32x4_p8 T1 = (W[IDX0] += s0 + s1 + W[IDX9]);
1159  T1 += S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K;
1160  uint32x4_p8 T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
1161 
1162  S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
1163  S[E] = S[D] + T1;
1164  S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
1165  S[A] = T1 + T2;
1166 }
1167 
1168 void SHA256_HashMultipleBlocks_POWER8(word32 *state, const word32 *data, size_t length, ByteOrder order)
1169 {
1170  CRYPTOPP_ASSERT(state); CRYPTOPP_ASSERT(data);
1171  CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE);
1172  CRYPTOPP_UNUSED(order);
1173 
1174  const uint32_t* k = reinterpret_cast<const uint32_t*>(SHA256_K);
1175  const uint32_t* m = reinterpret_cast<const uint32_t*>(data);
1176 
1177  uint32x4_p8 abcd = VectorLoad32x4u(state+0, 0);
1178  uint32x4_p8 efgh = VectorLoad32x4u(state+4, 0);
1179  uint32x4_p8 W[16], S[8], vm, vk;
1180 
1181  size_t blocks = length / SHA256::BLOCKSIZE;
1182  while (blocks--)
1183  {
1184  unsigned int i, offset=0;
1185 
1186  S[A] = abcd; S[E] = efgh;
1187  S[B] = VectorShiftLeft<4>(S[A]);
1188  S[F] = VectorShiftLeft<4>(S[E]);
1189  S[C] = VectorShiftLeft<4>(S[B]);
1190  S[G] = VectorShiftLeft<4>(S[F]);
1191  S[D] = VectorShiftLeft<4>(S[C]);
1192  S[H] = VectorShiftLeft<4>(S[G]);
1193 
1194  // Unroll the loop to provide the round number as a constexpr
1195  // for (unsigned int i=0; i<16; ++i)
1196  {
1197  vk = VectorLoad32x4(k, offset);
1198  vm = VectorLoadMsg32x4(m, offset);
1199  SHA256_ROUND1<0>(W,S, vk,vm);
1200  offset+=16;
1201 
1202  vk = VectorShiftLeft<4>(vk);
1203  vm = VectorShiftLeft<4>(vm);
1204  SHA256_ROUND1<1>(W,S, vk,vm);
1205 
1206  vk = VectorShiftLeft<4>(vk);
1207  vm = VectorShiftLeft<4>(vm);
1208  SHA256_ROUND1<2>(W,S, vk,vm);
1209 
1210  vk = VectorShiftLeft<4>(vk);
1211  vm = VectorShiftLeft<4>(vm);
1212  SHA256_ROUND1<3>(W,S, vk,vm);
1213 
1214  vk = VectorLoad32x4(k, offset);
1215  vm = VectorLoadMsg32x4(m, offset);
1216  SHA256_ROUND1<4>(W,S, vk,vm);
1217  offset+=16;
1218 
1219  vk = VectorShiftLeft<4>(vk);
1220  vm = VectorShiftLeft<4>(vm);
1221  SHA256_ROUND1<5>(W,S, vk,vm);
1222 
1223  vk = VectorShiftLeft<4>(vk);
1224  vm = VectorShiftLeft<4>(vm);
1225  SHA256_ROUND1<6>(W,S, vk,vm);
1226 
1227  vk = VectorShiftLeft<4>(vk);
1228  vm = VectorShiftLeft<4>(vm);
1229  SHA256_ROUND1<7>(W,S, vk,vm);
1230 
1231  vk = VectorLoad32x4(k, offset);
1232  vm = VectorLoadMsg32x4(m, offset);
1233  SHA256_ROUND1<8>(W,S, vk,vm);
1234  offset+=16;
1235 
1236  vk = VectorShiftLeft<4>(vk);
1237  vm = VectorShiftLeft<4>(vm);
1238  SHA256_ROUND1<9>(W,S, vk,vm);
1239 
1240  vk = VectorShiftLeft<4>(vk);
1241  vm = VectorShiftLeft<4>(vm);
1242  SHA256_ROUND1<10>(W,S, vk,vm);
1243 
1244  vk = VectorShiftLeft<4>(vk);
1245  vm = VectorShiftLeft<4>(vm);
1246  SHA256_ROUND1<11>(W,S, vk,vm);
1247 
1248  vk = VectorLoad32x4(k, offset);
1249  vm = VectorLoadMsg32x4(m, offset);
1250  SHA256_ROUND1<12>(W,S, vk,vm);
1251  offset+=16;
1252 
1253  vk = VectorShiftLeft<4>(vk);
1254  vm = VectorShiftLeft<4>(vm);
1255  SHA256_ROUND1<13>(W,S, vk,vm);
1256 
1257  vk = VectorShiftLeft<4>(vk);
1258  vm = VectorShiftLeft<4>(vm);
1259  SHA256_ROUND1<14>(W,S, vk,vm);
1260 
1261  vk = VectorShiftLeft<4>(vk);
1262  vm = VectorShiftLeft<4>(vm);
1263  SHA256_ROUND1<15>(W,S, vk,vm);
1264  }
1265 
1266  m += 16; // 32-bit words, not bytes
1267 
1268  for (i=16; i<64; i+=16)
1269  {
1270  vk = VectorLoad32x4(k, offset);
1271  SHA256_ROUND2<0>(W,S, vk);
1272  SHA256_ROUND2<1>(W,S, VectorShiftLeft<4>(vk));
1273  SHA256_ROUND2<2>(W,S, VectorShiftLeft<8>(vk));
1274  SHA256_ROUND2<3>(W,S, VectorShiftLeft<12>(vk));
1275  offset+=16;
1276 
1277  vk = VectorLoad32x4(k, offset);
1278  SHA256_ROUND2<4>(W,S, vk);
1279  SHA256_ROUND2<5>(W,S, VectorShiftLeft<4>(vk));
1280  SHA256_ROUND2<6>(W,S, VectorShiftLeft<8>(vk));
1281  SHA256_ROUND2<7>(W,S, VectorShiftLeft<12>(vk));
1282  offset+=16;
1283 
1284  vk = VectorLoad32x4(k, offset);
1285  SHA256_ROUND2<8>(W,S, vk);
1286  SHA256_ROUND2<9>(W,S, VectorShiftLeft<4>(vk));
1287  SHA256_ROUND2<10>(W,S, VectorShiftLeft<8>(vk));
1288  SHA256_ROUND2<11>(W,S, VectorShiftLeft<12>(vk));
1289  offset+=16;
1290 
1291  vk = VectorLoad32x4(k, offset);
1292  SHA256_ROUND2<12>(W,S, vk);
1293  SHA256_ROUND2<13>(W,S, VectorShiftLeft<4>(vk));
1294  SHA256_ROUND2<14>(W,S, VectorShiftLeft<8>(vk));
1295  SHA256_ROUND2<15>(W,S, VectorShiftLeft<12>(vk));
1296  offset+=16;
1297  }
1298 
1299  abcd += VectorPack(S[A],S[B],S[C],S[D]);
1300  efgh += VectorPack(S[E],S[F],S[G],S[H]);
1301  }
1302 
1303  VectorStore32x4u(abcd, state+0, 0);
1304  VectorStore32x4u(efgh, state+4, 0);
1305 }
1306 
1307 static inline
1308 uint64x2_p8 VectorPermute64x2(const uint64x2_p8 val, const uint8x16_p8 mask)
1309 {
1310  return (uint64x2_p8)vec_perm(val, val, mask);
1311 }
1312 
1313 // Aligned load
1314 template <class T> static inline
1315 uint64x2_p8 VectorLoad64x2(const T* data, int offset)
1316 {
1317  return (uint64x2_p8)vec_ld(offset, (const uint8_t*)data);
1318 }
1319 
1320 // Unaligned load
1321 template <class T> static inline
1322 uint64x2_p8 VectorLoad64x2u(const T* data, int offset)
1323 {
1324 #if defined(CRYPTOPP_XLC_VERSION)
1325  return (uint64x2_p8)vec_xl(offset, (const uint8_t*)data);
1326 #else
1327  return (uint64x2_p8)vec_vsx_ld(offset, (const uint8_t*)data);
1328 #endif
1329 }
1330 
1331 // Aligned store
1332 template <class T> static inline
1333 void VectorStore64x2(const uint64x2_p8 val, T* data, int offset)
1334 {
1335  vec_st((uint8x16_p8)val, offset, (uint8_t*)data);
1336 }
1337 
1338 // Unaligned store
1339 template <class T> static inline
1340 void VectorStore64x2u(const uint64x2_p8 val, T* data, int offset)
1341 {
1342 #if defined(CRYPTOPP_XLC_VERSION)
1343  vec_xst((uint8x16_p8)val, offset, (uint8_t*)data);
1344 #else
1345  vec_vsx_st((uint8x16_p8)val, offset, (uint8_t*)data);
1346 #endif
1347 }
1348 
1349 // Unaligned load of a user message. The load is big-endian,
1350 // and then the message is permuted for 32-bit words.
1351 template <class T> static inline
1352 uint64x2_p8 VectorLoadMsg64x2(const T* data, int offset)
1353 {
1354 #if defined(CRYPTOPP_LITTLE_ENDIAN)
1355  const uint8x16_p8 mask = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15};
1356  return VectorPermute64x2(VectorLoad64x2u(data, offset), mask);
1357 #else
1358  return VectorLoad64x2u(data, offset);
1359 #endif
1360 }
1361 
1362 static inline
1363 uint64x2_p8 VectorCh(const uint64x2_p8 x, const uint64x2_p8 y, const uint64x2_p8 z)
1364 {
1365  // The trick below is due to Andy Polyakov and Jack Lloyd
1366  return vec_sel(z,y,x);
1367 }
1368 
1369 static inline
1370 uint64x2_p8 VectorMaj(const uint64x2_p8 x, const uint64x2_p8 y, const uint64x2_p8 z)
1371 {
1372  // The trick below is due to Andy Polyakov and Jack Lloyd
1373  return vec_sel(y, z, vec_xor(x, y));
1374 }
1375 
1376 static inline
1377 uint64x2_p8 Vector_sigma0(const uint64x2_p8 val)
1378 {
1379 #if defined(CRYPTOPP_XLC_VERSION)
1380  return __vshasigmad(val, 0, 0);
1381 #else
1382  return __builtin_crypto_vshasigmad(val, 0, 0);
1383 #endif
1384 }
1385 
1386 static inline
1387 uint64x2_p8 Vector_sigma1(const uint64x2_p8 val)
1388 {
1389 #if defined(CRYPTOPP_XLC_VERSION)
1390  return __vshasigmad(val, 0, 0xf);
1391 #else
1392  return __builtin_crypto_vshasigmad(val, 0, 0xf);
1393 #endif
1394 }
1395 
1396 static inline
1397 uint64x2_p8 VectorSigma0(const uint64x2_p8 val)
1398 {
1399 #if defined(CRYPTOPP_XLC_VERSION)
1400  return __vshasigmad(val, 1, 0);
1401 #else
1402  return __builtin_crypto_vshasigmad(val, 1, 0);
1403 #endif
1404 }
1405 
1406 static inline
1407 uint64x2_p8 VectorSigma1(const uint64x2_p8 val)
1408 {
1409 #if defined(CRYPTOPP_XLC_VERSION)
1410  return __vshasigmad(val, 1, 0xf);
1411 #else
1412  return __builtin_crypto_vshasigmad(val, 1, 0xf);
1413 #endif
1414 }
1415 
1416 static inline
1417 uint64x2_p8 VectorPack(const uint64x2_p8 x, const uint64x2_p8 y)
1418 {
1419  const uint8x16_p8 m = {0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
1420  return vec_perm(x,y,m);
1421 }
1422 
1423 template <unsigned int L> static inline
1424 uint64x2_p8 VectorShiftLeft(const uint64x2_p8 val)
1425 {
1426 #if (defined(CRYPTOPP_LITTLE_ENDIAN))
1427  return (uint64x2_p8)vec_sld((uint8x16_p8)val, (uint8x16_p8)val, (16-L)&0xf);
1428 #else
1429  return (uint64x2_p8)vec_sld((uint8x16_p8)val, (uint8x16_p8)val, L&0xf);
1430 #endif
1431 }
1432 
1433 template <>
1434 uint64x2_p8 VectorShiftLeft<0>(const uint64x2_p8 val) { return val; }
1435 
1436 template <>
1437 uint64x2_p8 VectorShiftLeft<16>(const uint64x2_p8 val) { return val; }
1438 
1439 template <unsigned int R> static inline
1440 void SHA512_ROUND1(uint64x2_p8 W[16], uint64x2_p8 S[8], const uint64x2_p8 K, const uint64x2_p8 M)
1441 {
1442  uint64x2_p8 T1, T2;
1443 
1444  W[R] = M;
1445  T1 = S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K + M;
1446  T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
1447 
1448  S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
1449  S[E] = S[D] + T1;
1450  S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
1451  S[A] = T1 + T2;
1452 }
1453 
1454 template <unsigned int R> static inline
1455 void SHA512_ROUND2(uint64x2_p8 W[16], uint64x2_p8 S[8], const uint64x2_p8 K)
1456 {
1457  // Indexes into the W[] array
1458  enum {IDX0=(R+0)&0xf, IDX1=(R+1)&0xf, IDX9=(R+9)&0xf, IDX14=(R+14)&0xf};
1459 
1460  const uint64x2_p8 s0 = Vector_sigma0(W[IDX1]);
1461  const uint64x2_p8 s1 = Vector_sigma1(W[IDX14]);
1462 
1463  uint64x2_p8 T1 = (W[IDX0] += s0 + s1 + W[IDX9]);
1464  T1 += S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K;
1465  uint64x2_p8 T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
1466 
1467  S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
1468  S[E] = S[D] + T1;
1469  S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
1470  S[A] = T1 + T2;
1471 }
1472 
1473 void SHA512_HashMultipleBlocks_POWER8(word64 *state, const word64 *data, size_t length, ByteOrder order)
1474 {
1475  CRYPTOPP_ASSERT(state); CRYPTOPP_ASSERT(data);
1476  CRYPTOPP_ASSERT(length >= SHA512::BLOCKSIZE);
1477  CRYPTOPP_UNUSED(order);
1478 
1479  const uint64_t* k = reinterpret_cast<const uint64_t*>(SHA512_K);
1480  const uint64_t* m = reinterpret_cast<const uint64_t*>(data);
1481 
1482  uint64x2_p8 ab = VectorLoad64x2u(state+0, 0);
1483  uint64x2_p8 cd = VectorLoad64x2u(state+2, 0);
1484  uint64x2_p8 ef = VectorLoad64x2u(state+4, 0);
1485  uint64x2_p8 gh = VectorLoad64x2u(state+6, 0);
1486  uint64x2_p8 W[16], S[8], vm, vk;
1487 
1488  size_t blocks = length / SHA512::BLOCKSIZE;
1489  while (blocks--)
1490  {
1491  unsigned int i, offset=0;
1492 
1493  S[A] = ab; S[C] = cd;
1494  S[E] = ef; S[G] = gh;
1495  S[B] = VectorShiftLeft<8>(S[A]);
1496  S[D] = VectorShiftLeft<8>(S[C]);
1497  S[F] = VectorShiftLeft<8>(S[E]);
1498  S[H] = VectorShiftLeft<8>(S[G]);
1499 
1500  // Unroll the loop to provide the round number as a constexpr
1501  // for (unsigned int i=0; i<16; ++i)
1502  {
1503  vk = VectorLoad64x2(k, offset);
1504  vm = VectorLoadMsg64x2(m, offset);
1505  SHA512_ROUND1<0>(W,S, vk,vm);
1506  offset+=16;
1507 
1508  vk = VectorShiftLeft<8>(vk);
1509  vm = VectorShiftLeft<8>(vm);
1510  SHA512_ROUND1<1>(W,S, vk,vm);
1511 
1512  vk = VectorLoad64x2(k, offset);
1513  vm = VectorLoadMsg64x2(m, offset);
1514  SHA512_ROUND1<2>(W,S, vk,vm);
1515  offset+=16;
1516 
1517  vk = VectorShiftLeft<8>(vk);
1518  vm = VectorShiftLeft<8>(vm);
1519  SHA512_ROUND1<3>(W,S, vk,vm);
1520 
1521  vk = VectorLoad64x2(k, offset);
1522  vm = VectorLoadMsg64x2(m, offset);
1523  SHA512_ROUND1<4>(W,S, vk,vm);
1524  offset+=16;
1525 
1526  vk = VectorShiftLeft<8>(vk);
1527  vm = VectorShiftLeft<8>(vm);
1528  SHA512_ROUND1<5>(W,S, vk,vm);
1529 
1530  vk = VectorLoad64x2(k, offset);
1531  vm = VectorLoadMsg64x2(m, offset);
1532  SHA512_ROUND1<6>(W,S, vk,vm);
1533  offset+=16;
1534 
1535  vk = VectorShiftLeft<8>(vk);
1536  vm = VectorShiftLeft<8>(vm);
1537  SHA512_ROUND1<7>(W,S, vk,vm);
1538 
1539  vk = VectorLoad64x2(k, offset);
1540  vm = VectorLoadMsg64x2(m, offset);
1541  SHA512_ROUND1<8>(W,S, vk,vm);
1542  offset+=16;
1543 
1544  vk = VectorShiftLeft<8>(vk);
1545  vm = VectorShiftLeft<8>(vm);
1546  SHA512_ROUND1<9>(W,S, vk,vm);
1547 
1548  vk = VectorLoad64x2(k, offset);
1549  vm = VectorLoadMsg64x2(m, offset);
1550  SHA512_ROUND1<10>(W,S, vk,vm);
1551  offset+=16;
1552 
1553  vk = VectorShiftLeft<8>(vk);
1554  vm = VectorShiftLeft<8>(vm);
1555  SHA512_ROUND1<11>(W,S, vk,vm);
1556 
1557  vk = VectorLoad64x2(k, offset);
1558  vm = VectorLoadMsg64x2(m, offset);
1559  SHA512_ROUND1<12>(W,S, vk,vm);
1560  offset+=16;
1561 
1562  vk = VectorShiftLeft<8>(vk);
1563  vm = VectorShiftLeft<8>(vm);
1564  SHA512_ROUND1<13>(W,S, vk,vm);
1565 
1566  vk = VectorLoad64x2(k, offset);
1567  vm = VectorLoadMsg64x2(m, offset);
1568  SHA512_ROUND1<14>(W,S, vk,vm);
1569  offset+=16;
1570 
1571  vk = VectorShiftLeft<8>(vk);
1572  vm = VectorShiftLeft<8>(vm);
1573  SHA512_ROUND1<15>(W,S, vk,vm);
1574  }
1575 
1576  m += 16; // 64-bit words, not bytes
1577 
1578  for (i=16 ; i<80; i+=16)
1579  {
1580  vk = VectorLoad64x2(k, offset);
1581  SHA512_ROUND2<0>(W,S, vk);
1582  SHA512_ROUND2<1>(W,S, VectorShiftLeft<8>(vk));
1583  offset+=16;
1584 
1585  vk = VectorLoad64x2(k, offset);
1586  SHA512_ROUND2<2>(W,S, vk);
1587  SHA512_ROUND2<3>(W,S, VectorShiftLeft<8>(vk));
1588  offset+=16;
1589 
1590  vk = VectorLoad64x2(k, offset);
1591  SHA512_ROUND2<4>(W,S, vk);
1592  SHA512_ROUND2<5>(W,S, VectorShiftLeft<8>(vk));
1593  offset+=16;
1594 
1595  vk = VectorLoad64x2(k, offset);
1596  SHA512_ROUND2<6>(W,S, vk);
1597  SHA512_ROUND2<7>(W,S, VectorShiftLeft<8>(vk));
1598  offset+=16;
1599 
1600  vk = VectorLoad64x2(k, offset);
1601  SHA512_ROUND2<8>(W,S, vk);
1602  SHA512_ROUND2<9>(W,S, VectorShiftLeft<8>(vk));
1603  offset+=16;
1604 
1605  vk = VectorLoad64x2(k, offset);
1606  SHA512_ROUND2<10>(W,S, vk);
1607  SHA512_ROUND2<11>(W,S, VectorShiftLeft<8>(vk));
1608  offset+=16;
1609 
1610  vk = VectorLoad64x2(k, offset);
1611  SHA512_ROUND2<12>(W,S, vk);
1612  SHA512_ROUND2<13>(W,S, VectorShiftLeft<8>(vk));
1613  offset+=16;
1614 
1615  vk = VectorLoad64x2(k, offset);
1616  SHA512_ROUND2<14>(W,S, vk);
1617  SHA512_ROUND2<15>(W,S, VectorShiftLeft<8>(vk));
1618  offset+=16;
1619  }
1620 
1621  ab += VectorPack(S[A],S[B]);
1622  cd += VectorPack(S[C],S[D]);
1623  ef += VectorPack(S[E],S[F]);
1624  gh += VectorPack(S[G],S[H]);
1625  }
1626 
1627  VectorStore64x2u(ab, state+0, 0);
1628  VectorStore64x2u(cd, state+2, 0);
1629  VectorStore64x2u(ef, state+4, 0);
1630  VectorStore64x2u(gh, state+6, 0);
1631 }
1632 
1633 #endif // CRYPTOPP_POWER8_SHA_AVAILABLE
1634 
1635 ////////////////////////////////////////////////
1636 // end Gustavo, Serra, Scalet and Walton code //
1637 ////////////////////////////////////////////////
1638 
1639 NAMESPACE_END
Utility functions for the Crypto++ library.
ByteOrder
Provides the byte ordering.
Definition: cryptlib.h:140
T1 VectorShiftLeft(const T1 &vec1, const T2 &vec2)
Shift two vectors left.
Definition: ppc-simd.h:416
Library configuration file.
Support functions for PowerPC and vector operations.
Precompiled header file.
byte order is big-endian
Definition: cryptlib.h:144
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:60
Classes for SHA-1 and SHA-2 family of message digests.
Crypto++ library namespace.