21 #if (defined(__aarch32__) || defined(__aarch64__)) && defined(CRYPTOPP_SLOW_ARMV8_SHIFT) 22 # undef CRYPTOPP_ARM_NEON_AVAILABLE 25 #if (CRYPTOPP_SSE41_AVAILABLE) 26 # include <emmintrin.h> 27 # include <tmmintrin.h> 28 # include <smmintrin.h> 31 #if (CRYPTOPP_ARM_NEON_AVAILABLE) 32 # include <arm_neon.h> 37 #if defined(CRYPTOPP_ARM_ACLE_AVAILABLE) 39 # include <arm_acle.h> 43 #define M128_CAST(x) ((__m128i *)(void *)(x)) 44 #define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x)) 50 #if CRYPTOPP_SSE2_INTRIN_AVAILABLE && ((__SUNPRO_CC >= 0x5100 && __SUNPRO_CC < 0x5130) || \ 51 (defined(_MSC_VER) && _MSC_VER < 1600) || (defined(_M_IX86) && _MSC_VER >= 1600)) 52 inline __m128i MM_SET_EPI64X(
const word64 a,
const word64 b)
54 const word64 t[2] = {b,a}; __m128i r;
55 std::memcpy(&r, t,
sizeof(t));
59 # define MM_SET_EPI64X(a, b) _mm_set_epi64x(a, b) 62 ANONYMOUS_NAMESPACE_BEGIN
64 CRYPTOPP_ALIGN_DATA(16)
65 const word32 BLAKE2S_IV[8] = {
66 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
67 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL
70 CRYPTOPP_ALIGN_DATA(16)
71 const word64 BLAKE2B_IV[8] = {
72 W64LIT(0x6a09e667f3bcc908), W64LIT(0xbb67ae8584caa73b),
73 W64LIT(0x3c6ef372fe94f82b), W64LIT(0xa54ff53a5f1d36f1),
74 W64LIT(0x510e527fade682d1), W64LIT(0x9b05688c2b3e6c1f),
75 W64LIT(0x1f83d9abfb41bd6b), W64LIT(0x5be0cd19137e2179)
78 ANONYMOUS_NAMESPACE_END
80 #if CRYPTOPP_SSE41_AVAILABLE 83 __m128i row1, row2, row3, row4;
84 __m128i buf1, buf2, buf3, buf4;
89 const __m128i r8 = _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1);
90 const __m128i r16 = _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
92 const __m128i m0 = _mm_loadu_si128(CONST_M128_CAST(input + 00));
93 const __m128i m1 = _mm_loadu_si128(CONST_M128_CAST(input + 16));
94 const __m128i m2 = _mm_loadu_si128(CONST_M128_CAST(input + 32));
95 const __m128i m3 = _mm_loadu_si128(CONST_M128_CAST(input + 48));
97 row1 = ff0 = _mm_loadu_si128(CONST_M128_CAST(&state.h[0]));
98 row2 = ff1 = _mm_loadu_si128(CONST_M128_CAST(&state.h[4]));
99 row3 = _mm_setr_epi32(BLAKE2S_IV[0], BLAKE2S_IV[1], BLAKE2S_IV[2], BLAKE2S_IV[3]);
100 row4 = _mm_xor_si128(_mm_setr_epi32(BLAKE2S_IV[4], BLAKE2S_IV[5], BLAKE2S_IV[6], BLAKE2S_IV[7]), _mm_loadu_si128(CONST_M128_CAST(&state.t[0])));
101 buf1 = _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps((m0)), _mm_castsi128_ps((m1)), _MM_SHUFFLE(2,0,2,0))));
103 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
104 row4 = _mm_xor_si128(row4, row1);
105 row4 = _mm_shuffle_epi8(row4,r16);
106 row3 = _mm_add_epi32(row3, row4);
107 row2 = _mm_xor_si128(row2, row3);
108 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
110 buf2 = _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps((m0)), _mm_castsi128_ps((m1)), _MM_SHUFFLE(3,1,3,1))));
112 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
113 row4 = _mm_xor_si128(row4, row1);
114 row4 = _mm_shuffle_epi8(row4,r8);
115 row3 = _mm_add_epi32(row3, row4);
116 row2 = _mm_xor_si128(row2, row3);
117 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
119 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
120 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
121 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
123 buf3 = _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps((m2)), _mm_castsi128_ps((m3)), _MM_SHUFFLE(2,0,2,0))));
125 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
126 row4 = _mm_xor_si128(row4, row1);
127 row4 = _mm_shuffle_epi8(row4,r16);
128 row3 = _mm_add_epi32(row3, row4);
129 row2 = _mm_xor_si128(row2, row3);
130 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
132 buf4 = _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps((m2)), _mm_castsi128_ps((m3)), _MM_SHUFFLE(3,1,3,1))));
134 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
135 row4 = _mm_xor_si128(row4, row1);
136 row4 = _mm_shuffle_epi8(row4,r8);
137 row3 = _mm_add_epi32(row3, row4);
138 row2 = _mm_xor_si128(row2, row3);
139 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
141 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
142 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
143 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
145 t0 = _mm_blend_epi16(m1, m2, 0x0C);
146 t1 = _mm_slli_si128(m3, 4);
147 t2 = _mm_blend_epi16(t0, t1, 0xF0);
148 buf1 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,0,3));
150 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
151 row4 = _mm_xor_si128(row4, row1);
152 row4 = _mm_shuffle_epi8(row4,r16);
153 row3 = _mm_add_epi32(row3, row4);
154 row2 = _mm_xor_si128(row2, row3);
155 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
157 t0 = _mm_shuffle_epi32(m2,_MM_SHUFFLE(0,0,2,0));
158 t1 = _mm_blend_epi16(m1,m3,0xC0);
159 t2 = _mm_blend_epi16(t0, t1, 0xF0);
160 buf2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
162 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
163 row4 = _mm_xor_si128(row4, row1);
164 row4 = _mm_shuffle_epi8(row4,r8);
165 row3 = _mm_add_epi32(row3, row4);
166 row2 = _mm_xor_si128(row2, row3);
167 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
169 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
170 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
171 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
173 t0 = _mm_slli_si128(m1, 4);
174 t1 = _mm_blend_epi16(m2, t0, 0x30);
175 t2 = _mm_blend_epi16(m0, t1, 0xF0);
176 buf3 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
178 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
179 row4 = _mm_xor_si128(row4, row1);
180 row4 = _mm_shuffle_epi8(row4,r16);
181 row3 = _mm_add_epi32(row3, row4);
182 row2 = _mm_xor_si128(row2, row3);
183 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
185 t0 = _mm_unpackhi_epi32(m0,m1);
186 t1 = _mm_slli_si128(m3, 4);
187 t2 = _mm_blend_epi16(t0, t1, 0x0C);
188 buf4 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
190 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
191 row4 = _mm_xor_si128(row4, row1);
192 row4 = _mm_shuffle_epi8(row4,r8);
193 row3 = _mm_add_epi32(row3, row4);
194 row2 = _mm_xor_si128(row2, row3);
195 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
197 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
198 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
199 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
201 t0 = _mm_unpackhi_epi32(m2,m3);
202 t1 = _mm_blend_epi16(m3,m1,0x0C);
203 t2 = _mm_blend_epi16(t0, t1, 0x0F);
204 buf1 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2));
206 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
207 row4 = _mm_xor_si128(row4, row1);
208 row4 = _mm_shuffle_epi8(row4,r16);
209 row3 = _mm_add_epi32(row3, row4);
210 row2 = _mm_xor_si128(row2, row3);
211 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
213 t0 = _mm_unpacklo_epi32(m2,m0);
214 t1 = _mm_blend_epi16(t0, m0, 0xF0);
215 t2 = _mm_slli_si128(m3, 8);
216 buf2 = _mm_blend_epi16(t1, t2, 0xC0);
218 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
219 row4 = _mm_xor_si128(row4, row1);
220 row4 = _mm_shuffle_epi8(row4,r8);
221 row3 = _mm_add_epi32(row3, row4);
222 row2 = _mm_xor_si128(row2, row3);
223 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
225 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
226 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
227 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
229 t0 = _mm_blend_epi16(m0, m2, 0x3C);
230 t1 = _mm_srli_si128(m1, 12);
231 t2 = _mm_blend_epi16(t0,t1,0x03);
232 buf3 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,3,2));
234 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
235 row4 = _mm_xor_si128(row4, row1);
236 row4 = _mm_shuffle_epi8(row4,r16);
237 row3 = _mm_add_epi32(row3, row4);
238 row2 = _mm_xor_si128(row2, row3);
239 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
241 t0 = _mm_slli_si128(m3, 4);
242 t1 = _mm_blend_epi16(m0, m1, 0x33);
243 t2 = _mm_blend_epi16(t1, t0, 0xC0);
244 buf4 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(0,1,2,3));
246 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
247 row4 = _mm_xor_si128(row4, row1);
248 row4 = _mm_shuffle_epi8(row4,r8);
249 row3 = _mm_add_epi32(row3, row4);
250 row2 = _mm_xor_si128(row2, row3);
251 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
253 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
254 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
255 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
257 t0 = _mm_unpackhi_epi32(m0,m1);
258 t1 = _mm_unpackhi_epi32(t0, m2);
259 t2 = _mm_blend_epi16(t1, m3, 0x0C);
260 buf1 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2));
262 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
263 row4 = _mm_xor_si128(row4, row1);
264 row4 = _mm_shuffle_epi8(row4,r16);
265 row3 = _mm_add_epi32(row3, row4);
266 row2 = _mm_xor_si128(row2, row3);
267 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
269 t0 = _mm_slli_si128(m2, 8);
270 t1 = _mm_blend_epi16(m3,m0,0x0C);
271 t2 = _mm_blend_epi16(t1, t0, 0xC0);
272 buf2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3));
274 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
275 row4 = _mm_xor_si128(row4, row1);
276 row4 = _mm_shuffle_epi8(row4,r8);
277 row3 = _mm_add_epi32(row3, row4);
278 row2 = _mm_xor_si128(row2, row3);
279 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
281 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
282 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
283 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
285 t0 = _mm_blend_epi16(m0,m1,0x0F);
286 t1 = _mm_blend_epi16(t0, m3, 0xC0);
287 buf3 = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2));
289 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
290 row4 = _mm_xor_si128(row4, row1);
291 row4 = _mm_shuffle_epi8(row4,r16);
292 row3 = _mm_add_epi32(row3, row4);
293 row2 = _mm_xor_si128(row2, row3);
294 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
296 t0 = _mm_unpacklo_epi32(m0,m2);
297 t1 = _mm_unpackhi_epi32(m1,m2);
298 buf4 = _mm_unpacklo_epi64(t1,t0);
300 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
301 row4 = _mm_xor_si128(row4, row1);
302 row4 = _mm_shuffle_epi8(row4,r8);
303 row3 = _mm_add_epi32(row3, row4);
304 row2 = _mm_xor_si128(row2, row3);
305 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
307 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
308 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
309 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
311 t0 = _mm_unpacklo_epi64(m1,m2);
312 t1 = _mm_unpackhi_epi64(m0,m2);
313 t2 = _mm_blend_epi16(t0,t1,0x33);
314 buf1 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3));
316 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
317 row4 = _mm_xor_si128(row4, row1);
318 row4 = _mm_shuffle_epi8(row4,r16);
319 row3 = _mm_add_epi32(row3, row4);
320 row2 = _mm_xor_si128(row2, row3);
321 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
323 t0 = _mm_unpackhi_epi64(m1,m3);
324 t1 = _mm_unpacklo_epi64(m0,m1);
325 buf2 = _mm_blend_epi16(t0,t1,0x33);
327 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
328 row4 = _mm_xor_si128(row4, row1);
329 row4 = _mm_shuffle_epi8(row4,r8);
330 row3 = _mm_add_epi32(row3, row4);
331 row2 = _mm_xor_si128(row2, row3);
332 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
334 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
335 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
336 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
338 t0 = _mm_unpackhi_epi64(m3,m1);
339 t1 = _mm_unpackhi_epi64(m2,m0);
340 buf3 = _mm_blend_epi16(t1,t0,0x33);
342 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
343 row4 = _mm_xor_si128(row4, row1);
344 row4 = _mm_shuffle_epi8(row4,r16);
345 row3 = _mm_add_epi32(row3, row4);
346 row2 = _mm_xor_si128(row2, row3);
347 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
349 t0 = _mm_blend_epi16(m0,m2,0x03);
350 t1 = _mm_slli_si128(t0, 8);
351 t2 = _mm_blend_epi16(t1,m3,0x0F);
352 buf4 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,0,3));
354 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
355 row4 = _mm_xor_si128(row4, row1);
356 row4 = _mm_shuffle_epi8(row4,r8);
357 row3 = _mm_add_epi32(row3, row4);
358 row2 = _mm_xor_si128(row2, row3);
359 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
361 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
362 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
363 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
365 t0 = _mm_unpackhi_epi32(m0,m1);
366 t1 = _mm_unpacklo_epi32(m0,m2);
367 buf1 = _mm_unpacklo_epi64(t0,t1);
369 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
370 row4 = _mm_xor_si128(row4, row1);
371 row4 = _mm_shuffle_epi8(row4,r16);
372 row3 = _mm_add_epi32(row3, row4);
373 row2 = _mm_xor_si128(row2, row3);
374 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
376 t0 = _mm_srli_si128(m2, 4);
377 t1 = _mm_blend_epi16(m0,m3,0x03);
378 buf2 = _mm_blend_epi16(t1,t0,0x3C);
380 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
381 row4 = _mm_xor_si128(row4, row1);
382 row4 = _mm_shuffle_epi8(row4,r8);
383 row3 = _mm_add_epi32(row3, row4);
384 row2 = _mm_xor_si128(row2, row3);
385 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
387 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
388 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
389 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
391 t0 = _mm_blend_epi16(m1,m0,0x0C);
392 t1 = _mm_srli_si128(m3, 4);
393 t2 = _mm_blend_epi16(t0,t1,0x30);
394 buf3 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,3,0));
396 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
397 row4 = _mm_xor_si128(row4, row1);
398 row4 = _mm_shuffle_epi8(row4,r16);
399 row3 = _mm_add_epi32(row3, row4);
400 row2 = _mm_xor_si128(row2, row3);
401 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
403 t0 = _mm_unpacklo_epi64(m1,m2);
404 t1= _mm_shuffle_epi32(m3, _MM_SHUFFLE(0,2,0,1));
405 buf4 = _mm_blend_epi16(t0,t1,0x33);
407 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
408 row4 = _mm_xor_si128(row4, row1);
409 row4 = _mm_shuffle_epi8(row4,r8);
410 row3 = _mm_add_epi32(row3, row4);
411 row2 = _mm_xor_si128(row2, row3);
412 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
414 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
415 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
416 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
418 t0 = _mm_slli_si128(m1, 12);
419 t1 = _mm_blend_epi16(m0,m3,0x33);
420 buf1 = _mm_blend_epi16(t1,t0,0xC0);
422 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
423 row4 = _mm_xor_si128(row4, row1);
424 row4 = _mm_shuffle_epi8(row4,r16);
425 row3 = _mm_add_epi32(row3, row4);
426 row2 = _mm_xor_si128(row2, row3);
427 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
429 t0 = _mm_blend_epi16(m3,m2,0x30);
430 t1 = _mm_srli_si128(m1, 4);
431 t2 = _mm_blend_epi16(t0,t1,0x03);
432 buf2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,3,0));
434 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
435 row4 = _mm_xor_si128(row4, row1);
436 row4 = _mm_shuffle_epi8(row4,r8);
437 row3 = _mm_add_epi32(row3, row4);
438 row2 = _mm_xor_si128(row2, row3);
439 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
441 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
442 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
443 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
445 t0 = _mm_unpacklo_epi64(m0,m2);
446 t1 = _mm_srli_si128(m1, 4);
447 buf3 = _mm_shuffle_epi32(_mm_blend_epi16(t0,t1,0x0C), _MM_SHUFFLE(2,3,1,0));
449 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
450 row4 = _mm_xor_si128(row4, row1);
451 row4 = _mm_shuffle_epi8(row4,r16);
452 row3 = _mm_add_epi32(row3, row4);
453 row2 = _mm_xor_si128(row2, row3);
454 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
456 t0 = _mm_unpackhi_epi32(m1,m2);
457 t1 = _mm_unpackhi_epi64(m0,t0);
458 buf4 = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2));
460 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
461 row4 = _mm_xor_si128(row4, row1);
462 row4 = _mm_shuffle_epi8(row4,r8);
463 row3 = _mm_add_epi32(row3, row4);
464 row2 = _mm_xor_si128(row2, row3);
465 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
467 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
468 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
469 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
471 t0 = _mm_unpackhi_epi32(m0,m1);
472 t1 = _mm_blend_epi16(t0,m3,0x0F);
473 buf1 = _mm_shuffle_epi32(t1,_MM_SHUFFLE(2,0,3,1));
475 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
476 row4 = _mm_xor_si128(row4, row1);
477 row4 = _mm_shuffle_epi8(row4,r16);
478 row3 = _mm_add_epi32(row3, row4);
479 row2 = _mm_xor_si128(row2, row3);
480 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
482 t0 = _mm_blend_epi16(m2,m3,0x30);
483 t1 = _mm_srli_si128(m0,4);
484 t2 = _mm_blend_epi16(t0,t1,0x03);
485 buf2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,2,3));
487 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
488 row4 = _mm_xor_si128(row4, row1);
489 row4 = _mm_shuffle_epi8(row4,r8);
490 row3 = _mm_add_epi32(row3, row4);
491 row2 = _mm_xor_si128(row2, row3);
492 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
494 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
495 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
496 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
498 t0 = _mm_unpackhi_epi64(m0,m3);
499 t1 = _mm_unpacklo_epi64(m1,m2);
500 t2 = _mm_blend_epi16(t0,t1,0x3C);
501 buf3 = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,2,3,1));
503 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
504 row4 = _mm_xor_si128(row4, row1);
505 row4 = _mm_shuffle_epi8(row4,r16);
506 row3 = _mm_add_epi32(row3, row4);
507 row2 = _mm_xor_si128(row2, row3);
508 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
510 t0 = _mm_unpacklo_epi32(m0,m1);
511 t1 = _mm_unpackhi_epi32(m1,m2);
512 buf4 = _mm_unpacklo_epi64(t0,t1);
514 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
515 row4 = _mm_xor_si128(row4, row1);
516 row4 = _mm_shuffle_epi8(row4,r8);
517 row3 = _mm_add_epi32(row3, row4);
518 row2 = _mm_xor_si128(row2, row3);
519 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
521 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
522 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
523 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
525 t0 = _mm_unpackhi_epi32(m1,m3);
526 t1 = _mm_unpacklo_epi64(t0,m0);
527 t2 = _mm_blend_epi16(t1,m2,0xC0);
528 buf1 = _mm_shufflehi_epi16(t2,_MM_SHUFFLE(1,0,3,2));
530 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
531 row4 = _mm_xor_si128(row4, row1);
532 row4 = _mm_shuffle_epi8(row4,r16);
533 row3 = _mm_add_epi32(row3, row4);
534 row2 = _mm_xor_si128(row2, row3);
535 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
537 t0 = _mm_unpackhi_epi32(m0,m3);
538 t1 = _mm_blend_epi16(m2,t0,0xF0);
539 buf2 = _mm_shuffle_epi32(t1,_MM_SHUFFLE(0,2,1,3));
541 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
542 row4 = _mm_xor_si128(row4, row1);
543 row4 = _mm_shuffle_epi8(row4,r8);
544 row3 = _mm_add_epi32(row3, row4);
545 row2 = _mm_xor_si128(row2, row3);
546 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
548 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
549 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
550 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
552 t0 = _mm_blend_epi16(m2,m0,0x0C);
553 t1 = _mm_slli_si128(t0,4);
554 buf3 = _mm_blend_epi16(t1,m3,0x0F);
556 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
557 row4 = _mm_xor_si128(row4, row1);
558 row4 = _mm_shuffle_epi8(row4,r16);
559 row3 = _mm_add_epi32(row3, row4);
560 row2 = _mm_xor_si128(row2, row3);
561 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
563 t0 = _mm_blend_epi16(m1,m0,0x30);
564 buf4 = _mm_shuffle_epi32(t0,_MM_SHUFFLE(1,0,3,2));
566 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
567 row4 = _mm_xor_si128(row4, row1);
568 row4 = _mm_shuffle_epi8(row4,r8);
569 row3 = _mm_add_epi32(row3, row4);
570 row2 = _mm_xor_si128(row2, row3);
571 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
573 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
574 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
575 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
577 t0 = _mm_blend_epi16(m0,m2,0x03);
578 t1 = _mm_blend_epi16(m1,m2,0x30);
579 t2 = _mm_blend_epi16(t1,t0,0x0F);
580 buf1 = _mm_shuffle_epi32(t2,_MM_SHUFFLE(1,3,0,2));
582 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
583 row4 = _mm_xor_si128(row4, row1);
584 row4 = _mm_shuffle_epi8(row4,r16);
585 row3 = _mm_add_epi32(row3, row4);
586 row2 = _mm_xor_si128(row2, row3);
587 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
589 t0 = _mm_slli_si128(m0,4);
590 t1 = _mm_blend_epi16(m1,t0,0xC0);
591 buf2 = _mm_shuffle_epi32(t1,_MM_SHUFFLE(1,2,0,3));
593 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
594 row4 = _mm_xor_si128(row4, row1);
595 row4 = _mm_shuffle_epi8(row4,r8);
596 row3 = _mm_add_epi32(row3, row4);
597 row2 = _mm_xor_si128(row2, row3);
598 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
600 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
601 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
602 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
604 t0 = _mm_unpackhi_epi32(m0,m3);
605 t1 = _mm_unpacklo_epi32(m2,m3);
606 t2 = _mm_unpackhi_epi64(t0,t1);
607 buf3 = _mm_shuffle_epi32(t2,_MM_SHUFFLE(3,0,2,1));
609 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
610 row4 = _mm_xor_si128(row4, row1);
611 row4 = _mm_shuffle_epi8(row4,r16);
612 row3 = _mm_add_epi32(row3, row4);
613 row2 = _mm_xor_si128(row2, row3);
614 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
616 t0 = _mm_blend_epi16(m3,m2,0xC0);
617 t1 = _mm_unpacklo_epi32(m0,m3);
618 t2 = _mm_blend_epi16(t0,t1,0x0F);
619 buf4 = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,1,2,3));
621 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
622 row4 = _mm_xor_si128(row4, row1);
623 row4 = _mm_shuffle_epi8(row4,r8);
624 row3 = _mm_add_epi32(row3, row4);
625 row2 = _mm_xor_si128(row2, row3);
626 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
628 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
629 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
630 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
632 _mm_storeu_si128(M128_CAST(&state.h[0]), _mm_xor_si128(ff0, _mm_xor_si128(row1, row3)));
633 _mm_storeu_si128(M128_CAST(&state.h[4]), _mm_xor_si128(ff1, _mm_xor_si128(row2, row4)));
638 __m128i row1l, row1h;
639 __m128i row2l, row2h;
640 __m128i row3l, row3h;
641 __m128i row4l, row4h;
642 __m128i b0, b1, t0, t1;
644 const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
645 const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
647 const __m128i m0 = _mm_loadu_si128(CONST_M128_CAST(input + 00));
648 const __m128i m1 = _mm_loadu_si128(CONST_M128_CAST(input + 16));
649 const __m128i m2 = _mm_loadu_si128(CONST_M128_CAST(input + 32));
650 const __m128i m3 = _mm_loadu_si128(CONST_M128_CAST(input + 48));
651 const __m128i m4 = _mm_loadu_si128(CONST_M128_CAST(input + 64));
652 const __m128i m5 = _mm_loadu_si128(CONST_M128_CAST(input + 80));
653 const __m128i m6 = _mm_loadu_si128(CONST_M128_CAST(input + 96));
654 const __m128i m7 = _mm_loadu_si128(CONST_M128_CAST(input + 112));
656 row1l = _mm_loadu_si128(CONST_M128_CAST(&state.h[0]));
657 row1h = _mm_loadu_si128(CONST_M128_CAST(&state.h[2]));
658 row2l = _mm_loadu_si128(CONST_M128_CAST(&state.h[4]));
659 row2h = _mm_loadu_si128(CONST_M128_CAST(&state.h[6]));
660 row3l = _mm_loadu_si128(CONST_M128_CAST(&BLAKE2B_IV[0]));
661 row3h = _mm_loadu_si128(CONST_M128_CAST(&BLAKE2B_IV[2]));
662 row4l = _mm_xor_si128(_mm_loadu_si128(CONST_M128_CAST(&BLAKE2B_IV[4])), _mm_loadu_si128(CONST_M128_CAST(&state.t[0])));
663 row4h = _mm_xor_si128(_mm_loadu_si128(CONST_M128_CAST(&BLAKE2B_IV[6])), _mm_loadu_si128(CONST_M128_CAST(&state.f[0])));
665 b0 = _mm_unpacklo_epi64(m0, m1);
666 b1 = _mm_unpacklo_epi64(m2, m3);
667 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
668 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
669 row4l = _mm_xor_si128(row4l, row1l);
670 row4h = _mm_xor_si128(row4h, row1h);
671 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
672 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
673 row3l = _mm_add_epi64(row3l, row4l);
674 row3h = _mm_add_epi64(row3h, row4h);
675 row2l = _mm_xor_si128(row2l, row3l);
676 row2h = _mm_xor_si128(row2h, row3h);
677 row2l = _mm_shuffle_epi8(row2l, r24);
678 row2h = _mm_shuffle_epi8(row2h, r24);
680 b0 = _mm_unpackhi_epi64(m0, m1);
681 b1 = _mm_unpackhi_epi64(m2, m3);
683 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
684 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
685 row4l = _mm_xor_si128(row4l, row1l);
686 row4h = _mm_xor_si128(row4h, row1h);
687 row4l = _mm_shuffle_epi8(row4l, r16);
688 row4h = _mm_shuffle_epi8(row4h, r16);
689 row3l = _mm_add_epi64(row3l, row4l);
690 row3h = _mm_add_epi64(row3h, row4h);
691 row2l = _mm_xor_si128(row2l, row3l);
692 row2h = _mm_xor_si128(row2h, row3h);
693 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
694 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
696 t0 = _mm_alignr_epi8(row2h, row2l, 8);
697 t1 = _mm_alignr_epi8(row2l, row2h, 8);
698 row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
699 t0 = _mm_alignr_epi8(row4h, row4l, 8);
700 t1 = _mm_alignr_epi8(row4l, row4h, 8);
701 row4l = t1, row4h = t0;
703 b0 = _mm_unpacklo_epi64(m4, m5);
704 b1 = _mm_unpacklo_epi64(m6, m7);
706 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
707 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
708 row4l = _mm_xor_si128(row4l, row1l);
709 row4h = _mm_xor_si128(row4h, row1h);
710 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
711 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
712 row3l = _mm_add_epi64(row3l, row4l);
713 row3h = _mm_add_epi64(row3h, row4h);
714 row2l = _mm_xor_si128(row2l, row3l);
715 row2h = _mm_xor_si128(row2h, row3h);
716 row2l = _mm_shuffle_epi8(row2l, r24);
717 row2h = _mm_shuffle_epi8(row2h, r24);
719 b0 = _mm_unpackhi_epi64(m4, m5);
720 b1 = _mm_unpackhi_epi64(m6, m7);
722 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
723 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
724 row4l = _mm_xor_si128(row4l, row1l);
725 row4h = _mm_xor_si128(row4h, row1h);
726 row4l = _mm_shuffle_epi8(row4l, r16);
727 row4h = _mm_shuffle_epi8(row4h, r16);
728 row3l = _mm_add_epi64(row3l, row4l);
729 row3h = _mm_add_epi64(row3h, row4h);
730 row2l = _mm_xor_si128(row2l, row3l);
731 row2h = _mm_xor_si128(row2h, row3h);
732 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
733 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
735 t0 = _mm_alignr_epi8(row2l, row2h, 8);
736 t1 = _mm_alignr_epi8(row2h, row2l, 8);
737 row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
738 t0 = _mm_alignr_epi8(row4l, row4h, 8);
739 t1 = _mm_alignr_epi8(row4h, row4l, 8);
740 row4l = t1, row4h = t0;
742 b0 = _mm_unpacklo_epi64(m7, m2);
743 b1 = _mm_unpackhi_epi64(m4, m6);
745 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
746 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
747 row4l = _mm_xor_si128(row4l, row1l);
748 row4h = _mm_xor_si128(row4h, row1h);
749 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
750 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
751 row3l = _mm_add_epi64(row3l, row4l);
752 row3h = _mm_add_epi64(row3h, row4h);
753 row2l = _mm_xor_si128(row2l, row3l);
754 row2h = _mm_xor_si128(row2h, row3h);
755 row2l = _mm_shuffle_epi8(row2l, r24);
756 row2h = _mm_shuffle_epi8(row2h, r24);
758 b0 = _mm_unpacklo_epi64(m5, m4);
759 b1 = _mm_alignr_epi8(m3, m7, 8);
761 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
762 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
763 row4l = _mm_xor_si128(row4l, row1l);
764 row4h = _mm_xor_si128(row4h, row1h);
765 row4l = _mm_shuffle_epi8(row4l, r16);
766 row4h = _mm_shuffle_epi8(row4h, r16);
767 row3l = _mm_add_epi64(row3l, row4l);
768 row3h = _mm_add_epi64(row3h, row4h);
769 row2l = _mm_xor_si128(row2l, row3l);
770 row2h = _mm_xor_si128(row2h, row3h);
771 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
772 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
774 t0 = _mm_alignr_epi8(row2h, row2l, 8);
775 t1 = _mm_alignr_epi8(row2l, row2h, 8);
776 row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
777 t0 = _mm_alignr_epi8(row4h, row4l, 8);
778 t1 = _mm_alignr_epi8(row4l, row4h, 8);
779 row4l = t1, row4h = t0;
781 b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2));
782 b1 = _mm_unpackhi_epi64(m5, m2);
784 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
785 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
786 row4l = _mm_xor_si128(row4l, row1l);
787 row4h = _mm_xor_si128(row4h, row1h);
788 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
789 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
790 row3l = _mm_add_epi64(row3l, row4l);
791 row3h = _mm_add_epi64(row3h, row4h);
792 row2l = _mm_xor_si128(row2l, row3l);
793 row2h = _mm_xor_si128(row2h, row3h);
794 row2l = _mm_shuffle_epi8(row2l, r24);
795 row2h = _mm_shuffle_epi8(row2h, r24);
797 b0 = _mm_unpacklo_epi64(m6, m1);
798 b1 = _mm_unpackhi_epi64(m3, m1);
800 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
801 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
802 row4l = _mm_xor_si128(row4l, row1l);
803 row4h = _mm_xor_si128(row4h, row1h);
804 row4l = _mm_shuffle_epi8(row4l, r16);
805 row4h = _mm_shuffle_epi8(row4h, r16);
806 row3l = _mm_add_epi64(row3l, row4l);
807 row3h = _mm_add_epi64(row3h, row4h);
808 row2l = _mm_xor_si128(row2l, row3l);
809 row2h = _mm_xor_si128(row2h, row3h);
810 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
811 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
813 t0 = _mm_alignr_epi8(row2l, row2h, 8);
814 t1 = _mm_alignr_epi8(row2h, row2l, 8);
815 row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
816 t0 = _mm_alignr_epi8(row4l, row4h, 8);
817 t1 = _mm_alignr_epi8(row4h, row4l, 8);
818 row4l = t1, row4h = t0;
820 b0 = _mm_alignr_epi8(m6, m5, 8);
821 b1 = _mm_unpackhi_epi64(m2, m7);
823 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
824 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
825 row4l = _mm_xor_si128(row4l, row1l);
826 row4h = _mm_xor_si128(row4h, row1h);
827 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
828 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
829 row3l = _mm_add_epi64(row3l, row4l);
830 row3h = _mm_add_epi64(row3h, row4h);
831 row2l = _mm_xor_si128(row2l, row3l);
832 row2h = _mm_xor_si128(row2h, row3h);
833 row2l = _mm_shuffle_epi8(row2l, r24);
834 row2h = _mm_shuffle_epi8(row2h, r24);
836 b0 = _mm_unpacklo_epi64(m4, m0);
837 b1 = _mm_blend_epi16(m1, m6, 0xF0);
839 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
840 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
841 row4l = _mm_xor_si128(row4l, row1l);
842 row4h = _mm_xor_si128(row4h, row1h);
843 row4l = _mm_shuffle_epi8(row4l, r16);
844 row4h = _mm_shuffle_epi8(row4h, r16);
845 row3l = _mm_add_epi64(row3l, row4l);
846 row3h = _mm_add_epi64(row3h, row4h);
847 row2l = _mm_xor_si128(row2l, row3l);
848 row2h = _mm_xor_si128(row2h, row3h);
849 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
850 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
852 t0 = _mm_alignr_epi8(row2h, row2l, 8);
853 t1 = _mm_alignr_epi8(row2l, row2h, 8);
854 row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
855 t0 = _mm_alignr_epi8(row4h, row4l, 8);
856 t1 = _mm_alignr_epi8(row4l, row4h, 8);
857 row4l = t1, row4h = t0;
859 b0 = _mm_blend_epi16(m5, m1, 0xF0);
860 b1 = _mm_unpackhi_epi64(m3, m4);
862 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
863 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
864 row4l = _mm_xor_si128(row4l, row1l);
865 row4h = _mm_xor_si128(row4h, row1h);
866 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
867 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
868 row3l = _mm_add_epi64(row3l, row4l);
869 row3h = _mm_add_epi64(row3h, row4h);
870 row2l = _mm_xor_si128(row2l, row3l);
871 row2h = _mm_xor_si128(row2h, row3h);
872 row2l = _mm_shuffle_epi8(row2l, r24);
873 row2h = _mm_shuffle_epi8(row2h, r24);
875 b0 = _mm_unpacklo_epi64(m7, m3);
876 b1 = _mm_alignr_epi8(m2, m0, 8);
878 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
879 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
880 row4l = _mm_xor_si128(row4l, row1l);
881 row4h = _mm_xor_si128(row4h, row1h);
882 row4l = _mm_shuffle_epi8(row4l, r16);
883 row4h = _mm_shuffle_epi8(row4h, r16);
884 row3l = _mm_add_epi64(row3l, row4l);
885 row3h = _mm_add_epi64(row3h, row4h);
886 row2l = _mm_xor_si128(row2l, row3l);
887 row2h = _mm_xor_si128(row2h, row3h);
888 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
889 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
891 t0 = _mm_alignr_epi8(row2l, row2h, 8);
892 t1 = _mm_alignr_epi8(row2h, row2l, 8);
893 row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
894 t0 = _mm_alignr_epi8(row4l, row4h, 8);
895 t1 = _mm_alignr_epi8(row4h, row4l, 8);
896 row4l = t1, row4h = t0;
898 b0 = _mm_unpackhi_epi64(m3, m1);
899 b1 = _mm_unpackhi_epi64(m6, m5);
901 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
902 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
903 row4l = _mm_xor_si128(row4l, row1l);
904 row4h = _mm_xor_si128(row4h, row1h);
905 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
906 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
907 row3l = _mm_add_epi64(row3l, row4l);
908 row3h = _mm_add_epi64(row3h, row4h);
909 row2l = _mm_xor_si128(row2l, row3l);
910 row2h = _mm_xor_si128(row2h, row3h);
911 row2l = _mm_shuffle_epi8(row2l, r24);
912 row2h = _mm_shuffle_epi8(row2h, r24);
914 b0 = _mm_unpackhi_epi64(m4, m0);
915 b1 = _mm_unpacklo_epi64(m6, m7);
917 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
918 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
919 row4l = _mm_xor_si128(row4l, row1l);
920 row4h = _mm_xor_si128(row4h, row1h);
921 row4l = _mm_shuffle_epi8(row4l, r16);
922 row4h = _mm_shuffle_epi8(row4h, r16);
923 row3l = _mm_add_epi64(row3l, row4l);
924 row3h = _mm_add_epi64(row3h, row4h);
925 row2l = _mm_xor_si128(row2l, row3l);
926 row2h = _mm_xor_si128(row2h, row3h);
927 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
928 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
930 t0 = _mm_alignr_epi8(row2h, row2l, 8);
931 t1 = _mm_alignr_epi8(row2l, row2h, 8);
932 row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
933 t0 = _mm_alignr_epi8(row4h, row4l, 8);
934 t1 = _mm_alignr_epi8(row4l, row4h, 8);
935 row4l = t1, row4h = t0;
937 b0 = _mm_blend_epi16(m1, m2, 0xF0);
938 b1 = _mm_blend_epi16(m2, m7, 0xF0);
940 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
941 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
942 row4l = _mm_xor_si128(row4l, row1l);
943 row4h = _mm_xor_si128(row4h, row1h);
944 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
945 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
946 row3l = _mm_add_epi64(row3l, row4l);
947 row3h = _mm_add_epi64(row3h, row4h);
948 row2l = _mm_xor_si128(row2l, row3l);
949 row2h = _mm_xor_si128(row2h, row3h);
950 row2l = _mm_shuffle_epi8(row2l, r24);
951 row2h = _mm_shuffle_epi8(row2h, r24);
953 b0 = _mm_unpacklo_epi64(m3, m5);
954 b1 = _mm_unpacklo_epi64(m0, m4);
956 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
957 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
958 row4l = _mm_xor_si128(row4l, row1l);
959 row4h = _mm_xor_si128(row4h, row1h);
960 row4l = _mm_shuffle_epi8(row4l, r16);
961 row4h = _mm_shuffle_epi8(row4h, r16);
962 row3l = _mm_add_epi64(row3l, row4l);
963 row3h = _mm_add_epi64(row3h, row4h);
964 row2l = _mm_xor_si128(row2l, row3l);
965 row2h = _mm_xor_si128(row2h, row3h);
966 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
967 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
969 t0 = _mm_alignr_epi8(row2l, row2h, 8);
970 t1 = _mm_alignr_epi8(row2h, row2l, 8);
971 row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
972 t0 = _mm_alignr_epi8(row4l, row4h, 8);
973 t1 = _mm_alignr_epi8(row4h, row4l, 8);
974 row4l = t1, row4h = t0;
976 b0 = _mm_unpackhi_epi64(m4, m2);
977 b1 = _mm_unpacklo_epi64(m1, m5);
979 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
980 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
981 row4l = _mm_xor_si128(row4l, row1l);
982 row4h = _mm_xor_si128(row4h, row1h);
983 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
984 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
985 row3l = _mm_add_epi64(row3l, row4l);
986 row3h = _mm_add_epi64(row3h, row4h);
987 row2l = _mm_xor_si128(row2l, row3l);
988 row2h = _mm_xor_si128(row2h, row3h);
989 row2l = _mm_shuffle_epi8(row2l, r24);
990 row2h = _mm_shuffle_epi8(row2h, r24);
992 b0 = _mm_blend_epi16(m0, m3, 0xF0);
993 b1 = _mm_blend_epi16(m2, m7, 0xF0);
995 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
996 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
997 row4l = _mm_xor_si128(row4l, row1l);
998 row4h = _mm_xor_si128(row4h, row1h);
999 row4l = _mm_shuffle_epi8(row4l, r16);
1000 row4h = _mm_shuffle_epi8(row4h, r16);
1001 row3l = _mm_add_epi64(row3l, row4l);
1002 row3h = _mm_add_epi64(row3h, row4h);
1003 row2l = _mm_xor_si128(row2l, row3l);
1004 row2h = _mm_xor_si128(row2h, row3h);
1005 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
1006 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
1008 t0 = _mm_alignr_epi8(row2h, row2l, 8);
1009 t1 = _mm_alignr_epi8(row2l, row2h, 8);
1010 row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
1011 t0 = _mm_alignr_epi8(row4h, row4l, 8);
1012 t1 = _mm_alignr_epi8(row4l, row4h, 8);
1013 row4l = t1, row4h = t0;
1015 b0 = _mm_blend_epi16(m7, m5, 0xF0);
1016 b1 = _mm_blend_epi16(m3, m1, 0xF0);
1018 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1019 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1020 row4l = _mm_xor_si128(row4l, row1l);
1021 row4h = _mm_xor_si128(row4h, row1h);
1022 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
1023 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
1024 row3l = _mm_add_epi64(row3l, row4l);
1025 row3h = _mm_add_epi64(row3h, row4h);
1026 row2l = _mm_xor_si128(row2l, row3l);
1027 row2h = _mm_xor_si128(row2h, row3h);
1028 row2l = _mm_shuffle_epi8(row2l, r24);
1029 row2h = _mm_shuffle_epi8(row2h, r24);
1031 b0 = _mm_alignr_epi8(m6, m0, 8);
1032 b1 = _mm_blend_epi16(m4, m6, 0xF0);
1034 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1035 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1036 row4l = _mm_xor_si128(row4l, row1l);
1037 row4h = _mm_xor_si128(row4h, row1h);
1038 row4l = _mm_shuffle_epi8(row4l, r16);
1039 row4h = _mm_shuffle_epi8(row4h, r16);
1040 row3l = _mm_add_epi64(row3l, row4l);
1041 row3h = _mm_add_epi64(row3h, row4h);
1042 row2l = _mm_xor_si128(row2l, row3l);
1043 row2h = _mm_xor_si128(row2h, row3h);
1044 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
1045 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
1047 t0 = _mm_alignr_epi8(row2l, row2h, 8);
1048 t1 = _mm_alignr_epi8(row2h, row2l, 8);
1049 row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
1050 t0 = _mm_alignr_epi8(row4l, row4h, 8);
1051 t1 = _mm_alignr_epi8(row4h, row4l, 8);
1052 row4l = t1, row4h = t0;
1054 b0 = _mm_unpacklo_epi64(m1, m3);
1055 b1 = _mm_unpacklo_epi64(m0, m4);
1057 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1058 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1059 row4l = _mm_xor_si128(row4l, row1l);
1060 row4h = _mm_xor_si128(row4h, row1h);
1061 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
1062 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
1063 row3l = _mm_add_epi64(row3l, row4l);
1064 row3h = _mm_add_epi64(row3h, row4h);
1065 row2l = _mm_xor_si128(row2l, row3l);
1066 row2h = _mm_xor_si128(row2h, row3h);
1067 row2l = _mm_shuffle_epi8(row2l, r24);
1068 row2h = _mm_shuffle_epi8(row2h, r24);
1070 b0 = _mm_unpacklo_epi64(m6, m5);
1071 b1 = _mm_unpackhi_epi64(m5, m1);
1073 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1074 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1075 row4l = _mm_xor_si128(row4l, row1l);
1076 row4h = _mm_xor_si128(row4h, row1h);
1077 row4l = _mm_shuffle_epi8(row4l, r16);
1078 row4h = _mm_shuffle_epi8(row4h, r16);
1079 row3l = _mm_add_epi64(row3l, row4l);
1080 row3h = _mm_add_epi64(row3h, row4h);
1081 row2l = _mm_xor_si128(row2l, row3l);
1082 row2h = _mm_xor_si128(row2h, row3h);
1083 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
1084 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
1086 t0 = _mm_alignr_epi8(row2h, row2l, 8);
1087 t1 = _mm_alignr_epi8(row2l, row2h, 8);
1088 row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
1089 t0 = _mm_alignr_epi8(row4h, row4l, 8);
1090 t1 = _mm_alignr_epi8(row4l, row4h, 8);
1091 row4l = t1, row4h = t0;
1093 b0 = _mm_blend_epi16(m2, m3, 0xF0);
1094 b1 = _mm_unpackhi_epi64(m7, m0);
1096 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1097 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1098 row4l = _mm_xor_si128(row4l, row1l);
1099 row4h = _mm_xor_si128(row4h, row1h);
1100 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
1101 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
1102 row3l = _mm_add_epi64(row3l, row4l);
1103 row3h = _mm_add_epi64(row3h, row4h);
1104 row2l = _mm_xor_si128(row2l, row3l);
1105 row2h = _mm_xor_si128(row2h, row3h);
1106 row2l = _mm_shuffle_epi8(row2l, r24);
1107 row2h = _mm_shuffle_epi8(row2h, r24);
1109 b0 = _mm_unpackhi_epi64(m6, m2);
1110 b1 = _mm_blend_epi16(m7, m4, 0xF0);
1112 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1113 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1114 row4l = _mm_xor_si128(row4l, row1l);
1115 row4h = _mm_xor_si128(row4h, row1h);
1116 row4l = _mm_shuffle_epi8(row4l, r16);
1117 row4h = _mm_shuffle_epi8(row4h, r16);
1118 row3l = _mm_add_epi64(row3l, row4l);
1119 row3h = _mm_add_epi64(row3h, row4h);
1120 row2l = _mm_xor_si128(row2l, row3l);
1121 row2h = _mm_xor_si128(row2h, row3h);
1122 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
1123 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
1125 t0 = _mm_alignr_epi8(row2l, row2h, 8);
1126 t1 = _mm_alignr_epi8(row2h, row2l, 8);
1127 row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
1128 t0 = _mm_alignr_epi8(row4l, row4h, 8);
1129 t1 = _mm_alignr_epi8(row4h, row4l, 8);
1130 row4l = t1, row4h = t0;
1132 b0 = _mm_blend_epi16(m6, m0, 0xF0);
1133 b1 = _mm_unpacklo_epi64(m7, m2);
1135 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1136 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1137 row4l = _mm_xor_si128(row4l, row1l);
1138 row4h = _mm_xor_si128(row4h, row1h);
1139 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
1140 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
1141 row3l = _mm_add_epi64(row3l, row4l);
1142 row3h = _mm_add_epi64(row3h, row4h);
1143 row2l = _mm_xor_si128(row2l, row3l);
1144 row2h = _mm_xor_si128(row2h, row3h);
1145 row2l = _mm_shuffle_epi8(row2l, r24);
1146 row2h = _mm_shuffle_epi8(row2h, r24);
1148 b0 = _mm_unpackhi_epi64(m2, m7);
1149 b1 = _mm_alignr_epi8(m5, m6, 8);
1151 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1152 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1153 row4l = _mm_xor_si128(row4l, row1l);
1154 row4h = _mm_xor_si128(row4h, row1h);
1155 row4l = _mm_shuffle_epi8(row4l, r16);
1156 row4h = _mm_shuffle_epi8(row4h, r16);
1157 row3l = _mm_add_epi64(row3l, row4l);
1158 row3h = _mm_add_epi64(row3h, row4h);
1159 row2l = _mm_xor_si128(row2l, row3l);
1160 row2h = _mm_xor_si128(row2h, row3h);
1161 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
1162 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
1164 t0 = _mm_alignr_epi8(row2h, row2l, 8);
1165 t1 = _mm_alignr_epi8(row2l, row2h, 8);
1166 row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
1167 t0 = _mm_alignr_epi8(row4h, row4l, 8);
1168 t1 = _mm_alignr_epi8(row4l, row4h, 8);
1169 row4l = t1, row4h = t0;
1171 b0 = _mm_unpacklo_epi64(m0, m3);
1172 b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2));
1174 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1175 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1176 row4l = _mm_xor_si128(row4l, row1l);
1177 row4h = _mm_xor_si128(row4h, row1h);
1178 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
1179 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
1180 row3l = _mm_add_epi64(row3l, row4l);
1181 row3h = _mm_add_epi64(row3h, row4h);
1182 row2l = _mm_xor_si128(row2l, row3l);
1183 row2h = _mm_xor_si128(row2h, row3h);
1184 row2l = _mm_shuffle_epi8(row2l, r24);
1185 row2h = _mm_shuffle_epi8(row2h, r24);
1187 b0 = _mm_unpackhi_epi64(m3, m1);
1188 b1 = _mm_blend_epi16(m1, m5, 0xF0);
1190 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1191 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1192 row4l = _mm_xor_si128(row4l, row1l);
1193 row4h = _mm_xor_si128(row4h, row1h);
1194 row4l = _mm_shuffle_epi8(row4l, r16);
1195 row4h = _mm_shuffle_epi8(row4h, r16);
1196 row3l = _mm_add_epi64(row3l, row4l);
1197 row3h = _mm_add_epi64(row3h, row4h);
1198 row2l = _mm_xor_si128(row2l, row3l);
1199 row2h = _mm_xor_si128(row2h, row3h);
1200 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
1201 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
1203 t0 = _mm_alignr_epi8(row2l, row2h, 8);
1204 t1 = _mm_alignr_epi8(row2h, row2l, 8);
1205 row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
1206 t0 = _mm_alignr_epi8(row4l, row4h, 8);
1207 t1 = _mm_alignr_epi8(row4h, row4l, 8);
1208 row4l = t1, row4h = t0;
1210 b0 = _mm_unpackhi_epi64(m6, m3);
1211 b1 = _mm_blend_epi16(m6, m1, 0xF0);
1213 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1214 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1215 row4l = _mm_xor_si128(row4l, row1l);
1216 row4h = _mm_xor_si128(row4h, row1h);
1217 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
1218 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
1219 row3l = _mm_add_epi64(row3l, row4l);
1220 row3h = _mm_add_epi64(row3h, row4h);
1221 row2l = _mm_xor_si128(row2l, row3l);
1222 row2h = _mm_xor_si128(row2h, row3h);
1223 row2l = _mm_shuffle_epi8(row2l, r24);
1224 row2h = _mm_shuffle_epi8(row2h, r24);
1226 b0 = _mm_alignr_epi8(m7, m5, 8);
1227 b1 = _mm_unpackhi_epi64(m0, m4);
1229 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1230 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1231 row4l = _mm_xor_si128(row4l, row1l);
1232 row4h = _mm_xor_si128(row4h, row1h);
1233 row4l = _mm_shuffle_epi8(row4l, r16);
1234 row4h = _mm_shuffle_epi8(row4h, r16);
1235 row3l = _mm_add_epi64(row3l, row4l);
1236 row3h = _mm_add_epi64(row3h, row4h);
1237 row2l = _mm_xor_si128(row2l, row3l);
1238 row2h = _mm_xor_si128(row2h, row3h);
1239 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
1240 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
1242 t0 = _mm_alignr_epi8(row2h, row2l, 8);
1243 t1 = _mm_alignr_epi8(row2l, row2h, 8);
1244 row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
1245 t0 = _mm_alignr_epi8(row4h, row4l, 8);
1246 t1 = _mm_alignr_epi8(row4l, row4h, 8);
1247 row4l = t1, row4h = t0;
1249 b0 = _mm_unpackhi_epi64(m2, m7);
1250 b1 = _mm_unpacklo_epi64(m4, m1);
1252 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1253 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1254 row4l = _mm_xor_si128(row4l, row1l);
1255 row4h = _mm_xor_si128(row4h, row1h);
1256 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
1257 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
1258 row3l = _mm_add_epi64(row3l, row4l);
1259 row3h = _mm_add_epi64(row3h, row4h);
1260 row2l = _mm_xor_si128(row2l, row3l);
1261 row2h = _mm_xor_si128(row2h, row3h);
1262 row2l = _mm_shuffle_epi8(row2l, r24);
1263 row2h = _mm_shuffle_epi8(row2h, r24);
1265 b0 = _mm_unpacklo_epi64(m0, m2);
1266 b1 = _mm_unpacklo_epi64(m3, m5);
1268 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1269 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1270 row4l = _mm_xor_si128(row4l, row1l);
1271 row4h = _mm_xor_si128(row4h, row1h);
1272 row4l = _mm_shuffle_epi8(row4l, r16);
1273 row4h = _mm_shuffle_epi8(row4h, r16);
1274 row3l = _mm_add_epi64(row3l, row4l);
1275 row3h = _mm_add_epi64(row3h, row4h);
1276 row2l = _mm_xor_si128(row2l, row3l);
1277 row2h = _mm_xor_si128(row2h, row3h);
1278 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
1279 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
1281 t0 = _mm_alignr_epi8(row2l, row2h, 8);
1282 t1 = _mm_alignr_epi8(row2h, row2l, 8);
1283 row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
1284 t0 = _mm_alignr_epi8(row4l, row4h, 8);
1285 t1 = _mm_alignr_epi8(row4h, row4l, 8);
1286 row4l = t1, row4h = t0;
1288 b0 = _mm_unpacklo_epi64(m3, m7);
1289 b1 = _mm_alignr_epi8(m0, m5, 8);
1291 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1292 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1293 row4l = _mm_xor_si128(row4l, row1l);
1294 row4h = _mm_xor_si128(row4h, row1h);
1295 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
1296 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
1297 row3l = _mm_add_epi64(row3l, row4l);
1298 row3h = _mm_add_epi64(row3h, row4h);
1299 row2l = _mm_xor_si128(row2l, row3l);
1300 row2h = _mm_xor_si128(row2h, row3h);
1301 row2l = _mm_shuffle_epi8(row2l, r24);
1302 row2h = _mm_shuffle_epi8(row2h, r24);
1304 b0 = _mm_unpackhi_epi64(m7, m4);
1305 b1 = _mm_alignr_epi8(m4, m1, 8);
1307 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1308 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1309 row4l = _mm_xor_si128(row4l, row1l);
1310 row4h = _mm_xor_si128(row4h, row1h);
1311 row4l = _mm_shuffle_epi8(row4l, r16);
1312 row4h = _mm_shuffle_epi8(row4h, r16);
1313 row3l = _mm_add_epi64(row3l, row4l);
1314 row3h = _mm_add_epi64(row3h, row4h);
1315 row2l = _mm_xor_si128(row2l, row3l);
1316 row2h = _mm_xor_si128(row2h, row3h);
1317 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
1318 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
1320 t0 = _mm_alignr_epi8(row2h, row2l, 8);
1321 t1 = _mm_alignr_epi8(row2l, row2h, 8);
1322 row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
1323 t0 = _mm_alignr_epi8(row4h, row4l, 8);
1324 t1 = _mm_alignr_epi8(row4l, row4h, 8);
1325 row4l = t1, row4h = t0;
1328 b1 = _mm_alignr_epi8(m5, m0, 8);
1330 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1331 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1332 row4l = _mm_xor_si128(row4l, row1l);
1333 row4h = _mm_xor_si128(row4h, row1h);
1334 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
1335 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
1336 row3l = _mm_add_epi64(row3l, row4l);
1337 row3h = _mm_add_epi64(row3h, row4h);
1338 row2l = _mm_xor_si128(row2l, row3l);
1339 row2h = _mm_xor_si128(row2h, row3h);
1340 row2l = _mm_shuffle_epi8(row2l, r24);
1341 row2h = _mm_shuffle_epi8(row2h, r24);
1343 b0 = _mm_blend_epi16(m1, m3, 0xF0);
1346 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1347 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1348 row4l = _mm_xor_si128(row4l, row1l);
1349 row4h = _mm_xor_si128(row4h, row1h);
1350 row4l = _mm_shuffle_epi8(row4l, r16);
1351 row4h = _mm_shuffle_epi8(row4h, r16);
1352 row3l = _mm_add_epi64(row3l, row4l);
1353 row3h = _mm_add_epi64(row3h, row4h);
1354 row2l = _mm_xor_si128(row2l, row3l);
1355 row2h = _mm_xor_si128(row2h, row3h);
1356 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
1357 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
1359 t0 = _mm_alignr_epi8(row2l, row2h, 8);
1360 t1 = _mm_alignr_epi8(row2h, row2l, 8);
1361 row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
1362 t0 = _mm_alignr_epi8(row4l, row4h, 8);
1363 t1 = _mm_alignr_epi8(row4h, row4l, 8);
1364 row4l = t1, row4h = t0;
1366 b0 = _mm_unpacklo_epi64(m5, m4);
1367 b1 = _mm_unpackhi_epi64(m3, m0);
1369 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1370 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1371 row4l = _mm_xor_si128(row4l, row1l);
1372 row4h = _mm_xor_si128(row4h, row1h);
1373 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
1374 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
1375 row3l = _mm_add_epi64(row3l, row4l);
1376 row3h = _mm_add_epi64(row3h, row4h);
1377 row2l = _mm_xor_si128(row2l, row3l);
1378 row2h = _mm_xor_si128(row2h, row3h);
1379 row2l = _mm_shuffle_epi8(row2l, r24);
1380 row2h = _mm_shuffle_epi8(row2h, r24);
1382 b0 = _mm_unpacklo_epi64(m1, m2);
1383 b1 = _mm_blend_epi16(m3, m2, 0xF0);
1385 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1386 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1387 row4l = _mm_xor_si128(row4l, row1l);
1388 row4h = _mm_xor_si128(row4h, row1h);
1389 row4l = _mm_shuffle_epi8(row4l, r16);
1390 row4h = _mm_shuffle_epi8(row4h, r16);
1391 row3l = _mm_add_epi64(row3l, row4l);
1392 row3h = _mm_add_epi64(row3h, row4h);
1393 row2l = _mm_xor_si128(row2l, row3l);
1394 row2h = _mm_xor_si128(row2h, row3h);
1395 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
1396 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
1398 t0 = _mm_alignr_epi8(row2h, row2l, 8);
1399 t1 = _mm_alignr_epi8(row2l, row2h, 8);
1400 row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
1401 t0 = _mm_alignr_epi8(row4h, row4l, 8);
1402 t1 = _mm_alignr_epi8(row4l, row4h, 8);
1403 row4l = t1, row4h = t0;
1405 b0 = _mm_unpackhi_epi64(m7, m4);
1406 b1 = _mm_unpackhi_epi64(m1, m6);
1408 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1409 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1410 row4l = _mm_xor_si128(row4l, row1l);
1411 row4h = _mm_xor_si128(row4h, row1h);
1412 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
1413 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
1414 row3l = _mm_add_epi64(row3l, row4l);
1415 row3h = _mm_add_epi64(row3h, row4h);
1416 row2l = _mm_xor_si128(row2l, row3l);
1417 row2h = _mm_xor_si128(row2h, row3h);
1418 row2l = _mm_shuffle_epi8(row2l, r24);
1419 row2h = _mm_shuffle_epi8(row2h, r24);
1421 b0 = _mm_alignr_epi8(m7, m5, 8);
1422 b1 = _mm_unpacklo_epi64(m6, m0);
1424 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1425 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1426 row4l = _mm_xor_si128(row4l, row1l);
1427 row4h = _mm_xor_si128(row4h, row1h);
1428 row4l = _mm_shuffle_epi8(row4l, r16);
1429 row4h = _mm_shuffle_epi8(row4h, r16);
1430 row3l = _mm_add_epi64(row3l, row4l);
1431 row3h = _mm_add_epi64(row3h, row4h);
1432 row2l = _mm_xor_si128(row2l, row3l);
1433 row2h = _mm_xor_si128(row2h, row3h);
1434 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
1435 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
1437 t0 = _mm_alignr_epi8(row2l, row2h, 8);
1438 t1 = _mm_alignr_epi8(row2h, row2l, 8);
1439 row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
1440 t0 = _mm_alignr_epi8(row4l, row4h, 8);
1441 t1 = _mm_alignr_epi8(row4h, row4l, 8);
1442 row4l = t1, row4h = t0;
1444 b0 = _mm_unpacklo_epi64(m0, m1);
1445 b1 = _mm_unpacklo_epi64(m2, m3);
1447 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1448 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1449 row4l = _mm_xor_si128(row4l, row1l);
1450 row4h = _mm_xor_si128(row4h, row1h);
1451 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
1452 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
1453 row3l = _mm_add_epi64(row3l, row4l);
1454 row3h = _mm_add_epi64(row3h, row4h);
1455 row2l = _mm_xor_si128(row2l, row3l);
1456 row2h = _mm_xor_si128(row2h, row3h);
1457 row2l = _mm_shuffle_epi8(row2l, r24);
1458 row2h = _mm_shuffle_epi8(row2h, r24);
1460 b0 = _mm_unpackhi_epi64(m0, m1);
1461 b1 = _mm_unpackhi_epi64(m2, m3);
1463 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1464 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1465 row4l = _mm_xor_si128(row4l, row1l);
1466 row4h = _mm_xor_si128(row4h, row1h);
1467 row4l = _mm_shuffle_epi8(row4l, r16);
1468 row4h = _mm_shuffle_epi8(row4h, r16);
1469 row3l = _mm_add_epi64(row3l, row4l);
1470 row3h = _mm_add_epi64(row3h, row4h);
1471 row2l = _mm_xor_si128(row2l, row3l);
1472 row2h = _mm_xor_si128(row2h, row3h);
1473 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
1474 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
1476 t0 = _mm_alignr_epi8(row2h, row2l, 8);
1477 t1 = _mm_alignr_epi8(row2l, row2h, 8);
1478 row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
1479 t0 = _mm_alignr_epi8(row4h, row4l, 8);
1480 t1 = _mm_alignr_epi8(row4l, row4h, 8);
1481 row4l = t1, row4h = t0;
1483 b0 = _mm_unpacklo_epi64(m4, m5);
1484 b1 = _mm_unpacklo_epi64(m6, m7);
1486 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1487 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1488 row4l = _mm_xor_si128(row4l, row1l);
1489 row4h = _mm_xor_si128(row4h, row1h);
1490 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
1491 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
1492 row3l = _mm_add_epi64(row3l, row4l);
1493 row3h = _mm_add_epi64(row3h, row4h);
1494 row2l = _mm_xor_si128(row2l, row3l);
1495 row2h = _mm_xor_si128(row2h, row3h);
1496 row2l = _mm_shuffle_epi8(row2l, r24);
1497 row2h = _mm_shuffle_epi8(row2h, r24);
1499 b0 = _mm_unpackhi_epi64(m4, m5);
1500 b1 = _mm_unpackhi_epi64(m6, m7);
1502 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1503 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1504 row4l = _mm_xor_si128(row4l, row1l);
1505 row4h = _mm_xor_si128(row4h, row1h);
1506 row4l = _mm_shuffle_epi8(row4l, r16);
1507 row4h = _mm_shuffle_epi8(row4h, r16);
1508 row3l = _mm_add_epi64(row3l, row4l);
1509 row3h = _mm_add_epi64(row3h, row4h);
1510 row2l = _mm_xor_si128(row2l, row3l);
1511 row2h = _mm_xor_si128(row2h, row3h);
1512 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
1513 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
1515 t0 = _mm_alignr_epi8(row2l, row2h, 8);
1516 t1 = _mm_alignr_epi8(row2h, row2l, 8);
1517 row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
1518 t0 = _mm_alignr_epi8(row4l, row4h, 8);
1519 t1 = _mm_alignr_epi8(row4h, row4l, 8);
1520 row4l = t1, row4h = t0;
1522 b0 = _mm_unpacklo_epi64(m7, m2);
1523 b1 = _mm_unpackhi_epi64(m4, m6);
1525 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1526 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1527 row4l = _mm_xor_si128(row4l, row1l);
1528 row4h = _mm_xor_si128(row4h, row1h);
1529 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
1530 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
1531 row3l = _mm_add_epi64(row3l, row4l);
1532 row3h = _mm_add_epi64(row3h, row4h);
1533 row2l = _mm_xor_si128(row2l, row3l);
1534 row2h = _mm_xor_si128(row2h, row3h);
1535 row2l = _mm_shuffle_epi8(row2l, r24);
1536 row2h = _mm_shuffle_epi8(row2h, r24);
1538 b0 = _mm_unpacklo_epi64(m5, m4);
1539 b1 = _mm_alignr_epi8(m3, m7, 8);
1541 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1542 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1543 row4l = _mm_xor_si128(row4l, row1l);
1544 row4h = _mm_xor_si128(row4h, row1h);
1545 row4l = _mm_shuffle_epi8(row4l, r16);
1546 row4h = _mm_shuffle_epi8(row4h, r16);
1547 row3l = _mm_add_epi64(row3l, row4l);
1548 row3h = _mm_add_epi64(row3h, row4h);
1549 row2l = _mm_xor_si128(row2l, row3l);
1550 row2h = _mm_xor_si128(row2h, row3h);
1551 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
1552 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
1554 t0 = _mm_alignr_epi8(row2h, row2l, 8);
1555 t1 = _mm_alignr_epi8(row2l, row2h, 8);
1556 row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
1557 t0 = _mm_alignr_epi8(row4h, row4l, 8);
1558 t1 = _mm_alignr_epi8(row4l, row4h, 8);
1559 row4l = t1, row4h = t0;
1561 b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2));
1562 b1 = _mm_unpackhi_epi64(m5, m2);
1564 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1565 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1566 row4l = _mm_xor_si128(row4l, row1l);
1567 row4h = _mm_xor_si128(row4h, row1h);
1568 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
1569 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
1570 row3l = _mm_add_epi64(row3l, row4l);
1571 row3h = _mm_add_epi64(row3h, row4h);
1572 row2l = _mm_xor_si128(row2l, row3l);
1573 row2h = _mm_xor_si128(row2h, row3h);
1574 row2l = _mm_shuffle_epi8(row2l, r24);
1575 row2h = _mm_shuffle_epi8(row2h, r24);
1577 b0 = _mm_unpacklo_epi64(m6, m1);
1578 b1 = _mm_unpackhi_epi64(m3, m1);
1580 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1581 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1582 row4l = _mm_xor_si128(row4l, row1l);
1583 row4h = _mm_xor_si128(row4h, row1h);
1584 row4l = _mm_shuffle_epi8(row4l, r16);
1585 row4h = _mm_shuffle_epi8(row4h, r16);
1586 row3l = _mm_add_epi64(row3l, row4l);
1587 row3h = _mm_add_epi64(row3h, row4h);
1588 row2l = _mm_xor_si128(row2l, row3l);
1589 row2h = _mm_xor_si128(row2h, row3h);
1590 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
1591 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
1593 t0 = _mm_alignr_epi8(row2l, row2h, 8);
1594 t1 = _mm_alignr_epi8(row2h, row2l, 8);
1595 row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
1596 t0 = _mm_alignr_epi8(row4l, row4h, 8);
1597 t1 = _mm_alignr_epi8(row4h, row4l, 8);
1598 row4l = t1, row4h = t0;
1600 row1l = _mm_xor_si128(row3l, row1l);
1601 row1h = _mm_xor_si128(row3h, row1h);
1602 _mm_storeu_si128(M128_CAST(&state.h[0]), _mm_xor_si128(_mm_loadu_si128(CONST_M128_CAST(&state.h[0])), row1l));
1603 _mm_storeu_si128(M128_CAST(&state.h[2]), _mm_xor_si128(_mm_loadu_si128(CONST_M128_CAST(&state.h[2])), row1h));
1605 row2l = _mm_xor_si128(row4l, row2l);
1606 row2h = _mm_xor_si128(row4h, row2h);
1607 _mm_storeu_si128(M128_CAST(&state.h[4]), _mm_xor_si128(_mm_loadu_si128(CONST_M128_CAST(&state.h[4])), row2l));
1608 _mm_storeu_si128(M128_CAST(&state.h[6]), _mm_xor_si128(_mm_loadu_si128(CONST_M128_CAST(&state.h[6])), row2h));
1610 #endif // CRYPTOPP_SSE41_AVAILABLE 1612 #if CRYPTOPP_ARM_NEON_AVAILABLE 1615 #define BLAKE2S_LOAD_MSG_0_1(buf) \ 1616 do { uint32x2_t t0, t1; \ 1617 t0 = vzip_u32(vget_low_u32(m0), vget_high_u32(m0)).val[0]; \ 1618 t1 = vzip_u32(vget_low_u32(m1), vget_high_u32(m1)).val[0]; \ 1619 buf = vcombine_u32(t0, t1); } while(0) 1621 #define BLAKE2S_LOAD_MSG_0_2(buf) \ 1622 do { uint32x2_t t0, t1; \ 1623 t0 = vzip_u32(vget_low_u32(m0), vget_high_u32(m0)).val[1]; \ 1624 t1 = vzip_u32(vget_low_u32(m1), vget_high_u32(m1)).val[1]; \ 1625 buf = vcombine_u32(t0, t1); } while(0) 1627 #define BLAKE2S_LOAD_MSG_0_3(buf) \ 1628 do { uint32x2_t t0, t1; \ 1629 t0 = vzip_u32(vget_low_u32(m2), vget_high_u32(m2)).val[0]; \ 1630 t1 = vzip_u32(vget_low_u32(m3), vget_high_u32(m3)).val[0]; \ 1631 buf = vcombine_u32(t0, t1); } while(0) 1633 #define BLAKE2S_LOAD_MSG_0_4(buf) \ 1634 do { uint32x2_t t0, t1; \ 1635 t0 = vzip_u32(vget_low_u32(m2), vget_high_u32(m2)).val[1]; \ 1636 t1 = vzip_u32(vget_low_u32(m3), vget_high_u32(m3)).val[1]; \ 1637 buf = vcombine_u32(t0, t1); } while(0) 1639 #define BLAKE2S_LOAD_MSG_1_1(buf) \ 1640 do { uint32x2_t t0, t1; \ 1641 t0 = vzip_u32(vget_high_u32(m3), vget_low_u32(m1)).val[0]; \ 1642 t1 = vzip_u32(vget_low_u32(m2), vget_low_u32(m3)).val[1]; \ 1643 buf = vcombine_u32(t0, t1); } while(0) 1645 #define BLAKE2S_LOAD_MSG_1_2(buf) \ 1646 do { uint32x2_t t0, t1; \ 1647 t0 = vzip_u32(vget_high_u32(m2), vget_low_u32(m2)).val[0]; \ 1648 t1 = vext_u32(vget_high_u32(m3), vget_high_u32(m1), 1); \ 1649 buf = vcombine_u32(t0, t1); } while(0) 1651 #define BLAKE2S_LOAD_MSG_1_3(buf) \ 1652 do { uint32x2_t t0, t1; \ 1653 t0 = vext_u32(vget_low_u32(m0), vget_low_u32(m0), 1); \ 1654 t1 = vzip_u32(vget_high_u32(m2), vget_low_u32(m1)).val[1]; \ 1655 buf = vcombine_u32(t0, t1); } while(0) 1657 #define BLAKE2S_LOAD_MSG_1_4(buf) \ 1658 do { uint32x2_t t0, t1; \ 1659 t0 = vzip_u32(vget_low_u32(m3), vget_high_u32(m0)).val[0]; \ 1660 t1 = vzip_u32(vget_high_u32(m1), vget_high_u32(m0)).val[1]; \ 1661 buf = vcombine_u32(t0, t1); } while(0) 1663 #define BLAKE2S_LOAD_MSG_2_1(buf) \ 1664 do { uint32x2_t t0, t1; \ 1665 t0 = vext_u32(vget_high_u32(m2), vget_low_u32(m3), 1); \ 1666 t1 = vzip_u32(vget_low_u32(m1), vget_high_u32(m3)).val[1]; \ 1667 buf = vcombine_u32(t0, t1); } while(0) 1669 #define BLAKE2S_LOAD_MSG_2_2(buf) \ 1670 do { uint32x2_t t0, t1; \ 1671 t0 = vzip_u32(vget_low_u32(m2), vget_low_u32(m0)).val[0]; \ 1672 t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m0), vget_low_u32(m3)); \ 1673 buf = vcombine_u32(t0, t1); } while(0) 1675 #define BLAKE2S_LOAD_MSG_2_3(buf) \ 1676 do { uint32x2_t t0, t1; \ 1677 t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m2), vget_high_u32(m0)); \ 1678 t1 = vzip_u32(vget_high_u32(m1), vget_low_u32(m2)).val[1]; \ 1679 buf = vcombine_u32(t0, t1); } while(0) 1681 #define BLAKE2S_LOAD_MSG_2_4(buf) \ 1682 do { uint32x2_t t0, t1; \ 1683 t0 = vzip_u32(vget_high_u32(m3), vget_high_u32(m1)).val[0]; \ 1684 t1 = vext_u32(vget_low_u32(m0), vget_low_u32(m1), 1); \ 1685 buf = vcombine_u32(t0, t1); } while(0) 1687 #define BLAKE2S_LOAD_MSG_3_1(buf) \ 1688 do { uint32x2_t t0, t1; \ 1689 t0 = vzip_u32(vget_high_u32(m1), vget_high_u32(m0)).val[1]; \ 1690 t1 = vzip_u32(vget_low_u32(m3), vget_high_u32(m2)).val[1]; \ 1691 buf = vcombine_u32(t0, t1); } while(0) 1693 #define BLAKE2S_LOAD_MSG_3_2(buf) \ 1694 do { uint32x2_t t0, t1; \ 1695 t0 = vzip_u32(vget_low_u32(m2), vget_low_u32(m0)).val[1]; \ 1696 t1 = vzip_u32(vget_low_u32(m3), vget_high_u32(m3)).val[0]; \ 1697 buf = vcombine_u32(t0, t1); } while(0) 1699 #define BLAKE2S_LOAD_MSG_3_3(buf) \ 1700 do { uint32x2_t t0, t1; \ 1701 t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m0), vget_low_u32(m1)); \ 1702 t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m1), vget_high_u32(m3)); \ 1703 buf = vcombine_u32(t0, t1); } while(0) 1705 #define BLAKE2S_LOAD_MSG_3_4(buf) \ 1706 do { uint32x2_t t0, t1; \ 1707 t0 = vzip_u32(vget_high_u32(m1), vget_high_u32(m2)).val[0]; \ 1708 t1 = vzip_u32(vget_low_u32(m0), vget_low_u32(m2)).val[0]; \ 1709 buf = vcombine_u32(t0, t1); } while(0) 1711 #define BLAKE2S_LOAD_MSG_4_1(buf) \ 1712 do { uint32x2_t t0, t1; \ 1713 t0 = vzip_u32(vget_low_u32(m2), vget_low_u32(m1)).val[1]; \ 1714 t1 = vzip_u32((vget_high_u32(m0)), vget_high_u32(m2)).val[0]; \ 1715 buf = vcombine_u32(t0, t1); } while(0) 1717 #define BLAKE2S_LOAD_MSG_4_2(buf) \ 1718 do { uint32x2_t t0, t1; \ 1719 t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m0), vget_high_u32(m1)); \ 1720 t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m1), vget_high_u32(m3)); \ 1721 buf = vcombine_u32(t0, t1); } while(0) 1723 #define BLAKE2S_LOAD_MSG_4_3(buf) \ 1724 do { uint32x2_t t0, t1; \ 1725 t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m3), vget_high_u32(m2)); \ 1726 t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m1), vget_high_u32(m0)); \ 1727 buf = vcombine_u32(t0, t1); } while(0) 1729 #define BLAKE2S_LOAD_MSG_4_4(buf) \ 1730 do { uint32x2_t t0, t1; \ 1731 t0 = vext_u32(vget_low_u32(m0), vget_low_u32(m3), 1); \ 1732 t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m2), vget_low_u32(m3)); \ 1733 buf = vcombine_u32(t0, t1); } while(0) 1735 #define BLAKE2S_LOAD_MSG_5_1(buf) \ 1736 do { uint32x2_t t0, t1; \ 1737 t0 = vzip_u32((vget_high_u32(m0)), vget_high_u32(m1)).val[0]; \ 1738 t1 = vzip_u32(vget_low_u32(m0), vget_low_u32(m2)).val[0]; \ 1739 buf = vcombine_u32(t0, t1); } while(0) 1741 #define BLAKE2S_LOAD_MSG_5_2(buf) \ 1742 do { uint32x2_t t0, t1; \ 1743 t0 = vzip_u32(vget_low_u32(m3), vget_high_u32(m2)).val[0]; \ 1744 t1 = vzip_u32(vget_high_u32(m2), vget_high_u32(m0)).val[1]; \ 1745 buf = vcombine_u32(t0, t1); } while(0) 1747 #define BLAKE2S_LOAD_MSG_5_3(buf) \ 1748 do { uint32x2_t t0, t1; \ 1749 t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m1), vget_high_u32(m1)); \ 1750 t1 = vzip_u32(vget_high_u32(m3), vget_low_u32(m0)).val[1]; \ 1751 buf = vcombine_u32(t0, t1); } while(0) 1753 #define BLAKE2S_LOAD_MSG_5_4(buf) \ 1754 do { uint32x2_t t0, t1; \ 1755 t0 = vzip_u32(vget_low_u32(m3), vget_low_u32(m1)).val[1]; \ 1756 t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m3), vget_low_u32(m2)); \ 1757 buf = vcombine_u32(t0, t1); } while(0) 1759 #define BLAKE2S_LOAD_MSG_6_1(buf) \ 1760 do { uint32x2_t t0, t1; \ 1761 t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m3), vget_low_u32(m0)); \ 1762 t1 = vzip_u32(vget_high_u32(m3), vget_low_u32(m1)).val[0]; \ 1763 buf = vcombine_u32(t0, t1); } while(0) 1765 #define BLAKE2S_LOAD_MSG_6_2(buf) \ 1766 do { uint32x2_t t0, t1; \ 1767 t0 = vzip_u32(vget_low_u32(m1), vget_high_u32(m3)).val[1]; \ 1768 t1 = vext_u32(vget_low_u32(m3), vget_high_u32(m2), 1); \ 1769 buf = vcombine_u32(t0, t1); } while(0) 1771 #define BLAKE2S_LOAD_MSG_6_3(buf) \ 1772 do { uint32x2_t t0, t1; \ 1773 t0 = vzip_u32(vget_low_u32(m0), vget_high_u32(m1)).val[0]; \ 1774 t1 = vext_u32(vget_low_u32(m2), vget_low_u32(m2), 1); \ 1775 buf = vcombine_u32(t0, t1); } while(0) 1777 #define BLAKE2S_LOAD_MSG_6_4(buf) \ 1778 do { uint32x2_t t0, t1; \ 1779 t0 = vzip_u32(vget_high_u32(m1), vget_high_u32(m0)).val[1]; \ 1780 t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m0), vget_high_u32(m2)); \ 1781 buf = vcombine_u32(t0, t1); } while(0) 1783 #define BLAKE2S_LOAD_MSG_7_1(buf) \ 1784 do { uint32x2_t t0, t1; \ 1785 t0 = vzip_u32(vget_low_u32(m3), vget_high_u32(m1)).val[1]; \ 1786 t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m3), vget_high_u32(m0)); \ 1787 buf = vcombine_u32(t0, t1); } while(0) 1789 #define BLAKE2S_LOAD_MSG_7_2(buf) \ 1790 do { uint32x2_t t0, t1; \ 1791 t0 = vext_u32(vget_high_u32(m2), vget_high_u32(m3), 1); \ 1792 t1 = vzip_u32(vget_low_u32(m0), vget_low_u32(m2)).val[1]; \ 1793 buf = vcombine_u32(t0, t1); } while(0) 1795 #define BLAKE2S_LOAD_MSG_7_3(buf) \ 1796 do { uint32x2_t t0, t1; \ 1797 t0 = vzip_u32(vget_low_u32(m1), vget_high_u32(m3)).val[1]; \ 1798 t1 = vzip_u32(vget_low_u32(m2), vget_high_u32(m0)).val[0]; \ 1799 buf = vcombine_u32(t0, t1); } while(0) 1801 #define BLAKE2S_LOAD_MSG_7_4(buf) \ 1802 do { uint32x2_t t0, t1; \ 1803 t0 = vzip_u32(vget_low_u32(m0), vget_low_u32(m1)).val[0]; \ 1804 t1 = vzip_u32(vget_high_u32(m1), vget_high_u32(m2)).val[0]; \ 1805 buf = vcombine_u32(t0, t1); } while(0) 1807 #define BLAKE2S_LOAD_MSG_8_1(buf) \ 1808 do { uint32x2_t t0, t1; \ 1809 t0 = vzip_u32(vget_high_u32(m1), vget_high_u32(m3)).val[0]; \ 1810 t1 = vext_u32(vget_high_u32(m2), vget_low_u32(m0), 1); \ 1811 buf = vcombine_u32(t0, t1); } while(0) 1813 #define BLAKE2S_LOAD_MSG_8_2(buf) \ 1814 do { uint32x2_t t0, t1; \ 1815 t0 = vzip_u32(vget_high_u32(m3), vget_low_u32(m2)).val[1]; \ 1816 t1 = vext_u32(vget_high_u32(m0), vget_low_u32(m2), 1); \ 1817 buf = vcombine_u32(t0, t1); } while(0) 1819 #define BLAKE2S_LOAD_MSG_8_3(buf) \ 1820 do { uint32x2_t t0, t1; \ 1821 t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m3), vget_low_u32(m3)); \ 1822 t1 = vext_u32(vget_low_u32(m0), vget_high_u32(m2), 1); \ 1823 buf = vcombine_u32(t0, t1); } while(0) 1825 #define BLAKE2S_LOAD_MSG_8_4(buf) \ 1826 do { uint32x2_t t0, t1; \ 1827 t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m0), vget_high_u32(m1)); \ 1828 t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m1), vget_low_u32(m1)); \ 1829 buf = vcombine_u32(t0, t1); } while(0) 1831 #define BLAKE2S_LOAD_MSG_9_1(buf) \ 1832 do { uint32x2_t t0, t1; \ 1833 t0 = vzip_u32(vget_high_u32(m2), vget_low_u32(m2)).val[0]; \ 1834 t1 = vzip_u32(vget_high_u32(m1), vget_low_u32(m0)).val[1]; \ 1835 buf = vcombine_u32(t0, t1); } while(0) 1837 #define BLAKE2S_LOAD_MSG_9_2(buf) \ 1838 do { uint32x2_t t0, t1; \ 1839 t0 = vzip_u32((vget_high_u32(m0)), vget_low_u32(m1)).val[0]; \ 1840 t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m1), vget_low_u32(m1)); \ 1841 buf = vcombine_u32(t0, t1); } while(0) 1843 #define BLAKE2S_LOAD_MSG_9_3(buf) \ 1844 do { uint32x2_t t0, t1; \ 1845 t0 = vzip_u32(vget_high_u32(m3), vget_low_u32(m2)).val[1]; \ 1846 t1 = vzip_u32((vget_high_u32(m0)), vget_low_u32(m3)).val[1]; \ 1847 buf = vcombine_u32(t0, t1); } while(0) 1849 #define BLAKE2S_LOAD_MSG_9_4(buf) \ 1850 do { uint32x2_t t0, t1; \ 1851 t0 = vext_u32(vget_high_u32(m2), vget_high_u32(m3), 1); \ 1852 t1 = vzip_u32(vget_low_u32(m3), vget_low_u32(m0)).val[0]; \ 1853 buf = vcombine_u32(t0, t1); } while(0) 1855 #define vrorq_n_u32_16(x) vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(x))) 1857 #define vrorq_n_u32_8(x) vsriq_n_u32(vshlq_n_u32((x), 24), (x), 8) 1859 #define vrorq_n_u32(x, c) vsriq_n_u32(vshlq_n_u32((x), 32-(c)), (x), (c)) 1861 #define BLAKE2S_G1(row1,row2,row3,row4,buf) \ 1863 row1 = vaddq_u32(vaddq_u32(row1, buf), row2); row4 = veorq_u32(row4, row1); \ 1864 row4 = vrorq_n_u32_16(row4); row3 = vaddq_u32(row3, row4); \ 1865 row2 = veorq_u32(row2, row3); row2 = vrorq_n_u32(row2, 12); \ 1868 #define BLAKE2S_G2(row1,row2,row3,row4,buf) \ 1870 row1 = vaddq_u32(vaddq_u32(row1, buf), row2); row4 = veorq_u32(row4, row1); \ 1871 row4 = vrorq_n_u32_8(row4); row3 = vaddq_u32(row3, row4); \ 1872 row2 = veorq_u32(row2, row3); row2 = vrorq_n_u32(row2, 7); \ 1875 #define BLAKE2S_DIAGONALIZE(row1,row2,row3,row4) \ 1877 row4 = vextq_u32(row4, row4, 3); row3 = vextq_u32(row3, row3, 2); row2 = vextq_u32(row2, row2, 1); \ 1880 #define BLAKE2S_UNDIAGONALIZE(row1,row2,row3,row4) \ 1882 row4 = vextq_u32(row4, row4, 1); \ 1883 row3 = vextq_u32(row3, row3, 2); \ 1884 row2 = vextq_u32(row2, row2, 3); \ 1887 #define BLAKE2S_ROUND(r) \ 1889 uint32x4_t buf1, buf2, buf3, buf4; \ 1890 BLAKE2S_LOAD_MSG_ ##r ##_1(buf1); \ 1891 BLAKE2S_G1(row1,row2,row3,row4,buf1); \ 1892 BLAKE2S_LOAD_MSG_ ##r ##_2(buf2); \ 1893 BLAKE2S_G2(row1,row2,row3,row4,buf2); \ 1894 BLAKE2S_DIAGONALIZE(row1,row2,row3,row4); \ 1895 BLAKE2S_LOAD_MSG_ ##r ##_3(buf3); \ 1896 BLAKE2S_G1(row1,row2,row3,row4,buf3); \ 1897 BLAKE2S_LOAD_MSG_ ##r ##_4(buf4); \ 1898 BLAKE2S_G2(row1,row2,row3,row4,buf4); \ 1899 BLAKE2S_UNDIAGONALIZE(row1,row2,row3,row4); \ 1906 const uint32x4_t m0 = vreinterpretq_u32_u8(vld1q_u8((input + 00)));
1907 const uint32x4_t m1 = vreinterpretq_u32_u8(vld1q_u8((input + 16)));
1908 const uint32x4_t m2 = vreinterpretq_u32_u8(vld1q_u8((input + 32)));
1909 const uint32x4_t m3 = vreinterpretq_u32_u8(vld1q_u8((input + 48)));
1911 uint32x4_t row1, row2, row3, row4;
1913 const uint32x4_t f0 = row1 = vld1q_u32(&state.h[0]);
1914 const uint32x4_t f1 = row2 = vld1q_u32(&state.h[4]);
1915 row3 = vld1q_u32(&BLAKE2S_IV[0]);
1916 row4 = veorq_u32(vld1q_u32(&BLAKE2S_IV[4]), vld1q_u32(&state.t[0]));
1929 vst1q_u32(&state.h[0], veorq_u32(f0, veorq_u32(row1, row3)));
1930 vst1q_u32(&state.h[4], veorq_u32(f1, veorq_u32(row2, row4)));
1935 #define BLAKE2B_LOAD_MSG_0_1(b0, b1) \ 1936 do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m1)); b1 = vcombine_u64(vget_low_u64(m2), vget_low_u64(m3)); } while(0) 1938 #define BLAKE2B_LOAD_MSG_0_2(b0, b1) \ 1939 do { b0 = vcombine_u64(vget_high_u64(m0), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m3)); } while(0) 1941 #define BLAKE2B_LOAD_MSG_0_3(b0, b1) \ 1942 do { b0 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m5)); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m7)); } while(0) 1944 #define BLAKE2B_LOAD_MSG_0_4(b0, b1) \ 1945 do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m5)); b1 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m7)); } while(0) 1947 #define BLAKE2B_LOAD_MSG_1_1(b0, b1) \ 1948 do { b0 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m2)); b1 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m6)); } while(0) 1950 #define BLAKE2B_LOAD_MSG_1_2(b0, b1) \ 1951 do { b0 = vcombine_u64(vget_low_u64(m5), vget_low_u64(m4)); b1 = vextq_u64(m7, m3, 1); } while(0) 1953 #define BLAKE2B_LOAD_MSG_1_3(b0, b1) \ 1954 do { b0 = vextq_u64(m0, m0, 1); b1 = vcombine_u64(vget_high_u64(m5), vget_high_u64(m2)); } while(0) 1956 #define BLAKE2B_LOAD_MSG_1_4(b0, b1) \ 1957 do { b0 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m1)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); } while(0) 1959 #define BLAKE2B_LOAD_MSG_2_1(b0, b1) \ 1960 do { b0 = vextq_u64(m5, m6, 1); b1 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m7)); } while(0) 1962 #define BLAKE2B_LOAD_MSG_2_2(b0, b1) \ 1963 do { b0 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m0)); b1 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m6)); } while(0) 1965 #define BLAKE2B_LOAD_MSG_2_3(b0, b1) \ 1966 do { b0 = vcombine_u64(vget_low_u64(m5), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m4)); } while(0) 1968 #define BLAKE2B_LOAD_MSG_2_4(b0, b1) \ 1969 do { b0 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m3)); b1 = vextq_u64(m0, m2, 1); } while(0) 1971 #define BLAKE2B_LOAD_MSG_3_1(b0, b1) \ 1972 do { b0 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m5)); } while(0) 1974 #define BLAKE2B_LOAD_MSG_3_2(b0, b1) \ 1975 do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m0)); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m7)); } while(0) 1977 #define BLAKE2B_LOAD_MSG_3_3(b0, b1) \ 1978 do { b0 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m2)); b1 = vcombine_u64(vget_low_u64(m2), vget_high_u64(m7)); } while(0) 1980 #define BLAKE2B_LOAD_MSG_3_4(b0, b1) \ 1981 do { b0 = vcombine_u64(vget_low_u64(m3), vget_low_u64(m5)); b1 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m4)); } while(0) 1983 #define BLAKE2B_LOAD_MSG_4_1(b0, b1) \ 1984 do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m2)); b1 = vcombine_u64(vget_low_u64(m1), vget_low_u64(m5)); } while(0) 1986 #define BLAKE2B_LOAD_MSG_4_2(b0, b1) \ 1987 do { b0 = vcombine_u64(vget_low_u64(m0), vget_high_u64(m3)); b1 = vcombine_u64(vget_low_u64(m2), vget_high_u64(m7)); } while(0) 1989 #define BLAKE2B_LOAD_MSG_4_3(b0, b1) \ 1990 do { b0 = vcombine_u64(vget_low_u64(m7), vget_high_u64(m5)); b1 = vcombine_u64(vget_low_u64(m3), vget_high_u64(m1)); } while(0) 1992 #define BLAKE2B_LOAD_MSG_4_4(b0, b1) \ 1993 do { b0 = vextq_u64(m0, m6, 1); b1 = vcombine_u64(vget_low_u64(m4), vget_high_u64(m6)); } while(0) 1995 #define BLAKE2B_LOAD_MSG_5_1(b0, b1) \ 1996 do { b0 = vcombine_u64(vget_low_u64(m1), vget_low_u64(m3)); b1 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m4)); } while(0) 1998 #define BLAKE2B_LOAD_MSG_5_2(b0, b1) \ 1999 do { b0 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m5)); b1 = vcombine_u64(vget_high_u64(m5), vget_high_u64(m1)); } while(0) 2001 #define BLAKE2B_LOAD_MSG_5_3(b0, b1) \ 2002 do { b0 = vcombine_u64(vget_low_u64(m2), vget_high_u64(m3)); b1 = vcombine_u64(vget_high_u64(m7), vget_high_u64(m0)); } while(0) 2004 #define BLAKE2B_LOAD_MSG_5_4(b0, b1) \ 2005 do { b0 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m2)); b1 = vcombine_u64(vget_low_u64(m7), vget_high_u64(m4)); } while(0) 2007 #define BLAKE2B_LOAD_MSG_6_1(b0, b1) \ 2008 do { b0 = vcombine_u64(vget_low_u64(m6), vget_high_u64(m0)); b1 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m2)); } while(0) 2010 #define BLAKE2B_LOAD_MSG_6_2(b0, b1) \ 2011 do { b0 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m7)); b1 = vextq_u64(m6, m5, 1); } while(0) 2013 #define BLAKE2B_LOAD_MSG_6_3(b0, b1) \ 2014 do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m3)); b1 = vextq_u64(m4, m4, 1); } while(0) 2016 #define BLAKE2B_LOAD_MSG_6_4(b0, b1) \ 2017 do { b0 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); b1 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m5)); } while(0) 2019 #define BLAKE2B_LOAD_MSG_7_1(b0, b1) \ 2020 do { b0 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m3)); b1 = vcombine_u64(vget_low_u64(m6), vget_high_u64(m1)); } while(0) 2022 #define BLAKE2B_LOAD_MSG_7_2(b0, b1) \ 2023 do { b0 = vextq_u64(m5, m7, 1); b1 = vcombine_u64(vget_high_u64(m0), vget_high_u64(m4)); } while(0) 2025 #define BLAKE2B_LOAD_MSG_7_3(b0, b1) \ 2026 do { b0 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m7)); b1 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m1)); } while(0) 2028 #define BLAKE2B_LOAD_MSG_7_4(b0, b1) \ 2029 do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m2)); b1 = vcombine_u64(vget_low_u64(m3), vget_low_u64(m5)); } while(0) 2031 #define BLAKE2B_LOAD_MSG_8_1(b0, b1) \ 2032 do { b0 = vcombine_u64(vget_low_u64(m3), vget_low_u64(m7)); b1 = vextq_u64(m5, m0, 1); } while(0) 2034 #define BLAKE2B_LOAD_MSG_8_2(b0, b1) \ 2035 do { b0 = vcombine_u64(vget_high_u64(m7), vget_high_u64(m4)); b1 = vextq_u64(m1, m4, 1); } while(0) 2037 #define BLAKE2B_LOAD_MSG_8_3(b0, b1) \ 2038 do { b0 = m6; b1 = vextq_u64(m0, m5, 1); } while(0) 2040 #define BLAKE2B_LOAD_MSG_8_4(b0, b1) \ 2041 do { b0 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m3)); b1 = m2; } while(0) 2043 #define BLAKE2B_LOAD_MSG_9_1(b0, b1) \ 2044 do { b0 = vcombine_u64(vget_low_u64(m5), vget_low_u64(m4)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m0)); } while(0) 2046 #define BLAKE2B_LOAD_MSG_9_2(b0, b1) \ 2047 do { b0 = vcombine_u64(vget_low_u64(m1), vget_low_u64(m2)); b1 = vcombine_u64(vget_low_u64(m3), vget_high_u64(m2)); } while(0) 2049 #define BLAKE2B_LOAD_MSG_9_3(b0, b1) \ 2050 do { b0 = vcombine_u64(vget_high_u64(m7), vget_high_u64(m4)); b1 = vcombine_u64(vget_high_u64(m1), vget_high_u64(m6)); } while(0) 2052 #define BLAKE2B_LOAD_MSG_9_4(b0, b1) \ 2053 do { b0 = vextq_u64(m5, m7, 1); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m0)); } while(0) 2055 #define BLAKE2B_LOAD_MSG_10_1(b0, b1) \ 2056 do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m1)); b1 = vcombine_u64(vget_low_u64(m2), vget_low_u64(m3)); } while(0) 2058 #define BLAKE2B_LOAD_MSG_10_2(b0, b1) \ 2059 do { b0 = vcombine_u64(vget_high_u64(m0), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m3)); } while(0) 2061 #define BLAKE2B_LOAD_MSG_10_3(b0, b1) \ 2062 do { b0 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m5)); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m7)); } while(0) 2064 #define BLAKE2B_LOAD_MSG_10_4(b0, b1) \ 2065 do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m5)); b1 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m7)); } while(0) 2067 #define BLAKE2B_LOAD_MSG_11_1(b0, b1) \ 2068 do { b0 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m2)); b1 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m6)); } while(0) 2070 #define BLAKE2B_LOAD_MSG_11_2(b0, b1) \ 2071 do { b0 = vcombine_u64(vget_low_u64(m5), vget_low_u64(m4)); b1 = vextq_u64(m7, m3, 1); } while(0) 2073 #define BLAKE2B_LOAD_MSG_11_3(b0, b1) \ 2074 do { b0 = vextq_u64(m0, m0, 1); b1 = vcombine_u64(vget_high_u64(m5), vget_high_u64(m2)); } while(0) 2076 #define BLAKE2B_LOAD_MSG_11_4(b0, b1) \ 2077 do { b0 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m1)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); } while(0) 2079 #define vrorq_n_u64_32(x) vreinterpretq_u64_u32(vrev64q_u32(vreinterpretq_u32_u64((x)))) 2081 #define vrorq_n_u64_24(x) vcombine_u64(\ 2082 vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_low_u64(x)), vreinterpret_u8_u64(vget_low_u64(x)), 3)), \ 2083 vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_high_u64(x)), vreinterpret_u8_u64(vget_high_u64(x)), 3))) 2085 #define vrorq_n_u64_16(x) vcombine_u64(\ 2086 vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_low_u64(x)), vreinterpret_u8_u64(vget_low_u64(x)), 2)), \ 2087 vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_high_u64(x)), vreinterpret_u8_u64(vget_high_u64(x)), 2))) 2089 #define vrorq_n_u64_63(x) veorq_u64(vaddq_u64(x, x), vshrq_n_u64(x, 63)) 2091 #define BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ 2093 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l); \ 2094 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h); \ 2095 row4l = veorq_u64(row4l, row1l); row4h = veorq_u64(row4h, row1h); \ 2096 row4l = vrorq_n_u64_32(row4l); row4h = vrorq_n_u64_32(row4h); \ 2097 row3l = vaddq_u64(row3l, row4l); row3h = vaddq_u64(row3h, row4h); \ 2098 row2l = veorq_u64(row2l, row3l); row2h = veorq_u64(row2h, row3h); \ 2099 row2l = vrorq_n_u64_24(row2l); row2h = vrorq_n_u64_24(row2h); \ 2102 #define BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ 2104 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l); \ 2105 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h); \ 2106 row4l = veorq_u64(row4l, row1l); row4h = veorq_u64(row4h, row1h); \ 2107 row4l = vrorq_n_u64_16(row4l); row4h = vrorq_n_u64_16(row4h); \ 2108 row3l = vaddq_u64(row3l, row4l); row3h = vaddq_u64(row3h, row4h); \ 2109 row2l = veorq_u64(row2l, row3l); row2h = veorq_u64(row2h, row3h); \ 2110 row2l = vrorq_n_u64_63(row2l); row2h = vrorq_n_u64_63(row2h); \ 2113 #define BLAKE2B_DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ 2115 uint64x2_t t0 = vextq_u64(row2l, row2h, 1); \ 2116 uint64x2_t t1 = vextq_u64(row2h, row2l, 1); \ 2117 row2l = t0; row2h = t1; t0 = row3l; row3l = row3h; row3h = t0; \ 2118 t0 = vextq_u64(row4h, row4l, 1); t1 = vextq_u64(row4l, row4h, 1); \ 2119 row4l = t0; row4h = t1; \ 2122 #define BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ 2124 uint64x2_t t0 = vextq_u64(row2h, row2l, 1); \ 2125 uint64x2_t t1 = vextq_u64(row2l, row2h, 1); \ 2126 row2l = t0; row2h = t1; t0 = row3l; row3l = row3h; row3h = t0; \ 2127 t0 = vextq_u64(row4l, row4h, 1); t1 = vextq_u64(row4h, row4l, 1); \ 2128 row4l = t0; row4h = t1; \ 2131 #define BLAKE2B_ROUND(r) \ 2133 uint64x2_t b0, b1; \ 2134 BLAKE2B_LOAD_MSG_ ##r ##_1(b0, b1); \ 2135 BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ 2136 BLAKE2B_LOAD_MSG_ ##r ##_2(b0, b1); \ 2137 BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ 2138 BLAKE2B_DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ 2139 BLAKE2B_LOAD_MSG_ ##r ##_3(b0, b1); \ 2140 BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ 2141 BLAKE2B_LOAD_MSG_ ##r ##_4(b0, b1); \ 2142 BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ 2143 BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ 2150 const uint64x2_t m0 = vreinterpretq_u64_u8(vld1q_u8(input + 00));
2151 const uint64x2_t m1 = vreinterpretq_u64_u8(vld1q_u8(input + 16));
2152 const uint64x2_t m2 = vreinterpretq_u64_u8(vld1q_u8(input + 32));
2153 const uint64x2_t m3 = vreinterpretq_u64_u8(vld1q_u8(input + 48));
2154 const uint64x2_t m4 = vreinterpretq_u64_u8(vld1q_u8(input + 64));
2155 const uint64x2_t m5 = vreinterpretq_u64_u8(vld1q_u8(input + 80));
2156 const uint64x2_t m6 = vreinterpretq_u64_u8(vld1q_u8(input + 96));
2157 const uint64x2_t m7 = vreinterpretq_u64_u8(vld1q_u8(input + 112));
2159 uint64x2_t row1l, row1h, row2l, row2h;
2160 uint64x2_t row3l, row3h, row4l, row4h;
2162 const uint64x2_t h0 = row1l = vld1q_u64(&state.h[0]);
2163 const uint64x2_t h1 = row1h = vld1q_u64(&state.h[2]);
2164 const uint64x2_t h2 = row2l = vld1q_u64(&state.h[4]);
2165 const uint64x2_t h3 = row2h = vld1q_u64(&state.h[6]);
2167 row3l = vld1q_u64(&BLAKE2B_IV[0]);
2168 row3h = vld1q_u64(&BLAKE2B_IV[2]);
2169 row4l = veorq_u64(vld1q_u64(&BLAKE2B_IV[4]), vld1q_u64(&state.t[0]));
2170 row4h = veorq_u64(vld1q_u64(&BLAKE2B_IV[6]), vld1q_u64(&state.f[0]));
2185 vst1q_u64(&state.h[0], veorq_u64(h0, veorq_u64(row1l, row3l)));
2186 vst1q_u64(&state.h[2], veorq_u64(h1, veorq_u64(row1h, row3h)));
2187 vst1q_u64(&state.h[4], veorq_u64(h2, veorq_u64(row2l, row4l)));
2188 vst1q_u64(&state.h[6], veorq_u64(h3, veorq_u64(row2h, row4h)));
2190 #endif // CRYPTOPP_ARM_NEON_AVAILABLE Utility functions for the Crypto++ library.
Library configuration file.
bool IsAlignedOn(const void *ptr, unsigned int alignment)
Determines whether ptr is aligned to a minimum value.
Classes for BLAKE2b and BLAKE2s message digests and keyed message digests.
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
BLAKE2 state information.
Crypto++ library namespace.