Crypto++  8.6
Free C++ class library of cryptographic schemes
arm_simd.h
Go to the documentation of this file.
1 // arm_simd.h - written and placed in public domain by Jeffrey Walton
2 
3 /// \file arm_simd.h
4 /// \brief Support functions for ARM and vector operations
5 
6 #ifndef CRYPTOPP_ARM_SIMD_H
7 #define CRYPTOPP_ARM_SIMD_H
8 
9 #include "config.h"
10 
11 #if (CRYPTOPP_ARM_NEON_HEADER)
12 # include <stdint.h>
13 # include <arm_neon.h>
14 #endif
15 
16 #if (CRYPTOPP_ARM_ACLE_HEADER)
17 # include <stdint.h>
18 # include <arm_acle.h>
19 #endif
20 
21 #if (CRYPTOPP_ARM_CRC32_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
22 /// \name CRC32 checksum
23 //@{
24 
25 /// \brief CRC32 checksum
26 /// \param crc the starting crc value
27 /// \param val the value to checksum
28 /// \return CRC32 value
29 /// \since Crypto++ 8.6
30 inline uint32_t CRC32B (uint32_t crc, uint8_t val)
31 {
32 #if defined(_MSC_VER)
33  return __crc32b(crc, val);
34 #else
35  __asm__ ("crc32b %w0, %w0, %w1 \n\t"
36  :"+r" (crc) : "r" (val) );
37  return crc;
38 #endif
39 }
40 
41 /// \brief CRC32 checksum
42 /// \param crc the starting crc value
43 /// \param val the value to checksum
44 /// \return CRC32 value
45 /// \since Crypto++ 8.6
46 inline uint32_t CRC32W (uint32_t crc, uint32_t val)
47 {
48 #if defined(_MSC_VER)
49  return __crc32w(crc, val);
50 #else
51  __asm__ ("crc32w %w0, %w0, %w1 \n\t"
52  :"+r" (crc) : "r" (val) );
53  return crc;
54 #endif
55 }
56 
57 /// \brief CRC32 checksum
58 /// \param crc the starting crc value
59 /// \param vals the values to checksum
60 /// \return CRC32 value
61 /// \since Crypto++ 8.6
62 inline uint32_t CRC32Wx4 (uint32_t crc, const uint32_t vals[4])
63 {
64 #if defined(_MSC_VER)
65  return __crc32w(__crc32w(__crc32w(__crc32w(
66  crc, vals[0]), vals[1]), vals[2]), vals[3]);
67 #else
68  __asm__ ("crc32w %w0, %w0, %w1 \n\t"
69  "crc32w %w0, %w0, %w2 \n\t"
70  "crc32w %w0, %w0, %w3 \n\t"
71  "crc32w %w0, %w0, %w4 \n\t"
72  :"+r" (crc) : "r" (vals[0]), "r" (vals[1]),
73  "r" (vals[2]), "r" (vals[3]));
74  return crc;
75 #endif
76 }
77 
78 //@}
79 /// \name CRC32-C checksum
80 
81 /// \brief CRC32-C checksum
82 /// \param crc the starting crc value
83 /// \param val the value to checksum
84 /// \return CRC32-C value
85 /// \since Crypto++ 8.6
86 inline uint32_t CRC32CB (uint32_t crc, uint8_t val)
87 {
88 #if defined(_MSC_VER)
89  return __crc32cb(crc, val);
90 #else
91  __asm__ ("crc32cb %w0, %w0, %w1 \n\t"
92  :"+r" (crc) : "r" (val) );
93  return crc;
94 #endif
95 }
96 
97 /// \brief CRC32-C checksum
98 /// \param crc the starting crc value
99 /// \param val the value to checksum
100 /// \return CRC32-C value
101 /// \since Crypto++ 8.6
102 inline uint32_t CRC32CW (uint32_t crc, uint32_t val)
103 {
104 #if defined(_MSC_VER)
105  return __crc32cw(crc, val);
106 #else
107  __asm__ ("crc32cw %w0, %w0, %w1 \n\t"
108  :"+r" (crc) : "r" (val) );
109  return crc;
110 #endif
111 }
112 
113 /// \brief CRC32-C checksum
114 /// \param crc the starting crc value
115 /// \param vals the values to checksum
116 /// \return CRC32-C value
117 /// \since Crypto++ 8.6
118 inline uint32_t CRC32CWx4 (uint32_t crc, const uint32_t vals[4])
119 {
120 #if defined(_MSC_VER)
121  return __crc32cw(__crc32cw(__crc32cw(__crc32cw(
122  crc, vals[0]), vals[1]), vals[2]), vals[3]);
123 #else
124  __asm__ ("crc32cw %w0, %w0, %w1 \n\t"
125  "crc32cw %w0, %w0, %w2 \n\t"
126  "crc32cw %w0, %w0, %w3 \n\t"
127  "crc32cw %w0, %w0, %w4 \n\t"
128  :"+r" (crc) : "r" (vals[0]), "r" (vals[1]),
129  "r" (vals[2]), "r" (vals[3]));
130  return crc;
131 #endif
132 }
133 //@}
134 #endif // CRYPTOPP_ARM_CRC32_AVAILABLE
135 
136 #if (CRYPTOPP_ARM_PMULL_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
137 /// \name Polynomial multiplication
138 //@{
139 
140 /// \brief Polynomial multiplication
141 /// \param a the first value
142 /// \param b the second value
143 /// \return vector product
144 /// \details PMULL_00() performs polynomial multiplication and presents
145 /// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x00)</tt>.
146 /// The <tt>0x00</tt> indicates the low 64-bits of <tt>a</tt> and <tt>b</tt>
147 /// are multiplied.
148 /// \note An Intel XMM register is composed of 128-bits. The leftmost bit
149 /// is MSB and numbered 127, while the rightmost bit is LSB and
150 /// numbered 0.
151 /// \since Crypto++ 8.0
152 inline uint64x2_t PMULL_00(const uint64x2_t a, const uint64x2_t b)
153 {
154 #if defined(_MSC_VER)
155  const __n64 x = { vgetq_lane_u64(a, 0) };
156  const __n64 y = { vgetq_lane_u64(b, 0) };
157  return vmull_p64(x, y);
158 #elif defined(__GNUC__)
159  uint64x2_t r;
160  __asm__ ("pmull %0.1q, %1.1d, %2.1d \n\t"
161  :"=w" (r) : "w" (a), "w" (b) );
162  return r;
163 #else
164  return (uint64x2_t)(vmull_p64(
165  vgetq_lane_u64(vreinterpretq_u64_u8(a),0),
166  vgetq_lane_u64(vreinterpretq_u64_u8(b),0)));
167 #endif
168 }
169 
170 /// \brief Polynomial multiplication
171 /// \param a the first value
172 /// \param b the second value
173 /// \return vector product
174 /// \details PMULL_01 performs() polynomial multiplication and presents
175 /// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x01)</tt>.
176 /// The <tt>0x01</tt> indicates the low 64-bits of <tt>a</tt> and high
177 /// 64-bits of <tt>b</tt> are multiplied.
178 /// \note An Intel XMM register is composed of 128-bits. The leftmost bit
179 /// is MSB and numbered 127, while the rightmost bit is LSB and
180 /// numbered 0.
181 /// \since Crypto++ 8.0
182 inline uint64x2_t PMULL_01(const uint64x2_t a, const uint64x2_t b)
183 {
184 #if defined(_MSC_VER)
185  const __n64 x = { vgetq_lane_u64(a, 0) };
186  const __n64 y = { vgetq_lane_u64(b, 1) };
187  return vmull_p64(x, y);
188 #elif defined(__GNUC__)
189  uint64x2_t r;
190  __asm__ ("pmull %0.1q, %1.1d, %2.1d \n\t"
191  :"=w" (r) : "w" (a), "w" (vget_high_u64(b)) );
192  return r;
193 #else
194  return (uint64x2_t)(vmull_p64(
195  vgetq_lane_u64(vreinterpretq_u64_u8(a),0),
196  vgetq_lane_u64(vreinterpretq_u64_u8(b),1)));
197 #endif
198 }
199 
200 /// \brief Polynomial multiplication
201 /// \param a the first value
202 /// \param b the second value
203 /// \return vector product
204 /// \details PMULL_10() performs polynomial multiplication and presents
205 /// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x10)</tt>.
206 /// The <tt>0x10</tt> indicates the high 64-bits of <tt>a</tt> and low
207 /// 64-bits of <tt>b</tt> are multiplied.
208 /// \note An Intel XMM register is composed of 128-bits. The leftmost bit
209 /// is MSB and numbered 127, while the rightmost bit is LSB and
210 /// numbered 0.
211 /// \since Crypto++ 8.0
212 inline uint64x2_t PMULL_10(const uint64x2_t a, const uint64x2_t b)
213 {
214 #if defined(_MSC_VER)
215  const __n64 x = { vgetq_lane_u64(a, 1) };
216  const __n64 y = { vgetq_lane_u64(b, 0) };
217  return vmull_p64(x, y);
218 #elif defined(__GNUC__)
219  uint64x2_t r;
220  __asm__ ("pmull %0.1q, %1.1d, %2.1d \n\t"
221  :"=w" (r) : "w" (vget_high_u64(a)), "w" (b) );
222  return r;
223 #else
224  return (uint64x2_t)(vmull_p64(
225  vgetq_lane_u64(vreinterpretq_u64_u8(a),1),
226  vgetq_lane_u64(vreinterpretq_u64_u8(b),0)));
227 #endif
228 }
229 
230 /// \brief Polynomial multiplication
231 /// \param a the first value
232 /// \param b the second value
233 /// \return vector product
234 /// \details PMULL_11() performs polynomial multiplication and presents
235 /// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x11)</tt>.
236 /// The <tt>0x11</tt> indicates the high 64-bits of <tt>a</tt> and <tt>b</tt>
237 /// are multiplied.
238 /// \note An Intel XMM register is composed of 128-bits. The leftmost bit
239 /// is MSB and numbered 127, while the rightmost bit is LSB and
240 /// numbered 0.
241 /// \since Crypto++ 8.0
242 inline uint64x2_t PMULL_11(const uint64x2_t a, const uint64x2_t b)
243 {
244 #if defined(_MSC_VER)
245  const __n64 x = { vgetq_lane_u64(a, 1) };
246  const __n64 y = { vgetq_lane_u64(b, 1) };
247  return vmull_p64(x, y);
248 #elif defined(__GNUC__)
249  uint64x2_t r;
250  __asm__ ("pmull2 %0.1q, %1.2d, %2.2d \n\t"
251  :"=w" (r) : "w" (a), "w" (b) );
252  return r;
253 #else
254  return (uint64x2_t)(vmull_p64(
255  vgetq_lane_u64(vreinterpretq_u64_u8(a),1),
256  vgetq_lane_u64(vreinterpretq_u64_u8(b),1)));
257 #endif
258 }
259 
260 /// \brief Polynomial multiplication
261 /// \param a the first value
262 /// \param b the second value
263 /// \return vector product
264 /// \details PMULL() performs vmull_p64(). PMULL is provided as
265 /// GCC inline assembly due to Clang and lack of support for the intrinsic.
266 /// \since Crypto++ 8.0
267 inline uint64x2_t PMULL(const uint64x2_t a, const uint64x2_t b)
268 {
269 #if defined(_MSC_VER)
270  const __n64 x = { vgetq_lane_u64(a, 0) };
271  const __n64 y = { vgetq_lane_u64(b, 0) };
272  return vmull_p64(x, y);
273 #elif defined(__GNUC__)
274  uint64x2_t r;
275  __asm__ ("pmull %0.1q, %1.1d, %2.1d \n\t"
276  :"=w" (r) : "w" (a), "w" (b) );
277  return r;
278 #else
279  return (uint64x2_t)(vmull_p64(
280  vgetq_lane_u64(vreinterpretq_u64_u8(a),0),
281  vgetq_lane_u64(vreinterpretq_u64_u8(b),0)));
282 #endif
283 }
284 
285 /// \brief Polynomial multiplication
286 /// \param a the first value
287 /// \param b the second value
288 /// \return vector product
289 /// \details PMULL_HIGH() performs vmull_high_p64(). PMULL_HIGH is provided as
290 /// GCC inline assembly due to Clang and lack of support for the intrinsic.
291 /// \since Crypto++ 8.0
292 inline uint64x2_t PMULL_HIGH(const uint64x2_t a, const uint64x2_t b)
293 {
294 #if defined(_MSC_VER)
295  const __n64 x = { vgetq_lane_u64(a, 1) };
296  const __n64 y = { vgetq_lane_u64(b, 1) };
297  return vmull_p64(x, y);
298 #elif defined(__GNUC__)
299  uint64x2_t r;
300  __asm__ ("pmull2 %0.1q, %1.2d, %2.2d \n\t"
301  :"=w" (r) : "w" (a), "w" (b) );
302  return r;
303 #else
304  return (uint64x2_t)(vmull_p64(
305  vgetq_lane_u64(vreinterpretq_u64_u8(a),1),
306  vgetq_lane_u64(vreinterpretq_u64_u8(b),1))));
307 #endif
308 }
309 
310 /// \brief Vector extraction
311 /// \param a the first value
312 /// \param b the second value
313 /// \param c the byte count
314 /// \return vector
315 /// \details VEXT_U8() extracts the first <tt>c</tt> bytes of vector
316 /// <tt>a</tt> and the remaining bytes in <tt>b</tt>. VEXT_U8 is provided
317 /// as GCC inline assembly due to Clang and lack of support for the intrinsic.
318 /// \since Crypto++ 8.0
319 inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b, unsigned int c)
320 {
321 #if defined(_MSC_VER)
322  return vreinterpretq_u64_u8(vextq_u8(
323  vreinterpretq_u8_u64(a), vreinterpretq_u8_u64(b), c));
324 #else
325  uint64x2_t r;
326  __asm__ ("ext %0.16b, %1.16b, %2.16b, %3 \n\t"
327  :"=w" (r) : "w" (a), "w" (b), "I" (c) );
328  return r;
329 #endif
330 }
331 
332 /// \brief Vector extraction
333 /// \tparam C the byte count
334 /// \param a the first value
335 /// \param b the second value
336 /// \return vector
337 /// \details VEXT_U8() extracts the first <tt>C</tt> bytes of vector
338 /// <tt>a</tt> and the remaining bytes in <tt>b</tt>. VEXT_U8 is provided
339 /// as GCC inline assembly due to Clang and lack of support for the intrinsic.
340 /// \since Crypto++ 8.0
341 template <unsigned int C>
342 inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b)
343 {
344  // https://github.com/weidai11/cryptopp/issues/366
345 #if defined(_MSC_VER)
346  return vreinterpretq_u64_u8(vextq_u8(
347  vreinterpretq_u8_u64(a), vreinterpretq_u8_u64(b), C));
348 #else
349  uint64x2_t r;
350  __asm__ ("ext %0.16b, %1.16b, %2.16b, %3 \n\t"
351  :"=w" (r) : "w" (a), "w" (b), "I" (C) );
352  return r;
353 #endif
354 //@}
355 }
356 
357 #endif // CRYPTOPP_ARM_PMULL_AVAILABLE
358 
359 #if CRYPTOPP_ARM_SHA3_AVAILABLE || defined(CRYPTOPP_DOXYGEN_PROCESSING)
360 /// \name ARMv8.2 operations
361 //@{
362 
363 /// \brief Three-way XOR
364 /// \param a the first value
365 /// \param b the second value
366 /// \param c the third value
367 /// \return three-way exclusive OR of the values
368 /// \details VEOR3() performs veor3q_u64(). VEOR3 is provided as GCC inline assembly due
369 /// to Clang and lack of support for the intrinsic.
370 /// \details VEOR3 requires ARMv8.2.
371 /// \since Crypto++ 8.6
372 inline uint64x2_t VEOR3(uint64x2_t a, uint64x2_t b, uint64x2_t c)
373 {
374 #if defined(_MSC_VER)
375  return veor3q_u64(a, b, c);
376 #else
377  uint64x2_t r;
378  __asm__ ("eor3 %0.16b, %1.16b, %2.16b, %3.16b \n\t"
379  :"=w" (r) : "w" (a), "w" (b), "w" (c));
380  return r;
381 #endif
382 }
383 
384 /// \brief XOR and rotate
385 /// \param a the first value
386 /// \param b the second value
387 /// \param c the third value
388 /// \return two-way exclusive OR of the values, then rotated by imm6
389 /// \details VXARQ() performs vxarq_u64(). VXARQ is provided as GCC inline assembly due
390 /// to Clang and lack of support for the intrinsic.
391 /// \details VXARQ requires ARMv8.2.
392 /// \since Crypto++ 8.6
393 inline uint64x2_t VXAR(uint64x2_t a, uint64x2_t b, const int imm6)
394 {
395 #if defined(_MSC_VER)
396  return vxarq_u64(a, b, imm6);
397 #else
398  uint64x2_t r;
399  __asm__ ("xar %0.2d, %1.2d, %2.2d, %3 \n\t"
400  :"=w" (r) : "w" (a), "w" (b), "I" (imm6));
401  return r;
402 #endif
403 }
404 
405 /// \brief XOR and rotate
406 /// \tparam C the rotate amount
407 /// \param a the first value
408 /// \param b the second value
409 /// \return two-way exclusive OR of the values, then rotated by C
410 /// \details VXARQ() performs vxarq_u64(). VXARQ is provided as GCC inline assembly due
411 /// to Clang and lack of support for the intrinsic.
412 /// \details VXARQ requires ARMv8.2.
413 /// \since Crypto++ 8.6
414 template <unsigned int C>
415 inline uint64x2_t VXAR(uint64x2_t a, uint64x2_t b)
416 {
417 #if defined(_MSC_VER)
418  return vxarq_u64(a, b, C);
419 #else
420  uint64x2_t r;
421  __asm__ ("xar %0.2d, %1.2d, %2.2d, %3 \n\t"
422  :"=w" (r) : "w" (a), "w" (b), "I" (C));
423  return r;
424 #endif
425 }
426 
427 /// \brief XOR and rotate
428 /// \param a the first value
429 /// \param b the second value
430 /// \return two-way exclusive OR of the values, then rotated 1-bit
431 /// \details VRAX1() performs vrax1q_u64(). VRAX1 is provided as GCC inline assembly due
432 /// to Clang and lack of support for the intrinsic.
433 /// \details VRAX1 requires ARMv8.2.
434 /// \since Crypto++ 8.6
435 inline uint64x2_t VRAX1(uint64x2_t a, uint64x2_t b)
436 {
437 #if defined(_MSC_VER)
438  return vrax1q_u64(a, b);
439 #else
440  uint64x2_t r;
441  __asm__ ("rax1 %0.2d, %1.2d, %2.2d \n\t"
442  :"=w" (r) : "w" (a), "w" (b));
443  return r;
444 #endif
445 }
446 //@}
447 #endif // CRYPTOPP_ARM_SHA3_AVAILABLE
448 
449 #endif // CRYPTOPP_ARM_SIMD_H
uint64x2_t PMULL_00(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Definition: arm_simd.h:152
uint64x2_t VRAX1(uint64x2_t a, uint64x2_t b)
XOR and rotate.
Definition: arm_simd.h:435
uint32_t CRC32CWx4(uint32_t crc, const uint32_t vals[4])
CRC32-C checksum.
Definition: arm_simd.h:118
uint64x2_t PMULL_11(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Definition: arm_simd.h:242
uint32_t CRC32CB(uint32_t crc, uint8_t val)
CRC32-C checksum.
Definition: arm_simd.h:86
uint64x2_t PMULL_HIGH(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Definition: arm_simd.h:292
uint64x2_t PMULL_01(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Definition: arm_simd.h:182
uint64x2_t VEOR3(uint64x2_t a, uint64x2_t b, uint64x2_t c)
Three-way XOR.
Definition: arm_simd.h:372
uint64x2_t VXAR(uint64x2_t a, uint64x2_t b, const int imm6)
XOR and rotate.
Definition: arm_simd.h:393
uint32_t CRC32W(uint32_t crc, uint32_t val)
CRC32 checksum.
Definition: arm_simd.h:46
uint32_t CRC32B(uint32_t crc, uint8_t val)
CRC32 checksum.
Definition: arm_simd.h:30
uint64x2_t PMULL_10(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Definition: arm_simd.h:212
uint64x2_t PMULL(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Definition: arm_simd.h:267
uint32_t CRC32CW(uint32_t crc, uint32_t val)
CRC32-C checksum.
Definition: arm_simd.h:102
uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b, unsigned int c)
Vector extraction.
Definition: arm_simd.h:319
uint32_t CRC32Wx4(uint32_t crc, const uint32_t vals[4])
CRC32 checksum.
Definition: arm_simd.h:62
Library configuration file.