rsaz_exp.c 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231
  1. /*
  2. * Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
  3. * Copyright (c) 2012, Intel Corporation. All Rights Reserved.
  4. *
  5. * Licensed under the OpenSSL license (the "License"). You may not use
  6. * this file except in compliance with the License. You can obtain a copy
  7. * in the file LICENSE in the source distribution or at
  8. * https://www.openssl.org/source/license.html
  9. *
  10. * Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
  11. * (1) Intel Corporation, Israel Development Center, Haifa, Israel
  12. * (2) University of Haifa, Israel
  13. */
  14. #include <openssl/base.h>
  15. #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64)
  16. #include "rsaz_exp.h"
  17. #include <openssl/mem.h>
  18. #include "../../internal.h"
  19. // See crypto/bn/asm/rsaz-avx2.pl for further details.
  20. void rsaz_1024_norm2red_avx2(void *red, const void *norm);
  21. void rsaz_1024_mul_avx2(void *ret, const void *a, const void *b, const void *n,
  22. BN_ULONG k);
  23. void rsaz_1024_sqr_avx2(void *ret, const void *a, const void *n, BN_ULONG k,
  24. int cnt);
  25. void rsaz_1024_scatter5_avx2(void *tbl, const void *val, int i);
  26. void rsaz_1024_gather5_avx2(void *val, const void *tbl, int i);
  27. void rsaz_1024_red2norm_avx2(void *norm, const void *red);
  28. // one is 1 in RSAZ's representation.
  29. alignas(64) static const BN_ULONG one[40] = {
  30. 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  31. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
  32. // two80 is 2^80 in RSAZ's representation. Note RSAZ uses base 2^29, so this is
  33. // 2^(29*2 + 22) = 2^80, not 2^(64*2 + 22).
  34. alignas(64) static const BN_ULONG two80[40] = {
  35. 0, 0, 1 << 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  36. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
  37. void RSAZ_1024_mod_exp_avx2(BN_ULONG result_norm[16],
  38. const BN_ULONG base_norm[16], const BN_ULONG exponent[16],
  39. const BN_ULONG m_norm[16], const BN_ULONG RR[16], BN_ULONG k0) {
  40. alignas(64) uint8_t storage[(320 * 3) + (32 * 9 * 16)]; // 5.5KB
  41. unsigned char *a_inv, *m, *result, *table_s = storage + (320 * 3),
  42. *R2 = table_s; // borrow
  43. if (((((uintptr_t)storage & 4095) + 320) >> 12) != 0) {
  44. result = storage;
  45. a_inv = storage + 320;
  46. m = storage + (320 * 2); // should not cross page
  47. } else {
  48. m = storage; // should not cross page
  49. result = storage + 320;
  50. a_inv = storage + (320 * 2);
  51. }
  52. rsaz_1024_norm2red_avx2(m, m_norm);
  53. rsaz_1024_norm2red_avx2(a_inv, base_norm);
  54. rsaz_1024_norm2red_avx2(R2, RR);
  55. // Convert |R2| from the usual radix, giving R = 2^1024, to RSAZ's radix,
  56. // giving R = 2^(36*29) = 2^1044.
  57. rsaz_1024_mul_avx2(R2, R2, R2, m, k0);
  58. // R2 = 2^2048 * 2^2048 / 2^1044 = 2^3052
  59. rsaz_1024_mul_avx2(R2, R2, two80, m, k0);
  60. // R2 = 2^3052 * 2^80 / 2^1044 = 2^2088 = (2^1044)^2
  61. // table[0] = 1
  62. rsaz_1024_mul_avx2(result, R2, one, m, k0);
  63. // table[1] = a_inv^1
  64. rsaz_1024_mul_avx2(a_inv, a_inv, R2, m, k0);
  65. rsaz_1024_scatter5_avx2(table_s, result, 0);
  66. rsaz_1024_scatter5_avx2(table_s, a_inv, 1);
  67. // table[2] = a_inv^2
  68. rsaz_1024_sqr_avx2(result, a_inv, m, k0, 1);
  69. rsaz_1024_scatter5_avx2(table_s, result, 2);
  70. #if 0
  71. // This is almost 2x smaller and less than 1% slower.
  72. for (int index = 3; index < 32; index++) {
  73. rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
  74. rsaz_1024_scatter5_avx2(table_s, result, index);
  75. }
  76. #else
  77. // table[4] = a_inv^4
  78. rsaz_1024_sqr_avx2(result, result, m, k0, 1);
  79. rsaz_1024_scatter5_avx2(table_s, result, 4);
  80. // table[8] = a_inv^8
  81. rsaz_1024_sqr_avx2(result, result, m, k0, 1);
  82. rsaz_1024_scatter5_avx2(table_s, result, 8);
  83. // table[16] = a_inv^16
  84. rsaz_1024_sqr_avx2(result, result, m, k0, 1);
  85. rsaz_1024_scatter5_avx2(table_s, result, 16);
  86. // table[17] = a_inv^17
  87. rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
  88. rsaz_1024_scatter5_avx2(table_s, result, 17);
  89. // table[3]
  90. rsaz_1024_gather5_avx2(result, table_s, 2);
  91. rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
  92. rsaz_1024_scatter5_avx2(table_s, result, 3);
  93. // table[6]
  94. rsaz_1024_sqr_avx2(result, result, m, k0, 1);
  95. rsaz_1024_scatter5_avx2(table_s, result, 6);
  96. // table[12]
  97. rsaz_1024_sqr_avx2(result, result, m, k0, 1);
  98. rsaz_1024_scatter5_avx2(table_s, result, 12);
  99. // table[24]
  100. rsaz_1024_sqr_avx2(result, result, m, k0, 1);
  101. rsaz_1024_scatter5_avx2(table_s, result, 24);
  102. // table[25]
  103. rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
  104. rsaz_1024_scatter5_avx2(table_s, result, 25);
  105. // table[5]
  106. rsaz_1024_gather5_avx2(result, table_s, 4);
  107. rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
  108. rsaz_1024_scatter5_avx2(table_s, result, 5);
  109. // table[10]
  110. rsaz_1024_sqr_avx2(result, result, m, k0, 1);
  111. rsaz_1024_scatter5_avx2(table_s, result, 10);
  112. // table[20]
  113. rsaz_1024_sqr_avx2(result, result, m, k0, 1);
  114. rsaz_1024_scatter5_avx2(table_s, result, 20);
  115. // table[21]
  116. rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
  117. rsaz_1024_scatter5_avx2(table_s, result, 21);
  118. // table[7]
  119. rsaz_1024_gather5_avx2(result, table_s, 6);
  120. rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
  121. rsaz_1024_scatter5_avx2(table_s, result, 7);
  122. // table[14]
  123. rsaz_1024_sqr_avx2(result, result, m, k0, 1);
  124. rsaz_1024_scatter5_avx2(table_s, result, 14);
  125. // table[28]
  126. rsaz_1024_sqr_avx2(result, result, m, k0, 1);
  127. rsaz_1024_scatter5_avx2(table_s, result, 28);
  128. // table[29]
  129. rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
  130. rsaz_1024_scatter5_avx2(table_s, result, 29);
  131. // table[9]
  132. rsaz_1024_gather5_avx2(result, table_s, 8);
  133. rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
  134. rsaz_1024_scatter5_avx2(table_s, result, 9);
  135. // table[18]
  136. rsaz_1024_sqr_avx2(result, result, m, k0, 1);
  137. rsaz_1024_scatter5_avx2(table_s, result, 18);
  138. // table[19]
  139. rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
  140. rsaz_1024_scatter5_avx2(table_s, result, 19);
  141. // table[11]
  142. rsaz_1024_gather5_avx2(result, table_s, 10);
  143. rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
  144. rsaz_1024_scatter5_avx2(table_s, result, 11);
  145. // table[22]
  146. rsaz_1024_sqr_avx2(result, result, m, k0, 1);
  147. rsaz_1024_scatter5_avx2(table_s, result, 22);
  148. // table[23]
  149. rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
  150. rsaz_1024_scatter5_avx2(table_s, result, 23);
  151. // table[13]
  152. rsaz_1024_gather5_avx2(result, table_s, 12);
  153. rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
  154. rsaz_1024_scatter5_avx2(table_s, result, 13);
  155. // table[26]
  156. rsaz_1024_sqr_avx2(result, result, m, k0, 1);
  157. rsaz_1024_scatter5_avx2(table_s, result, 26);
  158. // table[27]
  159. rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
  160. rsaz_1024_scatter5_avx2(table_s, result, 27);
  161. // table[15]
  162. rsaz_1024_gather5_avx2(result, table_s, 14);
  163. rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
  164. rsaz_1024_scatter5_avx2(table_s, result, 15);
  165. // table[30]
  166. rsaz_1024_sqr_avx2(result, result, m, k0, 1);
  167. rsaz_1024_scatter5_avx2(table_s, result, 30);
  168. // table[31]
  169. rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
  170. rsaz_1024_scatter5_avx2(table_s, result, 31);
  171. #endif
  172. const uint8_t *p_str = (const uint8_t *)exponent;
  173. // load first window
  174. int wvalue = p_str[127] >> 3;
  175. rsaz_1024_gather5_avx2(result, table_s, wvalue);
  176. int index = 1014;
  177. while (index > -1) { // Loop for the remaining 127 windows.
  178. rsaz_1024_sqr_avx2(result, result, m, k0, 5);
  179. uint16_t wvalue_16;
  180. memcpy(&wvalue_16, &p_str[index / 8], sizeof(wvalue_16));
  181. wvalue = wvalue_16;
  182. wvalue = (wvalue >> (index % 8)) & 31;
  183. index -= 5;
  184. rsaz_1024_gather5_avx2(a_inv, table_s, wvalue); // Borrow |a_inv|.
  185. rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
  186. }
  187. // Square four times.
  188. rsaz_1024_sqr_avx2(result, result, m, k0, 4);
  189. wvalue = p_str[0] & 15;
  190. rsaz_1024_gather5_avx2(a_inv, table_s, wvalue); // Borrow |a_inv|.
  191. rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
  192. // Convert from Montgomery.
  193. rsaz_1024_mul_avx2(result, result, one, m, k0);
  194. rsaz_1024_red2norm_avx2(result_norm, result);
  195. OPENSSL_cleanse(storage, sizeof(storage));
  196. }
  197. #endif // OPENSSL_X86_64