1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/os/security/crypto/weakcryptospi/source/bigint/algorithms.cpp Fri Jun 15 03:10:57 2012 +0200
1.3 @@ -0,0 +1,1160 @@
1.4 +/*
1.5 +* Copyright (c) 2003-2009 Nokia Corporation and/or its subsidiary(-ies).
1.6 +* All rights reserved.
1.7 +* This component and the accompanying materials are made available
1.8 +* under the terms of the License "Eclipse Public License v1.0"
1.9 +* which accompanies this distribution, and is available
1.10 +* at the URL "http://www.eclipse.org/legal/epl-v10.html".
1.11 +*
1.12 +* Initial Contributors:
1.13 +* Nokia Corporation - initial contribution.
1.14 +*
1.15 +* Contributors:
1.16 +*
1.17 +* Description:
1.18 +*
1.19 +*/
1.20 +
1.21 +
1.22 +#include "words.h"
1.23 +#include "algorithms.h"
1.24 +
1.25 +word Add(word *C, const word *A, const word *B, unsigned int N)
1.26 +{
1.27 + assert (N%2 == 0);
1.28 + word carry = 0;
1.29 + for (unsigned int i = 0; i < N; i+=2)
1.30 + {
1.31 + dword u = (dword) carry + A[i] + B[i];
1.32 + C[i] = LOW_WORD(u);
1.33 + u = (dword) HIGH_WORD(u) + A[i+1] + B[i+1];
1.34 + C[i+1] = LOW_WORD(u);
1.35 + carry = HIGH_WORD(u);
1.36 + }
1.37 + return carry;
1.38 +}
1.39 +
1.40 +word Subtract(word *C, const word *A, const word *B, unsigned int N)
1.41 +{
1.42 + assert (N%2 == 0);
1.43 + word borrow=0;
1.44 + for (unsigned i = 0; i < N; i+=2)
1.45 + {
1.46 + dword u = (dword) A[i] - B[i] - borrow;
1.47 + C[i] = LOW_WORD(u);
1.48 + u = (dword) A[i+1] - B[i+1] - (word)(0-HIGH_WORD(u));
1.49 + C[i+1] = LOW_WORD(u);
1.50 + borrow = 0-HIGH_WORD(u);
1.51 + }
1.52 + return borrow;
1.53 +}
1.54 +
1.55 +int Compare(const word *A, const word *B, unsigned int N)
1.56 +{
1.57 + while (N--)
1.58 + if (A[N] > B[N])
1.59 + return 1;
1.60 + else if (A[N] < B[N])
1.61 + return -1;
1.62 +
1.63 + return 0;
1.64 +}
1.65 +
1.66 +// It is the job of the calling code to ensure that this won't carry.
1.67 +// If you aren't sure, use the next version that will tell you if you need to
1.68 +// grow your integer.
1.69 +// Having two of these creates ever so slightly more code but avoids having
1.70 +// ifdefs all over the rest of the code checking the following type stuff which
1.71 +// causes warnings in certain compilers about unused parameters in release
1.72 +// builds. We can't have that can we!
1.73 +/*
1.74 +Allows avoid this all over bigint.cpp and primes.cpp
1.75 +ifdef _DEBUG
1.76 + TUint carry = Increment(Ptr(), Size());
1.77 + assert(!carry);
1.78 +else
1.79 + Increment(Ptr(), Size())
1.80 +endif
1.81 +*/
1.82 +void IncrementNoCarry(word *A, unsigned int N, word B)
1.83 +{
1.84 + assert(N);
1.85 + word t = A[0];
1.86 + A[0] = t+B;
1.87 + if (A[0] >= t)
1.88 + return;
1.89 + for (unsigned i=1; i<N; i++)
1.90 + if (++A[i])
1.91 + return;
1.92 + assert(0);
1.93 +}
1.94 +
1.95 +word Increment(word *A, unsigned int N, word B)
1.96 +{
1.97 + assert(N);
1.98 + word t = A[0];
1.99 + A[0] = t+B;
1.100 + if (A[0] >= t)
1.101 + return 0;
1.102 + for (unsigned i=1; i<N; i++)
1.103 + if (++A[i])
1.104 + return 0;
1.105 + return 1;
1.106 +}
1.107 +
1.108 +//See commments above about IncrementNoCarry
1.109 +void DecrementNoCarry(word *A, unsigned int N, word B)
1.110 +{
1.111 + assert(N);
1.112 + word t = A[0];
1.113 + A[0] = t-B;
1.114 + if (A[0] <= t)
1.115 + return;
1.116 + for (unsigned i=1; i<N; i++)
1.117 + if (A[i]--)
1.118 + return;
1.119 + assert(0);
1.120 +}
1.121 +
1.122 +word Decrement(word *A, unsigned int N, word B)
1.123 +{
1.124 + assert(N);
1.125 + word t = A[0];
1.126 + A[0] = t-B;
1.127 + if (A[0] <= t)
1.128 + return 0;
1.129 + for (unsigned i=1; i<N; i++)
1.130 + if (A[i]--)
1.131 + return 0;
1.132 + return 1;
1.133 +}
1.134 +
1.135 +void TwosComplement(word *A, unsigned int N)
1.136 +{
1.137 + Decrement(A, N);
1.138 + for (unsigned i=0; i<N; i++)
1.139 + A[i] = ~A[i];
1.140 +}
1.141 +
1.142 +static word LinearMultiply(word *C, const word *A, word B, unsigned int N)
1.143 +{
1.144 + word carry=0;
1.145 + for(unsigned i=0; i<N; i++)
1.146 + {
1.147 + dword p = (dword)A[i] * B + carry;
1.148 + C[i] = LOW_WORD(p);
1.149 + carry = HIGH_WORD(p);
1.150 + }
1.151 + return carry;
1.152 +}
1.153 +
1.154 +static void AtomicMultiply(word *C, const word *A, const word *B)
1.155 +{
1.156 +/*
1.157 + word s;
1.158 + dword d;
1.159 +
1.160 + if (A1 >= A0)
1.161 + if (B0 >= B1)
1.162 + {
1.163 + s = 0;
1.164 + d = (dword)(A1-A0)*(B0-B1);
1.165 + }
1.166 + else
1.167 + {
1.168 + s = (A1-A0);
1.169 + d = (dword)s*(word)(B0-B1);
1.170 + }
1.171 + else
1.172 + if (B0 > B1)
1.173 + {
1.174 + s = (B0-B1);
1.175 + d = (word)(A1-A0)*(dword)s;
1.176 + }
1.177 + else
1.178 + {
1.179 + s = 0;
1.180 + d = (dword)(A0-A1)*(B1-B0);
1.181 + }
1.182 +*/
1.183 + // this segment is the branchless equivalent of above
1.184 + word D[4] = {A[1]-A[0], A[0]-A[1], B[0]-B[1], B[1]-B[0]};
1.185 + unsigned int ai = A[1] < A[0];
1.186 + unsigned int bi = B[0] < B[1];
1.187 + unsigned int di = ai & bi;
1.188 + dword d = (dword)D[di]*D[di+2];
1.189 + D[1] = D[3] = 0;
1.190 + unsigned int si = ai + !bi;
1.191 + word s = D[si];
1.192 +
1.193 + dword A0B0 = (dword)A[0]*B[0];
1.194 + C[0] = LOW_WORD(A0B0);
1.195 +
1.196 + dword A1B1 = (dword)A[1]*B[1];
1.197 + dword t = (dword) HIGH_WORD(A0B0) + LOW_WORD(A0B0) + LOW_WORD(d) + LOW_WORD(A1B1);
1.198 + C[1] = LOW_WORD(t);
1.199 +
1.200 + t = A1B1 + HIGH_WORD(t) + HIGH_WORD(A0B0) + HIGH_WORD(d) + HIGH_WORD(A1B1) - s;
1.201 + C[2] = LOW_WORD(t);
1.202 + C[3] = HIGH_WORD(t);
1.203 +}
1.204 +
1.205 +static word AtomicMultiplyAdd(word *C, const word *A, const word *B)
1.206 +{
1.207 + word D[4] = {A[1]-A[0], A[0]-A[1], B[0]-B[1], B[1]-B[0]};
1.208 + unsigned int ai = A[1] < A[0];
1.209 + unsigned int bi = B[0] < B[1];
1.210 + unsigned int di = ai & bi;
1.211 + dword d = (dword)D[di]*D[di+2];
1.212 + D[1] = D[3] = 0;
1.213 + unsigned int si = ai + !bi;
1.214 + word s = D[si];
1.215 +
1.216 + dword A0B0 = (dword)A[0]*B[0];
1.217 + dword t = A0B0 + C[0];
1.218 + C[0] = LOW_WORD(t);
1.219 +
1.220 + dword A1B1 = (dword)A[1]*B[1];
1.221 + t = (dword) HIGH_WORD(t) + LOW_WORD(A0B0) + LOW_WORD(d) + LOW_WORD(A1B1) + C[1];
1.222 + C[1] = LOW_WORD(t);
1.223 +
1.224 + t = (dword) HIGH_WORD(t) + LOW_WORD(A1B1) + HIGH_WORD(A0B0) + HIGH_WORD(d) + HIGH_WORD(A1B1) - s + C[2];
1.225 + C[2] = LOW_WORD(t);
1.226 +
1.227 + t = (dword) HIGH_WORD(t) + HIGH_WORD(A1B1) + C[3];
1.228 + C[3] = LOW_WORD(t);
1.229 + return HIGH_WORD(t);
1.230 +}
1.231 +
1.232 +static inline void AtomicMultiplyBottom(word *C, const word *A, const word *B)
1.233 +{
1.234 + dword t = (dword)A[0]*B[0];
1.235 + C[0] = LOW_WORD(t);
1.236 + C[1] = HIGH_WORD(t) + A[0]*B[1] + A[1]*B[0];
1.237 +}
1.238 +
1.239 +#define MulAcc(x, y) \
1.240 + p = (dword)A[x] * B[y] + c; \
1.241 + c = LOW_WORD(p); \
1.242 + p = (dword)d + HIGH_WORD(p); \
1.243 + d = LOW_WORD(p); \
1.244 + e += HIGH_WORD(p);
1.245 +
1.246 +#define SaveMulAcc(s, x, y) \
1.247 + R[s] = c; \
1.248 + p = (dword)A[x] * B[y] + d; \
1.249 + c = LOW_WORD(p); \
1.250 + p = (dword)e + HIGH_WORD(p); \
1.251 + d = LOW_WORD(p); \
1.252 + e = HIGH_WORD(p);
1.253 +
1.254 +#define MulAcc1(x, y) \
1.255 + p = (dword)A[x] * A[y] + c; \
1.256 + c = LOW_WORD(p); \
1.257 + p = (dword)d + HIGH_WORD(p); \
1.258 + d = LOW_WORD(p); \
1.259 + e += HIGH_WORD(p);
1.260 +
1.261 +#define SaveMulAcc1(s, x, y) \
1.262 + R[s] = c; \
1.263 + p = (dword)A[x] * A[y] + d; \
1.264 + c = LOW_WORD(p); \
1.265 + p = (dword)e + HIGH_WORD(p); \
1.266 + d = LOW_WORD(p); \
1.267 + e = HIGH_WORD(p);
1.268 +
1.269 +#define SquAcc(x, y) \
1.270 + p = (dword)A[x] * A[y]; \
1.271 + p = p + p + c; \
1.272 + c = LOW_WORD(p); \
1.273 + p = (dword)d + HIGH_WORD(p); \
1.274 + d = LOW_WORD(p); \
1.275 + e += HIGH_WORD(p);
1.276 +
1.277 +#define SaveSquAcc(s, x, y) \
1.278 + R[s] = c; \
1.279 + p = (dword)A[x] * A[y]; \
1.280 + p = p + p + d; \
1.281 + c = LOW_WORD(p); \
1.282 + p = (dword)e + HIGH_WORD(p); \
1.283 + d = LOW_WORD(p); \
1.284 + e = HIGH_WORD(p);
1.285 +
1.286 +// VC60 workaround: MSVC 6.0 has an optimization problem that makes
1.287 +// (dword)A*B where either A or B has been cast to a dword before
1.288 +// very expensive. Revisit a CombaSquare4() function when this
1.289 +// problem is fixed.
1.290 +
1.291 +// WARNING: KeithR. 05/08/03 This routine doesn't work with gcc on hardware
1.292 +// either. I've completely removed it. It may be worth looking into sometime
1.293 +// in the future.
1.294 +/*#ifndef __WINS__
1.295 +static void CombaSquare4(word *R, const word *A)
1.296 +{
1.297 + dword p;
1.298 + word c, d, e;
1.299 +
1.300 + p = (dword)A[0] * A[0];
1.301 + R[0] = LOW_WORD(p);
1.302 + c = HIGH_WORD(p);
1.303 + d = e = 0;
1.304 +
1.305 + SquAcc(0, 1);
1.306 +
1.307 + SaveSquAcc(1, 2, 0);
1.308 + MulAcc1(1, 1);
1.309 +
1.310 + SaveSquAcc(2, 0, 3);
1.311 + SquAcc(1, 2);
1.312 +
1.313 + SaveSquAcc(3, 3, 1);
1.314 + MulAcc1(2, 2);
1.315 +
1.316 + SaveSquAcc(4, 2, 3);
1.317 +
1.318 + R[5] = c;
1.319 + p = (dword)A[3] * A[3] + d;
1.320 + R[6] = LOW_WORD(p);
1.321 + R[7] = e + HIGH_WORD(p);
1.322 +}
1.323 +#endif */
1.324 +
1.325 +static void CombaMultiply4(word *R, const word *A, const word *B)
1.326 +{
1.327 + dword p;
1.328 + word c, d, e;
1.329 +
1.330 + p = (dword)A[0] * B[0];
1.331 + R[0] = LOW_WORD(p);
1.332 + c = HIGH_WORD(p);
1.333 + d = e = 0;
1.334 +
1.335 + MulAcc(0, 1);
1.336 + MulAcc(1, 0);
1.337 +
1.338 + SaveMulAcc(1, 2, 0);
1.339 + MulAcc(1, 1);
1.340 + MulAcc(0, 2);
1.341 +
1.342 + SaveMulAcc(2, 0, 3);
1.343 + MulAcc(1, 2);
1.344 + MulAcc(2, 1);
1.345 + MulAcc(3, 0);
1.346 +
1.347 + SaveMulAcc(3, 3, 1);
1.348 + MulAcc(2, 2);
1.349 + MulAcc(1, 3);
1.350 +
1.351 + SaveMulAcc(4, 2, 3);
1.352 + MulAcc(3, 2);
1.353 +
1.354 + R[5] = c;
1.355 + p = (dword)A[3] * B[3] + d;
1.356 + R[6] = LOW_WORD(p);
1.357 + R[7] = e + HIGH_WORD(p);
1.358 +}
1.359 +
1.360 +static void CombaMultiply8(word *R, const word *A, const word *B)
1.361 +{
1.362 + dword p;
1.363 + word c, d, e;
1.364 +
1.365 + p = (dword)A[0] * B[0];
1.366 + R[0] = LOW_WORD(p);
1.367 + c = HIGH_WORD(p);
1.368 + d = e = 0;
1.369 +
1.370 + MulAcc(0, 1);
1.371 + MulAcc(1, 0);
1.372 +
1.373 + SaveMulAcc(1, 2, 0);
1.374 + MulAcc(1, 1);
1.375 + MulAcc(0, 2);
1.376 +
1.377 + SaveMulAcc(2, 0, 3);
1.378 + MulAcc(1, 2);
1.379 + MulAcc(2, 1);
1.380 + MulAcc(3, 0);
1.381 +
1.382 + SaveMulAcc(3, 0, 4);
1.383 + MulAcc(1, 3);
1.384 + MulAcc(2, 2);
1.385 + MulAcc(3, 1);
1.386 + MulAcc(4, 0);
1.387 +
1.388 + SaveMulAcc(4, 0, 5);
1.389 + MulAcc(1, 4);
1.390 + MulAcc(2, 3);
1.391 + MulAcc(3, 2);
1.392 + MulAcc(4, 1);
1.393 + MulAcc(5, 0);
1.394 +
1.395 + SaveMulAcc(5, 0, 6);
1.396 + MulAcc(1, 5);
1.397 + MulAcc(2, 4);
1.398 + MulAcc(3, 3);
1.399 + MulAcc(4, 2);
1.400 + MulAcc(5, 1);
1.401 + MulAcc(6, 0);
1.402 +
1.403 + SaveMulAcc(6, 0, 7);
1.404 + MulAcc(1, 6);
1.405 + MulAcc(2, 5);
1.406 + MulAcc(3, 4);
1.407 + MulAcc(4, 3);
1.408 + MulAcc(5, 2);
1.409 + MulAcc(6, 1);
1.410 + MulAcc(7, 0);
1.411 +
1.412 + SaveMulAcc(7, 1, 7);
1.413 + MulAcc(2, 6);
1.414 + MulAcc(3, 5);
1.415 + MulAcc(4, 4);
1.416 + MulAcc(5, 3);
1.417 + MulAcc(6, 2);
1.418 + MulAcc(7, 1);
1.419 +
1.420 + SaveMulAcc(8, 2, 7);
1.421 + MulAcc(3, 6);
1.422 + MulAcc(4, 5);
1.423 + MulAcc(5, 4);
1.424 + MulAcc(6, 3);
1.425 + MulAcc(7, 2);
1.426 +
1.427 + SaveMulAcc(9, 3, 7);
1.428 + MulAcc(4, 6);
1.429 + MulAcc(5, 5);
1.430 + MulAcc(6, 4);
1.431 + MulAcc(7, 3);
1.432 +
1.433 + SaveMulAcc(10, 4, 7);
1.434 + MulAcc(5, 6);
1.435 + MulAcc(6, 5);
1.436 + MulAcc(7, 4);
1.437 +
1.438 + SaveMulAcc(11, 5, 7);
1.439 + MulAcc(6, 6);
1.440 + MulAcc(7, 5);
1.441 +
1.442 + SaveMulAcc(12, 6, 7);
1.443 + MulAcc(7, 6);
1.444 +
1.445 + R[13] = c;
1.446 + p = (dword)A[7] * B[7] + d;
1.447 + R[14] = LOW_WORD(p);
1.448 + R[15] = e + HIGH_WORD(p);
1.449 +}
1.450 +
1.451 +static void CombaMultiplyBottom4(word *R, const word *A, const word *B)
1.452 +{
1.453 + dword p;
1.454 + word c, d, e;
1.455 +
1.456 + p = (dword)A[0] * B[0];
1.457 + R[0] = LOW_WORD(p);
1.458 + c = HIGH_WORD(p);
1.459 + d = e = 0;
1.460 +
1.461 + MulAcc(0, 1);
1.462 + MulAcc(1, 0);
1.463 +
1.464 + SaveMulAcc(1, 2, 0);
1.465 + MulAcc(1, 1);
1.466 + MulAcc(0, 2);
1.467 +
1.468 + R[2] = c;
1.469 + R[3] = d + A[0] * B[3] + A[1] * B[2] + A[2] * B[1] + A[3] * B[0];
1.470 +}
1.471 +
1.472 +static void CombaMultiplyBottom8(word *R, const word *A, const word *B)
1.473 +{
1.474 + dword p;
1.475 + word c, d, e;
1.476 +
1.477 + p = (dword)A[0] * B[0];
1.478 + R[0] = LOW_WORD(p);
1.479 + c = HIGH_WORD(p);
1.480 + d = e = 0;
1.481 +
1.482 + MulAcc(0, 1);
1.483 + MulAcc(1, 0);
1.484 +
1.485 + SaveMulAcc(1, 2, 0);
1.486 + MulAcc(1, 1);
1.487 + MulAcc(0, 2);
1.488 +
1.489 + SaveMulAcc(2, 0, 3);
1.490 + MulAcc(1, 2);
1.491 + MulAcc(2, 1);
1.492 + MulAcc(3, 0);
1.493 +
1.494 + SaveMulAcc(3, 0, 4);
1.495 + MulAcc(1, 3);
1.496 + MulAcc(2, 2);
1.497 + MulAcc(3, 1);
1.498 + MulAcc(4, 0);
1.499 +
1.500 + SaveMulAcc(4, 0, 5);
1.501 + MulAcc(1, 4);
1.502 + MulAcc(2, 3);
1.503 + MulAcc(3, 2);
1.504 + MulAcc(4, 1);
1.505 + MulAcc(5, 0);
1.506 +
1.507 + SaveMulAcc(5, 0, 6);
1.508 + MulAcc(1, 5);
1.509 + MulAcc(2, 4);
1.510 + MulAcc(3, 3);
1.511 + MulAcc(4, 2);
1.512 + MulAcc(5, 1);
1.513 + MulAcc(6, 0);
1.514 +
1.515 + R[6] = c;
1.516 + R[7] = d + A[0] * B[7] + A[1] * B[6] + A[2] * B[5] + A[3] * B[4] +
1.517 + A[4] * B[3] + A[5] * B[2] + A[6] * B[1] + A[7] * B[0];
1.518 +}
1.519 +
1.520 +#undef MulAcc
1.521 +#undef SaveMulAcc
1.522 +static void AtomicInverseModPower2(word *C, word A0, word A1)
1.523 +{
1.524 + assert(A0%2==1);
1.525 +
1.526 + dword A=MAKE_DWORD(A0, A1), R=A0%8;
1.527 +
1.528 + for (unsigned i=3; i<2*WORD_BITS; i*=2)
1.529 + R = R*(2-R*A);
1.530 +
1.531 + assert(R*A==1);
1.532 +
1.533 + C[0] = LOW_WORD(R);
1.534 + C[1] = HIGH_WORD(R);
1.535 +}
1.536 +// ********************************************************
1.537 +
1.538 +#define A0 A
1.539 +#define A1 (A+N2)
1.540 +#define B0 B
1.541 +#define B1 (B+N2)
1.542 +
1.543 +#define T0 T
1.544 +#define T1 (T+N2)
1.545 +#define T2 (T+N)
1.546 +#define T3 (T+N+N2)
1.547 +
1.548 +#define R0 R
1.549 +#define R1 (R+N2)
1.550 +#define R2 (R+N)
1.551 +#define R3 (R+N+N2)
1.552 +
1.553 +// R[2*N] - result = A*B
1.554 +// T[2*N] - temporary work space
1.555 +// A[N] --- multiplier
1.556 +// B[N] --- multiplicant
1.557 +
1.558 +void RecursiveMultiply(word *R, word *T, const word *A, const word *B, unsigned int N)
1.559 +{
1.560 + assert(N>=2 && N%2==0);
1.561 +
1.562 + if (N==2)
1.563 + AtomicMultiply(R, A, B);
1.564 + else if (N==4)
1.565 + CombaMultiply4(R, A, B);
1.566 + else if (N==8)
1.567 + CombaMultiply8(R, A, B);
1.568 + else
1.569 + {
1.570 + const unsigned int N2 = N/2;
1.571 + int carry;
1.572 +
1.573 + int aComp = Compare(A0, A1, N2);
1.574 + int bComp = Compare(B0, B1, N2);
1.575 +
1.576 + switch (2*aComp + aComp + bComp)
1.577 + {
1.578 + case -4:
1.579 + Subtract(R0, A1, A0, N2);
1.580 + Subtract(R1, B0, B1, N2);
1.581 + RecursiveMultiply(T0, T2, R0, R1, N2);
1.582 + Subtract(T1, T1, R0, N2);
1.583 + carry = -1;
1.584 + break;
1.585 + case -2:
1.586 + Subtract(R0, A1, A0, N2);
1.587 + Subtract(R1, B0, B1, N2);
1.588 + RecursiveMultiply(T0, T2, R0, R1, N2);
1.589 + carry = 0;
1.590 + break;
1.591 + case 2:
1.592 + Subtract(R0, A0, A1, N2);
1.593 + Subtract(R1, B1, B0, N2);
1.594 + RecursiveMultiply(T0, T2, R0, R1, N2);
1.595 + carry = 0;
1.596 + break;
1.597 + case 4:
1.598 + Subtract(R0, A1, A0, N2);
1.599 + Subtract(R1, B0, B1, N2);
1.600 + RecursiveMultiply(T0, T2, R0, R1, N2);
1.601 + Subtract(T1, T1, R1, N2);
1.602 + carry = -1;
1.603 + break;
1.604 + default:
1.605 + SetWords(T0, 0, N);
1.606 + carry = 0;
1.607 + }
1.608 +
1.609 + RecursiveMultiply(R0, T2, A0, B0, N2);
1.610 + RecursiveMultiply(R2, T2, A1, B1, N2);
1.611 +
1.612 + // now T[01] holds (A1-A0)*(B0-B1), R[01] holds A0*B0, R[23] holds A1*B1
1.613 +
1.614 + carry += Add(T0, T0, R0, N);
1.615 + carry += Add(T0, T0, R2, N);
1.616 + carry += Add(R1, R1, T0, N);
1.617 +
1.618 + assert (carry >= 0 && carry <= 2);
1.619 + Increment(R3, N2, carry);
1.620 + }
1.621 +}
1.622 +
1.623 +// R[2*N] - result = A*A
1.624 +// T[2*N] - temporary work space
1.625 +// A[N] --- number to be squared
1.626 +
1.627 +void RecursiveSquare(word *R, word *T, const word *A, unsigned int N)
1.628 +{
1.629 + assert(N && N%2==0);
1.630 +
1.631 + if (N==2)
1.632 + AtomicMultiply(R, A, A);
1.633 + else if (N==4)
1.634 + {
1.635 + // VC60 workaround: MSVC 6.0 has an optimization problem that makes
1.636 + // (dword)A*B where either A or B has been cast to a dword before
1.637 + // very expensive. Revisit a CombaSquare4() function when this
1.638 + // problem is fixed.
1.639 +
1.640 +// WARNING: KeithR. 05/08/03 This routine doesn't work with gcc on hardware
1.641 +// either. I've completely removed it. It may be worth looking into sometime
1.642 +// in the future. Therefore, we use the CombaMultiply4 on all targets.
1.643 +//#ifdef __WINS__
1.644 + CombaMultiply4(R, A, A);
1.645 +/*#else
1.646 + CombaSquare4(R, A);
1.647 +#endif*/
1.648 + }
1.649 + else
1.650 + {
1.651 + const unsigned int N2 = N/2;
1.652 +
1.653 + RecursiveSquare(R0, T2, A0, N2);
1.654 + RecursiveSquare(R2, T2, A1, N2);
1.655 + RecursiveMultiply(T0, T2, A0, A1, N2);
1.656 +
1.657 + word carry = Add(R1, R1, T0, N);
1.658 + carry += Add(R1, R1, T0, N);
1.659 + Increment(R3, N2, carry);
1.660 + }
1.661 +}
1.662 +// R[N] - bottom half of A*B
1.663 +// T[N] - temporary work space
1.664 +// A[N] - multiplier
1.665 +// B[N] - multiplicant
1.666 +
1.667 +void RecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, unsigned int N)
1.668 +{
1.669 + assert(N>=2 && N%2==0);
1.670 +
1.671 + if (N==2)
1.672 + AtomicMultiplyBottom(R, A, B);
1.673 + else if (N==4)
1.674 + CombaMultiplyBottom4(R, A, B);
1.675 + else if (N==8)
1.676 + CombaMultiplyBottom8(R, A, B);
1.677 + else
1.678 + {
1.679 + const unsigned int N2 = N/2;
1.680 +
1.681 + RecursiveMultiply(R, T, A0, B0, N2);
1.682 + RecursiveMultiplyBottom(T0, T1, A1, B0, N2);
1.683 + Add(R1, R1, T0, N2);
1.684 + RecursiveMultiplyBottom(T0, T1, A0, B1, N2);
1.685 + Add(R1, R1, T0, N2);
1.686 + }
1.687 +}
1.688 +
1.689 +// R[N] --- upper half of A*B
1.690 +// T[2*N] - temporary work space
1.691 +// L[N] --- lower half of A*B
1.692 +// A[N] --- multiplier
1.693 +// B[N] --- multiplicant
1.694 +
1.695 +void RecursiveMultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, unsigned int N)
1.696 +{
1.697 + assert(N>=2 && N%2==0);
1.698 +
1.699 + if (N==2)
1.700 + {
1.701 + AtomicMultiply(T, A, B);
1.702 + ((dword *)R)[0] = ((dword *)T)[1];
1.703 + }
1.704 + else if (N==4)
1.705 + {
1.706 + CombaMultiply4(T, A, B);
1.707 + ((dword *)R)[0] = ((dword *)T)[2];
1.708 + ((dword *)R)[1] = ((dword *)T)[3];
1.709 + }
1.710 + else
1.711 + {
1.712 + const unsigned int N2 = N/2;
1.713 + int carry;
1.714 +
1.715 + int aComp = Compare(A0, A1, N2);
1.716 + int bComp = Compare(B0, B1, N2);
1.717 +
1.718 + switch (2*aComp + aComp + bComp)
1.719 + {
1.720 + case -4:
1.721 + Subtract(R0, A1, A0, N2);
1.722 + Subtract(R1, B0, B1, N2);
1.723 + RecursiveMultiply(T0, T2, R0, R1, N2);
1.724 + Subtract(T1, T1, R0, N2);
1.725 + carry = -1;
1.726 + break;
1.727 + case -2:
1.728 + Subtract(R0, A1, A0, N2);
1.729 + Subtract(R1, B0, B1, N2);
1.730 + RecursiveMultiply(T0, T2, R0, R1, N2);
1.731 + carry = 0;
1.732 + break;
1.733 + case 2:
1.734 + Subtract(R0, A0, A1, N2);
1.735 + Subtract(R1, B1, B0, N2);
1.736 + RecursiveMultiply(T0, T2, R0, R1, N2);
1.737 + carry = 0;
1.738 + break;
1.739 + case 4:
1.740 + Subtract(R0, A1, A0, N2);
1.741 + Subtract(R1, B0, B1, N2);
1.742 + RecursiveMultiply(T0, T2, R0, R1, N2);
1.743 + Subtract(T1, T1, R1, N2);
1.744 + carry = -1;
1.745 + break;
1.746 + default:
1.747 + SetWords(T0, 0, N);
1.748 + carry = 0;
1.749 + }
1.750 +
1.751 + RecursiveMultiply(T2, R0, A1, B1, N2);
1.752 +
1.753 + // now T[01] holds (A1-A0)*(B0-B1), T[23] holds A1*B1
1.754 +
1.755 + CopyWords(R0, L+N2, N2);
1.756 + word c2 = Subtract(R0, R0, L, N2);
1.757 + c2 += Subtract(R0, R0, T0, N2);
1.758 + word t = (Compare(R0, T2, N2) == -1);
1.759 +
1.760 + carry += t;
1.761 + carry += Increment(R0, N2, c2+t);
1.762 + carry += Add(R0, R0, T1, N2);
1.763 + carry += Add(R0, R0, T3, N2);
1.764 +
1.765 + CopyWords(R1, T3, N2);
1.766 + assert (carry >= 0 && carry <= 2);
1.767 + Increment(R1, N2, carry);
1.768 + }
1.769 +}
1.770 +
1.771 +// R[NA+NB] - result = A*B
1.772 +// T[NA+NB] - temporary work space
1.773 +// A[NA] ---- multiplier
1.774 +// B[NB] ---- multiplicant
1.775 +
1.776 +void AsymmetricMultiply(word *R, word *T, const word *A, unsigned int NA, const word *B, unsigned int NB)
1.777 +{
1.778 + if (NA == NB)
1.779 + {
1.780 + if (A == B)
1.781 + RecursiveSquare(R, T, A, NA);
1.782 + else
1.783 + RecursiveMultiply(R, T, A, B, NA);
1.784 +
1.785 + return;
1.786 + }
1.787 +
1.788 + if (NA > NB)
1.789 + {
1.790 + TClassSwap(A, B);
1.791 + TClassSwap(NA, NB);
1.792 + //std::swap(A, B);
1.793 + //std::swap(NA, NB);
1.794 + }
1.795 +
1.796 + assert(NB % NA == 0);
1.797 + assert((NB/NA)%2 == 0); // NB is an even multiple of NA
1.798 +
1.799 + if (NA==2 && !A[1])
1.800 + {
1.801 + switch (A[0])
1.802 + {
1.803 + case 0:
1.804 + SetWords(R, 0, NB+2);
1.805 + return;
1.806 + case 1:
1.807 + CopyWords(R, B, NB);
1.808 + R[NB] = R[NB+1] = 0;
1.809 + return;
1.810 + default:
1.811 + R[NB] = LinearMultiply(R, B, A[0], NB);
1.812 + R[NB+1] = 0;
1.813 + return;
1.814 + }
1.815 + }
1.816 +
1.817 + RecursiveMultiply(R, T, A, B, NA);
1.818 + CopyWords(T+2*NA, R+NA, NA);
1.819 +
1.820 + unsigned i;
1.821 +
1.822 + for (i=2*NA; i<NB; i+=2*NA)
1.823 + RecursiveMultiply(T+NA+i, T, A, B+i, NA);
1.824 + for (i=NA; i<NB; i+=2*NA)
1.825 + RecursiveMultiply(R+i, T, A, B+i, NA);
1.826 +
1.827 + if (Add(R+NA, R+NA, T+2*NA, NB-NA))
1.828 + Increment(R+NB, NA);
1.829 +}
1.830 +// R[N] ----- result = A inverse mod 2**(WORD_BITS*N)
1.831 +// T[3*N/2] - temporary work space
1.832 +// A[N] ----- an odd number as input
1.833 +
1.834 +void RecursiveInverseModPower2(word *R, word *T, const word *A, unsigned int N)
1.835 +{
1.836 + if (N==2)
1.837 + AtomicInverseModPower2(R, A[0], A[1]);
1.838 + else
1.839 + {
1.840 + const unsigned int N2 = N/2;
1.841 + RecursiveInverseModPower2(R0, T0, A0, N2);
1.842 + T0[0] = 1;
1.843 + SetWords(T0+1, 0, N2-1);
1.844 + RecursiveMultiplyTop(R1, T1, T0, R0, A0, N2);
1.845 + RecursiveMultiplyBottom(T0, T1, R0, A1, N2);
1.846 + Add(T0, R1, T0, N2);
1.847 + TwosComplement(T0, N2);
1.848 + RecursiveMultiplyBottom(R1, T1, R0, T0, N2);
1.849 + }
1.850 +}
1.851 +#undef A0
1.852 +#undef A1
1.853 +#undef B0
1.854 +#undef B1
1.855 +
1.856 +#undef T0
1.857 +#undef T1
1.858 +#undef T2
1.859 +#undef T3
1.860 +
1.861 +#undef R0
1.862 +#undef R1
1.863 +#undef R2
1.864 +#undef R3
1.865 +
1.866 +// R[N] --- result = X/(2**(WORD_BITS*N)) mod M
1.867 +// T[3*N] - temporary work space
1.868 +// X[2*N] - number to be reduced
1.869 +// M[N] --- modulus
1.870 +// U[N] --- multiplicative inverse of M mod 2**(WORD_BITS*N)
1.871 +
1.872 +void MontgomeryReduce(word *R, word *T, const word *X, const word *M, const word *U, unsigned int N)
1.873 +{
1.874 + RecursiveMultiplyBottom(R, T, X, U, N);
1.875 + RecursiveMultiplyTop(T, T+N, X, R, M, N);
1.876 + if (Subtract(R, X+N, T, N))
1.877 + {
1.878 +#ifdef _DEBUG
1.879 + word carry = Add(R, R, M, N);
1.880 + assert(carry);
1.881 +#else
1.882 + Add(R, R, M, N);
1.883 +#endif
1.884 + }
1.885 +}
1.886 +
1.887 +// do a 3 word by 2 word divide, returns quotient and leaves remainder in A
1.888 +static word SubatomicDivide(word *A, word B0, word B1)
1.889 +{
1.890 + // assert {A[2],A[1]} < {B1,B0}, so quotient can fit in a word
1.891 + assert(A[2] < B1 || (A[2]==B1 && A[1] < B0));
1.892 +
1.893 + dword p, u;
1.894 + word Q;
1.895 +
1.896 + // estimate the quotient: do a 2 word by 1 word divide
1.897 + if (B1+1 == 0)
1.898 + Q = A[2];
1.899 + else
1.900 + Q = word(MAKE_DWORD(A[1], A[2]) / (B1+1));
1.901 +
1.902 + // now subtract Q*B from A
1.903 + p = (dword) B0*Q;
1.904 + u = (dword) A[0] - LOW_WORD(p);
1.905 + A[0] = LOW_WORD(u);
1.906 + u = (dword) A[1] - HIGH_WORD(p) - (word)(0-HIGH_WORD(u)) - (dword)B1*Q;
1.907 + A[1] = LOW_WORD(u);
1.908 + A[2] += HIGH_WORD(u);
1.909 +
1.910 + // Q <= actual quotient, so fix it
1.911 + while (A[2] || A[1] > B1 || (A[1]==B1 && A[0]>=B0))
1.912 + {
1.913 + u = (dword) A[0] - B0;
1.914 + A[0] = LOW_WORD(u);
1.915 + u = (dword) A[1] - B1 - (word)(0-HIGH_WORD(u));
1.916 + A[1] = LOW_WORD(u);
1.917 + A[2] += HIGH_WORD(u);
1.918 + Q++;
1.919 + assert(Q); // shouldn't overflow
1.920 + }
1.921 +
1.922 + return Q;
1.923 +}
1.924 +
1.925 +// do a 4 word by 2 word divide, returns 2 word quotient in Q0 and Q1
1.926 +static inline void AtomicDivide(word *Q, const word *A, const word *B)
1.927 +{
1.928 + if (!B[0] && !B[1]) // if divisor is 0, we assume divisor==2**(2*WORD_BITS)
1.929 + {
1.930 + Q[0] = A[2];
1.931 + Q[1] = A[3];
1.932 + }
1.933 + else
1.934 + {
1.935 + word T[4];
1.936 + T[0] = A[0]; T[1] = A[1]; T[2] = A[2]; T[3] = A[3];
1.937 + Q[1] = SubatomicDivide(T+1, B[0], B[1]);
1.938 + Q[0] = SubatomicDivide(T, B[0], B[1]);
1.939 +
1.940 +#ifdef _DEBUG
1.941 + // multiply quotient and divisor and add remainder, make sure it equals dividend
1.942 + assert(!T[2] && !T[3] && (T[1] < B[1] || (T[1]==B[1] && T[0]<B[0])));
1.943 + word P[4];
1.944 + AtomicMultiply(P, Q, B);
1.945 + Add(P, P, T, 4);
1.946 + assert(Mem::Compare((TUint8*)P, 4*WORD_SIZE, (TUint8*)A, 4*WORD_SIZE)==0);
1.947 +#endif
1.948 + }
1.949 +}
1.950 +
1.951 +// for use by Divide(), corrects the underestimated quotient {Q1,Q0}
1.952 +static void CorrectQuotientEstimate(word *R, word *T, word *Q, const word *B, unsigned int N)
1.953 +{
1.954 + assert(N && N%2==0);
1.955 +
1.956 + if (Q[1])
1.957 + {
1.958 + T[N] = T[N+1] = 0;
1.959 + unsigned i;
1.960 + for (i=0; i<N; i+=4)
1.961 + AtomicMultiply(T+i, Q, B+i);
1.962 + for (i=2; i<N; i+=4)
1.963 + if (AtomicMultiplyAdd(T+i, Q, B+i))
1.964 + T[i+5] += (++T[i+4]==0);
1.965 + }
1.966 + else
1.967 + {
1.968 + T[N] = LinearMultiply(T, B, Q[0], N);
1.969 + T[N+1] = 0;
1.970 + }
1.971 +
1.972 +#ifdef _DEBUG
1.973 + word borrow = Subtract(R, R, T, N+2);
1.974 + assert(!borrow && !R[N+1]);
1.975 +#else
1.976 + Subtract(R, R, T, N+2);
1.977 +#endif
1.978 +
1.979 + while (R[N] || Compare(R, B, N) >= 0)
1.980 + {
1.981 + R[N] -= Subtract(R, R, B, N);
1.982 + Q[1] += (++Q[0]==0);
1.983 + assert(Q[0] || Q[1]); // no overflow
1.984 + }
1.985 +}
1.986 +
1.987 +// R[NB] -------- remainder = A%B
1.988 +// Q[NA-NB+2] --- quotient = A/B
1.989 +// T[NA+2*NB+4] - temp work space
1.990 +// A[NA] -------- dividend
1.991 +// B[NB] -------- divisor
1.992 +
1.993 +void Divide(word *R, word *Q, word *T, const word *A, unsigned int NA, const word *B, unsigned int NB)
1.994 +{
1.995 + assert(NA && NB && NA%2==0 && NB%2==0);
1.996 + assert(B[NB-1] || B[NB-2]);
1.997 + assert(NB <= NA);
1.998 +
1.999 + // set up temporary work space
1.1000 + word *const TA=T;
1.1001 + word *const TB=T+NA+2;
1.1002 + word *const TP=T+NA+2+NB;
1.1003 +
1.1004 + // copy B into TB and normalize it so that TB has highest bit set to 1
1.1005 + unsigned shiftWords = (B[NB-1]==0);
1.1006 + TB[0] = TB[NB-1] = 0;
1.1007 + CopyWords(TB+shiftWords, B, NB-shiftWords);
1.1008 + unsigned shiftBits = WORD_BITS - BitPrecision(TB[NB-1]);
1.1009 + assert(shiftBits < WORD_BITS);
1.1010 + ShiftWordsLeftByBits(TB, NB, shiftBits);
1.1011 +
1.1012 + // copy A into TA and normalize it
1.1013 + TA[0] = TA[NA] = TA[NA+1] = 0;
1.1014 + CopyWords(TA+shiftWords, A, NA);
1.1015 + ShiftWordsLeftByBits(TA, NA+2, shiftBits);
1.1016 +
1.1017 + if (TA[NA+1]==0 && TA[NA] <= 1)
1.1018 + {
1.1019 + Q[NA-NB+1] = Q[NA-NB] = 0;
1.1020 + while (TA[NA] || Compare(TA+NA-NB, TB, NB) >= 0)
1.1021 + {
1.1022 + TA[NA] -= Subtract(TA+NA-NB, TA+NA-NB, TB, NB);
1.1023 + ++Q[NA-NB];
1.1024 + }
1.1025 + }
1.1026 + else
1.1027 + {
1.1028 + NA+=2;
1.1029 + assert(Compare(TA+NA-NB, TB, NB) < 0);
1.1030 + }
1.1031 +
1.1032 + word BT[2];
1.1033 + BT[0] = TB[NB-2] + 1;
1.1034 + BT[1] = TB[NB-1] + (BT[0]==0);
1.1035 +
1.1036 + // start reducing TA mod TB, 2 words at a time
1.1037 + for (unsigned i=NA-2; i>=NB; i-=2)
1.1038 + {
1.1039 + AtomicDivide(Q+i-NB, TA+i-2, BT);
1.1040 + CorrectQuotientEstimate(TA+i-NB, TP, Q+i-NB, TB, NB);
1.1041 + }
1.1042 +
1.1043 + // copy TA into R, and denormalize it
1.1044 + CopyWords(R, TA+shiftWords, NB);
1.1045 + ShiftWordsRightByBits(R, NB, shiftBits);
1.1046 +}
1.1047 +
1.1048 +static inline unsigned int EvenWordCount(const word *X, unsigned int N)
1.1049 +{
1.1050 + while (N && X[N-2]==0 && X[N-1]==0)
1.1051 + N-=2;
1.1052 + return N;
1.1053 +}
1.1054 +
1.1055 +// return k
1.1056 +// R[N] --- result = A^(-1) * 2^k mod M
1.1057 +// T[4*N] - temporary work space
1.1058 +// A[NA] -- number to take inverse of
1.1059 +// M[N] --- modulus
1.1060 +
1.1061 +unsigned int AlmostInverse(word *R, word *T, const word *A, unsigned int NA, const word *M, unsigned int N)
1.1062 +{
1.1063 + assert(NA<=N && N && N%2==0);
1.1064 +
1.1065 + word *b = T;
1.1066 + word *c = T+N;
1.1067 + word *f = T+2*N;
1.1068 + word *g = T+3*N;
1.1069 + unsigned int bcLen=2, fgLen=EvenWordCount(M, N);
1.1070 + unsigned int k=0, s=0;
1.1071 +
1.1072 + SetWords(T, 0, 3*N);
1.1073 + b[0]=1;
1.1074 + CopyWords(f, A, NA);
1.1075 + CopyWords(g, M, N);
1.1076 +
1.1077 + FOREVER
1.1078 + {
1.1079 + word t=f[0];
1.1080 + while (!t)
1.1081 + {
1.1082 + if (EvenWordCount(f, fgLen)==0)
1.1083 + {
1.1084 + SetWords(R, 0, N);
1.1085 + return 0;
1.1086 + }
1.1087 +
1.1088 + ShiftWordsRightByWords(f, fgLen, 1);
1.1089 + if (c[bcLen-1]) bcLen+=2;
1.1090 + assert(bcLen <= N);
1.1091 + ShiftWordsLeftByWords(c, bcLen, 1);
1.1092 + k+=WORD_BITS;
1.1093 + t=f[0];
1.1094 + }
1.1095 +
1.1096 + unsigned int i=0;
1.1097 + while (t%2 == 0)
1.1098 + {
1.1099 + t>>=1;
1.1100 + i++;
1.1101 + }
1.1102 + k+=i;
1.1103 +
1.1104 + if (t==1 && f[1]==0 && EvenWordCount(f, fgLen)==2)
1.1105 + {
1.1106 + if (s%2==0)
1.1107 + CopyWords(R, b, N);
1.1108 + else
1.1109 + Subtract(R, M, b, N);
1.1110 + return k;
1.1111 + }
1.1112 +
1.1113 + ShiftWordsRightByBits(f, fgLen, i);
1.1114 + t=ShiftWordsLeftByBits(c, bcLen, i);
1.1115 + if (t)
1.1116 + {
1.1117 + c[bcLen] = t;
1.1118 + bcLen+=2;
1.1119 + assert(bcLen <= N);
1.1120 + }
1.1121 +
1.1122 + if (f[fgLen-2]==0 && g[fgLen-2]==0 && f[fgLen-1]==0 && g[fgLen-1]==0)
1.1123 + fgLen-=2;
1.1124 +
1.1125 + if (Compare(f, g, fgLen)==-1)
1.1126 + {
1.1127 + TClassSwap<word*>(f,g);
1.1128 + TClassSwap<word*>(b,c);
1.1129 + s++;
1.1130 + }
1.1131 +
1.1132 + Subtract(f, f, g, fgLen);
1.1133 +
1.1134 + if (Add(b, b, c, bcLen))
1.1135 + {
1.1136 + b[bcLen] = 1;
1.1137 + bcLen+=2;
1.1138 + assert(bcLen <= N);
1.1139 + }
1.1140 + }
1.1141 +}
1.1142 +
1.1143 +// R[N] - result = A/(2^k) mod M
1.1144 +// A[N] - input
1.1145 +// M[N] - modulus
1.1146 +
1.1147 +void DivideByPower2Mod(word *R, const word *A, unsigned int k, const word *M, unsigned int N)
1.1148 +{
1.1149 + CopyWords(R, A, N);
1.1150 +
1.1151 + while (k--)
1.1152 + {
1.1153 + if (R[0]%2==0)
1.1154 + ShiftWordsRightByBits(R, N, 1);
1.1155 + else
1.1156 + {
1.1157 + word carry = Add(R, R, M, N);
1.1158 + ShiftWordsRightByBits(R, N, 1);
1.1159 + R[N-1] += carry<<(WORD_BITS-1);
1.1160 + }
1.1161 + }
1.1162 +}
1.1163 +