os/security/crypto/weakcryptospi/source/bigint/algorithms.cpp
changeset 0 bde4ae8d615e
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/os/security/crypto/weakcryptospi/source/bigint/algorithms.cpp	Fri Jun 15 03:10:57 2012 +0200
     1.3 @@ -0,0 +1,1160 @@
     1.4 +/*
     1.5 +* Copyright (c) 2003-2009 Nokia Corporation and/or its subsidiary(-ies).
     1.6 +* All rights reserved.
     1.7 +* This component and the accompanying materials are made available
     1.8 +* under the terms of the License "Eclipse Public License v1.0"
     1.9 +* which accompanies this distribution, and is available
    1.10 +* at the URL "http://www.eclipse.org/legal/epl-v10.html".
    1.11 +*
    1.12 +* Initial Contributors:
    1.13 +* Nokia Corporation - initial contribution.
    1.14 +*
    1.15 +* Contributors:
    1.16 +*
    1.17 +* Description: 
    1.18 +*
    1.19 +*/
    1.20 +
    1.21 +
    1.22 +#include "words.h"
    1.23 +#include "algorithms.h"
    1.24 +
    1.25 +word Add(word *C, const word *A, const word *B, unsigned int N)
    1.26 +{
    1.27 +	assert (N%2 == 0);
    1.28 +	word carry = 0;
    1.29 +	for (unsigned int i = 0; i < N; i+=2)
    1.30 +	{
    1.31 +		dword u = (dword) carry + A[i] + B[i];
    1.32 +		C[i] = LOW_WORD(u);
    1.33 +		u = (dword) HIGH_WORD(u) + A[i+1] + B[i+1];
    1.34 +		C[i+1] = LOW_WORD(u);
    1.35 +		carry = HIGH_WORD(u);
    1.36 +	}
    1.37 +	return carry;
    1.38 +}
    1.39 +
    1.40 +word Subtract(word *C, const word *A, const word *B, unsigned int N)
    1.41 +{
    1.42 +	assert (N%2 == 0);
    1.43 +	word borrow=0;
    1.44 +	for (unsigned i = 0; i < N; i+=2)
    1.45 +	{
    1.46 +		dword u = (dword) A[i] - B[i] - borrow;
    1.47 +		C[i] = LOW_WORD(u);
    1.48 +		u = (dword) A[i+1] - B[i+1] - (word)(0-HIGH_WORD(u));
    1.49 +		C[i+1] = LOW_WORD(u);
    1.50 +		borrow = 0-HIGH_WORD(u);
    1.51 +	}
    1.52 +	return borrow;
    1.53 +}
    1.54 +
    1.55 +int Compare(const word *A, const word *B, unsigned int N)
    1.56 +{
    1.57 +	while (N--)
    1.58 +		if (A[N] > B[N])
    1.59 +			return 1;
    1.60 +		else if (A[N] < B[N])
    1.61 +			return -1;
    1.62 +
    1.63 +	return 0;
    1.64 +}
    1.65 +
    1.66 +// It is the job of the calling code to ensure that this won't carry.
    1.67 +// If you aren't sure, use the next version that will tell you if you need to
    1.68 +// grow your integer.
    1.69 +// Having two of these creates ever so slightly more code but avoids having
    1.70 +// ifdefs all over the rest of the code checking the following type stuff which
    1.71 +// causes warnings in certain compilers about unused parameters in release
    1.72 +// builds.  We can't have that can we!
    1.73 +/*
    1.74 +Allows avoid this all over bigint.cpp and primes.cpp
    1.75 +ifdef _DEBUG
    1.76 +	TUint carry = Increment(Ptr(), Size());
    1.77 +	assert(!carry);
    1.78 +else
    1.79 +	Increment(Ptr(), Size())
    1.80 +endif
    1.81 +*/
    1.82 +void IncrementNoCarry(word *A, unsigned int N, word B)
    1.83 +{
    1.84 +	assert(N);
    1.85 +	word t = A[0];
    1.86 +	A[0] = t+B;
    1.87 +	if (A[0] >= t)
    1.88 +		return;
    1.89 +	for (unsigned i=1; i<N; i++)
    1.90 +		if (++A[i])
    1.91 +			return;
    1.92 +	assert(0);
    1.93 +}
    1.94 +
    1.95 +word Increment(word *A, unsigned int N, word B)
    1.96 +{
    1.97 +	assert(N);
    1.98 +	word t = A[0];
    1.99 +	A[0] = t+B;
   1.100 +	if (A[0] >= t)
   1.101 +		return 0;
   1.102 +	for (unsigned i=1; i<N; i++)
   1.103 +		if (++A[i])
   1.104 +			return 0;
   1.105 +	return 1;
   1.106 +}
   1.107 +
   1.108 +//See commments above about IncrementNoCarry
   1.109 +void DecrementNoCarry(word *A, unsigned int N, word B)
   1.110 +{
   1.111 +	assert(N);
   1.112 +	word t = A[0];
   1.113 +	A[0] = t-B;
   1.114 +	if (A[0] <= t)
   1.115 +		return;
   1.116 +	for (unsigned i=1; i<N; i++)
   1.117 +		if (A[i]--)
   1.118 +			return;
   1.119 +	assert(0);
   1.120 +}
   1.121 +
   1.122 +word Decrement(word *A, unsigned int N, word B)
   1.123 +{
   1.124 +	assert(N);
   1.125 +	word t = A[0];
   1.126 +	A[0] = t-B;
   1.127 +	if (A[0] <= t)
   1.128 +		return 0;
   1.129 +	for (unsigned i=1; i<N; i++)
   1.130 +		if (A[i]--)
   1.131 +			return 0;
   1.132 +	return 1;
   1.133 +}
   1.134 +
   1.135 +void TwosComplement(word *A, unsigned int N)
   1.136 +{
   1.137 +	Decrement(A, N);
   1.138 +	for (unsigned i=0; i<N; i++)
   1.139 +		A[i] = ~A[i];
   1.140 +}
   1.141 +
   1.142 +static word LinearMultiply(word *C, const word *A, word B, unsigned int N)
   1.143 +{
   1.144 +	word carry=0;
   1.145 +	for(unsigned i=0; i<N; i++)
   1.146 +	{
   1.147 +		dword p = (dword)A[i] * B + carry;
   1.148 +		C[i] = LOW_WORD(p);
   1.149 +		carry = HIGH_WORD(p);
   1.150 +	}
   1.151 +	return carry;
   1.152 +}
   1.153 +
   1.154 +static void AtomicMultiply(word *C, const word *A, const word *B)
   1.155 +{
   1.156 +/*
   1.157 +	word s;
   1.158 +	dword d;
   1.159 +
   1.160 +	if (A1 >= A0)
   1.161 +		if (B0 >= B1)
   1.162 +		{
   1.163 +			s = 0;
   1.164 +			d = (dword)(A1-A0)*(B0-B1);
   1.165 +		}
   1.166 +		else
   1.167 +		{
   1.168 +			s = (A1-A0);
   1.169 +			d = (dword)s*(word)(B0-B1);
   1.170 +		}
   1.171 +	else
   1.172 +		if (B0 > B1)
   1.173 +		{
   1.174 +			s = (B0-B1);
   1.175 +			d = (word)(A1-A0)*(dword)s;
   1.176 +		}
   1.177 +		else
   1.178 +		{
   1.179 +			s = 0;
   1.180 +			d = (dword)(A0-A1)*(B1-B0);
   1.181 +		}
   1.182 +*/
   1.183 +	// this segment is the branchless equivalent of above
   1.184 +	word D[4] = {A[1]-A[0], A[0]-A[1], B[0]-B[1], B[1]-B[0]};
   1.185 +	unsigned int ai = A[1] < A[0];
   1.186 +	unsigned int bi = B[0] < B[1];
   1.187 +	unsigned int di = ai & bi;
   1.188 +	dword d = (dword)D[di]*D[di+2];
   1.189 +	D[1] = D[3] = 0;
   1.190 +	unsigned int si = ai + !bi;
   1.191 +	word s = D[si];
   1.192 +
   1.193 +	dword A0B0 = (dword)A[0]*B[0];
   1.194 +	C[0] = LOW_WORD(A0B0);
   1.195 +
   1.196 +	dword A1B1 = (dword)A[1]*B[1];
   1.197 +	dword t = (dword) HIGH_WORD(A0B0) + LOW_WORD(A0B0) + LOW_WORD(d) + LOW_WORD(A1B1);
   1.198 +	C[1] = LOW_WORD(t);
   1.199 +
   1.200 +	t = A1B1 + HIGH_WORD(t) + HIGH_WORD(A0B0) + HIGH_WORD(d) + HIGH_WORD(A1B1) - s;
   1.201 +	C[2] = LOW_WORD(t);
   1.202 +	C[3] = HIGH_WORD(t);
   1.203 +}
   1.204 +
   1.205 +static word AtomicMultiplyAdd(word *C, const word *A, const word *B)
   1.206 +{
   1.207 +	word D[4] = {A[1]-A[0], A[0]-A[1], B[0]-B[1], B[1]-B[0]};
   1.208 +	unsigned int ai = A[1] < A[0];
   1.209 +	unsigned int bi = B[0] < B[1];
   1.210 +	unsigned int di = ai & bi;
   1.211 +	dword d = (dword)D[di]*D[di+2];
   1.212 +	D[1] = D[3] = 0;
   1.213 +	unsigned int si = ai + !bi;
   1.214 +	word s = D[si];
   1.215 +
   1.216 +	dword A0B0 = (dword)A[0]*B[0];
   1.217 +	dword t = A0B0 + C[0];
   1.218 +	C[0] = LOW_WORD(t);
   1.219 +
   1.220 +	dword A1B1 = (dword)A[1]*B[1];
   1.221 +	t = (dword) HIGH_WORD(t) + LOW_WORD(A0B0) + LOW_WORD(d) + LOW_WORD(A1B1) + C[1];
   1.222 +	C[1] = LOW_WORD(t);
   1.223 +
   1.224 +	t = (dword) HIGH_WORD(t) + LOW_WORD(A1B1) + HIGH_WORD(A0B0) + HIGH_WORD(d) + HIGH_WORD(A1B1) - s + C[2];
   1.225 +	C[2] = LOW_WORD(t);
   1.226 +
   1.227 +	t = (dword) HIGH_WORD(t) + HIGH_WORD(A1B1) + C[3];
   1.228 +	C[3] = LOW_WORD(t);
   1.229 +	return HIGH_WORD(t);
   1.230 +}
   1.231 +
   1.232 +static inline void AtomicMultiplyBottom(word *C, const word *A, const word *B)
   1.233 +{
   1.234 +	dword t = (dword)A[0]*B[0];
   1.235 +	C[0] = LOW_WORD(t);
   1.236 +	C[1] = HIGH_WORD(t) + A[0]*B[1] + A[1]*B[0];
   1.237 +}
   1.238 +
   1.239 +#define MulAcc(x, y)								\
   1.240 +	p = (dword)A[x] * B[y] + c; 					\
   1.241 +	c = LOW_WORD(p);								\
   1.242 +	p = (dword)d + HIGH_WORD(p);					\
   1.243 +	d = LOW_WORD(p);								\
   1.244 +	e += HIGH_WORD(p);
   1.245 +
   1.246 +#define SaveMulAcc(s, x, y) 						\
   1.247 +	R[s] = c;										\
   1.248 +	p = (dword)A[x] * B[y] + d; 					\
   1.249 +	c = LOW_WORD(p);								\
   1.250 +	p = (dword)e + HIGH_WORD(p);					\
   1.251 +	d = LOW_WORD(p);								\
   1.252 +	e = HIGH_WORD(p);
   1.253 +
   1.254 +#define MulAcc1(x, y)								\
   1.255 +	p = (dword)A[x] * A[y] + c; 					\
   1.256 +	c = LOW_WORD(p);								\
   1.257 +	p = (dword)d + HIGH_WORD(p);					\
   1.258 +	d = LOW_WORD(p);								\
   1.259 +	e += HIGH_WORD(p);
   1.260 +
   1.261 +#define SaveMulAcc1(s, x, y) 						\
   1.262 +	R[s] = c;										\
   1.263 +	p = (dword)A[x] * A[y] + d; 					\
   1.264 +	c = LOW_WORD(p);								\
   1.265 +	p = (dword)e + HIGH_WORD(p);					\
   1.266 +	d = LOW_WORD(p);								\
   1.267 +	e = HIGH_WORD(p);
   1.268 +
   1.269 +#define SquAcc(x, y)								\
   1.270 +	p = (dword)A[x] * A[y];	\
   1.271 +	p = p + p + c; 					\
   1.272 +	c = LOW_WORD(p);								\
   1.273 +	p = (dword)d + HIGH_WORD(p);					\
   1.274 +	d = LOW_WORD(p);								\
   1.275 +	e += HIGH_WORD(p);
   1.276 +
   1.277 +#define SaveSquAcc(s, x, y) 						\
   1.278 +	R[s] = c;										\
   1.279 +	p = (dword)A[x] * A[y];	\
   1.280 +	p = p + p + d; 					\
   1.281 +	c = LOW_WORD(p);								\
   1.282 +	p = (dword)e + HIGH_WORD(p);					\
   1.283 +	d = LOW_WORD(p);								\
   1.284 +	e = HIGH_WORD(p);
   1.285 +
   1.286 +// VC60 workaround: MSVC 6.0 has an optimization problem that makes
   1.287 +// (dword)A*B where either A or B has been cast to a dword before
   1.288 +// very expensive. Revisit a CombaSquare4() function when this
   1.289 +// problem is fixed.               
   1.290 +
   1.291 +// WARNING: KeithR.  05/08/03 This routine doesn't work with gcc on hardware
   1.292 +// either.  I've completely removed it.  It may be worth looking into sometime
   1.293 +// in the future.
   1.294 +/*#ifndef __WINS__
   1.295 +static void CombaSquare4(word *R, const word *A)
   1.296 +{
   1.297 +	dword p;
   1.298 +	word c, d, e;
   1.299 +
   1.300 +	p = (dword)A[0] * A[0];
   1.301 +	R[0] = LOW_WORD(p);
   1.302 +	c = HIGH_WORD(p);
   1.303 +	d = e = 0;
   1.304 +
   1.305 +	SquAcc(0, 1);
   1.306 +
   1.307 +	SaveSquAcc(1, 2, 0);
   1.308 +	MulAcc1(1, 1);
   1.309 +
   1.310 +	SaveSquAcc(2, 0, 3);
   1.311 +	SquAcc(1, 2);
   1.312 +
   1.313 +	SaveSquAcc(3, 3, 1);
   1.314 +	MulAcc1(2, 2);
   1.315 +
   1.316 +	SaveSquAcc(4, 2, 3);
   1.317 +
   1.318 +	R[5] = c;
   1.319 +	p = (dword)A[3] * A[3] + d;
   1.320 +	R[6] = LOW_WORD(p);
   1.321 +	R[7] = e + HIGH_WORD(p);
   1.322 +}
   1.323 +#endif */
   1.324 +
   1.325 +static void CombaMultiply4(word *R, const word *A, const word *B)
   1.326 +{
   1.327 +	dword p;
   1.328 +	word c, d, e;
   1.329 +
   1.330 +	p = (dword)A[0] * B[0];
   1.331 +	R[0] = LOW_WORD(p);
   1.332 +	c = HIGH_WORD(p);
   1.333 +	d = e = 0;
   1.334 +
   1.335 +	MulAcc(0, 1);
   1.336 +	MulAcc(1, 0);
   1.337 +
   1.338 +	SaveMulAcc(1, 2, 0);
   1.339 +	MulAcc(1, 1);
   1.340 +	MulAcc(0, 2);
   1.341 +
   1.342 +	SaveMulAcc(2, 0, 3);
   1.343 +	MulAcc(1, 2);
   1.344 +	MulAcc(2, 1);
   1.345 +	MulAcc(3, 0);
   1.346 +
   1.347 +	SaveMulAcc(3, 3, 1);
   1.348 +	MulAcc(2, 2);
   1.349 +	MulAcc(1, 3);
   1.350 +
   1.351 +	SaveMulAcc(4, 2, 3);
   1.352 +	MulAcc(3, 2);
   1.353 +
   1.354 +	R[5] = c;
   1.355 +	p = (dword)A[3] * B[3] + d;
   1.356 +	R[6] = LOW_WORD(p);
   1.357 +	R[7] = e + HIGH_WORD(p);
   1.358 +}
   1.359 +
   1.360 +static void CombaMultiply8(word *R, const word *A, const word *B)
   1.361 +{
   1.362 +	dword p;
   1.363 +	word c, d, e;
   1.364 +
   1.365 +	p = (dword)A[0] * B[0];
   1.366 +	R[0] = LOW_WORD(p);
   1.367 +	c = HIGH_WORD(p);
   1.368 +	d = e = 0;
   1.369 +
   1.370 +	MulAcc(0, 1);
   1.371 +	MulAcc(1, 0);
   1.372 +
   1.373 +	SaveMulAcc(1, 2, 0);
   1.374 +	MulAcc(1, 1);
   1.375 +	MulAcc(0, 2);
   1.376 +
   1.377 +	SaveMulAcc(2, 0, 3);
   1.378 +	MulAcc(1, 2);
   1.379 +	MulAcc(2, 1);
   1.380 +	MulAcc(3, 0);
   1.381 +
   1.382 +	SaveMulAcc(3, 0, 4);
   1.383 +	MulAcc(1, 3);
   1.384 +	MulAcc(2, 2);
   1.385 +	MulAcc(3, 1);
   1.386 +	MulAcc(4, 0);
   1.387 +
   1.388 +	SaveMulAcc(4, 0, 5);
   1.389 +	MulAcc(1, 4);
   1.390 +	MulAcc(2, 3);
   1.391 +	MulAcc(3, 2);
   1.392 +	MulAcc(4, 1);
   1.393 +	MulAcc(5, 0);
   1.394 +
   1.395 +	SaveMulAcc(5, 0, 6);
   1.396 +	MulAcc(1, 5);
   1.397 +	MulAcc(2, 4);
   1.398 +	MulAcc(3, 3);
   1.399 +	MulAcc(4, 2);
   1.400 +	MulAcc(5, 1);
   1.401 +	MulAcc(6, 0);
   1.402 +
   1.403 +	SaveMulAcc(6, 0, 7);
   1.404 +	MulAcc(1, 6);
   1.405 +	MulAcc(2, 5);
   1.406 +	MulAcc(3, 4);
   1.407 +	MulAcc(4, 3);
   1.408 +	MulAcc(5, 2);
   1.409 +	MulAcc(6, 1);
   1.410 +	MulAcc(7, 0);
   1.411 +
   1.412 +	SaveMulAcc(7, 1, 7);
   1.413 +	MulAcc(2, 6);
   1.414 +	MulAcc(3, 5);
   1.415 +	MulAcc(4, 4);
   1.416 +	MulAcc(5, 3);
   1.417 +	MulAcc(6, 2);
   1.418 +	MulAcc(7, 1);
   1.419 +
   1.420 +	SaveMulAcc(8, 2, 7);
   1.421 +	MulAcc(3, 6);
   1.422 +	MulAcc(4, 5);
   1.423 +	MulAcc(5, 4);
   1.424 +	MulAcc(6, 3);
   1.425 +	MulAcc(7, 2);
   1.426 +
   1.427 +	SaveMulAcc(9, 3, 7);
   1.428 +	MulAcc(4, 6);
   1.429 +	MulAcc(5, 5);
   1.430 +	MulAcc(6, 4);
   1.431 +	MulAcc(7, 3);
   1.432 +
   1.433 +	SaveMulAcc(10, 4, 7);
   1.434 +	MulAcc(5, 6);
   1.435 +	MulAcc(6, 5);
   1.436 +	MulAcc(7, 4);
   1.437 +
   1.438 +	SaveMulAcc(11, 5, 7);
   1.439 +	MulAcc(6, 6);
   1.440 +	MulAcc(7, 5);
   1.441 +
   1.442 +	SaveMulAcc(12, 6, 7);
   1.443 +	MulAcc(7, 6);
   1.444 +
   1.445 +	R[13] = c;
   1.446 +	p = (dword)A[7] * B[7] + d;
   1.447 +	R[14] = LOW_WORD(p);
   1.448 +	R[15] = e + HIGH_WORD(p);
   1.449 +}
   1.450 +
   1.451 +static void CombaMultiplyBottom4(word *R, const word *A, const word *B)
   1.452 +{
   1.453 +	dword p;
   1.454 +	word c, d, e;
   1.455 +
   1.456 +	p = (dword)A[0] * B[0];
   1.457 +	R[0] = LOW_WORD(p);
   1.458 +	c = HIGH_WORD(p);
   1.459 +	d = e = 0;
   1.460 +
   1.461 +	MulAcc(0, 1);
   1.462 +	MulAcc(1, 0);
   1.463 +
   1.464 +	SaveMulAcc(1, 2, 0);
   1.465 +	MulAcc(1, 1);
   1.466 +	MulAcc(0, 2);
   1.467 +
   1.468 +	R[2] = c;
   1.469 +	R[3] = d + A[0] * B[3] + A[1] * B[2] + A[2] * B[1] + A[3] * B[0];
   1.470 +}
   1.471 +
   1.472 +static void CombaMultiplyBottom8(word *R, const word *A, const word *B)
   1.473 +{
   1.474 +	dword p;
   1.475 +	word c, d, e;
   1.476 +
   1.477 +	p = (dword)A[0] * B[0];
   1.478 +	R[0] = LOW_WORD(p);
   1.479 +	c = HIGH_WORD(p);
   1.480 +	d = e = 0;
   1.481 +
   1.482 +	MulAcc(0, 1);
   1.483 +	MulAcc(1, 0);
   1.484 +
   1.485 +	SaveMulAcc(1, 2, 0);
   1.486 +	MulAcc(1, 1);
   1.487 +	MulAcc(0, 2);
   1.488 +
   1.489 +	SaveMulAcc(2, 0, 3);
   1.490 +	MulAcc(1, 2);
   1.491 +	MulAcc(2, 1);
   1.492 +	MulAcc(3, 0);
   1.493 +
   1.494 +	SaveMulAcc(3, 0, 4);
   1.495 +	MulAcc(1, 3);
   1.496 +	MulAcc(2, 2);
   1.497 +	MulAcc(3, 1);
   1.498 +	MulAcc(4, 0);
   1.499 +
   1.500 +	SaveMulAcc(4, 0, 5);
   1.501 +	MulAcc(1, 4);
   1.502 +	MulAcc(2, 3);
   1.503 +	MulAcc(3, 2);
   1.504 +	MulAcc(4, 1);
   1.505 +	MulAcc(5, 0);
   1.506 +
   1.507 +	SaveMulAcc(5, 0, 6);
   1.508 +	MulAcc(1, 5);
   1.509 +	MulAcc(2, 4);
   1.510 +	MulAcc(3, 3);
   1.511 +	MulAcc(4, 2);
   1.512 +	MulAcc(5, 1);
   1.513 +	MulAcc(6, 0);
   1.514 +
   1.515 +	R[6] = c;
   1.516 +	R[7] = d + A[0] * B[7] + A[1] * B[6] + A[2] * B[5] + A[3] * B[4] +
   1.517 +				A[4] * B[3] + A[5] * B[2] + A[6] * B[1] + A[7] * B[0];
   1.518 +}
   1.519 +
   1.520 +#undef MulAcc
   1.521 +#undef SaveMulAcc
   1.522 +static void AtomicInverseModPower2(word *C, word A0, word A1)
   1.523 +{
   1.524 +	assert(A0%2==1);
   1.525 +
   1.526 +	dword A=MAKE_DWORD(A0, A1), R=A0%8;
   1.527 +
   1.528 +	for (unsigned i=3; i<2*WORD_BITS; i*=2)
   1.529 +		R = R*(2-R*A);
   1.530 +
   1.531 +	assert(R*A==1);
   1.532 +
   1.533 +	C[0] = LOW_WORD(R);
   1.534 +	C[1] = HIGH_WORD(R);
   1.535 +}
   1.536 +// ********************************************************
   1.537 +
   1.538 +#define A0		A
   1.539 +#define A1		(A+N2)
   1.540 +#define B0		B
   1.541 +#define B1		(B+N2)
   1.542 +
   1.543 +#define T0		T
   1.544 +#define T1		(T+N2)
   1.545 +#define T2		(T+N)
   1.546 +#define T3		(T+N+N2)
   1.547 +
   1.548 +#define R0		R
   1.549 +#define R1		(R+N2)
   1.550 +#define R2		(R+N)
   1.551 +#define R3		(R+N+N2)
   1.552 +
   1.553 +// R[2*N] - result = A*B
   1.554 +// T[2*N] - temporary work space
   1.555 +// A[N] --- multiplier
   1.556 +// B[N] --- multiplicant
   1.557 +
   1.558 +void RecursiveMultiply(word *R, word *T, const word *A, const word *B, unsigned int N)
   1.559 +{
   1.560 +	assert(N>=2 && N%2==0);
   1.561 +
   1.562 +	if (N==2)
   1.563 +		AtomicMultiply(R, A, B);
   1.564 +	else if (N==4)
   1.565 +		CombaMultiply4(R, A, B);
   1.566 +	else if (N==8)
   1.567 +		CombaMultiply8(R, A, B);
   1.568 +	else
   1.569 +	{
   1.570 +		const unsigned int N2 = N/2;
   1.571 +		int carry;
   1.572 +
   1.573 +		int aComp = Compare(A0, A1, N2);
   1.574 +		int bComp = Compare(B0, B1, N2);
   1.575 +
   1.576 +		switch (2*aComp + aComp + bComp)
   1.577 +		{
   1.578 +		case -4:
   1.579 +			Subtract(R0, A1, A0, N2);
   1.580 +			Subtract(R1, B0, B1, N2);
   1.581 +			RecursiveMultiply(T0, T2, R0, R1, N2);
   1.582 +			Subtract(T1, T1, R0, N2);
   1.583 +			carry = -1;
   1.584 +			break;
   1.585 +		case -2:
   1.586 +			Subtract(R0, A1, A0, N2);
   1.587 +			Subtract(R1, B0, B1, N2);
   1.588 +			RecursiveMultiply(T0, T2, R0, R1, N2);
   1.589 +			carry = 0;
   1.590 +			break;
   1.591 +		case 2:
   1.592 +			Subtract(R0, A0, A1, N2);
   1.593 +			Subtract(R1, B1, B0, N2);
   1.594 +			RecursiveMultiply(T0, T2, R0, R1, N2);
   1.595 +			carry = 0;
   1.596 +			break;
   1.597 +		case 4:
   1.598 +			Subtract(R0, A1, A0, N2);
   1.599 +			Subtract(R1, B0, B1, N2);
   1.600 +			RecursiveMultiply(T0, T2, R0, R1, N2);
   1.601 +			Subtract(T1, T1, R1, N2);
   1.602 +			carry = -1;
   1.603 +			break;
   1.604 +		default:
   1.605 +			SetWords(T0, 0, N);
   1.606 +			carry = 0;
   1.607 +		}
   1.608 +
   1.609 +		RecursiveMultiply(R0, T2, A0, B0, N2);
   1.610 +		RecursiveMultiply(R2, T2, A1, B1, N2);
   1.611 +
   1.612 +		// now T[01] holds (A1-A0)*(B0-B1), R[01] holds A0*B0, R[23] holds A1*B1
   1.613 +
   1.614 +		carry += Add(T0, T0, R0, N);
   1.615 +		carry += Add(T0, T0, R2, N);
   1.616 +		carry += Add(R1, R1, T0, N);
   1.617 +
   1.618 +		assert (carry >= 0 && carry <= 2);
   1.619 +		Increment(R3, N2, carry);
   1.620 +	}
   1.621 +}
   1.622 +
   1.623 +// R[2*N] - result = A*A
   1.624 +// T[2*N] - temporary work space
   1.625 +// A[N] --- number to be squared
   1.626 +
   1.627 +void RecursiveSquare(word *R, word *T, const word *A, unsigned int N)
   1.628 +{
   1.629 +	assert(N && N%2==0);
   1.630 +
   1.631 +	if (N==2)
   1.632 +		AtomicMultiply(R, A, A);
   1.633 +	else if (N==4)
   1.634 +	{
   1.635 +		// VC60 workaround: MSVC 6.0 has an optimization problem that makes
   1.636 +		// (dword)A*B where either A or B has been cast to a dword before
   1.637 +		// very expensive. Revisit a CombaSquare4() function when this
   1.638 +		// problem is fixed.               
   1.639 +
   1.640 +// WARNING: KeithR. 05/08/03 This routine doesn't work with gcc on hardware
   1.641 +// either.  I've completely removed it.  It may be worth looking into sometime
   1.642 +// in the future.  Therefore, we use the CombaMultiply4 on all targets.
   1.643 +//#ifdef __WINS__
   1.644 +		CombaMultiply4(R, A, A);
   1.645 +/*#else
   1.646 +		CombaSquare4(R, A);
   1.647 +#endif*/
   1.648 +	}
   1.649 +	else
   1.650 +	{
   1.651 +		const unsigned int N2 = N/2;
   1.652 +
   1.653 +		RecursiveSquare(R0, T2, A0, N2);
   1.654 +		RecursiveSquare(R2, T2, A1, N2);
   1.655 +		RecursiveMultiply(T0, T2, A0, A1, N2);
   1.656 +
   1.657 +		word carry = Add(R1, R1, T0, N);
   1.658 +		carry += Add(R1, R1, T0, N);
   1.659 +		Increment(R3, N2, carry);
   1.660 +	}
   1.661 +}
   1.662 +// R[N] - bottom half of A*B
   1.663 +// T[N] - temporary work space
   1.664 +// A[N] - multiplier
   1.665 +// B[N] - multiplicant
   1.666 +
   1.667 +void RecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, unsigned int N)
   1.668 +{
   1.669 +	assert(N>=2 && N%2==0);
   1.670 +
   1.671 +	if (N==2)
   1.672 +		AtomicMultiplyBottom(R, A, B);
   1.673 +	else if (N==4)
   1.674 +		CombaMultiplyBottom4(R, A, B);
   1.675 +	else if (N==8)
   1.676 +		CombaMultiplyBottom8(R, A, B);
   1.677 +	else
   1.678 +	{
   1.679 +		const unsigned int N2 = N/2;
   1.680 +
   1.681 +		RecursiveMultiply(R, T, A0, B0, N2);
   1.682 +		RecursiveMultiplyBottom(T0, T1, A1, B0, N2);
   1.683 +		Add(R1, R1, T0, N2);
   1.684 +		RecursiveMultiplyBottom(T0, T1, A0, B1, N2);
   1.685 +		Add(R1, R1, T0, N2);
   1.686 +	}
   1.687 +}
   1.688 +
   1.689 +// R[N] --- upper half of A*B
   1.690 +// T[2*N] - temporary work space
   1.691 +// L[N] --- lower half of A*B
   1.692 +// A[N] --- multiplier
   1.693 +// B[N] --- multiplicant
   1.694 +
   1.695 +void RecursiveMultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, unsigned int N)
   1.696 +{
   1.697 +	assert(N>=2 && N%2==0);
   1.698 +
   1.699 +	if (N==2)
   1.700 +	{
   1.701 +		AtomicMultiply(T, A, B);
   1.702 +		((dword *)R)[0] = ((dword *)T)[1];
   1.703 +	}
   1.704 +	else if (N==4)
   1.705 +	{
   1.706 +		CombaMultiply4(T, A, B);
   1.707 +		((dword *)R)[0] = ((dword *)T)[2];
   1.708 +		((dword *)R)[1] = ((dword *)T)[3];
   1.709 +	}
   1.710 +	else
   1.711 +	{
   1.712 +		const unsigned int N2 = N/2;
   1.713 +		int carry;
   1.714 +
   1.715 +		int aComp = Compare(A0, A1, N2);
   1.716 +		int bComp = Compare(B0, B1, N2);
   1.717 +
   1.718 +		switch (2*aComp + aComp + bComp)
   1.719 +		{
   1.720 +		case -4:
   1.721 +			Subtract(R0, A1, A0, N2);
   1.722 +			Subtract(R1, B0, B1, N2);
   1.723 +			RecursiveMultiply(T0, T2, R0, R1, N2);
   1.724 +			Subtract(T1, T1, R0, N2);
   1.725 +			carry = -1;
   1.726 +			break;
   1.727 +		case -2:
   1.728 +			Subtract(R0, A1, A0, N2);
   1.729 +			Subtract(R1, B0, B1, N2);
   1.730 +			RecursiveMultiply(T0, T2, R0, R1, N2);
   1.731 +			carry = 0;
   1.732 +			break;
   1.733 +		case 2:
   1.734 +			Subtract(R0, A0, A1, N2);
   1.735 +			Subtract(R1, B1, B0, N2);
   1.736 +			RecursiveMultiply(T0, T2, R0, R1, N2);
   1.737 +			carry = 0;
   1.738 +			break;
   1.739 +		case 4:
   1.740 +			Subtract(R0, A1, A0, N2);
   1.741 +			Subtract(R1, B0, B1, N2);
   1.742 +			RecursiveMultiply(T0, T2, R0, R1, N2);
   1.743 +			Subtract(T1, T1, R1, N2);
   1.744 +			carry = -1;
   1.745 +			break;
   1.746 +		default:
   1.747 +			SetWords(T0, 0, N);
   1.748 +			carry = 0;
   1.749 +		}
   1.750 +
   1.751 +		RecursiveMultiply(T2, R0, A1, B1, N2);
   1.752 +
   1.753 +		// now T[01] holds (A1-A0)*(B0-B1), T[23] holds A1*B1
   1.754 +
   1.755 +		CopyWords(R0, L+N2, N2);
   1.756 +		word c2 = Subtract(R0, R0, L, N2);
   1.757 +		c2 += Subtract(R0, R0, T0, N2);
   1.758 +		word t = (Compare(R0, T2, N2) == -1);
   1.759 +
   1.760 +		carry += t;
   1.761 +		carry += Increment(R0, N2, c2+t);
   1.762 +		carry += Add(R0, R0, T1, N2);
   1.763 +		carry += Add(R0, R0, T3, N2);
   1.764 +
   1.765 +		CopyWords(R1, T3, N2);
   1.766 +		assert (carry >= 0 && carry <= 2);
   1.767 +		Increment(R1, N2, carry);
   1.768 +	}
   1.769 +}
   1.770 +
   1.771 +// R[NA+NB] - result = A*B
   1.772 +// T[NA+NB] - temporary work space
   1.773 +// A[NA] ---- multiplier
   1.774 +// B[NB] ---- multiplicant
   1.775 +
   1.776 +void AsymmetricMultiply(word *R, word *T, const word *A, unsigned int NA, const word *B, unsigned int NB)
   1.777 +{
   1.778 +	if (NA == NB)
   1.779 +	{
   1.780 +		if (A == B)
   1.781 +			RecursiveSquare(R, T, A, NA);
   1.782 +		else
   1.783 +			RecursiveMultiply(R, T, A, B, NA);
   1.784 +
   1.785 +		return;
   1.786 +	}
   1.787 +
   1.788 +	if (NA > NB)
   1.789 +	{
   1.790 +		TClassSwap(A, B);
   1.791 +		TClassSwap(NA, NB);
   1.792 +		//std::swap(A, B);
   1.793 +		//std::swap(NA, NB);
   1.794 +	}
   1.795 +
   1.796 +	assert(NB % NA == 0);
   1.797 +	assert((NB/NA)%2 == 0); 	// NB is an even multiple of NA
   1.798 +
   1.799 +	if (NA==2 && !A[1])
   1.800 +	{
   1.801 +		switch (A[0])
   1.802 +		{
   1.803 +		case 0:
   1.804 +			SetWords(R, 0, NB+2);
   1.805 +			return;
   1.806 +		case 1:
   1.807 +			CopyWords(R, B, NB);
   1.808 +			R[NB] = R[NB+1] = 0;
   1.809 +			return;
   1.810 +		default:
   1.811 +			R[NB] = LinearMultiply(R, B, A[0], NB);
   1.812 +			R[NB+1] = 0;
   1.813 +			return;
   1.814 +		}
   1.815 +	}
   1.816 +
   1.817 +	RecursiveMultiply(R, T, A, B, NA);
   1.818 +	CopyWords(T+2*NA, R+NA, NA);
   1.819 +
   1.820 +	unsigned i;
   1.821 +
   1.822 +	for (i=2*NA; i<NB; i+=2*NA)
   1.823 +		RecursiveMultiply(T+NA+i, T, A, B+i, NA);
   1.824 +	for (i=NA; i<NB; i+=2*NA)
   1.825 +		RecursiveMultiply(R+i, T, A, B+i, NA);
   1.826 +
   1.827 +	if (Add(R+NA, R+NA, T+2*NA, NB-NA))
   1.828 +		Increment(R+NB, NA);
   1.829 +}
   1.830 +// R[N] ----- result = A inverse mod 2**(WORD_BITS*N)
   1.831 +// T[3*N/2] - temporary work space
   1.832 +// A[N] ----- an odd number as input
   1.833 +
   1.834 +void RecursiveInverseModPower2(word *R, word *T, const word *A, unsigned int N)
   1.835 +{
   1.836 +	if (N==2)
   1.837 +		AtomicInverseModPower2(R, A[0], A[1]);
   1.838 +	else
   1.839 +	{
   1.840 +		const unsigned int N2 = N/2;
   1.841 +		RecursiveInverseModPower2(R0, T0, A0, N2);
   1.842 +		T0[0] = 1;
   1.843 +		SetWords(T0+1, 0, N2-1);
   1.844 +		RecursiveMultiplyTop(R1, T1, T0, R0, A0, N2);
   1.845 +		RecursiveMultiplyBottom(T0, T1, R0, A1, N2);
   1.846 +		Add(T0, R1, T0, N2);
   1.847 +		TwosComplement(T0, N2);
   1.848 +		RecursiveMultiplyBottom(R1, T1, R0, T0, N2);
   1.849 +	}
   1.850 +}
   1.851 +#undef A0
   1.852 +#undef A1
   1.853 +#undef B0
   1.854 +#undef B1
   1.855 +
   1.856 +#undef T0
   1.857 +#undef T1
   1.858 +#undef T2
   1.859 +#undef T3
   1.860 +
   1.861 +#undef R0
   1.862 +#undef R1
   1.863 +#undef R2
   1.864 +#undef R3
   1.865 +
   1.866 +// R[N] --- result = X/(2**(WORD_BITS*N)) mod M
   1.867 +// T[3*N] - temporary work space
   1.868 +// X[2*N] - number to be reduced
   1.869 +// M[N] --- modulus
   1.870 +// U[N] --- multiplicative inverse of M mod 2**(WORD_BITS*N)
   1.871 +
   1.872 +void MontgomeryReduce(word *R, word *T, const word *X, const word *M, const word *U, unsigned int N)
   1.873 +{
   1.874 +	RecursiveMultiplyBottom(R, T, X, U, N);
   1.875 +	RecursiveMultiplyTop(T, T+N, X, R, M, N);
   1.876 +	if (Subtract(R, X+N, T, N))
   1.877 +	{
   1.878 +#ifdef _DEBUG
   1.879 +		word carry = Add(R, R, M, N);
   1.880 +		assert(carry);
   1.881 +#else
   1.882 +		Add(R, R, M, N);
   1.883 +#endif
   1.884 +	}
   1.885 +}
   1.886 +
   1.887 +// do a 3 word by 2 word divide, returns quotient and leaves remainder in A
   1.888 +static word SubatomicDivide(word *A, word B0, word B1)
   1.889 +{
   1.890 +	// assert {A[2],A[1]} < {B1,B0}, so quotient can fit in a word
   1.891 +	assert(A[2] < B1 || (A[2]==B1 && A[1] < B0));
   1.892 +
   1.893 +	dword p, u;
   1.894 +	word Q;
   1.895 +
   1.896 +	// estimate the quotient: do a 2 word by 1 word divide
   1.897 +	if (B1+1 == 0)
   1.898 +		Q = A[2];
   1.899 +	else
   1.900 +		Q = word(MAKE_DWORD(A[1], A[2]) / (B1+1));
   1.901 +
   1.902 +	// now subtract Q*B from A
   1.903 +	p = (dword) B0*Q;
   1.904 +	u = (dword) A[0] - LOW_WORD(p);
   1.905 +	A[0] = LOW_WORD(u);
   1.906 +	u = (dword) A[1] - HIGH_WORD(p) - (word)(0-HIGH_WORD(u)) - (dword)B1*Q;
   1.907 +	A[1] = LOW_WORD(u);
   1.908 +	A[2] += HIGH_WORD(u);
   1.909 +
   1.910 +	// Q <= actual quotient, so fix it
   1.911 +	while (A[2] || A[1] > B1 || (A[1]==B1 && A[0]>=B0))
   1.912 +	{
   1.913 +		u = (dword) A[0] - B0;
   1.914 +		A[0] = LOW_WORD(u);
   1.915 +		u = (dword) A[1] - B1 - (word)(0-HIGH_WORD(u));
   1.916 +		A[1] = LOW_WORD(u);
   1.917 +		A[2] += HIGH_WORD(u);
   1.918 +		Q++;
   1.919 +		assert(Q);	// shouldn't overflow
   1.920 +	}
   1.921 +
   1.922 +	return Q;
   1.923 +}
   1.924 +
   1.925 +// do a 4 word by 2 word divide, returns 2 word quotient in Q0 and Q1
   1.926 +static inline void AtomicDivide(word *Q, const word *A, const word *B)
   1.927 +{
   1.928 +	if (!B[0] && !B[1]) // if divisor is 0, we assume divisor==2**(2*WORD_BITS)
   1.929 +	{
   1.930 +		Q[0] = A[2];
   1.931 +		Q[1] = A[3];
   1.932 +	}
   1.933 +	else
   1.934 +	{
   1.935 +		word T[4];
   1.936 +		T[0] = A[0]; T[1] = A[1]; T[2] = A[2]; T[3] = A[3];
   1.937 +		Q[1] = SubatomicDivide(T+1, B[0], B[1]);
   1.938 +		Q[0] = SubatomicDivide(T, B[0], B[1]);
   1.939 +
   1.940 +#ifdef _DEBUG
   1.941 +		// multiply quotient and divisor and add remainder, make sure it equals dividend
   1.942 +		assert(!T[2] && !T[3] && (T[1] < B[1] || (T[1]==B[1] && T[0]<B[0])));
   1.943 +		word P[4];
   1.944 +		AtomicMultiply(P, Q, B);
   1.945 +		Add(P, P, T, 4);
   1.946 +		assert(Mem::Compare((TUint8*)P, 4*WORD_SIZE, (TUint8*)A, 4*WORD_SIZE)==0);
   1.947 +#endif
   1.948 +	}
   1.949 +}
   1.950 +
   1.951 +// for use by Divide(), corrects the underestimated quotient {Q1,Q0}
   1.952 +static void CorrectQuotientEstimate(word *R, word *T, word *Q, const word *B, unsigned int N)
   1.953 +{
   1.954 +	assert(N && N%2==0);
   1.955 +
   1.956 +	if (Q[1])
   1.957 +	{
   1.958 +		T[N] = T[N+1] = 0;
   1.959 +		unsigned i;
   1.960 +		for (i=0; i<N; i+=4)
   1.961 +			AtomicMultiply(T+i, Q, B+i);
   1.962 +		for (i=2; i<N; i+=4)
   1.963 +			if (AtomicMultiplyAdd(T+i, Q, B+i))
   1.964 +				T[i+5] += (++T[i+4]==0);
   1.965 +	}
   1.966 +	else
   1.967 +	{
   1.968 +		T[N] = LinearMultiply(T, B, Q[0], N);
   1.969 +		T[N+1] = 0;
   1.970 +	}
   1.971 +
   1.972 +#ifdef _DEBUG
   1.973 +	word borrow = Subtract(R, R, T, N+2);
   1.974 +	assert(!borrow && !R[N+1]);
   1.975 +#else
   1.976 +	Subtract(R, R, T, N+2);
   1.977 +#endif
   1.978 +
   1.979 +	while (R[N] || Compare(R, B, N) >= 0)
   1.980 +	{
   1.981 +		R[N] -= Subtract(R, R, B, N);
   1.982 +		Q[1] += (++Q[0]==0);
   1.983 +		assert(Q[0] || Q[1]); // no overflow
   1.984 +	}
   1.985 +}
   1.986 +
   1.987 +// R[NB] -------- remainder = A%B
   1.988 +// Q[NA-NB+2] --- quotient	= A/B
   1.989 +// T[NA+2*NB+4] - temp work space
   1.990 +// A[NA] -------- dividend
   1.991 +// B[NB] -------- divisor
   1.992 +
   1.993 +void Divide(word *R, word *Q, word *T, const word *A, unsigned int NA, const word *B, unsigned int NB)
   1.994 +{
   1.995 +	assert(NA && NB && NA%2==0 && NB%2==0);
   1.996 +	assert(B[NB-1] || B[NB-2]);
   1.997 +	assert(NB <= NA);
   1.998 +
   1.999 +	// set up temporary work space
  1.1000 +	word *const TA=T;
  1.1001 +	word *const TB=T+NA+2;
  1.1002 +	word *const TP=T+NA+2+NB;
  1.1003 +
  1.1004 +	// copy B into TB and normalize it so that TB has highest bit set to 1
  1.1005 +	unsigned shiftWords = (B[NB-1]==0);
  1.1006 +	TB[0] = TB[NB-1] = 0;
  1.1007 +	CopyWords(TB+shiftWords, B, NB-shiftWords);
  1.1008 +	unsigned shiftBits = WORD_BITS - BitPrecision(TB[NB-1]);
  1.1009 +	assert(shiftBits < WORD_BITS);
  1.1010 +	ShiftWordsLeftByBits(TB, NB, shiftBits);
  1.1011 +
  1.1012 +	// copy A into TA and normalize it
  1.1013 +	TA[0] = TA[NA] = TA[NA+1] = 0;
  1.1014 +	CopyWords(TA+shiftWords, A, NA);
  1.1015 +	ShiftWordsLeftByBits(TA, NA+2, shiftBits);
  1.1016 +
  1.1017 +	if (TA[NA+1]==0 && TA[NA] <= 1)
  1.1018 +	{
  1.1019 +		Q[NA-NB+1] = Q[NA-NB] = 0;
  1.1020 +		while (TA[NA] || Compare(TA+NA-NB, TB, NB) >= 0)
  1.1021 +		{
  1.1022 +			TA[NA] -= Subtract(TA+NA-NB, TA+NA-NB, TB, NB);
  1.1023 +			++Q[NA-NB];
  1.1024 +		}
  1.1025 +	}
  1.1026 +	else
  1.1027 +	{
  1.1028 +		NA+=2;
  1.1029 +		assert(Compare(TA+NA-NB, TB, NB) < 0);
  1.1030 +	}
  1.1031 +
  1.1032 +	word BT[2];
  1.1033 +	BT[0] = TB[NB-2] + 1;
  1.1034 +	BT[1] = TB[NB-1] + (BT[0]==0);
  1.1035 +
  1.1036 +	// start reducing TA mod TB, 2 words at a time
  1.1037 +	for (unsigned i=NA-2; i>=NB; i-=2)
  1.1038 +	{
  1.1039 +		AtomicDivide(Q+i-NB, TA+i-2, BT);
  1.1040 +		CorrectQuotientEstimate(TA+i-NB, TP, Q+i-NB, TB, NB);
  1.1041 +	}
  1.1042 +
  1.1043 +	// copy TA into R, and denormalize it
  1.1044 +	CopyWords(R, TA+shiftWords, NB);
  1.1045 +	ShiftWordsRightByBits(R, NB, shiftBits);
  1.1046 +}
  1.1047 +
  1.1048 +static inline unsigned int EvenWordCount(const word *X, unsigned int N)
  1.1049 +{
  1.1050 +	while (N && X[N-2]==0 && X[N-1]==0)
  1.1051 +		N-=2;
  1.1052 +	return N;
  1.1053 +}
  1.1054 +
  1.1055 +// return k
  1.1056 +// R[N] --- result = A^(-1) * 2^k mod M
  1.1057 +// T[4*N] - temporary work space
  1.1058 +// A[NA] -- number to take inverse of
  1.1059 +// M[N] --- modulus
  1.1060 +
  1.1061 +unsigned int AlmostInverse(word *R, word *T, const word *A, unsigned int NA, const word *M, unsigned int N)
  1.1062 +{
  1.1063 +	assert(NA<=N && N && N%2==0);
  1.1064 +
  1.1065 +	word *b = T;
  1.1066 +	word *c = T+N;
  1.1067 +	word *f = T+2*N;
  1.1068 +	word *g = T+3*N;
  1.1069 +	unsigned int bcLen=2, fgLen=EvenWordCount(M, N);
  1.1070 +	unsigned int k=0, s=0;
  1.1071 +
  1.1072 +	SetWords(T, 0, 3*N);
  1.1073 +	b[0]=1;
  1.1074 +	CopyWords(f, A, NA);
  1.1075 +	CopyWords(g, M, N);
  1.1076 +
  1.1077 +	FOREVER
  1.1078 +	{
  1.1079 +		word t=f[0];
  1.1080 +		while (!t)
  1.1081 +		{
  1.1082 +			if (EvenWordCount(f, fgLen)==0)
  1.1083 +			{
  1.1084 +				SetWords(R, 0, N);
  1.1085 +				return 0;
  1.1086 +			}
  1.1087 +
  1.1088 +			ShiftWordsRightByWords(f, fgLen, 1);
  1.1089 +			if (c[bcLen-1]) bcLen+=2;
  1.1090 +			assert(bcLen <= N);
  1.1091 +			ShiftWordsLeftByWords(c, bcLen, 1);
  1.1092 +			k+=WORD_BITS;
  1.1093 +			t=f[0];
  1.1094 +		}
  1.1095 +
  1.1096 +		unsigned int i=0;
  1.1097 +		while (t%2 == 0)
  1.1098 +		{
  1.1099 +			t>>=1;
  1.1100 +			i++;
  1.1101 +		}
  1.1102 +		k+=i;
  1.1103 +
  1.1104 +		if (t==1 && f[1]==0 && EvenWordCount(f, fgLen)==2)
  1.1105 +		{
  1.1106 +			if (s%2==0)
  1.1107 +				CopyWords(R, b, N);
  1.1108 +			else
  1.1109 +				Subtract(R, M, b, N);
  1.1110 +			return k;
  1.1111 +		}
  1.1112 +
  1.1113 +		ShiftWordsRightByBits(f, fgLen, i);
  1.1114 +		t=ShiftWordsLeftByBits(c, bcLen, i);
  1.1115 +		if (t)
  1.1116 +		{
  1.1117 +			c[bcLen] = t;
  1.1118 +			bcLen+=2;
  1.1119 +			assert(bcLen <= N);
  1.1120 +		}
  1.1121 +
  1.1122 +		if (f[fgLen-2]==0 && g[fgLen-2]==0 && f[fgLen-1]==0 && g[fgLen-1]==0)
  1.1123 +			fgLen-=2;
  1.1124 +
  1.1125 +		if (Compare(f, g, fgLen)==-1)
  1.1126 +		{
  1.1127 +			TClassSwap<word*>(f,g);
  1.1128 +			TClassSwap<word*>(b,c);
  1.1129 +			s++;
  1.1130 +		}
  1.1131 +
  1.1132 +		Subtract(f, f, g, fgLen);
  1.1133 +
  1.1134 +		if (Add(b, b, c, bcLen))
  1.1135 +		{
  1.1136 +			b[bcLen] = 1;
  1.1137 +			bcLen+=2;
  1.1138 +			assert(bcLen <= N);
  1.1139 +		}
  1.1140 +	}
  1.1141 +}
  1.1142 +
  1.1143 +// R[N] - result = A/(2^k) mod M
  1.1144 +// A[N] - input
  1.1145 +// M[N] - modulus
  1.1146 +
  1.1147 +void DivideByPower2Mod(word *R, const word *A, unsigned int k, const word *M, unsigned int N)
  1.1148 +{
  1.1149 +	CopyWords(R, A, N);
  1.1150 +
  1.1151 +	while (k--)
  1.1152 +	{
  1.1153 +		if (R[0]%2==0)
  1.1154 +			ShiftWordsRightByBits(R, N, 1);
  1.1155 +		else
  1.1156 +		{
  1.1157 +			word carry = Add(R, R, M, N);
  1.1158 +			ShiftWordsRightByBits(R, N, 1);
  1.1159 +			R[N-1] += carry<<(WORD_BITS-1);
  1.1160 +		}
  1.1161 +	}
  1.1162 +}
  1.1163 +