os/security/crypto/weakcryptospi/source/bigint/algorithms.cpp
author sl
Tue, 10 Jun 2014 14:32:02 +0200
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
     1 /*
     2 * Copyright (c) 2003-2009 Nokia Corporation and/or its subsidiary(-ies).
     3 * All rights reserved.
     4 * This component and the accompanying materials are made available
     5 * under the terms of the License "Eclipse Public License v1.0"
     6 * which accompanies this distribution, and is available
     7 * at the URL "http://www.eclipse.org/legal/epl-v10.html".
     8 *
     9 * Initial Contributors:
    10 * Nokia Corporation - initial contribution.
    11 *
    12 * Contributors:
    13 *
    14 * Description: 
    15 *
    16 */
    17 
    18 
    19 #include "words.h"
    20 #include "algorithms.h"
    21 
    22 word Add(word *C, const word *A, const word *B, unsigned int N)
    23 {
    24 	assert (N%2 == 0);
    25 	word carry = 0;
    26 	for (unsigned int i = 0; i < N; i+=2)
    27 	{
    28 		dword u = (dword) carry + A[i] + B[i];
    29 		C[i] = LOW_WORD(u);
    30 		u = (dword) HIGH_WORD(u) + A[i+1] + B[i+1];
    31 		C[i+1] = LOW_WORD(u);
    32 		carry = HIGH_WORD(u);
    33 	}
    34 	return carry;
    35 }
    36 
    37 word Subtract(word *C, const word *A, const word *B, unsigned int N)
    38 {
    39 	assert (N%2 == 0);
    40 	word borrow=0;
    41 	for (unsigned i = 0; i < N; i+=2)
    42 	{
    43 		dword u = (dword) A[i] - B[i] - borrow;
    44 		C[i] = LOW_WORD(u);
    45 		u = (dword) A[i+1] - B[i+1] - (word)(0-HIGH_WORD(u));
    46 		C[i+1] = LOW_WORD(u);
    47 		borrow = 0-HIGH_WORD(u);
    48 	}
    49 	return borrow;
    50 }
    51 
    52 int Compare(const word *A, const word *B, unsigned int N)
    53 {
    54 	while (N--)
    55 		if (A[N] > B[N])
    56 			return 1;
    57 		else if (A[N] < B[N])
    58 			return -1;
    59 
    60 	return 0;
    61 }
    62 
    63 // It is the job of the calling code to ensure that this won't carry.
    64 // If you aren't sure, use the next version that will tell you if you need to
    65 // grow your integer.
    66 // Having two of these creates ever so slightly more code but avoids having
    67 // ifdefs all over the rest of the code checking the following type stuff which
    68 // causes warnings in certain compilers about unused parameters in release
    69 // builds.  We can't have that can we!
    70 /*
    71 Allows avoid this all over bigint.cpp and primes.cpp
    72 ifdef _DEBUG
    73 	TUint carry = Increment(Ptr(), Size());
    74 	assert(!carry);
    75 else
    76 	Increment(Ptr(), Size())
    77 endif
    78 */
    79 void IncrementNoCarry(word *A, unsigned int N, word B)
    80 {
    81 	assert(N);
    82 	word t = A[0];
    83 	A[0] = t+B;
    84 	if (A[0] >= t)
    85 		return;
    86 	for (unsigned i=1; i<N; i++)
    87 		if (++A[i])
    88 			return;
    89 	assert(0);
    90 }
    91 
    92 word Increment(word *A, unsigned int N, word B)
    93 {
    94 	assert(N);
    95 	word t = A[0];
    96 	A[0] = t+B;
    97 	if (A[0] >= t)
    98 		return 0;
    99 	for (unsigned i=1; i<N; i++)
   100 		if (++A[i])
   101 			return 0;
   102 	return 1;
   103 }
   104 
   105 //See commments above about IncrementNoCarry
   106 void DecrementNoCarry(word *A, unsigned int N, word B)
   107 {
   108 	assert(N);
   109 	word t = A[0];
   110 	A[0] = t-B;
   111 	if (A[0] <= t)
   112 		return;
   113 	for (unsigned i=1; i<N; i++)
   114 		if (A[i]--)
   115 			return;
   116 	assert(0);
   117 }
   118 
   119 word Decrement(word *A, unsigned int N, word B)
   120 {
   121 	assert(N);
   122 	word t = A[0];
   123 	A[0] = t-B;
   124 	if (A[0] <= t)
   125 		return 0;
   126 	for (unsigned i=1; i<N; i++)
   127 		if (A[i]--)
   128 			return 0;
   129 	return 1;
   130 }
   131 
   132 void TwosComplement(word *A, unsigned int N)
   133 {
   134 	Decrement(A, N);
   135 	for (unsigned i=0; i<N; i++)
   136 		A[i] = ~A[i];
   137 }
   138 
   139 static word LinearMultiply(word *C, const word *A, word B, unsigned int N)
   140 {
   141 	word carry=0;
   142 	for(unsigned i=0; i<N; i++)
   143 	{
   144 		dword p = (dword)A[i] * B + carry;
   145 		C[i] = LOW_WORD(p);
   146 		carry = HIGH_WORD(p);
   147 	}
   148 	return carry;
   149 }
   150 
   151 static void AtomicMultiply(word *C, const word *A, const word *B)
   152 {
   153 /*
   154 	word s;
   155 	dword d;
   156 
   157 	if (A1 >= A0)
   158 		if (B0 >= B1)
   159 		{
   160 			s = 0;
   161 			d = (dword)(A1-A0)*(B0-B1);
   162 		}
   163 		else
   164 		{
   165 			s = (A1-A0);
   166 			d = (dword)s*(word)(B0-B1);
   167 		}
   168 	else
   169 		if (B0 > B1)
   170 		{
   171 			s = (B0-B1);
   172 			d = (word)(A1-A0)*(dword)s;
   173 		}
   174 		else
   175 		{
   176 			s = 0;
   177 			d = (dword)(A0-A1)*(B1-B0);
   178 		}
   179 */
   180 	// this segment is the branchless equivalent of above
   181 	word D[4] = {A[1]-A[0], A[0]-A[1], B[0]-B[1], B[1]-B[0]};
   182 	unsigned int ai = A[1] < A[0];
   183 	unsigned int bi = B[0] < B[1];
   184 	unsigned int di = ai & bi;
   185 	dword d = (dword)D[di]*D[di+2];
   186 	D[1] = D[3] = 0;
   187 	unsigned int si = ai + !bi;
   188 	word s = D[si];
   189 
   190 	dword A0B0 = (dword)A[0]*B[0];
   191 	C[0] = LOW_WORD(A0B0);
   192 
   193 	dword A1B1 = (dword)A[1]*B[1];
   194 	dword t = (dword) HIGH_WORD(A0B0) + LOW_WORD(A0B0) + LOW_WORD(d) + LOW_WORD(A1B1);
   195 	C[1] = LOW_WORD(t);
   196 
   197 	t = A1B1 + HIGH_WORD(t) + HIGH_WORD(A0B0) + HIGH_WORD(d) + HIGH_WORD(A1B1) - s;
   198 	C[2] = LOW_WORD(t);
   199 	C[3] = HIGH_WORD(t);
   200 }
   201 
   202 static word AtomicMultiplyAdd(word *C, const word *A, const word *B)
   203 {
   204 	word D[4] = {A[1]-A[0], A[0]-A[1], B[0]-B[1], B[1]-B[0]};
   205 	unsigned int ai = A[1] < A[0];
   206 	unsigned int bi = B[0] < B[1];
   207 	unsigned int di = ai & bi;
   208 	dword d = (dword)D[di]*D[di+2];
   209 	D[1] = D[3] = 0;
   210 	unsigned int si = ai + !bi;
   211 	word s = D[si];
   212 
   213 	dword A0B0 = (dword)A[0]*B[0];
   214 	dword t = A0B0 + C[0];
   215 	C[0] = LOW_WORD(t);
   216 
   217 	dword A1B1 = (dword)A[1]*B[1];
   218 	t = (dword) HIGH_WORD(t) + LOW_WORD(A0B0) + LOW_WORD(d) + LOW_WORD(A1B1) + C[1];
   219 	C[1] = LOW_WORD(t);
   220 
   221 	t = (dword) HIGH_WORD(t) + LOW_WORD(A1B1) + HIGH_WORD(A0B0) + HIGH_WORD(d) + HIGH_WORD(A1B1) - s + C[2];
   222 	C[2] = LOW_WORD(t);
   223 
   224 	t = (dword) HIGH_WORD(t) + HIGH_WORD(A1B1) + C[3];
   225 	C[3] = LOW_WORD(t);
   226 	return HIGH_WORD(t);
   227 }
   228 
   229 static inline void AtomicMultiplyBottom(word *C, const word *A, const word *B)
   230 {
   231 	dword t = (dword)A[0]*B[0];
   232 	C[0] = LOW_WORD(t);
   233 	C[1] = HIGH_WORD(t) + A[0]*B[1] + A[1]*B[0];
   234 }
   235 
   236 #define MulAcc(x, y)								\
   237 	p = (dword)A[x] * B[y] + c; 					\
   238 	c = LOW_WORD(p);								\
   239 	p = (dword)d + HIGH_WORD(p);					\
   240 	d = LOW_WORD(p);								\
   241 	e += HIGH_WORD(p);
   242 
   243 #define SaveMulAcc(s, x, y) 						\
   244 	R[s] = c;										\
   245 	p = (dword)A[x] * B[y] + d; 					\
   246 	c = LOW_WORD(p);								\
   247 	p = (dword)e + HIGH_WORD(p);					\
   248 	d = LOW_WORD(p);								\
   249 	e = HIGH_WORD(p);
   250 
   251 #define MulAcc1(x, y)								\
   252 	p = (dword)A[x] * A[y] + c; 					\
   253 	c = LOW_WORD(p);								\
   254 	p = (dword)d + HIGH_WORD(p);					\
   255 	d = LOW_WORD(p);								\
   256 	e += HIGH_WORD(p);
   257 
   258 #define SaveMulAcc1(s, x, y) 						\
   259 	R[s] = c;										\
   260 	p = (dword)A[x] * A[y] + d; 					\
   261 	c = LOW_WORD(p);								\
   262 	p = (dword)e + HIGH_WORD(p);					\
   263 	d = LOW_WORD(p);								\
   264 	e = HIGH_WORD(p);
   265 
   266 #define SquAcc(x, y)								\
   267 	p = (dword)A[x] * A[y];	\
   268 	p = p + p + c; 					\
   269 	c = LOW_WORD(p);								\
   270 	p = (dword)d + HIGH_WORD(p);					\
   271 	d = LOW_WORD(p);								\
   272 	e += HIGH_WORD(p);
   273 
   274 #define SaveSquAcc(s, x, y) 						\
   275 	R[s] = c;										\
   276 	p = (dword)A[x] * A[y];	\
   277 	p = p + p + d; 					\
   278 	c = LOW_WORD(p);								\
   279 	p = (dword)e + HIGH_WORD(p);					\
   280 	d = LOW_WORD(p);								\
   281 	e = HIGH_WORD(p);
   282 
   283 // VC60 workaround: MSVC 6.0 has an optimization problem that makes
   284 // (dword)A*B where either A or B has been cast to a dword before
   285 // very expensive. Revisit a CombaSquare4() function when this
   286 // problem is fixed.               
   287 
   288 // WARNING: KeithR.  05/08/03 This routine doesn't work with gcc on hardware
   289 // either.  I've completely removed it.  It may be worth looking into sometime
   290 // in the future.
   291 /*#ifndef __WINS__
   292 static void CombaSquare4(word *R, const word *A)
   293 {
   294 	dword p;
   295 	word c, d, e;
   296 
   297 	p = (dword)A[0] * A[0];
   298 	R[0] = LOW_WORD(p);
   299 	c = HIGH_WORD(p);
   300 	d = e = 0;
   301 
   302 	SquAcc(0, 1);
   303 
   304 	SaveSquAcc(1, 2, 0);
   305 	MulAcc1(1, 1);
   306 
   307 	SaveSquAcc(2, 0, 3);
   308 	SquAcc(1, 2);
   309 
   310 	SaveSquAcc(3, 3, 1);
   311 	MulAcc1(2, 2);
   312 
   313 	SaveSquAcc(4, 2, 3);
   314 
   315 	R[5] = c;
   316 	p = (dword)A[3] * A[3] + d;
   317 	R[6] = LOW_WORD(p);
   318 	R[7] = e + HIGH_WORD(p);
   319 }
   320 #endif */
   321 
   322 static void CombaMultiply4(word *R, const word *A, const word *B)
   323 {
   324 	dword p;
   325 	word c, d, e;
   326 
   327 	p = (dword)A[0] * B[0];
   328 	R[0] = LOW_WORD(p);
   329 	c = HIGH_WORD(p);
   330 	d = e = 0;
   331 
   332 	MulAcc(0, 1);
   333 	MulAcc(1, 0);
   334 
   335 	SaveMulAcc(1, 2, 0);
   336 	MulAcc(1, 1);
   337 	MulAcc(0, 2);
   338 
   339 	SaveMulAcc(2, 0, 3);
   340 	MulAcc(1, 2);
   341 	MulAcc(2, 1);
   342 	MulAcc(3, 0);
   343 
   344 	SaveMulAcc(3, 3, 1);
   345 	MulAcc(2, 2);
   346 	MulAcc(1, 3);
   347 
   348 	SaveMulAcc(4, 2, 3);
   349 	MulAcc(3, 2);
   350 
   351 	R[5] = c;
   352 	p = (dword)A[3] * B[3] + d;
   353 	R[6] = LOW_WORD(p);
   354 	R[7] = e + HIGH_WORD(p);
   355 }
   356 
   357 static void CombaMultiply8(word *R, const word *A, const word *B)
   358 {
   359 	dword p;
   360 	word c, d, e;
   361 
   362 	p = (dword)A[0] * B[0];
   363 	R[0] = LOW_WORD(p);
   364 	c = HIGH_WORD(p);
   365 	d = e = 0;
   366 
   367 	MulAcc(0, 1);
   368 	MulAcc(1, 0);
   369 
   370 	SaveMulAcc(1, 2, 0);
   371 	MulAcc(1, 1);
   372 	MulAcc(0, 2);
   373 
   374 	SaveMulAcc(2, 0, 3);
   375 	MulAcc(1, 2);
   376 	MulAcc(2, 1);
   377 	MulAcc(3, 0);
   378 
   379 	SaveMulAcc(3, 0, 4);
   380 	MulAcc(1, 3);
   381 	MulAcc(2, 2);
   382 	MulAcc(3, 1);
   383 	MulAcc(4, 0);
   384 
   385 	SaveMulAcc(4, 0, 5);
   386 	MulAcc(1, 4);
   387 	MulAcc(2, 3);
   388 	MulAcc(3, 2);
   389 	MulAcc(4, 1);
   390 	MulAcc(5, 0);
   391 
   392 	SaveMulAcc(5, 0, 6);
   393 	MulAcc(1, 5);
   394 	MulAcc(2, 4);
   395 	MulAcc(3, 3);
   396 	MulAcc(4, 2);
   397 	MulAcc(5, 1);
   398 	MulAcc(6, 0);
   399 
   400 	SaveMulAcc(6, 0, 7);
   401 	MulAcc(1, 6);
   402 	MulAcc(2, 5);
   403 	MulAcc(3, 4);
   404 	MulAcc(4, 3);
   405 	MulAcc(5, 2);
   406 	MulAcc(6, 1);
   407 	MulAcc(7, 0);
   408 
   409 	SaveMulAcc(7, 1, 7);
   410 	MulAcc(2, 6);
   411 	MulAcc(3, 5);
   412 	MulAcc(4, 4);
   413 	MulAcc(5, 3);
   414 	MulAcc(6, 2);
   415 	MulAcc(7, 1);
   416 
   417 	SaveMulAcc(8, 2, 7);
   418 	MulAcc(3, 6);
   419 	MulAcc(4, 5);
   420 	MulAcc(5, 4);
   421 	MulAcc(6, 3);
   422 	MulAcc(7, 2);
   423 
   424 	SaveMulAcc(9, 3, 7);
   425 	MulAcc(4, 6);
   426 	MulAcc(5, 5);
   427 	MulAcc(6, 4);
   428 	MulAcc(7, 3);
   429 
   430 	SaveMulAcc(10, 4, 7);
   431 	MulAcc(5, 6);
   432 	MulAcc(6, 5);
   433 	MulAcc(7, 4);
   434 
   435 	SaveMulAcc(11, 5, 7);
   436 	MulAcc(6, 6);
   437 	MulAcc(7, 5);
   438 
   439 	SaveMulAcc(12, 6, 7);
   440 	MulAcc(7, 6);
   441 
   442 	R[13] = c;
   443 	p = (dword)A[7] * B[7] + d;
   444 	R[14] = LOW_WORD(p);
   445 	R[15] = e + HIGH_WORD(p);
   446 }
   447 
   448 static void CombaMultiplyBottom4(word *R, const word *A, const word *B)
   449 {
   450 	dword p;
   451 	word c, d, e;
   452 
   453 	p = (dword)A[0] * B[0];
   454 	R[0] = LOW_WORD(p);
   455 	c = HIGH_WORD(p);
   456 	d = e = 0;
   457 
   458 	MulAcc(0, 1);
   459 	MulAcc(1, 0);
   460 
   461 	SaveMulAcc(1, 2, 0);
   462 	MulAcc(1, 1);
   463 	MulAcc(0, 2);
   464 
   465 	R[2] = c;
   466 	R[3] = d + A[0] * B[3] + A[1] * B[2] + A[2] * B[1] + A[3] * B[0];
   467 }
   468 
   469 static void CombaMultiplyBottom8(word *R, const word *A, const word *B)
   470 {
   471 	dword p;
   472 	word c, d, e;
   473 
   474 	p = (dword)A[0] * B[0];
   475 	R[0] = LOW_WORD(p);
   476 	c = HIGH_WORD(p);
   477 	d = e = 0;
   478 
   479 	MulAcc(0, 1);
   480 	MulAcc(1, 0);
   481 
   482 	SaveMulAcc(1, 2, 0);
   483 	MulAcc(1, 1);
   484 	MulAcc(0, 2);
   485 
   486 	SaveMulAcc(2, 0, 3);
   487 	MulAcc(1, 2);
   488 	MulAcc(2, 1);
   489 	MulAcc(3, 0);
   490 
   491 	SaveMulAcc(3, 0, 4);
   492 	MulAcc(1, 3);
   493 	MulAcc(2, 2);
   494 	MulAcc(3, 1);
   495 	MulAcc(4, 0);
   496 
   497 	SaveMulAcc(4, 0, 5);
   498 	MulAcc(1, 4);
   499 	MulAcc(2, 3);
   500 	MulAcc(3, 2);
   501 	MulAcc(4, 1);
   502 	MulAcc(5, 0);
   503 
   504 	SaveMulAcc(5, 0, 6);
   505 	MulAcc(1, 5);
   506 	MulAcc(2, 4);
   507 	MulAcc(3, 3);
   508 	MulAcc(4, 2);
   509 	MulAcc(5, 1);
   510 	MulAcc(6, 0);
   511 
   512 	R[6] = c;
   513 	R[7] = d + A[0] * B[7] + A[1] * B[6] + A[2] * B[5] + A[3] * B[4] +
   514 				A[4] * B[3] + A[5] * B[2] + A[6] * B[1] + A[7] * B[0];
   515 }
   516 
   517 #undef MulAcc
   518 #undef SaveMulAcc
   519 static void AtomicInverseModPower2(word *C, word A0, word A1)
   520 {
   521 	assert(A0%2==1);
   522 
   523 	dword A=MAKE_DWORD(A0, A1), R=A0%8;
   524 
   525 	for (unsigned i=3; i<2*WORD_BITS; i*=2)
   526 		R = R*(2-R*A);
   527 
   528 	assert(R*A==1);
   529 
   530 	C[0] = LOW_WORD(R);
   531 	C[1] = HIGH_WORD(R);
   532 }
   533 // ********************************************************
   534 
   535 #define A0		A
   536 #define A1		(A+N2)
   537 #define B0		B
   538 #define B1		(B+N2)
   539 
   540 #define T0		T
   541 #define T1		(T+N2)
   542 #define T2		(T+N)
   543 #define T3		(T+N+N2)
   544 
   545 #define R0		R
   546 #define R1		(R+N2)
   547 #define R2		(R+N)
   548 #define R3		(R+N+N2)
   549 
   550 // R[2*N] - result = A*B
   551 // T[2*N] - temporary work space
   552 // A[N] --- multiplier
   553 // B[N] --- multiplicant
   554 
   555 void RecursiveMultiply(word *R, word *T, const word *A, const word *B, unsigned int N)
   556 {
   557 	assert(N>=2 && N%2==0);
   558 
   559 	if (N==2)
   560 		AtomicMultiply(R, A, B);
   561 	else if (N==4)
   562 		CombaMultiply4(R, A, B);
   563 	else if (N==8)
   564 		CombaMultiply8(R, A, B);
   565 	else
   566 	{
   567 		const unsigned int N2 = N/2;
   568 		int carry;
   569 
   570 		int aComp = Compare(A0, A1, N2);
   571 		int bComp = Compare(B0, B1, N2);
   572 
   573 		switch (2*aComp + aComp + bComp)
   574 		{
   575 		case -4:
   576 			Subtract(R0, A1, A0, N2);
   577 			Subtract(R1, B0, B1, N2);
   578 			RecursiveMultiply(T0, T2, R0, R1, N2);
   579 			Subtract(T1, T1, R0, N2);
   580 			carry = -1;
   581 			break;
   582 		case -2:
   583 			Subtract(R0, A1, A0, N2);
   584 			Subtract(R1, B0, B1, N2);
   585 			RecursiveMultiply(T0, T2, R0, R1, N2);
   586 			carry = 0;
   587 			break;
   588 		case 2:
   589 			Subtract(R0, A0, A1, N2);
   590 			Subtract(R1, B1, B0, N2);
   591 			RecursiveMultiply(T0, T2, R0, R1, N2);
   592 			carry = 0;
   593 			break;
   594 		case 4:
   595 			Subtract(R0, A1, A0, N2);
   596 			Subtract(R1, B0, B1, N2);
   597 			RecursiveMultiply(T0, T2, R0, R1, N2);
   598 			Subtract(T1, T1, R1, N2);
   599 			carry = -1;
   600 			break;
   601 		default:
   602 			SetWords(T0, 0, N);
   603 			carry = 0;
   604 		}
   605 
   606 		RecursiveMultiply(R0, T2, A0, B0, N2);
   607 		RecursiveMultiply(R2, T2, A1, B1, N2);
   608 
   609 		// now T[01] holds (A1-A0)*(B0-B1), R[01] holds A0*B0, R[23] holds A1*B1
   610 
   611 		carry += Add(T0, T0, R0, N);
   612 		carry += Add(T0, T0, R2, N);
   613 		carry += Add(R1, R1, T0, N);
   614 
   615 		assert (carry >= 0 && carry <= 2);
   616 		Increment(R3, N2, carry);
   617 	}
   618 }
   619 
   620 // R[2*N] - result = A*A
   621 // T[2*N] - temporary work space
   622 // A[N] --- number to be squared
   623 
   624 void RecursiveSquare(word *R, word *T, const word *A, unsigned int N)
   625 {
   626 	assert(N && N%2==0);
   627 
   628 	if (N==2)
   629 		AtomicMultiply(R, A, A);
   630 	else if (N==4)
   631 	{
   632 		// VC60 workaround: MSVC 6.0 has an optimization problem that makes
   633 		// (dword)A*B where either A or B has been cast to a dword before
   634 		// very expensive. Revisit a CombaSquare4() function when this
   635 		// problem is fixed.               
   636 
   637 // WARNING: KeithR. 05/08/03 This routine doesn't work with gcc on hardware
   638 // either.  I've completely removed it.  It may be worth looking into sometime
   639 // in the future.  Therefore, we use the CombaMultiply4 on all targets.
   640 //#ifdef __WINS__
   641 		CombaMultiply4(R, A, A);
   642 /*#else
   643 		CombaSquare4(R, A);
   644 #endif*/
   645 	}
   646 	else
   647 	{
   648 		const unsigned int N2 = N/2;
   649 
   650 		RecursiveSquare(R0, T2, A0, N2);
   651 		RecursiveSquare(R2, T2, A1, N2);
   652 		RecursiveMultiply(T0, T2, A0, A1, N2);
   653 
   654 		word carry = Add(R1, R1, T0, N);
   655 		carry += Add(R1, R1, T0, N);
   656 		Increment(R3, N2, carry);
   657 	}
   658 }
   659 // R[N] - bottom half of A*B
   660 // T[N] - temporary work space
   661 // A[N] - multiplier
   662 // B[N] - multiplicant
   663 
   664 void RecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, unsigned int N)
   665 {
   666 	assert(N>=2 && N%2==0);
   667 
   668 	if (N==2)
   669 		AtomicMultiplyBottom(R, A, B);
   670 	else if (N==4)
   671 		CombaMultiplyBottom4(R, A, B);
   672 	else if (N==8)
   673 		CombaMultiplyBottom8(R, A, B);
   674 	else
   675 	{
   676 		const unsigned int N2 = N/2;
   677 
   678 		RecursiveMultiply(R, T, A0, B0, N2);
   679 		RecursiveMultiplyBottom(T0, T1, A1, B0, N2);
   680 		Add(R1, R1, T0, N2);
   681 		RecursiveMultiplyBottom(T0, T1, A0, B1, N2);
   682 		Add(R1, R1, T0, N2);
   683 	}
   684 }
   685 
   686 // R[N] --- upper half of A*B
   687 // T[2*N] - temporary work space
   688 // L[N] --- lower half of A*B
   689 // A[N] --- multiplier
   690 // B[N] --- multiplicant
   691 
   692 void RecursiveMultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, unsigned int N)
   693 {
   694 	assert(N>=2 && N%2==0);
   695 
   696 	if (N==2)
   697 	{
   698 		AtomicMultiply(T, A, B);
   699 		((dword *)R)[0] = ((dword *)T)[1];
   700 	}
   701 	else if (N==4)
   702 	{
   703 		CombaMultiply4(T, A, B);
   704 		((dword *)R)[0] = ((dword *)T)[2];
   705 		((dword *)R)[1] = ((dword *)T)[3];
   706 	}
   707 	else
   708 	{
   709 		const unsigned int N2 = N/2;
   710 		int carry;
   711 
   712 		int aComp = Compare(A0, A1, N2);
   713 		int bComp = Compare(B0, B1, N2);
   714 
   715 		switch (2*aComp + aComp + bComp)
   716 		{
   717 		case -4:
   718 			Subtract(R0, A1, A0, N2);
   719 			Subtract(R1, B0, B1, N2);
   720 			RecursiveMultiply(T0, T2, R0, R1, N2);
   721 			Subtract(T1, T1, R0, N2);
   722 			carry = -1;
   723 			break;
   724 		case -2:
   725 			Subtract(R0, A1, A0, N2);
   726 			Subtract(R1, B0, B1, N2);
   727 			RecursiveMultiply(T0, T2, R0, R1, N2);
   728 			carry = 0;
   729 			break;
   730 		case 2:
   731 			Subtract(R0, A0, A1, N2);
   732 			Subtract(R1, B1, B0, N2);
   733 			RecursiveMultiply(T0, T2, R0, R1, N2);
   734 			carry = 0;
   735 			break;
   736 		case 4:
   737 			Subtract(R0, A1, A0, N2);
   738 			Subtract(R1, B0, B1, N2);
   739 			RecursiveMultiply(T0, T2, R0, R1, N2);
   740 			Subtract(T1, T1, R1, N2);
   741 			carry = -1;
   742 			break;
   743 		default:
   744 			SetWords(T0, 0, N);
   745 			carry = 0;
   746 		}
   747 
   748 		RecursiveMultiply(T2, R0, A1, B1, N2);
   749 
   750 		// now T[01] holds (A1-A0)*(B0-B1), T[23] holds A1*B1
   751 
   752 		CopyWords(R0, L+N2, N2);
   753 		word c2 = Subtract(R0, R0, L, N2);
   754 		c2 += Subtract(R0, R0, T0, N2);
   755 		word t = (Compare(R0, T2, N2) == -1);
   756 
   757 		carry += t;
   758 		carry += Increment(R0, N2, c2+t);
   759 		carry += Add(R0, R0, T1, N2);
   760 		carry += Add(R0, R0, T3, N2);
   761 
   762 		CopyWords(R1, T3, N2);
   763 		assert (carry >= 0 && carry <= 2);
   764 		Increment(R1, N2, carry);
   765 	}
   766 }
   767 
   768 // R[NA+NB] - result = A*B
   769 // T[NA+NB] - temporary work space
   770 // A[NA] ---- multiplier
   771 // B[NB] ---- multiplicant
   772 
   773 void AsymmetricMultiply(word *R, word *T, const word *A, unsigned int NA, const word *B, unsigned int NB)
   774 {
   775 	if (NA == NB)
   776 	{
   777 		if (A == B)
   778 			RecursiveSquare(R, T, A, NA);
   779 		else
   780 			RecursiveMultiply(R, T, A, B, NA);
   781 
   782 		return;
   783 	}
   784 
   785 	if (NA > NB)
   786 	{
   787 		TClassSwap(A, B);
   788 		TClassSwap(NA, NB);
   789 		//std::swap(A, B);
   790 		//std::swap(NA, NB);
   791 	}
   792 
   793 	assert(NB % NA == 0);
   794 	assert((NB/NA)%2 == 0); 	// NB is an even multiple of NA
   795 
   796 	if (NA==2 && !A[1])
   797 	{
   798 		switch (A[0])
   799 		{
   800 		case 0:
   801 			SetWords(R, 0, NB+2);
   802 			return;
   803 		case 1:
   804 			CopyWords(R, B, NB);
   805 			R[NB] = R[NB+1] = 0;
   806 			return;
   807 		default:
   808 			R[NB] = LinearMultiply(R, B, A[0], NB);
   809 			R[NB+1] = 0;
   810 			return;
   811 		}
   812 	}
   813 
   814 	RecursiveMultiply(R, T, A, B, NA);
   815 	CopyWords(T+2*NA, R+NA, NA);
   816 
   817 	unsigned i;
   818 
   819 	for (i=2*NA; i<NB; i+=2*NA)
   820 		RecursiveMultiply(T+NA+i, T, A, B+i, NA);
   821 	for (i=NA; i<NB; i+=2*NA)
   822 		RecursiveMultiply(R+i, T, A, B+i, NA);
   823 
   824 	if (Add(R+NA, R+NA, T+2*NA, NB-NA))
   825 		Increment(R+NB, NA);
   826 }
   827 // R[N] ----- result = A inverse mod 2**(WORD_BITS*N)
   828 // T[3*N/2] - temporary work space
   829 // A[N] ----- an odd number as input
   830 
   831 void RecursiveInverseModPower2(word *R, word *T, const word *A, unsigned int N)
   832 {
   833 	if (N==2)
   834 		AtomicInverseModPower2(R, A[0], A[1]);
   835 	else
   836 	{
   837 		const unsigned int N2 = N/2;
   838 		RecursiveInverseModPower2(R0, T0, A0, N2);
   839 		T0[0] = 1;
   840 		SetWords(T0+1, 0, N2-1);
   841 		RecursiveMultiplyTop(R1, T1, T0, R0, A0, N2);
   842 		RecursiveMultiplyBottom(T0, T1, R0, A1, N2);
   843 		Add(T0, R1, T0, N2);
   844 		TwosComplement(T0, N2);
   845 		RecursiveMultiplyBottom(R1, T1, R0, T0, N2);
   846 	}
   847 }
   848 #undef A0
   849 #undef A1
   850 #undef B0
   851 #undef B1
   852 
   853 #undef T0
   854 #undef T1
   855 #undef T2
   856 #undef T3
   857 
   858 #undef R0
   859 #undef R1
   860 #undef R2
   861 #undef R3
   862 
   863 // R[N] --- result = X/(2**(WORD_BITS*N)) mod M
   864 // T[3*N] - temporary work space
   865 // X[2*N] - number to be reduced
   866 // M[N] --- modulus
   867 // U[N] --- multiplicative inverse of M mod 2**(WORD_BITS*N)
   868 
   869 void MontgomeryReduce(word *R, word *T, const word *X, const word *M, const word *U, unsigned int N)
   870 {
   871 	RecursiveMultiplyBottom(R, T, X, U, N);
   872 	RecursiveMultiplyTop(T, T+N, X, R, M, N);
   873 	if (Subtract(R, X+N, T, N))
   874 	{
   875 #ifdef _DEBUG
   876 		word carry = Add(R, R, M, N);
   877 		assert(carry);
   878 #else
   879 		Add(R, R, M, N);
   880 #endif
   881 	}
   882 }
   883 
   884 // do a 3 word by 2 word divide, returns quotient and leaves remainder in A
   885 static word SubatomicDivide(word *A, word B0, word B1)
   886 {
   887 	// assert {A[2],A[1]} < {B1,B0}, so quotient can fit in a word
   888 	assert(A[2] < B1 || (A[2]==B1 && A[1] < B0));
   889 
   890 	dword p, u;
   891 	word Q;
   892 
   893 	// estimate the quotient: do a 2 word by 1 word divide
   894 	if (B1+1 == 0)
   895 		Q = A[2];
   896 	else
   897 		Q = word(MAKE_DWORD(A[1], A[2]) / (B1+1));
   898 
   899 	// now subtract Q*B from A
   900 	p = (dword) B0*Q;
   901 	u = (dword) A[0] - LOW_WORD(p);
   902 	A[0] = LOW_WORD(u);
   903 	u = (dword) A[1] - HIGH_WORD(p) - (word)(0-HIGH_WORD(u)) - (dword)B1*Q;
   904 	A[1] = LOW_WORD(u);
   905 	A[2] += HIGH_WORD(u);
   906 
   907 	// Q <= actual quotient, so fix it
   908 	while (A[2] || A[1] > B1 || (A[1]==B1 && A[0]>=B0))
   909 	{
   910 		u = (dword) A[0] - B0;
   911 		A[0] = LOW_WORD(u);
   912 		u = (dword) A[1] - B1 - (word)(0-HIGH_WORD(u));
   913 		A[1] = LOW_WORD(u);
   914 		A[2] += HIGH_WORD(u);
   915 		Q++;
   916 		assert(Q);	// shouldn't overflow
   917 	}
   918 
   919 	return Q;
   920 }
   921 
   922 // do a 4 word by 2 word divide, returns 2 word quotient in Q0 and Q1
   923 static inline void AtomicDivide(word *Q, const word *A, const word *B)
   924 {
   925 	if (!B[0] && !B[1]) // if divisor is 0, we assume divisor==2**(2*WORD_BITS)
   926 	{
   927 		Q[0] = A[2];
   928 		Q[1] = A[3];
   929 	}
   930 	else
   931 	{
   932 		word T[4];
   933 		T[0] = A[0]; T[1] = A[1]; T[2] = A[2]; T[3] = A[3];
   934 		Q[1] = SubatomicDivide(T+1, B[0], B[1]);
   935 		Q[0] = SubatomicDivide(T, B[0], B[1]);
   936 
   937 #ifdef _DEBUG
   938 		// multiply quotient and divisor and add remainder, make sure it equals dividend
   939 		assert(!T[2] && !T[3] && (T[1] < B[1] || (T[1]==B[1] && T[0]<B[0])));
   940 		word P[4];
   941 		AtomicMultiply(P, Q, B);
   942 		Add(P, P, T, 4);
   943 		assert(Mem::Compare((TUint8*)P, 4*WORD_SIZE, (TUint8*)A, 4*WORD_SIZE)==0);
   944 #endif
   945 	}
   946 }
   947 
   948 // for use by Divide(), corrects the underestimated quotient {Q1,Q0}
   949 static void CorrectQuotientEstimate(word *R, word *T, word *Q, const word *B, unsigned int N)
   950 {
   951 	assert(N && N%2==0);
   952 
   953 	if (Q[1])
   954 	{
   955 		T[N] = T[N+1] = 0;
   956 		unsigned i;
   957 		for (i=0; i<N; i+=4)
   958 			AtomicMultiply(T+i, Q, B+i);
   959 		for (i=2; i<N; i+=4)
   960 			if (AtomicMultiplyAdd(T+i, Q, B+i))
   961 				T[i+5] += (++T[i+4]==0);
   962 	}
   963 	else
   964 	{
   965 		T[N] = LinearMultiply(T, B, Q[0], N);
   966 		T[N+1] = 0;
   967 	}
   968 
   969 #ifdef _DEBUG
   970 	word borrow = Subtract(R, R, T, N+2);
   971 	assert(!borrow && !R[N+1]);
   972 #else
   973 	Subtract(R, R, T, N+2);
   974 #endif
   975 
   976 	while (R[N] || Compare(R, B, N) >= 0)
   977 	{
   978 		R[N] -= Subtract(R, R, B, N);
   979 		Q[1] += (++Q[0]==0);
   980 		assert(Q[0] || Q[1]); // no overflow
   981 	}
   982 }
   983 
   984 // R[NB] -------- remainder = A%B
   985 // Q[NA-NB+2] --- quotient	= A/B
   986 // T[NA+2*NB+4] - temp work space
   987 // A[NA] -------- dividend
   988 // B[NB] -------- divisor
   989 
   990 void Divide(word *R, word *Q, word *T, const word *A, unsigned int NA, const word *B, unsigned int NB)
   991 {
   992 	assert(NA && NB && NA%2==0 && NB%2==0);
   993 	assert(B[NB-1] || B[NB-2]);
   994 	assert(NB <= NA);
   995 
   996 	// set up temporary work space
   997 	word *const TA=T;
   998 	word *const TB=T+NA+2;
   999 	word *const TP=T+NA+2+NB;
  1000 
  1001 	// copy B into TB and normalize it so that TB has highest bit set to 1
  1002 	unsigned shiftWords = (B[NB-1]==0);
  1003 	TB[0] = TB[NB-1] = 0;
  1004 	CopyWords(TB+shiftWords, B, NB-shiftWords);
  1005 	unsigned shiftBits = WORD_BITS - BitPrecision(TB[NB-1]);
  1006 	assert(shiftBits < WORD_BITS);
  1007 	ShiftWordsLeftByBits(TB, NB, shiftBits);
  1008 
  1009 	// copy A into TA and normalize it
  1010 	TA[0] = TA[NA] = TA[NA+1] = 0;
  1011 	CopyWords(TA+shiftWords, A, NA);
  1012 	ShiftWordsLeftByBits(TA, NA+2, shiftBits);
  1013 
  1014 	if (TA[NA+1]==0 && TA[NA] <= 1)
  1015 	{
  1016 		Q[NA-NB+1] = Q[NA-NB] = 0;
  1017 		while (TA[NA] || Compare(TA+NA-NB, TB, NB) >= 0)
  1018 		{
  1019 			TA[NA] -= Subtract(TA+NA-NB, TA+NA-NB, TB, NB);
  1020 			++Q[NA-NB];
  1021 		}
  1022 	}
  1023 	else
  1024 	{
  1025 		NA+=2;
  1026 		assert(Compare(TA+NA-NB, TB, NB) < 0);
  1027 	}
  1028 
  1029 	word BT[2];
  1030 	BT[0] = TB[NB-2] + 1;
  1031 	BT[1] = TB[NB-1] + (BT[0]==0);
  1032 
  1033 	// start reducing TA mod TB, 2 words at a time
  1034 	for (unsigned i=NA-2; i>=NB; i-=2)
  1035 	{
  1036 		AtomicDivide(Q+i-NB, TA+i-2, BT);
  1037 		CorrectQuotientEstimate(TA+i-NB, TP, Q+i-NB, TB, NB);
  1038 	}
  1039 
  1040 	// copy TA into R, and denormalize it
  1041 	CopyWords(R, TA+shiftWords, NB);
  1042 	ShiftWordsRightByBits(R, NB, shiftBits);
  1043 }
  1044 
  1045 static inline unsigned int EvenWordCount(const word *X, unsigned int N)
  1046 {
  1047 	while (N && X[N-2]==0 && X[N-1]==0)
  1048 		N-=2;
  1049 	return N;
  1050 }
  1051 
  1052 // return k
  1053 // R[N] --- result = A^(-1) * 2^k mod M
  1054 // T[4*N] - temporary work space
  1055 // A[NA] -- number to take inverse of
  1056 // M[N] --- modulus
  1057 
  1058 unsigned int AlmostInverse(word *R, word *T, const word *A, unsigned int NA, const word *M, unsigned int N)
  1059 {
  1060 	assert(NA<=N && N && N%2==0);
  1061 
  1062 	word *b = T;
  1063 	word *c = T+N;
  1064 	word *f = T+2*N;
  1065 	word *g = T+3*N;
  1066 	unsigned int bcLen=2, fgLen=EvenWordCount(M, N);
  1067 	unsigned int k=0, s=0;
  1068 
  1069 	SetWords(T, 0, 3*N);
  1070 	b[0]=1;
  1071 	CopyWords(f, A, NA);
  1072 	CopyWords(g, M, N);
  1073 
  1074 	FOREVER
  1075 	{
  1076 		word t=f[0];
  1077 		while (!t)
  1078 		{
  1079 			if (EvenWordCount(f, fgLen)==0)
  1080 			{
  1081 				SetWords(R, 0, N);
  1082 				return 0;
  1083 			}
  1084 
  1085 			ShiftWordsRightByWords(f, fgLen, 1);
  1086 			if (c[bcLen-1]) bcLen+=2;
  1087 			assert(bcLen <= N);
  1088 			ShiftWordsLeftByWords(c, bcLen, 1);
  1089 			k+=WORD_BITS;
  1090 			t=f[0];
  1091 		}
  1092 
  1093 		unsigned int i=0;
  1094 		while (t%2 == 0)
  1095 		{
  1096 			t>>=1;
  1097 			i++;
  1098 		}
  1099 		k+=i;
  1100 
  1101 		if (t==1 && f[1]==0 && EvenWordCount(f, fgLen)==2)
  1102 		{
  1103 			if (s%2==0)
  1104 				CopyWords(R, b, N);
  1105 			else
  1106 				Subtract(R, M, b, N);
  1107 			return k;
  1108 		}
  1109 
  1110 		ShiftWordsRightByBits(f, fgLen, i);
  1111 		t=ShiftWordsLeftByBits(c, bcLen, i);
  1112 		if (t)
  1113 		{
  1114 			c[bcLen] = t;
  1115 			bcLen+=2;
  1116 			assert(bcLen <= N);
  1117 		}
  1118 
  1119 		if (f[fgLen-2]==0 && g[fgLen-2]==0 && f[fgLen-1]==0 && g[fgLen-1]==0)
  1120 			fgLen-=2;
  1121 
  1122 		if (Compare(f, g, fgLen)==-1)
  1123 		{
  1124 			TClassSwap<word*>(f,g);
  1125 			TClassSwap<word*>(b,c);
  1126 			s++;
  1127 		}
  1128 
  1129 		Subtract(f, f, g, fgLen);
  1130 
  1131 		if (Add(b, b, c, bcLen))
  1132 		{
  1133 			b[bcLen] = 1;
  1134 			bcLen+=2;
  1135 			assert(bcLen <= N);
  1136 		}
  1137 	}
  1138 }
  1139 
  1140 // R[N] - result = A/(2^k) mod M
  1141 // A[N] - input
  1142 // M[N] - modulus
  1143 
  1144 void DivideByPower2Mod(word *R, const word *A, unsigned int k, const word *M, unsigned int N)
  1145 {
  1146 	CopyWords(R, A, N);
  1147 
  1148 	while (k--)
  1149 	{
  1150 		if (R[0]%2==0)
  1151 			ShiftWordsRightByBits(R, N, 1);
  1152 		else
  1153 		{
  1154 			word carry = Add(R, R, M, N);
  1155 			ShiftWordsRightByBits(R, N, 1);
  1156 			R[N-1] += carry<<(WORD_BITS-1);
  1157 		}
  1158 	}
  1159 }
  1160