os/security/crypto/weakcryptospi/source/bigint/algorithms.cpp
author sl@SLION-WIN7.fritz.box
Fri, 15 Jun 2012 03:10:57 +0200
changeset 0 bde4ae8d615e
permissions -rw-r--r--
First public contribution.
sl@0
     1
/*
sl@0
     2
* Copyright (c) 2003-2009 Nokia Corporation and/or its subsidiary(-ies).
sl@0
     3
* All rights reserved.
sl@0
     4
* This component and the accompanying materials are made available
sl@0
     5
* under the terms of the License "Eclipse Public License v1.0"
sl@0
     6
* which accompanies this distribution, and is available
sl@0
     7
* at the URL "http://www.eclipse.org/legal/epl-v10.html".
sl@0
     8
*
sl@0
     9
* Initial Contributors:
sl@0
    10
* Nokia Corporation - initial contribution.
sl@0
    11
*
sl@0
    12
* Contributors:
sl@0
    13
*
sl@0
    14
* Description: 
sl@0
    15
*
sl@0
    16
*/
sl@0
    17
sl@0
    18
sl@0
    19
#include "words.h"
sl@0
    20
#include "algorithms.h"
sl@0
    21
sl@0
    22
word Add(word *C, const word *A, const word *B, unsigned int N)
sl@0
    23
{
sl@0
    24
	assert (N%2 == 0);
sl@0
    25
	word carry = 0;
sl@0
    26
	for (unsigned int i = 0; i < N; i+=2)
sl@0
    27
	{
sl@0
    28
		dword u = (dword) carry + A[i] + B[i];
sl@0
    29
		C[i] = LOW_WORD(u);
sl@0
    30
		u = (dword) HIGH_WORD(u) + A[i+1] + B[i+1];
sl@0
    31
		C[i+1] = LOW_WORD(u);
sl@0
    32
		carry = HIGH_WORD(u);
sl@0
    33
	}
sl@0
    34
	return carry;
sl@0
    35
}
sl@0
    36
sl@0
    37
word Subtract(word *C, const word *A, const word *B, unsigned int N)
sl@0
    38
{
sl@0
    39
	assert (N%2 == 0);
sl@0
    40
	word borrow=0;
sl@0
    41
	for (unsigned i = 0; i < N; i+=2)
sl@0
    42
	{
sl@0
    43
		dword u = (dword) A[i] - B[i] - borrow;
sl@0
    44
		C[i] = LOW_WORD(u);
sl@0
    45
		u = (dword) A[i+1] - B[i+1] - (word)(0-HIGH_WORD(u));
sl@0
    46
		C[i+1] = LOW_WORD(u);
sl@0
    47
		borrow = 0-HIGH_WORD(u);
sl@0
    48
	}
sl@0
    49
	return borrow;
sl@0
    50
}
sl@0
    51
sl@0
    52
int Compare(const word *A, const word *B, unsigned int N)
sl@0
    53
{
sl@0
    54
	while (N--)
sl@0
    55
		if (A[N] > B[N])
sl@0
    56
			return 1;
sl@0
    57
		else if (A[N] < B[N])
sl@0
    58
			return -1;
sl@0
    59
sl@0
    60
	return 0;
sl@0
    61
}
sl@0
    62
sl@0
    63
// It is the job of the calling code to ensure that this won't carry.
sl@0
    64
// If you aren't sure, use the next version that will tell you if you need to
sl@0
    65
// grow your integer.
sl@0
    66
// Having two of these creates ever so slightly more code but avoids having
sl@0
    67
// ifdefs all over the rest of the code checking the following type stuff which
sl@0
    68
// causes warnings in certain compilers about unused parameters in release
sl@0
    69
// builds.  We can't have that can we!
sl@0
    70
/*
sl@0
    71
Allows avoid this all over bigint.cpp and primes.cpp
sl@0
    72
ifdef _DEBUG
sl@0
    73
	TUint carry = Increment(Ptr(), Size());
sl@0
    74
	assert(!carry);
sl@0
    75
else
sl@0
    76
	Increment(Ptr(), Size())
sl@0
    77
endif
sl@0
    78
*/
sl@0
    79
void IncrementNoCarry(word *A, unsigned int N, word B)
sl@0
    80
{
sl@0
    81
	assert(N);
sl@0
    82
	word t = A[0];
sl@0
    83
	A[0] = t+B;
sl@0
    84
	if (A[0] >= t)
sl@0
    85
		return;
sl@0
    86
	for (unsigned i=1; i<N; i++)
sl@0
    87
		if (++A[i])
sl@0
    88
			return;
sl@0
    89
	assert(0);
sl@0
    90
}
sl@0
    91
sl@0
    92
word Increment(word *A, unsigned int N, word B)
sl@0
    93
{
sl@0
    94
	assert(N);
sl@0
    95
	word t = A[0];
sl@0
    96
	A[0] = t+B;
sl@0
    97
	if (A[0] >= t)
sl@0
    98
		return 0;
sl@0
    99
	for (unsigned i=1; i<N; i++)
sl@0
   100
		if (++A[i])
sl@0
   101
			return 0;
sl@0
   102
	return 1;
sl@0
   103
}
sl@0
   104
sl@0
   105
//See commments above about IncrementNoCarry
sl@0
   106
void DecrementNoCarry(word *A, unsigned int N, word B)
sl@0
   107
{
sl@0
   108
	assert(N);
sl@0
   109
	word t = A[0];
sl@0
   110
	A[0] = t-B;
sl@0
   111
	if (A[0] <= t)
sl@0
   112
		return;
sl@0
   113
	for (unsigned i=1; i<N; i++)
sl@0
   114
		if (A[i]--)
sl@0
   115
			return;
sl@0
   116
	assert(0);
sl@0
   117
}
sl@0
   118
sl@0
   119
word Decrement(word *A, unsigned int N, word B)
sl@0
   120
{
sl@0
   121
	assert(N);
sl@0
   122
	word t = A[0];
sl@0
   123
	A[0] = t-B;
sl@0
   124
	if (A[0] <= t)
sl@0
   125
		return 0;
sl@0
   126
	for (unsigned i=1; i<N; i++)
sl@0
   127
		if (A[i]--)
sl@0
   128
			return 0;
sl@0
   129
	return 1;
sl@0
   130
}
sl@0
   131
sl@0
   132
void TwosComplement(word *A, unsigned int N)
sl@0
   133
{
sl@0
   134
	Decrement(A, N);
sl@0
   135
	for (unsigned i=0; i<N; i++)
sl@0
   136
		A[i] = ~A[i];
sl@0
   137
}
sl@0
   138
sl@0
   139
static word LinearMultiply(word *C, const word *A, word B, unsigned int N)
sl@0
   140
{
sl@0
   141
	word carry=0;
sl@0
   142
	for(unsigned i=0; i<N; i++)
sl@0
   143
	{
sl@0
   144
		dword p = (dword)A[i] * B + carry;
sl@0
   145
		C[i] = LOW_WORD(p);
sl@0
   146
		carry = HIGH_WORD(p);
sl@0
   147
	}
sl@0
   148
	return carry;
sl@0
   149
}
sl@0
   150
sl@0
   151
static void AtomicMultiply(word *C, const word *A, const word *B)
sl@0
   152
{
sl@0
   153
/*
sl@0
   154
	word s;
sl@0
   155
	dword d;
sl@0
   156
sl@0
   157
	if (A1 >= A0)
sl@0
   158
		if (B0 >= B1)
sl@0
   159
		{
sl@0
   160
			s = 0;
sl@0
   161
			d = (dword)(A1-A0)*(B0-B1);
sl@0
   162
		}
sl@0
   163
		else
sl@0
   164
		{
sl@0
   165
			s = (A1-A0);
sl@0
   166
			d = (dword)s*(word)(B0-B1);
sl@0
   167
		}
sl@0
   168
	else
sl@0
   169
		if (B0 > B1)
sl@0
   170
		{
sl@0
   171
			s = (B0-B1);
sl@0
   172
			d = (word)(A1-A0)*(dword)s;
sl@0
   173
		}
sl@0
   174
		else
sl@0
   175
		{
sl@0
   176
			s = 0;
sl@0
   177
			d = (dword)(A0-A1)*(B1-B0);
sl@0
   178
		}
sl@0
   179
*/
sl@0
   180
	// this segment is the branchless equivalent of above
sl@0
   181
	word D[4] = {A[1]-A[0], A[0]-A[1], B[0]-B[1], B[1]-B[0]};
sl@0
   182
	unsigned int ai = A[1] < A[0];
sl@0
   183
	unsigned int bi = B[0] < B[1];
sl@0
   184
	unsigned int di = ai & bi;
sl@0
   185
	dword d = (dword)D[di]*D[di+2];
sl@0
   186
	D[1] = D[3] = 0;
sl@0
   187
	unsigned int si = ai + !bi;
sl@0
   188
	word s = D[si];
sl@0
   189
sl@0
   190
	dword A0B0 = (dword)A[0]*B[0];
sl@0
   191
	C[0] = LOW_WORD(A0B0);
sl@0
   192
sl@0
   193
	dword A1B1 = (dword)A[1]*B[1];
sl@0
   194
	dword t = (dword) HIGH_WORD(A0B0) + LOW_WORD(A0B0) + LOW_WORD(d) + LOW_WORD(A1B1);
sl@0
   195
	C[1] = LOW_WORD(t);
sl@0
   196
sl@0
   197
	t = A1B1 + HIGH_WORD(t) + HIGH_WORD(A0B0) + HIGH_WORD(d) + HIGH_WORD(A1B1) - s;
sl@0
   198
	C[2] = LOW_WORD(t);
sl@0
   199
	C[3] = HIGH_WORD(t);
sl@0
   200
}
sl@0
   201
sl@0
   202
static word AtomicMultiplyAdd(word *C, const word *A, const word *B)
sl@0
   203
{
sl@0
   204
	word D[4] = {A[1]-A[0], A[0]-A[1], B[0]-B[1], B[1]-B[0]};
sl@0
   205
	unsigned int ai = A[1] < A[0];
sl@0
   206
	unsigned int bi = B[0] < B[1];
sl@0
   207
	unsigned int di = ai & bi;
sl@0
   208
	dword d = (dword)D[di]*D[di+2];
sl@0
   209
	D[1] = D[3] = 0;
sl@0
   210
	unsigned int si = ai + !bi;
sl@0
   211
	word s = D[si];
sl@0
   212
sl@0
   213
	dword A0B0 = (dword)A[0]*B[0];
sl@0
   214
	dword t = A0B0 + C[0];
sl@0
   215
	C[0] = LOW_WORD(t);
sl@0
   216
sl@0
   217
	dword A1B1 = (dword)A[1]*B[1];
sl@0
   218
	t = (dword) HIGH_WORD(t) + LOW_WORD(A0B0) + LOW_WORD(d) + LOW_WORD(A1B1) + C[1];
sl@0
   219
	C[1] = LOW_WORD(t);
sl@0
   220
sl@0
   221
	t = (dword) HIGH_WORD(t) + LOW_WORD(A1B1) + HIGH_WORD(A0B0) + HIGH_WORD(d) + HIGH_WORD(A1B1) - s + C[2];
sl@0
   222
	C[2] = LOW_WORD(t);
sl@0
   223
sl@0
   224
	t = (dword) HIGH_WORD(t) + HIGH_WORD(A1B1) + C[3];
sl@0
   225
	C[3] = LOW_WORD(t);
sl@0
   226
	return HIGH_WORD(t);
sl@0
   227
}
sl@0
   228
sl@0
   229
static inline void AtomicMultiplyBottom(word *C, const word *A, const word *B)
sl@0
   230
{
sl@0
   231
	dword t = (dword)A[0]*B[0];
sl@0
   232
	C[0] = LOW_WORD(t);
sl@0
   233
	C[1] = HIGH_WORD(t) + A[0]*B[1] + A[1]*B[0];
sl@0
   234
}
sl@0
   235
sl@0
   236
#define MulAcc(x, y)								\
sl@0
   237
	p = (dword)A[x] * B[y] + c; 					\
sl@0
   238
	c = LOW_WORD(p);								\
sl@0
   239
	p = (dword)d + HIGH_WORD(p);					\
sl@0
   240
	d = LOW_WORD(p);								\
sl@0
   241
	e += HIGH_WORD(p);
sl@0
   242
sl@0
   243
#define SaveMulAcc(s, x, y) 						\
sl@0
   244
	R[s] = c;										\
sl@0
   245
	p = (dword)A[x] * B[y] + d; 					\
sl@0
   246
	c = LOW_WORD(p);								\
sl@0
   247
	p = (dword)e + HIGH_WORD(p);					\
sl@0
   248
	d = LOW_WORD(p);								\
sl@0
   249
	e = HIGH_WORD(p);
sl@0
   250
sl@0
   251
#define MulAcc1(x, y)								\
sl@0
   252
	p = (dword)A[x] * A[y] + c; 					\
sl@0
   253
	c = LOW_WORD(p);								\
sl@0
   254
	p = (dword)d + HIGH_WORD(p);					\
sl@0
   255
	d = LOW_WORD(p);								\
sl@0
   256
	e += HIGH_WORD(p);
sl@0
   257
sl@0
   258
#define SaveMulAcc1(s, x, y) 						\
sl@0
   259
	R[s] = c;										\
sl@0
   260
	p = (dword)A[x] * A[y] + d; 					\
sl@0
   261
	c = LOW_WORD(p);								\
sl@0
   262
	p = (dword)e + HIGH_WORD(p);					\
sl@0
   263
	d = LOW_WORD(p);								\
sl@0
   264
	e = HIGH_WORD(p);
sl@0
   265
sl@0
   266
#define SquAcc(x, y)								\
sl@0
   267
	p = (dword)A[x] * A[y];	\
sl@0
   268
	p = p + p + c; 					\
sl@0
   269
	c = LOW_WORD(p);								\
sl@0
   270
	p = (dword)d + HIGH_WORD(p);					\
sl@0
   271
	d = LOW_WORD(p);								\
sl@0
   272
	e += HIGH_WORD(p);
sl@0
   273
sl@0
   274
#define SaveSquAcc(s, x, y) 						\
sl@0
   275
	R[s] = c;										\
sl@0
   276
	p = (dword)A[x] * A[y];	\
sl@0
   277
	p = p + p + d; 					\
sl@0
   278
	c = LOW_WORD(p);								\
sl@0
   279
	p = (dword)e + HIGH_WORD(p);					\
sl@0
   280
	d = LOW_WORD(p);								\
sl@0
   281
	e = HIGH_WORD(p);
sl@0
   282
sl@0
   283
// VC60 workaround: MSVC 6.0 has an optimization problem that makes
sl@0
   284
// (dword)A*B where either A or B has been cast to a dword before
sl@0
   285
// very expensive. Revisit a CombaSquare4() function when this
sl@0
   286
// problem is fixed.               
sl@0
   287
sl@0
   288
// WARNING: KeithR.  05/08/03 This routine doesn't work with gcc on hardware
sl@0
   289
// either.  I've completely removed it.  It may be worth looking into sometime
sl@0
   290
// in the future.
sl@0
   291
/*#ifndef __WINS__
sl@0
   292
static void CombaSquare4(word *R, const word *A)
sl@0
   293
{
sl@0
   294
	dword p;
sl@0
   295
	word c, d, e;
sl@0
   296
sl@0
   297
	p = (dword)A[0] * A[0];
sl@0
   298
	R[0] = LOW_WORD(p);
sl@0
   299
	c = HIGH_WORD(p);
sl@0
   300
	d = e = 0;
sl@0
   301
sl@0
   302
	SquAcc(0, 1);
sl@0
   303
sl@0
   304
	SaveSquAcc(1, 2, 0);
sl@0
   305
	MulAcc1(1, 1);
sl@0
   306
sl@0
   307
	SaveSquAcc(2, 0, 3);
sl@0
   308
	SquAcc(1, 2);
sl@0
   309
sl@0
   310
	SaveSquAcc(3, 3, 1);
sl@0
   311
	MulAcc1(2, 2);
sl@0
   312
sl@0
   313
	SaveSquAcc(4, 2, 3);
sl@0
   314
sl@0
   315
	R[5] = c;
sl@0
   316
	p = (dword)A[3] * A[3] + d;
sl@0
   317
	R[6] = LOW_WORD(p);
sl@0
   318
	R[7] = e + HIGH_WORD(p);
sl@0
   319
}
sl@0
   320
#endif */
sl@0
   321
sl@0
   322
static void CombaMultiply4(word *R, const word *A, const word *B)
sl@0
   323
{
sl@0
   324
	dword p;
sl@0
   325
	word c, d, e;
sl@0
   326
sl@0
   327
	p = (dword)A[0] * B[0];
sl@0
   328
	R[0] = LOW_WORD(p);
sl@0
   329
	c = HIGH_WORD(p);
sl@0
   330
	d = e = 0;
sl@0
   331
sl@0
   332
	MulAcc(0, 1);
sl@0
   333
	MulAcc(1, 0);
sl@0
   334
sl@0
   335
	SaveMulAcc(1, 2, 0);
sl@0
   336
	MulAcc(1, 1);
sl@0
   337
	MulAcc(0, 2);
sl@0
   338
sl@0
   339
	SaveMulAcc(2, 0, 3);
sl@0
   340
	MulAcc(1, 2);
sl@0
   341
	MulAcc(2, 1);
sl@0
   342
	MulAcc(3, 0);
sl@0
   343
sl@0
   344
	SaveMulAcc(3, 3, 1);
sl@0
   345
	MulAcc(2, 2);
sl@0
   346
	MulAcc(1, 3);
sl@0
   347
sl@0
   348
	SaveMulAcc(4, 2, 3);
sl@0
   349
	MulAcc(3, 2);
sl@0
   350
sl@0
   351
	R[5] = c;
sl@0
   352
	p = (dword)A[3] * B[3] + d;
sl@0
   353
	R[6] = LOW_WORD(p);
sl@0
   354
	R[7] = e + HIGH_WORD(p);
sl@0
   355
}
sl@0
   356
sl@0
   357
static void CombaMultiply8(word *R, const word *A, const word *B)
sl@0
   358
{
sl@0
   359
	dword p;
sl@0
   360
	word c, d, e;
sl@0
   361
sl@0
   362
	p = (dword)A[0] * B[0];
sl@0
   363
	R[0] = LOW_WORD(p);
sl@0
   364
	c = HIGH_WORD(p);
sl@0
   365
	d = e = 0;
sl@0
   366
sl@0
   367
	MulAcc(0, 1);
sl@0
   368
	MulAcc(1, 0);
sl@0
   369
sl@0
   370
	SaveMulAcc(1, 2, 0);
sl@0
   371
	MulAcc(1, 1);
sl@0
   372
	MulAcc(0, 2);
sl@0
   373
sl@0
   374
	SaveMulAcc(2, 0, 3);
sl@0
   375
	MulAcc(1, 2);
sl@0
   376
	MulAcc(2, 1);
sl@0
   377
	MulAcc(3, 0);
sl@0
   378
sl@0
   379
	SaveMulAcc(3, 0, 4);
sl@0
   380
	MulAcc(1, 3);
sl@0
   381
	MulAcc(2, 2);
sl@0
   382
	MulAcc(3, 1);
sl@0
   383
	MulAcc(4, 0);
sl@0
   384
sl@0
   385
	SaveMulAcc(4, 0, 5);
sl@0
   386
	MulAcc(1, 4);
sl@0
   387
	MulAcc(2, 3);
sl@0
   388
	MulAcc(3, 2);
sl@0
   389
	MulAcc(4, 1);
sl@0
   390
	MulAcc(5, 0);
sl@0
   391
sl@0
   392
	SaveMulAcc(5, 0, 6);
sl@0
   393
	MulAcc(1, 5);
sl@0
   394
	MulAcc(2, 4);
sl@0
   395
	MulAcc(3, 3);
sl@0
   396
	MulAcc(4, 2);
sl@0
   397
	MulAcc(5, 1);
sl@0
   398
	MulAcc(6, 0);
sl@0
   399
sl@0
   400
	SaveMulAcc(6, 0, 7);
sl@0
   401
	MulAcc(1, 6);
sl@0
   402
	MulAcc(2, 5);
sl@0
   403
	MulAcc(3, 4);
sl@0
   404
	MulAcc(4, 3);
sl@0
   405
	MulAcc(5, 2);
sl@0
   406
	MulAcc(6, 1);
sl@0
   407
	MulAcc(7, 0);
sl@0
   408
sl@0
   409
	SaveMulAcc(7, 1, 7);
sl@0
   410
	MulAcc(2, 6);
sl@0
   411
	MulAcc(3, 5);
sl@0
   412
	MulAcc(4, 4);
sl@0
   413
	MulAcc(5, 3);
sl@0
   414
	MulAcc(6, 2);
sl@0
   415
	MulAcc(7, 1);
sl@0
   416
sl@0
   417
	SaveMulAcc(8, 2, 7);
sl@0
   418
	MulAcc(3, 6);
sl@0
   419
	MulAcc(4, 5);
sl@0
   420
	MulAcc(5, 4);
sl@0
   421
	MulAcc(6, 3);
sl@0
   422
	MulAcc(7, 2);
sl@0
   423
sl@0
   424
	SaveMulAcc(9, 3, 7);
sl@0
   425
	MulAcc(4, 6);
sl@0
   426
	MulAcc(5, 5);
sl@0
   427
	MulAcc(6, 4);
sl@0
   428
	MulAcc(7, 3);
sl@0
   429
sl@0
   430
	SaveMulAcc(10, 4, 7);
sl@0
   431
	MulAcc(5, 6);
sl@0
   432
	MulAcc(6, 5);
sl@0
   433
	MulAcc(7, 4);
sl@0
   434
sl@0
   435
	SaveMulAcc(11, 5, 7);
sl@0
   436
	MulAcc(6, 6);
sl@0
   437
	MulAcc(7, 5);
sl@0
   438
sl@0
   439
	SaveMulAcc(12, 6, 7);
sl@0
   440
	MulAcc(7, 6);
sl@0
   441
sl@0
   442
	R[13] = c;
sl@0
   443
	p = (dword)A[7] * B[7] + d;
sl@0
   444
	R[14] = LOW_WORD(p);
sl@0
   445
	R[15] = e + HIGH_WORD(p);
sl@0
   446
}
sl@0
   447
sl@0
   448
static void CombaMultiplyBottom4(word *R, const word *A, const word *B)
sl@0
   449
{
sl@0
   450
	dword p;
sl@0
   451
	word c, d, e;
sl@0
   452
sl@0
   453
	p = (dword)A[0] * B[0];
sl@0
   454
	R[0] = LOW_WORD(p);
sl@0
   455
	c = HIGH_WORD(p);
sl@0
   456
	d = e = 0;
sl@0
   457
sl@0
   458
	MulAcc(0, 1);
sl@0
   459
	MulAcc(1, 0);
sl@0
   460
sl@0
   461
	SaveMulAcc(1, 2, 0);
sl@0
   462
	MulAcc(1, 1);
sl@0
   463
	MulAcc(0, 2);
sl@0
   464
sl@0
   465
	R[2] = c;
sl@0
   466
	R[3] = d + A[0] * B[3] + A[1] * B[2] + A[2] * B[1] + A[3] * B[0];
sl@0
   467
}
sl@0
   468
sl@0
   469
static void CombaMultiplyBottom8(word *R, const word *A, const word *B)
sl@0
   470
{
sl@0
   471
	dword p;
sl@0
   472
	word c, d, e;
sl@0
   473
sl@0
   474
	p = (dword)A[0] * B[0];
sl@0
   475
	R[0] = LOW_WORD(p);
sl@0
   476
	c = HIGH_WORD(p);
sl@0
   477
	d = e = 0;
sl@0
   478
sl@0
   479
	MulAcc(0, 1);
sl@0
   480
	MulAcc(1, 0);
sl@0
   481
sl@0
   482
	SaveMulAcc(1, 2, 0);
sl@0
   483
	MulAcc(1, 1);
sl@0
   484
	MulAcc(0, 2);
sl@0
   485
sl@0
   486
	SaveMulAcc(2, 0, 3);
sl@0
   487
	MulAcc(1, 2);
sl@0
   488
	MulAcc(2, 1);
sl@0
   489
	MulAcc(3, 0);
sl@0
   490
sl@0
   491
	SaveMulAcc(3, 0, 4);
sl@0
   492
	MulAcc(1, 3);
sl@0
   493
	MulAcc(2, 2);
sl@0
   494
	MulAcc(3, 1);
sl@0
   495
	MulAcc(4, 0);
sl@0
   496
sl@0
   497
	SaveMulAcc(4, 0, 5);
sl@0
   498
	MulAcc(1, 4);
sl@0
   499
	MulAcc(2, 3);
sl@0
   500
	MulAcc(3, 2);
sl@0
   501
	MulAcc(4, 1);
sl@0
   502
	MulAcc(5, 0);
sl@0
   503
sl@0
   504
	SaveMulAcc(5, 0, 6);
sl@0
   505
	MulAcc(1, 5);
sl@0
   506
	MulAcc(2, 4);
sl@0
   507
	MulAcc(3, 3);
sl@0
   508
	MulAcc(4, 2);
sl@0
   509
	MulAcc(5, 1);
sl@0
   510
	MulAcc(6, 0);
sl@0
   511
sl@0
   512
	R[6] = c;
sl@0
   513
	R[7] = d + A[0] * B[7] + A[1] * B[6] + A[2] * B[5] + A[3] * B[4] +
sl@0
   514
				A[4] * B[3] + A[5] * B[2] + A[6] * B[1] + A[7] * B[0];
sl@0
   515
}
sl@0
   516
sl@0
   517
#undef MulAcc
sl@0
   518
#undef SaveMulAcc
sl@0
   519
static void AtomicInverseModPower2(word *C, word A0, word A1)
sl@0
   520
{
sl@0
   521
	assert(A0%2==1);
sl@0
   522
sl@0
   523
	dword A=MAKE_DWORD(A0, A1), R=A0%8;
sl@0
   524
sl@0
   525
	for (unsigned i=3; i<2*WORD_BITS; i*=2)
sl@0
   526
		R = R*(2-R*A);
sl@0
   527
sl@0
   528
	assert(R*A==1);
sl@0
   529
sl@0
   530
	C[0] = LOW_WORD(R);
sl@0
   531
	C[1] = HIGH_WORD(R);
sl@0
   532
}
sl@0
   533
// ********************************************************
sl@0
   534
sl@0
   535
#define A0		A
sl@0
   536
#define A1		(A+N2)
sl@0
   537
#define B0		B
sl@0
   538
#define B1		(B+N2)
sl@0
   539
sl@0
   540
#define T0		T
sl@0
   541
#define T1		(T+N2)
sl@0
   542
#define T2		(T+N)
sl@0
   543
#define T3		(T+N+N2)
sl@0
   544
sl@0
   545
#define R0		R
sl@0
   546
#define R1		(R+N2)
sl@0
   547
#define R2		(R+N)
sl@0
   548
#define R3		(R+N+N2)
sl@0
   549
sl@0
   550
// R[2*N] - result = A*B
sl@0
   551
// T[2*N] - temporary work space
sl@0
   552
// A[N] --- multiplier
sl@0
   553
// B[N] --- multiplicant
sl@0
   554
sl@0
   555
void RecursiveMultiply(word *R, word *T, const word *A, const word *B, unsigned int N)
sl@0
   556
{
sl@0
   557
	assert(N>=2 && N%2==0);
sl@0
   558
sl@0
   559
	if (N==2)
sl@0
   560
		AtomicMultiply(R, A, B);
sl@0
   561
	else if (N==4)
sl@0
   562
		CombaMultiply4(R, A, B);
sl@0
   563
	else if (N==8)
sl@0
   564
		CombaMultiply8(R, A, B);
sl@0
   565
	else
sl@0
   566
	{
sl@0
   567
		const unsigned int N2 = N/2;
sl@0
   568
		int carry;
sl@0
   569
sl@0
   570
		int aComp = Compare(A0, A1, N2);
sl@0
   571
		int bComp = Compare(B0, B1, N2);
sl@0
   572
sl@0
   573
		switch (2*aComp + aComp + bComp)
sl@0
   574
		{
sl@0
   575
		case -4:
sl@0
   576
			Subtract(R0, A1, A0, N2);
sl@0
   577
			Subtract(R1, B0, B1, N2);
sl@0
   578
			RecursiveMultiply(T0, T2, R0, R1, N2);
sl@0
   579
			Subtract(T1, T1, R0, N2);
sl@0
   580
			carry = -1;
sl@0
   581
			break;
sl@0
   582
		case -2:
sl@0
   583
			Subtract(R0, A1, A0, N2);
sl@0
   584
			Subtract(R1, B0, B1, N2);
sl@0
   585
			RecursiveMultiply(T0, T2, R0, R1, N2);
sl@0
   586
			carry = 0;
sl@0
   587
			break;
sl@0
   588
		case 2:
sl@0
   589
			Subtract(R0, A0, A1, N2);
sl@0
   590
			Subtract(R1, B1, B0, N2);
sl@0
   591
			RecursiveMultiply(T0, T2, R0, R1, N2);
sl@0
   592
			carry = 0;
sl@0
   593
			break;
sl@0
   594
		case 4:
sl@0
   595
			Subtract(R0, A1, A0, N2);
sl@0
   596
			Subtract(R1, B0, B1, N2);
sl@0
   597
			RecursiveMultiply(T0, T2, R0, R1, N2);
sl@0
   598
			Subtract(T1, T1, R1, N2);
sl@0
   599
			carry = -1;
sl@0
   600
			break;
sl@0
   601
		default:
sl@0
   602
			SetWords(T0, 0, N);
sl@0
   603
			carry = 0;
sl@0
   604
		}
sl@0
   605
sl@0
   606
		RecursiveMultiply(R0, T2, A0, B0, N2);
sl@0
   607
		RecursiveMultiply(R2, T2, A1, B1, N2);
sl@0
   608
sl@0
   609
		// now T[01] holds (A1-A0)*(B0-B1), R[01] holds A0*B0, R[23] holds A1*B1
sl@0
   610
sl@0
   611
		carry += Add(T0, T0, R0, N);
sl@0
   612
		carry += Add(T0, T0, R2, N);
sl@0
   613
		carry += Add(R1, R1, T0, N);
sl@0
   614
sl@0
   615
		assert (carry >= 0 && carry <= 2);
sl@0
   616
		Increment(R3, N2, carry);
sl@0
   617
	}
sl@0
   618
}
sl@0
   619
sl@0
   620
// R[2*N] - result = A*A
sl@0
   621
// T[2*N] - temporary work space
sl@0
   622
// A[N] --- number to be squared
sl@0
   623
sl@0
   624
void RecursiveSquare(word *R, word *T, const word *A, unsigned int N)
sl@0
   625
{
sl@0
   626
	assert(N && N%2==0);
sl@0
   627
sl@0
   628
	if (N==2)
sl@0
   629
		AtomicMultiply(R, A, A);
sl@0
   630
	else if (N==4)
sl@0
   631
	{
sl@0
   632
		// VC60 workaround: MSVC 6.0 has an optimization problem that makes
sl@0
   633
		// (dword)A*B where either A or B has been cast to a dword before
sl@0
   634
		// very expensive. Revisit a CombaSquare4() function when this
sl@0
   635
		// problem is fixed.               
sl@0
   636
sl@0
   637
// WARNING: KeithR. 05/08/03 This routine doesn't work with gcc on hardware
sl@0
   638
// either.  I've completely removed it.  It may be worth looking into sometime
sl@0
   639
// in the future.  Therefore, we use the CombaMultiply4 on all targets.
sl@0
   640
//#ifdef __WINS__
sl@0
   641
		CombaMultiply4(R, A, A);
sl@0
   642
/*#else
sl@0
   643
		CombaSquare4(R, A);
sl@0
   644
#endif*/
sl@0
   645
	}
sl@0
   646
	else
sl@0
   647
	{
sl@0
   648
		const unsigned int N2 = N/2;
sl@0
   649
sl@0
   650
		RecursiveSquare(R0, T2, A0, N2);
sl@0
   651
		RecursiveSquare(R2, T2, A1, N2);
sl@0
   652
		RecursiveMultiply(T0, T2, A0, A1, N2);
sl@0
   653
sl@0
   654
		word carry = Add(R1, R1, T0, N);
sl@0
   655
		carry += Add(R1, R1, T0, N);
sl@0
   656
		Increment(R3, N2, carry);
sl@0
   657
	}
sl@0
   658
}
sl@0
   659
// R[N] - bottom half of A*B
sl@0
   660
// T[N] - temporary work space
sl@0
   661
// A[N] - multiplier
sl@0
   662
// B[N] - multiplicant
sl@0
   663
sl@0
   664
void RecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, unsigned int N)
sl@0
   665
{
sl@0
   666
	assert(N>=2 && N%2==0);
sl@0
   667
sl@0
   668
	if (N==2)
sl@0
   669
		AtomicMultiplyBottom(R, A, B);
sl@0
   670
	else if (N==4)
sl@0
   671
		CombaMultiplyBottom4(R, A, B);
sl@0
   672
	else if (N==8)
sl@0
   673
		CombaMultiplyBottom8(R, A, B);
sl@0
   674
	else
sl@0
   675
	{
sl@0
   676
		const unsigned int N2 = N/2;
sl@0
   677
sl@0
   678
		RecursiveMultiply(R, T, A0, B0, N2);
sl@0
   679
		RecursiveMultiplyBottom(T0, T1, A1, B0, N2);
sl@0
   680
		Add(R1, R1, T0, N2);
sl@0
   681
		RecursiveMultiplyBottom(T0, T1, A0, B1, N2);
sl@0
   682
		Add(R1, R1, T0, N2);
sl@0
   683
	}
sl@0
   684
}
sl@0
   685
sl@0
   686
// R[N] --- upper half of A*B
sl@0
   687
// T[2*N] - temporary work space
sl@0
   688
// L[N] --- lower half of A*B
sl@0
   689
// A[N] --- multiplier
sl@0
   690
// B[N] --- multiplicant
sl@0
   691
sl@0
   692
void RecursiveMultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, unsigned int N)
sl@0
   693
{
sl@0
   694
	assert(N>=2 && N%2==0);
sl@0
   695
sl@0
   696
	if (N==2)
sl@0
   697
	{
sl@0
   698
		AtomicMultiply(T, A, B);
sl@0
   699
		((dword *)R)[0] = ((dword *)T)[1];
sl@0
   700
	}
sl@0
   701
	else if (N==4)
sl@0
   702
	{
sl@0
   703
		CombaMultiply4(T, A, B);
sl@0
   704
		((dword *)R)[0] = ((dword *)T)[2];
sl@0
   705
		((dword *)R)[1] = ((dword *)T)[3];
sl@0
   706
	}
sl@0
   707
	else
sl@0
   708
	{
sl@0
   709
		const unsigned int N2 = N/2;
sl@0
   710
		int carry;
sl@0
   711
sl@0
   712
		int aComp = Compare(A0, A1, N2);
sl@0
   713
		int bComp = Compare(B0, B1, N2);
sl@0
   714
sl@0
   715
		switch (2*aComp + aComp + bComp)
sl@0
   716
		{
sl@0
   717
		case -4:
sl@0
   718
			Subtract(R0, A1, A0, N2);
sl@0
   719
			Subtract(R1, B0, B1, N2);
sl@0
   720
			RecursiveMultiply(T0, T2, R0, R1, N2);
sl@0
   721
			Subtract(T1, T1, R0, N2);
sl@0
   722
			carry = -1;
sl@0
   723
			break;
sl@0
   724
		case -2:
sl@0
   725
			Subtract(R0, A1, A0, N2);
sl@0
   726
			Subtract(R1, B0, B1, N2);
sl@0
   727
			RecursiveMultiply(T0, T2, R0, R1, N2);
sl@0
   728
			carry = 0;
sl@0
   729
			break;
sl@0
   730
		case 2:
sl@0
   731
			Subtract(R0, A0, A1, N2);
sl@0
   732
			Subtract(R1, B1, B0, N2);
sl@0
   733
			RecursiveMultiply(T0, T2, R0, R1, N2);
sl@0
   734
			carry = 0;
sl@0
   735
			break;
sl@0
   736
		case 4:
sl@0
   737
			Subtract(R0, A1, A0, N2);
sl@0
   738
			Subtract(R1, B0, B1, N2);
sl@0
   739
			RecursiveMultiply(T0, T2, R0, R1, N2);
sl@0
   740
			Subtract(T1, T1, R1, N2);
sl@0
   741
			carry = -1;
sl@0
   742
			break;
sl@0
   743
		default:
sl@0
   744
			SetWords(T0, 0, N);
sl@0
   745
			carry = 0;
sl@0
   746
		}
sl@0
   747
sl@0
   748
		RecursiveMultiply(T2, R0, A1, B1, N2);
sl@0
   749
sl@0
   750
		// now T[01] holds (A1-A0)*(B0-B1), T[23] holds A1*B1
sl@0
   751
sl@0
   752
		CopyWords(R0, L+N2, N2);
sl@0
   753
		word c2 = Subtract(R0, R0, L, N2);
sl@0
   754
		c2 += Subtract(R0, R0, T0, N2);
sl@0
   755
		word t = (Compare(R0, T2, N2) == -1);
sl@0
   756
sl@0
   757
		carry += t;
sl@0
   758
		carry += Increment(R0, N2, c2+t);
sl@0
   759
		carry += Add(R0, R0, T1, N2);
sl@0
   760
		carry += Add(R0, R0, T3, N2);
sl@0
   761
sl@0
   762
		CopyWords(R1, T3, N2);
sl@0
   763
		assert (carry >= 0 && carry <= 2);
sl@0
   764
		Increment(R1, N2, carry);
sl@0
   765
	}
sl@0
   766
}
sl@0
   767
sl@0
   768
// R[NA+NB] - result = A*B
sl@0
   769
// T[NA+NB] - temporary work space
sl@0
   770
// A[NA] ---- multiplier
sl@0
   771
// B[NB] ---- multiplicant
sl@0
   772
sl@0
   773
void AsymmetricMultiply(word *R, word *T, const word *A, unsigned int NA, const word *B, unsigned int NB)
sl@0
   774
{
sl@0
   775
	if (NA == NB)
sl@0
   776
	{
sl@0
   777
		if (A == B)
sl@0
   778
			RecursiveSquare(R, T, A, NA);
sl@0
   779
		else
sl@0
   780
			RecursiveMultiply(R, T, A, B, NA);
sl@0
   781
sl@0
   782
		return;
sl@0
   783
	}
sl@0
   784
sl@0
   785
	if (NA > NB)
sl@0
   786
	{
sl@0
   787
		TClassSwap(A, B);
sl@0
   788
		TClassSwap(NA, NB);
sl@0
   789
		//std::swap(A, B);
sl@0
   790
		//std::swap(NA, NB);
sl@0
   791
	}
sl@0
   792
sl@0
   793
	assert(NB % NA == 0);
sl@0
   794
	assert((NB/NA)%2 == 0); 	// NB is an even multiple of NA
sl@0
   795
sl@0
   796
	if (NA==2 && !A[1])
sl@0
   797
	{
sl@0
   798
		switch (A[0])
sl@0
   799
		{
sl@0
   800
		case 0:
sl@0
   801
			SetWords(R, 0, NB+2);
sl@0
   802
			return;
sl@0
   803
		case 1:
sl@0
   804
			CopyWords(R, B, NB);
sl@0
   805
			R[NB] = R[NB+1] = 0;
sl@0
   806
			return;
sl@0
   807
		default:
sl@0
   808
			R[NB] = LinearMultiply(R, B, A[0], NB);
sl@0
   809
			R[NB+1] = 0;
sl@0
   810
			return;
sl@0
   811
		}
sl@0
   812
	}
sl@0
   813
sl@0
   814
	RecursiveMultiply(R, T, A, B, NA);
sl@0
   815
	CopyWords(T+2*NA, R+NA, NA);
sl@0
   816
sl@0
   817
	unsigned i;
sl@0
   818
sl@0
   819
	for (i=2*NA; i<NB; i+=2*NA)
sl@0
   820
		RecursiveMultiply(T+NA+i, T, A, B+i, NA);
sl@0
   821
	for (i=NA; i<NB; i+=2*NA)
sl@0
   822
		RecursiveMultiply(R+i, T, A, B+i, NA);
sl@0
   823
sl@0
   824
	if (Add(R+NA, R+NA, T+2*NA, NB-NA))
sl@0
   825
		Increment(R+NB, NA);
sl@0
   826
}
sl@0
   827
// R[N] ----- result = A inverse mod 2**(WORD_BITS*N)
sl@0
   828
// T[3*N/2] - temporary work space
sl@0
   829
// A[N] ----- an odd number as input
sl@0
   830
sl@0
   831
void RecursiveInverseModPower2(word *R, word *T, const word *A, unsigned int N)
sl@0
   832
{
sl@0
   833
	if (N==2)
sl@0
   834
		AtomicInverseModPower2(R, A[0], A[1]);
sl@0
   835
	else
sl@0
   836
	{
sl@0
   837
		const unsigned int N2 = N/2;
sl@0
   838
		RecursiveInverseModPower2(R0, T0, A0, N2);
sl@0
   839
		T0[0] = 1;
sl@0
   840
		SetWords(T0+1, 0, N2-1);
sl@0
   841
		RecursiveMultiplyTop(R1, T1, T0, R0, A0, N2);
sl@0
   842
		RecursiveMultiplyBottom(T0, T1, R0, A1, N2);
sl@0
   843
		Add(T0, R1, T0, N2);
sl@0
   844
		TwosComplement(T0, N2);
sl@0
   845
		RecursiveMultiplyBottom(R1, T1, R0, T0, N2);
sl@0
   846
	}
sl@0
   847
}
sl@0
   848
#undef A0
sl@0
   849
#undef A1
sl@0
   850
#undef B0
sl@0
   851
#undef B1
sl@0
   852
sl@0
   853
#undef T0
sl@0
   854
#undef T1
sl@0
   855
#undef T2
sl@0
   856
#undef T3
sl@0
   857
sl@0
   858
#undef R0
sl@0
   859
#undef R1
sl@0
   860
#undef R2
sl@0
   861
#undef R3
sl@0
   862
sl@0
   863
// R[N] --- result = X/(2**(WORD_BITS*N)) mod M
sl@0
   864
// T[3*N] - temporary work space
sl@0
   865
// X[2*N] - number to be reduced
sl@0
   866
// M[N] --- modulus
sl@0
   867
// U[N] --- multiplicative inverse of M mod 2**(WORD_BITS*N)
sl@0
   868
sl@0
   869
void MontgomeryReduce(word *R, word *T, const word *X, const word *M, const word *U, unsigned int N)
sl@0
   870
{
sl@0
   871
	RecursiveMultiplyBottom(R, T, X, U, N);
sl@0
   872
	RecursiveMultiplyTop(T, T+N, X, R, M, N);
sl@0
   873
	if (Subtract(R, X+N, T, N))
sl@0
   874
	{
sl@0
   875
#ifdef _DEBUG
sl@0
   876
		word carry = Add(R, R, M, N);
sl@0
   877
		assert(carry);
sl@0
   878
#else
sl@0
   879
		Add(R, R, M, N);
sl@0
   880
#endif
sl@0
   881
	}
sl@0
   882
}
sl@0
   883
sl@0
   884
// do a 3 word by 2 word divide, returns quotient and leaves remainder in A
sl@0
   885
static word SubatomicDivide(word *A, word B0, word B1)
sl@0
   886
{
sl@0
   887
	// assert {A[2],A[1]} < {B1,B0}, so quotient can fit in a word
sl@0
   888
	assert(A[2] < B1 || (A[2]==B1 && A[1] < B0));
sl@0
   889
sl@0
   890
	dword p, u;
sl@0
   891
	word Q;
sl@0
   892
sl@0
   893
	// estimate the quotient: do a 2 word by 1 word divide
sl@0
   894
	if (B1+1 == 0)
sl@0
   895
		Q = A[2];
sl@0
   896
	else
sl@0
   897
		Q = word(MAKE_DWORD(A[1], A[2]) / (B1+1));
sl@0
   898
sl@0
   899
	// now subtract Q*B from A
sl@0
   900
	p = (dword) B0*Q;
sl@0
   901
	u = (dword) A[0] - LOW_WORD(p);
sl@0
   902
	A[0] = LOW_WORD(u);
sl@0
   903
	u = (dword) A[1] - HIGH_WORD(p) - (word)(0-HIGH_WORD(u)) - (dword)B1*Q;
sl@0
   904
	A[1] = LOW_WORD(u);
sl@0
   905
	A[2] += HIGH_WORD(u);
sl@0
   906
sl@0
   907
	// Q <= actual quotient, so fix it
sl@0
   908
	while (A[2] || A[1] > B1 || (A[1]==B1 && A[0]>=B0))
sl@0
   909
	{
sl@0
   910
		u = (dword) A[0] - B0;
sl@0
   911
		A[0] = LOW_WORD(u);
sl@0
   912
		u = (dword) A[1] - B1 - (word)(0-HIGH_WORD(u));
sl@0
   913
		A[1] = LOW_WORD(u);
sl@0
   914
		A[2] += HIGH_WORD(u);
sl@0
   915
		Q++;
sl@0
   916
		assert(Q);	// shouldn't overflow
sl@0
   917
	}
sl@0
   918
sl@0
   919
	return Q;
sl@0
   920
}
sl@0
   921
sl@0
   922
// do a 4 word by 2 word divide, returns 2 word quotient in Q0 and Q1
sl@0
   923
static inline void AtomicDivide(word *Q, const word *A, const word *B)
sl@0
   924
{
sl@0
   925
	if (!B[0] && !B[1]) // if divisor is 0, we assume divisor==2**(2*WORD_BITS)
sl@0
   926
	{
sl@0
   927
		Q[0] = A[2];
sl@0
   928
		Q[1] = A[3];
sl@0
   929
	}
sl@0
   930
	else
sl@0
   931
	{
sl@0
   932
		word T[4];
sl@0
   933
		T[0] = A[0]; T[1] = A[1]; T[2] = A[2]; T[3] = A[3];
sl@0
   934
		Q[1] = SubatomicDivide(T+1, B[0], B[1]);
sl@0
   935
		Q[0] = SubatomicDivide(T, B[0], B[1]);
sl@0
   936
sl@0
   937
#ifdef _DEBUG
sl@0
   938
		// multiply quotient and divisor and add remainder, make sure it equals dividend
sl@0
   939
		assert(!T[2] && !T[3] && (T[1] < B[1] || (T[1]==B[1] && T[0]<B[0])));
sl@0
   940
		word P[4];
sl@0
   941
		AtomicMultiply(P, Q, B);
sl@0
   942
		Add(P, P, T, 4);
sl@0
   943
		assert(Mem::Compare((TUint8*)P, 4*WORD_SIZE, (TUint8*)A, 4*WORD_SIZE)==0);
sl@0
   944
#endif
sl@0
   945
	}
sl@0
   946
}
sl@0
   947
sl@0
   948
// for use by Divide(), corrects the underestimated quotient {Q1,Q0}
sl@0
   949
static void CorrectQuotientEstimate(word *R, word *T, word *Q, const word *B, unsigned int N)
sl@0
   950
{
sl@0
   951
	assert(N && N%2==0);
sl@0
   952
sl@0
   953
	if (Q[1])
sl@0
   954
	{
sl@0
   955
		T[N] = T[N+1] = 0;
sl@0
   956
		unsigned i;
sl@0
   957
		for (i=0; i<N; i+=4)
sl@0
   958
			AtomicMultiply(T+i, Q, B+i);
sl@0
   959
		for (i=2; i<N; i+=4)
sl@0
   960
			if (AtomicMultiplyAdd(T+i, Q, B+i))
sl@0
   961
				T[i+5] += (++T[i+4]==0);
sl@0
   962
	}
sl@0
   963
	else
sl@0
   964
	{
sl@0
   965
		T[N] = LinearMultiply(T, B, Q[0], N);
sl@0
   966
		T[N+1] = 0;
sl@0
   967
	}
sl@0
   968
sl@0
   969
#ifdef _DEBUG
sl@0
   970
	word borrow = Subtract(R, R, T, N+2);
sl@0
   971
	assert(!borrow && !R[N+1]);
sl@0
   972
#else
sl@0
   973
	Subtract(R, R, T, N+2);
sl@0
   974
#endif
sl@0
   975
sl@0
   976
	while (R[N] || Compare(R, B, N) >= 0)
sl@0
   977
	{
sl@0
   978
		R[N] -= Subtract(R, R, B, N);
sl@0
   979
		Q[1] += (++Q[0]==0);
sl@0
   980
		assert(Q[0] || Q[1]); // no overflow
sl@0
   981
	}
sl@0
   982
}
sl@0
   983
sl@0
   984
// R[NB] -------- remainder = A%B
sl@0
   985
// Q[NA-NB+2] --- quotient	= A/B
sl@0
   986
// T[NA+2*NB+4] - temp work space
sl@0
   987
// A[NA] -------- dividend
sl@0
   988
// B[NB] -------- divisor
sl@0
   989
sl@0
   990
void Divide(word *R, word *Q, word *T, const word *A, unsigned int NA, const word *B, unsigned int NB)
sl@0
   991
{
sl@0
   992
	assert(NA && NB && NA%2==0 && NB%2==0);
sl@0
   993
	assert(B[NB-1] || B[NB-2]);
sl@0
   994
	assert(NB <= NA);
sl@0
   995
sl@0
   996
	// set up temporary work space
sl@0
   997
	word *const TA=T;
sl@0
   998
	word *const TB=T+NA+2;
sl@0
   999
	word *const TP=T+NA+2+NB;
sl@0
  1000
sl@0
  1001
	// copy B into TB and normalize it so that TB has highest bit set to 1
sl@0
  1002
	unsigned shiftWords = (B[NB-1]==0);
sl@0
  1003
	TB[0] = TB[NB-1] = 0;
sl@0
  1004
	CopyWords(TB+shiftWords, B, NB-shiftWords);
sl@0
  1005
	unsigned shiftBits = WORD_BITS - BitPrecision(TB[NB-1]);
sl@0
  1006
	assert(shiftBits < WORD_BITS);
sl@0
  1007
	ShiftWordsLeftByBits(TB, NB, shiftBits);
sl@0
  1008
sl@0
  1009
	// copy A into TA and normalize it
sl@0
  1010
	TA[0] = TA[NA] = TA[NA+1] = 0;
sl@0
  1011
	CopyWords(TA+shiftWords, A, NA);
sl@0
  1012
	ShiftWordsLeftByBits(TA, NA+2, shiftBits);
sl@0
  1013
sl@0
  1014
	if (TA[NA+1]==0 && TA[NA] <= 1)
sl@0
  1015
	{
sl@0
  1016
		Q[NA-NB+1] = Q[NA-NB] = 0;
sl@0
  1017
		while (TA[NA] || Compare(TA+NA-NB, TB, NB) >= 0)
sl@0
  1018
		{
sl@0
  1019
			TA[NA] -= Subtract(TA+NA-NB, TA+NA-NB, TB, NB);
sl@0
  1020
			++Q[NA-NB];
sl@0
  1021
		}
sl@0
  1022
	}
sl@0
  1023
	else
sl@0
  1024
	{
sl@0
  1025
		NA+=2;
sl@0
  1026
		assert(Compare(TA+NA-NB, TB, NB) < 0);
sl@0
  1027
	}
sl@0
  1028
sl@0
  1029
	word BT[2];
sl@0
  1030
	BT[0] = TB[NB-2] + 1;
sl@0
  1031
	BT[1] = TB[NB-1] + (BT[0]==0);
sl@0
  1032
sl@0
  1033
	// start reducing TA mod TB, 2 words at a time
sl@0
  1034
	for (unsigned i=NA-2; i>=NB; i-=2)
sl@0
  1035
	{
sl@0
  1036
		AtomicDivide(Q+i-NB, TA+i-2, BT);
sl@0
  1037
		CorrectQuotientEstimate(TA+i-NB, TP, Q+i-NB, TB, NB);
sl@0
  1038
	}
sl@0
  1039
sl@0
  1040
	// copy TA into R, and denormalize it
sl@0
  1041
	CopyWords(R, TA+shiftWords, NB);
sl@0
  1042
	ShiftWordsRightByBits(R, NB, shiftBits);
sl@0
  1043
}
sl@0
  1044
sl@0
  1045
static inline unsigned int EvenWordCount(const word *X, unsigned int N)
sl@0
  1046
{
sl@0
  1047
	while (N && X[N-2]==0 && X[N-1]==0)
sl@0
  1048
		N-=2;
sl@0
  1049
	return N;
sl@0
  1050
}
sl@0
  1051
sl@0
  1052
// return k
sl@0
  1053
// R[N] --- result = A^(-1) * 2^k mod M
sl@0
  1054
// T[4*N] - temporary work space
sl@0
  1055
// A[NA] -- number to take inverse of
sl@0
  1056
// M[N] --- modulus
sl@0
  1057
sl@0
  1058
unsigned int AlmostInverse(word *R, word *T, const word *A, unsigned int NA, const word *M, unsigned int N)
sl@0
  1059
{
sl@0
  1060
	assert(NA<=N && N && N%2==0);
sl@0
  1061
sl@0
  1062
	word *b = T;
sl@0
  1063
	word *c = T+N;
sl@0
  1064
	word *f = T+2*N;
sl@0
  1065
	word *g = T+3*N;
sl@0
  1066
	unsigned int bcLen=2, fgLen=EvenWordCount(M, N);
sl@0
  1067
	unsigned int k=0, s=0;
sl@0
  1068
sl@0
  1069
	SetWords(T, 0, 3*N);
sl@0
  1070
	b[0]=1;
sl@0
  1071
	CopyWords(f, A, NA);
sl@0
  1072
	CopyWords(g, M, N);
sl@0
  1073
sl@0
  1074
	FOREVER
sl@0
  1075
	{
sl@0
  1076
		word t=f[0];
sl@0
  1077
		while (!t)
sl@0
  1078
		{
sl@0
  1079
			if (EvenWordCount(f, fgLen)==0)
sl@0
  1080
			{
sl@0
  1081
				SetWords(R, 0, N);
sl@0
  1082
				return 0;
sl@0
  1083
			}
sl@0
  1084
sl@0
  1085
			ShiftWordsRightByWords(f, fgLen, 1);
sl@0
  1086
			if (c[bcLen-1]) bcLen+=2;
sl@0
  1087
			assert(bcLen <= N);
sl@0
  1088
			ShiftWordsLeftByWords(c, bcLen, 1);
sl@0
  1089
			k+=WORD_BITS;
sl@0
  1090
			t=f[0];
sl@0
  1091
		}
sl@0
  1092
sl@0
  1093
		unsigned int i=0;
sl@0
  1094
		while (t%2 == 0)
sl@0
  1095
		{
sl@0
  1096
			t>>=1;
sl@0
  1097
			i++;
sl@0
  1098
		}
sl@0
  1099
		k+=i;
sl@0
  1100
sl@0
  1101
		if (t==1 && f[1]==0 && EvenWordCount(f, fgLen)==2)
sl@0
  1102
		{
sl@0
  1103
			if (s%2==0)
sl@0
  1104
				CopyWords(R, b, N);
sl@0
  1105
			else
sl@0
  1106
				Subtract(R, M, b, N);
sl@0
  1107
			return k;
sl@0
  1108
		}
sl@0
  1109
sl@0
  1110
		ShiftWordsRightByBits(f, fgLen, i);
sl@0
  1111
		t=ShiftWordsLeftByBits(c, bcLen, i);
sl@0
  1112
		if (t)
sl@0
  1113
		{
sl@0
  1114
			c[bcLen] = t;
sl@0
  1115
			bcLen+=2;
sl@0
  1116
			assert(bcLen <= N);
sl@0
  1117
		}
sl@0
  1118
sl@0
  1119
		if (f[fgLen-2]==0 && g[fgLen-2]==0 && f[fgLen-1]==0 && g[fgLen-1]==0)
sl@0
  1120
			fgLen-=2;
sl@0
  1121
sl@0
  1122
		if (Compare(f, g, fgLen)==-1)
sl@0
  1123
		{
sl@0
  1124
			TClassSwap<word*>(f,g);
sl@0
  1125
			TClassSwap<word*>(b,c);
sl@0
  1126
			s++;
sl@0
  1127
		}
sl@0
  1128
sl@0
  1129
		Subtract(f, f, g, fgLen);
sl@0
  1130
sl@0
  1131
		if (Add(b, b, c, bcLen))
sl@0
  1132
		{
sl@0
  1133
			b[bcLen] = 1;
sl@0
  1134
			bcLen+=2;
sl@0
  1135
			assert(bcLen <= N);
sl@0
  1136
		}
sl@0
  1137
	}
sl@0
  1138
}
sl@0
  1139
sl@0
  1140
// R[N] - result = A/(2^k) mod M
sl@0
  1141
// A[N] - input
sl@0
  1142
// M[N] - modulus
sl@0
  1143
sl@0
  1144
void DivideByPower2Mod(word *R, const word *A, unsigned int k, const word *M, unsigned int N)
sl@0
  1145
{
sl@0
  1146
	CopyWords(R, A, N);
sl@0
  1147
sl@0
  1148
	while (k--)
sl@0
  1149
	{
sl@0
  1150
		if (R[0]%2==0)
sl@0
  1151
			ShiftWordsRightByBits(R, N, 1);
sl@0
  1152
		else
sl@0
  1153
		{
sl@0
  1154
			word carry = Add(R, R, M, N);
sl@0
  1155
			ShiftWordsRightByBits(R, N, 1);
sl@0
  1156
			R[N-1] += carry<<(WORD_BITS-1);
sl@0
  1157
		}
sl@0
  1158
	}
sl@0
  1159
}
sl@0
  1160