os/ossrv/ssl/libcrypto/src/crypto/bn/bn_asm.c
author sl
Tue, 10 Jun 2014 14:32:02 +0200
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
     1 /* crypto/bn/bn_asm.c */
     2 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
     3  * All rights reserved.
     4  *
     5  * This package is an SSL implementation written
     6  * by Eric Young (eay@cryptsoft.com).
     7  * The implementation was written so as to conform with Netscapes SSL.
     8  * 
     9  * This library is free for commercial and non-commercial use as long as
    10  * the following conditions are aheared to.  The following conditions
    11  * apply to all code found in this distribution, be it the RC4, RSA,
    12  * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
    13  * included with this distribution is covered by the same copyright terms
    14  * except that the holder is Tim Hudson (tjh@cryptsoft.com).
    15  * 
    16  * Copyright remains Eric Young's, and as such any Copyright notices in
    17  * the code are not to be removed.
    18  * If this package is used in a product, Eric Young should be given attribution
    19  * as the author of the parts of the library used.
    20  * This can be in the form of a textual message at program startup or
    21  * in documentation (online or textual) provided with the package.
    22  * 
    23  * Redistribution and use in source and binary forms, with or without
    24  * modification, are permitted provided that the following conditions
    25  * are met:
    26  * 1. Redistributions of source code must retain the copyright
    27  *    notice, this list of conditions and the following disclaimer.
    28  * 2. Redistributions in binary form must reproduce the above copyright
    29  *    notice, this list of conditions and the following disclaimer in the
    30  *    documentation and/or other materials provided with the distribution.
    31  * 3. All advertising materials mentioning features or use of this software
    32  *    must display the following acknowledgement:
    33  *    "This product includes cryptographic software written by
    34  *     Eric Young (eay@cryptsoft.com)"
    35  *    The word 'cryptographic' can be left out if the rouines from the library
    36  *    being used are not cryptographic related :-).
    37  * 4. If you include any Windows specific code (or a derivative thereof) from 
    38  *    the apps directory (application code) you must include an acknowledgement:
    39  *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
    40  * 
    41  * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
    42  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
    43  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
    44  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
    45  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
    46  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
    47  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
    48  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
    49  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
    50  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
    51  * SUCH DAMAGE.
    52  * 
    53  * The licence and distribution terms for any publically available version or
    54  * derivative of this code cannot be changed.  i.e. this code cannot simply be
    55  * copied and put under another distribution licence
    56  * [including the GNU Public Licence.]
    57  */
    58 
    59 #ifndef BN_DEBUG
    60 # undef NDEBUG /* avoid conflicting definitions */
    61 # define NDEBUG
    62 #endif
    63 
    64 #include <stdio.h>
    65 #include <assert.h>
    66 #include "cryptlib.h"
    67 #include "bn_lcl.h"
    68 
    69 #if defined(BN_LLONG) || defined(BN_UMULT_HIGH)
    70 
    71 EXPORT_C BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
    72 	{
    73 	BN_ULONG c1=0;
    74 
    75 	assert(num >= 0);
    76 	if (num <= 0) return(c1);
    77 
    78 	while (num&~3)
    79 		{
    80 		mul_add(rp[0],ap[0],w,c1);
    81 		mul_add(rp[1],ap[1],w,c1);
    82 		mul_add(rp[2],ap[2],w,c1);
    83 		mul_add(rp[3],ap[3],w,c1);
    84 		ap+=4; rp+=4; num-=4;
    85 		}
    86 	if (num)
    87 		{
    88 		mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1;
    89 		mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1;
    90 		mul_add(rp[2],ap[2],w,c1); return c1;
    91 		}
    92 	
    93 	return(c1);
    94 	} 
    95 
    96 EXPORT_C BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
    97 	{
    98 	BN_ULONG c1=0;
    99 
   100 	assert(num >= 0);
   101 	if (num <= 0) return(c1);
   102 
   103 	while (num&~3)
   104 		{
   105 		mul(rp[0],ap[0],w,c1);
   106 		mul(rp[1],ap[1],w,c1);
   107 		mul(rp[2],ap[2],w,c1);
   108 		mul(rp[3],ap[3],w,c1);
   109 		ap+=4; rp+=4; num-=4;
   110 		}
   111 	if (num)
   112 		{
   113 		mul(rp[0],ap[0],w,c1); if (--num == 0) return c1;
   114 		mul(rp[1],ap[1],w,c1); if (--num == 0) return c1;
   115 		mul(rp[2],ap[2],w,c1);
   116 		}
   117 	return(c1);
   118 	} 
   119 
   120 EXPORT_C void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
   121         {
   122 	assert(n >= 0);
   123 	if (n <= 0) return;
   124 	while (n&~3)
   125 		{
   126 		sqr(r[0],r[1],a[0]);
   127 		sqr(r[2],r[3],a[1]);
   128 		sqr(r[4],r[5],a[2]);
   129 		sqr(r[6],r[7],a[3]);
   130 		a+=4; r+=8; n-=4;
   131 		}
   132 	if (n)
   133 		{
   134 		sqr(r[0],r[1],a[0]); if (--n == 0) return;
   135 		sqr(r[2],r[3],a[1]); if (--n == 0) return;
   136 		sqr(r[4],r[5],a[2]);
   137 		}
   138 	}
   139 
   140 #else /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */
   141 
   142 EXPORT_C BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
   143 	{
   144 	BN_ULONG c=0;
   145 	BN_ULONG bl,bh;
   146 
   147 	assert(num >= 0);
   148 	if (num <= 0) return((BN_ULONG)0);
   149 
   150 	bl=LBITS(w);
   151 	bh=HBITS(w);
   152 
   153 	for (;;)
   154 		{
   155 		mul_add(rp[0],ap[0],bl,bh,c);
   156 		if (--num == 0) break;
   157 		mul_add(rp[1],ap[1],bl,bh,c);
   158 		if (--num == 0) break;
   159 		mul_add(rp[2],ap[2],bl,bh,c);
   160 		if (--num == 0) break;
   161 		mul_add(rp[3],ap[3],bl,bh,c);
   162 		if (--num == 0) break;
   163 		ap+=4;
   164 		rp+=4;
   165 		}
   166 	return(c);
   167 	} 
   168 
   169 EXPORT_C BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
   170 	{
   171 	BN_ULONG carry=0;
   172 	BN_ULONG bl,bh;
   173 
   174 	assert(num >= 0);
   175 	if (num <= 0) return((BN_ULONG)0);
   176 
   177 	bl=LBITS(w);
   178 	bh=HBITS(w);
   179 
   180 	for (;;)
   181 		{
   182 		mul(rp[0],ap[0],bl,bh,carry);
   183 		if (--num == 0) break;
   184 		mul(rp[1],ap[1],bl,bh,carry);
   185 		if (--num == 0) break;
   186 		mul(rp[2],ap[2],bl,bh,carry);
   187 		if (--num == 0) break;
   188 		mul(rp[3],ap[3],bl,bh,carry);
   189 		if (--num == 0) break;
   190 		ap+=4;
   191 		rp+=4;
   192 		}
   193 	return(carry);
   194 	} 
   195 
   196 EXPORT_C void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
   197         {
   198 	assert(n >= 0);
   199 	if (n <= 0) return;
   200 	for (;;)
   201 		{
   202 		sqr64(r[0],r[1],a[0]);
   203 		if (--n == 0) break;
   204 
   205 		sqr64(r[2],r[3],a[1]);
   206 		if (--n == 0) break;
   207 
   208 		sqr64(r[4],r[5],a[2]);
   209 		if (--n == 0) break;
   210 
   211 		sqr64(r[6],r[7],a[3]);
   212 		if (--n == 0) break;
   213 
   214 		a+=4;
   215 		r+=8;
   216 		}
   217 	}
   218 
   219 #endif /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */
   220 
   221 #if defined(BN_LLONG) && defined(BN_DIV2W)
   222 
   223 EXPORT_C BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
   224 	{
   225 	return((BN_ULONG)(((((BN_ULLONG)h)<<BN_BITS2)|l)/(BN_ULLONG)d));
   226 	}
   227 
   228 #else
   229 
   230 /* Divide h,l by d and return the result. */
   231 /* I need to test this some more :-( */
   232 EXPORT_C BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
   233 	{
   234 	BN_ULONG dh,dl,q,ret=0,th,tl,t;
   235 	int i,count=2;
   236 
   237 	if (d == 0) return(BN_MASK2);
   238 
   239 	i=BN_num_bits_word(d);
   240 	assert((i == BN_BITS2) || (h <= (BN_ULONG)1<<i));
   241 
   242 	i=BN_BITS2-i;
   243 	if (h >= d) h-=d;
   244 
   245 	if (i)
   246 		{
   247 		d<<=i;
   248 		h=(h<<i)|(l>>(BN_BITS2-i));
   249 		l<<=i;
   250 		}
   251 	dh=(d&BN_MASK2h)>>BN_BITS4;
   252 	dl=(d&BN_MASK2l);
   253 	for (;;)
   254 		{
   255 		if ((h>>BN_BITS4) == dh)
   256 			q=BN_MASK2l;
   257 		else
   258 			q=h/dh;
   259 
   260 		th=q*dh;
   261 		tl=dl*q;
   262 		for (;;)
   263 			{
   264 			t=h-th;
   265 			if ((t&BN_MASK2h) ||
   266 				((tl) <= (
   267 					(t<<BN_BITS4)|
   268 					((l&BN_MASK2h)>>BN_BITS4))))
   269 				break;
   270 			q--;
   271 			th-=dh;
   272 			tl-=dl;
   273 			}
   274 		t=(tl>>BN_BITS4);
   275 		tl=(tl<<BN_BITS4)&BN_MASK2h;
   276 		th+=t;
   277 
   278 		if (l < tl) th++;
   279 		l-=tl;
   280 		if (h < th)
   281 			{
   282 			h+=d;
   283 			q--;
   284 			}
   285 		h-=th;
   286 
   287 		if (--count == 0) break;
   288 
   289 		ret=q<<BN_BITS4;
   290 		h=((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2;
   291 		l=(l&BN_MASK2l)<<BN_BITS4;
   292 		}
   293 	ret|=q;
   294 	return(ret);
   295 	}
   296 #endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */
   297 
   298 #ifdef BN_LLONG
   299 EXPORT_C BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
   300         {
   301 	BN_ULLONG ll=0;
   302 
   303 	assert(n >= 0);
   304 	if (n <= 0) return((BN_ULONG)0);
   305 
   306 	for (;;)
   307 		{
   308 		ll+=(BN_ULLONG)a[0]+b[0];
   309 		r[0]=(BN_ULONG)ll&BN_MASK2;
   310 		ll>>=BN_BITS2;
   311 		if (--n <= 0) break;
   312 
   313 		ll+=(BN_ULLONG)a[1]+b[1];
   314 		r[1]=(BN_ULONG)ll&BN_MASK2;
   315 		ll>>=BN_BITS2;
   316 		if (--n <= 0) break;
   317 
   318 		ll+=(BN_ULLONG)a[2]+b[2];
   319 		r[2]=(BN_ULONG)ll&BN_MASK2;
   320 		ll>>=BN_BITS2;
   321 		if (--n <= 0) break;
   322 
   323 		ll+=(BN_ULLONG)a[3]+b[3];
   324 		r[3]=(BN_ULONG)ll&BN_MASK2;
   325 		ll>>=BN_BITS2;
   326 		if (--n <= 0) break;
   327 
   328 		a+=4;
   329 		b+=4;
   330 		r+=4;
   331 		}
   332 	return((BN_ULONG)ll);
   333 	}
   334 #else /* !BN_LLONG */
   335 EXPORT_C BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
   336         {
   337 	BN_ULONG c,l,t;
   338 
   339 	assert(n >= 0);
   340 	if (n <= 0) return((BN_ULONG)0);
   341 
   342 	c=0;
   343 	for (;;)
   344 		{
   345 		t=a[0];
   346 		t=(t+c)&BN_MASK2;
   347 		c=(t < c);
   348 		l=(t+b[0])&BN_MASK2;
   349 		c+=(l < t);
   350 		r[0]=l;
   351 		if (--n <= 0) break;
   352 
   353 		t=a[1];
   354 		t=(t+c)&BN_MASK2;
   355 		c=(t < c);
   356 		l=(t+b[1])&BN_MASK2;
   357 		c+=(l < t);
   358 		r[1]=l;
   359 		if (--n <= 0) break;
   360 
   361 		t=a[2];
   362 		t=(t+c)&BN_MASK2;
   363 		c=(t < c);
   364 		l=(t+b[2])&BN_MASK2;
   365 		c+=(l < t);
   366 		r[2]=l;
   367 		if (--n <= 0) break;
   368 
   369 		t=a[3];
   370 		t=(t+c)&BN_MASK2;
   371 		c=(t < c);
   372 		l=(t+b[3])&BN_MASK2;
   373 		c+=(l < t);
   374 		r[3]=l;
   375 		if (--n <= 0) break;
   376 
   377 		a+=4;
   378 		b+=4;
   379 		r+=4;
   380 		}
   381 	return((BN_ULONG)c);
   382 	}
   383 #endif /* !BN_LLONG */
   384 
   385 EXPORT_C BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
   386         {
   387 	BN_ULONG t1,t2;
   388 	int c=0;
   389 
   390 	assert(n >= 0);
   391 	if (n <= 0) return((BN_ULONG)0);
   392 
   393 	for (;;)
   394 		{
   395 		t1=a[0]; t2=b[0];
   396 		r[0]=(t1-t2-c)&BN_MASK2;
   397 		if (t1 != t2) c=(t1 < t2);
   398 		if (--n <= 0) break;
   399 
   400 		t1=a[1]; t2=b[1];
   401 		r[1]=(t1-t2-c)&BN_MASK2;
   402 		if (t1 != t2) c=(t1 < t2);
   403 		if (--n <= 0) break;
   404 
   405 		t1=a[2]; t2=b[2];
   406 		r[2]=(t1-t2-c)&BN_MASK2;
   407 		if (t1 != t2) c=(t1 < t2);
   408 		if (--n <= 0) break;
   409 
   410 		t1=a[3]; t2=b[3];
   411 		r[3]=(t1-t2-c)&BN_MASK2;
   412 		if (t1 != t2) c=(t1 < t2);
   413 		if (--n <= 0) break;
   414 
   415 		a+=4;
   416 		b+=4;
   417 		r+=4;
   418 		}
   419 	return(c);
   420 	}
   421 
   422 #ifdef BN_MUL_COMBA
   423 
   424 #undef bn_mul_comba8
   425 #undef bn_mul_comba4
   426 #undef bn_sqr_comba8
   427 #undef bn_sqr_comba4
   428 
   429 /* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
   430 /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
   431 /* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
   432 /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
   433 
   434 #ifdef BN_LLONG
   435 #define mul_add_c(a,b,c0,c1,c2) \
   436 	t=(BN_ULLONG)a*b; \
   437 	t1=(BN_ULONG)Lw(t); \
   438 	t2=(BN_ULONG)Hw(t); \
   439 	c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
   440 	c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
   441 
   442 #define mul_add_c2(a,b,c0,c1,c2) \
   443 	t=(BN_ULLONG)a*b; \
   444 	tt=(t+t)&BN_MASK; \
   445 	if (tt < t) c2++; \
   446 	t1=(BN_ULONG)Lw(tt); \
   447 	t2=(BN_ULONG)Hw(tt); \
   448 	c0=(c0+t1)&BN_MASK2;  \
   449 	if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \
   450 	c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
   451 
   452 #define sqr_add_c(a,i,c0,c1,c2) \
   453 	t=(BN_ULLONG)a[i]*a[i]; \
   454 	t1=(BN_ULONG)Lw(t); \
   455 	t2=(BN_ULONG)Hw(t); \
   456 	c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
   457 	c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
   458 
   459 #define sqr_add_c2(a,i,j,c0,c1,c2) \
   460 	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
   461 
   462 #elif defined(BN_UMULT_LOHI)
   463 
   464 #define mul_add_c(a,b,c0,c1,c2)	{	\
   465 	BN_ULONG ta=(a),tb=(b);		\
   466 	BN_UMULT_LOHI(t1,t2,ta,tb);	\
   467 	c0 += t1; t2 += (c0<t1)?1:0;	\
   468 	c1 += t2; c2 += (c1<t2)?1:0;	\
   469 	}
   470 
   471 #define mul_add_c2(a,b,c0,c1,c2) {	\
   472 	BN_ULONG ta=(a),tb=(b),t0;	\
   473 	BN_UMULT_LOHI(t0,t1,ta,tb);	\
   474 	t2 = t1+t1; c2 += (t2<t1)?1:0;	\
   475 	t1 = t0+t0; t2 += (t1<t0)?1:0;	\
   476 	c0 += t1; t2 += (c0<t1)?1:0;	\
   477 	c1 += t2; c2 += (c1<t2)?1:0;	\
   478 	}
   479 
   480 #define sqr_add_c(a,i,c0,c1,c2)	{	\
   481 	BN_ULONG ta=(a)[i];		\
   482 	BN_UMULT_LOHI(t1,t2,ta,ta);	\
   483 	c0 += t1; t2 += (c0<t1)?1:0;	\
   484 	c1 += t2; c2 += (c1<t2)?1:0;	\
   485 	}
   486 
   487 #define sqr_add_c2(a,i,j,c0,c1,c2)	\
   488 	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
   489 
   490 #elif defined(BN_UMULT_HIGH)
   491 
   492 #define mul_add_c(a,b,c0,c1,c2)	{	\
   493 	BN_ULONG ta=(a),tb=(b);		\
   494 	t1 = ta * tb;			\
   495 	t2 = BN_UMULT_HIGH(ta,tb);	\
   496 	c0 += t1; t2 += (c0<t1)?1:0;	\
   497 	c1 += t2; c2 += (c1<t2)?1:0;	\
   498 	}
   499 
   500 #define mul_add_c2(a,b,c0,c1,c2) {	\
   501 	BN_ULONG ta=(a),tb=(b),t0;	\
   502 	t1 = BN_UMULT_HIGH(ta,tb);	\
   503 	t0 = ta * tb;			\
   504 	t2 = t1+t1; c2 += (t2<t1)?1:0;	\
   505 	t1 = t0+t0; t2 += (t1<t0)?1:0;	\
   506 	c0 += t1; t2 += (c0<t1)?1:0;	\
   507 	c1 += t2; c2 += (c1<t2)?1:0;	\
   508 	}
   509 
   510 #define sqr_add_c(a,i,c0,c1,c2)	{	\
   511 	BN_ULONG ta=(a)[i];		\
   512 	t1 = ta * ta;			\
   513 	t2 = BN_UMULT_HIGH(ta,ta);	\
   514 	c0 += t1; t2 += (c0<t1)?1:0;	\
   515 	c1 += t2; c2 += (c1<t2)?1:0;	\
   516 	}
   517 
   518 #define sqr_add_c2(a,i,j,c0,c1,c2)	\
   519 	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
   520 
   521 #else /* !BN_LLONG */
   522 #define mul_add_c(a,b,c0,c1,c2) \
   523 	t1=LBITS(a); t2=HBITS(a); \
   524 	bl=LBITS(b); bh=HBITS(b); \
   525 	mul64(t1,t2,bl,bh); \
   526 	c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
   527 	c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
   528 
   529 #define mul_add_c2(a,b,c0,c1,c2) \
   530 	t1=LBITS(a); t2=HBITS(a); \
   531 	bl=LBITS(b); bh=HBITS(b); \
   532 	mul64(t1,t2,bl,bh); \
   533 	if (t2 & BN_TBIT) c2++; \
   534 	t2=(t2+t2)&BN_MASK2; \
   535 	if (t1 & BN_TBIT) t2++; \
   536 	t1=(t1+t1)&BN_MASK2; \
   537 	c0=(c0+t1)&BN_MASK2;  \
   538 	if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \
   539 	c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
   540 
   541 #define sqr_add_c(a,i,c0,c1,c2) \
   542 	sqr64(t1,t2,(a)[i]); \
   543 	c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
   544 	c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
   545 
   546 #define sqr_add_c2(a,i,j,c0,c1,c2) \
   547 	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
   548 #endif /* !BN_LLONG */
   549 
   550 EXPORT_C void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
   551 	{
   552 #ifdef BN_LLONG
   553 	BN_ULLONG t;
   554 #else
   555 	BN_ULONG bl,bh;
   556 #endif
   557 	BN_ULONG t1,t2;
   558 	BN_ULONG c1,c2,c3;
   559 
   560 	c1=0;
   561 	c2=0;
   562 	c3=0;
   563 	mul_add_c(a[0],b[0],c1,c2,c3);
   564 	r[0]=c1;
   565 	c1=0;
   566 	mul_add_c(a[0],b[1],c2,c3,c1);
   567 	mul_add_c(a[1],b[0],c2,c3,c1);
   568 	r[1]=c2;
   569 	c2=0;
   570 	mul_add_c(a[2],b[0],c3,c1,c2);
   571 	mul_add_c(a[1],b[1],c3,c1,c2);
   572 	mul_add_c(a[0],b[2],c3,c1,c2);
   573 	r[2]=c3;
   574 	c3=0;
   575 	mul_add_c(a[0],b[3],c1,c2,c3);
   576 	mul_add_c(a[1],b[2],c1,c2,c3);
   577 	mul_add_c(a[2],b[1],c1,c2,c3);
   578 	mul_add_c(a[3],b[0],c1,c2,c3);
   579 	r[3]=c1;
   580 	c1=0;
   581 	mul_add_c(a[4],b[0],c2,c3,c1);
   582 	mul_add_c(a[3],b[1],c2,c3,c1);
   583 	mul_add_c(a[2],b[2],c2,c3,c1);
   584 	mul_add_c(a[1],b[3],c2,c3,c1);
   585 	mul_add_c(a[0],b[4],c2,c3,c1);
   586 	r[4]=c2;
   587 	c2=0;
   588 	mul_add_c(a[0],b[5],c3,c1,c2);
   589 	mul_add_c(a[1],b[4],c3,c1,c2);
   590 	mul_add_c(a[2],b[3],c3,c1,c2);
   591 	mul_add_c(a[3],b[2],c3,c1,c2);
   592 	mul_add_c(a[4],b[1],c3,c1,c2);
   593 	mul_add_c(a[5],b[0],c3,c1,c2);
   594 	r[5]=c3;
   595 	c3=0;
   596 	mul_add_c(a[6],b[0],c1,c2,c3);
   597 	mul_add_c(a[5],b[1],c1,c2,c3);
   598 	mul_add_c(a[4],b[2],c1,c2,c3);
   599 	mul_add_c(a[3],b[3],c1,c2,c3);
   600 	mul_add_c(a[2],b[4],c1,c2,c3);
   601 	mul_add_c(a[1],b[5],c1,c2,c3);
   602 	mul_add_c(a[0],b[6],c1,c2,c3);
   603 	r[6]=c1;
   604 	c1=0;
   605 	mul_add_c(a[0],b[7],c2,c3,c1);
   606 	mul_add_c(a[1],b[6],c2,c3,c1);
   607 	mul_add_c(a[2],b[5],c2,c3,c1);
   608 	mul_add_c(a[3],b[4],c2,c3,c1);
   609 	mul_add_c(a[4],b[3],c2,c3,c1);
   610 	mul_add_c(a[5],b[2],c2,c3,c1);
   611 	mul_add_c(a[6],b[1],c2,c3,c1);
   612 	mul_add_c(a[7],b[0],c2,c3,c1);
   613 	r[7]=c2;
   614 	c2=0;
   615 	mul_add_c(a[7],b[1],c3,c1,c2);
   616 	mul_add_c(a[6],b[2],c3,c1,c2);
   617 	mul_add_c(a[5],b[3],c3,c1,c2);
   618 	mul_add_c(a[4],b[4],c3,c1,c2);
   619 	mul_add_c(a[3],b[5],c3,c1,c2);
   620 	mul_add_c(a[2],b[6],c3,c1,c2);
   621 	mul_add_c(a[1],b[7],c3,c1,c2);
   622 	r[8]=c3;
   623 	c3=0;
   624 	mul_add_c(a[2],b[7],c1,c2,c3);
   625 	mul_add_c(a[3],b[6],c1,c2,c3);
   626 	mul_add_c(a[4],b[5],c1,c2,c3);
   627 	mul_add_c(a[5],b[4],c1,c2,c3);
   628 	mul_add_c(a[6],b[3],c1,c2,c3);
   629 	mul_add_c(a[7],b[2],c1,c2,c3);
   630 	r[9]=c1;
   631 	c1=0;
   632 	mul_add_c(a[7],b[3],c2,c3,c1);
   633 	mul_add_c(a[6],b[4],c2,c3,c1);
   634 	mul_add_c(a[5],b[5],c2,c3,c1);
   635 	mul_add_c(a[4],b[6],c2,c3,c1);
   636 	mul_add_c(a[3],b[7],c2,c3,c1);
   637 	r[10]=c2;
   638 	c2=0;
   639 	mul_add_c(a[4],b[7],c3,c1,c2);
   640 	mul_add_c(a[5],b[6],c3,c1,c2);
   641 	mul_add_c(a[6],b[5],c3,c1,c2);
   642 	mul_add_c(a[7],b[4],c3,c1,c2);
   643 	r[11]=c3;
   644 	c3=0;
   645 	mul_add_c(a[7],b[5],c1,c2,c3);
   646 	mul_add_c(a[6],b[6],c1,c2,c3);
   647 	mul_add_c(a[5],b[7],c1,c2,c3);
   648 	r[12]=c1;
   649 	c1=0;
   650 	mul_add_c(a[6],b[7],c2,c3,c1);
   651 	mul_add_c(a[7],b[6],c2,c3,c1);
   652 	r[13]=c2;
   653 	c2=0;
   654 	mul_add_c(a[7],b[7],c3,c1,c2);
   655 	r[14]=c3;
   656 	r[15]=c1;
   657 	}
   658 
   659 EXPORT_C void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
   660 	{
   661 #ifdef BN_LLONG
   662 	BN_ULLONG t;
   663 #else
   664 	BN_ULONG bl,bh;
   665 #endif
   666 	BN_ULONG t1,t2;
   667 	BN_ULONG c1,c2,c3;
   668 
   669 	c1=0;
   670 	c2=0;
   671 	c3=0;
   672 	mul_add_c(a[0],b[0],c1,c2,c3);
   673 	r[0]=c1;
   674 	c1=0;
   675 	mul_add_c(a[0],b[1],c2,c3,c1);
   676 	mul_add_c(a[1],b[0],c2,c3,c1);
   677 	r[1]=c2;
   678 	c2=0;
   679 	mul_add_c(a[2],b[0],c3,c1,c2);
   680 	mul_add_c(a[1],b[1],c3,c1,c2);
   681 	mul_add_c(a[0],b[2],c3,c1,c2);
   682 	r[2]=c3;
   683 	c3=0;
   684 	mul_add_c(a[0],b[3],c1,c2,c3);
   685 	mul_add_c(a[1],b[2],c1,c2,c3);
   686 	mul_add_c(a[2],b[1],c1,c2,c3);
   687 	mul_add_c(a[3],b[0],c1,c2,c3);
   688 	r[3]=c1;
   689 	c1=0;
   690 	mul_add_c(a[3],b[1],c2,c3,c1);
   691 	mul_add_c(a[2],b[2],c2,c3,c1);
   692 	mul_add_c(a[1],b[3],c2,c3,c1);
   693 	r[4]=c2;
   694 	c2=0;
   695 	mul_add_c(a[2],b[3],c3,c1,c2);
   696 	mul_add_c(a[3],b[2],c3,c1,c2);
   697 	r[5]=c3;
   698 	c3=0;
   699 	mul_add_c(a[3],b[3],c1,c2,c3);
   700 	r[6]=c1;
   701 	r[7]=c2;
   702 	}
   703 
   704 EXPORT_C void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
   705 	{
   706 #ifdef BN_LLONG
   707 	BN_ULLONG t,tt;
   708 #else
   709 	BN_ULONG bl,bh;
   710 #endif
   711 	BN_ULONG t1,t2;
   712 	BN_ULONG c1,c2,c3;
   713 
   714 	c1=0;
   715 	c2=0;
   716 	c3=0;
   717 	sqr_add_c(a,0,c1,c2,c3);
   718 	r[0]=c1;
   719 	c1=0;
   720 	sqr_add_c2(a,1,0,c2,c3,c1);
   721 	r[1]=c2;
   722 	c2=0;
   723 	sqr_add_c(a,1,c3,c1,c2);
   724 	sqr_add_c2(a,2,0,c3,c1,c2);
   725 	r[2]=c3;
   726 	c3=0;
   727 	sqr_add_c2(a,3,0,c1,c2,c3);
   728 	sqr_add_c2(a,2,1,c1,c2,c3);
   729 	r[3]=c1;
   730 	c1=0;
   731 	sqr_add_c(a,2,c2,c3,c1);
   732 	sqr_add_c2(a,3,1,c2,c3,c1);
   733 	sqr_add_c2(a,4,0,c2,c3,c1);
   734 	r[4]=c2;
   735 	c2=0;
   736 	sqr_add_c2(a,5,0,c3,c1,c2);
   737 	sqr_add_c2(a,4,1,c3,c1,c2);
   738 	sqr_add_c2(a,3,2,c3,c1,c2);
   739 	r[5]=c3;
   740 	c3=0;
   741 	sqr_add_c(a,3,c1,c2,c3);
   742 	sqr_add_c2(a,4,2,c1,c2,c3);
   743 	sqr_add_c2(a,5,1,c1,c2,c3);
   744 	sqr_add_c2(a,6,0,c1,c2,c3);
   745 	r[6]=c1;
   746 	c1=0;
   747 	sqr_add_c2(a,7,0,c2,c3,c1);
   748 	sqr_add_c2(a,6,1,c2,c3,c1);
   749 	sqr_add_c2(a,5,2,c2,c3,c1);
   750 	sqr_add_c2(a,4,3,c2,c3,c1);
   751 	r[7]=c2;
   752 	c2=0;
   753 	sqr_add_c(a,4,c3,c1,c2);
   754 	sqr_add_c2(a,5,3,c3,c1,c2);
   755 	sqr_add_c2(a,6,2,c3,c1,c2);
   756 	sqr_add_c2(a,7,1,c3,c1,c2);
   757 	r[8]=c3;
   758 	c3=0;
   759 	sqr_add_c2(a,7,2,c1,c2,c3);
   760 	sqr_add_c2(a,6,3,c1,c2,c3);
   761 	sqr_add_c2(a,5,4,c1,c2,c3);
   762 	r[9]=c1;
   763 	c1=0;
   764 	sqr_add_c(a,5,c2,c3,c1);
   765 	sqr_add_c2(a,6,4,c2,c3,c1);
   766 	sqr_add_c2(a,7,3,c2,c3,c1);
   767 	r[10]=c2;
   768 	c2=0;
   769 	sqr_add_c2(a,7,4,c3,c1,c2);
   770 	sqr_add_c2(a,6,5,c3,c1,c2);
   771 	r[11]=c3;
   772 	c3=0;
   773 	sqr_add_c(a,6,c1,c2,c3);
   774 	sqr_add_c2(a,7,5,c1,c2,c3);
   775 	r[12]=c1;
   776 	c1=0;
   777 	sqr_add_c2(a,7,6,c2,c3,c1);
   778 	r[13]=c2;
   779 	c2=0;
   780 	sqr_add_c(a,7,c3,c1,c2);
   781 	r[14]=c3;
   782 	r[15]=c1;
   783 	}
   784 
   785 EXPORT_C void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
   786 	{
   787 #ifdef BN_LLONG
   788 	BN_ULLONG t,tt;
   789 #else
   790 	BN_ULONG bl,bh;
   791 #endif
   792 	BN_ULONG t1,t2;
   793 	BN_ULONG c1,c2,c3;
   794 
   795 	c1=0;
   796 	c2=0;
   797 	c3=0;
   798 	sqr_add_c(a,0,c1,c2,c3);
   799 	r[0]=c1;
   800 	c1=0;
   801 	sqr_add_c2(a,1,0,c2,c3,c1);
   802 	r[1]=c2;
   803 	c2=0;
   804 	sqr_add_c(a,1,c3,c1,c2);
   805 	sqr_add_c2(a,2,0,c3,c1,c2);
   806 	r[2]=c3;
   807 	c3=0;
   808 	sqr_add_c2(a,3,0,c1,c2,c3);
   809 	sqr_add_c2(a,2,1,c1,c2,c3);
   810 	r[3]=c1;
   811 	c1=0;
   812 	sqr_add_c(a,2,c2,c3,c1);
   813 	sqr_add_c2(a,3,1,c2,c3,c1);
   814 	r[4]=c2;
   815 	c2=0;
   816 	sqr_add_c2(a,3,2,c3,c1,c2);
   817 	r[5]=c3;
   818 	c3=0;
   819 	sqr_add_c(a,3,c1,c2,c3);
   820 	r[6]=c1;
   821 	r[7]=c2;
   822 	}
   823 #else /* !BN_MUL_COMBA */
   824 
   825 /* hmm... is it faster just to do a multiply? */
   826 #undef bn_sqr_comba4
   827 EXPORT_C void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
   828 	{
   829 	BN_ULONG t[8];
   830 	bn_sqr_normal(r,a,4,t);
   831 	}
   832 
   833 #undef bn_sqr_comba8
   834 EXPORT_C void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
   835 	{
   836 	BN_ULONG t[16];
   837 	bn_sqr_normal(r,a,8,t);
   838 	}
   839 
   840 EXPORT_C void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
   841 	{
   842 	r[4]=bn_mul_words(    &(r[0]),a,4,b[0]);
   843 	r[5]=bn_mul_add_words(&(r[1]),a,4,b[1]);
   844 	r[6]=bn_mul_add_words(&(r[2]),a,4,b[2]);
   845 	r[7]=bn_mul_add_words(&(r[3]),a,4,b[3]);
   846 	}
   847 
   848 EXPORT_C void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
   849 	{
   850 	r[ 8]=bn_mul_words(    &(r[0]),a,8,b[0]);
   851 	r[ 9]=bn_mul_add_words(&(r[1]),a,8,b[1]);
   852 	r[10]=bn_mul_add_words(&(r[2]),a,8,b[2]);
   853 	r[11]=bn_mul_add_words(&(r[3]),a,8,b[3]);
   854 	r[12]=bn_mul_add_words(&(r[4]),a,8,b[4]);
   855 	r[13]=bn_mul_add_words(&(r[5]),a,8,b[5]);
   856 	r[14]=bn_mul_add_words(&(r[6]),a,8,b[6]);
   857 	r[15]=bn_mul_add_words(&(r[7]),a,8,b[7]);
   858 	}
   859 
   860 #endif /* !BN_MUL_COMBA */