os/ossrv/genericopenlibs/liboil/src/motovec/vec_memset.s
changeset 0 bde4ae8d615e
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/os/ossrv/genericopenlibs/liboil/src/motovec/vec_memset.s	Fri Jun 15 03:10:57 2012 +0200
     1.3 @@ -0,0 +1,553 @@
     1.4 +//------------------------------------------------------------------
     1.5 +// file:  vec_memset.S
     1.6 +//    AltiVec enabled version of memset and bzero and cacheable_memzero
     1.7 +//------------------------------------------------------------------
     1.8 +
     1.9 +//------------------------------------------------------------------
    1.10 +//	Copyright Motorola, Inc. 2002
    1.11 +//	ALL RIGHTS RESERVED
    1.12 +//
    1.13 +//	You are hereby granted a copyright license to use, modify, and 
    1.14 +//	distribute the SOFTWARE so long as this entire notice is retained 
    1.15 +//	without alteration in any modified and/or redistributed versions, 
    1.16 +//	and that such modified versions are clearly identified as such.  
    1.17 +//	No licenses are granted by implication, estoppel or otherwise under 
    1.18 +//	any patents or trademarks of Motorola, Inc.
    1.19 +//
    1.20 +//	The SOFTWARE is provided on an "AS IS" basis and without warranty.  
    1.21 +//	To the maximum extent permitted by applicable law, MOTOROLA DISCLAIMS 
    1.22 +//	ALL WARRANTIES WHETHER EXPRESS OR IMPLIED, INCLUDING IMPLIED 
    1.23 +//	WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR 
    1.24 +//	PURPOSE AND ANY WARRANTY AGAINST INFRINGEMENT WITH 
    1.25 +//	REGARD TO THE SOFTWARE (INCLUDING ANY MODIFIED VERSIONS 
    1.26 +//	THEREOF) AND ANY ACCOMPANYING WRITTEN MATERIALS. 
    1.27 +//
    1.28 +//	To the maximum extent permitted by applicable law, IN NO EVENT SHALL 
    1.29 +//	MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER 
    1.30 +//	(INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF 
    1.31 +//	BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS 
    1.32 +//	INFORMATION, OR OTHER PECUNIARY LOSS) ARISING OF THE USE OR 
    1.33 +//	INABILITY TO USE THE SOFTWARE.   Motorola assumes no responsibility 
    1.34 +//	for the maintenance and support of the SOFTWARE.
    1.35 +//------------------------------------------------------------------
    1.36 +
    1.37 +//------------------------------------------------------------------
    1.38 +// extern void *memset( void *ptr, int val, size_t len );
    1.39 +//   Copies val into each of len characters beginning at ptr.
    1.40 +//                                       - Harbison&Steele 4th ed
    1.41 +//    (despite val being an int, this memset assumes it is never
    1.42 +//     more than a byte.  That seems to be correct from all the
    1.43 +//     memset functions I've seen but I don't know if ANSI allows
    1.44 +//     anthing longer.     Chuck Corley  12/21/02) 
    1.45 +// Returns:
    1.46 +//  void * ptr
    1.47 +//------------------------------------------------------------------
    1.48 +
    1.49 +//------------------------------------------------------------------
    1.50 +// extern void * bzero( char *ptr, int len);   
    1.51 +//   Copies 0 into each of len characters at ptr.
    1.52 +//                                       - Harbison&Steele 4th ed
    1.53 +// Returns:
    1.54 +//  void * ptr
    1.55 +//------------------------------------------------------------------
    1.56 +
    1.57 +// Revision History:
    1.58 +//    Rev 0.0	Original                        Chuck Corley	02/09/03
    1.59 +//              Could benefit from changes added to memcpy
    1.60 +//    Rev 0.1	Revised per memcpy Rev 0.30     Chuck Corley	05/01/03
    1.61 +//
    1.62 +//  This is beta quality code; users are encouraged to make it faster.
    1.63 +//  ASSUMPTIONS:
    1.64 +//     Code is highly likely to be in the cache; data is not (streaming data)
    1.65 +//     Zero fill could be quite likely.
    1.66 +//     Moving fill byte from GPR to VR as below faster than stw->lvebx via stack
    1.67 +
    1.68 +#define VRSV 256	//	VRSAVE spr
    1.69 +// Don't use vectors for BC <= MIN_VEC. Works only if MIN_VEC >= 16 bytes.
    1.70 +#define MIN_VEC 16
    1.71 +
    1.72 +// Register useage
    1.73 +#define Rt r0	// 	r0 when used as a temporary register	
    1.74 +
    1.75 +#define DST r3	// 	entering: dest pointer; exiting: same dest pointer
    1.76 +
    1.77 +#define FILL r4	// 	entering: fill char then fill word
    1.78 +
    1.79 +#define BC r5	//	entering: Byte_Count then remaining Byte_Count
    1.80 +
    1.81 +#define DBC r6//	dst + byte count
    1.82 +
    1.83 +#define BK r7	//  	BC - 1 +/- (n*16)
    1.84 +
    1.85 +#define Fsh r8	//	fill byte shifted right one nibble
    1.86 +
    1.87 +#define DM1 r9//	dst -1 for byte-by-byte backwards initially
    1.88 +#define D r9	//	(dst+16)[0:27] - dst[28:31]
    1.89 +#define DNX r9	//	(dst+n*16)[28:31]
    1.90 +#define BL r9	//	second byte_kount index pointer
    1.91 +
    1.92 +#define DR r10	//	(dst+16)[0:27]
    1.93 +#define QW r10	//  	number of cache lines
    1.94 +
    1.95 +#define DBK r11	//	(dst+byte_count-1) then (dst+byte_count-1)[28:31]
    1.96 +
    1.97 +#define RSV r12	//  	storage for VRSAVE register if used
    1.98 +
    1.99 +//  Condition register use (not including temporary cr0)
   1.100 +//      cr0[2]   = (FILL==0)?
   1.101 +//      cr1[0,2] = (BC == 0)? 1 : 0; (nothing to move)
   1.102 +// then cr1[2]   = (DST[28:31] == 0)? 1 : 0;  (D0 left justified)
   1.103 +// then cr1[2]   = ((DBK = DST+BC-1)[28:31] = 0xF)? 1 : 0; (DN right justified)
   1.104 +//      cr6[2]   = (QW == 0)? 1 : 0;
   1.105 +// then cr6[1]   = (QW > 4)? 1 : 0; (>4 vectors to move?)
   1.106 +// then cr6[3]   = (third store[27] == 1)? 1: 0; (cache line alignment)
   1.107 +// then cr6[3]   = (last store[27] == 1)? 1: 0; (last store odd?)
   1.108 +//      cr7[2]   = (BC>MIN_VEC)?1:0;  (BC big enough to warrant vectors)
   1.109 +// then cr7[0:3] = (DST+16)[0:27]-DST  (How many bytes (iff <16) in first vector?)
   1.110 +// then cr7[0:3] = (DST+BC)[0:27]  (How many bytes (iff <16) in last vector?)
   1.111 +
   1.112 +// Conditionalize the use of dcba.  It will help if the data is
   1.113 +// not in cache and hurt if it is.  Generally, except for small
   1.114 +// benchmarks repeated many times, we assume data is not in cache
   1.115 +// (data streaming) and using dcba is a performance boost.
   1.116 +// We use dcba which will noop to non-cacheable memory rather than
   1.117 +// dcbz which will cause an aligment exception.
   1.118 +#ifndef NO_DCBA
   1.119 +#if defined(__GNUC__) || defined(__MWERKS__) || defined(_DIAB_TOOL)
   1.120 + // gcc and codewarrior and diab don't assemble dcba
   1.121 +#define DCBK .long 0x7c033dec
   1.122 +// dcba r3,r7    or    dcba DST,BK
   1.123 +#else
   1.124 +#ifdef __ghs__
   1.125 +.macro DCBK
   1.126 +.long 0x7c033dec
   1.127 +.endm
   1.128 +#else
   1.129 +#define DCBK dcba DST,BK
   1.130 +#endif  // __ghs__
   1.131 +#endif  // __GNUC__ or __MWERKS__
   1.132 +#else
   1.133 +#define DCBK nop
   1.134 +#endif  // NO_DCBA
   1.135 +
   1.136 +	.text
   1.137 +#ifdef __MWERKS__
   1.138 +	.align	32
   1.139 +#else
   1.140 +	.align	5
   1.141 +#endif
   1.142 +
   1.143 +#ifdef LIBMOTOVEC
   1.144 +	.globl	memset     
   1.145 +memset:
   1.146 +#else
   1.147 +	.globl	_vec_memset     
   1.148 +_vec_memset:
   1.149 +#endif
   1.150 +
   1.151 +	cmpi	cr7,0,BC,MIN_VEC	// IU1 Check for minimum byte count
   1.152 +	cmpi	cr1,0,BC,0	// IU1 Eliminate zero byte count
   1.153 +	rlwinm.	Fsh,FILL,28,28,3 // IU1 Is fill byte zero? and shift
   1.154 +
   1.155 +	addi	DM1,DST,-1	// IU1 Pre-bias and duplicate destination
   1.156 +	addi	DR,DST,16	// IU1 Address of second dst vector
   1.157 +	add	DBC,DST,BC	// IU1 Address of last dst byte + 1
   1.158 +	bgt	cr7,v_memset	// b if BC>MIN_VEC
   1.159 +
   1.160 +	mtctr	BC		// for (i=1;i<=BC;i++)
   1.161 +	beqlr	cr1		// return if BC = 0
   1.162 +Byte_set:
   1.163 +	stbu	FILL,1(DM1)	// LSU * ++(DST-1) = FILL
   1.164 +	bdnz	Byte_set
   1.165 +
   1.166 +	blr
   1.167 +
   1.168 +v_memset:
   1.169 +// Byte count < MIN_VEC bytes will have been set by scalar code above,
   1.170 +// so this will not deal with small block sets < MIN_VEC.
   1.171 +
   1.172 +// For systems using VRSAVE, define VRSAV=1 when compiling.  For systems
   1.173 +// that don't, make sure VRSAVE is undefined.
   1.174 +#ifdef VRSAVE
   1.175 +	mfspr	RSV,VRSV	// IU2 Get current VRSAVE contents
   1.176 +#endif
   1.177 +	rlwinm	DR,DR,0,0,27	// IU1 (DST+16)[0:27]
   1.178 +	addi	DBK,DBC,-1	// IU1 Address of last dst byte
   1.179 +
   1.180 +#ifdef VRSAVE
   1.181 +	oris	Rt,RSV,0xe000	// IU1 Or in registers used by this routine
   1.182 +#endif
   1.183 +	subf	D,DST,DR	// IU1 How many bytes in first destination?
   1.184 +	li	BK,0		// IU1 Initialize byte kount index
   1.185 +
   1.186 +#ifdef VRSAVE
   1.187 +	mtspr	VRSV,Rt	// IU2 Save in VRSAVE before first vec op
   1.188 +#endif
   1.189 +	vxor	v0,v0,v0	// VIU Clear v0
   1.190 +	subf	QW,DR,DBK	// IU1 Bytes of full vectors to move (-16)
   1.191 +	cmpi	cr1,0,D,16	// IU1 Is D0 left justified?
   1.192 +	beq+	enter_bzero	// b if FILL==0
   1.193 +
   1.194 +	lvsl	v0,0,Fsh	// LSU Move upper nibble to byte 0 of VR
   1.195 +	vspltisb	v1,4	// VPU Splat 0x4 to every byte
   1.196 +
   1.197 +	lvsl	v2,0,FILL	// LSU Move lower nibble to byte 0 of VR
   1.198 +
   1.199 +	vslb	v0,v0,v1	// VIU Move upper nibble to VR[0:3]
   1.200 +
   1.201 +	vor	v0,v0,v2	// VIU Form FILL byte in VR[0:7]
   1.202 +
   1.203 +	vspltb	v0,v0,0		// VPU Splat the fill byte to all bytes
   1.204 +enter_bzero:
   1.205 +	mtcrf	0x01,D		// IU2 Put bytes in 1st dst in cr7
   1.206 +	rlwinm	QW,QW,28,4,31	// IU1 Quad words remaining
   1.207 +	beq	cr1,Left_just	// b if D0 is left justified
   1.208 +
   1.209 +	bns	cr7,No_B_fwd	// b if only even number of bytes to store
   1.210 +
   1.211 +	stvebx	v0,DST,BK	// LSU store first byte at DST+0
   1.212 +	addi	BK,BK,1		// IU1 increment index
   1.213 +No_B_fwd:
   1.214 +	bne	cr7,No_H_fwd	// b if only words to store
   1.215 +
   1.216 +	stvehx	v0,DST,BK	// LSU store halfword at DST+0/1
   1.217 +	addi	BK,BK,2		// IU1 increment index
   1.218 +No_H_fwd:
   1.219 +	bng	cr7,No_W1_fwd	// b if exactly zero or two words to store
   1.220 +
   1.221 +	stvewx	v0,DST,BK	// LSU store word 1 of one or three
   1.222 +	addi	BK,BK,4		// IU1 increment index
   1.223 +
   1.224 +No_W1_fwd:
   1.225 +	bnl	cr7,No_W2_fwd	// b if there was only one word to store
   1.226 +	stvewx	v0,DST,BK	// LSU store word 1 of two or 2 of three
   1.227 +	addi	BK,BK,4		// IU1 increment index
   1.228 +
   1.229 +	stvewx	v0,DST,BK	// LSU store word 2 of two or 3 of three
   1.230 +	b	No_W2_fwd
   1.231 +
   1.232 +Left_just:	
   1.233 +	stvx	v0,0,DST	// LSU Store 16 bytes at D0
   1.234 +No_W2_fwd:
   1.235 +	rlwinm	Rt,DBK,0,28,31	// IU1 (DBK = DST+BC-1)[28:31]
   1.236 +	cmpi	cr6,0,QW,0	// IU1 Any full vectors to move?
   1.237 +
   1.238 +	li	BK,16		// IU1 Re-initialize byte kount index
   1.239 +	cmpi	cr1,0,Rt,0xF	// IU1 Is DN right justified?
   1.240 +	ble	cr6,Last_QW	// b if no Quad words to do
   1.241 +
   1.242 +	mtctr	QW		// IU2 for (i=0;i<=QW;i++)
   1.243 +	cmpi	cr6,0,QW,4	// IU1 Check QW>4
   1.244 +
   1.245 +QW_loop:
   1.246 +	stvx	v0,DST,BK	// LSU Store 16 fill bytes
   1.247 +	addi	BK,BK,16	// IU1 Increment byte kount index
   1.248 +	bdnzf	25,QW_loop	// b if 4 or less quad words to do
   1.249 +
   1.250 +	add	DNX,DST,BK	// IU1 address of next store (DST+32 if QW>4)
   1.251 +	addi	QW,QW,-1	// IU1 One more QW stored by now
   1.252 +	bgt	cr6,GT_4QW_fwd	// b if >4 quad words left
   1.253 +
   1.254 +Last_QW:	// Next vector is the last; we're done.
   1.255 +	mtcrf	0x01,DBC	// IU2 Put final vector byte count in cr7
   1.256 +
   1.257 +	beq	cr1,Rt_just_fwd	// b if last destination is right justified
   1.258 +
   1.259 +	rlwinm	DBK,DBK,0,0,27	// IU1 Round to QW addr of last byte
   1.260 +	li	BL,0		// IU1 Initialize index pointer
   1.261 +	bnl	cr7,Only_1W_fwd	// b if there was only one or zero words to store
   1.262 +
   1.263 +	stvewx	v0,DBK,BL	// LSU store word 1 of two or three
   1.264 +	addi	BL,BL,4		// IU1 increment index
   1.265 +
   1.266 +	stvewx	v0,DBK,BL	// LSU store word 2 of two or three
   1.267 +	addi	BL,BL,4		// IU1 increment index
   1.268 +Only_1W_fwd:
   1.269 +	bng	cr7,Only_2W_fwd	// b if there were only two or zero words to store
   1.270 +
   1.271 +	stvewx	v0,DBK,BL	// LSU store word 3 of three if necessary
   1.272 +	addi	BL,BL,4		// IU1 increment index
   1.273 +Only_2W_fwd:
   1.274 +	bne	cr7,Only_B_fwd	// b if there are no half words to store
   1.275 +
   1.276 +	stvehx	v0,DBK,BL	// LSU store one halfword if necessary
   1.277 +	addi	BL,BL,2		// IU1 increment index
   1.278 +Only_B_fwd:
   1.279 +	bns	cr7,All_done_fwd	// b if there are no bytes to store
   1.280 +
   1.281 +	stvebx	v0,DBK,BL	// LSU store one byte if necessary
   1.282 +	b	All_done_fwd
   1.283 +
   1.284 +Rt_just_fwd:
   1.285 +
   1.286 +	stvx	v0,DST,BK	// LSU Store 16 bytes at D14
   1.287 +All_done_fwd:
   1.288 +#ifdef VRSAVE
   1.289 +	mtspr	VRSV,RSV	// IU1 Restore VRSAVE	
   1.290 +#endif
   1.291 +	blr			// Return destination address from entry
   1.292 +
   1.293 +#ifdef __MWERKS__
   1.294 +	.align	16
   1.295 +#else
   1.296 +	.align	4
   1.297 +#endif
   1.298 +GT_4QW_fwd:	// Do once if nxt st is to odd half of cache line, else twice
   1.299 +
   1.300 +	addi	QW,QW,-1	// IU1 Keeping track of QWs stored
   1.301 +	mtcrf	0x02,DNX	// IU2 cr6[3]=((DST+32)[27]==1)?1:0;
   1.302 +	addi	DNX,DNX,16	// IU1 Update cr6 for next loop
   1.303 +
   1.304 +	stvx	v0,DST,BK	// LSU Store 16 bytes at D2
   1.305 +	addi	BK,BK,16	// IU1 Increment byte count by 16
   1.306 +	bdnzf	27,GT_4QW_fwd	// b if next store is to lower (even) half of CL
   1.307 +
   1.308 +	mtcrf	0x02,DBK	// IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?)
   1.309 +
   1.310 +	bns	cr6,B32_fwd	// b if DST[27] == 0; i.e, final store is even
   1.311 +
   1.312 +// We need the ctr register to reflect an even byte count before entering
   1.313 +// the next block - faster to decrement than to reload.
   1.314 +	bdnz	B32_fwd		// decrement counter for last QW store odd
   1.315 +
   1.316 +B32_fwd:	// Should be at least 2 stores remaining and next 2 are cache aligned
   1.317 +	DCBK			// LSU then Kill instead of RWITM
   1.318 +
   1.319 +	stvx	v0,DST,BK	// LSU Store 16 bytes at D11
   1.320 +	addi	BK,BK,16	// IU1 Increment byte count
   1.321 +	bdz	Nxt_loc_fwd	// always decrement and branch to next instr		
   1.322 +
   1.323 +Nxt_loc_fwd:
   1.324 +	stvx	v0,DST,BK	// LSU Store 16 bytes at D12
   1.325 +	addi	BK,BK,16	// IU1 Increment byte count
   1.326 +	bdnz	B32_fwd		// b if there are at least two more QWs to do
   1.327 +
   1.328 +	bso	cr6,One_even_QW	// b if there is one even and one odd QW to store
   1.329 +	b	Last_QW		// b if last store is to even address
   1.330 +
   1.331 +// Come here with two more loads and two stores to do
   1.332 +One_even_QW:
   1.333 +	stvx	v0,DST,BK	// LSU Store 16 bytes at D13
   1.334 +	addi	BK,BK,16	// IU1 Increment byte count
   1.335 +
   1.336 +	b	Last_QW
   1.337 +
   1.338 +// End of memset in AltiVec
   1.339 +
   1.340 +#define BCz r4		// in bzero r4 enters with byte count
   1.341 +
   1.342 +#ifdef __MWERKS__
   1.343 +	.align	32
   1.344 +#else
   1.345 +	.align	5
   1.346 +#endif
   1.347 +
   1.348 +#ifdef LIBMOTOVEC
   1.349 +	.globl	bzero     
   1.350 +bzero:
   1.351 +#else
   1.352 +	.globl	vec_bzero     
   1.353 +vec_bzero:
   1.354 +#endif
   1.355 +
   1.356 +	mr	BC,BCz		// IU1 arg[2] is BC here, not FILL
   1.357 +	li	FILL,0		// IU1 for bzero FILL=0
   1.358 +#ifdef LIBMOTOVEC
   1.359 +	b	memset     
   1.360 +#else
   1.361 +	b	_vec_memset     
   1.362 +#endif
   1.363 +
   1.364 +// cacheable_memzero will employ dcbz to clear 32 bytes at a time
   1.365 +// of cacheable memory. Like bzero, second entering argument will be BC.
   1.366 +// Using this for non-cacheable memory will generate an alignment exception.
   1.367 +
   1.368 +	.text
   1.369 +#ifdef __MWERKS__
   1.370 +	.align	32
   1.371 +#else
   1.372 +	.align	5
   1.373 +#endif
   1.374 +
   1.375 +#ifdef LIBMOTOVEC
   1.376 +	.globl	cacheable_memzero     
   1.377 +cacheable_memzero:
   1.378 +#else
   1.379 +	.globl	vec_cacheable_memzero     
   1.380 +vec_cacheable_memzero:
   1.381 +#endif
   1.382 +
   1.383 +	mr	BC,BCz		// IU1 arg[2] is BC here, not FILL
   1.384 +	li	FILL,0		// IU1 for bzero FILL=0
   1.385 +	cmpi	cr7,0,BC,MIN_VEC	// IU1 Check for minimum byte count
   1.386 +
   1.387 +	cmpi	cr1,0,BC,0	// IU1 Eliminate zero byte count
   1.388 +
   1.389 +	addi	DM1,DST,-1	// IU1 Pre-bias and duplicate destination
   1.390 +	addi	DR,DST,16	// IU1 Address of second dst vector
   1.391 +	add	DBC,DST,BC	// IU1 Address of last dst byte + 1
   1.392 +	bgt	cr7,c_v_memset	// b if BC>MIN_VEC
   1.393 +
   1.394 +	mtctr	BC		// for (i=1;i<=BC;i++)
   1.395 +	beqlr	cr1		// return if BC = 0
   1.396 +c_Byte_set:
   1.397 +	stbu	FILL,1(DM1)	// LSU * ++(DST-1) = FILL
   1.398 +	bdnz	c_Byte_set
   1.399 +
   1.400 +	blr
   1.401 +
   1.402 +c_v_memset:
   1.403 +// Byte count < MIN_VEC bytes will have been set by scalar code above,
   1.404 +// so this will not deal with small block sets < MIN_VEC.
   1.405 +
   1.406 +// For systems using VRSAVE, define VRSAV=1 when compiling.  For systems
   1.407 +// that don't, make sure VRSAVE is undefined.
   1.408 +#ifdef VRSAVE
   1.409 +	mfspr	RSV,VRSV	// IU2 Get current VRSAVE contents
   1.410 +#endif
   1.411 +	rlwinm	DR,DR,0,0,27	// IU1 (DST+16)[0:27]
   1.412 +	addi	DBK,DBC,-1	// IU1 Address of last dst byte
   1.413 +
   1.414 +#ifdef VRSAVE
   1.415 +	oris	Rt,RSV,0x8000	// IU1 Or in registers used by this routine
   1.416 +#endif
   1.417 +	subf	D,DST,DR	// IU1 How many bytes in first destination?
   1.418 +	li	BK,0		// IU1 Initialize byte kount index
   1.419 +
   1.420 +#ifdef VRSAVE
   1.421 +	mtspr	VRSV,Rt	// IU2 Save in VRSAVE before first vec op
   1.422 +#endif
   1.423 +	vxor	v0,v0,v0	// VIU Clear v0
   1.424 +	subf	QW,DR,DBK	// IU1 Bytes of full vectors to move (-16)
   1.425 +	cmpi	cr1,0,D,16	// IU1 Is D0 left justified?
   1.426 +
   1.427 +	mtcrf	0x01,D		// IU2 Put bytes in 1st dst in cr7
   1.428 +	rlwinm	QW,QW,28,4,31	// IU1 Quad words remaining
   1.429 +	beq	cr1,c_Left_just	// b if D0 is left justified
   1.430 +
   1.431 +	bns	cr7,c_No_B_fwd	// b if only even number of bytes to store
   1.432 +
   1.433 +	stvebx	v0,DST,BK	// LSU store first byte at DST+0
   1.434 +	addi	BK,BK,1		// IU1 increment index
   1.435 +c_No_B_fwd:
   1.436 +	bne	cr7,c_No_H_fwd	// b if only words to store
   1.437 +
   1.438 +	stvehx	v0,DST,BK	// LSU store halfword at DST+0/1
   1.439 +	addi	BK,BK,2		// IU1 increment index
   1.440 +c_No_H_fwd:
   1.441 +	bng	cr7,c_No_W1_fwd	// b if exactly zero or two words to store
   1.442 +
   1.443 +	stvewx	v0,DST,BK	// LSU store word 1 of one or three
   1.444 +	addi	BK,BK,4		// IU1 increment index
   1.445 +
   1.446 +c_No_W1_fwd:
   1.447 +	bnl	cr7,c_No_W2_fwd	// b if there was only one word to store
   1.448 +	stvewx	v0,DST,BK	// LSU store word 1 of two or 2 of three
   1.449 +	addi	BK,BK,4		// IU1 increment index
   1.450 +
   1.451 +	stvewx	v0,DST,BK	// LSU store word 2 of two or 3 of three
   1.452 +	b	c_No_W2_fwd
   1.453 +
   1.454 +c_Left_just:	
   1.455 +	stvx	v0,0,DST	// LSU Store 16 bytes at D0
   1.456 +c_No_W2_fwd:
   1.457 +	rlwinm	Rt,DBK,0,28,31	// IU1 (DBK = DST+BC-1)[28:31]
   1.458 +	cmpi	cr6,0,QW,0	// IU1 Any full vectors to move?
   1.459 +
   1.460 +	li	BK,16		// IU1 Re-initialize byte kount index
   1.461 +	cmpi	cr1,0,Rt,0xF	// IU1 Is DN right justified?
   1.462 +	ble	cr6,c_Last_QW	// b if no Quad words to do
   1.463 +
   1.464 +	mtctr	QW		// IU2 for (i=0;i<=QW;i++)
   1.465 +	cmpi	cr6,0,QW,4	// IU1 Check QW>4
   1.466 +
   1.467 +c_QW_loop:
   1.468 +	stvx	v0,DST,BK	// LSU Store 16 fill bytes
   1.469 +	addi	BK,BK,16	// IU1 Increment byte kount index
   1.470 +	bdnzf	25,c_QW_loop	// b if 4 or less quad words to do
   1.471 +
   1.472 +	add	DNX,DST,BK	// IU1 address of next store (DST+32 if QW>4)
   1.473 +	addi	QW,QW,-1	// IU1 One more QW stored by now
   1.474 +	bgt	cr6,c_GT_4QW_fwd	// b if >4 quad words left
   1.475 +
   1.476 +c_Last_QW:	// Next vector is the last; we're done.
   1.477 +	mtcrf	0x01,DBC	// IU2 Put final vector byte count in cr7
   1.478 +
   1.479 +	beq	cr1,c_Rt_just_fwd	// b if last destination is right justified
   1.480 +
   1.481 +	rlwinm	DBK,DBK,0,0,27	// IU1 Round to QW addr of last byte
   1.482 +	li	BL,0		// IU1 Initialize index pointer
   1.483 +	bnl	cr7,c_Only_1W_fwd	// b if there was only one or zero words to store
   1.484 +
   1.485 +	stvewx	v0,DBK,BL	// LSU store word 1 of two or three
   1.486 +	addi	BL,BL,4		// IU1 increment index
   1.487 +
   1.488 +	stvewx	v0,DBK,BL	// LSU store word 2 of two or three
   1.489 +	addi	BL,BL,4		// IU1 increment index
   1.490 +c_Only_1W_fwd:
   1.491 +	bng	cr7,Only_2W_fwd	// b if there were only two or zero words to store
   1.492 +
   1.493 +	stvewx	v0,DBK,BL	// LSU store word 3 of three if necessary
   1.494 +	addi	BL,BL,4		// IU1 increment index
   1.495 +c_Only_2W_fwd:
   1.496 +	bne	cr7,c_Only_B_fwd	// b if there are no half words to store
   1.497 +
   1.498 +	stvehx	v0,DBK,BL	// LSU store one halfword if necessary
   1.499 +	addi	BL,BL,2		// IU1 increment index
   1.500 +c_Only_B_fwd:
   1.501 +	bns	cr7,c_All_done_fwd	// b if there are no bytes to store
   1.502 +
   1.503 +	stvebx	v0,DBK,BL	// LSU store one byte if necessary
   1.504 +	b	c_All_done_fwd
   1.505 +
   1.506 +c_Rt_just_fwd:
   1.507 +
   1.508 +	stvx	v0,DST,BK	// LSU Store 16 bytes at D14
   1.509 +c_All_done_fwd:
   1.510 +#ifdef VRSAVE
   1.511 +	mtspr	VRSV,RSV	// IU1 Restore VRSAVE	
   1.512 +#endif
   1.513 +	blr			// Return destination address from entry
   1.514 +
   1.515 +#ifdef __MWERKS__
   1.516 +	.align	16
   1.517 +#else
   1.518 +	.align	4
   1.519 +#endif
   1.520 +c_GT_4QW_fwd:	// Do once if nxt st is to odd half of cache line, else twice
   1.521 +
   1.522 +	addi	QW,QW,-1	// IU1 Keeping track of QWs stored
   1.523 +	mtcrf	0x02,DNX	// IU2 cr6[3]=((DST+32)[27]==1)?1:0;
   1.524 +	addi	DNX,DNX,16	// IU1 Update cr6 for next loop
   1.525 +
   1.526 +	stvx	v0,DST,BK	// LSU Store 16 bytes at D2
   1.527 +	addi	BK,BK,16	// IU1 Increment byte count by 16
   1.528 +	bdnzf	27,c_GT_4QW_fwd	// b if next store is to lower (even) half of CL
   1.529 +
   1.530 +	mtcrf	0x02,DBK	// IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?)
   1.531 +
   1.532 +	bns	cr6,c_B32_fwd	// b if DST[27] == 0; i.e, final store is even
   1.533 +
   1.534 +// We need the ctr register to reflect an even byte count before entering
   1.535 +// the next block - faster to decrement than to reload.
   1.536 +	bdnz	B32_fwd		// decrement counter for last QW store odd
   1.537 +
   1.538 +c_B32_fwd:	// Should be at least 2 stores remaining and next 2 are cache aligned
   1.539 +	dcbz	DST,BK		// LSU zero whole cache line
   1.540 +	bdz	c_Nxt_loc_fwd	// always decrement and branch to next instr		
   1.541 +
   1.542 +c_Nxt_loc_fwd:
   1.543 +	addi	BK,BK,32	// IU1 Increment byte count
   1.544 +	bdnz	B32_fwd		// b if there are at least two more QWs to do
   1.545 +
   1.546 +	bso	cr6,c_One_even_QW	// b if there is one even and one odd QW to store
   1.547 +	b	c_Last_QW		// b if last store is to even address
   1.548 +
   1.549 +// Come here with two more loads and two stores to do
   1.550 +c_One_even_QW:
   1.551 +	stvx	v0,DST,BK	// LSU Store 16 bytes at D13
   1.552 +	addi	BK,BK,16	// IU1 Increment byte count
   1.553 +
   1.554 +	b	c_Last_QW
   1.555 +
   1.556 +// End of cacheable_memzero in AltiVec