os/ossrv/genericopenlibs/liboil/src/motovec/vec_memcpy.s
changeset 0 bde4ae8d615e
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/os/ossrv/genericopenlibs/liboil/src/motovec/vec_memcpy.s	Fri Jun 15 03:10:57 2012 +0200
     1.3 @@ -0,0 +1,876 @@
     1.4 +//------------------------------------------------------------------
     1.5 +// file:  vec_memcpy.S
     1.6 +//    AltiVec enabled version of memcpy and bcopy
     1.7 +//------------------------------------------------------------------
     1.8 +
     1.9 +//------------------------------------------------------------------
    1.10 +//	Copyright Motorola, Inc. 2003
    1.11 +//	ALL RIGHTS RESERVED
    1.12 +//
    1.13 +//	You are hereby granted a copyright license to use, modify, and 
    1.14 +//	distribute the SOFTWARE so long as this entire notice is retained 
    1.15 +//	without alteration in any modified and/or redistributed versions, 
    1.16 +//	and that such modified versions are clearly identified as such.  
    1.17 +//	No licenses are granted by implication, estoppel or otherwise under 
    1.18 +//	any patents or trademarks of Motorola, Inc.
    1.19 +//
    1.20 +//	The SOFTWARE is provided on an "AS IS" basis and without warranty.  
    1.21 +//	To the maximum extent permitted by applicable law, MOTOROLA DISCLAIMS 
    1.22 +//	ALL WARRANTIES WHETHER EXPRESS OR IMPLIED, INCLUDING IMPLIED 
    1.23 +//	WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR 
    1.24 +//	PURPOSE AND ANY WARRANTY AGAINST INFRINGEMENT WITH 
    1.25 +//	REGARD TO THE SOFTWARE (INCLUDING ANY MODIFIED VERSIONS 
    1.26 +//	THEREOF) AND ANY ACCOMPANYING WRITTEN MATERIALS. 
    1.27 +//
    1.28 +//	To the maximum extent permitted by applicable law, IN NO EVENT SHALL 
    1.29 +//	MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER 
    1.30 +//	(INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF 
    1.31 +//	BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS 
    1.32 +//	INFORMATION, OR OTHER PECUNIARY LOSS) ARISING OF THE USE OR 
    1.33 +//	INABILITY TO USE THE SOFTWARE.   Motorola assumes no responsibility 
    1.34 +//	for the maintenance and support of the SOFTWARE.
    1.35 +//------------------------------------------------------------------
    1.36 +
    1.37 +//------------------------------------------------------------------
    1.38 +// extern  void * memcpy(void *dst, const void *src, size_t len);
    1.39 +// Returns:
    1.40 +//  void *dst
    1.41 +//------------------------------------------------------------------
    1.42 +
    1.43 +//------------------------------------------------------------------
    1.44 +// extern void * memmove( void *dst, const void *src, size_t len );
    1.45 +//   Copies len characters from src to dst and returns the value of
    1.46 +//   dst.  Works correctly for overlapping memory regions.
    1.47 +//               - Harbison&Steele 4th ed (corrected as to return)
    1.48 +// Returns:
    1.49 +//  void *dst
    1.50 +//------------------------------------------------------------------
    1.51 +
    1.52 +//------------------------------------------------------------------
    1.53 +// extern  void * bcopy(const void *src, void *dst,  size_t len);
    1.54 +// Returns:
    1.55 +//  void *dst
    1.56 +//------------------------------------------------------------------
    1.57 +
    1.58 +// memcpy and memmove are combined into one entry point here because of
    1.59 +// the similarity of operation and need to create fool-proof code.
    1.60 +// The following conditions determine what is "fool proof":
    1.61 +//
    1.62 +// if:                                          then single entry:
    1.63 +// (DST-SRC)<0 && (SRC-DST)>=BC && BC>MIN_VEC    will b to v_memcpy
    1.64 +// (DST-SRC)<0 && (SRC-DST)< BC && BC>MIN_VEC    must b to v_memcpy
    1.65 +// (DST-SRC)<0                  && BC<MIN_VEC    copy fwd byte-by-byte
    1.66 +// (DST-SRC)==0                 || BC==0         will just return
    1.67 +// (DST-SRC)>0                  && BC<MIN_VEC    copy bkwd byte-by-byte
    1.68 +// (DST-SRC)>0 && (DST-SRC)< BC && BC>MIN_VEC    must b to v_memmove
    1.69 +// (DST-SRC)>0 && (SRC-DST)>=BC && BC>MIN_VEC    will b to v_memmove
    1.70 +
    1.71 +// If you call memmove (or vec_memmove) and |DST-SRC|>=BC,
    1.72 +// this code will branch to v_memcpy anyway for maximum performance.
    1.73 +
    1.74 +// Revision History:
    1.75 +//    Rev 0.0	Original                          Chuck Corley	02/03/03
    1.76 +//              Can still add dst, 128B loop, and aligned option
    1.77 +//    Rev 0.01  Fixed JY's seg-fault violation              CJC 02/17/03
    1.78 +//    Rev 0.1   Added 128B loop and dst; cndtnlzd dcbz      CJC 02/18/03
    1.79 +//              (Creating separate path for QW aligned didn't help much)
    1.80 +//    Rev 0.11  Small code schdling; chngd dst for memmove  CJC 02/23/03
    1.81 +//    Rev 0.20  Eliminated alternate entry and cleanup      CJC 02/27/03                   
    1.82 +//    Rev 0.21  Inproved loop branch targets for v_mempcy   CJC 03/01/03                   
    1.83 +//    Rev 0.22  Experimented with dst (sent to H.)          CJC 03/02/03                   
    1.84 +//    Rev 0.23  Substituted dcba for dcbz (sent to JY)      CJC 03/08/03                   
    1.85 +//    Rev 0.24  Use two dst streams                         CJC 03/12/03
    1.86 +//    Rev 0.25  Fix for all compilers, cleanup, and release with
    1.87 +//              libmotovec.a rev 0.10                       CJC 03/14/03
    1.88 +//    Rev 0.30  Fix for pre-empted destination (SNDF-DS)    CJC 04/02/03                   
    1.89 +//
    1.90 +//  Between Rev 0.25 and 0.30 the code was revised to store elements of
    1.91 +//  source at destination when first and/or last vector are less than 16
    1.92 +//  bytes. Areviewer at SNDF observed that loading the destination vector
    1.93 +//  for merging exposed the "uninvolved" destination bytes to incoherency 
    1.94 +//  if an interrupt pre-empted this routine and modified the "uninvolved"
    1.95 +//  destination vector(s) while held in register for merging.  It seems
    1.96 +//  like a low possibility but this revision is no longer subject to that
    1.97 +//  possibility.  (It is also slightly faster than Rev 0.25.)
    1.98 +//  This is beta quality code; users are encouraged to make it faster.
    1.99 +//  ASSUMPTIONS:
   1.100 +//     Code is highly likely to be in the cache; data is not (streaming data)
   1.101 +
   1.102 +#define VRSV 256	//	VRSAVE spr
   1.103 +// Don't use vectors for BC <= MIN_VEC. Works only if MIN_VEC >= 16 bytes.
   1.104 +#define MIN_VEC 16
   1.105 +// Don't use Big_loop in v_memcpy for |dst-src|<= minimum overlap.
   1.106 +#define MIN_OVL 128
   1.107 +
   1.108 +// Register useage
   1.109 +#define Rt r0	// 	r0 when used as a temporary register	
   1.110 +
   1.111 +#define DST r3	// 	entering: dst pointer; exiting: same dst pointer
   1.112 +
   1.113 +#define SRC r4	// 	entering: src ptr; then end of src range index (SRC+BC) in memmove
   1.114 +
   1.115 +#define BC r5	//	entering: Byte_Count
   1.116 +
   1.117 +#define PCS r6	//  	save for partial checksum entering
   1.118 +
   1.119 +#define DMS r7	//      dst - src initially
   1.120 +#define BK r7	//  	BC - 1 +/- (n*16)
   1.121 +
   1.122 +// Codewarrior will put an unwelcome space as "lbzu	r0,1(r7 )"
   1.123 +// if you don't put the comment right after the r7.  CJC 030314
   1.124 +#define SM1 r8//	src -1 for byte-by-byte forwards initially
   1.125 +#define S r8	//	src[28:31]
   1.126 +#define SMD r8	//      src[0:27]-dst[0:27]
   1.127 +#define STR r8	//	data stream touch block & stride info for Big_loop
   1.128 +
   1.129 +#define DM1 r9//	dst -1 for byte-by-byte forwards initially
   1.130 +#define D r9	//	dst[28:31]
   1.131 +#define DNX r9	//	(dst+n*16)[28:31]
   1.132 +#define BL r9	//	second byte_kount index pointer
   1.133 +
   1.134 +#define SBC r10//	src + byte count initially then src[28:31]
   1.135 +#define BLK r10	//      temporary data stream touch block & stride info
   1.136 +#define DR r10	//	(dst+16)[0:27]
   1.137 +#define QW r10	//  	number of quad words (vectors)
   1.138 +
   1.139 +#define DBC r11//	dst + byte count initially
   1.140 +#define BLL r11	//      temporary data stream touch block & stride info
   1.141 +#define SBK r11	//	(src+byte_count-1)
   1.142 +#define SBR r11	//	(src+byte_count-1)[0:27]
   1.143 +#define DBK r11	//	(dst+byte_count-1) then (dst+byte_count-1)[28:31]
   1.144 +#define BIG r11	//	QW/8 or 128 byte loop count
   1.145 +#define SP8 r11	//      SRC + n*128 (8 QWs) for data streaming after first call
   1.146 +
   1.147 +#define RSV r12	//  	storage for VRSAVE register if used
   1.148 +
   1.149 +#define VS0   v0	//  	src vector for permuting
   1.150 +
   1.151 +#define VS1   v1	//  	src vector for permuting
   1.152 +
   1.153 +#define VP3   v2	// 	d - s permute register
   1.154 +
   1.155 +#define VPS0  v3	// 	permuted source vector to store
   1.156 +
   1.157 +#define VPS1  v4	//  	2nd permuted source vector to store
   1.158 +
   1.159 +#define VPS2  v5	//      additional permuted src in Big loop
   1.160 +
   1.161 +#define VS2   v6	//  	src vector for permuting
   1.162 +#define VPS3  v6	//      additional permuted src in Big loop
   1.163 +
   1.164 +#define VS3   v7	//      additional src load in Big loop
   1.165 +#define VPS4  v7	//      additional permuted src in Big loop
   1.166 +
   1.167 +#define VS4   v8	//      additional src load in Big loop
   1.168 +#define VPS5  v8	//      additional permuted src in Big loop
   1.169 +
   1.170 +#define VS5   v9	//      additional src load in Big loop
   1.171 +#define VPS6  v9	//      additional permuted src in Big loop
   1.172 +
   1.173 +#define VS6   v10	//      additional src load in Big loop
   1.174 +#define VPS7  v10	//      additional permuted src in Big loop
   1.175 +
   1.176 +#define VS7   v11	//      additional src load in Big loop
   1.177 +
   1.178 +// Conditionalize the use of dcba.  It will help if the data is
   1.179 +// not in cache and hurt if it is.  Generally, except for small
   1.180 +// benchmarks repeated many times, we assume data is not in cache
   1.181 +// (data streaming) and using dcbz is a performance boost.
   1.182 +#ifndef NO_DCBA
   1.183 +#if defined(__GNUC__) || defined(__MWERKS__) || defined(_DIAB_TOOL)
   1.184 + // gcc and codewarrior and diab don't assemble dcba
   1.185 +#define DCBK .long 0x7c033dec
   1.186 +// dcba r3,r7    or    dcba DST,BK
   1.187 +#define DCBL .long 0x7c034dec
   1.188 +// dcba r3,r9     or    dcba DST,BL
   1.189 +#else
   1.190 +#ifdef __ghs__
   1.191 +.macro DCBK
   1.192 +.long 0x7c033dec
   1.193 +.endm
   1.194 +.macro DCBL
   1.195 +.long 0x7c034dec
   1.196 +.endm
   1.197 +#else
   1.198 +#define DCBK dcba DST,BK
   1.199 +#define DCBL dcba DST,BL
   1.200 +#endif  // __ghs__
   1.201 +#endif  // __GNUC__ or __MWERKS__
   1.202 +#else
   1.203 +#define DCBK nop
   1.204 +#define DCBL nop
   1.205 +#endif  // NO_DCBA
   1.206 +
   1.207 +// Conditionalize the use of dst (data stream touch).  It will help
   1.208 +// if the data is not in cache and hurt if it is (though not as badly
   1.209 +// as dcbz).  Generally, except for small benchmarks repeated many times,
   1.210 +// we assume data is not in cache (data streaming) and using dst is a
   1.211 +// performance boost.
   1.212 +#ifndef NO_DST
   1.213 +#define STRM_B dst	SBC,BLL,0
   1.214 +#define STRM_F dst	SRC,BLK,0
   1.215 +#define STRM_1 dst	SP8,STR,1
   1.216 +
   1.217 +#else
   1.218 +#define STRM_B	nop
   1.219 +#define STRM_F	nop
   1.220 +#define STRM_1	nop
   1.221 +#endif
   1.222 +
   1.223 +//  Condition register use
   1.224 +//      cr0[0:2] = (dst-src==0)? return: ((dst-src>0)? copy_bkwd, copy_fwd;);
   1.225 +// then cr0[0:2] = (dst[28:31]-src[28:31]<0)? "shifting left", "shifting right";
   1.226 +//      cr1[0,2] = (BC == 0)? 1 : 0; (nothing to move)
   1.227 +// then cr1[2]   = (DST[28:31] == 0)? 1 : 0;  (D0 left justified)
   1.228 +// then cr1[2]   = ((DBK = DST+BC-1)[28:31] = 0xF)? 1 : 0; (DN right justified)
   1.229 +//      cr5[0,2] = (|DST-SRC|<=MIN_OVL)?1:0;  (Overlap too small for Big loop?)
   1.230 +//      cr6[1,2] = (DST-SRC>=BC)?1:0;  (Okay for v_memmove to copy forward?)
   1.231 +// then cr6[2]   = (QW == 0)? 1 : 0; (Any full vectors to move?)
   1.232 +// then cr6[1]   = (QW > 4)? 1 : 0; (>4 vectors to move?)
   1.233 +// then cr6[3]   = (third store[27] == 1)? 1: 0; (cache line alignment)
   1.234 +// then cr6[3]   = (last store[27] == 1)? 1: 0; (last store odd?)
   1.235 +//      cr7[2]   = (BC>MIN_VEC)?1:0;  (BC big enough to warrant vectors)
   1.236 +// then cr7[0:3] = (DST+16)[0:27]-DST  (How many bytes (iff <16) in first vector?)
   1.237 +// then cr7[1]   = (QW > 14)? 1 : 0; (>14 vectors to move?)
   1.238 +// then cr7[0:3] = (DST+BC)[0:27]  (How many bytes (iff <16) in last vector?)
   1.239 +
   1.240 +	.text
   1.241 +#ifdef __MWERKS__
   1.242 +	.align	32
   1.243 +#else
   1.244 +	.align	5
   1.245 +#endif
   1.246 +
   1.247 +#ifdef LIBMOTOVEC
   1.248 +	.globl	memmove     
   1.249 +memmove:
   1.250 +	nop			// IU1 Compilers forget first label
   1.251 +	.globl	memcpy     
   1.252 +memcpy:
   1.253 +#else
   1.254 +	.globl	vec_memmove     
   1.255 +vec_memmove:
   1.256 +	nop			// IU1 Only way I know to preserve both labels
   1.257 +	.globl	_vec_memcpy     
   1.258 +_vec_memcpy:
   1.259 +#endif
   1.260 +	subf.	DMS,SRC,DST	// IU1 Compute dst-src difference
   1.261 +	cmpi	cr1,0,BC,0	// IU1 Eliminate zero byte count moves
   1.262 +	cmpi	cr7,0,BC,MIN_VEC	// IU1 Check for minimum byte count
   1.263 +
   1.264 +	addi	SM1,SRC,-1	// IU1 Pre-bias and duplicate src for fwd
   1.265 +	addi	DM1,DST,-1	// IU1 Pre-bias and duplicate destination
   1.266 +	add	SBC,SRC,BC	// IU1 Pre-bias and duplicate src for bkwd
   1.267 +	beqlr			// return if DST = SRC
   1.268 +
   1.269 +	add	DBC,DST,BC	// IU1 Pre-bias and duplicate destination
   1.270 +	subf	Rt,DST,SRC	// IU1 Form |DST-SRC| if DST-SRC<0
   1.271 +	beqlr	cr1		// return if BC = 0
   1.272 +
   1.273 +	bgt	Cpy_bkwd	// b if DST-SRC>0 (have to copy backward)
   1.274 +	cmpi	cr5,0,Rt,MIN_OVL	// IU1 (|DST-SRC|>128)?1:0; for v_memcpy
   1.275 +	bgt	cr7,v_memcpy	// b if BC>MIN_VEC (okay to copy vectors fwd)
   1.276 +
   1.277 +// Copy byte-by-byte forwards if DST-SRC<0 and BC<=MIN_VEC	
   1.278 +	mtctr	BC		// i=BC; do ...;i--; while (i>0)
   1.279 +Byte_cpy_fwd:
   1.280 +	lbzu	Rt,1(SM1)	// LSU * ++(DST-1) = * ++(SRC-1)
   1.281 +	stbu	Rt,1(DM1)	// LSU
   1.282 +	bdnz	Byte_cpy_fwd
   1.283 +
   1.284 +	blr
   1.285 +	nop			// IU1 Improve next label as branch target	
   1.286 +Cpy_bkwd:
   1.287 +	cmpi	cr5,0,DMS,MIN_OVL	// IU1 ((DST-SRC)>128)?1:0; for v_memcpy
   1.288 +	cmp	cr6,0,DMS,BC	// IU1 cr6[1,2]=(DST-SRC>=BC)?1:0;
   1.289 +	bgt	cr7,v_memmove	// b if BC>MIN_VEC (copy vectors bkwd)
   1.290 +// Copy byte-by-byte backwards if DST-SRC>0 and BC<=MIN_VEC
   1.291 +	mtctr	BC		// i=BC; do ...;i--; while (i>0)
   1.292 +Byte_cpy_bwd:
   1.293 +	lbzu	Rt,-1(SBC)	// LSU * --(DST+BC) = * --(SRC+BC)
   1.294 +	stbu	Rt,-1(DBC)	// LSU Store it
   1.295 +	bdnz	Byte_cpy_bwd
   1.296 +	blr
   1.297 +	
   1.298 +#ifdef __MWERKS__
   1.299 +	.align	16
   1.300 +#else
   1.301 +	.align	4
   1.302 +#endif
   1.303 +
   1.304 +v_memmove:
   1.305 +// Byte count < MIN_VEC bytes will have been copied by scalar code above,
   1.306 +// so this will not deal with small block moves < MIN_VEC.
   1.307 +
   1.308 +// For systems using VRSAVE, define VRSAVE=1 when compiling.  For systems
   1.309 +// that don't, make sure VRSAVE is undefined.
   1.310 +#ifdef VRSAVE
   1.311 +	mfspr	RSV,VRSV	// IU2 Get current VRSAVE contents
   1.312 +#endif
   1.313 +	rlwinm	S,SRC,0,28,31	// IU1 Save src address bits s[28:31]
   1.314 +	rlwinm	D,DST,0,28,31	// IU1 D = dst[28:31]
   1.315 +	bge	cr6,MC_entry	// b to v_memcpy if DST-SRC>=BC (fwd copy OK)
   1.316 +
   1.317 +#ifdef VRSAVE
   1.318 +	oris	Rt,RSV,0xfff0	// IU1 Or in registers used by this routine
   1.319 +#endif	
   1.320 +	lis	BLL,0x010c	// IU1 Stream 12 blocks of 16 bytes
   1.321 +	subf.	SMD,D,S		// IU1 if S-D<0 essentially shifting right
   1.322 +
   1.323 +#ifdef VRSAVE
   1.324 +	mtspr	VRSV,Rt		// IU2 Save in VRSAVE before first vec op
   1.325 +#endif
   1.326 +	lvsr	VP3,0,DMS	// LSU Permute vector for dst - src shft right
   1.327 +	ori	BLL,BLL,0xffe0	// IU1 Stream stride -32B
   1.328 +
   1.329 +	STRM_B			// LSU Start data stream at SRC+BC
   1.330 +	addi	SBK,SBC,-1	// IU1 Address of last src byte
   1.331 +	bgt	Rt_shft		// Bytes from upper vector = (s-d>0)?s-d:16+s-d;
   1.332 +	addi	SMD,SMD,16	// IU1 Save 16-(d-s)
   1.333 +Rt_shft:
   1.334 +
   1.335 +	rlwinm	SBR,SBK,0,0,27	// IU1 (SRC+BC-1)[0:27]
   1.336 +	addi	BK,BC,-1	// IU1 Initialize byte index
   1.337 +
   1.338 +	subf	Rt,SBR,SBC	// IU1 How many bytes in first source?
   1.339 +	add	DBK,DST,BK	// IU1 Address of last dst byte
   1.340 +	addi	DR,DST,16	// IU1 Address of second dst vector
   1.341 +
   1.342 +	subf.	SMD,Rt,SMD	// IU1 if bytes in 1st src>Bytes in 1st permute
   1.343 +	rlwinm	Rt,DBK,0,28,31	// IU1 (DST+BC-1)[28:31]
   1.344 +	rlwinm	DR,DR,0,0,27	// IU1 (DST+16)[0:27]
   1.345 +
   1.346 +// If there are more useful bytes in the upper vector of a permute pair than we
   1.347 +// will get in the first permute, the first loaded vector needs to be in the
   1.348 +// lower half of the permute pair.  The upper half is a don't care then.
   1.349 +	blt	Get_bytes_rt	// b if shifting left (D-S>=0)
   1.350 +
   1.351 +	lvx	VS1,SRC,BK	// LSU Get SN load started
   1.352 +// Comments numbering source and destination assume single path through the
   1.353 +// code executing each instruction once.  For vec_memmove, an example would
   1.354 +// be the call memmove(BASE+0x0F, BASE+0x2F, 82). N = 6 in that case.
   1.355 +	addi	SRC,SRC,-16	// IU1 Decrement src base (to keep BK useful)
   1.356 +
   1.357 +Get_bytes_rt:	// Come here to get VS0 & Don't care what VS1 is	
   1.358 +	lvx	VS0,SRC,BK	// LSU Get SN-1 (SN if D-S<0) in lower vector
   1.359 +	subf	QW,DR,DBK	// IU1 Bytes of full vectors to move (-16)
   1.360 +	cmpi	cr7,0,Rt,0xF	// IU1 Is Dn right justified?
   1.361 +
   1.362 +	cmpi	cr1,0,D,0	// IU1 Is D0 left justified?
   1.363 +	rlwinm	QW,QW,28,4,31	// IU1 Quad words remaining
   1.364 +	add	Rt,DST,BC	// IU1 Refresh the value of DST+BC
   1.365 +
   1.366 +	cmpi	cr6,0,QW,0	// IU1 Any full vectors to move?
   1.367 +	vperm	VPS0,VS0,VS1,VP3	// VPU Align SN-1 and SN to DN
   1.368 +	vor	VS1,VS0,VS0	// VIU1 Move lower vector to upper
   1.369 +	beq	cr7,Rt_just	// b if DN is right justified
   1.370 +
   1.371 +	mtcrf	0x01,Rt		// IU2 Put final vector byte count in cr7
   1.372 +	rlwinm	DBK,DBK,0,0,27	// IU1 Address of first byte of final vector
   1.373 +	li	D,0		// IU1 Initialize an index pointer
   1.374 +	bnl	cr7,Only_1W_bkwd	// b if there was only one or zero words to store
   1.375 +
   1.376 +	stvewx	VPS0,DBK,D	// LSU store word 1 of two or three
   1.377 +	addi	D,D,4		// IU1 increment index
   1.378 +
   1.379 +	stvewx	VPS0,DBK,D	// LSU store word 2 of two or three
   1.380 +	addi	D,D,4		// IU1 increment index
   1.381 +Only_1W_bkwd:
   1.382 +	bng	cr7,Only_2W_bkwd	// b if there were only two or zero words to store
   1.383 +
   1.384 +	stvewx	VPS0,DBK,D	// LSU store word 3 of three if necessary
   1.385 +	addi	D,D,4		// IU1 increment index
   1.386 +Only_2W_bkwd:
   1.387 +	bne	cr7,Only_B_bkwd	// b if there are no half words to store
   1.388 +
   1.389 +	stvehx	VPS0,DBK,D	// LSU store one halfword if necessary
   1.390 +	addi	D,D,2		// IU1 increment index
   1.391 +Only_B_bkwd:
   1.392 +	bns	cr7,All_done_bkwd	// b if there are no bytes to store
   1.393 +
   1.394 +	stvebx	VPS0,DBK,D	// LSU store one byte if necessary
   1.395 +	b	All_done_bkwd
   1.396 +
   1.397 +Rt_just:	
   1.398 +	stvx	VPS0,DST,BK	// LSU Store 16 bytes at DN
   1.399 +All_done_bkwd:
   1.400 +	addi	BK,BK,-16	// IU1 Decrement destination byte count
   1.401 +
   1.402 +	ble	cr6,Last_load	// b if no Quad words to do
   1.403 +	mtctr	QW		// IU2 for (i=0;i<=QW;i++)-execution serializng
   1.404 +	cmpi	cr6,0,QW,4	// IU1 Check QW>4
   1.405 +QW_loop:
   1.406 +	lvx	VS0,SRC,BK	// LSU Get SN-2 (or SN-1 if ADJ==0)
   1.407 +
   1.408 +	vperm	VPS0,VS0,VS1,VP3	// VPU Align SN-2 and SN-1 to DN-1
   1.409 +	vor	VS1,VS0,VS0	// VIU1 Move lower vector to upper
   1.410 +
   1.411 +	stvx	VPS0,DST,BK	// LSU Store 16 bytes at DN-1
   1.412 +	addi	BK,BK,-16	// IU1 Decrement byte kount
   1.413 +	bdnzf	25,QW_loop	// b if 4 or less quad words to do
   1.414 +
   1.415 +	add	DNX,DST,BK	// IU1 address of next store (DST+BC-1-16)
   1.416 +	bgt	cr6,GT_4QW	// b if >4 quad words left
   1.417 +
   1.418 +Last_load:	// if D-S>=0, next load will be from same address as last
   1.419 +	blt	No_ld_bkwd	// b if shifting right (S-D>=0)
   1.420 +	addi	SRC,SRC,16	// IU1 recorrect source if it was decremented
   1.421 +No_ld_bkwd:				
   1.422 +	lvx	VS0,0,SRC	// LSU Get last source SN-6 (guaranteed S0)
   1.423 +// Current 16 bytes is the last; we're done.
   1.424 +	dss	0		// Data stream stop
   1.425 +	vperm	VPS0,VS0,VS1,VP3	// VPU Align SN-6 and SN-5 to DN-6
   1.426 +	subfic	D,DST,16	// IU1 How many bytes in first destination?
   1.427 +	beq	cr1,Lt_just	// b if last destination is left justified
   1.428 +
   1.429 +	mtcrf	0x01,D		// IU2 Put byte count remaining in cr7
   1.430 +	li	D,0		// IU1 Initialize index pointer
   1.431 +	bns	cr7,No_B_bkwd	// b if only even number of bytes to store
   1.432 +
   1.433 +	stvebx	VPS0,DST,D	// LSU store first byte at DST+0
   1.434 +	addi	D,D,1		// IU1 increment index
   1.435 +No_B_bkwd:
   1.436 +	bne	cr7,No_H_bkwd	// b if only words to store
   1.437 +	stvehx	VPS0,DST,D	// LSU store halfword at DST+0/1
   1.438 +	addi	D,D,2		// IU1 increment index
   1.439 +
   1.440 +No_H_bkwd:
   1.441 +	bng	cr7,No_W1_bkwd	// b if exactly zero or two words to store
   1.442 +	stvewx	VPS0,DST,D	// LSU store word 1 of one or three
   1.443 +	addi	D,D,4		// IU1 increment index
   1.444 +
   1.445 +No_W1_bkwd:
   1.446 +	bnl	cr7,No_W2_bkwd	// b if there was only one word to store
   1.447 +	stvewx	VPS0,DST,D	// LSU store word 1 of two or 2 of three
   1.448 +	addi	D,D,4		// IU1 increment index
   1.449 +
   1.450 +	stvewx	VPS0,DST,D	// LSU store word 2 of two or 3 of three
   1.451 +	b	No_W2_bkwd
   1.452 +
   1.453 +Lt_just:
   1.454 +	stvx	VPS0,0,DST	// LSU Store 16 bytes at final dst addr D0
   1.455 +No_W2_bkwd:
   1.456 +#ifdef VRSAVE
   1.457 +	mtspr	VRSV,RSV	// IU1 Restore VRSAVE	
   1.458 +#endif
   1.459 +	blr			// Return destination address from entry
   1.460 +
   1.461 +GT_4QW:	// Do once if next store is to even half of cache line, else twice
   1.462 +
   1.463 +	lvx	VS0,SRC,BK	// LSU Get SN-3 (or SN-2)
   1.464 +	mtcrf	0x02,DNX	// IU2 cr6[3]=((DST+BC-1)[27]==1)?1:0;
   1.465 +	
   1.466 +	vperm	VPS0,VS0,VS1,VP3	// VPU Align SN-3 and SN-2 to Dn-2
   1.467 +	vor	VS1,VS0,VS0	// VIU1 Move lower vector to upper
   1.468 +	addi	DNX,DNX,-16	// IU1 Prepare to update cr6 next loop
   1.469 +
   1.470 +	stvx	VPS0,DST,BK	// LSU Store 16 bytes at DN-2
   1.471 +	vor	VS3,VS0,VS0	// VIU Make a copy of lower vector
   1.472 +	addi	BK,BK,-16	// IU1 Decrement byte count by 16
   1.473 +	bdnzt	27,GT_4QW	// b if next store is to upper (odd) half of CL
   1.474 +// At this point next store will be to even address.
   1.475 +
   1.476 +	lis	STR,0x102	// IU1 Stream 2 blocks of 16 bytes
   1.477 +	mtcrf	0x02,DST	// IU2 cr6[3]=(DST[27]==1)?1:0; (DST odd?)
   1.478 +	addi	BL,BK,-16	// IU1 Create an alternate byte count - 16
   1.479 +
   1.480 +	ori	STR,STR,0xffe0	// IU1 Stream stride -32B
   1.481 +	addi	SP8,SRC,-64	// IU1 Starting address for data stream touch
   1.482 +	bso	cr6,B32_bkwd	// b if DST[27] == 1; i.e, final store is odd
   1.483 +
   1.484 +	bdnz	B32_bkwd	// decrement counter for last odd QW store
   1.485 +B32_bkwd:	// Should be at least 2 stores remaining and next 2 are cache aligned
   1.486 +	lvx	VS2,SRC,BK	// LSU Get SN-4 (or SN-3)
   1.487 +	addi	SP8,SP8,-32	// IU1 Next starting address for data stream touch
   1.488 +
   1.489 +	lvx	VS1,SRC,BL	// LSU Get SN-5 (or SN-4)
   1.490 +	vperm	VPS0,VS2,VS3,VP3	// VPU Align SN-4 and SN-3 to DN-3
   1.491 +
   1.492 +	STRM_1			// LSU Stream 64 byte blocks ahead of loads
   1.493 +
   1.494 +	DCBL			// LSU allocate next cache line
   1.495 +
   1.496 +	vperm	VPS1,VS1,VS2,VP3	// VPU Align SN-5 and SN-4 to DN-4
   1.497 +	vor	VS3,VS1,VS1	// VIU1 Move SN-5 to SN-3
   1.498 +
   1.499 +	stvx	VPS0,DST,BK	// LSU Store 16 bytes at DN-3
   1.500 +	addi	BK,BL,-16	// IU1 Decrement byte count
   1.501 +	bdz	Nxt_loc_bkwd	// always decrement and branch to next instr		
   1.502 +
   1.503 +Nxt_loc_bkwd:
   1.504 +	stvx	VPS1,DST,BL	// LSU Store 16 bytes at DN-4
   1.505 +	addi	BL,BK,-16	// IU1 Decrement alternate byte count
   1.506 +	bdnz	B32_bkwd	// b if there are at least two more QWs to do
   1.507 +
   1.508 +	bns	cr6,One_odd_QW	// b if there was one more odd QW to store
   1.509 +	b	Last_load
   1.510 +
   1.511 +// Come here with two more loads and two stores to do
   1.512 +One_odd_QW:
   1.513 +	lvx	VS1,SRC,BK	// LSU Get SN-6 (or SN-5)
   1.514 +
   1.515 +	vperm	VPS1,VS1,VS3,VP3	// VPU Align SN-6 and SN-5 to DN-5
   1.516 +
   1.517 +	stvx	VPS1,DST,BK	// LSU Store 16 bytes at DN-5
   1.518 +
   1.519 +	b	Last_load
   1.520 +
   1.521 +// End of memmove in AltiVec
   1.522 +
   1.523 +#ifdef __MWERKS__
   1.524 +	.align	16
   1.525 +#else
   1.526 +	.align	4
   1.527 +#endif
   1.528 +v_memcpy:
   1.529 +// Byte count < MIN_VEC bytes will have been copied by scalar code above,
   1.530 +// so this will not deal with small block moves < MIN_VEC.
   1.531 +
   1.532 +#ifdef VRSAVE
   1.533 +	mfspr	RSV,VRSV	// IU2 Get current VRSAVE contents
   1.534 +#endif
   1.535 +	rlwinm	S,SRC,0,28,31	// IU1 Save src address bits s[28:31]
   1.536 +	rlwinm	D,DST,0,28,31	// IU1 D = dst[28:31]
   1.537 +
   1.538 +MC_entry:	// enter here from memmove if DST-SRC>=BC; this should be faster
   1.539 +#ifdef VRSAVE
   1.540 +	oris	Rt,RSV,0xfff0	// IU1 Or in registers used by this routine
   1.541 +#endif	
   1.542 +	lis	BLK,0x010c	// IU1 Stream 12 blocks of 16 bytes
   1.543 +
   1.544 +	subf.	S,S,D		// IU1 if D-S<0 essentially shifting left
   1.545 +
   1.546 +#ifdef VRSAVE
   1.547 +	mtspr	VRSV,Rt		// IU2 Save in VRSAVE before first vec op
   1.548 +#endif
   1.549 +	lvsr	VP3,0,DMS	// LSU Permute vector for dst - src shft right
   1.550 +	ori	BLK,BLK,32	// IU1 Stream stride 32B
   1.551 +
   1.552 +	STRM_F			// LSU Start data stream 0 at SRC
   1.553 +	addi	DR,DST,16	// IU1 Address of second dst vector
   1.554 +	addi	DBK,DBC,-1	// IU1 Address of last dst byte
   1.555 +
   1.556 +// If D-S<0 we are "kinda" shifting left with the right shift permute vector
   1.557 +// loaded to VP3 and we need both S0 and S1 to permute.  If D-S>=0 then the
   1.558 +// first loaded vector needs to be in the upper half of the permute pair and
   1.559 +// the lower half is a don't care then.
   1.560 +	bge	Ld_bytes_rt	// b if shifting right (D-S>=0)
   1.561 +
   1.562 +	lvx	VS0,0,SRC	// LSU Get S0 load started
   1.563 +// Comments numbering source and destination assume single path through the
   1.564 +// code executing each instruction once.  For vec_memcpy, an example would
   1.565 +// be the call memcpy(BASE+0x1E, BASE+0x1F, 259). N = 16 in that case.
   1.566 +	addi	SRC,SRC,16	// IU1 Increment src base (to keep BK useful)
   1.567 +
   1.568 +Ld_bytes_rt:	// Come here to get VS1 & Don't care what VS0 is	
   1.569 +	lvx	VS1,0,SRC	// LSU Get S1 (or S0 if D-S>=0) in upper vector
   1.570 +	rlwinm	DR,DR,0,0,27	// IU1 (DST+16)[0:27]
   1.571 +	cmpi	cr1,0,D,0	// IU1 Is D0 left justified?
   1.572 +
   1.573 +	subf	Rt,DST,DR	// IU1 How many bytes in first destination?
   1.574 +	subf	QW,DR,DBK	// IU1 Bytes of full vectors to move (-16)
   1.575 +	li	BK,0		// IU1 Initialize byte kount index
   1.576 +
   1.577 +	mtcrf	0x01,Rt		// IU2 Put bytes in 1st dst in cr7
   1.578 +	rlwinm	QW,QW,28,4,31	// IU1 Quad words remaining
   1.579 +	vperm	VPS0,VS0,VS1,VP3	// VPU Align S0 and S1 to D0
   1.580 +
   1.581 +	vor	VS0,VS1,VS1	// VIU1 Move upper vector to lower
   1.582 +	beq	cr1,Left_just	// b if D0 is left justified
   1.583 +
   1.584 +	bns	cr7,No_B_fwd	// b if only even number of bytes to store
   1.585 +
   1.586 +	stvebx	VPS0,DST,BK	// LSU store first byte at DST+0
   1.587 +	addi	BK,BK,1		// IU1 increment index
   1.588 +No_B_fwd:
   1.589 +	bne	cr7,No_H_fwd	// b if only words to store
   1.590 +
   1.591 +	stvehx	VPS0,DST,BK	// LSU store halfword at DST+0/1
   1.592 +	addi	BK,BK,2		// IU1 increment index
   1.593 +No_H_fwd:
   1.594 +	bng	cr7,No_W1_fwd	// b if exactly zero or two words to store
   1.595 +
   1.596 +	stvewx	VPS0,DST,BK	// LSU store word 1 of one or three
   1.597 +	addi	BK,BK,4		// IU1 increment index
   1.598 +
   1.599 +No_W1_fwd:
   1.600 +	bnl	cr7,No_W2_fwd	// b if there was only one word to store
   1.601 +	stvewx	VPS0,DST,BK	// LSU store word 1 of two or 2 of three
   1.602 +	addi	BK,BK,4		// IU1 increment index
   1.603 +
   1.604 +	stvewx	VPS0,DST,BK	// LSU store word 2 of two or 3 of three
   1.605 +	b	No_W2_fwd
   1.606 +
   1.607 +Left_just:	
   1.608 +	stvx	VPS0,0,DST	// LSU Store 16 bytes at D0
   1.609 +No_W2_fwd:
   1.610 +	rlwinm	Rt,DBK,0,28,31	// IU1 (DBK = DST+BC-1)[28:31]
   1.611 +	cmpi	cr6,0,QW,0	// IU1 Any full vectors to move?
   1.612 +
   1.613 +	li	BK,16		// IU1 Re-initialize byte kount index
   1.614 +	cmpi	cr1,0,Rt,0xF	// IU1 Is DN right justified?
   1.615 +	cmpi	cr7,0,QW,14	// IU1 Check QW>14
   1.616 +	ble	cr6,Last_ld_fwd	// b if no Quad words to do
   1.617 +
   1.618 +	mtctr	QW		// IU2 for (i=0;i<=QW;i++)
   1.619 +	cmpi	cr6,0,QW,4	// IU1 Check QW>4
   1.620 +QW_fwd_loop:
   1.621 +	lvx	VS1,SRC,BK	// LSU Get S2 (or S1)
   1.622 +
   1.623 +	vperm	VPS0,VS0,VS1,VP3	// VPU Align S1 and S2 to D1
   1.624 +	vor	VS0,VS1,VS1	// VIU1 Move upper vector to lower
   1.625 +
   1.626 +	stvx	VPS0,DST,BK	// LSU Store 16 bytes at D1(+n*16 where n<4)
   1.627 +	addi	BK,BK,16	// IU1 Increment byte kount index
   1.628 +	bdnzf	25,QW_fwd_loop	// b if 4 or less quad words to do
   1.629 +
   1.630 +	add	DNX,DST,BK	// IU1 address of next store (DST+32 if QW>4)
   1.631 +	addi	QW,QW,-1	// IU1 One more QW stored by now
   1.632 +	bgt	cr6,GT_4QW_fwd	// b if >4 quad words left
   1.633 +
   1.634 +Last_ld_fwd:	// Next 16 bytes is the last; we're done.
   1.635 +	add	DBC,DST,BC	// IU1 Recompute address of last dst byte + 1
   1.636 +	add	SBC,SRC,BC	// IU1 Recompute address of last src byte + 1
   1.637 +	bge	No_ld_fwd	// b if shifting right (D-S>=0)
   1.638 +
   1.639 +	addi	SBC,SBC,-16	// IU1 if D-S>=0 we didn't add 16 to src
   1.640 +No_ld_fwd:
   1.641 +	mtcrf	0x01,DBC	// IU2 Put final vector byte count in cr7
   1.642 +	addi	DBK,DBC,-1	// IU1 Recompute address of last dst byte
   1.643 +	addi	Rt,SBC,-1	// IU1 Recompute address of last src byte
   1.644 +
   1.645 +// If D-S<0 we have already loaded all the source vectors.
   1.646 +// If D-S>=0 then the first loaded vector went to the upper half of the permute
   1.647 +// pair and we need one more vector.  (This may be a duplicate.)
   1.648 +
   1.649 +	lvx	VS1,0,Rt	// LSU Get last source S14 (guaranteed SN)
   1.650 +
   1.651 +#ifndef NO_DST				
   1.652 +	dss	0		// Data stream 0 stop
   1.653 +
   1.654 +	dss	1		// Data stream 1 stop
   1.655 +#endif
   1.656 +	vperm	VPS0,VS0,VS1,VP3	// VPU Align S13 and S14 to D14
   1.657 +	beq	cr1,Rt_just_fwd	// b if last destination is right justified
   1.658 +
   1.659 +	rlwinm	DBK,DBK,0,0,27	// IU1 Round to QW addr of last byte
   1.660 +	li	D,0		// IU1 Initialize index pointer
   1.661 +	bnl	cr7,Only_1W_fwd	// b if there was only one or zero words to store
   1.662 +
   1.663 +	stvewx	VPS0,DBK,D	// LSU store word 1 of two or three
   1.664 +	addi	D,D,4		// IU1 increment index
   1.665 +
   1.666 +	stvewx	VPS0,DBK,D	// LSU store word 2 of two or three
   1.667 +	addi	D,D,4		// IU1 increment index
   1.668 +Only_1W_fwd:
   1.669 +	bng	cr7,Only_2W_fwd	// b if there were only two or zero words to store
   1.670 +
   1.671 +	stvewx	VPS0,DBK,D	// LSU store word 3 of three if necessary
   1.672 +	addi	D,D,4		// IU1 increment index
   1.673 +Only_2W_fwd:
   1.674 +	bne	cr7,Only_B_fwd	// b if there are no half words to store
   1.675 +
   1.676 +	stvehx	VPS0,DBK,D	// LSU store one halfword if necessary
   1.677 +	addi	D,D,2		// IU1 increment index
   1.678 +Only_B_fwd:
   1.679 +	bns	cr7,All_done_fwd	// b if there are no bytes to store
   1.680 +
   1.681 +	stvebx	VPS0,DBK,D	// LSU store one byte if necessary
   1.682 +	b	All_done_fwd
   1.683 +
   1.684 +Rt_just_fwd:
   1.685 +
   1.686 +	stvx	VPS0,DST,BK	// LSU Store 16 bytes at D14
   1.687 +All_done_fwd:
   1.688 +#ifdef VRSAVE
   1.689 +	mtspr	VRSV,RSV	// IU1 Restore VRSAVE	
   1.690 +#endif
   1.691 +	blr			// Return destination address from entry
   1.692 +#ifdef __MWERKS__
   1.693 +	.align	16
   1.694 +#else
   1.695 +	.align	4
   1.696 +#endif
   1.697 +GT_4QW_fwd:	// Do once if nxt st is to odd half of cache line, else twice
   1.698 +
   1.699 +	lvx	VS1,SRC,BK	// LSU Get S3 (or S2)
   1.700 +	addi	QW,QW,-1	// IU1 Keeping track of QWs stored
   1.701 +	mtcrf	0x02,DNX	// IU2 cr6[3]=((DST+32)[27]==1)?1:0;
   1.702 +	
   1.703 +	addi	DNX,DNX,16	// IU1 Update cr6 for next loop
   1.704 +	addi	Rt,QW,-2	// IU1 Insure at least 2 QW left after big loop
   1.705 +
   1.706 +	vperm	VPS0,VS0,VS1,VP3	// VPU Align S2 and S3 to D2
   1.707 +	vor	VS0,VS1,VS1	// VIU1 Move upper vector to lower
   1.708 +
   1.709 +	stvx	VPS0,DST,BK	// LSU Store 16 bytes at D2
   1.710 +	addi	BK,BK,16	// IU1 Increment byte count by 16
   1.711 +	bdnzf	27,GT_4QW_fwd	// b if next store is to lower (even) half of CL
   1.712 +// At this point next store will be to even address.
   1.713 +
   1.714 +	mtcrf	0x02,DBK	// IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?)
   1.715 +	lis	STR,0x104	// IU1 Stream 4 blocks of 16 bytes
   1.716 +	addi	BL,BK,16	// IU1 Create an alternate byte kount + 32
   1.717 +
   1.718 +	ori	STR,STR,32	// IU1 Stream stride 32B
   1.719 +#ifndef NO_BIG_LOOP
   1.720 +	rlwinm	BIG,Rt,29,3,31	// IU1 QW/8 big loops to do
   1.721 +
   1.722 +	rlwinm	Rt,Rt,0,0,28	// IU1 How many QWs will be done in big loop
   1.723 +	bgt	cr7,Big_loop	// b if QW > 14
   1.724 +#endif
   1.725 +No_big_loop:
   1.726 +// We need the ctr register to reflect an even byte count before entering
   1.727 +// the next block - faster to decrement than to reload.
   1.728 +
   1.729 +	addi	SP8,SRC,256	// IU1 Starting address for data stream touch
   1.730 +	xoris	STR,STR,0x6	// IU1 Reset stream to 2 blocks of 16 bytes
   1.731 +	bns	cr6,B32_fwd	// b if DST[27] == 0; i.e, final store is even
   1.732 +
   1.733 +	bdnz	B32_fwd		// decrement counter for last QW store odd
   1.734 +
   1.735 +B32_fwd:	// Should be at least 2 stores remaining and next 2 are cache aligned
   1.736 +	lvx	VS1,SRC,BK	// LSU Get S12
   1.737 +	addi	SP8,SP8,32	// IU1 Next starting address for data stream touch
   1.738 +
   1.739 +	lvx	VS2,SRC,BL	// LSU Get S13
   1.740 +	vperm	VPS1,VS0,VS1,VP3	// VPU Align S11 and S12 to D11
   1.741 +
   1.742 +	STRM_1			// LSU Stream 64 byte blocks ahead of loads
   1.743 +
   1.744 +	DCBK			// LSU then Kill instead of RWITM
   1.745 +
   1.746 +	vperm	VPS0,VS1,VS2,VP3	// VPU Align S12 and S13 to D12
   1.747 +	vor	VS0,VS2,VS2	// VIU1 Move S13 to S11
   1.748 +
   1.749 +	stvx	VPS1,DST,BK	// LSU Store 16 bytes at D11
   1.750 +	addi	BK,BL,16	// IU1 Increment byte count
   1.751 +	bdz	Nxt_loc_fwd	// always decrement and branch to next instr		
   1.752 +
   1.753 +Nxt_loc_fwd:
   1.754 +	stvx	VPS0,DST,BL	// LSU Store 16 bytes at D12
   1.755 +	addi	BL,BK,16	// IU1 Increment alternate byte count
   1.756 +	bdnz	B32_fwd		// b if there are at least two more QWs to do
   1.757 +
   1.758 +	bso	cr6,One_even_QW	// b if there is one even and one odd QW to store
   1.759 +	b	Last_ld_fwd	// b if last store is to even address
   1.760 +
   1.761 +// Come here with two more loads and two stores to do
   1.762 +One_even_QW:
   1.763 +	lvx	VS1,SRC,BK	// LSU Get S14 (or S13 if if D-S>=0)
   1.764 +
   1.765 +	vperm	VPS0,VS0,VS1,VP3	// VPU Align S13 and S14 to D13
   1.766 +	vor	VS0,VS1,VS1	// VIU1 Move upper vector to lower
   1.767 +
   1.768 +	stvx	VPS0,DST,BK	// LSU Store 16 bytes at D13
   1.769 +	addi	BK,BK,16	// IU1 Increment byte count
   1.770 +
   1.771 +	b	Last_ld_fwd
   1.772 +
   1.773 +#ifdef __MWERKS__
   1.774 +	.align	16
   1.775 +#else
   1.776 +	.align	4
   1.777 +#endif
   1.778 +Big_loop:
   1.779 +	subf	QW,Rt,QW	// IU1 Should be 2-7 QWs left after big loop
   1.780 +	blt	cr5,No_big_loop	// b back if |DST-SRC|<128; Big_loop won't work.
   1.781 +	mtctr	BIG		// IU2 loop for as many 128B loops as possible
   1.782 +	addi	SP8,SRC,256	// IU1 Starting address for data stream touch
   1.783 +
   1.784 +Loop_of_128B:	// Come here with QW>=10 and next store even; VS0 last load
   1.785 +	lvx	VS1,SRC,BK	// LSU Get S4 (or S3 if D-S>=0)
   1.786 +	addi	BL,BK,32	// IU1 Increment Byte_Kount+16 by 32	
   1.787 +	addi	SP8,SP8,128	// IU1 increment address for data stream touch
   1.788 +
   1.789 +	lvx	VS3,SRC,BL	// LSU Get S6 (or S5)
   1.790 +	addi	BL,BL,32	// IU1 Increment Byte_Kount+48 by 32	
   1.791 +
   1.792 +	lvx	VS5,SRC,BL	// LSU Get S8 (or S7)
   1.793 +	addi	BL,BL,32	// IU1 Increment Byte_Kount+80 by 32	
   1.794 +
   1.795 +	lvx	VS7,SRC,BL	// LSU Get S10 (or S9)
   1.796 +	addi	BL,BK,16	// IU1 Increment Byte_Kount+16 by 16	
   1.797 +
   1.798 +	lvx	VS2,SRC,BL	// LSU Get S5 (or S4)
   1.799 +	addi	BL,BL,32	// IU1 Increment Byte_Kount+32 by 32	
   1.800 +
   1.801 +	lvx	VS4,SRC,BL	// LSU Get S7 (or S6)
   1.802 +	addi	BL,BL,32	// IU1 Increment Byte_Kount+64 by 32	
   1.803 +	
   1.804 +	lvx	VS6,SRC,BL	// LSU Get S9 (or S8)
   1.805 +	addi	BL,BL,32	// IU1 Increment Byte_Kount+96 by 32	
   1.806 +	vperm	VPS0,VS0,VS1,VP3	// VPU
   1.807 +
   1.808 +	lvx	VS0,SRC,BL	// LSU Get S11 (or S10)
   1.809 +	vperm	VPS1,VS1,VS2,VP3	// VPU
   1.810 +
   1.811 +	STRM_1			// LSU Stream 4 32B blocks, stride 32B
   1.812 +
   1.813 +	DCBK			// LSU then Kill instead of RWITM
   1.814 +
   1.815 +	stvx	VPS0,DST,BK	// LSU Store D3
   1.816 +	addi	BK,BK,16	// IU1 Increment Byte_Kount+16 by 16	
   1.817 +	vperm	VPS2,VS2,VS3,VP3	// VPU
   1.818 +
   1.819 +	stvx	VPS1,DST,BK	// LSU Store D4
   1.820 +	addi	BK,BK,16	// IU1 Increment Byte_Kount+32 by 16	
   1.821 +	vperm	VPS3,VS3,VS4,VP3	// VPU
   1.822 +
   1.823 +	DCBK			// LSU then Kill instead of RWITM
   1.824 +
   1.825 +	stvx	VPS2,DST,BK	// LSU Store D5
   1.826 +	addi	BK,BK,16	// IU1 Increment Byte_Kount+48 by 16	
   1.827 +	vperm	VPS4,VS4,VS5,VP3	// VPU
   1.828 +
   1.829 +	stvx	VPS3,DST,BK	// LSU Store D6
   1.830 +	addi	BK,BK,16	// IU1 Increment Byte_Kount+64 by 16	
   1.831 +	vperm	VPS5,VS5,VS6,VP3	// VPU
   1.832 +
   1.833 +	DCBK			// LSU then Kill instead of RWITM
   1.834 +
   1.835 +	stvx	VPS4,DST,BK	// LSU Store D7
   1.836 +	addi	BK,BK,16	// IU1 Increment Byte_Kount+80 by 16	
   1.837 +	vperm	VPS6,VS6,VS7,VP3	// VPU
   1.838 +
   1.839 +	stvx	VPS5,DST,BK	// LSU Store D8
   1.840 +	addi	BK,BK,16	// IU1 Increment Byte_Kount+96 by 16	
   1.841 +	vperm	VPS7,VS7,VS0,VP3	// VPU
   1.842 +
   1.843 +	DCBK			// LSU then Kill instead of RWITM
   1.844 +
   1.845 +	stvx	VPS6,DST,BK	// LSU Store D9
   1.846 +	addi	BK,BK,16	// IU1 Increment Byte_Kount+112 by 16	
   1.847 +
   1.848 +	stvx	VPS7,DST,BK	// LSU Store D10
   1.849 +	addi	BK,BK,16	// IU1 Increment Byte_Kount+128 by 16	
   1.850 +	bdnz	Loop_of_128B	// b if ctr > 0 (QW/8 still > 0)
   1.851 +
   1.852 +	mtctr	QW		// IU1 Restore QW remaining to counter
   1.853 +	addi	BL,BK,16	// IU1 Create an alternate byte kount + 16
   1.854 +	bns	cr6,B32_fwd	// b if DST[27] == 0; i.e, final store is even
   1.855 +
   1.856 +	bdnz	B32_fwd		// b and decrement counter for last QW store odd
   1.857 +				// One of the above branches should have taken
   1.858 +
   1.859 +// End of memcpy in AltiVec
   1.860 +
   1.861 +// bcopy works like memcpy, but the source and destination operands are reversed.
   1.862 +// Following will just reverse the operands and branch to memcpy.
   1.863 +
   1.864 +#ifdef LIBMOTOVEC
   1.865 +	.globl	bcopy     
   1.866 +bcopy:
   1.867 +#else
   1.868 +	.globl	vec_bcopy     
   1.869 +vec_bcopy:
   1.870 +#endif
   1.871 +	mr	Rt,DST		// temp storage for what is really source address (r3)
   1.872 +	mr	DST,SRC		// swap destination address to r3 to match memcpy dst
   1.873 +	mr	SRC,Rt		// Complete swap of destination and source for memcpy
   1.874 +#ifdef LIBMOTOVEC
   1.875 +	b	memcpy		// b to memcpy with correct args in r3 and r4	
   1.876 +#else
   1.877 +	b	_vec_memcpy	// b to vec_memcpy with correct args in r3 and r4	
   1.878 +#endif
   1.879 +// End of bcopy in AltiVec