Symaptic: os/ossrv/genericopenlibs/liboil/src/motovec/vec

sl@0	1	//------------------------------------------------------------------
sl@0	2	// file: vec_memcpy.S
sl@0	3	// AltiVec enabled version of memcpy and bcopy
sl@0	4	//------------------------------------------------------------------
sl@0	5
sl@0	6	//------------------------------------------------------------------
sl@0	7	// Copyright Motorola, Inc. 2003
sl@0	8	// ALL RIGHTS RESERVED
sl@0	9	//
sl@0	10	// You are hereby granted a copyright license to use, modify, and
sl@0	11	// distribute the SOFTWARE so long as this entire notice is retained
sl@0	12	// without alteration in any modified and/or redistributed versions,
sl@0	13	// and that such modified versions are clearly identified as such.
sl@0	14	// No licenses are granted by implication, estoppel or otherwise under
sl@0	15	// any patents or trademarks of Motorola, Inc.
sl@0	16	//
sl@0	17	// The SOFTWARE is provided on an "AS IS" basis and without warranty.
sl@0	18	// To the maximum extent permitted by applicable law, MOTOROLA DISCLAIMS
sl@0	19	// ALL WARRANTIES WHETHER EXPRESS OR IMPLIED, INCLUDING IMPLIED
sl@0	20	// WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR
sl@0	21	// PURPOSE AND ANY WARRANTY AGAINST INFRINGEMENT WITH
sl@0	22	// REGARD TO THE SOFTWARE (INCLUDING ANY MODIFIED VERSIONS
sl@0	23	// THEREOF) AND ANY ACCOMPANYING WRITTEN MATERIALS.
sl@0	24	//
sl@0	25	// To the maximum extent permitted by applicable law, IN NO EVENT SHALL
sl@0	26	// MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER
sl@0	27	// (INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF
sl@0	28	// BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS
sl@0	29	// INFORMATION, OR OTHER PECUNIARY LOSS) ARISING OF THE USE OR
sl@0	30	// INABILITY TO USE THE SOFTWARE. Motorola assumes no responsibility
sl@0	31	// for the maintenance and support of the SOFTWARE.
sl@0	32	//------------------------------------------------------------------
sl@0	33
sl@0	34	//------------------------------------------------------------------
sl@0	35	// extern void * memcpy(void dst, const void src, size_t len);
sl@0	36	// Returns:
sl@0	37	// void *dst
sl@0	38	//------------------------------------------------------------------
sl@0	39
sl@0	40	//------------------------------------------------------------------
sl@0	41	// extern void * memmove( void dst, const void src, size_t len );
sl@0	42	// Copies len characters from src to dst and returns the value of
sl@0	43	// dst. Works correctly for overlapping memory regions.
sl@0	44	// - Harbison&Steele 4th ed (corrected as to return)
sl@0	45	// Returns:
sl@0	46	// void *dst
sl@0	47	//------------------------------------------------------------------
sl@0	48
sl@0	49	//------------------------------------------------------------------
sl@0	50	// extern void * bcopy(const void src, void dst, size_t len);
sl@0	51	// Returns:
sl@0	52	// void *dst
sl@0	53	//------------------------------------------------------------------
sl@0	54
sl@0	55	// memcpy and memmove are combined into one entry point here because of
sl@0	56	// the similarity of operation and need to create fool-proof code.
sl@0	57	// The following conditions determine what is "fool proof":
sl@0	58	//
sl@0	59	// if: then single entry:
sl@0	60	// (DST-SRC)<0 && (SRC-DST)>=BC && BC>MIN_VEC will b to v_memcpy
sl@0	61	// (DST-SRC)<0 && (SRC-DST)< BC && BC>MIN_VEC must b to v_memcpy
sl@0	62	// (DST-SRC)<0 && BC<MIN_VEC copy fwd byte-by-byte
sl@0	63	// (DST-SRC)==0 \|\| BC==0 will just return
sl@0	64	// (DST-SRC)>0 && BC<MIN_VEC copy bkwd byte-by-byte
sl@0	65	// (DST-SRC)>0 && (DST-SRC)< BC && BC>MIN_VEC must b to v_memmove
sl@0	66	// (DST-SRC)>0 && (SRC-DST)>=BC && BC>MIN_VEC will b to v_memmove
sl@0	67
sl@0	68	// If you call memmove (or vec_memmove) and \|DST-SRC\|>=BC,
sl@0	69	// this code will branch to v_memcpy anyway for maximum performance.
sl@0	70
sl@0	71	// Revision History:
sl@0	72	// Rev 0.0 Original Chuck Corley 02/03/03
sl@0	73	// Can still add dst, 128B loop, and aligned option
sl@0	74	// Rev 0.01 Fixed JY's seg-fault violation CJC 02/17/03
sl@0	75	// Rev 0.1 Added 128B loop and dst; cndtnlzd dcbz CJC 02/18/03
sl@0	76	// (Creating separate path for QW aligned didn't help much)
sl@0	77	// Rev 0.11 Small code schdling; chngd dst for memmove CJC 02/23/03
sl@0	78	// Rev 0.20 Eliminated alternate entry and cleanup CJC 02/27/03
sl@0	79	// Rev 0.21 Inproved loop branch targets for v_mempcy CJC 03/01/03
sl@0	80	// Rev 0.22 Experimented with dst (sent to H.) CJC 03/02/03
sl@0	81	// Rev 0.23 Substituted dcba for dcbz (sent to JY) CJC 03/08/03
sl@0	82	// Rev 0.24 Use two dst streams CJC 03/12/03
sl@0	83	// Rev 0.25 Fix for all compilers, cleanup, and release with
sl@0	84	// libmotovec.a rev 0.10 CJC 03/14/03
sl@0	85	// Rev 0.30 Fix for pre-empted destination (SNDF-DS) CJC 04/02/03
sl@0	86	//
sl@0	87	// Between Rev 0.25 and 0.30 the code was revised to store elements of
sl@0	88	// source at destination when first and/or last vector are less than 16
sl@0	89	// bytes. Areviewer at SNDF observed that loading the destination vector
sl@0	90	// for merging exposed the "uninvolved" destination bytes to incoherency
sl@0	91	// if an interrupt pre-empted this routine and modified the "uninvolved"
sl@0	92	// destination vector(s) while held in register for merging. It seems
sl@0	93	// like a low possibility but this revision is no longer subject to that
sl@0	94	// possibility. (It is also slightly faster than Rev 0.25.)
sl@0	95	// This is beta quality code; users are encouraged to make it faster.
sl@0	96	// ASSUMPTIONS:
sl@0	97	// Code is highly likely to be in the cache; data is not (streaming data)
sl@0	98
sl@0	99	#define VRSV 256 // VRSAVE spr
sl@0	100	// Don't use vectors for BC <= MIN_VEC. Works only if MIN_VEC >= 16 bytes.
sl@0	101	#define MIN_VEC 16
sl@0	102	// Don't use Big_loop in v_memcpy for \|dst-src\|<= minimum overlap.
sl@0	103	#define MIN_OVL 128
sl@0	104
sl@0	105	// Register useage
sl@0	106	#define Rt r0 // r0 when used as a temporary register
sl@0	107
sl@0	108	#define DST r3 // entering: dst pointer; exiting: same dst pointer
sl@0	109
sl@0	110	#define SRC r4 // entering: src ptr; then end of src range index (SRC+BC) in memmove
sl@0	111
sl@0	112	#define BC r5 // entering: Byte_Count
sl@0	113
sl@0	114	#define PCS r6 // save for partial checksum entering
sl@0	115
sl@0	116	#define DMS r7 // dst - src initially
sl@0	117	#define BK r7 // BC - 1 +/- (n*16)
sl@0	118
sl@0	119	// Codewarrior will put an unwelcome space as "lbzu r0,1(r7 )"
sl@0	120	// if you don't put the comment right after the r7. CJC 030314
sl@0	121	#define SM1 r8// src -1 for byte-by-byte forwards initially
sl@0	122	#define S r8 // src[28:31]
sl@0	123	#define SMD r8 // src[0:27]-dst[0:27]
sl@0	124	#define STR r8 // data stream touch block & stride info for Big_loop
sl@0	125
sl@0	126	#define DM1 r9// dst -1 for byte-by-byte forwards initially
sl@0	127	#define D r9 // dst[28:31]
sl@0	128	#define DNX r9 // (dst+n*16)[28:31]
sl@0	129	#define BL r9 // second byte_kount index pointer
sl@0	130
sl@0	131	#define SBC r10// src + byte count initially then src[28:31]
sl@0	132	#define BLK r10 // temporary data stream touch block & stride info
sl@0	133	#define DR r10 // (dst+16)[0:27]
sl@0	134	#define QW r10 // number of quad words (vectors)
sl@0	135
sl@0	136	#define DBC r11// dst + byte count initially
sl@0	137	#define BLL r11 // temporary data stream touch block & stride info
sl@0	138	#define SBK r11 // (src+byte_count-1)
sl@0	139	#define SBR r11 // (src+byte_count-1)[0:27]
sl@0	140	#define DBK r11 // (dst+byte_count-1) then (dst+byte_count-1)[28:31]
sl@0	141	#define BIG r11 // QW/8 or 128 byte loop count
sl@0	142	#define SP8 r11 // SRC + n*128 (8 QWs) for data streaming after first call
sl@0	143
sl@0	144	#define RSV r12 // storage for VRSAVE register if used
sl@0	145
sl@0	146	#define VS0 v0 // src vector for permuting
sl@0	147
sl@0	148	#define VS1 v1 // src vector for permuting
sl@0	149
sl@0	150	#define VP3 v2 // d - s permute register
sl@0	151
sl@0	152	#define VPS0 v3 // permuted source vector to store
sl@0	153
sl@0	154	#define VPS1 v4 // 2nd permuted source vector to store
sl@0	155
sl@0	156	#define VPS2 v5 // additional permuted src in Big loop
sl@0	157
sl@0	158	#define VS2 v6 // src vector for permuting
sl@0	159	#define VPS3 v6 // additional permuted src in Big loop
sl@0	160
sl@0	161	#define VS3 v7 // additional src load in Big loop
sl@0	162	#define VPS4 v7 // additional permuted src in Big loop
sl@0	163
sl@0	164	#define VS4 v8 // additional src load in Big loop
sl@0	165	#define VPS5 v8 // additional permuted src in Big loop
sl@0	166
sl@0	167	#define VS5 v9 // additional src load in Big loop
sl@0	168	#define VPS6 v9 // additional permuted src in Big loop
sl@0	169
sl@0	170	#define VS6 v10 // additional src load in Big loop
sl@0	171	#define VPS7 v10 // additional permuted src in Big loop
sl@0	172
sl@0	173	#define VS7 v11 // additional src load in Big loop
sl@0	174
sl@0	175	// Conditionalize the use of dcba. It will help if the data is
sl@0	176	// not in cache and hurt if it is. Generally, except for small
sl@0	177	// benchmarks repeated many times, we assume data is not in cache
sl@0	178	// (data streaming) and using dcbz is a performance boost.
sl@0	179	#ifndef NO_DCBA
sl@0	180	#if defined(__GNUC__) \|\| defined(__MWERKS__) \|\| defined(_DIAB_TOOL)
sl@0	181	// gcc and codewarrior and diab don't assemble dcba
sl@0	182	#define DCBK .long 0x7c033dec
sl@0	183	// dcba r3,r7 or dcba DST,BK
sl@0	184	#define DCBL .long 0x7c034dec
sl@0	185	// dcba r3,r9 or dcba DST,BL
sl@0	186	#else
sl@0	187	#ifdef __ghs__
sl@0	188	.macro DCBK
sl@0	189	.long 0x7c033dec
sl@0	190	.endm
sl@0	191	.macro DCBL
sl@0	192	.long 0x7c034dec
sl@0	193	.endm
sl@0	194	#else
sl@0	195	#define DCBK dcba DST,BK
sl@0	196	#define DCBL dcba DST,BL
sl@0	197	#endif // __ghs__
sl@0	198	#endif // __GNUC__ or __MWERKS__
sl@0	199	#else
sl@0	200	#define DCBK nop
sl@0	201	#define DCBL nop
sl@0	202	#endif // NO_DCBA
sl@0	203
sl@0	204	// Conditionalize the use of dst (data stream touch). It will help
sl@0	205	// if the data is not in cache and hurt if it is (though not as badly
sl@0	206	// as dcbz). Generally, except for small benchmarks repeated many times,
sl@0	207	// we assume data is not in cache (data streaming) and using dst is a
sl@0	208	// performance boost.
sl@0	209	#ifndef NO_DST
sl@0	210	#define STRM_B dst SBC,BLL,0
sl@0	211	#define STRM_F dst SRC,BLK,0
sl@0	212	#define STRM_1 dst SP8,STR,1
sl@0	213
sl@0	214	#else
sl@0	215	#define STRM_B nop
sl@0	216	#define STRM_F nop
sl@0	217	#define STRM_1 nop
sl@0	218	#endif
sl@0	219
sl@0	220	// Condition register use
sl@0	221	// cr0[0:2] = (dst-src==0)? return: ((dst-src>0)? copy_bkwd, copy_fwd;);
sl@0	222	// then cr0[0:2] = (dst[28:31]-src[28:31]<0)? "shifting left", "shifting right";
sl@0	223	// cr1[0,2] = (BC == 0)? 1 : 0; (nothing to move)
sl@0	224	// then cr1[2] = (DST[28:31] == 0)? 1 : 0; (D0 left justified)
sl@0	225	// then cr1[2] = ((DBK = DST+BC-1)[28:31] = 0xF)? 1 : 0; (DN right justified)
sl@0	226	// cr5[0,2] = (\|DST-SRC\|<=MIN_OVL)?1:0; (Overlap too small for Big loop?)
sl@0	227	// cr6[1,2] = (DST-SRC>=BC)?1:0; (Okay for v_memmove to copy forward?)
sl@0	228	// then cr6[2] = (QW == 0)? 1 : 0; (Any full vectors to move?)
sl@0	229	// then cr6[1] = (QW > 4)? 1 : 0; (>4 vectors to move?)
sl@0	230	// then cr6[3] = (third store[27] == 1)? 1: 0; (cache line alignment)
sl@0	231	// then cr6[3] = (last store[27] == 1)? 1: 0; (last store odd?)
sl@0	232	// cr7[2] = (BC>MIN_VEC)?1:0; (BC big enough to warrant vectors)
sl@0	233	// then cr7[0:3] = (DST+16)[0:27]-DST (How many bytes (iff <16) in first vector?)
sl@0	234	// then cr7[1] = (QW > 14)? 1 : 0; (>14 vectors to move?)
sl@0	235	// then cr7[0:3] = (DST+BC)[0:27] (How many bytes (iff <16) in last vector?)
sl@0	236
sl@0	237	.text
sl@0	238	#ifdef __MWERKS__
sl@0	239	.align 32
sl@0	240	#else
sl@0	241	.align 5
sl@0	242	#endif
sl@0	243
sl@0	244	#ifdef LIBMOTOVEC
sl@0	245	.globl memmove
sl@0	246	memmove:
sl@0	247	nop // IU1 Compilers forget first label
sl@0	248	.globl memcpy
sl@0	249	memcpy:
sl@0	250	#else
sl@0	251	.globl vec_memmove
sl@0	252	vec_memmove:
sl@0	253	nop // IU1 Only way I know to preserve both labels
sl@0	254	.globl _vec_memcpy
sl@0	255	_vec_memcpy:
sl@0	256	#endif
sl@0	257	subf. DMS,SRC,DST // IU1 Compute dst-src difference
sl@0	258	cmpi cr1,0,BC,0 // IU1 Eliminate zero byte count moves
sl@0	259	cmpi cr7,0,BC,MIN_VEC // IU1 Check for minimum byte count
sl@0	260
sl@0	261	addi SM1,SRC,-1 // IU1 Pre-bias and duplicate src for fwd
sl@0	262	addi DM1,DST,-1 // IU1 Pre-bias and duplicate destination
sl@0	263	add SBC,SRC,BC // IU1 Pre-bias and duplicate src for bkwd
sl@0	264	beqlr // return if DST = SRC
sl@0	265
sl@0	266	add DBC,DST,BC // IU1 Pre-bias and duplicate destination
sl@0	267	subf Rt,DST,SRC // IU1 Form \|DST-SRC\| if DST-SRC<0
sl@0	268	beqlr cr1 // return if BC = 0
sl@0	269
sl@0	270	bgt Cpy_bkwd // b if DST-SRC>0 (have to copy backward)
sl@0	271	cmpi cr5,0,Rt,MIN_OVL // IU1 (\|DST-SRC\|>128)?1:0; for v_memcpy
sl@0	272	bgt cr7,v_memcpy // b if BC>MIN_VEC (okay to copy vectors fwd)
sl@0	273
sl@0	274	// Copy byte-by-byte forwards if DST-SRC<0 and BC<=MIN_VEC
sl@0	275	mtctr BC // i=BC; do ...;i--; while (i>0)
sl@0	276	Byte_cpy_fwd:
sl@0	277	lbzu Rt,1(SM1) // LSU * ++(DST-1) = * ++(SRC-1)
sl@0	278	stbu Rt,1(DM1) // LSU
sl@0	279	bdnz Byte_cpy_fwd
sl@0	280
sl@0	281	blr
sl@0	282	nop // IU1 Improve next label as branch target
sl@0	283	Cpy_bkwd:
sl@0	284	cmpi cr5,0,DMS,MIN_OVL // IU1 ((DST-SRC)>128)?1:0; for v_memcpy
sl@0	285	cmp cr6,0,DMS,BC // IU1 cr6[1,2]=(DST-SRC>=BC)?1:0;
sl@0	286	bgt cr7,v_memmove // b if BC>MIN_VEC (copy vectors bkwd)
sl@0	287	// Copy byte-by-byte backwards if DST-SRC>0 and BC<=MIN_VEC
sl@0	288	mtctr BC // i=BC; do ...;i--; while (i>0)
sl@0	289	Byte_cpy_bwd:
sl@0	290	lbzu Rt,-1(SBC) // LSU * --(DST+BC) = * --(SRC+BC)
sl@0	291	stbu Rt,-1(DBC) // LSU Store it
sl@0	292	bdnz Byte_cpy_bwd
sl@0	293	blr
sl@0	294
sl@0	295	#ifdef __MWERKS__
sl@0	296	.align 16
sl@0	297	#else
sl@0	298	.align 4
sl@0	299	#endif
sl@0	300
sl@0	301	v_memmove:
sl@0	302	// Byte count < MIN_VEC bytes will have been copied by scalar code above,
sl@0	303	// so this will not deal with small block moves < MIN_VEC.
sl@0	304
sl@0	305	// For systems using VRSAVE, define VRSAVE=1 when compiling. For systems
sl@0	306	// that don't, make sure VRSAVE is undefined.
sl@0	307	#ifdef VRSAVE
sl@0	308	mfspr RSV,VRSV // IU2 Get current VRSAVE contents
sl@0	309	#endif
sl@0	310	rlwinm S,SRC,0,28,31 // IU1 Save src address bits s[28:31]
sl@0	311	rlwinm D,DST,0,28,31 // IU1 D = dst[28:31]
sl@0	312	bge cr6,MC_entry // b to v_memcpy if DST-SRC>=BC (fwd copy OK)
sl@0	313
sl@0	314	#ifdef VRSAVE
sl@0	315	oris Rt,RSV,0xfff0 // IU1 Or in registers used by this routine
sl@0	316	#endif
sl@0	317	lis BLL,0x010c // IU1 Stream 12 blocks of 16 bytes
sl@0	318	subf. SMD,D,S // IU1 if S-D<0 essentially shifting right
sl@0	319
sl@0	320	#ifdef VRSAVE
sl@0	321	mtspr VRSV,Rt // IU2 Save in VRSAVE before first vec op
sl@0	322	#endif
sl@0	323	lvsr VP3,0,DMS // LSU Permute vector for dst - src shft right
sl@0	324	ori BLL,BLL,0xffe0 // IU1 Stream stride -32B
sl@0	325
sl@0	326	STRM_B // LSU Start data stream at SRC+BC
sl@0	327	addi SBK,SBC,-1 // IU1 Address of last src byte
sl@0	328	bgt Rt_shft // Bytes from upper vector = (s-d>0)?s-d:16+s-d;
sl@0	329	addi SMD,SMD,16 // IU1 Save 16-(d-s)
sl@0	330	Rt_shft:
sl@0	331
sl@0	332	rlwinm SBR,SBK,0,0,27 // IU1 (SRC+BC-1)[0:27]
sl@0	333	addi BK,BC,-1 // IU1 Initialize byte index
sl@0	334
sl@0	335	subf Rt,SBR,SBC // IU1 How many bytes in first source?
sl@0	336	add DBK,DST,BK // IU1 Address of last dst byte
sl@0	337	addi DR,DST,16 // IU1 Address of second dst vector
sl@0	338
sl@0	339	subf. SMD,Rt,SMD // IU1 if bytes in 1st src>Bytes in 1st permute
sl@0	340	rlwinm Rt,DBK,0,28,31 // IU1 (DST+BC-1)[28:31]
sl@0	341	rlwinm DR,DR,0,0,27 // IU1 (DST+16)[0:27]
sl@0	342
sl@0	343	// If there are more useful bytes in the upper vector of a permute pair than we
sl@0	344	// will get in the first permute, the first loaded vector needs to be in the
sl@0	345	// lower half of the permute pair. The upper half is a don't care then.
sl@0	346	blt Get_bytes_rt // b if shifting left (D-S>=0)
sl@0	347
sl@0	348	lvx VS1,SRC,BK // LSU Get SN load started
sl@0	349	// Comments numbering source and destination assume single path through the
sl@0	350	// code executing each instruction once. For vec_memmove, an example would
sl@0	351	// be the call memmove(BASE+0x0F, BASE+0x2F, 82). N = 6 in that case.
sl@0	352	addi SRC,SRC,-16 // IU1 Decrement src base (to keep BK useful)
sl@0	353
sl@0	354	Get_bytes_rt: // Come here to get VS0 & Don't care what VS1 is
sl@0	355	lvx VS0,SRC,BK // LSU Get SN-1 (SN if D-S<0) in lower vector
sl@0	356	subf QW,DR,DBK // IU1 Bytes of full vectors to move (-16)
sl@0	357	cmpi cr7,0,Rt,0xF // IU1 Is Dn right justified?
sl@0	358
sl@0	359	cmpi cr1,0,D,0 // IU1 Is D0 left justified?
sl@0	360	rlwinm QW,QW,28,4,31 // IU1 Quad words remaining
sl@0	361	add Rt,DST,BC // IU1 Refresh the value of DST+BC
sl@0	362
sl@0	363	cmpi cr6,0,QW,0 // IU1 Any full vectors to move?
sl@0	364	vperm VPS0,VS0,VS1,VP3 // VPU Align SN-1 and SN to DN
sl@0	365	vor VS1,VS0,VS0 // VIU1 Move lower vector to upper
sl@0	366	beq cr7,Rt_just // b if DN is right justified
sl@0	367
sl@0	368	mtcrf 0x01,Rt // IU2 Put final vector byte count in cr7
sl@0	369	rlwinm DBK,DBK,0,0,27 // IU1 Address of first byte of final vector
sl@0	370	li D,0 // IU1 Initialize an index pointer
sl@0	371	bnl cr7,Only_1W_bkwd // b if there was only one or zero words to store
sl@0	372
sl@0	373	stvewx VPS0,DBK,D // LSU store word 1 of two or three
sl@0	374	addi D,D,4 // IU1 increment index
sl@0	375
sl@0	376	stvewx VPS0,DBK,D // LSU store word 2 of two or three
sl@0	377	addi D,D,4 // IU1 increment index
sl@0	378	Only_1W_bkwd:
sl@0	379	bng cr7,Only_2W_bkwd // b if there were only two or zero words to store
sl@0	380
sl@0	381	stvewx VPS0,DBK,D // LSU store word 3 of three if necessary
sl@0	382	addi D,D,4 // IU1 increment index
sl@0	383	Only_2W_bkwd:
sl@0	384	bne cr7,Only_B_bkwd // b if there are no half words to store
sl@0	385
sl@0	386	stvehx VPS0,DBK,D // LSU store one halfword if necessary
sl@0	387	addi D,D,2 // IU1 increment index
sl@0	388	Only_B_bkwd:
sl@0	389	bns cr7,All_done_bkwd // b if there are no bytes to store
sl@0	390
sl@0	391	stvebx VPS0,DBK,D // LSU store one byte if necessary
sl@0	392	b All_done_bkwd
sl@0	393
sl@0	394	Rt_just:
sl@0	395	stvx VPS0,DST,BK // LSU Store 16 bytes at DN
sl@0	396	All_done_bkwd:
sl@0	397	addi BK,BK,-16 // IU1 Decrement destination byte count
sl@0	398
sl@0	399	ble cr6,Last_load // b if no Quad words to do
sl@0	400	mtctr QW // IU2 for (i=0;i<=QW;i++)-execution serializng
sl@0	401	cmpi cr6,0,QW,4 // IU1 Check QW>4
sl@0	402	QW_loop:
sl@0	403	lvx VS0,SRC,BK // LSU Get SN-2 (or SN-1 if ADJ==0)
sl@0	404
sl@0	405	vperm VPS0,VS0,VS1,VP3 // VPU Align SN-2 and SN-1 to DN-1
sl@0	406	vor VS1,VS0,VS0 // VIU1 Move lower vector to upper
sl@0	407
sl@0	408	stvx VPS0,DST,BK // LSU Store 16 bytes at DN-1
sl@0	409	addi BK,BK,-16 // IU1 Decrement byte kount
sl@0	410	bdnzf 25,QW_loop // b if 4 or less quad words to do
sl@0	411
sl@0	412	add DNX,DST,BK // IU1 address of next store (DST+BC-1-16)
sl@0	413	bgt cr6,GT_4QW // b if >4 quad words left
sl@0	414
sl@0	415	Last_load: // if D-S>=0, next load will be from same address as last
sl@0	416	blt No_ld_bkwd // b if shifting right (S-D>=0)
sl@0	417	addi SRC,SRC,16 // IU1 recorrect source if it was decremented
sl@0	418	No_ld_bkwd:
sl@0	419	lvx VS0,0,SRC // LSU Get last source SN-6 (guaranteed S0)
sl@0	420	// Current 16 bytes is the last; we're done.
sl@0	421	dss 0 // Data stream stop
sl@0	422	vperm VPS0,VS0,VS1,VP3 // VPU Align SN-6 and SN-5 to DN-6
sl@0	423	subfic D,DST,16 // IU1 How many bytes in first destination?
sl@0	424	beq cr1,Lt_just // b if last destination is left justified
sl@0	425
sl@0	426	mtcrf 0x01,D // IU2 Put byte count remaining in cr7
sl@0	427	li D,0 // IU1 Initialize index pointer
sl@0	428	bns cr7,No_B_bkwd // b if only even number of bytes to store
sl@0	429
sl@0	430	stvebx VPS0,DST,D // LSU store first byte at DST+0
sl@0	431	addi D,D,1 // IU1 increment index
sl@0	432	No_B_bkwd:
sl@0	433	bne cr7,No_H_bkwd // b if only words to store
sl@0	434	stvehx VPS0,DST,D // LSU store halfword at DST+0/1
sl@0	435	addi D,D,2 // IU1 increment index
sl@0	436
sl@0	437	No_H_bkwd:
sl@0	438	bng cr7,No_W1_bkwd // b if exactly zero or two words to store
sl@0	439	stvewx VPS0,DST,D // LSU store word 1 of one or three
sl@0	440	addi D,D,4 // IU1 increment index
sl@0	441
sl@0	442	No_W1_bkwd:
sl@0	443	bnl cr7,No_W2_bkwd // b if there was only one word to store
sl@0	444	stvewx VPS0,DST,D // LSU store word 1 of two or 2 of three
sl@0	445	addi D,D,4 // IU1 increment index
sl@0	446
sl@0	447	stvewx VPS0,DST,D // LSU store word 2 of two or 3 of three
sl@0	448	b No_W2_bkwd
sl@0	449
sl@0	450	Lt_just:
sl@0	451	stvx VPS0,0,DST // LSU Store 16 bytes at final dst addr D0
sl@0	452	No_W2_bkwd:
sl@0	453	#ifdef VRSAVE
sl@0	454	mtspr VRSV,RSV // IU1 Restore VRSAVE
sl@0	455	#endif
sl@0	456	blr // Return destination address from entry
sl@0	457
sl@0	458	GT_4QW: // Do once if next store is to even half of cache line, else twice
sl@0	459
sl@0	460	lvx VS0,SRC,BK // LSU Get SN-3 (or SN-2)
sl@0	461	mtcrf 0x02,DNX // IU2 cr6[3]=((DST+BC-1)[27]==1)?1:0;
sl@0	462
sl@0	463	vperm VPS0,VS0,VS1,VP3 // VPU Align SN-3 and SN-2 to Dn-2
sl@0	464	vor VS1,VS0,VS0 // VIU1 Move lower vector to upper
sl@0	465	addi DNX,DNX,-16 // IU1 Prepare to update cr6 next loop
sl@0	466
sl@0	467	stvx VPS0,DST,BK // LSU Store 16 bytes at DN-2
sl@0	468	vor VS3,VS0,VS0 // VIU Make a copy of lower vector
sl@0	469	addi BK,BK,-16 // IU1 Decrement byte count by 16
sl@0	470	bdnzt 27,GT_4QW // b if next store is to upper (odd) half of CL
sl@0	471	// At this point next store will be to even address.
sl@0	472
sl@0	473	lis STR,0x102 // IU1 Stream 2 blocks of 16 bytes
sl@0	474	mtcrf 0x02,DST // IU2 cr6[3]=(DST[27]==1)?1:0; (DST odd?)
sl@0	475	addi BL,BK,-16 // IU1 Create an alternate byte count - 16
sl@0	476
sl@0	477	ori STR,STR,0xffe0 // IU1 Stream stride -32B
sl@0	478	addi SP8,SRC,-64 // IU1 Starting address for data stream touch
sl@0	479	bso cr6,B32_bkwd // b if DST[27] == 1; i.e, final store is odd
sl@0	480
sl@0	481	bdnz B32_bkwd // decrement counter for last odd QW store
sl@0	482	B32_bkwd: // Should be at least 2 stores remaining and next 2 are cache aligned
sl@0	483	lvx VS2,SRC,BK // LSU Get SN-4 (or SN-3)
sl@0	484	addi SP8,SP8,-32 // IU1 Next starting address for data stream touch
sl@0	485
sl@0	486	lvx VS1,SRC,BL // LSU Get SN-5 (or SN-4)
sl@0	487	vperm VPS0,VS2,VS3,VP3 // VPU Align SN-4 and SN-3 to DN-3
sl@0	488
sl@0	489	STRM_1 // LSU Stream 64 byte blocks ahead of loads
sl@0	490
sl@0	491	DCBL // LSU allocate next cache line
sl@0	492
sl@0	493	vperm VPS1,VS1,VS2,VP3 // VPU Align SN-5 and SN-4 to DN-4
sl@0	494	vor VS3,VS1,VS1 // VIU1 Move SN-5 to SN-3
sl@0	495
sl@0	496	stvx VPS0,DST,BK // LSU Store 16 bytes at DN-3
sl@0	497	addi BK,BL,-16 // IU1 Decrement byte count
sl@0	498	bdz Nxt_loc_bkwd // always decrement and branch to next instr
sl@0	499
sl@0	500	Nxt_loc_bkwd:
sl@0	501	stvx VPS1,DST,BL // LSU Store 16 bytes at DN-4
sl@0	502	addi BL,BK,-16 // IU1 Decrement alternate byte count
sl@0	503	bdnz B32_bkwd // b if there are at least two more QWs to do
sl@0	504
sl@0	505	bns cr6,One_odd_QW // b if there was one more odd QW to store
sl@0	506	b Last_load
sl@0	507
sl@0	508	// Come here with two more loads and two stores to do
sl@0	509	One_odd_QW:
sl@0	510	lvx VS1,SRC,BK // LSU Get SN-6 (or SN-5)
sl@0	511
sl@0	512	vperm VPS1,VS1,VS3,VP3 // VPU Align SN-6 and SN-5 to DN-5
sl@0	513
sl@0	514	stvx VPS1,DST,BK // LSU Store 16 bytes at DN-5
sl@0	515
sl@0	516	b Last_load
sl@0	517
sl@0	518	// End of memmove in AltiVec
sl@0	519
sl@0	520	#ifdef __MWERKS__
sl@0	521	.align 16
sl@0	522	#else
sl@0	523	.align 4
sl@0	524	#endif
sl@0	525	v_memcpy:
sl@0	526	// Byte count < MIN_VEC bytes will have been copied by scalar code above,
sl@0	527	// so this will not deal with small block moves < MIN_VEC.
sl@0	528
sl@0	529	#ifdef VRSAVE
sl@0	530	mfspr RSV,VRSV // IU2 Get current VRSAVE contents
sl@0	531	#endif
sl@0	532	rlwinm S,SRC,0,28,31 // IU1 Save src address bits s[28:31]
sl@0	533	rlwinm D,DST,0,28,31 // IU1 D = dst[28:31]
sl@0	534
sl@0	535	MC_entry: // enter here from memmove if DST-SRC>=BC; this should be faster
sl@0	536	#ifdef VRSAVE
sl@0	537	oris Rt,RSV,0xfff0 // IU1 Or in registers used by this routine
sl@0	538	#endif
sl@0	539	lis BLK,0x010c // IU1 Stream 12 blocks of 16 bytes
sl@0	540
sl@0	541	subf. S,S,D // IU1 if D-S<0 essentially shifting left
sl@0	542
sl@0	543	#ifdef VRSAVE
sl@0	544	mtspr VRSV,Rt // IU2 Save in VRSAVE before first vec op
sl@0	545	#endif
sl@0	546	lvsr VP3,0,DMS // LSU Permute vector for dst - src shft right
sl@0	547	ori BLK,BLK,32 // IU1 Stream stride 32B
sl@0	548
sl@0	549	STRM_F // LSU Start data stream 0 at SRC
sl@0	550	addi DR,DST,16 // IU1 Address of second dst vector
sl@0	551	addi DBK,DBC,-1 // IU1 Address of last dst byte
sl@0	552
sl@0	553	// If D-S<0 we are "kinda" shifting left with the right shift permute vector
sl@0	554	// loaded to VP3 and we need both S0 and S1 to permute. If D-S>=0 then the
sl@0	555	// first loaded vector needs to be in the upper half of the permute pair and
sl@0	556	// the lower half is a don't care then.
sl@0	557	bge Ld_bytes_rt // b if shifting right (D-S>=0)
sl@0	558
sl@0	559	lvx VS0,0,SRC // LSU Get S0 load started
sl@0	560	// Comments numbering source and destination assume single path through the
sl@0	561	// code executing each instruction once. For vec_memcpy, an example would
sl@0	562	// be the call memcpy(BASE+0x1E, BASE+0x1F, 259). N = 16 in that case.
sl@0	563	addi SRC,SRC,16 // IU1 Increment src base (to keep BK useful)
sl@0	564
sl@0	565	Ld_bytes_rt: // Come here to get VS1 & Don't care what VS0 is
sl@0	566	lvx VS1,0,SRC // LSU Get S1 (or S0 if D-S>=0) in upper vector
sl@0	567	rlwinm DR,DR,0,0,27 // IU1 (DST+16)[0:27]
sl@0	568	cmpi cr1,0,D,0 // IU1 Is D0 left justified?
sl@0	569
sl@0	570	subf Rt,DST,DR // IU1 How many bytes in first destination?
sl@0	571	subf QW,DR,DBK // IU1 Bytes of full vectors to move (-16)
sl@0	572	li BK,0 // IU1 Initialize byte kount index
sl@0	573
sl@0	574	mtcrf 0x01,Rt // IU2 Put bytes in 1st dst in cr7
sl@0	575	rlwinm QW,QW,28,4,31 // IU1 Quad words remaining
sl@0	576	vperm VPS0,VS0,VS1,VP3 // VPU Align S0 and S1 to D0
sl@0	577
sl@0	578	vor VS0,VS1,VS1 // VIU1 Move upper vector to lower
sl@0	579	beq cr1,Left_just // b if D0 is left justified
sl@0	580
sl@0	581	bns cr7,No_B_fwd // b if only even number of bytes to store
sl@0	582
sl@0	583	stvebx VPS0,DST,BK // LSU store first byte at DST+0
sl@0	584	addi BK,BK,1 // IU1 increment index
sl@0	585	No_B_fwd:
sl@0	586	bne cr7,No_H_fwd // b if only words to store
sl@0	587
sl@0	588	stvehx VPS0,DST,BK // LSU store halfword at DST+0/1
sl@0	589	addi BK,BK,2 // IU1 increment index
sl@0	590	No_H_fwd:
sl@0	591	bng cr7,No_W1_fwd // b if exactly zero or two words to store
sl@0	592
sl@0	593	stvewx VPS0,DST,BK // LSU store word 1 of one or three
sl@0	594	addi BK,BK,4 // IU1 increment index
sl@0	595
sl@0	596	No_W1_fwd:
sl@0	597	bnl cr7,No_W2_fwd // b if there was only one word to store
sl@0	598	stvewx VPS0,DST,BK // LSU store word 1 of two or 2 of three
sl@0	599	addi BK,BK,4 // IU1 increment index
sl@0	600
sl@0	601	stvewx VPS0,DST,BK // LSU store word 2 of two or 3 of three
sl@0	602	b No_W2_fwd
sl@0	603
sl@0	604	Left_just:
sl@0	605	stvx VPS0,0,DST // LSU Store 16 bytes at D0
sl@0	606	No_W2_fwd:
sl@0	607	rlwinm Rt,DBK,0,28,31 // IU1 (DBK = DST+BC-1)[28:31]
sl@0	608	cmpi cr6,0,QW,0 // IU1 Any full vectors to move?
sl@0	609
sl@0	610	li BK,16 // IU1 Re-initialize byte kount index
sl@0	611	cmpi cr1,0,Rt,0xF // IU1 Is DN right justified?
sl@0	612	cmpi cr7,0,QW,14 // IU1 Check QW>14
sl@0	613	ble cr6,Last_ld_fwd // b if no Quad words to do
sl@0	614
sl@0	615	mtctr QW // IU2 for (i=0;i<=QW;i++)
sl@0	616	cmpi cr6,0,QW,4 // IU1 Check QW>4
sl@0	617	QW_fwd_loop:
sl@0	618	lvx VS1,SRC,BK // LSU Get S2 (or S1)
sl@0	619
sl@0	620	vperm VPS0,VS0,VS1,VP3 // VPU Align S1 and S2 to D1
sl@0	621	vor VS0,VS1,VS1 // VIU1 Move upper vector to lower
sl@0	622
sl@0	623	stvx VPS0,DST,BK // LSU Store 16 bytes at D1(+n*16 where n<4)
sl@0	624	addi BK,BK,16 // IU1 Increment byte kount index
sl@0	625	bdnzf 25,QW_fwd_loop // b if 4 or less quad words to do
sl@0	626
sl@0	627	add DNX,DST,BK // IU1 address of next store (DST+32 if QW>4)
sl@0	628	addi QW,QW,-1 // IU1 One more QW stored by now
sl@0	629	bgt cr6,GT_4QW_fwd // b if >4 quad words left
sl@0	630
sl@0	631	Last_ld_fwd: // Next 16 bytes is the last; we're done.
sl@0	632	add DBC,DST,BC // IU1 Recompute address of last dst byte + 1
sl@0	633	add SBC,SRC,BC // IU1 Recompute address of last src byte + 1
sl@0	634	bge No_ld_fwd // b if shifting right (D-S>=0)
sl@0	635
sl@0	636	addi SBC,SBC,-16 // IU1 if D-S>=0 we didn't add 16 to src
sl@0	637	No_ld_fwd:
sl@0	638	mtcrf 0x01,DBC // IU2 Put final vector byte count in cr7
sl@0	639	addi DBK,DBC,-1 // IU1 Recompute address of last dst byte
sl@0	640	addi Rt,SBC,-1 // IU1 Recompute address of last src byte
sl@0	641
sl@0	642	// If D-S<0 we have already loaded all the source vectors.
sl@0	643	// If D-S>=0 then the first loaded vector went to the upper half of the permute
sl@0	644	// pair and we need one more vector. (This may be a duplicate.)
sl@0	645
sl@0	646	lvx VS1,0,Rt // LSU Get last source S14 (guaranteed SN)
sl@0	647
sl@0	648	#ifndef NO_DST
sl@0	649	dss 0 // Data stream 0 stop
sl@0	650
sl@0	651	dss 1 // Data stream 1 stop
sl@0	652	#endif
sl@0	653	vperm VPS0,VS0,VS1,VP3 // VPU Align S13 and S14 to D14
sl@0	654	beq cr1,Rt_just_fwd // b if last destination is right justified
sl@0	655
sl@0	656	rlwinm DBK,DBK,0,0,27 // IU1 Round to QW addr of last byte
sl@0	657	li D,0 // IU1 Initialize index pointer
sl@0	658	bnl cr7,Only_1W_fwd // b if there was only one or zero words to store
sl@0	659
sl@0	660	stvewx VPS0,DBK,D // LSU store word 1 of two or three
sl@0	661	addi D,D,4 // IU1 increment index
sl@0	662
sl@0	663	stvewx VPS0,DBK,D // LSU store word 2 of two or three
sl@0	664	addi D,D,4 // IU1 increment index
sl@0	665	Only_1W_fwd:
sl@0	666	bng cr7,Only_2W_fwd // b if there were only two or zero words to store
sl@0	667
sl@0	668	stvewx VPS0,DBK,D // LSU store word 3 of three if necessary
sl@0	669	addi D,D,4 // IU1 increment index
sl@0	670	Only_2W_fwd:
sl@0	671	bne cr7,Only_B_fwd // b if there are no half words to store
sl@0	672
sl@0	673	stvehx VPS0,DBK,D // LSU store one halfword if necessary
sl@0	674	addi D,D,2 // IU1 increment index
sl@0	675	Only_B_fwd:
sl@0	676	bns cr7,All_done_fwd // b if there are no bytes to store
sl@0	677
sl@0	678	stvebx VPS0,DBK,D // LSU store one byte if necessary
sl@0	679	b All_done_fwd
sl@0	680
sl@0	681	Rt_just_fwd:
sl@0	682
sl@0	683	stvx VPS0,DST,BK // LSU Store 16 bytes at D14
sl@0	684	All_done_fwd:
sl@0	685	#ifdef VRSAVE
sl@0	686	mtspr VRSV,RSV // IU1 Restore VRSAVE
sl@0	687	#endif
sl@0	688	blr // Return destination address from entry
sl@0	689	#ifdef __MWERKS__
sl@0	690	.align 16
sl@0	691	#else
sl@0	692	.align 4
sl@0	693	#endif
sl@0	694	GT_4QW_fwd: // Do once if nxt st is to odd half of cache line, else twice
sl@0	695
sl@0	696	lvx VS1,SRC,BK // LSU Get S3 (or S2)
sl@0	697	addi QW,QW,-1 // IU1 Keeping track of QWs stored
sl@0	698	mtcrf 0x02,DNX // IU2 cr6[3]=((DST+32)[27]==1)?1:0;
sl@0	699
sl@0	700	addi DNX,DNX,16 // IU1 Update cr6 for next loop
sl@0	701	addi Rt,QW,-2 // IU1 Insure at least 2 QW left after big loop
sl@0	702
sl@0	703	vperm VPS0,VS0,VS1,VP3 // VPU Align S2 and S3 to D2
sl@0	704	vor VS0,VS1,VS1 // VIU1 Move upper vector to lower
sl@0	705
sl@0	706	stvx VPS0,DST,BK // LSU Store 16 bytes at D2
sl@0	707	addi BK,BK,16 // IU1 Increment byte count by 16
sl@0	708	bdnzf 27,GT_4QW_fwd // b if next store is to lower (even) half of CL
sl@0	709	// At this point next store will be to even address.
sl@0	710
sl@0	711	mtcrf 0x02,DBK // IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?)
sl@0	712	lis STR,0x104 // IU1 Stream 4 blocks of 16 bytes
sl@0	713	addi BL,BK,16 // IU1 Create an alternate byte kount + 32
sl@0	714
sl@0	715	ori STR,STR,32 // IU1 Stream stride 32B
sl@0	716	#ifndef NO_BIG_LOOP
sl@0	717	rlwinm BIG,Rt,29,3,31 // IU1 QW/8 big loops to do
sl@0	718
sl@0	719	rlwinm Rt,Rt,0,0,28 // IU1 How many QWs will be done in big loop
sl@0	720	bgt cr7,Big_loop // b if QW > 14
sl@0	721	#endif
sl@0	722	No_big_loop:
sl@0	723	// We need the ctr register to reflect an even byte count before entering
sl@0	724	// the next block - faster to decrement than to reload.
sl@0	725
sl@0	726	addi SP8,SRC,256 // IU1 Starting address for data stream touch
sl@0	727	xoris STR,STR,0x6 // IU1 Reset stream to 2 blocks of 16 bytes
sl@0	728	bns cr6,B32_fwd // b if DST[27] == 0; i.e, final store is even
sl@0	729
sl@0	730	bdnz B32_fwd // decrement counter for last QW store odd
sl@0	731
sl@0	732	B32_fwd: // Should be at least 2 stores remaining and next 2 are cache aligned
sl@0	733	lvx VS1,SRC,BK // LSU Get S12
sl@0	734	addi SP8,SP8,32 // IU1 Next starting address for data stream touch
sl@0	735
sl@0	736	lvx VS2,SRC,BL // LSU Get S13
sl@0	737	vperm VPS1,VS0,VS1,VP3 // VPU Align S11 and S12 to D11
sl@0	738
sl@0	739	STRM_1 // LSU Stream 64 byte blocks ahead of loads
sl@0	740
sl@0	741	DCBK // LSU then Kill instead of RWITM
sl@0	742
sl@0	743	vperm VPS0,VS1,VS2,VP3 // VPU Align S12 and S13 to D12
sl@0	744	vor VS0,VS2,VS2 // VIU1 Move S13 to S11
sl@0	745
sl@0	746	stvx VPS1,DST,BK // LSU Store 16 bytes at D11
sl@0	747	addi BK,BL,16 // IU1 Increment byte count
sl@0	748	bdz Nxt_loc_fwd // always decrement and branch to next instr
sl@0	749
sl@0	750	Nxt_loc_fwd:
sl@0	751	stvx VPS0,DST,BL // LSU Store 16 bytes at D12
sl@0	752	addi BL,BK,16 // IU1 Increment alternate byte count
sl@0	753	bdnz B32_fwd // b if there are at least two more QWs to do
sl@0	754
sl@0	755	bso cr6,One_even_QW // b if there is one even and one odd QW to store
sl@0	756	b Last_ld_fwd // b if last store is to even address
sl@0	757
sl@0	758	// Come here with two more loads and two stores to do
sl@0	759	One_even_QW:
sl@0	760	lvx VS1,SRC,BK // LSU Get S14 (or S13 if if D-S>=0)
sl@0	761
sl@0	762	vperm VPS0,VS0,VS1,VP3 // VPU Align S13 and S14 to D13
sl@0	763	vor VS0,VS1,VS1 // VIU1 Move upper vector to lower
sl@0	764
sl@0	765	stvx VPS0,DST,BK // LSU Store 16 bytes at D13
sl@0	766	addi BK,BK,16 // IU1 Increment byte count
sl@0	767
sl@0	768	b Last_ld_fwd
sl@0	769
sl@0	770	#ifdef __MWERKS__
sl@0	771	.align 16
sl@0	772	#else
sl@0	773	.align 4
sl@0	774	#endif
sl@0	775	Big_loop:
sl@0	776	subf QW,Rt,QW // IU1 Should be 2-7 QWs left after big loop
sl@0	777	blt cr5,No_big_loop // b back if \|DST-SRC\|<128; Big_loop won't work.
sl@0	778	mtctr BIG // IU2 loop for as many 128B loops as possible
sl@0	779	addi SP8,SRC,256 // IU1 Starting address for data stream touch
sl@0	780
sl@0	781	Loop_of_128B: // Come here with QW>=10 and next store even; VS0 last load
sl@0	782	lvx VS1,SRC,BK // LSU Get S4 (or S3 if D-S>=0)
sl@0	783	addi BL,BK,32 // IU1 Increment Byte_Kount+16 by 32
sl@0	784	addi SP8,SP8,128 // IU1 increment address for data stream touch
sl@0	785
sl@0	786	lvx VS3,SRC,BL // LSU Get S6 (or S5)
sl@0	787	addi BL,BL,32 // IU1 Increment Byte_Kount+48 by 32
sl@0	788
sl@0	789	lvx VS5,SRC,BL // LSU Get S8 (or S7)
sl@0	790	addi BL,BL,32 // IU1 Increment Byte_Kount+80 by 32
sl@0	791
sl@0	792	lvx VS7,SRC,BL // LSU Get S10 (or S9)
sl@0	793	addi BL,BK,16 // IU1 Increment Byte_Kount+16 by 16
sl@0	794
sl@0	795	lvx VS2,SRC,BL // LSU Get S5 (or S4)
sl@0	796	addi BL,BL,32 // IU1 Increment Byte_Kount+32 by 32
sl@0	797
sl@0	798	lvx VS4,SRC,BL // LSU Get S7 (or S6)
sl@0	799	addi BL,BL,32 // IU1 Increment Byte_Kount+64 by 32
sl@0	800
sl@0	801	lvx VS6,SRC,BL // LSU Get S9 (or S8)
sl@0	802	addi BL,BL,32 // IU1 Increment Byte_Kount+96 by 32
sl@0	803	vperm VPS0,VS0,VS1,VP3 // VPU
sl@0	804
sl@0	805	lvx VS0,SRC,BL // LSU Get S11 (or S10)
sl@0	806	vperm VPS1,VS1,VS2,VP3 // VPU
sl@0	807
sl@0	808	STRM_1 // LSU Stream 4 32B blocks, stride 32B
sl@0	809
sl@0	810	DCBK // LSU then Kill instead of RWITM
sl@0	811
sl@0	812	stvx VPS0,DST,BK // LSU Store D3
sl@0	813	addi BK,BK,16 // IU1 Increment Byte_Kount+16 by 16
sl@0	814	vperm VPS2,VS2,VS3,VP3 // VPU
sl@0	815
sl@0	816	stvx VPS1,DST,BK // LSU Store D4
sl@0	817	addi BK,BK,16 // IU1 Increment Byte_Kount+32 by 16
sl@0	818	vperm VPS3,VS3,VS4,VP3 // VPU
sl@0	819
sl@0	820	DCBK // LSU then Kill instead of RWITM
sl@0	821
sl@0	822	stvx VPS2,DST,BK // LSU Store D5
sl@0	823	addi BK,BK,16 // IU1 Increment Byte_Kount+48 by 16
sl@0	824	vperm VPS4,VS4,VS5,VP3 // VPU
sl@0	825
sl@0	826	stvx VPS3,DST,BK // LSU Store D6
sl@0	827	addi BK,BK,16 // IU1 Increment Byte_Kount+64 by 16
sl@0	828	vperm VPS5,VS5,VS6,VP3 // VPU
sl@0	829
sl@0	830	DCBK // LSU then Kill instead of RWITM
sl@0	831
sl@0	832	stvx VPS4,DST,BK // LSU Store D7
sl@0	833	addi BK,BK,16 // IU1 Increment Byte_Kount+80 by 16
sl@0	834	vperm VPS6,VS6,VS7,VP3 // VPU
sl@0	835
sl@0	836	stvx VPS5,DST,BK // LSU Store D8
sl@0	837	addi BK,BK,16 // IU1 Increment Byte_Kount+96 by 16
sl@0	838	vperm VPS7,VS7,VS0,VP3 // VPU
sl@0	839
sl@0	840	DCBK // LSU then Kill instead of RWITM
sl@0	841
sl@0	842	stvx VPS6,DST,BK // LSU Store D9
sl@0	843	addi BK,BK,16 // IU1 Increment Byte_Kount+112 by 16
sl@0	844
sl@0	845	stvx VPS7,DST,BK // LSU Store D10
sl@0	846	addi BK,BK,16 // IU1 Increment Byte_Kount+128 by 16
sl@0	847	bdnz Loop_of_128B // b if ctr > 0 (QW/8 still > 0)
sl@0	848
sl@0	849	mtctr QW // IU1 Restore QW remaining to counter
sl@0	850	addi BL,BK,16 // IU1 Create an alternate byte kount + 16
sl@0	851	bns cr6,B32_fwd // b if DST[27] == 0; i.e, final store is even
sl@0	852
sl@0	853	bdnz B32_fwd // b and decrement counter for last QW store odd
sl@0	854	// One of the above branches should have taken
sl@0	855
sl@0	856	// End of memcpy in AltiVec
sl@0	857
sl@0	858	// bcopy works like memcpy, but the source and destination operands are reversed.
sl@0	859	// Following will just reverse the operands and branch to memcpy.
sl@0	860
sl@0	861	#ifdef LIBMOTOVEC
sl@0	862	.globl bcopy
sl@0	863	bcopy:
sl@0	864	#else
sl@0	865	.globl vec_bcopy
sl@0	866	vec_bcopy:
sl@0	867	#endif
sl@0	868	mr Rt,DST // temp storage for what is really source address (r3)
sl@0	869	mr DST,SRC // swap destination address to r3 to match memcpy dst
sl@0	870	mr SRC,Rt // Complete swap of destination and source for memcpy
sl@0	871	#ifdef LIBMOTOVEC
sl@0	872	b memcpy // b to memcpy with correct args in r3 and r4
sl@0	873	#else
sl@0	874	b _vec_memcpy // b to vec_memcpy with correct args in r3 and r4
sl@0	875	#endif
sl@0	876	// End of bcopy in AltiVec

author	sl@SLION-WIN7.fritz.box
	Fri, 15 Jun 2012 03:10:57 +0200
changeset 0	bde4ae8d615e
permissions	-rw-r--r--