Symaptic: os/ossrv/genericopenlibs/liboil/src/motovec/vec

sl@0	1	//------------------------------------------------------------------
sl@0	2	// file: vec_memset.S
sl@0	3	// AltiVec enabled version of memset and bzero and cacheable_memzero
sl@0	4	//------------------------------------------------------------------
sl@0	5
sl@0	6	//------------------------------------------------------------------
sl@0	7	// Copyright Motorola, Inc. 2002
sl@0	8	// ALL RIGHTS RESERVED
sl@0	9	//
sl@0	10	// You are hereby granted a copyright license to use, modify, and
sl@0	11	// distribute the SOFTWARE so long as this entire notice is retained
sl@0	12	// without alteration in any modified and/or redistributed versions,
sl@0	13	// and that such modified versions are clearly identified as such.
sl@0	14	// No licenses are granted by implication, estoppel or otherwise under
sl@0	15	// any patents or trademarks of Motorola, Inc.
sl@0	16	//
sl@0	17	// The SOFTWARE is provided on an "AS IS" basis and without warranty.
sl@0	18	// To the maximum extent permitted by applicable law, MOTOROLA DISCLAIMS
sl@0	19	// ALL WARRANTIES WHETHER EXPRESS OR IMPLIED, INCLUDING IMPLIED
sl@0	20	// WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR
sl@0	21	// PURPOSE AND ANY WARRANTY AGAINST INFRINGEMENT WITH
sl@0	22	// REGARD TO THE SOFTWARE (INCLUDING ANY MODIFIED VERSIONS
sl@0	23	// THEREOF) AND ANY ACCOMPANYING WRITTEN MATERIALS.
sl@0	24	//
sl@0	25	// To the maximum extent permitted by applicable law, IN NO EVENT SHALL
sl@0	26	// MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER
sl@0	27	// (INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF
sl@0	28	// BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS
sl@0	29	// INFORMATION, OR OTHER PECUNIARY LOSS) ARISING OF THE USE OR
sl@0	30	// INABILITY TO USE THE SOFTWARE. Motorola assumes no responsibility
sl@0	31	// for the maintenance and support of the SOFTWARE.
sl@0	32	//------------------------------------------------------------------
sl@0	33
sl@0	34	//------------------------------------------------------------------
sl@0	35	// extern void memset( void ptr, int val, size_t len );
sl@0	36	// Copies val into each of len characters beginning at ptr.
sl@0	37	// - Harbison&Steele 4th ed
sl@0	38	// (despite val being an int, this memset assumes it is never
sl@0	39	// more than a byte. That seems to be correct from all the
sl@0	40	// memset functions I've seen but I don't know if ANSI allows
sl@0	41	// anthing longer. Chuck Corley 12/21/02)
sl@0	42	// Returns:
sl@0	43	// void * ptr
sl@0	44	//------------------------------------------------------------------
sl@0	45
sl@0	46	//------------------------------------------------------------------
sl@0	47	// extern void * bzero( char *ptr, int len);
sl@0	48	// Copies 0 into each of len characters at ptr.
sl@0	49	// - Harbison&Steele 4th ed
sl@0	50	// Returns:
sl@0	51	// void * ptr
sl@0	52	//------------------------------------------------------------------
sl@0	53
sl@0	54	// Revision History:
sl@0	55	// Rev 0.0 Original Chuck Corley 02/09/03
sl@0	56	// Could benefit from changes added to memcpy
sl@0	57	// Rev 0.1 Revised per memcpy Rev 0.30 Chuck Corley 05/01/03
sl@0	58	//
sl@0	59	// This is beta quality code; users are encouraged to make it faster.
sl@0	60	// ASSUMPTIONS:
sl@0	61	// Code is highly likely to be in the cache; data is not (streaming data)
sl@0	62	// Zero fill could be quite likely.
sl@0	63	// Moving fill byte from GPR to VR as below faster than stw->lvebx via stack
sl@0	64
sl@0	65	#define VRSV 256 // VRSAVE spr
sl@0	66	// Don't use vectors for BC <= MIN_VEC. Works only if MIN_VEC >= 16 bytes.
sl@0	67	#define MIN_VEC 16
sl@0	68
sl@0	69	// Register useage
sl@0	70	#define Rt r0 // r0 when used as a temporary register
sl@0	71
sl@0	72	#define DST r3 // entering: dest pointer; exiting: same dest pointer
sl@0	73
sl@0	74	#define FILL r4 // entering: fill char then fill word
sl@0	75
sl@0	76	#define BC r5 // entering: Byte_Count then remaining Byte_Count
sl@0	77
sl@0	78	#define DBC r6// dst + byte count
sl@0	79
sl@0	80	#define BK r7 // BC - 1 +/- (n*16)
sl@0	81
sl@0	82	#define Fsh r8 // fill byte shifted right one nibble
sl@0	83
sl@0	84	#define DM1 r9// dst -1 for byte-by-byte backwards initially
sl@0	85	#define D r9 // (dst+16)[0:27] - dst[28:31]
sl@0	86	#define DNX r9 // (dst+n*16)[28:31]
sl@0	87	#define BL r9 // second byte_kount index pointer
sl@0	88
sl@0	89	#define DR r10 // (dst+16)[0:27]
sl@0	90	#define QW r10 // number of cache lines
sl@0	91
sl@0	92	#define DBK r11 // (dst+byte_count-1) then (dst+byte_count-1)[28:31]
sl@0	93
sl@0	94	#define RSV r12 // storage for VRSAVE register if used
sl@0	95
sl@0	96	// Condition register use (not including temporary cr0)
sl@0	97	// cr0[2] = (FILL==0)?
sl@0	98	// cr1[0,2] = (BC == 0)? 1 : 0; (nothing to move)
sl@0	99	// then cr1[2] = (DST[28:31] == 0)? 1 : 0; (D0 left justified)
sl@0	100	// then cr1[2] = ((DBK = DST+BC-1)[28:31] = 0xF)? 1 : 0; (DN right justified)
sl@0	101	// cr6[2] = (QW == 0)? 1 : 0;
sl@0	102	// then cr6[1] = (QW > 4)? 1 : 0; (>4 vectors to move?)
sl@0	103	// then cr6[3] = (third store[27] == 1)? 1: 0; (cache line alignment)
sl@0	104	// then cr6[3] = (last store[27] == 1)? 1: 0; (last store odd?)
sl@0	105	// cr7[2] = (BC>MIN_VEC)?1:0; (BC big enough to warrant vectors)
sl@0	106	// then cr7[0:3] = (DST+16)[0:27]-DST (How many bytes (iff <16) in first vector?)
sl@0	107	// then cr7[0:3] = (DST+BC)[0:27] (How many bytes (iff <16) in last vector?)
sl@0	108
sl@0	109	// Conditionalize the use of dcba. It will help if the data is
sl@0	110	// not in cache and hurt if it is. Generally, except for small
sl@0	111	// benchmarks repeated many times, we assume data is not in cache
sl@0	112	// (data streaming) and using dcba is a performance boost.
sl@0	113	// We use dcba which will noop to non-cacheable memory rather than
sl@0	114	// dcbz which will cause an aligment exception.
sl@0	115	#ifndef NO_DCBA
sl@0	116	#if defined(__GNUC__) \|\| defined(__MWERKS__) \|\| defined(_DIAB_TOOL)
sl@0	117	// gcc and codewarrior and diab don't assemble dcba
sl@0	118	#define DCBK .long 0x7c033dec
sl@0	119	// dcba r3,r7 or dcba DST,BK
sl@0	120	#else
sl@0	121	#ifdef __ghs__
sl@0	122	.macro DCBK
sl@0	123	.long 0x7c033dec
sl@0	124	.endm
sl@0	125	#else
sl@0	126	#define DCBK dcba DST,BK
sl@0	127	#endif // __ghs__
sl@0	128	#endif // __GNUC__ or __MWERKS__
sl@0	129	#else
sl@0	130	#define DCBK nop
sl@0	131	#endif // NO_DCBA
sl@0	132
sl@0	133	.text
sl@0	134	#ifdef __MWERKS__
sl@0	135	.align 32
sl@0	136	#else
sl@0	137	.align 5
sl@0	138	#endif
sl@0	139
sl@0	140	#ifdef LIBMOTOVEC
sl@0	141	.globl memset
sl@0	142	memset:
sl@0	143	#else
sl@0	144	.globl _vec_memset
sl@0	145	_vec_memset:
sl@0	146	#endif
sl@0	147
sl@0	148	cmpi cr7,0,BC,MIN_VEC // IU1 Check for minimum byte count
sl@0	149	cmpi cr1,0,BC,0 // IU1 Eliminate zero byte count
sl@0	150	rlwinm. Fsh,FILL,28,28,3 // IU1 Is fill byte zero? and shift
sl@0	151
sl@0	152	addi DM1,DST,-1 // IU1 Pre-bias and duplicate destination
sl@0	153	addi DR,DST,16 // IU1 Address of second dst vector
sl@0	154	add DBC,DST,BC // IU1 Address of last dst byte + 1
sl@0	155	bgt cr7,v_memset // b if BC>MIN_VEC
sl@0	156
sl@0	157	mtctr BC // for (i=1;i<=BC;i++)
sl@0	158	beqlr cr1 // return if BC = 0
sl@0	159	Byte_set:
sl@0	160	stbu FILL,1(DM1) // LSU * ++(DST-1) = FILL
sl@0	161	bdnz Byte_set
sl@0	162
sl@0	163	blr
sl@0	164
sl@0	165	v_memset:
sl@0	166	// Byte count < MIN_VEC bytes will have been set by scalar code above,
sl@0	167	// so this will not deal with small block sets < MIN_VEC.
sl@0	168
sl@0	169	// For systems using VRSAVE, define VRSAV=1 when compiling. For systems
sl@0	170	// that don't, make sure VRSAVE is undefined.
sl@0	171	#ifdef VRSAVE
sl@0	172	mfspr RSV,VRSV // IU2 Get current VRSAVE contents
sl@0	173	#endif
sl@0	174	rlwinm DR,DR,0,0,27 // IU1 (DST+16)[0:27]
sl@0	175	addi DBK,DBC,-1 // IU1 Address of last dst byte
sl@0	176
sl@0	177	#ifdef VRSAVE
sl@0	178	oris Rt,RSV,0xe000 // IU1 Or in registers used by this routine
sl@0	179	#endif
sl@0	180	subf D,DST,DR // IU1 How many bytes in first destination?
sl@0	181	li BK,0 // IU1 Initialize byte kount index
sl@0	182
sl@0	183	#ifdef VRSAVE
sl@0	184	mtspr VRSV,Rt // IU2 Save in VRSAVE before first vec op
sl@0	185	#endif
sl@0	186	vxor v0,v0,v0 // VIU Clear v0
sl@0	187	subf QW,DR,DBK // IU1 Bytes of full vectors to move (-16)
sl@0	188	cmpi cr1,0,D,16 // IU1 Is D0 left justified?
sl@0	189	beq+ enter_bzero // b if FILL==0
sl@0	190
sl@0	191	lvsl v0,0,Fsh // LSU Move upper nibble to byte 0 of VR
sl@0	192	vspltisb v1,4 // VPU Splat 0x4 to every byte
sl@0	193
sl@0	194	lvsl v2,0,FILL // LSU Move lower nibble to byte 0 of VR
sl@0	195
sl@0	196	vslb v0,v0,v1 // VIU Move upper nibble to VR[0:3]
sl@0	197
sl@0	198	vor v0,v0,v2 // VIU Form FILL byte in VR[0:7]
sl@0	199
sl@0	200	vspltb v0,v0,0 // VPU Splat the fill byte to all bytes
sl@0	201	enter_bzero:
sl@0	202	mtcrf 0x01,D // IU2 Put bytes in 1st dst in cr7
sl@0	203	rlwinm QW,QW,28,4,31 // IU1 Quad words remaining
sl@0	204	beq cr1,Left_just // b if D0 is left justified
sl@0	205
sl@0	206	bns cr7,No_B_fwd // b if only even number of bytes to store
sl@0	207
sl@0	208	stvebx v0,DST,BK // LSU store first byte at DST+0
sl@0	209	addi BK,BK,1 // IU1 increment index
sl@0	210	No_B_fwd:
sl@0	211	bne cr7,No_H_fwd // b if only words to store
sl@0	212
sl@0	213	stvehx v0,DST,BK // LSU store halfword at DST+0/1
sl@0	214	addi BK,BK,2 // IU1 increment index
sl@0	215	No_H_fwd:
sl@0	216	bng cr7,No_W1_fwd // b if exactly zero or two words to store
sl@0	217
sl@0	218	stvewx v0,DST,BK // LSU store word 1 of one or three
sl@0	219	addi BK,BK,4 // IU1 increment index
sl@0	220
sl@0	221	No_W1_fwd:
sl@0	222	bnl cr7,No_W2_fwd // b if there was only one word to store
sl@0	223	stvewx v0,DST,BK // LSU store word 1 of two or 2 of three
sl@0	224	addi BK,BK,4 // IU1 increment index
sl@0	225
sl@0	226	stvewx v0,DST,BK // LSU store word 2 of two or 3 of three
sl@0	227	b No_W2_fwd
sl@0	228
sl@0	229	Left_just:
sl@0	230	stvx v0,0,DST // LSU Store 16 bytes at D0
sl@0	231	No_W2_fwd:
sl@0	232	rlwinm Rt,DBK,0,28,31 // IU1 (DBK = DST+BC-1)[28:31]
sl@0	233	cmpi cr6,0,QW,0 // IU1 Any full vectors to move?
sl@0	234
sl@0	235	li BK,16 // IU1 Re-initialize byte kount index
sl@0	236	cmpi cr1,0,Rt,0xF // IU1 Is DN right justified?
sl@0	237	ble cr6,Last_QW // b if no Quad words to do
sl@0	238
sl@0	239	mtctr QW // IU2 for (i=0;i<=QW;i++)
sl@0	240	cmpi cr6,0,QW,4 // IU1 Check QW>4
sl@0	241
sl@0	242	QW_loop:
sl@0	243	stvx v0,DST,BK // LSU Store 16 fill bytes
sl@0	244	addi BK,BK,16 // IU1 Increment byte kount index
sl@0	245	bdnzf 25,QW_loop // b if 4 or less quad words to do
sl@0	246
sl@0	247	add DNX,DST,BK // IU1 address of next store (DST+32 if QW>4)
sl@0	248	addi QW,QW,-1 // IU1 One more QW stored by now
sl@0	249	bgt cr6,GT_4QW_fwd // b if >4 quad words left
sl@0	250
sl@0	251	Last_QW: // Next vector is the last; we're done.
sl@0	252	mtcrf 0x01,DBC // IU2 Put final vector byte count in cr7
sl@0	253
sl@0	254	beq cr1,Rt_just_fwd // b if last destination is right justified
sl@0	255
sl@0	256	rlwinm DBK,DBK,0,0,27 // IU1 Round to QW addr of last byte
sl@0	257	li BL,0 // IU1 Initialize index pointer
sl@0	258	bnl cr7,Only_1W_fwd // b if there was only one or zero words to store
sl@0	259
sl@0	260	stvewx v0,DBK,BL // LSU store word 1 of two or three
sl@0	261	addi BL,BL,4 // IU1 increment index
sl@0	262
sl@0	263	stvewx v0,DBK,BL // LSU store word 2 of two or three
sl@0	264	addi BL,BL,4 // IU1 increment index
sl@0	265	Only_1W_fwd:
sl@0	266	bng cr7,Only_2W_fwd // b if there were only two or zero words to store
sl@0	267
sl@0	268	stvewx v0,DBK,BL // LSU store word 3 of three if necessary
sl@0	269	addi BL,BL,4 // IU1 increment index
sl@0	270	Only_2W_fwd:
sl@0	271	bne cr7,Only_B_fwd // b if there are no half words to store
sl@0	272
sl@0	273	stvehx v0,DBK,BL // LSU store one halfword if necessary
sl@0	274	addi BL,BL,2 // IU1 increment index
sl@0	275	Only_B_fwd:
sl@0	276	bns cr7,All_done_fwd // b if there are no bytes to store
sl@0	277
sl@0	278	stvebx v0,DBK,BL // LSU store one byte if necessary
sl@0	279	b All_done_fwd
sl@0	280
sl@0	281	Rt_just_fwd:
sl@0	282
sl@0	283	stvx v0,DST,BK // LSU Store 16 bytes at D14
sl@0	284	All_done_fwd:
sl@0	285	#ifdef VRSAVE
sl@0	286	mtspr VRSV,RSV // IU1 Restore VRSAVE
sl@0	287	#endif
sl@0	288	blr // Return destination address from entry
sl@0	289
sl@0	290	#ifdef __MWERKS__
sl@0	291	.align 16
sl@0	292	#else
sl@0	293	.align 4
sl@0	294	#endif
sl@0	295	GT_4QW_fwd: // Do once if nxt st is to odd half of cache line, else twice
sl@0	296
sl@0	297	addi QW,QW,-1 // IU1 Keeping track of QWs stored
sl@0	298	mtcrf 0x02,DNX // IU2 cr6[3]=((DST+32)[27]==1)?1:0;
sl@0	299	addi DNX,DNX,16 // IU1 Update cr6 for next loop
sl@0	300
sl@0	301	stvx v0,DST,BK // LSU Store 16 bytes at D2
sl@0	302	addi BK,BK,16 // IU1 Increment byte count by 16
sl@0	303	bdnzf 27,GT_4QW_fwd // b if next store is to lower (even) half of CL
sl@0	304
sl@0	305	mtcrf 0x02,DBK // IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?)
sl@0	306
sl@0	307	bns cr6,B32_fwd // b if DST[27] == 0; i.e, final store is even
sl@0	308
sl@0	309	// We need the ctr register to reflect an even byte count before entering
sl@0	310	// the next block - faster to decrement than to reload.
sl@0	311	bdnz B32_fwd // decrement counter for last QW store odd
sl@0	312
sl@0	313	B32_fwd: // Should be at least 2 stores remaining and next 2 are cache aligned
sl@0	314	DCBK // LSU then Kill instead of RWITM
sl@0	315
sl@0	316	stvx v0,DST,BK // LSU Store 16 bytes at D11
sl@0	317	addi BK,BK,16 // IU1 Increment byte count
sl@0	318	bdz Nxt_loc_fwd // always decrement and branch to next instr
sl@0	319
sl@0	320	Nxt_loc_fwd:
sl@0	321	stvx v0,DST,BK // LSU Store 16 bytes at D12
sl@0	322	addi BK,BK,16 // IU1 Increment byte count
sl@0	323	bdnz B32_fwd // b if there are at least two more QWs to do
sl@0	324
sl@0	325	bso cr6,One_even_QW // b if there is one even and one odd QW to store
sl@0	326	b Last_QW // b if last store is to even address
sl@0	327
sl@0	328	// Come here with two more loads and two stores to do
sl@0	329	One_even_QW:
sl@0	330	stvx v0,DST,BK // LSU Store 16 bytes at D13
sl@0	331	addi BK,BK,16 // IU1 Increment byte count
sl@0	332
sl@0	333	b Last_QW
sl@0	334
sl@0	335	// End of memset in AltiVec
sl@0	336
sl@0	337	#define BCz r4 // in bzero r4 enters with byte count
sl@0	338
sl@0	339	#ifdef __MWERKS__
sl@0	340	.align 32
sl@0	341	#else
sl@0	342	.align 5
sl@0	343	#endif
sl@0	344
sl@0	345	#ifdef LIBMOTOVEC
sl@0	346	.globl bzero
sl@0	347	bzero:
sl@0	348	#else
sl@0	349	.globl vec_bzero
sl@0	350	vec_bzero:
sl@0	351	#endif
sl@0	352
sl@0	353	mr BC,BCz // IU1 arg[2] is BC here, not FILL
sl@0	354	li FILL,0 // IU1 for bzero FILL=0
sl@0	355	#ifdef LIBMOTOVEC
sl@0	356	b memset
sl@0	357	#else
sl@0	358	b _vec_memset
sl@0	359	#endif
sl@0	360
sl@0	361	// cacheable_memzero will employ dcbz to clear 32 bytes at a time
sl@0	362	// of cacheable memory. Like bzero, second entering argument will be BC.
sl@0	363	// Using this for non-cacheable memory will generate an alignment exception.
sl@0	364
sl@0	365	.text
sl@0	366	#ifdef __MWERKS__
sl@0	367	.align 32
sl@0	368	#else
sl@0	369	.align 5
sl@0	370	#endif
sl@0	371
sl@0	372	#ifdef LIBMOTOVEC
sl@0	373	.globl cacheable_memzero
sl@0	374	cacheable_memzero:
sl@0	375	#else
sl@0	376	.globl vec_cacheable_memzero
sl@0	377	vec_cacheable_memzero:
sl@0	378	#endif
sl@0	379
sl@0	380	mr BC,BCz // IU1 arg[2] is BC here, not FILL
sl@0	381	li FILL,0 // IU1 for bzero FILL=0
sl@0	382	cmpi cr7,0,BC,MIN_VEC // IU1 Check for minimum byte count
sl@0	383
sl@0	384	cmpi cr1,0,BC,0 // IU1 Eliminate zero byte count
sl@0	385
sl@0	386	addi DM1,DST,-1 // IU1 Pre-bias and duplicate destination
sl@0	387	addi DR,DST,16 // IU1 Address of second dst vector
sl@0	388	add DBC,DST,BC // IU1 Address of last dst byte + 1
sl@0	389	bgt cr7,c_v_memset // b if BC>MIN_VEC
sl@0	390
sl@0	391	mtctr BC // for (i=1;i<=BC;i++)
sl@0	392	beqlr cr1 // return if BC = 0
sl@0	393	c_Byte_set:
sl@0	394	stbu FILL,1(DM1) // LSU * ++(DST-1) = FILL
sl@0	395	bdnz c_Byte_set
sl@0	396
sl@0	397	blr
sl@0	398
sl@0	399	c_v_memset:
sl@0	400	// Byte count < MIN_VEC bytes will have been set by scalar code above,
sl@0	401	// so this will not deal with small block sets < MIN_VEC.
sl@0	402
sl@0	403	// For systems using VRSAVE, define VRSAV=1 when compiling. For systems
sl@0	404	// that don't, make sure VRSAVE is undefined.
sl@0	405	#ifdef VRSAVE
sl@0	406	mfspr RSV,VRSV // IU2 Get current VRSAVE contents
sl@0	407	#endif
sl@0	408	rlwinm DR,DR,0,0,27 // IU1 (DST+16)[0:27]
sl@0	409	addi DBK,DBC,-1 // IU1 Address of last dst byte
sl@0	410
sl@0	411	#ifdef VRSAVE
sl@0	412	oris Rt,RSV,0x8000 // IU1 Or in registers used by this routine
sl@0	413	#endif
sl@0	414	subf D,DST,DR // IU1 How many bytes in first destination?
sl@0	415	li BK,0 // IU1 Initialize byte kount index
sl@0	416
sl@0	417	#ifdef VRSAVE
sl@0	418	mtspr VRSV,Rt // IU2 Save in VRSAVE before first vec op
sl@0	419	#endif
sl@0	420	vxor v0,v0,v0 // VIU Clear v0
sl@0	421	subf QW,DR,DBK // IU1 Bytes of full vectors to move (-16)
sl@0	422	cmpi cr1,0,D,16 // IU1 Is D0 left justified?
sl@0	423
sl@0	424	mtcrf 0x01,D // IU2 Put bytes in 1st dst in cr7
sl@0	425	rlwinm QW,QW,28,4,31 // IU1 Quad words remaining
sl@0	426	beq cr1,c_Left_just // b if D0 is left justified
sl@0	427
sl@0	428	bns cr7,c_No_B_fwd // b if only even number of bytes to store
sl@0	429
sl@0	430	stvebx v0,DST,BK // LSU store first byte at DST+0
sl@0	431	addi BK,BK,1 // IU1 increment index
sl@0	432	c_No_B_fwd:
sl@0	433	bne cr7,c_No_H_fwd // b if only words to store
sl@0	434
sl@0	435	stvehx v0,DST,BK // LSU store halfword at DST+0/1
sl@0	436	addi BK,BK,2 // IU1 increment index
sl@0	437	c_No_H_fwd:
sl@0	438	bng cr7,c_No_W1_fwd // b if exactly zero or two words to store
sl@0	439
sl@0	440	stvewx v0,DST,BK // LSU store word 1 of one or three
sl@0	441	addi BK,BK,4 // IU1 increment index
sl@0	442
sl@0	443	c_No_W1_fwd:
sl@0	444	bnl cr7,c_No_W2_fwd // b if there was only one word to store
sl@0	445	stvewx v0,DST,BK // LSU store word 1 of two or 2 of three
sl@0	446	addi BK,BK,4 // IU1 increment index
sl@0	447
sl@0	448	stvewx v0,DST,BK // LSU store word 2 of two or 3 of three
sl@0	449	b c_No_W2_fwd
sl@0	450
sl@0	451	c_Left_just:
sl@0	452	stvx v0,0,DST // LSU Store 16 bytes at D0
sl@0	453	c_No_W2_fwd:
sl@0	454	rlwinm Rt,DBK,0,28,31 // IU1 (DBK = DST+BC-1)[28:31]
sl@0	455	cmpi cr6,0,QW,0 // IU1 Any full vectors to move?
sl@0	456
sl@0	457	li BK,16 // IU1 Re-initialize byte kount index
sl@0	458	cmpi cr1,0,Rt,0xF // IU1 Is DN right justified?
sl@0	459	ble cr6,c_Last_QW // b if no Quad words to do
sl@0	460
sl@0	461	mtctr QW // IU2 for (i=0;i<=QW;i++)
sl@0	462	cmpi cr6,0,QW,4 // IU1 Check QW>4
sl@0	463
sl@0	464	c_QW_loop:
sl@0	465	stvx v0,DST,BK // LSU Store 16 fill bytes
sl@0	466	addi BK,BK,16 // IU1 Increment byte kount index
sl@0	467	bdnzf 25,c_QW_loop // b if 4 or less quad words to do
sl@0	468
sl@0	469	add DNX,DST,BK // IU1 address of next store (DST+32 if QW>4)
sl@0	470	addi QW,QW,-1 // IU1 One more QW stored by now
sl@0	471	bgt cr6,c_GT_4QW_fwd // b if >4 quad words left
sl@0	472
sl@0	473	c_Last_QW: // Next vector is the last; we're done.
sl@0	474	mtcrf 0x01,DBC // IU2 Put final vector byte count in cr7
sl@0	475
sl@0	476	beq cr1,c_Rt_just_fwd // b if last destination is right justified
sl@0	477
sl@0	478	rlwinm DBK,DBK,0,0,27 // IU1 Round to QW addr of last byte
sl@0	479	li BL,0 // IU1 Initialize index pointer
sl@0	480	bnl cr7,c_Only_1W_fwd // b if there was only one or zero words to store
sl@0	481
sl@0	482	stvewx v0,DBK,BL // LSU store word 1 of two or three
sl@0	483	addi BL,BL,4 // IU1 increment index
sl@0	484
sl@0	485	stvewx v0,DBK,BL // LSU store word 2 of two or three
sl@0	486	addi BL,BL,4 // IU1 increment index
sl@0	487	c_Only_1W_fwd:
sl@0	488	bng cr7,Only_2W_fwd // b if there were only two or zero words to store
sl@0	489
sl@0	490	stvewx v0,DBK,BL // LSU store word 3 of three if necessary
sl@0	491	addi BL,BL,4 // IU1 increment index
sl@0	492	c_Only_2W_fwd:
sl@0	493	bne cr7,c_Only_B_fwd // b if there are no half words to store
sl@0	494
sl@0	495	stvehx v0,DBK,BL // LSU store one halfword if necessary
sl@0	496	addi BL,BL,2 // IU1 increment index
sl@0	497	c_Only_B_fwd:
sl@0	498	bns cr7,c_All_done_fwd // b if there are no bytes to store
sl@0	499
sl@0	500	stvebx v0,DBK,BL // LSU store one byte if necessary
sl@0	501	b c_All_done_fwd
sl@0	502
sl@0	503	c_Rt_just_fwd:
sl@0	504
sl@0	505	stvx v0,DST,BK // LSU Store 16 bytes at D14
sl@0	506	c_All_done_fwd:
sl@0	507	#ifdef VRSAVE
sl@0	508	mtspr VRSV,RSV // IU1 Restore VRSAVE
sl@0	509	#endif
sl@0	510	blr // Return destination address from entry
sl@0	511
sl@0	512	#ifdef __MWERKS__
sl@0	513	.align 16
sl@0	514	#else
sl@0	515	.align 4
sl@0	516	#endif
sl@0	517	c_GT_4QW_fwd: // Do once if nxt st is to odd half of cache line, else twice
sl@0	518
sl@0	519	addi QW,QW,-1 // IU1 Keeping track of QWs stored
sl@0	520	mtcrf 0x02,DNX // IU2 cr6[3]=((DST+32)[27]==1)?1:0;
sl@0	521	addi DNX,DNX,16 // IU1 Update cr6 for next loop
sl@0	522
sl@0	523	stvx v0,DST,BK // LSU Store 16 bytes at D2
sl@0	524	addi BK,BK,16 // IU1 Increment byte count by 16
sl@0	525	bdnzf 27,c_GT_4QW_fwd // b if next store is to lower (even) half of CL
sl@0	526
sl@0	527	mtcrf 0x02,DBK // IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?)
sl@0	528
sl@0	529	bns cr6,c_B32_fwd // b if DST[27] == 0; i.e, final store is even
sl@0	530
sl@0	531	// We need the ctr register to reflect an even byte count before entering
sl@0	532	// the next block - faster to decrement than to reload.
sl@0	533	bdnz B32_fwd // decrement counter for last QW store odd
sl@0	534
sl@0	535	c_B32_fwd: // Should be at least 2 stores remaining and next 2 are cache aligned
sl@0	536	dcbz DST,BK // LSU zero whole cache line
sl@0	537	bdz c_Nxt_loc_fwd // always decrement and branch to next instr
sl@0	538
sl@0	539	c_Nxt_loc_fwd:
sl@0	540	addi BK,BK,32 // IU1 Increment byte count
sl@0	541	bdnz B32_fwd // b if there are at least two more QWs to do
sl@0	542
sl@0	543	bso cr6,c_One_even_QW // b if there is one even and one odd QW to store
sl@0	544	b c_Last_QW // b if last store is to even address
sl@0	545
sl@0	546	// Come here with two more loads and two stores to do
sl@0	547	c_One_even_QW:
sl@0	548	stvx v0,DST,BK // LSU Store 16 bytes at D13
sl@0	549	addi BK,BK,16 // IU1 Increment byte count
sl@0	550
sl@0	551	b c_Last_QW
sl@0	552
sl@0	553	// End of cacheable_memzero in AltiVec

author	sl
	Tue, 10 Jun 2014 14:32:02 +0200
changeset 1	260cb5ec6c19
permissions	-rw-r--r--