sl@0
|
1 |
//------------------------------------------------------------------
|
sl@0
|
2 |
// file: vec_memset.S
|
sl@0
|
3 |
// AltiVec enabled version of memset and bzero and cacheable_memzero
|
sl@0
|
4 |
//------------------------------------------------------------------
|
sl@0
|
5 |
|
sl@0
|
6 |
//------------------------------------------------------------------
|
sl@0
|
7 |
// Copyright Motorola, Inc. 2002
|
sl@0
|
8 |
// ALL RIGHTS RESERVED
|
sl@0
|
9 |
//
|
sl@0
|
10 |
// You are hereby granted a copyright license to use, modify, and
|
sl@0
|
11 |
// distribute the SOFTWARE so long as this entire notice is retained
|
sl@0
|
12 |
// without alteration in any modified and/or redistributed versions,
|
sl@0
|
13 |
// and that such modified versions are clearly identified as such.
|
sl@0
|
14 |
// No licenses are granted by implication, estoppel or otherwise under
|
sl@0
|
15 |
// any patents or trademarks of Motorola, Inc.
|
sl@0
|
16 |
//
|
sl@0
|
17 |
// The SOFTWARE is provided on an "AS IS" basis and without warranty.
|
sl@0
|
18 |
// To the maximum extent permitted by applicable law, MOTOROLA DISCLAIMS
|
sl@0
|
19 |
// ALL WARRANTIES WHETHER EXPRESS OR IMPLIED, INCLUDING IMPLIED
|
sl@0
|
20 |
// WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR
|
sl@0
|
21 |
// PURPOSE AND ANY WARRANTY AGAINST INFRINGEMENT WITH
|
sl@0
|
22 |
// REGARD TO THE SOFTWARE (INCLUDING ANY MODIFIED VERSIONS
|
sl@0
|
23 |
// THEREOF) AND ANY ACCOMPANYING WRITTEN MATERIALS.
|
sl@0
|
24 |
//
|
sl@0
|
25 |
// To the maximum extent permitted by applicable law, IN NO EVENT SHALL
|
sl@0
|
26 |
// MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER
|
sl@0
|
27 |
// (INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF
|
sl@0
|
28 |
// BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS
|
sl@0
|
29 |
// INFORMATION, OR OTHER PECUNIARY LOSS) ARISING OF THE USE OR
|
sl@0
|
30 |
// INABILITY TO USE THE SOFTWARE. Motorola assumes no responsibility
|
sl@0
|
31 |
// for the maintenance and support of the SOFTWARE.
|
sl@0
|
32 |
//------------------------------------------------------------------
|
sl@0
|
33 |
|
sl@0
|
34 |
//------------------------------------------------------------------
|
sl@0
|
35 |
// extern void *memset( void *ptr, int val, size_t len );
|
sl@0
|
36 |
// Copies val into each of len characters beginning at ptr.
|
sl@0
|
37 |
// - Harbison&Steele 4th ed
|
sl@0
|
38 |
// (despite val being an int, this memset assumes it is never
|
sl@0
|
39 |
// more than a byte. That seems to be correct from all the
|
sl@0
|
40 |
// memset functions I've seen but I don't know if ANSI allows
|
sl@0
|
41 |
// anthing longer. Chuck Corley 12/21/02)
|
sl@0
|
42 |
// Returns:
|
sl@0
|
43 |
// void * ptr
|
sl@0
|
44 |
//------------------------------------------------------------------
|
sl@0
|
45 |
|
sl@0
|
46 |
//------------------------------------------------------------------
|
sl@0
|
47 |
// extern void * bzero( char *ptr, int len);
|
sl@0
|
48 |
// Copies 0 into each of len characters at ptr.
|
sl@0
|
49 |
// - Harbison&Steele 4th ed
|
sl@0
|
50 |
// Returns:
|
sl@0
|
51 |
// void * ptr
|
sl@0
|
52 |
//------------------------------------------------------------------
|
sl@0
|
53 |
|
sl@0
|
54 |
// Revision History:
|
sl@0
|
55 |
// Rev 0.0 Original Chuck Corley 02/09/03
|
sl@0
|
56 |
// Could benefit from changes added to memcpy
|
sl@0
|
57 |
// Rev 0.1 Revised per memcpy Rev 0.30 Chuck Corley 05/01/03
|
sl@0
|
58 |
//
|
sl@0
|
59 |
// This is beta quality code; users are encouraged to make it faster.
|
sl@0
|
60 |
// ASSUMPTIONS:
|
sl@0
|
61 |
// Code is highly likely to be in the cache; data is not (streaming data)
|
sl@0
|
62 |
// Zero fill could be quite likely.
|
sl@0
|
63 |
// Moving fill byte from GPR to VR as below faster than stw->lvebx via stack
|
sl@0
|
64 |
|
sl@0
|
65 |
#define VRSV 256 // VRSAVE spr
|
sl@0
|
66 |
// Don't use vectors for BC <= MIN_VEC. Works only if MIN_VEC >= 16 bytes.
|
sl@0
|
67 |
#define MIN_VEC 16
|
sl@0
|
68 |
|
sl@0
|
69 |
// Register useage
|
sl@0
|
70 |
#define Rt r0 // r0 when used as a temporary register
|
sl@0
|
71 |
|
sl@0
|
72 |
#define DST r3 // entering: dest pointer; exiting: same dest pointer
|
sl@0
|
73 |
|
sl@0
|
74 |
#define FILL r4 // entering: fill char then fill word
|
sl@0
|
75 |
|
sl@0
|
76 |
#define BC r5 // entering: Byte_Count then remaining Byte_Count
|
sl@0
|
77 |
|
sl@0
|
78 |
#define DBC r6// dst + byte count
|
sl@0
|
79 |
|
sl@0
|
80 |
#define BK r7 // BC - 1 +/- (n*16)
|
sl@0
|
81 |
|
sl@0
|
82 |
#define Fsh r8 // fill byte shifted right one nibble
|
sl@0
|
83 |
|
sl@0
|
84 |
#define DM1 r9// dst -1 for byte-by-byte backwards initially
|
sl@0
|
85 |
#define D r9 // (dst+16)[0:27] - dst[28:31]
|
sl@0
|
86 |
#define DNX r9 // (dst+n*16)[28:31]
|
sl@0
|
87 |
#define BL r9 // second byte_kount index pointer
|
sl@0
|
88 |
|
sl@0
|
89 |
#define DR r10 // (dst+16)[0:27]
|
sl@0
|
90 |
#define QW r10 // number of cache lines
|
sl@0
|
91 |
|
sl@0
|
92 |
#define DBK r11 // (dst+byte_count-1) then (dst+byte_count-1)[28:31]
|
sl@0
|
93 |
|
sl@0
|
94 |
#define RSV r12 // storage for VRSAVE register if used
|
sl@0
|
95 |
|
sl@0
|
96 |
// Condition register use (not including temporary cr0)
|
sl@0
|
97 |
// cr0[2] = (FILL==0)?
|
sl@0
|
98 |
// cr1[0,2] = (BC == 0)? 1 : 0; (nothing to move)
|
sl@0
|
99 |
// then cr1[2] = (DST[28:31] == 0)? 1 : 0; (D0 left justified)
|
sl@0
|
100 |
// then cr1[2] = ((DBK = DST+BC-1)[28:31] = 0xF)? 1 : 0; (DN right justified)
|
sl@0
|
101 |
// cr6[2] = (QW == 0)? 1 : 0;
|
sl@0
|
102 |
// then cr6[1] = (QW > 4)? 1 : 0; (>4 vectors to move?)
|
sl@0
|
103 |
// then cr6[3] = (third store[27] == 1)? 1: 0; (cache line alignment)
|
sl@0
|
104 |
// then cr6[3] = (last store[27] == 1)? 1: 0; (last store odd?)
|
sl@0
|
105 |
// cr7[2] = (BC>MIN_VEC)?1:0; (BC big enough to warrant vectors)
|
sl@0
|
106 |
// then cr7[0:3] = (DST+16)[0:27]-DST (How many bytes (iff <16) in first vector?)
|
sl@0
|
107 |
// then cr7[0:3] = (DST+BC)[0:27] (How many bytes (iff <16) in last vector?)
|
sl@0
|
108 |
|
sl@0
|
109 |
// Conditionalize the use of dcba. It will help if the data is
|
sl@0
|
110 |
// not in cache and hurt if it is. Generally, except for small
|
sl@0
|
111 |
// benchmarks repeated many times, we assume data is not in cache
|
sl@0
|
112 |
// (data streaming) and using dcba is a performance boost.
|
sl@0
|
113 |
// We use dcba which will noop to non-cacheable memory rather than
|
sl@0
|
114 |
// dcbz which will cause an aligment exception.
|
sl@0
|
115 |
#ifndef NO_DCBA
|
sl@0
|
116 |
#if defined(__GNUC__) || defined(__MWERKS__) || defined(_DIAB_TOOL)
|
sl@0
|
117 |
// gcc and codewarrior and diab don't assemble dcba
|
sl@0
|
118 |
#define DCBK .long 0x7c033dec
|
sl@0
|
119 |
// dcba r3,r7 or dcba DST,BK
|
sl@0
|
120 |
#else
|
sl@0
|
121 |
#ifdef __ghs__
|
sl@0
|
122 |
.macro DCBK
|
sl@0
|
123 |
.long 0x7c033dec
|
sl@0
|
124 |
.endm
|
sl@0
|
125 |
#else
|
sl@0
|
126 |
#define DCBK dcba DST,BK
|
sl@0
|
127 |
#endif // __ghs__
|
sl@0
|
128 |
#endif // __GNUC__ or __MWERKS__
|
sl@0
|
129 |
#else
|
sl@0
|
130 |
#define DCBK nop
|
sl@0
|
131 |
#endif // NO_DCBA
|
sl@0
|
132 |
|
sl@0
|
133 |
.text
|
sl@0
|
134 |
#ifdef __MWERKS__
|
sl@0
|
135 |
.align 32
|
sl@0
|
136 |
#else
|
sl@0
|
137 |
.align 5
|
sl@0
|
138 |
#endif
|
sl@0
|
139 |
|
sl@0
|
140 |
#ifdef LIBMOTOVEC
|
sl@0
|
141 |
.globl memset
|
sl@0
|
142 |
memset:
|
sl@0
|
143 |
#else
|
sl@0
|
144 |
.globl _vec_memset
|
sl@0
|
145 |
_vec_memset:
|
sl@0
|
146 |
#endif
|
sl@0
|
147 |
|
sl@0
|
148 |
cmpi cr7,0,BC,MIN_VEC // IU1 Check for minimum byte count
|
sl@0
|
149 |
cmpi cr1,0,BC,0 // IU1 Eliminate zero byte count
|
sl@0
|
150 |
rlwinm. Fsh,FILL,28,28,3 // IU1 Is fill byte zero? and shift
|
sl@0
|
151 |
|
sl@0
|
152 |
addi DM1,DST,-1 // IU1 Pre-bias and duplicate destination
|
sl@0
|
153 |
addi DR,DST,16 // IU1 Address of second dst vector
|
sl@0
|
154 |
add DBC,DST,BC // IU1 Address of last dst byte + 1
|
sl@0
|
155 |
bgt cr7,v_memset // b if BC>MIN_VEC
|
sl@0
|
156 |
|
sl@0
|
157 |
mtctr BC // for (i=1;i<=BC;i++)
|
sl@0
|
158 |
beqlr cr1 // return if BC = 0
|
sl@0
|
159 |
Byte_set:
|
sl@0
|
160 |
stbu FILL,1(DM1) // LSU * ++(DST-1) = FILL
|
sl@0
|
161 |
bdnz Byte_set
|
sl@0
|
162 |
|
sl@0
|
163 |
blr
|
sl@0
|
164 |
|
sl@0
|
165 |
v_memset:
|
sl@0
|
166 |
// Byte count < MIN_VEC bytes will have been set by scalar code above,
|
sl@0
|
167 |
// so this will not deal with small block sets < MIN_VEC.
|
sl@0
|
168 |
|
sl@0
|
169 |
// For systems using VRSAVE, define VRSAV=1 when compiling. For systems
|
sl@0
|
170 |
// that don't, make sure VRSAVE is undefined.
|
sl@0
|
171 |
#ifdef VRSAVE
|
sl@0
|
172 |
mfspr RSV,VRSV // IU2 Get current VRSAVE contents
|
sl@0
|
173 |
#endif
|
sl@0
|
174 |
rlwinm DR,DR,0,0,27 // IU1 (DST+16)[0:27]
|
sl@0
|
175 |
addi DBK,DBC,-1 // IU1 Address of last dst byte
|
sl@0
|
176 |
|
sl@0
|
177 |
#ifdef VRSAVE
|
sl@0
|
178 |
oris Rt,RSV,0xe000 // IU1 Or in registers used by this routine
|
sl@0
|
179 |
#endif
|
sl@0
|
180 |
subf D,DST,DR // IU1 How many bytes in first destination?
|
sl@0
|
181 |
li BK,0 // IU1 Initialize byte kount index
|
sl@0
|
182 |
|
sl@0
|
183 |
#ifdef VRSAVE
|
sl@0
|
184 |
mtspr VRSV,Rt // IU2 Save in VRSAVE before first vec op
|
sl@0
|
185 |
#endif
|
sl@0
|
186 |
vxor v0,v0,v0 // VIU Clear v0
|
sl@0
|
187 |
subf QW,DR,DBK // IU1 Bytes of full vectors to move (-16)
|
sl@0
|
188 |
cmpi cr1,0,D,16 // IU1 Is D0 left justified?
|
sl@0
|
189 |
beq+ enter_bzero // b if FILL==0
|
sl@0
|
190 |
|
sl@0
|
191 |
lvsl v0,0,Fsh // LSU Move upper nibble to byte 0 of VR
|
sl@0
|
192 |
vspltisb v1,4 // VPU Splat 0x4 to every byte
|
sl@0
|
193 |
|
sl@0
|
194 |
lvsl v2,0,FILL // LSU Move lower nibble to byte 0 of VR
|
sl@0
|
195 |
|
sl@0
|
196 |
vslb v0,v0,v1 // VIU Move upper nibble to VR[0:3]
|
sl@0
|
197 |
|
sl@0
|
198 |
vor v0,v0,v2 // VIU Form FILL byte in VR[0:7]
|
sl@0
|
199 |
|
sl@0
|
200 |
vspltb v0,v0,0 // VPU Splat the fill byte to all bytes
|
sl@0
|
201 |
enter_bzero:
|
sl@0
|
202 |
mtcrf 0x01,D // IU2 Put bytes in 1st dst in cr7
|
sl@0
|
203 |
rlwinm QW,QW,28,4,31 // IU1 Quad words remaining
|
sl@0
|
204 |
beq cr1,Left_just // b if D0 is left justified
|
sl@0
|
205 |
|
sl@0
|
206 |
bns cr7,No_B_fwd // b if only even number of bytes to store
|
sl@0
|
207 |
|
sl@0
|
208 |
stvebx v0,DST,BK // LSU store first byte at DST+0
|
sl@0
|
209 |
addi BK,BK,1 // IU1 increment index
|
sl@0
|
210 |
No_B_fwd:
|
sl@0
|
211 |
bne cr7,No_H_fwd // b if only words to store
|
sl@0
|
212 |
|
sl@0
|
213 |
stvehx v0,DST,BK // LSU store halfword at DST+0/1
|
sl@0
|
214 |
addi BK,BK,2 // IU1 increment index
|
sl@0
|
215 |
No_H_fwd:
|
sl@0
|
216 |
bng cr7,No_W1_fwd // b if exactly zero or two words to store
|
sl@0
|
217 |
|
sl@0
|
218 |
stvewx v0,DST,BK // LSU store word 1 of one or three
|
sl@0
|
219 |
addi BK,BK,4 // IU1 increment index
|
sl@0
|
220 |
|
sl@0
|
221 |
No_W1_fwd:
|
sl@0
|
222 |
bnl cr7,No_W2_fwd // b if there was only one word to store
|
sl@0
|
223 |
stvewx v0,DST,BK // LSU store word 1 of two or 2 of three
|
sl@0
|
224 |
addi BK,BK,4 // IU1 increment index
|
sl@0
|
225 |
|
sl@0
|
226 |
stvewx v0,DST,BK // LSU store word 2 of two or 3 of three
|
sl@0
|
227 |
b No_W2_fwd
|
sl@0
|
228 |
|
sl@0
|
229 |
Left_just:
|
sl@0
|
230 |
stvx v0,0,DST // LSU Store 16 bytes at D0
|
sl@0
|
231 |
No_W2_fwd:
|
sl@0
|
232 |
rlwinm Rt,DBK,0,28,31 // IU1 (DBK = DST+BC-1)[28:31]
|
sl@0
|
233 |
cmpi cr6,0,QW,0 // IU1 Any full vectors to move?
|
sl@0
|
234 |
|
sl@0
|
235 |
li BK,16 // IU1 Re-initialize byte kount index
|
sl@0
|
236 |
cmpi cr1,0,Rt,0xF // IU1 Is DN right justified?
|
sl@0
|
237 |
ble cr6,Last_QW // b if no Quad words to do
|
sl@0
|
238 |
|
sl@0
|
239 |
mtctr QW // IU2 for (i=0;i<=QW;i++)
|
sl@0
|
240 |
cmpi cr6,0,QW,4 // IU1 Check QW>4
|
sl@0
|
241 |
|
sl@0
|
242 |
QW_loop:
|
sl@0
|
243 |
stvx v0,DST,BK // LSU Store 16 fill bytes
|
sl@0
|
244 |
addi BK,BK,16 // IU1 Increment byte kount index
|
sl@0
|
245 |
bdnzf 25,QW_loop // b if 4 or less quad words to do
|
sl@0
|
246 |
|
sl@0
|
247 |
add DNX,DST,BK // IU1 address of next store (DST+32 if QW>4)
|
sl@0
|
248 |
addi QW,QW,-1 // IU1 One more QW stored by now
|
sl@0
|
249 |
bgt cr6,GT_4QW_fwd // b if >4 quad words left
|
sl@0
|
250 |
|
sl@0
|
251 |
Last_QW: // Next vector is the last; we're done.
|
sl@0
|
252 |
mtcrf 0x01,DBC // IU2 Put final vector byte count in cr7
|
sl@0
|
253 |
|
sl@0
|
254 |
beq cr1,Rt_just_fwd // b if last destination is right justified
|
sl@0
|
255 |
|
sl@0
|
256 |
rlwinm DBK,DBK,0,0,27 // IU1 Round to QW addr of last byte
|
sl@0
|
257 |
li BL,0 // IU1 Initialize index pointer
|
sl@0
|
258 |
bnl cr7,Only_1W_fwd // b if there was only one or zero words to store
|
sl@0
|
259 |
|
sl@0
|
260 |
stvewx v0,DBK,BL // LSU store word 1 of two or three
|
sl@0
|
261 |
addi BL,BL,4 // IU1 increment index
|
sl@0
|
262 |
|
sl@0
|
263 |
stvewx v0,DBK,BL // LSU store word 2 of two or three
|
sl@0
|
264 |
addi BL,BL,4 // IU1 increment index
|
sl@0
|
265 |
Only_1W_fwd:
|
sl@0
|
266 |
bng cr7,Only_2W_fwd // b if there were only two or zero words to store
|
sl@0
|
267 |
|
sl@0
|
268 |
stvewx v0,DBK,BL // LSU store word 3 of three if necessary
|
sl@0
|
269 |
addi BL,BL,4 // IU1 increment index
|
sl@0
|
270 |
Only_2W_fwd:
|
sl@0
|
271 |
bne cr7,Only_B_fwd // b if there are no half words to store
|
sl@0
|
272 |
|
sl@0
|
273 |
stvehx v0,DBK,BL // LSU store one halfword if necessary
|
sl@0
|
274 |
addi BL,BL,2 // IU1 increment index
|
sl@0
|
275 |
Only_B_fwd:
|
sl@0
|
276 |
bns cr7,All_done_fwd // b if there are no bytes to store
|
sl@0
|
277 |
|
sl@0
|
278 |
stvebx v0,DBK,BL // LSU store one byte if necessary
|
sl@0
|
279 |
b All_done_fwd
|
sl@0
|
280 |
|
sl@0
|
281 |
Rt_just_fwd:
|
sl@0
|
282 |
|
sl@0
|
283 |
stvx v0,DST,BK // LSU Store 16 bytes at D14
|
sl@0
|
284 |
All_done_fwd:
|
sl@0
|
285 |
#ifdef VRSAVE
|
sl@0
|
286 |
mtspr VRSV,RSV // IU1 Restore VRSAVE
|
sl@0
|
287 |
#endif
|
sl@0
|
288 |
blr // Return destination address from entry
|
sl@0
|
289 |
|
sl@0
|
290 |
#ifdef __MWERKS__
|
sl@0
|
291 |
.align 16
|
sl@0
|
292 |
#else
|
sl@0
|
293 |
.align 4
|
sl@0
|
294 |
#endif
|
sl@0
|
295 |
GT_4QW_fwd: // Do once if nxt st is to odd half of cache line, else twice
|
sl@0
|
296 |
|
sl@0
|
297 |
addi QW,QW,-1 // IU1 Keeping track of QWs stored
|
sl@0
|
298 |
mtcrf 0x02,DNX // IU2 cr6[3]=((DST+32)[27]==1)?1:0;
|
sl@0
|
299 |
addi DNX,DNX,16 // IU1 Update cr6 for next loop
|
sl@0
|
300 |
|
sl@0
|
301 |
stvx v0,DST,BK // LSU Store 16 bytes at D2
|
sl@0
|
302 |
addi BK,BK,16 // IU1 Increment byte count by 16
|
sl@0
|
303 |
bdnzf 27,GT_4QW_fwd // b if next store is to lower (even) half of CL
|
sl@0
|
304 |
|
sl@0
|
305 |
mtcrf 0x02,DBK // IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?)
|
sl@0
|
306 |
|
sl@0
|
307 |
bns cr6,B32_fwd // b if DST[27] == 0; i.e, final store is even
|
sl@0
|
308 |
|
sl@0
|
309 |
// We need the ctr register to reflect an even byte count before entering
|
sl@0
|
310 |
// the next block - faster to decrement than to reload.
|
sl@0
|
311 |
bdnz B32_fwd // decrement counter for last QW store odd
|
sl@0
|
312 |
|
sl@0
|
313 |
B32_fwd: // Should be at least 2 stores remaining and next 2 are cache aligned
|
sl@0
|
314 |
DCBK // LSU then Kill instead of RWITM
|
sl@0
|
315 |
|
sl@0
|
316 |
stvx v0,DST,BK // LSU Store 16 bytes at D11
|
sl@0
|
317 |
addi BK,BK,16 // IU1 Increment byte count
|
sl@0
|
318 |
bdz Nxt_loc_fwd // always decrement and branch to next instr
|
sl@0
|
319 |
|
sl@0
|
320 |
Nxt_loc_fwd:
|
sl@0
|
321 |
stvx v0,DST,BK // LSU Store 16 bytes at D12
|
sl@0
|
322 |
addi BK,BK,16 // IU1 Increment byte count
|
sl@0
|
323 |
bdnz B32_fwd // b if there are at least two more QWs to do
|
sl@0
|
324 |
|
sl@0
|
325 |
bso cr6,One_even_QW // b if there is one even and one odd QW to store
|
sl@0
|
326 |
b Last_QW // b if last store is to even address
|
sl@0
|
327 |
|
sl@0
|
328 |
// Come here with two more loads and two stores to do
|
sl@0
|
329 |
One_even_QW:
|
sl@0
|
330 |
stvx v0,DST,BK // LSU Store 16 bytes at D13
|
sl@0
|
331 |
addi BK,BK,16 // IU1 Increment byte count
|
sl@0
|
332 |
|
sl@0
|
333 |
b Last_QW
|
sl@0
|
334 |
|
sl@0
|
335 |
// End of memset in AltiVec
|
sl@0
|
336 |
|
sl@0
|
337 |
#define BCz r4 // in bzero r4 enters with byte count
|
sl@0
|
338 |
|
sl@0
|
339 |
#ifdef __MWERKS__
|
sl@0
|
340 |
.align 32
|
sl@0
|
341 |
#else
|
sl@0
|
342 |
.align 5
|
sl@0
|
343 |
#endif
|
sl@0
|
344 |
|
sl@0
|
345 |
#ifdef LIBMOTOVEC
|
sl@0
|
346 |
.globl bzero
|
sl@0
|
347 |
bzero:
|
sl@0
|
348 |
#else
|
sl@0
|
349 |
.globl vec_bzero
|
sl@0
|
350 |
vec_bzero:
|
sl@0
|
351 |
#endif
|
sl@0
|
352 |
|
sl@0
|
353 |
mr BC,BCz // IU1 arg[2] is BC here, not FILL
|
sl@0
|
354 |
li FILL,0 // IU1 for bzero FILL=0
|
sl@0
|
355 |
#ifdef LIBMOTOVEC
|
sl@0
|
356 |
b memset
|
sl@0
|
357 |
#else
|
sl@0
|
358 |
b _vec_memset
|
sl@0
|
359 |
#endif
|
sl@0
|
360 |
|
sl@0
|
361 |
// cacheable_memzero will employ dcbz to clear 32 bytes at a time
|
sl@0
|
362 |
// of cacheable memory. Like bzero, second entering argument will be BC.
|
sl@0
|
363 |
// Using this for non-cacheable memory will generate an alignment exception.
|
sl@0
|
364 |
|
sl@0
|
365 |
.text
|
sl@0
|
366 |
#ifdef __MWERKS__
|
sl@0
|
367 |
.align 32
|
sl@0
|
368 |
#else
|
sl@0
|
369 |
.align 5
|
sl@0
|
370 |
#endif
|
sl@0
|
371 |
|
sl@0
|
372 |
#ifdef LIBMOTOVEC
|
sl@0
|
373 |
.globl cacheable_memzero
|
sl@0
|
374 |
cacheable_memzero:
|
sl@0
|
375 |
#else
|
sl@0
|
376 |
.globl vec_cacheable_memzero
|
sl@0
|
377 |
vec_cacheable_memzero:
|
sl@0
|
378 |
#endif
|
sl@0
|
379 |
|
sl@0
|
380 |
mr BC,BCz // IU1 arg[2] is BC here, not FILL
|
sl@0
|
381 |
li FILL,0 // IU1 for bzero FILL=0
|
sl@0
|
382 |
cmpi cr7,0,BC,MIN_VEC // IU1 Check for minimum byte count
|
sl@0
|
383 |
|
sl@0
|
384 |
cmpi cr1,0,BC,0 // IU1 Eliminate zero byte count
|
sl@0
|
385 |
|
sl@0
|
386 |
addi DM1,DST,-1 // IU1 Pre-bias and duplicate destination
|
sl@0
|
387 |
addi DR,DST,16 // IU1 Address of second dst vector
|
sl@0
|
388 |
add DBC,DST,BC // IU1 Address of last dst byte + 1
|
sl@0
|
389 |
bgt cr7,c_v_memset // b if BC>MIN_VEC
|
sl@0
|
390 |
|
sl@0
|
391 |
mtctr BC // for (i=1;i<=BC;i++)
|
sl@0
|
392 |
beqlr cr1 // return if BC = 0
|
sl@0
|
393 |
c_Byte_set:
|
sl@0
|
394 |
stbu FILL,1(DM1) // LSU * ++(DST-1) = FILL
|
sl@0
|
395 |
bdnz c_Byte_set
|
sl@0
|
396 |
|
sl@0
|
397 |
blr
|
sl@0
|
398 |
|
sl@0
|
399 |
c_v_memset:
|
sl@0
|
400 |
// Byte count < MIN_VEC bytes will have been set by scalar code above,
|
sl@0
|
401 |
// so this will not deal with small block sets < MIN_VEC.
|
sl@0
|
402 |
|
sl@0
|
403 |
// For systems using VRSAVE, define VRSAV=1 when compiling. For systems
|
sl@0
|
404 |
// that don't, make sure VRSAVE is undefined.
|
sl@0
|
405 |
#ifdef VRSAVE
|
sl@0
|
406 |
mfspr RSV,VRSV // IU2 Get current VRSAVE contents
|
sl@0
|
407 |
#endif
|
sl@0
|
408 |
rlwinm DR,DR,0,0,27 // IU1 (DST+16)[0:27]
|
sl@0
|
409 |
addi DBK,DBC,-1 // IU1 Address of last dst byte
|
sl@0
|
410 |
|
sl@0
|
411 |
#ifdef VRSAVE
|
sl@0
|
412 |
oris Rt,RSV,0x8000 // IU1 Or in registers used by this routine
|
sl@0
|
413 |
#endif
|
sl@0
|
414 |
subf D,DST,DR // IU1 How many bytes in first destination?
|
sl@0
|
415 |
li BK,0 // IU1 Initialize byte kount index
|
sl@0
|
416 |
|
sl@0
|
417 |
#ifdef VRSAVE
|
sl@0
|
418 |
mtspr VRSV,Rt // IU2 Save in VRSAVE before first vec op
|
sl@0
|
419 |
#endif
|
sl@0
|
420 |
vxor v0,v0,v0 // VIU Clear v0
|
sl@0
|
421 |
subf QW,DR,DBK // IU1 Bytes of full vectors to move (-16)
|
sl@0
|
422 |
cmpi cr1,0,D,16 // IU1 Is D0 left justified?
|
sl@0
|
423 |
|
sl@0
|
424 |
mtcrf 0x01,D // IU2 Put bytes in 1st dst in cr7
|
sl@0
|
425 |
rlwinm QW,QW,28,4,31 // IU1 Quad words remaining
|
sl@0
|
426 |
beq cr1,c_Left_just // b if D0 is left justified
|
sl@0
|
427 |
|
sl@0
|
428 |
bns cr7,c_No_B_fwd // b if only even number of bytes to store
|
sl@0
|
429 |
|
sl@0
|
430 |
stvebx v0,DST,BK // LSU store first byte at DST+0
|
sl@0
|
431 |
addi BK,BK,1 // IU1 increment index
|
sl@0
|
432 |
c_No_B_fwd:
|
sl@0
|
433 |
bne cr7,c_No_H_fwd // b if only words to store
|
sl@0
|
434 |
|
sl@0
|
435 |
stvehx v0,DST,BK // LSU store halfword at DST+0/1
|
sl@0
|
436 |
addi BK,BK,2 // IU1 increment index
|
sl@0
|
437 |
c_No_H_fwd:
|
sl@0
|
438 |
bng cr7,c_No_W1_fwd // b if exactly zero or two words to store
|
sl@0
|
439 |
|
sl@0
|
440 |
stvewx v0,DST,BK // LSU store word 1 of one or three
|
sl@0
|
441 |
addi BK,BK,4 // IU1 increment index
|
sl@0
|
442 |
|
sl@0
|
443 |
c_No_W1_fwd:
|
sl@0
|
444 |
bnl cr7,c_No_W2_fwd // b if there was only one word to store
|
sl@0
|
445 |
stvewx v0,DST,BK // LSU store word 1 of two or 2 of three
|
sl@0
|
446 |
addi BK,BK,4 // IU1 increment index
|
sl@0
|
447 |
|
sl@0
|
448 |
stvewx v0,DST,BK // LSU store word 2 of two or 3 of three
|
sl@0
|
449 |
b c_No_W2_fwd
|
sl@0
|
450 |
|
sl@0
|
451 |
c_Left_just:
|
sl@0
|
452 |
stvx v0,0,DST // LSU Store 16 bytes at D0
|
sl@0
|
453 |
c_No_W2_fwd:
|
sl@0
|
454 |
rlwinm Rt,DBK,0,28,31 // IU1 (DBK = DST+BC-1)[28:31]
|
sl@0
|
455 |
cmpi cr6,0,QW,0 // IU1 Any full vectors to move?
|
sl@0
|
456 |
|
sl@0
|
457 |
li BK,16 // IU1 Re-initialize byte kount index
|
sl@0
|
458 |
cmpi cr1,0,Rt,0xF // IU1 Is DN right justified?
|
sl@0
|
459 |
ble cr6,c_Last_QW // b if no Quad words to do
|
sl@0
|
460 |
|
sl@0
|
461 |
mtctr QW // IU2 for (i=0;i<=QW;i++)
|
sl@0
|
462 |
cmpi cr6,0,QW,4 // IU1 Check QW>4
|
sl@0
|
463 |
|
sl@0
|
464 |
c_QW_loop:
|
sl@0
|
465 |
stvx v0,DST,BK // LSU Store 16 fill bytes
|
sl@0
|
466 |
addi BK,BK,16 // IU1 Increment byte kount index
|
sl@0
|
467 |
bdnzf 25,c_QW_loop // b if 4 or less quad words to do
|
sl@0
|
468 |
|
sl@0
|
469 |
add DNX,DST,BK // IU1 address of next store (DST+32 if QW>4)
|
sl@0
|
470 |
addi QW,QW,-1 // IU1 One more QW stored by now
|
sl@0
|
471 |
bgt cr6,c_GT_4QW_fwd // b if >4 quad words left
|
sl@0
|
472 |
|
sl@0
|
473 |
c_Last_QW: // Next vector is the last; we're done.
|
sl@0
|
474 |
mtcrf 0x01,DBC // IU2 Put final vector byte count in cr7
|
sl@0
|
475 |
|
sl@0
|
476 |
beq cr1,c_Rt_just_fwd // b if last destination is right justified
|
sl@0
|
477 |
|
sl@0
|
478 |
rlwinm DBK,DBK,0,0,27 // IU1 Round to QW addr of last byte
|
sl@0
|
479 |
li BL,0 // IU1 Initialize index pointer
|
sl@0
|
480 |
bnl cr7,c_Only_1W_fwd // b if there was only one or zero words to store
|
sl@0
|
481 |
|
sl@0
|
482 |
stvewx v0,DBK,BL // LSU store word 1 of two or three
|
sl@0
|
483 |
addi BL,BL,4 // IU1 increment index
|
sl@0
|
484 |
|
sl@0
|
485 |
stvewx v0,DBK,BL // LSU store word 2 of two or three
|
sl@0
|
486 |
addi BL,BL,4 // IU1 increment index
|
sl@0
|
487 |
c_Only_1W_fwd:
|
sl@0
|
488 |
bng cr7,Only_2W_fwd // b if there were only two or zero words to store
|
sl@0
|
489 |
|
sl@0
|
490 |
stvewx v0,DBK,BL // LSU store word 3 of three if necessary
|
sl@0
|
491 |
addi BL,BL,4 // IU1 increment index
|
sl@0
|
492 |
c_Only_2W_fwd:
|
sl@0
|
493 |
bne cr7,c_Only_B_fwd // b if there are no half words to store
|
sl@0
|
494 |
|
sl@0
|
495 |
stvehx v0,DBK,BL // LSU store one halfword if necessary
|
sl@0
|
496 |
addi BL,BL,2 // IU1 increment index
|
sl@0
|
497 |
c_Only_B_fwd:
|
sl@0
|
498 |
bns cr7,c_All_done_fwd // b if there are no bytes to store
|
sl@0
|
499 |
|
sl@0
|
500 |
stvebx v0,DBK,BL // LSU store one byte if necessary
|
sl@0
|
501 |
b c_All_done_fwd
|
sl@0
|
502 |
|
sl@0
|
503 |
c_Rt_just_fwd:
|
sl@0
|
504 |
|
sl@0
|
505 |
stvx v0,DST,BK // LSU Store 16 bytes at D14
|
sl@0
|
506 |
c_All_done_fwd:
|
sl@0
|
507 |
#ifdef VRSAVE
|
sl@0
|
508 |
mtspr VRSV,RSV // IU1 Restore VRSAVE
|
sl@0
|
509 |
#endif
|
sl@0
|
510 |
blr // Return destination address from entry
|
sl@0
|
511 |
|
sl@0
|
512 |
#ifdef __MWERKS__
|
sl@0
|
513 |
.align 16
|
sl@0
|
514 |
#else
|
sl@0
|
515 |
.align 4
|
sl@0
|
516 |
#endif
|
sl@0
|
517 |
c_GT_4QW_fwd: // Do once if nxt st is to odd half of cache line, else twice
|
sl@0
|
518 |
|
sl@0
|
519 |
addi QW,QW,-1 // IU1 Keeping track of QWs stored
|
sl@0
|
520 |
mtcrf 0x02,DNX // IU2 cr6[3]=((DST+32)[27]==1)?1:0;
|
sl@0
|
521 |
addi DNX,DNX,16 // IU1 Update cr6 for next loop
|
sl@0
|
522 |
|
sl@0
|
523 |
stvx v0,DST,BK // LSU Store 16 bytes at D2
|
sl@0
|
524 |
addi BK,BK,16 // IU1 Increment byte count by 16
|
sl@0
|
525 |
bdnzf 27,c_GT_4QW_fwd // b if next store is to lower (even) half of CL
|
sl@0
|
526 |
|
sl@0
|
527 |
mtcrf 0x02,DBK // IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?)
|
sl@0
|
528 |
|
sl@0
|
529 |
bns cr6,c_B32_fwd // b if DST[27] == 0; i.e, final store is even
|
sl@0
|
530 |
|
sl@0
|
531 |
// We need the ctr register to reflect an even byte count before entering
|
sl@0
|
532 |
// the next block - faster to decrement than to reload.
|
sl@0
|
533 |
bdnz B32_fwd // decrement counter for last QW store odd
|
sl@0
|
534 |
|
sl@0
|
535 |
c_B32_fwd: // Should be at least 2 stores remaining and next 2 are cache aligned
|
sl@0
|
536 |
dcbz DST,BK // LSU zero whole cache line
|
sl@0
|
537 |
bdz c_Nxt_loc_fwd // always decrement and branch to next instr
|
sl@0
|
538 |
|
sl@0
|
539 |
c_Nxt_loc_fwd:
|
sl@0
|
540 |
addi BK,BK,32 // IU1 Increment byte count
|
sl@0
|
541 |
bdnz B32_fwd // b if there are at least two more QWs to do
|
sl@0
|
542 |
|
sl@0
|
543 |
bso cr6,c_One_even_QW // b if there is one even and one odd QW to store
|
sl@0
|
544 |
b c_Last_QW // b if last store is to even address
|
sl@0
|
545 |
|
sl@0
|
546 |
// Come here with two more loads and two stores to do
|
sl@0
|
547 |
c_One_even_QW:
|
sl@0
|
548 |
stvx v0,DST,BK // LSU Store 16 bytes at D13
|
sl@0
|
549 |
addi BK,BK,16 // IU1 Increment byte count
|
sl@0
|
550 |
|
sl@0
|
551 |
b c_Last_QW
|
sl@0
|
552 |
|
sl@0
|
553 |
// End of cacheable_memzero in AltiVec
|