Update contrib.
1 //------------------------------------------------------------------
3 // AltiVec enabled version of memset and bzero and cacheable_memzero
4 //------------------------------------------------------------------
6 //------------------------------------------------------------------
7 // Copyright Motorola, Inc. 2002
10 // You are hereby granted a copyright license to use, modify, and
11 // distribute the SOFTWARE so long as this entire notice is retained
12 // without alteration in any modified and/or redistributed versions,
13 // and that such modified versions are clearly identified as such.
14 // No licenses are granted by implication, estoppel or otherwise under
15 // any patents or trademarks of Motorola, Inc.
17 // The SOFTWARE is provided on an "AS IS" basis and without warranty.
18 // To the maximum extent permitted by applicable law, MOTOROLA DISCLAIMS
19 // ALL WARRANTIES WHETHER EXPRESS OR IMPLIED, INCLUDING IMPLIED
20 // WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR
21 // PURPOSE AND ANY WARRANTY AGAINST INFRINGEMENT WITH
22 // REGARD TO THE SOFTWARE (INCLUDING ANY MODIFIED VERSIONS
23 // THEREOF) AND ANY ACCOMPANYING WRITTEN MATERIALS.
25 // To the maximum extent permitted by applicable law, IN NO EVENT SHALL
26 // MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER
27 // (INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF
28 // BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS
29 // INFORMATION, OR OTHER PECUNIARY LOSS) ARISING OF THE USE OR
30 // INABILITY TO USE THE SOFTWARE. Motorola assumes no responsibility
31 // for the maintenance and support of the SOFTWARE.
32 //------------------------------------------------------------------
34 //------------------------------------------------------------------
35 // extern void *memset( void *ptr, int val, size_t len );
36 // Copies val into each of len characters beginning at ptr.
37 // - Harbison&Steele 4th ed
38 // (despite val being an int, this memset assumes it is never
39 // more than a byte. That seems to be correct from all the
40 // memset functions I've seen but I don't know if ANSI allows
41 // anthing longer. Chuck Corley 12/21/02)
44 //------------------------------------------------------------------
46 //------------------------------------------------------------------
47 // extern void * bzero( char *ptr, int len);
48 // Copies 0 into each of len characters at ptr.
49 // - Harbison&Steele 4th ed
52 //------------------------------------------------------------------
55 // Rev 0.0 Original Chuck Corley 02/09/03
56 // Could benefit from changes added to memcpy
57 // Rev 0.1 Revised per memcpy Rev 0.30 Chuck Corley 05/01/03
59 // This is beta quality code; users are encouraged to make it faster.
61 // Code is highly likely to be in the cache; data is not (streaming data)
62 // Zero fill could be quite likely.
63 // Moving fill byte from GPR to VR as below faster than stw->lvebx via stack
65 #define VRSV 256 // VRSAVE spr
66 // Don't use vectors for BC <= MIN_VEC. Works only if MIN_VEC >= 16 bytes.
70 #define Rt r0 // r0 when used as a temporary register
72 #define DST r3 // entering: dest pointer; exiting: same dest pointer
74 #define FILL r4 // entering: fill char then fill word
76 #define BC r5 // entering: Byte_Count then remaining Byte_Count
78 #define DBC r6// dst + byte count
80 #define BK r7 // BC - 1 +/- (n*16)
82 #define Fsh r8 // fill byte shifted right one nibble
84 #define DM1 r9// dst -1 for byte-by-byte backwards initially
85 #define D r9 // (dst+16)[0:27] - dst[28:31]
86 #define DNX r9 // (dst+n*16)[28:31]
87 #define BL r9 // second byte_kount index pointer
89 #define DR r10 // (dst+16)[0:27]
90 #define QW r10 // number of cache lines
92 #define DBK r11 // (dst+byte_count-1) then (dst+byte_count-1)[28:31]
94 #define RSV r12 // storage for VRSAVE register if used
96 // Condition register use (not including temporary cr0)
97 // cr0[2] = (FILL==0)?
98 // cr1[0,2] = (BC == 0)? 1 : 0; (nothing to move)
99 // then cr1[2] = (DST[28:31] == 0)? 1 : 0; (D0 left justified)
100 // then cr1[2] = ((DBK = DST+BC-1)[28:31] = 0xF)? 1 : 0; (DN right justified)
101 // cr6[2] = (QW == 0)? 1 : 0;
102 // then cr6[1] = (QW > 4)? 1 : 0; (>4 vectors to move?)
103 // then cr6[3] = (third store[27] == 1)? 1: 0; (cache line alignment)
104 // then cr6[3] = (last store[27] == 1)? 1: 0; (last store odd?)
105 // cr7[2] = (BC>MIN_VEC)?1:0; (BC big enough to warrant vectors)
106 // then cr7[0:3] = (DST+16)[0:27]-DST (How many bytes (iff <16) in first vector?)
107 // then cr7[0:3] = (DST+BC)[0:27] (How many bytes (iff <16) in last vector?)
109 // Conditionalize the use of dcba. It will help if the data is
110 // not in cache and hurt if it is. Generally, except for small
111 // benchmarks repeated many times, we assume data is not in cache
112 // (data streaming) and using dcba is a performance boost.
113 // We use dcba which will noop to non-cacheable memory rather than
114 // dcbz which will cause an aligment exception.
116 #if defined(__GNUC__) || defined(__MWERKS__) || defined(_DIAB_TOOL)
117 // gcc and codewarrior and diab don't assemble dcba
118 #define DCBK .long 0x7c033dec
119 // dcba r3,r7 or dcba DST,BK
126 #define DCBK dcba DST,BK
128 #endif // __GNUC__ or __MWERKS__
148 cmpi cr7,0,BC,MIN_VEC // IU1 Check for minimum byte count
149 cmpi cr1,0,BC,0 // IU1 Eliminate zero byte count
150 rlwinm. Fsh,FILL,28,28,3 // IU1 Is fill byte zero? and shift
152 addi DM1,DST,-1 // IU1 Pre-bias and duplicate destination
153 addi DR,DST,16 // IU1 Address of second dst vector
154 add DBC,DST,BC // IU1 Address of last dst byte + 1
155 bgt cr7,v_memset // b if BC>MIN_VEC
157 mtctr BC // for (i=1;i<=BC;i++)
158 beqlr cr1 // return if BC = 0
160 stbu FILL,1(DM1) // LSU * ++(DST-1) = FILL
166 // Byte count < MIN_VEC bytes will have been set by scalar code above,
167 // so this will not deal with small block sets < MIN_VEC.
169 // For systems using VRSAVE, define VRSAV=1 when compiling. For systems
170 // that don't, make sure VRSAVE is undefined.
172 mfspr RSV,VRSV // IU2 Get current VRSAVE contents
174 rlwinm DR,DR,0,0,27 // IU1 (DST+16)[0:27]
175 addi DBK,DBC,-1 // IU1 Address of last dst byte
178 oris Rt,RSV,0xe000 // IU1 Or in registers used by this routine
180 subf D,DST,DR // IU1 How many bytes in first destination?
181 li BK,0 // IU1 Initialize byte kount index
184 mtspr VRSV,Rt // IU2 Save in VRSAVE before first vec op
186 vxor v0,v0,v0 // VIU Clear v0
187 subf QW,DR,DBK // IU1 Bytes of full vectors to move (-16)
188 cmpi cr1,0,D,16 // IU1 Is D0 left justified?
189 beq+ enter_bzero // b if FILL==0
191 lvsl v0,0,Fsh // LSU Move upper nibble to byte 0 of VR
192 vspltisb v1,4 // VPU Splat 0x4 to every byte
194 lvsl v2,0,FILL // LSU Move lower nibble to byte 0 of VR
196 vslb v0,v0,v1 // VIU Move upper nibble to VR[0:3]
198 vor v0,v0,v2 // VIU Form FILL byte in VR[0:7]
200 vspltb v0,v0,0 // VPU Splat the fill byte to all bytes
202 mtcrf 0x01,D // IU2 Put bytes in 1st dst in cr7
203 rlwinm QW,QW,28,4,31 // IU1 Quad words remaining
204 beq cr1,Left_just // b if D0 is left justified
206 bns cr7,No_B_fwd // b if only even number of bytes to store
208 stvebx v0,DST,BK // LSU store first byte at DST+0
209 addi BK,BK,1 // IU1 increment index
211 bne cr7,No_H_fwd // b if only words to store
213 stvehx v0,DST,BK // LSU store halfword at DST+0/1
214 addi BK,BK,2 // IU1 increment index
216 bng cr7,No_W1_fwd // b if exactly zero or two words to store
218 stvewx v0,DST,BK // LSU store word 1 of one or three
219 addi BK,BK,4 // IU1 increment index
222 bnl cr7,No_W2_fwd // b if there was only one word to store
223 stvewx v0,DST,BK // LSU store word 1 of two or 2 of three
224 addi BK,BK,4 // IU1 increment index
226 stvewx v0,DST,BK // LSU store word 2 of two or 3 of three
230 stvx v0,0,DST // LSU Store 16 bytes at D0
232 rlwinm Rt,DBK,0,28,31 // IU1 (DBK = DST+BC-1)[28:31]
233 cmpi cr6,0,QW,0 // IU1 Any full vectors to move?
235 li BK,16 // IU1 Re-initialize byte kount index
236 cmpi cr1,0,Rt,0xF // IU1 Is DN right justified?
237 ble cr6,Last_QW // b if no Quad words to do
239 mtctr QW // IU2 for (i=0;i<=QW;i++)
240 cmpi cr6,0,QW,4 // IU1 Check QW>4
243 stvx v0,DST,BK // LSU Store 16 fill bytes
244 addi BK,BK,16 // IU1 Increment byte kount index
245 bdnzf 25,QW_loop // b if 4 or less quad words to do
247 add DNX,DST,BK // IU1 address of next store (DST+32 if QW>4)
248 addi QW,QW,-1 // IU1 One more QW stored by now
249 bgt cr6,GT_4QW_fwd // b if >4 quad words left
251 Last_QW: // Next vector is the last; we're done.
252 mtcrf 0x01,DBC // IU2 Put final vector byte count in cr7
254 beq cr1,Rt_just_fwd // b if last destination is right justified
256 rlwinm DBK,DBK,0,0,27 // IU1 Round to QW addr of last byte
257 li BL,0 // IU1 Initialize index pointer
258 bnl cr7,Only_1W_fwd // b if there was only one or zero words to store
260 stvewx v0,DBK,BL // LSU store word 1 of two or three
261 addi BL,BL,4 // IU1 increment index
263 stvewx v0,DBK,BL // LSU store word 2 of two or three
264 addi BL,BL,4 // IU1 increment index
266 bng cr7,Only_2W_fwd // b if there were only two or zero words to store
268 stvewx v0,DBK,BL // LSU store word 3 of three if necessary
269 addi BL,BL,4 // IU1 increment index
271 bne cr7,Only_B_fwd // b if there are no half words to store
273 stvehx v0,DBK,BL // LSU store one halfword if necessary
274 addi BL,BL,2 // IU1 increment index
276 bns cr7,All_done_fwd // b if there are no bytes to store
278 stvebx v0,DBK,BL // LSU store one byte if necessary
283 stvx v0,DST,BK // LSU Store 16 bytes at D14
286 mtspr VRSV,RSV // IU1 Restore VRSAVE
288 blr // Return destination address from entry
295 GT_4QW_fwd: // Do once if nxt st is to odd half of cache line, else twice
297 addi QW,QW,-1 // IU1 Keeping track of QWs stored
298 mtcrf 0x02,DNX // IU2 cr6[3]=((DST+32)[27]==1)?1:0;
299 addi DNX,DNX,16 // IU1 Update cr6 for next loop
301 stvx v0,DST,BK // LSU Store 16 bytes at D2
302 addi BK,BK,16 // IU1 Increment byte count by 16
303 bdnzf 27,GT_4QW_fwd // b if next store is to lower (even) half of CL
305 mtcrf 0x02,DBK // IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?)
307 bns cr6,B32_fwd // b if DST[27] == 0; i.e, final store is even
309 // We need the ctr register to reflect an even byte count before entering
310 // the next block - faster to decrement than to reload.
311 bdnz B32_fwd // decrement counter for last QW store odd
313 B32_fwd: // Should be at least 2 stores remaining and next 2 are cache aligned
314 DCBK // LSU then Kill instead of RWITM
316 stvx v0,DST,BK // LSU Store 16 bytes at D11
317 addi BK,BK,16 // IU1 Increment byte count
318 bdz Nxt_loc_fwd // always decrement and branch to next instr
321 stvx v0,DST,BK // LSU Store 16 bytes at D12
322 addi BK,BK,16 // IU1 Increment byte count
323 bdnz B32_fwd // b if there are at least two more QWs to do
325 bso cr6,One_even_QW // b if there is one even and one odd QW to store
326 b Last_QW // b if last store is to even address
328 // Come here with two more loads and two stores to do
330 stvx v0,DST,BK // LSU Store 16 bytes at D13
331 addi BK,BK,16 // IU1 Increment byte count
335 // End of memset in AltiVec
337 #define BCz r4 // in bzero r4 enters with byte count
353 mr BC,BCz // IU1 arg[2] is BC here, not FILL
354 li FILL,0 // IU1 for bzero FILL=0
361 // cacheable_memzero will employ dcbz to clear 32 bytes at a time
362 // of cacheable memory. Like bzero, second entering argument will be BC.
363 // Using this for non-cacheable memory will generate an alignment exception.
373 .globl cacheable_memzero
376 .globl vec_cacheable_memzero
377 vec_cacheable_memzero:
380 mr BC,BCz // IU1 arg[2] is BC here, not FILL
381 li FILL,0 // IU1 for bzero FILL=0
382 cmpi cr7,0,BC,MIN_VEC // IU1 Check for minimum byte count
384 cmpi cr1,0,BC,0 // IU1 Eliminate zero byte count
386 addi DM1,DST,-1 // IU1 Pre-bias and duplicate destination
387 addi DR,DST,16 // IU1 Address of second dst vector
388 add DBC,DST,BC // IU1 Address of last dst byte + 1
389 bgt cr7,c_v_memset // b if BC>MIN_VEC
391 mtctr BC // for (i=1;i<=BC;i++)
392 beqlr cr1 // return if BC = 0
394 stbu FILL,1(DM1) // LSU * ++(DST-1) = FILL
400 // Byte count < MIN_VEC bytes will have been set by scalar code above,
401 // so this will not deal with small block sets < MIN_VEC.
403 // For systems using VRSAVE, define VRSAV=1 when compiling. For systems
404 // that don't, make sure VRSAVE is undefined.
406 mfspr RSV,VRSV // IU2 Get current VRSAVE contents
408 rlwinm DR,DR,0,0,27 // IU1 (DST+16)[0:27]
409 addi DBK,DBC,-1 // IU1 Address of last dst byte
412 oris Rt,RSV,0x8000 // IU1 Or in registers used by this routine
414 subf D,DST,DR // IU1 How many bytes in first destination?
415 li BK,0 // IU1 Initialize byte kount index
418 mtspr VRSV,Rt // IU2 Save in VRSAVE before first vec op
420 vxor v0,v0,v0 // VIU Clear v0
421 subf QW,DR,DBK // IU1 Bytes of full vectors to move (-16)
422 cmpi cr1,0,D,16 // IU1 Is D0 left justified?
424 mtcrf 0x01,D // IU2 Put bytes in 1st dst in cr7
425 rlwinm QW,QW,28,4,31 // IU1 Quad words remaining
426 beq cr1,c_Left_just // b if D0 is left justified
428 bns cr7,c_No_B_fwd // b if only even number of bytes to store
430 stvebx v0,DST,BK // LSU store first byte at DST+0
431 addi BK,BK,1 // IU1 increment index
433 bne cr7,c_No_H_fwd // b if only words to store
435 stvehx v0,DST,BK // LSU store halfword at DST+0/1
436 addi BK,BK,2 // IU1 increment index
438 bng cr7,c_No_W1_fwd // b if exactly zero or two words to store
440 stvewx v0,DST,BK // LSU store word 1 of one or three
441 addi BK,BK,4 // IU1 increment index
444 bnl cr7,c_No_W2_fwd // b if there was only one word to store
445 stvewx v0,DST,BK // LSU store word 1 of two or 2 of three
446 addi BK,BK,4 // IU1 increment index
448 stvewx v0,DST,BK // LSU store word 2 of two or 3 of three
452 stvx v0,0,DST // LSU Store 16 bytes at D0
454 rlwinm Rt,DBK,0,28,31 // IU1 (DBK = DST+BC-1)[28:31]
455 cmpi cr6,0,QW,0 // IU1 Any full vectors to move?
457 li BK,16 // IU1 Re-initialize byte kount index
458 cmpi cr1,0,Rt,0xF // IU1 Is DN right justified?
459 ble cr6,c_Last_QW // b if no Quad words to do
461 mtctr QW // IU2 for (i=0;i<=QW;i++)
462 cmpi cr6,0,QW,4 // IU1 Check QW>4
465 stvx v0,DST,BK // LSU Store 16 fill bytes
466 addi BK,BK,16 // IU1 Increment byte kount index
467 bdnzf 25,c_QW_loop // b if 4 or less quad words to do
469 add DNX,DST,BK // IU1 address of next store (DST+32 if QW>4)
470 addi QW,QW,-1 // IU1 One more QW stored by now
471 bgt cr6,c_GT_4QW_fwd // b if >4 quad words left
473 c_Last_QW: // Next vector is the last; we're done.
474 mtcrf 0x01,DBC // IU2 Put final vector byte count in cr7
476 beq cr1,c_Rt_just_fwd // b if last destination is right justified
478 rlwinm DBK,DBK,0,0,27 // IU1 Round to QW addr of last byte
479 li BL,0 // IU1 Initialize index pointer
480 bnl cr7,c_Only_1W_fwd // b if there was only one or zero words to store
482 stvewx v0,DBK,BL // LSU store word 1 of two or three
483 addi BL,BL,4 // IU1 increment index
485 stvewx v0,DBK,BL // LSU store word 2 of two or three
486 addi BL,BL,4 // IU1 increment index
488 bng cr7,Only_2W_fwd // b if there were only two or zero words to store
490 stvewx v0,DBK,BL // LSU store word 3 of three if necessary
491 addi BL,BL,4 // IU1 increment index
493 bne cr7,c_Only_B_fwd // b if there are no half words to store
495 stvehx v0,DBK,BL // LSU store one halfword if necessary
496 addi BL,BL,2 // IU1 increment index
498 bns cr7,c_All_done_fwd // b if there are no bytes to store
500 stvebx v0,DBK,BL // LSU store one byte if necessary
505 stvx v0,DST,BK // LSU Store 16 bytes at D14
508 mtspr VRSV,RSV // IU1 Restore VRSAVE
510 blr // Return destination address from entry
517 c_GT_4QW_fwd: // Do once if nxt st is to odd half of cache line, else twice
519 addi QW,QW,-1 // IU1 Keeping track of QWs stored
520 mtcrf 0x02,DNX // IU2 cr6[3]=((DST+32)[27]==1)?1:0;
521 addi DNX,DNX,16 // IU1 Update cr6 for next loop
523 stvx v0,DST,BK // LSU Store 16 bytes at D2
524 addi BK,BK,16 // IU1 Increment byte count by 16
525 bdnzf 27,c_GT_4QW_fwd // b if next store is to lower (even) half of CL
527 mtcrf 0x02,DBK // IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?)
529 bns cr6,c_B32_fwd // b if DST[27] == 0; i.e, final store is even
531 // We need the ctr register to reflect an even byte count before entering
532 // the next block - faster to decrement than to reload.
533 bdnz B32_fwd // decrement counter for last QW store odd
535 c_B32_fwd: // Should be at least 2 stores remaining and next 2 are cache aligned
536 dcbz DST,BK // LSU zero whole cache line
537 bdz c_Nxt_loc_fwd // always decrement and branch to next instr
540 addi BK,BK,32 // IU1 Increment byte count
541 bdnz B32_fwd // b if there are at least two more QWs to do
543 bso cr6,c_One_even_QW // b if there is one even and one odd QW to store
544 b c_Last_QW // b if last store is to even address
546 // Come here with two more loads and two stores to do
548 stvx v0,DST,BK // LSU Store 16 bytes at D13
549 addi BK,BK,16 // IU1 Increment byte count
553 // End of cacheable_memzero in AltiVec