1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/os/ossrv/genericopenlibs/liboil/src/motovec/vec_memset.s Fri Jun 15 03:10:57 2012 +0200
1.3 @@ -0,0 +1,553 @@
1.4 +//------------------------------------------------------------------
1.5 +// file: vec_memset.S
1.6 +// AltiVec enabled version of memset and bzero and cacheable_memzero
1.7 +//------------------------------------------------------------------
1.8 +
1.9 +//------------------------------------------------------------------
1.10 +// Copyright Motorola, Inc. 2002
1.11 +// ALL RIGHTS RESERVED
1.12 +//
1.13 +// You are hereby granted a copyright license to use, modify, and
1.14 +// distribute the SOFTWARE so long as this entire notice is retained
1.15 +// without alteration in any modified and/or redistributed versions,
1.16 +// and that such modified versions are clearly identified as such.
1.17 +// No licenses are granted by implication, estoppel or otherwise under
1.18 +// any patents or trademarks of Motorola, Inc.
1.19 +//
1.20 +// The SOFTWARE is provided on an "AS IS" basis and without warranty.
1.21 +// To the maximum extent permitted by applicable law, MOTOROLA DISCLAIMS
1.22 +// ALL WARRANTIES WHETHER EXPRESS OR IMPLIED, INCLUDING IMPLIED
1.23 +// WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR
1.24 +// PURPOSE AND ANY WARRANTY AGAINST INFRINGEMENT WITH
1.25 +// REGARD TO THE SOFTWARE (INCLUDING ANY MODIFIED VERSIONS
1.26 +// THEREOF) AND ANY ACCOMPANYING WRITTEN MATERIALS.
1.27 +//
1.28 +// To the maximum extent permitted by applicable law, IN NO EVENT SHALL
1.29 +// MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER
1.30 +// (INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF
1.31 +// BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS
1.32 +// INFORMATION, OR OTHER PECUNIARY LOSS) ARISING OF THE USE OR
1.33 +// INABILITY TO USE THE SOFTWARE. Motorola assumes no responsibility
1.34 +// for the maintenance and support of the SOFTWARE.
1.35 +//------------------------------------------------------------------
1.36 +
1.37 +//------------------------------------------------------------------
1.38 +// extern void *memset( void *ptr, int val, size_t len );
1.39 +// Copies val into each of len characters beginning at ptr.
1.40 +// - Harbison&Steele 4th ed
1.41 +// (despite val being an int, this memset assumes it is never
1.42 +// more than a byte. That seems to be correct from all the
1.43 +// memset functions I've seen but I don't know if ANSI allows
1.44 +// anthing longer. Chuck Corley 12/21/02)
1.45 +// Returns:
1.46 +// void * ptr
1.47 +//------------------------------------------------------------------
1.48 +
1.49 +//------------------------------------------------------------------
1.50 +// extern void * bzero( char *ptr, int len);
1.51 +// Copies 0 into each of len characters at ptr.
1.52 +// - Harbison&Steele 4th ed
1.53 +// Returns:
1.54 +// void * ptr
1.55 +//------------------------------------------------------------------
1.56 +
1.57 +// Revision History:
1.58 +// Rev 0.0 Original Chuck Corley 02/09/03
1.59 +// Could benefit from changes added to memcpy
1.60 +// Rev 0.1 Revised per memcpy Rev 0.30 Chuck Corley 05/01/03
1.61 +//
1.62 +// This is beta quality code; users are encouraged to make it faster.
1.63 +// ASSUMPTIONS:
1.64 +// Code is highly likely to be in the cache; data is not (streaming data)
1.65 +// Zero fill could be quite likely.
1.66 +// Moving fill byte from GPR to VR as below faster than stw->lvebx via stack
1.67 +
1.68 +#define VRSV 256 // VRSAVE spr
1.69 +// Don't use vectors for BC <= MIN_VEC. Works only if MIN_VEC >= 16 bytes.
1.70 +#define MIN_VEC 16
1.71 +
1.72 +// Register useage
1.73 +#define Rt r0 // r0 when used as a temporary register
1.74 +
1.75 +#define DST r3 // entering: dest pointer; exiting: same dest pointer
1.76 +
1.77 +#define FILL r4 // entering: fill char then fill word
1.78 +
1.79 +#define BC r5 // entering: Byte_Count then remaining Byte_Count
1.80 +
1.81 +#define DBC r6// dst + byte count
1.82 +
1.83 +#define BK r7 // BC - 1 +/- (n*16)
1.84 +
1.85 +#define Fsh r8 // fill byte shifted right one nibble
1.86 +
1.87 +#define DM1 r9// dst -1 for byte-by-byte backwards initially
1.88 +#define D r9 // (dst+16)[0:27] - dst[28:31]
1.89 +#define DNX r9 // (dst+n*16)[28:31]
1.90 +#define BL r9 // second byte_kount index pointer
1.91 +
1.92 +#define DR r10 // (dst+16)[0:27]
1.93 +#define QW r10 // number of cache lines
1.94 +
1.95 +#define DBK r11 // (dst+byte_count-1) then (dst+byte_count-1)[28:31]
1.96 +
1.97 +#define RSV r12 // storage for VRSAVE register if used
1.98 +
1.99 +// Condition register use (not including temporary cr0)
1.100 +// cr0[2] = (FILL==0)?
1.101 +// cr1[0,2] = (BC == 0)? 1 : 0; (nothing to move)
1.102 +// then cr1[2] = (DST[28:31] == 0)? 1 : 0; (D0 left justified)
1.103 +// then cr1[2] = ((DBK = DST+BC-1)[28:31] = 0xF)? 1 : 0; (DN right justified)
1.104 +// cr6[2] = (QW == 0)? 1 : 0;
1.105 +// then cr6[1] = (QW > 4)? 1 : 0; (>4 vectors to move?)
1.106 +// then cr6[3] = (third store[27] == 1)? 1: 0; (cache line alignment)
1.107 +// then cr6[3] = (last store[27] == 1)? 1: 0; (last store odd?)
1.108 +// cr7[2] = (BC>MIN_VEC)?1:0; (BC big enough to warrant vectors)
1.109 +// then cr7[0:3] = (DST+16)[0:27]-DST (How many bytes (iff <16) in first vector?)
1.110 +// then cr7[0:3] = (DST+BC)[0:27] (How many bytes (iff <16) in last vector?)
1.111 +
1.112 +// Conditionalize the use of dcba. It will help if the data is
1.113 +// not in cache and hurt if it is. Generally, except for small
1.114 +// benchmarks repeated many times, we assume data is not in cache
1.115 +// (data streaming) and using dcba is a performance boost.
1.116 +// We use dcba which will noop to non-cacheable memory rather than
1.117 +// dcbz which will cause an aligment exception.
1.118 +#ifndef NO_DCBA
1.119 +#if defined(__GNUC__) || defined(__MWERKS__) || defined(_DIAB_TOOL)
1.120 + // gcc and codewarrior and diab don't assemble dcba
1.121 +#define DCBK .long 0x7c033dec
1.122 +// dcba r3,r7 or dcba DST,BK
1.123 +#else
1.124 +#ifdef __ghs__
1.125 +.macro DCBK
1.126 +.long 0x7c033dec
1.127 +.endm
1.128 +#else
1.129 +#define DCBK dcba DST,BK
1.130 +#endif // __ghs__
1.131 +#endif // __GNUC__ or __MWERKS__
1.132 +#else
1.133 +#define DCBK nop
1.134 +#endif // NO_DCBA
1.135 +
1.136 + .text
1.137 +#ifdef __MWERKS__
1.138 + .align 32
1.139 +#else
1.140 + .align 5
1.141 +#endif
1.142 +
1.143 +#ifdef LIBMOTOVEC
1.144 + .globl memset
1.145 +memset:
1.146 +#else
1.147 + .globl _vec_memset
1.148 +_vec_memset:
1.149 +#endif
1.150 +
1.151 + cmpi cr7,0,BC,MIN_VEC // IU1 Check for minimum byte count
1.152 + cmpi cr1,0,BC,0 // IU1 Eliminate zero byte count
1.153 + rlwinm. Fsh,FILL,28,28,3 // IU1 Is fill byte zero? and shift
1.154 +
1.155 + addi DM1,DST,-1 // IU1 Pre-bias and duplicate destination
1.156 + addi DR,DST,16 // IU1 Address of second dst vector
1.157 + add DBC,DST,BC // IU1 Address of last dst byte + 1
1.158 + bgt cr7,v_memset // b if BC>MIN_VEC
1.159 +
1.160 + mtctr BC // for (i=1;i<=BC;i++)
1.161 + beqlr cr1 // return if BC = 0
1.162 +Byte_set:
1.163 + stbu FILL,1(DM1) // LSU * ++(DST-1) = FILL
1.164 + bdnz Byte_set
1.165 +
1.166 + blr
1.167 +
1.168 +v_memset:
1.169 +// Byte count < MIN_VEC bytes will have been set by scalar code above,
1.170 +// so this will not deal with small block sets < MIN_VEC.
1.171 +
1.172 +// For systems using VRSAVE, define VRSAV=1 when compiling. For systems
1.173 +// that don't, make sure VRSAVE is undefined.
1.174 +#ifdef VRSAVE
1.175 + mfspr RSV,VRSV // IU2 Get current VRSAVE contents
1.176 +#endif
1.177 + rlwinm DR,DR,0,0,27 // IU1 (DST+16)[0:27]
1.178 + addi DBK,DBC,-1 // IU1 Address of last dst byte
1.179 +
1.180 +#ifdef VRSAVE
1.181 + oris Rt,RSV,0xe000 // IU1 Or in registers used by this routine
1.182 +#endif
1.183 + subf D,DST,DR // IU1 How many bytes in first destination?
1.184 + li BK,0 // IU1 Initialize byte kount index
1.185 +
1.186 +#ifdef VRSAVE
1.187 + mtspr VRSV,Rt // IU2 Save in VRSAVE before first vec op
1.188 +#endif
1.189 + vxor v0,v0,v0 // VIU Clear v0
1.190 + subf QW,DR,DBK // IU1 Bytes of full vectors to move (-16)
1.191 + cmpi cr1,0,D,16 // IU1 Is D0 left justified?
1.192 + beq+ enter_bzero // b if FILL==0
1.193 +
1.194 + lvsl v0,0,Fsh // LSU Move upper nibble to byte 0 of VR
1.195 + vspltisb v1,4 // VPU Splat 0x4 to every byte
1.196 +
1.197 + lvsl v2,0,FILL // LSU Move lower nibble to byte 0 of VR
1.198 +
1.199 + vslb v0,v0,v1 // VIU Move upper nibble to VR[0:3]
1.200 +
1.201 + vor v0,v0,v2 // VIU Form FILL byte in VR[0:7]
1.202 +
1.203 + vspltb v0,v0,0 // VPU Splat the fill byte to all bytes
1.204 +enter_bzero:
1.205 + mtcrf 0x01,D // IU2 Put bytes in 1st dst in cr7
1.206 + rlwinm QW,QW,28,4,31 // IU1 Quad words remaining
1.207 + beq cr1,Left_just // b if D0 is left justified
1.208 +
1.209 + bns cr7,No_B_fwd // b if only even number of bytes to store
1.210 +
1.211 + stvebx v0,DST,BK // LSU store first byte at DST+0
1.212 + addi BK,BK,1 // IU1 increment index
1.213 +No_B_fwd:
1.214 + bne cr7,No_H_fwd // b if only words to store
1.215 +
1.216 + stvehx v0,DST,BK // LSU store halfword at DST+0/1
1.217 + addi BK,BK,2 // IU1 increment index
1.218 +No_H_fwd:
1.219 + bng cr7,No_W1_fwd // b if exactly zero or two words to store
1.220 +
1.221 + stvewx v0,DST,BK // LSU store word 1 of one or three
1.222 + addi BK,BK,4 // IU1 increment index
1.223 +
1.224 +No_W1_fwd:
1.225 + bnl cr7,No_W2_fwd // b if there was only one word to store
1.226 + stvewx v0,DST,BK // LSU store word 1 of two or 2 of three
1.227 + addi BK,BK,4 // IU1 increment index
1.228 +
1.229 + stvewx v0,DST,BK // LSU store word 2 of two or 3 of three
1.230 + b No_W2_fwd
1.231 +
1.232 +Left_just:
1.233 + stvx v0,0,DST // LSU Store 16 bytes at D0
1.234 +No_W2_fwd:
1.235 + rlwinm Rt,DBK,0,28,31 // IU1 (DBK = DST+BC-1)[28:31]
1.236 + cmpi cr6,0,QW,0 // IU1 Any full vectors to move?
1.237 +
1.238 + li BK,16 // IU1 Re-initialize byte kount index
1.239 + cmpi cr1,0,Rt,0xF // IU1 Is DN right justified?
1.240 + ble cr6,Last_QW // b if no Quad words to do
1.241 +
1.242 + mtctr QW // IU2 for (i=0;i<=QW;i++)
1.243 + cmpi cr6,0,QW,4 // IU1 Check QW>4
1.244 +
1.245 +QW_loop:
1.246 + stvx v0,DST,BK // LSU Store 16 fill bytes
1.247 + addi BK,BK,16 // IU1 Increment byte kount index
1.248 + bdnzf 25,QW_loop // b if 4 or less quad words to do
1.249 +
1.250 + add DNX,DST,BK // IU1 address of next store (DST+32 if QW>4)
1.251 + addi QW,QW,-1 // IU1 One more QW stored by now
1.252 + bgt cr6,GT_4QW_fwd // b if >4 quad words left
1.253 +
1.254 +Last_QW: // Next vector is the last; we're done.
1.255 + mtcrf 0x01,DBC // IU2 Put final vector byte count in cr7
1.256 +
1.257 + beq cr1,Rt_just_fwd // b if last destination is right justified
1.258 +
1.259 + rlwinm DBK,DBK,0,0,27 // IU1 Round to QW addr of last byte
1.260 + li BL,0 // IU1 Initialize index pointer
1.261 + bnl cr7,Only_1W_fwd // b if there was only one or zero words to store
1.262 +
1.263 + stvewx v0,DBK,BL // LSU store word 1 of two or three
1.264 + addi BL,BL,4 // IU1 increment index
1.265 +
1.266 + stvewx v0,DBK,BL // LSU store word 2 of two or three
1.267 + addi BL,BL,4 // IU1 increment index
1.268 +Only_1W_fwd:
1.269 + bng cr7,Only_2W_fwd // b if there were only two or zero words to store
1.270 +
1.271 + stvewx v0,DBK,BL // LSU store word 3 of three if necessary
1.272 + addi BL,BL,4 // IU1 increment index
1.273 +Only_2W_fwd:
1.274 + bne cr7,Only_B_fwd // b if there are no half words to store
1.275 +
1.276 + stvehx v0,DBK,BL // LSU store one halfword if necessary
1.277 + addi BL,BL,2 // IU1 increment index
1.278 +Only_B_fwd:
1.279 + bns cr7,All_done_fwd // b if there are no bytes to store
1.280 +
1.281 + stvebx v0,DBK,BL // LSU store one byte if necessary
1.282 + b All_done_fwd
1.283 +
1.284 +Rt_just_fwd:
1.285 +
1.286 + stvx v0,DST,BK // LSU Store 16 bytes at D14
1.287 +All_done_fwd:
1.288 +#ifdef VRSAVE
1.289 + mtspr VRSV,RSV // IU1 Restore VRSAVE
1.290 +#endif
1.291 + blr // Return destination address from entry
1.292 +
1.293 +#ifdef __MWERKS__
1.294 + .align 16
1.295 +#else
1.296 + .align 4
1.297 +#endif
1.298 +GT_4QW_fwd: // Do once if nxt st is to odd half of cache line, else twice
1.299 +
1.300 + addi QW,QW,-1 // IU1 Keeping track of QWs stored
1.301 + mtcrf 0x02,DNX // IU2 cr6[3]=((DST+32)[27]==1)?1:0;
1.302 + addi DNX,DNX,16 // IU1 Update cr6 for next loop
1.303 +
1.304 + stvx v0,DST,BK // LSU Store 16 bytes at D2
1.305 + addi BK,BK,16 // IU1 Increment byte count by 16
1.306 + bdnzf 27,GT_4QW_fwd // b if next store is to lower (even) half of CL
1.307 +
1.308 + mtcrf 0x02,DBK // IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?)
1.309 +
1.310 + bns cr6,B32_fwd // b if DST[27] == 0; i.e, final store is even
1.311 +
1.312 +// We need the ctr register to reflect an even byte count before entering
1.313 +// the next block - faster to decrement than to reload.
1.314 + bdnz B32_fwd // decrement counter for last QW store odd
1.315 +
1.316 +B32_fwd: // Should be at least 2 stores remaining and next 2 are cache aligned
1.317 + DCBK // LSU then Kill instead of RWITM
1.318 +
1.319 + stvx v0,DST,BK // LSU Store 16 bytes at D11
1.320 + addi BK,BK,16 // IU1 Increment byte count
1.321 + bdz Nxt_loc_fwd // always decrement and branch to next instr
1.322 +
1.323 +Nxt_loc_fwd:
1.324 + stvx v0,DST,BK // LSU Store 16 bytes at D12
1.325 + addi BK,BK,16 // IU1 Increment byte count
1.326 + bdnz B32_fwd // b if there are at least two more QWs to do
1.327 +
1.328 + bso cr6,One_even_QW // b if there is one even and one odd QW to store
1.329 + b Last_QW // b if last store is to even address
1.330 +
1.331 +// Come here with two more loads and two stores to do
1.332 +One_even_QW:
1.333 + stvx v0,DST,BK // LSU Store 16 bytes at D13
1.334 + addi BK,BK,16 // IU1 Increment byte count
1.335 +
1.336 + b Last_QW
1.337 +
1.338 +// End of memset in AltiVec
1.339 +
1.340 +#define BCz r4 // in bzero r4 enters with byte count
1.341 +
1.342 +#ifdef __MWERKS__
1.343 + .align 32
1.344 +#else
1.345 + .align 5
1.346 +#endif
1.347 +
1.348 +#ifdef LIBMOTOVEC
1.349 + .globl bzero
1.350 +bzero:
1.351 +#else
1.352 + .globl vec_bzero
1.353 +vec_bzero:
1.354 +#endif
1.355 +
1.356 + mr BC,BCz // IU1 arg[2] is BC here, not FILL
1.357 + li FILL,0 // IU1 for bzero FILL=0
1.358 +#ifdef LIBMOTOVEC
1.359 + b memset
1.360 +#else
1.361 + b _vec_memset
1.362 +#endif
1.363 +
1.364 +// cacheable_memzero will employ dcbz to clear 32 bytes at a time
1.365 +// of cacheable memory. Like bzero, second entering argument will be BC.
1.366 +// Using this for non-cacheable memory will generate an alignment exception.
1.367 +
1.368 + .text
1.369 +#ifdef __MWERKS__
1.370 + .align 32
1.371 +#else
1.372 + .align 5
1.373 +#endif
1.374 +
1.375 +#ifdef LIBMOTOVEC
1.376 + .globl cacheable_memzero
1.377 +cacheable_memzero:
1.378 +#else
1.379 + .globl vec_cacheable_memzero
1.380 +vec_cacheable_memzero:
1.381 +#endif
1.382 +
1.383 + mr BC,BCz // IU1 arg[2] is BC here, not FILL
1.384 + li FILL,0 // IU1 for bzero FILL=0
1.385 + cmpi cr7,0,BC,MIN_VEC // IU1 Check for minimum byte count
1.386 +
1.387 + cmpi cr1,0,BC,0 // IU1 Eliminate zero byte count
1.388 +
1.389 + addi DM1,DST,-1 // IU1 Pre-bias and duplicate destination
1.390 + addi DR,DST,16 // IU1 Address of second dst vector
1.391 + add DBC,DST,BC // IU1 Address of last dst byte + 1
1.392 + bgt cr7,c_v_memset // b if BC>MIN_VEC
1.393 +
1.394 + mtctr BC // for (i=1;i<=BC;i++)
1.395 + beqlr cr1 // return if BC = 0
1.396 +c_Byte_set:
1.397 + stbu FILL,1(DM1) // LSU * ++(DST-1) = FILL
1.398 + bdnz c_Byte_set
1.399 +
1.400 + blr
1.401 +
1.402 +c_v_memset:
1.403 +// Byte count < MIN_VEC bytes will have been set by scalar code above,
1.404 +// so this will not deal with small block sets < MIN_VEC.
1.405 +
1.406 +// For systems using VRSAVE, define VRSAV=1 when compiling. For systems
1.407 +// that don't, make sure VRSAVE is undefined.
1.408 +#ifdef VRSAVE
1.409 + mfspr RSV,VRSV // IU2 Get current VRSAVE contents
1.410 +#endif
1.411 + rlwinm DR,DR,0,0,27 // IU1 (DST+16)[0:27]
1.412 + addi DBK,DBC,-1 // IU1 Address of last dst byte
1.413 +
1.414 +#ifdef VRSAVE
1.415 + oris Rt,RSV,0x8000 // IU1 Or in registers used by this routine
1.416 +#endif
1.417 + subf D,DST,DR // IU1 How many bytes in first destination?
1.418 + li BK,0 // IU1 Initialize byte kount index
1.419 +
1.420 +#ifdef VRSAVE
1.421 + mtspr VRSV,Rt // IU2 Save in VRSAVE before first vec op
1.422 +#endif
1.423 + vxor v0,v0,v0 // VIU Clear v0
1.424 + subf QW,DR,DBK // IU1 Bytes of full vectors to move (-16)
1.425 + cmpi cr1,0,D,16 // IU1 Is D0 left justified?
1.426 +
1.427 + mtcrf 0x01,D // IU2 Put bytes in 1st dst in cr7
1.428 + rlwinm QW,QW,28,4,31 // IU1 Quad words remaining
1.429 + beq cr1,c_Left_just // b if D0 is left justified
1.430 +
1.431 + bns cr7,c_No_B_fwd // b if only even number of bytes to store
1.432 +
1.433 + stvebx v0,DST,BK // LSU store first byte at DST+0
1.434 + addi BK,BK,1 // IU1 increment index
1.435 +c_No_B_fwd:
1.436 + bne cr7,c_No_H_fwd // b if only words to store
1.437 +
1.438 + stvehx v0,DST,BK // LSU store halfword at DST+0/1
1.439 + addi BK,BK,2 // IU1 increment index
1.440 +c_No_H_fwd:
1.441 + bng cr7,c_No_W1_fwd // b if exactly zero or two words to store
1.442 +
1.443 + stvewx v0,DST,BK // LSU store word 1 of one or three
1.444 + addi BK,BK,4 // IU1 increment index
1.445 +
1.446 +c_No_W1_fwd:
1.447 + bnl cr7,c_No_W2_fwd // b if there was only one word to store
1.448 + stvewx v0,DST,BK // LSU store word 1 of two or 2 of three
1.449 + addi BK,BK,4 // IU1 increment index
1.450 +
1.451 + stvewx v0,DST,BK // LSU store word 2 of two or 3 of three
1.452 + b c_No_W2_fwd
1.453 +
1.454 +c_Left_just:
1.455 + stvx v0,0,DST // LSU Store 16 bytes at D0
1.456 +c_No_W2_fwd:
1.457 + rlwinm Rt,DBK,0,28,31 // IU1 (DBK = DST+BC-1)[28:31]
1.458 + cmpi cr6,0,QW,0 // IU1 Any full vectors to move?
1.459 +
1.460 + li BK,16 // IU1 Re-initialize byte kount index
1.461 + cmpi cr1,0,Rt,0xF // IU1 Is DN right justified?
1.462 + ble cr6,c_Last_QW // b if no Quad words to do
1.463 +
1.464 + mtctr QW // IU2 for (i=0;i<=QW;i++)
1.465 + cmpi cr6,0,QW,4 // IU1 Check QW>4
1.466 +
1.467 +c_QW_loop:
1.468 + stvx v0,DST,BK // LSU Store 16 fill bytes
1.469 + addi BK,BK,16 // IU1 Increment byte kount index
1.470 + bdnzf 25,c_QW_loop // b if 4 or less quad words to do
1.471 +
1.472 + add DNX,DST,BK // IU1 address of next store (DST+32 if QW>4)
1.473 + addi QW,QW,-1 // IU1 One more QW stored by now
1.474 + bgt cr6,c_GT_4QW_fwd // b if >4 quad words left
1.475 +
1.476 +c_Last_QW: // Next vector is the last; we're done.
1.477 + mtcrf 0x01,DBC // IU2 Put final vector byte count in cr7
1.478 +
1.479 + beq cr1,c_Rt_just_fwd // b if last destination is right justified
1.480 +
1.481 + rlwinm DBK,DBK,0,0,27 // IU1 Round to QW addr of last byte
1.482 + li BL,0 // IU1 Initialize index pointer
1.483 + bnl cr7,c_Only_1W_fwd // b if there was only one or zero words to store
1.484 +
1.485 + stvewx v0,DBK,BL // LSU store word 1 of two or three
1.486 + addi BL,BL,4 // IU1 increment index
1.487 +
1.488 + stvewx v0,DBK,BL // LSU store word 2 of two or three
1.489 + addi BL,BL,4 // IU1 increment index
1.490 +c_Only_1W_fwd:
1.491 + bng cr7,Only_2W_fwd // b if there were only two or zero words to store
1.492 +
1.493 + stvewx v0,DBK,BL // LSU store word 3 of three if necessary
1.494 + addi BL,BL,4 // IU1 increment index
1.495 +c_Only_2W_fwd:
1.496 + bne cr7,c_Only_B_fwd // b if there are no half words to store
1.497 +
1.498 + stvehx v0,DBK,BL // LSU store one halfword if necessary
1.499 + addi BL,BL,2 // IU1 increment index
1.500 +c_Only_B_fwd:
1.501 + bns cr7,c_All_done_fwd // b if there are no bytes to store
1.502 +
1.503 + stvebx v0,DBK,BL // LSU store one byte if necessary
1.504 + b c_All_done_fwd
1.505 +
1.506 +c_Rt_just_fwd:
1.507 +
1.508 + stvx v0,DST,BK // LSU Store 16 bytes at D14
1.509 +c_All_done_fwd:
1.510 +#ifdef VRSAVE
1.511 + mtspr VRSV,RSV // IU1 Restore VRSAVE
1.512 +#endif
1.513 + blr // Return destination address from entry
1.514 +
1.515 +#ifdef __MWERKS__
1.516 + .align 16
1.517 +#else
1.518 + .align 4
1.519 +#endif
1.520 +c_GT_4QW_fwd: // Do once if nxt st is to odd half of cache line, else twice
1.521 +
1.522 + addi QW,QW,-1 // IU1 Keeping track of QWs stored
1.523 + mtcrf 0x02,DNX // IU2 cr6[3]=((DST+32)[27]==1)?1:0;
1.524 + addi DNX,DNX,16 // IU1 Update cr6 for next loop
1.525 +
1.526 + stvx v0,DST,BK // LSU Store 16 bytes at D2
1.527 + addi BK,BK,16 // IU1 Increment byte count by 16
1.528 + bdnzf 27,c_GT_4QW_fwd // b if next store is to lower (even) half of CL
1.529 +
1.530 + mtcrf 0x02,DBK // IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?)
1.531 +
1.532 + bns cr6,c_B32_fwd // b if DST[27] == 0; i.e, final store is even
1.533 +
1.534 +// We need the ctr register to reflect an even byte count before entering
1.535 +// the next block - faster to decrement than to reload.
1.536 + bdnz B32_fwd // decrement counter for last QW store odd
1.537 +
1.538 +c_B32_fwd: // Should be at least 2 stores remaining and next 2 are cache aligned
1.539 + dcbz DST,BK // LSU zero whole cache line
1.540 + bdz c_Nxt_loc_fwd // always decrement and branch to next instr
1.541 +
1.542 +c_Nxt_loc_fwd:
1.543 + addi BK,BK,32 // IU1 Increment byte count
1.544 + bdnz B32_fwd // b if there are at least two more QWs to do
1.545 +
1.546 + bso cr6,c_One_even_QW // b if there is one even and one odd QW to store
1.547 + b c_Last_QW // b if last store is to even address
1.548 +
1.549 +// Come here with two more loads and two stores to do
1.550 +c_One_even_QW:
1.551 + stvx v0,DST,BK // LSU Store 16 bytes at D13
1.552 + addi BK,BK,16 // IU1 Increment byte count
1.553 +
1.554 + b c_Last_QW
1.555 +
1.556 +// End of cacheable_memzero in AltiVec