1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/os/ossrv/genericopenlibs/liboil/src/motovec/vec_memcpy.s Fri Jun 15 03:10:57 2012 +0200
1.3 @@ -0,0 +1,876 @@
1.4 +//------------------------------------------------------------------
1.5 +// file: vec_memcpy.S
1.6 +// AltiVec enabled version of memcpy and bcopy
1.7 +//------------------------------------------------------------------
1.8 +
1.9 +//------------------------------------------------------------------
1.10 +// Copyright Motorola, Inc. 2003
1.11 +// ALL RIGHTS RESERVED
1.12 +//
1.13 +// You are hereby granted a copyright license to use, modify, and
1.14 +// distribute the SOFTWARE so long as this entire notice is retained
1.15 +// without alteration in any modified and/or redistributed versions,
1.16 +// and that such modified versions are clearly identified as such.
1.17 +// No licenses are granted by implication, estoppel or otherwise under
1.18 +// any patents or trademarks of Motorola, Inc.
1.19 +//
1.20 +// The SOFTWARE is provided on an "AS IS" basis and without warranty.
1.21 +// To the maximum extent permitted by applicable law, MOTOROLA DISCLAIMS
1.22 +// ALL WARRANTIES WHETHER EXPRESS OR IMPLIED, INCLUDING IMPLIED
1.23 +// WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR
1.24 +// PURPOSE AND ANY WARRANTY AGAINST INFRINGEMENT WITH
1.25 +// REGARD TO THE SOFTWARE (INCLUDING ANY MODIFIED VERSIONS
1.26 +// THEREOF) AND ANY ACCOMPANYING WRITTEN MATERIALS.
1.27 +//
1.28 +// To the maximum extent permitted by applicable law, IN NO EVENT SHALL
1.29 +// MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER
1.30 +// (INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF
1.31 +// BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS
1.32 +// INFORMATION, OR OTHER PECUNIARY LOSS) ARISING OF THE USE OR
1.33 +// INABILITY TO USE THE SOFTWARE. Motorola assumes no responsibility
1.34 +// for the maintenance and support of the SOFTWARE.
1.35 +//------------------------------------------------------------------
1.36 +
1.37 +//------------------------------------------------------------------
1.38 +// extern void * memcpy(void *dst, const void *src, size_t len);
1.39 +// Returns:
1.40 +// void *dst
1.41 +//------------------------------------------------------------------
1.42 +
1.43 +//------------------------------------------------------------------
1.44 +// extern void * memmove( void *dst, const void *src, size_t len );
1.45 +// Copies len characters from src to dst and returns the value of
1.46 +// dst. Works correctly for overlapping memory regions.
1.47 +// - Harbison&Steele 4th ed (corrected as to return)
1.48 +// Returns:
1.49 +// void *dst
1.50 +//------------------------------------------------------------------
1.51 +
1.52 +//------------------------------------------------------------------
1.53 +// extern void * bcopy(const void *src, void *dst, size_t len);
1.54 +// Returns:
1.55 +// void *dst
1.56 +//------------------------------------------------------------------
1.57 +
1.58 +// memcpy and memmove are combined into one entry point here because of
1.59 +// the similarity of operation and need to create fool-proof code.
1.60 +// The following conditions determine what is "fool proof":
1.61 +//
1.62 +// if: then single entry:
1.63 +// (DST-SRC)<0 && (SRC-DST)>=BC && BC>MIN_VEC will b to v_memcpy
1.64 +// (DST-SRC)<0 && (SRC-DST)< BC && BC>MIN_VEC must b to v_memcpy
1.65 +// (DST-SRC)<0 && BC<MIN_VEC copy fwd byte-by-byte
1.66 +// (DST-SRC)==0 || BC==0 will just return
1.67 +// (DST-SRC)>0 && BC<MIN_VEC copy bkwd byte-by-byte
1.68 +// (DST-SRC)>0 && (DST-SRC)< BC && BC>MIN_VEC must b to v_memmove
1.69 +// (DST-SRC)>0 && (SRC-DST)>=BC && BC>MIN_VEC will b to v_memmove
1.70 +
1.71 +// If you call memmove (or vec_memmove) and |DST-SRC|>=BC,
1.72 +// this code will branch to v_memcpy anyway for maximum performance.
1.73 +
1.74 +// Revision History:
1.75 +// Rev 0.0 Original Chuck Corley 02/03/03
1.76 +// Can still add dst, 128B loop, and aligned option
1.77 +// Rev 0.01 Fixed JY's seg-fault violation CJC 02/17/03
1.78 +// Rev 0.1 Added 128B loop and dst; cndtnlzd dcbz CJC 02/18/03
1.79 +// (Creating separate path for QW aligned didn't help much)
1.80 +// Rev 0.11 Small code schdling; chngd dst for memmove CJC 02/23/03
1.81 +// Rev 0.20 Eliminated alternate entry and cleanup CJC 02/27/03
1.82 +// Rev 0.21 Inproved loop branch targets for v_mempcy CJC 03/01/03
1.83 +// Rev 0.22 Experimented with dst (sent to H.) CJC 03/02/03
1.84 +// Rev 0.23 Substituted dcba for dcbz (sent to JY) CJC 03/08/03
1.85 +// Rev 0.24 Use two dst streams CJC 03/12/03
1.86 +// Rev 0.25 Fix for all compilers, cleanup, and release with
1.87 +// libmotovec.a rev 0.10 CJC 03/14/03
1.88 +// Rev 0.30 Fix for pre-empted destination (SNDF-DS) CJC 04/02/03
1.89 +//
1.90 +// Between Rev 0.25 and 0.30 the code was revised to store elements of
1.91 +// source at destination when first and/or last vector are less than 16
1.92 +// bytes. Areviewer at SNDF observed that loading the destination vector
1.93 +// for merging exposed the "uninvolved" destination bytes to incoherency
1.94 +// if an interrupt pre-empted this routine and modified the "uninvolved"
1.95 +// destination vector(s) while held in register for merging. It seems
1.96 +// like a low possibility but this revision is no longer subject to that
1.97 +// possibility. (It is also slightly faster than Rev 0.25.)
1.98 +// This is beta quality code; users are encouraged to make it faster.
1.99 +// ASSUMPTIONS:
1.100 +// Code is highly likely to be in the cache; data is not (streaming data)
1.101 +
1.102 +#define VRSV 256 // VRSAVE spr
1.103 +// Don't use vectors for BC <= MIN_VEC. Works only if MIN_VEC >= 16 bytes.
1.104 +#define MIN_VEC 16
1.105 +// Don't use Big_loop in v_memcpy for |dst-src|<= minimum overlap.
1.106 +#define MIN_OVL 128
1.107 +
1.108 +// Register useage
1.109 +#define Rt r0 // r0 when used as a temporary register
1.110 +
1.111 +#define DST r3 // entering: dst pointer; exiting: same dst pointer
1.112 +
1.113 +#define SRC r4 // entering: src ptr; then end of src range index (SRC+BC) in memmove
1.114 +
1.115 +#define BC r5 // entering: Byte_Count
1.116 +
1.117 +#define PCS r6 // save for partial checksum entering
1.118 +
1.119 +#define DMS r7 // dst - src initially
1.120 +#define BK r7 // BC - 1 +/- (n*16)
1.121 +
1.122 +// Codewarrior will put an unwelcome space as "lbzu r0,1(r7 )"
1.123 +// if you don't put the comment right after the r7. CJC 030314
1.124 +#define SM1 r8// src -1 for byte-by-byte forwards initially
1.125 +#define S r8 // src[28:31]
1.126 +#define SMD r8 // src[0:27]-dst[0:27]
1.127 +#define STR r8 // data stream touch block & stride info for Big_loop
1.128 +
1.129 +#define DM1 r9// dst -1 for byte-by-byte forwards initially
1.130 +#define D r9 // dst[28:31]
1.131 +#define DNX r9 // (dst+n*16)[28:31]
1.132 +#define BL r9 // second byte_kount index pointer
1.133 +
1.134 +#define SBC r10// src + byte count initially then src[28:31]
1.135 +#define BLK r10 // temporary data stream touch block & stride info
1.136 +#define DR r10 // (dst+16)[0:27]
1.137 +#define QW r10 // number of quad words (vectors)
1.138 +
1.139 +#define DBC r11// dst + byte count initially
1.140 +#define BLL r11 // temporary data stream touch block & stride info
1.141 +#define SBK r11 // (src+byte_count-1)
1.142 +#define SBR r11 // (src+byte_count-1)[0:27]
1.143 +#define DBK r11 // (dst+byte_count-1) then (dst+byte_count-1)[28:31]
1.144 +#define BIG r11 // QW/8 or 128 byte loop count
1.145 +#define SP8 r11 // SRC + n*128 (8 QWs) for data streaming after first call
1.146 +
1.147 +#define RSV r12 // storage for VRSAVE register if used
1.148 +
1.149 +#define VS0 v0 // src vector for permuting
1.150 +
1.151 +#define VS1 v1 // src vector for permuting
1.152 +
1.153 +#define VP3 v2 // d - s permute register
1.154 +
1.155 +#define VPS0 v3 // permuted source vector to store
1.156 +
1.157 +#define VPS1 v4 // 2nd permuted source vector to store
1.158 +
1.159 +#define VPS2 v5 // additional permuted src in Big loop
1.160 +
1.161 +#define VS2 v6 // src vector for permuting
1.162 +#define VPS3 v6 // additional permuted src in Big loop
1.163 +
1.164 +#define VS3 v7 // additional src load in Big loop
1.165 +#define VPS4 v7 // additional permuted src in Big loop
1.166 +
1.167 +#define VS4 v8 // additional src load in Big loop
1.168 +#define VPS5 v8 // additional permuted src in Big loop
1.169 +
1.170 +#define VS5 v9 // additional src load in Big loop
1.171 +#define VPS6 v9 // additional permuted src in Big loop
1.172 +
1.173 +#define VS6 v10 // additional src load in Big loop
1.174 +#define VPS7 v10 // additional permuted src in Big loop
1.175 +
1.176 +#define VS7 v11 // additional src load in Big loop
1.177 +
1.178 +// Conditionalize the use of dcba. It will help if the data is
1.179 +// not in cache and hurt if it is. Generally, except for small
1.180 +// benchmarks repeated many times, we assume data is not in cache
1.181 +// (data streaming) and using dcbz is a performance boost.
1.182 +#ifndef NO_DCBA
1.183 +#if defined(__GNUC__) || defined(__MWERKS__) || defined(_DIAB_TOOL)
1.184 + // gcc and codewarrior and diab don't assemble dcba
1.185 +#define DCBK .long 0x7c033dec
1.186 +// dcba r3,r7 or dcba DST,BK
1.187 +#define DCBL .long 0x7c034dec
1.188 +// dcba r3,r9 or dcba DST,BL
1.189 +#else
1.190 +#ifdef __ghs__
1.191 +.macro DCBK
1.192 +.long 0x7c033dec
1.193 +.endm
1.194 +.macro DCBL
1.195 +.long 0x7c034dec
1.196 +.endm
1.197 +#else
1.198 +#define DCBK dcba DST,BK
1.199 +#define DCBL dcba DST,BL
1.200 +#endif // __ghs__
1.201 +#endif // __GNUC__ or __MWERKS__
1.202 +#else
1.203 +#define DCBK nop
1.204 +#define DCBL nop
1.205 +#endif // NO_DCBA
1.206 +
1.207 +// Conditionalize the use of dst (data stream touch). It will help
1.208 +// if the data is not in cache and hurt if it is (though not as badly
1.209 +// as dcbz). Generally, except for small benchmarks repeated many times,
1.210 +// we assume data is not in cache (data streaming) and using dst is a
1.211 +// performance boost.
1.212 +#ifndef NO_DST
1.213 +#define STRM_B dst SBC,BLL,0
1.214 +#define STRM_F dst SRC,BLK,0
1.215 +#define STRM_1 dst SP8,STR,1
1.216 +
1.217 +#else
1.218 +#define STRM_B nop
1.219 +#define STRM_F nop
1.220 +#define STRM_1 nop
1.221 +#endif
1.222 +
1.223 +// Condition register use
1.224 +// cr0[0:2] = (dst-src==0)? return: ((dst-src>0)? copy_bkwd, copy_fwd;);
1.225 +// then cr0[0:2] = (dst[28:31]-src[28:31]<0)? "shifting left", "shifting right";
1.226 +// cr1[0,2] = (BC == 0)? 1 : 0; (nothing to move)
1.227 +// then cr1[2] = (DST[28:31] == 0)? 1 : 0; (D0 left justified)
1.228 +// then cr1[2] = ((DBK = DST+BC-1)[28:31] = 0xF)? 1 : 0; (DN right justified)
1.229 +// cr5[0,2] = (|DST-SRC|<=MIN_OVL)?1:0; (Overlap too small for Big loop?)
1.230 +// cr6[1,2] = (DST-SRC>=BC)?1:0; (Okay for v_memmove to copy forward?)
1.231 +// then cr6[2] = (QW == 0)? 1 : 0; (Any full vectors to move?)
1.232 +// then cr6[1] = (QW > 4)? 1 : 0; (>4 vectors to move?)
1.233 +// then cr6[3] = (third store[27] == 1)? 1: 0; (cache line alignment)
1.234 +// then cr6[3] = (last store[27] == 1)? 1: 0; (last store odd?)
1.235 +// cr7[2] = (BC>MIN_VEC)?1:0; (BC big enough to warrant vectors)
1.236 +// then cr7[0:3] = (DST+16)[0:27]-DST (How many bytes (iff <16) in first vector?)
1.237 +// then cr7[1] = (QW > 14)? 1 : 0; (>14 vectors to move?)
1.238 +// then cr7[0:3] = (DST+BC)[0:27] (How many bytes (iff <16) in last vector?)
1.239 +
1.240 + .text
1.241 +#ifdef __MWERKS__
1.242 + .align 32
1.243 +#else
1.244 + .align 5
1.245 +#endif
1.246 +
1.247 +#ifdef LIBMOTOVEC
1.248 + .globl memmove
1.249 +memmove:
1.250 + nop // IU1 Compilers forget first label
1.251 + .globl memcpy
1.252 +memcpy:
1.253 +#else
1.254 + .globl vec_memmove
1.255 +vec_memmove:
1.256 + nop // IU1 Only way I know to preserve both labels
1.257 + .globl _vec_memcpy
1.258 +_vec_memcpy:
1.259 +#endif
1.260 + subf. DMS,SRC,DST // IU1 Compute dst-src difference
1.261 + cmpi cr1,0,BC,0 // IU1 Eliminate zero byte count moves
1.262 + cmpi cr7,0,BC,MIN_VEC // IU1 Check for minimum byte count
1.263 +
1.264 + addi SM1,SRC,-1 // IU1 Pre-bias and duplicate src for fwd
1.265 + addi DM1,DST,-1 // IU1 Pre-bias and duplicate destination
1.266 + add SBC,SRC,BC // IU1 Pre-bias and duplicate src for bkwd
1.267 + beqlr // return if DST = SRC
1.268 +
1.269 + add DBC,DST,BC // IU1 Pre-bias and duplicate destination
1.270 + subf Rt,DST,SRC // IU1 Form |DST-SRC| if DST-SRC<0
1.271 + beqlr cr1 // return if BC = 0
1.272 +
1.273 + bgt Cpy_bkwd // b if DST-SRC>0 (have to copy backward)
1.274 + cmpi cr5,0,Rt,MIN_OVL // IU1 (|DST-SRC|>128)?1:0; for v_memcpy
1.275 + bgt cr7,v_memcpy // b if BC>MIN_VEC (okay to copy vectors fwd)
1.276 +
1.277 +// Copy byte-by-byte forwards if DST-SRC<0 and BC<=MIN_VEC
1.278 + mtctr BC // i=BC; do ...;i--; while (i>0)
1.279 +Byte_cpy_fwd:
1.280 + lbzu Rt,1(SM1) // LSU * ++(DST-1) = * ++(SRC-1)
1.281 + stbu Rt,1(DM1) // LSU
1.282 + bdnz Byte_cpy_fwd
1.283 +
1.284 + blr
1.285 + nop // IU1 Improve next label as branch target
1.286 +Cpy_bkwd:
1.287 + cmpi cr5,0,DMS,MIN_OVL // IU1 ((DST-SRC)>128)?1:0; for v_memcpy
1.288 + cmp cr6,0,DMS,BC // IU1 cr6[1,2]=(DST-SRC>=BC)?1:0;
1.289 + bgt cr7,v_memmove // b if BC>MIN_VEC (copy vectors bkwd)
1.290 +// Copy byte-by-byte backwards if DST-SRC>0 and BC<=MIN_VEC
1.291 + mtctr BC // i=BC; do ...;i--; while (i>0)
1.292 +Byte_cpy_bwd:
1.293 + lbzu Rt,-1(SBC) // LSU * --(DST+BC) = * --(SRC+BC)
1.294 + stbu Rt,-1(DBC) // LSU Store it
1.295 + bdnz Byte_cpy_bwd
1.296 + blr
1.297 +
1.298 +#ifdef __MWERKS__
1.299 + .align 16
1.300 +#else
1.301 + .align 4
1.302 +#endif
1.303 +
1.304 +v_memmove:
1.305 +// Byte count < MIN_VEC bytes will have been copied by scalar code above,
1.306 +// so this will not deal with small block moves < MIN_VEC.
1.307 +
1.308 +// For systems using VRSAVE, define VRSAVE=1 when compiling. For systems
1.309 +// that don't, make sure VRSAVE is undefined.
1.310 +#ifdef VRSAVE
1.311 + mfspr RSV,VRSV // IU2 Get current VRSAVE contents
1.312 +#endif
1.313 + rlwinm S,SRC,0,28,31 // IU1 Save src address bits s[28:31]
1.314 + rlwinm D,DST,0,28,31 // IU1 D = dst[28:31]
1.315 + bge cr6,MC_entry // b to v_memcpy if DST-SRC>=BC (fwd copy OK)
1.316 +
1.317 +#ifdef VRSAVE
1.318 + oris Rt,RSV,0xfff0 // IU1 Or in registers used by this routine
1.319 +#endif
1.320 + lis BLL,0x010c // IU1 Stream 12 blocks of 16 bytes
1.321 + subf. SMD,D,S // IU1 if S-D<0 essentially shifting right
1.322 +
1.323 +#ifdef VRSAVE
1.324 + mtspr VRSV,Rt // IU2 Save in VRSAVE before first vec op
1.325 +#endif
1.326 + lvsr VP3,0,DMS // LSU Permute vector for dst - src shft right
1.327 + ori BLL,BLL,0xffe0 // IU1 Stream stride -32B
1.328 +
1.329 + STRM_B // LSU Start data stream at SRC+BC
1.330 + addi SBK,SBC,-1 // IU1 Address of last src byte
1.331 + bgt Rt_shft // Bytes from upper vector = (s-d>0)?s-d:16+s-d;
1.332 + addi SMD,SMD,16 // IU1 Save 16-(d-s)
1.333 +Rt_shft:
1.334 +
1.335 + rlwinm SBR,SBK,0,0,27 // IU1 (SRC+BC-1)[0:27]
1.336 + addi BK,BC,-1 // IU1 Initialize byte index
1.337 +
1.338 + subf Rt,SBR,SBC // IU1 How many bytes in first source?
1.339 + add DBK,DST,BK // IU1 Address of last dst byte
1.340 + addi DR,DST,16 // IU1 Address of second dst vector
1.341 +
1.342 + subf. SMD,Rt,SMD // IU1 if bytes in 1st src>Bytes in 1st permute
1.343 + rlwinm Rt,DBK,0,28,31 // IU1 (DST+BC-1)[28:31]
1.344 + rlwinm DR,DR,0,0,27 // IU1 (DST+16)[0:27]
1.345 +
1.346 +// If there are more useful bytes in the upper vector of a permute pair than we
1.347 +// will get in the first permute, the first loaded vector needs to be in the
1.348 +// lower half of the permute pair. The upper half is a don't care then.
1.349 + blt Get_bytes_rt // b if shifting left (D-S>=0)
1.350 +
1.351 + lvx VS1,SRC,BK // LSU Get SN load started
1.352 +// Comments numbering source and destination assume single path through the
1.353 +// code executing each instruction once. For vec_memmove, an example would
1.354 +// be the call memmove(BASE+0x0F, BASE+0x2F, 82). N = 6 in that case.
1.355 + addi SRC,SRC,-16 // IU1 Decrement src base (to keep BK useful)
1.356 +
1.357 +Get_bytes_rt: // Come here to get VS0 & Don't care what VS1 is
1.358 + lvx VS0,SRC,BK // LSU Get SN-1 (SN if D-S<0) in lower vector
1.359 + subf QW,DR,DBK // IU1 Bytes of full vectors to move (-16)
1.360 + cmpi cr7,0,Rt,0xF // IU1 Is Dn right justified?
1.361 +
1.362 + cmpi cr1,0,D,0 // IU1 Is D0 left justified?
1.363 + rlwinm QW,QW,28,4,31 // IU1 Quad words remaining
1.364 + add Rt,DST,BC // IU1 Refresh the value of DST+BC
1.365 +
1.366 + cmpi cr6,0,QW,0 // IU1 Any full vectors to move?
1.367 + vperm VPS0,VS0,VS1,VP3 // VPU Align SN-1 and SN to DN
1.368 + vor VS1,VS0,VS0 // VIU1 Move lower vector to upper
1.369 + beq cr7,Rt_just // b if DN is right justified
1.370 +
1.371 + mtcrf 0x01,Rt // IU2 Put final vector byte count in cr7
1.372 + rlwinm DBK,DBK,0,0,27 // IU1 Address of first byte of final vector
1.373 + li D,0 // IU1 Initialize an index pointer
1.374 + bnl cr7,Only_1W_bkwd // b if there was only one or zero words to store
1.375 +
1.376 + stvewx VPS0,DBK,D // LSU store word 1 of two or three
1.377 + addi D,D,4 // IU1 increment index
1.378 +
1.379 + stvewx VPS0,DBK,D // LSU store word 2 of two or three
1.380 + addi D,D,4 // IU1 increment index
1.381 +Only_1W_bkwd:
1.382 + bng cr7,Only_2W_bkwd // b if there were only two or zero words to store
1.383 +
1.384 + stvewx VPS0,DBK,D // LSU store word 3 of three if necessary
1.385 + addi D,D,4 // IU1 increment index
1.386 +Only_2W_bkwd:
1.387 + bne cr7,Only_B_bkwd // b if there are no half words to store
1.388 +
1.389 + stvehx VPS0,DBK,D // LSU store one halfword if necessary
1.390 + addi D,D,2 // IU1 increment index
1.391 +Only_B_bkwd:
1.392 + bns cr7,All_done_bkwd // b if there are no bytes to store
1.393 +
1.394 + stvebx VPS0,DBK,D // LSU store one byte if necessary
1.395 + b All_done_bkwd
1.396 +
1.397 +Rt_just:
1.398 + stvx VPS0,DST,BK // LSU Store 16 bytes at DN
1.399 +All_done_bkwd:
1.400 + addi BK,BK,-16 // IU1 Decrement destination byte count
1.401 +
1.402 + ble cr6,Last_load // b if no Quad words to do
1.403 + mtctr QW // IU2 for (i=0;i<=QW;i++)-execution serializng
1.404 + cmpi cr6,0,QW,4 // IU1 Check QW>4
1.405 +QW_loop:
1.406 + lvx VS0,SRC,BK // LSU Get SN-2 (or SN-1 if ADJ==0)
1.407 +
1.408 + vperm VPS0,VS0,VS1,VP3 // VPU Align SN-2 and SN-1 to DN-1
1.409 + vor VS1,VS0,VS0 // VIU1 Move lower vector to upper
1.410 +
1.411 + stvx VPS0,DST,BK // LSU Store 16 bytes at DN-1
1.412 + addi BK,BK,-16 // IU1 Decrement byte kount
1.413 + bdnzf 25,QW_loop // b if 4 or less quad words to do
1.414 +
1.415 + add DNX,DST,BK // IU1 address of next store (DST+BC-1-16)
1.416 + bgt cr6,GT_4QW // b if >4 quad words left
1.417 +
1.418 +Last_load: // if D-S>=0, next load will be from same address as last
1.419 + blt No_ld_bkwd // b if shifting right (S-D>=0)
1.420 + addi SRC,SRC,16 // IU1 recorrect source if it was decremented
1.421 +No_ld_bkwd:
1.422 + lvx VS0,0,SRC // LSU Get last source SN-6 (guaranteed S0)
1.423 +// Current 16 bytes is the last; we're done.
1.424 + dss 0 // Data stream stop
1.425 + vperm VPS0,VS0,VS1,VP3 // VPU Align SN-6 and SN-5 to DN-6
1.426 + subfic D,DST,16 // IU1 How many bytes in first destination?
1.427 + beq cr1,Lt_just // b if last destination is left justified
1.428 +
1.429 + mtcrf 0x01,D // IU2 Put byte count remaining in cr7
1.430 + li D,0 // IU1 Initialize index pointer
1.431 + bns cr7,No_B_bkwd // b if only even number of bytes to store
1.432 +
1.433 + stvebx VPS0,DST,D // LSU store first byte at DST+0
1.434 + addi D,D,1 // IU1 increment index
1.435 +No_B_bkwd:
1.436 + bne cr7,No_H_bkwd // b if only words to store
1.437 + stvehx VPS0,DST,D // LSU store halfword at DST+0/1
1.438 + addi D,D,2 // IU1 increment index
1.439 +
1.440 +No_H_bkwd:
1.441 + bng cr7,No_W1_bkwd // b if exactly zero or two words to store
1.442 + stvewx VPS0,DST,D // LSU store word 1 of one or three
1.443 + addi D,D,4 // IU1 increment index
1.444 +
1.445 +No_W1_bkwd:
1.446 + bnl cr7,No_W2_bkwd // b if there was only one word to store
1.447 + stvewx VPS0,DST,D // LSU store word 1 of two or 2 of three
1.448 + addi D,D,4 // IU1 increment index
1.449 +
1.450 + stvewx VPS0,DST,D // LSU store word 2 of two or 3 of three
1.451 + b No_W2_bkwd
1.452 +
1.453 +Lt_just:
1.454 + stvx VPS0,0,DST // LSU Store 16 bytes at final dst addr D0
1.455 +No_W2_bkwd:
1.456 +#ifdef VRSAVE
1.457 + mtspr VRSV,RSV // IU1 Restore VRSAVE
1.458 +#endif
1.459 + blr // Return destination address from entry
1.460 +
1.461 +GT_4QW: // Do once if next store is to even half of cache line, else twice
1.462 +
1.463 + lvx VS0,SRC,BK // LSU Get SN-3 (or SN-2)
1.464 + mtcrf 0x02,DNX // IU2 cr6[3]=((DST+BC-1)[27]==1)?1:0;
1.465 +
1.466 + vperm VPS0,VS0,VS1,VP3 // VPU Align SN-3 and SN-2 to Dn-2
1.467 + vor VS1,VS0,VS0 // VIU1 Move lower vector to upper
1.468 + addi DNX,DNX,-16 // IU1 Prepare to update cr6 next loop
1.469 +
1.470 + stvx VPS0,DST,BK // LSU Store 16 bytes at DN-2
1.471 + vor VS3,VS0,VS0 // VIU Make a copy of lower vector
1.472 + addi BK,BK,-16 // IU1 Decrement byte count by 16
1.473 + bdnzt 27,GT_4QW // b if next store is to upper (odd) half of CL
1.474 +// At this point next store will be to even address.
1.475 +
1.476 + lis STR,0x102 // IU1 Stream 2 blocks of 16 bytes
1.477 + mtcrf 0x02,DST // IU2 cr6[3]=(DST[27]==1)?1:0; (DST odd?)
1.478 + addi BL,BK,-16 // IU1 Create an alternate byte count - 16
1.479 +
1.480 + ori STR,STR,0xffe0 // IU1 Stream stride -32B
1.481 + addi SP8,SRC,-64 // IU1 Starting address for data stream touch
1.482 + bso cr6,B32_bkwd // b if DST[27] == 1; i.e, final store is odd
1.483 +
1.484 + bdnz B32_bkwd // decrement counter for last odd QW store
1.485 +B32_bkwd: // Should be at least 2 stores remaining and next 2 are cache aligned
1.486 + lvx VS2,SRC,BK // LSU Get SN-4 (or SN-3)
1.487 + addi SP8,SP8,-32 // IU1 Next starting address for data stream touch
1.488 +
1.489 + lvx VS1,SRC,BL // LSU Get SN-5 (or SN-4)
1.490 + vperm VPS0,VS2,VS3,VP3 // VPU Align SN-4 and SN-3 to DN-3
1.491 +
1.492 + STRM_1 // LSU Stream 64 byte blocks ahead of loads
1.493 +
1.494 + DCBL // LSU allocate next cache line
1.495 +
1.496 + vperm VPS1,VS1,VS2,VP3 // VPU Align SN-5 and SN-4 to DN-4
1.497 + vor VS3,VS1,VS1 // VIU1 Move SN-5 to SN-3
1.498 +
1.499 + stvx VPS0,DST,BK // LSU Store 16 bytes at DN-3
1.500 + addi BK,BL,-16 // IU1 Decrement byte count
1.501 + bdz Nxt_loc_bkwd // always decrement and branch to next instr
1.502 +
1.503 +Nxt_loc_bkwd:
1.504 + stvx VPS1,DST,BL // LSU Store 16 bytes at DN-4
1.505 + addi BL,BK,-16 // IU1 Decrement alternate byte count
1.506 + bdnz B32_bkwd // b if there are at least two more QWs to do
1.507 +
1.508 + bns cr6,One_odd_QW // b if there was one more odd QW to store
1.509 + b Last_load
1.510 +
1.511 +// Come here with two more loads and two stores to do
1.512 +One_odd_QW:
1.513 + lvx VS1,SRC,BK // LSU Get SN-6 (or SN-5)
1.514 +
1.515 + vperm VPS1,VS1,VS3,VP3 // VPU Align SN-6 and SN-5 to DN-5
1.516 +
1.517 + stvx VPS1,DST,BK // LSU Store 16 bytes at DN-5
1.518 +
1.519 + b Last_load
1.520 +
1.521 +// End of memmove in AltiVec
1.522 +
1.523 +#ifdef __MWERKS__
1.524 + .align 16
1.525 +#else
1.526 + .align 4
1.527 +#endif
1.528 +v_memcpy:
1.529 +// Byte count < MIN_VEC bytes will have been copied by scalar code above,
1.530 +// so this will not deal with small block moves < MIN_VEC.
1.531 +
1.532 +#ifdef VRSAVE
1.533 + mfspr RSV,VRSV // IU2 Get current VRSAVE contents
1.534 +#endif
1.535 + rlwinm S,SRC,0,28,31 // IU1 Save src address bits s[28:31]
1.536 + rlwinm D,DST,0,28,31 // IU1 D = dst[28:31]
1.537 +
1.538 +MC_entry: // enter here from memmove if DST-SRC>=BC; this should be faster
1.539 +#ifdef VRSAVE
1.540 + oris Rt,RSV,0xfff0 // IU1 Or in registers used by this routine
1.541 +#endif
1.542 + lis BLK,0x010c // IU1 Stream 12 blocks of 16 bytes
1.543 +
1.544 + subf. S,S,D // IU1 if D-S<0 essentially shifting left
1.545 +
1.546 +#ifdef VRSAVE
1.547 + mtspr VRSV,Rt // IU2 Save in VRSAVE before first vec op
1.548 +#endif
1.549 + lvsr VP3,0,DMS // LSU Permute vector for dst - src shft right
1.550 + ori BLK,BLK,32 // IU1 Stream stride 32B
1.551 +
1.552 + STRM_F // LSU Start data stream 0 at SRC
1.553 + addi DR,DST,16 // IU1 Address of second dst vector
1.554 + addi DBK,DBC,-1 // IU1 Address of last dst byte
1.555 +
1.556 +// If D-S<0 we are "kinda" shifting left with the right shift permute vector
1.557 +// loaded to VP3 and we need both S0 and S1 to permute. If D-S>=0 then the
1.558 +// first loaded vector needs to be in the upper half of the permute pair and
1.559 +// the lower half is a don't care then.
1.560 + bge Ld_bytes_rt // b if shifting right (D-S>=0)
1.561 +
1.562 + lvx VS0,0,SRC // LSU Get S0 load started
1.563 +// Comments numbering source and destination assume single path through the
1.564 +// code executing each instruction once. For vec_memcpy, an example would
1.565 +// be the call memcpy(BASE+0x1E, BASE+0x1F, 259). N = 16 in that case.
1.566 + addi SRC,SRC,16 // IU1 Increment src base (to keep BK useful)
1.567 +
1.568 +Ld_bytes_rt: // Come here to get VS1 & Don't care what VS0 is
1.569 + lvx VS1,0,SRC // LSU Get S1 (or S0 if D-S>=0) in upper vector
1.570 + rlwinm DR,DR,0,0,27 // IU1 (DST+16)[0:27]
1.571 + cmpi cr1,0,D,0 // IU1 Is D0 left justified?
1.572 +
1.573 + subf Rt,DST,DR // IU1 How many bytes in first destination?
1.574 + subf QW,DR,DBK // IU1 Bytes of full vectors to move (-16)
1.575 + li BK,0 // IU1 Initialize byte kount index
1.576 +
1.577 + mtcrf 0x01,Rt // IU2 Put bytes in 1st dst in cr7
1.578 + rlwinm QW,QW,28,4,31 // IU1 Quad words remaining
1.579 + vperm VPS0,VS0,VS1,VP3 // VPU Align S0 and S1 to D0
1.580 +
1.581 + vor VS0,VS1,VS1 // VIU1 Move upper vector to lower
1.582 + beq cr1,Left_just // b if D0 is left justified
1.583 +
1.584 + bns cr7,No_B_fwd // b if only even number of bytes to store
1.585 +
1.586 + stvebx VPS0,DST,BK // LSU store first byte at DST+0
1.587 + addi BK,BK,1 // IU1 increment index
1.588 +No_B_fwd:
1.589 + bne cr7,No_H_fwd // b if only words to store
1.590 +
1.591 + stvehx VPS0,DST,BK // LSU store halfword at DST+0/1
1.592 + addi BK,BK,2 // IU1 increment index
1.593 +No_H_fwd:
1.594 + bng cr7,No_W1_fwd // b if exactly zero or two words to store
1.595 +
1.596 + stvewx VPS0,DST,BK // LSU store word 1 of one or three
1.597 + addi BK,BK,4 // IU1 increment index
1.598 +
1.599 +No_W1_fwd:
1.600 + bnl cr7,No_W2_fwd // b if there was only one word to store
1.601 + stvewx VPS0,DST,BK // LSU store word 1 of two or 2 of three
1.602 + addi BK,BK,4 // IU1 increment index
1.603 +
1.604 + stvewx VPS0,DST,BK // LSU store word 2 of two or 3 of three
1.605 + b No_W2_fwd
1.606 +
1.607 +Left_just:
1.608 + stvx VPS0,0,DST // LSU Store 16 bytes at D0
1.609 +No_W2_fwd:
1.610 + rlwinm Rt,DBK,0,28,31 // IU1 (DBK = DST+BC-1)[28:31]
1.611 + cmpi cr6,0,QW,0 // IU1 Any full vectors to move?
1.612 +
1.613 + li BK,16 // IU1 Re-initialize byte kount index
1.614 + cmpi cr1,0,Rt,0xF // IU1 Is DN right justified?
1.615 + cmpi cr7,0,QW,14 // IU1 Check QW>14
1.616 + ble cr6,Last_ld_fwd // b if no Quad words to do
1.617 +
1.618 + mtctr QW // IU2 for (i=0;i<=QW;i++)
1.619 + cmpi cr6,0,QW,4 // IU1 Check QW>4
1.620 +QW_fwd_loop:
1.621 + lvx VS1,SRC,BK // LSU Get S2 (or S1)
1.622 +
1.623 + vperm VPS0,VS0,VS1,VP3 // VPU Align S1 and S2 to D1
1.624 + vor VS0,VS1,VS1 // VIU1 Move upper vector to lower
1.625 +
1.626 + stvx VPS0,DST,BK // LSU Store 16 bytes at D1(+n*16 where n<4)
1.627 + addi BK,BK,16 // IU1 Increment byte kount index
1.628 + bdnzf 25,QW_fwd_loop // b if 4 or less quad words to do
1.629 +
1.630 + add DNX,DST,BK // IU1 address of next store (DST+32 if QW>4)
1.631 + addi QW,QW,-1 // IU1 One more QW stored by now
1.632 + bgt cr6,GT_4QW_fwd // b if >4 quad words left
1.633 +
1.634 +Last_ld_fwd: // Next 16 bytes is the last; we're done.
1.635 + add DBC,DST,BC // IU1 Recompute address of last dst byte + 1
1.636 + add SBC,SRC,BC // IU1 Recompute address of last src byte + 1
1.637 + bge No_ld_fwd // b if shifting right (D-S>=0)
1.638 +
1.639 + addi SBC,SBC,-16 // IU1 if D-S>=0 we didn't add 16 to src
1.640 +No_ld_fwd:
1.641 + mtcrf 0x01,DBC // IU2 Put final vector byte count in cr7
1.642 + addi DBK,DBC,-1 // IU1 Recompute address of last dst byte
1.643 + addi Rt,SBC,-1 // IU1 Recompute address of last src byte
1.644 +
1.645 +// If D-S<0 we have already loaded all the source vectors.
1.646 +// If D-S>=0 then the first loaded vector went to the upper half of the permute
1.647 +// pair and we need one more vector. (This may be a duplicate.)
1.648 +
1.649 + lvx VS1,0,Rt // LSU Get last source S14 (guaranteed SN)
1.650 +
1.651 +#ifndef NO_DST
1.652 + dss 0 // Data stream 0 stop
1.653 +
1.654 + dss 1 // Data stream 1 stop
1.655 +#endif
1.656 + vperm VPS0,VS0,VS1,VP3 // VPU Align S13 and S14 to D14
1.657 + beq cr1,Rt_just_fwd // b if last destination is right justified
1.658 +
1.659 + rlwinm DBK,DBK,0,0,27 // IU1 Round to QW addr of last byte
1.660 + li D,0 // IU1 Initialize index pointer
1.661 + bnl cr7,Only_1W_fwd // b if there was only one or zero words to store
1.662 +
1.663 + stvewx VPS0,DBK,D // LSU store word 1 of two or three
1.664 + addi D,D,4 // IU1 increment index
1.665 +
1.666 + stvewx VPS0,DBK,D // LSU store word 2 of two or three
1.667 + addi D,D,4 // IU1 increment index
1.668 +Only_1W_fwd:
1.669 + bng cr7,Only_2W_fwd // b if there were only two or zero words to store
1.670 +
1.671 + stvewx VPS0,DBK,D // LSU store word 3 of three if necessary
1.672 + addi D,D,4 // IU1 increment index
1.673 +Only_2W_fwd:
1.674 + bne cr7,Only_B_fwd // b if there are no half words to store
1.675 +
1.676 + stvehx VPS0,DBK,D // LSU store one halfword if necessary
1.677 + addi D,D,2 // IU1 increment index
1.678 +Only_B_fwd:
1.679 + bns cr7,All_done_fwd // b if there are no bytes to store
1.680 +
1.681 + stvebx VPS0,DBK,D // LSU store one byte if necessary
1.682 + b All_done_fwd
1.683 +
1.684 +Rt_just_fwd:
1.685 +
1.686 + stvx VPS0,DST,BK // LSU Store 16 bytes at D14
1.687 +All_done_fwd:
1.688 +#ifdef VRSAVE
1.689 + mtspr VRSV,RSV // IU1 Restore VRSAVE
1.690 +#endif
1.691 + blr // Return destination address from entry
1.692 +#ifdef __MWERKS__
1.693 + .align 16
1.694 +#else
1.695 + .align 4
1.696 +#endif
1.697 +GT_4QW_fwd: // Do once if nxt st is to odd half of cache line, else twice
1.698 +
1.699 + lvx VS1,SRC,BK // LSU Get S3 (or S2)
1.700 + addi QW,QW,-1 // IU1 Keeping track of QWs stored
1.701 + mtcrf 0x02,DNX // IU2 cr6[3]=((DST+32)[27]==1)?1:0;
1.702 +
1.703 + addi DNX,DNX,16 // IU1 Update cr6 for next loop
1.704 + addi Rt,QW,-2 // IU1 Insure at least 2 QW left after big loop
1.705 +
1.706 + vperm VPS0,VS0,VS1,VP3 // VPU Align S2 and S3 to D2
1.707 + vor VS0,VS1,VS1 // VIU1 Move upper vector to lower
1.708 +
1.709 + stvx VPS0,DST,BK // LSU Store 16 bytes at D2
1.710 + addi BK,BK,16 // IU1 Increment byte count by 16
1.711 + bdnzf 27,GT_4QW_fwd // b if next store is to lower (even) half of CL
1.712 +// At this point next store will be to even address.
1.713 +
1.714 + mtcrf 0x02,DBK // IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?)
1.715 + lis STR,0x104 // IU1 Stream 4 blocks of 16 bytes
1.716 + addi BL,BK,16 // IU1 Create an alternate byte kount + 32
1.717 +
1.718 + ori STR,STR,32 // IU1 Stream stride 32B
1.719 +#ifndef NO_BIG_LOOP
1.720 + rlwinm BIG,Rt,29,3,31 // IU1 QW/8 big loops to do
1.721 +
1.722 + rlwinm Rt,Rt,0,0,28 // IU1 How many QWs will be done in big loop
1.723 + bgt cr7,Big_loop // b if QW > 14
1.724 +#endif
1.725 +No_big_loop:
1.726 +// We need the ctr register to reflect an even byte count before entering
1.727 +// the next block - faster to decrement than to reload.
1.728 +
1.729 + addi SP8,SRC,256 // IU1 Starting address for data stream touch
1.730 + xoris STR,STR,0x6 // IU1 Reset stream to 2 blocks of 16 bytes
1.731 + bns cr6,B32_fwd // b if DST[27] == 0; i.e, final store is even
1.732 +
1.733 + bdnz B32_fwd // decrement counter for last QW store odd
1.734 +
1.735 +B32_fwd: // Should be at least 2 stores remaining and next 2 are cache aligned
1.736 + lvx VS1,SRC,BK // LSU Get S12
1.737 + addi SP8,SP8,32 // IU1 Next starting address for data stream touch
1.738 +
1.739 + lvx VS2,SRC,BL // LSU Get S13
1.740 + vperm VPS1,VS0,VS1,VP3 // VPU Align S11 and S12 to D11
1.741 +
1.742 + STRM_1 // LSU Stream 64 byte blocks ahead of loads
1.743 +
1.744 + DCBK // LSU then Kill instead of RWITM
1.745 +
1.746 + vperm VPS0,VS1,VS2,VP3 // VPU Align S12 and S13 to D12
1.747 + vor VS0,VS2,VS2 // VIU1 Move S13 to S11
1.748 +
1.749 + stvx VPS1,DST,BK // LSU Store 16 bytes at D11
1.750 + addi BK,BL,16 // IU1 Increment byte count
1.751 + bdz Nxt_loc_fwd // always decrement and branch to next instr
1.752 +
1.753 +Nxt_loc_fwd:
1.754 + stvx VPS0,DST,BL // LSU Store 16 bytes at D12
1.755 + addi BL,BK,16 // IU1 Increment alternate byte count
1.756 + bdnz B32_fwd // b if there are at least two more QWs to do
1.757 +
1.758 + bso cr6,One_even_QW // b if there is one even and one odd QW to store
1.759 + b Last_ld_fwd // b if last store is to even address
1.760 +
1.761 +// Come here with two more loads and two stores to do
1.762 +One_even_QW:
1.763 + lvx VS1,SRC,BK // LSU Get S14 (or S13 if if D-S>=0)
1.764 +
1.765 + vperm VPS0,VS0,VS1,VP3 // VPU Align S13 and S14 to D13
1.766 + vor VS0,VS1,VS1 // VIU1 Move upper vector to lower
1.767 +
1.768 + stvx VPS0,DST,BK // LSU Store 16 bytes at D13
1.769 + addi BK,BK,16 // IU1 Increment byte count
1.770 +
1.771 + b Last_ld_fwd
1.772 +
1.773 +#ifdef __MWERKS__
1.774 + .align 16
1.775 +#else
1.776 + .align 4
1.777 +#endif
1.778 +Big_loop:
1.779 + subf QW,Rt,QW // IU1 Should be 2-7 QWs left after big loop
1.780 + blt cr5,No_big_loop // b back if |DST-SRC|<128; Big_loop won't work.
1.781 + mtctr BIG // IU2 loop for as many 128B loops as possible
1.782 + addi SP8,SRC,256 // IU1 Starting address for data stream touch
1.783 +
1.784 +Loop_of_128B: // Come here with QW>=10 and next store even; VS0 last load
1.785 + lvx VS1,SRC,BK // LSU Get S4 (or S3 if D-S>=0)
1.786 + addi BL,BK,32 // IU1 Increment Byte_Kount+16 by 32
1.787 + addi SP8,SP8,128 // IU1 increment address for data stream touch
1.788 +
1.789 + lvx VS3,SRC,BL // LSU Get S6 (or S5)
1.790 + addi BL,BL,32 // IU1 Increment Byte_Kount+48 by 32
1.791 +
1.792 + lvx VS5,SRC,BL // LSU Get S8 (or S7)
1.793 + addi BL,BL,32 // IU1 Increment Byte_Kount+80 by 32
1.794 +
1.795 + lvx VS7,SRC,BL // LSU Get S10 (or S9)
1.796 + addi BL,BK,16 // IU1 Increment Byte_Kount+16 by 16
1.797 +
1.798 + lvx VS2,SRC,BL // LSU Get S5 (or S4)
1.799 + addi BL,BL,32 // IU1 Increment Byte_Kount+32 by 32
1.800 +
1.801 + lvx VS4,SRC,BL // LSU Get S7 (or S6)
1.802 + addi BL,BL,32 // IU1 Increment Byte_Kount+64 by 32
1.803 +
1.804 + lvx VS6,SRC,BL // LSU Get S9 (or S8)
1.805 + addi BL,BL,32 // IU1 Increment Byte_Kount+96 by 32
1.806 + vperm VPS0,VS0,VS1,VP3 // VPU
1.807 +
1.808 + lvx VS0,SRC,BL // LSU Get S11 (or S10)
1.809 + vperm VPS1,VS1,VS2,VP3 // VPU
1.810 +
1.811 + STRM_1 // LSU Stream 4 32B blocks, stride 32B
1.812 +
1.813 + DCBK // LSU then Kill instead of RWITM
1.814 +
1.815 + stvx VPS0,DST,BK // LSU Store D3
1.816 + addi BK,BK,16 // IU1 Increment Byte_Kount+16 by 16
1.817 + vperm VPS2,VS2,VS3,VP3 // VPU
1.818 +
1.819 + stvx VPS1,DST,BK // LSU Store D4
1.820 + addi BK,BK,16 // IU1 Increment Byte_Kount+32 by 16
1.821 + vperm VPS3,VS3,VS4,VP3 // VPU
1.822 +
1.823 + DCBK // LSU then Kill instead of RWITM
1.824 +
1.825 + stvx VPS2,DST,BK // LSU Store D5
1.826 + addi BK,BK,16 // IU1 Increment Byte_Kount+48 by 16
1.827 + vperm VPS4,VS4,VS5,VP3 // VPU
1.828 +
1.829 + stvx VPS3,DST,BK // LSU Store D6
1.830 + addi BK,BK,16 // IU1 Increment Byte_Kount+64 by 16
1.831 + vperm VPS5,VS5,VS6,VP3 // VPU
1.832 +
1.833 + DCBK // LSU then Kill instead of RWITM
1.834 +
1.835 + stvx VPS4,DST,BK // LSU Store D7
1.836 + addi BK,BK,16 // IU1 Increment Byte_Kount+80 by 16
1.837 + vperm VPS6,VS6,VS7,VP3 // VPU
1.838 +
1.839 + stvx VPS5,DST,BK // LSU Store D8
1.840 + addi BK,BK,16 // IU1 Increment Byte_Kount+96 by 16
1.841 + vperm VPS7,VS7,VS0,VP3 // VPU
1.842 +
1.843 + DCBK // LSU then Kill instead of RWITM
1.844 +
1.845 + stvx VPS6,DST,BK // LSU Store D9
1.846 + addi BK,BK,16 // IU1 Increment Byte_Kount+112 by 16
1.847 +
1.848 + stvx VPS7,DST,BK // LSU Store D10
1.849 + addi BK,BK,16 // IU1 Increment Byte_Kount+128 by 16
1.850 + bdnz Loop_of_128B // b if ctr > 0 (QW/8 still > 0)
1.851 +
1.852 + mtctr QW // IU1 Restore QW remaining to counter
1.853 + addi BL,BK,16 // IU1 Create an alternate byte kount + 16
1.854 + bns cr6,B32_fwd // b if DST[27] == 0; i.e, final store is even
1.855 +
1.856 + bdnz B32_fwd // b and decrement counter for last QW store odd
1.857 + // One of the above branches should have taken
1.858 +
1.859 +// End of memcpy in AltiVec
1.860 +
1.861 +// bcopy works like memcpy, but the source and destination operands are reversed.
1.862 +// Following will just reverse the operands and branch to memcpy.
1.863 +
1.864 +#ifdef LIBMOTOVEC
1.865 + .globl bcopy
1.866 +bcopy:
1.867 +#else
1.868 + .globl vec_bcopy
1.869 +vec_bcopy:
1.870 +#endif
1.871 + mr Rt,DST // temp storage for what is really source address (r3)
1.872 + mr DST,SRC // swap destination address to r3 to match memcpy dst
1.873 + mr SRC,Rt // Complete swap of destination and source for memcpy
1.874 +#ifdef LIBMOTOVEC
1.875 + b memcpy // b to memcpy with correct args in r3 and r4
1.876 +#else
1.877 + b _vec_memcpy // b to vec_memcpy with correct args in r3 and r4
1.878 +#endif
1.879 +// End of bcopy in AltiVec