sl@0: //------------------------------------------------------------------ sl@0: // file: vec_memcpy.S sl@0: // AltiVec enabled version of memcpy and bcopy sl@0: //------------------------------------------------------------------ sl@0: sl@0: //------------------------------------------------------------------ sl@0: // Copyright Motorola, Inc. 2003 sl@0: // ALL RIGHTS RESERVED sl@0: // sl@0: // You are hereby granted a copyright license to use, modify, and sl@0: // distribute the SOFTWARE so long as this entire notice is retained sl@0: // without alteration in any modified and/or redistributed versions, sl@0: // and that such modified versions are clearly identified as such. sl@0: // No licenses are granted by implication, estoppel or otherwise under sl@0: // any patents or trademarks of Motorola, Inc. sl@0: // sl@0: // The SOFTWARE is provided on an "AS IS" basis and without warranty. sl@0: // To the maximum extent permitted by applicable law, MOTOROLA DISCLAIMS sl@0: // ALL WARRANTIES WHETHER EXPRESS OR IMPLIED, INCLUDING IMPLIED sl@0: // WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR sl@0: // PURPOSE AND ANY WARRANTY AGAINST INFRINGEMENT WITH sl@0: // REGARD TO THE SOFTWARE (INCLUDING ANY MODIFIED VERSIONS sl@0: // THEREOF) AND ANY ACCOMPANYING WRITTEN MATERIALS. sl@0: // sl@0: // To the maximum extent permitted by applicable law, IN NO EVENT SHALL sl@0: // MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER sl@0: // (INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF sl@0: // BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS sl@0: // INFORMATION, OR OTHER PECUNIARY LOSS) ARISING OF THE USE OR sl@0: // INABILITY TO USE THE SOFTWARE. Motorola assumes no responsibility sl@0: // for the maintenance and support of the SOFTWARE. sl@0: //------------------------------------------------------------------ sl@0: sl@0: //------------------------------------------------------------------ sl@0: // extern void * memcpy(void *dst, const void *src, size_t len); sl@0: // Returns: sl@0: // void *dst sl@0: //------------------------------------------------------------------ sl@0: sl@0: //------------------------------------------------------------------ sl@0: // extern void * memmove( void *dst, const void *src, size_t len ); sl@0: // Copies len characters from src to dst and returns the value of sl@0: // dst. Works correctly for overlapping memory regions. sl@0: // - Harbison&Steele 4th ed (corrected as to return) sl@0: // Returns: sl@0: // void *dst sl@0: //------------------------------------------------------------------ sl@0: sl@0: //------------------------------------------------------------------ sl@0: // extern void * bcopy(const void *src, void *dst, size_t len); sl@0: // Returns: sl@0: // void *dst sl@0: //------------------------------------------------------------------ sl@0: sl@0: // memcpy and memmove are combined into one entry point here because of sl@0: // the similarity of operation and need to create fool-proof code. sl@0: // The following conditions determine what is "fool proof": sl@0: // sl@0: // if: then single entry: sl@0: // (DST-SRC)<0 && (SRC-DST)>=BC && BC>MIN_VEC will b to v_memcpy sl@0: // (DST-SRC)<0 && (SRC-DST)< BC && BC>MIN_VEC must b to v_memcpy sl@0: // (DST-SRC)<0 && BC0 && BC0 && (DST-SRC)< BC && BC>MIN_VEC must b to v_memmove sl@0: // (DST-SRC)>0 && (SRC-DST)>=BC && BC>MIN_VEC will b to v_memmove sl@0: sl@0: // If you call memmove (or vec_memmove) and |DST-SRC|>=BC, sl@0: // this code will branch to v_memcpy anyway for maximum performance. sl@0: sl@0: // Revision History: sl@0: // Rev 0.0 Original Chuck Corley 02/03/03 sl@0: // Can still add dst, 128B loop, and aligned option sl@0: // Rev 0.01 Fixed JY's seg-fault violation CJC 02/17/03 sl@0: // Rev 0.1 Added 128B loop and dst; cndtnlzd dcbz CJC 02/18/03 sl@0: // (Creating separate path for QW aligned didn't help much) sl@0: // Rev 0.11 Small code schdling; chngd dst for memmove CJC 02/23/03 sl@0: // Rev 0.20 Eliminated alternate entry and cleanup CJC 02/27/03 sl@0: // Rev 0.21 Inproved loop branch targets for v_mempcy CJC 03/01/03 sl@0: // Rev 0.22 Experimented with dst (sent to H.) CJC 03/02/03 sl@0: // Rev 0.23 Substituted dcba for dcbz (sent to JY) CJC 03/08/03 sl@0: // Rev 0.24 Use two dst streams CJC 03/12/03 sl@0: // Rev 0.25 Fix for all compilers, cleanup, and release with sl@0: // libmotovec.a rev 0.10 CJC 03/14/03 sl@0: // Rev 0.30 Fix for pre-empted destination (SNDF-DS) CJC 04/02/03 sl@0: // sl@0: // Between Rev 0.25 and 0.30 the code was revised to store elements of sl@0: // source at destination when first and/or last vector are less than 16 sl@0: // bytes. Areviewer at SNDF observed that loading the destination vector sl@0: // for merging exposed the "uninvolved" destination bytes to incoherency sl@0: // if an interrupt pre-empted this routine and modified the "uninvolved" sl@0: // destination vector(s) while held in register for merging. It seems sl@0: // like a low possibility but this revision is no longer subject to that sl@0: // possibility. (It is also slightly faster than Rev 0.25.) sl@0: // This is beta quality code; users are encouraged to make it faster. sl@0: // ASSUMPTIONS: sl@0: // Code is highly likely to be in the cache; data is not (streaming data) sl@0: sl@0: #define VRSV 256 // VRSAVE spr sl@0: // Don't use vectors for BC <= MIN_VEC. Works only if MIN_VEC >= 16 bytes. sl@0: #define MIN_VEC 16 sl@0: // Don't use Big_loop in v_memcpy for |dst-src|<= minimum overlap. sl@0: #define MIN_OVL 128 sl@0: sl@0: // Register useage sl@0: #define Rt r0 // r0 when used as a temporary register sl@0: sl@0: #define DST r3 // entering: dst pointer; exiting: same dst pointer sl@0: sl@0: #define SRC r4 // entering: src ptr; then end of src range index (SRC+BC) in memmove sl@0: sl@0: #define BC r5 // entering: Byte_Count sl@0: sl@0: #define PCS r6 // save for partial checksum entering sl@0: sl@0: #define DMS r7 // dst - src initially sl@0: #define BK r7 // BC - 1 +/- (n*16) sl@0: sl@0: // Codewarrior will put an unwelcome space as "lbzu r0,1(r7 )" sl@0: // if you don't put the comment right after the r7. CJC 030314 sl@0: #define SM1 r8// src -1 for byte-by-byte forwards initially sl@0: #define S r8 // src[28:31] sl@0: #define SMD r8 // src[0:27]-dst[0:27] sl@0: #define STR r8 // data stream touch block & stride info for Big_loop sl@0: sl@0: #define DM1 r9// dst -1 for byte-by-byte forwards initially sl@0: #define D r9 // dst[28:31] sl@0: #define DNX r9 // (dst+n*16)[28:31] sl@0: #define BL r9 // second byte_kount index pointer sl@0: sl@0: #define SBC r10// src + byte count initially then src[28:31] sl@0: #define BLK r10 // temporary data stream touch block & stride info sl@0: #define DR r10 // (dst+16)[0:27] sl@0: #define QW r10 // number of quad words (vectors) sl@0: sl@0: #define DBC r11// dst + byte count initially sl@0: #define BLL r11 // temporary data stream touch block & stride info sl@0: #define SBK r11 // (src+byte_count-1) sl@0: #define SBR r11 // (src+byte_count-1)[0:27] sl@0: #define DBK r11 // (dst+byte_count-1) then (dst+byte_count-1)[28:31] sl@0: #define BIG r11 // QW/8 or 128 byte loop count sl@0: #define SP8 r11 // SRC + n*128 (8 QWs) for data streaming after first call sl@0: sl@0: #define RSV r12 // storage for VRSAVE register if used sl@0: sl@0: #define VS0 v0 // src vector for permuting sl@0: sl@0: #define VS1 v1 // src vector for permuting sl@0: sl@0: #define VP3 v2 // d - s permute register sl@0: sl@0: #define VPS0 v3 // permuted source vector to store sl@0: sl@0: #define VPS1 v4 // 2nd permuted source vector to store sl@0: sl@0: #define VPS2 v5 // additional permuted src in Big loop sl@0: sl@0: #define VS2 v6 // src vector for permuting sl@0: #define VPS3 v6 // additional permuted src in Big loop sl@0: sl@0: #define VS3 v7 // additional src load in Big loop sl@0: #define VPS4 v7 // additional permuted src in Big loop sl@0: sl@0: #define VS4 v8 // additional src load in Big loop sl@0: #define VPS5 v8 // additional permuted src in Big loop sl@0: sl@0: #define VS5 v9 // additional src load in Big loop sl@0: #define VPS6 v9 // additional permuted src in Big loop sl@0: sl@0: #define VS6 v10 // additional src load in Big loop sl@0: #define VPS7 v10 // additional permuted src in Big loop sl@0: sl@0: #define VS7 v11 // additional src load in Big loop sl@0: sl@0: // Conditionalize the use of dcba. It will help if the data is sl@0: // not in cache and hurt if it is. Generally, except for small sl@0: // benchmarks repeated many times, we assume data is not in cache sl@0: // (data streaming) and using dcbz is a performance boost. sl@0: #ifndef NO_DCBA sl@0: #if defined(__GNUC__) || defined(__MWERKS__) || defined(_DIAB_TOOL) sl@0: // gcc and codewarrior and diab don't assemble dcba sl@0: #define DCBK .long 0x7c033dec sl@0: // dcba r3,r7 or dcba DST,BK sl@0: #define DCBL .long 0x7c034dec sl@0: // dcba r3,r9 or dcba DST,BL sl@0: #else sl@0: #ifdef __ghs__ sl@0: .macro DCBK sl@0: .long 0x7c033dec sl@0: .endm sl@0: .macro DCBL sl@0: .long 0x7c034dec sl@0: .endm sl@0: #else sl@0: #define DCBK dcba DST,BK sl@0: #define DCBL dcba DST,BL sl@0: #endif // __ghs__ sl@0: #endif // __GNUC__ or __MWERKS__ sl@0: #else sl@0: #define DCBK nop sl@0: #define DCBL nop sl@0: #endif // NO_DCBA sl@0: sl@0: // Conditionalize the use of dst (data stream touch). It will help sl@0: // if the data is not in cache and hurt if it is (though not as badly sl@0: // as dcbz). Generally, except for small benchmarks repeated many times, sl@0: // we assume data is not in cache (data streaming) and using dst is a sl@0: // performance boost. sl@0: #ifndef NO_DST sl@0: #define STRM_B dst SBC,BLL,0 sl@0: #define STRM_F dst SRC,BLK,0 sl@0: #define STRM_1 dst SP8,STR,1 sl@0: sl@0: #else sl@0: #define STRM_B nop sl@0: #define STRM_F nop sl@0: #define STRM_1 nop sl@0: #endif sl@0: sl@0: // Condition register use sl@0: // cr0[0:2] = (dst-src==0)? return: ((dst-src>0)? copy_bkwd, copy_fwd;); sl@0: // then cr0[0:2] = (dst[28:31]-src[28:31]<0)? "shifting left", "shifting right"; sl@0: // cr1[0,2] = (BC == 0)? 1 : 0; (nothing to move) sl@0: // then cr1[2] = (DST[28:31] == 0)? 1 : 0; (D0 left justified) sl@0: // then cr1[2] = ((DBK = DST+BC-1)[28:31] = 0xF)? 1 : 0; (DN right justified) sl@0: // cr5[0,2] = (|DST-SRC|<=MIN_OVL)?1:0; (Overlap too small for Big loop?) sl@0: // cr6[1,2] = (DST-SRC>=BC)?1:0; (Okay for v_memmove to copy forward?) sl@0: // then cr6[2] = (QW == 0)? 1 : 0; (Any full vectors to move?) sl@0: // then cr6[1] = (QW > 4)? 1 : 0; (>4 vectors to move?) sl@0: // then cr6[3] = (third store[27] == 1)? 1: 0; (cache line alignment) sl@0: // then cr6[3] = (last store[27] == 1)? 1: 0; (last store odd?) sl@0: // cr7[2] = (BC>MIN_VEC)?1:0; (BC big enough to warrant vectors) sl@0: // then cr7[0:3] = (DST+16)[0:27]-DST (How many bytes (iff <16) in first vector?) sl@0: // then cr7[1] = (QW > 14)? 1 : 0; (>14 vectors to move?) sl@0: // then cr7[0:3] = (DST+BC)[0:27] (How many bytes (iff <16) in last vector?) sl@0: sl@0: .text sl@0: #ifdef __MWERKS__ sl@0: .align 32 sl@0: #else sl@0: .align 5 sl@0: #endif sl@0: sl@0: #ifdef LIBMOTOVEC sl@0: .globl memmove sl@0: memmove: sl@0: nop // IU1 Compilers forget first label sl@0: .globl memcpy sl@0: memcpy: sl@0: #else sl@0: .globl vec_memmove sl@0: vec_memmove: sl@0: nop // IU1 Only way I know to preserve both labels sl@0: .globl _vec_memcpy sl@0: _vec_memcpy: sl@0: #endif sl@0: subf. DMS,SRC,DST // IU1 Compute dst-src difference sl@0: cmpi cr1,0,BC,0 // IU1 Eliminate zero byte count moves sl@0: cmpi cr7,0,BC,MIN_VEC // IU1 Check for minimum byte count sl@0: sl@0: addi SM1,SRC,-1 // IU1 Pre-bias and duplicate src for fwd sl@0: addi DM1,DST,-1 // IU1 Pre-bias and duplicate destination sl@0: add SBC,SRC,BC // IU1 Pre-bias and duplicate src for bkwd sl@0: beqlr // return if DST = SRC sl@0: sl@0: add DBC,DST,BC // IU1 Pre-bias and duplicate destination sl@0: subf Rt,DST,SRC // IU1 Form |DST-SRC| if DST-SRC<0 sl@0: beqlr cr1 // return if BC = 0 sl@0: sl@0: bgt Cpy_bkwd // b if DST-SRC>0 (have to copy backward) sl@0: cmpi cr5,0,Rt,MIN_OVL // IU1 (|DST-SRC|>128)?1:0; for v_memcpy sl@0: bgt cr7,v_memcpy // b if BC>MIN_VEC (okay to copy vectors fwd) sl@0: sl@0: // Copy byte-by-byte forwards if DST-SRC<0 and BC<=MIN_VEC sl@0: mtctr BC // i=BC; do ...;i--; while (i>0) sl@0: Byte_cpy_fwd: sl@0: lbzu Rt,1(SM1) // LSU * ++(DST-1) = * ++(SRC-1) sl@0: stbu Rt,1(DM1) // LSU sl@0: bdnz Byte_cpy_fwd sl@0: sl@0: blr sl@0: nop // IU1 Improve next label as branch target sl@0: Cpy_bkwd: sl@0: cmpi cr5,0,DMS,MIN_OVL // IU1 ((DST-SRC)>128)?1:0; for v_memcpy sl@0: cmp cr6,0,DMS,BC // IU1 cr6[1,2]=(DST-SRC>=BC)?1:0; sl@0: bgt cr7,v_memmove // b if BC>MIN_VEC (copy vectors bkwd) sl@0: // Copy byte-by-byte backwards if DST-SRC>0 and BC<=MIN_VEC sl@0: mtctr BC // i=BC; do ...;i--; while (i>0) sl@0: Byte_cpy_bwd: sl@0: lbzu Rt,-1(SBC) // LSU * --(DST+BC) = * --(SRC+BC) sl@0: stbu Rt,-1(DBC) // LSU Store it sl@0: bdnz Byte_cpy_bwd sl@0: blr sl@0: sl@0: #ifdef __MWERKS__ sl@0: .align 16 sl@0: #else sl@0: .align 4 sl@0: #endif sl@0: sl@0: v_memmove: sl@0: // Byte count < MIN_VEC bytes will have been copied by scalar code above, sl@0: // so this will not deal with small block moves < MIN_VEC. sl@0: sl@0: // For systems using VRSAVE, define VRSAVE=1 when compiling. For systems sl@0: // that don't, make sure VRSAVE is undefined. sl@0: #ifdef VRSAVE sl@0: mfspr RSV,VRSV // IU2 Get current VRSAVE contents sl@0: #endif sl@0: rlwinm S,SRC,0,28,31 // IU1 Save src address bits s[28:31] sl@0: rlwinm D,DST,0,28,31 // IU1 D = dst[28:31] sl@0: bge cr6,MC_entry // b to v_memcpy if DST-SRC>=BC (fwd copy OK) sl@0: sl@0: #ifdef VRSAVE sl@0: oris Rt,RSV,0xfff0 // IU1 Or in registers used by this routine sl@0: #endif sl@0: lis BLL,0x010c // IU1 Stream 12 blocks of 16 bytes sl@0: subf. SMD,D,S // IU1 if S-D<0 essentially shifting right sl@0: sl@0: #ifdef VRSAVE sl@0: mtspr VRSV,Rt // IU2 Save in VRSAVE before first vec op sl@0: #endif sl@0: lvsr VP3,0,DMS // LSU Permute vector for dst - src shft right sl@0: ori BLL,BLL,0xffe0 // IU1 Stream stride -32B sl@0: sl@0: STRM_B // LSU Start data stream at SRC+BC sl@0: addi SBK,SBC,-1 // IU1 Address of last src byte sl@0: bgt Rt_shft // Bytes from upper vector = (s-d>0)?s-d:16+s-d; sl@0: addi SMD,SMD,16 // IU1 Save 16-(d-s) sl@0: Rt_shft: sl@0: sl@0: rlwinm SBR,SBK,0,0,27 // IU1 (SRC+BC-1)[0:27] sl@0: addi BK,BC,-1 // IU1 Initialize byte index sl@0: sl@0: subf Rt,SBR,SBC // IU1 How many bytes in first source? sl@0: add DBK,DST,BK // IU1 Address of last dst byte sl@0: addi DR,DST,16 // IU1 Address of second dst vector sl@0: sl@0: subf. SMD,Rt,SMD // IU1 if bytes in 1st src>Bytes in 1st permute sl@0: rlwinm Rt,DBK,0,28,31 // IU1 (DST+BC-1)[28:31] sl@0: rlwinm DR,DR,0,0,27 // IU1 (DST+16)[0:27] sl@0: sl@0: // If there are more useful bytes in the upper vector of a permute pair than we sl@0: // will get in the first permute, the first loaded vector needs to be in the sl@0: // lower half of the permute pair. The upper half is a don't care then. sl@0: blt Get_bytes_rt // b if shifting left (D-S>=0) sl@0: sl@0: lvx VS1,SRC,BK // LSU Get SN load started sl@0: // Comments numbering source and destination assume single path through the sl@0: // code executing each instruction once. For vec_memmove, an example would sl@0: // be the call memmove(BASE+0x0F, BASE+0x2F, 82). N = 6 in that case. sl@0: addi SRC,SRC,-16 // IU1 Decrement src base (to keep BK useful) sl@0: sl@0: Get_bytes_rt: // Come here to get VS0 & Don't care what VS1 is sl@0: lvx VS0,SRC,BK // LSU Get SN-1 (SN if D-S<0) in lower vector sl@0: subf QW,DR,DBK // IU1 Bytes of full vectors to move (-16) sl@0: cmpi cr7,0,Rt,0xF // IU1 Is Dn right justified? sl@0: sl@0: cmpi cr1,0,D,0 // IU1 Is D0 left justified? sl@0: rlwinm QW,QW,28,4,31 // IU1 Quad words remaining sl@0: add Rt,DST,BC // IU1 Refresh the value of DST+BC sl@0: sl@0: cmpi cr6,0,QW,0 // IU1 Any full vectors to move? sl@0: vperm VPS0,VS0,VS1,VP3 // VPU Align SN-1 and SN to DN sl@0: vor VS1,VS0,VS0 // VIU1 Move lower vector to upper sl@0: beq cr7,Rt_just // b if DN is right justified sl@0: sl@0: mtcrf 0x01,Rt // IU2 Put final vector byte count in cr7 sl@0: rlwinm DBK,DBK,0,0,27 // IU1 Address of first byte of final vector sl@0: li D,0 // IU1 Initialize an index pointer sl@0: bnl cr7,Only_1W_bkwd // b if there was only one or zero words to store sl@0: sl@0: stvewx VPS0,DBK,D // LSU store word 1 of two or three sl@0: addi D,D,4 // IU1 increment index sl@0: sl@0: stvewx VPS0,DBK,D // LSU store word 2 of two or three sl@0: addi D,D,4 // IU1 increment index sl@0: Only_1W_bkwd: sl@0: bng cr7,Only_2W_bkwd // b if there were only two or zero words to store sl@0: sl@0: stvewx VPS0,DBK,D // LSU store word 3 of three if necessary sl@0: addi D,D,4 // IU1 increment index sl@0: Only_2W_bkwd: sl@0: bne cr7,Only_B_bkwd // b if there are no half words to store sl@0: sl@0: stvehx VPS0,DBK,D // LSU store one halfword if necessary sl@0: addi D,D,2 // IU1 increment index sl@0: Only_B_bkwd: sl@0: bns cr7,All_done_bkwd // b if there are no bytes to store sl@0: sl@0: stvebx VPS0,DBK,D // LSU store one byte if necessary sl@0: b All_done_bkwd sl@0: sl@0: Rt_just: sl@0: stvx VPS0,DST,BK // LSU Store 16 bytes at DN sl@0: All_done_bkwd: sl@0: addi BK,BK,-16 // IU1 Decrement destination byte count sl@0: sl@0: ble cr6,Last_load // b if no Quad words to do sl@0: mtctr QW // IU2 for (i=0;i<=QW;i++)-execution serializng sl@0: cmpi cr6,0,QW,4 // IU1 Check QW>4 sl@0: QW_loop: sl@0: lvx VS0,SRC,BK // LSU Get SN-2 (or SN-1 if ADJ==0) sl@0: sl@0: vperm VPS0,VS0,VS1,VP3 // VPU Align SN-2 and SN-1 to DN-1 sl@0: vor VS1,VS0,VS0 // VIU1 Move lower vector to upper sl@0: sl@0: stvx VPS0,DST,BK // LSU Store 16 bytes at DN-1 sl@0: addi BK,BK,-16 // IU1 Decrement byte kount sl@0: bdnzf 25,QW_loop // b if 4 or less quad words to do sl@0: sl@0: add DNX,DST,BK // IU1 address of next store (DST+BC-1-16) sl@0: bgt cr6,GT_4QW // b if >4 quad words left sl@0: sl@0: Last_load: // if D-S>=0, next load will be from same address as last sl@0: blt No_ld_bkwd // b if shifting right (S-D>=0) sl@0: addi SRC,SRC,16 // IU1 recorrect source if it was decremented sl@0: No_ld_bkwd: sl@0: lvx VS0,0,SRC // LSU Get last source SN-6 (guaranteed S0) sl@0: // Current 16 bytes is the last; we're done. sl@0: dss 0 // Data stream stop sl@0: vperm VPS0,VS0,VS1,VP3 // VPU Align SN-6 and SN-5 to DN-6 sl@0: subfic D,DST,16 // IU1 How many bytes in first destination? sl@0: beq cr1,Lt_just // b if last destination is left justified sl@0: sl@0: mtcrf 0x01,D // IU2 Put byte count remaining in cr7 sl@0: li D,0 // IU1 Initialize index pointer sl@0: bns cr7,No_B_bkwd // b if only even number of bytes to store sl@0: sl@0: stvebx VPS0,DST,D // LSU store first byte at DST+0 sl@0: addi D,D,1 // IU1 increment index sl@0: No_B_bkwd: sl@0: bne cr7,No_H_bkwd // b if only words to store sl@0: stvehx VPS0,DST,D // LSU store halfword at DST+0/1 sl@0: addi D,D,2 // IU1 increment index sl@0: sl@0: No_H_bkwd: sl@0: bng cr7,No_W1_bkwd // b if exactly zero or two words to store sl@0: stvewx VPS0,DST,D // LSU store word 1 of one or three sl@0: addi D,D,4 // IU1 increment index sl@0: sl@0: No_W1_bkwd: sl@0: bnl cr7,No_W2_bkwd // b if there was only one word to store sl@0: stvewx VPS0,DST,D // LSU store word 1 of two or 2 of three sl@0: addi D,D,4 // IU1 increment index sl@0: sl@0: stvewx VPS0,DST,D // LSU store word 2 of two or 3 of three sl@0: b No_W2_bkwd sl@0: sl@0: Lt_just: sl@0: stvx VPS0,0,DST // LSU Store 16 bytes at final dst addr D0 sl@0: No_W2_bkwd: sl@0: #ifdef VRSAVE sl@0: mtspr VRSV,RSV // IU1 Restore VRSAVE sl@0: #endif sl@0: blr // Return destination address from entry sl@0: sl@0: GT_4QW: // Do once if next store is to even half of cache line, else twice sl@0: sl@0: lvx VS0,SRC,BK // LSU Get SN-3 (or SN-2) sl@0: mtcrf 0x02,DNX // IU2 cr6[3]=((DST+BC-1)[27]==1)?1:0; sl@0: sl@0: vperm VPS0,VS0,VS1,VP3 // VPU Align SN-3 and SN-2 to Dn-2 sl@0: vor VS1,VS0,VS0 // VIU1 Move lower vector to upper sl@0: addi DNX,DNX,-16 // IU1 Prepare to update cr6 next loop sl@0: sl@0: stvx VPS0,DST,BK // LSU Store 16 bytes at DN-2 sl@0: vor VS3,VS0,VS0 // VIU Make a copy of lower vector sl@0: addi BK,BK,-16 // IU1 Decrement byte count by 16 sl@0: bdnzt 27,GT_4QW // b if next store is to upper (odd) half of CL sl@0: // At this point next store will be to even address. sl@0: sl@0: lis STR,0x102 // IU1 Stream 2 blocks of 16 bytes sl@0: mtcrf 0x02,DST // IU2 cr6[3]=(DST[27]==1)?1:0; (DST odd?) sl@0: addi BL,BK,-16 // IU1 Create an alternate byte count - 16 sl@0: sl@0: ori STR,STR,0xffe0 // IU1 Stream stride -32B sl@0: addi SP8,SRC,-64 // IU1 Starting address for data stream touch sl@0: bso cr6,B32_bkwd // b if DST[27] == 1; i.e, final store is odd sl@0: sl@0: bdnz B32_bkwd // decrement counter for last odd QW store sl@0: B32_bkwd: // Should be at least 2 stores remaining and next 2 are cache aligned sl@0: lvx VS2,SRC,BK // LSU Get SN-4 (or SN-3) sl@0: addi SP8,SP8,-32 // IU1 Next starting address for data stream touch sl@0: sl@0: lvx VS1,SRC,BL // LSU Get SN-5 (or SN-4) sl@0: vperm VPS0,VS2,VS3,VP3 // VPU Align SN-4 and SN-3 to DN-3 sl@0: sl@0: STRM_1 // LSU Stream 64 byte blocks ahead of loads sl@0: sl@0: DCBL // LSU allocate next cache line sl@0: sl@0: vperm VPS1,VS1,VS2,VP3 // VPU Align SN-5 and SN-4 to DN-4 sl@0: vor VS3,VS1,VS1 // VIU1 Move SN-5 to SN-3 sl@0: sl@0: stvx VPS0,DST,BK // LSU Store 16 bytes at DN-3 sl@0: addi BK,BL,-16 // IU1 Decrement byte count sl@0: bdz Nxt_loc_bkwd // always decrement and branch to next instr sl@0: sl@0: Nxt_loc_bkwd: sl@0: stvx VPS1,DST,BL // LSU Store 16 bytes at DN-4 sl@0: addi BL,BK,-16 // IU1 Decrement alternate byte count sl@0: bdnz B32_bkwd // b if there are at least two more QWs to do sl@0: sl@0: bns cr6,One_odd_QW // b if there was one more odd QW to store sl@0: b Last_load sl@0: sl@0: // Come here with two more loads and two stores to do sl@0: One_odd_QW: sl@0: lvx VS1,SRC,BK // LSU Get SN-6 (or SN-5) sl@0: sl@0: vperm VPS1,VS1,VS3,VP3 // VPU Align SN-6 and SN-5 to DN-5 sl@0: sl@0: stvx VPS1,DST,BK // LSU Store 16 bytes at DN-5 sl@0: sl@0: b Last_load sl@0: sl@0: // End of memmove in AltiVec sl@0: sl@0: #ifdef __MWERKS__ sl@0: .align 16 sl@0: #else sl@0: .align 4 sl@0: #endif sl@0: v_memcpy: sl@0: // Byte count < MIN_VEC bytes will have been copied by scalar code above, sl@0: // so this will not deal with small block moves < MIN_VEC. sl@0: sl@0: #ifdef VRSAVE sl@0: mfspr RSV,VRSV // IU2 Get current VRSAVE contents sl@0: #endif sl@0: rlwinm S,SRC,0,28,31 // IU1 Save src address bits s[28:31] sl@0: rlwinm D,DST,0,28,31 // IU1 D = dst[28:31] sl@0: sl@0: MC_entry: // enter here from memmove if DST-SRC>=BC; this should be faster sl@0: #ifdef VRSAVE sl@0: oris Rt,RSV,0xfff0 // IU1 Or in registers used by this routine sl@0: #endif sl@0: lis BLK,0x010c // IU1 Stream 12 blocks of 16 bytes sl@0: sl@0: subf. S,S,D // IU1 if D-S<0 essentially shifting left sl@0: sl@0: #ifdef VRSAVE sl@0: mtspr VRSV,Rt // IU2 Save in VRSAVE before first vec op sl@0: #endif sl@0: lvsr VP3,0,DMS // LSU Permute vector for dst - src shft right sl@0: ori BLK,BLK,32 // IU1 Stream stride 32B sl@0: sl@0: STRM_F // LSU Start data stream 0 at SRC sl@0: addi DR,DST,16 // IU1 Address of second dst vector sl@0: addi DBK,DBC,-1 // IU1 Address of last dst byte sl@0: sl@0: // If D-S<0 we are "kinda" shifting left with the right shift permute vector sl@0: // loaded to VP3 and we need both S0 and S1 to permute. If D-S>=0 then the sl@0: // first loaded vector needs to be in the upper half of the permute pair and sl@0: // the lower half is a don't care then. sl@0: bge Ld_bytes_rt // b if shifting right (D-S>=0) sl@0: sl@0: lvx VS0,0,SRC // LSU Get S0 load started sl@0: // Comments numbering source and destination assume single path through the sl@0: // code executing each instruction once. For vec_memcpy, an example would sl@0: // be the call memcpy(BASE+0x1E, BASE+0x1F, 259). N = 16 in that case. sl@0: addi SRC,SRC,16 // IU1 Increment src base (to keep BK useful) sl@0: sl@0: Ld_bytes_rt: // Come here to get VS1 & Don't care what VS0 is sl@0: lvx VS1,0,SRC // LSU Get S1 (or S0 if D-S>=0) in upper vector sl@0: rlwinm DR,DR,0,0,27 // IU1 (DST+16)[0:27] sl@0: cmpi cr1,0,D,0 // IU1 Is D0 left justified? sl@0: sl@0: subf Rt,DST,DR // IU1 How many bytes in first destination? sl@0: subf QW,DR,DBK // IU1 Bytes of full vectors to move (-16) sl@0: li BK,0 // IU1 Initialize byte kount index sl@0: sl@0: mtcrf 0x01,Rt // IU2 Put bytes in 1st dst in cr7 sl@0: rlwinm QW,QW,28,4,31 // IU1 Quad words remaining sl@0: vperm VPS0,VS0,VS1,VP3 // VPU Align S0 and S1 to D0 sl@0: sl@0: vor VS0,VS1,VS1 // VIU1 Move upper vector to lower sl@0: beq cr1,Left_just // b if D0 is left justified sl@0: sl@0: bns cr7,No_B_fwd // b if only even number of bytes to store sl@0: sl@0: stvebx VPS0,DST,BK // LSU store first byte at DST+0 sl@0: addi BK,BK,1 // IU1 increment index sl@0: No_B_fwd: sl@0: bne cr7,No_H_fwd // b if only words to store sl@0: sl@0: stvehx VPS0,DST,BK // LSU store halfword at DST+0/1 sl@0: addi BK,BK,2 // IU1 increment index sl@0: No_H_fwd: sl@0: bng cr7,No_W1_fwd // b if exactly zero or two words to store sl@0: sl@0: stvewx VPS0,DST,BK // LSU store word 1 of one or three sl@0: addi BK,BK,4 // IU1 increment index sl@0: sl@0: No_W1_fwd: sl@0: bnl cr7,No_W2_fwd // b if there was only one word to store sl@0: stvewx VPS0,DST,BK // LSU store word 1 of two or 2 of three sl@0: addi BK,BK,4 // IU1 increment index sl@0: sl@0: stvewx VPS0,DST,BK // LSU store word 2 of two or 3 of three sl@0: b No_W2_fwd sl@0: sl@0: Left_just: sl@0: stvx VPS0,0,DST // LSU Store 16 bytes at D0 sl@0: No_W2_fwd: sl@0: rlwinm Rt,DBK,0,28,31 // IU1 (DBK = DST+BC-1)[28:31] sl@0: cmpi cr6,0,QW,0 // IU1 Any full vectors to move? sl@0: sl@0: li BK,16 // IU1 Re-initialize byte kount index sl@0: cmpi cr1,0,Rt,0xF // IU1 Is DN right justified? sl@0: cmpi cr7,0,QW,14 // IU1 Check QW>14 sl@0: ble cr6,Last_ld_fwd // b if no Quad words to do sl@0: sl@0: mtctr QW // IU2 for (i=0;i<=QW;i++) sl@0: cmpi cr6,0,QW,4 // IU1 Check QW>4 sl@0: QW_fwd_loop: sl@0: lvx VS1,SRC,BK // LSU Get S2 (or S1) sl@0: sl@0: vperm VPS0,VS0,VS1,VP3 // VPU Align S1 and S2 to D1 sl@0: vor VS0,VS1,VS1 // VIU1 Move upper vector to lower sl@0: sl@0: stvx VPS0,DST,BK // LSU Store 16 bytes at D1(+n*16 where n<4) sl@0: addi BK,BK,16 // IU1 Increment byte kount index sl@0: bdnzf 25,QW_fwd_loop // b if 4 or less quad words to do sl@0: sl@0: add DNX,DST,BK // IU1 address of next store (DST+32 if QW>4) sl@0: addi QW,QW,-1 // IU1 One more QW stored by now sl@0: bgt cr6,GT_4QW_fwd // b if >4 quad words left sl@0: sl@0: Last_ld_fwd: // Next 16 bytes is the last; we're done. sl@0: add DBC,DST,BC // IU1 Recompute address of last dst byte + 1 sl@0: add SBC,SRC,BC // IU1 Recompute address of last src byte + 1 sl@0: bge No_ld_fwd // b if shifting right (D-S>=0) sl@0: sl@0: addi SBC,SBC,-16 // IU1 if D-S>=0 we didn't add 16 to src sl@0: No_ld_fwd: sl@0: mtcrf 0x01,DBC // IU2 Put final vector byte count in cr7 sl@0: addi DBK,DBC,-1 // IU1 Recompute address of last dst byte sl@0: addi Rt,SBC,-1 // IU1 Recompute address of last src byte sl@0: sl@0: // If D-S<0 we have already loaded all the source vectors. sl@0: // If D-S>=0 then the first loaded vector went to the upper half of the permute sl@0: // pair and we need one more vector. (This may be a duplicate.) sl@0: sl@0: lvx VS1,0,Rt // LSU Get last source S14 (guaranteed SN) sl@0: sl@0: #ifndef NO_DST sl@0: dss 0 // Data stream 0 stop sl@0: sl@0: dss 1 // Data stream 1 stop sl@0: #endif sl@0: vperm VPS0,VS0,VS1,VP3 // VPU Align S13 and S14 to D14 sl@0: beq cr1,Rt_just_fwd // b if last destination is right justified sl@0: sl@0: rlwinm DBK,DBK,0,0,27 // IU1 Round to QW addr of last byte sl@0: li D,0 // IU1 Initialize index pointer sl@0: bnl cr7,Only_1W_fwd // b if there was only one or zero words to store sl@0: sl@0: stvewx VPS0,DBK,D // LSU store word 1 of two or three sl@0: addi D,D,4 // IU1 increment index sl@0: sl@0: stvewx VPS0,DBK,D // LSU store word 2 of two or three sl@0: addi D,D,4 // IU1 increment index sl@0: Only_1W_fwd: sl@0: bng cr7,Only_2W_fwd // b if there were only two or zero words to store sl@0: sl@0: stvewx VPS0,DBK,D // LSU store word 3 of three if necessary sl@0: addi D,D,4 // IU1 increment index sl@0: Only_2W_fwd: sl@0: bne cr7,Only_B_fwd // b if there are no half words to store sl@0: sl@0: stvehx VPS0,DBK,D // LSU store one halfword if necessary sl@0: addi D,D,2 // IU1 increment index sl@0: Only_B_fwd: sl@0: bns cr7,All_done_fwd // b if there are no bytes to store sl@0: sl@0: stvebx VPS0,DBK,D // LSU store one byte if necessary sl@0: b All_done_fwd sl@0: sl@0: Rt_just_fwd: sl@0: sl@0: stvx VPS0,DST,BK // LSU Store 16 bytes at D14 sl@0: All_done_fwd: sl@0: #ifdef VRSAVE sl@0: mtspr VRSV,RSV // IU1 Restore VRSAVE sl@0: #endif sl@0: blr // Return destination address from entry sl@0: #ifdef __MWERKS__ sl@0: .align 16 sl@0: #else sl@0: .align 4 sl@0: #endif sl@0: GT_4QW_fwd: // Do once if nxt st is to odd half of cache line, else twice sl@0: sl@0: lvx VS1,SRC,BK // LSU Get S3 (or S2) sl@0: addi QW,QW,-1 // IU1 Keeping track of QWs stored sl@0: mtcrf 0x02,DNX // IU2 cr6[3]=((DST+32)[27]==1)?1:0; sl@0: sl@0: addi DNX,DNX,16 // IU1 Update cr6 for next loop sl@0: addi Rt,QW,-2 // IU1 Insure at least 2 QW left after big loop sl@0: sl@0: vperm VPS0,VS0,VS1,VP3 // VPU Align S2 and S3 to D2 sl@0: vor VS0,VS1,VS1 // VIU1 Move upper vector to lower sl@0: sl@0: stvx VPS0,DST,BK // LSU Store 16 bytes at D2 sl@0: addi BK,BK,16 // IU1 Increment byte count by 16 sl@0: bdnzf 27,GT_4QW_fwd // b if next store is to lower (even) half of CL sl@0: // At this point next store will be to even address. sl@0: sl@0: mtcrf 0x02,DBK // IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?) sl@0: lis STR,0x104 // IU1 Stream 4 blocks of 16 bytes sl@0: addi BL,BK,16 // IU1 Create an alternate byte kount + 32 sl@0: sl@0: ori STR,STR,32 // IU1 Stream stride 32B sl@0: #ifndef NO_BIG_LOOP sl@0: rlwinm BIG,Rt,29,3,31 // IU1 QW/8 big loops to do sl@0: sl@0: rlwinm Rt,Rt,0,0,28 // IU1 How many QWs will be done in big loop sl@0: bgt cr7,Big_loop // b if QW > 14 sl@0: #endif sl@0: No_big_loop: sl@0: // We need the ctr register to reflect an even byte count before entering sl@0: // the next block - faster to decrement than to reload. sl@0: sl@0: addi SP8,SRC,256 // IU1 Starting address for data stream touch sl@0: xoris STR,STR,0x6 // IU1 Reset stream to 2 blocks of 16 bytes sl@0: bns cr6,B32_fwd // b if DST[27] == 0; i.e, final store is even sl@0: sl@0: bdnz B32_fwd // decrement counter for last QW store odd sl@0: sl@0: B32_fwd: // Should be at least 2 stores remaining and next 2 are cache aligned sl@0: lvx VS1,SRC,BK // LSU Get S12 sl@0: addi SP8,SP8,32 // IU1 Next starting address for data stream touch sl@0: sl@0: lvx VS2,SRC,BL // LSU Get S13 sl@0: vperm VPS1,VS0,VS1,VP3 // VPU Align S11 and S12 to D11 sl@0: sl@0: STRM_1 // LSU Stream 64 byte blocks ahead of loads sl@0: sl@0: DCBK // LSU then Kill instead of RWITM sl@0: sl@0: vperm VPS0,VS1,VS2,VP3 // VPU Align S12 and S13 to D12 sl@0: vor VS0,VS2,VS2 // VIU1 Move S13 to S11 sl@0: sl@0: stvx VPS1,DST,BK // LSU Store 16 bytes at D11 sl@0: addi BK,BL,16 // IU1 Increment byte count sl@0: bdz Nxt_loc_fwd // always decrement and branch to next instr sl@0: sl@0: Nxt_loc_fwd: sl@0: stvx VPS0,DST,BL // LSU Store 16 bytes at D12 sl@0: addi BL,BK,16 // IU1 Increment alternate byte count sl@0: bdnz B32_fwd // b if there are at least two more QWs to do sl@0: sl@0: bso cr6,One_even_QW // b if there is one even and one odd QW to store sl@0: b Last_ld_fwd // b if last store is to even address sl@0: sl@0: // Come here with two more loads and two stores to do sl@0: One_even_QW: sl@0: lvx VS1,SRC,BK // LSU Get S14 (or S13 if if D-S>=0) sl@0: sl@0: vperm VPS0,VS0,VS1,VP3 // VPU Align S13 and S14 to D13 sl@0: vor VS0,VS1,VS1 // VIU1 Move upper vector to lower sl@0: sl@0: stvx VPS0,DST,BK // LSU Store 16 bytes at D13 sl@0: addi BK,BK,16 // IU1 Increment byte count sl@0: sl@0: b Last_ld_fwd sl@0: sl@0: #ifdef __MWERKS__ sl@0: .align 16 sl@0: #else sl@0: .align 4 sl@0: #endif sl@0: Big_loop: sl@0: subf QW,Rt,QW // IU1 Should be 2-7 QWs left after big loop sl@0: blt cr5,No_big_loop // b back if |DST-SRC|<128; Big_loop won't work. sl@0: mtctr BIG // IU2 loop for as many 128B loops as possible sl@0: addi SP8,SRC,256 // IU1 Starting address for data stream touch sl@0: sl@0: Loop_of_128B: // Come here with QW>=10 and next store even; VS0 last load sl@0: lvx VS1,SRC,BK // LSU Get S4 (or S3 if D-S>=0) sl@0: addi BL,BK,32 // IU1 Increment Byte_Kount+16 by 32 sl@0: addi SP8,SP8,128 // IU1 increment address for data stream touch sl@0: sl@0: lvx VS3,SRC,BL // LSU Get S6 (or S5) sl@0: addi BL,BL,32 // IU1 Increment Byte_Kount+48 by 32 sl@0: sl@0: lvx VS5,SRC,BL // LSU Get S8 (or S7) sl@0: addi BL,BL,32 // IU1 Increment Byte_Kount+80 by 32 sl@0: sl@0: lvx VS7,SRC,BL // LSU Get S10 (or S9) sl@0: addi BL,BK,16 // IU1 Increment Byte_Kount+16 by 16 sl@0: sl@0: lvx VS2,SRC,BL // LSU Get S5 (or S4) sl@0: addi BL,BL,32 // IU1 Increment Byte_Kount+32 by 32 sl@0: sl@0: lvx VS4,SRC,BL // LSU Get S7 (or S6) sl@0: addi BL,BL,32 // IU1 Increment Byte_Kount+64 by 32 sl@0: sl@0: lvx VS6,SRC,BL // LSU Get S9 (or S8) sl@0: addi BL,BL,32 // IU1 Increment Byte_Kount+96 by 32 sl@0: vperm VPS0,VS0,VS1,VP3 // VPU sl@0: sl@0: lvx VS0,SRC,BL // LSU Get S11 (or S10) sl@0: vperm VPS1,VS1,VS2,VP3 // VPU sl@0: sl@0: STRM_1 // LSU Stream 4 32B blocks, stride 32B sl@0: sl@0: DCBK // LSU then Kill instead of RWITM sl@0: sl@0: stvx VPS0,DST,BK // LSU Store D3 sl@0: addi BK,BK,16 // IU1 Increment Byte_Kount+16 by 16 sl@0: vperm VPS2,VS2,VS3,VP3 // VPU sl@0: sl@0: stvx VPS1,DST,BK // LSU Store D4 sl@0: addi BK,BK,16 // IU1 Increment Byte_Kount+32 by 16 sl@0: vperm VPS3,VS3,VS4,VP3 // VPU sl@0: sl@0: DCBK // LSU then Kill instead of RWITM sl@0: sl@0: stvx VPS2,DST,BK // LSU Store D5 sl@0: addi BK,BK,16 // IU1 Increment Byte_Kount+48 by 16 sl@0: vperm VPS4,VS4,VS5,VP3 // VPU sl@0: sl@0: stvx VPS3,DST,BK // LSU Store D6 sl@0: addi BK,BK,16 // IU1 Increment Byte_Kount+64 by 16 sl@0: vperm VPS5,VS5,VS6,VP3 // VPU sl@0: sl@0: DCBK // LSU then Kill instead of RWITM sl@0: sl@0: stvx VPS4,DST,BK // LSU Store D7 sl@0: addi BK,BK,16 // IU1 Increment Byte_Kount+80 by 16 sl@0: vperm VPS6,VS6,VS7,VP3 // VPU sl@0: sl@0: stvx VPS5,DST,BK // LSU Store D8 sl@0: addi BK,BK,16 // IU1 Increment Byte_Kount+96 by 16 sl@0: vperm VPS7,VS7,VS0,VP3 // VPU sl@0: sl@0: DCBK // LSU then Kill instead of RWITM sl@0: sl@0: stvx VPS6,DST,BK // LSU Store D9 sl@0: addi BK,BK,16 // IU1 Increment Byte_Kount+112 by 16 sl@0: sl@0: stvx VPS7,DST,BK // LSU Store D10 sl@0: addi BK,BK,16 // IU1 Increment Byte_Kount+128 by 16 sl@0: bdnz Loop_of_128B // b if ctr > 0 (QW/8 still > 0) sl@0: sl@0: mtctr QW // IU1 Restore QW remaining to counter sl@0: addi BL,BK,16 // IU1 Create an alternate byte kount + 16 sl@0: bns cr6,B32_fwd // b if DST[27] == 0; i.e, final store is even sl@0: sl@0: bdnz B32_fwd // b and decrement counter for last QW store odd sl@0: // One of the above branches should have taken sl@0: sl@0: // End of memcpy in AltiVec sl@0: sl@0: // bcopy works like memcpy, but the source and destination operands are reversed. sl@0: // Following will just reverse the operands and branch to memcpy. sl@0: sl@0: #ifdef LIBMOTOVEC sl@0: .globl bcopy sl@0: bcopy: sl@0: #else sl@0: .globl vec_bcopy sl@0: vec_bcopy: sl@0: #endif sl@0: mr Rt,DST // temp storage for what is really source address (r3) sl@0: mr DST,SRC // swap destination address to r3 to match memcpy dst sl@0: mr SRC,Rt // Complete swap of destination and source for memcpy sl@0: #ifdef LIBMOTOVEC sl@0: b memcpy // b to memcpy with correct args in r3 and r4 sl@0: #else sl@0: b _vec_memcpy // b to vec_memcpy with correct args in r3 and r4 sl@0: #endif sl@0: // End of bcopy in AltiVec