sl@0: //------------------------------------------------------------------ sl@0: // file: vec_memset.S sl@0: // AltiVec enabled version of memset and bzero and cacheable_memzero sl@0: //------------------------------------------------------------------ sl@0: sl@0: //------------------------------------------------------------------ sl@0: // Copyright Motorola, Inc. 2002 sl@0: // ALL RIGHTS RESERVED sl@0: // sl@0: // You are hereby granted a copyright license to use, modify, and sl@0: // distribute the SOFTWARE so long as this entire notice is retained sl@0: // without alteration in any modified and/or redistributed versions, sl@0: // and that such modified versions are clearly identified as such. sl@0: // No licenses are granted by implication, estoppel or otherwise under sl@0: // any patents or trademarks of Motorola, Inc. sl@0: // sl@0: // The SOFTWARE is provided on an "AS IS" basis and without warranty. sl@0: // To the maximum extent permitted by applicable law, MOTOROLA DISCLAIMS sl@0: // ALL WARRANTIES WHETHER EXPRESS OR IMPLIED, INCLUDING IMPLIED sl@0: // WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR sl@0: // PURPOSE AND ANY WARRANTY AGAINST INFRINGEMENT WITH sl@0: // REGARD TO THE SOFTWARE (INCLUDING ANY MODIFIED VERSIONS sl@0: // THEREOF) AND ANY ACCOMPANYING WRITTEN MATERIALS. sl@0: // sl@0: // To the maximum extent permitted by applicable law, IN NO EVENT SHALL sl@0: // MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER sl@0: // (INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF sl@0: // BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS sl@0: // INFORMATION, OR OTHER PECUNIARY LOSS) ARISING OF THE USE OR sl@0: // INABILITY TO USE THE SOFTWARE. Motorola assumes no responsibility sl@0: // for the maintenance and support of the SOFTWARE. sl@0: //------------------------------------------------------------------ sl@0: sl@0: //------------------------------------------------------------------ sl@0: // extern void *memset( void *ptr, int val, size_t len ); sl@0: // Copies val into each of len characters beginning at ptr. sl@0: // - Harbison&Steele 4th ed sl@0: // (despite val being an int, this memset assumes it is never sl@0: // more than a byte. That seems to be correct from all the sl@0: // memset functions I've seen but I don't know if ANSI allows sl@0: // anthing longer. Chuck Corley 12/21/02) sl@0: // Returns: sl@0: // void * ptr sl@0: //------------------------------------------------------------------ sl@0: sl@0: //------------------------------------------------------------------ sl@0: // extern void * bzero( char *ptr, int len); sl@0: // Copies 0 into each of len characters at ptr. sl@0: // - Harbison&Steele 4th ed sl@0: // Returns: sl@0: // void * ptr sl@0: //------------------------------------------------------------------ sl@0: sl@0: // Revision History: sl@0: // Rev 0.0 Original Chuck Corley 02/09/03 sl@0: // Could benefit from changes added to memcpy sl@0: // Rev 0.1 Revised per memcpy Rev 0.30 Chuck Corley 05/01/03 sl@0: // sl@0: // This is beta quality code; users are encouraged to make it faster. sl@0: // ASSUMPTIONS: sl@0: // Code is highly likely to be in the cache; data is not (streaming data) sl@0: // Zero fill could be quite likely. sl@0: // Moving fill byte from GPR to VR as below faster than stw->lvebx via stack sl@0: sl@0: #define VRSV 256 // VRSAVE spr sl@0: // Don't use vectors for BC <= MIN_VEC. Works only if MIN_VEC >= 16 bytes. sl@0: #define MIN_VEC 16 sl@0: sl@0: // Register useage sl@0: #define Rt r0 // r0 when used as a temporary register sl@0: sl@0: #define DST r3 // entering: dest pointer; exiting: same dest pointer sl@0: sl@0: #define FILL r4 // entering: fill char then fill word sl@0: sl@0: #define BC r5 // entering: Byte_Count then remaining Byte_Count sl@0: sl@0: #define DBC r6// dst + byte count sl@0: sl@0: #define BK r7 // BC - 1 +/- (n*16) sl@0: sl@0: #define Fsh r8 // fill byte shifted right one nibble sl@0: sl@0: #define DM1 r9// dst -1 for byte-by-byte backwards initially sl@0: #define D r9 // (dst+16)[0:27] - dst[28:31] sl@0: #define DNX r9 // (dst+n*16)[28:31] sl@0: #define BL r9 // second byte_kount index pointer sl@0: sl@0: #define DR r10 // (dst+16)[0:27] sl@0: #define QW r10 // number of cache lines sl@0: sl@0: #define DBK r11 // (dst+byte_count-1) then (dst+byte_count-1)[28:31] sl@0: sl@0: #define RSV r12 // storage for VRSAVE register if used sl@0: sl@0: // Condition register use (not including temporary cr0) sl@0: // cr0[2] = (FILL==0)? sl@0: // cr1[0,2] = (BC == 0)? 1 : 0; (nothing to move) sl@0: // then cr1[2] = (DST[28:31] == 0)? 1 : 0; (D0 left justified) sl@0: // then cr1[2] = ((DBK = DST+BC-1)[28:31] = 0xF)? 1 : 0; (DN right justified) sl@0: // cr6[2] = (QW == 0)? 1 : 0; sl@0: // then cr6[1] = (QW > 4)? 1 : 0; (>4 vectors to move?) sl@0: // then cr6[3] = (third store[27] == 1)? 1: 0; (cache line alignment) sl@0: // then cr6[3] = (last store[27] == 1)? 1: 0; (last store odd?) sl@0: // cr7[2] = (BC>MIN_VEC)?1:0; (BC big enough to warrant vectors) sl@0: // then cr7[0:3] = (DST+16)[0:27]-DST (How many bytes (iff <16) in first vector?) sl@0: // then cr7[0:3] = (DST+BC)[0:27] (How many bytes (iff <16) in last vector?) sl@0: sl@0: // Conditionalize the use of dcba. It will help if the data is sl@0: // not in cache and hurt if it is. Generally, except for small sl@0: // benchmarks repeated many times, we assume data is not in cache sl@0: // (data streaming) and using dcba is a performance boost. sl@0: // We use dcba which will noop to non-cacheable memory rather than sl@0: // dcbz which will cause an aligment exception. sl@0: #ifndef NO_DCBA sl@0: #if defined(__GNUC__) || defined(__MWERKS__) || defined(_DIAB_TOOL) sl@0: // gcc and codewarrior and diab don't assemble dcba sl@0: #define DCBK .long 0x7c033dec sl@0: // dcba r3,r7 or dcba DST,BK sl@0: #else sl@0: #ifdef __ghs__ sl@0: .macro DCBK sl@0: .long 0x7c033dec sl@0: .endm sl@0: #else sl@0: #define DCBK dcba DST,BK sl@0: #endif // __ghs__ sl@0: #endif // __GNUC__ or __MWERKS__ sl@0: #else sl@0: #define DCBK nop sl@0: #endif // NO_DCBA sl@0: sl@0: .text sl@0: #ifdef __MWERKS__ sl@0: .align 32 sl@0: #else sl@0: .align 5 sl@0: #endif sl@0: sl@0: #ifdef LIBMOTOVEC sl@0: .globl memset sl@0: memset: sl@0: #else sl@0: .globl _vec_memset sl@0: _vec_memset: sl@0: #endif sl@0: sl@0: cmpi cr7,0,BC,MIN_VEC // IU1 Check for minimum byte count sl@0: cmpi cr1,0,BC,0 // IU1 Eliminate zero byte count sl@0: rlwinm. Fsh,FILL,28,28,3 // IU1 Is fill byte zero? and shift sl@0: sl@0: addi DM1,DST,-1 // IU1 Pre-bias and duplicate destination sl@0: addi DR,DST,16 // IU1 Address of second dst vector sl@0: add DBC,DST,BC // IU1 Address of last dst byte + 1 sl@0: bgt cr7,v_memset // b if BC>MIN_VEC sl@0: sl@0: mtctr BC // for (i=1;i<=BC;i++) sl@0: beqlr cr1 // return if BC = 0 sl@0: Byte_set: sl@0: stbu FILL,1(DM1) // LSU * ++(DST-1) = FILL sl@0: bdnz Byte_set sl@0: sl@0: blr sl@0: sl@0: v_memset: sl@0: // Byte count < MIN_VEC bytes will have been set by scalar code above, sl@0: // so this will not deal with small block sets < MIN_VEC. sl@0: sl@0: // For systems using VRSAVE, define VRSAV=1 when compiling. For systems sl@0: // that don't, make sure VRSAVE is undefined. sl@0: #ifdef VRSAVE sl@0: mfspr RSV,VRSV // IU2 Get current VRSAVE contents sl@0: #endif sl@0: rlwinm DR,DR,0,0,27 // IU1 (DST+16)[0:27] sl@0: addi DBK,DBC,-1 // IU1 Address of last dst byte sl@0: sl@0: #ifdef VRSAVE sl@0: oris Rt,RSV,0xe000 // IU1 Or in registers used by this routine sl@0: #endif sl@0: subf D,DST,DR // IU1 How many bytes in first destination? sl@0: li BK,0 // IU1 Initialize byte kount index sl@0: sl@0: #ifdef VRSAVE sl@0: mtspr VRSV,Rt // IU2 Save in VRSAVE before first vec op sl@0: #endif sl@0: vxor v0,v0,v0 // VIU Clear v0 sl@0: subf QW,DR,DBK // IU1 Bytes of full vectors to move (-16) sl@0: cmpi cr1,0,D,16 // IU1 Is D0 left justified? sl@0: beq+ enter_bzero // b if FILL==0 sl@0: sl@0: lvsl v0,0,Fsh // LSU Move upper nibble to byte 0 of VR sl@0: vspltisb v1,4 // VPU Splat 0x4 to every byte sl@0: sl@0: lvsl v2,0,FILL // LSU Move lower nibble to byte 0 of VR sl@0: sl@0: vslb v0,v0,v1 // VIU Move upper nibble to VR[0:3] sl@0: sl@0: vor v0,v0,v2 // VIU Form FILL byte in VR[0:7] sl@0: sl@0: vspltb v0,v0,0 // VPU Splat the fill byte to all bytes sl@0: enter_bzero: sl@0: mtcrf 0x01,D // IU2 Put bytes in 1st dst in cr7 sl@0: rlwinm QW,QW,28,4,31 // IU1 Quad words remaining sl@0: beq cr1,Left_just // b if D0 is left justified sl@0: sl@0: bns cr7,No_B_fwd // b if only even number of bytes to store sl@0: sl@0: stvebx v0,DST,BK // LSU store first byte at DST+0 sl@0: addi BK,BK,1 // IU1 increment index sl@0: No_B_fwd: sl@0: bne cr7,No_H_fwd // b if only words to store sl@0: sl@0: stvehx v0,DST,BK // LSU store halfword at DST+0/1 sl@0: addi BK,BK,2 // IU1 increment index sl@0: No_H_fwd: sl@0: bng cr7,No_W1_fwd // b if exactly zero or two words to store sl@0: sl@0: stvewx v0,DST,BK // LSU store word 1 of one or three sl@0: addi BK,BK,4 // IU1 increment index sl@0: sl@0: No_W1_fwd: sl@0: bnl cr7,No_W2_fwd // b if there was only one word to store sl@0: stvewx v0,DST,BK // LSU store word 1 of two or 2 of three sl@0: addi BK,BK,4 // IU1 increment index sl@0: sl@0: stvewx v0,DST,BK // LSU store word 2 of two or 3 of three sl@0: b No_W2_fwd sl@0: sl@0: Left_just: sl@0: stvx v0,0,DST // LSU Store 16 bytes at D0 sl@0: No_W2_fwd: sl@0: rlwinm Rt,DBK,0,28,31 // IU1 (DBK = DST+BC-1)[28:31] sl@0: cmpi cr6,0,QW,0 // IU1 Any full vectors to move? sl@0: sl@0: li BK,16 // IU1 Re-initialize byte kount index sl@0: cmpi cr1,0,Rt,0xF // IU1 Is DN right justified? sl@0: ble cr6,Last_QW // b if no Quad words to do sl@0: sl@0: mtctr QW // IU2 for (i=0;i<=QW;i++) sl@0: cmpi cr6,0,QW,4 // IU1 Check QW>4 sl@0: sl@0: QW_loop: sl@0: stvx v0,DST,BK // LSU Store 16 fill bytes sl@0: addi BK,BK,16 // IU1 Increment byte kount index sl@0: bdnzf 25,QW_loop // b if 4 or less quad words to do sl@0: sl@0: add DNX,DST,BK // IU1 address of next store (DST+32 if QW>4) sl@0: addi QW,QW,-1 // IU1 One more QW stored by now sl@0: bgt cr6,GT_4QW_fwd // b if >4 quad words left sl@0: sl@0: Last_QW: // Next vector is the last; we're done. sl@0: mtcrf 0x01,DBC // IU2 Put final vector byte count in cr7 sl@0: sl@0: beq cr1,Rt_just_fwd // b if last destination is right justified sl@0: sl@0: rlwinm DBK,DBK,0,0,27 // IU1 Round to QW addr of last byte sl@0: li BL,0 // IU1 Initialize index pointer sl@0: bnl cr7,Only_1W_fwd // b if there was only one or zero words to store sl@0: sl@0: stvewx v0,DBK,BL // LSU store word 1 of two or three sl@0: addi BL,BL,4 // IU1 increment index sl@0: sl@0: stvewx v0,DBK,BL // LSU store word 2 of two or three sl@0: addi BL,BL,4 // IU1 increment index sl@0: Only_1W_fwd: sl@0: bng cr7,Only_2W_fwd // b if there were only two or zero words to store sl@0: sl@0: stvewx v0,DBK,BL // LSU store word 3 of three if necessary sl@0: addi BL,BL,4 // IU1 increment index sl@0: Only_2W_fwd: sl@0: bne cr7,Only_B_fwd // b if there are no half words to store sl@0: sl@0: stvehx v0,DBK,BL // LSU store one halfword if necessary sl@0: addi BL,BL,2 // IU1 increment index sl@0: Only_B_fwd: sl@0: bns cr7,All_done_fwd // b if there are no bytes to store sl@0: sl@0: stvebx v0,DBK,BL // LSU store one byte if necessary sl@0: b All_done_fwd sl@0: sl@0: Rt_just_fwd: sl@0: sl@0: stvx v0,DST,BK // LSU Store 16 bytes at D14 sl@0: All_done_fwd: sl@0: #ifdef VRSAVE sl@0: mtspr VRSV,RSV // IU1 Restore VRSAVE sl@0: #endif sl@0: blr // Return destination address from entry sl@0: sl@0: #ifdef __MWERKS__ sl@0: .align 16 sl@0: #else sl@0: .align 4 sl@0: #endif sl@0: GT_4QW_fwd: // Do once if nxt st is to odd half of cache line, else twice sl@0: sl@0: addi QW,QW,-1 // IU1 Keeping track of QWs stored sl@0: mtcrf 0x02,DNX // IU2 cr6[3]=((DST+32)[27]==1)?1:0; sl@0: addi DNX,DNX,16 // IU1 Update cr6 for next loop sl@0: sl@0: stvx v0,DST,BK // LSU Store 16 bytes at D2 sl@0: addi BK,BK,16 // IU1 Increment byte count by 16 sl@0: bdnzf 27,GT_4QW_fwd // b if next store is to lower (even) half of CL sl@0: sl@0: mtcrf 0x02,DBK // IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?) sl@0: sl@0: bns cr6,B32_fwd // b if DST[27] == 0; i.e, final store is even sl@0: sl@0: // We need the ctr register to reflect an even byte count before entering sl@0: // the next block - faster to decrement than to reload. sl@0: bdnz B32_fwd // decrement counter for last QW store odd sl@0: sl@0: B32_fwd: // Should be at least 2 stores remaining and next 2 are cache aligned sl@0: DCBK // LSU then Kill instead of RWITM sl@0: sl@0: stvx v0,DST,BK // LSU Store 16 bytes at D11 sl@0: addi BK,BK,16 // IU1 Increment byte count sl@0: bdz Nxt_loc_fwd // always decrement and branch to next instr sl@0: sl@0: Nxt_loc_fwd: sl@0: stvx v0,DST,BK // LSU Store 16 bytes at D12 sl@0: addi BK,BK,16 // IU1 Increment byte count sl@0: bdnz B32_fwd // b if there are at least two more QWs to do sl@0: sl@0: bso cr6,One_even_QW // b if there is one even and one odd QW to store sl@0: b Last_QW // b if last store is to even address sl@0: sl@0: // Come here with two more loads and two stores to do sl@0: One_even_QW: sl@0: stvx v0,DST,BK // LSU Store 16 bytes at D13 sl@0: addi BK,BK,16 // IU1 Increment byte count sl@0: sl@0: b Last_QW sl@0: sl@0: // End of memset in AltiVec sl@0: sl@0: #define BCz r4 // in bzero r4 enters with byte count sl@0: sl@0: #ifdef __MWERKS__ sl@0: .align 32 sl@0: #else sl@0: .align 5 sl@0: #endif sl@0: sl@0: #ifdef LIBMOTOVEC sl@0: .globl bzero sl@0: bzero: sl@0: #else sl@0: .globl vec_bzero sl@0: vec_bzero: sl@0: #endif sl@0: sl@0: mr BC,BCz // IU1 arg[2] is BC here, not FILL sl@0: li FILL,0 // IU1 for bzero FILL=0 sl@0: #ifdef LIBMOTOVEC sl@0: b memset sl@0: #else sl@0: b _vec_memset sl@0: #endif sl@0: sl@0: // cacheable_memzero will employ dcbz to clear 32 bytes at a time sl@0: // of cacheable memory. Like bzero, second entering argument will be BC. sl@0: // Using this for non-cacheable memory will generate an alignment exception. sl@0: sl@0: .text sl@0: #ifdef __MWERKS__ sl@0: .align 32 sl@0: #else sl@0: .align 5 sl@0: #endif sl@0: sl@0: #ifdef LIBMOTOVEC sl@0: .globl cacheable_memzero sl@0: cacheable_memzero: sl@0: #else sl@0: .globl vec_cacheable_memzero sl@0: vec_cacheable_memzero: sl@0: #endif sl@0: sl@0: mr BC,BCz // IU1 arg[2] is BC here, not FILL sl@0: li FILL,0 // IU1 for bzero FILL=0 sl@0: cmpi cr7,0,BC,MIN_VEC // IU1 Check for minimum byte count sl@0: sl@0: cmpi cr1,0,BC,0 // IU1 Eliminate zero byte count sl@0: sl@0: addi DM1,DST,-1 // IU1 Pre-bias and duplicate destination sl@0: addi DR,DST,16 // IU1 Address of second dst vector sl@0: add DBC,DST,BC // IU1 Address of last dst byte + 1 sl@0: bgt cr7,c_v_memset // b if BC>MIN_VEC sl@0: sl@0: mtctr BC // for (i=1;i<=BC;i++) sl@0: beqlr cr1 // return if BC = 0 sl@0: c_Byte_set: sl@0: stbu FILL,1(DM1) // LSU * ++(DST-1) = FILL sl@0: bdnz c_Byte_set sl@0: sl@0: blr sl@0: sl@0: c_v_memset: sl@0: // Byte count < MIN_VEC bytes will have been set by scalar code above, sl@0: // so this will not deal with small block sets < MIN_VEC. sl@0: sl@0: // For systems using VRSAVE, define VRSAV=1 when compiling. For systems sl@0: // that don't, make sure VRSAVE is undefined. sl@0: #ifdef VRSAVE sl@0: mfspr RSV,VRSV // IU2 Get current VRSAVE contents sl@0: #endif sl@0: rlwinm DR,DR,0,0,27 // IU1 (DST+16)[0:27] sl@0: addi DBK,DBC,-1 // IU1 Address of last dst byte sl@0: sl@0: #ifdef VRSAVE sl@0: oris Rt,RSV,0x8000 // IU1 Or in registers used by this routine sl@0: #endif sl@0: subf D,DST,DR // IU1 How many bytes in first destination? sl@0: li BK,0 // IU1 Initialize byte kount index sl@0: sl@0: #ifdef VRSAVE sl@0: mtspr VRSV,Rt // IU2 Save in VRSAVE before first vec op sl@0: #endif sl@0: vxor v0,v0,v0 // VIU Clear v0 sl@0: subf QW,DR,DBK // IU1 Bytes of full vectors to move (-16) sl@0: cmpi cr1,0,D,16 // IU1 Is D0 left justified? sl@0: sl@0: mtcrf 0x01,D // IU2 Put bytes in 1st dst in cr7 sl@0: rlwinm QW,QW,28,4,31 // IU1 Quad words remaining sl@0: beq cr1,c_Left_just // b if D0 is left justified sl@0: sl@0: bns cr7,c_No_B_fwd // b if only even number of bytes to store sl@0: sl@0: stvebx v0,DST,BK // LSU store first byte at DST+0 sl@0: addi BK,BK,1 // IU1 increment index sl@0: c_No_B_fwd: sl@0: bne cr7,c_No_H_fwd // b if only words to store sl@0: sl@0: stvehx v0,DST,BK // LSU store halfword at DST+0/1 sl@0: addi BK,BK,2 // IU1 increment index sl@0: c_No_H_fwd: sl@0: bng cr7,c_No_W1_fwd // b if exactly zero or two words to store sl@0: sl@0: stvewx v0,DST,BK // LSU store word 1 of one or three sl@0: addi BK,BK,4 // IU1 increment index sl@0: sl@0: c_No_W1_fwd: sl@0: bnl cr7,c_No_W2_fwd // b if there was only one word to store sl@0: stvewx v0,DST,BK // LSU store word 1 of two or 2 of three sl@0: addi BK,BK,4 // IU1 increment index sl@0: sl@0: stvewx v0,DST,BK // LSU store word 2 of two or 3 of three sl@0: b c_No_W2_fwd sl@0: sl@0: c_Left_just: sl@0: stvx v0,0,DST // LSU Store 16 bytes at D0 sl@0: c_No_W2_fwd: sl@0: rlwinm Rt,DBK,0,28,31 // IU1 (DBK = DST+BC-1)[28:31] sl@0: cmpi cr6,0,QW,0 // IU1 Any full vectors to move? sl@0: sl@0: li BK,16 // IU1 Re-initialize byte kount index sl@0: cmpi cr1,0,Rt,0xF // IU1 Is DN right justified? sl@0: ble cr6,c_Last_QW // b if no Quad words to do sl@0: sl@0: mtctr QW // IU2 for (i=0;i<=QW;i++) sl@0: cmpi cr6,0,QW,4 // IU1 Check QW>4 sl@0: sl@0: c_QW_loop: sl@0: stvx v0,DST,BK // LSU Store 16 fill bytes sl@0: addi BK,BK,16 // IU1 Increment byte kount index sl@0: bdnzf 25,c_QW_loop // b if 4 or less quad words to do sl@0: sl@0: add DNX,DST,BK // IU1 address of next store (DST+32 if QW>4) sl@0: addi QW,QW,-1 // IU1 One more QW stored by now sl@0: bgt cr6,c_GT_4QW_fwd // b if >4 quad words left sl@0: sl@0: c_Last_QW: // Next vector is the last; we're done. sl@0: mtcrf 0x01,DBC // IU2 Put final vector byte count in cr7 sl@0: sl@0: beq cr1,c_Rt_just_fwd // b if last destination is right justified sl@0: sl@0: rlwinm DBK,DBK,0,0,27 // IU1 Round to QW addr of last byte sl@0: li BL,0 // IU1 Initialize index pointer sl@0: bnl cr7,c_Only_1W_fwd // b if there was only one or zero words to store sl@0: sl@0: stvewx v0,DBK,BL // LSU store word 1 of two or three sl@0: addi BL,BL,4 // IU1 increment index sl@0: sl@0: stvewx v0,DBK,BL // LSU store word 2 of two or three sl@0: addi BL,BL,4 // IU1 increment index sl@0: c_Only_1W_fwd: sl@0: bng cr7,Only_2W_fwd // b if there were only two or zero words to store sl@0: sl@0: stvewx v0,DBK,BL // LSU store word 3 of three if necessary sl@0: addi BL,BL,4 // IU1 increment index sl@0: c_Only_2W_fwd: sl@0: bne cr7,c_Only_B_fwd // b if there are no half words to store sl@0: sl@0: stvehx v0,DBK,BL // LSU store one halfword if necessary sl@0: addi BL,BL,2 // IU1 increment index sl@0: c_Only_B_fwd: sl@0: bns cr7,c_All_done_fwd // b if there are no bytes to store sl@0: sl@0: stvebx v0,DBK,BL // LSU store one byte if necessary sl@0: b c_All_done_fwd sl@0: sl@0: c_Rt_just_fwd: sl@0: sl@0: stvx v0,DST,BK // LSU Store 16 bytes at D14 sl@0: c_All_done_fwd: sl@0: #ifdef VRSAVE sl@0: mtspr VRSV,RSV // IU1 Restore VRSAVE sl@0: #endif sl@0: blr // Return destination address from entry sl@0: sl@0: #ifdef __MWERKS__ sl@0: .align 16 sl@0: #else sl@0: .align 4 sl@0: #endif sl@0: c_GT_4QW_fwd: // Do once if nxt st is to odd half of cache line, else twice sl@0: sl@0: addi QW,QW,-1 // IU1 Keeping track of QWs stored sl@0: mtcrf 0x02,DNX // IU2 cr6[3]=((DST+32)[27]==1)?1:0; sl@0: addi DNX,DNX,16 // IU1 Update cr6 for next loop sl@0: sl@0: stvx v0,DST,BK // LSU Store 16 bytes at D2 sl@0: addi BK,BK,16 // IU1 Increment byte count by 16 sl@0: bdnzf 27,c_GT_4QW_fwd // b if next store is to lower (even) half of CL sl@0: sl@0: mtcrf 0x02,DBK // IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?) sl@0: sl@0: bns cr6,c_B32_fwd // b if DST[27] == 0; i.e, final store is even sl@0: sl@0: // We need the ctr register to reflect an even byte count before entering sl@0: // the next block - faster to decrement than to reload. sl@0: bdnz B32_fwd // decrement counter for last QW store odd sl@0: sl@0: c_B32_fwd: // Should be at least 2 stores remaining and next 2 are cache aligned sl@0: dcbz DST,BK // LSU zero whole cache line sl@0: bdz c_Nxt_loc_fwd // always decrement and branch to next instr sl@0: sl@0: c_Nxt_loc_fwd: sl@0: addi BK,BK,32 // IU1 Increment byte count sl@0: bdnz B32_fwd // b if there are at least two more QWs to do sl@0: sl@0: bso cr6,c_One_even_QW // b if there is one even and one odd QW to store sl@0: b c_Last_QW // b if last store is to even address sl@0: sl@0: // Come here with two more loads and two stores to do sl@0: c_One_even_QW: sl@0: stvx v0,DST,BK // LSU Store 16 bytes at D13 sl@0: addi BK,BK,16 // IU1 Increment byte count sl@0: sl@0: b c_Last_QW sl@0: sl@0: // End of cacheable_memzero in AltiVec