Symaptic: os/ossrv/genericopenlibs/liboil/src/arm/math_vfp

sl@0	1	/*
sl@0	2	* Copyright (c) 2007
sl@0	3	* Josep Torra <josep@fluendo.com>. All rights reserved.
sl@0	4	*
sl@0	5	* Redistribution and use in source and binary forms, with or without
sl@0	6	* modification, are permitted provided that the following conditions
sl@0	7	* are met:
sl@0	8	* 1. Redistributions of source code must retain the above copyright
sl@0	9	* notice, this list of conditions and the following disclaimer.
sl@0	10	* 2. Redistributions in binary form must reproduce the above copyright
sl@0	11	* notice, this list of conditions and the following disclaimer in the
sl@0	12	* documentation and/or other materials provided with the distribution.
sl@0	13	*
sl@0	14	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
sl@0	15	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
sl@0	16	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
sl@0	17	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE
sl@0	18	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
sl@0	19	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
sl@0	20	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
sl@0	21	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
sl@0	22	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
sl@0	23	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
sl@0	24	* SUCH DAMAGE.
sl@0	25	*/
sl@0	26
sl@0	27	#if __VFP_FP__
sl@0	28	/*
sl@0	29	** compile with -mcpu=arm1136j-s -mfpu=vfp -mfloat-abi=softfp
sl@0	30	**
sl@0	31	** void vfp_add_f32 (float d, const float s1, const float *s2, int n);
sl@0	32	** void vfp_add_f64 (double d, const double s1, const double *s2, int n);
sl@0	33	** void vfp_divide_f32 (float d, const float s1, const float *s2, int n);
sl@0	34	** void vfp_divide_f64 (double d, const double s1, const double *s2, int n);
sl@0	35	** void vfp_multiply_f32 (float d, const float s1, const float *s2, int n);
sl@0	36	** void vfp_multiply_f64 (double d, const double s1, const double *s2, int n);
sl@0	37	** void vfp_subtract_f32 (float d, const float s1, const float *s2, int n);
sl@0	38	** void vfp_subtract_f64 (double d, const double s1, const double *s2, int n);
sl@0	39	**
sl@0	40	** d: $r0 \| s1: $r1 \| s2: $r2 \| n: $r3 \|
sl@0	41	**
sl@0	42	*/
sl@0	43
sl@0	44	#define UNROLL_F32_TEMPLATE(fname,finst) \
sl@0	45	.global vfp_ ## fname ## ; \
sl@0	46	vfp_ ## fname ## : \
sl@0	47	stmdb sp!, {fp, lr}; /* save registers to stack */ \
sl@0	48	ands ip, r3, #7; /* ip = n % 8 */ \
sl@0	49	beq vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */ \
sl@0	50	vfp_ ## fname ## _loop1: \
sl@0	51	fldmias r1!, {s0}; \
sl@0	52	fldmias r2!, {s1}; \
sl@0	53	## finst ##s s2, s0, s1; \
sl@0	54	fstmias r0!, {s2}; \
sl@0	55	subs ip, ip, #1; \
sl@0	56	bne vfp_ ## fname ## _loop1; \
sl@0	57	vfp_ ## fname ## _unroll: /* unroll by 8 */ \
sl@0	58	movs ip, r3, lsr #3; /* ip = n / 8 */ \
sl@0	59	beq vfp_ ## fname ## _end; /* if ip == 0 goto finish */ \
sl@0	60	fmrx lr, fpscr; /* read fpscr register into arm */\
sl@0	61	mov fp, #7; \
sl@0	62	orr fp, lr, fp, lsl #16; /* set vector lenght to 8 */ \
sl@0	63	fmxr fpscr, fp; \
sl@0	64	vfp_ ## fname ## _loop2: \
sl@0	65	fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}; \
sl@0	66	fldmias r2!, {s16, s17, s18, s19, s20, s21, s22, s23}; \
sl@0	67	## finst ##s s24, s8, s16; \
sl@0	68	fstmias r0!, {s24, s25, s26, s27, s28, s29, s30, s31}; \
sl@0	69	subs ip, ip, #1; \
sl@0	70	bne vfp_ ## fname ## _loop2; \
sl@0	71	fmxr fpscr, lr; /* restore original fpscr */ \
sl@0	72	vfp_ ## fname ## _end: \
sl@0	73	ldmia sp!, {fp, pc}; /* recovering from stack and return */
sl@0	74
sl@0	75	#define UNROLL_F64_TEMPLATE(fname,finst) \
sl@0	76	.global vfp_ ## fname ## ; \
sl@0	77	vfp_ ## fname ## : \
sl@0	78	stmdb sp!, {fp, lr}; /* save registers to stack */ \
sl@0	79	ands ip, r3, #3; /* ip = n % 3 */ \
sl@0	80	beq vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */ \
sl@0	81	vfp_ ## fname ## _loop1: \
sl@0	82	fldmiad r1!, {d0}; \
sl@0	83	fldmiad r2!, {d1}; \
sl@0	84	## finst ##d d2, d0, d1; \
sl@0	85	fstmiad r0!, {d2}; \
sl@0	86	subs ip, ip, #1; \
sl@0	87	bne vfp_ ## fname ## _loop1; \
sl@0	88	vfp_ ## fname ## _unroll: /* unroll by 4 */ \
sl@0	89	movs ip, r3, lsr #2; /* ip = n / 4 */ \
sl@0	90	beq vfp_ ## fname ## _end; /* if ip == 0 goto finish */ \
sl@0	91	fmrx lr, fpscr; /* read fpscr register into arm */\
sl@0	92	mov fp, #3; \
sl@0	93	orr fp, lr, fp, lsl #16; /* set vector lenght to 8 */ \
sl@0	94	fmxr fpscr, fp; \
sl@0	95	vfp_ ## fname ## _loop2: \
sl@0	96	fldmiad r1!, {d4, d5, d6, d7}; \
sl@0	97	fldmiad r2!, {d8, d9, d10, d11}; \
sl@0	98	## finst ##d d12, d4, d8; \
sl@0	99	fstmiad r0!, {d12, d13, d14, d15}; \
sl@0	100	subs ip, ip, #1; \
sl@0	101	bne vfp_ ## fname ## _loop2; \
sl@0	102	fmxr fpscr, lr; /* restore original fpscr */ \
sl@0	103	vfp_ ## fname ## _end: \
sl@0	104	ldmia sp!, {fp, pc}; /* recovering from stack and return */
sl@0	105
sl@0	106	.align 2
sl@0	107	UNROLL_F32_TEMPLATE(add_f32,fadd);
sl@0	108	UNROLL_F64_TEMPLATE(add_f64,fadd);
sl@0	109
sl@0	110	UNROLL_F32_TEMPLATE(divide_f32,fdiv);
sl@0	111	UNROLL_F64_TEMPLATE(divide_f64,fdiv);
sl@0	112
sl@0	113	UNROLL_F32_TEMPLATE(multiply_f32,fmul);
sl@0	114	UNROLL_F64_TEMPLATE(multiply_f64,fmul);
sl@0	115
sl@0	116	UNROLL_F32_TEMPLATE(subtract_f32,fsub);
sl@0	117	UNROLL_F64_TEMPLATE(subtract_f64,fsub);
sl@0	118
sl@0	119	#undef UNROLL_F32_TEMPLATE
sl@0	120	#undef UNROLL_F64_TEMPLATE
sl@0	121
sl@0	122	/*
sl@0	123	**
sl@0	124	** void vfp_scalaradd_f32_ns (float d, const float s1, const float *s2_1, int n);
sl@0	125	** void vfp_scalaradd_f64_ns (double d, const double s1, const double *s2_1, int n);
sl@0	126	** void vfp_scalarmultiply_f32_ns (float d, const float s1, const float *s2_1, int n);
sl@0	127	** void vfp_scalarmultiply_f64_ns (double d, const double s1, const double *s2_1, int n);
sl@0	128	**
sl@0	129	** d: $r0 \| s1: $r1 \| s2_1: $r2 \| n: $r3 \|
sl@0	130	**
sl@0	131	*/
sl@0	132	#define UNROLL_F32_TEMPLATE(fname,finst) \
sl@0	133	.global vfp_ ## fname ## ; \
sl@0	134	vfp_ ## fname ## : \
sl@0	135	stmdb sp!, {fp, lr}; /* save registers to stack */ \
sl@0	136	fldmias r2, {s1}; /* load scalar value */ \
sl@0	137	ands ip, r3, #7; /* ip = n % 8 */ \
sl@0	138	beq vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */ \
sl@0	139	vfp_ ## fname ## _loop1: \
sl@0	140	fldmias r1!, {s0}; \
sl@0	141	## finst ##s s2, s0, s1; \
sl@0	142	fstmias r0!, {s2}; \
sl@0	143	subs ip, ip, #1; \
sl@0	144	bne vfp_ ## fname ## _loop1; \
sl@0	145	vfp_ ## fname ## _unroll: /* unroll by 8 */ \
sl@0	146	movs ip, r3, lsr #3; /* ip = n / 8 */ \
sl@0	147	beq vfp_ ## fname ## _end; /* if ip == 0 goto finish */ \
sl@0	148	fmrx lr, fpscr; /* read fpscr register into arm */\
sl@0	149	mov fp, #7; \
sl@0	150	orr fp, lr, fp, lsl #16; /* set vector lenght to 8 */ \
sl@0	151	fmxr fpscr, fp; \
sl@0	152	vfp_ ## fname ## _loop2: \
sl@0	153	fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}; \
sl@0	154	## finst ##s s24, s8, s1; \
sl@0	155	fstmias r0!, {s24, s25, s26, s27, s28, s29, s30, s31}; \
sl@0	156	subs ip, ip, #1; \
sl@0	157	bne vfp_ ## fname ## _loop2; \
sl@0	158	fmxr fpscr, lr; /* restore original fpscr */ \
sl@0	159	vfp_ ## fname ## _end: \
sl@0	160	ldmia sp!, {fp, pc}; /* recovering from stack and return */
sl@0	161
sl@0	162	#define UNROLL_F64_TEMPLATE(fname,finst) \
sl@0	163	.global vfp_ ## fname ## ; \
sl@0	164	vfp_ ## fname ## : \
sl@0	165	stmdb sp!, {fp, lr}; /* save registers to stack */ \
sl@0	166	fldmiad r2, {d1}; /* load scalar value */ \
sl@0	167	ands ip, r3, #3; /* ip = n % 3 */ \
sl@0	168	beq vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */ \
sl@0	169	vfp_ ## fname ## _loop1: \
sl@0	170	fldmiad r1!, {d0}; \
sl@0	171	## finst ##d d2, d0, d1; \
sl@0	172	fstmiad r0!, {d2}; \
sl@0	173	subs ip, ip, #1; \
sl@0	174	bne vfp_ ## fname ## _loop1; \
sl@0	175	vfp_ ## fname ## _unroll: /* unroll by 4 */ \
sl@0	176	movs ip, r3, lsr #2; /* ip = n / 4 */ \
sl@0	177	beq vfp_ ## fname ## _end; /* if ip == 0 goto finish */ \
sl@0	178	fmrx lr, fpscr; /* read fpscr register into arm */\
sl@0	179	mov fp, #3; \
sl@0	180	orr fp, lr, fp, lsl #16; /* set vector lenght to 4 */ \
sl@0	181	fmxr fpscr, fp; \
sl@0	182	vfp_ ## fname ## _loop2: \
sl@0	183	fldmiad r1!, {d4, d5, d6, d7}; \
sl@0	184	## finst ##d d12, d4, d1; \
sl@0	185	fstmiad r0!, {d12, d13, d14, d15}; \
sl@0	186	subs ip, ip, #1; \
sl@0	187	bne vfp_ ## fname ## _loop2; \
sl@0	188	fmxr fpscr, lr; /* restore original fpscr */ \
sl@0	189	vfp_ ## fname ## _end: \
sl@0	190	ldmia sp!, {fp, pc}; /* recovering from stack and return */
sl@0	191
sl@0	192	UNROLL_F32_TEMPLATE(scalaradd_f32_ns,fadd);
sl@0	193	UNROLL_F64_TEMPLATE(scalaradd_f64_ns,fadd);
sl@0	194
sl@0	195	UNROLL_F32_TEMPLATE(scalarmultiply_f32_ns,fmul);
sl@0	196	UNROLL_F64_TEMPLATE(scalarmultiply_f64_ns,fmul);
sl@0	197
sl@0	198	#undef UNROLL_F32_TEMPLATE
sl@0	199	#undef UNROLL_F64_TEMPLATE
sl@0	200
sl@0	201	/*
sl@0	202	**
sl@0	203	** void vfp_abs_f32_f32_ns(float d, const float s, int n);
sl@0	204	** void vfp_abs_f64_f64_ns(double d, const double s, int n);
sl@0	205	** void vfp_negative_f32(float d, const float s, int n);
sl@0	206	** void vfp_negative_f64(double d, const double s, int n);
sl@0	207	**
sl@0	208	** d: $r0 \| s: $r1 \| n: $r2 \|
sl@0	209	**
sl@0	210	*/
sl@0	211	#define UNROLL_F32_TEMPLATE(fname,finst) \
sl@0	212	.global vfp_ ## fname ## ; \
sl@0	213	vfp_ ## fname ## : \
sl@0	214	stmdb sp!, {fp, lr}; /* save registers to stack */ \
sl@0	215	ands ip, r2, #7; /* ip = n % 8 */ \
sl@0	216	beq vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */ \
sl@0	217	vfp_ ## fname ## _loop1: \
sl@0	218	fldmias r1!, {s0}; \
sl@0	219	## finst ##s s2, s0; \
sl@0	220	fstmias r0!, {s2}; \
sl@0	221	subs ip, ip, #1; \
sl@0	222	bne vfp_ ## fname ## _loop1; \
sl@0	223	vfp_ ## fname ## _unroll: /* unroll by 8 */ \
sl@0	224	movs ip, r2, lsr #3; /* ip = n / 8 */ \
sl@0	225	beq vfp_ ## fname ## _end; /* if ip == 0 goto finish */ \
sl@0	226	fmrx lr, fpscr; /* read fpscr register into arm */\
sl@0	227	mov fp, #7; \
sl@0	228	orr fp, lr, fp, lsl #16; /* set vector lenght to 8 */ \
sl@0	229	fmxr fpscr, fp; \
sl@0	230	vfp_ ## fname ## _loop2: \
sl@0	231	fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}; \
sl@0	232	## finst ##s s24, s8; \
sl@0	233	fstmias r0!, {s24, s25, s26, s27, s28, s29, s30, s31}; \
sl@0	234	subs ip, ip, #1; \
sl@0	235	bne vfp_ ## fname ## _loop2; \
sl@0	236	fmxr fpscr, lr; /* restore original fpscr */ \
sl@0	237	vfp_ ## fname ## _end: \
sl@0	238	ldmia sp!, {fp, pc}; /* recovering from stack and return */
sl@0	239
sl@0	240	#define UNROLL_F64_TEMPLATE(fname,finst) \
sl@0	241	.global vfp_ ## fname ## ; \
sl@0	242	vfp_ ## fname ## : \
sl@0	243	stmdb sp!, {fp, lr}; /* save registers to stack */ \
sl@0	244	ands ip, r2, #3; /* ip = n % 3 */ \
sl@0	245	beq vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */ \
sl@0	246	vfp_ ## fname ## _loop1: \
sl@0	247	fldmiad r1!, {d0}; \
sl@0	248	## finst ##d d2, d0; \
sl@0	249	fstmiad r0!, {d2}; \
sl@0	250	subs ip, ip, #1; \
sl@0	251	bne vfp_ ## fname ## _loop1; \
sl@0	252	vfp_ ## fname ## _unroll: /* unroll by 4 */ \
sl@0	253	movs ip, r2, lsr #2; /* ip = n / 4 */ \
sl@0	254	beq vfp_ ## fname ## _end; /* if ip == 0 goto finish */ \
sl@0	255	fmrx lr, fpscr; /* read fpscr register into arm */\
sl@0	256	mov fp, #3; \
sl@0	257	orr fp, lr, fp, lsl #16; /* set vector lenght to 4 */ \
sl@0	258	fmxr fpscr, fp; \
sl@0	259	vfp_ ## fname ## _loop2: \
sl@0	260	fldmiad r1!, {d4, d5, d6, d7}; \
sl@0	261	## finst ##d d12, d4; \
sl@0	262	fstmiad r0!, {d12, d13, d14, d15}; \
sl@0	263	subs ip, ip, #1; \
sl@0	264	bne vfp_ ## fname ## _loop2; \
sl@0	265	fmxr fpscr, lr; /* restore original fpscr */ \
sl@0	266	vfp_ ## fname ## _end: \
sl@0	267	ldmia sp!, {fp, pc}; /* recovering from stack and return */
sl@0	268
sl@0	269	UNROLL_F32_TEMPLATE(abs_f32_f32_ns,fabs);
sl@0	270	UNROLL_F64_TEMPLATE(abs_f64_f64_ns,fabs);
sl@0	271
sl@0	272	UNROLL_F32_TEMPLATE(negative_f32,fneg);
sl@0	273	UNROLL_F64_TEMPLATE(negative_f64,fneg);
sl@0	274
sl@0	275	#undef UNROLL_F32_TEMPLATE
sl@0	276	#undef UNROLL_F64_TEMPLATE
sl@0	277	#endif

author	sl
	Tue, 10 Jun 2014 14:32:02 +0200
changeset 1	260cb5ec6c19
permissions	-rw-r--r--