sl@0: /* sl@0: * Copyright © 2004 Red Hat, Inc. sl@0: * Copyright © 2004 Nicholas Miell sl@0: * Copyright © 2005 Trolltech AS sl@0: * sl@0: * Permission to use, copy, modify, distribute, and sell this software and its sl@0: * documentation for any purpose is hereby granted without fee, provided that sl@0: * the above copyright notice appear in all copies and that both that sl@0: * copyright notice and this permission notice appear in supporting sl@0: * documentation, and that the name of Red Hat not be used in advertising or sl@0: * publicity pertaining to distribution of the software without specific, sl@0: * written prior permission. Red Hat makes no representations about the sl@0: * suitability of this software for any purpose. It is provided "as is" sl@0: * without express or implied warranty. sl@0: * sl@0: * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS sl@0: * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND sl@0: * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY sl@0: * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES sl@0: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN sl@0: * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING sl@0: * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS sl@0: * SOFTWARE. sl@0: * sl@0: * Author: Søren Sandmann (sandmann@redhat.com) sl@0: * Minor Improvements: Nicholas Miell (nmiell@gmail.com) sl@0: * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com) sl@0: * sl@0: * Based on work by Owen Taylor sl@0: */ sl@0: //Portions Copyright (c) 2008-2009 Nokia Corporation and/or its subsidiary(-ies). All rights reserved. sl@0: sl@0: #ifdef HAVE_CONFIG_H sl@0: #include "config.h" sl@0: #endif sl@0: sl@0: #include sl@0: #include sl@0: sl@0: #include /* for _mm_shuffle_pi16 and _MM_SHUFFLE */ sl@0: sl@0: typedef uint32_t CARD32; sl@0: typedef uint16_t CARD16; sl@0: typedef int16_t INT16; sl@0: typedef uint8_t CARD8; sl@0: typedef uint64_t ullong; sl@0: typedef CARD32* PicturePtr; sl@0: typedef CARD32* FbBits; sl@0: typedef int FbStride; sl@0: sl@0: sl@0: #include "fbmmx.h" sl@0: #include "fbpict.h" sl@0: sl@0: #define CHECKPOINT() sl@0: sl@0: OIL_DECLARE_CLASS (composite_in_argb); sl@0: OIL_DECLARE_CLASS (composite_in_argb_const_src); sl@0: OIL_DECLARE_CLASS (composite_in_argb_const_mask); sl@0: OIL_DECLARE_CLASS (composite_over_argb); sl@0: OIL_DECLARE_CLASS (composite_over_argb_const_src); sl@0: OIL_DECLARE_CLASS (composite_add_argb); sl@0: OIL_DECLARE_CLASS (composite_add_argb_const_src); sl@0: OIL_DECLARE_CLASS (composite_in_over_argb); sl@0: OIL_DECLARE_CLASS (composite_in_over_argb_const_src); sl@0: OIL_DECLARE_CLASS (composite_in_over_argb_const_mask); sl@0: OIL_DECLARE_CLASS (composite_over_u8); sl@0: OIL_DECLARE_CLASS (composite_add_u8); sl@0: sl@0: sl@0: /* --------------- MMX code patch for fbcompose.c --------------------- */ sl@0: sl@0: #if 0 sl@0: static void sl@0: mmxCombineMaskU (uint32_t *dest, const uint32_t *src, const uint8_t *mask, int width) sl@0: { sl@0: const __m64 mmx_0 = _mm_setzero_si64(); sl@0: const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; sl@0: sl@0: const uint32_t *end = mask + width; sl@0: while (mask < end) { sl@0: __m64 a = MmxTo(*mask); sl@0: __m64 s = MmxTo(*src); sl@0: a = MmxAlpha(a); sl@0: MmxMul(s, a); sl@0: *dest = MmxFrom(s); sl@0: ++src; sl@0: ++dest; sl@0: ++mask; sl@0: } sl@0: _mm_empty(); sl@0: } sl@0: #endif sl@0: sl@0: #ifdef ENABLE_BROKEN_IMPLS sl@0: static void sl@0: mmxCombineOverU (uint32_t *dest, const uint32_t *src, int width) sl@0: { sl@0: const __m64 mmx_0 = _mm_setzero_si64(); sl@0: const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; sl@0: const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; sl@0: sl@0: const uint32_t *end = dest + width; sl@0: sl@0: while (dest < end) { sl@0: __m64 x, y, a; sl@0: x = MmxTo(*src); sl@0: y = MmxTo(*dest); sl@0: a = MmxAlpha(x); sl@0: a = MmxNegate(a); sl@0: MmxMulAdd(y, a, x); sl@0: *dest = MmxFrom(y); sl@0: ++dest; sl@0: ++src; sl@0: } sl@0: _mm_empty(); sl@0: } sl@0: OIL_DEFINE_IMPL_FULL(mmxCombineOverU, composite_over_argb, OIL_IMPL_FLAG_MMX); sl@0: #endif sl@0: sl@0: #if 0 sl@0: static FASTCALL void sl@0: mmxCombineOverReverseU (CARD32 *dest, const CARD32 *src, int width) sl@0: { sl@0: const __m64 mmx_0 = _mm_setzero_si64(); sl@0: const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; sl@0: const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; sl@0: sl@0: const CARD32 *end = dest + width; sl@0: sl@0: while (dest < end) { sl@0: __m64 x, y, a; sl@0: x = MmxTo(*dest); sl@0: y = MmxTo(*src); sl@0: a = MmxAlpha(x); sl@0: a = MmxNegate(a); sl@0: MmxMulAdd(y, a, x); sl@0: *dest = MmxFrom(y); sl@0: ++dest; sl@0: ++src; sl@0: } sl@0: _mm_empty(); sl@0: } sl@0: #endif sl@0: sl@0: #if 0 sl@0: static void sl@0: mmxCombineInU (CARD32 *dest, const CARD32 *src, int width) sl@0: { sl@0: const __m64 mmx_0 = _mm_setzero_si64(); sl@0: const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; sl@0: sl@0: const CARD32 *end = dest + width; sl@0: sl@0: while (dest < end) { sl@0: __m64 x, a; sl@0: x = MmxTo(*src); sl@0: a = MmxTo(*dest); sl@0: a = MmxAlpha(a); sl@0: MmxMul(x, a); sl@0: *dest = MmxFrom(x); sl@0: ++dest; sl@0: ++src; sl@0: } sl@0: _mm_empty(); sl@0: } sl@0: #endif sl@0: sl@0: #if 0 sl@0: static FASTCALL void sl@0: mmxCombineInReverseU (CARD32 *dest, const CARD32 *src, int width) sl@0: { sl@0: const __m64 mmx_0 = _mm_setzero_si64(); sl@0: const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; sl@0: sl@0: const CARD32 *end = dest + width; sl@0: sl@0: while (dest < end) { sl@0: __m64 x, a; sl@0: x = MmxTo(*dest); sl@0: a = MmxTo(*src); sl@0: a = MmxAlpha(a); sl@0: MmxMul(x, a); sl@0: *dest = MmxFrom(x); sl@0: ++dest; sl@0: ++src; sl@0: } sl@0: _mm_empty(); sl@0: } sl@0: #endif sl@0: sl@0: #if 0 sl@0: static FASTCALL void sl@0: mmxCombineOutU (CARD32 *dest, const CARD32 *src, int width) sl@0: { sl@0: const __m64 mmx_0 = _mm_setzero_si64(); sl@0: const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; sl@0: const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; sl@0: sl@0: const CARD32 *end = dest + width; sl@0: sl@0: while (dest < end) { sl@0: __m64 x, a; sl@0: x = MmxTo(*src); sl@0: a = MmxTo(*dest); sl@0: a = MmxAlpha(a); sl@0: a = MmxNegate(a); sl@0: MmxMul(x, a); sl@0: *dest = MmxFrom(x); sl@0: ++dest; sl@0: ++src; sl@0: } sl@0: _mm_empty(); sl@0: } sl@0: #endif sl@0: sl@0: #if 0 sl@0: static FASTCALL void sl@0: mmxCombineOutReverseU (CARD32 *dest, const CARD32 *src, int width) sl@0: { sl@0: const __m64 mmx_0 = _mm_setzero_si64(); sl@0: const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; sl@0: const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; sl@0: sl@0: const CARD32 *end = dest + width; sl@0: sl@0: while (dest < end) { sl@0: __m64 x, a; sl@0: x = MmxTo(*dest); sl@0: a = MmxTo(*src); sl@0: a = MmxAlpha(a); sl@0: a = MmxNegate(a); sl@0: MmxMul(x, a); sl@0: *dest = MmxFrom(x); sl@0: ++dest; sl@0: ++src; sl@0: } sl@0: _mm_empty(); sl@0: } sl@0: sl@0: static FASTCALL void sl@0: mmxCombineAtopU (CARD32 *dest, const CARD32 *src, int width) sl@0: { sl@0: const __m64 mmx_0 = _mm_setzero_si64(); sl@0: const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; sl@0: const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; sl@0: sl@0: const CARD32 *end = dest + width; sl@0: sl@0: while (dest < end) { sl@0: __m64 s, da, d, sia; sl@0: s = MmxTo(*src); sl@0: d = MmxTo(*dest); sl@0: sia = MmxAlpha(s); sl@0: sia = MmxNegate(sia); sl@0: da = MmxAlpha(d); sl@0: MmxAddMul(s, da, d, sia); sl@0: *dest = MmxFrom(s); sl@0: ++dest; sl@0: ++src; sl@0: } sl@0: _mm_empty(); sl@0: } sl@0: sl@0: static FASTCALL void sl@0: mmxCombineAtopReverseU (CARD32 *dest, const CARD32 *src, int width) sl@0: { sl@0: const __m64 mmx_0 = _mm_setzero_si64(); sl@0: const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; sl@0: const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; sl@0: sl@0: const CARD32 *end; sl@0: sl@0: end = dest + width; sl@0: sl@0: while (dest < end) { sl@0: __m64 s, dia, d, sa; sl@0: s = MmxTo(*src); sl@0: d = MmxTo(*dest); sl@0: sa = MmxAlpha(s); sl@0: dia = MmxAlpha(d); sl@0: dia = MmxNegate(dia); sl@0: MmxAddMul(s, dia, d, sa); sl@0: *dest = MmxFrom(s); sl@0: ++dest; sl@0: ++src; sl@0: } sl@0: _mm_empty(); sl@0: } sl@0: sl@0: static FASTCALL void sl@0: mmxCombineXorU (CARD32 *dest, const CARD32 *src, int width) sl@0: { sl@0: const __m64 mmx_0 = _mm_setzero_si64(); sl@0: const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; sl@0: const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; sl@0: sl@0: const CARD32 *end = dest + width; sl@0: sl@0: while (dest < end) { sl@0: __m64 s, dia, d, sia; sl@0: s = MmxTo(*src); sl@0: d = MmxTo(*dest); sl@0: sia = MmxAlpha(s); sl@0: dia = MmxAlpha(d); sl@0: sia = MmxNegate(sia); sl@0: dia = MmxNegate(dia); sl@0: MmxAddMul(s, dia, d, sia); sl@0: *dest = MmxFrom(s); sl@0: ++dest; sl@0: ++src; sl@0: } sl@0: _mm_empty(); sl@0: } sl@0: #endif sl@0: sl@0: static void sl@0: mmxCombineAddU (uint32_t *dest, const uint32_t *src, int width) sl@0: { sl@0: const __m64 mmx_0 = _mm_setzero_si64(); sl@0: sl@0: const uint32_t *end = dest + width; sl@0: while (dest < end) { sl@0: __m64 s, d; sl@0: s = MmxTo(*src); sl@0: d = MmxTo(*dest); sl@0: s = MmxAdd(s, d); sl@0: *dest = MmxFrom(s); sl@0: ++dest; sl@0: ++src; sl@0: } sl@0: _mm_empty(); sl@0: } sl@0: OIL_DEFINE_IMPL_FULL(mmxCombineAddU, composite_add_argb, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_SSE); sl@0: sl@0: #if 0 sl@0: static FASTCALL void sl@0: mmxCombineSaturateU (CARD32 *dest, const CARD32 *src, int width) sl@0: { sl@0: const __m64 mmx_0 = _mm_setzero_si64(); sl@0: const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; sl@0: sl@0: const CARD32 *end = dest + width; sl@0: while (dest < end) { sl@0: CARD32 s = *src; sl@0: CARD32 d = *dest; sl@0: __m64 ms = MmxTo(s); sl@0: __m64 md = MmxTo(d); sl@0: CARD32 sa = s >> 24; sl@0: CARD32 da = ~d >> 24; sl@0: sl@0: if (sa > da) { sl@0: __m64 msa = MmxTo(FbIntDiv(da, sa)); sl@0: msa = MmxAlpha(msa); sl@0: MmxMul(ms, msa); sl@0: } sl@0: MmxAdd(md, ms); sl@0: *dest = MmxFrom(md); sl@0: ++src; sl@0: ++dest; sl@0: } sl@0: _mm_empty(); sl@0: } sl@0: sl@0: sl@0: static FASTCALL void sl@0: mmxCombineSrcC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width) sl@0: { sl@0: const __m64 mmx_0 = _mm_setzero_si64(); sl@0: const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; sl@0: sl@0: const CARD32 *end = src + width; sl@0: while (src < end) { sl@0: __m64 a = MmxTo(*mask); sl@0: __m64 s = MmxTo(*src); sl@0: MmxMul(s, a); sl@0: *dest = MmxFrom(s); sl@0: ++src; sl@0: ++mask; sl@0: ++dest; sl@0: } sl@0: _mm_empty(); sl@0: } sl@0: sl@0: static FASTCALL void sl@0: mmxCombineOverC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width) sl@0: { sl@0: const __m64 mmx_0 = _mm_setzero_si64(); sl@0: const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; sl@0: const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; sl@0: sl@0: const CARD32 *end = src + width; sl@0: while (src < end) { sl@0: __m64 a = MmxTo(*mask); sl@0: __m64 s = MmxTo(*src); sl@0: __m64 d = MmxTo(*dest); sl@0: __m64 sa = MmxAlpha(s); sl@0: MmxMul(s, a); sl@0: MmxMul(a, sa); sl@0: a = MmxNegate(a); sl@0: MmxMulAdd(d, a, s); sl@0: *dest = MmxFrom(d); sl@0: ++src; sl@0: ++dest; sl@0: ++mask; sl@0: } sl@0: _mm_empty(); sl@0: } sl@0: sl@0: static FASTCALL void sl@0: mmxCombineOverReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width) sl@0: { sl@0: const __m64 mmx_0 = _mm_setzero_si64(); sl@0: const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; sl@0: const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; sl@0: sl@0: const CARD32 *end = src + width; sl@0: while (src < end) { sl@0: __m64 a = MmxTo(*mask); sl@0: __m64 s = MmxTo(*src); sl@0: __m64 d = MmxTo(*dest); sl@0: __m64 da = MmxAlpha(d); sl@0: da = MmxNegate(da); sl@0: MmxMul(s, a); sl@0: MmxMulAdd(s, da, d); sl@0: *dest = MmxFrom(s); sl@0: ++src; sl@0: ++dest; sl@0: ++mask; sl@0: } sl@0: _mm_empty(); sl@0: } sl@0: sl@0: sl@0: static FASTCALL void sl@0: mmxCombineInC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width) sl@0: { sl@0: const __m64 mmx_0 = _mm_setzero_si64(); sl@0: const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; sl@0: sl@0: const CARD32 *end = src + width; sl@0: while (src < end) { sl@0: __m64 a = MmxTo(*mask); sl@0: __m64 s = MmxTo(*src); sl@0: __m64 d = MmxTo(*dest); sl@0: __m64 da = MmxAlpha(d); sl@0: MmxMul(s, a); sl@0: MmxMul(s, da); sl@0: *dest = MmxFrom(s); sl@0: ++src; sl@0: ++dest; sl@0: ++mask; sl@0: } sl@0: _mm_empty(); sl@0: } sl@0: sl@0: static FASTCALL void sl@0: mmxCombineInReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width) sl@0: { sl@0: const __m64 mmx_0 = _mm_setzero_si64(); sl@0: const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; sl@0: sl@0: const CARD32 *end = src + width; sl@0: while (src < end) { sl@0: __m64 a = MmxTo(*mask); sl@0: __m64 s = MmxTo(*src); sl@0: __m64 d = MmxTo(*dest); sl@0: __m64 sa = MmxAlpha(s); sl@0: MmxMul(a, sa); sl@0: MmxMul(d, a); sl@0: *dest = MmxFrom(d); sl@0: ++src; sl@0: ++dest; sl@0: ++mask; sl@0: } sl@0: _mm_empty(); sl@0: } sl@0: sl@0: static FASTCALL void sl@0: mmxCombineOutC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width) sl@0: { sl@0: const __m64 mmx_0 = _mm_setzero_si64(); sl@0: const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; sl@0: const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; sl@0: sl@0: const CARD32 *end = src + width; sl@0: while (src < end) { sl@0: __m64 a = MmxTo(*mask); sl@0: __m64 s = MmxTo(*src); sl@0: __m64 d = MmxTo(*dest); sl@0: __m64 da = MmxAlpha(d); sl@0: da = MmxNegate(da); sl@0: MmxMul(s, a); sl@0: MmxMul(s, da); sl@0: *dest = MmxFrom(s); sl@0: ++src; sl@0: ++dest; sl@0: ++mask; sl@0: } sl@0: _mm_empty(); sl@0: } sl@0: sl@0: static FASTCALL void sl@0: mmxCombineOutReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width) sl@0: { sl@0: const __m64 mmx_0 = _mm_setzero_si64(); sl@0: const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; sl@0: const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; sl@0: sl@0: const CARD32 *end = src + width; sl@0: while (src < end) { sl@0: __m64 a = MmxTo(*mask); sl@0: __m64 s = MmxTo(*src); sl@0: __m64 d = MmxTo(*dest); sl@0: __m64 sa = MmxAlpha(s); sl@0: MmxMul(a, sa); sl@0: a = MmxNegate(a); sl@0: MmxMul(d, a); sl@0: *dest = MmxFrom(d); sl@0: ++src; sl@0: ++dest; sl@0: ++mask; sl@0: } sl@0: _mm_empty(); sl@0: } sl@0: sl@0: static FASTCALL void sl@0: mmxCombineAtopC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width) sl@0: { sl@0: const __m64 mmx_0 = _mm_setzero_si64(); sl@0: const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; sl@0: const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; sl@0: sl@0: const CARD32 *end = src + width; sl@0: while (src < end) { sl@0: __m64 a = MmxTo(*mask); sl@0: __m64 s = MmxTo(*src); sl@0: __m64 d = MmxTo(*dest); sl@0: __m64 da = MmxAlpha(d); sl@0: __m64 sa = MmxAlpha(s); sl@0: MmxMul(s, a); sl@0: MmxMul(a, sa); sl@0: a = MmxNegate(a); sl@0: MmxAddMul(d, a, s, da); sl@0: *dest = MmxFrom(d); sl@0: ++src; sl@0: ++dest; sl@0: ++mask; sl@0: } sl@0: _mm_empty(); sl@0: } sl@0: sl@0: static FASTCALL void sl@0: mmxCombineAtopReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width) sl@0: { sl@0: const __m64 mmx_0 = _mm_setzero_si64(); sl@0: const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; sl@0: const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; sl@0: sl@0: const CARD32 *end = src + width; sl@0: while (src < end) { sl@0: __m64 a = MmxTo(*mask); sl@0: __m64 s = MmxTo(*src); sl@0: __m64 d = MmxTo(*dest); sl@0: __m64 da = MmxAlpha(d); sl@0: __m64 sa = MmxAlpha(s) sl@0: MmxMul(s, a); sl@0: MmxMul(a, sa); sl@0: da = MmxNegate(da); sl@0: MmxAddMul(d, a, s, da); sl@0: *dest = MmxFrom(d); sl@0: ++src; sl@0: ++dest; sl@0: ++mask; sl@0: } sl@0: _mm_empty(); sl@0: } sl@0: sl@0: static FASTCALL void sl@0: mmxCombineXorC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width) sl@0: { sl@0: const __m64 mmx_0 = _mm_setzero_si64(); sl@0: const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; sl@0: const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; sl@0: sl@0: const CARD32 *end = src + width; sl@0: while (src < end) { sl@0: __m64 a = MmxTo(*mask); sl@0: __m64 s = MmxTo(*src); sl@0: __m64 d = MmxTo(*dest); sl@0: __m64 da = MmxAlpha(d); sl@0: __m64 sa = MmxAlpha(s); sl@0: MmxMul(s, a); sl@0: MmxMul(a, sa); sl@0: da = MmxNegate(da); sl@0: a = MmxNegate(a); sl@0: MmxAddMul(d, a, s, da); sl@0: *dest = MmxFrom(d); sl@0: ++src; sl@0: ++dest; sl@0: ++mask; sl@0: } sl@0: _mm_empty(); sl@0: } sl@0: sl@0: static FASTCALL void sl@0: mmxCombineAddC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width) sl@0: { sl@0: const __m64 mmx_0 = _mm_setzero_si64(); sl@0: const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; sl@0: sl@0: const CARD32 *end = src + width; sl@0: while (src < end) { sl@0: __m64 a = MmxTo(*mask); sl@0: __m64 s = MmxTo(*src); sl@0: __m64 d = MmxTo(*dest); sl@0: MmxMul(s, a); sl@0: d = MmxAdd(s, d); sl@0: *dest = MmxFrom(d); sl@0: ++src; sl@0: ++dest; sl@0: ++mask; sl@0: } sl@0: _mm_empty(); sl@0: } sl@0: sl@0: extern FbComposeFunctions composeFunctions; sl@0: sl@0: void fbComposeSetupMMX(void) sl@0: { sl@0: /* check if we have MMX support and initialize accordingly */ sl@0: if (fbHaveMMX()) { sl@0: composeFunctions.combineU[PictOpOver] = mmxCombineOverU; sl@0: composeFunctions.combineU[PictOpOverReverse] = mmxCombineOverReverseU; sl@0: composeFunctions.combineU[PictOpIn] = mmxCombineInU; sl@0: composeFunctions.combineU[PictOpInReverse] = mmxCombineInReverseU; sl@0: composeFunctions.combineU[PictOpOut] = mmxCombineOutU; sl@0: composeFunctions.combineU[PictOpOutReverse] = mmxCombineOutReverseU; sl@0: composeFunctions.combineU[PictOpAtop] = mmxCombineAtopU; sl@0: composeFunctions.combineU[PictOpAtopReverse] = mmxCombineAtopReverseU; sl@0: composeFunctions.combineU[PictOpXor] = mmxCombineXorU; sl@0: composeFunctions.combineU[PictOpAdd] = mmxCombineAddU; sl@0: composeFunctions.combineU[PictOpSaturate] = mmxCombineSaturateU; sl@0: sl@0: composeFunctions.combineC[PictOpSrc] = mmxCombineSrcC; sl@0: composeFunctions.combineC[PictOpOver] = mmxCombineOverC; sl@0: composeFunctions.combineC[PictOpOverReverse] = mmxCombineOverReverseC; sl@0: composeFunctions.combineC[PictOpIn] = mmxCombineInC; sl@0: composeFunctions.combineC[PictOpInReverse] = mmxCombineInReverseC; sl@0: composeFunctions.combineC[PictOpOut] = mmxCombineOutC; sl@0: composeFunctions.combineC[PictOpOutReverse] = mmxCombineOutReverseC; sl@0: composeFunctions.combineC[PictOpAtop] = mmxCombineAtopC; sl@0: composeFunctions.combineC[PictOpAtopReverse] = mmxCombineAtopReverseC; sl@0: composeFunctions.combineC[PictOpXor] = mmxCombineXorC; sl@0: composeFunctions.combineC[PictOpAdd] = mmxCombineAddC; sl@0: sl@0: composeFunctions.combineMaskU = mmxCombineMaskU; sl@0: } sl@0: } sl@0: #endif sl@0: sl@0: sl@0: /* ------------------ MMX code paths called from fbpict.c ----------------------- */ sl@0: sl@0: typedef union { sl@0: __m64 m64; sl@0: uint64_t ull; sl@0: } m64_ull; sl@0: sl@0: typedef struct sl@0: { sl@0: m64_ull mmx_4x00ff; sl@0: m64_ull mmx_4x0080; sl@0: m64_ull mmx_565_rgb; sl@0: m64_ull mmx_565_unpack_multiplier; sl@0: m64_ull mmx_565_r; sl@0: m64_ull mmx_565_g; sl@0: m64_ull mmx_565_b; sl@0: m64_ull mmx_mask_0; sl@0: m64_ull mmx_mask_1; sl@0: m64_ull mmx_mask_2; sl@0: m64_ull mmx_mask_3; sl@0: m64_ull mmx_full_alpha; sl@0: m64_ull mmx_ffff0000ffff0000; sl@0: m64_ull mmx_0000ffff00000000; sl@0: m64_ull mmx_000000000000ffff; sl@0: } MMXData; sl@0: sl@0: static const MMXData c = sl@0: { sl@0: .mmx_4x00ff.ull = 0x00ff00ff00ff00ffULL, sl@0: .mmx_4x0080.ull = 0x0080008000800080ULL, sl@0: .mmx_565_rgb.ull = 0x000001f0003f001fULL, sl@0: .mmx_565_r.ull = 0x000000f800000000ULL, sl@0: .mmx_565_g.ull = 0x0000000000fc0000ULL, sl@0: .mmx_565_b.ull = 0x00000000000000f8ULL, sl@0: .mmx_mask_0.ull = 0xffffffffffff0000ULL, sl@0: .mmx_mask_1.ull = 0xffffffff0000ffffULL, sl@0: .mmx_mask_2.ull = 0xffff0000ffffffffULL, sl@0: .mmx_mask_3.ull = 0x0000ffffffffffffULL, sl@0: .mmx_full_alpha.ull = 0x00ff000000000000ULL, sl@0: .mmx_565_unpack_multiplier.ull = 0x0000008404100840ULL, sl@0: .mmx_ffff0000ffff0000.ull = 0xffff0000ffff0000ULL, sl@0: .mmx_0000ffff00000000.ull = 0x0000ffff00000000ULL, sl@0: .mmx_000000000000ffff.ull = 0x000000000000ffffULL, sl@0: }; sl@0: sl@0: #define MC(x) ((__m64) c.mmx_##x.m64) sl@0: sl@0: static __inline__ __m64 sl@0: shift (__m64 v, int s) sl@0: { sl@0: if (s > 0) sl@0: return _mm_slli_si64 (v, s); sl@0: else if (s < 0) sl@0: return _mm_srli_si64 (v, -s); sl@0: else sl@0: return v; sl@0: } sl@0: sl@0: static __inline__ __m64 sl@0: negate (__m64 mask) sl@0: { sl@0: return _mm_xor_si64 (mask, MC(4x00ff)); sl@0: } sl@0: sl@0: static __inline__ __m64 sl@0: pix_multiply (__m64 a, __m64 b) sl@0: { sl@0: __m64 res; sl@0: sl@0: res = _mm_mullo_pi16 (a, b); sl@0: res = _mm_adds_pu16 (res, MC(4x0080)); sl@0: res = _mm_adds_pu16 (res, _mm_srli_pi16 (res, 8)); sl@0: res = _mm_srli_pi16 (res, 8); sl@0: sl@0: return res; sl@0: } sl@0: sl@0: static __inline__ __m64 sl@0: expand_alpha (__m64 pixel) sl@0: { sl@0: return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(3, 3, 3, 3)); sl@0: } sl@0: sl@0: static __inline__ __m64 sl@0: expand_alpha_rev (__m64 pixel) sl@0: { sl@0: return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(0, 0, 0, 0)); sl@0: } sl@0: sl@0: static __inline__ __m64 sl@0: invert_colors (__m64 pixel) sl@0: { sl@0: return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(3, 0, 1, 2)); sl@0: } sl@0: sl@0: /* Notes about writing mmx code sl@0: * sl@0: * give memory operands as the second operand. If you give it as the sl@0: * first, gcc will first load it into a register, then use that sl@0: * register sl@0: * sl@0: * ie. use sl@0: * sl@0: * _mm_mullo_pi16 (x, mmx_constant); sl@0: * sl@0: * not sl@0: * sl@0: * _mm_mullo_pi16 (mmx_constant, x); sl@0: * sl@0: * Also try to minimize dependencies. i.e. when you need a value, try sl@0: * to calculate it from a value that was calculated as early as sl@0: * possible. sl@0: */ sl@0: sl@0: static __inline__ __m64 sl@0: over (__m64 src, __m64 srca, __m64 dest) sl@0: { sl@0: return _mm_adds_pu8 (src, pix_multiply(dest, negate(srca))); sl@0: } sl@0: sl@0: static __inline__ __m64 sl@0: over_rev_non_pre (__m64 src, __m64 dest) sl@0: { sl@0: __m64 srca = expand_alpha (src); sl@0: __m64 srcfaaa = _mm_or_si64 (srca, MC(full_alpha)); sl@0: sl@0: return over(pix_multiply(invert_colors(src), srcfaaa), srca, dest); sl@0: } sl@0: sl@0: static __inline__ __m64 sl@0: in (__m64 src, sl@0: __m64 mask) sl@0: { sl@0: return pix_multiply (src, mask); sl@0: } sl@0: sl@0: static __inline__ __m64 sl@0: in_over (__m64 src, sl@0: __m64 srca, sl@0: __m64 mask, sl@0: __m64 dest) sl@0: { sl@0: return over(in(src, mask), pix_multiply(srca, mask), dest); sl@0: } sl@0: sl@0: static __inline__ __m64 sl@0: load8888 (CARD32 v) sl@0: { sl@0: return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (v), _mm_setzero_si64()); sl@0: } sl@0: sl@0: static __inline__ __m64 sl@0: pack8888 (__m64 lo, __m64 hi) sl@0: { sl@0: __m64 r; sl@0: r = _mm_packs_pu16 (lo, hi); sl@0: return r; sl@0: } sl@0: sl@0: static __inline__ CARD32 sl@0: store8888 (__m64 v) sl@0: { sl@0: return _mm_cvtsi64_si32(pack8888(v, _mm_setzero_si64())); sl@0: } sl@0: sl@0: /* Expand 16 bits positioned at @pos (0-3) of a mmx register into sl@0: * sl@0: * 00RR00GG00BB sl@0: * sl@0: * --- Expanding 565 in the low word --- sl@0: * sl@0: * m = (m << (32 - 3)) | (m << (16 - 5)) | m; sl@0: * m = m & (01f0003f001f); sl@0: * m = m * (008404100840); sl@0: * m = m >> 8; sl@0: * sl@0: * Note the trick here - the top word is shifted by another nibble to sl@0: * avoid it bumping into the middle word sl@0: */ sl@0: static __inline__ __m64 sl@0: expand565 (__m64 pixel, int pos) sl@0: { sl@0: __m64 p = pixel; sl@0: __m64 t1, t2; sl@0: sl@0: /* move pixel to low 16 bit and zero the rest */ sl@0: p = shift (shift (p, (3 - pos) * 16), -48); sl@0: sl@0: t1 = shift (p, 36 - 11); sl@0: t2 = shift (p, 16 - 5); sl@0: sl@0: p = _mm_or_si64 (t1, p); sl@0: p = _mm_or_si64 (t2, p); sl@0: p = _mm_and_si64 (p, MC(565_rgb)); sl@0: sl@0: pixel = _mm_mullo_pi16 (p, MC(565_unpack_multiplier)); sl@0: return _mm_srli_pi16 (pixel, 8); sl@0: } sl@0: sl@0: static __inline__ __m64 sl@0: expand8888 (__m64 in, int pos) sl@0: { sl@0: if (pos == 0) sl@0: return _mm_unpacklo_pi8 (in, _mm_setzero_si64()); sl@0: else sl@0: return _mm_unpackhi_pi8 (in, _mm_setzero_si64()); sl@0: } sl@0: sl@0: static __inline__ __m64 sl@0: pack565 (__m64 pixel, __m64 target, int pos) sl@0: { sl@0: __m64 p = pixel; sl@0: __m64 t = target; sl@0: __m64 r, g, b; sl@0: sl@0: r = _mm_and_si64 (p, MC(565_r)); sl@0: g = _mm_and_si64 (p, MC(565_g)); sl@0: b = _mm_and_si64 (p, MC(565_b)); sl@0: sl@0: r = shift (r, - (32 - 8) + pos * 16); sl@0: g = shift (g, - (16 - 3) + pos * 16); sl@0: b = shift (b, - (0 + 3) + pos * 16); sl@0: sl@0: if (pos == 0) sl@0: t = _mm_and_si64 (t, MC(mask_0)); sl@0: else if (pos == 1) sl@0: t = _mm_and_si64 (t, MC(mask_1)); sl@0: else if (pos == 2) sl@0: t = _mm_and_si64 (t, MC(mask_2)); sl@0: else if (pos == 3) sl@0: t = _mm_and_si64 (t, MC(mask_3)); sl@0: sl@0: p = _mm_or_si64 (r, t); sl@0: p = _mm_or_si64 (g, p); sl@0: sl@0: return _mm_or_si64 (b, p); sl@0: } sl@0: sl@0: #ifdef ENABLE_BROKEN_IMPLS sl@0: /* broken. See Debian bug #340932 */ sl@0: static void sl@0: fbCompositeSolid_nx8888mmx (uint32_t *dst, uint32_t *src, int w) sl@0: { sl@0: __m64 vsrc, vsrca; sl@0: sl@0: vsrc = load8888 (*src); sl@0: vsrca = expand_alpha (vsrc); sl@0: sl@0: while (w && (unsigned long)dst & 7) sl@0: { sl@0: *dst = store8888(over(vsrc, vsrca, load8888(*dst))); sl@0: sl@0: w--; sl@0: dst++; sl@0: } sl@0: sl@0: while (w >= 2) sl@0: { sl@0: __m64 vdest; sl@0: __m64 dest0, dest1; sl@0: sl@0: vdest = *(__m64 *)dst; sl@0: sl@0: dest0 = over(vsrc, vsrca, expand8888(vdest, 0)); sl@0: dest1 = over(vsrc, vsrca, expand8888(vdest, 1)); sl@0: sl@0: *(__m64 *)dst = pack8888(dest0, dest1); sl@0: sl@0: dst += 2; sl@0: w -= 2; sl@0: } sl@0: sl@0: while (w) sl@0: { sl@0: *dst = store8888(over(vsrc, vsrca, load8888(*dst))); sl@0: sl@0: w--; sl@0: dst++; sl@0: } sl@0: sl@0: _mm_empty(); sl@0: } sl@0: OIL_DEFINE_IMPL_FULL(fbCompositeSolid_nx8888mmx, composite_over_argb_const_src, sl@0: OIL_IMPL_FLAG_MMX| OIL_IMPL_FLAG_MMXEXT); sl@0: #endif sl@0: sl@0: #if 0 sl@0: void sl@0: fbCompositeSolid_nx0565mmx (CARD8 op, sl@0: PicturePtr pSrc, sl@0: PicturePtr pMask, sl@0: PicturePtr pDst, sl@0: INT16 xSrc, sl@0: INT16 ySrc, sl@0: INT16 xMask, sl@0: INT16 yMask, sl@0: INT16 xDst, sl@0: INT16 yDst, sl@0: CARD16 width, sl@0: CARD16 height) sl@0: { sl@0: CARD32 src; sl@0: CARD16 *dstLine, *dst; sl@0: CARD16 w; sl@0: FbStride dstStride; sl@0: __m64 vsrc, vsrca; sl@0: sl@0: CHECKPOINT(); sl@0: sl@0: fbComposeGetSolid(pSrc, src, pDst->format); sl@0: sl@0: if (src >> 24 == 0) sl@0: return; sl@0: sl@0: fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1); sl@0: sl@0: vsrc = load8888 (src); sl@0: vsrca = expand_alpha (vsrc); sl@0: sl@0: while (height--) sl@0: { sl@0: dst = dstLine; sl@0: dstLine += dstStride; sl@0: w = width; sl@0: sl@0: CHECKPOINT(); sl@0: sl@0: while (w && (unsigned long)dst & 7) sl@0: { sl@0: ullong d = *dst; sl@0: __m64 vdest = expand565 ((__m64)d, 0); sl@0: vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0); sl@0: *dst = (ullong)vdest; sl@0: sl@0: w--; sl@0: dst++; sl@0: } sl@0: sl@0: while (w >= 4) sl@0: { sl@0: __m64 vdest; sl@0: sl@0: vdest = *(__m64 *)dst; sl@0: sl@0: vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 0)), vdest, 0); sl@0: vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 1)), vdest, 1); sl@0: vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 2)), vdest, 2); sl@0: vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 3)), vdest, 3); sl@0: sl@0: *(__m64 *)dst = vdest; sl@0: sl@0: dst += 4; sl@0: w -= 4; sl@0: } sl@0: sl@0: CHECKPOINT(); sl@0: sl@0: while (w) sl@0: { sl@0: ullong d = *dst; sl@0: __m64 vdest = expand565 ((__m64)d, 0); sl@0: vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0); sl@0: *dst = (ullong)vdest; sl@0: sl@0: w--; sl@0: dst++; sl@0: } sl@0: } sl@0: sl@0: _mm_empty(); sl@0: } sl@0: #endif sl@0: sl@0: #if 0 sl@0: static void sl@0: fbCompositeSolidMask_nx8888x8888Cmmx (uint32_t *dst, uint32_t *src, uint8_t *mask, int w) sl@0: { sl@0: CARD32 src, srca; sl@0: CARD32 *dstLine; sl@0: CARD32 *maskLine; sl@0: FbStride dstStride, maskStride; sl@0: __m64 vsrc, vsrca; sl@0: sl@0: sl@0: while (twidth && (unsigned long)q & 7) sl@0: { sl@0: CARD32 m = *(CARD32 *)p; sl@0: sl@0: if (m) sl@0: { sl@0: __m64 vdest = load8888(*q); sl@0: vdest = in_over(vsrc, vsrca, load8888(m), vdest); sl@0: *q = (ullong)pack8888(vdest, _mm_setzero_si64()); sl@0: } sl@0: sl@0: twidth--; sl@0: p++; sl@0: q++; sl@0: } sl@0: sl@0: while (twidth >= 2) sl@0: { sl@0: CARD32 m0, m1; sl@0: m0 = *p; sl@0: m1 = *(p + 1); sl@0: sl@0: if (m0 | m1) sl@0: { sl@0: __m64 dest0, dest1; sl@0: __m64 vdest = *(__m64 *)q; sl@0: sl@0: dest0 = in_over(vsrc, vsrca, load8888(m0), sl@0: expand8888 (vdest, 0)); sl@0: dest1 = in_over(vsrc, vsrca, load8888(m1), sl@0: expand8888 (vdest, 1)); sl@0: sl@0: *(__m64 *)q = pack8888(dest0, dest1); sl@0: } sl@0: sl@0: p += 2; sl@0: q += 2; sl@0: twidth -= 2; sl@0: } sl@0: sl@0: while (twidth) sl@0: { sl@0: CARD32 m = *(CARD32 *)p; sl@0: sl@0: if (m) sl@0: { sl@0: __m64 vdest = load8888(*q); sl@0: vdest = in_over(vsrc, vsrca, load8888(m), vdest); sl@0: *q = (ullong)pack8888(vdest, _mm_setzero_si64()); sl@0: } sl@0: sl@0: twidth--; sl@0: p++; sl@0: q++; sl@0: } sl@0: sl@0: _mm_empty(); sl@0: } sl@0: #endif sl@0: sl@0: #if 0 sl@0: static void sl@0: fbCompositeSrc_8888x8x8888mmx (uint32_t *dest, uint32_t *src, uint8_t *mask, sl@0: int width) sl@0: { sl@0: sl@0: mask = *maskLine << 24 | *maskLine << 16 | *maskLine << 8 | *maskLine; sl@0: vmask = load8888 (mask); sl@0: srca = MC(4x00ff); sl@0: sl@0: while (height--) sl@0: { sl@0: dst = dstLine; sl@0: dstLine += dstStride; sl@0: src = srcLine; sl@0: srcLine += srcStride; sl@0: w = width; sl@0: sl@0: while (w && (unsigned long)dst & 7) sl@0: { sl@0: __m64 s = load8888 (*src); sl@0: __m64 d = load8888 (*dst); sl@0: sl@0: *dst = (ullong)pack8888 (in_over (s, srca, vmask, d), (__m64)_mm_setzero_si64()); sl@0: sl@0: w--; sl@0: dst++; sl@0: src++; sl@0: } sl@0: sl@0: while (w >= 16) sl@0: { sl@0: __m64 vd0 = *(__m64 *)(dst + 0); sl@0: __m64 vd1 = *(__m64 *)(dst + 2); sl@0: __m64 vd2 = *(__m64 *)(dst + 4); sl@0: __m64 vd3 = *(__m64 *)(dst + 6); sl@0: __m64 vd4 = *(__m64 *)(dst + 8); sl@0: __m64 vd5 = *(__m64 *)(dst + 10); sl@0: __m64 vd6 = *(__m64 *)(dst + 12); sl@0: __m64 vd7 = *(__m64 *)(dst + 14); sl@0: sl@0: __m64 vs0 = *(__m64 *)(src + 0); sl@0: __m64 vs1 = *(__m64 *)(src + 2); sl@0: __m64 vs2 = *(__m64 *)(src + 4); sl@0: __m64 vs3 = *(__m64 *)(src + 6); sl@0: __m64 vs4 = *(__m64 *)(src + 8); sl@0: __m64 vs5 = *(__m64 *)(src + 10); sl@0: __m64 vs6 = *(__m64 *)(src + 12); sl@0: __m64 vs7 = *(__m64 *)(src + 14); sl@0: sl@0: vd0 = (__m64)pack8888 ( sl@0: in_over (expand8888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)), sl@0: in_over (expand8888 (vs0, 1), srca, vmask, expand8888 (vd0, 1))); sl@0: sl@0: vd1 = (__m64)pack8888 ( sl@0: in_over (expand8888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)), sl@0: in_over (expand8888 (vs1, 1), srca, vmask, expand8888 (vd1, 1))); sl@0: sl@0: vd2 = (__m64)pack8888 ( sl@0: in_over (expand8888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)), sl@0: in_over (expand8888 (vs2, 1), srca, vmask, expand8888 (vd2, 1))); sl@0: sl@0: vd3 = (__m64)pack8888 ( sl@0: in_over (expand8888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)), sl@0: in_over (expand8888 (vs3, 1), srca, vmask, expand8888 (vd3, 1))); sl@0: sl@0: vd4 = (__m64)pack8888 ( sl@0: in_over (expand8888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)), sl@0: in_over (expand8888 (vs4, 1), srca, vmask, expand8888 (vd4, 1))); sl@0: sl@0: vd5 = (__m64)pack8888 ( sl@0: in_over (expand8888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)), sl@0: in_over (expand8888 (vs5, 1), srca, vmask, expand8888 (vd5, 1))); sl@0: sl@0: vd6 = (__m64)pack8888 ( sl@0: in_over (expand8888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)), sl@0: in_over (expand8888 (vs6, 1), srca, vmask, expand8888 (vd6, 1))); sl@0: sl@0: vd7 = (__m64)pack8888 ( sl@0: in_over (expand8888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)), sl@0: in_over (expand8888 (vs7, 1), srca, vmask, expand8888 (vd7, 1))); sl@0: sl@0: *(__m64 *)(dst + 0) = vd0; sl@0: *(__m64 *)(dst + 2) = vd1; sl@0: *(__m64 *)(dst + 4) = vd2; sl@0: *(__m64 *)(dst + 6) = vd3; sl@0: *(__m64 *)(dst + 8) = vd4; sl@0: *(__m64 *)(dst + 10) = vd5; sl@0: *(__m64 *)(dst + 12) = vd6; sl@0: *(__m64 *)(dst + 14) = vd7; sl@0: sl@0: w -= 16; sl@0: dst += 16; sl@0: src += 16; sl@0: } sl@0: sl@0: while (w) sl@0: { sl@0: __m64 s = load8888 (*src); sl@0: __m64 d = load8888 (*dst); sl@0: sl@0: *dst = (ullong)pack8888 (in_over (s, srca, vmask, d), (__m64)_mm_setzero_si64()); sl@0: sl@0: w--; sl@0: dst++; sl@0: src++; sl@0: } sl@0: } sl@0: sl@0: _mm_empty(); sl@0: } sl@0: sl@0: void sl@0: fbCompositeSrc_8888x8888mmx (CARD8 op, sl@0: PicturePtr pSrc, sl@0: PicturePtr pMask, sl@0: PicturePtr pDst, sl@0: INT16 xSrc, sl@0: INT16 ySrc, sl@0: INT16 xMask, sl@0: INT16 yMask, sl@0: INT16 xDst, sl@0: INT16 yDst, sl@0: CARD16 width, sl@0: CARD16 height) sl@0: { sl@0: CARD32 *dstLine, *dst; sl@0: CARD32 *srcLine, *src; sl@0: FbStride dstStride, srcStride; sl@0: CARD16 w; sl@0: __m64 srca; sl@0: sl@0: CHECKPOINT(); sl@0: sl@0: fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1); sl@0: fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1); sl@0: sl@0: srca = MC (4x00ff); sl@0: sl@0: while (height--) sl@0: { sl@0: dst = dstLine; sl@0: dstLine += dstStride; sl@0: src = srcLine; sl@0: srcLine += srcStride; sl@0: w = width; sl@0: sl@0: while (w && (unsigned long)dst & 7) sl@0: { sl@0: __m64 s = load8888 (*src); sl@0: __m64 d = load8888 (*dst); sl@0: sl@0: *dst = (ullong)pack8888 (over (s, expand_alpha (s), d), (__m64)_mm_setzero_si64()); sl@0: sl@0: w--; sl@0: dst++; sl@0: src++; sl@0: } sl@0: sl@0: while (w >= 2) sl@0: { sl@0: __m64 vd = *(__m64 *)(dst + 0); sl@0: __m64 vs = *(__m64 *)(src + 0); sl@0: __m64 vs0 = expand8888 (vs, 0); sl@0: __m64 vs1 = expand8888 (vs, 1); sl@0: sl@0: *(__m64 *)dst = (__m64)pack8888 ( sl@0: over (vs0, expand_alpha (vs0), expand8888 (vd, 0)), sl@0: over (vs1, expand_alpha (vs1), expand8888 (vd, 1))); sl@0: sl@0: w -= 2; sl@0: dst += 2; sl@0: src += 2; sl@0: } sl@0: sl@0: while (w) sl@0: { sl@0: __m64 s = load8888 (*src); sl@0: __m64 d = load8888 (*dst); sl@0: sl@0: *dst = (ullong)pack8888 (over (s, expand_alpha (s), d), sl@0: (__m64)_mm_setzero_si64()); sl@0: sl@0: w--; sl@0: dst++; sl@0: src++; sl@0: } sl@0: } sl@0: sl@0: _mm_empty(); sl@0: } sl@0: sl@0: void sl@0: fbCompositeSolidMask_nx8x8888mmx (CARD8 op, sl@0: PicturePtr pSrc, sl@0: PicturePtr pMask, sl@0: PicturePtr pDst, sl@0: INT16 xSrc, sl@0: INT16 ySrc, sl@0: INT16 xMask, sl@0: INT16 yMask, sl@0: INT16 xDst, sl@0: INT16 yDst, sl@0: CARD16 width, sl@0: CARD16 height) sl@0: { sl@0: CARD32 src, srca; sl@0: CARD32 *dstLine, *dst; sl@0: CARD8 *maskLine, *mask; sl@0: FbStride dstStride, maskStride; sl@0: CARD16 w; sl@0: __m64 vsrc, vsrca; sl@0: ullong srcsrc; sl@0: sl@0: CHECKPOINT(); sl@0: sl@0: fbComposeGetSolid(pSrc, src, pDst->format); sl@0: sl@0: srca = src >> 24; sl@0: if (srca == 0) sl@0: return; sl@0: sl@0: srcsrc = (unsigned long long)src << 32 | src; sl@0: sl@0: fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1); sl@0: fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1); sl@0: sl@0: vsrc = load8888 (src); sl@0: vsrca = expand_alpha (vsrc); sl@0: sl@0: while (height--) sl@0: { sl@0: dst = dstLine; sl@0: dstLine += dstStride; sl@0: mask = maskLine; sl@0: maskLine += maskStride; sl@0: w = width; sl@0: sl@0: CHECKPOINT(); sl@0: sl@0: while (w && (unsigned long)dst & 7) sl@0: { sl@0: ullong m = *mask; sl@0: sl@0: if (m) sl@0: { sl@0: __m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), load8888(*dst)); sl@0: *dst = (ullong)pack8888(vdest, _mm_setzero_si64()); sl@0: } sl@0: sl@0: w--; sl@0: mask++; sl@0: dst++; sl@0: } sl@0: sl@0: CHECKPOINT(); sl@0: sl@0: while (w >= 2) sl@0: { sl@0: ullong m0, m1; sl@0: m0 = *mask; sl@0: m1 = *(mask + 1); sl@0: sl@0: if (srca == 0xff && (m0 & m1) == 0xff) sl@0: { sl@0: *(unsigned long long *)dst = srcsrc; sl@0: } sl@0: else if (m0 | m1) sl@0: { sl@0: __m64 vdest; sl@0: __m64 dest0, dest1; sl@0: sl@0: vdest = *(__m64 *)dst; sl@0: sl@0: dest0 = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m0), expand8888(vdest, 0)); sl@0: dest1 = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m1), expand8888(vdest, 1)); sl@0: sl@0: *(__m64 *)dst = pack8888(dest0, dest1); sl@0: } sl@0: sl@0: mask += 2; sl@0: dst += 2; sl@0: w -= 2; sl@0: } sl@0: sl@0: CHECKPOINT(); sl@0: sl@0: while (w) sl@0: { sl@0: ullong m = *mask; sl@0: sl@0: if (m) sl@0: { sl@0: __m64 vdest = load8888(*dst); sl@0: vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), vdest); sl@0: *dst = (ullong)pack8888(vdest, _mm_setzero_si64()); sl@0: } sl@0: sl@0: w--; sl@0: mask++; sl@0: dst++; sl@0: } sl@0: } sl@0: sl@0: _mm_empty(); sl@0: } sl@0: sl@0: sl@0: void sl@0: fbCompositeSolidMask_nx8x0565mmx (CARD8 op, sl@0: PicturePtr pSrc, sl@0: PicturePtr pMask, sl@0: PicturePtr pDst, sl@0: INT16 xSrc, sl@0: INT16 ySrc, sl@0: INT16 xMask, sl@0: INT16 yMask, sl@0: INT16 xDst, sl@0: INT16 yDst, sl@0: CARD16 width, sl@0: CARD16 height) sl@0: { sl@0: CARD32 src, srca; sl@0: CARD16 *dstLine, *dst; sl@0: CARD8 *maskLine, *mask; sl@0: FbStride dstStride, maskStride; sl@0: CARD16 w; sl@0: __m64 vsrc, vsrca; sl@0: unsigned long long srcsrcsrcsrc, src16; sl@0: sl@0: CHECKPOINT(); sl@0: sl@0: fbComposeGetSolid(pSrc, src, pDst->format); sl@0: sl@0: srca = src >> 24; sl@0: if (srca == 0) sl@0: return; sl@0: sl@0: fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1); sl@0: fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1); sl@0: sl@0: vsrc = load8888 (src); sl@0: vsrca = expand_alpha (vsrc); sl@0: sl@0: src16 = (ullong)pack565(vsrc, _mm_setzero_si64(), 0); sl@0: sl@0: srcsrcsrcsrc = (ullong)src16 << 48 | (ullong)src16 << 32 | sl@0: (ullong)src16 << 16 | (ullong)src16; sl@0: sl@0: while (height--) sl@0: { sl@0: dst = dstLine; sl@0: dstLine += dstStride; sl@0: mask = maskLine; sl@0: maskLine += maskStride; sl@0: w = width; sl@0: sl@0: CHECKPOINT(); sl@0: sl@0: while (w && (unsigned long)dst & 7) sl@0: { sl@0: ullong m = *mask; sl@0: sl@0: if (m) sl@0: { sl@0: ullong d = *dst; sl@0: __m64 vd = (__m64)d; sl@0: __m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), expand565(vd, 0)); sl@0: *dst = (ullong)pack565(vdest, _mm_setzero_si64(), 0); sl@0: } sl@0: sl@0: w--; sl@0: mask++; sl@0: dst++; sl@0: } sl@0: sl@0: CHECKPOINT(); sl@0: sl@0: while (w >= 4) sl@0: { sl@0: ullong m0, m1, m2, m3; sl@0: m0 = *mask; sl@0: m1 = *(mask + 1); sl@0: m2 = *(mask + 2); sl@0: m3 = *(mask + 3); sl@0: sl@0: if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff) sl@0: { sl@0: *(unsigned long long *)dst = srcsrcsrcsrc; sl@0: } sl@0: else if (m0 | m1 | m2 | m3) sl@0: { sl@0: __m64 vdest; sl@0: __m64 vm0, vm1, vm2, vm3; sl@0: sl@0: vdest = *(__m64 *)dst; sl@0: sl@0: vm0 = (__m64)m0; sl@0: vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm0), expand565(vdest, 0)), vdest, 0); sl@0: vm1 = (__m64)m1; sl@0: vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm1), expand565(vdest, 1)), vdest, 1); sl@0: vm2 = (__m64)m2; sl@0: vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm2), expand565(vdest, 2)), vdest, 2); sl@0: vm3 = (__m64)m3; sl@0: vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm3), expand565(vdest, 3)), vdest, 3); sl@0: sl@0: *(__m64 *)dst = vdest; sl@0: } sl@0: sl@0: w -= 4; sl@0: mask += 4; sl@0: dst += 4; sl@0: } sl@0: sl@0: CHECKPOINT(); sl@0: sl@0: while (w) sl@0: { sl@0: ullong m = *mask; sl@0: sl@0: if (m) sl@0: { sl@0: ullong d = *dst; sl@0: __m64 vd = (__m64)d; sl@0: __m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), expand565(vd, 0)); sl@0: *dst = (ullong)pack565(vdest, _mm_setzero_si64(), 0); sl@0: } sl@0: sl@0: w--; sl@0: mask++; sl@0: dst++; sl@0: } sl@0: } sl@0: sl@0: _mm_empty(); sl@0: } sl@0: sl@0: void sl@0: fbCompositeSrc_8888RevNPx0565mmx (CARD8 op, sl@0: PicturePtr pSrc, sl@0: PicturePtr pMask, sl@0: PicturePtr pDst, sl@0: INT16 xSrc, sl@0: INT16 ySrc, sl@0: INT16 xMask, sl@0: INT16 yMask, sl@0: INT16 xDst, sl@0: INT16 yDst, sl@0: CARD16 width, sl@0: CARD16 height) sl@0: { sl@0: CARD16 *dstLine, *dst; sl@0: CARD32 *srcLine, *src; sl@0: FbStride dstStride, srcStride; sl@0: CARD16 w; sl@0: sl@0: CHECKPOINT(); sl@0: sl@0: fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1); sl@0: fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1); sl@0: sl@0: assert (pSrc->pDrawable == pMask->pDrawable); sl@0: sl@0: while (height--) sl@0: { sl@0: dst = dstLine; sl@0: dstLine += dstStride; sl@0: src = srcLine; sl@0: srcLine += srcStride; sl@0: w = width; sl@0: sl@0: CHECKPOINT(); sl@0: sl@0: while (w && (unsigned long)dst & 7) sl@0: { sl@0: __m64 vsrc = load8888 (*src); sl@0: ullong d = *dst; sl@0: __m64 vdest = expand565 ((__m64)d, 0); sl@0: sl@0: vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0); sl@0: sl@0: *dst = (ullong)vdest; sl@0: sl@0: w--; sl@0: dst++; sl@0: src++; sl@0: } sl@0: sl@0: CHECKPOINT(); sl@0: sl@0: while (w >= 4) sl@0: { sl@0: CARD32 s0, s1, s2, s3; sl@0: unsigned char a0, a1, a2, a3; sl@0: sl@0: s0 = *src; sl@0: s1 = *(src + 1); sl@0: s2 = *(src + 2); sl@0: s3 = *(src + 3); sl@0: sl@0: a0 = (s0 >> 24); sl@0: a1 = (s1 >> 24); sl@0: a2 = (s2 >> 24); sl@0: a3 = (s3 >> 24); sl@0: sl@0: if ((a0 & a1 & a2 & a3) == 0xFF) sl@0: { sl@0: __m64 vdest; sl@0: vdest = pack565(invert_colors(load8888(s0)), _mm_setzero_si64(), 0); sl@0: vdest = pack565(invert_colors(load8888(s1)), vdest, 1); sl@0: vdest = pack565(invert_colors(load8888(s2)), vdest, 2); sl@0: vdest = pack565(invert_colors(load8888(s3)), vdest, 3); sl@0: sl@0: *(__m64 *)dst = vdest; sl@0: } sl@0: else if (a0 | a1 | a2 | a3) sl@0: { sl@0: __m64 vdest = *(__m64 *)dst; sl@0: sl@0: vdest = pack565(over_rev_non_pre(load8888(s0), expand565(vdest, 0)), vdest, 0); sl@0: vdest = pack565(over_rev_non_pre(load8888(s1), expand565(vdest, 1)), vdest, 1); sl@0: vdest = pack565(over_rev_non_pre(load8888(s2), expand565(vdest, 2)), vdest, 2); sl@0: vdest = pack565(over_rev_non_pre(load8888(s3), expand565(vdest, 3)), vdest, 3); sl@0: sl@0: *(__m64 *)dst = vdest; sl@0: } sl@0: sl@0: w -= 4; sl@0: dst += 4; sl@0: src += 4; sl@0: } sl@0: sl@0: CHECKPOINT(); sl@0: sl@0: while (w) sl@0: { sl@0: __m64 vsrc = load8888 (*src); sl@0: ullong d = *dst; sl@0: __m64 vdest = expand565 ((__m64)d, 0); sl@0: sl@0: vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0); sl@0: sl@0: *dst = (ullong)vdest; sl@0: sl@0: w--; sl@0: dst++; sl@0: src++; sl@0: } sl@0: } sl@0: sl@0: _mm_empty(); sl@0: } sl@0: sl@0: /* "8888RevNP" is GdkPixbuf's format: ABGR, non premultiplied */ sl@0: sl@0: void sl@0: fbCompositeSrc_8888RevNPx8888mmx (CARD8 op, sl@0: PicturePtr pSrc, sl@0: PicturePtr pMask, sl@0: PicturePtr pDst, sl@0: INT16 xSrc, sl@0: INT16 ySrc, sl@0: INT16 xMask, sl@0: INT16 yMask, sl@0: INT16 xDst, sl@0: INT16 yDst, sl@0: CARD16 width, sl@0: CARD16 height) sl@0: { sl@0: CARD32 *dstLine, *dst; sl@0: CARD32 *srcLine, *src; sl@0: FbStride dstStride, srcStride; sl@0: CARD16 w; sl@0: sl@0: CHECKPOINT(); sl@0: sl@0: fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1); sl@0: fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1); sl@0: sl@0: assert (pSrc->pDrawable == pMask->pDrawable); sl@0: sl@0: while (height--) sl@0: { sl@0: dst = dstLine; sl@0: dstLine += dstStride; sl@0: src = srcLine; sl@0: srcLine += srcStride; sl@0: w = width; sl@0: sl@0: while (w && (unsigned long)dst & 7) sl@0: { sl@0: __m64 s = load8888 (*src); sl@0: __m64 d = load8888 (*dst); sl@0: sl@0: *dst = (ullong)pack8888 (over_rev_non_pre (s, d), _mm_setzero_si64()); sl@0: sl@0: w--; sl@0: dst++; sl@0: src++; sl@0: } sl@0: sl@0: while (w >= 2) sl@0: { sl@0: ullong s0, s1; sl@0: unsigned char a0, a1; sl@0: __m64 d0, d1; sl@0: sl@0: s0 = *src; sl@0: s1 = *(src + 1); sl@0: sl@0: a0 = (s0 >> 24); sl@0: a1 = (s1 >> 24); sl@0: sl@0: if ((a0 & a1) == 0xFF) sl@0: { sl@0: d0 = invert_colors(load8888(s0)); sl@0: d1 = invert_colors(load8888(s1)); sl@0: sl@0: *(__m64 *)dst = pack8888 (d0, d1); sl@0: } sl@0: else if (a0 | a1) sl@0: { sl@0: __m64 vdest = *(__m64 *)dst; sl@0: sl@0: d0 = over_rev_non_pre (load8888(s0), expand8888 (vdest, 0)); sl@0: d1 = over_rev_non_pre (load8888(s1), expand8888 (vdest, 1)); sl@0: sl@0: *(__m64 *)dst = pack8888 (d0, d1); sl@0: } sl@0: sl@0: w -= 2; sl@0: dst += 2; sl@0: src += 2; sl@0: } sl@0: sl@0: while (w) sl@0: { sl@0: __m64 s = load8888 (*src); sl@0: __m64 d = load8888 (*dst); sl@0: sl@0: *dst = (ullong)pack8888 (over_rev_non_pre (s, d), _mm_setzero_si64()); sl@0: sl@0: w--; sl@0: dst++; sl@0: src++; sl@0: } sl@0: } sl@0: sl@0: _mm_empty(); sl@0: } sl@0: sl@0: void sl@0: fbCompositeSolidMask_nx8888x0565Cmmx (CARD8 op, sl@0: PicturePtr pSrc, sl@0: PicturePtr pMask, sl@0: PicturePtr pDst, sl@0: INT16 xSrc, sl@0: INT16 ySrc, sl@0: INT16 xMask, sl@0: INT16 yMask, sl@0: INT16 xDst, sl@0: INT16 yDst, sl@0: CARD16 width, sl@0: CARD16 height) sl@0: { sl@0: CARD32 src, srca; sl@0: CARD16 *dstLine; sl@0: CARD32 *maskLine; sl@0: FbStride dstStride, maskStride; sl@0: __m64 vsrc, vsrca; sl@0: sl@0: CHECKPOINT(); sl@0: sl@0: fbComposeGetSolid(pSrc, src, pDst->format); sl@0: sl@0: srca = src >> 24; sl@0: if (srca == 0) sl@0: return; sl@0: sl@0: fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1); sl@0: fbComposeGetStart (pMask, xMask, yMask, CARD32, maskStride, maskLine, 1); sl@0: sl@0: vsrc = load8888 (src); sl@0: vsrca = expand_alpha (vsrc); sl@0: sl@0: while (height--) sl@0: { sl@0: int twidth = width; sl@0: CARD32 *p = (CARD32 *)maskLine; sl@0: CARD16 *q = (CARD16 *)dstLine; sl@0: sl@0: while (twidth && ((unsigned long)q & 7)) sl@0: { sl@0: CARD32 m = *(CARD32 *)p; sl@0: sl@0: if (m) sl@0: { sl@0: ullong d = *q; sl@0: __m64 vdest = expand565 ((__m64)d, 0); sl@0: vdest = pack565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0); sl@0: *q = (ullong)vdest; sl@0: } sl@0: sl@0: twidth--; sl@0: p++; sl@0: q++; sl@0: } sl@0: sl@0: while (twidth >= 4) sl@0: { sl@0: CARD32 m0, m1, m2, m3; sl@0: sl@0: m0 = *p; sl@0: m1 = *(p + 1); sl@0: m2 = *(p + 2); sl@0: m3 = *(p + 3); sl@0: sl@0: if ((m0 | m1 | m2 | m3)) sl@0: { sl@0: __m64 vdest = *(__m64 *)q; sl@0: sl@0: vdest = pack565(in_over(vsrc, vsrca, load8888(m0), expand565(vdest, 0)), vdest, 0); sl@0: vdest = pack565(in_over(vsrc, vsrca, load8888(m1), expand565(vdest, 1)), vdest, 1); sl@0: vdest = pack565(in_over(vsrc, vsrca, load8888(m2), expand565(vdest, 2)), vdest, 2); sl@0: vdest = pack565(in_over(vsrc, vsrca, load8888(m3), expand565(vdest, 3)), vdest, 3); sl@0: sl@0: *(__m64 *)q = vdest; sl@0: } sl@0: twidth -= 4; sl@0: p += 4; sl@0: q += 4; sl@0: } sl@0: sl@0: while (twidth) sl@0: { sl@0: CARD32 m; sl@0: sl@0: m = *(CARD32 *)p; sl@0: if (m) sl@0: { sl@0: ullong d = *q; sl@0: __m64 vdest = expand565((__m64)d, 0); sl@0: vdest = pack565 (in_over(vsrc, vsrca, load8888(m), vdest), vdest, 0); sl@0: *q = (ullong)vdest; sl@0: } sl@0: sl@0: twidth--; sl@0: p++; sl@0: q++; sl@0: } sl@0: sl@0: maskLine += maskStride; sl@0: dstLine += dstStride; sl@0: } sl@0: sl@0: _mm_empty (); sl@0: } sl@0: #endif sl@0: sl@0: static void sl@0: fbCompositeSrcAdd_8000x8000mmx (uint8_t *dst, uint8_t *src, int w) sl@0: { sl@0: int s; sl@0: int d; sl@0: int t; sl@0: sl@0: while (w && (unsigned long)dst & 7) sl@0: { sl@0: s = *src; sl@0: d = *dst; sl@0: t = d + s; sl@0: s = t | (0 - (t >> 8)); sl@0: *dst = s; sl@0: sl@0: dst++; sl@0: src++; sl@0: w--; sl@0: } sl@0: sl@0: while (w >= 8) sl@0: { sl@0: *(__m64*)dst = _mm_adds_pu8(*(__m64*)src, *(__m64*)dst); sl@0: dst += 8; sl@0: src += 8; sl@0: w -= 8; sl@0: } sl@0: sl@0: while (w) sl@0: { sl@0: s = *src; sl@0: d = *dst; sl@0: t = d + s; sl@0: s = t | (0 - (t >> 8)); sl@0: *dst = s; sl@0: sl@0: dst++; sl@0: src++; sl@0: w--; sl@0: } sl@0: sl@0: _mm_empty(); sl@0: } sl@0: OIL_DEFINE_IMPL_FULL (fbCompositeSrcAdd_8000x8000mmx, composite_add_u8, OIL_IMPL_FLAG_MMX); sl@0: sl@0: static void sl@0: fbCompositeSrcAdd_8888x8888mmx (uint32_t *dst, uint32_t *src, int w) sl@0: { sl@0: while (w && (unsigned long)dst & 7) sl@0: { sl@0: *dst = _mm_cvtsi64_si32(_mm_adds_pu8(_mm_cvtsi32_si64(*src), sl@0: _mm_cvtsi32_si64(*dst))); sl@0: dst++; sl@0: src++; sl@0: w--; sl@0: } sl@0: sl@0: while (w >= 2) sl@0: { sl@0: *(__m64 *)dst = _mm_adds_pu8(*(__m64*)src, *(__m64*)dst); sl@0: dst += 2; sl@0: src += 2; sl@0: w -= 2; sl@0: } sl@0: sl@0: if (w) sl@0: { sl@0: *dst = _mm_cvtsi64_si32(_mm_adds_pu8(_mm_cvtsi32_si64(*src), sl@0: _mm_cvtsi32_si64(*dst))); sl@0: sl@0: } sl@0: sl@0: _mm_empty(); sl@0: } sl@0: OIL_DEFINE_IMPL_FULL (fbCompositeSrcAdd_8888x8888mmx, composite_add_argb, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_SSE); sl@0: sl@0: #if 0 sl@0: #define GetStart(drw,x,y,type,stride,line,bpp) {\ sl@0: FbBits *__bits__; \ sl@0: FbStride __stride__; \ sl@0: int __xoff__,__yoff__; \ sl@0: \ sl@0: fbGetDrawable((drw),__bits__,__stride__,bpp,__xoff__,__yoff__); \ sl@0: (stride) = __stride__ * sizeof (FbBits) / sizeof (type); \ sl@0: (line) = ((type *) __bits__) + (stride) * ((y) - __yoff__) + ((x) - __xoff__); \ sl@0: } sl@0: sl@0: Bool sl@0: fbSolidFillmmx (DrawablePtr pDraw, sl@0: int x, sl@0: int y, sl@0: int width, sl@0: int height, sl@0: FbBits xor) sl@0: { sl@0: FbStride stride; sl@0: int bpp; sl@0: ullong fill; sl@0: __m64 vfill; sl@0: CARD32 byte_width; sl@0: CARD8 *byte_line; sl@0: FbBits *bits; sl@0: int xoff, yoff; sl@0: sl@0: CHECKPOINT(); sl@0: sl@0: fbGetDrawable(pDraw, bits, stride, bpp, xoff, yoff); sl@0: sl@0: if (bpp == 16 && (xor >> 16 != (xor & 0xffff))) sl@0: return FALSE; sl@0: sl@0: if (bpp != 16 && bpp != 32) sl@0: return FALSE; sl@0: sl@0: if (bpp == 16) sl@0: { sl@0: stride = stride * sizeof (FbBits) / 2; sl@0: byte_line = (CARD8 *)(((CARD16 *)bits) + stride * (y - yoff) + (x - xoff)); sl@0: byte_width = 2 * width; sl@0: stride *= 2; sl@0: } sl@0: else sl@0: { sl@0: stride = stride * sizeof (FbBits) / 4; sl@0: byte_line = (CARD8 *)(((CARD32 *)bits) + stride * (y - yoff) + (x - xoff)); sl@0: byte_width = 4 * width; sl@0: stride *= 4; sl@0: } sl@0: sl@0: fill = ((ullong)xor << 32) | xor; sl@0: vfill = (__m64)fill; sl@0: sl@0: while (height--) sl@0: { sl@0: int w; sl@0: CARD8 *d = byte_line; sl@0: byte_line += stride; sl@0: w = byte_width; sl@0: sl@0: while (w >= 2 && ((unsigned long)d & 3)) sl@0: { sl@0: *(CARD16 *)d = xor; sl@0: w -= 2; sl@0: d += 2; sl@0: } sl@0: sl@0: while (w >= 4 && ((unsigned long)d & 7)) sl@0: { sl@0: *(CARD32 *)d = xor; sl@0: sl@0: w -= 4; sl@0: d += 4; sl@0: } sl@0: sl@0: while (w >= 64) sl@0: { sl@0: *(__m64*) (d + 0) = vfill; sl@0: *(__m64*) (d + 8) = vfill; sl@0: *(__m64*) (d + 16) = vfill; sl@0: *(__m64*) (d + 24) = vfill; sl@0: *(__m64*) (d + 32) = vfill; sl@0: *(__m64*) (d + 40) = vfill; sl@0: *(__m64*) (d + 48) = vfill; sl@0: *(__m64*) (d + 56) = vfill; sl@0: sl@0: w -= 64; sl@0: d += 64; sl@0: } sl@0: while (w >= 4) sl@0: { sl@0: *(CARD32 *)d = xor; sl@0: sl@0: w -= 4; sl@0: d += 4; sl@0: } sl@0: if (w >= 2) sl@0: { sl@0: *(CARD16 *)d = xor; sl@0: w -= 2; sl@0: d += 2; sl@0: } sl@0: } sl@0: sl@0: _mm_empty(); sl@0: return TRUE; sl@0: } sl@0: sl@0: Bool sl@0: fbCopyAreammx (DrawablePtr pSrc, sl@0: DrawablePtr pDst, sl@0: int src_x, sl@0: int src_y, sl@0: int dst_x, sl@0: int dst_y, sl@0: int width, sl@0: int height) sl@0: { sl@0: FbBits * src_bits; sl@0: FbStride src_stride; sl@0: int src_bpp; sl@0: int src_xoff; sl@0: int src_yoff; sl@0: sl@0: FbBits * dst_bits; sl@0: FbStride dst_stride; sl@0: int dst_bpp; sl@0: int dst_xoff; sl@0: int dst_yoff; sl@0: sl@0: CARD8 * src_bytes; sl@0: CARD8 * dst_bytes; sl@0: int byte_width; sl@0: sl@0: fbGetDrawable(pSrc, src_bits, src_stride, src_bpp, src_xoff, src_yoff); sl@0: fbGetDrawable(pDst, dst_bits, dst_stride, dst_bpp, dst_xoff, dst_yoff); sl@0: sl@0: if (src_bpp != 16 && src_bpp != 32) sl@0: return FALSE; sl@0: sl@0: if (dst_bpp != 16 && dst_bpp != 32) sl@0: return FALSE; sl@0: sl@0: if (src_bpp != dst_bpp) sl@0: { sl@0: return FALSE; sl@0: } sl@0: sl@0: if (src_bpp == 16) sl@0: { sl@0: src_stride = src_stride * sizeof (FbBits) / 2; sl@0: dst_stride = dst_stride * sizeof (FbBits) / 2; sl@0: src_bytes = (CARD8 *)(((CARD16 *)src_bits) + src_stride * (src_y - src_yoff) + (src_x - src_xoff)); sl@0: dst_bytes = (CARD8 *)(((CARD16 *)dst_bits) + dst_stride * (dst_y - dst_yoff) + (dst_x - dst_xoff)); sl@0: byte_width = 2 * width; sl@0: src_stride *= 2; sl@0: dst_stride *= 2; sl@0: } sl@0: else sl@0: { sl@0: src_stride = src_stride * sizeof (FbBits) / 4; sl@0: dst_stride = dst_stride * sizeof (FbBits) / 4; sl@0: src_bytes = (CARD8 *)(((CARD32 *)src_bits) + src_stride * (src_y - src_yoff) + (src_x - src_xoff)); sl@0: dst_bytes = (CARD8 *)(((CARD32 *)dst_bits) + dst_stride * (dst_y - dst_yoff) + (dst_x - dst_xoff)); sl@0: byte_width = 4 * width; sl@0: src_stride *= 4; sl@0: dst_stride *= 4; sl@0: } sl@0: sl@0: while (height--) sl@0: { sl@0: int w; sl@0: CARD8 *s = src_bytes; sl@0: CARD8 *d = dst_bytes; sl@0: src_bytes += src_stride; sl@0: dst_bytes += dst_stride; sl@0: w = byte_width; sl@0: sl@0: while (w >= 2 && ((unsigned long)d & 3)) sl@0: { sl@0: *(CARD16 *)d = *(CARD16 *)s; sl@0: w -= 2; sl@0: s += 2; sl@0: d += 2; sl@0: } sl@0: sl@0: while (w >= 4 && ((unsigned long)d & 7)) sl@0: { sl@0: *(CARD32 *)d = *(CARD32 *)s; sl@0: sl@0: w -= 4; sl@0: s += 4; sl@0: d += 4; sl@0: } sl@0: sl@0: while (w >= 64) sl@0: { sl@0: *(__m64 *)(d + 0) = *(__m64 *)(s + 0); sl@0: *(__m64 *)(d + 8) = *(__m64 *)(s + 8); sl@0: *(__m64 *)(d + 16) = *(__m64 *)(s + 16); sl@0: *(__m64 *)(d + 24) = *(__m64 *)(s + 24); sl@0: *(__m64 *)(d + 32) = *(__m64 *)(s + 32); sl@0: *(__m64 *)(d + 40) = *(__m64 *)(s + 40); sl@0: *(__m64 *)(d + 48) = *(__m64 *)(s + 48); sl@0: *(__m64 *)(d + 56) = *(__m64 *)(s + 56); sl@0: w -= 64; sl@0: s += 64; sl@0: d += 64; sl@0: } sl@0: while (w >= 4) sl@0: { sl@0: *(CARD32 *)d = *(CARD32 *)s; sl@0: sl@0: w -= 4; sl@0: s += 4; sl@0: d += 4; sl@0: } sl@0: if (w >= 2) sl@0: { sl@0: *(CARD16 *)d = *(CARD16 *)s; sl@0: w -= 2; sl@0: s += 2; sl@0: d += 2; sl@0: } sl@0: } sl@0: sl@0: _mm_empty(); sl@0: return TRUE; sl@0: } sl@0: sl@0: void sl@0: fbCompositeCopyAreammx (CARD8 op, sl@0: PicturePtr pSrc, sl@0: PicturePtr pMask, sl@0: PicturePtr pDst, sl@0: INT16 xSrc, sl@0: INT16 ySrc, sl@0: INT16 xMask, sl@0: INT16 yMask, sl@0: INT16 xDst, sl@0: INT16 yDst, sl@0: CARD16 width, sl@0: CARD16 height) sl@0: { sl@0: fbCopyAreammx (pSrc->pDrawable, sl@0: pDst->pDrawable, sl@0: xSrc, ySrc, sl@0: xDst, yDst, sl@0: width, height); sl@0: } sl@0: sl@0: #if !defined(__amd64__) && !defined(__x86_64__) sl@0: sl@0: enum CPUFeatures { sl@0: NoFeatures = 0, sl@0: MMX = 0x1, sl@0: MMX_Extensions = 0x2, sl@0: SSE = 0x6, sl@0: SSE2 = 0x8, sl@0: CMOV = 0x10 sl@0: }; sl@0: sl@0: static unsigned int detectCPUFeatures(void) { sl@0: unsigned int result; sl@0: char vendor[13]; sl@0: vendor[0] = 0; sl@0: vendor[12] = 0; sl@0: /* see p. 118 of amd64 instruction set manual Vol3 */ sl@0: __asm__ ("push %%ebx\n" sl@0: "pushf\n" sl@0: "pop %%eax\n" sl@0: "mov %%eax, %%ebx\n" sl@0: "xor $0x00200000, %%eax\n" sl@0: "push %%eax\n" sl@0: "popf\n" sl@0: "pushf\n" sl@0: "pop %%eax\n" sl@0: "mov $0x0, %%edx\n" sl@0: "xor %%ebx, %%eax\n" sl@0: "jz skip\n" sl@0: sl@0: "mov $0x00000000, %%eax\n" sl@0: "cpuid\n" sl@0: "mov %%ebx, %1\n" sl@0: "mov %%edx, %2\n" sl@0: "mov %%ecx, %3\n" sl@0: "mov $0x00000001, %%eax\n" sl@0: "cpuid\n" sl@0: "skip:\n" sl@0: "pop %%ebx\n" sl@0: "mov %%edx, %0\n" sl@0: : "=r" (result), sl@0: "=m" (vendor[0]), sl@0: "=m" (vendor[4]), sl@0: "=m" (vendor[8]) sl@0: : sl@0: : "%eax", "%ecx", "%edx" sl@0: ); sl@0: sl@0: unsigned int features = 0; sl@0: if (result) { sl@0: /* result now contains the standard feature bits */ sl@0: if (result & (1 << 15)) sl@0: features |= CMOV; sl@0: if (result & (1 << 23)) sl@0: features |= MMX; sl@0: if (result & (1 << 25)) sl@0: features |= SSE; sl@0: if (result & (1 << 26)) sl@0: features |= SSE2; sl@0: if ((result & MMX) && !(result & SSE) && (strcmp(vendor, "AuthenticAMD") == 0)) { sl@0: /* check for AMD MMX extensions */ sl@0: sl@0: unsigned int result; sl@0: __asm__("push %%ebx\n" sl@0: "mov $0x80000000, %%eax\n" sl@0: "cpuid\n" sl@0: "xor %%edx, %%edx\n" sl@0: "cmp $0x1, %%eax\n" sl@0: "jge skip2\n" sl@0: "mov $0x80000001, %%eax\n" sl@0: "cpuid\n" sl@0: "skip2:\n" sl@0: "mov %%edx, %0\n" sl@0: "pop %%ebx\n" sl@0: : "=r" (result) sl@0: : sl@0: : "%eax", "%ecx", "%edx" sl@0: ); sl@0: if (result & (1<<22)) sl@0: features |= MMX_Extensions; sl@0: } sl@0: } sl@0: return features; sl@0: } sl@0: sl@0: Bool sl@0: fbHaveMMX (void) sl@0: { sl@0: static Bool initialized = FALSE; sl@0: static Bool mmx_present; sl@0: sl@0: if (!initialized) sl@0: { sl@0: unsigned int features = detectCPUFeatures(); sl@0: mmx_present = (features & (MMX|MMX_Extensions)) == (MMX|MMX_Extensions); sl@0: initialized = TRUE; sl@0: } sl@0: sl@0: return mmx_present; sl@0: } sl@0: #endif /* __amd64__ */ sl@0: sl@0: sl@0: #endif sl@0: sl@0: sl@0: #ifdef __SYMBIAN32__ sl@0: sl@0: OilFunctionImpl* __oil_function_impl_mmxCombineOverU, composite_over_argb() { sl@0: return &_oil_function_impl_mmxCombineOverU, composite_over_argb; sl@0: } sl@0: #endif sl@0: sl@0: #ifdef __SYMBIAN32__ sl@0: sl@0: OilFunctionImpl* __oil_function_impl_mmxCombineAddU, composite_add_argb() { sl@0: return &_oil_function_impl_mmxCombineAddU, composite_add_argb; sl@0: } sl@0: #endif sl@0: sl@0: #ifdef __SYMBIAN32__ sl@0: sl@0: OilFunctionImpl* __oil_function_impl_fbCompositeSolid_nx8888mmx, composite_over_argb_const_src() { sl@0: return &_oil_function_impl_fbCompositeSolid_nx8888mmx, composite_over_argb_const_src; sl@0: } sl@0: #endif sl@0: sl@0: #ifdef __SYMBIAN32__ sl@0: sl@0: OilFunctionImpl* __oil_function_impl_fbCompositeSrcAdd_8000x8000mmx, composite_add_u8() { sl@0: return &_oil_function_impl_fbCompositeSrcAdd_8000x8000mmx, composite_add_u8; sl@0: } sl@0: #endif sl@0: sl@0: #ifdef __SYMBIAN32__ sl@0: sl@0: OilFunctionImpl* __oil_function_impl_fbCompositeSrcAdd_8888x8888mmx, composite_add_argb() { sl@0: return &_oil_function_impl_fbCompositeSrcAdd_8888x8888mmx, composite_add_argb; sl@0: } sl@0: #endif sl@0: