[meego-commits] 8989: Changes to Trunk:Testing/pixman

Liu Xinyun no_reply at build.meego.com
Thu Oct 28 00:59:15 UTC 2010


Hi,
I have made the following changes to pixman in project Trunk:Testing. Please review and accept ASAP.

Thank You,
Liu Xinyun

[This message was auto-generated]

---

Request #8989:

  submit:   home:xyl:branches:Trunk:Testing/pixman(r3)(cleanup) -> Trunk:Testing/pixman


Message:
    Update to 0.20.0. The latest stable version

State:   new          2010-10-27T17:59:14 xyl
Comment: None



changes files:
--------------
--- pixman.changes
+++ pixman.changes
@@ -0,0 +1,3 @@
+* Thu Oct 28 2010 Liu Xinyun <xinyun.liu at intel.com> - 0.20.0
+- Update to 0.20.0
+

old:
----
  pixman-0.19.4.tar.bz2

new:
----
  pixman-0.20.0.tar.bz2

spec files:
-----------
--- pixman.spec
+++ pixman.spec
@@ -1,13 +1,13 @@
 # 
 # Do NOT Edit the Auto-generated Part!
-# Generated by: spectacle version 0.19
+# Generated by: spectacle version 0.20
 # 
 # >> macros
 # << macros
 
 Name:       pixman
 Summary:    Pixel manipulation library
-Version:    0.19.4
+Version:    0.20.0
 Release:    1
 Group:      System/Libraries
 License:    MIT

other changes:
--------------

++++++ pixman-0.19.4.tar.bz2 -> pixman-0.20.0.tar.bz2
--- configure
+++ configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.65 for pixman 0.19.4.
+# Generated by GNU Autoconf 2.65 for pixman 0.20.0.
 #
 # Report bugs to <"pixman at lists.freedesktop.org">.
 #
@@ -701,8 +701,8 @@
 # Identity of this package.
 PACKAGE_NAME='pixman'
 PACKAGE_TARNAME='pixman'
-PACKAGE_VERSION='0.19.4'
-PACKAGE_STRING='pixman 0.19.4'
+PACKAGE_VERSION='0.20.0'
+PACKAGE_STRING='pixman 0.20.0'
 PACKAGE_BUGREPORT='"pixman at lists.freedesktop.org"'
 PACKAGE_URL=''
 
@@ -1473,7 +1473,7 @@
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures pixman 0.19.4 to adapt to many kinds of systems.
+\`configure' configures pixman 0.20.0 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1543,7 +1543,7 @@
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of pixman 0.19.4:";;
+     short | recursive ) echo "Configuration of pixman 0.20.0:";;
    esac
   cat <<\_ACEOF
 
@@ -1663,7 +1663,7 @@
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-pixman configure 0.19.4
+pixman configure 0.20.0
 generated by GNU Autoconf 2.65
 
 Copyright (C) 2009 Free Software Foundation, Inc.
@@ -2251,7 +2251,7 @@
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by pixman $as_me 0.19.4, which was
+It was created by pixman $as_me 0.20.0, which was
 generated by GNU Autoconf 2.65.  Invocation command line was
 
   $ $0 $@
@@ -3059,7 +3059,7 @@
 
 # Define the identity of the package.
  PACKAGE='pixman'
- VERSION='0.19.4'
+ VERSION='0.20.0'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -11239,13 +11239,13 @@
 
 
 
-LT_VERSION_INFO="19:4:19"
+LT_VERSION_INFO="20:0:20"
 
 PIXMAN_VERSION_MAJOR=0
 
-PIXMAN_VERSION_MINOR=19
+PIXMAN_VERSION_MINOR=20
 
-PIXMAN_VERSION_MICRO=4
+PIXMAN_VERSION_MICRO=0
 
 
 
@@ -12757,6 +12757,78 @@
 
     fi
 
+        if test "z$support_for_pthread_setspecific" != "zyes"; then
+		save_CFLAGS="$CFLAGS"
+	save_LDFLAGS="$LDFLAGS"
+	save_LIBS="$LIBS"
+	CFLAGS=""
+	LDFLAGS=""
+	LIBS=""
+	CFLAGS="-D_REENTRANT"; LDFLAGS="-lroot"
+	cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <stdlib.h>
+#include <pthread.h>
+
+static pthread_once_t once_control = PTHREAD_ONCE_INIT;
+static pthread_key_t key;
+
+static void
+make_key (void)
+{
+    pthread_key_create (&key, NULL);
+}
+
+int
+main ()
+{
+    void *value = NULL;
+
+    if (pthread_once (&once_control, make_key) != 0)
+    {
+	value = NULL;
+    }
+    else
+    {
+	value = pthread_getspecific (key);
+	if (!value)
+	{
+	    value = malloc (100);
+	    pthread_setspecific (key, value);
+	}
+    }
+    return 0;
+}
+
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pixman_cc_stderr=`test -f conftest.err && cat conftest.err`
+		 pixman_cc_flag=yes
+else
+  pixman_cc_stderr=`test -f conftest.err && cat conftest.err`
+		 pixman_cc_flag=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+
+	if test "x$pixman_cc_stderr" != "x"; then
+		pixman_cc_flag=no
+	fi
+
+	if test "x$pixman_cc_flag" = "xyes"; then
+		PTHREAD_CFLAGS="$CFLAGS"
+		 PTHREAD_LIBS="$LIBS"
+		 PTHREAD_LDFLAGS="$LDFLAGS"
+		 support_for_pthread_setspecific=yes
+	else
+		:
+	fi
+	CFLAGS="$save_CFLAGS"
+	LDFLAGS="$save_LDFLAGS"
+	LIBS="$save_LIBS"
+
+    fi
+
 
     if test $support_for_pthread_setspecific = yes; then
 	CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
@@ -13330,7 +13402,7 @@
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by pixman $as_me 0.19.4, which was
+This file was extended by pixman $as_me 0.20.0, which was
 generated by GNU Autoconf 2.65.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -13396,7 +13468,7 @@
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-pixman config.status 0.19.4
+pixman config.status 0.20.0
 configured by $0, generated by GNU Autoconf 2.65,
   with options \\"\$ac_cs_config\\"
 
--- configure.ac
+++ configure.ac
@@ -53,8 +53,8 @@
 #
 
 m4_define([pixman_major], 0)
-m4_define([pixman_minor], 19)
-m4_define([pixman_micro], 4)
+m4_define([pixman_minor], 20)
+m4_define([pixman_micro], 0)
 
 m4_define([pixman_version],[pixman_major.pixman_minor.pixman_micro])
 
@@ -726,7 +726,8 @@
 
     PIXMAN_CHECK_PTHREAD([CFLAGS="-D_REENTRANT"; LIBS="-lpthread"])
     PIXMAN_CHECK_PTHREAD([CFLAGS="-pthread"; LDFLAGS="-pthread"])
-
+    PIXMAN_CHECK_PTHREAD([CFLAGS="-D_REENTRANT"; LDFLAGS="-lroot"])
+    
     if test $support_for_pthread_setspecific = yes; then
 	CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
 	AC_DEFINE([HAVE_PTHREAD_SETSPECIFIC], [], [Whether pthread_setspecific() is supported])
--- pixman/pixman-arm-neon-asm.S
+++ pixman/pixman-arm-neon-asm.S
@@ -388,6 +388,7 @@
     vld1.16     {d4, d5}, [DST_R, :128]!
     vst1.16     {d28, d29}, [DST_W, :128]!
     pixman_composite_over_n_0565_process_pixblock_head
+    cache_preload 8, 8
 .endm
 
 .macro pixman_composite_over_n_0565_init
@@ -495,15 +496,15 @@
 
 /******************************************************************************/
 
-.macro pixman_composite_add_8000_8000_process_pixblock_head
+.macro pixman_composite_add_8_8_process_pixblock_head
     vqadd.u8    q14, q0, q2
     vqadd.u8    q15, q1, q3
 .endm
 
-.macro pixman_composite_add_8000_8000_process_pixblock_tail
+.macro pixman_composite_add_8_8_process_pixblock_tail
 .endm
 
-.macro pixman_composite_add_8000_8000_process_pixblock_tail_head
+.macro pixman_composite_add_8_8_process_pixblock_tail_head
     vld1.8      {d0, d1, d2, d3}, [SRC]!
                                     PF add PF_X, PF_X, #32
                                     PF tst PF_CTL, #0xF
@@ -523,15 +524,15 @@
 .endm
 
 generate_composite_function \
-    pixman_composite_add_8000_8000_asm_neon, 8, 0, 8, \
+    pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
     FLAG_DST_READWRITE, \
     32, /* number of pixels, processed in a single block */ \
     10, /* prefetch distance */ \
     default_init, \
     default_cleanup, \
-    pixman_composite_add_8000_8000_process_pixblock_head, \
-    pixman_composite_add_8000_8000_process_pixblock_tail, \
-    pixman_composite_add_8000_8000_process_pixblock_tail_head
+    pixman_composite_add_8_8_process_pixblock_head, \
+    pixman_composite_add_8_8_process_pixblock_tail, \
+    pixman_composite_add_8_8_process_pixblock_tail_head
 
 /******************************************************************************/
 
@@ -561,8 +562,8 @@
     10, /* prefetch distance */ \
     default_init, \
     default_cleanup, \
-    pixman_composite_add_8000_8000_process_pixblock_head, \
-    pixman_composite_add_8000_8000_process_pixblock_tail, \
+    pixman_composite_add_8_8_process_pixblock_head, \
+    pixman_composite_add_8_8_process_pixblock_tail, \
     pixman_composite_add_8888_8888_process_pixblock_tail_head
 
 generate_composite_function_single_scanline \
@@ -571,8 +572,8 @@
     8, /* number of pixels, processed in a single block */ \
     default_init, \
     default_cleanup, \
-    pixman_composite_add_8000_8000_process_pixblock_head, \
-    pixman_composite_add_8000_8000_process_pixblock_tail, \
+    pixman_composite_add_8_8_process_pixblock_head, \
+    pixman_composite_add_8_8_process_pixblock_tail, \
     pixman_composite_add_8888_8888_process_pixblock_tail_head
 
 /******************************************************************************/
@@ -710,6 +711,7 @@
     vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
     vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
     pixman_composite_over_8888_8888_process_pixblock_head
+    cache_preload 8, 8
 .endm
 
 .macro pixman_composite_over_n_8888_init
@@ -1870,3 +1872,104 @@
     10,  /* dst_r_basereg */ \
     8,  /* src_basereg   */ \
     15  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_add_0565_8_0565_process_pixblock_head
+    /* mask is in d15 */
+    convert_0565_to_x888 q4, d2, d1, d0
+    convert_0565_to_x888 q5, d6, d5, d4
+    /* source pixel data is in      {d0, d1, d2, XX} */
+    /* destination pixel data is in {d4, d5, d6, XX} */
+    vmull.u8    q6,  d15, d2
+    vmull.u8    q5,  d15, d1
+    vmull.u8    q4,  d15, d0
+    vrshr.u16   q12, q6,  #8
+    vrshr.u16   q11, q5,  #8
+    vrshr.u16   q10, q4,  #8
+    vraddhn.u16 d2,  q6,  q12
+    vraddhn.u16 d1,  q5,  q11
+    vraddhn.u16 d0,  q4,  q10
+.endm
+
+.macro pixman_composite_add_0565_8_0565_process_pixblock_tail
+    vqadd.u8    q0,  q0,  q2
+    vqadd.u8    q1,  q1,  q3
+    /* 32bpp result is in {d0, d1, d2, XX} */
+    convert_8888_to_0565 d2, d1, d0, q14, q15, q3
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head
+    vld1.8     {d15}, [MASK]!
+    pixman_composite_add_0565_8_0565_process_pixblock_tail
+    vld1.16    {d8, d9}, [SRC]!
+    vld1.16    {d10, d11}, [DST_R, :128]!
+    cache_preload 8, 8
+    pixman_composite_add_0565_8_0565_process_pixblock_head
+    vst1.16    {d28, d29}, [DST_W, :128]!
+.endm
+
+generate_composite_function \
+    pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_add_0565_8_0565_process_pixblock_head, \
+    pixman_composite_add_0565_8_0565_process_pixblock_tail, \
+    pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    10, /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    15  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_out_reverse_8_0565_process_pixblock_head
+    /* mask is in d15 */
+    convert_0565_to_x888 q5, d6, d5, d4
+    /* destination pixel data is in {d4, d5, d6, xx} */
+    vmvn.8      d24, d15 /* get inverted alpha */
+    /* now do alpha blending */
+    vmull.u8    q8, d24, d4
+    vmull.u8    q9, d24, d5
+    vmull.u8    q10, d24, d6
+.endm
+
+.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail
+    vrshr.u16   q14, q8, #8
+    vrshr.u16   q15, q9, #8
+    vrshr.u16   q12, q10, #8
+    vraddhn.u16 d0, q14, q8
+    vraddhn.u16 d1, q15, q9
+    vraddhn.u16 d2, q12, q10
+    /* 32bpp result is in {d0, d1, d2, XX} */
+    convert_8888_to_0565 d2, d1, d0, q14, q15, q3
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head
+    vld1.8     {d15}, [SRC]!
+    pixman_composite_out_reverse_8_0565_process_pixblock_tail
+    vld1.16    {d10, d11}, [DST_R, :128]!
+    cache_preload 8, 8
+    pixman_composite_out_reverse_8_0565_process_pixblock_head
+    vst1.16    {d28, d29}, [DST_W, :128]!
+.endm
+
+generate_composite_function \
+    pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_out_reverse_8_0565_process_pixblock_head, \
+    pixman_composite_out_reverse_8_0565_process_pixblock_tail, \
+    pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    10, /* dst_r_basereg */ \
+    15, /* src_basereg   */ \
+    0   /* mask_basereg  */
--- pixman/pixman-arm-neon.c
+++ pixman/pixman-arm-neon.c
@@ -52,7 +52,7 @@
                                    uint8_t, 3, uint16_t, 1)
 PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_pixbuf_8888,
                                    uint32_t, 1, uint32_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, add_8000_8000,
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, add_8_8,
                                    uint8_t, 1, uint8_t, 1)
 PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, add_8888_8888,
                                    uint32_t, 1, uint32_t, 1)
@@ -60,6 +60,8 @@
                                    uint32_t, 1, uint16_t, 1)
 PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, over_8888_8888,
                                    uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, out_reverse_8_0565,
+                                   uint8_t, 1, uint16_t, 1)
 
 PIXMAN_ARM_BIND_FAST_PATH_N_DST (neon, over_n_0565,
                                  uint16_t, 1)
@@ -82,6 +84,8 @@
 
 PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_8_8_8,
                                         uint8_t, 1, uint8_t, 1, uint8_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_0565_8_0565,
+                                        uint16_t, 1, uint8_t, 1, uint16_t, 1)
 PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_8888_8888_8888,
                                         uint32_t, 1, uint32_t, 1, uint32_t, 1)
 PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, over_8888_8_8888,
@@ -256,12 +260,16 @@
     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null,     a8b8g8r8, neon_composite_src_x888_8888),
     PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       a8,       neon_composite_add_n_8_8),
     PIXMAN_STD_FAST_PATH (ADD,  a8,       a8,       a8,       neon_composite_add_8_8_8),
+    PIXMAN_STD_FAST_PATH (ADD,  r5g6b5,   a8,       r5g6b5,   neon_composite_add_0565_8_0565),
+    PIXMAN_STD_FAST_PATH (ADD,  b5g6r5,   a8,       b5g6r5,   neon_composite_add_0565_8_0565),
     PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, a8r8g8b8, a8r8g8b8, neon_composite_add_8888_8888_8888),
-    PIXMAN_STD_FAST_PATH (ADD,  a8,       null,     a8,       neon_composite_add_8000_8000),
+    PIXMAN_STD_FAST_PATH (ADD,  a8,       null,     a8,       neon_composite_add_8_8),
     PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, null,     a8r8g8b8, neon_composite_add_8888_8888),
     PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, null,     a8b8g8r8, neon_composite_add_8888_8888),
     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, neon_composite_over_reverse_n_8888),
     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, neon_composite_over_reverse_n_8888),
+    PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, r5g6b5,   neon_composite_out_reverse_8_0565),
+    PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, b5g6r5,   neon_composite_out_reverse_8_0565),
 
     { PIXMAN_OP_NONE },
 };
@@ -347,9 +355,13 @@
 pixman_implementation_t *
 _pixman_implementation_create_arm_neon (void)
 {
-    pixman_implementation_t *general = _pixman_implementation_create_fast_path ();
+#ifdef USE_ARM_SIMD
+    pixman_implementation_t *fallback = _pixman_implementation_create_arm_simd ();
+#else
+    pixman_implementation_t *fallback = _pixman_implementation_create_fast_path ();
+#endif
     pixman_implementation_t *imp =
-	_pixman_implementation_create (general, arm_neon_fast_paths);
+	_pixman_implementation_create (fallback, arm_neon_fast_paths);
 
     imp->combine_32[PIXMAN_OP_OVER] = neon_combine_over_u;
     imp->combine_32[PIXMAN_OP_ADD] = neon_combine_add_u;
--- pixman/pixman-arm-simd-asm.S
+++ pixman/pixman-arm-simd-asm.S
@@ -56,7 +56,7 @@
  * aliases for better readability and maintainability.
  */
 
-pixman_asm_function pixman_composite_add_8000_8000_asm_armv6
+pixman_asm_function pixman_composite_add_8_8_asm_armv6
 	push	{r4, r5, r6, r7, r8, r9, r10, r11}
 	mov	r10, r1
 	sub	sp, sp, #4
--- pixman/pixman-arm-simd.c
+++ pixman/pixman-arm-simd.c
@@ -33,12 +33,12 @@
 #if 0 /* This code was moved to 'pixman-arm-simd-asm.S' */
 
 void
-pixman_composite_add_8000_8000_asm_armv6 (int32_t  width,
-                                          int32_t  height,
-                                          uint8_t *dst_line,
-                                          int32_t  dst_stride,
-                                          uint8_t *src_line,
-                                          int32_t  src_stride)
+pixman_composite_add_8_8_asm_armv6 (int32_t  width,
+				    int32_t  height,
+				    uint8_t *dst_line,
+				    int32_t  dst_stride,
+				    uint8_t *src_line,
+				    int32_t  src_stride)
 {
     uint8_t *dst, *src;
     int32_t w;
@@ -375,7 +375,7 @@
 
 #endif
 
-PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8000_8000,
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8_8,
                                    uint8_t, 1, uint8_t, 1)
 PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, over_8888_8888,
                                    uint32_t, 1, uint32_t, 1)
@@ -397,7 +397,7 @@
     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, armv6_composite_over_8888_n_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, armv6_composite_over_8888_n_8888),
 
-    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, armv6_composite_add_8000_8000),
+    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, armv6_composite_add_8_8),
 
     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, armv6_composite_over_n_8_8888),
     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, armv6_composite_over_n_8_8888),
--- pixman/pixman-bits-image.c
+++ pixman/pixman-bits-image.c
@@ -944,7 +944,8 @@
 					  convert_ ## format,		\
 					  PIXMAN_ ## format,		\
 					  repeat_mode);			\
-    }
+    }									\
+    extern int no_such_variable
 
 MAKE_BILINEAR_FETCHER (pad_a8r8g8b8,     a8r8g8b8, PIXMAN_REPEAT_PAD);
 MAKE_BILINEAR_FETCHER (none_a8r8g8b8,    a8r8g8b8, PIXMAN_REPEAT_NONE);
--- pixman/pixman-combine.c.template
+++ pixman/pixman-combine.c.template
@@ -133,6 +133,17 @@
 }
 
 static void
+combine_dst (pixman_implementation_t *imp,
+	     pixman_op_t	      op,
+	     comp4_t *		      dest,
+	     const comp4_t *	      src,
+	     const comp4_t *          mask,
+	     int		      width)
+{
+    return;
+}
+
+static void
 combine_src_u (pixman_implementation_t *imp,
                pixman_op_t              op,
                comp4_t *                dest,
@@ -1296,17 +1307,13 @@
 	comp4_t s = combine_mask (src, mask, i);
 	comp2_t a = s >> A_SHIFT;
 
-	if (a != 0x00)
+	if (s != 0x00)
 	{
-	    if (a != MASK)
-	    {
-		comp4_t d = *(dest + i);
-		a = combine_disjoint_out_part (d >> A_SHIFT, a);
-		UNcx4_MUL_UNc_ADD_UNcx4 (d, a, s);
-		s = d;
-	    }
+	    comp4_t d = *(dest + i);
+	    a = combine_disjoint_out_part (d >> A_SHIFT, a);
+	    UNcx4_MUL_UNc_ADD_UNcx4 (d, a, s);
 
-	    *(dest + i) = s;
+	    *(dest + i) = d;
 	}
     }
 }
@@ -2314,7 +2321,7 @@
     /* Unified alpha */
     imp->combine_width[PIXMAN_OP_CLEAR] = combine_clear;
     imp->combine_width[PIXMAN_OP_SRC] = combine_src_u;
-    /* dest */
+    imp->combine_width[PIXMAN_OP_DST] = combine_dst;
     imp->combine_width[PIXMAN_OP_OVER] = combine_over_u;
     imp->combine_width[PIXMAN_OP_OVER_REVERSE] = combine_over_reverse_u;
     imp->combine_width[PIXMAN_OP_IN] = combine_in_u;
@@ -2330,7 +2337,7 @@
     /* Disjoint, unified */
     imp->combine_width[PIXMAN_OP_DISJOINT_CLEAR] = combine_clear;
     imp->combine_width[PIXMAN_OP_DISJOINT_SRC] = combine_src_u;
-    /* dest */
+    imp->combine_width[PIXMAN_OP_DISJOINT_DST] = combine_dst;
     imp->combine_width[PIXMAN_OP_DISJOINT_OVER] = combine_disjoint_over_u;
     imp->combine_width[PIXMAN_OP_DISJOINT_OVER_REVERSE] = combine_saturate_u;
     imp->combine_width[PIXMAN_OP_DISJOINT_IN] = combine_disjoint_in_u;
@@ -2344,7 +2351,7 @@
     /* Conjoint, unified */
     imp->combine_width[PIXMAN_OP_CONJOINT_CLEAR] = combine_clear;
     imp->combine_width[PIXMAN_OP_CONJOINT_SRC] = combine_src_u;
-    /* dest */
+    imp->combine_width[PIXMAN_OP_CONJOINT_DST] = combine_dst;
     imp->combine_width[PIXMAN_OP_CONJOINT_OVER] = combine_conjoint_over_u;
     imp->combine_width[PIXMAN_OP_CONJOINT_OVER_REVERSE] = combine_conjoint_over_reverse_u;
     imp->combine_width[PIXMAN_OP_CONJOINT_IN] = combine_conjoint_in_u;
@@ -2390,7 +2397,7 @@
     /* Disjoint CA */
     imp->combine_width_ca[PIXMAN_OP_DISJOINT_CLEAR] = combine_clear_ca;
     imp->combine_width_ca[PIXMAN_OP_DISJOINT_SRC] = combine_src_ca;
-    /* dest */
+    imp->combine_width_ca[PIXMAN_OP_DISJOINT_DST] = combine_dst;
     imp->combine_width_ca[PIXMAN_OP_DISJOINT_OVER] = combine_disjoint_over_ca;
     imp->combine_width_ca[PIXMAN_OP_DISJOINT_OVER_REVERSE] = combine_saturate_ca;
     imp->combine_width_ca[PIXMAN_OP_DISJOINT_IN] = combine_disjoint_in_ca;
@@ -2404,7 +2411,7 @@
     /* Conjoint CA */
     imp->combine_width_ca[PIXMAN_OP_CONJOINT_CLEAR] = combine_clear_ca;
     imp->combine_width_ca[PIXMAN_OP_CONJOINT_SRC] = combine_src_ca;
-    /* dest */
+    imp->combine_width_ca[PIXMAN_OP_CONJOINT_DST] = combine_dst;
     imp->combine_width_ca[PIXMAN_OP_CONJOINT_OVER] = combine_conjoint_over_ca;
     imp->combine_width_ca[PIXMAN_OP_CONJOINT_OVER_REVERSE] = combine_conjoint_over_reverse_ca;
     imp->combine_width_ca[PIXMAN_OP_CONJOINT_IN] = combine_conjoint_in_ca;
@@ -2427,10 +2434,10 @@
     imp->combine_width_ca[PIXMAN_OP_DIFFERENCE] = combine_difference_ca;
     imp->combine_width_ca[PIXMAN_OP_EXCLUSION] = combine_exclusion_ca;
 
-    /* It is not clear that these make sense, so leave them out for now */
-    imp->combine_width_ca[PIXMAN_OP_HSL_HUE] = NULL;
-    imp->combine_width_ca[PIXMAN_OP_HSL_SATURATION] = NULL;
-    imp->combine_width_ca[PIXMAN_OP_HSL_COLOR] = NULL;
-    imp->combine_width_ca[PIXMAN_OP_HSL_LUMINOSITY] = NULL;
+    /* It is not clear that these make sense, so make them noops for now */
+    imp->combine_width_ca[PIXMAN_OP_HSL_HUE] = combine_dst;
+    imp->combine_width_ca[PIXMAN_OP_HSL_SATURATION] = combine_dst;
+    imp->combine_width_ca[PIXMAN_OP_HSL_COLOR] = combine_dst;
+    imp->combine_width_ca[PIXMAN_OP_HSL_LUMINOSITY] = combine_dst;
 }
 
--- pixman/pixman-combine32.c
+++ pixman/pixman-combine32.c
@@ -137,6 +137,17 @@
 }
 
 static void
+combine_dst (pixman_implementation_t *imp,
+	     pixman_op_t	      op,
+	     uint32_t *		      dest,
+	     const uint32_t *	      src,
+	     const uint32_t *          mask,
+	     int		      width)
+{
+    return;
+}
+
+static void
 combine_src_u (pixman_implementation_t *imp,
                pixman_op_t              op,
                uint32_t *                dest,
@@ -1300,17 +1311,13 @@
 	uint32_t s = combine_mask (src, mask, i);
 	uint16_t a = s >> A_SHIFT;
 
-	if (a != 0x00)
+	if (s != 0x00)
 	{
-	    if (a != MASK)
-	    {
-		uint32_t d = *(dest + i);
-		a = combine_disjoint_out_part (d >> A_SHIFT, a);
-		UN8x4_MUL_UN8_ADD_UN8x4 (d, a, s);
-		s = d;
-	    }
+	    uint32_t d = *(dest + i);
+	    a = combine_disjoint_out_part (d >> A_SHIFT, a);
+	    UN8x4_MUL_UN8_ADD_UN8x4 (d, a, s);
 
-	    *(dest + i) = s;
+	    *(dest + i) = d;
 	}
     }
 }
@@ -2318,7 +2325,7 @@
     /* Unified alpha */
     imp->combine_32[PIXMAN_OP_CLEAR] = combine_clear;
     imp->combine_32[PIXMAN_OP_SRC] = combine_src_u;
-    /* dest */
+    imp->combine_32[PIXMAN_OP_DST] = combine_dst;
     imp->combine_32[PIXMAN_OP_OVER] = combine_over_u;
     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = combine_over_reverse_u;
     imp->combine_32[PIXMAN_OP_IN] = combine_in_u;
@@ -2334,7 +2341,7 @@
     /* Disjoint, unified */
     imp->combine_32[PIXMAN_OP_DISJOINT_CLEAR] = combine_clear;
     imp->combine_32[PIXMAN_OP_DISJOINT_SRC] = combine_src_u;
-    /* dest */
+    imp->combine_32[PIXMAN_OP_DISJOINT_DST] = combine_dst;
     imp->combine_32[PIXMAN_OP_DISJOINT_OVER] = combine_disjoint_over_u;
     imp->combine_32[PIXMAN_OP_DISJOINT_OVER_REVERSE] = combine_saturate_u;
     imp->combine_32[PIXMAN_OP_DISJOINT_IN] = combine_disjoint_in_u;
@@ -2348,7 +2355,7 @@
     /* Conjoint, unified */
     imp->combine_32[PIXMAN_OP_CONJOINT_CLEAR] = combine_clear;
     imp->combine_32[PIXMAN_OP_CONJOINT_SRC] = combine_src_u;
-    /* dest */
+    imp->combine_32[PIXMAN_OP_CONJOINT_DST] = combine_dst;
     imp->combine_32[PIXMAN_OP_CONJOINT_OVER] = combine_conjoint_over_u;
     imp->combine_32[PIXMAN_OP_CONJOINT_OVER_REVERSE] = combine_conjoint_over_reverse_u;
     imp->combine_32[PIXMAN_OP_CONJOINT_IN] = combine_conjoint_in_u;
@@ -2394,7 +2401,7 @@
     /* Disjoint CA */
     imp->combine_32_ca[PIXMAN_OP_DISJOINT_CLEAR] = combine_clear_ca;
     imp->combine_32_ca[PIXMAN_OP_DISJOINT_SRC] = combine_src_ca;
-    /* dest */
+    imp->combine_32_ca[PIXMAN_OP_DISJOINT_DST] = combine_dst;
     imp->combine_32_ca[PIXMAN_OP_DISJOINT_OVER] = combine_disjoint_over_ca;
     imp->combine_32_ca[PIXMAN_OP_DISJOINT_OVER_REVERSE] = combine_saturate_ca;
     imp->combine_32_ca[PIXMAN_OP_DISJOINT_IN] = combine_disjoint_in_ca;
@@ -2408,7 +2415,7 @@
     /* Conjoint CA */
     imp->combine_32_ca[PIXMAN_OP_CONJOINT_CLEAR] = combine_clear_ca;
     imp->combine_32_ca[PIXMAN_OP_CONJOINT_SRC] = combine_src_ca;
-    /* dest */
+    imp->combine_32_ca[PIXMAN_OP_CONJOINT_DST] = combine_dst;
     imp->combine_32_ca[PIXMAN_OP_CONJOINT_OVER] = combine_conjoint_over_ca;
     imp->combine_32_ca[PIXMAN_OP_CONJOINT_OVER_REVERSE] = combine_conjoint_over_reverse_ca;
     imp->combine_32_ca[PIXMAN_OP_CONJOINT_IN] = combine_conjoint_in_ca;
@@ -2431,10 +2438,10 @@
     imp->combine_32_ca[PIXMAN_OP_DIFFERENCE] = combine_difference_ca;
     imp->combine_32_ca[PIXMAN_OP_EXCLUSION] = combine_exclusion_ca;
 
-    /* It is not clear that these make sense, so leave them out for now */
-    imp->combine_32_ca[PIXMAN_OP_HSL_HUE] = NULL;
-    imp->combine_32_ca[PIXMAN_OP_HSL_SATURATION] = NULL;
-    imp->combine_32_ca[PIXMAN_OP_HSL_COLOR] = NULL;
-    imp->combine_32_ca[PIXMAN_OP_HSL_LUMINOSITY] = NULL;
+    /* It is not clear that these make sense, so make them noops for now */
+    imp->combine_32_ca[PIXMAN_OP_HSL_HUE] = combine_dst;
+    imp->combine_32_ca[PIXMAN_OP_HSL_SATURATION] = combine_dst;
+    imp->combine_32_ca[PIXMAN_OP_HSL_COLOR] = combine_dst;
+    imp->combine_32_ca[PIXMAN_OP_HSL_LUMINOSITY] = combine_dst;
 }
 
--- pixman/pixman-combine64.c
+++ pixman/pixman-combine64.c
@@ -137,6 +137,17 @@
 }
 
 static void
+combine_dst (pixman_implementation_t *imp,
+	     pixman_op_t	      op,
+	     uint64_t *		      dest,
+	     const uint64_t *	      src,
+	     const uint64_t *          mask,
+	     int		      width)
+{
+    return;
+}
+
+static void
 combine_src_u (pixman_implementation_t *imp,
                pixman_op_t              op,
                uint64_t *                dest,
@@ -1300,17 +1311,13 @@
 	uint64_t s = combine_mask (src, mask, i);
 	uint32_t a = s >> A_SHIFT;
 
-	if (a != 0x00)
+	if (s != 0x00)
 	{
-	    if (a != MASK)
-	    {
-		uint64_t d = *(dest + i);
-		a = combine_disjoint_out_part (d >> A_SHIFT, a);
-		UN16x4_MUL_UN16_ADD_UN16x4 (d, a, s);
-		s = d;
-	    }
+	    uint64_t d = *(dest + i);
+	    a = combine_disjoint_out_part (d >> A_SHIFT, a);
+	    UN16x4_MUL_UN16_ADD_UN16x4 (d, a, s);
 
-	    *(dest + i) = s;
+	    *(dest + i) = d;
 	}
     }
 }
@@ -2318,7 +2325,7 @@
     /* Unified alpha */
     imp->combine_64[PIXMAN_OP_CLEAR] = combine_clear;
     imp->combine_64[PIXMAN_OP_SRC] = combine_src_u;
-    /* dest */
+    imp->combine_64[PIXMAN_OP_DST] = combine_dst;
     imp->combine_64[PIXMAN_OP_OVER] = combine_over_u;
     imp->combine_64[PIXMAN_OP_OVER_REVERSE] = combine_over_reverse_u;
     imp->combine_64[PIXMAN_OP_IN] = combine_in_u;
@@ -2334,7 +2341,7 @@
     /* Disjoint, unified */
     imp->combine_64[PIXMAN_OP_DISJOINT_CLEAR] = combine_clear;
     imp->combine_64[PIXMAN_OP_DISJOINT_SRC] = combine_src_u;
-    /* dest */
+    imp->combine_64[PIXMAN_OP_DISJOINT_DST] = combine_dst;
     imp->combine_64[PIXMAN_OP_DISJOINT_OVER] = combine_disjoint_over_u;
     imp->combine_64[PIXMAN_OP_DISJOINT_OVER_REVERSE] = combine_saturate_u;
     imp->combine_64[PIXMAN_OP_DISJOINT_IN] = combine_disjoint_in_u;
@@ -2348,7 +2355,7 @@
     /* Conjoint, unified */
     imp->combine_64[PIXMAN_OP_CONJOINT_CLEAR] = combine_clear;
     imp->combine_64[PIXMAN_OP_CONJOINT_SRC] = combine_src_u;
-    /* dest */
+    imp->combine_64[PIXMAN_OP_CONJOINT_DST] = combine_dst;
     imp->combine_64[PIXMAN_OP_CONJOINT_OVER] = combine_conjoint_over_u;
     imp->combine_64[PIXMAN_OP_CONJOINT_OVER_REVERSE] = combine_conjoint_over_reverse_u;
     imp->combine_64[PIXMAN_OP_CONJOINT_IN] = combine_conjoint_in_u;
@@ -2394,7 +2401,7 @@
     /* Disjoint CA */
     imp->combine_64_ca[PIXMAN_OP_DISJOINT_CLEAR] = combine_clear_ca;
     imp->combine_64_ca[PIXMAN_OP_DISJOINT_SRC] = combine_src_ca;
-    /* dest */
+    imp->combine_64_ca[PIXMAN_OP_DISJOINT_DST] = combine_dst;
     imp->combine_64_ca[PIXMAN_OP_DISJOINT_OVER] = combine_disjoint_over_ca;
     imp->combine_64_ca[PIXMAN_OP_DISJOINT_OVER_REVERSE] = combine_saturate_ca;
     imp->combine_64_ca[PIXMAN_OP_DISJOINT_IN] = combine_disjoint_in_ca;
@@ -2408,7 +2415,7 @@
     /* Conjoint CA */
     imp->combine_64_ca[PIXMAN_OP_CONJOINT_CLEAR] = combine_clear_ca;
     imp->combine_64_ca[PIXMAN_OP_CONJOINT_SRC] = combine_src_ca;
-    /* dest */
+    imp->combine_64_ca[PIXMAN_OP_CONJOINT_DST] = combine_dst;
     imp->combine_64_ca[PIXMAN_OP_CONJOINT_OVER] = combine_conjoint_over_ca;
     imp->combine_64_ca[PIXMAN_OP_CONJOINT_OVER_REVERSE] = combine_conjoint_over_reverse_ca;
     imp->combine_64_ca[PIXMAN_OP_CONJOINT_IN] = combine_conjoint_in_ca;
@@ -2431,10 +2438,10 @@
     imp->combine_64_ca[PIXMAN_OP_DIFFERENCE] = combine_difference_ca;
     imp->combine_64_ca[PIXMAN_OP_EXCLUSION] = combine_exclusion_ca;
 
-    /* It is not clear that these make sense, so leave them out for now */
-    imp->combine_64_ca[PIXMAN_OP_HSL_HUE] = NULL;
-    imp->combine_64_ca[PIXMAN_OP_HSL_SATURATION] = NULL;
-    imp->combine_64_ca[PIXMAN_OP_HSL_COLOR] = NULL;
-    imp->combine_64_ca[PIXMAN_OP_HSL_LUMINOSITY] = NULL;
+    /* It is not clear that these make sense, so make them noops for now */
+    imp->combine_64_ca[PIXMAN_OP_HSL_HUE] = combine_dst;
+    imp->combine_64_ca[PIXMAN_OP_HSL_SATURATION] = combine_dst;
+    imp->combine_64_ca[PIXMAN_OP_HSL_COLOR] = combine_dst;
+    imp->combine_64_ca[PIXMAN_OP_HSL_LUMINOSITY] = combine_dst;
 }
 
--- pixman/pixman-compiler.h
+++ pixman/pixman-compiler.h
@@ -191,7 +191,8 @@
 		value = tls_ ## name ## _alloc ();			\
 	}								\
 	return value;							\
-    }
+    }									\
+    extern int no_such_variable						
 
 #   define PIXMAN_GET_THREAD_LOCAL(name)				\
     tls_ ## name ## _get ()
--- pixman/pixman-fast-path.c
+++ pixman/pixman-fast-path.c
@@ -910,19 +910,19 @@
 }
 
 static void
-fast_composite_add_8000_8000 (pixman_implementation_t *imp,
-                              pixman_op_t              op,
-                              pixman_image_t *         src_image,
-                              pixman_image_t *         mask_image,
-                              pixman_image_t *         dst_image,
-                              int32_t                  src_x,
-                              int32_t                  src_y,
-                              int32_t                  mask_x,
-                              int32_t                  mask_y,
-                              int32_t                  dest_x,
-                              int32_t                  dest_y,
-                              int32_t                  width,
-                              int32_t                  height)
+fast_composite_add_8_8 (pixman_implementation_t *imp,
+			pixman_op_t              op,
+			pixman_image_t *         src_image,
+			pixman_image_t *         mask_image,
+			pixman_image_t *         dst_image,
+			int32_t                  src_x,
+			int32_t                  src_y,
+			int32_t                  mask_x,
+			int32_t                  mask_y,
+			int32_t                  dest_x,
+			int32_t                  dest_y,
+			int32_t                  width,
+			int32_t                  height)
 {
     uint8_t     *dst_line, *dst;
     uint8_t     *src_line, *src;
@@ -1399,15 +1399,60 @@
 FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, SRC, NONE);
 FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, SRC, PAD);
 FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, SRC, NORMAL);
-FAST_NEAREST (565_565_cover, 0565, 0565, uint16_t, uint16_t, SRC, COVER);
-FAST_NEAREST (565_565_none, 0565, 0565, uint16_t, uint16_t, SRC, NONE);
-FAST_NEAREST (565_565_pad, 0565, 0565, uint16_t, uint16_t, SRC, PAD);
 FAST_NEAREST (565_565_normal, 0565, 0565, uint16_t, uint16_t, SRC, NORMAL);
 FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, OVER, COVER);
 FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, OVER, NONE);
 FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, OVER, PAD);
 FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, OVER, NORMAL);
 
+/* Use more unrolling for src_0565_0565 because it is typically CPU bound */
+static force_inline void
+scaled_nearest_scanline_565_565_SRC (uint16_t *      dst,
+				     uint16_t *      src,
+				     int32_t         w,
+				     pixman_fixed_t  vx,
+				     pixman_fixed_t  unit_x,
+				     pixman_fixed_t  max_vx)
+{
+    uint16_t tmp1, tmp2, tmp3, tmp4;
+    while ((w -= 4) >= 0)
+    {
+	tmp1 = src[pixman_fixed_to_int (vx)];
+	vx += unit_x;
+	tmp2 = src[pixman_fixed_to_int (vx)];
+	vx += unit_x;
+	tmp3 = src[pixman_fixed_to_int (vx)];
+	vx += unit_x;
+	tmp4 = src[pixman_fixed_to_int (vx)];
+	vx += unit_x;
+	*dst++ = tmp1;
+	*dst++ = tmp2;
+	*dst++ = tmp3;
+	*dst++ = tmp4;
+    }
+    if (w & 2)
+    {
+	tmp1 = src[pixman_fixed_to_int (vx)];
+	vx += unit_x;
+	tmp2 = src[pixman_fixed_to_int (vx)];
+	vx += unit_x;
+	*dst++ = tmp1;
+	*dst++ = tmp2;
+    }
+    if (w & 1)
+	*dst++ = src[pixman_fixed_to_int (vx)];
+}
+
+FAST_NEAREST_MAINLOOP (565_565_cover_SRC,
+		       scaled_nearest_scanline_565_565_SRC,
+		       uint16_t, uint16_t, COVER);
+FAST_NEAREST_MAINLOOP (565_565_none_SRC,
+		       scaled_nearest_scanline_565_565_SRC,
+		       uint16_t, uint16_t, NONE);
+FAST_NEAREST_MAINLOOP (565_565_pad_SRC,
+		       scaled_nearest_scanline_565_565_SRC,
+		       uint16_t, uint16_t, PAD);
+
 static force_inline uint32_t
 fetch_nearest (pixman_repeat_t src_repeat,
 	       pixman_format_code_t format,
@@ -1602,7 +1647,7 @@
     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, fast_composite_over_8888_0565),
     PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, fast_composite_add_8888_8888),
     PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, fast_composite_add_8888_8888),
-    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, fast_composite_add_8000_8000),
+    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, fast_composite_add_8_8),
     PIXMAN_STD_FAST_PATH (ADD, a1, null, a1, fast_composite_add_1000_1000),
     PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, fast_composite_add_n_8888_8888_ca),
     PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, fast_composite_add_n_8_8),
--- pixman/pixman-fast-path.h
+++ pixman/pixman-fast-path.h
@@ -381,7 +381,9 @@
 			  OP, repeat_mode)						\
     FAST_NEAREST_MAINLOOP(scale_func_name##_##OP,					\
 			  scaled_nearest_scanline_ ## scale_func_name ## _ ## OP,	\
-			  src_type_t, dst_type_t, repeat_mode)
+			  src_type_t, dst_type_t, repeat_mode)				\
+											\
+    extern int no_such_variable
 
 
 #define SCALED_NEAREST_FLAGS						\
--- pixman/pixman-image.c
+++ pixman/pixman-image.c
@@ -48,7 +48,6 @@
     gradient->n_stops = n_stops;
 
     gradient->stop_range = 0xffff;
-    gradient->common.class = SOURCE_IMAGE_CLASS_UNKNOWN;
 
     return TRUE;
 }
@@ -363,24 +362,21 @@
 	flags |=
 	    FAST_PATH_NO_PAD_REPEAT		|
 	    FAST_PATH_NO_NONE_REPEAT		|
-	    FAST_PATH_NO_NORMAL_REPEAT		|
-	    FAST_PATH_COVERS_CLIP;
+	    FAST_PATH_NO_NORMAL_REPEAT;
 	break;
 
     case PIXMAN_REPEAT_PAD:
 	flags |=
 	    FAST_PATH_NO_REFLECT_REPEAT		|
 	    FAST_PATH_NO_NONE_REPEAT		|
-	    FAST_PATH_NO_NORMAL_REPEAT		|
-	    FAST_PATH_COVERS_CLIP;
+	    FAST_PATH_NO_NORMAL_REPEAT;
 	break;
 
     default:
 	flags |=
 	    FAST_PATH_NO_REFLECT_REPEAT		|
 	    FAST_PATH_NO_PAD_REPEAT		|
-	    FAST_PATH_NO_NONE_REPEAT		|
-	    FAST_PATH_COVERS_CLIP;
+	    FAST_PATH_NO_NONE_REPEAT;
 	break;
     }
 
@@ -400,8 +396,6 @@
 
 	if (image->solid.color.alpha == 0xffff)
 	    flags |= FAST_PATH_IS_OPAQUE;
-
-	flags |= FAST_PATH_COVERS_CLIP;
 	break;
 
     case BITS:
@@ -414,12 +408,6 @@
 	else
 	{
 	    code = image->bits.format;
-
-	    if (!image->common.transform &&
-		image->common.repeat == PIXMAN_REPEAT_NORMAL)
-	    {
-		flags |= FAST_PATH_SIMPLE_REPEAT;
-	    }
 	}
 
 	if (!PIXMAN_FORMAT_A (image->bits.format)				&&
--- pixman/pixman-linear-gradient.c
+++ pixman/pixman-linear-gradient.c
@@ -1,3 +1,4 @@
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
 /*
  * Copyright © 2000 SuSE, Inc.
  * Copyright © 2007 Red Hat, Inc.
@@ -37,58 +38,58 @@
                           int             width,
                           int             height)
 {
+    source_image_t *source = (source_image_t *)image;
     linear_gradient_t *linear = (linear_gradient_t *)image;
     pixman_vector_t v;
     pixman_fixed_32_32_t l;
-    pixman_fixed_48_16_t dx, dy, a, b, off;
-    pixman_fixed_48_16_t factors[4];
-    int i;
-
-    image->source.class = SOURCE_IMAGE_CLASS_UNKNOWN;
+    pixman_fixed_48_16_t dx, dy;
+    double inc;
+    source_image_class_t class;
 
-    dx = linear->p2.x - linear->p1.x;
-    dy = linear->p2.y - linear->p1.y;
+    class = SOURCE_IMAGE_CLASS_UNKNOWN;
 
-    l = dx * dx + dy * dy;
-
-    if (l)
+    if (source->common.transform)
     {
-	a = (dx << 32) / l;
-	b = (dy << 32) / l;
+	/* projective transformation */
+	if (source->common.transform->matrix[2][0] != 0 ||
+	    source->common.transform->matrix[2][1] != 0 ||
+	    source->common.transform->matrix[2][2] == 0)
+	{
+	    return class;
+	}
+
+	v.vector[0] = source->common.transform->matrix[0][1];
+	v.vector[1] = source->common.transform->matrix[1][1];
+	v.vector[2] = source->common.transform->matrix[2][2];
     }
     else
     {
-	a = b = 0;
-    }
-
-    off = (-a * linear->p1.x
-           -b * linear->p1.y) >> 16;
-
-    for (i = 0; i < 3; i++)
-    {
-	v.vector[0] = pixman_int_to_fixed ((i % 2) * (width  - 1) + x);
-	v.vector[1] = pixman_int_to_fixed ((i / 2) * (height - 1) + y);
+	v.vector[0] = 0;
+	v.vector[1] = pixman_fixed_1;
 	v.vector[2] = pixman_fixed_1;
+    }
 
-	if (image->common.transform)
-	{
-	    if (!pixman_transform_point_3d (image->common.transform, &v))
-	    {
-		image->source.class = SOURCE_IMAGE_CLASS_UNKNOWN;
+    dx = linear->p2.x - linear->p1.x;
+    dy = linear->p2.y - linear->p1.y;
 
-		return image->source.class;
-	    }
-	}
+    l = dx * dx + dy * dy;
 
-	factors[i] = ((a * v.vector[0] + b * v.vector[1]) >> 16) + off;
-    }
+    if (l == 0)
+	return class;	
 
-    if (factors[2] == factors[0])
-	image->source.class = SOURCE_IMAGE_CLASS_HORIZONTAL;
-    else if (factors[1] == factors[0])
-	image->source.class = SOURCE_IMAGE_CLASS_VERTICAL;
+    /*
+     * compute how much the input of the gradient walked changes
+     * when moving vertically through the whole image
+     */
+    inc = height * (double) pixman_fixed_1 * pixman_fixed_1 *
+	(dx * v.vector[0] + dy * v.vector[1]) /
+	(v.vector[2] * (double) l);
+
+    /* check that casting to integer would result in 0 */
+    if (-1 < inc && inc < 1)
+	class = SOURCE_IMAGE_CLASS_HORIZONTAL;
 
-    return image->source.class;
+    return class;
 }
 
 static void
@@ -101,7 +102,7 @@
 {
     pixman_vector_t v, unit;
     pixman_fixed_32_32_t l;
-    pixman_fixed_48_16_t dx, dy, a, b, off;
+    pixman_fixed_48_16_t dx, dy;
     gradient_t *gradient = (gradient_t *)image;
     source_image_t *source = (source_image_t *)image;
     linear_gradient_t *linear = (linear_gradient_t *)image;
@@ -136,31 +137,31 @@
 
     l = dx * dx + dy * dy;
 
-    if (l != 0)
+    if (l == 0 || unit.vector[2] == 0)
     {
-	a = (dx << 32) / l;
-	b = (dy << 32) / l;
-	off = (-a * linear->p1.x
-	       -b * linear->p1.y) >> 16;
-    }
-
-    if (l == 0 || (unit.vector[2] == 0 && v.vector[2] == pixman_fixed_1))
-    {
-	pixman_fixed_48_16_t inc, t;
-
 	/* affine transformation only */
-	if (l == 0)
+        pixman_fixed_32_32_t t, next_inc;
+	double inc;
+
+	if (l == 0 || v.vector[2] == 0)
 	{
 	    t = 0;
 	    inc = 0;
 	}
 	else
 	{
-	    t = ((a * v.vector[0] + b * v.vector[1]) >> 16) + off;
-	    inc = (a * unit.vector[0] + b * unit.vector[1]) >> 16;
+	    double invden, v2;
+
+	    invden = pixman_fixed_1 * (double) pixman_fixed_1 /
+		(l * (double) v.vector[2]);
+	    v2 = v.vector[2] * (1. / pixman_fixed_1);
+	    t = ((dx * v.vector[0] + dy * v.vector[1]) - 
+		 (dx * linear->p1.x + dy * linear->p1.y) * v2) * invden;
+	    inc = (dx * unit.vector[0] + dy * unit.vector[1]) * invden;
 	}
+	next_inc = 0;
 
-	if (source->class == SOURCE_IMAGE_CLASS_VERTICAL)
+	if (((pixman_fixed_32_32_t )(inc * width)) == 0)
 	{
 	    register uint32_t color;
 
@@ -170,81 +171,52 @@
 	}
 	else
 	{
-	    if (!mask)
-	    {
-		while (buffer < end)
-		{
-		    *buffer++ = _pixman_gradient_walker_pixel (&walker, t);
-		    
-		    t += inc;
-		}
-	    }
-	    else
+	    int i;
+
+	    i = 0;
+	    while (buffer < end)
 	    {
-		while (buffer < end)
+		if (!mask || *mask++)
 		{
-		    if (*mask++)
-			*buffer = _pixman_gradient_walker_pixel (&walker, t);
-
-		    buffer++;
-		    t += inc;
+		    *buffer = _pixman_gradient_walker_pixel (&walker,
+							     t + next_inc);
 		}
+		i++;
+		next_inc = inc * i;
+		buffer++;
 	    }
 	}
     }
     else
     {
 	/* projective transformation */
-	pixman_fixed_48_16_t t;
-
-	if (source->class == SOURCE_IMAGE_CLASS_VERTICAL)
-	{
-	    register uint32_t color;
-
-	    if (v.vector[2] == 0)
-	    {
-		t = 0;
-	    }
-	    else
-	    {
-		pixman_fixed_48_16_t x, y;
+        double t;
 
-		x = ((pixman_fixed_48_16_t) v.vector[0] << 16) / v.vector[2];
-		y = ((pixman_fixed_48_16_t) v.vector[1] << 16) / v.vector[2];
-		t = ((a * x + b * y) >> 16) + off;
-	    }
+	t = 0;
 
-	    color = _pixman_gradient_walker_pixel (&walker, t);
-	    while (buffer < end)
-		*buffer++ = color;
-	}
-	else
+	while (buffer < end)
 	{
-	    while (buffer < end)
+	    if (!mask || *mask++)
 	    {
-		if (!mask || *mask++)
+	        if (v.vector[2] != 0)
 		{
-		    if (v.vector[2] == 0)
-		    {
-			t = 0;
-		    }
-		    else
-		    {
-			pixman_fixed_48_16_t x, y;
-			x = ((pixman_fixed_48_16_t)v.vector[0] << 16) / v.vector[2];
-			y = ((pixman_fixed_48_16_t)v.vector[1] << 16) / v.vector[2];
-			t = ((a * x + b * y) >> 16) + off;
-		    }
+		    double invden, v2;
 
-		    *buffer = _pixman_gradient_walker_pixel (&walker, t);
+		    invden = pixman_fixed_1 * (double) pixman_fixed_1 /
+			(l * (double) v.vector[2]);
+		    v2 = v.vector[2] * (1. / pixman_fixed_1);
+		    t = ((dx * v.vector[0] + dy * v.vector[1]) - 
+			 (dx * linear->p1.x + dy * linear->p1.y) * v2) * invden;
 		}
 
-		++buffer;
-
-		v.vector[0] += unit.vector[0];
-		v.vector[1] += unit.vector[1];
-		v.vector[2] += unit.vector[2];
+		*buffer = _pixman_gradient_walker_pixel (&walker, t);
 	    }
+
+	    ++buffer;
+
+	    v.vector[0] += unit.vector[0];
+	    v.vector[1] += unit.vector[1];
+	    v.vector[2] += unit.vector[2];
 	}
     }
 }
@@ -282,7 +254,6 @@
     linear->p2 = *p2;
 
     image->type = LINEAR;
-    image->source.class = SOURCE_IMAGE_CLASS_UNKNOWN;
     image->common.classify = linear_gradient_classify;
     image->common.property_changed = linear_gradient_property_changed;
 
--- pixman/pixman-mmx.c
+++ pixman/pixman-mmx.c
@@ -2845,19 +2845,19 @@
 }
 
 static void
-mmx_composite_add_8000_8000 (pixman_implementation_t *imp,
-                             pixman_op_t              op,
-                             pixman_image_t *         src_image,
-                             pixman_image_t *         mask_image,
-                             pixman_image_t *         dst_image,
-                             int32_t                  src_x,
-                             int32_t                  src_y,
-                             int32_t                  mask_x,
-                             int32_t                  mask_y,
-                             int32_t                  dest_x,
-                             int32_t                  dest_y,
-                             int32_t                  width,
-                             int32_t                  height)
+mmx_composite_add_8_8 (pixman_implementation_t *imp,
+		       pixman_op_t              op,
+		       pixman_image_t *         src_image,
+		       pixman_image_t *         mask_image,
+		       pixman_image_t *         dst_image,
+		       int32_t                  src_x,
+		       int32_t                  src_y,
+		       int32_t                  mask_x,
+		       int32_t                  mask_y,
+		       int32_t                  dest_x,
+		       int32_t                  dest_y,
+		       int32_t                  width,
+		       int32_t                  height)
 {
     uint8_t *dst_line, *dst;
     uint8_t *src_line, *src;
@@ -3268,7 +3268,7 @@
 
     PIXMAN_STD_FAST_PATH    (ADD,  a8r8g8b8, null,     a8r8g8b8, mmx_composite_add_8888_8888       ),
     PIXMAN_STD_FAST_PATH    (ADD,  a8b8g8r8, null,     a8b8g8r8, mmx_composite_add_8888_8888       ),
-    PIXMAN_STD_FAST_PATH    (ADD,  a8,       null,     a8,       mmx_composite_add_8000_8000       ),
+    PIXMAN_STD_FAST_PATH    (ADD,  a8,       null,     a8,       mmx_composite_add_8_8		   ),
     PIXMAN_STD_FAST_PATH    (ADD,  solid,    a8,       a8,       mmx_composite_add_n_8_8           ),
 
     PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8r8g8b8, mmx_composite_src_n_8_8888        ),
--- pixman/pixman-private.h
+++ pixman/pixman-private.h
@@ -65,7 +65,6 @@
 {
     SOURCE_IMAGE_CLASS_UNKNOWN,
     SOURCE_IMAGE_CLASS_HORIZONTAL,
-    SOURCE_IMAGE_CLASS_VERTICAL,
 } source_image_class_t;
 
 typedef source_image_class_t (*classify_func_t) (pixman_image_t *image,
@@ -112,7 +111,6 @@
 struct source_image
 {
     image_common_t common;
-    source_image_class_t class;
 };
 
 struct solid_fill
@@ -152,10 +150,11 @@
 
     circle_t   c1;
     circle_t   c2;
-    double     cdx;
-    double     cdy;
-    double     dr;
-    double     A;
+
+    circle_t   delta;
+    double     a;
+    double     inva;
+    double     mindr;
 };
 
 struct conical_gradient
@@ -554,13 +553,13 @@
 #define FAST_PATH_NO_PAD_REPEAT			(1 <<  3)
 #define FAST_PATH_NO_REFLECT_REPEAT		(1 <<  4)
 #define FAST_PATH_NO_ACCESSORS			(1 <<  5)
-#define FAST_PATH_NARROW_FORMAT		(1 <<  6)
-#define FAST_PATH_COVERS_CLIP			(1 <<  7)
+#define FAST_PATH_NARROW_FORMAT			(1 <<  6)
 #define FAST_PATH_COMPONENT_ALPHA		(1 <<  8)
+#define FAST_PATH_SAMPLES_OPAQUE		(1 <<  7)
 #define FAST_PATH_UNIFIED_ALPHA			(1 <<  9)
 #define FAST_PATH_SCALE_TRANSFORM		(1 << 10)
 #define FAST_PATH_NEAREST_FILTER		(1 << 11)
-#define FAST_PATH_SIMPLE_REPEAT			(1 << 12)
+#define FAST_PATH_HAS_TRANSFORM			(1 << 12)
 #define FAST_PATH_IS_OPAQUE			(1 << 13)
 #define FAST_PATH_NEEDS_WORKAROUND		(1 << 14)
 #define FAST_PATH_NO_NONE_REPEAT		(1 << 15)
@@ -570,8 +569,6 @@
 #define FAST_PATH_Y_UNIT_ZERO			(1 << 19)
 #define FAST_PATH_BILINEAR_FILTER		(1 << 20)
 #define FAST_PATH_NO_NORMAL_REPEAT		(1 << 21)
-#define FAST_PATH_HAS_TRANSFORM			(1 << 22)
-#define FAST_PATH_SAMPLES_OPAQUE		(1 << 23)
 
 #define FAST_PATH_PAD_REPEAT						\
     (FAST_PATH_NO_NONE_REPEAT		|				\
@@ -593,29 +590,25 @@
      FAST_PATH_NO_NORMAL_REPEAT		|				\
      FAST_PATH_NO_PAD_REPEAT)
 
-#define _FAST_PATH_STANDARD_FLAGS					\
-    (FAST_PATH_ID_TRANSFORM		|				\
-     FAST_PATH_NO_ALPHA_MAP		|				\
-     FAST_PATH_NO_CONVOLUTION_FILTER	|				\
-     FAST_PATH_NO_PAD_REPEAT		|				\
-     FAST_PATH_NO_REFLECT_REPEAT	|				\
+#define FAST_PATH_STANDARD_FLAGS					\
+    (FAST_PATH_NO_CONVOLUTION_FILTER	|				\
      FAST_PATH_NO_ACCESSORS		|				\
-     FAST_PATH_NARROW_FORMAT		|				\
-     FAST_PATH_COVERS_CLIP)
+     FAST_PATH_NO_ALPHA_MAP		|				\
+     FAST_PATH_NARROW_FORMAT)
 
-#define FAST_PATH_STD_SRC_FLAGS						\
-    _FAST_PATH_STANDARD_FLAGS
-#define FAST_PATH_STD_MASK_U_FLAGS					\
-    (_FAST_PATH_STANDARD_FLAGS		|				\
-     FAST_PATH_UNIFIED_ALPHA)
-#define FAST_PATH_STD_MASK_CA_FLAGS					\
-    (_FAST_PATH_STANDARD_FLAGS		|				\
-     FAST_PATH_COMPONENT_ALPHA)
 #define FAST_PATH_STD_DEST_FLAGS					\
     (FAST_PATH_NO_ACCESSORS		|				\
      FAST_PATH_NO_ALPHA_MAP		|				\
      FAST_PATH_NARROW_FORMAT)
 
+#define SOURCE_FLAGS(format)						\
+    (FAST_PATH_STANDARD_FLAGS |						\
+     ((PIXMAN_ ## format == PIXMAN_solid) ?				\
+      0 : (FAST_PATH_SAMPLES_COVER_CLIP | FAST_PATH_ID_TRANSFORM)))
+
+#define MASK_FLAGS(format, extra)					\
+    ((PIXMAN_ ## format == PIXMAN_null) ? 0 : (SOURCE_FLAGS (format) | extra))
+
 #define FAST_PATH(op, src, src_flags, mask, mask_flags, dest, dest_flags, func) \
     PIXMAN_OP_ ## op,							\
     PIXMAN_ ## src,							\
@@ -628,19 +621,19 @@
 
 #define PIXMAN_STD_FAST_PATH(op, src, mask, dest, func)			\
     { FAST_PATH (							\
-	  op,								\
-	  src, FAST_PATH_STD_SRC_FLAGS,					\
-	  mask, (PIXMAN_ ## mask) ? FAST_PATH_STD_MASK_U_FLAGS : 0,	\
-	  dest, FAST_PATH_STD_DEST_FLAGS,				\
-	  func) }
+	    op,								\
+	    src,  SOURCE_FLAGS (src),					\
+	    mask, MASK_FLAGS (mask, FAST_PATH_UNIFIED_ALPHA),		\
+	    dest, FAST_PATH_STD_DEST_FLAGS,				\
+	    func) }
 
 #define PIXMAN_STD_FAST_PATH_CA(op, src, mask, dest, func)		\
     { FAST_PATH (							\
-	  op,								\
-	  src, FAST_PATH_STD_SRC_FLAGS,					\
-	  mask, FAST_PATH_STD_MASK_CA_FLAGS,				\
-	  dest, FAST_PATH_STD_DEST_FLAGS,				\
-	  func) }
+	    op,								\
+	    src,  SOURCE_FLAGS (src),					\
+	    mask, MASK_FLAGS (mask, FAST_PATH_COMPONENT_ALPHA),		\
+	    dest, FAST_PATH_STD_DEST_FLAGS,				\
+	    func) }
 
 /* Memory allocation helpers */
 void *
--- pixman/pixman-radial-gradient.c
+++ pixman/pixman-radial-gradient.c
@@ -1,3 +1,4 @@
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
 /*
  *
  * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
@@ -33,6 +34,100 @@
 #include <math.h>
 #include "pixman-private.h"
 
+static inline pixman_fixed_32_32_t
+dot (pixman_fixed_48_16_t x1,
+     pixman_fixed_48_16_t y1,
+     pixman_fixed_48_16_t z1,
+     pixman_fixed_48_16_t x2,
+     pixman_fixed_48_16_t y2,
+     pixman_fixed_48_16_t z2)
+{
+    /*
+     * Exact computation, assuming that the input values can
+     * be represented as pixman_fixed_16_16_t
+     */
+    return x1 * x2 + y1 * y2 + z1 * z2;
+}
+
+static inline double
+fdot (double x1,
+      double y1,
+      double z1,
+      double x2,
+      double y2,
+      double z2)
+{
+    /*
+     * Error can be unbound in some special cases.
+     * Using clever dot product algorithms (for example compensated
+     * dot product) would improve this but make the code much less
+     * obvious
+     */
+    return x1 * x2 + y1 * y2 + z1 * z2;
+}
+
+static uint32_t
+radial_compute_color (double                    a,
+		      double                    b,
+		      double                    c,
+		      double                    inva,
+		      double                    dr,
+		      double                    mindr,
+		      pixman_gradient_walker_t *walker,
+		      pixman_repeat_t           repeat)
+{
+    /*
+     * In this function error propagation can lead to bad results:
+     *  - det can have an unbound error (if b*b-a*c is very small),
+     *    potentially making it the opposite sign of what it should have been
+     *    (thus clearing a pixel that would have been colored or vice-versa)
+     *    or propagating the error to sqrtdet;
+     *    if det has the wrong sign or b is very small, this can lead to bad
+     *    results
+     *
+     *  - the algorithm used to compute the solutions of the quadratic
+     *    equation is not numerically stable (but saves one division compared
+     *    to the numerically stable one);
+     *    this can be a problem if a*c is much smaller than b*b
+     *
+     *  - the above problems are worse if a is small (as inva becomes bigger)
+     */
+    double det;
+
+    if (a == 0)
+    {
+	return _pixman_gradient_walker_pixel (walker,
+					      pixman_fixed_1 / 2 * c / b);
+    }
+
+    det = fdot (b, a, 0, b, -c, 0);
+    if (det >= 0)
+    {
+	double sqrtdet, t0, t1;
+
+	sqrtdet = sqrt (det);
+	t0 = (b + sqrtdet) * inva;
+	t1 = (b - sqrtdet) * inva;
+
+	if (repeat == PIXMAN_REPEAT_NONE)
+	{
+	    if (0 <= t0 && t0 <= pixman_fixed_1)
+		return _pixman_gradient_walker_pixel (walker, t0);
+	    else if (0 <= t1 && t1 <= pixman_fixed_1)
+		return _pixman_gradient_walker_pixel (walker, t1);
+	}
+	else
+	{
+	    if (t0 * dr > mindr)
+		return _pixman_gradient_walker_pixel (walker, t0);
+	    else if (t1 * dr > mindr)
+		return _pixman_gradient_walker_pixel (walker, t1);
+	}
+    }
+
+    return 0;
+}
+
 static void
 radial_gradient_get_scanline_32 (pixman_image_t *image,
                                  int             x,
@@ -42,118 +137,85 @@
                                  const uint32_t *mask)
 {
     /*
+     * Implementation of radial gradients following the PDF specification.
+     * See section 8.7.4.5.4 Type 3 (Radial) Shadings of the PDF Reference
+     * Manual (PDF 32000-1:2008 at the time of this writing).
+     * 
      * In the radial gradient problem we are given two circles (c₁,r₁) and
-     * (c₂,r₂) that define the gradient itself. Then, for any point p, we
-     * must compute the value(s) of t within [0.0, 1.0] representing the
-     * circle(s) that would color the point.
-     *
-     * There are potentially two values of t since the point p can be
-     * colored by both sides of the circle, (which happens whenever one
-     * circle is not entirely contained within the other).
-     *
-     * If we solve for a value of t that is outside of [0.0, 1.0] then we
-     * use the extend mode (NONE, REPEAT, REFLECT, or PAD) to map to a
-     * value within [0.0, 1.0].
+     * (c₂,r₂) that define the gradient itself.
      *
-     * Here is an illustration of the problem:
+     * Mathematically the gradient can be defined as the family of circles
      *
-     *              p₂
-     *           p  •
-     *           •   ╲
-     *        ·       ╲r₂
-     *  p₁ ·           ╲
-     *  •              θ╲
-     *   ╲             ╌╌•
-     *    ╲r₁        ·   c₂
-     *    θ╲    ·
-     *    ╌╌•
-     *      c₁
+     *     ((1-t)·c₁ + t·(c₂), (1-t)·r₁ + t·r₂)
      *
-     * Given (c₁,r₁), (c₂,r₂) and p, we must find an angle θ such that two
-     * points p₁ and p₂ on the two circles are collinear with p. Then, the
-     * desired value of t is the ratio of the length of p₁p to the length
-     * of p₁p₂.
+     * excluding those circles whose radius would be < 0. When a point
+     * belongs to more than one circle, the one with a bigger t is the only
+     * one that contributes to its color. When a point does not belong
+     * to any of the circles, it is transparent black, i.e. RGBA (0, 0, 0, 0).
+     * Further limitations on the range of values for t are imposed when
+     * the gradient is not repeated, namely t must belong to [0,1].
      *
-     * So, we have six unknown values: (p₁x, p₁y), (p₂x, p₂y), θ and t.
-     * We can also write six equations that constrain the problem:
+     * The graphical result is the same as drawing the valid (radius > 0)
+     * circles with increasing t in [-inf, +inf] (or in [0,1] if the gradient
+     * is not repeated) using SOURCE operatior composition.
      *
-     * Point p₁ is a distance r₁ from c₁ at an angle of θ:
+     * It looks like a cone pointing towards the viewer if the ending circle
+     * is smaller than the starting one, a cone pointing inside the page if
+     * the starting circle is the smaller one and like a cylinder if they
+     * have the same radius.
      *
-     *	1. p₁x = c₁x + r₁·cos θ
-     *	2. p₁y = c₁y + r₁·sin θ
+     * What we actually do is, given the point whose color we are interested
+     * in, compute the t values for that point, solving for t in:
      *
-     * Point p₂ is a distance r₂ from c₂ at an angle of θ:
+     *     length((1-t)·c₁ + t·(c₂) - p) = (1-t)·r₁ + t·r₂
+     * 
+     * Let's rewrite it in a simpler way, by defining some auxiliary
+     * variables:
      *
-     *	3. p₂x = c₂x + r2·cos θ
-     *	4. p₂y = c₂y + r2·sin θ
+     *     cd = c₂ - c₁
+     *     pd = p - c₁
+     *     dr = r₂ - r₁
+     *     lenght(t·cd - pd) = r₁ + t·dr
      *
-     * Point p lies at a fraction t along the line segment p₁p₂:
+     * which actually means
      *
-     *	5. px = t·p₂x + (1-t)·p₁x
-     *	6. py = t·p₂y + (1-t)·p₁y
+     *     hypot(t·cdx - pdx, t·cdy - pdy) = r₁ + t·dr
      *
-     * To solve, first subtitute 1-4 into 5 and 6:
+     * or
      *
-     * px = t·(c₂x + r₂·cos θ) + (1-t)·(c₁x + r₁·cos θ)
-     * py = t·(c₂y + r₂·sin θ) + (1-t)·(c₁y + r₁·sin θ)
+     *     ⎷((t·cdx - pdx)² + (t·cdy - pdy)²) = r₁ + t·dr.
      *
-     * Then solve each for cos θ and sin θ expressed as a function of t:
+     * If we impose (as stated earlier) that r₁ + t·dr >= 0, it becomes:
      *
-     * cos θ = (-(c₂x - c₁x)·t + (px - c₁x)) / ((r₂-r₁)·t + r₁)
-     * sin θ = (-(c₂y - c₁y)·t + (py - c₁y)) / ((r₂-r₁)·t + r₁)
+     *     (t·cdx - pdx)² + (t·cdy - pdy)² = (r₁ + t·dr)²
      *
-     * To simplify this a bit, we define new variables for several of the
-     * common terms as shown below:
+     * where we can actually expand the squares and solve for t:
      *
-     *              p₂
-     *           p  •
-     *           •   ╲
-     *        ·  ┆    ╲r₂
-     *  p₁ ·     ┆     ╲
-     *  •     pdy┆      ╲
-     *   ╲       ┆       •c₂
-     *    ╲r₁    ┆   ·   ┆
-     *     ╲    ·┆       ┆cdy
-     *      •╌╌╌╌┴╌╌╌╌╌╌╌┘
-     *    c₁  pdx   cdx
+     *     t²cdx² - 2t·cdx·pdx + pdx² + t²cdy² - 2t·cdy·pdy + pdy² =
+     *       = r₁² + 2·r₁·t·dr + t²·dr²
      *
-     * cdx = (c₂x - c₁x)
-     * cdy = (c₂y - c₁y)
-     *  dr =  r₂-r₁
-     * pdx =  px - c₁x
-     * pdy =  py - c₁y
+     *     (cdx² + cdy² - dr²)t² - 2(cdx·pdx + cdy·pdy + r₁·dr)t +
+     *         (pdx² + pdy² - r₁²) = 0
      *
-     * Note that cdx, cdy, and dr do not depend on point p at all, so can
-     * be pre-computed for the entire gradient. The simplifed equations
-     * are now:
+     *     A = cdx² + cdy² - dr²
+     *     B = pdx·cdx + pdy·cdy + r₁·dr
+     *     C = pdx² + pdy² - r₁²
+     *     At² - 2Bt + C = 0
+     * 
+     * The solutions (unless the equation degenerates because of A = 0) are:
      *
-     * cos θ = (-cdx·t + pdx) / (dr·t + r₁)
-     * sin θ = (-cdy·t + pdy) / (dr·t + r₁)
+     *     t = (B ± ⎷(B² - A·C)) / A
      *
-     * Finally, to get a single function of t and eliminate the last
-     * unknown θ, we use the identity sin²θ + cos²θ = 1. First, square
-     * each equation, (we knew a quadratic was coming since it must be
-     * possible to obtain two solutions in some cases):
+     * The solution we are going to prefer is the bigger one, unless the
+     * radius associated to it is negative (or it falls outside the valid t
+     * range).
      *
-     * cos²θ = (cdx²t² - 2·cdx·pdx·t + pdx²) / (dr²·t² + 2·r₁·dr·t + r₁²)
-     * sin²θ = (cdy²t² - 2·cdy·pdy·t + pdy²) / (dr²·t² + 2·r₁·dr·t + r₁²)
+     * Additional observations (useful for optimizations):
+     * A does not depend on p
      *
-     * Then add both together, set the result equal to 1, and express as a
-     * standard quadratic equation in t of the form At² + Bt + C = 0
-     *
-     * (cdx² + cdy² - dr²)·t² - 2·(cdx·pdx + cdy·pdy + r₁·dr)·t + (pdx² + pdy² - r₁²) = 0
-     *
-     * In other words:
-     *
-     * A = cdx² + cdy² - dr²
-     * B = -2·(pdx·cdx + pdy·cdy + r₁·dr)
-     * C = pdx² + pdy² - r₁²
-     *
-     * And again, notice that A does not depend on p, so can be
-     * precomputed. From here we just use the quadratic formula to solve
-     * for t:
-     *
-     * t = (-2·B ± ⎷(B² - 4·A·C)) / 2·A
+     * A < 0 <=> one of the two circles completely contains the other one
+     *   <=> for every p, the radiuses associated with the two t solutions
+     *       have opposite sign
      */
 
     gradient_t *gradient = (gradient_t *)image;
@@ -161,153 +223,150 @@
     radial_gradient_t *radial = (radial_gradient_t *)image;
     uint32_t *end = buffer + width;
     pixman_gradient_walker_t walker;
-    pixman_bool_t affine = TRUE;
-    double cx = 1.;
-    double cy = 0.;
-    double cz = 0.;
-    double rx = x + 0.5;
-    double ry = y + 0.5;
-    double rz = 1.;
+    pixman_vector_t v, unit;
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (x) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (y) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
 
     _pixman_gradient_walker_init (&walker, gradient, source->common.repeat);
 
     if (source->common.transform)
     {
-	pixman_vector_t v;
-	/* reference point is the center of the pixel */
-	v.vector[0] = pixman_int_to_fixed (x) + pixman_fixed_1 / 2;
-	v.vector[1] = pixman_int_to_fixed (y) + pixman_fixed_1 / 2;
-	v.vector[2] = pixman_fixed_1;
-	
 	if (!pixman_transform_point_3d (source->common.transform, &v))
 	    return;
-
-	cx = source->common.transform->matrix[0][0] / 65536.;
-	cy = source->common.transform->matrix[1][0] / 65536.;
-	cz = source->common.transform->matrix[2][0] / 65536.;
 	
-	rx = v.vector[0] / 65536.;
-	ry = v.vector[1] / 65536.;
-	rz = v.vector[2] / 65536.;
-
-	affine =
-	    source->common.transform->matrix[2][0] == 0 &&
-	    v.vector[2] == pixman_fixed_1;
+	unit.vector[0] = source->common.transform->matrix[0][0];
+	unit.vector[1] = source->common.transform->matrix[1][0];
+	unit.vector[2] = source->common.transform->matrix[2][0];
+    }
+    else
+    {
+	unit.vector[0] = pixman_fixed_1;
+	unit.vector[1] = 0;
+	unit.vector[2] = 0;
     }
 
-    if (affine)
+    if (unit.vector[2] == 0 && v.vector[2] == pixman_fixed_1)
     {
-	/* When computing t over a scanline, we notice that some expressions
-	 * are constant so we can compute them just once. Given:
+	/*
+	 * Given:
 	 *
-	 * t = (-2·B ± ⎷(B² - 4·A·C)) / 2·A
+	 * t = (B ± ⎷(B² - A·C)) / A
 	 *
 	 * where
 	 *
-	 * A = cdx² + cdy² - dr² [precomputed as radial->A]
-	 * B = -2·(pdx·cdx + pdy·cdy + r₁·dr)
+	 * A = cdx² + cdy² - dr²
+	 * B = pdx·cdx + pdy·cdy + r₁·dr
 	 * C = pdx² + pdy² - r₁²
+	 * det = B² - A·C
 	 *
 	 * Since we have an affine transformation, we know that (pdx, pdy)
 	 * increase linearly with each pixel,
 	 *
-	 * pdx = pdx₀ + n·cx,
-	 * pdy = pdy₀ + n·cy,
-	 *
-	 * we can then express B in terms of an linear increment along
-	 * the scanline:
+	 * pdx = pdx₀ + n·ux,
+	 * pdy = pdy₀ + n·uy,
 	 *
-	 * B = B₀ + n·cB, with
-	 * B₀ = -2·(pdx₀·cdx + pdy₀·cdy + r₁·dr) and
-	 * cB = -2·(cx·cdx + cy·cdy)
-	 *
-	 * Thus we can replace the full evaluation of B per-pixel (4 multiplies,
-	 * 2 additions) with a single addition.
+	 * we can then express B, C and det through multiple differentiation.
+	 */
+	pixman_fixed_32_32_t b, db, c, dc, ddc;
+
+	/* warning: this computation may overflow */
+	v.vector[0] -= radial->c1.x;
+	v.vector[1] -= radial->c1.y;
+
+	/*
+	 * B and C are computed and updated exactly.
+	 * If fdot was used instead of dot, in the worst case it would
+	 * lose 11 bits of precision in each of the multiplication and
+	 * summing up would zero out all the bit that were preserved,
+	 * thus making the result 0 instead of the correct one.
+	 * This would mean a worst case of unbound relative error or
+	 * about 2^10 absolute error
 	 */
-	double r1   = radial->c1.radius / 65536.;
-	double r1sq = r1 * r1;
-	double pdx  = rx - radial->c1.x / 65536.;
-	double pdy  = ry - radial->c1.y / 65536.;
-	double A = radial->A;
-	double invA = -65536. / (2. * A);
-	double A4 = -4. * A;
-	double B  = -2. * (pdx*radial->cdx + pdy*radial->cdy + r1*radial->dr);
-	double cB = -2. *  (cx*radial->cdx +  cy*radial->cdy);
-	pixman_bool_t invert = A * radial->dr < 0;
+	b = dot (v.vector[0], v.vector[1], radial->c1.radius,
+		 radial->delta.x, radial->delta.y, radial->delta.radius);
+	db = dot (unit.vector[0], unit.vector[1], 0,
+		  radial->delta.x, radial->delta.y, 0);
+
+	c = dot (v.vector[0], v.vector[1],
+		 -((pixman_fixed_48_16_t) radial->c1.radius),
+		 v.vector[0], v.vector[1], radial->c1.radius);
+	dc = dot (2 * (pixman_fixed_48_16_t) v.vector[0] + unit.vector[0],
+		  2 * (pixman_fixed_48_16_t) v.vector[1] + unit.vector[1],
+		  0,
+		  unit.vector[0], unit.vector[1], 0);
+	ddc = 2 * dot (unit.vector[0], unit.vector[1], 0,
+		       unit.vector[0], unit.vector[1], 0);
 
 	while (buffer < end)
 	{
 	    if (!mask || *mask++)
 	    {
-		pixman_fixed_48_16_t t;
-		double det = B * B + A4 * (pdx * pdx + pdy * pdy - r1sq);
-		if (det <= 0.)
-		    t = (pixman_fixed_48_16_t) (B * invA);
-		else if (invert)
-		    t = (pixman_fixed_48_16_t) ((B + sqrt (det)) * invA);
-		else
-		    t = (pixman_fixed_48_16_t) ((B - sqrt (det)) * invA);
-
-		*buffer = _pixman_gradient_walker_pixel (&walker, t);
+		*buffer = radial_compute_color (radial->a, b, c,
+						radial->inva,
+						radial->delta.radius,
+						radial->mindr,
+						&walker,
+						source->common.repeat);
 	    }
-	    ++buffer;
 
-	    pdx += cx;
-	    pdy += cy;
-	    B += cB;
+	    b += db;
+	    c += dc;
+	    dc += ddc;
+	    ++buffer;
 	}
     }
     else
     {
 	/* projective */
+	/* Warning:
+	 * error propagation guarantees are much looser than in the affine case
+	 */
 	while (buffer < end)
 	{
 	    if (!mask || *mask++)
 	    {
-		double pdx, pdy;
-		double B, C;
-		double det;
-		double c1x = radial->c1.x / 65536.0;
-		double c1y = radial->c1.y / 65536.0;
-		double r1  = radial->c1.radius / 65536.0;
-		pixman_fixed_48_16_t t;
-		double x, y;
-
-		if (rz != 0)
-		{
-		    x = rx / rz;
-		    y = ry / rz;
-		}
-		else
+		if (v.vector[2] != 0)
 		{
-		    x = y = 0.;
-		}
+		    double pdx, pdy, invv2, b, c;
 
-		pdx = x - c1x;
-		pdy = y - c1y;
+		    invv2 = 1. * pixman_fixed_1 / v.vector[2];
 
-		B = -2 * (pdx * radial->cdx +
-			  pdy * radial->cdy +
-			  r1 * radial->dr);
-		C = (pdx * pdx + pdy * pdy - r1 * r1);
-
-		det = (B * B) - (4 * radial->A * C);
-		if (det < 0.0)
-		    det = 0.0;
+		    pdx = v.vector[0] * invv2 - radial->c1.x;
+		    /*    / pixman_fixed_1 */
 
-		if (radial->A * radial->dr < 0)
-		    t = (pixman_fixed_48_16_t) ((-B - sqrt (det)) / (2.0 * radial->A) * 65536);
+		    pdy = v.vector[1] * invv2 - radial->c1.y;
+		    /*    / pixman_fixed_1 */
+
+		    b = fdot (pdx, pdy, radial->c1.radius,
+			      radial->delta.x, radial->delta.y,
+			      radial->delta.radius);
+		    /*  / pixman_fixed_1 / pixman_fixed_1 */
+
+		    c = fdot (pdx, pdy, -radial->c1.radius,
+			      pdx, pdy, radial->c1.radius);
+		    /*  / pixman_fixed_1 / pixman_fixed_1 */
+
+		    *buffer = radial_compute_color (radial->a, b, c,
+						    radial->inva,
+						    radial->delta.radius,
+						    radial->mindr,
+						    &walker,
+						    source->common.repeat);
+		}
 		else
-		    t = (pixman_fixed_48_16_t) ((-B + sqrt (det)) / (2.0 * radial->A) * 65536);
-
-		*buffer = _pixman_gradient_walker_pixel (&walker, t);
+		{
+		    *buffer = 0;
+		}
 	    }
 	    
 	    ++buffer;
 
-	    rx += cx;
-	    ry += cy;
-	    rz += cz;
+	    v.vector[0] += unit.vector[0];
+	    v.vector[1] += unit.vector[1];
+	    v.vector[2] += unit.vector[2];
 	}
     }
 }
@@ -351,12 +410,20 @@
     radial->c2.x = outer->x;
     radial->c2.y = outer->y;
     radial->c2.radius = outer_radius;
-    radial->cdx = pixman_fixed_to_double (radial->c2.x - radial->c1.x);
-    radial->cdy = pixman_fixed_to_double (radial->c2.y - radial->c1.y);
-    radial->dr = pixman_fixed_to_double (radial->c2.radius - radial->c1.radius);
-    radial->A = (radial->cdx * radial->cdx +
-		 radial->cdy * radial->cdy -
-		 radial->dr  * radial->dr);
+
+    /* warning: this computations may overflow */
+    radial->delta.x = radial->c2.x - radial->c1.x;
+    radial->delta.y = radial->c2.y - radial->c1.y;
+    radial->delta.radius = radial->c2.radius - radial->c1.radius;
+
+    /* computed exactly, then cast to double -> every bit of the double
+       representation is correct (53 bits) */
+    radial->a = dot (radial->delta.x, radial->delta.y, -radial->delta.radius,
+		     radial->delta.x, radial->delta.y, radial->delta.radius);
+    if (radial->a != 0)
+	radial->inva = 1. * pixman_fixed_1 / radial->a;
+
+    radial->mindr = -1. * pixman_fixed_1 * radial->c1.radius;
 
     image->common.property_changed = radial_gradient_property_changed;
 
--- pixman/pixman-solid-fill.c
+++ pixman/pixman-solid-fill.c
@@ -66,7 +66,7 @@
                      int             width,
                      int             height)
 {
-    return (image->source.class = SOURCE_IMAGE_CLASS_HORIZONTAL);
+    return SOURCE_IMAGE_CLASS_HORIZONTAL;
 }
 
 static void
@@ -109,7 +109,6 @@
     img->solid.color_32 = color_to_uint32 (color);
     img->solid.color_64 = color_to_uint64 (color);
 
-    img->source.class = SOURCE_IMAGE_CLASS_UNKNOWN;
     img->common.classify = solid_fill_classify;
     img->common.property_changed = solid_fill_property_changed;
 
--- pixman/pixman-sse2.c
+++ pixman/pixman-sse2.c
@@ -357,34 +357,6 @@
     over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
 }
 
-static force_inline void
-cache_prefetch (__m128i* addr)
-{
-    _mm_prefetch ((void const*)addr, _MM_HINT_T0);
-}
-
-static force_inline void
-cache_prefetch_next (__m128i* addr)
-{
-    _mm_prefetch ((void const *)(addr + 4), _MM_HINT_T0); /* 64 bytes ahead */
-}
-
-/* prefetching NULL is very slow on some systems. don't do that. */
-
-static force_inline void
-maybe_prefetch (__m128i* addr)
-{
-    if (addr)
-	cache_prefetch (addr);
-}
-
-static force_inline void
-maybe_prefetch_next (__m128i* addr)
-{
-    if (addr)
-	cache_prefetch_next (addr);
-}
-
 /* load 4 pixels from a 16-byte boundary aligned address */
 static force_inline __m128i
 load_128_aligned (__m128i* src)
@@ -649,11 +621,6 @@
     __m128i xmm_src_lo, xmm_src_hi;
     __m128i xmm_alpha_lo, xmm_alpha_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     /* Align dst on a 16-byte boundary */
     while (w && ((unsigned long)pd & 15))
     {
@@ -667,18 +634,8 @@
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	/* I'm loading unaligned because I'm not sure about
 	 * the address alignment.
 	 */
@@ -740,11 +697,6 @@
     __m128i xmm_src_lo, xmm_src_hi;
     __m128i xmm_alpha_lo, xmm_alpha_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     /* Align dst on a 16-byte boundary */
     while (w &&
            ((unsigned long)pd & 15))
@@ -759,18 +711,8 @@
 	    pm++;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	/* I'm loading unaligned because I'm not sure
 	 * about the address alignment.
 	 */
@@ -842,11 +784,6 @@
     __m128i xmm_src_lo, xmm_src_hi;
     __m128i xmm_dst_lo, xmm_dst_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w && ((unsigned long) pd & 15))
     {
 	s = combine1 (ps, pm);
@@ -859,18 +796,8 @@
 	    pm++;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
 
@@ -916,11 +843,6 @@
     __m128i xmm_src_lo, xmm_src_hi;
     __m128i xmm_dst_lo, xmm_dst_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w && ((unsigned long) pd & 15))
     {
 	s = combine1 (ps, pm);
@@ -933,18 +855,8 @@
 	    pm++;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
 
@@ -985,11 +897,6 @@
                                  const uint32_t* pm,
                                  int             w)
 {
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w && ((unsigned long) pd & 15))
     {
 	uint32_t s = combine1 (ps, pm);
@@ -1006,21 +913,11 @@
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
 	__m128i xmm_src_lo, xmm_src_hi;
 	__m128i xmm_dst_lo, xmm_dst_hi;
 
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 
@@ -1067,11 +964,6 @@
                          const uint32_t* pm,
                          int             w)
 {
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w && ((unsigned long) pd & 15))
     {
 	uint32_t s = combine1 (ps, pm);
@@ -1087,21 +979,11 @@
 	    pm++;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
 	__m128i xmm_src_lo, xmm_src_hi;
 	__m128i xmm_dst_lo, xmm_dst_hi;
 
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 
@@ -1167,11 +1049,6 @@
     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w && ((unsigned long) pd & 15))
     {
 	s = combine1 (ps, pm);
@@ -1184,18 +1061,8 @@
 	    pm++;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 
@@ -1264,11 +1131,6 @@
     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w && ((unsigned long) pd & 15))
     {
 	s = combine1 (ps, pm);
@@ -1281,18 +1143,8 @@
 	    pm++;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 
@@ -1365,11 +1217,6 @@
     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w && ((unsigned long) pd & 15))
     {
 	s = combine1 (ps, pm);
@@ -1382,18 +1229,8 @@
 	    pm++;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
 	xmm_dst = load_128_aligned ((__m128i*) pd);
 
@@ -1450,11 +1287,6 @@
     const uint32_t* ps = src;
     const uint32_t* pm = mask;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = combine1 (ps, pm);
@@ -1468,20 +1300,10 @@
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
 	__m128i s;
 
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	s = combine4 ((__m128i*)ps, (__m128i*)pm);
 
 	save_128_aligned (
@@ -1536,11 +1358,6 @@
     uint32_t pack_cmp;
     __m128i xmm_src, xmm_dst;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = combine1 (ps, pm);
@@ -1553,18 +1370,8 @@
 	    pm++;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    maybe_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	maybe_prefetch_next ((__m128i*)pm);
-
 	xmm_dst = load_128_aligned  ((__m128i*)pd);
 	xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
 
@@ -1637,11 +1444,6 @@
     __m128i xmm_mask_lo, xmm_mask_hi;
     __m128i xmm_dst_lo, xmm_dst_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -1651,18 +1453,8 @@
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
 
@@ -1718,11 +1510,6 @@
     __m128i xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -1733,18 +1520,8 @@
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -1807,11 +1584,6 @@
     __m128i xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -1822,18 +1594,8 @@
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -1885,11 +1647,6 @@
     __m128i xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -1904,18 +1661,8 @@
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -1973,11 +1720,6 @@
     __m128i xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -1992,18 +1734,8 @@
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2059,11 +1791,6 @@
     __m128i xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -2078,18 +1805,8 @@
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2148,11 +1865,6 @@
     __m128i xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -2168,18 +1880,8 @@
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2258,11 +1960,6 @@
     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -2273,18 +1970,8 @@
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2364,11 +2051,6 @@
     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -2379,18 +2061,8 @@
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2473,11 +2145,6 @@
     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -2488,18 +2155,8 @@
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2562,11 +2219,6 @@
     __m128i xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
 	s = *ps++;
@@ -2580,18 +2232,8 @@
 	w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-	/* fill cache line with next memory */
-	cache_prefetch_next ((__m128i*)ps);
-	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
-
 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
@@ -2971,9 +2613,6 @@
     {
 	dst = dst_line;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-
 	dst_line += dst_stride;
 	w = width;
 
@@ -2986,13 +2625,8 @@
 	    w--;
 	}
 
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 4)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
 
 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
@@ -3062,9 +2696,6 @@
     {
 	dst = dst_line;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-
 	dst_line += dst_stride;
 	w = width;
 
@@ -3079,14 +2710,8 @@
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 8)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
 
 	    unpack_565_128_4x128 (xmm_dst,
@@ -3177,10 +2802,6 @@
 	dst_line += dst_stride;
 	mask_line += mask_stride;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)pd);
-	cache_prefetch ((__m128i*)pm);
-
 	while (w && (unsigned long)pd & 15)
 	{
 	    m = *pm++;
@@ -3200,16 +2821,8 @@
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)pd);
-	cache_prefetch ((__m128i*)pm);
-
 	while (w >= 4)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)pd);
-	    cache_prefetch_next ((__m128i*)pm);
-
 	    xmm_mask = load_128_unaligned ((__m128i*)pm);
 
 	    pack_cmp =
@@ -3316,10 +2929,6 @@
 	dst_line += dst_stride;
 	mask_line += mask_stride;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)pd);
-	cache_prefetch ((__m128i*)pm);
-
 	while (w && (unsigned long)pd & 15)
 	{
 	    m = *pm++;
@@ -3340,16 +2949,8 @@
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)pd);
-	cache_prefetch ((__m128i*)pm);
-
 	while (w >= 4)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)pd);
-	    cache_prefetch_next ((__m128i*)pm);
-
 	    xmm_mask = load_128_unaligned ((__m128i*)pm);
 
 	    pack_cmp =
@@ -3447,10 +3048,6 @@
 	src_line += src_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-	cache_prefetch ((__m128i*)src);
-
 	while (w && (unsigned long)dst & 15)
 	{
 	    uint32_t s = *src++;
@@ -3467,16 +3064,8 @@
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-	cache_prefetch ((__m128i*)src);
-
 	while (w >= 4)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)dst);
-	    cache_prefetch_next ((__m128i*)src);
-
 	    xmm_src = load_128_unaligned ((__m128i*)src);
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
 
@@ -3556,25 +3145,16 @@
 	src_line += src_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-
 	while (w && (unsigned long)dst & 15)
 	{
 	    *dst++ = *src++ | 0xff000000;
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-
 	while (w >= 16)
 	{
 	    __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
 	    
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)src);
-
 	    xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
 	    xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
 	    xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
@@ -3646,10 +3226,6 @@
 	src_line += src_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-	cache_prefetch ((__m128i*)src);
-
 	while (w && (unsigned long)dst & 15)
 	{
 	    uint32_t s = (*src++) | 0xff000000;
@@ -3666,16 +3242,8 @@
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-	cache_prefetch ((__m128i*)src);
-
 	while (w >= 4)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)dst);
-	    cache_prefetch_next ((__m128i*)src);
-
 	    xmm_src = _mm_or_si128 (
 		load_128_unaligned ((__m128i*)src), mask_ff000000);
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
@@ -3815,10 +3383,6 @@
 	dst = dst_line;
 	src = src_line;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-
 	dst_line += dst_stride;
 	src_line += src_stride;
 	w = width;
@@ -3834,17 +3398,9 @@
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-
 	/* It's a 8 pixel loop */
 	while (w >= 8)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)src);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    /* I'm loading unaligned because I'm not sure
 	     * about the address alignment.
 	     */
@@ -3954,10 +3510,6 @@
 	mask_line += mask_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && (unsigned long)dst & 15)
 	{
 	    uint8_t m = *mask++;
@@ -3978,16 +3530,8 @@
 	    dst++;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 4)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)mask);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    m = *((uint32_t*)mask);
 
 	    if (srca == 0xff && m == 0xffffffff)
@@ -4099,7 +3643,6 @@
 	return FALSE;
     }
 
-    cache_prefetch ((__m128i*)byte_line);
     xmm_def = create_mask_2x32_128 (data, data);
 
     while (height--)
@@ -4109,8 +3652,6 @@
 	byte_line += stride;
 	w = byte_width;
 
-	cache_prefetch_next ((__m128i*)d);
-
 	while (w >= 1 && ((unsigned long)d & 1))
 	{
 	    *(uint8_t *)d = data;
@@ -4133,12 +3674,8 @@
 	    d += 4;
 	}
 
-	cache_prefetch_next ((__m128i*)d);
-
 	while (w >= 128)
 	{
-	    cache_prefetch (((__m128i*)d) + 12);
-
 	    save_128_aligned ((__m128i*)(d),     xmm_def);
 	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
 	    save_128_aligned ((__m128i*)(d + 32),  xmm_def);
@@ -4154,8 +3691,6 @@
 
 	if (w >= 64)
 	{
-	    cache_prefetch (((__m128i*)d) + 8);
-
 	    save_128_aligned ((__m128i*)(d),     xmm_def);
 	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
 	    save_128_aligned ((__m128i*)(d + 32),  xmm_def);
@@ -4165,8 +3700,6 @@
 	    w -= 64;
 	}
 
-	cache_prefetch_next ((__m128i*)d);
-
 	if (w >= 32)
 	{
 	    save_128_aligned ((__m128i*)(d),     xmm_def);
@@ -4184,8 +3717,6 @@
 	    w -= 16;
 	}
 
-	cache_prefetch_next ((__m128i*)d);
-
 	while (w >= 4)
 	{
 	    *(uint32_t *)d = data;
@@ -4265,10 +3796,6 @@
 	mask_line += mask_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && (unsigned long)dst & 15)
 	{
 	    uint8_t m = *mask++;
@@ -4288,16 +3815,8 @@
 	    dst++;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 4)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)mask);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    m = *((uint32_t*)mask);
 
 	    if (srca == 0xff && m == 0xffffffff)
@@ -4410,10 +3929,6 @@
 	mask_line += mask_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && (unsigned long)dst & 15)
 	{
 	    m = *mask++;
@@ -4434,16 +3949,8 @@
 	    dst++;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 8)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)mask);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    xmm_dst = load_128_aligned ((__m128i*) dst);
 	    unpack_565_128_4x128 (xmm_dst,
 				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
@@ -4570,10 +4077,6 @@
 	src_line += src_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && (unsigned long)dst & 15)
 	{
 	    s = *src++;
@@ -4587,16 +4090,8 @@
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 8)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)src);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    /* First round */
 	    xmm_src = load_128_unaligned ((__m128i*)src);
 	    xmm_dst = load_128_aligned  ((__m128i*)dst);
@@ -4715,10 +4210,6 @@
 	src_line += src_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && (unsigned long)dst & 15)
 	{
 	    s = *src++;
@@ -4731,16 +4222,8 @@
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 4)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)src);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    xmm_src_hi = load_128_unaligned ((__m128i*)src);
 
 	    opaque = is_opaque (xmm_src_hi);
@@ -4845,10 +4328,6 @@
 	mask_line += mask_stride;
 	dst_line += dst_stride;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && ((unsigned long)dst & 15))
 	{
 	    m = *(uint32_t *) mask;
@@ -4870,16 +4349,8 @@
 	    mask++;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 8)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)mask);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    /* First round */
 	    xmm_mask = load_128_unaligned ((__m128i*)mask);
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
@@ -5001,10 +4472,6 @@
 	mask_line += mask_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && ((unsigned long)dst & 15))
 	{
 	    m = (uint32_t) *mask++;
@@ -5018,16 +4485,8 @@
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 16)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)mask);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    xmm_mask = load_128_unaligned ((__m128i*)mask);
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
 
@@ -5121,9 +4580,6 @@
 	dst_line += dst_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && ((unsigned long)dst & 15))
 	{
 	    d = (uint32_t) *dst;
@@ -5135,14 +4591,8 @@
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 16)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
 
 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
@@ -5214,10 +4664,6 @@
 	src_line += src_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && ((unsigned long)dst & 15))
 	{
 	    s = (uint32_t) *src++;
@@ -5229,16 +4675,8 @@
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 16)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)src);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    xmm_src = load_128_unaligned ((__m128i*)src);
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
 
@@ -5321,10 +4759,6 @@
 	mask_line += mask_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && ((unsigned long)dst & 15))
 	{
 	    m = (uint32_t) *mask++;
@@ -5338,16 +4772,8 @@
 	    w--;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)mask);
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 16)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)mask);
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    xmm_mask = load_128_unaligned ((__m128i*)mask);
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
 
@@ -5440,9 +4866,6 @@
 	dst_line += dst_stride;
 	w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-
 	while (w && ((unsigned long)dst & 15))
 	{
 	    *dst = (uint8_t)_mm_cvtsi64_si32 (
@@ -5454,14 +4877,8 @@
 	    dst++;
 	}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 16)
 	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)dst);
-
 	    save_128_aligned (
 		(__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned  ((__m128i*)dst)));
 
@@ -5485,23 +4902,23 @@
 }
 
 /* ----------------------------------------------------------------------
- * composite_add_8000_8000
+ * composite_add_8_8
  */
 
 static void
-sse2_composite_add_8000_8000 (pixman_implementation_t *imp,
-                              pixman_op_t              op,
-                              pixman_image_t *         src_image,
-                              pixman_image_t *         mask_image,
-                              pixman_image_t *         dst_image,
-                              int32_t                  src_x,
-                              int32_t                  src_y,
-                              int32_t                  mask_x,
-                              int32_t                  mask_y,
-                              int32_t                  dest_x,
-                              int32_t                  dest_y,
-                              int32_t                  width,
-                              int32_t                  height)
+sse2_composite_add_8_8 (pixman_implementation_t *imp,
+			pixman_op_t              op,
+			pixman_image_t *         src_image,
+			pixman_image_t *         mask_image,
+			pixman_image_t *         dst_image,
+			int32_t                  src_x,
+			int32_t                  src_y,
+			int32_t                  mask_x,
+			int32_t                  mask_y,
+			int32_t                  dest_x,
+			int32_t                  dest_y,
+			int32_t                  width,
+			int32_t                  height)
 {
     uint8_t     *dst_line, *dst;
     uint8_t     *src_line, *src;
@@ -5519,10 +4936,6 @@
 	dst = dst_line;
 	src = src_line;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-
 	dst_line += dst_stride;
 	src_line += src_stride;
 	w = width;
@@ -5644,9 +5057,6 @@
 	return FALSE;
     }
 
-    cache_prefetch ((__m128i*)src_bytes);
-    cache_prefetch ((__m128i*)dst_bytes);
-
     while (height--)
     {
 	int w;
@@ -5656,9 +5066,6 @@
 	dst_bytes += dst_stride;
 	w = byte_width;
 
-	cache_prefetch_next ((__m128i*)s);
-	cache_prefetch_next ((__m128i*)d);
-
 	while (w >= 2 && ((unsigned long)d & 3))
 	{
 	    *(uint16_t *)d = *(uint16_t *)s;
@@ -5676,17 +5083,10 @@
 	    d += 4;
 	}
 
-	cache_prefetch_next ((__m128i*)s);
-	cache_prefetch_next ((__m128i*)d);
-
 	while (w >= 64)
 	{
 	    __m128i xmm0, xmm1, xmm2, xmm3;
 
-	    /* 128 bytes ahead */
-	    cache_prefetch (((__m128i*)s) + 8);
-	    cache_prefetch (((__m128i*)d) + 8);
-
 	    xmm0 = load_128_unaligned ((__m128i*)(s));
 	    xmm1 = load_128_unaligned ((__m128i*)(s + 16));
 	    xmm2 = load_128_unaligned ((__m128i*)(s + 32));
@@ -5702,9 +5102,6 @@
 	    w -= 64;
 	}
 
-	cache_prefetch_next ((__m128i*)s);
-	cache_prefetch_next ((__m128i*)d);
-
 	while (w >= 16)
 	{
 	    save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
@@ -5714,9 +5111,6 @@
 	    s += 16;
 	}
 
-	cache_prefetch_next ((__m128i*)s);
-	cache_prefetch_next ((__m128i*)d);
-
 	while (w >= 4)
 	{
 	    *(uint32_t *)d = *(uint32_t *)s;
@@ -5809,11 +5203,6 @@
 
         w = width;
 
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i*)src);
-        cache_prefetch ((__m128i*)dst);
-        cache_prefetch ((__m128i*)mask);
-
         while (w && (unsigned long)dst & 15)
         {
             s = 0xff000000 | *src++;
@@ -5833,18 +5222,8 @@
             w--;
         }
 
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i*)src);
-        cache_prefetch ((__m128i*)dst);
-        cache_prefetch ((__m128i*)mask);
-
         while (w >= 4)
         {
-            /* fill cache line with next memory */
-            cache_prefetch_next ((__m128i*)src);
-            cache_prefetch_next ((__m128i*)dst);
-            cache_prefetch_next ((__m128i*)mask);
-
             m = *(uint32_t*) mask;
             xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
 
@@ -5955,11 +5334,6 @@
 
         w = width;
 
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i *)src);
-        cache_prefetch ((__m128i *)dst);
-        cache_prefetch ((__m128i *)mask);
-
         while (w && (unsigned long)dst & 15)
         {
 	    uint32_t sa;
@@ -5994,18 +5368,8 @@
             w--;
         }
 
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i *)src);
-        cache_prefetch ((__m128i *)dst);
-        cache_prefetch ((__m128i *)mask);
-
         while (w >= 4)
         {
-            /* fill cache line with next memory */
-            cache_prefetch_next ((__m128i *)src);
-            cache_prefetch_next ((__m128i *)dst);
-            cache_prefetch_next ((__m128i *)mask);
-
             m = *(uint32_t *) mask;
 
 	    if (m)
@@ -6117,9 +5481,6 @@
     {
 	dst = dst_line;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)dst);
-
 	dst_line += dst_stride;
 	w = width;
 
@@ -6135,15 +5496,10 @@
 	    dst++;
 	}
 
-	cache_prefetch ((__m128i*)dst);
-
 	while (w >= 4)
 	{
 	    __m128i tmp_lo, tmp_hi;
 
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)(dst + 4));
-
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
 
 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
@@ -6224,11 +5580,6 @@
 
         w = width;
 
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i *)src);
-        cache_prefetch ((__m128i *)dst);
-        cache_prefetch ((__m128i *)mask);
-
         while (w && (unsigned long)dst & 15)
         {
 	    uint32_t sa;
@@ -6263,18 +5614,8 @@
             w--;
         }
 
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i *)src);
-        cache_prefetch ((__m128i *)dst);
-        cache_prefetch ((__m128i *)mask);
-
         while (w >= 4)
         {
-            /* fill cache line with next memory */
-            cache_prefetch_next ((__m128i *)src);
-            cache_prefetch_next ((__m128i *)dst);
-            cache_prefetch_next ((__m128i *)mask);
-
 	    xmm_mask = load_128_unaligned ((__m128i*)mask);
 
 	    if (!is_transparent (xmm_mask))
@@ -6504,7 +5845,7 @@
 
     /* PIXMAN_OP_ADD */
     PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
-    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8000_8000),
+    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
     PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
     PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
     PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
--- pixman/pixman-version.h
+++ pixman/pixman-version.h
@@ -32,10 +32,10 @@
 #endif
 
 #define PIXMAN_VERSION_MAJOR 0
-#define PIXMAN_VERSION_MINOR 19
-#define PIXMAN_VERSION_MICRO 4
+#define PIXMAN_VERSION_MINOR 20
+#define PIXMAN_VERSION_MICRO 0
 
-#define PIXMAN_VERSION_STRING "0.19.4"
+#define PIXMAN_VERSION_STRING "0.20.0"
 
 #define PIXMAN_VERSION_ENCODE(major, minor, micro) (	\
 	  ((major) * 10000)				\
--- pixman/pixman.c
+++ pixman/pixman.c
@@ -377,126 +377,6 @@
     return TRUE;
 }
 
-static void
-walk_region_internal (pixman_implementation_t *imp,
-                      pixman_op_t              op,
-                      pixman_image_t *         src_image,
-                      pixman_image_t *         mask_image,
-                      pixman_image_t *         dst_image,
-                      int32_t                  src_x,
-                      int32_t                  src_y,
-                      int32_t                  mask_x,
-                      int32_t                  mask_y,
-                      int32_t                  dest_x,
-                      int32_t                  dest_y,
-                      int32_t                  width,
-                      int32_t                  height,
-                      pixman_bool_t            src_repeat,
-                      pixman_bool_t            mask_repeat,
-                      pixman_region32_t *      region,
-                      pixman_composite_func_t  composite_rect)
-{
-    int w, h, w_this, h_this;
-    int x_msk, y_msk, x_src, y_src, x_dst, y_dst;
-    int src_dy = src_y - dest_y;
-    int src_dx = src_x - dest_x;
-    int mask_dy = mask_y - dest_y;
-    int mask_dx = mask_x - dest_x;
-    const pixman_box32_t *pbox;
-    int n;
-
-    pbox = pixman_region32_rectangles (region, &n);
-
-    /* Fast path for non-repeating sources */
-    if (!src_repeat && !mask_repeat)
-    {
-       while (n--)
-       {
-           (*composite_rect) (imp, op,
-                              src_image, mask_image, dst_image,
-                              pbox->x1 + src_dx,
-                              pbox->y1 + src_dy,
-                              pbox->x1 + mask_dx,
-                              pbox->y1 + mask_dy,
-                              pbox->x1,
-                              pbox->y1,
-                              pbox->x2 - pbox->x1,
-                              pbox->y2 - pbox->y1);
-           
-           pbox++;
-       }
-
-       return;
-    }
-    
-    while (n--)
-    {
-	h = pbox->y2 - pbox->y1;
-	y_src = pbox->y1 + src_dy;
-	y_msk = pbox->y1 + mask_dy;
-	y_dst = pbox->y1;
-
-	while (h)
-	{
-	    h_this = h;
-	    w = pbox->x2 - pbox->x1;
-	    x_src = pbox->x1 + src_dx;
-	    x_msk = pbox->x1 + mask_dx;
-	    x_dst = pbox->x1;
-
-	    if (mask_repeat)
-	    {
-		y_msk = MOD (y_msk, mask_image->bits.height);
-		if (h_this > mask_image->bits.height - y_msk)
-		    h_this = mask_image->bits.height - y_msk;
-	    }
-
-	    if (src_repeat)
-	    {
-		y_src = MOD (y_src, src_image->bits.height);
-		if (h_this > src_image->bits.height - y_src)
-		    h_this = src_image->bits.height - y_src;
-	    }
-
-	    while (w)
-	    {
-		w_this = w;
-
-		if (mask_repeat)
-		{
-		    x_msk = MOD (x_msk, mask_image->bits.width);
-		    if (w_this > mask_image->bits.width - x_msk)
-			w_this = mask_image->bits.width - x_msk;
-		}
-
-		if (src_repeat)
-		{
-		    x_src = MOD (x_src, src_image->bits.width);
-		    if (w_this > src_image->bits.width - x_src)
-			w_this = src_image->bits.width - x_src;
-		}
-
-		(*composite_rect) (imp, op,
-				   src_image, mask_image, dst_image,
-				   x_src, y_src, x_msk, y_msk, x_dst, y_dst,
-				   w_this, h_this);
-		w -= w_this;
-
-		x_src += w_this;
-		x_msk += w_this;
-		x_dst += w_this;
-	    }
-
-	    h -= h_this;
-	    y_src += h_this;
-	    y_msk += h_this;
-	    y_dst += h_this;
-	}
-
-	pbox++;
-    }
-}
-
 #define N_CACHED_FAST_PATHS 8
 
 typedef struct
@@ -746,7 +626,7 @@
 	    extents->x2 - x <= image->bits.width &&
 	    extents->y2 - y <= image->bits.height)
 	{
-	    *flags |= (FAST_PATH_SAMPLES_COVER_CLIP | FAST_PATH_COVERS_CLIP);
+	    *flags |= FAST_PATH_SAMPLES_COVER_CLIP;
 	    return TRUE;
 	}
     
@@ -789,7 +669,7 @@
 	    ex.x1 >= 0 && ex.y1 >= 0 &&
 	    ex.x2 <= image->bits.width && ex.y2 <= image->bits.height)
 	{
-	    *flags |= (FAST_PATH_SAMPLES_COVER_CLIP | FAST_PATH_COVERS_CLIP);
+	    *flags |= FAST_PATH_SAMPLES_COVER_CLIP;
 	}
     }
     else
@@ -949,14 +829,26 @@
 				   dest_format, dest_flags,
 				   &imp, &func))
     {
-	walk_region_internal (imp, op,
-			      src, mask, dest,
-			      src_x, src_y, mask_x, mask_y,
-			      dest_x, dest_y,
-			      width, height,
-			      (src_flags & FAST_PATH_SIMPLE_REPEAT),
-			      (mask_flags & FAST_PATH_SIMPLE_REPEAT),
-			      &region, func);
+	const pixman_box32_t *pbox;
+	int n;
+
+	pbox = pixman_region32_rectangles (&region, &n);
+	
+	while (n--)
+	{
+	    func (imp, op,
+		  src, mask, dest,
+		  pbox->x1 + src_x - dest_x,
+		  pbox->y1 + src_y - dest_y,
+		  pbox->x1 + mask_x - dest_x,
+		  pbox->y1 + mask_y - dest_y,
+		  pbox->x1,
+		  pbox->y1,
+		  pbox->x2 - pbox->x1,
+		  pbox->y2 - pbox->y1);
+	    
+	    pbox++;
+	}
     }
 
 out:
--- test/Makefile.am
+++ test/Makefile.am
@@ -23,7 +23,6 @@
 
 a1_trap_test_LDADD = $(TEST_LDADD)
 fetch_test_LDADD = $(TEST_LDADD)
-composite_LDADD = $(TEST_LDADD)
 gradient_crash_test_LDADD = $(TEST_LDADD)
 trap_crasher_LDADD = $(TEST_LDADD)
 oob_test_LDADD = $(TEST_LDADD)
@@ -49,6 +48,9 @@
 alpha_loop_LDADD = $(TEST_LDADD)
 alpha_loop_SOURCES = alpha-loop.c utils.c utils.h
 
+composite_LDADD = $(TEST_LDADD)
+composite_SOURCES = composite.c utils.c utils.h
+
 # GTK using test programs
 
 if HAVE_GTK
--- test/Makefile.in
+++ test/Makefile.in
@@ -96,8 +96,8 @@
 @HAVE_GTK_TRUE@	$(am__objects_1)
 clip_test_OBJECTS = $(am_clip_test_OBJECTS)
 @HAVE_GTK_TRUE at clip_test_DEPENDENCIES = $(am__DEPENDENCIES_2)
-composite_SOURCES = composite.c
-composite_OBJECTS = composite.$(OBJEXT)
+am_composite_OBJECTS = composite.$(OBJEXT) utils.$(OBJEXT)
+composite_OBJECTS = $(am_composite_OBJECTS)
 composite_DEPENDENCIES = $(TEST_LDADD)
 am__composite_test_SOURCES_DIST = composite-test.c gtk-utils.c \
 	gtk-utils.h
@@ -187,10 +187,11 @@
 SOURCES = a1-trap-test.c $(affine_test_SOURCES) $(alpha_loop_SOURCES) \
 	$(alpha_test_SOURCES) $(alphamap_SOURCES) \
 	$(blitters_test_SOURCES) $(clip_in_SOURCES) \
-	$(clip_test_SOURCES) composite.c $(composite_test_SOURCES) \
-	$(convolution_test_SOURCES) fetch-test.c gradient-crash-test.c \
-	$(gradient_test_SOURCES) $(lowlevel_blt_bench_SOURCES) \
-	oob-test.c $(region_test_SOURCES) region-translate-test.c \
+	$(clip_test_SOURCES) $(composite_SOURCES) \
+	$(composite_test_SOURCES) $(convolution_test_SOURCES) \
+	fetch-test.c gradient-crash-test.c $(gradient_test_SOURCES) \
+	$(lowlevel_blt_bench_SOURCES) oob-test.c \
+	$(region_test_SOURCES) region-translate-test.c \
 	scaling-crash-test.c $(scaling_test_SOURCES) \
 	$(screen_test_SOURCES) trap-crasher.c $(trap_test_SOURCES) \
 	window-test.c
@@ -198,7 +199,7 @@
 	$(alpha_loop_SOURCES) $(am__alpha_test_SOURCES_DIST) \
 	$(alphamap_SOURCES) $(blitters_test_SOURCES) \
 	$(am__clip_in_SOURCES_DIST) $(am__clip_test_SOURCES_DIST) \
-	composite.c $(am__composite_test_SOURCES_DIST) \
+	$(composite_SOURCES) $(am__composite_test_SOURCES_DIST) \
 	$(am__convolution_test_SOURCES_DIST) fetch-test.c \
 	gradient-crash-test.c $(am__gradient_test_SOURCES_DIST) \
 	$(lowlevel_blt_bench_SOURCES) oob-test.c \
@@ -373,7 +374,6 @@
 
 a1_trap_test_LDADD = $(TEST_LDADD)
 fetch_test_LDADD = $(TEST_LDADD)
-composite_LDADD = $(TEST_LDADD)
 gradient_crash_test_LDADD = $(TEST_LDADD)
 trap_crasher_LDADD = $(TEST_LDADD)
 oob_test_LDADD = $(TEST_LDADD)
@@ -392,6 +392,8 @@
 alphamap_SOURCES = alphamap.c utils.c utils.h
 alpha_loop_LDADD = $(TEST_LDADD)
 alpha_loop_SOURCES = alpha-loop.c utils.c utils.h
+composite_LDADD = $(TEST_LDADD)
+composite_SOURCES = composite.c utils.c utils.h
 
 # GTK using test programs
 @HAVE_GTK_TRUE at GTK_LDADD = $(TEST_LDADD) $(GTK_LIBS)
--- test/alphamap.c
+++ test/alphamap.c
@@ -45,15 +45,29 @@
     return "<unknown - bug in alphamap.c>";
 }
 
+static void
+on_destroy (pixman_image_t *image, void *data)
+{
+    uint32_t *bits = pixman_image_get_data (image);
+
+    fence_free (bits);
+}
+
 static pixman_image_t *
 make_image (pixman_format_code_t format)
 {
     uint32_t *bits;
     uint8_t bpp = PIXMAN_FORMAT_BPP (format) / 8;
+    pixman_image_t *image;
 
     bits = (uint32_t *)make_random_bytes (WIDTH * HEIGHT * bpp);
 
-    return pixman_image_create_bits (format, WIDTH, HEIGHT, bits, WIDTH * bpp);
+    image = pixman_image_create_bits (format, WIDTH, HEIGHT, bits, WIDTH * bpp);
+
+    if (image && bits)
+	pixman_image_set_destroy_function (image, on_destroy, NULL);
+
+    return image;
 }
 
 static pixman_image_t *
@@ -68,6 +82,7 @@
 
 	pixman_image_set_alpha_map (image, alpha,
 				    alpha_origin_x, alpha_origin_y);
+	pixman_image_unref (alpha);
     }
 
     return image;
@@ -203,6 +218,10 @@
 	}
     }
 
+    pixman_image_set_alpha_map (src, NULL, 0, 0);
+    pixman_image_set_alpha_map (dst, NULL, 0, 0);
+    pixman_image_set_alpha_map (orig_dst, NULL, 0, 0);
+
     pixman_image_unref (src);
     pixman_image_unref (dst);
     pixman_image_unref (orig_dst);
--- test/blitters-test.c
+++ test/blitters-test.c
@@ -465,6 +465,6 @@
     }
 
     return fuzzer_test_main("blitters", 2000000,
-			    0x217CF14A,
+			    0x1DB8BDF8,
 			    test_composite, argc, argv);
 }
--- test/composite.c
+++ test/composite.c
@@ -1,6 +1,8 @@
 /*
  * Copyright © 2005 Eric Anholt
  * Copyright © 2009 Chris Wilson
+ * Copyright © 2010 Soeren Sandmann
+ * Copyright © 2010 Red Hat, Inc.
  *
  * Permission to use, copy, modify, distribute, and sell this software and its
  * documentation for any purpose is hereby granted without fee, provided that
@@ -20,15 +22,14 @@
  * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
  * PERFORMANCE OF THIS SOFTWARE.
  */
-
+#define PIXMAN_USE_INTERNAL_API
 #include <pixman.h>
 #include <stdio.h>
 #include <stdlib.h> /* abort() */
 #include <math.h>
 #include <config.h>
-
-#define FALSE 0
-#define TRUE !FALSE
+#include <time.h>
+#include "utils.h"
 
 #define ARRAY_LENGTH(A) ((int) (sizeof (A) / sizeof ((A) [0])))
 #define min(a,b) ((a) <= (b) ? (a) : (b))
@@ -50,14 +51,15 @@
     const char *name;
 };
 
-static color_t colors[] =
+static const color_t colors[] =
 {
-    /* these are premultiplied in main() */
     { 1.0, 1.0, 1.0, 1.0 },
+    { 1.0, 1.0, 1.0, 0.0 },
+    { 0.0, 0.0, 0.0, 1.0 },
+    { 0.0, 0.0, 0.0, 0.0 },
     { 1.0, 0.0, 0.0, 1.0 },
     { 0.0, 1.0, 0.0, 1.0 },
     { 0.0, 0.0, 1.0, 1.0 },
-    { 0.0, 0.0, 0.0, 1.0 },
     { 0.5, 0.0, 0.0, 0.5 },
 };
 
@@ -82,62 +84,66 @@
     out->alpha = _color_double_to_short (color->a);
 }
 
+#define REPEAT 0x01000000
+#define FLAGS  0xff000000
+
+static const int sizes[] =
+{
+    0,
+    1,
+    1 | REPEAT,
+    10
+};
+
 static const format_t formats[] =
 {
 #define P(x) { PIXMAN_##x, #x }
-    P(a8),
 
-    /* 32bpp formats */
+    /* 32 bpp formats */
     P(a8r8g8b8),
     P(x8r8g8b8),
     P(a8b8g8r8),
     P(x8b8g8r8),
     P(b8g8r8a8),
     P(b8g8r8x8),
-
-    /* XXX: and here the errors begin! */
-#if 0
     P(x2r10g10b10),
-    P(a2r10g10b10),
     P(x2b10g10r10),
+    P(a2r10g10b10),
     P(a2b10g10r10),
 
-    /* 24bpp formats */
+    /* 24 bpp formats */
     P(r8g8b8),
     P(b8g8r8),
-
-    /* 16bpp formats */
     P(r5g6b5),
     P(b5g6r5),
 
-    P(a1r5g5b5),
+    /* 16 bpp formats */
     P(x1r5g5b5),
-    P(a1b5g5r5),
     P(x1b5g5r5),
-    P(a4r4g4b4),
-    P(x4r4g4b4),
+    P(a1r5g5b5),
+    P(a1b5g5r5),
     P(a4b4g4r4),
     P(x4b4g4r4),
+    P(a4r4g4b4),
+    P(x4r4g4b4),
 
-    /* 8bpp formats */
+    /* 8 bpp formats */
     P(a8),
     P(r3g3b2),
     P(b2g3r3),
     P(a2r2g2b2),
     P(a2b2g2r2),
-
     P(x4a4),
 
-    /* 4bpp formats */
+    /* 4 bpp formats */
     P(a4),
     P(r1g2b1),
     P(b1g2r1),
     P(a1r1g1b1),
     P(a1b1g1r1),
 
-    /* 1bpp formats */
+    /* 1 bpp formats */
     P(a1)
-#endif
 #undef P
 };
 
@@ -482,8 +488,9 @@
 color_correct (pixman_format_code_t format,
 	       color_t *color)
 {
-#define round_pix(pix, mask) \
-    ((int)((pix) * (mask) + .5) / (double) (mask))
+#define MASK(x) ((1 << (x)) - 1)
+#define round_pix(pix, m)						\
+    ((int)((pix) * (MASK(m)) + .5) / (double) (MASK(m)))
 
     if (PIXMAN_FORMAT_R (format) == 0)
     {
@@ -504,6 +511,7 @@
 	color->a = round_pix (color->a, PIXMAN_FORMAT_A (format));
 
 #undef round_pix
+#undef MASK
 }
 
 static void
@@ -594,18 +602,15 @@
 }
 
 static double
-eval_diff (color_t *expected, color_t *test)
+eval_diff (color_t *expected, color_t *test, pixman_format_code_t format)
 {
     double rscale, gscale, bscale, ascale;
     double rdiff, gdiff, bdiff, adiff;
 
-    /* XXX: Need to be provided mask shifts so we can produce useful error
-     * values.
-     */
-    rscale = 1.0 * (1 << 5);
-    gscale = 1.0 * (1 << 6);
-    bscale = 1.0 * (1 << 5);
-    ascale = 1.0 * 32;
+    rscale = 1.0 * ((1 << PIXMAN_FORMAT_R (format)) - 1);
+    gscale = 1.0 * ((1 << PIXMAN_FORMAT_G (format)) - 1);
+    bscale = 1.0 * ((1 << PIXMAN_FORMAT_B (format)) - 1);
+    ascale = 1.0 * ((1 << PIXMAN_FORMAT_A (format)) - 1);
 
     rdiff = fabs (test->r - expected->r) * rscale;
     bdiff = fabs (test->g - expected->g) * gscale;
@@ -699,7 +704,12 @@
 		  &expected, component_alpha);
     color_correct (dst->format->format, &expected);
 
-    diff = eval_diff (&expected, &result);
+    diff = eval_diff (&expected, &result, dst->format->format);
+
+    /* FIXME: We should find out what deviation is acceptable. 3.0
+     * is clearly absurd for 2 bit formats for example. On the other
+     * hand currently 1.0 does not work.
+     */
     if (diff > 3.0)
     {
 	char buf[40];
@@ -717,7 +727,7 @@
 		result.r, result.g, result.b, result.a,
 		*(unsigned long *) pixman_image_get_data (dst->image),
 		expected.r, expected.g, expected.b, expected.a);
-	
+
 	if (mask != NULL)
 	{
 	    printf ("src color: %.2f %.2f %.2f %.2f\n"
@@ -751,9 +761,6 @@
     return success;
 }
 
-#define REPEAT 0x01000000
-#define FLAGS  0xff000000
-
 static void
 image_init (image_t *info,
 	    int color,
@@ -766,7 +773,7 @@
     compute_pixman_color (info->color, &fill);
 
     info->format = &formats[format];
-    info->size = size & ~FLAGS;
+    info->size = sizes[size] & ~FLAGS;
     info->repeat = PIXMAN_REPEAT_NONE;
 
     if (info->size)
@@ -800,103 +807,105 @@
     pixman_image_unref (info->image);
 }
 
-int
-main (void)
+static int
+random_size (void)
+{
+    return lcg_rand_n (ARRAY_LENGTH (sizes));
+}
+
+static int
+random_color (void)
 {
-    pixman_bool_t ok, group_ok = TRUE, ca;
-    int i, d, m, s;
-    int tests_passed = 0, tests_total = 0;
-    int sizes[] = { 1, 1 | REPEAT, 10 };
-    int num_tests;
+    return lcg_rand_n (ARRAY_LENGTH (colors));
+}
+
+static int
+random_format (void)
+{
+    return lcg_rand_n (ARRAY_LENGTH (formats));
+}
+
+static pixman_bool_t
+run_test (uint32_t seed)
+{
+    image_t src, mask, dst;
+    const operator_t *op;
+    int ca;
+    int ok;
+
+    lcg_srand (seed);
+    
+    image_init (&dst, random_color(), random_format(), 1);
+    image_init (&src, random_color(), random_format(), random_size());
+    image_init (&mask, random_color(), random_format(), random_size());
+
+    op = &(operators [lcg_rand_n (ARRAY_LENGTH (operators))]);
+
+    ca = lcg_rand_n (3);
 
-    for (i = 0; i < ARRAY_LENGTH (colors); i++)
+    switch (ca)
     {
-	colors[i].r *= colors[i].a;
-	colors[i].g *= colors[i].a;
-	colors[i].b *= colors[i].a;
+    case 0:
+	ok = composite_test (&dst, op, &src, NULL, FALSE);
+	break;
+    case 1:
+	ok = composite_test (&dst, op, &src, &mask, FALSE);
+	break;
+    case 2:
+	ok = composite_test (&dst, op, &src, &mask,
+			     mask.size? TRUE : FALSE);
+	break;
+    default:
+	ok = FALSE;
+	break;
     }
 
-    num_tests = ARRAY_LENGTH (colors) * ARRAY_LENGTH (formats);
+    image_fini (&src);
+    image_fini (&mask);
+    image_fini (&dst);
 
-    for (d = 0; d < num_tests; d++)
-    {
-	image_t dst;
+    return ok;
+}
 
-	image_init (
-	    &dst, d / ARRAY_LENGTH (formats), d % ARRAY_LENGTH (formats), 1);
+int
+main (int argc, char **argv)
+{
+#define N_TESTS (8 * 1024 * 1024)
+    int result = 0;
+    int i;
 
+    if (argc > 1)
+    {
+	char *end;
+	
+	i = strtol (argv[1], &end, 0);
 
-	for (s = -ARRAY_LENGTH (colors);
-	     s < ARRAY_LENGTH (sizes) * num_tests;
-	     s++)
+	if (end != argv[1])
 	{
-	    image_t src;
-
-	    if (s < 0)
-	    {
-		image_init (&src, -s - 1, 0, 0);
-	    }
+	    if (!run_test (i))
+		return 1;
 	    else
-	    {
-		image_init (&src,
-			    s / ARRAY_LENGTH (sizes) / ARRAY_LENGTH (formats),
-			    s / ARRAY_LENGTH (sizes) % ARRAY_LENGTH (formats),
-			    sizes[s % ARRAY_LENGTH (sizes)]);
-	    }
-
-	    for (m = -ARRAY_LENGTH (colors);
-		 m < ARRAY_LENGTH (sizes) * num_tests;
-		 m++)
-	    {
-		image_t mask;
+		return 0;
+	}
+	else
+	{
+	    printf ("Usage:\n\n   %s <number>\n\n", argv[0]);
+	    return -1;
+	}
+    }
 
-		if (m < 0)
-		{
-		    image_init (&mask, -m - 1, 0, 0);
-		}
-		else
-		{
-		    image_init (
-			&mask,
-			m / ARRAY_LENGTH (sizes) / ARRAY_LENGTH (formats),
-			m / ARRAY_LENGTH (sizes) % ARRAY_LENGTH (formats),
-			sizes[m % ARRAY_LENGTH (sizes)]);
-		}
-
-		for (ca = -1; ca <= 1; ca++)
-		{
-		    for (i = 0; i < ARRAY_LENGTH (operators); i++)
-		    {
-			const operator_t *op = &operators[i];
-
-			switch (ca)
-			{
-			case -1:
-			    ok = composite_test (&dst, op, &src, NULL, FALSE);
-			    break;
-			case 0:
-			    ok = composite_test (&dst, op, &src, &mask, FALSE);
-			    break;
-			case 1:
-			    ok = composite_test (&dst, op, &src, &mask,
-						 mask.size? TRUE : FALSE);
-			    break;
-                        default:
-			    ok = FALSE; /* Silence GCC */
-                            break;
-			}
-			group_ok = group_ok && ok;
-			tests_passed += ok;
-			tests_total++;
-		    }
-		}
+#ifdef USE_OPENMP
+#   pragma omp parallel for default(none) shared(result) shared(argv) 
+#endif
+    for (i = 1; i <= N_TESTS; ++i)
+    {
+	if (!result && !run_test (i))
+	{
+	    printf ("Test %d failed.\n", i);
 
-		image_fini (&mask);
-	    }
-	    image_fini (&src);
+	    result = i;
 	}
-	image_fini (&dst);
     }
-
-    return group_ok == FALSE;
+    
+    return result;
 }
--- test/lowlevel-blt-bench.c
+++ test/lowlevel-blt-bench.c
@@ -544,7 +544,7 @@
 tests_tbl[] =
 {
     { "add_8_8_8",             PIXMAN_a8,          0, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_a8 },
-    { "add_n_8_8000",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_a8 },
+    { "add_n_8_8",             PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_a8 },
     { "add_n_8_8888",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_a8r8g8b8 },
     { "add_n_8_x888",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_x8r8g8b8 },
     { "add_n_8_0565",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_r5g6b5 },
@@ -553,7 +553,7 @@
     { "add_n_8_2222",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_a2r2g2b2 },
     { "add_n_8_2x10",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_x2r10g10b10 },
     { "add_n_8_2a10",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_a2r10g10b10 },
-    { "add_n_8000",            PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a8 },
+    { "add_n_8",               PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a8 },
     { "add_n_8888",            PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a8r8g8b8 },
     { "add_n_x888",            PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_x8r8g8b8 },
     { "add_n_0565",            PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_r5g6b5 },
@@ -562,7 +562,7 @@
     { "add_n_2222",            PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a2r2g2b2 },
     { "add_n_2x10",            PIXMAN_a2r10g10b10, 1, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_x2r10g10b10 },
     { "add_n_2a10",            PIXMAN_a2r10g10b10, 1, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a2r10g10b10 },
-    { "add_8000_8000",         PIXMAN_a8,          0, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a8 },
+    { "add_8_8",               PIXMAN_a8,          0, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a8 },
     { "add_x888_x888",         PIXMAN_x8r8g8b8,    0, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_x8r8g8b8 },
     { "add_8888_8888",         PIXMAN_a8r8g8b8,    0, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a8r8g8b8 },
     { "add_8888_0565",         PIXMAN_a8r8g8b8,    0, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_r5g6b5 },
--- test/utils.c
+++ test/utils.c
@@ -218,7 +218,12 @@
     int n_bytes;
 } info_t;
 
-#if defined(HAVE_MPROTECT) && defined(HAVE_GETPAGESIZE)
+#if defined(HAVE_MPROTECT) && defined(HAVE_GETPAGESIZE) && defined(HAVE_SYS_MMAN_H)
+
+/* This is apparently necessary on at least OS X */
+#ifndef MAP_ANONYMOUS
+#define MAP_ANONYMOUS MAP_ANON
+#endif
 
 void *
 fence_malloc (uint32_t len)
@@ -238,7 +243,7 @@
     addr = mmap (NULL, n_bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS,
 		 -1, 0);
 
-    if (addr == (void *)MAP_FAILED)
+    if (addr == MAP_FAILED)
     {
 	printf ("mmap failed on %u %u\n", len, n_bytes);
 	return NULL;
@@ -254,20 +259,12 @@
     ((info_t *)initial_page)->trailing = trailing_protected;
     ((info_t *)initial_page)->n_bytes = n_bytes;
 
-    if (mprotect (leading_protected, N_LEADING_PROTECTED * page_size,
-		  PROT_NONE) == -1)
-    {
-	free (addr);
-	return NULL;
-    }
-
-    if (mprotect (trailing_protected, N_TRAILING_PROTECTED * page_size,
-		  PROT_NONE) == -1)
+    if ((mprotect (leading_protected, N_LEADING_PROTECTED * page_size,
+		  PROT_NONE) == -1) ||
+	(mprotect (trailing_protected, N_TRAILING_PROTECTED * page_size,
+		  PROT_NONE) == -1))
     {
-	mprotect (leading_protected, N_LEADING_PROTECTED * page_size,
-		  PROT_READ | PROT_WRITE);
-
-	free (addr);
+	munmap (addr, n_bytes);
 	return NULL;
     }
 
@@ -282,13 +279,6 @@
     uint8_t *leading_protected = payload - N_LEADING_PROTECTED * page_size;
     uint8_t *initial_page = leading_protected - page_size;
     info_t *info = (info_t *)initial_page;
-    uint8_t *trailing_protected = info->trailing;
-
-    mprotect (leading_protected, N_LEADING_PROTECTED * page_size,
-	      PROT_READ | PROT_WRITE);
-
-    mprotect (trailing_protected, N_LEADING_PROTECTED * page_size,
-	      PROT_READ | PROT_WRITE);
 
     munmap (info->addr, info->n_bytes);
 }

++++++ pixman.yaml
--- pixman.yaml
+++ pixman.yaml
@@ -1,6 +1,6 @@
 Name: pixman
 Summary: Pixel manipulation library
-Version: 0.19.4
+Version: 0.20.0
 Release: 1
 Group: System/Libraries
 License: MIT



More information about the MeeGo-commits mailing list