+ /*
+ * The rounding may not be quite the same with PIPE_ARCH_SSE
+ * (util_iround right now only does nearest/even on x87,
+ * otherwise nearest/away-from-zero).
+ * Both should be acceptable, I think.
+ */
+#if defined(PIPE_ARCH_SSE)
+ __m128 v0r, v1r;
+ __m128 vxy0xy2, vxy1xy0;
+ __m128i vxy0xy2i, vxy1xy0i;
+ __m128i dxdy0120, x0x2y0y2, x1x0y1y0, x0120, y0120;
+ __m128 pix_offset = _mm_set1_ps(setup->pixel_offset);
+ __m128 fixed_one = _mm_set1_ps((float)FIXED_ONE);
+ v0r = _mm_castpd_ps(_mm_load_sd((double *)v0[0]));
+ vxy0xy2 = _mm_loadh_pi(v0r, (__m64 *)v2[0]);
+ v1r = _mm_castpd_ps(_mm_load_sd((double *)v1[0]));
+ vxy1xy0 = _mm_movelh_ps(v1r, vxy0xy2);
+ vxy0xy2 = _mm_sub_ps(vxy0xy2, pix_offset);
+ vxy1xy0 = _mm_sub_ps(vxy1xy0, pix_offset);
+ vxy0xy2 = _mm_mul_ps(vxy0xy2, fixed_one);
+ vxy1xy0 = _mm_mul_ps(vxy1xy0, fixed_one);
+ vxy0xy2i = _mm_cvtps_epi32(vxy0xy2);
+ vxy1xy0i = _mm_cvtps_epi32(vxy1xy0);
+ dxdy0120 = _mm_sub_epi32(vxy0xy2i, vxy1xy0i);
+ _mm_store_si128((__m128i *)&position->dx01, dxdy0120);
+ /*
+ * For the mul, would need some more shuffles, plus emulation
+ * for the signed mul (without sse41), so don't bother.
+ */
+ x0x2y0y2 = _mm_shuffle_epi32(vxy0xy2i, _MM_SHUFFLE(3,1,2,0));
+ x1x0y1y0 = _mm_shuffle_epi32(vxy1xy0i, _MM_SHUFFLE(3,1,2,0));
+ x0120 = _mm_unpacklo_epi32(x0x2y0y2, x1x0y1y0);
+ y0120 = _mm_unpackhi_epi32(x0x2y0y2, x1x0y1y0);
+ _mm_store_si128((__m128i *)&position->x[0], x0120);
+ _mm_store_si128((__m128i *)&position->y[0], y0120);
+
+#else