i915: Fix streaming loads for intel_tiled_memcpy
authorChris Wilson <chris@chris-wilson.co.uk>
Fri, 25 May 2018 23:33:56 +0000 (00:33 +0100)
committerKenneth Graunke <kenneth@whitecape.org>
Sat, 26 May 2018 04:35:50 +0000 (21:35 -0700)
We stream from a tiled and aligned source into an unaligned user buffer,
so we need to use _mm_storeu_si128.

Fixes: d21c086d819d78fb3f6abcbb14aa492970f442aa (i965/tiled_memcpy: inline movntdqa loads in tiled_to_linear)
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
src/mesa/drivers/dri/i965/intel_tiled_memcpy.c

index fac5427d2ed685131636446b638a690f9f034b46..6440dceac362806ad9eec8a40c747d97bc4bd502 100644 (file)
@@ -223,17 +223,17 @@ _memcpy_streaming_load(void *dest, const void *src, size_t count)
 {
    if (count == 16) {
       __m128i val = _mm_stream_load_si128((__m128i *)src);
-      _mm_store_si128((__m128i *)dest, val);
+      _mm_storeu_si128((__m128i *)dest, val);
       return dest;
    } else if (count == 64) {
       __m128i val0 = _mm_stream_load_si128(((__m128i *)src) + 0);
       __m128i val1 = _mm_stream_load_si128(((__m128i *)src) + 1);
       __m128i val2 = _mm_stream_load_si128(((__m128i *)src) + 2);
       __m128i val3 = _mm_stream_load_si128(((__m128i *)src) + 3);
-      _mm_store_si128(((__m128i *)dest) + 0, val0);
-      _mm_store_si128(((__m128i *)dest) + 1, val1);
-      _mm_store_si128(((__m128i *)dest) + 2, val2);
-      _mm_store_si128(((__m128i *)dest) + 3, val3);
+      _mm_storeu_si128(((__m128i *)dest) + 0, val0);
+      _mm_storeu_si128(((__m128i *)dest) + 1, val1);
+      _mm_storeu_si128(((__m128i *)dest) + 2, val2);
+      _mm_storeu_si128(((__m128i *)dest) + 3, val3);
       return dest;
    } else {
       assert(count < 64); /* and (count < 16) for ytiled */