#include "brw_context.h"
#include "intel_tiled_memcpy.h"
+#if defined(USE_SSE41)
+#include "main/streaming-load-memcpy.h"
+#include <smmintrin.h>
+#endif
#if defined(__SSSE3__)
#include <tmmintrin.h>
#elif defined(__SSE2__)
return dst;
}
+#if defined(USE_SSE41)
+static ALWAYS_INLINE void *
+_memcpy_streaming_load(void *dest, const void *src, size_t count)
+{
+ if (count == 16) {
+ __m128i val = _mm_stream_load_si128((__m128i *)src);
+ _mm_store_si128((__m128i *)dest, val);
+ return dest;
+ } else if (count == 64) {
+ __m128i val0 = _mm_stream_load_si128(((__m128i *)src) + 0);
+ __m128i val1 = _mm_stream_load_si128(((__m128i *)src) + 1);
+ __m128i val2 = _mm_stream_load_si128(((__m128i *)src) + 2);
+ __m128i val3 = _mm_stream_load_si128(((__m128i *)src) + 3);
+ _mm_store_si128(((__m128i *)dest) + 0, val0);
+ _mm_store_si128(((__m128i *)dest) + 1, val1);
+ _mm_store_si128(((__m128i *)dest) + 2, val2);
+ _mm_store_si128(((__m128i *)dest) + 3, val3);
+ return dest;
+ } else {
+ assert(count < 64); /* and (count < 16) for ytiled */
+ return memcpy(dest, src, count);
+ }
+}
+#endif
+
/**
* Each row from y0 to y1 is copied in three parts: [x0,x1), [x1,x2), [x2,x3).
* These ranges are in bytes, i.e. pixels * bytes-per-pixel.
return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
dst, src, dst_pitch, swizzle_bit,
rgba8_copy, rgba8_copy_aligned_src);
+#if defined(USE_SSE41)
+ else if (mem_copy == (mem_copy_fn)_mesa_streaming_load_memcpy)
+ return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
+ dst, src, dst_pitch, swizzle_bit,
+ memcpy, _memcpy_streaming_load);
+#endif
else
unreachable("not reached");
} else {
return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
dst, src, dst_pitch, swizzle_bit,
rgba8_copy, rgba8_copy_aligned_src);
+#if defined(USE_SSE41)
+ else if (mem_copy == (mem_copy_fn)_mesa_streaming_load_memcpy)
+ return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
+ dst, src, dst_pitch, swizzle_bit,
+ memcpy, _memcpy_streaming_load);
+#endif
else
unreachable("not reached");
}
return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
dst, src, dst_pitch, swizzle_bit,
rgba8_copy, rgba8_copy_aligned_src);
+#if defined(USE_SSE41)
+ else if (mem_copy == (mem_copy_fn)_mesa_streaming_load_memcpy)
+ return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
+ dst, src, dst_pitch, swizzle_bit,
+ memcpy, _memcpy_streaming_load);
+#endif
else
unreachable("not reached");
} else {
return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
dst, src, dst_pitch, swizzle_bit,
rgba8_copy, rgba8_copy_aligned_src);
+#if defined(USE_SSE41)
+ else if (mem_copy == (mem_copy_fn)_mesa_streaming_load_memcpy)
+ return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
+ dst, src, dst_pitch, swizzle_bit,
+ memcpy, _memcpy_streaming_load);
+#endif
else
unreachable("not reached");
}
unreachable("unsupported tiling");
}
+#if defined(USE_SSE41)
+ if (mem_copy == (mem_copy_fn)_mesa_streaming_load_memcpy) {
+ /* The hidden cacheline sized register used by movntdqa can apparently
+ * give you stale data, so do an mfence to invalidate it.
+ */
+ _mm_mfence();
+ }
+#endif
+
/* Round out to tile boundaries. */
xt0 = ALIGN_DOWN(xt1, tw);
xt3 = ALIGN_UP (xt2, tw);
'intel_tex_image.c',
'intel_tex_obj.h',
'intel_tex_validate.c',
- 'intel_tiled_memcpy.c',
- 'intel_tiled_memcpy.h',
'intel_upload.c',
'libdrm_macros.h',
)
+files_intel_tiled_memcpy = files(
+ 'intel_tiled_memcpy.c',
+ 'intel_tiled_memcpy.h',
+)
+
i965_gen_libs = []
foreach v : ['40', '45', '50', '60', '70', '75', '80', '90', '100', '110']
i965_gen_libs += static_library(
],
)
+intel_tiled_memcpy = static_library(
+ 'intel_tiled_memcpy',
+ [files_intel_tiled_memcpy],
+ include_directories : [
+ inc_common, inc_intel, inc_dri_common, inc_drm_uapi,
+ ],
+ c_args : [c_vis_args, no_override_init_args, '-msse2', sse41_args],
+)
+
libi965 = static_library(
'i965',
[files_i965, i965_oa_sources, ir_expression_operation_h,
cpp_args : [cpp_vis_args, '-msse2'],
link_with : [
i965_gen_libs, libintel_common, libintel_dev, libisl, libintel_compiler,
- libblorp,
+ libblorp, intel_tiled_memcpy,
],
dependencies : [dep_libdrm, dep_valgrind, idep_nir_headers],
)