gallium/u_threaded: align batches and call slots to 16 bytes
authorMarek Olšák <marek.olsak@amd.com>
Tue, 30 May 2017 23:32:01 +0000 (01:32 +0200)
committerMarek Olšák <marek.olsak@amd.com>
Mon, 5 Jun 2017 16:25:57 +0000 (18:25 +0200)
not sure if this helps

Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
src/gallium/auxiliary/util/u_threaded_context.c
src/gallium/auxiliary/util/u_threaded_context.h

index 8ea7f8aa260d956461a9e1e59f3f2d3ca1bc04d8..34206bfbf40927eb88b4b895e584d3ab204deb68 100644 (file)
@@ -2125,7 +2125,7 @@ tc_destroy(struct pipe_context *_pipe)
 
    slab_destroy_child(&tc->pool_transfers);
    pipe->destroy(pipe);
-   FREE(tc);
+   os_free_aligned(tc);
 }
 
 static const tc_execute execute_func[TC_NUM_CALLS] = {
@@ -2165,11 +2165,18 @@ threaded_context_create(struct pipe_context *pipe,
    if (!debug_get_bool_option("GALLIUM_THREAD", util_cpu_caps.nr_cpus > 1))
       return pipe;
 
-   tc = CALLOC_STRUCT(threaded_context);
+   tc = os_malloc_aligned(sizeof(struct threaded_context), 16);
    if (!tc) {
       pipe->destroy(pipe);
       return NULL;
    }
+   memset(tc, 0, sizeof(*tc));
+
+   assert((uintptr_t)tc % 16 == 0);
+   STATIC_ASSERT(offsetof(struct threaded_context, batch_slots[0]) % 16 == 0);
+   STATIC_ASSERT(offsetof(struct threaded_context, batch_slots[0].call[0]) % 16 == 0);
+   STATIC_ASSERT(offsetof(struct threaded_context, batch_slots[0].call[1]) % 16 == 0);
+   STATIC_ASSERT(offsetof(struct threaded_context, batch_slots[1].call[0]) % 16 == 0);
 
    /* The driver context isn't wrapped, so set its "priv" to NULL. */
    pipe->priv = NULL;
index f13923050a07f46db182b907e382bc0c940a6dd5..5d2a10cb125958b66bbb7c7b0c66165526a3ca48 100644 (file)
@@ -273,7 +273,14 @@ union tc_payload {
    uint64_t __use_8_bytes;
 };
 
-struct tc_call {
+#ifdef _MSC_VER
+#define ALIGN16 __declspec(align(16))
+#else
+#define ALIGN16 __attribute__((aligned(16)))
+#endif
+
+/* Each call slot should be aligned to its own size for optimal cache usage. */
+struct ALIGN16 tc_call {
    unsigned sentinel;
    ushort num_call_slots;
    ushort call_id;