i965: Split shader_time entries into separate cachelines.
authorEric Anholt <eric@anholt.net>
Mon, 11 Mar 2013 19:59:06 +0000 (12:59 -0700)
committerEric Anholt <eric@anholt.net>
Thu, 14 Mar 2013 19:30:39 +0000 (12:30 -0700)
This avoids some snooping overhead between EUs processing separate shaders
(so VS versus FS).

Improves performance of a minecraft trace with shader_time by 28.9% +/-
18.3% (n=7), and performance of my old GLSL demo by 93.7% +/- 0.8% (n=4).

v2: Add a define for the stride with a comment explaining its units and
    why.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
src/mesa/drivers/dri/i965/brw_context.h
src/mesa/drivers/dri/i965/brw_fs.cpp
src/mesa/drivers/dri/i965/brw_program.c
src/mesa/drivers/dri/i965/brw_vec4.cpp

index c34d6b108a2eca14416d69798bca89af1cb28e3c..d042dd640c91caedc35ff806fdd069d1b51695a5 100644 (file)
@@ -571,6 +571,14 @@ struct brw_vs_prog_data {
 #define SURF_INDEX_SOL_BINDING(t)    ((t))
 #define BRW_MAX_GS_SURFACES          SURF_INDEX_SOL_BINDING(BRW_MAX_SOL_BINDINGS)
 
+/**
+ * Stride in bytes between shader_time entries.
+ *
+ * We separate entries by a cacheline to reduce traffic between EUs writing to
+ * different entries.
+ */
+#define SHADER_TIME_STRIDE 64
+
 enum brw_cache_id {
    BRW_BLEND_STATE,
    BRW_DEPTH_STENCIL_STATE,
index 8ce39543002de8ecf7424262eb8e884543d904e6..8476bb51086106860d1dbe80b5c1bceabad73512 100644 (file)
@@ -621,7 +621,7 @@ fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 
    fs_reg offset_mrf = fs_reg(MRF, base_mrf);
    offset_mrf.type = BRW_REGISTER_TYPE_UD;
-   emit(MOV(offset_mrf, fs_reg(shader_time_index * 4)));
+   emit(MOV(offset_mrf, fs_reg(shader_time_index * SHADER_TIME_STRIDE)));
 
    fs_reg time_mrf = fs_reg(MRF, base_mrf + 1);
    time_mrf.type = BRW_REGISTER_TYPE_UD;
index 75eb6bc66899056e59e93ffc134ea587cbda2a0d..62954d3cf7863f2eaa6225e5d02d11bcf95cc21f 100644 (file)
@@ -228,7 +228,8 @@ brw_init_shader_time(struct brw_context *brw)
 
    const int max_entries = 4096;
    brw->shader_time.bo = drm_intel_bo_alloc(intel->bufmgr, "shader time",
-                                            max_entries * 4, 4096);
+                                            max_entries * SHADER_TIME_STRIDE,
+                                            4096);
    brw->shader_time.programs = rzalloc_array(brw, struct gl_shader_program *,
                                              max_entries);
    brw->shader_time.types = rzalloc_array(brw, enum shader_time_shader_type,
@@ -409,7 +410,7 @@ brw_collect_shader_time(struct brw_context *brw)
    uint32_t *times = brw->shader_time.bo->virtual;
 
    for (int i = 0; i < brw->shader_time.num_entries; i++) {
-      brw->shader_time.cumulative[i] += times[i];
+      brw->shader_time.cumulative[i] += times[i * SHADER_TIME_STRIDE / 4];
    }
 
    /* Zero the BO out to clear it out for our next collection.
index f319f32c2cc236e183c0853c7696fc80951995ab..d759710af3cc75f171ac12f5d8716caf8b5f9f43 100644 (file)
@@ -1225,7 +1225,7 @@ vec4_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 
    dst_reg offset_mrf = dst_reg(MRF, base_mrf);
    offset_mrf.type = BRW_REGISTER_TYPE_UD;
-   emit(MOV(offset_mrf, src_reg(shader_time_index * 4)));
+   emit(MOV(offset_mrf, src_reg(shader_time_index * SHADER_TIME_STRIDE)));
 
    dst_reg time_mrf = dst_reg(MRF, base_mrf + 1);
    time_mrf.type = BRW_REGISTER_TYPE_UD;