i965: store workaround_bo offset

[mesa.git] / src / mesa / drivers / dri / i965 / brw_program_cache.c
diff --git a/src/mesa/drivers/dri/i965/brw_program_cache.c b/src/mesa/drivers/dri/i965/brw_program_cache.c

index 3d95372bc0e4052d1094ea168618f42b27d8c9fe..c1bdaab591c669174a0c0a1c611fa543a9a4693f 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_program_cache.c
+++ b/src/mesa/drivers/dri/i965/brw_program_cache.c
@@ -37,59 +37,68 @@
   *  data) in return.  Objects in the cache may not have relocations
   * (pointers to other BOs) in them.
   *
- * The inner workings are a simple hash table based on a CRC of the
+ * The inner workings are a simple hash table based on a FNV-1a of the
   * key data.
   *
   * Replacement is not implemented.  Instead, when the cache gets too
   * big we throw out all of the cache data and let it get regenerated.
   */
  
-#include "main/imports.h"
+#include "main/streaming-load-memcpy.h"
+#include "x86/common_x86_asm.h"
  #include "intel_batchbuffer.h"
  #include "brw_state.h"
-#include "brw_vs.h"
  #include "brw_wm.h"
  #include "brw_gs.h"
  #include "brw_cs.h"
  #include "brw_program.h"
+#include "compiler/brw_eu.h"
+#include "util/u_memory.h"
  
  #define FILE_DEBUG_FLAG DEBUG_STATE
  
-static unsigned
-get_program_string_id(enum brw_cache_id cache_id, const void *key)
+struct brw_cache_item {
+   /**
+    * Effectively part of the key, cache_id identifies what kind of state
+    * buffer is involved, and also which dirty flag should set.
+    */
+   enum brw_cache_id cache_id;
+
+   /** 32-bit hash of the key data */
+   GLuint hash;
+
+   /** for variable-sized keys */
+   GLuint key_size;
+   GLuint prog_data_size;
+   const struct brw_base_prog_key *key;
+
+   uint32_t offset;
+   uint32_t size;
+
+   struct brw_cache_item *next;
+};
+
+enum brw_cache_id
+brw_stage_cache_id(gl_shader_stage stage)
  {
-   switch (cache_id) {
-   case BRW_CACHE_VS_PROG:
-      return ((struct brw_vs_prog_key *) key)->program_string_id;
-   case BRW_CACHE_TCS_PROG:
-      return ((struct brw_tcs_prog_key *) key)->program_string_id;
-   case BRW_CACHE_TES_PROG:
-      return ((struct brw_tes_prog_key *) key)->program_string_id;
-   case BRW_CACHE_GS_PROG:
-      return ((struct brw_gs_prog_key *) key)->program_string_id;
-   case BRW_CACHE_CS_PROG:
-      return ((struct brw_cs_prog_key *) key)->program_string_id;
-   case BRW_CACHE_FS_PROG:
-      return ((struct brw_wm_prog_key *) key)->program_string_id;
-   default:
-      unreachable("no program string id for this kind of program");
-   }
+   static const enum brw_cache_id stage_ids[] = {
+      BRW_CACHE_VS_PROG,
+      BRW_CACHE_TCS_PROG,
+      BRW_CACHE_TES_PROG,
+      BRW_CACHE_GS_PROG,
+      BRW_CACHE_FS_PROG,
+      BRW_CACHE_CS_PROG,
+   };
+   assert((int)stage >= 0 && stage < ARRAY_SIZE(stage_ids));
+   return stage_ids[stage];
  }
  
  static GLuint
  hash_key(struct brw_cache_item *item)
  {
-   GLuint *ikey = (GLuint *)item->key;
-   GLuint hash = item->cache_id, i;
-
-   assert(item->key_size % 4 == 0);
-
-   /* I'm sure this can be improved on:
-    */
-   for (i = 0; i < item->key_size/4; i++) {
-      hash ^= ikey[i];
-      hash = (hash << 5) | (hash >> 27);
-   }
+    uint32_t hash = _mesa_fnv32_1a_offset_bias;
+    hash = _mesa_fnv32_1a_accumulate(hash, item->cache_id);
+    hash = _mesa_fnv32_1a_accumulate_block(hash, item->key, item->key_size);
  
     return hash;
  }
@@ -156,12 +165,10 @@ rehash(struct brw_cache *cache)
   * Returns the buffer object matching cache_id and key, or NULL.
   */
  bool
-brw_search_cache(struct brw_cache *cache,
-                 enum brw_cache_id cache_id,
-                 const void *key, GLuint key_size,
-                 uint32_t *inout_offset, void *inout_aux)
+brw_search_cache(struct brw_cache *cache, enum brw_cache_id cache_id,
+                 const void *key, GLuint key_size, uint32_t *inout_offset,
+                 void *inout_prog_data, bool flag_state)
  {
-   struct brw_context *brw = cache->brw;
     struct brw_cache_item *item;
     struct brw_cache_item lookup;
     GLuint hash;
@@ -177,12 +184,14 @@ brw_search_cache(struct brw_cache *cache,
     if (item == NULL)
        return false;
  
-   void *aux = ((char *) item->key) + item->key_size;
+   void *prog_data = ((char *) item->key) + item->key_size;
  
-   if (item->offset != *inout_offset || aux != *((void **) inout_aux)) {
-      brw->ctx.NewDriverState |= (1 << cache_id);
+   if (item->offset != *inout_offset ||
+       prog_data != *((void **) inout_prog_data)) {
+      if (likely(flag_state))
+         cache->brw->ctx.NewDriverState |= (1 << cache_id);
        *inout_offset = item->offset;
-      *((void **) inout_aux) = aux;
+      *((void **) inout_prog_data) = prog_data;
     }
  
     return true;
@@ -192,29 +201,33 @@ static void
  brw_cache_new_bo(struct brw_cache *cache, uint32_t new_size)
  {
     struct brw_context *brw = cache->brw;
-   drm_intel_bo *new_bo;
+   struct brw_bo *new_bo;
  
-   new_bo = drm_intel_bo_alloc(brw->bufmgr, "program cache", new_size, 64);
-   if (brw->has_llc)
-      drm_intel_gem_bo_map_unsynchronized(new_bo);
+   perf_debug("Copying to larger program cache: %u kB -> %u kB\n",
+              (unsigned) cache->bo->size / 1024, new_size / 1024);
+
+   new_bo = brw_bo_alloc(brw->bufmgr, "program cache", new_size,
+                         BRW_MEMZONE_SHADER);
+   if (can_do_exec_capture(brw->screen))
+      new_bo->kflags |= EXEC_OBJECT_CAPTURE;
+
+   void *map = brw_bo_map(brw, new_bo, MAP_READ | MAP_WRITE |
+                                       MAP_ASYNC | MAP_PERSISTENT);
  
     /* Copy any existing data that needs to be saved. */
     if (cache->next_offset != 0) {
-      if (brw->has_llc) {
-         memcpy(new_bo->virtual, cache->bo->virtual, cache->next_offset);
-      } else {
-         drm_intel_bo_map(cache->bo, false);
-         drm_intel_bo_subdata(new_bo, 0, cache->next_offset,
-                              cache->bo->virtual);
-         drm_intel_bo_unmap(cache->bo);
-      }
+#ifdef USE_SSE41
+      if (!cache->bo->cache_coherent && cpu_has_sse4_1)
+         _mesa_streaming_load_memcpy(map, cache->map, cache->next_offset);
+      else
+#endif
+         memcpy(map, cache->map, cache->next_offset);
     }
  
-   if (brw->has_llc)
-      drm_intel_bo_unmap(cache->bo);
-   drm_intel_bo_unreference(cache->bo);
+   brw_bo_unmap(cache->bo);
+   brw_bo_unreference(cache->bo);
     cache->bo = new_bo;
-   cache->bo_used_by_gpu = false;
+   cache->map = map;
  
     /* Since we have a new BO in place, we need to signal the units
      * that depend on it (state base address on gen5+, or unit state before).
@@ -231,23 +244,13 @@ brw_lookup_prog(const struct brw_cache *cache,
                  enum brw_cache_id cache_id,
                  const void *data, unsigned data_size)
  {
-   const struct brw_context *brw = cache->brw;
     unsigned i;
     const struct brw_cache_item *item;
  
     for (i = 0; i < cache->size; i++) {
        for (item = cache->items[i]; item; item = item->next) {
-         int ret;
-
-         if (item->cache_id != cache_id || item->size != data_size)
-            continue;
-
-         if (!brw->has_llc)
-            drm_intel_bo_map(cache->bo, false);
-         ret = memcmp(cache->bo->virtual + item->offset, data, item->size);
-         if (!brw->has_llc)
-            drm_intel_bo_unmap(cache->bo);
-         if (ret)
+         if (item->cache_id != cache_id || item->size != data_size ||
+             memcmp(cache->map + item->offset, data, item->size) != 0)
              continue;
  
           return item;
@@ -261,7 +264,6 @@ static uint32_t
  brw_alloc_item_data(struct brw_cache *cache, uint32_t size)
  {
     uint32_t offset;
-   struct brw_context *brw = cache->brw;
  
     /* Allocate space in the cache BO for our new program. */
     if (cache->next_offset + size > cache->bo->size) {
@@ -273,14 +275,6 @@ brw_alloc_item_data(struct brw_cache *cache, uint32_t size)
        brw_cache_new_bo(cache, new_size);
     }
  
-   /* If we would block on writing to an in-use program BO, just
-    * recreate it.
-    */
-   if (!brw->has_llc && cache->bo_used_by_gpu) {
-      perf_debug("Copying busy program cache buffer.\n");
-      brw_cache_new_bo(cache, cache->bo->size);
-   }
-
     offset = cache->next_offset;
  
     /* Programs are always 64-byte aligned, so set up the next one now */
@@ -297,7 +291,7 @@ brw_find_previous_compile(struct brw_cache *cache,
     for (unsigned i = 0; i < cache->size; i++) {
        for (struct brw_cache_item *c = cache->items[i]; c; c = c->next) {
           if (c->cache_id == cache_id &&
-             get_program_string_id(cache_id, c->key) == program_string_id) {
+             c->key->program_string_id == program_string_id) {
              return c->key;
           }
        }
@@ -313,12 +307,11 @@ brw_upload_cache(struct brw_cache *cache,
                   GLuint key_size,
                   const void *data,
                   GLuint data_size,
-                 const void *aux,
-                 GLuint aux_size,
+                 const void *prog_data,
+                 GLuint prog_data_size,
                   uint32_t *out_offset,
-                 void *out_aux)
+                 void *out_prog_data)
  {
-   struct brw_context *brw = cache->brw;
     struct brw_cache_item *item = CALLOC_STRUCT(brw_cache_item);
     const struct brw_cache_item *matching_data =
        brw_lookup_prog(cache, cache_id, data, data_size);
@@ -329,7 +322,7 @@ brw_upload_cache(struct brw_cache *cache,
     item->size = data_size;
     item->key = key;
     item->key_size = key_size;
-   item->aux_size = aux_size;
+   item->prog_data_size = prog_data_size;
     hash = hash_key(item);
     item->hash = hash;
  
@@ -345,18 +338,14 @@ brw_upload_cache(struct brw_cache *cache,
        item->offset = brw_alloc_item_data(cache, data_size);
  
        /* Copy data to the buffer */
-      if (brw->has_llc) {
-         memcpy((char *)cache->bo->virtual + item->offset, data, data_size);
-      } else {
-         drm_intel_bo_subdata(cache->bo, item->offset, data_size, data);
-      }
+      memcpy(cache->map + item->offset, data, data_size);
     }
  
-   /* Set up the memory containing the key and aux_data */
-   tmp = malloc(key_size + aux_size);
+   /* Set up the memory containing the key and prog_data */
+   tmp = malloc(key_size + prog_data_size);
  
     memcpy(tmp, key, key_size);
-   memcpy(tmp + key_size, aux, aux_size);
+   memcpy(tmp + key_size, prog_data, prog_data_size);
  
     item->key = tmp;
  
@@ -369,7 +358,7 @@ brw_upload_cache(struct brw_cache *cache,
     cache->n_items++;
  
     *out_offset = item->offset;
-   *(void **)out_aux = (void *)((char *)item->key + item->key_size);
+   *(void **)out_prog_data = (void *)((char *)item->key + item->key_size);
     cache->brw->ctx.NewDriverState |= 1 << cache_id;
  }
  
@@ -385,9 +374,13 @@ brw_init_caches(struct brw_context *brw)
     cache->items =
        calloc(cache->size, sizeof(struct brw_cache_item *));
  
-   cache->bo = drm_intel_bo_alloc(brw->bufmgr, "program cache",  4096, 64);
-   if (brw->has_llc)
-      drm_intel_gem_bo_map_unsynchronized(cache->bo);
+   cache->bo = brw_bo_alloc(brw->bufmgr, "program cache", 16384,
+                            BRW_MEMZONE_SHADER);
+   if (can_do_exec_capture(brw->screen))
+      cache->bo->kflags |= EXEC_OBJECT_CAPTURE;
+
+   cache->map = brw_bo_map(brw, cache->bo, MAP_READ | MAP_WRITE |
+                                           MAP_ASYNC | MAP_PERSISTENT);
  }
  
  static void
@@ -407,8 +400,8 @@ brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
               c->cache_id == BRW_CACHE_GS_PROG ||
               c->cache_id == BRW_CACHE_FS_PROG ||
               c->cache_id == BRW_CACHE_CS_PROG) {
-            const void *item_aux = c->key + c->key_size;
-            brw_stage_prog_data_free(item_aux);
+            const void *item_prog_data = ((char *)c->key) + c->key_size;
+            brw_stage_prog_data_free(item_prog_data);
           }
           free((void *)c->key);
           free(c);
@@ -454,6 +447,7 @@ brw_program_cache_check_size(struct brw_context *brw)
        perf_debug("Exceeded state cache size limit.  Clearing the set "
                   "of compiled programs, which will trigger recompiles\n");
        brw_clear_cache(brw, &brw->cache);
+      brw_cache_new_bo(&brw->cache, brw->cache.bo->size);
     }
  }
  
@@ -464,10 +458,13 @@ brw_destroy_cache(struct brw_context *brw, struct brw_cache *cache)
  
     DBG("%s\n", __func__);
  
-   if (brw->has_llc)
-      drm_intel_bo_unmap(cache->bo);
-   drm_intel_bo_unreference(cache->bo);
-   cache->bo = NULL;
+   /* This can be NULL if context creation failed early on */
+   if (cache->bo) {
+      brw_bo_unmap(cache->bo);
+      brw_bo_unreference(cache->bo);
+      cache->bo = NULL;
+      cache->map = NULL;
+   }
     brw_clear_cache(brw, cache);
     free(cache->items);
     cache->items = NULL;
@@ -480,3 +477,45 @@ brw_destroy_caches(struct brw_context *brw)
  {
     brw_destroy_cache(brw, &brw->cache);
  }
+
+static const char *
+cache_name(enum brw_cache_id cache_id)
+{
+   switch (cache_id) {
+   case BRW_CACHE_VS_PROG:
+      return "VS kernel";
+   case BRW_CACHE_TCS_PROG:
+      return "TCS kernel";
+   case BRW_CACHE_TES_PROG:
+      return "TES kernel";
+   case BRW_CACHE_FF_GS_PROG:
+      return "Fixed-function GS kernel";
+   case BRW_CACHE_GS_PROG:
+      return "GS kernel";
+   case BRW_CACHE_CLIP_PROG:
+      return "CLIP kernel";
+   case BRW_CACHE_SF_PROG:
+      return "SF kernel";
+   case BRW_CACHE_FS_PROG:
+      return "FS kernel";
+   case BRW_CACHE_CS_PROG:
+      return "CS kernel";
+   default:
+      return "unknown";
+   }
+}
+
+void
+brw_print_program_cache(struct brw_context *brw)
+{
+   const struct brw_cache *cache = &brw->cache;
+   struct brw_cache_item *item;
+
+   for (unsigned i = 0; i < cache->size; i++) {
+      for (item = cache->items[i]; item; item = item->next) {
+         fprintf(stderr, "%s:\n", cache_name(i));
+         brw_disassemble(&brw->screen->devinfo, cache->map,
+                         item->offset, item->size, stderr);
+      }
+   }
+}