src/mesa/drivers/dri/i965/brw_state_cache.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keithw@vmware.com>
  30   */
  31
  32 /** @file brw_state_cache.c
  33  *
  34  * This file implements a simple static state cache for 965.  The
  35  * consumers can query the hash table of state using a cache_id,
  36  * opaque key data, and receive the corresponding state buffer object
  37  * of state (plus associated auxiliary data) in return.  Objects in
  38  * the cache may not have relocations (pointers to other BOs) in them.
  39  *
  40  * The inner workings are a simple hash table based on a CRC of the
  41  * key data.
  42  *
  43  * Replacement is not implemented.  Instead, when the cache gets too
  44  * big we throw out all of the cache data and let it get regenerated.
  45  */
  46
  47 #include "main/imports.h"
  48 #include "intel_batchbuffer.h"
  49 #include "brw_state.h"
  50 #include "brw_vs.h"
  51 #include "brw_wm.h"
  52 #include "brw_gs.h"
  53 #include "brw_cs.h"
  54
  55 #define FILE_DEBUG_FLAG DEBUG_STATE
  56
  57 static GLuint
  58 hash_key(struct brw_cache_item *item)
  59 {
  60    GLuint *ikey = (GLuint *)item->key;
  61    GLuint hash = item->cache_id, i;
  62
  63    assert(item->key_size % 4 == 0);
  64
  65    /* I'm sure this can be improved on:
  66     */
  67    for (i = 0; i < item->key_size/4; i++) {
  68       hash ^= ikey[i];
  69       hash = (hash << 5) | (hash >> 27);
  70    }
  71
  72    return hash;
  73 }
  74
  75 static int
  76 brw_cache_item_equals(const struct brw_cache_item *a,
  77                       const struct brw_cache_item *b)
  78 {
  79    return a->cache_id == b->cache_id &&
  80       a->hash == b->hash &&
  81       a->key_size == b->key_size &&
  82       (memcmp(a->key, b->key, a->key_size) == 0);
  83 }
  84
  85 static struct brw_cache_item *
  86 search_cache(struct brw_cache *cache, GLuint hash,
  87              struct brw_cache_item *lookup)
  88 {
  89    struct brw_cache_item *c;
  90
  91 #if 0
  92    int bucketcount = 0;
  93
  94    for (c = cache->items[hash % cache->size]; c; c = c->next)
  95       bucketcount++;
  96
  97    fprintf(stderr, "bucket %d/%d = %d/%d items\n", hash % cache->size,
  98            cache->size, bucketcount, cache->n_items);
  99 #endif
 100
 101    for (c = cache->items[hash % cache->size]; c; c = c->next) {
 102       if (brw_cache_item_equals(lookup, c))
 103          return c;
 104    }
 105
 106    return NULL;
 107 }
 108
 109
 110 static void
 111 rehash(struct brw_cache *cache)
 112 {
 113    struct brw_cache_item **items;
 114    struct brw_cache_item *c, *next;
 115    GLuint size, i;
 116
 117    size = cache->size * 3;
 118    items = calloc(size, sizeof(*items));
 119
 120    for (i = 0; i < cache->size; i++)
 121       for (c = cache->items[i]; c; c = next) {
 122          next = c->next;
 123          c->next = items[c->hash % size];
 124          items[c->hash % size] = c;
 125       }
 126
 127    free(cache->items);
 128    cache->items = items;
 129    cache->size = size;
 130 }
 131
 132
 133 /**
 134  * Returns the buffer object matching cache_id and key, or NULL.
 135  */
 136 bool
 137 brw_search_cache(struct brw_cache *cache,
 138                  enum brw_cache_id cache_id,
 139                  const void *key, GLuint key_size,
 140                  uint32_t *inout_offset, void *out_aux)
 141 {
 142    struct brw_context *brw = cache->brw;
 143    struct brw_cache_item *item;
 144    struct brw_cache_item lookup;
 145    GLuint hash;
 146
 147    lookup.cache_id = cache_id;
 148    lookup.key = key;
 149    lookup.key_size = key_size;
 150    hash = hash_key(&lookup);
 151    lookup.hash = hash;
 152
 153    item = search_cache(cache, hash, &lookup);
 154
 155    if (item == NULL)
 156       return false;
 157
 158    *(void **)out_aux = ((char *)item->key + item->key_size);
 159
 160    if (item->offset != *inout_offset) {
 161       brw->ctx.NewDriverState |= (1 << cache_id);
 162       *inout_offset = item->offset;
 163    }
 164
 165    return true;
 166 }
 167
 168 static void
 169 brw_cache_new_bo(struct brw_cache *cache, uint32_t new_size)
 170 {
 171    struct brw_context *brw = cache->brw;
 172    drm_intel_bo *new_bo;
 173
 174    new_bo = drm_intel_bo_alloc(brw->bufmgr, "program cache", new_size, 64);
 175    if (brw->has_llc)
 176       drm_intel_gem_bo_map_unsynchronized(new_bo);
 177
 178    /* Copy any existing data that needs to be saved. */
 179    if (cache->next_offset != 0) {
 180       if (brw->has_llc) {
 181          memcpy(new_bo->virtual, cache->bo->virtual, cache->next_offset);
 182       } else {
 183          drm_intel_bo_map(cache->bo, false);
 184          drm_intel_bo_subdata(new_bo, 0, cache->next_offset,
 185                               cache->bo->virtual);
 186          drm_intel_bo_unmap(cache->bo);
 187       }
 188    }
 189
 190    if (brw->has_llc)
 191       drm_intel_bo_unmap(cache->bo);
 192    drm_intel_bo_unreference(cache->bo);
 193    cache->bo = new_bo;
 194    cache->bo_used_by_gpu = false;
 195
 196    /* Since we have a new BO in place, we need to signal the units
 197     * that depend on it (state base address on gen5+, or unit state before).
 198     */
 199    brw->ctx.NewDriverState |= BRW_NEW_PROGRAM_CACHE;
 200 }
 201
 202 /**
 203  * Attempts to find an item in the cache with identical data and aux
 204  * data to use
 205  */
 206 static bool
 207 brw_try_upload_using_copy(struct brw_cache *cache,
 208                           struct brw_cache_item *result_item,
 209                           const void *data,
 210                           const void *aux)
 211 {
 212    struct brw_context *brw = cache->brw;
 213    int i;
 214    struct brw_cache_item *item;
 215
 216    for (i = 0; i < cache->size; i++) {
 217       for (item = cache->items[i]; item; item = item->next) {
 218          const void *item_aux = item->key + item->key_size;
 219          int ret;
 220
 221          if (item->cache_id != result_item->cache_id ||
 222              item->size != result_item->size ||
 223              item->aux_size != result_item->aux_size) {
 224             continue;
 225          }
 226
 227          if (cache->aux_compare[result_item->cache_id]) {
 228             if (!cache->aux_compare[result_item->cache_id](item_aux, aux))
 229                continue;
 230          } else if (memcmp(item_aux, aux, item->aux_size) != 0) {
 231             continue;
 232          }
 233
 234          if (!brw->has_llc)
 235             drm_intel_bo_map(cache->bo, false);
 236          ret = memcmp(cache->bo->virtual + item->offset, data, item->size);
 237          if (!brw->has_llc)
 238             drm_intel_bo_unmap(cache->bo);
 239          if (ret)
 240             continue;
 241
 242          result_item->offset = item->offset;
 243
 244          return true;
 245       }
 246    }
 247
 248    return false;
 249 }
 250
 251 static void
 252 brw_upload_item_data(struct brw_cache *cache,
 253                      struct brw_cache_item *item,
 254                      const void *data)
 255 {
 256    struct brw_context *brw = cache->brw;
 257
 258    /* Allocate space in the cache BO for our new program. */
 259    if (cache->next_offset + item->size > cache->bo->size) {
 260       uint32_t new_size = cache->bo->size * 2;
 261
 262       while (cache->next_offset + item->size > new_size)
 263          new_size *= 2;
 264
 265       brw_cache_new_bo(cache, new_size);
 266    }
 267
 268    /* If we would block on writing to an in-use program BO, just
 269     * recreate it.
 270     */
 271    if (!brw->has_llc && cache->bo_used_by_gpu) {
 272       perf_debug("Copying busy program cache buffer.\n");
 273       brw_cache_new_bo(cache, cache->bo->size);
 274    }
 275
 276    item->offset = cache->next_offset;
 277
 278    /* Programs are always 64-byte aligned, so set up the next one now */
 279    cache->next_offset = ALIGN(item->offset + item->size, 64);
 280 }
 281
 282 void
 283 brw_upload_cache(struct brw_cache *cache,
 284                  enum brw_cache_id cache_id,
 285                  const void *key,
 286                  GLuint key_size,
 287                  const void *data,
 288                  GLuint data_size,
 289                  const void *aux,
 290                  GLuint aux_size,
 291                  uint32_t *out_offset,
 292                  void *out_aux)
 293 {
 294    struct brw_context *brw = cache->brw;
 295    struct brw_cache_item *item = CALLOC_STRUCT(brw_cache_item);
 296    GLuint hash;
 297    void *tmp;
 298
 299    item->cache_id = cache_id;
 300    item->size = data_size;
 301    item->key = key;
 302    item->key_size = key_size;
 303    item->aux_size = aux_size;
 304    hash = hash_key(item);
 305    item->hash = hash;
 306
 307    /* If we can find a matching prog/prog_data combo in the cache
 308     * already, then reuse the existing stuff.  This will mean not
 309     * flagging CACHE_NEW_* when transitioning between the two
 310     * equivalent hash keys.  This is notably useful for programs
 311     * generating shaders at runtime, where multiple shaders may
 312     * compile to the thing in our backend.
 313     */
 314    if (!brw_try_upload_using_copy(cache, item, data, aux)) {
 315       brw_upload_item_data(cache, item, data);
 316    }
 317
 318    /* Set up the memory containing the key and aux_data */
 319    tmp = malloc(key_size + aux_size);
 320
 321    memcpy(tmp, key, key_size);
 322    memcpy(tmp + key_size, aux, aux_size);
 323
 324    item->key = tmp;
 325
 326    if (cache->n_items > cache->size * 1.5)
 327       rehash(cache);
 328
 329    hash %= cache->size;
 330    item->next = cache->items[hash];
 331    cache->items[hash] = item;
 332    cache->n_items++;
 333
 334    /* Copy data to the buffer */
 335    if (brw->has_llc) {
 336       memcpy((char *) cache->bo->virtual + item->offset, data, data_size);
 337    } else {
 338       drm_intel_bo_subdata(cache->bo, item->offset, data_size, data);
 339    }
 340
 341    *out_offset = item->offset;
 342    *(void **)out_aux = (void *)((char *)item->key + item->key_size);
 343    cache->brw->ctx.NewDriverState |= 1 << cache_id;
 344 }
 345
 346 void
 347 brw_init_caches(struct brw_context *brw)
 348 {
 349    struct brw_cache *cache = &brw->cache;
 350
 351    cache->brw = brw;
 352
 353    cache->size = 7;
 354    cache->n_items = 0;
 355    cache->items =
 356       calloc(cache->size, sizeof(struct brw_cache_item *));
 357
 358    cache->bo = drm_intel_bo_alloc(brw->bufmgr,
 359                                   "program cache",
 360                                   4096, 64);
 361    if (brw->has_llc)
 362       drm_intel_gem_bo_map_unsynchronized(cache->bo);
 363
 364    cache->aux_compare[BRW_CACHE_VS_PROG] = brw_vs_prog_data_compare;
 365    cache->aux_compare[BRW_CACHE_GS_PROG] = brw_gs_prog_data_compare;
 366    cache->aux_compare[BRW_CACHE_FS_PROG] = brw_wm_prog_data_compare;
 367    cache->aux_compare[BRW_CACHE_CS_PROG] = brw_cs_prog_data_compare;
 368    cache->aux_free[BRW_CACHE_VS_PROG] = brw_stage_prog_data_free;
 369    cache->aux_free[BRW_CACHE_GS_PROG] = brw_stage_prog_data_free;
 370    cache->aux_free[BRW_CACHE_FS_PROG] = brw_stage_prog_data_free;
 371    cache->aux_free[BRW_CACHE_CS_PROG] = brw_stage_prog_data_free;
 372 }
 373
 374 static void
 375 brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
 376 {
 377    struct brw_cache_item *c, *next;
 378    GLuint i;
 379
 380    DBG("%s\n", __func__);
 381
 382    for (i = 0; i < cache->size; i++) {
 383       for (c = cache->items[i]; c; c = next) {
 384          next = c->next;
 385          if (cache->aux_free[c->cache_id]) {
 386             const void *item_aux = c->key + c->key_size;
 387             cache->aux_free[c->cache_id](item_aux);
 388          }
 389          free((void *)c->key);
 390          free(c);
 391       }
 392       cache->items[i] = NULL;
 393    }
 394
 395    cache->n_items = 0;
 396
 397    /* Start putting programs into the start of the BO again, since
 398     * we'll never find the old results.
 399     */
 400    cache->next_offset = 0;
 401
 402    /* We need to make sure that the programs get regenerated, since
 403     * any offsets leftover in brw_context will no longer be valid.
 404     */
 405    brw->NewGLState |= ~0;
 406    brw->ctx.NewDriverState |= ~0ull;
 407    intel_batchbuffer_flush(brw);
 408 }
 409
 410 void
 411 brw_state_cache_check_size(struct brw_context *brw)
 412 {
 413    /* un-tuned guess.  Each object is generally a page, so 2000 of them is 8 MB of
 414     * state cache.
 415     */
 416    if (brw->cache.n_items > 2000) {
 417       perf_debug("Exceeded state cache size limit.  Clearing the set "
 418                  "of compiled programs, which will trigger recompiles\n");
 419       brw_clear_cache(brw, &brw->cache);
 420    }
 421 }
 422
 423
 424 static void
 425 brw_destroy_cache(struct brw_context *brw, struct brw_cache *cache)
 426 {
 427
 428    DBG("%s\n", __func__);
 429
 430    if (cache->bo == NULL)
 431       return;
 432
 433    if (brw->has_llc)
 434       drm_intel_bo_unmap(cache->bo);
 435    drm_intel_bo_unreference(cache->bo);
 436    cache->bo = NULL;
 437    brw_clear_cache(brw, cache);
 438    free(cache->items);
 439    cache->items = NULL;
 440    cache->size = 0;
 441 }
 442
 443
 444 void
 445 brw_destroy_caches(struct brw_context *brw)
 446 {
 447    brw_destroy_cache(brw, &brw->cache);
 448 }