src/mesa/drivers/dri/i965/brw_state_cache.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32 /** @file brw_state_cache.c
  33  *
  34  * This file implements a simple static state cache for 965.  The
  35  * consumers can query the hash table of state using a cache_id,
  36  * opaque key data, and receive the corresponding state buffer object
  37  * of state (plus associated auxiliary data) in return.  Objects in
  38  * the cache may not have relocations (pointers to other BOs) in them.
  39  *
  40  * The inner workings are a simple hash table based on a CRC of the
  41  * key data.
  42  *
  43  * Replacement is not implemented.  Instead, when the cache gets too
  44  * big we throw out all of the cache data and let it get regenerated.
  45  */
  46
  47 #include "main/imports.h"
  48 #include "intel_batchbuffer.h"
  49 #include "brw_state.h"
  50 #include "brw_vs.h"
  51 #include "brw_wm.h"
  52 #include "brw_vs.h"
  53
  54 #define FILE_DEBUG_FLAG DEBUG_STATE
  55
  56 static GLuint
  57 hash_key(struct brw_cache_item *item)
  58 {
  59    GLuint *ikey = (GLuint *)item->key;
  60    GLuint hash = item->cache_id, i;
  61
  62    assert(item->key_size % 4 == 0);
  63
  64    /* I'm sure this can be improved on:
  65     */
  66    for (i = 0; i < item->key_size/4; i++) {
  67       hash ^= ikey[i];
  68       hash = (hash << 5) | (hash >> 27);
  69    }
  70
  71    return hash;
  72 }
  73
  74 static int
  75 brw_cache_item_equals(const struct brw_cache_item *a,
  76                       const struct brw_cache_item *b)
  77 {
  78    return a->cache_id == b->cache_id &&
  79       a->hash == b->hash &&
  80       a->key_size == b->key_size &&
  81       (memcmp(a->key, b->key, a->key_size) == 0);
  82 }
  83
  84 static struct brw_cache_item *
  85 search_cache(struct brw_cache *cache, GLuint hash,
  86              struct brw_cache_item *lookup)
  87 {
  88    struct brw_cache_item *c;
  89
  90 #if 0
  91    int bucketcount = 0;
  92
  93    for (c = cache->items[hash % cache->size]; c; c = c->next)
  94       bucketcount++;
  95
  96    fprintf(stderr, "bucket %d/%d = %d/%d items\n", hash % cache->size,
  97            cache->size, bucketcount, cache->n_items);
  98 #endif
  99
 100    for (c = cache->items[hash % cache->size]; c; c = c->next) {
 101       if (brw_cache_item_equals(lookup, c))
 102          return c;
 103    }
 104
 105    return NULL;
 106 }
 107
 108
 109 static void
 110 rehash(struct brw_cache *cache)
 111 {
 112    struct brw_cache_item **items;
 113    struct brw_cache_item *c, *next;
 114    GLuint size, i;
 115
 116    size = cache->size * 3;
 117    items = calloc(1, size * sizeof(*items));
 118
 119    for (i = 0; i < cache->size; i++)
 120       for (c = cache->items[i]; c; c = next) {
 121          next = c->next;
 122          c->next = items[c->hash % size];
 123          items[c->hash % size] = c;
 124       }
 125
 126    free(cache->items);
 127    cache->items = items;
 128    cache->size = size;
 129 }
 130
 131
 132 /**
 133  * Returns the buffer object matching cache_id and key, or NULL.
 134  */
 135 bool
 136 brw_search_cache(struct brw_cache *cache,
 137                  enum brw_cache_id cache_id,
 138                  const void *key, GLuint key_size,
 139                  uint32_t *inout_offset, void *out_aux)
 140 {
 141    struct brw_context *brw = cache->brw;
 142    struct brw_cache_item *item;
 143    struct brw_cache_item lookup;
 144    GLuint hash;
 145
 146    lookup.cache_id = cache_id;
 147    lookup.key = key;
 148    lookup.key_size = key_size;
 149    hash = hash_key(&lookup);
 150    lookup.hash = hash;
 151
 152    item = search_cache(cache, hash, &lookup);
 153
 154    if (item == NULL)
 155       return false;
 156
 157    *(void **)out_aux = ((char *)item->key + item->key_size);
 158
 159    if (item->offset != *inout_offset) {
 160       brw->state.dirty.cache |= (1 << cache_id);
 161       *inout_offset = item->offset;
 162    }
 163
 164    return true;
 165 }
 166
 167 static void
 168 brw_cache_new_bo(struct brw_cache *cache, uint32_t new_size)
 169 {
 170    struct brw_context *brw = cache->brw;
 171    struct intel_context *intel = &brw->intel;
 172    drm_intel_bo *new_bo;
 173
 174    new_bo = drm_intel_bo_alloc(intel->bufmgr, "program cache", new_size, 64);
 175
 176    /* Copy any existing data that needs to be saved. */
 177    if (cache->next_offset != 0) {
 178       drm_intel_bo_map(cache->bo, false);
 179       drm_intel_bo_subdata(new_bo, 0, cache->next_offset, cache->bo->virtual);
 180       drm_intel_bo_unmap(cache->bo);
 181    }
 182
 183    drm_intel_bo_unreference(cache->bo);
 184    cache->bo = new_bo;
 185    cache->bo_used_by_gpu = false;
 186
 187    /* Since we have a new BO in place, we need to signal the units
 188     * that depend on it (state base address on gen5+, or unit state before).
 189     */
 190    brw->state.dirty.brw |= BRW_NEW_PROGRAM_CACHE;
 191 }
 192
 193 /**
 194  * Attempts to find an item in the cache with identical data and aux
 195  * data to use
 196  */
 197 static bool
 198 brw_try_upload_using_copy(struct brw_cache *cache,
 199                           struct brw_cache_item *result_item,
 200                           const void *data,
 201                           const void *aux)
 202 {
 203    int i;
 204    struct brw_cache_item *item;
 205
 206    for (i = 0; i < cache->size; i++) {
 207       for (item = cache->items[i]; item; item = item->next) {
 208          const void *item_aux = item->key + item->key_size;
 209          int ret;
 210
 211          if (item->cache_id != result_item->cache_id ||
 212              item->size != result_item->size ||
 213              item->aux_size != result_item->aux_size) {
 214             continue;
 215          }
 216
 217          if (cache->aux_compare[result_item->cache_id]) {
 218             if (!cache->aux_compare[result_item->cache_id](item_aux, aux,
 219                                                            item->aux_size,
 220                                                            item->key))
 221                continue;
 222          } else if (memcmp(item_aux, aux, item->aux_size) != 0) {
 223             continue;
 224          }
 225
 226          drm_intel_bo_map(cache->bo, false);
 227          ret = memcmp(cache->bo->virtual + item->offset, data, item->size);
 228          drm_intel_bo_unmap(cache->bo);
 229          if (ret)
 230             continue;
 231
 232          result_item->offset = item->offset;
 233
 234          return true;
 235       }
 236    }
 237
 238    return false;
 239 }
 240
 241 static void
 242 brw_upload_item_data(struct brw_cache *cache,
 243                      struct brw_cache_item *item,
 244                      const void *data)
 245 {
 246    /* Allocate space in the cache BO for our new program. */
 247    if (cache->next_offset + item->size > cache->bo->size) {
 248       uint32_t new_size = cache->bo->size * 2;
 249
 250       while (cache->next_offset + item->size > new_size)
 251          new_size *= 2;
 252
 253       brw_cache_new_bo(cache, new_size);
 254    }
 255
 256    /* If we would block on writing to an in-use program BO, just
 257     * recreate it.
 258     */
 259    if (cache->bo_used_by_gpu) {
 260       brw_cache_new_bo(cache, cache->bo->size);
 261    }
 262
 263    item->offset = cache->next_offset;
 264
 265    /* Programs are always 64-byte aligned, so set up the next one now */
 266    cache->next_offset = ALIGN(item->offset + item->size, 64);
 267 }
 268
 269 void
 270 brw_upload_cache(struct brw_cache *cache,
 271                  enum brw_cache_id cache_id,
 272                  const void *key,
 273                  GLuint key_size,
 274                  const void *data,
 275                  GLuint data_size,
 276                  const void *aux,
 277                  GLuint aux_size,
 278                  uint32_t *out_offset,
 279                  void *out_aux)
 280 {
 281    struct brw_cache_item *item = CALLOC_STRUCT(brw_cache_item);
 282    GLuint hash;
 283    void *tmp;
 284
 285    item->cache_id = cache_id;
 286    item->size = data_size;
 287    item->key = key;
 288    item->key_size = key_size;
 289    item->aux_size = aux_size;
 290    hash = hash_key(item);
 291    item->hash = hash;
 292
 293    /* If we can find a matching prog/prog_data combo in the cache
 294     * already, then reuse the existing stuff.  This will mean not
 295     * flagging CACHE_NEW_* when transitioning between the two
 296     * equivalent hash keys.  This is notably useful for programs
 297     * generating shaders at runtime, where multiple shaders may
 298     * compile to the thing in our backend.
 299     */
 300    if (!brw_try_upload_using_copy(cache, item, data, aux)) {
 301       brw_upload_item_data(cache, item, data);
 302    }
 303
 304    /* Set up the memory containing the key and aux_data */
 305    tmp = malloc(key_size + aux_size);
 306
 307    memcpy(tmp, key, key_size);
 308    memcpy(tmp + key_size, aux, aux_size);
 309
 310    item->key = tmp;
 311
 312    if (cache->n_items > cache->size * 1.5)
 313       rehash(cache);
 314
 315    hash %= cache->size;
 316    item->next = cache->items[hash];
 317    cache->items[hash] = item;
 318    cache->n_items++;
 319
 320    /* Copy data to the buffer */
 321    drm_intel_bo_subdata(cache->bo, item->offset, data_size, data);
 322
 323    *out_offset = item->offset;
 324    *(void **)out_aux = (void *)((char *)item->key + item->key_size);
 325    cache->brw->state.dirty.cache |= 1 << cache_id;
 326 }
 327
 328 void
 329 brw_init_caches(struct brw_context *brw)
 330 {
 331    struct intel_context *intel = &brw->intel;
 332    struct brw_cache *cache = &brw->cache;
 333
 334    cache->brw = brw;
 335
 336    cache->size = 7;
 337    cache->n_items = 0;
 338    cache->items =
 339       calloc(1, cache->size * sizeof(struct brw_cache_item));
 340
 341    cache->bo = drm_intel_bo_alloc(intel->bufmgr,
 342                                   "program cache",
 343                                   4096, 64);
 344
 345    cache->aux_compare[BRW_VS_PROG] = brw_vs_prog_data_compare;
 346    cache->aux_compare[BRW_WM_PROG] = brw_wm_prog_data_compare;
 347    cache->aux_free[BRW_VS_PROG] = brw_vs_prog_data_free;
 348    cache->aux_free[BRW_WM_PROG] = brw_wm_prog_data_free;
 349 }
 350
 351 static void
 352 brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
 353 {
 354    struct intel_context *intel = &brw->intel;
 355    struct brw_cache_item *c, *next;
 356    GLuint i;
 357
 358    DBG("%s\n", __FUNCTION__);
 359
 360    for (i = 0; i < cache->size; i++) {
 361       for (c = cache->items[i]; c; c = next) {
 362          next = c->next;
 363          if (cache->aux_free[c->cache_id]) {
 364             const void *item_aux = c->key + c->key_size;
 365             cache->aux_free[c->cache_id](item_aux);
 366          }
 367          free((void *)c->key);
 368          free(c);
 369       }
 370       cache->items[i] = NULL;
 371    }
 372
 373    cache->n_items = 0;
 374
 375    /* Start putting programs into the start of the BO again, since
 376     * we'll never find the old results.
 377     */
 378    cache->next_offset = 0;
 379
 380    /* We need to make sure that the programs get regenerated, since
 381     * any offsets leftover in brw_context will no longer be valid.
 382     */
 383    brw->state.dirty.mesa |= ~0;
 384    brw->state.dirty.brw |= ~0;
 385    brw->state.dirty.cache |= ~0;
 386    intel_batchbuffer_flush(intel);
 387 }
 388
 389 void
 390 brw_state_cache_check_size(struct brw_context *brw)
 391 {
 392    /* un-tuned guess.  Each object is generally a page, so 2000 of them is 8 MB of
 393     * state cache.
 394     */
 395    if (brw->cache.n_items > 2000) {
 396       perf_debug("Exceeded state cache size limit.  Clearing the set "
 397                  "of compiled programs, which will trigger recompiles\n");
 398       brw_clear_cache(brw, &brw->cache);
 399    }
 400 }
 401
 402
 403 static void
 404 brw_destroy_cache(struct brw_context *brw, struct brw_cache *cache)
 405 {
 406
 407    DBG("%s\n", __FUNCTION__);
 408
 409    drm_intel_bo_unreference(cache->bo);
 410    cache->bo = NULL;
 411    brw_clear_cache(brw, cache);
 412    free(cache->items);
 413    cache->items = NULL;
 414    cache->size = 0;
 415 }
 416
 417
 418 void
 419 brw_destroy_caches(struct brw_context *brw)
 420 {
 421    brw_destroy_cache(brw, &brw->cache);
 422 }