i965: Make a helper for finding an existing shader variant.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_program_cache.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keithw@vmware.com>
30 */
31
32 /** @file brw_program_cache.c
33 *
34 * This file implements a simple program cache for 965. The consumers can
35 * query the hash table of programs using a cache_id and program key, and
36 * receive the corresponding program buffer object (plus associated auxiliary
37 * data) in return. Objects in the cache may not have relocations
38 * (pointers to other BOs) in them.
39 *
40 * The inner workings are a simple hash table based on a CRC of the
41 * key data.
42 *
43 * Replacement is not implemented. Instead, when the cache gets too
44 * big we throw out all of the cache data and let it get regenerated.
45 */
46
47 #include "main/imports.h"
48 #include "intel_batchbuffer.h"
49 #include "brw_state.h"
50 #include "brw_vs.h"
51 #include "brw_wm.h"
52 #include "brw_gs.h"
53 #include "brw_cs.h"
54 #include "brw_program.h"
55
56 #define FILE_DEBUG_FLAG DEBUG_STATE
57
58 static unsigned
59 get_program_string_id(enum brw_cache_id cache_id, const void *key)
60 {
61 switch (cache_id) {
62 case BRW_CACHE_VS_PROG:
63 return ((struct brw_vs_prog_key *) key)->program_string_id;
64 case BRW_CACHE_TCS_PROG:
65 return ((struct brw_tcs_prog_key *) key)->program_string_id;
66 case BRW_CACHE_TES_PROG:
67 return ((struct brw_tes_prog_key *) key)->program_string_id;
68 case BRW_CACHE_GS_PROG:
69 return ((struct brw_gs_prog_key *) key)->program_string_id;
70 case BRW_CACHE_CS_PROG:
71 return ((struct brw_cs_prog_key *) key)->program_string_id;
72 case BRW_CACHE_FS_PROG:
73 return ((struct brw_wm_prog_key *) key)->program_string_id;
74 default:
75 unreachable("no program string id for this kind of program");
76 }
77 }
78
79 static GLuint
80 hash_key(struct brw_cache_item *item)
81 {
82 GLuint *ikey = (GLuint *)item->key;
83 GLuint hash = item->cache_id, i;
84
85 assert(item->key_size % 4 == 0);
86
87 /* I'm sure this can be improved on:
88 */
89 for (i = 0; i < item->key_size/4; i++) {
90 hash ^= ikey[i];
91 hash = (hash << 5) | (hash >> 27);
92 }
93
94 return hash;
95 }
96
97 static int
98 brw_cache_item_equals(const struct brw_cache_item *a,
99 const struct brw_cache_item *b)
100 {
101 return a->cache_id == b->cache_id &&
102 a->hash == b->hash &&
103 a->key_size == b->key_size &&
104 (memcmp(a->key, b->key, a->key_size) == 0);
105 }
106
107 static struct brw_cache_item *
108 search_cache(struct brw_cache *cache, GLuint hash,
109 struct brw_cache_item *lookup)
110 {
111 struct brw_cache_item *c;
112
113 #if 0
114 int bucketcount = 0;
115
116 for (c = cache->items[hash % cache->size]; c; c = c->next)
117 bucketcount++;
118
119 fprintf(stderr, "bucket %d/%d = %d/%d items\n", hash % cache->size,
120 cache->size, bucketcount, cache->n_items);
121 #endif
122
123 for (c = cache->items[hash % cache->size]; c; c = c->next) {
124 if (brw_cache_item_equals(lookup, c))
125 return c;
126 }
127
128 return NULL;
129 }
130
131
132 static void
133 rehash(struct brw_cache *cache)
134 {
135 struct brw_cache_item **items;
136 struct brw_cache_item *c, *next;
137 GLuint size, i;
138
139 size = cache->size * 3;
140 items = calloc(size, sizeof(*items));
141
142 for (i = 0; i < cache->size; i++)
143 for (c = cache->items[i]; c; c = next) {
144 next = c->next;
145 c->next = items[c->hash % size];
146 items[c->hash % size] = c;
147 }
148
149 free(cache->items);
150 cache->items = items;
151 cache->size = size;
152 }
153
154
155 /**
156 * Returns the buffer object matching cache_id and key, or NULL.
157 */
158 bool
159 brw_search_cache(struct brw_cache *cache,
160 enum brw_cache_id cache_id,
161 const void *key, GLuint key_size,
162 uint32_t *inout_offset, void *inout_aux)
163 {
164 struct brw_context *brw = cache->brw;
165 struct brw_cache_item *item;
166 struct brw_cache_item lookup;
167 GLuint hash;
168
169 lookup.cache_id = cache_id;
170 lookup.key = key;
171 lookup.key_size = key_size;
172 hash = hash_key(&lookup);
173 lookup.hash = hash;
174
175 item = search_cache(cache, hash, &lookup);
176
177 if (item == NULL)
178 return false;
179
180 void *aux = ((char *) item->key) + item->key_size;
181
182 if (item->offset != *inout_offset || aux != *((void **) inout_aux)) {
183 brw->ctx.NewDriverState |= (1 << cache_id);
184 *inout_offset = item->offset;
185 *((void **) inout_aux) = aux;
186 }
187
188 return true;
189 }
190
191 static void
192 brw_cache_new_bo(struct brw_cache *cache, uint32_t new_size)
193 {
194 struct brw_context *brw = cache->brw;
195 drm_intel_bo *new_bo;
196
197 new_bo = drm_intel_bo_alloc(brw->bufmgr, "program cache", new_size, 64);
198 if (brw->has_llc)
199 drm_intel_gem_bo_map_unsynchronized(new_bo);
200
201 /* Copy any existing data that needs to be saved. */
202 if (cache->next_offset != 0) {
203 if (brw->has_llc) {
204 memcpy(new_bo->virtual, cache->bo->virtual, cache->next_offset);
205 } else {
206 drm_intel_bo_map(cache->bo, false);
207 drm_intel_bo_subdata(new_bo, 0, cache->next_offset,
208 cache->bo->virtual);
209 drm_intel_bo_unmap(cache->bo);
210 }
211 }
212
213 if (brw->has_llc)
214 drm_intel_bo_unmap(cache->bo);
215 drm_intel_bo_unreference(cache->bo);
216 cache->bo = new_bo;
217 cache->bo_used_by_gpu = false;
218
219 /* Since we have a new BO in place, we need to signal the units
220 * that depend on it (state base address on gen5+, or unit state before).
221 */
222 brw->ctx.NewDriverState |= BRW_NEW_PROGRAM_CACHE;
223 brw->batch.state_base_address_emitted = false;
224 }
225
226 /**
227 * Attempts to find an item in the cache with identical data.
228 */
229 static const struct brw_cache_item *
230 brw_lookup_prog(const struct brw_cache *cache,
231 enum brw_cache_id cache_id,
232 const void *data, unsigned data_size)
233 {
234 const struct brw_context *brw = cache->brw;
235 unsigned i;
236 const struct brw_cache_item *item;
237
238 for (i = 0; i < cache->size; i++) {
239 for (item = cache->items[i]; item; item = item->next) {
240 int ret;
241
242 if (item->cache_id != cache_id || item->size != data_size)
243 continue;
244
245 if (!brw->has_llc)
246 drm_intel_bo_map(cache->bo, false);
247 ret = memcmp(cache->bo->virtual + item->offset, data, item->size);
248 if (!brw->has_llc)
249 drm_intel_bo_unmap(cache->bo);
250 if (ret)
251 continue;
252
253 return item;
254 }
255 }
256
257 return NULL;
258 }
259
260 static uint32_t
261 brw_alloc_item_data(struct brw_cache *cache, uint32_t size)
262 {
263 uint32_t offset;
264 struct brw_context *brw = cache->brw;
265
266 /* Allocate space in the cache BO for our new program. */
267 if (cache->next_offset + size > cache->bo->size) {
268 uint32_t new_size = cache->bo->size * 2;
269
270 while (cache->next_offset + size > new_size)
271 new_size *= 2;
272
273 brw_cache_new_bo(cache, new_size);
274 }
275
276 /* If we would block on writing to an in-use program BO, just
277 * recreate it.
278 */
279 if (!brw->has_llc && cache->bo_used_by_gpu) {
280 perf_debug("Copying busy program cache buffer.\n");
281 brw_cache_new_bo(cache, cache->bo->size);
282 }
283
284 offset = cache->next_offset;
285
286 /* Programs are always 64-byte aligned, so set up the next one now */
287 cache->next_offset = ALIGN(offset + size, 64);
288
289 return offset;
290 }
291
292 const void *
293 brw_find_previous_compile(struct brw_cache *cache,
294 enum brw_cache_id cache_id,
295 unsigned program_string_id)
296 {
297 for (unsigned i = 0; i < cache->size; i++) {
298 for (struct brw_cache_item *c = cache->items[i]; c; c = c->next) {
299 if (c->cache_id == cache_id &&
300 get_program_string_id(cache_id, c->key) == program_string_id) {
301 return c->key;
302 }
303 }
304 }
305
306 return NULL;
307 }
308
309 void
310 brw_upload_cache(struct brw_cache *cache,
311 enum brw_cache_id cache_id,
312 const void *key,
313 GLuint key_size,
314 const void *data,
315 GLuint data_size,
316 const void *aux,
317 GLuint aux_size,
318 uint32_t *out_offset,
319 void *out_aux)
320 {
321 struct brw_context *brw = cache->brw;
322 struct brw_cache_item *item = CALLOC_STRUCT(brw_cache_item);
323 const struct brw_cache_item *matching_data =
324 brw_lookup_prog(cache, cache_id, data, data_size);
325 GLuint hash;
326 void *tmp;
327
328 item->cache_id = cache_id;
329 item->size = data_size;
330 item->key = key;
331 item->key_size = key_size;
332 item->aux_size = aux_size;
333 hash = hash_key(item);
334 item->hash = hash;
335
336 /* If we can find a matching prog in the cache already, then reuse the
337 * existing stuff without creating new copy into the underlying buffer
338 * object. This is notably useful for programs generating shaders at
339 * runtime, where multiple shaders may compile to the same thing in our
340 * backend.
341 */
342 if (matching_data) {
343 item->offset = matching_data->offset;
344 } else {
345 item->offset = brw_alloc_item_data(cache, data_size);
346
347 /* Copy data to the buffer */
348 if (brw->has_llc) {
349 memcpy((char *)cache->bo->virtual + item->offset, data, data_size);
350 } else {
351 drm_intel_bo_subdata(cache->bo, item->offset, data_size, data);
352 }
353 }
354
355 /* Set up the memory containing the key and aux_data */
356 tmp = malloc(key_size + aux_size);
357
358 memcpy(tmp, key, key_size);
359 memcpy(tmp + key_size, aux, aux_size);
360
361 item->key = tmp;
362
363 if (cache->n_items > cache->size * 1.5f)
364 rehash(cache);
365
366 hash %= cache->size;
367 item->next = cache->items[hash];
368 cache->items[hash] = item;
369 cache->n_items++;
370
371 *out_offset = item->offset;
372 *(void **)out_aux = (void *)((char *)item->key + item->key_size);
373 cache->brw->ctx.NewDriverState |= 1 << cache_id;
374 }
375
376 void
377 brw_init_caches(struct brw_context *brw)
378 {
379 struct brw_cache *cache = &brw->cache;
380
381 cache->brw = brw;
382
383 cache->size = 7;
384 cache->n_items = 0;
385 cache->items =
386 calloc(cache->size, sizeof(struct brw_cache_item *));
387
388 cache->bo = drm_intel_bo_alloc(brw->bufmgr, "program cache", 4096, 64);
389 if (brw->has_llc)
390 drm_intel_gem_bo_map_unsynchronized(cache->bo);
391 }
392
393 static void
394 brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
395 {
396 struct brw_cache_item *c, *next;
397 GLuint i;
398
399 DBG("%s\n", __func__);
400
401 for (i = 0; i < cache->size; i++) {
402 for (c = cache->items[i]; c; c = next) {
403 next = c->next;
404 if (c->cache_id == BRW_CACHE_VS_PROG ||
405 c->cache_id == BRW_CACHE_TCS_PROG ||
406 c->cache_id == BRW_CACHE_TES_PROG ||
407 c->cache_id == BRW_CACHE_GS_PROG ||
408 c->cache_id == BRW_CACHE_FS_PROG ||
409 c->cache_id == BRW_CACHE_CS_PROG) {
410 const void *item_aux = c->key + c->key_size;
411 brw_stage_prog_data_free(item_aux);
412 }
413 free((void *)c->key);
414 free(c);
415 }
416 cache->items[i] = NULL;
417 }
418
419 cache->n_items = 0;
420
421 /* Start putting programs into the start of the BO again, since
422 * we'll never find the old results.
423 */
424 cache->next_offset = 0;
425
426 /* We need to make sure that the programs get regenerated, since
427 * any offsets leftover in brw_context will no longer be valid.
428 */
429 brw->NewGLState = ~0;
430 brw->ctx.NewDriverState = ~0ull;
431 brw->state.pipelines[BRW_RENDER_PIPELINE].mesa = ~0;
432 brw->state.pipelines[BRW_RENDER_PIPELINE].brw = ~0ull;
433 brw->state.pipelines[BRW_COMPUTE_PIPELINE].mesa = ~0;
434 brw->state.pipelines[BRW_COMPUTE_PIPELINE].brw = ~0ull;
435
436 /* Also, NULL out any stale program pointers. */
437 brw->vs.base.prog_data = NULL;
438 brw->tcs.base.prog_data = NULL;
439 brw->tes.base.prog_data = NULL;
440 brw->gs.base.prog_data = NULL;
441 brw->wm.base.prog_data = NULL;
442 brw->cs.base.prog_data = NULL;
443
444 intel_batchbuffer_flush(brw);
445 }
446
447 void
448 brw_program_cache_check_size(struct brw_context *brw)
449 {
450 /* un-tuned guess. Each object is generally a page, so 2000 of them is 8 MB of
451 * state cache.
452 */
453 if (brw->cache.n_items > 2000) {
454 perf_debug("Exceeded state cache size limit. Clearing the set "
455 "of compiled programs, which will trigger recompiles\n");
456 brw_clear_cache(brw, &brw->cache);
457 }
458 }
459
460
461 static void
462 brw_destroy_cache(struct brw_context *brw, struct brw_cache *cache)
463 {
464
465 DBG("%s\n", __func__);
466
467 if (brw->has_llc)
468 drm_intel_bo_unmap(cache->bo);
469 drm_intel_bo_unreference(cache->bo);
470 cache->bo = NULL;
471 brw_clear_cache(brw, cache);
472 free(cache->items);
473 cache->items = NULL;
474 cache->size = 0;
475 }
476
477
478 void
479 brw_destroy_caches(struct brw_context *brw)
480 {
481 brw_destroy_cache(brw, &brw->cache);
482 }