30be02fa253523ca472b67ac3ba08145079071bc
[mesa.git] / src / mesa / drivers / dri / i965 / brw_program_cache.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keithw@vmware.com>
30 */
31
32 /** @file brw_program_cache.c
33 *
34 * This file implements a simple program cache for 965. The consumers can
35 * query the hash table of programs using a cache_id and program key, and
36 * receive the corresponding program buffer object (plus associated auxiliary
37 * data) in return. Objects in the cache may not have relocations
38 * (pointers to other BOs) in them.
39 *
40 * The inner workings are a simple hash table based on a FNV-1a of the
41 * key data.
42 *
43 * Replacement is not implemented. Instead, when the cache gets too
44 * big we throw out all of the cache data and let it get regenerated.
45 */
46
47 #include "util/imports.h"
48 #include "main/streaming-load-memcpy.h"
49 #include "x86/common_x86_asm.h"
50 #include "intel_batchbuffer.h"
51 #include "brw_state.h"
52 #include "brw_wm.h"
53 #include "brw_gs.h"
54 #include "brw_cs.h"
55 #include "brw_program.h"
56 #include "compiler/brw_eu.h"
57 #include "util/u_memory.h"
58
59 #define FILE_DEBUG_FLAG DEBUG_STATE
60
61 struct brw_cache_item {
62 /**
63 * Effectively part of the key, cache_id identifies what kind of state
64 * buffer is involved, and also which dirty flag should set.
65 */
66 enum brw_cache_id cache_id;
67
68 /** 32-bit hash of the key data */
69 GLuint hash;
70
71 /** for variable-sized keys */
72 GLuint key_size;
73 GLuint prog_data_size;
74 const struct brw_base_prog_key *key;
75
76 uint32_t offset;
77 uint32_t size;
78
79 struct brw_cache_item *next;
80 };
81
82 enum brw_cache_id
83 brw_stage_cache_id(gl_shader_stage stage)
84 {
85 static const enum brw_cache_id stage_ids[] = {
86 BRW_CACHE_VS_PROG,
87 BRW_CACHE_TCS_PROG,
88 BRW_CACHE_TES_PROG,
89 BRW_CACHE_GS_PROG,
90 BRW_CACHE_FS_PROG,
91 BRW_CACHE_CS_PROG,
92 };
93 assert((int)stage >= 0 && stage < ARRAY_SIZE(stage_ids));
94 return stage_ids[stage];
95 }
96
97 static GLuint
98 hash_key(struct brw_cache_item *item)
99 {
100 uint32_t hash = _mesa_fnv32_1a_offset_bias;
101 hash = _mesa_fnv32_1a_accumulate(hash, item->cache_id);
102 hash = _mesa_fnv32_1a_accumulate_block(hash, item->key, item->key_size);
103
104 return hash;
105 }
106
107 static int
108 brw_cache_item_equals(const struct brw_cache_item *a,
109 const struct brw_cache_item *b)
110 {
111 return a->cache_id == b->cache_id &&
112 a->hash == b->hash &&
113 a->key_size == b->key_size &&
114 (memcmp(a->key, b->key, a->key_size) == 0);
115 }
116
117 static struct brw_cache_item *
118 search_cache(struct brw_cache *cache, GLuint hash,
119 struct brw_cache_item *lookup)
120 {
121 struct brw_cache_item *c;
122
123 #if 0
124 int bucketcount = 0;
125
126 for (c = cache->items[hash % cache->size]; c; c = c->next)
127 bucketcount++;
128
129 fprintf(stderr, "bucket %d/%d = %d/%d items\n", hash % cache->size,
130 cache->size, bucketcount, cache->n_items);
131 #endif
132
133 for (c = cache->items[hash % cache->size]; c; c = c->next) {
134 if (brw_cache_item_equals(lookup, c))
135 return c;
136 }
137
138 return NULL;
139 }
140
141
142 static void
143 rehash(struct brw_cache *cache)
144 {
145 struct brw_cache_item **items;
146 struct brw_cache_item *c, *next;
147 GLuint size, i;
148
149 size = cache->size * 3;
150 items = calloc(size, sizeof(*items));
151
152 for (i = 0; i < cache->size; i++)
153 for (c = cache->items[i]; c; c = next) {
154 next = c->next;
155 c->next = items[c->hash % size];
156 items[c->hash % size] = c;
157 }
158
159 free(cache->items);
160 cache->items = items;
161 cache->size = size;
162 }
163
164
165 /**
166 * Returns the buffer object matching cache_id and key, or NULL.
167 */
168 bool
169 brw_search_cache(struct brw_cache *cache, enum brw_cache_id cache_id,
170 const void *key, GLuint key_size, uint32_t *inout_offset,
171 void *inout_prog_data, bool flag_state)
172 {
173 struct brw_cache_item *item;
174 struct brw_cache_item lookup;
175 GLuint hash;
176
177 lookup.cache_id = cache_id;
178 lookup.key = key;
179 lookup.key_size = key_size;
180 hash = hash_key(&lookup);
181 lookup.hash = hash;
182
183 item = search_cache(cache, hash, &lookup);
184
185 if (item == NULL)
186 return false;
187
188 void *prog_data = ((char *) item->key) + item->key_size;
189
190 if (item->offset != *inout_offset ||
191 prog_data != *((void **) inout_prog_data)) {
192 if (likely(flag_state))
193 cache->brw->ctx.NewDriverState |= (1 << cache_id);
194 *inout_offset = item->offset;
195 *((void **) inout_prog_data) = prog_data;
196 }
197
198 return true;
199 }
200
201 static void
202 brw_cache_new_bo(struct brw_cache *cache, uint32_t new_size)
203 {
204 struct brw_context *brw = cache->brw;
205 struct brw_bo *new_bo;
206
207 perf_debug("Copying to larger program cache: %u kB -> %u kB\n",
208 (unsigned) cache->bo->size / 1024, new_size / 1024);
209
210 new_bo = brw_bo_alloc(brw->bufmgr, "program cache", new_size,
211 BRW_MEMZONE_SHADER);
212 if (can_do_exec_capture(brw->screen))
213 new_bo->kflags |= EXEC_OBJECT_CAPTURE;
214
215 void *map = brw_bo_map(brw, new_bo, MAP_READ | MAP_WRITE |
216 MAP_ASYNC | MAP_PERSISTENT);
217
218 /* Copy any existing data that needs to be saved. */
219 if (cache->next_offset != 0) {
220 #ifdef USE_SSE41
221 if (!cache->bo->cache_coherent && cpu_has_sse4_1)
222 _mesa_streaming_load_memcpy(map, cache->map, cache->next_offset);
223 else
224 #endif
225 memcpy(map, cache->map, cache->next_offset);
226 }
227
228 brw_bo_unmap(cache->bo);
229 brw_bo_unreference(cache->bo);
230 cache->bo = new_bo;
231 cache->map = map;
232
233 /* Since we have a new BO in place, we need to signal the units
234 * that depend on it (state base address on gen5+, or unit state before).
235 */
236 brw->ctx.NewDriverState |= BRW_NEW_PROGRAM_CACHE;
237 brw->batch.state_base_address_emitted = false;
238 }
239
240 /**
241 * Attempts to find an item in the cache with identical data.
242 */
243 static const struct brw_cache_item *
244 brw_lookup_prog(const struct brw_cache *cache,
245 enum brw_cache_id cache_id,
246 const void *data, unsigned data_size)
247 {
248 unsigned i;
249 const struct brw_cache_item *item;
250
251 for (i = 0; i < cache->size; i++) {
252 for (item = cache->items[i]; item; item = item->next) {
253 if (item->cache_id != cache_id || item->size != data_size ||
254 memcmp(cache->map + item->offset, data, item->size) != 0)
255 continue;
256
257 return item;
258 }
259 }
260
261 return NULL;
262 }
263
264 static uint32_t
265 brw_alloc_item_data(struct brw_cache *cache, uint32_t size)
266 {
267 uint32_t offset;
268
269 /* Allocate space in the cache BO for our new program. */
270 if (cache->next_offset + size > cache->bo->size) {
271 uint32_t new_size = cache->bo->size * 2;
272
273 while (cache->next_offset + size > new_size)
274 new_size *= 2;
275
276 brw_cache_new_bo(cache, new_size);
277 }
278
279 offset = cache->next_offset;
280
281 /* Programs are always 64-byte aligned, so set up the next one now */
282 cache->next_offset = ALIGN(offset + size, 64);
283
284 return offset;
285 }
286
287 const void *
288 brw_find_previous_compile(struct brw_cache *cache,
289 enum brw_cache_id cache_id,
290 unsigned program_string_id)
291 {
292 for (unsigned i = 0; i < cache->size; i++) {
293 for (struct brw_cache_item *c = cache->items[i]; c; c = c->next) {
294 if (c->cache_id == cache_id &&
295 c->key->program_string_id == program_string_id) {
296 return c->key;
297 }
298 }
299 }
300
301 return NULL;
302 }
303
304 void
305 brw_upload_cache(struct brw_cache *cache,
306 enum brw_cache_id cache_id,
307 const void *key,
308 GLuint key_size,
309 const void *data,
310 GLuint data_size,
311 const void *prog_data,
312 GLuint prog_data_size,
313 uint32_t *out_offset,
314 void *out_prog_data)
315 {
316 struct brw_cache_item *item = CALLOC_STRUCT(brw_cache_item);
317 const struct brw_cache_item *matching_data =
318 brw_lookup_prog(cache, cache_id, data, data_size);
319 GLuint hash;
320 void *tmp;
321
322 item->cache_id = cache_id;
323 item->size = data_size;
324 item->key = key;
325 item->key_size = key_size;
326 item->prog_data_size = prog_data_size;
327 hash = hash_key(item);
328 item->hash = hash;
329
330 /* If we can find a matching prog in the cache already, then reuse the
331 * existing stuff without creating new copy into the underlying buffer
332 * object. This is notably useful for programs generating shaders at
333 * runtime, where multiple shaders may compile to the same thing in our
334 * backend.
335 */
336 if (matching_data) {
337 item->offset = matching_data->offset;
338 } else {
339 item->offset = brw_alloc_item_data(cache, data_size);
340
341 /* Copy data to the buffer */
342 memcpy(cache->map + item->offset, data, data_size);
343 }
344
345 /* Set up the memory containing the key and prog_data */
346 tmp = malloc(key_size + prog_data_size);
347
348 memcpy(tmp, key, key_size);
349 memcpy(tmp + key_size, prog_data, prog_data_size);
350
351 item->key = tmp;
352
353 if (cache->n_items > cache->size * 1.5f)
354 rehash(cache);
355
356 hash %= cache->size;
357 item->next = cache->items[hash];
358 cache->items[hash] = item;
359 cache->n_items++;
360
361 *out_offset = item->offset;
362 *(void **)out_prog_data = (void *)((char *)item->key + item->key_size);
363 cache->brw->ctx.NewDriverState |= 1 << cache_id;
364 }
365
366 void
367 brw_init_caches(struct brw_context *brw)
368 {
369 struct brw_cache *cache = &brw->cache;
370
371 cache->brw = brw;
372
373 cache->size = 7;
374 cache->n_items = 0;
375 cache->items =
376 calloc(cache->size, sizeof(struct brw_cache_item *));
377
378 cache->bo = brw_bo_alloc(brw->bufmgr, "program cache", 16384,
379 BRW_MEMZONE_SHADER);
380 if (can_do_exec_capture(brw->screen))
381 cache->bo->kflags |= EXEC_OBJECT_CAPTURE;
382
383 cache->map = brw_bo_map(brw, cache->bo, MAP_READ | MAP_WRITE |
384 MAP_ASYNC | MAP_PERSISTENT);
385 }
386
387 static void
388 brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
389 {
390 struct brw_cache_item *c, *next;
391 GLuint i;
392
393 DBG("%s\n", __func__);
394
395 for (i = 0; i < cache->size; i++) {
396 for (c = cache->items[i]; c; c = next) {
397 next = c->next;
398 if (c->cache_id == BRW_CACHE_VS_PROG ||
399 c->cache_id == BRW_CACHE_TCS_PROG ||
400 c->cache_id == BRW_CACHE_TES_PROG ||
401 c->cache_id == BRW_CACHE_GS_PROG ||
402 c->cache_id == BRW_CACHE_FS_PROG ||
403 c->cache_id == BRW_CACHE_CS_PROG) {
404 const void *item_prog_data = ((char *)c->key) + c->key_size;
405 brw_stage_prog_data_free(item_prog_data);
406 }
407 free((void *)c->key);
408 free(c);
409 }
410 cache->items[i] = NULL;
411 }
412
413 cache->n_items = 0;
414
415 /* Start putting programs into the start of the BO again, since
416 * we'll never find the old results.
417 */
418 cache->next_offset = 0;
419
420 /* We need to make sure that the programs get regenerated, since
421 * any offsets leftover in brw_context will no longer be valid.
422 */
423 brw->NewGLState = ~0;
424 brw->ctx.NewDriverState = ~0ull;
425 brw->state.pipelines[BRW_RENDER_PIPELINE].mesa = ~0;
426 brw->state.pipelines[BRW_RENDER_PIPELINE].brw = ~0ull;
427 brw->state.pipelines[BRW_COMPUTE_PIPELINE].mesa = ~0;
428 brw->state.pipelines[BRW_COMPUTE_PIPELINE].brw = ~0ull;
429
430 /* Also, NULL out any stale program pointers. */
431 brw->vs.base.prog_data = NULL;
432 brw->tcs.base.prog_data = NULL;
433 brw->tes.base.prog_data = NULL;
434 brw->gs.base.prog_data = NULL;
435 brw->wm.base.prog_data = NULL;
436 brw->cs.base.prog_data = NULL;
437
438 intel_batchbuffer_flush(brw);
439 }
440
441 void
442 brw_program_cache_check_size(struct brw_context *brw)
443 {
444 /* un-tuned guess. Each object is generally a page, so 2000 of them is 8 MB of
445 * state cache.
446 */
447 if (brw->cache.n_items > 2000) {
448 perf_debug("Exceeded state cache size limit. Clearing the set "
449 "of compiled programs, which will trigger recompiles\n");
450 brw_clear_cache(brw, &brw->cache);
451 brw_cache_new_bo(&brw->cache, brw->cache.bo->size);
452 }
453 }
454
455
456 static void
457 brw_destroy_cache(struct brw_context *brw, struct brw_cache *cache)
458 {
459
460 DBG("%s\n", __func__);
461
462 /* This can be NULL if context creation failed early on */
463 if (cache->bo) {
464 brw_bo_unmap(cache->bo);
465 brw_bo_unreference(cache->bo);
466 cache->bo = NULL;
467 cache->map = NULL;
468 }
469 brw_clear_cache(brw, cache);
470 free(cache->items);
471 cache->items = NULL;
472 cache->size = 0;
473 }
474
475
476 void
477 brw_destroy_caches(struct brw_context *brw)
478 {
479 brw_destroy_cache(brw, &brw->cache);
480 }
481
482 static const char *
483 cache_name(enum brw_cache_id cache_id)
484 {
485 switch (cache_id) {
486 case BRW_CACHE_VS_PROG:
487 return "VS kernel";
488 case BRW_CACHE_TCS_PROG:
489 return "TCS kernel";
490 case BRW_CACHE_TES_PROG:
491 return "TES kernel";
492 case BRW_CACHE_FF_GS_PROG:
493 return "Fixed-function GS kernel";
494 case BRW_CACHE_GS_PROG:
495 return "GS kernel";
496 case BRW_CACHE_CLIP_PROG:
497 return "CLIP kernel";
498 case BRW_CACHE_SF_PROG:
499 return "SF kernel";
500 case BRW_CACHE_FS_PROG:
501 return "FS kernel";
502 case BRW_CACHE_CS_PROG:
503 return "CS kernel";
504 default:
505 return "unknown";
506 }
507 }
508
509 void
510 brw_print_program_cache(struct brw_context *brw)
511 {
512 const struct brw_cache *cache = &brw->cache;
513 struct brw_cache_item *item;
514
515 for (unsigned i = 0; i < cache->size; i++) {
516 for (item = cache->items[i]; item; item = item->next) {
517 fprintf(stderr, "%s:\n", cache_name(i));
518 brw_disassemble(&brw->screen->devinfo, cache->map,
519 item->offset, item->size, stderr);
520 }
521 }
522 }