Move compiler.h and imports.h/c from src/mesa/main into src/util
[mesa.git] / src / mesa / drivers / dri / i965 / brw_program_cache.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keithw@vmware.com>
30 */
31
32 /** @file brw_program_cache.c
33 *
34 * This file implements a simple program cache for 965. The consumers can
35 * query the hash table of programs using a cache_id and program key, and
36 * receive the corresponding program buffer object (plus associated auxiliary
37 * data) in return. Objects in the cache may not have relocations
38 * (pointers to other BOs) in them.
39 *
40 * The inner workings are a simple hash table based on a FNV-1a of the
41 * key data.
42 *
43 * Replacement is not implemented. Instead, when the cache gets too
44 * big we throw out all of the cache data and let it get regenerated.
45 */
46
47 #include "util/imports.h"
48 #include "main/streaming-load-memcpy.h"
49 #include "x86/common_x86_asm.h"
50 #include "intel_batchbuffer.h"
51 #include "brw_state.h"
52 #include "brw_wm.h"
53 #include "brw_gs.h"
54 #include "brw_cs.h"
55 #include "brw_program.h"
56 #include "compiler/brw_eu.h"
57
58 #define FILE_DEBUG_FLAG DEBUG_STATE
59
60 struct brw_cache_item {
61 /**
62 * Effectively part of the key, cache_id identifies what kind of state
63 * buffer is involved, and also which dirty flag should set.
64 */
65 enum brw_cache_id cache_id;
66
67 /** 32-bit hash of the key data */
68 GLuint hash;
69
70 /** for variable-sized keys */
71 GLuint key_size;
72 GLuint prog_data_size;
73 const struct brw_base_prog_key *key;
74
75 uint32_t offset;
76 uint32_t size;
77
78 struct brw_cache_item *next;
79 };
80
81 enum brw_cache_id
82 brw_stage_cache_id(gl_shader_stage stage)
83 {
84 static const enum brw_cache_id stage_ids[] = {
85 BRW_CACHE_VS_PROG,
86 BRW_CACHE_TCS_PROG,
87 BRW_CACHE_TES_PROG,
88 BRW_CACHE_GS_PROG,
89 BRW_CACHE_FS_PROG,
90 BRW_CACHE_CS_PROG,
91 };
92 assert((int)stage >= 0 && stage < ARRAY_SIZE(stage_ids));
93 return stage_ids[stage];
94 }
95
96 static GLuint
97 hash_key(struct brw_cache_item *item)
98 {
99 uint32_t hash = _mesa_fnv32_1a_offset_bias;
100 hash = _mesa_fnv32_1a_accumulate(hash, item->cache_id);
101 hash = _mesa_fnv32_1a_accumulate_block(hash, item->key, item->key_size);
102
103 return hash;
104 }
105
106 static int
107 brw_cache_item_equals(const struct brw_cache_item *a,
108 const struct brw_cache_item *b)
109 {
110 return a->cache_id == b->cache_id &&
111 a->hash == b->hash &&
112 a->key_size == b->key_size &&
113 (memcmp(a->key, b->key, a->key_size) == 0);
114 }
115
116 static struct brw_cache_item *
117 search_cache(struct brw_cache *cache, GLuint hash,
118 struct brw_cache_item *lookup)
119 {
120 struct brw_cache_item *c;
121
122 #if 0
123 int bucketcount = 0;
124
125 for (c = cache->items[hash % cache->size]; c; c = c->next)
126 bucketcount++;
127
128 fprintf(stderr, "bucket %d/%d = %d/%d items\n", hash % cache->size,
129 cache->size, bucketcount, cache->n_items);
130 #endif
131
132 for (c = cache->items[hash % cache->size]; c; c = c->next) {
133 if (brw_cache_item_equals(lookup, c))
134 return c;
135 }
136
137 return NULL;
138 }
139
140
141 static void
142 rehash(struct brw_cache *cache)
143 {
144 struct brw_cache_item **items;
145 struct brw_cache_item *c, *next;
146 GLuint size, i;
147
148 size = cache->size * 3;
149 items = calloc(size, sizeof(*items));
150
151 for (i = 0; i < cache->size; i++)
152 for (c = cache->items[i]; c; c = next) {
153 next = c->next;
154 c->next = items[c->hash % size];
155 items[c->hash % size] = c;
156 }
157
158 free(cache->items);
159 cache->items = items;
160 cache->size = size;
161 }
162
163
164 /**
165 * Returns the buffer object matching cache_id and key, or NULL.
166 */
167 bool
168 brw_search_cache(struct brw_cache *cache, enum brw_cache_id cache_id,
169 const void *key, GLuint key_size, uint32_t *inout_offset,
170 void *inout_prog_data, bool flag_state)
171 {
172 struct brw_cache_item *item;
173 struct brw_cache_item lookup;
174 GLuint hash;
175
176 lookup.cache_id = cache_id;
177 lookup.key = key;
178 lookup.key_size = key_size;
179 hash = hash_key(&lookup);
180 lookup.hash = hash;
181
182 item = search_cache(cache, hash, &lookup);
183
184 if (item == NULL)
185 return false;
186
187 void *prog_data = ((char *) item->key) + item->key_size;
188
189 if (item->offset != *inout_offset ||
190 prog_data != *((void **) inout_prog_data)) {
191 if (likely(flag_state))
192 cache->brw->ctx.NewDriverState |= (1 << cache_id);
193 *inout_offset = item->offset;
194 *((void **) inout_prog_data) = prog_data;
195 }
196
197 return true;
198 }
199
200 static void
201 brw_cache_new_bo(struct brw_cache *cache, uint32_t new_size)
202 {
203 struct brw_context *brw = cache->brw;
204 struct brw_bo *new_bo;
205
206 perf_debug("Copying to larger program cache: %u kB -> %u kB\n",
207 (unsigned) cache->bo->size / 1024, new_size / 1024);
208
209 new_bo = brw_bo_alloc(brw->bufmgr, "program cache", new_size,
210 BRW_MEMZONE_SHADER);
211 if (can_do_exec_capture(brw->screen))
212 new_bo->kflags |= EXEC_OBJECT_CAPTURE;
213
214 void *map = brw_bo_map(brw, new_bo, MAP_READ | MAP_WRITE |
215 MAP_ASYNC | MAP_PERSISTENT);
216
217 /* Copy any existing data that needs to be saved. */
218 if (cache->next_offset != 0) {
219 #ifdef USE_SSE41
220 if (!cache->bo->cache_coherent && cpu_has_sse4_1)
221 _mesa_streaming_load_memcpy(map, cache->map, cache->next_offset);
222 else
223 #endif
224 memcpy(map, cache->map, cache->next_offset);
225 }
226
227 brw_bo_unmap(cache->bo);
228 brw_bo_unreference(cache->bo);
229 cache->bo = new_bo;
230 cache->map = map;
231
232 /* Since we have a new BO in place, we need to signal the units
233 * that depend on it (state base address on gen5+, or unit state before).
234 */
235 brw->ctx.NewDriverState |= BRW_NEW_PROGRAM_CACHE;
236 brw->batch.state_base_address_emitted = false;
237 }
238
239 /**
240 * Attempts to find an item in the cache with identical data.
241 */
242 static const struct brw_cache_item *
243 brw_lookup_prog(const struct brw_cache *cache,
244 enum brw_cache_id cache_id,
245 const void *data, unsigned data_size)
246 {
247 unsigned i;
248 const struct brw_cache_item *item;
249
250 for (i = 0; i < cache->size; i++) {
251 for (item = cache->items[i]; item; item = item->next) {
252 if (item->cache_id != cache_id || item->size != data_size ||
253 memcmp(cache->map + item->offset, data, item->size) != 0)
254 continue;
255
256 return item;
257 }
258 }
259
260 return NULL;
261 }
262
263 static uint32_t
264 brw_alloc_item_data(struct brw_cache *cache, uint32_t size)
265 {
266 uint32_t offset;
267
268 /* Allocate space in the cache BO for our new program. */
269 if (cache->next_offset + size > cache->bo->size) {
270 uint32_t new_size = cache->bo->size * 2;
271
272 while (cache->next_offset + size > new_size)
273 new_size *= 2;
274
275 brw_cache_new_bo(cache, new_size);
276 }
277
278 offset = cache->next_offset;
279
280 /* Programs are always 64-byte aligned, so set up the next one now */
281 cache->next_offset = ALIGN(offset + size, 64);
282
283 return offset;
284 }
285
286 const void *
287 brw_find_previous_compile(struct brw_cache *cache,
288 enum brw_cache_id cache_id,
289 unsigned program_string_id)
290 {
291 for (unsigned i = 0; i < cache->size; i++) {
292 for (struct brw_cache_item *c = cache->items[i]; c; c = c->next) {
293 if (c->cache_id == cache_id &&
294 c->key->program_string_id == program_string_id) {
295 return c->key;
296 }
297 }
298 }
299
300 return NULL;
301 }
302
303 void
304 brw_upload_cache(struct brw_cache *cache,
305 enum brw_cache_id cache_id,
306 const void *key,
307 GLuint key_size,
308 const void *data,
309 GLuint data_size,
310 const void *prog_data,
311 GLuint prog_data_size,
312 uint32_t *out_offset,
313 void *out_prog_data)
314 {
315 struct brw_cache_item *item = CALLOC_STRUCT(brw_cache_item);
316 const struct brw_cache_item *matching_data =
317 brw_lookup_prog(cache, cache_id, data, data_size);
318 GLuint hash;
319 void *tmp;
320
321 item->cache_id = cache_id;
322 item->size = data_size;
323 item->key = key;
324 item->key_size = key_size;
325 item->prog_data_size = prog_data_size;
326 hash = hash_key(item);
327 item->hash = hash;
328
329 /* If we can find a matching prog in the cache already, then reuse the
330 * existing stuff without creating new copy into the underlying buffer
331 * object. This is notably useful for programs generating shaders at
332 * runtime, where multiple shaders may compile to the same thing in our
333 * backend.
334 */
335 if (matching_data) {
336 item->offset = matching_data->offset;
337 } else {
338 item->offset = brw_alloc_item_data(cache, data_size);
339
340 /* Copy data to the buffer */
341 memcpy(cache->map + item->offset, data, data_size);
342 }
343
344 /* Set up the memory containing the key and prog_data */
345 tmp = malloc(key_size + prog_data_size);
346
347 memcpy(tmp, key, key_size);
348 memcpy(tmp + key_size, prog_data, prog_data_size);
349
350 item->key = tmp;
351
352 if (cache->n_items > cache->size * 1.5f)
353 rehash(cache);
354
355 hash %= cache->size;
356 item->next = cache->items[hash];
357 cache->items[hash] = item;
358 cache->n_items++;
359
360 *out_offset = item->offset;
361 *(void **)out_prog_data = (void *)((char *)item->key + item->key_size);
362 cache->brw->ctx.NewDriverState |= 1 << cache_id;
363 }
364
365 void
366 brw_init_caches(struct brw_context *brw)
367 {
368 struct brw_cache *cache = &brw->cache;
369
370 cache->brw = brw;
371
372 cache->size = 7;
373 cache->n_items = 0;
374 cache->items =
375 calloc(cache->size, sizeof(struct brw_cache_item *));
376
377 cache->bo = brw_bo_alloc(brw->bufmgr, "program cache", 16384,
378 BRW_MEMZONE_SHADER);
379 if (can_do_exec_capture(brw->screen))
380 cache->bo->kflags |= EXEC_OBJECT_CAPTURE;
381
382 cache->map = brw_bo_map(brw, cache->bo, MAP_READ | MAP_WRITE |
383 MAP_ASYNC | MAP_PERSISTENT);
384 }
385
386 static void
387 brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
388 {
389 struct brw_cache_item *c, *next;
390 GLuint i;
391
392 DBG("%s\n", __func__);
393
394 for (i = 0; i < cache->size; i++) {
395 for (c = cache->items[i]; c; c = next) {
396 next = c->next;
397 if (c->cache_id == BRW_CACHE_VS_PROG ||
398 c->cache_id == BRW_CACHE_TCS_PROG ||
399 c->cache_id == BRW_CACHE_TES_PROG ||
400 c->cache_id == BRW_CACHE_GS_PROG ||
401 c->cache_id == BRW_CACHE_FS_PROG ||
402 c->cache_id == BRW_CACHE_CS_PROG) {
403 const void *item_prog_data = ((char *)c->key) + c->key_size;
404 brw_stage_prog_data_free(item_prog_data);
405 }
406 free((void *)c->key);
407 free(c);
408 }
409 cache->items[i] = NULL;
410 }
411
412 cache->n_items = 0;
413
414 /* Start putting programs into the start of the BO again, since
415 * we'll never find the old results.
416 */
417 cache->next_offset = 0;
418
419 /* We need to make sure that the programs get regenerated, since
420 * any offsets leftover in brw_context will no longer be valid.
421 */
422 brw->NewGLState = ~0;
423 brw->ctx.NewDriverState = ~0ull;
424 brw->state.pipelines[BRW_RENDER_PIPELINE].mesa = ~0;
425 brw->state.pipelines[BRW_RENDER_PIPELINE].brw = ~0ull;
426 brw->state.pipelines[BRW_COMPUTE_PIPELINE].mesa = ~0;
427 brw->state.pipelines[BRW_COMPUTE_PIPELINE].brw = ~0ull;
428
429 /* Also, NULL out any stale program pointers. */
430 brw->vs.base.prog_data = NULL;
431 brw->tcs.base.prog_data = NULL;
432 brw->tes.base.prog_data = NULL;
433 brw->gs.base.prog_data = NULL;
434 brw->wm.base.prog_data = NULL;
435 brw->cs.base.prog_data = NULL;
436
437 intel_batchbuffer_flush(brw);
438 }
439
440 void
441 brw_program_cache_check_size(struct brw_context *brw)
442 {
443 /* un-tuned guess. Each object is generally a page, so 2000 of them is 8 MB of
444 * state cache.
445 */
446 if (brw->cache.n_items > 2000) {
447 perf_debug("Exceeded state cache size limit. Clearing the set "
448 "of compiled programs, which will trigger recompiles\n");
449 brw_clear_cache(brw, &brw->cache);
450 brw_cache_new_bo(&brw->cache, brw->cache.bo->size);
451 }
452 }
453
454
455 static void
456 brw_destroy_cache(struct brw_context *brw, struct brw_cache *cache)
457 {
458
459 DBG("%s\n", __func__);
460
461 /* This can be NULL if context creation failed early on */
462 if (cache->bo) {
463 brw_bo_unmap(cache->bo);
464 brw_bo_unreference(cache->bo);
465 cache->bo = NULL;
466 cache->map = NULL;
467 }
468 brw_clear_cache(brw, cache);
469 free(cache->items);
470 cache->items = NULL;
471 cache->size = 0;
472 }
473
474
475 void
476 brw_destroy_caches(struct brw_context *brw)
477 {
478 brw_destroy_cache(brw, &brw->cache);
479 }
480
481 static const char *
482 cache_name(enum brw_cache_id cache_id)
483 {
484 switch (cache_id) {
485 case BRW_CACHE_VS_PROG:
486 return "VS kernel";
487 case BRW_CACHE_TCS_PROG:
488 return "TCS kernel";
489 case BRW_CACHE_TES_PROG:
490 return "TES kernel";
491 case BRW_CACHE_FF_GS_PROG:
492 return "Fixed-function GS kernel";
493 case BRW_CACHE_GS_PROG:
494 return "GS kernel";
495 case BRW_CACHE_CLIP_PROG:
496 return "CLIP kernel";
497 case BRW_CACHE_SF_PROG:
498 return "SF kernel";
499 case BRW_CACHE_FS_PROG:
500 return "FS kernel";
501 case BRW_CACHE_CS_PROG:
502 return "CS kernel";
503 default:
504 return "unknown";
505 }
506 }
507
508 void
509 brw_print_program_cache(struct brw_context *brw)
510 {
511 const struct brw_cache *cache = &brw->cache;
512 struct brw_cache_item *item;
513
514 for (unsigned i = 0; i < cache->size; i++) {
515 for (item = cache->items[i]; item; item = item->next) {
516 fprintf(stderr, "%s:\n", cache_name(i));
517 brw_disassemble(&brw->screen->devinfo, cache->map,
518 item->offset, item->size, stderr);
519 }
520 }
521 }