i965/gen9: Optimize slice and subslice load balancing behavior.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_program_cache.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keithw@vmware.com>
30 */
31
32 /** @file brw_program_cache.c
33 *
34 * This file implements a simple program cache for 965. The consumers can
35 * query the hash table of programs using a cache_id and program key, and
36 * receive the corresponding program buffer object (plus associated auxiliary
37 * data) in return. Objects in the cache may not have relocations
38 * (pointers to other BOs) in them.
39 *
40 * The inner workings are a simple hash table based on a CRC of the
41 * key data.
42 *
43 * Replacement is not implemented. Instead, when the cache gets too
44 * big we throw out all of the cache data and let it get regenerated.
45 */
46
47 #include "main/imports.h"
48 #include "main/streaming-load-memcpy.h"
49 #include "x86/common_x86_asm.h"
50 #include "intel_batchbuffer.h"
51 #include "brw_state.h"
52 #include "brw_wm.h"
53 #include "brw_gs.h"
54 #include "brw_cs.h"
55 #include "brw_program.h"
56 #include "compiler/brw_eu.h"
57
58 #define FILE_DEBUG_FLAG DEBUG_STATE
59
60 struct brw_cache_item {
61 /**
62 * Effectively part of the key, cache_id identifies what kind of state
63 * buffer is involved, and also which dirty flag should set.
64 */
65 enum brw_cache_id cache_id;
66
67 /** 32-bit hash of the key data */
68 GLuint hash;
69
70 /** for variable-sized keys */
71 GLuint key_size;
72 GLuint prog_data_size;
73 const struct brw_base_prog_key *key;
74
75 uint32_t offset;
76 uint32_t size;
77
78 struct brw_cache_item *next;
79 };
80
81 enum brw_cache_id
82 brw_stage_cache_id(gl_shader_stage stage)
83 {
84 static const enum brw_cache_id stage_ids[] = {
85 BRW_CACHE_VS_PROG,
86 BRW_CACHE_TCS_PROG,
87 BRW_CACHE_TES_PROG,
88 BRW_CACHE_GS_PROG,
89 BRW_CACHE_FS_PROG,
90 BRW_CACHE_CS_PROG,
91 };
92 assert((int)stage >= 0 && stage < ARRAY_SIZE(stage_ids));
93 return stage_ids[stage];
94 }
95
96 static GLuint
97 hash_key(struct brw_cache_item *item)
98 {
99 GLuint *ikey = (GLuint *)item->key;
100 GLuint hash = item->cache_id, i;
101
102 assert(item->key_size % 4 == 0);
103
104 /* I'm sure this can be improved on:
105 */
106 for (i = 0; i < item->key_size/4; i++) {
107 hash ^= ikey[i];
108 hash = (hash << 5) | (hash >> 27);
109 }
110
111 return hash;
112 }
113
114 static int
115 brw_cache_item_equals(const struct brw_cache_item *a,
116 const struct brw_cache_item *b)
117 {
118 return a->cache_id == b->cache_id &&
119 a->hash == b->hash &&
120 a->key_size == b->key_size &&
121 (memcmp(a->key, b->key, a->key_size) == 0);
122 }
123
124 static struct brw_cache_item *
125 search_cache(struct brw_cache *cache, GLuint hash,
126 struct brw_cache_item *lookup)
127 {
128 struct brw_cache_item *c;
129
130 #if 0
131 int bucketcount = 0;
132
133 for (c = cache->items[hash % cache->size]; c; c = c->next)
134 bucketcount++;
135
136 fprintf(stderr, "bucket %d/%d = %d/%d items\n", hash % cache->size,
137 cache->size, bucketcount, cache->n_items);
138 #endif
139
140 for (c = cache->items[hash % cache->size]; c; c = c->next) {
141 if (brw_cache_item_equals(lookup, c))
142 return c;
143 }
144
145 return NULL;
146 }
147
148
149 static void
150 rehash(struct brw_cache *cache)
151 {
152 struct brw_cache_item **items;
153 struct brw_cache_item *c, *next;
154 GLuint size, i;
155
156 size = cache->size * 3;
157 items = calloc(size, sizeof(*items));
158
159 for (i = 0; i < cache->size; i++)
160 for (c = cache->items[i]; c; c = next) {
161 next = c->next;
162 c->next = items[c->hash % size];
163 items[c->hash % size] = c;
164 }
165
166 free(cache->items);
167 cache->items = items;
168 cache->size = size;
169 }
170
171
172 /**
173 * Returns the buffer object matching cache_id and key, or NULL.
174 */
175 bool
176 brw_search_cache(struct brw_cache *cache, enum brw_cache_id cache_id,
177 const void *key, GLuint key_size, uint32_t *inout_offset,
178 void *inout_prog_data, bool flag_state)
179 {
180 struct brw_cache_item *item;
181 struct brw_cache_item lookup;
182 GLuint hash;
183
184 lookup.cache_id = cache_id;
185 lookup.key = key;
186 lookup.key_size = key_size;
187 hash = hash_key(&lookup);
188 lookup.hash = hash;
189
190 item = search_cache(cache, hash, &lookup);
191
192 if (item == NULL)
193 return false;
194
195 void *prog_data = ((char *) item->key) + item->key_size;
196
197 if (item->offset != *inout_offset ||
198 prog_data != *((void **) inout_prog_data)) {
199 if (likely(flag_state))
200 cache->brw->ctx.NewDriverState |= (1 << cache_id);
201 *inout_offset = item->offset;
202 *((void **) inout_prog_data) = prog_data;
203 }
204
205 return true;
206 }
207
208 static void
209 brw_cache_new_bo(struct brw_cache *cache, uint32_t new_size)
210 {
211 struct brw_context *brw = cache->brw;
212 struct brw_bo *new_bo;
213
214 perf_debug("Copying to larger program cache: %u kB -> %u kB\n",
215 (unsigned) cache->bo->size / 1024, new_size / 1024);
216
217 new_bo = brw_bo_alloc(brw->bufmgr, "program cache", new_size,
218 BRW_MEMZONE_SHADER);
219 if (can_do_exec_capture(brw->screen))
220 new_bo->kflags |= EXEC_OBJECT_CAPTURE;
221
222 void *map = brw_bo_map(brw, new_bo, MAP_READ | MAP_WRITE |
223 MAP_ASYNC | MAP_PERSISTENT);
224
225 /* Copy any existing data that needs to be saved. */
226 if (cache->next_offset != 0) {
227 #ifdef USE_SSE41
228 if (!cache->bo->cache_coherent && cpu_has_sse4_1)
229 _mesa_streaming_load_memcpy(map, cache->map, cache->next_offset);
230 else
231 #endif
232 memcpy(map, cache->map, cache->next_offset);
233 }
234
235 brw_bo_unmap(cache->bo);
236 brw_bo_unreference(cache->bo);
237 cache->bo = new_bo;
238 cache->map = map;
239
240 /* Since we have a new BO in place, we need to signal the units
241 * that depend on it (state base address on gen5+, or unit state before).
242 */
243 brw->ctx.NewDriverState |= BRW_NEW_PROGRAM_CACHE;
244 brw->batch.state_base_address_emitted = false;
245 }
246
247 /**
248 * Attempts to find an item in the cache with identical data.
249 */
250 static const struct brw_cache_item *
251 brw_lookup_prog(const struct brw_cache *cache,
252 enum brw_cache_id cache_id,
253 const void *data, unsigned data_size)
254 {
255 unsigned i;
256 const struct brw_cache_item *item;
257
258 for (i = 0; i < cache->size; i++) {
259 for (item = cache->items[i]; item; item = item->next) {
260 if (item->cache_id != cache_id || item->size != data_size ||
261 memcmp(cache->map + item->offset, data, item->size) != 0)
262 continue;
263
264 return item;
265 }
266 }
267
268 return NULL;
269 }
270
271 static uint32_t
272 brw_alloc_item_data(struct brw_cache *cache, uint32_t size)
273 {
274 uint32_t offset;
275
276 /* Allocate space in the cache BO for our new program. */
277 if (cache->next_offset + size > cache->bo->size) {
278 uint32_t new_size = cache->bo->size * 2;
279
280 while (cache->next_offset + size > new_size)
281 new_size *= 2;
282
283 brw_cache_new_bo(cache, new_size);
284 }
285
286 offset = cache->next_offset;
287
288 /* Programs are always 64-byte aligned, so set up the next one now */
289 cache->next_offset = ALIGN(offset + size, 64);
290
291 return offset;
292 }
293
294 const void *
295 brw_find_previous_compile(struct brw_cache *cache,
296 enum brw_cache_id cache_id,
297 unsigned program_string_id)
298 {
299 for (unsigned i = 0; i < cache->size; i++) {
300 for (struct brw_cache_item *c = cache->items[i]; c; c = c->next) {
301 if (c->cache_id == cache_id &&
302 c->key->program_string_id == program_string_id) {
303 return c->key;
304 }
305 }
306 }
307
308 return NULL;
309 }
310
311 void
312 brw_upload_cache(struct brw_cache *cache,
313 enum brw_cache_id cache_id,
314 const void *key,
315 GLuint key_size,
316 const void *data,
317 GLuint data_size,
318 const void *prog_data,
319 GLuint prog_data_size,
320 uint32_t *out_offset,
321 void *out_prog_data)
322 {
323 struct brw_cache_item *item = CALLOC_STRUCT(brw_cache_item);
324 const struct brw_cache_item *matching_data =
325 brw_lookup_prog(cache, cache_id, data, data_size);
326 GLuint hash;
327 void *tmp;
328
329 item->cache_id = cache_id;
330 item->size = data_size;
331 item->key = key;
332 item->key_size = key_size;
333 item->prog_data_size = prog_data_size;
334 hash = hash_key(item);
335 item->hash = hash;
336
337 /* If we can find a matching prog in the cache already, then reuse the
338 * existing stuff without creating new copy into the underlying buffer
339 * object. This is notably useful for programs generating shaders at
340 * runtime, where multiple shaders may compile to the same thing in our
341 * backend.
342 */
343 if (matching_data) {
344 item->offset = matching_data->offset;
345 } else {
346 item->offset = brw_alloc_item_data(cache, data_size);
347
348 /* Copy data to the buffer */
349 memcpy(cache->map + item->offset, data, data_size);
350 }
351
352 /* Set up the memory containing the key and prog_data */
353 tmp = malloc(key_size + prog_data_size);
354
355 memcpy(tmp, key, key_size);
356 memcpy(tmp + key_size, prog_data, prog_data_size);
357
358 item->key = tmp;
359
360 if (cache->n_items > cache->size * 1.5f)
361 rehash(cache);
362
363 hash %= cache->size;
364 item->next = cache->items[hash];
365 cache->items[hash] = item;
366 cache->n_items++;
367
368 *out_offset = item->offset;
369 *(void **)out_prog_data = (void *)((char *)item->key + item->key_size);
370 cache->brw->ctx.NewDriverState |= 1 << cache_id;
371 }
372
373 void
374 brw_init_caches(struct brw_context *brw)
375 {
376 struct brw_cache *cache = &brw->cache;
377
378 cache->brw = brw;
379
380 cache->size = 7;
381 cache->n_items = 0;
382 cache->items =
383 calloc(cache->size, sizeof(struct brw_cache_item *));
384
385 cache->bo = brw_bo_alloc(brw->bufmgr, "program cache", 16384,
386 BRW_MEMZONE_SHADER);
387 if (can_do_exec_capture(brw->screen))
388 cache->bo->kflags |= EXEC_OBJECT_CAPTURE;
389
390 cache->map = brw_bo_map(brw, cache->bo, MAP_READ | MAP_WRITE |
391 MAP_ASYNC | MAP_PERSISTENT);
392 }
393
394 static void
395 brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
396 {
397 struct brw_cache_item *c, *next;
398 GLuint i;
399
400 DBG("%s\n", __func__);
401
402 for (i = 0; i < cache->size; i++) {
403 for (c = cache->items[i]; c; c = next) {
404 next = c->next;
405 if (c->cache_id == BRW_CACHE_VS_PROG ||
406 c->cache_id == BRW_CACHE_TCS_PROG ||
407 c->cache_id == BRW_CACHE_TES_PROG ||
408 c->cache_id == BRW_CACHE_GS_PROG ||
409 c->cache_id == BRW_CACHE_FS_PROG ||
410 c->cache_id == BRW_CACHE_CS_PROG) {
411 const void *item_prog_data = ((char *)c->key) + c->key_size;
412 brw_stage_prog_data_free(item_prog_data);
413 }
414 free((void *)c->key);
415 free(c);
416 }
417 cache->items[i] = NULL;
418 }
419
420 cache->n_items = 0;
421
422 /* Start putting programs into the start of the BO again, since
423 * we'll never find the old results.
424 */
425 cache->next_offset = 0;
426
427 /* We need to make sure that the programs get regenerated, since
428 * any offsets leftover in brw_context will no longer be valid.
429 */
430 brw->NewGLState = ~0;
431 brw->ctx.NewDriverState = ~0ull;
432 brw->state.pipelines[BRW_RENDER_PIPELINE].mesa = ~0;
433 brw->state.pipelines[BRW_RENDER_PIPELINE].brw = ~0ull;
434 brw->state.pipelines[BRW_COMPUTE_PIPELINE].mesa = ~0;
435 brw->state.pipelines[BRW_COMPUTE_PIPELINE].brw = ~0ull;
436
437 /* Also, NULL out any stale program pointers. */
438 brw->vs.base.prog_data = NULL;
439 brw->tcs.base.prog_data = NULL;
440 brw->tes.base.prog_data = NULL;
441 brw->gs.base.prog_data = NULL;
442 brw->wm.base.prog_data = NULL;
443 brw->cs.base.prog_data = NULL;
444
445 intel_batchbuffer_flush(brw);
446 }
447
448 void
449 brw_program_cache_check_size(struct brw_context *brw)
450 {
451 /* un-tuned guess. Each object is generally a page, so 2000 of them is 8 MB of
452 * state cache.
453 */
454 if (brw->cache.n_items > 2000) {
455 perf_debug("Exceeded state cache size limit. Clearing the set "
456 "of compiled programs, which will trigger recompiles\n");
457 brw_clear_cache(brw, &brw->cache);
458 brw_cache_new_bo(&brw->cache, brw->cache.bo->size);
459 }
460 }
461
462
463 static void
464 brw_destroy_cache(struct brw_context *brw, struct brw_cache *cache)
465 {
466
467 DBG("%s\n", __func__);
468
469 /* This can be NULL if context creation failed early on */
470 if (cache->bo) {
471 brw_bo_unmap(cache->bo);
472 brw_bo_unreference(cache->bo);
473 cache->bo = NULL;
474 cache->map = NULL;
475 }
476 brw_clear_cache(brw, cache);
477 free(cache->items);
478 cache->items = NULL;
479 cache->size = 0;
480 }
481
482
483 void
484 brw_destroy_caches(struct brw_context *brw)
485 {
486 brw_destroy_cache(brw, &brw->cache);
487 }
488
489 static const char *
490 cache_name(enum brw_cache_id cache_id)
491 {
492 switch (cache_id) {
493 case BRW_CACHE_VS_PROG:
494 return "VS kernel";
495 case BRW_CACHE_TCS_PROG:
496 return "TCS kernel";
497 case BRW_CACHE_TES_PROG:
498 return "TES kernel";
499 case BRW_CACHE_FF_GS_PROG:
500 return "Fixed-function GS kernel";
501 case BRW_CACHE_GS_PROG:
502 return "GS kernel";
503 case BRW_CACHE_CLIP_PROG:
504 return "CLIP kernel";
505 case BRW_CACHE_SF_PROG:
506 return "SF kernel";
507 case BRW_CACHE_FS_PROG:
508 return "FS kernel";
509 case BRW_CACHE_CS_PROG:
510 return "CS kernel";
511 default:
512 return "unknown";
513 }
514 }
515
516 void
517 brw_print_program_cache(struct brw_context *brw)
518 {
519 const struct brw_cache *cache = &brw->cache;
520 struct brw_cache_item *item;
521
522 for (unsigned i = 0; i < cache->size; i++) {
523 for (item = cache->items[i]; item; item = item->next) {
524 fprintf(stderr, "%s:\n", cache_name(i));
525 brw_disassemble(&brw->screen->devinfo, cache->map,
526 item->offset, item->size, stderr);
527 }
528 }
529 }