2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **********************************************************************/
29 * Keith Whitwell <keithw@vmware.com>
32 /** @file brw_program_cache.c
34 * This file implements a simple program cache for 965. The consumers can
35 * query the hash table of programs using a cache_id and program key, and
36 * receive the corresponding program buffer object (plus associated auxiliary
37 * data) in return. Objects in the cache may not have relocations
38 * (pointers to other BOs) in them.
40 * The inner workings are a simple hash table based on a FNV-1a of the
43 * Replacement is not implemented. Instead, when the cache gets too
44 * big we throw out all of the cache data and let it get regenerated.
47 #include "util/imports.h"
48 #include "main/streaming-load-memcpy.h"
49 #include "x86/common_x86_asm.h"
50 #include "intel_batchbuffer.h"
51 #include "brw_state.h"
55 #include "brw_program.h"
56 #include "compiler/brw_eu.h"
58 #define FILE_DEBUG_FLAG DEBUG_STATE
60 struct brw_cache_item
{
62 * Effectively part of the key, cache_id identifies what kind of state
63 * buffer is involved, and also which dirty flag should set.
65 enum brw_cache_id cache_id
;
67 /** 32-bit hash of the key data */
70 /** for variable-sized keys */
72 GLuint prog_data_size
;
73 const struct brw_base_prog_key
*key
;
78 struct brw_cache_item
*next
;
82 brw_stage_cache_id(gl_shader_stage stage
)
84 static const enum brw_cache_id stage_ids
[] = {
92 assert((int)stage
>= 0 && stage
< ARRAY_SIZE(stage_ids
));
93 return stage_ids
[stage
];
97 hash_key(struct brw_cache_item
*item
)
99 uint32_t hash
= _mesa_fnv32_1a_offset_bias
;
100 hash
= _mesa_fnv32_1a_accumulate(hash
, item
->cache_id
);
101 hash
= _mesa_fnv32_1a_accumulate_block(hash
, item
->key
, item
->key_size
);
107 brw_cache_item_equals(const struct brw_cache_item
*a
,
108 const struct brw_cache_item
*b
)
110 return a
->cache_id
== b
->cache_id
&&
111 a
->hash
== b
->hash
&&
112 a
->key_size
== b
->key_size
&&
113 (memcmp(a
->key
, b
->key
, a
->key_size
) == 0);
116 static struct brw_cache_item
*
117 search_cache(struct brw_cache
*cache
, GLuint hash
,
118 struct brw_cache_item
*lookup
)
120 struct brw_cache_item
*c
;
125 for (c
= cache
->items
[hash
% cache
->size
]; c
; c
= c
->next
)
128 fprintf(stderr
, "bucket %d/%d = %d/%d items\n", hash
% cache
->size
,
129 cache
->size
, bucketcount
, cache
->n_items
);
132 for (c
= cache
->items
[hash
% cache
->size
]; c
; c
= c
->next
) {
133 if (brw_cache_item_equals(lookup
, c
))
142 rehash(struct brw_cache
*cache
)
144 struct brw_cache_item
**items
;
145 struct brw_cache_item
*c
, *next
;
148 size
= cache
->size
* 3;
149 items
= calloc(size
, sizeof(*items
));
151 for (i
= 0; i
< cache
->size
; i
++)
152 for (c
= cache
->items
[i
]; c
; c
= next
) {
154 c
->next
= items
[c
->hash
% size
];
155 items
[c
->hash
% size
] = c
;
159 cache
->items
= items
;
165 * Returns the buffer object matching cache_id and key, or NULL.
168 brw_search_cache(struct brw_cache
*cache
, enum brw_cache_id cache_id
,
169 const void *key
, GLuint key_size
, uint32_t *inout_offset
,
170 void *inout_prog_data
, bool flag_state
)
172 struct brw_cache_item
*item
;
173 struct brw_cache_item lookup
;
176 lookup
.cache_id
= cache_id
;
178 lookup
.key_size
= key_size
;
179 hash
= hash_key(&lookup
);
182 item
= search_cache(cache
, hash
, &lookup
);
187 void *prog_data
= ((char *) item
->key
) + item
->key_size
;
189 if (item
->offset
!= *inout_offset
||
190 prog_data
!= *((void **) inout_prog_data
)) {
191 if (likely(flag_state
))
192 cache
->brw
->ctx
.NewDriverState
|= (1 << cache_id
);
193 *inout_offset
= item
->offset
;
194 *((void **) inout_prog_data
) = prog_data
;
201 brw_cache_new_bo(struct brw_cache
*cache
, uint32_t new_size
)
203 struct brw_context
*brw
= cache
->brw
;
204 struct brw_bo
*new_bo
;
206 perf_debug("Copying to larger program cache: %u kB -> %u kB\n",
207 (unsigned) cache
->bo
->size
/ 1024, new_size
/ 1024);
209 new_bo
= brw_bo_alloc(brw
->bufmgr
, "program cache", new_size
,
211 if (can_do_exec_capture(brw
->screen
))
212 new_bo
->kflags
|= EXEC_OBJECT_CAPTURE
;
214 void *map
= brw_bo_map(brw
, new_bo
, MAP_READ
| MAP_WRITE
|
215 MAP_ASYNC
| MAP_PERSISTENT
);
217 /* Copy any existing data that needs to be saved. */
218 if (cache
->next_offset
!= 0) {
220 if (!cache
->bo
->cache_coherent
&& cpu_has_sse4_1
)
221 _mesa_streaming_load_memcpy(map
, cache
->map
, cache
->next_offset
);
224 memcpy(map
, cache
->map
, cache
->next_offset
);
227 brw_bo_unmap(cache
->bo
);
228 brw_bo_unreference(cache
->bo
);
232 /* Since we have a new BO in place, we need to signal the units
233 * that depend on it (state base address on gen5+, or unit state before).
235 brw
->ctx
.NewDriverState
|= BRW_NEW_PROGRAM_CACHE
;
236 brw
->batch
.state_base_address_emitted
= false;
240 * Attempts to find an item in the cache with identical data.
242 static const struct brw_cache_item
*
243 brw_lookup_prog(const struct brw_cache
*cache
,
244 enum brw_cache_id cache_id
,
245 const void *data
, unsigned data_size
)
248 const struct brw_cache_item
*item
;
250 for (i
= 0; i
< cache
->size
; i
++) {
251 for (item
= cache
->items
[i
]; item
; item
= item
->next
) {
252 if (item
->cache_id
!= cache_id
|| item
->size
!= data_size
||
253 memcmp(cache
->map
+ item
->offset
, data
, item
->size
) != 0)
264 brw_alloc_item_data(struct brw_cache
*cache
, uint32_t size
)
268 /* Allocate space in the cache BO for our new program. */
269 if (cache
->next_offset
+ size
> cache
->bo
->size
) {
270 uint32_t new_size
= cache
->bo
->size
* 2;
272 while (cache
->next_offset
+ size
> new_size
)
275 brw_cache_new_bo(cache
, new_size
);
278 offset
= cache
->next_offset
;
280 /* Programs are always 64-byte aligned, so set up the next one now */
281 cache
->next_offset
= ALIGN(offset
+ size
, 64);
287 brw_find_previous_compile(struct brw_cache
*cache
,
288 enum brw_cache_id cache_id
,
289 unsigned program_string_id
)
291 for (unsigned i
= 0; i
< cache
->size
; i
++) {
292 for (struct brw_cache_item
*c
= cache
->items
[i
]; c
; c
= c
->next
) {
293 if (c
->cache_id
== cache_id
&&
294 c
->key
->program_string_id
== program_string_id
) {
304 brw_upload_cache(struct brw_cache
*cache
,
305 enum brw_cache_id cache_id
,
310 const void *prog_data
,
311 GLuint prog_data_size
,
312 uint32_t *out_offset
,
315 struct brw_cache_item
*item
= CALLOC_STRUCT(brw_cache_item
);
316 const struct brw_cache_item
*matching_data
=
317 brw_lookup_prog(cache
, cache_id
, data
, data_size
);
321 item
->cache_id
= cache_id
;
322 item
->size
= data_size
;
324 item
->key_size
= key_size
;
325 item
->prog_data_size
= prog_data_size
;
326 hash
= hash_key(item
);
329 /* If we can find a matching prog in the cache already, then reuse the
330 * existing stuff without creating new copy into the underlying buffer
331 * object. This is notably useful for programs generating shaders at
332 * runtime, where multiple shaders may compile to the same thing in our
336 item
->offset
= matching_data
->offset
;
338 item
->offset
= brw_alloc_item_data(cache
, data_size
);
340 /* Copy data to the buffer */
341 memcpy(cache
->map
+ item
->offset
, data
, data_size
);
344 /* Set up the memory containing the key and prog_data */
345 tmp
= malloc(key_size
+ prog_data_size
);
347 memcpy(tmp
, key
, key_size
);
348 memcpy(tmp
+ key_size
, prog_data
, prog_data_size
);
352 if (cache
->n_items
> cache
->size
* 1.5f
)
356 item
->next
= cache
->items
[hash
];
357 cache
->items
[hash
] = item
;
360 *out_offset
= item
->offset
;
361 *(void **)out_prog_data
= (void *)((char *)item
->key
+ item
->key_size
);
362 cache
->brw
->ctx
.NewDriverState
|= 1 << cache_id
;
366 brw_init_caches(struct brw_context
*brw
)
368 struct brw_cache
*cache
= &brw
->cache
;
375 calloc(cache
->size
, sizeof(struct brw_cache_item
*));
377 cache
->bo
= brw_bo_alloc(brw
->bufmgr
, "program cache", 16384,
379 if (can_do_exec_capture(brw
->screen
))
380 cache
->bo
->kflags
|= EXEC_OBJECT_CAPTURE
;
382 cache
->map
= brw_bo_map(brw
, cache
->bo
, MAP_READ
| MAP_WRITE
|
383 MAP_ASYNC
| MAP_PERSISTENT
);
387 brw_clear_cache(struct brw_context
*brw
, struct brw_cache
*cache
)
389 struct brw_cache_item
*c
, *next
;
392 DBG("%s\n", __func__
);
394 for (i
= 0; i
< cache
->size
; i
++) {
395 for (c
= cache
->items
[i
]; c
; c
= next
) {
397 if (c
->cache_id
== BRW_CACHE_VS_PROG
||
398 c
->cache_id
== BRW_CACHE_TCS_PROG
||
399 c
->cache_id
== BRW_CACHE_TES_PROG
||
400 c
->cache_id
== BRW_CACHE_GS_PROG
||
401 c
->cache_id
== BRW_CACHE_FS_PROG
||
402 c
->cache_id
== BRW_CACHE_CS_PROG
) {
403 const void *item_prog_data
= ((char *)c
->key
) + c
->key_size
;
404 brw_stage_prog_data_free(item_prog_data
);
406 free((void *)c
->key
);
409 cache
->items
[i
] = NULL
;
414 /* Start putting programs into the start of the BO again, since
415 * we'll never find the old results.
417 cache
->next_offset
= 0;
419 /* We need to make sure that the programs get regenerated, since
420 * any offsets leftover in brw_context will no longer be valid.
422 brw
->NewGLState
= ~0;
423 brw
->ctx
.NewDriverState
= ~0ull;
424 brw
->state
.pipelines
[BRW_RENDER_PIPELINE
].mesa
= ~0;
425 brw
->state
.pipelines
[BRW_RENDER_PIPELINE
].brw
= ~0ull;
426 brw
->state
.pipelines
[BRW_COMPUTE_PIPELINE
].mesa
= ~0;
427 brw
->state
.pipelines
[BRW_COMPUTE_PIPELINE
].brw
= ~0ull;
429 /* Also, NULL out any stale program pointers. */
430 brw
->vs
.base
.prog_data
= NULL
;
431 brw
->tcs
.base
.prog_data
= NULL
;
432 brw
->tes
.base
.prog_data
= NULL
;
433 brw
->gs
.base
.prog_data
= NULL
;
434 brw
->wm
.base
.prog_data
= NULL
;
435 brw
->cs
.base
.prog_data
= NULL
;
437 intel_batchbuffer_flush(brw
);
441 brw_program_cache_check_size(struct brw_context
*brw
)
443 /* un-tuned guess. Each object is generally a page, so 2000 of them is 8 MB of
446 if (brw
->cache
.n_items
> 2000) {
447 perf_debug("Exceeded state cache size limit. Clearing the set "
448 "of compiled programs, which will trigger recompiles\n");
449 brw_clear_cache(brw
, &brw
->cache
);
450 brw_cache_new_bo(&brw
->cache
, brw
->cache
.bo
->size
);
456 brw_destroy_cache(struct brw_context
*brw
, struct brw_cache
*cache
)
459 DBG("%s\n", __func__
);
461 /* This can be NULL if context creation failed early on */
463 brw_bo_unmap(cache
->bo
);
464 brw_bo_unreference(cache
->bo
);
468 brw_clear_cache(brw
, cache
);
476 brw_destroy_caches(struct brw_context
*brw
)
478 brw_destroy_cache(brw
, &brw
->cache
);
482 cache_name(enum brw_cache_id cache_id
)
485 case BRW_CACHE_VS_PROG
:
487 case BRW_CACHE_TCS_PROG
:
489 case BRW_CACHE_TES_PROG
:
491 case BRW_CACHE_FF_GS_PROG
:
492 return "Fixed-function GS kernel";
493 case BRW_CACHE_GS_PROG
:
495 case BRW_CACHE_CLIP_PROG
:
496 return "CLIP kernel";
497 case BRW_CACHE_SF_PROG
:
499 case BRW_CACHE_FS_PROG
:
501 case BRW_CACHE_CS_PROG
:
509 brw_print_program_cache(struct brw_context
*brw
)
511 const struct brw_cache
*cache
= &brw
->cache
;
512 struct brw_cache_item
*item
;
514 for (unsigned i
= 0; i
< cache
->size
; i
++) {
515 for (item
= cache
->items
[i
]; item
; item
= item
->next
) {
516 fprintf(stderr
, "%s:\n", cache_name(i
));
517 brw_disassemble(&brw
->screen
->devinfo
, cache
->map
,
518 item
->offset
, item
->size
, stderr
);