2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **********************************************************************/
29 * Keith Whitwell <keithw@vmware.com>
32 /** @file brw_program_cache.c
34 * This file implements a simple program cache for 965. The consumers can
35 * query the hash table of programs using a cache_id and program key, and
36 * receive the corresponding program buffer object (plus associated auxiliary
37 * data) in return. Objects in the cache may not have relocations
38 * (pointers to other BOs) in them.
40 * The inner workings are a simple hash table based on a FNV-1a of the
43 * Replacement is not implemented. Instead, when the cache gets too
44 * big we throw out all of the cache data and let it get regenerated.
47 #include "util/imports.h"
48 #include "main/streaming-load-memcpy.h"
49 #include "x86/common_x86_asm.h"
50 #include "intel_batchbuffer.h"
51 #include "brw_state.h"
55 #include "brw_program.h"
56 #include "compiler/brw_eu.h"
57 #include "util/u_memory.h"
59 #define FILE_DEBUG_FLAG DEBUG_STATE
61 struct brw_cache_item
{
63 * Effectively part of the key, cache_id identifies what kind of state
64 * buffer is involved, and also which dirty flag should set.
66 enum brw_cache_id cache_id
;
68 /** 32-bit hash of the key data */
71 /** for variable-sized keys */
73 GLuint prog_data_size
;
74 const struct brw_base_prog_key
*key
;
79 struct brw_cache_item
*next
;
83 brw_stage_cache_id(gl_shader_stage stage
)
85 static const enum brw_cache_id stage_ids
[] = {
93 assert((int)stage
>= 0 && stage
< ARRAY_SIZE(stage_ids
));
94 return stage_ids
[stage
];
98 hash_key(struct brw_cache_item
*item
)
100 uint32_t hash
= _mesa_fnv32_1a_offset_bias
;
101 hash
= _mesa_fnv32_1a_accumulate(hash
, item
->cache_id
);
102 hash
= _mesa_fnv32_1a_accumulate_block(hash
, item
->key
, item
->key_size
);
108 brw_cache_item_equals(const struct brw_cache_item
*a
,
109 const struct brw_cache_item
*b
)
111 return a
->cache_id
== b
->cache_id
&&
112 a
->hash
== b
->hash
&&
113 a
->key_size
== b
->key_size
&&
114 (memcmp(a
->key
, b
->key
, a
->key_size
) == 0);
117 static struct brw_cache_item
*
118 search_cache(struct brw_cache
*cache
, GLuint hash
,
119 struct brw_cache_item
*lookup
)
121 struct brw_cache_item
*c
;
126 for (c
= cache
->items
[hash
% cache
->size
]; c
; c
= c
->next
)
129 fprintf(stderr
, "bucket %d/%d = %d/%d items\n", hash
% cache
->size
,
130 cache
->size
, bucketcount
, cache
->n_items
);
133 for (c
= cache
->items
[hash
% cache
->size
]; c
; c
= c
->next
) {
134 if (brw_cache_item_equals(lookup
, c
))
143 rehash(struct brw_cache
*cache
)
145 struct brw_cache_item
**items
;
146 struct brw_cache_item
*c
, *next
;
149 size
= cache
->size
* 3;
150 items
= calloc(size
, sizeof(*items
));
152 for (i
= 0; i
< cache
->size
; i
++)
153 for (c
= cache
->items
[i
]; c
; c
= next
) {
155 c
->next
= items
[c
->hash
% size
];
156 items
[c
->hash
% size
] = c
;
160 cache
->items
= items
;
166 * Returns the buffer object matching cache_id and key, or NULL.
169 brw_search_cache(struct brw_cache
*cache
, enum brw_cache_id cache_id
,
170 const void *key
, GLuint key_size
, uint32_t *inout_offset
,
171 void *inout_prog_data
, bool flag_state
)
173 struct brw_cache_item
*item
;
174 struct brw_cache_item lookup
;
177 lookup
.cache_id
= cache_id
;
179 lookup
.key_size
= key_size
;
180 hash
= hash_key(&lookup
);
183 item
= search_cache(cache
, hash
, &lookup
);
188 void *prog_data
= ((char *) item
->key
) + item
->key_size
;
190 if (item
->offset
!= *inout_offset
||
191 prog_data
!= *((void **) inout_prog_data
)) {
192 if (likely(flag_state
))
193 cache
->brw
->ctx
.NewDriverState
|= (1 << cache_id
);
194 *inout_offset
= item
->offset
;
195 *((void **) inout_prog_data
) = prog_data
;
202 brw_cache_new_bo(struct brw_cache
*cache
, uint32_t new_size
)
204 struct brw_context
*brw
= cache
->brw
;
205 struct brw_bo
*new_bo
;
207 perf_debug("Copying to larger program cache: %u kB -> %u kB\n",
208 (unsigned) cache
->bo
->size
/ 1024, new_size
/ 1024);
210 new_bo
= brw_bo_alloc(brw
->bufmgr
, "program cache", new_size
,
212 if (can_do_exec_capture(brw
->screen
))
213 new_bo
->kflags
|= EXEC_OBJECT_CAPTURE
;
215 void *map
= brw_bo_map(brw
, new_bo
, MAP_READ
| MAP_WRITE
|
216 MAP_ASYNC
| MAP_PERSISTENT
);
218 /* Copy any existing data that needs to be saved. */
219 if (cache
->next_offset
!= 0) {
221 if (!cache
->bo
->cache_coherent
&& cpu_has_sse4_1
)
222 _mesa_streaming_load_memcpy(map
, cache
->map
, cache
->next_offset
);
225 memcpy(map
, cache
->map
, cache
->next_offset
);
228 brw_bo_unmap(cache
->bo
);
229 brw_bo_unreference(cache
->bo
);
233 /* Since we have a new BO in place, we need to signal the units
234 * that depend on it (state base address on gen5+, or unit state before).
236 brw
->ctx
.NewDriverState
|= BRW_NEW_PROGRAM_CACHE
;
237 brw
->batch
.state_base_address_emitted
= false;
241 * Attempts to find an item in the cache with identical data.
243 static const struct brw_cache_item
*
244 brw_lookup_prog(const struct brw_cache
*cache
,
245 enum brw_cache_id cache_id
,
246 const void *data
, unsigned data_size
)
249 const struct brw_cache_item
*item
;
251 for (i
= 0; i
< cache
->size
; i
++) {
252 for (item
= cache
->items
[i
]; item
; item
= item
->next
) {
253 if (item
->cache_id
!= cache_id
|| item
->size
!= data_size
||
254 memcmp(cache
->map
+ item
->offset
, data
, item
->size
) != 0)
265 brw_alloc_item_data(struct brw_cache
*cache
, uint32_t size
)
269 /* Allocate space in the cache BO for our new program. */
270 if (cache
->next_offset
+ size
> cache
->bo
->size
) {
271 uint32_t new_size
= cache
->bo
->size
* 2;
273 while (cache
->next_offset
+ size
> new_size
)
276 brw_cache_new_bo(cache
, new_size
);
279 offset
= cache
->next_offset
;
281 /* Programs are always 64-byte aligned, so set up the next one now */
282 cache
->next_offset
= ALIGN(offset
+ size
, 64);
288 brw_find_previous_compile(struct brw_cache
*cache
,
289 enum brw_cache_id cache_id
,
290 unsigned program_string_id
)
292 for (unsigned i
= 0; i
< cache
->size
; i
++) {
293 for (struct brw_cache_item
*c
= cache
->items
[i
]; c
; c
= c
->next
) {
294 if (c
->cache_id
== cache_id
&&
295 c
->key
->program_string_id
== program_string_id
) {
305 brw_upload_cache(struct brw_cache
*cache
,
306 enum brw_cache_id cache_id
,
311 const void *prog_data
,
312 GLuint prog_data_size
,
313 uint32_t *out_offset
,
316 struct brw_cache_item
*item
= CALLOC_STRUCT(brw_cache_item
);
317 const struct brw_cache_item
*matching_data
=
318 brw_lookup_prog(cache
, cache_id
, data
, data_size
);
322 item
->cache_id
= cache_id
;
323 item
->size
= data_size
;
325 item
->key_size
= key_size
;
326 item
->prog_data_size
= prog_data_size
;
327 hash
= hash_key(item
);
330 /* If we can find a matching prog in the cache already, then reuse the
331 * existing stuff without creating new copy into the underlying buffer
332 * object. This is notably useful for programs generating shaders at
333 * runtime, where multiple shaders may compile to the same thing in our
337 item
->offset
= matching_data
->offset
;
339 item
->offset
= brw_alloc_item_data(cache
, data_size
);
341 /* Copy data to the buffer */
342 memcpy(cache
->map
+ item
->offset
, data
, data_size
);
345 /* Set up the memory containing the key and prog_data */
346 tmp
= malloc(key_size
+ prog_data_size
);
348 memcpy(tmp
, key
, key_size
);
349 memcpy(tmp
+ key_size
, prog_data
, prog_data_size
);
353 if (cache
->n_items
> cache
->size
* 1.5f
)
357 item
->next
= cache
->items
[hash
];
358 cache
->items
[hash
] = item
;
361 *out_offset
= item
->offset
;
362 *(void **)out_prog_data
= (void *)((char *)item
->key
+ item
->key_size
);
363 cache
->brw
->ctx
.NewDriverState
|= 1 << cache_id
;
367 brw_init_caches(struct brw_context
*brw
)
369 struct brw_cache
*cache
= &brw
->cache
;
376 calloc(cache
->size
, sizeof(struct brw_cache_item
*));
378 cache
->bo
= brw_bo_alloc(brw
->bufmgr
, "program cache", 16384,
380 if (can_do_exec_capture(brw
->screen
))
381 cache
->bo
->kflags
|= EXEC_OBJECT_CAPTURE
;
383 cache
->map
= brw_bo_map(brw
, cache
->bo
, MAP_READ
| MAP_WRITE
|
384 MAP_ASYNC
| MAP_PERSISTENT
);
388 brw_clear_cache(struct brw_context
*brw
, struct brw_cache
*cache
)
390 struct brw_cache_item
*c
, *next
;
393 DBG("%s\n", __func__
);
395 for (i
= 0; i
< cache
->size
; i
++) {
396 for (c
= cache
->items
[i
]; c
; c
= next
) {
398 if (c
->cache_id
== BRW_CACHE_VS_PROG
||
399 c
->cache_id
== BRW_CACHE_TCS_PROG
||
400 c
->cache_id
== BRW_CACHE_TES_PROG
||
401 c
->cache_id
== BRW_CACHE_GS_PROG
||
402 c
->cache_id
== BRW_CACHE_FS_PROG
||
403 c
->cache_id
== BRW_CACHE_CS_PROG
) {
404 const void *item_prog_data
= ((char *)c
->key
) + c
->key_size
;
405 brw_stage_prog_data_free(item_prog_data
);
407 free((void *)c
->key
);
410 cache
->items
[i
] = NULL
;
415 /* Start putting programs into the start of the BO again, since
416 * we'll never find the old results.
418 cache
->next_offset
= 0;
420 /* We need to make sure that the programs get regenerated, since
421 * any offsets leftover in brw_context will no longer be valid.
423 brw
->NewGLState
= ~0;
424 brw
->ctx
.NewDriverState
= ~0ull;
425 brw
->state
.pipelines
[BRW_RENDER_PIPELINE
].mesa
= ~0;
426 brw
->state
.pipelines
[BRW_RENDER_PIPELINE
].brw
= ~0ull;
427 brw
->state
.pipelines
[BRW_COMPUTE_PIPELINE
].mesa
= ~0;
428 brw
->state
.pipelines
[BRW_COMPUTE_PIPELINE
].brw
= ~0ull;
430 /* Also, NULL out any stale program pointers. */
431 brw
->vs
.base
.prog_data
= NULL
;
432 brw
->tcs
.base
.prog_data
= NULL
;
433 brw
->tes
.base
.prog_data
= NULL
;
434 brw
->gs
.base
.prog_data
= NULL
;
435 brw
->wm
.base
.prog_data
= NULL
;
436 brw
->cs
.base
.prog_data
= NULL
;
438 intel_batchbuffer_flush(brw
);
442 brw_program_cache_check_size(struct brw_context
*brw
)
444 /* un-tuned guess. Each object is generally a page, so 2000 of them is 8 MB of
447 if (brw
->cache
.n_items
> 2000) {
448 perf_debug("Exceeded state cache size limit. Clearing the set "
449 "of compiled programs, which will trigger recompiles\n");
450 brw_clear_cache(brw
, &brw
->cache
);
451 brw_cache_new_bo(&brw
->cache
, brw
->cache
.bo
->size
);
457 brw_destroy_cache(struct brw_context
*brw
, struct brw_cache
*cache
)
460 DBG("%s\n", __func__
);
462 /* This can be NULL if context creation failed early on */
464 brw_bo_unmap(cache
->bo
);
465 brw_bo_unreference(cache
->bo
);
469 brw_clear_cache(brw
, cache
);
477 brw_destroy_caches(struct brw_context
*brw
)
479 brw_destroy_cache(brw
, &brw
->cache
);
483 cache_name(enum brw_cache_id cache_id
)
486 case BRW_CACHE_VS_PROG
:
488 case BRW_CACHE_TCS_PROG
:
490 case BRW_CACHE_TES_PROG
:
492 case BRW_CACHE_FF_GS_PROG
:
493 return "Fixed-function GS kernel";
494 case BRW_CACHE_GS_PROG
:
496 case BRW_CACHE_CLIP_PROG
:
497 return "CLIP kernel";
498 case BRW_CACHE_SF_PROG
:
500 case BRW_CACHE_FS_PROG
:
502 case BRW_CACHE_CS_PROG
:
510 brw_print_program_cache(struct brw_context
*brw
)
512 const struct brw_cache
*cache
= &brw
->cache
;
513 struct brw_cache_item
*item
;
515 for (unsigned i
= 0; i
< cache
->size
; i
++) {
516 for (item
= cache
->items
[i
]; item
; item
= item
->next
) {
517 fprintf(stderr
, "%s:\n", cache_name(i
));
518 brw_disassemble(&brw
->screen
->devinfo
, cache
->map
,
519 item
->offset
, item
->size
, stderr
);