2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **********************************************************************/
29 * Keith Whitwell <keithw@vmware.com>
32 /** @file brw_program_cache.c
34 * This file implements a simple program cache for 965. The consumers can
35 * query the hash table of programs using a cache_id and program key, and
36 * receive the corresponding program buffer object (plus associated auxiliary
37 * data) in return. Objects in the cache may not have relocations
38 * (pointers to other BOs) in them.
40 * The inner workings are a simple hash table based on a FNV-1a of the
43 * Replacement is not implemented. Instead, when the cache gets too
44 * big we throw out all of the cache data and let it get regenerated.
47 #include "main/streaming-load-memcpy.h"
48 #include "x86/common_x86_asm.h"
49 #include "intel_batchbuffer.h"
50 #include "brw_state.h"
54 #include "brw_program.h"
55 #include "compiler/brw_eu.h"
56 #include "util/u_memory.h"
57 #define XXH_INLINE_ALL
58 #include "util/xxhash.h"
60 #define FILE_DEBUG_FLAG DEBUG_STATE
62 struct brw_cache_item
{
64 * Effectively part of the key, cache_id identifies what kind of state
65 * buffer is involved, and also which dirty flag should set.
67 enum brw_cache_id cache_id
;
69 /** 32-bit hash of the key data */
72 /** for variable-sized keys */
74 GLuint prog_data_size
;
75 const struct brw_base_prog_key
*key
;
80 struct brw_cache_item
*next
;
84 brw_stage_cache_id(gl_shader_stage stage
)
86 static const enum brw_cache_id stage_ids
[] = {
94 assert((int)stage
>= 0 && stage
< ARRAY_SIZE(stage_ids
));
95 return stage_ids
[stage
];
99 hash_key(struct brw_cache_item
*item
)
102 hash
= XXH32(&item
->cache_id
, sizeof(item
->cache_id
), hash
);
103 hash
= XXH32(item
->key
, item
->key_size
, hash
);
109 brw_cache_item_equals(const struct brw_cache_item
*a
,
110 const struct brw_cache_item
*b
)
112 return a
->cache_id
== b
->cache_id
&&
113 a
->hash
== b
->hash
&&
114 a
->key_size
== b
->key_size
&&
115 (memcmp(a
->key
, b
->key
, a
->key_size
) == 0);
118 static struct brw_cache_item
*
119 search_cache(struct brw_cache
*cache
, GLuint hash
,
120 struct brw_cache_item
*lookup
)
122 struct brw_cache_item
*c
;
127 for (c
= cache
->items
[hash
% cache
->size
]; c
; c
= c
->next
)
130 fprintf(stderr
, "bucket %d/%d = %d/%d items\n", hash
% cache
->size
,
131 cache
->size
, bucketcount
, cache
->n_items
);
134 for (c
= cache
->items
[hash
% cache
->size
]; c
; c
= c
->next
) {
135 if (brw_cache_item_equals(lookup
, c
))
144 rehash(struct brw_cache
*cache
)
146 struct brw_cache_item
**items
;
147 struct brw_cache_item
*c
, *next
;
150 size
= cache
->size
* 3;
151 items
= calloc(size
, sizeof(*items
));
153 for (i
= 0; i
< cache
->size
; i
++)
154 for (c
= cache
->items
[i
]; c
; c
= next
) {
156 c
->next
= items
[c
->hash
% size
];
157 items
[c
->hash
% size
] = c
;
161 cache
->items
= items
;
167 * Returns the buffer object matching cache_id and key, or NULL.
170 brw_search_cache(struct brw_cache
*cache
, enum brw_cache_id cache_id
,
171 const void *key
, GLuint key_size
, uint32_t *inout_offset
,
172 void *inout_prog_data
, bool flag_state
)
174 struct brw_cache_item
*item
;
175 struct brw_cache_item lookup
;
178 lookup
.cache_id
= cache_id
;
180 lookup
.key_size
= key_size
;
181 hash
= hash_key(&lookup
);
184 item
= search_cache(cache
, hash
, &lookup
);
189 void *prog_data
= ((char *) item
->key
) + item
->key_size
;
191 if (item
->offset
!= *inout_offset
||
192 prog_data
!= *((void **) inout_prog_data
)) {
193 if (likely(flag_state
))
194 cache
->brw
->ctx
.NewDriverState
|= (1 << cache_id
);
195 *inout_offset
= item
->offset
;
196 *((void **) inout_prog_data
) = prog_data
;
203 brw_cache_new_bo(struct brw_cache
*cache
, uint32_t new_size
)
205 struct brw_context
*brw
= cache
->brw
;
206 struct brw_bo
*new_bo
;
208 perf_debug("Copying to larger program cache: %u kB -> %u kB\n",
209 (unsigned) cache
->bo
->size
/ 1024, new_size
/ 1024);
211 new_bo
= brw_bo_alloc(brw
->bufmgr
, "program cache", new_size
,
213 if (can_do_exec_capture(brw
->screen
))
214 new_bo
->kflags
|= EXEC_OBJECT_CAPTURE
;
216 void *map
= brw_bo_map(brw
, new_bo
, MAP_READ
| MAP_WRITE
|
217 MAP_ASYNC
| MAP_PERSISTENT
);
219 /* Copy any existing data that needs to be saved. */
220 if (cache
->next_offset
!= 0) {
222 if (!cache
->bo
->cache_coherent
&& cpu_has_sse4_1
)
223 _mesa_streaming_load_memcpy(map
, cache
->map
, cache
->next_offset
);
226 memcpy(map
, cache
->map
, cache
->next_offset
);
229 brw_bo_unmap(cache
->bo
);
230 brw_bo_unreference(cache
->bo
);
234 /* Since we have a new BO in place, we need to signal the units
235 * that depend on it (state base address on gen5+, or unit state before).
237 brw
->ctx
.NewDriverState
|= BRW_NEW_PROGRAM_CACHE
;
238 brw
->batch
.state_base_address_emitted
= false;
242 * Attempts to find an item in the cache with identical data.
244 static const struct brw_cache_item
*
245 brw_lookup_prog(const struct brw_cache
*cache
,
246 enum brw_cache_id cache_id
,
247 const void *data
, unsigned data_size
)
250 const struct brw_cache_item
*item
;
252 for (i
= 0; i
< cache
->size
; i
++) {
253 for (item
= cache
->items
[i
]; item
; item
= item
->next
) {
254 if (item
->cache_id
!= cache_id
|| item
->size
!= data_size
||
255 memcmp(cache
->map
+ item
->offset
, data
, item
->size
) != 0)
266 brw_alloc_item_data(struct brw_cache
*cache
, uint32_t size
)
270 /* Allocate space in the cache BO for our new program. */
271 if (cache
->next_offset
+ size
> cache
->bo
->size
) {
272 uint32_t new_size
= cache
->bo
->size
* 2;
274 while (cache
->next_offset
+ size
> new_size
)
277 brw_cache_new_bo(cache
, new_size
);
280 offset
= cache
->next_offset
;
282 /* Programs are always 64-byte aligned, so set up the next one now */
283 cache
->next_offset
= ALIGN(offset
+ size
, 64);
289 brw_find_previous_compile(struct brw_cache
*cache
,
290 enum brw_cache_id cache_id
,
291 unsigned program_string_id
)
293 for (unsigned i
= 0; i
< cache
->size
; i
++) {
294 for (struct brw_cache_item
*c
= cache
->items
[i
]; c
; c
= c
->next
) {
295 if (c
->cache_id
== cache_id
&&
296 c
->key
->program_string_id
== program_string_id
) {
306 brw_upload_cache(struct brw_cache
*cache
,
307 enum brw_cache_id cache_id
,
312 const void *prog_data
,
313 GLuint prog_data_size
,
314 uint32_t *out_offset
,
317 struct brw_cache_item
*item
= CALLOC_STRUCT(brw_cache_item
);
318 const struct brw_cache_item
*matching_data
=
319 brw_lookup_prog(cache
, cache_id
, data
, data_size
);
323 item
->cache_id
= cache_id
;
324 item
->size
= data_size
;
326 item
->key_size
= key_size
;
327 item
->prog_data_size
= prog_data_size
;
328 hash
= hash_key(item
);
331 /* If we can find a matching prog in the cache already, then reuse the
332 * existing stuff without creating new copy into the underlying buffer
333 * object. This is notably useful for programs generating shaders at
334 * runtime, where multiple shaders may compile to the same thing in our
338 item
->offset
= matching_data
->offset
;
340 item
->offset
= brw_alloc_item_data(cache
, data_size
);
342 /* Copy data to the buffer */
343 memcpy(cache
->map
+ item
->offset
, data
, data_size
);
346 /* Set up the memory containing the key and prog_data */
347 tmp
= malloc(key_size
+ prog_data_size
);
349 memcpy(tmp
, key
, key_size
);
350 memcpy(tmp
+ key_size
, prog_data
, prog_data_size
);
354 if (cache
->n_items
> cache
->size
* 1.5f
)
358 item
->next
= cache
->items
[hash
];
359 cache
->items
[hash
] = item
;
362 *out_offset
= item
->offset
;
363 *(void **)out_prog_data
= (void *)((char *)item
->key
+ item
->key_size
);
364 cache
->brw
->ctx
.NewDriverState
|= 1 << cache_id
;
368 brw_init_caches(struct brw_context
*brw
)
370 struct brw_cache
*cache
= &brw
->cache
;
377 calloc(cache
->size
, sizeof(struct brw_cache_item
*));
379 cache
->bo
= brw_bo_alloc(brw
->bufmgr
, "program cache", 16384,
381 if (can_do_exec_capture(brw
->screen
))
382 cache
->bo
->kflags
|= EXEC_OBJECT_CAPTURE
;
384 cache
->map
= brw_bo_map(brw
, cache
->bo
, MAP_READ
| MAP_WRITE
|
385 MAP_ASYNC
| MAP_PERSISTENT
);
389 brw_clear_cache(struct brw_context
*brw
, struct brw_cache
*cache
)
391 struct brw_cache_item
*c
, *next
;
394 DBG("%s\n", __func__
);
396 for (i
= 0; i
< cache
->size
; i
++) {
397 for (c
= cache
->items
[i
]; c
; c
= next
) {
399 if (c
->cache_id
== BRW_CACHE_VS_PROG
||
400 c
->cache_id
== BRW_CACHE_TCS_PROG
||
401 c
->cache_id
== BRW_CACHE_TES_PROG
||
402 c
->cache_id
== BRW_CACHE_GS_PROG
||
403 c
->cache_id
== BRW_CACHE_FS_PROG
||
404 c
->cache_id
== BRW_CACHE_CS_PROG
) {
405 const void *item_prog_data
= ((char *)c
->key
) + c
->key_size
;
406 brw_stage_prog_data_free(item_prog_data
);
408 free((void *)c
->key
);
411 cache
->items
[i
] = NULL
;
416 /* Start putting programs into the start of the BO again, since
417 * we'll never find the old results.
419 cache
->next_offset
= 0;
421 /* We need to make sure that the programs get regenerated, since
422 * any offsets leftover in brw_context will no longer be valid.
424 brw
->NewGLState
= ~0;
425 brw
->ctx
.NewDriverState
= ~0ull;
426 brw
->state
.pipelines
[BRW_RENDER_PIPELINE
].mesa
= ~0;
427 brw
->state
.pipelines
[BRW_RENDER_PIPELINE
].brw
= ~0ull;
428 brw
->state
.pipelines
[BRW_COMPUTE_PIPELINE
].mesa
= ~0;
429 brw
->state
.pipelines
[BRW_COMPUTE_PIPELINE
].brw
= ~0ull;
431 /* Also, NULL out any stale program pointers. */
432 brw
->vs
.base
.prog_data
= NULL
;
433 brw
->tcs
.base
.prog_data
= NULL
;
434 brw
->tes
.base
.prog_data
= NULL
;
435 brw
->gs
.base
.prog_data
= NULL
;
436 brw
->wm
.base
.prog_data
= NULL
;
437 brw
->cs
.base
.prog_data
= NULL
;
439 intel_batchbuffer_flush(brw
);
443 brw_program_cache_check_size(struct brw_context
*brw
)
445 /* un-tuned guess. Each object is generally a page, so 2000 of them is 8 MB of
448 if (brw
->cache
.n_items
> 2000) {
449 perf_debug("Exceeded state cache size limit. Clearing the set "
450 "of compiled programs, which will trigger recompiles\n");
451 brw_clear_cache(brw
, &brw
->cache
);
452 brw_cache_new_bo(&brw
->cache
, brw
->cache
.bo
->size
);
458 brw_destroy_cache(struct brw_context
*brw
, struct brw_cache
*cache
)
461 DBG("%s\n", __func__
);
463 /* This can be NULL if context creation failed early on */
465 brw_bo_unmap(cache
->bo
);
466 brw_bo_unreference(cache
->bo
);
470 brw_clear_cache(brw
, cache
);
478 brw_destroy_caches(struct brw_context
*brw
)
480 brw_destroy_cache(brw
, &brw
->cache
);
484 cache_name(enum brw_cache_id cache_id
)
487 case BRW_CACHE_VS_PROG
:
489 case BRW_CACHE_TCS_PROG
:
491 case BRW_CACHE_TES_PROG
:
493 case BRW_CACHE_FF_GS_PROG
:
494 return "Fixed-function GS kernel";
495 case BRW_CACHE_GS_PROG
:
497 case BRW_CACHE_CLIP_PROG
:
498 return "CLIP kernel";
499 case BRW_CACHE_SF_PROG
:
501 case BRW_CACHE_FS_PROG
:
503 case BRW_CACHE_CS_PROG
:
511 brw_print_program_cache(struct brw_context
*brw
)
513 const struct brw_cache
*cache
= &brw
->cache
;
514 struct brw_cache_item
*item
;
516 for (unsigned i
= 0; i
< cache
->size
; i
++) {
517 for (item
= cache
->items
[i
]; item
; item
= item
->next
) {
518 fprintf(stderr
, "%s:\n", cache_name(i
));
519 brw_disassemble(&brw
->screen
->devinfo
, cache
->map
,
520 item
->offset
, item
->size
, stderr
);