2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **********************************************************************/
29 * Keith Whitwell <keithw@vmware.com>
32 /** @file brw_program_cache.c
34 * This file implements a simple program cache for 965. The consumers can
35 * query the hash table of programs using a cache_id and program key, and
36 * receive the corresponding program buffer object (plus associated auxiliary
37 * data) in return. Objects in the cache may not have relocations
38 * (pointers to other BOs) in them.
40 * The inner workings are a simple hash table based on a CRC of the
43 * Replacement is not implemented. Instead, when the cache gets too
44 * big we throw out all of the cache data and let it get regenerated.
47 #include "main/imports.h"
48 #include "intel_batchbuffer.h"
49 #include "brw_state.h"
54 #include "brw_program.h"
56 #define FILE_DEBUG_FLAG DEBUG_STATE
59 get_program_string_id(enum brw_cache_id cache_id
, const void *key
)
62 case BRW_CACHE_VS_PROG
:
63 return ((struct brw_vs_prog_key
*) key
)->program_string_id
;
64 case BRW_CACHE_TCS_PROG
:
65 return ((struct brw_tcs_prog_key
*) key
)->program_string_id
;
66 case BRW_CACHE_TES_PROG
:
67 return ((struct brw_tes_prog_key
*) key
)->program_string_id
;
68 case BRW_CACHE_GS_PROG
:
69 return ((struct brw_gs_prog_key
*) key
)->program_string_id
;
70 case BRW_CACHE_CS_PROG
:
71 return ((struct brw_cs_prog_key
*) key
)->program_string_id
;
72 case BRW_CACHE_FS_PROG
:
73 return ((struct brw_wm_prog_key
*) key
)->program_string_id
;
75 unreachable("no program string id for this kind of program");
80 hash_key(struct brw_cache_item
*item
)
82 GLuint
*ikey
= (GLuint
*)item
->key
;
83 GLuint hash
= item
->cache_id
, i
;
85 assert(item
->key_size
% 4 == 0);
87 /* I'm sure this can be improved on:
89 for (i
= 0; i
< item
->key_size
/4; i
++) {
91 hash
= (hash
<< 5) | (hash
>> 27);
98 brw_cache_item_equals(const struct brw_cache_item
*a
,
99 const struct brw_cache_item
*b
)
101 return a
->cache_id
== b
->cache_id
&&
102 a
->hash
== b
->hash
&&
103 a
->key_size
== b
->key_size
&&
104 (memcmp(a
->key
, b
->key
, a
->key_size
) == 0);
107 static struct brw_cache_item
*
108 search_cache(struct brw_cache
*cache
, GLuint hash
,
109 struct brw_cache_item
*lookup
)
111 struct brw_cache_item
*c
;
116 for (c
= cache
->items
[hash
% cache
->size
]; c
; c
= c
->next
)
119 fprintf(stderr
, "bucket %d/%d = %d/%d items\n", hash
% cache
->size
,
120 cache
->size
, bucketcount
, cache
->n_items
);
123 for (c
= cache
->items
[hash
% cache
->size
]; c
; c
= c
->next
) {
124 if (brw_cache_item_equals(lookup
, c
))
133 rehash(struct brw_cache
*cache
)
135 struct brw_cache_item
**items
;
136 struct brw_cache_item
*c
, *next
;
139 size
= cache
->size
* 3;
140 items
= calloc(size
, sizeof(*items
));
142 for (i
= 0; i
< cache
->size
; i
++)
143 for (c
= cache
->items
[i
]; c
; c
= next
) {
145 c
->next
= items
[c
->hash
% size
];
146 items
[c
->hash
% size
] = c
;
150 cache
->items
= items
;
156 * Returns the buffer object matching cache_id and key, or NULL.
159 brw_search_cache(struct brw_cache
*cache
,
160 enum brw_cache_id cache_id
,
161 const void *key
, GLuint key_size
,
162 uint32_t *inout_offset
, void *inout_aux
)
164 struct brw_context
*brw
= cache
->brw
;
165 struct brw_cache_item
*item
;
166 struct brw_cache_item lookup
;
169 lookup
.cache_id
= cache_id
;
171 lookup
.key_size
= key_size
;
172 hash
= hash_key(&lookup
);
175 item
= search_cache(cache
, hash
, &lookup
);
180 void *aux
= ((char *) item
->key
) + item
->key_size
;
182 if (item
->offset
!= *inout_offset
|| aux
!= *((void **) inout_aux
)) {
183 brw
->ctx
.NewDriverState
|= (1 << cache_id
);
184 *inout_offset
= item
->offset
;
185 *((void **) inout_aux
) = aux
;
192 brw_cache_new_bo(struct brw_cache
*cache
, uint32_t new_size
)
194 struct brw_context
*brw
= cache
->brw
;
195 drm_intel_bo
*new_bo
;
197 new_bo
= drm_intel_bo_alloc(brw
->bufmgr
, "program cache", new_size
, 64);
199 drm_intel_gem_bo_map_unsynchronized(new_bo
);
201 /* Copy any existing data that needs to be saved. */
202 if (cache
->next_offset
!= 0) {
204 memcpy(new_bo
->virtual, cache
->bo
->virtual, cache
->next_offset
);
206 drm_intel_bo_map(cache
->bo
, false);
207 drm_intel_bo_subdata(new_bo
, 0, cache
->next_offset
,
209 drm_intel_bo_unmap(cache
->bo
);
214 drm_intel_bo_unmap(cache
->bo
);
215 drm_intel_bo_unreference(cache
->bo
);
217 cache
->bo_used_by_gpu
= false;
219 /* Since we have a new BO in place, we need to signal the units
220 * that depend on it (state base address on gen5+, or unit state before).
222 brw
->ctx
.NewDriverState
|= BRW_NEW_PROGRAM_CACHE
;
223 brw
->batch
.state_base_address_emitted
= false;
227 * Attempts to find an item in the cache with identical data.
229 static const struct brw_cache_item
*
230 brw_lookup_prog(const struct brw_cache
*cache
,
231 enum brw_cache_id cache_id
,
232 const void *data
, unsigned data_size
)
234 const struct brw_context
*brw
= cache
->brw
;
236 const struct brw_cache_item
*item
;
238 for (i
= 0; i
< cache
->size
; i
++) {
239 for (item
= cache
->items
[i
]; item
; item
= item
->next
) {
242 if (item
->cache_id
!= cache_id
|| item
->size
!= data_size
)
246 drm_intel_bo_map(cache
->bo
, false);
247 ret
= memcmp(cache
->bo
->virtual + item
->offset
, data
, item
->size
);
249 drm_intel_bo_unmap(cache
->bo
);
261 brw_alloc_item_data(struct brw_cache
*cache
, uint32_t size
)
264 struct brw_context
*brw
= cache
->brw
;
266 /* Allocate space in the cache BO for our new program. */
267 if (cache
->next_offset
+ size
> cache
->bo
->size
) {
268 uint32_t new_size
= cache
->bo
->size
* 2;
270 while (cache
->next_offset
+ size
> new_size
)
273 brw_cache_new_bo(cache
, new_size
);
276 /* If we would block on writing to an in-use program BO, just
279 if (!brw
->has_llc
&& cache
->bo_used_by_gpu
) {
280 perf_debug("Copying busy program cache buffer.\n");
281 brw_cache_new_bo(cache
, cache
->bo
->size
);
284 offset
= cache
->next_offset
;
286 /* Programs are always 64-byte aligned, so set up the next one now */
287 cache
->next_offset
= ALIGN(offset
+ size
, 64);
293 brw_find_previous_compile(struct brw_cache
*cache
,
294 enum brw_cache_id cache_id
,
295 unsigned program_string_id
)
297 for (unsigned i
= 0; i
< cache
->size
; i
++) {
298 for (struct brw_cache_item
*c
= cache
->items
[i
]; c
; c
= c
->next
) {
299 if (c
->cache_id
== cache_id
&&
300 get_program_string_id(cache_id
, c
->key
) == program_string_id
) {
310 brw_upload_cache(struct brw_cache
*cache
,
311 enum brw_cache_id cache_id
,
318 uint32_t *out_offset
,
321 struct brw_context
*brw
= cache
->brw
;
322 struct brw_cache_item
*item
= CALLOC_STRUCT(brw_cache_item
);
323 const struct brw_cache_item
*matching_data
=
324 brw_lookup_prog(cache
, cache_id
, data
, data_size
);
328 item
->cache_id
= cache_id
;
329 item
->size
= data_size
;
331 item
->key_size
= key_size
;
332 item
->aux_size
= aux_size
;
333 hash
= hash_key(item
);
336 /* If we can find a matching prog in the cache already, then reuse the
337 * existing stuff without creating new copy into the underlying buffer
338 * object. This is notably useful for programs generating shaders at
339 * runtime, where multiple shaders may compile to the same thing in our
343 item
->offset
= matching_data
->offset
;
345 item
->offset
= brw_alloc_item_data(cache
, data_size
);
347 /* Copy data to the buffer */
349 memcpy((char *)cache
->bo
->virtual + item
->offset
, data
, data_size
);
351 drm_intel_bo_subdata(cache
->bo
, item
->offset
, data_size
, data
);
355 /* Set up the memory containing the key and aux_data */
356 tmp
= malloc(key_size
+ aux_size
);
358 memcpy(tmp
, key
, key_size
);
359 memcpy(tmp
+ key_size
, aux
, aux_size
);
363 if (cache
->n_items
> cache
->size
* 1.5f
)
367 item
->next
= cache
->items
[hash
];
368 cache
->items
[hash
] = item
;
371 *out_offset
= item
->offset
;
372 *(void **)out_aux
= (void *)((char *)item
->key
+ item
->key_size
);
373 cache
->brw
->ctx
.NewDriverState
|= 1 << cache_id
;
377 brw_init_caches(struct brw_context
*brw
)
379 struct brw_cache
*cache
= &brw
->cache
;
386 calloc(cache
->size
, sizeof(struct brw_cache_item
*));
388 cache
->bo
= drm_intel_bo_alloc(brw
->bufmgr
, "program cache", 4096, 64);
390 drm_intel_gem_bo_map_unsynchronized(cache
->bo
);
394 brw_clear_cache(struct brw_context
*brw
, struct brw_cache
*cache
)
396 struct brw_cache_item
*c
, *next
;
399 DBG("%s\n", __func__
);
401 for (i
= 0; i
< cache
->size
; i
++) {
402 for (c
= cache
->items
[i
]; c
; c
= next
) {
404 if (c
->cache_id
== BRW_CACHE_VS_PROG
||
405 c
->cache_id
== BRW_CACHE_TCS_PROG
||
406 c
->cache_id
== BRW_CACHE_TES_PROG
||
407 c
->cache_id
== BRW_CACHE_GS_PROG
||
408 c
->cache_id
== BRW_CACHE_FS_PROG
||
409 c
->cache_id
== BRW_CACHE_CS_PROG
) {
410 const void *item_aux
= c
->key
+ c
->key_size
;
411 brw_stage_prog_data_free(item_aux
);
413 free((void *)c
->key
);
416 cache
->items
[i
] = NULL
;
421 /* Start putting programs into the start of the BO again, since
422 * we'll never find the old results.
424 cache
->next_offset
= 0;
426 /* We need to make sure that the programs get regenerated, since
427 * any offsets leftover in brw_context will no longer be valid.
429 brw
->NewGLState
= ~0;
430 brw
->ctx
.NewDriverState
= ~0ull;
431 brw
->state
.pipelines
[BRW_RENDER_PIPELINE
].mesa
= ~0;
432 brw
->state
.pipelines
[BRW_RENDER_PIPELINE
].brw
= ~0ull;
433 brw
->state
.pipelines
[BRW_COMPUTE_PIPELINE
].mesa
= ~0;
434 brw
->state
.pipelines
[BRW_COMPUTE_PIPELINE
].brw
= ~0ull;
436 /* Also, NULL out any stale program pointers. */
437 brw
->vs
.base
.prog_data
= NULL
;
438 brw
->tcs
.base
.prog_data
= NULL
;
439 brw
->tes
.base
.prog_data
= NULL
;
440 brw
->gs
.base
.prog_data
= NULL
;
441 brw
->wm
.base
.prog_data
= NULL
;
442 brw
->cs
.base
.prog_data
= NULL
;
444 intel_batchbuffer_flush(brw
);
448 brw_program_cache_check_size(struct brw_context
*brw
)
450 /* un-tuned guess. Each object is generally a page, so 2000 of them is 8 MB of
453 if (brw
->cache
.n_items
> 2000) {
454 perf_debug("Exceeded state cache size limit. Clearing the set "
455 "of compiled programs, which will trigger recompiles\n");
456 brw_clear_cache(brw
, &brw
->cache
);
462 brw_destroy_cache(struct brw_context
*brw
, struct brw_cache
*cache
)
465 DBG("%s\n", __func__
);
468 drm_intel_bo_unmap(cache
->bo
);
469 drm_intel_bo_unreference(cache
->bo
);
471 brw_clear_cache(brw
, cache
);
479 brw_destroy_caches(struct brw_context
*brw
)
481 brw_destroy_cache(brw
, &brw
->cache
);
485 cache_name(enum brw_cache_id cache_id
)
488 case BRW_CACHE_VS_PROG
:
490 case BRW_CACHE_TCS_PROG
:
492 case BRW_CACHE_TES_PROG
:
494 case BRW_CACHE_FF_GS_PROG
:
495 return "Fixed-function GS kernel";
496 case BRW_CACHE_GS_PROG
:
498 case BRW_CACHE_CLIP_PROG
:
499 return "CLIP kernel";
500 case BRW_CACHE_SF_PROG
:
502 case BRW_CACHE_FS_PROG
:
504 case BRW_CACHE_CS_PROG
:
512 brw_print_program_cache(struct brw_context
*brw
)
514 const struct brw_cache
*cache
= &brw
->cache
;
515 struct brw_cache_item
*item
;
517 drm_intel_bo_map(cache
->bo
, false);
519 for (unsigned i
= 0; i
< cache
->size
; i
++) {
520 for (item
= cache
->items
[i
]; item
; item
= item
->next
) {
521 fprintf(stderr
, "%s:\n", cache_name(i
));
522 brw_disassemble(&brw
->screen
->devinfo
, cache
->bo
->virtual,
523 item
->offset
, item
->size
, stderr
);
527 drm_intel_bo_unmap(cache
->bo
);