2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **********************************************************************/
29 * Keith Whitwell <keithw@vmware.com>
32 /** @file brw_program_cache.c
34 * This file implements a simple program cache for 965. The consumers can
35 * query the hash table of programs using a cache_id and program key, and
36 * receive the corresponding program buffer object (plus associated auxiliary
37 * data) in return. Objects in the cache may not have relocations
38 * (pointers to other BOs) in them.
40 * The inner workings are a simple hash table based on a CRC of the
43 * Replacement is not implemented. Instead, when the cache gets too
44 * big we throw out all of the cache data and let it get regenerated.
47 #include "main/imports.h"
48 #include "main/streaming-load-memcpy.h"
49 #include "x86/common_x86_asm.h"
50 #include "intel_batchbuffer.h"
51 #include "brw_state.h"
55 #include "brw_program.h"
56 #include "compiler/brw_eu.h"
58 #define FILE_DEBUG_FLAG DEBUG_STATE
60 struct brw_cache_item
{
62 * Effectively part of the key, cache_id identifies what kind of state
63 * buffer is involved, and also which dirty flag should set.
65 enum brw_cache_id cache_id
;
67 /** 32-bit hash of the key data */
70 /** for variable-sized keys */
72 GLuint prog_data_size
;
73 const struct brw_base_prog_key
*key
;
78 struct brw_cache_item
*next
;
82 brw_stage_cache_id(gl_shader_stage stage
)
84 static const enum brw_cache_id stage_ids
[] = {
92 assert((int)stage
>= 0 && stage
< ARRAY_SIZE(stage_ids
));
93 return stage_ids
[stage
];
97 hash_key(struct brw_cache_item
*item
)
99 GLuint
*ikey
= (GLuint
*)item
->key
;
100 GLuint hash
= item
->cache_id
, i
;
102 assert(item
->key_size
% 4 == 0);
104 /* I'm sure this can be improved on:
106 for (i
= 0; i
< item
->key_size
/4; i
++) {
108 hash
= (hash
<< 5) | (hash
>> 27);
115 brw_cache_item_equals(const struct brw_cache_item
*a
,
116 const struct brw_cache_item
*b
)
118 return a
->cache_id
== b
->cache_id
&&
119 a
->hash
== b
->hash
&&
120 a
->key_size
== b
->key_size
&&
121 (memcmp(a
->key
, b
->key
, a
->key_size
) == 0);
124 static struct brw_cache_item
*
125 search_cache(struct brw_cache
*cache
, GLuint hash
,
126 struct brw_cache_item
*lookup
)
128 struct brw_cache_item
*c
;
133 for (c
= cache
->items
[hash
% cache
->size
]; c
; c
= c
->next
)
136 fprintf(stderr
, "bucket %d/%d = %d/%d items\n", hash
% cache
->size
,
137 cache
->size
, bucketcount
, cache
->n_items
);
140 for (c
= cache
->items
[hash
% cache
->size
]; c
; c
= c
->next
) {
141 if (brw_cache_item_equals(lookup
, c
))
150 rehash(struct brw_cache
*cache
)
152 struct brw_cache_item
**items
;
153 struct brw_cache_item
*c
, *next
;
156 size
= cache
->size
* 3;
157 items
= calloc(size
, sizeof(*items
));
159 for (i
= 0; i
< cache
->size
; i
++)
160 for (c
= cache
->items
[i
]; c
; c
= next
) {
162 c
->next
= items
[c
->hash
% size
];
163 items
[c
->hash
% size
] = c
;
167 cache
->items
= items
;
173 * Returns the buffer object matching cache_id and key, or NULL.
176 brw_search_cache(struct brw_cache
*cache
, enum brw_cache_id cache_id
,
177 const void *key
, GLuint key_size
, uint32_t *inout_offset
,
178 void *inout_prog_data
, bool flag_state
)
180 struct brw_cache_item
*item
;
181 struct brw_cache_item lookup
;
184 lookup
.cache_id
= cache_id
;
186 lookup
.key_size
= key_size
;
187 hash
= hash_key(&lookup
);
190 item
= search_cache(cache
, hash
, &lookup
);
195 void *prog_data
= ((char *) item
->key
) + item
->key_size
;
197 if (item
->offset
!= *inout_offset
||
198 prog_data
!= *((void **) inout_prog_data
)) {
199 if (likely(flag_state
))
200 cache
->brw
->ctx
.NewDriverState
|= (1 << cache_id
);
201 *inout_offset
= item
->offset
;
202 *((void **) inout_prog_data
) = prog_data
;
209 brw_cache_new_bo(struct brw_cache
*cache
, uint32_t new_size
)
211 struct brw_context
*brw
= cache
->brw
;
212 struct brw_bo
*new_bo
;
214 perf_debug("Copying to larger program cache: %u kB -> %u kB\n",
215 (unsigned) cache
->bo
->size
/ 1024, new_size
/ 1024);
217 new_bo
= brw_bo_alloc(brw
->bufmgr
, "program cache", new_size
,
219 if (can_do_exec_capture(brw
->screen
))
220 new_bo
->kflags
|= EXEC_OBJECT_CAPTURE
;
222 void *map
= brw_bo_map(brw
, new_bo
, MAP_READ
| MAP_WRITE
|
223 MAP_ASYNC
| MAP_PERSISTENT
);
225 /* Copy any existing data that needs to be saved. */
226 if (cache
->next_offset
!= 0) {
228 if (!cache
->bo
->cache_coherent
&& cpu_has_sse4_1
)
229 _mesa_streaming_load_memcpy(map
, cache
->map
, cache
->next_offset
);
232 memcpy(map
, cache
->map
, cache
->next_offset
);
235 brw_bo_unmap(cache
->bo
);
236 brw_bo_unreference(cache
->bo
);
240 /* Since we have a new BO in place, we need to signal the units
241 * that depend on it (state base address on gen5+, or unit state before).
243 brw
->ctx
.NewDriverState
|= BRW_NEW_PROGRAM_CACHE
;
244 brw
->batch
.state_base_address_emitted
= false;
248 * Attempts to find an item in the cache with identical data.
250 static const struct brw_cache_item
*
251 brw_lookup_prog(const struct brw_cache
*cache
,
252 enum brw_cache_id cache_id
,
253 const void *data
, unsigned data_size
)
256 const struct brw_cache_item
*item
;
258 for (i
= 0; i
< cache
->size
; i
++) {
259 for (item
= cache
->items
[i
]; item
; item
= item
->next
) {
260 if (item
->cache_id
!= cache_id
|| item
->size
!= data_size
||
261 memcmp(cache
->map
+ item
->offset
, data
, item
->size
) != 0)
272 brw_alloc_item_data(struct brw_cache
*cache
, uint32_t size
)
276 /* Allocate space in the cache BO for our new program. */
277 if (cache
->next_offset
+ size
> cache
->bo
->size
) {
278 uint32_t new_size
= cache
->bo
->size
* 2;
280 while (cache
->next_offset
+ size
> new_size
)
283 brw_cache_new_bo(cache
, new_size
);
286 offset
= cache
->next_offset
;
288 /* Programs are always 64-byte aligned, so set up the next one now */
289 cache
->next_offset
= ALIGN(offset
+ size
, 64);
295 brw_find_previous_compile(struct brw_cache
*cache
,
296 enum brw_cache_id cache_id
,
297 unsigned program_string_id
)
299 for (unsigned i
= 0; i
< cache
->size
; i
++) {
300 for (struct brw_cache_item
*c
= cache
->items
[i
]; c
; c
= c
->next
) {
301 if (c
->cache_id
== cache_id
&&
302 c
->key
->program_string_id
== program_string_id
) {
312 brw_upload_cache(struct brw_cache
*cache
,
313 enum brw_cache_id cache_id
,
318 const void *prog_data
,
319 GLuint prog_data_size
,
320 uint32_t *out_offset
,
323 struct brw_cache_item
*item
= CALLOC_STRUCT(brw_cache_item
);
324 const struct brw_cache_item
*matching_data
=
325 brw_lookup_prog(cache
, cache_id
, data
, data_size
);
329 item
->cache_id
= cache_id
;
330 item
->size
= data_size
;
332 item
->key_size
= key_size
;
333 item
->prog_data_size
= prog_data_size
;
334 hash
= hash_key(item
);
337 /* If we can find a matching prog in the cache already, then reuse the
338 * existing stuff without creating new copy into the underlying buffer
339 * object. This is notably useful for programs generating shaders at
340 * runtime, where multiple shaders may compile to the same thing in our
344 item
->offset
= matching_data
->offset
;
346 item
->offset
= brw_alloc_item_data(cache
, data_size
);
348 /* Copy data to the buffer */
349 memcpy(cache
->map
+ item
->offset
, data
, data_size
);
352 /* Set up the memory containing the key and prog_data */
353 tmp
= malloc(key_size
+ prog_data_size
);
355 memcpy(tmp
, key
, key_size
);
356 memcpy(tmp
+ key_size
, prog_data
, prog_data_size
);
360 if (cache
->n_items
> cache
->size
* 1.5f
)
364 item
->next
= cache
->items
[hash
];
365 cache
->items
[hash
] = item
;
368 *out_offset
= item
->offset
;
369 *(void **)out_prog_data
= (void *)((char *)item
->key
+ item
->key_size
);
370 cache
->brw
->ctx
.NewDriverState
|= 1 << cache_id
;
374 brw_init_caches(struct brw_context
*brw
)
376 struct brw_cache
*cache
= &brw
->cache
;
383 calloc(cache
->size
, sizeof(struct brw_cache_item
*));
385 cache
->bo
= brw_bo_alloc(brw
->bufmgr
, "program cache", 16384,
387 if (can_do_exec_capture(brw
->screen
))
388 cache
->bo
->kflags
|= EXEC_OBJECT_CAPTURE
;
390 cache
->map
= brw_bo_map(brw
, cache
->bo
, MAP_READ
| MAP_WRITE
|
391 MAP_ASYNC
| MAP_PERSISTENT
);
395 brw_clear_cache(struct brw_context
*brw
, struct brw_cache
*cache
)
397 struct brw_cache_item
*c
, *next
;
400 DBG("%s\n", __func__
);
402 for (i
= 0; i
< cache
->size
; i
++) {
403 for (c
= cache
->items
[i
]; c
; c
= next
) {
405 if (c
->cache_id
== BRW_CACHE_VS_PROG
||
406 c
->cache_id
== BRW_CACHE_TCS_PROG
||
407 c
->cache_id
== BRW_CACHE_TES_PROG
||
408 c
->cache_id
== BRW_CACHE_GS_PROG
||
409 c
->cache_id
== BRW_CACHE_FS_PROG
||
410 c
->cache_id
== BRW_CACHE_CS_PROG
) {
411 const void *item_prog_data
= ((char *)c
->key
) + c
->key_size
;
412 brw_stage_prog_data_free(item_prog_data
);
414 free((void *)c
->key
);
417 cache
->items
[i
] = NULL
;
422 /* Start putting programs into the start of the BO again, since
423 * we'll never find the old results.
425 cache
->next_offset
= 0;
427 /* We need to make sure that the programs get regenerated, since
428 * any offsets leftover in brw_context will no longer be valid.
430 brw
->NewGLState
= ~0;
431 brw
->ctx
.NewDriverState
= ~0ull;
432 brw
->state
.pipelines
[BRW_RENDER_PIPELINE
].mesa
= ~0;
433 brw
->state
.pipelines
[BRW_RENDER_PIPELINE
].brw
= ~0ull;
434 brw
->state
.pipelines
[BRW_COMPUTE_PIPELINE
].mesa
= ~0;
435 brw
->state
.pipelines
[BRW_COMPUTE_PIPELINE
].brw
= ~0ull;
437 /* Also, NULL out any stale program pointers. */
438 brw
->vs
.base
.prog_data
= NULL
;
439 brw
->tcs
.base
.prog_data
= NULL
;
440 brw
->tes
.base
.prog_data
= NULL
;
441 brw
->gs
.base
.prog_data
= NULL
;
442 brw
->wm
.base
.prog_data
= NULL
;
443 brw
->cs
.base
.prog_data
= NULL
;
445 intel_batchbuffer_flush(brw
);
449 brw_program_cache_check_size(struct brw_context
*brw
)
451 /* un-tuned guess. Each object is generally a page, so 2000 of them is 8 MB of
454 if (brw
->cache
.n_items
> 2000) {
455 perf_debug("Exceeded state cache size limit. Clearing the set "
456 "of compiled programs, which will trigger recompiles\n");
457 brw_clear_cache(brw
, &brw
->cache
);
458 brw_cache_new_bo(&brw
->cache
, brw
->cache
.bo
->size
);
464 brw_destroy_cache(struct brw_context
*brw
, struct brw_cache
*cache
)
467 DBG("%s\n", __func__
);
469 /* This can be NULL if context creation failed early on */
471 brw_bo_unmap(cache
->bo
);
472 brw_bo_unreference(cache
->bo
);
476 brw_clear_cache(brw
, cache
);
484 brw_destroy_caches(struct brw_context
*brw
)
486 brw_destroy_cache(brw
, &brw
->cache
);
490 cache_name(enum brw_cache_id cache_id
)
493 case BRW_CACHE_VS_PROG
:
495 case BRW_CACHE_TCS_PROG
:
497 case BRW_CACHE_TES_PROG
:
499 case BRW_CACHE_FF_GS_PROG
:
500 return "Fixed-function GS kernel";
501 case BRW_CACHE_GS_PROG
:
503 case BRW_CACHE_CLIP_PROG
:
504 return "CLIP kernel";
505 case BRW_CACHE_SF_PROG
:
507 case BRW_CACHE_FS_PROG
:
509 case BRW_CACHE_CS_PROG
:
517 brw_print_program_cache(struct brw_context
*brw
)
519 const struct brw_cache
*cache
= &brw
->cache
;
520 struct brw_cache_item
*item
;
522 for (unsigned i
= 0; i
< cache
->size
; i
++) {
523 for (item
= cache
->items
[i
]; item
; item
= item
->next
) {
524 fprintf(stderr
, "%s:\n", cache_name(i
));
525 brw_disassemble(&brw
->screen
->devinfo
, cache
->map
,
526 item
->offset
, item
->size
, stderr
);