anv/pipeline_cache: Allow for an zero-sized cache
[mesa.git] / src / intel / vulkan / anv_pipeline_cache.c
1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "util/mesa-sha1.h"
25 #include "util/debug.h"
26 #include "anv_private.h"
27
28 /* Remaining work:
29 *
30 * - Compact binding table layout so it's tight and not dependent on
31 * descriptor set layout.
32 *
33 * - Review prog_data struct for size and cacheability: struct
34 * brw_stage_prog_data has binding_table which uses a lot of uint32_t for 8
35 * bit quantities etc; param, pull_param, and image_params are pointers, we
36 * just need the compation map. use bit fields for all bools, eg
37 * dual_src_blend.
38 */
39
40 void
41 anv_pipeline_cache_init(struct anv_pipeline_cache *cache,
42 struct anv_device *device)
43 {
44 cache->device = device;
45 anv_state_stream_init(&cache->program_stream,
46 &device->instruction_block_pool);
47 pthread_mutex_init(&cache->mutex, NULL);
48
49 cache->kernel_count = 0;
50 cache->total_size = 0;
51 cache->table_size = 1024;
52 const size_t byte_size = cache->table_size * sizeof(cache->hash_table[0]);
53 cache->hash_table = malloc(byte_size);
54
55 /* We don't consider allocation failure fatal, we just start with a 0-sized
56 * cache. */
57 if (cache->hash_table == NULL ||
58 !env_var_as_boolean("ANV_ENABLE_PIPELINE_CACHE", true))
59 cache->table_size = 0;
60 else
61 memset(cache->hash_table, 0xff, byte_size);
62 }
63
64 void
65 anv_pipeline_cache_finish(struct anv_pipeline_cache *cache)
66 {
67 anv_state_stream_finish(&cache->program_stream);
68 pthread_mutex_destroy(&cache->mutex);
69 free(cache->hash_table);
70 }
71
72 struct cache_entry {
73 unsigned char sha1[20];
74 uint32_t prog_data_size;
75 uint32_t kernel_size;
76 uint32_t surface_count;
77 uint32_t sampler_count;
78 uint32_t image_count;
79
80 char prog_data[0];
81
82 /* kernel follows prog_data at next 64 byte aligned address */
83 };
84
85 static uint32_t
86 entry_size(struct cache_entry *entry)
87 {
88 /* This returns the number of bytes needed to serialize an entry, which
89 * doesn't include the alignment padding bytes.
90 */
91
92 struct brw_stage_prog_data *prog_data = (void *)entry->prog_data;
93 const uint32_t param_size =
94 prog_data->nr_params * sizeof(*prog_data->param);
95
96 const uint32_t map_size =
97 entry->surface_count * sizeof(struct anv_pipeline_binding) +
98 entry->sampler_count * sizeof(struct anv_pipeline_binding);
99
100 return sizeof(*entry) + entry->prog_data_size + param_size + map_size;
101 }
102
103 void
104 anv_hash_shader(unsigned char *hash, const void *key, size_t key_size,
105 struct anv_shader_module *module,
106 const char *entrypoint,
107 const VkSpecializationInfo *spec_info)
108 {
109 struct mesa_sha1 *ctx;
110
111 ctx = _mesa_sha1_init();
112 _mesa_sha1_update(ctx, key, key_size);
113 _mesa_sha1_update(ctx, module->sha1, sizeof(module->sha1));
114 _mesa_sha1_update(ctx, entrypoint, strlen(entrypoint));
115 /* hash in shader stage, pipeline layout? */
116 if (spec_info) {
117 _mesa_sha1_update(ctx, spec_info->pMapEntries,
118 spec_info->mapEntryCount * sizeof spec_info->pMapEntries[0]);
119 _mesa_sha1_update(ctx, spec_info->pData, spec_info->dataSize);
120 }
121 _mesa_sha1_final(ctx, hash);
122 }
123
124 static uint32_t
125 anv_pipeline_cache_search_unlocked(struct anv_pipeline_cache *cache,
126 const unsigned char *sha1,
127 const struct brw_stage_prog_data **prog_data,
128 struct anv_pipeline_bind_map *map)
129 {
130 const uint32_t mask = cache->table_size - 1;
131 const uint32_t start = (*(uint32_t *) sha1);
132
133 for (uint32_t i = 0; i < cache->table_size; i++) {
134 const uint32_t index = (start + i) & mask;
135 const uint32_t offset = cache->hash_table[index];
136
137 if (offset == ~0)
138 return NO_KERNEL;
139
140 struct cache_entry *entry =
141 cache->program_stream.block_pool->map + offset;
142 if (memcmp(entry->sha1, sha1, sizeof(entry->sha1)) == 0) {
143 if (prog_data) {
144 assert(map);
145 void *p = entry->prog_data;
146 *prog_data = p;
147 p += entry->prog_data_size;
148 p += (*prog_data)->nr_params * sizeof(*(*prog_data)->param);
149 map->surface_count = entry->surface_count;
150 map->sampler_count = entry->sampler_count;
151 map->image_count = entry->image_count;
152 map->surface_to_descriptor = p;
153 p += map->surface_count * sizeof(struct anv_pipeline_binding);
154 map->sampler_to_descriptor = p;
155 }
156
157 return offset + align_u32(entry_size(entry), 64);
158 }
159 }
160
161 /* This can happen if the pipeline cache is disabled via
162 * ANV_ENABLE_PIPELINE_CACHE=false
163 */
164 return NO_KERNEL;
165 }
166
167 uint32_t
168 anv_pipeline_cache_search(struct anv_pipeline_cache *cache,
169 const unsigned char *sha1,
170 const struct brw_stage_prog_data **prog_data,
171 struct anv_pipeline_bind_map *map)
172 {
173 uint32_t kernel;
174
175 pthread_mutex_lock(&cache->mutex);
176
177 kernel = anv_pipeline_cache_search_unlocked(cache, sha1, prog_data, map);
178
179 pthread_mutex_unlock(&cache->mutex);
180
181 return kernel;
182 }
183
184 static void
185 anv_pipeline_cache_set_entry(struct anv_pipeline_cache *cache,
186 struct cache_entry *entry, uint32_t entry_offset)
187 {
188 const uint32_t mask = cache->table_size - 1;
189 const uint32_t start = (*(uint32_t *) entry->sha1);
190
191 /* We'll always be able to insert when we get here. */
192 assert(cache->kernel_count < cache->table_size / 2);
193
194 for (uint32_t i = 0; i < cache->table_size; i++) {
195 const uint32_t index = (start + i) & mask;
196 if (cache->hash_table[index] == ~0) {
197 cache->hash_table[index] = entry_offset;
198 break;
199 }
200 }
201
202 cache->total_size += entry_size(entry) + entry->kernel_size;
203 cache->kernel_count++;
204 }
205
206 static VkResult
207 anv_pipeline_cache_grow(struct anv_pipeline_cache *cache)
208 {
209 const uint32_t table_size = cache->table_size * 2;
210 const uint32_t old_table_size = cache->table_size;
211 const size_t byte_size = table_size * sizeof(cache->hash_table[0]);
212 uint32_t *table;
213 uint32_t *old_table = cache->hash_table;
214
215 table = malloc(byte_size);
216 if (table == NULL)
217 return VK_ERROR_OUT_OF_HOST_MEMORY;
218
219 cache->hash_table = table;
220 cache->table_size = table_size;
221 cache->kernel_count = 0;
222 cache->total_size = 0;
223
224 memset(cache->hash_table, 0xff, byte_size);
225 for (uint32_t i = 0; i < old_table_size; i++) {
226 const uint32_t offset = old_table[i];
227 if (offset == ~0)
228 continue;
229
230 struct cache_entry *entry =
231 cache->program_stream.block_pool->map + offset;
232 anv_pipeline_cache_set_entry(cache, entry, offset);
233 }
234
235 free(old_table);
236
237 return VK_SUCCESS;
238 }
239
240 static void
241 anv_pipeline_cache_add_entry(struct anv_pipeline_cache *cache,
242 struct cache_entry *entry, uint32_t entry_offset)
243 {
244 if (cache->kernel_count == cache->table_size / 2)
245 anv_pipeline_cache_grow(cache);
246
247 /* Failing to grow that hash table isn't fatal, but may mean we don't
248 * have enough space to add this new kernel. Only add it if there's room.
249 */
250 if (cache->kernel_count < cache->table_size / 2)
251 anv_pipeline_cache_set_entry(cache, entry, entry_offset);
252 }
253
254 uint32_t
255 anv_pipeline_cache_upload_kernel(struct anv_pipeline_cache *cache,
256 const unsigned char *sha1,
257 const void *kernel, size_t kernel_size,
258 const struct brw_stage_prog_data **prog_data,
259 size_t prog_data_size,
260 struct anv_pipeline_bind_map *map)
261 {
262 pthread_mutex_lock(&cache->mutex);
263
264 /* Before uploading, check again that another thread didn't upload this
265 * shader while we were compiling it.
266 */
267 if (sha1) {
268 uint32_t cached_kernel =
269 anv_pipeline_cache_search_unlocked(cache, sha1, prog_data, map);
270 if (cached_kernel != NO_KERNEL) {
271 pthread_mutex_unlock(&cache->mutex);
272 return cached_kernel;
273 }
274 }
275
276 struct cache_entry *entry;
277
278 assert((*prog_data)->nr_pull_params == 0);
279 assert((*prog_data)->nr_image_params == 0);
280
281 const uint32_t param_size =
282 (*prog_data)->nr_params * sizeof(*(*prog_data)->param);
283
284 const uint32_t map_size =
285 map->surface_count * sizeof(struct anv_pipeline_binding) +
286 map->sampler_count * sizeof(struct anv_pipeline_binding);
287
288 const uint32_t preamble_size =
289 align_u32(sizeof(*entry) + prog_data_size + param_size + map_size, 64);
290
291 const uint32_t size = preamble_size + kernel_size;
292
293 assert(size < cache->program_stream.block_pool->block_size);
294 const struct anv_state state =
295 anv_state_stream_alloc(&cache->program_stream, size, 64);
296
297 entry = state.map;
298 entry->prog_data_size = prog_data_size;
299 entry->surface_count = map->surface_count;
300 entry->sampler_count = map->sampler_count;
301 entry->image_count = map->image_count;
302 entry->kernel_size = kernel_size;
303
304 void *p = entry->prog_data;
305 memcpy(p, *prog_data, prog_data_size);
306 p += prog_data_size;
307
308 memcpy(p, (*prog_data)->param, param_size);
309 ((struct brw_stage_prog_data *)entry->prog_data)->param = p;
310 p += param_size;
311
312 memcpy(p, map->surface_to_descriptor,
313 map->surface_count * sizeof(struct anv_pipeline_binding));
314 map->surface_to_descriptor = p;
315 p += map->surface_count * sizeof(struct anv_pipeline_binding);
316
317 memcpy(p, map->sampler_to_descriptor,
318 map->sampler_count * sizeof(struct anv_pipeline_binding));
319 map->sampler_to_descriptor = p;
320
321 if (sha1) {
322 assert(anv_pipeline_cache_search_unlocked(cache, sha1,
323 NULL, NULL) == NO_KERNEL);
324
325 memcpy(entry->sha1, sha1, sizeof(entry->sha1));
326 anv_pipeline_cache_add_entry(cache, entry, state.offset);
327 }
328
329 pthread_mutex_unlock(&cache->mutex);
330
331 memcpy(state.map + preamble_size, kernel, kernel_size);
332
333 if (!cache->device->info.has_llc)
334 anv_state_clflush(state);
335
336 *prog_data = (const struct brw_stage_prog_data *) entry->prog_data;
337
338 return state.offset + preamble_size;
339 }
340
341 struct cache_header {
342 uint32_t header_size;
343 uint32_t header_version;
344 uint32_t vendor_id;
345 uint32_t device_id;
346 uint8_t uuid[VK_UUID_SIZE];
347 };
348
349 static void
350 anv_pipeline_cache_load(struct anv_pipeline_cache *cache,
351 const void *data, size_t size)
352 {
353 struct anv_device *device = cache->device;
354 struct cache_header header;
355 uint8_t uuid[VK_UUID_SIZE];
356
357 if (size < sizeof(header))
358 return;
359 memcpy(&header, data, sizeof(header));
360 if (header.header_size < sizeof(header))
361 return;
362 if (header.header_version != VK_PIPELINE_CACHE_HEADER_VERSION_ONE)
363 return;
364 if (header.vendor_id != 0x8086)
365 return;
366 if (header.device_id != device->chipset_id)
367 return;
368 anv_device_get_cache_uuid(uuid);
369 if (memcmp(header.uuid, uuid, VK_UUID_SIZE) != 0)
370 return;
371
372 void *end = (void *) data + size;
373 void *p = (void *) data + header.header_size;
374
375 while (p < end) {
376 struct cache_entry *entry = p;
377
378 void *data = entry->prog_data;
379
380 /* Make a copy of prog_data so that it's mutable */
381 uint8_t prog_data_tmp[512];
382 assert(entry->prog_data_size <= sizeof(prog_data_tmp));
383 memcpy(prog_data_tmp, data, entry->prog_data_size);
384 struct brw_stage_prog_data *prog_data = (void *)prog_data_tmp;
385 data += entry->prog_data_size;
386
387 prog_data->param = data;
388 data += prog_data->nr_params * sizeof(*prog_data->param);
389
390 struct anv_pipeline_binding *surface_to_descriptor = data;
391 data += entry->surface_count * sizeof(struct anv_pipeline_binding);
392 struct anv_pipeline_binding *sampler_to_descriptor = data;
393 data += entry->sampler_count * sizeof(struct anv_pipeline_binding);
394 void *kernel = data;
395
396 struct anv_pipeline_bind_map map = {
397 .surface_count = entry->surface_count,
398 .sampler_count = entry->sampler_count,
399 .image_count = entry->image_count,
400 .surface_to_descriptor = surface_to_descriptor,
401 .sampler_to_descriptor = sampler_to_descriptor
402 };
403
404 const struct brw_stage_prog_data *const_prog_data = prog_data;
405
406 anv_pipeline_cache_upload_kernel(cache, entry->sha1,
407 kernel, entry->kernel_size,
408 &const_prog_data,
409 entry->prog_data_size, &map);
410 p = kernel + entry->kernel_size;
411 }
412 }
413
414 VkResult anv_CreatePipelineCache(
415 VkDevice _device,
416 const VkPipelineCacheCreateInfo* pCreateInfo,
417 const VkAllocationCallbacks* pAllocator,
418 VkPipelineCache* pPipelineCache)
419 {
420 ANV_FROM_HANDLE(anv_device, device, _device);
421 struct anv_pipeline_cache *cache;
422
423 assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO);
424 assert(pCreateInfo->flags == 0);
425
426 cache = anv_alloc2(&device->alloc, pAllocator,
427 sizeof(*cache), 8,
428 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
429 if (cache == NULL)
430 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
431
432 anv_pipeline_cache_init(cache, device);
433
434 if (pCreateInfo->initialDataSize > 0)
435 anv_pipeline_cache_load(cache,
436 pCreateInfo->pInitialData,
437 pCreateInfo->initialDataSize);
438
439 *pPipelineCache = anv_pipeline_cache_to_handle(cache);
440
441 return VK_SUCCESS;
442 }
443
444 void anv_DestroyPipelineCache(
445 VkDevice _device,
446 VkPipelineCache _cache,
447 const VkAllocationCallbacks* pAllocator)
448 {
449 ANV_FROM_HANDLE(anv_device, device, _device);
450 ANV_FROM_HANDLE(anv_pipeline_cache, cache, _cache);
451
452 anv_pipeline_cache_finish(cache);
453
454 anv_free2(&device->alloc, pAllocator, cache);
455 }
456
457 VkResult anv_GetPipelineCacheData(
458 VkDevice _device,
459 VkPipelineCache _cache,
460 size_t* pDataSize,
461 void* pData)
462 {
463 ANV_FROM_HANDLE(anv_device, device, _device);
464 ANV_FROM_HANDLE(anv_pipeline_cache, cache, _cache);
465 struct cache_header *header;
466
467 const size_t size = sizeof(*header) + cache->total_size;
468
469 if (pData == NULL) {
470 *pDataSize = size;
471 return VK_SUCCESS;
472 }
473
474 if (*pDataSize < sizeof(*header)) {
475 *pDataSize = 0;
476 return VK_INCOMPLETE;
477 }
478
479 void *p = pData, *end = pData + *pDataSize;
480 header = p;
481 header->header_size = sizeof(*header);
482 header->header_version = VK_PIPELINE_CACHE_HEADER_VERSION_ONE;
483 header->vendor_id = 0x8086;
484 header->device_id = device->chipset_id;
485 anv_device_get_cache_uuid(header->uuid);
486 p += header->header_size;
487
488 struct cache_entry *entry;
489 for (uint32_t i = 0; i < cache->table_size; i++) {
490 if (cache->hash_table[i] == ~0)
491 continue;
492
493 entry = cache->program_stream.block_pool->map + cache->hash_table[i];
494 const uint32_t size = entry_size(entry);
495 if (end < p + size + entry->kernel_size)
496 break;
497
498 memcpy(p, entry, size);
499 p += size;
500
501 void *kernel = (void *) entry + align_u32(size, 64);
502
503 memcpy(p, kernel, entry->kernel_size);
504 p += entry->kernel_size;
505 }
506
507 *pDataSize = p - pData;
508
509 return VK_SUCCESS;
510 }
511
512 static void
513 anv_pipeline_cache_merge(struct anv_pipeline_cache *dst,
514 struct anv_pipeline_cache *src)
515 {
516 for (uint32_t i = 0; i < src->table_size; i++) {
517 const uint32_t offset = src->hash_table[i];
518 if (offset == ~0)
519 continue;
520
521 struct cache_entry *entry =
522 src->program_stream.block_pool->map + offset;
523
524 if (anv_pipeline_cache_search(dst, entry->sha1, NULL, NULL) != NO_KERNEL)
525 continue;
526
527 anv_pipeline_cache_add_entry(dst, entry, offset);
528 }
529 }
530
531 VkResult anv_MergePipelineCaches(
532 VkDevice _device,
533 VkPipelineCache destCache,
534 uint32_t srcCacheCount,
535 const VkPipelineCache* pSrcCaches)
536 {
537 ANV_FROM_HANDLE(anv_pipeline_cache, dst, destCache);
538
539 for (uint32_t i = 0; i < srcCacheCount; i++) {
540 ANV_FROM_HANDLE(anv_pipeline_cache, src, pSrcCaches[i]);
541
542 anv_pipeline_cache_merge(dst, src);
543 }
544
545 return VK_SUCCESS;
546 }