anv/pipeline: Add support for caching the push constant map
[mesa.git] / src / intel / vulkan / anv_pipeline_cache.c
1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "util/mesa-sha1.h"
25 #include "util/debug.h"
26 #include "anv_private.h"
27
28 /* Remaining work:
29 *
30 * - Compact binding table layout so it's tight and not dependent on
31 * descriptor set layout.
32 *
33 * - Review prog_data struct for size and cacheability: struct
34 * brw_stage_prog_data has binding_table which uses a lot of uint32_t for 8
35 * bit quantities etc; param, pull_param, and image_params are pointers, we
36 * just need the compation map. use bit fields for all bools, eg
37 * dual_src_blend.
38 */
39
40 void
41 anv_pipeline_cache_init(struct anv_pipeline_cache *cache,
42 struct anv_device *device)
43 {
44 cache->device = device;
45 anv_state_stream_init(&cache->program_stream,
46 &device->instruction_block_pool);
47 pthread_mutex_init(&cache->mutex, NULL);
48
49 cache->kernel_count = 0;
50 cache->total_size = 0;
51 cache->table_size = 1024;
52 const size_t byte_size = cache->table_size * sizeof(cache->hash_table[0]);
53 cache->hash_table = malloc(byte_size);
54
55 /* We don't consider allocation failure fatal, we just start with a 0-sized
56 * cache. */
57 if (cache->hash_table == NULL ||
58 !env_var_as_boolean("ANV_ENABLE_PIPELINE_CACHE", true))
59 cache->table_size = 0;
60 else
61 memset(cache->hash_table, 0xff, byte_size);
62 }
63
64 void
65 anv_pipeline_cache_finish(struct anv_pipeline_cache *cache)
66 {
67 anv_state_stream_finish(&cache->program_stream);
68 pthread_mutex_destroy(&cache->mutex);
69 free(cache->hash_table);
70 }
71
72 struct cache_entry {
73 unsigned char sha1[20];
74 uint32_t prog_data_size;
75 uint32_t kernel_size;
76 uint32_t surface_count;
77 uint32_t sampler_count;
78 uint32_t image_count;
79
80 char prog_data[0];
81
82 /* kernel follows prog_data at next 64 byte aligned address */
83 };
84
85 static uint32_t
86 entry_size(struct cache_entry *entry)
87 {
88 /* This returns the number of bytes needed to serialize an entry, which
89 * doesn't include the alignment padding bytes.
90 */
91
92 struct brw_stage_prog_data *prog_data = (void *)entry->prog_data;
93 const uint32_t param_size =
94 prog_data->nr_params * sizeof(*prog_data->param);
95
96 const uint32_t map_size =
97 entry->surface_count * sizeof(struct anv_pipeline_binding) +
98 entry->sampler_count * sizeof(struct anv_pipeline_binding);
99
100 return sizeof(*entry) + entry->prog_data_size + param_size + map_size;
101 }
102
103 void
104 anv_hash_shader(unsigned char *hash, const void *key, size_t key_size,
105 struct anv_shader_module *module,
106 const char *entrypoint,
107 const VkSpecializationInfo *spec_info)
108 {
109 struct mesa_sha1 *ctx;
110
111 ctx = _mesa_sha1_init();
112 _mesa_sha1_update(ctx, key, key_size);
113 _mesa_sha1_update(ctx, module->sha1, sizeof(module->sha1));
114 _mesa_sha1_update(ctx, entrypoint, strlen(entrypoint));
115 /* hash in shader stage, pipeline layout? */
116 if (spec_info) {
117 _mesa_sha1_update(ctx, spec_info->pMapEntries,
118 spec_info->mapEntryCount * sizeof spec_info->pMapEntries[0]);
119 _mesa_sha1_update(ctx, spec_info->pData, spec_info->dataSize);
120 }
121 _mesa_sha1_final(ctx, hash);
122 }
123
124 static uint32_t
125 anv_pipeline_cache_search_unlocked(struct anv_pipeline_cache *cache,
126 const unsigned char *sha1,
127 const struct brw_stage_prog_data **prog_data,
128 struct anv_pipeline_bind_map *map)
129 {
130 const uint32_t mask = cache->table_size - 1;
131 const uint32_t start = (*(uint32_t *) sha1);
132
133 for (uint32_t i = 0; i < cache->table_size; i++) {
134 const uint32_t index = (start + i) & mask;
135 const uint32_t offset = cache->hash_table[index];
136
137 if (offset == ~0)
138 return NO_KERNEL;
139
140 struct cache_entry *entry =
141 cache->program_stream.block_pool->map + offset;
142 if (memcmp(entry->sha1, sha1, sizeof(entry->sha1)) == 0) {
143 if (prog_data) {
144 assert(map);
145 void *p = entry->prog_data;
146 *prog_data = p;
147 p += entry->prog_data_size;
148 p += (*prog_data)->nr_params * sizeof(*(*prog_data)->param);
149 map->surface_count = entry->surface_count;
150 map->sampler_count = entry->sampler_count;
151 map->image_count = entry->image_count;
152 map->surface_to_descriptor = p;
153 p += map->surface_count * sizeof(struct anv_pipeline_binding);
154 map->sampler_to_descriptor = p;
155 }
156
157 return offset + align_u32(entry_size(entry), 64);
158 }
159 }
160
161 unreachable("hash table should never be full");
162 }
163
164 uint32_t
165 anv_pipeline_cache_search(struct anv_pipeline_cache *cache,
166 const unsigned char *sha1,
167 const struct brw_stage_prog_data **prog_data,
168 struct anv_pipeline_bind_map *map)
169 {
170 uint32_t kernel;
171
172 pthread_mutex_lock(&cache->mutex);
173
174 kernel = anv_pipeline_cache_search_unlocked(cache, sha1, prog_data, map);
175
176 pthread_mutex_unlock(&cache->mutex);
177
178 return kernel;
179 }
180
181 static void
182 anv_pipeline_cache_set_entry(struct anv_pipeline_cache *cache,
183 struct cache_entry *entry, uint32_t entry_offset)
184 {
185 const uint32_t mask = cache->table_size - 1;
186 const uint32_t start = (*(uint32_t *) entry->sha1);
187
188 /* We'll always be able to insert when we get here. */
189 assert(cache->kernel_count < cache->table_size / 2);
190
191 for (uint32_t i = 0; i < cache->table_size; i++) {
192 const uint32_t index = (start + i) & mask;
193 if (cache->hash_table[index] == ~0) {
194 cache->hash_table[index] = entry_offset;
195 break;
196 }
197 }
198
199 cache->total_size += entry_size(entry) + entry->kernel_size;
200 cache->kernel_count++;
201 }
202
203 static VkResult
204 anv_pipeline_cache_grow(struct anv_pipeline_cache *cache)
205 {
206 const uint32_t table_size = cache->table_size * 2;
207 const uint32_t old_table_size = cache->table_size;
208 const size_t byte_size = table_size * sizeof(cache->hash_table[0]);
209 uint32_t *table;
210 uint32_t *old_table = cache->hash_table;
211
212 table = malloc(byte_size);
213 if (table == NULL)
214 return VK_ERROR_OUT_OF_HOST_MEMORY;
215
216 cache->hash_table = table;
217 cache->table_size = table_size;
218 cache->kernel_count = 0;
219 cache->total_size = 0;
220
221 memset(cache->hash_table, 0xff, byte_size);
222 for (uint32_t i = 0; i < old_table_size; i++) {
223 const uint32_t offset = old_table[i];
224 if (offset == ~0)
225 continue;
226
227 struct cache_entry *entry =
228 cache->program_stream.block_pool->map + offset;
229 anv_pipeline_cache_set_entry(cache, entry, offset);
230 }
231
232 free(old_table);
233
234 return VK_SUCCESS;
235 }
236
237 static void
238 anv_pipeline_cache_add_entry(struct anv_pipeline_cache *cache,
239 struct cache_entry *entry, uint32_t entry_offset)
240 {
241 if (cache->kernel_count == cache->table_size / 2)
242 anv_pipeline_cache_grow(cache);
243
244 /* Failing to grow that hash table isn't fatal, but may mean we don't
245 * have enough space to add this new kernel. Only add it if there's room.
246 */
247 if (cache->kernel_count < cache->table_size / 2)
248 anv_pipeline_cache_set_entry(cache, entry, entry_offset);
249 }
250
251 uint32_t
252 anv_pipeline_cache_upload_kernel(struct anv_pipeline_cache *cache,
253 const unsigned char *sha1,
254 const void *kernel, size_t kernel_size,
255 const struct brw_stage_prog_data **prog_data,
256 size_t prog_data_size,
257 struct anv_pipeline_bind_map *map)
258 {
259 pthread_mutex_lock(&cache->mutex);
260
261 /* Before uploading, check again that another thread didn't upload this
262 * shader while we were compiling it.
263 */
264 if (sha1) {
265 uint32_t cached_kernel =
266 anv_pipeline_cache_search_unlocked(cache, sha1, prog_data, map);
267 if (cached_kernel != NO_KERNEL) {
268 pthread_mutex_unlock(&cache->mutex);
269 return cached_kernel;
270 }
271 }
272
273 struct cache_entry *entry;
274
275 assert((*prog_data)->nr_pull_params == 0);
276 assert((*prog_data)->nr_image_params == 0);
277
278 const uint32_t param_size =
279 (*prog_data)->nr_params * sizeof(*(*prog_data)->param);
280
281 const uint32_t map_size =
282 map->surface_count * sizeof(struct anv_pipeline_binding) +
283 map->sampler_count * sizeof(struct anv_pipeline_binding);
284
285 const uint32_t preamble_size =
286 align_u32(sizeof(*entry) + prog_data_size + param_size + map_size, 64);
287
288 const uint32_t size = preamble_size + kernel_size;
289
290 assert(size < cache->program_stream.block_pool->block_size);
291 const struct anv_state state =
292 anv_state_stream_alloc(&cache->program_stream, size, 64);
293
294 entry = state.map;
295 entry->prog_data_size = prog_data_size;
296 entry->surface_count = map->surface_count;
297 entry->sampler_count = map->sampler_count;
298 entry->image_count = map->image_count;
299 entry->kernel_size = kernel_size;
300
301 void *p = entry->prog_data;
302 memcpy(p, *prog_data, prog_data_size);
303 p += prog_data_size;
304
305 memcpy(p, (*prog_data)->param, param_size);
306 ((struct brw_stage_prog_data *)entry->prog_data)->param = p;
307 p += param_size;
308
309 memcpy(p, map->surface_to_descriptor,
310 map->surface_count * sizeof(struct anv_pipeline_binding));
311 map->surface_to_descriptor = p;
312 p += map->surface_count * sizeof(struct anv_pipeline_binding);
313
314 memcpy(p, map->sampler_to_descriptor,
315 map->sampler_count * sizeof(struct anv_pipeline_binding));
316 map->sampler_to_descriptor = p;
317
318 if (sha1) {
319 assert(anv_pipeline_cache_search_unlocked(cache, sha1,
320 NULL, NULL) == NO_KERNEL);
321
322 memcpy(entry->sha1, sha1, sizeof(entry->sha1));
323 anv_pipeline_cache_add_entry(cache, entry, state.offset);
324 }
325
326 pthread_mutex_unlock(&cache->mutex);
327
328 memcpy(state.map + preamble_size, kernel, kernel_size);
329
330 if (!cache->device->info.has_llc)
331 anv_state_clflush(state);
332
333 *prog_data = (const struct brw_stage_prog_data *) entry->prog_data;
334
335 return state.offset + preamble_size;
336 }
337
338 struct cache_header {
339 uint32_t header_size;
340 uint32_t header_version;
341 uint32_t vendor_id;
342 uint32_t device_id;
343 uint8_t uuid[VK_UUID_SIZE];
344 };
345
346 static void
347 anv_pipeline_cache_load(struct anv_pipeline_cache *cache,
348 const void *data, size_t size)
349 {
350 struct anv_device *device = cache->device;
351 struct cache_header header;
352 uint8_t uuid[VK_UUID_SIZE];
353
354 if (size < sizeof(header))
355 return;
356 memcpy(&header, data, sizeof(header));
357 if (header.header_size < sizeof(header))
358 return;
359 if (header.header_version != VK_PIPELINE_CACHE_HEADER_VERSION_ONE)
360 return;
361 if (header.vendor_id != 0x8086)
362 return;
363 if (header.device_id != device->chipset_id)
364 return;
365 anv_device_get_cache_uuid(uuid);
366 if (memcmp(header.uuid, uuid, VK_UUID_SIZE) != 0)
367 return;
368
369 void *end = (void *) data + size;
370 void *p = (void *) data + header.header_size;
371
372 while (p < end) {
373 struct cache_entry *entry = p;
374
375 void *data = entry->prog_data;
376
377 /* Make a copy of prog_data so that it's mutable */
378 uint8_t prog_data_tmp[512];
379 assert(entry->prog_data_size <= sizeof(prog_data_tmp));
380 memcpy(prog_data_tmp, data, entry->prog_data_size);
381 struct brw_stage_prog_data *prog_data = (void *)prog_data_tmp;
382 data += entry->prog_data_size;
383
384 prog_data->param = data;
385 data += prog_data->nr_params * sizeof(*prog_data->param);
386
387 struct anv_pipeline_binding *surface_to_descriptor = data;
388 data += entry->surface_count * sizeof(struct anv_pipeline_binding);
389 struct anv_pipeline_binding *sampler_to_descriptor = data;
390 data += entry->sampler_count * sizeof(struct anv_pipeline_binding);
391 void *kernel = data;
392
393 struct anv_pipeline_bind_map map = {
394 .surface_count = entry->surface_count,
395 .sampler_count = entry->sampler_count,
396 .image_count = entry->image_count,
397 .surface_to_descriptor = surface_to_descriptor,
398 .sampler_to_descriptor = sampler_to_descriptor
399 };
400
401 const struct brw_stage_prog_data *const_prog_data = prog_data;
402
403 anv_pipeline_cache_upload_kernel(cache, entry->sha1,
404 kernel, entry->kernel_size,
405 &const_prog_data,
406 entry->prog_data_size, &map);
407 p = kernel + entry->kernel_size;
408 }
409 }
410
411 VkResult anv_CreatePipelineCache(
412 VkDevice _device,
413 const VkPipelineCacheCreateInfo* pCreateInfo,
414 const VkAllocationCallbacks* pAllocator,
415 VkPipelineCache* pPipelineCache)
416 {
417 ANV_FROM_HANDLE(anv_device, device, _device);
418 struct anv_pipeline_cache *cache;
419
420 assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO);
421 assert(pCreateInfo->flags == 0);
422
423 cache = anv_alloc2(&device->alloc, pAllocator,
424 sizeof(*cache), 8,
425 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
426 if (cache == NULL)
427 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
428
429 anv_pipeline_cache_init(cache, device);
430
431 if (pCreateInfo->initialDataSize > 0)
432 anv_pipeline_cache_load(cache,
433 pCreateInfo->pInitialData,
434 pCreateInfo->initialDataSize);
435
436 *pPipelineCache = anv_pipeline_cache_to_handle(cache);
437
438 return VK_SUCCESS;
439 }
440
441 void anv_DestroyPipelineCache(
442 VkDevice _device,
443 VkPipelineCache _cache,
444 const VkAllocationCallbacks* pAllocator)
445 {
446 ANV_FROM_HANDLE(anv_device, device, _device);
447 ANV_FROM_HANDLE(anv_pipeline_cache, cache, _cache);
448
449 anv_pipeline_cache_finish(cache);
450
451 anv_free2(&device->alloc, pAllocator, cache);
452 }
453
454 VkResult anv_GetPipelineCacheData(
455 VkDevice _device,
456 VkPipelineCache _cache,
457 size_t* pDataSize,
458 void* pData)
459 {
460 ANV_FROM_HANDLE(anv_device, device, _device);
461 ANV_FROM_HANDLE(anv_pipeline_cache, cache, _cache);
462 struct cache_header *header;
463
464 const size_t size = sizeof(*header) + cache->total_size;
465
466 if (pData == NULL) {
467 *pDataSize = size;
468 return VK_SUCCESS;
469 }
470
471 if (*pDataSize < sizeof(*header)) {
472 *pDataSize = 0;
473 return VK_INCOMPLETE;
474 }
475
476 void *p = pData, *end = pData + *pDataSize;
477 header = p;
478 header->header_size = sizeof(*header);
479 header->header_version = VK_PIPELINE_CACHE_HEADER_VERSION_ONE;
480 header->vendor_id = 0x8086;
481 header->device_id = device->chipset_id;
482 anv_device_get_cache_uuid(header->uuid);
483 p += header->header_size;
484
485 struct cache_entry *entry;
486 for (uint32_t i = 0; i < cache->table_size; i++) {
487 if (cache->hash_table[i] == ~0)
488 continue;
489
490 entry = cache->program_stream.block_pool->map + cache->hash_table[i];
491 const uint32_t size = entry_size(entry);
492 if (end < p + size + entry->kernel_size)
493 break;
494
495 memcpy(p, entry, size);
496 p += size;
497
498 void *kernel = (void *) entry + align_u32(size, 64);
499
500 memcpy(p, kernel, entry->kernel_size);
501 p += entry->kernel_size;
502 }
503
504 *pDataSize = p - pData;
505
506 return VK_SUCCESS;
507 }
508
509 static void
510 anv_pipeline_cache_merge(struct anv_pipeline_cache *dst,
511 struct anv_pipeline_cache *src)
512 {
513 for (uint32_t i = 0; i < src->table_size; i++) {
514 const uint32_t offset = src->hash_table[i];
515 if (offset == ~0)
516 continue;
517
518 struct cache_entry *entry =
519 src->program_stream.block_pool->map + offset;
520
521 if (anv_pipeline_cache_search(dst, entry->sha1, NULL, NULL) != NO_KERNEL)
522 continue;
523
524 anv_pipeline_cache_add_entry(dst, entry, offset);
525 }
526 }
527
528 VkResult anv_MergePipelineCaches(
529 VkDevice _device,
530 VkPipelineCache destCache,
531 uint32_t srcCacheCount,
532 const VkPipelineCache* pSrcCaches)
533 {
534 ANV_FROM_HANDLE(anv_pipeline_cache, dst, destCache);
535
536 for (uint32_t i = 0; i < srcCacheCount; i++) {
537 ANV_FROM_HANDLE(anv_pipeline_cache, src, pSrcCaches[i]);
538
539 anv_pipeline_cache_merge(dst, src);
540 }
541
542 return VK_SUCCESS;
543 }