f6ff41f84b29dee0e0a003daca40d9461769c52f
[mesa.git] / src / intel / vulkan / anv_queue.c
1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /**
25 * This file implements VkQueue, VkFence, and VkSemaphore
26 */
27
28 #include "anv_private.h"
29 #include "util/vk_util.h"
30
31 #include "genxml/gen7_pack.h"
32
33 VkResult
34 anv_device_execbuf(struct anv_device *device,
35 struct drm_i915_gem_execbuffer2 *execbuf,
36 struct anv_bo **execbuf_bos)
37 {
38 int ret = anv_gem_execbuffer(device, execbuf);
39 if (ret != 0) {
40 /* We don't know the real error. */
41 device->lost = true;
42 return vk_errorf(VK_ERROR_DEVICE_LOST, "execbuf2 failed: %m");
43 }
44
45 struct drm_i915_gem_exec_object2 *objects =
46 (void *)(uintptr_t)execbuf->buffers_ptr;
47 for (uint32_t k = 0; k < execbuf->buffer_count; k++)
48 execbuf_bos[k]->offset = objects[k].offset;
49
50 return VK_SUCCESS;
51 }
52
53 VkResult
54 anv_device_submit_simple_batch(struct anv_device *device,
55 struct anv_batch *batch)
56 {
57 struct drm_i915_gem_execbuffer2 execbuf;
58 struct drm_i915_gem_exec_object2 exec2_objects[1];
59 struct anv_bo bo, *exec_bos[1];
60 VkResult result = VK_SUCCESS;
61 uint32_t size;
62
63 /* Kernel driver requires 8 byte aligned batch length */
64 size = align_u32(batch->next - batch->start, 8);
65 result = anv_bo_pool_alloc(&device->batch_bo_pool, &bo, size);
66 if (result != VK_SUCCESS)
67 return result;
68
69 memcpy(bo.map, batch->start, size);
70 if (!device->info.has_llc)
71 anv_flush_range(bo.map, size);
72
73 exec_bos[0] = &bo;
74 exec2_objects[0].handle = bo.gem_handle;
75 exec2_objects[0].relocation_count = 0;
76 exec2_objects[0].relocs_ptr = 0;
77 exec2_objects[0].alignment = 0;
78 exec2_objects[0].offset = bo.offset;
79 exec2_objects[0].flags = 0;
80 exec2_objects[0].rsvd1 = 0;
81 exec2_objects[0].rsvd2 = 0;
82
83 execbuf.buffers_ptr = (uintptr_t) exec2_objects;
84 execbuf.buffer_count = 1;
85 execbuf.batch_start_offset = 0;
86 execbuf.batch_len = size;
87 execbuf.cliprects_ptr = 0;
88 execbuf.num_cliprects = 0;
89 execbuf.DR1 = 0;
90 execbuf.DR4 = 0;
91
92 execbuf.flags =
93 I915_EXEC_HANDLE_LUT | I915_EXEC_NO_RELOC | I915_EXEC_RENDER;
94 execbuf.rsvd1 = device->context_id;
95 execbuf.rsvd2 = 0;
96
97 result = anv_device_execbuf(device, &execbuf, exec_bos);
98 if (result != VK_SUCCESS)
99 goto fail;
100
101 result = anv_device_wait(device, &bo, INT64_MAX);
102
103 fail:
104 anv_bo_pool_free(&device->batch_bo_pool, &bo);
105
106 return result;
107 }
108
109 VkResult anv_QueueSubmit(
110 VkQueue _queue,
111 uint32_t submitCount,
112 const VkSubmitInfo* pSubmits,
113 VkFence _fence)
114 {
115 ANV_FROM_HANDLE(anv_queue, queue, _queue);
116 ANV_FROM_HANDLE(anv_fence, fence, _fence);
117 struct anv_device *device = queue->device;
118
119 /* Query for device status prior to submitting. Technically, we don't need
120 * to do this. However, if we have a client that's submitting piles of
121 * garbage, we would rather break as early as possible to keep the GPU
122 * hanging contained. If we don't check here, we'll either be waiting for
123 * the kernel to kick us or we'll have to wait until the client waits on a
124 * fence before we actually know whether or not we've hung.
125 */
126 VkResult result = anv_device_query_status(device);
127 if (result != VK_SUCCESS)
128 return result;
129
130 /* We lock around QueueSubmit for three main reasons:
131 *
132 * 1) When a block pool is resized, we create a new gem handle with a
133 * different size and, in the case of surface states, possibly a
134 * different center offset but we re-use the same anv_bo struct when
135 * we do so. If this happens in the middle of setting up an execbuf,
136 * we could end up with our list of BOs out of sync with our list of
137 * gem handles.
138 *
139 * 2) The algorithm we use for building the list of unique buffers isn't
140 * thread-safe. While the client is supposed to syncronize around
141 * QueueSubmit, this would be extremely difficult to debug if it ever
142 * came up in the wild due to a broken app. It's better to play it
143 * safe and just lock around QueueSubmit.
144 *
145 * 3) The anv_cmd_buffer_execbuf function may perform relocations in
146 * userspace. Due to the fact that the surface state buffer is shared
147 * between batches, we can't afford to have that happen from multiple
148 * threads at the same time. Even though the user is supposed to
149 * ensure this doesn't happen, we play it safe as in (2) above.
150 *
151 * Since the only other things that ever take the device lock such as block
152 * pool resize only rarely happen, this will almost never be contended so
153 * taking a lock isn't really an expensive operation in this case.
154 */
155 pthread_mutex_lock(&device->mutex);
156
157 for (uint32_t i = 0; i < submitCount; i++) {
158 for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j++) {
159 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer,
160 pSubmits[i].pCommandBuffers[j]);
161 assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
162 assert(!anv_batch_has_error(&cmd_buffer->batch));
163
164 result = anv_cmd_buffer_execbuf(device, cmd_buffer);
165 if (result != VK_SUCCESS)
166 goto out;
167 }
168 }
169
170 if (fence) {
171 struct anv_bo *fence_bo = &fence->bo;
172 result = anv_device_execbuf(device, &fence->execbuf, &fence_bo);
173 if (result != VK_SUCCESS)
174 goto out;
175
176 /* Update the fence and wake up any waiters */
177 assert(fence->state == ANV_FENCE_STATE_RESET);
178 fence->state = ANV_FENCE_STATE_SUBMITTED;
179 pthread_cond_broadcast(&device->queue_submit);
180 }
181
182 out:
183 if (result != VK_SUCCESS) {
184 /* In the case that something has gone wrong we may end up with an
185 * inconsistent state from which it may not be trivial to recover.
186 * For example, we might have computed address relocations and
187 * any future attempt to re-submit this job will need to know about
188 * this and avoid computing relocation addresses again.
189 *
190 * To avoid this sort of issues, we assume that if something was
191 * wrong during submission we must already be in a really bad situation
192 * anyway (such us being out of memory) and return
193 * VK_ERROR_DEVICE_LOST to ensure that clients do not attempt to
194 * submit the same job again to this device.
195 */
196 result = VK_ERROR_DEVICE_LOST;
197 device->lost = true;
198
199 /* If we return VK_ERROR_DEVICE LOST here, we need to ensure that
200 * vkWaitForFences() and vkGetFenceStatus() return a valid result
201 * (VK_SUCCESS or VK_ERROR_DEVICE_LOST) in a finite amount of time.
202 * Setting the fence status to SIGNALED ensures this will happen in
203 * any case.
204 */
205 if (fence)
206 fence->state = ANV_FENCE_STATE_SIGNALED;
207 }
208
209 pthread_mutex_unlock(&device->mutex);
210
211 return result;
212 }
213
214 VkResult anv_QueueWaitIdle(
215 VkQueue _queue)
216 {
217 ANV_FROM_HANDLE(anv_queue, queue, _queue);
218
219 return anv_DeviceWaitIdle(anv_device_to_handle(queue->device));
220 }
221
222 VkResult anv_CreateFence(
223 VkDevice _device,
224 const VkFenceCreateInfo* pCreateInfo,
225 const VkAllocationCallbacks* pAllocator,
226 VkFence* pFence)
227 {
228 ANV_FROM_HANDLE(anv_device, device, _device);
229 struct anv_bo fence_bo;
230 struct anv_fence *fence;
231 struct anv_batch batch;
232 VkResult result;
233
234 assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_FENCE_CREATE_INFO);
235
236 result = anv_bo_pool_alloc(&device->batch_bo_pool, &fence_bo, 4096);
237 if (result != VK_SUCCESS)
238 return result;
239
240 /* Fences are small. Just store the CPU data structure in the BO. */
241 fence = fence_bo.map;
242 fence->bo = fence_bo;
243
244 /* Place the batch after the CPU data but on its own cache line. */
245 const uint32_t batch_offset = align_u32(sizeof(*fence), CACHELINE_SIZE);
246 batch.next = batch.start = fence->bo.map + batch_offset;
247 batch.end = fence->bo.map + fence->bo.size;
248 anv_batch_emit(&batch, GEN7_MI_BATCH_BUFFER_END, bbe);
249 anv_batch_emit(&batch, GEN7_MI_NOOP, noop);
250
251 if (!device->info.has_llc) {
252 assert(((uintptr_t) batch.start & CACHELINE_MASK) == 0);
253 assert(batch.next - batch.start <= CACHELINE_SIZE);
254 __builtin_ia32_mfence();
255 __builtin_ia32_clflush(batch.start);
256 }
257
258 fence->exec2_objects[0].handle = fence->bo.gem_handle;
259 fence->exec2_objects[0].relocation_count = 0;
260 fence->exec2_objects[0].relocs_ptr = 0;
261 fence->exec2_objects[0].alignment = 0;
262 fence->exec2_objects[0].offset = fence->bo.offset;
263 fence->exec2_objects[0].flags = 0;
264 fence->exec2_objects[0].rsvd1 = 0;
265 fence->exec2_objects[0].rsvd2 = 0;
266
267 fence->execbuf.buffers_ptr = (uintptr_t) fence->exec2_objects;
268 fence->execbuf.buffer_count = 1;
269 fence->execbuf.batch_start_offset = batch.start - fence->bo.map;
270 fence->execbuf.batch_len = batch.next - batch.start;
271 fence->execbuf.cliprects_ptr = 0;
272 fence->execbuf.num_cliprects = 0;
273 fence->execbuf.DR1 = 0;
274 fence->execbuf.DR4 = 0;
275
276 fence->execbuf.flags =
277 I915_EXEC_HANDLE_LUT | I915_EXEC_NO_RELOC | I915_EXEC_RENDER;
278 fence->execbuf.rsvd1 = device->context_id;
279 fence->execbuf.rsvd2 = 0;
280
281 if (pCreateInfo->flags & VK_FENCE_CREATE_SIGNALED_BIT) {
282 fence->state = ANV_FENCE_STATE_SIGNALED;
283 } else {
284 fence->state = ANV_FENCE_STATE_RESET;
285 }
286
287 *pFence = anv_fence_to_handle(fence);
288
289 return VK_SUCCESS;
290 }
291
292 void anv_DestroyFence(
293 VkDevice _device,
294 VkFence _fence,
295 const VkAllocationCallbacks* pAllocator)
296 {
297 ANV_FROM_HANDLE(anv_device, device, _device);
298 ANV_FROM_HANDLE(anv_fence, fence, _fence);
299
300 if (!fence)
301 return;
302
303 assert(fence->bo.map == fence);
304 anv_bo_pool_free(&device->batch_bo_pool, &fence->bo);
305 }
306
307 VkResult anv_ResetFences(
308 VkDevice _device,
309 uint32_t fenceCount,
310 const VkFence* pFences)
311 {
312 for (uint32_t i = 0; i < fenceCount; i++) {
313 ANV_FROM_HANDLE(anv_fence, fence, pFences[i]);
314 fence->state = ANV_FENCE_STATE_RESET;
315 }
316
317 return VK_SUCCESS;
318 }
319
320 VkResult anv_GetFenceStatus(
321 VkDevice _device,
322 VkFence _fence)
323 {
324 ANV_FROM_HANDLE(anv_device, device, _device);
325 ANV_FROM_HANDLE(anv_fence, fence, _fence);
326
327 if (unlikely(device->lost))
328 return VK_ERROR_DEVICE_LOST;
329
330 switch (fence->state) {
331 case ANV_FENCE_STATE_RESET:
332 /* If it hasn't even been sent off to the GPU yet, it's not ready */
333 return VK_NOT_READY;
334
335 case ANV_FENCE_STATE_SIGNALED:
336 /* It's been signaled, return success */
337 return VK_SUCCESS;
338
339 case ANV_FENCE_STATE_SUBMITTED: {
340 VkResult result = anv_device_bo_busy(device, &fence->bo);
341 if (result == VK_SUCCESS) {
342 fence->state = ANV_FENCE_STATE_SIGNALED;
343 return VK_SUCCESS;
344 } else {
345 return result;
346 }
347 }
348 default:
349 unreachable("Invalid fence status");
350 }
351 }
352
353 #define NSEC_PER_SEC 1000000000
354 #define INT_TYPE_MAX(type) ((1ull << (sizeof(type) * 8 - 1)) - 1)
355
356 VkResult anv_WaitForFences(
357 VkDevice _device,
358 uint32_t fenceCount,
359 const VkFence* pFences,
360 VkBool32 waitAll,
361 uint64_t _timeout)
362 {
363 ANV_FROM_HANDLE(anv_device, device, _device);
364 int ret;
365
366 if (unlikely(device->lost))
367 return VK_ERROR_DEVICE_LOST;
368
369 /* DRM_IOCTL_I915_GEM_WAIT uses a signed 64 bit timeout and is supposed
370 * to block indefinitely timeouts <= 0. Unfortunately, this was broken
371 * for a couple of kernel releases. Since there's no way to know
372 * whether or not the kernel we're using is one of the broken ones, the
373 * best we can do is to clamp the timeout to INT64_MAX. This limits the
374 * maximum timeout from 584 years to 292 years - likely not a big deal.
375 */
376 int64_t timeout = MIN2(_timeout, INT64_MAX);
377
378 VkResult result = VK_SUCCESS;
379 uint32_t pending_fences = fenceCount;
380 while (pending_fences) {
381 pending_fences = 0;
382 bool signaled_fences = false;
383 for (uint32_t i = 0; i < fenceCount; i++) {
384 ANV_FROM_HANDLE(anv_fence, fence, pFences[i]);
385 switch (fence->state) {
386 case ANV_FENCE_STATE_RESET:
387 /* This fence hasn't been submitted yet, we'll catch it the next
388 * time around. Yes, this may mean we dead-loop but, short of
389 * lots of locking and a condition variable, there's not much that
390 * we can do about that.
391 */
392 pending_fences++;
393 continue;
394
395 case ANV_FENCE_STATE_SIGNALED:
396 /* This fence is not pending. If waitAll isn't set, we can return
397 * early. Otherwise, we have to keep going.
398 */
399 if (!waitAll) {
400 result = VK_SUCCESS;
401 goto done;
402 }
403 continue;
404
405 case ANV_FENCE_STATE_SUBMITTED:
406 /* These are the fences we really care about. Go ahead and wait
407 * on it until we hit a timeout.
408 */
409 result = anv_device_wait(device, &fence->bo, timeout);
410 switch (result) {
411 case VK_SUCCESS:
412 fence->state = ANV_FENCE_STATE_SIGNALED;
413 signaled_fences = true;
414 if (!waitAll)
415 goto done;
416 break;
417
418 case VK_TIMEOUT:
419 goto done;
420
421 default:
422 return result;
423 }
424 }
425 }
426
427 if (pending_fences && !signaled_fences) {
428 /* If we've hit this then someone decided to vkWaitForFences before
429 * they've actually submitted any of them to a queue. This is a
430 * fairly pessimal case, so it's ok to lock here and use a standard
431 * pthreads condition variable.
432 */
433 pthread_mutex_lock(&device->mutex);
434
435 /* It's possible that some of the fences have changed state since the
436 * last time we checked. Now that we have the lock, check for
437 * pending fences again and don't wait if it's changed.
438 */
439 uint32_t now_pending_fences = 0;
440 for (uint32_t i = 0; i < fenceCount; i++) {
441 ANV_FROM_HANDLE(anv_fence, fence, pFences[i]);
442 if (fence->state == ANV_FENCE_STATE_RESET)
443 now_pending_fences++;
444 }
445 assert(now_pending_fences <= pending_fences);
446
447 if (now_pending_fences == pending_fences) {
448 struct timespec before;
449 clock_gettime(CLOCK_MONOTONIC, &before);
450
451 uint32_t abs_nsec = before.tv_nsec + timeout % NSEC_PER_SEC;
452 uint64_t abs_sec = before.tv_sec + (abs_nsec / NSEC_PER_SEC) +
453 (timeout / NSEC_PER_SEC);
454 abs_nsec %= NSEC_PER_SEC;
455
456 /* Avoid roll-over in tv_sec on 32-bit systems if the user
457 * provided timeout is UINT64_MAX
458 */
459 struct timespec abstime;
460 abstime.tv_nsec = abs_nsec;
461 abstime.tv_sec = MIN2(abs_sec, INT_TYPE_MAX(abstime.tv_sec));
462
463 ret = pthread_cond_timedwait(&device->queue_submit,
464 &device->mutex, &abstime);
465 assert(ret != EINVAL);
466
467 struct timespec after;
468 clock_gettime(CLOCK_MONOTONIC, &after);
469 uint64_t time_elapsed =
470 ((uint64_t)after.tv_sec * NSEC_PER_SEC + after.tv_nsec) -
471 ((uint64_t)before.tv_sec * NSEC_PER_SEC + before.tv_nsec);
472
473 if (time_elapsed >= timeout) {
474 pthread_mutex_unlock(&device->mutex);
475 result = VK_TIMEOUT;
476 goto done;
477 }
478
479 timeout -= time_elapsed;
480 }
481
482 pthread_mutex_unlock(&device->mutex);
483 }
484 }
485
486 done:
487 if (unlikely(device->lost))
488 return VK_ERROR_DEVICE_LOST;
489
490 return result;
491 }
492
493 // Queue semaphore functions
494
495 VkResult anv_CreateSemaphore(
496 VkDevice _device,
497 const VkSemaphoreCreateInfo* pCreateInfo,
498 const VkAllocationCallbacks* pAllocator,
499 VkSemaphore* pSemaphore)
500 {
501 ANV_FROM_HANDLE(anv_device, device, _device);
502 struct anv_semaphore *semaphore;
503
504 assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO);
505
506 semaphore = vk_alloc2(&device->alloc, pAllocator, sizeof(*semaphore), 8,
507 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
508 if (semaphore == NULL)
509 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
510
511 /* The DRM execbuffer ioctl always execute in-oder, even between
512 * different rings. As such, a dummy no-op semaphore is a perfectly
513 * valid implementation.
514 */
515 semaphore->permanent.type = ANV_SEMAPHORE_TYPE_DUMMY;
516 semaphore->temporary.type = ANV_SEMAPHORE_TYPE_NONE;
517
518 *pSemaphore = anv_semaphore_to_handle(semaphore);
519
520 return VK_SUCCESS;
521 }
522
523 void anv_DestroySemaphore(
524 VkDevice _device,
525 VkSemaphore _semaphore,
526 const VkAllocationCallbacks* pAllocator)
527 {
528 ANV_FROM_HANDLE(anv_device, device, _device);
529 ANV_FROM_HANDLE(anv_semaphore, semaphore, _semaphore);
530
531 if (semaphore == NULL)
532 return;
533
534 vk_free2(&device->alloc, pAllocator, semaphore);
535 }