2 * Copyright © 2015 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * This file implements VkQueue, VkFence, and VkSemaphore
28 #include "anv_private.h"
29 #include "util/vk_util.h"
31 #include "genxml/gen7_pack.h"
34 anv_device_execbuf(struct anv_device
*device
,
35 struct drm_i915_gem_execbuffer2
*execbuf
,
36 struct anv_bo
**execbuf_bos
)
38 int ret
= anv_gem_execbuffer(device
, execbuf
);
40 /* We don't know the real error. */
42 return vk_errorf(VK_ERROR_DEVICE_LOST
, "execbuf2 failed: %m");
45 struct drm_i915_gem_exec_object2
*objects
=
46 (void *)(uintptr_t)execbuf
->buffers_ptr
;
47 for (uint32_t k
= 0; k
< execbuf
->buffer_count
; k
++)
48 execbuf_bos
[k
]->offset
= objects
[k
].offset
;
54 anv_device_submit_simple_batch(struct anv_device
*device
,
55 struct anv_batch
*batch
)
57 struct drm_i915_gem_execbuffer2 execbuf
;
58 struct drm_i915_gem_exec_object2 exec2_objects
[1];
59 struct anv_bo bo
, *exec_bos
[1];
60 VkResult result
= VK_SUCCESS
;
63 /* Kernel driver requires 8 byte aligned batch length */
64 size
= align_u32(batch
->next
- batch
->start
, 8);
65 result
= anv_bo_pool_alloc(&device
->batch_bo_pool
, &bo
, size
);
66 if (result
!= VK_SUCCESS
)
69 memcpy(bo
.map
, batch
->start
, size
);
70 if (!device
->info
.has_llc
)
71 anv_flush_range(bo
.map
, size
);
74 exec2_objects
[0].handle
= bo
.gem_handle
;
75 exec2_objects
[0].relocation_count
= 0;
76 exec2_objects
[0].relocs_ptr
= 0;
77 exec2_objects
[0].alignment
= 0;
78 exec2_objects
[0].offset
= bo
.offset
;
79 exec2_objects
[0].flags
= 0;
80 exec2_objects
[0].rsvd1
= 0;
81 exec2_objects
[0].rsvd2
= 0;
83 execbuf
.buffers_ptr
= (uintptr_t) exec2_objects
;
84 execbuf
.buffer_count
= 1;
85 execbuf
.batch_start_offset
= 0;
86 execbuf
.batch_len
= size
;
87 execbuf
.cliprects_ptr
= 0;
88 execbuf
.num_cliprects
= 0;
93 I915_EXEC_HANDLE_LUT
| I915_EXEC_NO_RELOC
| I915_EXEC_RENDER
;
94 execbuf
.rsvd1
= device
->context_id
;
97 result
= anv_device_execbuf(device
, &execbuf
, exec_bos
);
98 if (result
!= VK_SUCCESS
)
101 result
= anv_device_wait(device
, &bo
, INT64_MAX
);
104 anv_bo_pool_free(&device
->batch_bo_pool
, &bo
);
109 VkResult
anv_QueueSubmit(
111 uint32_t submitCount
,
112 const VkSubmitInfo
* pSubmits
,
115 ANV_FROM_HANDLE(anv_queue
, queue
, _queue
);
116 ANV_FROM_HANDLE(anv_fence
, fence
, _fence
);
117 struct anv_device
*device
= queue
->device
;
119 /* Query for device status prior to submitting. Technically, we don't need
120 * to do this. However, if we have a client that's submitting piles of
121 * garbage, we would rather break as early as possible to keep the GPU
122 * hanging contained. If we don't check here, we'll either be waiting for
123 * the kernel to kick us or we'll have to wait until the client waits on a
124 * fence before we actually know whether or not we've hung.
126 VkResult result
= anv_device_query_status(device
);
127 if (result
!= VK_SUCCESS
)
130 /* We lock around QueueSubmit for three main reasons:
132 * 1) When a block pool is resized, we create a new gem handle with a
133 * different size and, in the case of surface states, possibly a
134 * different center offset but we re-use the same anv_bo struct when
135 * we do so. If this happens in the middle of setting up an execbuf,
136 * we could end up with our list of BOs out of sync with our list of
139 * 2) The algorithm we use for building the list of unique buffers isn't
140 * thread-safe. While the client is supposed to syncronize around
141 * QueueSubmit, this would be extremely difficult to debug if it ever
142 * came up in the wild due to a broken app. It's better to play it
143 * safe and just lock around QueueSubmit.
145 * 3) The anv_cmd_buffer_execbuf function may perform relocations in
146 * userspace. Due to the fact that the surface state buffer is shared
147 * between batches, we can't afford to have that happen from multiple
148 * threads at the same time. Even though the user is supposed to
149 * ensure this doesn't happen, we play it safe as in (2) above.
151 * Since the only other things that ever take the device lock such as block
152 * pool resize only rarely happen, this will almost never be contended so
153 * taking a lock isn't really an expensive operation in this case.
155 pthread_mutex_lock(&device
->mutex
);
157 for (uint32_t i
= 0; i
< submitCount
; i
++) {
158 for (uint32_t j
= 0; j
< pSubmits
[i
].commandBufferCount
; j
++) {
159 ANV_FROM_HANDLE(anv_cmd_buffer
, cmd_buffer
,
160 pSubmits
[i
].pCommandBuffers
[j
]);
161 assert(cmd_buffer
->level
== VK_COMMAND_BUFFER_LEVEL_PRIMARY
);
162 assert(!anv_batch_has_error(&cmd_buffer
->batch
));
164 result
= anv_cmd_buffer_execbuf(device
, cmd_buffer
);
165 if (result
!= VK_SUCCESS
)
171 struct anv_bo
*fence_bo
= &fence
->bo
;
172 result
= anv_device_execbuf(device
, &fence
->execbuf
, &fence_bo
);
173 if (result
!= VK_SUCCESS
)
176 /* Update the fence and wake up any waiters */
177 assert(fence
->state
== ANV_FENCE_STATE_RESET
);
178 fence
->state
= ANV_FENCE_STATE_SUBMITTED
;
179 pthread_cond_broadcast(&device
->queue_submit
);
183 if (result
!= VK_SUCCESS
) {
184 /* In the case that something has gone wrong we may end up with an
185 * inconsistent state from which it may not be trivial to recover.
186 * For example, we might have computed address relocations and
187 * any future attempt to re-submit this job will need to know about
188 * this and avoid computing relocation addresses again.
190 * To avoid this sort of issues, we assume that if something was
191 * wrong during submission we must already be in a really bad situation
192 * anyway (such us being out of memory) and return
193 * VK_ERROR_DEVICE_LOST to ensure that clients do not attempt to
194 * submit the same job again to this device.
196 result
= VK_ERROR_DEVICE_LOST
;
199 /* If we return VK_ERROR_DEVICE LOST here, we need to ensure that
200 * vkWaitForFences() and vkGetFenceStatus() return a valid result
201 * (VK_SUCCESS or VK_ERROR_DEVICE_LOST) in a finite amount of time.
202 * Setting the fence status to SIGNALED ensures this will happen in
206 fence
->state
= ANV_FENCE_STATE_SIGNALED
;
209 pthread_mutex_unlock(&device
->mutex
);
214 VkResult
anv_QueueWaitIdle(
217 ANV_FROM_HANDLE(anv_queue
, queue
, _queue
);
219 return anv_DeviceWaitIdle(anv_device_to_handle(queue
->device
));
222 VkResult
anv_CreateFence(
224 const VkFenceCreateInfo
* pCreateInfo
,
225 const VkAllocationCallbacks
* pAllocator
,
228 ANV_FROM_HANDLE(anv_device
, device
, _device
);
229 struct anv_bo fence_bo
;
230 struct anv_fence
*fence
;
231 struct anv_batch batch
;
234 assert(pCreateInfo
->sType
== VK_STRUCTURE_TYPE_FENCE_CREATE_INFO
);
236 result
= anv_bo_pool_alloc(&device
->batch_bo_pool
, &fence_bo
, 4096);
237 if (result
!= VK_SUCCESS
)
240 /* Fences are small. Just store the CPU data structure in the BO. */
241 fence
= fence_bo
.map
;
242 fence
->bo
= fence_bo
;
244 /* Place the batch after the CPU data but on its own cache line. */
245 const uint32_t batch_offset
= align_u32(sizeof(*fence
), CACHELINE_SIZE
);
246 batch
.next
= batch
.start
= fence
->bo
.map
+ batch_offset
;
247 batch
.end
= fence
->bo
.map
+ fence
->bo
.size
;
248 anv_batch_emit(&batch
, GEN7_MI_BATCH_BUFFER_END
, bbe
);
249 anv_batch_emit(&batch
, GEN7_MI_NOOP
, noop
);
251 if (!device
->info
.has_llc
) {
252 assert(((uintptr_t) batch
.start
& CACHELINE_MASK
) == 0);
253 assert(batch
.next
- batch
.start
<= CACHELINE_SIZE
);
254 __builtin_ia32_mfence();
255 __builtin_ia32_clflush(batch
.start
);
258 fence
->exec2_objects
[0].handle
= fence
->bo
.gem_handle
;
259 fence
->exec2_objects
[0].relocation_count
= 0;
260 fence
->exec2_objects
[0].relocs_ptr
= 0;
261 fence
->exec2_objects
[0].alignment
= 0;
262 fence
->exec2_objects
[0].offset
= fence
->bo
.offset
;
263 fence
->exec2_objects
[0].flags
= 0;
264 fence
->exec2_objects
[0].rsvd1
= 0;
265 fence
->exec2_objects
[0].rsvd2
= 0;
267 fence
->execbuf
.buffers_ptr
= (uintptr_t) fence
->exec2_objects
;
268 fence
->execbuf
.buffer_count
= 1;
269 fence
->execbuf
.batch_start_offset
= batch
.start
- fence
->bo
.map
;
270 fence
->execbuf
.batch_len
= batch
.next
- batch
.start
;
271 fence
->execbuf
.cliprects_ptr
= 0;
272 fence
->execbuf
.num_cliprects
= 0;
273 fence
->execbuf
.DR1
= 0;
274 fence
->execbuf
.DR4
= 0;
276 fence
->execbuf
.flags
=
277 I915_EXEC_HANDLE_LUT
| I915_EXEC_NO_RELOC
| I915_EXEC_RENDER
;
278 fence
->execbuf
.rsvd1
= device
->context_id
;
279 fence
->execbuf
.rsvd2
= 0;
281 if (pCreateInfo
->flags
& VK_FENCE_CREATE_SIGNALED_BIT
) {
282 fence
->state
= ANV_FENCE_STATE_SIGNALED
;
284 fence
->state
= ANV_FENCE_STATE_RESET
;
287 *pFence
= anv_fence_to_handle(fence
);
292 void anv_DestroyFence(
295 const VkAllocationCallbacks
* pAllocator
)
297 ANV_FROM_HANDLE(anv_device
, device
, _device
);
298 ANV_FROM_HANDLE(anv_fence
, fence
, _fence
);
303 assert(fence
->bo
.map
== fence
);
304 anv_bo_pool_free(&device
->batch_bo_pool
, &fence
->bo
);
307 VkResult
anv_ResetFences(
310 const VkFence
* pFences
)
312 for (uint32_t i
= 0; i
< fenceCount
; i
++) {
313 ANV_FROM_HANDLE(anv_fence
, fence
, pFences
[i
]);
314 fence
->state
= ANV_FENCE_STATE_RESET
;
320 VkResult
anv_GetFenceStatus(
324 ANV_FROM_HANDLE(anv_device
, device
, _device
);
325 ANV_FROM_HANDLE(anv_fence
, fence
, _fence
);
327 if (unlikely(device
->lost
))
328 return VK_ERROR_DEVICE_LOST
;
330 switch (fence
->state
) {
331 case ANV_FENCE_STATE_RESET
:
332 /* If it hasn't even been sent off to the GPU yet, it's not ready */
335 case ANV_FENCE_STATE_SIGNALED
:
336 /* It's been signaled, return success */
339 case ANV_FENCE_STATE_SUBMITTED
: {
340 VkResult result
= anv_device_bo_busy(device
, &fence
->bo
);
341 if (result
== VK_SUCCESS
) {
342 fence
->state
= ANV_FENCE_STATE_SIGNALED
;
349 unreachable("Invalid fence status");
353 #define NSEC_PER_SEC 1000000000
354 #define INT_TYPE_MAX(type) ((1ull << (sizeof(type) * 8 - 1)) - 1)
356 VkResult
anv_WaitForFences(
359 const VkFence
* pFences
,
363 ANV_FROM_HANDLE(anv_device
, device
, _device
);
366 if (unlikely(device
->lost
))
367 return VK_ERROR_DEVICE_LOST
;
369 /* DRM_IOCTL_I915_GEM_WAIT uses a signed 64 bit timeout and is supposed
370 * to block indefinitely timeouts <= 0. Unfortunately, this was broken
371 * for a couple of kernel releases. Since there's no way to know
372 * whether or not the kernel we're using is one of the broken ones, the
373 * best we can do is to clamp the timeout to INT64_MAX. This limits the
374 * maximum timeout from 584 years to 292 years - likely not a big deal.
376 int64_t timeout
= MIN2(_timeout
, INT64_MAX
);
378 VkResult result
= VK_SUCCESS
;
379 uint32_t pending_fences
= fenceCount
;
380 while (pending_fences
) {
382 bool signaled_fences
= false;
383 for (uint32_t i
= 0; i
< fenceCount
; i
++) {
384 ANV_FROM_HANDLE(anv_fence
, fence
, pFences
[i
]);
385 switch (fence
->state
) {
386 case ANV_FENCE_STATE_RESET
:
387 /* This fence hasn't been submitted yet, we'll catch it the next
388 * time around. Yes, this may mean we dead-loop but, short of
389 * lots of locking and a condition variable, there's not much that
390 * we can do about that.
395 case ANV_FENCE_STATE_SIGNALED
:
396 /* This fence is not pending. If waitAll isn't set, we can return
397 * early. Otherwise, we have to keep going.
405 case ANV_FENCE_STATE_SUBMITTED
:
406 /* These are the fences we really care about. Go ahead and wait
407 * on it until we hit a timeout.
409 result
= anv_device_wait(device
, &fence
->bo
, timeout
);
412 fence
->state
= ANV_FENCE_STATE_SIGNALED
;
413 signaled_fences
= true;
427 if (pending_fences
&& !signaled_fences
) {
428 /* If we've hit this then someone decided to vkWaitForFences before
429 * they've actually submitted any of them to a queue. This is a
430 * fairly pessimal case, so it's ok to lock here and use a standard
431 * pthreads condition variable.
433 pthread_mutex_lock(&device
->mutex
);
435 /* It's possible that some of the fences have changed state since the
436 * last time we checked. Now that we have the lock, check for
437 * pending fences again and don't wait if it's changed.
439 uint32_t now_pending_fences
= 0;
440 for (uint32_t i
= 0; i
< fenceCount
; i
++) {
441 ANV_FROM_HANDLE(anv_fence
, fence
, pFences
[i
]);
442 if (fence
->state
== ANV_FENCE_STATE_RESET
)
443 now_pending_fences
++;
445 assert(now_pending_fences
<= pending_fences
);
447 if (now_pending_fences
== pending_fences
) {
448 struct timespec before
;
449 clock_gettime(CLOCK_MONOTONIC
, &before
);
451 uint32_t abs_nsec
= before
.tv_nsec
+ timeout
% NSEC_PER_SEC
;
452 uint64_t abs_sec
= before
.tv_sec
+ (abs_nsec
/ NSEC_PER_SEC
) +
453 (timeout
/ NSEC_PER_SEC
);
454 abs_nsec
%= NSEC_PER_SEC
;
456 /* Avoid roll-over in tv_sec on 32-bit systems if the user
457 * provided timeout is UINT64_MAX
459 struct timespec abstime
;
460 abstime
.tv_nsec
= abs_nsec
;
461 abstime
.tv_sec
= MIN2(abs_sec
, INT_TYPE_MAX(abstime
.tv_sec
));
463 ret
= pthread_cond_timedwait(&device
->queue_submit
,
464 &device
->mutex
, &abstime
);
465 assert(ret
!= EINVAL
);
467 struct timespec after
;
468 clock_gettime(CLOCK_MONOTONIC
, &after
);
469 uint64_t time_elapsed
=
470 ((uint64_t)after
.tv_sec
* NSEC_PER_SEC
+ after
.tv_nsec
) -
471 ((uint64_t)before
.tv_sec
* NSEC_PER_SEC
+ before
.tv_nsec
);
473 if (time_elapsed
>= timeout
) {
474 pthread_mutex_unlock(&device
->mutex
);
479 timeout
-= time_elapsed
;
482 pthread_mutex_unlock(&device
->mutex
);
487 if (unlikely(device
->lost
))
488 return VK_ERROR_DEVICE_LOST
;
493 // Queue semaphore functions
495 VkResult
anv_CreateSemaphore(
497 const VkSemaphoreCreateInfo
* pCreateInfo
,
498 const VkAllocationCallbacks
* pAllocator
,
499 VkSemaphore
* pSemaphore
)
501 ANV_FROM_HANDLE(anv_device
, device
, _device
);
502 struct anv_semaphore
*semaphore
;
504 assert(pCreateInfo
->sType
== VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO
);
506 semaphore
= vk_alloc2(&device
->alloc
, pAllocator
, sizeof(*semaphore
), 8,
507 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT
);
508 if (semaphore
== NULL
)
509 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY
);
511 /* The DRM execbuffer ioctl always execute in-oder, even between
512 * different rings. As such, a dummy no-op semaphore is a perfectly
513 * valid implementation.
515 semaphore
->permanent
.type
= ANV_SEMAPHORE_TYPE_DUMMY
;
516 semaphore
->temporary
.type
= ANV_SEMAPHORE_TYPE_NONE
;
518 *pSemaphore
= anv_semaphore_to_handle(semaphore
);
523 void anv_DestroySemaphore(
525 VkSemaphore _semaphore
,
526 const VkAllocationCallbacks
* pAllocator
)
528 ANV_FROM_HANDLE(anv_device
, device
, _device
);
529 ANV_FROM_HANDLE(anv_semaphore
, semaphore
, _semaphore
);
531 if (semaphore
== NULL
)
534 vk_free2(&device
->alloc
, pAllocator
, semaphore
);