2 * Copyright © 2015 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
30 #include "anv_private.h"
32 #include "genxml/gen7_pack.h"
33 #include "genxml/gen8_pack.h"
35 /** \file anv_batch_chain.c
37 * This file contains functions related to anv_cmd_buffer as a data
38 * structure. This involves everything required to create and destroy
39 * the actual batch buffers as well as link them together and handle
40 * relocations and surface state. It specifically does *not* contain any
41 * handling of actual vkCmd calls beyond vkCmdExecuteCommands.
44 /*-----------------------------------------------------------------------*
45 * Functions related to anv_reloc_list
46 *-----------------------------------------------------------------------*/
49 anv_reloc_list_init_clone(struct anv_reloc_list
*list
,
50 const VkAllocationCallbacks
*alloc
,
51 const struct anv_reloc_list
*other_list
)
54 list
->num_relocs
= other_list
->num_relocs
;
55 list
->array_length
= other_list
->array_length
;
58 list
->array_length
= 256;
62 vk_alloc(alloc
, list
->array_length
* sizeof(*list
->relocs
), 8,
63 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT
);
65 if (list
->relocs
== NULL
)
66 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY
);
69 vk_alloc(alloc
, list
->array_length
* sizeof(*list
->reloc_bos
), 8,
70 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT
);
72 if (list
->reloc_bos
== NULL
) {
73 vk_free(alloc
, list
->relocs
);
74 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY
);
78 memcpy(list
->relocs
, other_list
->relocs
,
79 list
->array_length
* sizeof(*list
->relocs
));
80 memcpy(list
->reloc_bos
, other_list
->reloc_bos
,
81 list
->array_length
* sizeof(*list
->reloc_bos
));
88 anv_reloc_list_init(struct anv_reloc_list
*list
,
89 const VkAllocationCallbacks
*alloc
)
91 return anv_reloc_list_init_clone(list
, alloc
, NULL
);
95 anv_reloc_list_finish(struct anv_reloc_list
*list
,
96 const VkAllocationCallbacks
*alloc
)
98 vk_free(alloc
, list
->relocs
);
99 vk_free(alloc
, list
->reloc_bos
);
103 anv_reloc_list_grow(struct anv_reloc_list
*list
,
104 const VkAllocationCallbacks
*alloc
,
105 size_t num_additional_relocs
)
107 if (list
->num_relocs
+ num_additional_relocs
<= list
->array_length
)
110 size_t new_length
= list
->array_length
* 2;
111 while (new_length
< list
->num_relocs
+ num_additional_relocs
)
114 struct drm_i915_gem_relocation_entry
*new_relocs
=
115 vk_alloc(alloc
, new_length
* sizeof(*list
->relocs
), 8,
116 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT
);
117 if (new_relocs
== NULL
)
118 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY
);
120 struct anv_bo
**new_reloc_bos
=
121 vk_alloc(alloc
, new_length
* sizeof(*list
->reloc_bos
), 8,
122 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT
);
123 if (new_reloc_bos
== NULL
) {
124 vk_free(alloc
, new_relocs
);
125 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY
);
128 memcpy(new_relocs
, list
->relocs
, list
->num_relocs
* sizeof(*list
->relocs
));
129 memcpy(new_reloc_bos
, list
->reloc_bos
,
130 list
->num_relocs
* sizeof(*list
->reloc_bos
));
132 vk_free(alloc
, list
->relocs
);
133 vk_free(alloc
, list
->reloc_bos
);
135 list
->array_length
= new_length
;
136 list
->relocs
= new_relocs
;
137 list
->reloc_bos
= new_reloc_bos
;
143 anv_reloc_list_add(struct anv_reloc_list
*list
,
144 const VkAllocationCallbacks
*alloc
,
145 uint32_t offset
, struct anv_bo
*target_bo
, uint32_t delta
)
147 struct drm_i915_gem_relocation_entry
*entry
;
150 const uint32_t domain
=
151 target_bo
->is_winsys_bo
? I915_GEM_DOMAIN_RENDER
: 0;
153 anv_reloc_list_grow(list
, alloc
, 1);
154 /* TODO: Handle failure */
156 /* XXX: Can we use I915_EXEC_HANDLE_LUT? */
157 index
= list
->num_relocs
++;
158 list
->reloc_bos
[index
] = target_bo
;
159 entry
= &list
->relocs
[index
];
160 entry
->target_handle
= target_bo
->gem_handle
;
161 entry
->delta
= delta
;
162 entry
->offset
= offset
;
163 entry
->presumed_offset
= target_bo
->offset
;
164 entry
->read_domains
= domain
;
165 entry
->write_domain
= domain
;
166 VG(VALGRIND_CHECK_MEM_IS_DEFINED(entry
, sizeof(*entry
)));
168 return target_bo
->offset
+ delta
;
172 anv_reloc_list_append(struct anv_reloc_list
*list
,
173 const VkAllocationCallbacks
*alloc
,
174 struct anv_reloc_list
*other
, uint32_t offset
)
176 anv_reloc_list_grow(list
, alloc
, other
->num_relocs
);
177 /* TODO: Handle failure */
179 memcpy(&list
->relocs
[list
->num_relocs
], &other
->relocs
[0],
180 other
->num_relocs
* sizeof(other
->relocs
[0]));
181 memcpy(&list
->reloc_bos
[list
->num_relocs
], &other
->reloc_bos
[0],
182 other
->num_relocs
* sizeof(other
->reloc_bos
[0]));
184 for (uint32_t i
= 0; i
< other
->num_relocs
; i
++)
185 list
->relocs
[i
+ list
->num_relocs
].offset
+= offset
;
187 list
->num_relocs
+= other
->num_relocs
;
190 /*-----------------------------------------------------------------------*
191 * Functions related to anv_batch
192 *-----------------------------------------------------------------------*/
195 anv_batch_emit_dwords(struct anv_batch
*batch
, int num_dwords
)
197 if (batch
->next
+ num_dwords
* 4 > batch
->end
)
198 batch
->extend_cb(batch
, batch
->user_data
);
200 void *p
= batch
->next
;
202 batch
->next
+= num_dwords
* 4;
203 assert(batch
->next
<= batch
->end
);
209 anv_batch_emit_reloc(struct anv_batch
*batch
,
210 void *location
, struct anv_bo
*bo
, uint32_t delta
)
212 return anv_reloc_list_add(batch
->relocs
, batch
->alloc
,
213 location
- batch
->start
, bo
, delta
);
217 anv_batch_emit_batch(struct anv_batch
*batch
, struct anv_batch
*other
)
219 uint32_t size
, offset
;
221 size
= other
->next
- other
->start
;
222 assert(size
% 4 == 0);
224 if (batch
->next
+ size
> batch
->end
)
225 batch
->extend_cb(batch
, batch
->user_data
);
227 assert(batch
->next
+ size
<= batch
->end
);
229 VG(VALGRIND_CHECK_MEM_IS_DEFINED(other
->start
, size
));
230 memcpy(batch
->next
, other
->start
, size
);
232 offset
= batch
->next
- batch
->start
;
233 anv_reloc_list_append(batch
->relocs
, batch
->alloc
,
234 other
->relocs
, offset
);
239 /*-----------------------------------------------------------------------*
240 * Functions related to anv_batch_bo
241 *-----------------------------------------------------------------------*/
244 anv_batch_bo_create(struct anv_cmd_buffer
*cmd_buffer
,
245 struct anv_batch_bo
**bbo_out
)
249 struct anv_batch_bo
*bbo
= vk_alloc(&cmd_buffer
->pool
->alloc
, sizeof(*bbo
),
250 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT
);
252 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY
);
254 result
= anv_bo_pool_alloc(&cmd_buffer
->device
->batch_bo_pool
, &bbo
->bo
,
255 ANV_CMD_BUFFER_BATCH_SIZE
);
256 if (result
!= VK_SUCCESS
)
259 result
= anv_reloc_list_init(&bbo
->relocs
, &cmd_buffer
->pool
->alloc
);
260 if (result
!= VK_SUCCESS
)
268 anv_bo_pool_free(&cmd_buffer
->device
->batch_bo_pool
, &bbo
->bo
);
270 vk_free(&cmd_buffer
->pool
->alloc
, bbo
);
276 anv_batch_bo_clone(struct anv_cmd_buffer
*cmd_buffer
,
277 const struct anv_batch_bo
*other_bbo
,
278 struct anv_batch_bo
**bbo_out
)
282 struct anv_batch_bo
*bbo
= vk_alloc(&cmd_buffer
->pool
->alloc
, sizeof(*bbo
),
283 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT
);
285 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY
);
287 result
= anv_bo_pool_alloc(&cmd_buffer
->device
->batch_bo_pool
, &bbo
->bo
,
289 if (result
!= VK_SUCCESS
)
292 result
= anv_reloc_list_init_clone(&bbo
->relocs
, &cmd_buffer
->pool
->alloc
,
294 if (result
!= VK_SUCCESS
)
297 bbo
->length
= other_bbo
->length
;
298 memcpy(bbo
->bo
.map
, other_bbo
->bo
.map
, other_bbo
->length
);
300 bbo
->last_ss_pool_bo_offset
= other_bbo
->last_ss_pool_bo_offset
;
307 anv_bo_pool_free(&cmd_buffer
->device
->batch_bo_pool
, &bbo
->bo
);
309 vk_free(&cmd_buffer
->pool
->alloc
, bbo
);
315 anv_batch_bo_start(struct anv_batch_bo
*bbo
, struct anv_batch
*batch
,
316 size_t batch_padding
)
318 batch
->next
= batch
->start
= bbo
->bo
.map
;
319 batch
->end
= bbo
->bo
.map
+ bbo
->bo
.size
- batch_padding
;
320 batch
->relocs
= &bbo
->relocs
;
321 bbo
->last_ss_pool_bo_offset
= 0;
322 bbo
->relocs
.num_relocs
= 0;
326 anv_batch_bo_continue(struct anv_batch_bo
*bbo
, struct anv_batch
*batch
,
327 size_t batch_padding
)
329 batch
->start
= bbo
->bo
.map
;
330 batch
->next
= bbo
->bo
.map
+ bbo
->length
;
331 batch
->end
= bbo
->bo
.map
+ bbo
->bo
.size
- batch_padding
;
332 batch
->relocs
= &bbo
->relocs
;
336 anv_batch_bo_finish(struct anv_batch_bo
*bbo
, struct anv_batch
*batch
)
338 assert(batch
->start
== bbo
->bo
.map
);
339 bbo
->length
= batch
->next
- batch
->start
;
340 VG(VALGRIND_CHECK_MEM_IS_DEFINED(batch
->start
, bbo
->length
));
344 anv_batch_bo_grow(struct anv_cmd_buffer
*cmd_buffer
, struct anv_batch_bo
*bbo
,
345 struct anv_batch
*batch
, size_t aditional
,
346 size_t batch_padding
)
348 assert(batch
->start
== bbo
->bo
.map
);
349 bbo
->length
= batch
->next
- batch
->start
;
351 size_t new_size
= bbo
->bo
.size
;
352 while (new_size
<= bbo
->length
+ aditional
+ batch_padding
)
355 if (new_size
== bbo
->bo
.size
)
358 struct anv_bo new_bo
;
359 VkResult result
= anv_bo_pool_alloc(&cmd_buffer
->device
->batch_bo_pool
,
361 if (result
!= VK_SUCCESS
)
364 memcpy(new_bo
.map
, bbo
->bo
.map
, bbo
->length
);
366 anv_bo_pool_free(&cmd_buffer
->device
->batch_bo_pool
, &bbo
->bo
);
369 anv_batch_bo_continue(bbo
, batch
, batch_padding
);
375 anv_batch_bo_destroy(struct anv_batch_bo
*bbo
,
376 struct anv_cmd_buffer
*cmd_buffer
)
378 anv_reloc_list_finish(&bbo
->relocs
, &cmd_buffer
->pool
->alloc
);
379 anv_bo_pool_free(&cmd_buffer
->device
->batch_bo_pool
, &bbo
->bo
);
380 vk_free(&cmd_buffer
->pool
->alloc
, bbo
);
384 anv_batch_bo_list_clone(const struct list_head
*list
,
385 struct anv_cmd_buffer
*cmd_buffer
,
386 struct list_head
*new_list
)
388 VkResult result
= VK_SUCCESS
;
390 list_inithead(new_list
);
392 struct anv_batch_bo
*prev_bbo
= NULL
;
393 list_for_each_entry(struct anv_batch_bo
, bbo
, list
, link
) {
394 struct anv_batch_bo
*new_bbo
= NULL
;
395 result
= anv_batch_bo_clone(cmd_buffer
, bbo
, &new_bbo
);
396 if (result
!= VK_SUCCESS
)
398 list_addtail(&new_bbo
->link
, new_list
);
401 /* As we clone this list of batch_bo's, they chain one to the
402 * other using MI_BATCH_BUFFER_START commands. We need to fix up
403 * those relocations as we go. Fortunately, this is pretty easy
404 * as it will always be the last relocation in the list.
406 uint32_t last_idx
= prev_bbo
->relocs
.num_relocs
- 1;
407 assert(prev_bbo
->relocs
.reloc_bos
[last_idx
] == &bbo
->bo
);
408 prev_bbo
->relocs
.reloc_bos
[last_idx
] = &new_bbo
->bo
;
414 if (result
!= VK_SUCCESS
) {
415 list_for_each_entry_safe(struct anv_batch_bo
, bbo
, new_list
, link
)
416 anv_batch_bo_destroy(bbo
, cmd_buffer
);
422 /*-----------------------------------------------------------------------*
423 * Functions related to anv_batch_bo
424 *-----------------------------------------------------------------------*/
426 static inline struct anv_batch_bo
*
427 anv_cmd_buffer_current_batch_bo(struct anv_cmd_buffer
*cmd_buffer
)
429 return LIST_ENTRY(struct anv_batch_bo
, cmd_buffer
->batch_bos
.prev
, link
);
433 anv_cmd_buffer_surface_base_address(struct anv_cmd_buffer
*cmd_buffer
)
435 return (struct anv_address
) {
436 .bo
= &cmd_buffer
->device
->surface_state_block_pool
.bo
,
437 .offset
= *(int32_t *)u_vector_head(&cmd_buffer
->bt_blocks
),
442 emit_batch_buffer_start(struct anv_cmd_buffer
*cmd_buffer
,
443 struct anv_bo
*bo
, uint32_t offset
)
445 /* In gen8+ the address field grew to two dwords to accomodate 48 bit
446 * offsets. The high 16 bits are in the last dword, so we can use the gen8
447 * version in either case, as long as we set the instruction length in the
448 * header accordingly. This means that we always emit three dwords here
449 * and all the padding and adjustment we do in this file works for all
453 const uint32_t gen7_length
=
454 GEN7_MI_BATCH_BUFFER_START_length
- GEN7_MI_BATCH_BUFFER_START_length_bias
;
455 const uint32_t gen8_length
=
456 GEN8_MI_BATCH_BUFFER_START_length
- GEN8_MI_BATCH_BUFFER_START_length_bias
;
458 anv_batch_emit(&cmd_buffer
->batch
, GEN8_MI_BATCH_BUFFER_START
, bbs
) {
459 bbs
.DWordLength
= cmd_buffer
->device
->info
.gen
< 8 ?
460 gen7_length
: gen8_length
;
461 bbs
._2ndLevelBatchBuffer
= _1stlevelbatch
;
462 bbs
.AddressSpaceIndicator
= ASI_PPGTT
;
463 bbs
.BatchBufferStartAddress
= (struct anv_address
) { bo
, offset
};
468 cmd_buffer_chain_to_batch_bo(struct anv_cmd_buffer
*cmd_buffer
,
469 struct anv_batch_bo
*bbo
)
471 struct anv_batch
*batch
= &cmd_buffer
->batch
;
472 struct anv_batch_bo
*current_bbo
=
473 anv_cmd_buffer_current_batch_bo(cmd_buffer
);
475 /* We set the end of the batch a little short so we would be sure we
476 * have room for the chaining command. Since we're about to emit the
477 * chaining command, let's set it back where it should go.
479 batch
->end
+= GEN8_MI_BATCH_BUFFER_START_length
* 4;
480 assert(batch
->end
== current_bbo
->bo
.map
+ current_bbo
->bo
.size
);
482 emit_batch_buffer_start(cmd_buffer
, &bbo
->bo
, 0);
484 anv_batch_bo_finish(current_bbo
, batch
);
488 anv_cmd_buffer_chain_batch(struct anv_batch
*batch
, void *_data
)
490 struct anv_cmd_buffer
*cmd_buffer
= _data
;
491 struct anv_batch_bo
*new_bbo
;
493 VkResult result
= anv_batch_bo_create(cmd_buffer
, &new_bbo
);
494 if (result
!= VK_SUCCESS
)
497 struct anv_batch_bo
**seen_bbo
= u_vector_add(&cmd_buffer
->seen_bbos
);
498 if (seen_bbo
== NULL
) {
499 anv_batch_bo_destroy(new_bbo
, cmd_buffer
);
500 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY
);
504 cmd_buffer_chain_to_batch_bo(cmd_buffer
, new_bbo
);
506 list_addtail(&new_bbo
->link
, &cmd_buffer
->batch_bos
);
508 anv_batch_bo_start(new_bbo
, batch
, GEN8_MI_BATCH_BUFFER_START_length
* 4);
514 anv_cmd_buffer_grow_batch(struct anv_batch
*batch
, void *_data
)
516 struct anv_cmd_buffer
*cmd_buffer
= _data
;
517 struct anv_batch_bo
*bbo
= anv_cmd_buffer_current_batch_bo(cmd_buffer
);
519 anv_batch_bo_grow(cmd_buffer
, bbo
, &cmd_buffer
->batch
, 4096,
520 GEN8_MI_BATCH_BUFFER_START_length
* 4);
525 /** Allocate a binding table
527 * This function allocates a binding table. This is a bit more complicated
528 * than one would think due to a combination of Vulkan driver design and some
529 * unfortunate hardware restrictions.
531 * The 3DSTATE_BINDING_TABLE_POINTERS_* packets only have a 16-bit field for
532 * the binding table pointer which means that all binding tables need to live
533 * in the bottom 64k of surface state base address. The way the GL driver has
534 * classically dealt with this restriction is to emit all surface states
535 * on-the-fly into the batch and have a batch buffer smaller than 64k. This
536 * isn't really an option in Vulkan for a couple of reasons:
538 * 1) In Vulkan, we have growing (or chaining) batches so surface states have
539 * to live in their own buffer and we have to be able to re-emit
540 * STATE_BASE_ADDRESS as needed which requires a full pipeline stall. In
541 * order to avoid emitting STATE_BASE_ADDRESS any more often than needed
542 * (it's not that hard to hit 64k of just binding tables), we allocate
543 * surface state objects up-front when VkImageView is created. In order
544 * for this to work, surface state objects need to be allocated from a
547 * 2) We tried to design the surface state system in such a way that it's
548 * already ready for bindless texturing. The way bindless texturing works
549 * on our hardware is that you have a big pool of surface state objects
550 * (with its own state base address) and the bindless handles are simply
551 * offsets into that pool. With the architecture we chose, we already
552 * have that pool and it's exactly the same pool that we use for regular
553 * surface states so we should already be ready for bindless.
555 * 3) For render targets, we need to be able to fill out the surface states
556 * later in vkBeginRenderPass so that we can assign clear colors
557 * correctly. One way to do this would be to just create the surface
558 * state data and then repeatedly copy it into the surface state BO every
559 * time we have to re-emit STATE_BASE_ADDRESS. While this works, it's
560 * rather annoying and just being able to allocate them up-front and
561 * re-use them for the entire render pass.
563 * While none of these are technically blockers for emitting state on the fly
564 * like we do in GL, the ability to have a single surface state pool is
565 * simplifies things greatly. Unfortunately, it comes at a cost...
567 * Because of the 64k limitation of 3DSTATE_BINDING_TABLE_POINTERS_*, we can't
568 * place the binding tables just anywhere in surface state base address.
569 * Because 64k isn't a whole lot of space, we can't simply restrict the
570 * surface state buffer to 64k, we have to be more clever. The solution we've
571 * chosen is to have a block pool with a maximum size of 2G that starts at
572 * zero and grows in both directions. All surface states are allocated from
573 * the top of the pool (positive offsets) and we allocate blocks (< 64k) of
574 * binding tables from the bottom of the pool (negative offsets). Every time
575 * we allocate a new binding table block, we set surface state base address to
576 * point to the bottom of the binding table block. This way all of the
577 * binding tables in the block are in the bottom 64k of surface state base
578 * address. When we fill out the binding table, we add the distance between
579 * the bottom of our binding table block and zero of the block pool to the
580 * surface state offsets so that they are correct relative to out new surface
581 * state base address at the bottom of the binding table block.
583 * \see adjust_relocations_from_block_pool()
584 * \see adjust_relocations_too_block_pool()
586 * \param[in] entries The number of surface state entries the binding
587 * table should be able to hold.
589 * \param[out] state_offset The offset surface surface state base address
590 * where the surface states live. This must be
591 * added to the surface state offset when it is
592 * written into the binding table entry.
594 * \return An anv_state representing the binding table
597 anv_cmd_buffer_alloc_binding_table(struct anv_cmd_buffer
*cmd_buffer
,
598 uint32_t entries
, uint32_t *state_offset
)
600 struct anv_block_pool
*block_pool
=
601 &cmd_buffer
->device
->surface_state_block_pool
;
602 int32_t *bt_block
= u_vector_head(&cmd_buffer
->bt_blocks
);
603 struct anv_state state
;
605 state
.alloc_size
= align_u32(entries
* 4, 32);
607 if (cmd_buffer
->bt_next
+ state
.alloc_size
> block_pool
->block_size
)
608 return (struct anv_state
) { 0 };
610 state
.offset
= cmd_buffer
->bt_next
;
611 state
.map
= block_pool
->map
+ *bt_block
+ state
.offset
;
613 cmd_buffer
->bt_next
+= state
.alloc_size
;
615 assert(*bt_block
< 0);
616 *state_offset
= -(*bt_block
);
622 anv_cmd_buffer_alloc_surface_state(struct anv_cmd_buffer
*cmd_buffer
)
624 return anv_state_stream_alloc(&cmd_buffer
->surface_state_stream
, 64, 64);
628 anv_cmd_buffer_alloc_dynamic_state(struct anv_cmd_buffer
*cmd_buffer
,
629 uint32_t size
, uint32_t alignment
)
631 return anv_state_stream_alloc(&cmd_buffer
->dynamic_state_stream
,
636 anv_cmd_buffer_new_binding_table_block(struct anv_cmd_buffer
*cmd_buffer
)
638 struct anv_block_pool
*block_pool
=
639 &cmd_buffer
->device
->surface_state_block_pool
;
641 int32_t *offset
= u_vector_add(&cmd_buffer
->bt_blocks
);
643 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY
);
645 *offset
= anv_block_pool_alloc_back(block_pool
);
646 cmd_buffer
->bt_next
= 0;
652 anv_execbuf_init(struct anv_execbuf
*exec
)
654 memset(exec
, 0, sizeof(*exec
));
658 anv_execbuf_finish(struct anv_execbuf
*exec
,
659 const VkAllocationCallbacks
*alloc
)
661 vk_free(alloc
, exec
->objects
);
662 vk_free(alloc
, exec
->bos
);
666 anv_cmd_buffer_init_batch_bo_chain(struct anv_cmd_buffer
*cmd_buffer
)
668 struct anv_batch_bo
*batch_bo
;
671 list_inithead(&cmd_buffer
->batch_bos
);
673 result
= anv_batch_bo_create(cmd_buffer
, &batch_bo
);
674 if (result
!= VK_SUCCESS
)
677 list_addtail(&batch_bo
->link
, &cmd_buffer
->batch_bos
);
679 cmd_buffer
->batch
.alloc
= &cmd_buffer
->pool
->alloc
;
680 cmd_buffer
->batch
.user_data
= cmd_buffer
;
682 if (cmd_buffer
->device
->can_chain_batches
) {
683 cmd_buffer
->batch
.extend_cb
= anv_cmd_buffer_chain_batch
;
685 cmd_buffer
->batch
.extend_cb
= anv_cmd_buffer_grow_batch
;
688 anv_batch_bo_start(batch_bo
, &cmd_buffer
->batch
,
689 GEN8_MI_BATCH_BUFFER_START_length
* 4);
691 int success
= u_vector_init(&cmd_buffer
->seen_bbos
,
692 sizeof(struct anv_bo
*),
693 8 * sizeof(struct anv_bo
*));
697 *(struct anv_batch_bo
**)u_vector_add(&cmd_buffer
->seen_bbos
) = batch_bo
;
699 success
= u_vector_init(&cmd_buffer
->bt_blocks
, sizeof(int32_t),
700 8 * sizeof(int32_t));
704 result
= anv_reloc_list_init(&cmd_buffer
->surface_relocs
,
705 &cmd_buffer
->pool
->alloc
);
706 if (result
!= VK_SUCCESS
)
709 anv_cmd_buffer_new_binding_table_block(cmd_buffer
);
711 anv_execbuf_init(&cmd_buffer
->execbuf2
);
716 u_vector_finish(&cmd_buffer
->bt_blocks
);
718 u_vector_finish(&cmd_buffer
->seen_bbos
);
720 anv_batch_bo_destroy(batch_bo
, cmd_buffer
);
726 anv_cmd_buffer_fini_batch_bo_chain(struct anv_cmd_buffer
*cmd_buffer
)
729 u_vector_foreach(bt_block
, &cmd_buffer
->bt_blocks
) {
730 anv_block_pool_free(&cmd_buffer
->device
->surface_state_block_pool
,
733 u_vector_finish(&cmd_buffer
->bt_blocks
);
735 anv_reloc_list_finish(&cmd_buffer
->surface_relocs
, &cmd_buffer
->pool
->alloc
);
737 u_vector_finish(&cmd_buffer
->seen_bbos
);
739 /* Destroy all of the batch buffers */
740 list_for_each_entry_safe(struct anv_batch_bo
, bbo
,
741 &cmd_buffer
->batch_bos
, link
) {
742 anv_batch_bo_destroy(bbo
, cmd_buffer
);
745 anv_execbuf_finish(&cmd_buffer
->execbuf2
, &cmd_buffer
->pool
->alloc
);
749 anv_cmd_buffer_reset_batch_bo_chain(struct anv_cmd_buffer
*cmd_buffer
)
751 /* Delete all but the first batch bo */
752 assert(!list_empty(&cmd_buffer
->batch_bos
));
753 while (cmd_buffer
->batch_bos
.next
!= cmd_buffer
->batch_bos
.prev
) {
754 struct anv_batch_bo
*bbo
= anv_cmd_buffer_current_batch_bo(cmd_buffer
);
755 list_del(&bbo
->link
);
756 anv_batch_bo_destroy(bbo
, cmd_buffer
);
758 assert(!list_empty(&cmd_buffer
->batch_bos
));
760 anv_batch_bo_start(anv_cmd_buffer_current_batch_bo(cmd_buffer
),
762 GEN8_MI_BATCH_BUFFER_START_length
* 4);
764 while (u_vector_length(&cmd_buffer
->bt_blocks
) > 1) {
765 int32_t *bt_block
= u_vector_remove(&cmd_buffer
->bt_blocks
);
766 anv_block_pool_free(&cmd_buffer
->device
->surface_state_block_pool
,
769 assert(u_vector_length(&cmd_buffer
->bt_blocks
) == 1);
770 cmd_buffer
->bt_next
= 0;
772 cmd_buffer
->surface_relocs
.num_relocs
= 0;
774 /* Reset the list of seen buffers */
775 cmd_buffer
->seen_bbos
.head
= 0;
776 cmd_buffer
->seen_bbos
.tail
= 0;
778 *(struct anv_batch_bo
**)u_vector_add(&cmd_buffer
->seen_bbos
) =
779 anv_cmd_buffer_current_batch_bo(cmd_buffer
);
783 anv_cmd_buffer_end_batch_buffer(struct anv_cmd_buffer
*cmd_buffer
)
785 struct anv_batch_bo
*batch_bo
= anv_cmd_buffer_current_batch_bo(cmd_buffer
);
787 if (cmd_buffer
->level
== VK_COMMAND_BUFFER_LEVEL_PRIMARY
) {
788 /* When we start a batch buffer, we subtract a certain amount of
789 * padding from the end to ensure that we always have room to emit a
790 * BATCH_BUFFER_START to chain to the next BO. We need to remove
791 * that padding before we end the batch; otherwise, we may end up
792 * with our BATCH_BUFFER_END in another BO.
794 cmd_buffer
->batch
.end
+= GEN8_MI_BATCH_BUFFER_START_length
* 4;
795 assert(cmd_buffer
->batch
.end
== batch_bo
->bo
.map
+ batch_bo
->bo
.size
);
797 anv_batch_emit(&cmd_buffer
->batch
, GEN7_MI_BATCH_BUFFER_END
, bbe
);
799 /* Round batch up to an even number of dwords. */
800 if ((cmd_buffer
->batch
.next
- cmd_buffer
->batch
.start
) & 4)
801 anv_batch_emit(&cmd_buffer
->batch
, GEN7_MI_NOOP
, noop
);
803 cmd_buffer
->exec_mode
= ANV_CMD_BUFFER_EXEC_MODE_PRIMARY
;
806 anv_batch_bo_finish(batch_bo
, &cmd_buffer
->batch
);
808 if (cmd_buffer
->level
== VK_COMMAND_BUFFER_LEVEL_SECONDARY
) {
809 /* If this is a secondary command buffer, we need to determine the
810 * mode in which it will be executed with vkExecuteCommands. We
811 * determine this statically here so that this stays in sync with the
812 * actual ExecuteCommands implementation.
814 if (!cmd_buffer
->device
->can_chain_batches
) {
815 cmd_buffer
->exec_mode
= ANV_CMD_BUFFER_EXEC_MODE_GROW_AND_EMIT
;
816 } else if ((cmd_buffer
->batch_bos
.next
== cmd_buffer
->batch_bos
.prev
) &&
817 (batch_bo
->length
< ANV_CMD_BUFFER_BATCH_SIZE
/ 2)) {
818 /* If the secondary has exactly one batch buffer in its list *and*
819 * that batch buffer is less than half of the maximum size, we're
820 * probably better of simply copying it into our batch.
822 cmd_buffer
->exec_mode
= ANV_CMD_BUFFER_EXEC_MODE_EMIT
;
823 } else if (!(cmd_buffer
->usage_flags
&
824 VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT
)) {
825 cmd_buffer
->exec_mode
= ANV_CMD_BUFFER_EXEC_MODE_CHAIN
;
827 /* When we chain, we need to add an MI_BATCH_BUFFER_START command
828 * with its relocation. In order to handle this we'll increment here
829 * so we can unconditionally decrement right before adding the
830 * MI_BATCH_BUFFER_START command.
832 batch_bo
->relocs
.num_relocs
++;
833 cmd_buffer
->batch
.next
+= GEN8_MI_BATCH_BUFFER_START_length
* 4;
835 cmd_buffer
->exec_mode
= ANV_CMD_BUFFER_EXEC_MODE_COPY_AND_CHAIN
;
840 static inline VkResult
841 anv_cmd_buffer_add_seen_bbos(struct anv_cmd_buffer
*cmd_buffer
,
842 struct list_head
*list
)
844 list_for_each_entry(struct anv_batch_bo
, bbo
, list
, link
) {
845 struct anv_batch_bo
**bbo_ptr
= u_vector_add(&cmd_buffer
->seen_bbos
);
847 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY
);
856 anv_cmd_buffer_add_secondary(struct anv_cmd_buffer
*primary
,
857 struct anv_cmd_buffer
*secondary
)
859 switch (secondary
->exec_mode
) {
860 case ANV_CMD_BUFFER_EXEC_MODE_EMIT
:
861 anv_batch_emit_batch(&primary
->batch
, &secondary
->batch
);
863 case ANV_CMD_BUFFER_EXEC_MODE_GROW_AND_EMIT
: {
864 struct anv_batch_bo
*bbo
= anv_cmd_buffer_current_batch_bo(primary
);
865 unsigned length
= secondary
->batch
.end
- secondary
->batch
.start
;
866 anv_batch_bo_grow(primary
, bbo
, &primary
->batch
, length
,
867 GEN8_MI_BATCH_BUFFER_START_length
* 4);
868 anv_batch_emit_batch(&primary
->batch
, &secondary
->batch
);
871 case ANV_CMD_BUFFER_EXEC_MODE_CHAIN
: {
872 struct anv_batch_bo
*first_bbo
=
873 list_first_entry(&secondary
->batch_bos
, struct anv_batch_bo
, link
);
874 struct anv_batch_bo
*last_bbo
=
875 list_last_entry(&secondary
->batch_bos
, struct anv_batch_bo
, link
);
877 emit_batch_buffer_start(primary
, &first_bbo
->bo
, 0);
879 struct anv_batch_bo
*this_bbo
= anv_cmd_buffer_current_batch_bo(primary
);
880 assert(primary
->batch
.start
== this_bbo
->bo
.map
);
881 uint32_t offset
= primary
->batch
.next
- primary
->batch
.start
;
882 const uint32_t inst_size
= GEN8_MI_BATCH_BUFFER_START_length
* 4;
884 /* Roll back the previous MI_BATCH_BUFFER_START and its relocation so we
885 * can emit a new command and relocation for the current splice. In
886 * order to handle the initial-use case, we incremented next and
887 * num_relocs in end_batch_buffer() so we can alyways just subtract
890 last_bbo
->relocs
.num_relocs
--;
891 secondary
->batch
.next
-= inst_size
;
892 emit_batch_buffer_start(secondary
, &this_bbo
->bo
, offset
);
893 anv_cmd_buffer_add_seen_bbos(primary
, &secondary
->batch_bos
);
895 /* After patching up the secondary buffer, we need to clflush the
896 * modified instruction in case we're on a !llc platform. We use a
897 * little loop to handle the case where the instruction crosses a cache
900 if (!primary
->device
->info
.has_llc
) {
901 void *inst
= secondary
->batch
.next
- inst_size
;
902 void *p
= (void *) (((uintptr_t) inst
) & ~CACHELINE_MASK
);
903 __builtin_ia32_mfence();
904 while (p
< secondary
->batch
.next
) {
905 __builtin_ia32_clflush(p
);
911 case ANV_CMD_BUFFER_EXEC_MODE_COPY_AND_CHAIN
: {
912 struct list_head copy_list
;
913 VkResult result
= anv_batch_bo_list_clone(&secondary
->batch_bos
,
916 if (result
!= VK_SUCCESS
)
919 anv_cmd_buffer_add_seen_bbos(primary
, ©_list
);
921 struct anv_batch_bo
*first_bbo
=
922 list_first_entry(©_list
, struct anv_batch_bo
, link
);
923 struct anv_batch_bo
*last_bbo
=
924 list_last_entry(©_list
, struct anv_batch_bo
, link
);
926 cmd_buffer_chain_to_batch_bo(primary
, first_bbo
);
928 list_splicetail(©_list
, &primary
->batch_bos
);
930 anv_batch_bo_continue(last_bbo
, &primary
->batch
,
931 GEN8_MI_BATCH_BUFFER_START_length
* 4);
935 assert(!"Invalid execution mode");
938 anv_reloc_list_append(&primary
->surface_relocs
, &primary
->pool
->alloc
,
939 &secondary
->surface_relocs
, 0);
943 anv_execbuf_add_bo(struct anv_execbuf
*exec
,
945 struct anv_reloc_list
*relocs
,
946 const VkAllocationCallbacks
*alloc
)
948 struct drm_i915_gem_exec_object2
*obj
= NULL
;
950 if (bo
->index
< exec
->bo_count
&& exec
->bos
[bo
->index
] == bo
)
951 obj
= &exec
->objects
[bo
->index
];
954 /* We've never seen this one before. Add it to the list and assign
955 * an id that we can use later.
957 if (exec
->bo_count
>= exec
->array_length
) {
958 uint32_t new_len
= exec
->objects
? exec
->array_length
* 2 : 64;
960 struct drm_i915_gem_exec_object2
*new_objects
=
961 vk_alloc(alloc
, new_len
* sizeof(*new_objects
),
962 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND
);
963 if (new_objects
== NULL
)
964 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY
);
966 struct anv_bo
**new_bos
=
967 vk_alloc(alloc
, new_len
* sizeof(*new_bos
),
968 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND
);
969 if (new_bos
== NULL
) {
970 vk_free(alloc
, new_objects
);
971 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY
);
975 memcpy(new_objects
, exec
->objects
,
976 exec
->bo_count
* sizeof(*new_objects
));
977 memcpy(new_bos
, exec
->bos
,
978 exec
->bo_count
* sizeof(*new_bos
));
981 vk_free(alloc
, exec
->objects
);
982 vk_free(alloc
, exec
->bos
);
984 exec
->objects
= new_objects
;
986 exec
->array_length
= new_len
;
989 assert(exec
->bo_count
< exec
->array_length
);
991 bo
->index
= exec
->bo_count
++;
992 obj
= &exec
->objects
[bo
->index
];
993 exec
->bos
[bo
->index
] = bo
;
995 obj
->handle
= bo
->gem_handle
;
996 obj
->relocation_count
= 0;
999 obj
->offset
= bo
->offset
;
1000 obj
->flags
= bo
->is_winsys_bo
? EXEC_OBJECT_WRITE
: 0;
1005 if (relocs
!= NULL
&& obj
->relocation_count
== 0) {
1006 /* This is the first time we've ever seen a list of relocations for
1007 * this BO. Go ahead and set the relocations and then walk the list
1008 * of relocations and add them all.
1010 obj
->relocation_count
= relocs
->num_relocs
;
1011 obj
->relocs_ptr
= (uintptr_t) relocs
->relocs
;
1013 for (size_t i
= 0; i
< relocs
->num_relocs
; i
++) {
1014 /* A quick sanity check on relocations */
1015 assert(relocs
->relocs
[i
].offset
< bo
->size
);
1016 anv_execbuf_add_bo(exec
, relocs
->reloc_bos
[i
], NULL
, alloc
);
1024 anv_cmd_buffer_process_relocs(struct anv_cmd_buffer
*cmd_buffer
,
1025 struct anv_reloc_list
*list
)
1027 for (size_t i
= 0; i
< list
->num_relocs
; i
++)
1028 list
->relocs
[i
].target_handle
= list
->reloc_bos
[i
]->index
;
1032 write_reloc(const struct anv_device
*device
, void *p
, uint64_t v
, bool flush
)
1034 unsigned reloc_size
= 0;
1035 if (device
->info
.gen
>= 8) {
1036 /* From the Broadwell PRM Vol. 2a, MI_LOAD_REGISTER_MEM::MemoryAddress:
1038 * "This field specifies the address of the memory location where the
1039 * register value specified in the DWord above will read from. The
1040 * address specifies the DWord location of the data. Range =
1041 * GraphicsVirtualAddress[63:2] for a DWord register GraphicsAddress
1042 * [63:48] are ignored by the HW and assumed to be in correct
1043 * canonical form [63:48] == [47]."
1045 const int shift
= 63 - 47;
1046 reloc_size
= sizeof(uint64_t);
1047 *(uint64_t *)p
= (((int64_t)v
) << shift
) >> shift
;
1049 reloc_size
= sizeof(uint32_t);
1053 if (flush
&& !device
->info
.has_llc
)
1054 anv_clflush_range(p
, reloc_size
);
1058 adjust_relocations_from_state_pool(struct anv_block_pool
*pool
,
1059 struct anv_reloc_list
*relocs
)
1061 for (size_t i
= 0; i
< relocs
->num_relocs
; i
++) {
1062 /* All of the relocations from this block pool to other BO's should
1063 * have been emitted relative to the surface block pool center. We
1064 * need to add the center offset to make them relative to the
1065 * beginning of the actual GEM bo.
1067 relocs
->relocs
[i
].offset
+= pool
->center_bo_offset
;
1072 adjust_relocations_to_state_pool(struct anv_block_pool
*pool
,
1073 struct anv_bo
*from_bo
,
1074 struct anv_reloc_list
*relocs
,
1075 uint32_t *last_pool_center_bo_offset
)
1077 assert(*last_pool_center_bo_offset
<= pool
->center_bo_offset
);
1078 uint32_t delta
= pool
->center_bo_offset
- *last_pool_center_bo_offset
;
1080 /* When we initially emit relocations into a block pool, we don't
1081 * actually know what the final center_bo_offset will be so we just emit
1082 * it as if center_bo_offset == 0. Now that we know what the center
1083 * offset is, we need to walk the list of relocations and adjust any
1084 * relocations that point to the pool bo with the correct offset.
1086 for (size_t i
= 0; i
< relocs
->num_relocs
; i
++) {
1087 if (relocs
->reloc_bos
[i
] == &pool
->bo
) {
1088 /* Adjust the delta value in the relocation to correctly
1089 * correspond to the new delta. Initially, this value may have
1090 * been negative (if treated as unsigned), but we trust in
1091 * uint32_t roll-over to fix that for us at this point.
1093 relocs
->relocs
[i
].delta
+= delta
;
1095 /* Since the delta has changed, we need to update the actual
1096 * relocated value with the new presumed value. This function
1097 * should only be called on batch buffers, so we know it isn't in
1098 * use by the GPU at the moment.
1100 assert(relocs
->relocs
[i
].offset
< from_bo
->size
);
1101 write_reloc(pool
->device
, from_bo
->map
+ relocs
->relocs
[i
].offset
,
1102 relocs
->relocs
[i
].presumed_offset
+
1103 relocs
->relocs
[i
].delta
, false);
1107 *last_pool_center_bo_offset
= pool
->center_bo_offset
;
1111 anv_cmd_buffer_prepare_execbuf(struct anv_cmd_buffer
*cmd_buffer
)
1113 struct anv_batch
*batch
= &cmd_buffer
->batch
;
1114 struct anv_block_pool
*ss_pool
=
1115 &cmd_buffer
->device
->surface_state_block_pool
;
1117 cmd_buffer
->execbuf2
.bo_count
= 0;
1119 adjust_relocations_from_state_pool(ss_pool
, &cmd_buffer
->surface_relocs
);
1120 anv_execbuf_add_bo(&cmd_buffer
->execbuf2
, &ss_pool
->bo
,
1121 &cmd_buffer
->surface_relocs
,
1122 &cmd_buffer
->pool
->alloc
);
1124 /* First, we walk over all of the bos we've seen and add them and their
1125 * relocations to the validate list.
1127 struct anv_batch_bo
**bbo
;
1128 u_vector_foreach(bbo
, &cmd_buffer
->seen_bbos
) {
1129 adjust_relocations_to_state_pool(ss_pool
, &(*bbo
)->bo
, &(*bbo
)->relocs
,
1130 &(*bbo
)->last_ss_pool_bo_offset
);
1132 anv_execbuf_add_bo(&cmd_buffer
->execbuf2
, &(*bbo
)->bo
, &(*bbo
)->relocs
,
1133 &cmd_buffer
->pool
->alloc
);
1136 struct anv_batch_bo
*first_batch_bo
=
1137 list_first_entry(&cmd_buffer
->batch_bos
, struct anv_batch_bo
, link
);
1139 /* The kernel requires that the last entry in the validation list be the
1140 * batch buffer to execute. We can simply swap the element
1141 * corresponding to the first batch_bo in the chain with the last
1142 * element in the list.
1144 if (first_batch_bo
->bo
.index
!= cmd_buffer
->execbuf2
.bo_count
- 1) {
1145 uint32_t idx
= first_batch_bo
->bo
.index
;
1146 uint32_t last_idx
= cmd_buffer
->execbuf2
.bo_count
- 1;
1148 struct drm_i915_gem_exec_object2 tmp_obj
=
1149 cmd_buffer
->execbuf2
.objects
[idx
];
1150 assert(cmd_buffer
->execbuf2
.bos
[idx
] == &first_batch_bo
->bo
);
1152 cmd_buffer
->execbuf2
.objects
[idx
] = cmd_buffer
->execbuf2
.objects
[last_idx
];
1153 cmd_buffer
->execbuf2
.bos
[idx
] = cmd_buffer
->execbuf2
.bos
[last_idx
];
1154 cmd_buffer
->execbuf2
.bos
[idx
]->index
= idx
;
1156 cmd_buffer
->execbuf2
.objects
[last_idx
] = tmp_obj
;
1157 cmd_buffer
->execbuf2
.bos
[last_idx
] = &first_batch_bo
->bo
;
1158 first_batch_bo
->bo
.index
= last_idx
;
1161 /* Now we go through and fixup all of the relocation lists to point to
1162 * the correct indices in the object array. We have to do this after we
1163 * reorder the list above as some of the indices may have changed.
1165 u_vector_foreach(bbo
, &cmd_buffer
->seen_bbos
)
1166 anv_cmd_buffer_process_relocs(cmd_buffer
, &(*bbo
)->relocs
);
1168 anv_cmd_buffer_process_relocs(cmd_buffer
, &cmd_buffer
->surface_relocs
);
1170 if (!cmd_buffer
->device
->info
.has_llc
) {
1171 __builtin_ia32_mfence();
1172 u_vector_foreach(bbo
, &cmd_buffer
->seen_bbos
) {
1173 for (uint32_t i
= 0; i
< (*bbo
)->length
; i
+= CACHELINE_SIZE
)
1174 __builtin_ia32_clflush((*bbo
)->bo
.map
+ i
);
1178 cmd_buffer
->execbuf2
.execbuf
= (struct drm_i915_gem_execbuffer2
) {
1179 .buffers_ptr
= (uintptr_t) cmd_buffer
->execbuf2
.objects
,
1180 .buffer_count
= cmd_buffer
->execbuf2
.bo_count
,
1181 .batch_start_offset
= 0,
1182 .batch_len
= batch
->next
- batch
->start
,
1187 .flags
= I915_EXEC_HANDLE_LUT
| I915_EXEC_RENDER
|
1188 I915_EXEC_CONSTANTS_REL_GENERAL
,
1189 .rsvd1
= cmd_buffer
->device
->context_id
,
1195 anv_cmd_buffer_execbuf(struct anv_device
*device
,
1196 struct anv_cmd_buffer
*cmd_buffer
)
1198 /* Since surface states are shared between command buffers and we don't
1199 * know what order they will be submitted to the kernel, we don't know what
1200 * address is actually written in the surface state object at any given
1201 * time. The only option is to set a bogus presumed offset and let
1202 * relocations do their job.
1204 for (size_t i
= 0; i
< cmd_buffer
->surface_relocs
.num_relocs
; i
++)
1205 cmd_buffer
->surface_relocs
.relocs
[i
].presumed_offset
= -1;
1207 return anv_device_execbuf(device
, &cmd_buffer
->execbuf2
.execbuf
,
1208 cmd_buffer
->execbuf2
.bos
);