anv: Improve flushing around STATE_BASE_ADDRESS
[mesa.git] / src / intel / vulkan / genX_cmd_buffer.c
1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <assert.h>
25 #include <stdbool.h>
26
27 #include "anv_private.h"
28 #include "vk_format_info.h"
29
30 #include "common/gen_l3_config.h"
31 #include "genxml/gen_macros.h"
32 #include "genxml/genX_pack.h"
33
34 static void
35 emit_lrm(struct anv_batch *batch,
36 uint32_t reg, struct anv_bo *bo, uint32_t offset)
37 {
38 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
39 lrm.RegisterAddress = reg;
40 lrm.MemoryAddress = (struct anv_address) { bo, offset };
41 }
42 }
43
44 static void
45 emit_lri(struct anv_batch *batch, uint32_t reg, uint32_t imm)
46 {
47 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
48 lri.RegisterOffset = reg;
49 lri.DataDWord = imm;
50 }
51 }
52
53 void
54 genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
55 {
56 struct anv_device *device = cmd_buffer->device;
57
58 /* Emit a render target cache flush.
59 *
60 * This isn't documented anywhere in the PRM. However, it seems to be
61 * necessary prior to changing the surface state base adress. Without
62 * this, we get GPU hangs when using multi-level command buffers which
63 * clear depth, reset state base address, and then go render stuff.
64 */
65 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
66 pc.DCFlushEnable = true;
67 pc.RenderTargetCacheFlushEnable = true;
68 pc.CommandStreamerStallEnable = true;
69 }
70
71 anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BASE_ADDRESS), sba) {
72 sba.GeneralStateBaseAddress = (struct anv_address) { NULL, 0 };
73 sba.GeneralStateMemoryObjectControlState = GENX(MOCS);
74 sba.GeneralStateBaseAddressModifyEnable = true;
75
76 sba.SurfaceStateBaseAddress =
77 anv_cmd_buffer_surface_base_address(cmd_buffer);
78 sba.SurfaceStateMemoryObjectControlState = GENX(MOCS);
79 sba.SurfaceStateBaseAddressModifyEnable = true;
80
81 sba.DynamicStateBaseAddress =
82 (struct anv_address) { &device->dynamic_state_block_pool.bo, 0 };
83 sba.DynamicStateMemoryObjectControlState = GENX(MOCS);
84 sba.DynamicStateBaseAddressModifyEnable = true;
85
86 sba.IndirectObjectBaseAddress = (struct anv_address) { NULL, 0 };
87 sba.IndirectObjectMemoryObjectControlState = GENX(MOCS);
88 sba.IndirectObjectBaseAddressModifyEnable = true;
89
90 sba.InstructionBaseAddress =
91 (struct anv_address) { &device->instruction_block_pool.bo, 0 };
92 sba.InstructionMemoryObjectControlState = GENX(MOCS);
93 sba.InstructionBaseAddressModifyEnable = true;
94
95 # if (GEN_GEN >= 8)
96 /* Broadwell requires that we specify a buffer size for a bunch of
97 * these fields. However, since we will be growing the BO's live, we
98 * just set them all to the maximum.
99 */
100 sba.GeneralStateBufferSize = 0xfffff;
101 sba.GeneralStateBufferSizeModifyEnable = true;
102 sba.DynamicStateBufferSize = 0xfffff;
103 sba.DynamicStateBufferSizeModifyEnable = true;
104 sba.IndirectObjectBufferSize = 0xfffff;
105 sba.IndirectObjectBufferSizeModifyEnable = true;
106 sba.InstructionBufferSize = 0xfffff;
107 sba.InstructionBuffersizeModifyEnable = true;
108 # endif
109 }
110
111 /* After re-setting the surface state base address, we have to do some
112 * cache flusing so that the sampler engine will pick up the new
113 * SURFACE_STATE objects and binding tables. From the Broadwell PRM,
114 * Shared Function > 3D Sampler > State > State Caching (page 96):
115 *
116 * Coherency with system memory in the state cache, like the texture
117 * cache is handled partially by software. It is expected that the
118 * command stream or shader will issue Cache Flush operation or
119 * Cache_Flush sampler message to ensure that the L1 cache remains
120 * coherent with system memory.
121 *
122 * [...]
123 *
124 * Whenever the value of the Dynamic_State_Base_Addr,
125 * Surface_State_Base_Addr are altered, the L1 state cache must be
126 * invalidated to ensure the new surface or sampler state is fetched
127 * from system memory.
128 *
129 * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit
130 * which, according the PIPE_CONTROL instruction documentation in the
131 * Broadwell PRM:
132 *
133 * Setting this bit is independent of any other bit in this packet.
134 * This bit controls the invalidation of the L1 and L2 state caches
135 * at the top of the pipe i.e. at the parsing time.
136 *
137 * Unfortunately, experimentation seems to indicate that state cache
138 * invalidation through a PIPE_CONTROL does nothing whatsoever in
139 * regards to surface state and binding tables. In stead, it seems that
140 * invalidating the texture cache is what is actually needed.
141 *
142 * XXX: As far as we have been able to determine through
143 * experimentation, shows that flush the texture cache appears to be
144 * sufficient. The theory here is that all of the sampling/rendering
145 * units cache the binding table in the texture cache. However, we have
146 * yet to be able to actually confirm this.
147 */
148 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
149 pc.TextureCacheInvalidationEnable = true;
150 pc.ConstantCacheInvalidationEnable = true;
151 pc.StateCacheInvalidationEnable = true;
152 }
153 }
154
155 static void
156 add_surface_state_reloc(struct anv_cmd_buffer *cmd_buffer,
157 struct anv_state state,
158 struct anv_bo *bo, uint32_t offset)
159 {
160 const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
161
162 anv_reloc_list_add(&cmd_buffer->surface_relocs, &cmd_buffer->pool->alloc,
163 state.offset + isl_dev->ss.addr_offset, bo, offset);
164 }
165
166 static void
167 add_image_view_relocs(struct anv_cmd_buffer *cmd_buffer,
168 const struct anv_image_view *iview,
169 enum isl_aux_usage aux_usage,
170 struct anv_state state)
171 {
172 const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
173
174 anv_reloc_list_add(&cmd_buffer->surface_relocs, &cmd_buffer->pool->alloc,
175 state.offset + isl_dev->ss.addr_offset,
176 iview->bo, iview->offset);
177
178 if (aux_usage != ISL_AUX_USAGE_NONE) {
179 uint32_t aux_offset = iview->offset + iview->image->aux_surface.offset;
180
181 /* On gen7 and prior, the bottom 12 bits of the MCS base address are
182 * used to store other information. This should be ok, however, because
183 * surface buffer addresses are always 4K page alinged.
184 */
185 assert((aux_offset & 0xfff) == 0);
186 uint32_t *aux_addr_dw = state.map + isl_dev->ss.aux_addr_offset;
187 aux_offset += *aux_addr_dw & 0xfff;
188
189 anv_reloc_list_add(&cmd_buffer->surface_relocs, &cmd_buffer->pool->alloc,
190 state.offset + isl_dev->ss.aux_addr_offset,
191 iview->bo, aux_offset);
192 }
193 }
194
195 static bool
196 color_is_zero_one(VkClearColorValue value, enum isl_format format)
197 {
198 if (isl_format_has_int_channel(format)) {
199 for (unsigned i = 0; i < 4; i++) {
200 if (value.int32[i] != 0 && value.int32[i] != 1)
201 return false;
202 }
203 } else {
204 for (unsigned i = 0; i < 4; i++) {
205 if (value.float32[i] != 0.0f && value.float32[i] != 1.0f)
206 return false;
207 }
208 }
209
210 return true;
211 }
212
213 static void
214 color_attachment_compute_aux_usage(struct anv_device *device,
215 struct anv_attachment_state *att_state,
216 struct anv_image_view *iview,
217 VkRect2D render_area,
218 union isl_color_value *fast_clear_color)
219 {
220 if (iview->image->aux_surface.isl.size == 0) {
221 att_state->aux_usage = ISL_AUX_USAGE_NONE;
222 att_state->input_aux_usage = ISL_AUX_USAGE_NONE;
223 att_state->fast_clear = false;
224 return;
225 }
226
227 assert(iview->image->aux_surface.isl.usage & ISL_SURF_USAGE_CCS_BIT);
228
229 att_state->clear_color_is_zero_one =
230 color_is_zero_one(att_state->clear_value.color, iview->isl.format);
231
232 if (att_state->pending_clear_aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
233 /* Start off assuming fast clears are possible */
234 att_state->fast_clear = true;
235
236 /* Potentially, we could do partial fast-clears but doing so has crazy
237 * alignment restrictions. It's easier to just restrict to full size
238 * fast clears for now.
239 */
240 if (render_area.offset.x != 0 ||
241 render_area.offset.y != 0 ||
242 render_area.extent.width != iview->extent.width ||
243 render_area.extent.height != iview->extent.height)
244 att_state->fast_clear = false;
245
246 if (GEN_GEN <= 7) {
247 /* On gen7, we can't do multi-LOD or multi-layer fast-clears. We
248 * technically can, but it comes with crazy restrictions that we
249 * don't want to deal with now.
250 */
251 if (iview->isl.base_level > 0 ||
252 iview->isl.base_array_layer > 0 ||
253 iview->isl.array_len > 1)
254 att_state->fast_clear = false;
255 }
256
257 /* On Broadwell and earlier, we can only handle 0/1 clear colors */
258 if (GEN_GEN <= 8 && !att_state->clear_color_is_zero_one)
259 att_state->fast_clear = false;
260
261 if (att_state->fast_clear) {
262 memcpy(fast_clear_color->u32, att_state->clear_value.color.uint32,
263 sizeof(fast_clear_color->u32));
264 }
265 } else {
266 att_state->fast_clear = false;
267 }
268
269 if (isl_format_supports_lossless_compression(&device->info,
270 iview->isl.format)) {
271 att_state->aux_usage = ISL_AUX_USAGE_CCS_E;
272 att_state->input_aux_usage = ISL_AUX_USAGE_CCS_E;
273 } else if (att_state->fast_clear) {
274 att_state->aux_usage = ISL_AUX_USAGE_CCS_D;
275 if (GEN_GEN >= 9) {
276 /* From the Sky Lake PRM, RENDER_SURFACE_STATE::AuxiliarySurfaceMode:
277 *
278 * "If Number of Multisamples is MULTISAMPLECOUNT_1, AUX_CCS_D
279 * setting is only allowed if Surface Format supported for Fast
280 * Clear. In addition, if the surface is bound to the sampling
281 * engine, Surface Format must be supported for Render Target
282 * Compression for surfaces bound to the sampling engine."
283 *
284 * In other words, we can't sample from a fast-cleared image if it
285 * doesn't also support color compression.
286 */
287 att_state->input_aux_usage = ISL_AUX_USAGE_NONE;
288 } else if (GEN_GEN == 8) {
289 /* Broadwell can sample from fast-cleared images */
290 att_state->input_aux_usage = ISL_AUX_USAGE_CCS_D;
291 } else {
292 /* Ivy Bridge and Haswell cannot */
293 att_state->input_aux_usage = ISL_AUX_USAGE_NONE;
294 }
295 } else {
296 att_state->aux_usage = ISL_AUX_USAGE_NONE;
297 att_state->input_aux_usage = ISL_AUX_USAGE_NONE;
298 }
299 }
300
301 static bool
302 need_input_attachment_state(const struct anv_render_pass_attachment *att)
303 {
304 if (!(att->usage & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT))
305 return false;
306
307 /* We only allocate input attachment states for color surfaces. Compression
308 * is not yet enabled for depth textures and stencil doesn't allow
309 * compression so we can just use the texture surface state from the view.
310 */
311 return vk_format_is_color(att->format);
312 }
313
314 static enum isl_aux_usage
315 layout_to_hiz_usage(VkImageLayout layout, uint8_t samples)
316 {
317 switch (layout) {
318 case VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL:
319 return ISL_AUX_USAGE_HIZ;
320 case VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL:
321 case VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL:
322 if (anv_can_sample_with_hiz(GEN_GEN, samples))
323 return ISL_AUX_USAGE_HIZ;
324 /* Fall-through */
325 case VK_IMAGE_LAYOUT_GENERAL:
326 /* This buffer could be used as a source or destination in a transfer
327 * operation. Transfer operations current don't perform HiZ-enabled reads
328 * and writes.
329 */
330 default:
331 return ISL_AUX_USAGE_NONE;
332 }
333 }
334
335 /* Transitions a HiZ-enabled depth buffer from one layout to another. Unless
336 * the initial layout is undefined, the HiZ buffer and depth buffer will
337 * represent the same data at the end of this operation.
338 */
339 static void
340 transition_depth_buffer(struct anv_cmd_buffer *cmd_buffer,
341 const struct anv_image *image,
342 VkImageLayout initial_layout,
343 VkImageLayout final_layout)
344 {
345 assert(image);
346
347 if (image->aux_usage != ISL_AUX_USAGE_HIZ || final_layout == initial_layout)
348 return;
349
350 const bool hiz_enabled = layout_to_hiz_usage(initial_layout, image->samples) ==
351 ISL_AUX_USAGE_HIZ;
352 const bool enable_hiz = layout_to_hiz_usage(final_layout, image->samples) ==
353 ISL_AUX_USAGE_HIZ;
354
355 enum blorp_hiz_op hiz_op;
356 if (initial_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
357 /* We've already initialized the aux HiZ buffer at BindImageMemory time,
358 * so there's no need to perform a HIZ resolve or clear to avoid GPU hangs.
359 * This initial layout indicates that the user doesn't care about the data
360 * that's currently in the buffer, so resolves are not necessary except
361 * for the special case noted below.
362 */
363 hiz_op = BLORP_HIZ_OP_NONE;
364 } else if (hiz_enabled && !enable_hiz) {
365 hiz_op = BLORP_HIZ_OP_DEPTH_RESOLVE;
366 } else if (!hiz_enabled && enable_hiz) {
367 hiz_op = BLORP_HIZ_OP_HIZ_RESOLVE;
368 } else {
369 assert(hiz_enabled == enable_hiz);
370 /* If the same buffer will be used, no resolves are necessary except for
371 * the special case noted below.
372 */
373 hiz_op = BLORP_HIZ_OP_NONE;
374 }
375
376 if (hiz_op != BLORP_HIZ_OP_NONE)
377 anv_gen8_hiz_op_resolve(cmd_buffer, image, hiz_op);
378
379 /* Images that have sampling with HiZ enabled cause all shader sampling to
380 * load data with the HiZ buffer. Therefore, in the case of transitioning to
381 * the general layout - which currently routes all writes to the depth
382 * buffer - we must ensure that the HiZ buffer remains consistent with the
383 * depth buffer by performing an additional HIZ resolve if the operation
384 * required by this transition was not already a HiZ resolve.
385 */
386 if (final_layout == VK_IMAGE_LAYOUT_GENERAL &&
387 anv_can_sample_with_hiz(GEN_GEN, image->samples) &&
388 hiz_op != BLORP_HIZ_OP_HIZ_RESOLVE) {
389 anv_gen8_hiz_op_resolve(cmd_buffer, image, BLORP_HIZ_OP_HIZ_RESOLVE);
390 }
391 }
392
393
394 /**
395 * Setup anv_cmd_state::attachments for vkCmdBeginRenderPass.
396 */
397 static void
398 genX(cmd_buffer_setup_attachments)(struct anv_cmd_buffer *cmd_buffer,
399 struct anv_render_pass *pass,
400 const VkRenderPassBeginInfo *begin)
401 {
402 const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
403 struct anv_cmd_state *state = &cmd_buffer->state;
404
405 vk_free(&cmd_buffer->pool->alloc, state->attachments);
406
407 if (pass->attachment_count == 0) {
408 state->attachments = NULL;
409 return;
410 }
411
412 state->attachments = vk_alloc(&cmd_buffer->pool->alloc,
413 pass->attachment_count *
414 sizeof(state->attachments[0]),
415 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
416 if (state->attachments == NULL) {
417 /* FIXME: Propagate VK_ERROR_OUT_OF_HOST_MEMORY to vkEndCommandBuffer */
418 abort();
419 }
420
421 bool need_null_state = false;
422 unsigned num_states = 0;
423 for (uint32_t i = 0; i < pass->attachment_count; ++i) {
424 if (vk_format_is_color(pass->attachments[i].format)) {
425 num_states++;
426 } else {
427 /* We need a null state for any depth-stencil-only subpasses.
428 * Importantly, this includes depth/stencil clears so we create one
429 * whenever we have depth or stencil
430 */
431 need_null_state = true;
432 }
433
434 if (need_input_attachment_state(&pass->attachments[i]))
435 num_states++;
436 }
437 num_states += need_null_state;
438
439 const uint32_t ss_stride = align_u32(isl_dev->ss.size, isl_dev->ss.align);
440 state->render_pass_states =
441 anv_state_stream_alloc(&cmd_buffer->surface_state_stream,
442 num_states * ss_stride, isl_dev->ss.align);
443
444 struct anv_state next_state = state->render_pass_states;
445 next_state.alloc_size = isl_dev->ss.size;
446
447 if (need_null_state) {
448 state->null_surface_state = next_state;
449 next_state.offset += ss_stride;
450 next_state.map += ss_stride;
451 }
452
453 for (uint32_t i = 0; i < pass->attachment_count; ++i) {
454 if (vk_format_is_color(pass->attachments[i].format)) {
455 state->attachments[i].color_rt_state = next_state;
456 next_state.offset += ss_stride;
457 next_state.map += ss_stride;
458 }
459
460 if (need_input_attachment_state(&pass->attachments[i])) {
461 state->attachments[i].input_att_state = next_state;
462 next_state.offset += ss_stride;
463 next_state.map += ss_stride;
464 }
465 }
466 assert(next_state.offset == state->render_pass_states.offset +
467 state->render_pass_states.alloc_size);
468
469 if (begin) {
470 ANV_FROM_HANDLE(anv_framebuffer, framebuffer, begin->framebuffer);
471 assert(pass->attachment_count == framebuffer->attachment_count);
472
473 if (need_null_state) {
474 struct GENX(RENDER_SURFACE_STATE) null_ss = {
475 .SurfaceType = SURFTYPE_NULL,
476 .SurfaceArray = framebuffer->layers > 0,
477 .SurfaceFormat = ISL_FORMAT_R8G8B8A8_UNORM,
478 #if GEN_GEN >= 8
479 .TileMode = YMAJOR,
480 #else
481 .TiledSurface = true,
482 #endif
483 .Width = framebuffer->width - 1,
484 .Height = framebuffer->height - 1,
485 .Depth = framebuffer->layers - 1,
486 .RenderTargetViewExtent = framebuffer->layers - 1,
487 };
488 GENX(RENDER_SURFACE_STATE_pack)(NULL, state->null_surface_state.map,
489 &null_ss);
490 }
491
492 for (uint32_t i = 0; i < pass->attachment_count; ++i) {
493 struct anv_render_pass_attachment *att = &pass->attachments[i];
494 VkImageAspectFlags att_aspects = vk_format_aspects(att->format);
495 VkImageAspectFlags clear_aspects = 0;
496
497 if (att_aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
498 /* color attachment */
499 if (att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
500 clear_aspects |= VK_IMAGE_ASPECT_COLOR_BIT;
501 }
502 } else {
503 /* depthstencil attachment */
504 if ((att_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) &&
505 att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
506 clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
507 }
508 if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
509 att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
510 clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
511 }
512 }
513
514 state->attachments[i].current_layout = att->initial_layout;
515 state->attachments[i].pending_clear_aspects = clear_aspects;
516 if (clear_aspects)
517 state->attachments[i].clear_value = begin->pClearValues[i];
518
519 struct anv_image_view *iview = framebuffer->attachments[i];
520 assert(iview->vk_format == att->format);
521
522 union isl_color_value clear_color = { .u32 = { 0, } };
523 if (att_aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
524 color_attachment_compute_aux_usage(cmd_buffer->device,
525 &state->attachments[i],
526 iview, begin->renderArea,
527 &clear_color);
528
529 struct isl_view view = iview->isl;
530 view.usage |= ISL_SURF_USAGE_RENDER_TARGET_BIT;
531 isl_surf_fill_state(isl_dev,
532 state->attachments[i].color_rt_state.map,
533 .surf = &iview->image->color_surface.isl,
534 .view = &view,
535 .aux_surf = &iview->image->aux_surface.isl,
536 .aux_usage = state->attachments[i].aux_usage,
537 .clear_color = clear_color,
538 .mocs = cmd_buffer->device->default_mocs);
539
540 add_image_view_relocs(cmd_buffer, iview,
541 state->attachments[i].aux_usage,
542 state->attachments[i].color_rt_state);
543 } else {
544 if (iview->image->aux_usage == ISL_AUX_USAGE_HIZ) {
545 state->attachments[i].aux_usage =
546 layout_to_hiz_usage(att->initial_layout, iview->image->samples);
547 } else {
548 state->attachments[i].aux_usage = ISL_AUX_USAGE_NONE;
549 }
550 state->attachments[i].input_aux_usage = ISL_AUX_USAGE_NONE;
551 }
552
553 if (need_input_attachment_state(&pass->attachments[i])) {
554 struct isl_view view = iview->isl;
555 view.usage |= ISL_SURF_USAGE_TEXTURE_BIT;
556 isl_surf_fill_state(isl_dev,
557 state->attachments[i].input_att_state.map,
558 .surf = &iview->image->color_surface.isl,
559 .view = &view,
560 .aux_surf = &iview->image->aux_surface.isl,
561 .aux_usage = state->attachments[i].input_aux_usage,
562 .clear_color = clear_color,
563 .mocs = cmd_buffer->device->default_mocs);
564
565 add_image_view_relocs(cmd_buffer, iview,
566 state->attachments[i].input_aux_usage,
567 state->attachments[i].input_att_state);
568 }
569 }
570
571 if (!cmd_buffer->device->info.has_llc)
572 anv_state_clflush(state->render_pass_states);
573 }
574 }
575
576 VkResult
577 genX(BeginCommandBuffer)(
578 VkCommandBuffer commandBuffer,
579 const VkCommandBufferBeginInfo* pBeginInfo)
580 {
581 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
582
583 /* If this is the first vkBeginCommandBuffer, we must *initialize* the
584 * command buffer's state. Otherwise, we must *reset* its state. In both
585 * cases we reset it.
586 *
587 * From the Vulkan 1.0 spec:
588 *
589 * If a command buffer is in the executable state and the command buffer
590 * was allocated from a command pool with the
591 * VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT flag set, then
592 * vkBeginCommandBuffer implicitly resets the command buffer, behaving
593 * as if vkResetCommandBuffer had been called with
594 * VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT not set. It then puts
595 * the command buffer in the recording state.
596 */
597 anv_cmd_buffer_reset(cmd_buffer);
598
599 cmd_buffer->usage_flags = pBeginInfo->flags;
600
601 assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY ||
602 !(cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT));
603
604 genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
605
606 if (cmd_buffer->usage_flags &
607 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
608 cmd_buffer->state.pass =
609 anv_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass);
610 cmd_buffer->state.subpass =
611 &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];
612 cmd_buffer->state.framebuffer = NULL;
613
614 genX(cmd_buffer_setup_attachments)(cmd_buffer, cmd_buffer->state.pass,
615 NULL);
616
617 cmd_buffer->state.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
618 }
619
620 return VK_SUCCESS;
621 }
622
623 VkResult
624 genX(EndCommandBuffer)(
625 VkCommandBuffer commandBuffer)
626 {
627 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
628
629 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
630
631 anv_cmd_buffer_end_batch_buffer(cmd_buffer);
632
633 return VK_SUCCESS;
634 }
635
636 void
637 genX(CmdExecuteCommands)(
638 VkCommandBuffer commandBuffer,
639 uint32_t commandBufferCount,
640 const VkCommandBuffer* pCmdBuffers)
641 {
642 ANV_FROM_HANDLE(anv_cmd_buffer, primary, commandBuffer);
643
644 assert(primary->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
645
646 for (uint32_t i = 0; i < commandBufferCount; i++) {
647 ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
648
649 assert(secondary->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
650
651 if (secondary->usage_flags &
652 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
653 /* If we're continuing a render pass from the primary, we need to
654 * copy the surface states for the current subpass into the storage
655 * we allocated for them in BeginCommandBuffer.
656 */
657 struct anv_bo *ss_bo = &primary->device->surface_state_block_pool.bo;
658 struct anv_state src_state = primary->state.render_pass_states;
659 struct anv_state dst_state = secondary->state.render_pass_states;
660 assert(src_state.alloc_size == dst_state.alloc_size);
661
662 genX(cmd_buffer_gpu_memcpy)(primary, ss_bo, dst_state.offset,
663 ss_bo, src_state.offset,
664 src_state.alloc_size);
665 }
666
667 anv_cmd_buffer_add_secondary(primary, secondary);
668 }
669
670 /* Each of the secondary command buffers will use its own state base
671 * address. We need to re-emit state base address for the primary after
672 * all of the secondaries are done.
673 *
674 * TODO: Maybe we want to make this a dirty bit to avoid extra state base
675 * address calls?
676 */
677 genX(cmd_buffer_emit_state_base_address)(primary);
678 }
679
680 #define IVB_L3SQCREG1_SQGHPCI_DEFAULT 0x00730000
681 #define VLV_L3SQCREG1_SQGHPCI_DEFAULT 0x00d30000
682 #define HSW_L3SQCREG1_SQGHPCI_DEFAULT 0x00610000
683
684 /**
685 * Program the hardware to use the specified L3 configuration.
686 */
687 void
688 genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer,
689 const struct gen_l3_config *cfg)
690 {
691 assert(cfg);
692 if (cfg == cmd_buffer->state.current_l3_config)
693 return;
694
695 if (unlikely(INTEL_DEBUG & DEBUG_L3)) {
696 fprintf(stderr, "L3 config transition: ");
697 gen_dump_l3_config(cfg, stderr);
698 }
699
700 const bool has_slm = cfg->n[GEN_L3P_SLM];
701
702 /* According to the hardware docs, the L3 partitioning can only be changed
703 * while the pipeline is completely drained and the caches are flushed,
704 * which involves a first PIPE_CONTROL flush which stalls the pipeline...
705 */
706 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
707 pc.DCFlushEnable = true;
708 pc.PostSyncOperation = NoWrite;
709 pc.CommandStreamerStallEnable = true;
710 }
711
712 /* ...followed by a second pipelined PIPE_CONTROL that initiates
713 * invalidation of the relevant caches. Note that because RO invalidation
714 * happens at the top of the pipeline (i.e. right away as the PIPE_CONTROL
715 * command is processed by the CS) we cannot combine it with the previous
716 * stalling flush as the hardware documentation suggests, because that
717 * would cause the CS to stall on previous rendering *after* RO
718 * invalidation and wouldn't prevent the RO caches from being polluted by
719 * concurrent rendering before the stall completes. This intentionally
720 * doesn't implement the SKL+ hardware workaround suggesting to enable CS
721 * stall on PIPE_CONTROLs with the texture cache invalidation bit set for
722 * GPGPU workloads because the previous and subsequent PIPE_CONTROLs
723 * already guarantee that there is no concurrent GPGPU kernel execution
724 * (see SKL HSD 2132585).
725 */
726 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
727 pc.TextureCacheInvalidationEnable = true;
728 pc.ConstantCacheInvalidationEnable = true;
729 pc.InstructionCacheInvalidateEnable = true;
730 pc.StateCacheInvalidationEnable = true;
731 pc.PostSyncOperation = NoWrite;
732 }
733
734 /* Now send a third stalling flush to make sure that invalidation is
735 * complete when the L3 configuration registers are modified.
736 */
737 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
738 pc.DCFlushEnable = true;
739 pc.PostSyncOperation = NoWrite;
740 pc.CommandStreamerStallEnable = true;
741 }
742
743 #if GEN_GEN >= 8
744
745 assert(!cfg->n[GEN_L3P_IS] && !cfg->n[GEN_L3P_C] && !cfg->n[GEN_L3P_T]);
746
747 uint32_t l3cr;
748 anv_pack_struct(&l3cr, GENX(L3CNTLREG),
749 .SLMEnable = has_slm,
750 .URBAllocation = cfg->n[GEN_L3P_URB],
751 .ROAllocation = cfg->n[GEN_L3P_RO],
752 .DCAllocation = cfg->n[GEN_L3P_DC],
753 .AllAllocation = cfg->n[GEN_L3P_ALL]);
754
755 /* Set up the L3 partitioning. */
756 emit_lri(&cmd_buffer->batch, GENX(L3CNTLREG_num), l3cr);
757
758 #else
759
760 const bool has_dc = cfg->n[GEN_L3P_DC] || cfg->n[GEN_L3P_ALL];
761 const bool has_is = cfg->n[GEN_L3P_IS] || cfg->n[GEN_L3P_RO] ||
762 cfg->n[GEN_L3P_ALL];
763 const bool has_c = cfg->n[GEN_L3P_C] || cfg->n[GEN_L3P_RO] ||
764 cfg->n[GEN_L3P_ALL];
765 const bool has_t = cfg->n[GEN_L3P_T] || cfg->n[GEN_L3P_RO] ||
766 cfg->n[GEN_L3P_ALL];
767
768 assert(!cfg->n[GEN_L3P_ALL]);
769
770 /* When enabled SLM only uses a portion of the L3 on half of the banks,
771 * the matching space on the remaining banks has to be allocated to a
772 * client (URB for all validated configurations) set to the
773 * lower-bandwidth 2-bank address hashing mode.
774 */
775 const struct gen_device_info *devinfo = &cmd_buffer->device->info;
776 const bool urb_low_bw = has_slm && !devinfo->is_baytrail;
777 assert(!urb_low_bw || cfg->n[GEN_L3P_URB] == cfg->n[GEN_L3P_SLM]);
778
779 /* Minimum number of ways that can be allocated to the URB. */
780 MAYBE_UNUSED const unsigned n0_urb = devinfo->is_baytrail ? 32 : 0;
781 assert(cfg->n[GEN_L3P_URB] >= n0_urb);
782
783 uint32_t l3sqcr1, l3cr2, l3cr3;
784 anv_pack_struct(&l3sqcr1, GENX(L3SQCREG1),
785 .ConvertDC_UC = !has_dc,
786 .ConvertIS_UC = !has_is,
787 .ConvertC_UC = !has_c,
788 .ConvertT_UC = !has_t);
789 l3sqcr1 |=
790 GEN_IS_HASWELL ? HSW_L3SQCREG1_SQGHPCI_DEFAULT :
791 devinfo->is_baytrail ? VLV_L3SQCREG1_SQGHPCI_DEFAULT :
792 IVB_L3SQCREG1_SQGHPCI_DEFAULT;
793
794 anv_pack_struct(&l3cr2, GENX(L3CNTLREG2),
795 .SLMEnable = has_slm,
796 .URBLowBandwidth = urb_low_bw,
797 .URBAllocation = cfg->n[GEN_L3P_URB],
798 #if !GEN_IS_HASWELL
799 .ALLAllocation = cfg->n[GEN_L3P_ALL],
800 #endif
801 .ROAllocation = cfg->n[GEN_L3P_RO],
802 .DCAllocation = cfg->n[GEN_L3P_DC]);
803
804 anv_pack_struct(&l3cr3, GENX(L3CNTLREG3),
805 .ISAllocation = cfg->n[GEN_L3P_IS],
806 .ISLowBandwidth = 0,
807 .CAllocation = cfg->n[GEN_L3P_C],
808 .CLowBandwidth = 0,
809 .TAllocation = cfg->n[GEN_L3P_T],
810 .TLowBandwidth = 0);
811
812 /* Set up the L3 partitioning. */
813 emit_lri(&cmd_buffer->batch, GENX(L3SQCREG1_num), l3sqcr1);
814 emit_lri(&cmd_buffer->batch, GENX(L3CNTLREG2_num), l3cr2);
815 emit_lri(&cmd_buffer->batch, GENX(L3CNTLREG3_num), l3cr3);
816
817 #if GEN_IS_HASWELL
818 if (cmd_buffer->device->instance->physicalDevice.cmd_parser_version >= 4) {
819 /* Enable L3 atomics on HSW if we have a DC partition, otherwise keep
820 * them disabled to avoid crashing the system hard.
821 */
822 uint32_t scratch1, chicken3;
823 anv_pack_struct(&scratch1, GENX(SCRATCH1),
824 .L3AtomicDisable = !has_dc);
825 anv_pack_struct(&chicken3, GENX(CHICKEN3),
826 .L3AtomicDisableMask = true,
827 .L3AtomicDisable = !has_dc);
828 emit_lri(&cmd_buffer->batch, GENX(SCRATCH1_num), scratch1);
829 emit_lri(&cmd_buffer->batch, GENX(CHICKEN3_num), chicken3);
830 }
831 #endif
832
833 #endif
834
835 cmd_buffer->state.current_l3_config = cfg;
836 }
837
838 void
839 genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
840 {
841 enum anv_pipe_bits bits = cmd_buffer->state.pending_pipe_bits;
842
843 /* Flushes are pipelined while invalidations are handled immediately.
844 * Therefore, if we're flushing anything then we need to schedule a stall
845 * before any invalidations can happen.
846 */
847 if (bits & ANV_PIPE_FLUSH_BITS)
848 bits |= ANV_PIPE_NEEDS_CS_STALL_BIT;
849
850 /* If we're going to do an invalidate and we have a pending CS stall that
851 * has yet to be resolved, we do the CS stall now.
852 */
853 if ((bits & ANV_PIPE_INVALIDATE_BITS) &&
854 (bits & ANV_PIPE_NEEDS_CS_STALL_BIT)) {
855 bits |= ANV_PIPE_CS_STALL_BIT;
856 bits &= ~ANV_PIPE_NEEDS_CS_STALL_BIT;
857 }
858
859 if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT)) {
860 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
861 pipe.DepthCacheFlushEnable = bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
862 pipe.DCFlushEnable = bits & ANV_PIPE_DATA_CACHE_FLUSH_BIT;
863 pipe.RenderTargetCacheFlushEnable =
864 bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
865
866 pipe.DepthStallEnable = bits & ANV_PIPE_DEPTH_STALL_BIT;
867 pipe.CommandStreamerStallEnable = bits & ANV_PIPE_CS_STALL_BIT;
868 pipe.StallAtPixelScoreboard = bits & ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
869
870 /*
871 * According to the Broadwell documentation, any PIPE_CONTROL with the
872 * "Command Streamer Stall" bit set must also have another bit set,
873 * with five different options:
874 *
875 * - Render Target Cache Flush
876 * - Depth Cache Flush
877 * - Stall at Pixel Scoreboard
878 * - Post-Sync Operation
879 * - Depth Stall
880 * - DC Flush Enable
881 *
882 * I chose "Stall at Pixel Scoreboard" since that's what we use in
883 * mesa and it seems to work fine. The choice is fairly arbitrary.
884 */
885 if ((bits & ANV_PIPE_CS_STALL_BIT) &&
886 !(bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_DEPTH_STALL_BIT |
887 ANV_PIPE_STALL_AT_SCOREBOARD_BIT)))
888 pipe.StallAtPixelScoreboard = true;
889 }
890
891 bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT);
892 }
893
894 if (bits & ANV_PIPE_INVALIDATE_BITS) {
895 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
896 pipe.StateCacheInvalidationEnable =
897 bits & ANV_PIPE_STATE_CACHE_INVALIDATE_BIT;
898 pipe.ConstantCacheInvalidationEnable =
899 bits & ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
900 pipe.VFCacheInvalidationEnable =
901 bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
902 pipe.TextureCacheInvalidationEnable =
903 bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
904 pipe.InstructionCacheInvalidateEnable =
905 bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT;
906 }
907
908 bits &= ~ANV_PIPE_INVALIDATE_BITS;
909 }
910
911 cmd_buffer->state.pending_pipe_bits = bits;
912 }
913
914 void genX(CmdPipelineBarrier)(
915 VkCommandBuffer commandBuffer,
916 VkPipelineStageFlags srcStageMask,
917 VkPipelineStageFlags destStageMask,
918 VkBool32 byRegion,
919 uint32_t memoryBarrierCount,
920 const VkMemoryBarrier* pMemoryBarriers,
921 uint32_t bufferMemoryBarrierCount,
922 const VkBufferMemoryBarrier* pBufferMemoryBarriers,
923 uint32_t imageMemoryBarrierCount,
924 const VkImageMemoryBarrier* pImageMemoryBarriers)
925 {
926 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
927 uint32_t b;
928
929 /* XXX: Right now, we're really dumb and just flush whatever categories
930 * the app asks for. One of these days we may make this a bit better
931 * but right now that's all the hardware allows for in most areas.
932 */
933 VkAccessFlags src_flags = 0;
934 VkAccessFlags dst_flags = 0;
935
936 for (uint32_t i = 0; i < memoryBarrierCount; i++) {
937 src_flags |= pMemoryBarriers[i].srcAccessMask;
938 dst_flags |= pMemoryBarriers[i].dstAccessMask;
939 }
940
941 for (uint32_t i = 0; i < bufferMemoryBarrierCount; i++) {
942 src_flags |= pBufferMemoryBarriers[i].srcAccessMask;
943 dst_flags |= pBufferMemoryBarriers[i].dstAccessMask;
944 }
945
946 for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) {
947 src_flags |= pImageMemoryBarriers[i].srcAccessMask;
948 dst_flags |= pImageMemoryBarriers[i].dstAccessMask;
949 ANV_FROM_HANDLE(anv_image, image, pImageMemoryBarriers[i].image);
950 if (pImageMemoryBarriers[i].subresourceRange.aspectMask &
951 VK_IMAGE_ASPECT_DEPTH_BIT) {
952 transition_depth_buffer(cmd_buffer, image,
953 pImageMemoryBarriers[i].oldLayout,
954 pImageMemoryBarriers[i].newLayout);
955 }
956 }
957
958 enum anv_pipe_bits pipe_bits = 0;
959
960 for_each_bit(b, src_flags) {
961 switch ((VkAccessFlagBits)(1 << b)) {
962 case VK_ACCESS_SHADER_WRITE_BIT:
963 pipe_bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT;
964 break;
965 case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT:
966 pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
967 break;
968 case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
969 pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
970 break;
971 case VK_ACCESS_TRANSFER_WRITE_BIT:
972 pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
973 pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
974 break;
975 default:
976 break; /* Nothing to do */
977 }
978 }
979
980 for_each_bit(b, dst_flags) {
981 switch ((VkAccessFlagBits)(1 << b)) {
982 case VK_ACCESS_INDIRECT_COMMAND_READ_BIT:
983 case VK_ACCESS_INDEX_READ_BIT:
984 case VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT:
985 pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
986 break;
987 case VK_ACCESS_UNIFORM_READ_BIT:
988 pipe_bits |= ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
989 pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
990 break;
991 case VK_ACCESS_SHADER_READ_BIT:
992 case VK_ACCESS_INPUT_ATTACHMENT_READ_BIT:
993 case VK_ACCESS_TRANSFER_READ_BIT:
994 pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
995 break;
996 default:
997 break; /* Nothing to do */
998 }
999 }
1000
1001 cmd_buffer->state.pending_pipe_bits |= pipe_bits;
1002 }
1003
1004 static void
1005 cmd_buffer_alloc_push_constants(struct anv_cmd_buffer *cmd_buffer)
1006 {
1007 VkShaderStageFlags stages = cmd_buffer->state.pipeline->active_stages;
1008
1009 /* In order to avoid thrash, we assume that vertex and fragment stages
1010 * always exist. In the rare case where one is missing *and* the other
1011 * uses push concstants, this may be suboptimal. However, avoiding stalls
1012 * seems more important.
1013 */
1014 stages |= VK_SHADER_STAGE_FRAGMENT_BIT | VK_SHADER_STAGE_VERTEX_BIT;
1015
1016 if (stages == cmd_buffer->state.push_constant_stages)
1017 return;
1018
1019 #if GEN_GEN >= 8
1020 const unsigned push_constant_kb = 32;
1021 #elif GEN_IS_HASWELL
1022 const unsigned push_constant_kb = cmd_buffer->device->info.gt == 3 ? 32 : 16;
1023 #else
1024 const unsigned push_constant_kb = 16;
1025 #endif
1026
1027 const unsigned num_stages =
1028 _mesa_bitcount(stages & VK_SHADER_STAGE_ALL_GRAPHICS);
1029 unsigned size_per_stage = push_constant_kb / num_stages;
1030
1031 /* Broadwell+ and Haswell gt3 require that the push constant sizes be in
1032 * units of 2KB. Incidentally, these are the same platforms that have
1033 * 32KB worth of push constant space.
1034 */
1035 if (push_constant_kb == 32)
1036 size_per_stage &= ~1u;
1037
1038 uint32_t kb_used = 0;
1039 for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_FRAGMENT; i++) {
1040 unsigned push_size = (stages & (1 << i)) ? size_per_stage : 0;
1041 anv_batch_emit(&cmd_buffer->batch,
1042 GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
1043 alloc._3DCommandSubOpcode = 18 + i;
1044 alloc.ConstantBufferOffset = (push_size > 0) ? kb_used : 0;
1045 alloc.ConstantBufferSize = push_size;
1046 }
1047 kb_used += push_size;
1048 }
1049
1050 anv_batch_emit(&cmd_buffer->batch,
1051 GENX(3DSTATE_PUSH_CONSTANT_ALLOC_PS), alloc) {
1052 alloc.ConstantBufferOffset = kb_used;
1053 alloc.ConstantBufferSize = push_constant_kb - kb_used;
1054 }
1055
1056 cmd_buffer->state.push_constant_stages = stages;
1057
1058 /* From the BDW PRM for 3DSTATE_PUSH_CONSTANT_ALLOC_VS:
1059 *
1060 * "The 3DSTATE_CONSTANT_VS must be reprogrammed prior to
1061 * the next 3DPRIMITIVE command after programming the
1062 * 3DSTATE_PUSH_CONSTANT_ALLOC_VS"
1063 *
1064 * Since 3DSTATE_PUSH_CONSTANT_ALLOC_VS is programmed as part of
1065 * pipeline setup, we need to dirty push constants.
1066 */
1067 cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS;
1068 }
1069
1070 static VkResult
1071 emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
1072 gl_shader_stage stage,
1073 struct anv_state *bt_state)
1074 {
1075 struct anv_subpass *subpass = cmd_buffer->state.subpass;
1076 struct anv_pipeline *pipeline;
1077 uint32_t bias, state_offset;
1078
1079 switch (stage) {
1080 case MESA_SHADER_COMPUTE:
1081 pipeline = cmd_buffer->state.compute_pipeline;
1082 bias = 1;
1083 break;
1084 default:
1085 pipeline = cmd_buffer->state.pipeline;
1086 bias = 0;
1087 break;
1088 }
1089
1090 if (!anv_pipeline_has_stage(pipeline, stage)) {
1091 *bt_state = (struct anv_state) { 0, };
1092 return VK_SUCCESS;
1093 }
1094
1095 struct anv_pipeline_bind_map *map = &pipeline->shaders[stage]->bind_map;
1096 if (bias + map->surface_count == 0) {
1097 *bt_state = (struct anv_state) { 0, };
1098 return VK_SUCCESS;
1099 }
1100
1101 *bt_state = anv_cmd_buffer_alloc_binding_table(cmd_buffer,
1102 bias + map->surface_count,
1103 &state_offset);
1104 uint32_t *bt_map = bt_state->map;
1105
1106 if (bt_state->map == NULL)
1107 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
1108
1109 if (stage == MESA_SHADER_COMPUTE &&
1110 get_cs_prog_data(cmd_buffer->state.compute_pipeline)->uses_num_work_groups) {
1111 struct anv_bo *bo = cmd_buffer->state.num_workgroups_bo;
1112 uint32_t bo_offset = cmd_buffer->state.num_workgroups_offset;
1113
1114 struct anv_state surface_state;
1115 surface_state =
1116 anv_cmd_buffer_alloc_surface_state(cmd_buffer);
1117
1118 const enum isl_format format =
1119 anv_isl_format_for_descriptor_type(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
1120 anv_fill_buffer_surface_state(cmd_buffer->device, surface_state,
1121 format, bo_offset, 12, 1);
1122
1123 bt_map[0] = surface_state.offset + state_offset;
1124 add_surface_state_reloc(cmd_buffer, surface_state, bo, bo_offset);
1125 }
1126
1127 if (map->surface_count == 0)
1128 goto out;
1129
1130 if (map->image_count > 0) {
1131 VkResult result =
1132 anv_cmd_buffer_ensure_push_constant_field(cmd_buffer, stage, images);
1133 if (result != VK_SUCCESS)
1134 return result;
1135
1136 cmd_buffer->state.push_constants_dirty |= 1 << stage;
1137 }
1138
1139 uint32_t image = 0;
1140 for (uint32_t s = 0; s < map->surface_count; s++) {
1141 struct anv_pipeline_binding *binding = &map->surface_to_descriptor[s];
1142
1143 struct anv_state surface_state;
1144
1145 if (binding->set == ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS) {
1146 /* Color attachment binding */
1147 assert(stage == MESA_SHADER_FRAGMENT);
1148 assert(binding->binding == 0);
1149 if (binding->index < subpass->color_count) {
1150 const unsigned att = subpass->color_attachments[binding->index];
1151 surface_state = cmd_buffer->state.attachments[att].color_rt_state;
1152 } else {
1153 surface_state = cmd_buffer->state.null_surface_state;
1154 }
1155
1156 bt_map[bias + s] = surface_state.offset + state_offset;
1157 continue;
1158 }
1159
1160 struct anv_descriptor_set *set =
1161 cmd_buffer->state.descriptors[binding->set];
1162 uint32_t offset = set->layout->binding[binding->binding].descriptor_index;
1163 struct anv_descriptor *desc = &set->descriptors[offset + binding->index];
1164
1165 switch (desc->type) {
1166 case VK_DESCRIPTOR_TYPE_SAMPLER:
1167 /* Nothing for us to do here */
1168 continue;
1169
1170 case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
1171 case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
1172 surface_state = desc->image_view->sampler_surface_state;
1173 assert(surface_state.alloc_size);
1174 add_image_view_relocs(cmd_buffer, desc->image_view,
1175 desc->image_view->image->aux_usage,
1176 surface_state);
1177 break;
1178
1179 case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
1180 assert(stage == MESA_SHADER_FRAGMENT);
1181 if (desc->image_view->aspect_mask != VK_IMAGE_ASPECT_COLOR_BIT) {
1182 /* For depth and stencil input attachments, we treat it like any
1183 * old texture that a user may have bound.
1184 */
1185 surface_state = desc->image_view->sampler_surface_state;
1186 assert(surface_state.alloc_size);
1187 add_image_view_relocs(cmd_buffer, desc->image_view,
1188 desc->image_view->image->aux_usage,
1189 surface_state);
1190 } else {
1191 /* For color input attachments, we create the surface state at
1192 * vkBeginRenderPass time so that we can include aux and clear
1193 * color information.
1194 */
1195 assert(binding->input_attachment_index < subpass->input_count);
1196 const unsigned subpass_att = binding->input_attachment_index;
1197 const unsigned att = subpass->input_attachments[subpass_att];
1198 surface_state = cmd_buffer->state.attachments[att].input_att_state;
1199 }
1200 break;
1201
1202 case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: {
1203 surface_state = desc->image_view->storage_surface_state;
1204 assert(surface_state.alloc_size);
1205 add_image_view_relocs(cmd_buffer, desc->image_view,
1206 desc->image_view->image->aux_usage,
1207 surface_state);
1208
1209 struct brw_image_param *image_param =
1210 &cmd_buffer->state.push_constants[stage]->images[image++];
1211
1212 *image_param = desc->image_view->storage_image_param;
1213 image_param->surface_idx = bias + s;
1214 break;
1215 }
1216
1217 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
1218 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
1219 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
1220 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
1221 case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
1222 surface_state = desc->buffer_view->surface_state;
1223 assert(surface_state.alloc_size);
1224 add_surface_state_reloc(cmd_buffer, surface_state,
1225 desc->buffer_view->bo,
1226 desc->buffer_view->offset);
1227 break;
1228
1229 case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
1230 surface_state = desc->buffer_view->storage_surface_state;
1231 assert(surface_state.alloc_size);
1232 add_surface_state_reloc(cmd_buffer, surface_state,
1233 desc->buffer_view->bo,
1234 desc->buffer_view->offset);
1235
1236 struct brw_image_param *image_param =
1237 &cmd_buffer->state.push_constants[stage]->images[image++];
1238
1239 *image_param = desc->buffer_view->storage_image_param;
1240 image_param->surface_idx = bias + s;
1241 break;
1242
1243 default:
1244 assert(!"Invalid descriptor type");
1245 continue;
1246 }
1247
1248 bt_map[bias + s] = surface_state.offset + state_offset;
1249 }
1250 assert(image == map->image_count);
1251
1252 out:
1253 if (!cmd_buffer->device->info.has_llc)
1254 anv_state_clflush(*bt_state);
1255
1256 return VK_SUCCESS;
1257 }
1258
1259 static VkResult
1260 emit_samplers(struct anv_cmd_buffer *cmd_buffer,
1261 gl_shader_stage stage,
1262 struct anv_state *state)
1263 {
1264 struct anv_pipeline *pipeline;
1265
1266 if (stage == MESA_SHADER_COMPUTE)
1267 pipeline = cmd_buffer->state.compute_pipeline;
1268 else
1269 pipeline = cmd_buffer->state.pipeline;
1270
1271 if (!anv_pipeline_has_stage(pipeline, stage)) {
1272 *state = (struct anv_state) { 0, };
1273 return VK_SUCCESS;
1274 }
1275
1276 struct anv_pipeline_bind_map *map = &pipeline->shaders[stage]->bind_map;
1277 if (map->sampler_count == 0) {
1278 *state = (struct anv_state) { 0, };
1279 return VK_SUCCESS;
1280 }
1281
1282 uint32_t size = map->sampler_count * 16;
1283 *state = anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, size, 32);
1284
1285 if (state->map == NULL)
1286 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
1287
1288 for (uint32_t s = 0; s < map->sampler_count; s++) {
1289 struct anv_pipeline_binding *binding = &map->sampler_to_descriptor[s];
1290 struct anv_descriptor_set *set =
1291 cmd_buffer->state.descriptors[binding->set];
1292 uint32_t offset = set->layout->binding[binding->binding].descriptor_index;
1293 struct anv_descriptor *desc = &set->descriptors[offset + binding->index];
1294
1295 if (desc->type != VK_DESCRIPTOR_TYPE_SAMPLER &&
1296 desc->type != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
1297 continue;
1298
1299 struct anv_sampler *sampler = desc->sampler;
1300
1301 /* This can happen if we have an unfilled slot since TYPE_SAMPLER
1302 * happens to be zero.
1303 */
1304 if (sampler == NULL)
1305 continue;
1306
1307 memcpy(state->map + (s * 16),
1308 sampler->state, sizeof(sampler->state));
1309 }
1310
1311 if (!cmd_buffer->device->info.has_llc)
1312 anv_state_clflush(*state);
1313
1314 return VK_SUCCESS;
1315 }
1316
1317 static uint32_t
1318 flush_descriptor_sets(struct anv_cmd_buffer *cmd_buffer)
1319 {
1320 VkShaderStageFlags dirty = cmd_buffer->state.descriptors_dirty &
1321 cmd_buffer->state.pipeline->active_stages;
1322
1323 VkResult result = VK_SUCCESS;
1324 anv_foreach_stage(s, dirty) {
1325 result = emit_samplers(cmd_buffer, s, &cmd_buffer->state.samplers[s]);
1326 if (result != VK_SUCCESS)
1327 break;
1328 result = emit_binding_table(cmd_buffer, s,
1329 &cmd_buffer->state.binding_tables[s]);
1330 if (result != VK_SUCCESS)
1331 break;
1332 }
1333
1334 if (result != VK_SUCCESS) {
1335 assert(result == VK_ERROR_OUT_OF_DEVICE_MEMORY);
1336
1337 result = anv_cmd_buffer_new_binding_table_block(cmd_buffer);
1338 assert(result == VK_SUCCESS);
1339
1340 /* Re-emit state base addresses so we get the new surface state base
1341 * address before we start emitting binding tables etc.
1342 */
1343 genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
1344
1345 /* Re-emit all active binding tables */
1346 dirty |= cmd_buffer->state.pipeline->active_stages;
1347 anv_foreach_stage(s, dirty) {
1348 result = emit_samplers(cmd_buffer, s, &cmd_buffer->state.samplers[s]);
1349 if (result != VK_SUCCESS)
1350 return result;
1351 result = emit_binding_table(cmd_buffer, s,
1352 &cmd_buffer->state.binding_tables[s]);
1353 if (result != VK_SUCCESS)
1354 return result;
1355 }
1356 }
1357
1358 cmd_buffer->state.descriptors_dirty &= ~dirty;
1359
1360 return dirty;
1361 }
1362
1363 static void
1364 cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer *cmd_buffer,
1365 uint32_t stages)
1366 {
1367 static const uint32_t sampler_state_opcodes[] = {
1368 [MESA_SHADER_VERTEX] = 43,
1369 [MESA_SHADER_TESS_CTRL] = 44, /* HS */
1370 [MESA_SHADER_TESS_EVAL] = 45, /* DS */
1371 [MESA_SHADER_GEOMETRY] = 46,
1372 [MESA_SHADER_FRAGMENT] = 47,
1373 [MESA_SHADER_COMPUTE] = 0,
1374 };
1375
1376 static const uint32_t binding_table_opcodes[] = {
1377 [MESA_SHADER_VERTEX] = 38,
1378 [MESA_SHADER_TESS_CTRL] = 39,
1379 [MESA_SHADER_TESS_EVAL] = 40,
1380 [MESA_SHADER_GEOMETRY] = 41,
1381 [MESA_SHADER_FRAGMENT] = 42,
1382 [MESA_SHADER_COMPUTE] = 0,
1383 };
1384
1385 anv_foreach_stage(s, stages) {
1386 if (cmd_buffer->state.samplers[s].alloc_size > 0) {
1387 anv_batch_emit(&cmd_buffer->batch,
1388 GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ssp) {
1389 ssp._3DCommandSubOpcode = sampler_state_opcodes[s];
1390 ssp.PointertoVSSamplerState = cmd_buffer->state.samplers[s].offset;
1391 }
1392 }
1393
1394 /* Always emit binding table pointers if we're asked to, since on SKL
1395 * this is what flushes push constants. */
1396 anv_batch_emit(&cmd_buffer->batch,
1397 GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), btp) {
1398 btp._3DCommandSubOpcode = binding_table_opcodes[s];
1399 btp.PointertoVSBindingTable = cmd_buffer->state.binding_tables[s].offset;
1400 }
1401 }
1402 }
1403
1404 static uint32_t
1405 cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer)
1406 {
1407 static const uint32_t push_constant_opcodes[] = {
1408 [MESA_SHADER_VERTEX] = 21,
1409 [MESA_SHADER_TESS_CTRL] = 25, /* HS */
1410 [MESA_SHADER_TESS_EVAL] = 26, /* DS */
1411 [MESA_SHADER_GEOMETRY] = 22,
1412 [MESA_SHADER_FRAGMENT] = 23,
1413 [MESA_SHADER_COMPUTE] = 0,
1414 };
1415
1416 VkShaderStageFlags flushed = 0;
1417
1418 anv_foreach_stage(stage, cmd_buffer->state.push_constants_dirty) {
1419 if (stage == MESA_SHADER_COMPUTE)
1420 continue;
1421
1422 struct anv_state state = anv_cmd_buffer_push_constants(cmd_buffer, stage);
1423
1424 if (state.offset == 0) {
1425 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c)
1426 c._3DCommandSubOpcode = push_constant_opcodes[stage];
1427 } else {
1428 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c) {
1429 c._3DCommandSubOpcode = push_constant_opcodes[stage],
1430 c.ConstantBody = (struct GENX(3DSTATE_CONSTANT_BODY)) {
1431 #if GEN_GEN >= 9
1432 .PointerToConstantBuffer2 = { &cmd_buffer->device->dynamic_state_block_pool.bo, state.offset },
1433 .ConstantBuffer2ReadLength = DIV_ROUND_UP(state.alloc_size, 32),
1434 #else
1435 .PointerToConstantBuffer0 = { .offset = state.offset },
1436 .ConstantBuffer0ReadLength = DIV_ROUND_UP(state.alloc_size, 32),
1437 #endif
1438 };
1439 }
1440 }
1441
1442 flushed |= mesa_to_vk_shader_stage(stage);
1443 }
1444
1445 cmd_buffer->state.push_constants_dirty &= ~VK_SHADER_STAGE_ALL_GRAPHICS;
1446
1447 return flushed;
1448 }
1449
1450 void
1451 genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer)
1452 {
1453 struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
1454 uint32_t *p;
1455
1456 uint32_t vb_emit = cmd_buffer->state.vb_dirty & pipeline->vb_used;
1457
1458 assert((pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT) == 0);
1459
1460 genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->urb.l3_config);
1461
1462 genX(flush_pipeline_select_3d)(cmd_buffer);
1463
1464 if (vb_emit) {
1465 const uint32_t num_buffers = __builtin_popcount(vb_emit);
1466 const uint32_t num_dwords = 1 + num_buffers * 4;
1467
1468 p = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
1469 GENX(3DSTATE_VERTEX_BUFFERS));
1470 uint32_t vb, i = 0;
1471 for_each_bit(vb, vb_emit) {
1472 struct anv_buffer *buffer = cmd_buffer->state.vertex_bindings[vb].buffer;
1473 uint32_t offset = cmd_buffer->state.vertex_bindings[vb].offset;
1474
1475 struct GENX(VERTEX_BUFFER_STATE) state = {
1476 .VertexBufferIndex = vb,
1477
1478 #if GEN_GEN >= 8
1479 .MemoryObjectControlState = GENX(MOCS),
1480 #else
1481 .BufferAccessType = pipeline->instancing_enable[vb] ? INSTANCEDATA : VERTEXDATA,
1482 .InstanceDataStepRate = 1,
1483 .VertexBufferMemoryObjectControlState = GENX(MOCS),
1484 #endif
1485
1486 .AddressModifyEnable = true,
1487 .BufferPitch = pipeline->binding_stride[vb],
1488 .BufferStartingAddress = { buffer->bo, buffer->offset + offset },
1489
1490 #if GEN_GEN >= 8
1491 .BufferSize = buffer->size - offset
1492 #else
1493 .EndAddress = { buffer->bo, buffer->offset + buffer->size - 1},
1494 #endif
1495 };
1496
1497 GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state);
1498 i++;
1499 }
1500 }
1501
1502 cmd_buffer->state.vb_dirty &= ~vb_emit;
1503
1504 if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_PIPELINE) {
1505 anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->batch);
1506
1507 /* The exact descriptor layout is pulled from the pipeline, so we need
1508 * to re-emit binding tables on every pipeline change.
1509 */
1510 cmd_buffer->state.descriptors_dirty |=
1511 cmd_buffer->state.pipeline->active_stages;
1512
1513 /* If the pipeline changed, we may need to re-allocate push constant
1514 * space in the URB.
1515 */
1516 cmd_buffer_alloc_push_constants(cmd_buffer);
1517 }
1518
1519 #if GEN_GEN <= 7
1520 if (cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_VERTEX_BIT ||
1521 cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_VERTEX_BIT) {
1522 /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1:
1523 *
1524 * "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth
1525 * stall needs to be sent just prior to any 3DSTATE_VS,
1526 * 3DSTATE_URB_VS, 3DSTATE_CONSTANT_VS,
1527 * 3DSTATE_BINDING_TABLE_POINTER_VS,
1528 * 3DSTATE_SAMPLER_STATE_POINTER_VS command. Only one
1529 * PIPE_CONTROL needs to be sent before any combination of VS
1530 * associated 3DSTATE."
1531 */
1532 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1533 pc.DepthStallEnable = true;
1534 pc.PostSyncOperation = WriteImmediateData;
1535 pc.Address =
1536 (struct anv_address) { &cmd_buffer->device->workaround_bo, 0 };
1537 }
1538 }
1539 #endif
1540
1541 /* Render targets live in the same binding table as fragment descriptors */
1542 if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_RENDER_TARGETS)
1543 cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
1544
1545 /* We emit the binding tables and sampler tables first, then emit push
1546 * constants and then finally emit binding table and sampler table
1547 * pointers. It has to happen in this order, since emitting the binding
1548 * tables may change the push constants (in case of storage images). After
1549 * emitting push constants, on SKL+ we have to emit the corresponding
1550 * 3DSTATE_BINDING_TABLE_POINTER_* for the push constants to take effect.
1551 */
1552 uint32_t dirty = 0;
1553 if (cmd_buffer->state.descriptors_dirty)
1554 dirty = flush_descriptor_sets(cmd_buffer);
1555
1556 if (cmd_buffer->state.push_constants_dirty) {
1557 #if GEN_GEN >= 9
1558 /* On Sky Lake and later, the binding table pointers commands are
1559 * what actually flush the changes to push constant state so we need
1560 * to dirty them so they get re-emitted below.
1561 */
1562 dirty |= cmd_buffer_flush_push_constants(cmd_buffer);
1563 #else
1564 cmd_buffer_flush_push_constants(cmd_buffer);
1565 #endif
1566 }
1567
1568 if (dirty)
1569 cmd_buffer_emit_descriptor_pointers(cmd_buffer, dirty);
1570
1571 if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_DYNAMIC_VIEWPORT)
1572 gen8_cmd_buffer_emit_viewport(cmd_buffer);
1573
1574 if (cmd_buffer->state.dirty & (ANV_CMD_DIRTY_DYNAMIC_VIEWPORT |
1575 ANV_CMD_DIRTY_PIPELINE)) {
1576 gen8_cmd_buffer_emit_depth_viewport(cmd_buffer,
1577 pipeline->depth_clamp_enable);
1578 }
1579
1580 if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_DYNAMIC_SCISSOR)
1581 gen7_cmd_buffer_emit_scissor(cmd_buffer);
1582
1583 genX(cmd_buffer_flush_dynamic_state)(cmd_buffer);
1584
1585 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1586 }
1587
1588 static void
1589 emit_base_vertex_instance_bo(struct anv_cmd_buffer *cmd_buffer,
1590 struct anv_bo *bo, uint32_t offset)
1591 {
1592 uint32_t *p = anv_batch_emitn(&cmd_buffer->batch, 5,
1593 GENX(3DSTATE_VERTEX_BUFFERS));
1594
1595 GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, p + 1,
1596 &(struct GENX(VERTEX_BUFFER_STATE)) {
1597 .VertexBufferIndex = 32, /* Reserved for this */
1598 .AddressModifyEnable = true,
1599 .BufferPitch = 0,
1600 #if (GEN_GEN >= 8)
1601 .MemoryObjectControlState = GENX(MOCS),
1602 .BufferStartingAddress = { bo, offset },
1603 .BufferSize = 8
1604 #else
1605 .VertexBufferMemoryObjectControlState = GENX(MOCS),
1606 .BufferStartingAddress = { bo, offset },
1607 .EndAddress = { bo, offset + 8 },
1608 #endif
1609 });
1610 }
1611
1612 static void
1613 emit_base_vertex_instance(struct anv_cmd_buffer *cmd_buffer,
1614 uint32_t base_vertex, uint32_t base_instance)
1615 {
1616 struct anv_state id_state =
1617 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 8, 4);
1618
1619 ((uint32_t *)id_state.map)[0] = base_vertex;
1620 ((uint32_t *)id_state.map)[1] = base_instance;
1621
1622 if (!cmd_buffer->device->info.has_llc)
1623 anv_state_clflush(id_state);
1624
1625 emit_base_vertex_instance_bo(cmd_buffer,
1626 &cmd_buffer->device->dynamic_state_block_pool.bo, id_state.offset);
1627 }
1628
1629 void genX(CmdDraw)(
1630 VkCommandBuffer commandBuffer,
1631 uint32_t vertexCount,
1632 uint32_t instanceCount,
1633 uint32_t firstVertex,
1634 uint32_t firstInstance)
1635 {
1636 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1637 struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
1638 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1639
1640 genX(cmd_buffer_flush_state)(cmd_buffer);
1641
1642 if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance)
1643 emit_base_vertex_instance(cmd_buffer, firstVertex, firstInstance);
1644
1645 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
1646 prim.VertexAccessType = SEQUENTIAL;
1647 prim.PrimitiveTopologyType = pipeline->topology;
1648 prim.VertexCountPerInstance = vertexCount;
1649 prim.StartVertexLocation = firstVertex;
1650 prim.InstanceCount = instanceCount;
1651 prim.StartInstanceLocation = firstInstance;
1652 prim.BaseVertexLocation = 0;
1653 }
1654 }
1655
1656 void genX(CmdDrawIndexed)(
1657 VkCommandBuffer commandBuffer,
1658 uint32_t indexCount,
1659 uint32_t instanceCount,
1660 uint32_t firstIndex,
1661 int32_t vertexOffset,
1662 uint32_t firstInstance)
1663 {
1664 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1665 struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
1666 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1667
1668 genX(cmd_buffer_flush_state)(cmd_buffer);
1669
1670 if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance)
1671 emit_base_vertex_instance(cmd_buffer, vertexOffset, firstInstance);
1672
1673 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
1674 prim.VertexAccessType = RANDOM;
1675 prim.PrimitiveTopologyType = pipeline->topology;
1676 prim.VertexCountPerInstance = indexCount;
1677 prim.StartVertexLocation = firstIndex;
1678 prim.InstanceCount = instanceCount;
1679 prim.StartInstanceLocation = firstInstance;
1680 prim.BaseVertexLocation = vertexOffset;
1681 }
1682 }
1683
1684 /* Auto-Draw / Indirect Registers */
1685 #define GEN7_3DPRIM_END_OFFSET 0x2420
1686 #define GEN7_3DPRIM_START_VERTEX 0x2430
1687 #define GEN7_3DPRIM_VERTEX_COUNT 0x2434
1688 #define GEN7_3DPRIM_INSTANCE_COUNT 0x2438
1689 #define GEN7_3DPRIM_START_INSTANCE 0x243C
1690 #define GEN7_3DPRIM_BASE_VERTEX 0x2440
1691
1692 void genX(CmdDrawIndirect)(
1693 VkCommandBuffer commandBuffer,
1694 VkBuffer _buffer,
1695 VkDeviceSize offset,
1696 uint32_t drawCount,
1697 uint32_t stride)
1698 {
1699 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1700 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
1701 struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
1702 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1703 struct anv_bo *bo = buffer->bo;
1704 uint32_t bo_offset = buffer->offset + offset;
1705
1706 genX(cmd_buffer_flush_state)(cmd_buffer);
1707
1708 if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance)
1709 emit_base_vertex_instance_bo(cmd_buffer, bo, bo_offset + 8);
1710
1711 emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset);
1712 emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4);
1713 emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8);
1714 emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 12);
1715 emit_lri(&cmd_buffer->batch, GEN7_3DPRIM_BASE_VERTEX, 0);
1716
1717 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
1718 prim.IndirectParameterEnable = true;
1719 prim.VertexAccessType = SEQUENTIAL;
1720 prim.PrimitiveTopologyType = pipeline->topology;
1721 }
1722 }
1723
1724 void genX(CmdDrawIndexedIndirect)(
1725 VkCommandBuffer commandBuffer,
1726 VkBuffer _buffer,
1727 VkDeviceSize offset,
1728 uint32_t drawCount,
1729 uint32_t stride)
1730 {
1731 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1732 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
1733 struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
1734 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1735 struct anv_bo *bo = buffer->bo;
1736 uint32_t bo_offset = buffer->offset + offset;
1737
1738 genX(cmd_buffer_flush_state)(cmd_buffer);
1739
1740 /* TODO: We need to stomp base vertex to 0 somehow */
1741 if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance)
1742 emit_base_vertex_instance_bo(cmd_buffer, bo, bo_offset + 12);
1743
1744 emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset);
1745 emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4);
1746 emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8);
1747 emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_BASE_VERTEX, bo, bo_offset + 12);
1748 emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 16);
1749
1750 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
1751 prim.IndirectParameterEnable = true;
1752 prim.VertexAccessType = RANDOM;
1753 prim.PrimitiveTopologyType = pipeline->topology;
1754 }
1755 }
1756
1757 static VkResult
1758 flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer)
1759 {
1760 struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
1761 struct anv_state surfaces = { 0, }, samplers = { 0, };
1762 VkResult result;
1763
1764 result = emit_binding_table(cmd_buffer, MESA_SHADER_COMPUTE, &surfaces);
1765 if (result != VK_SUCCESS) {
1766 assert(result == VK_ERROR_OUT_OF_DEVICE_MEMORY);
1767 result = anv_cmd_buffer_new_binding_table_block(cmd_buffer);
1768 assert(result == VK_SUCCESS);
1769
1770 /* Re-emit state base addresses so we get the new surface state base
1771 * address before we start emitting binding tables etc.
1772 */
1773 genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
1774
1775 result = emit_binding_table(cmd_buffer, MESA_SHADER_COMPUTE, &surfaces);
1776 assert(result == VK_SUCCESS);
1777 }
1778
1779 result = emit_samplers(cmd_buffer, MESA_SHADER_COMPUTE, &samplers);
1780 assert(result == VK_SUCCESS);
1781
1782 uint32_t iface_desc_data_dw[GENX(INTERFACE_DESCRIPTOR_DATA_length)];
1783 struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = {
1784 .BindingTablePointer = surfaces.offset,
1785 .SamplerStatePointer = samplers.offset,
1786 };
1787 GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, iface_desc_data_dw, &desc);
1788
1789 struct anv_state state =
1790 anv_cmd_buffer_merge_dynamic(cmd_buffer, iface_desc_data_dw,
1791 pipeline->interface_descriptor_data,
1792 GENX(INTERFACE_DESCRIPTOR_DATA_length),
1793 64);
1794
1795 uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
1796 anv_batch_emit(&cmd_buffer->batch,
1797 GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) {
1798 mid.InterfaceDescriptorTotalLength = size;
1799 mid.InterfaceDescriptorDataStartAddress = state.offset;
1800 }
1801
1802 return VK_SUCCESS;
1803 }
1804
1805 void
1806 genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer)
1807 {
1808 struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
1809 MAYBE_UNUSED VkResult result;
1810
1811 assert(pipeline->active_stages == VK_SHADER_STAGE_COMPUTE_BIT);
1812
1813 genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->urb.l3_config);
1814
1815 genX(flush_pipeline_select_gpgpu)(cmd_buffer);
1816
1817 if (cmd_buffer->state.compute_dirty & ANV_CMD_DIRTY_PIPELINE) {
1818 /* From the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE:
1819 *
1820 * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
1821 * the only bits that are changed are scoreboard related: Scoreboard
1822 * Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For
1823 * these scoreboard related states, a MEDIA_STATE_FLUSH is
1824 * sufficient."
1825 */
1826 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT;
1827 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1828
1829 anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->batch);
1830 }
1831
1832 if ((cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_COMPUTE_BIT) ||
1833 (cmd_buffer->state.compute_dirty & ANV_CMD_DIRTY_PIPELINE)) {
1834 /* FIXME: figure out descriptors for gen7 */
1835 result = flush_compute_descriptor_set(cmd_buffer);
1836 assert(result == VK_SUCCESS);
1837 cmd_buffer->state.descriptors_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
1838 }
1839
1840 if (cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_COMPUTE_BIT) {
1841 struct anv_state push_state =
1842 anv_cmd_buffer_cs_push_constants(cmd_buffer);
1843
1844 if (push_state.alloc_size) {
1845 anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_CURBE_LOAD), curbe) {
1846 curbe.CURBETotalDataLength = push_state.alloc_size;
1847 curbe.CURBEDataStartAddress = push_state.offset;
1848 }
1849 }
1850 }
1851
1852 cmd_buffer->state.compute_dirty = 0;
1853
1854 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1855 }
1856
1857 #if GEN_GEN == 7
1858
1859 static bool
1860 verify_cmd_parser(const struct anv_device *device,
1861 int required_version,
1862 const char *function)
1863 {
1864 if (device->instance->physicalDevice.cmd_parser_version < required_version) {
1865 vk_errorf(VK_ERROR_FEATURE_NOT_PRESENT,
1866 "cmd parser version %d is required for %s",
1867 required_version, function);
1868 return false;
1869 } else {
1870 return true;
1871 }
1872 }
1873
1874 #endif
1875
1876 void genX(CmdDispatch)(
1877 VkCommandBuffer commandBuffer,
1878 uint32_t x,
1879 uint32_t y,
1880 uint32_t z)
1881 {
1882 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1883 struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
1884 const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
1885
1886 if (prog_data->uses_num_work_groups) {
1887 struct anv_state state =
1888 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 12, 4);
1889 uint32_t *sizes = state.map;
1890 sizes[0] = x;
1891 sizes[1] = y;
1892 sizes[2] = z;
1893 if (!cmd_buffer->device->info.has_llc)
1894 anv_state_clflush(state);
1895 cmd_buffer->state.num_workgroups_offset = state.offset;
1896 cmd_buffer->state.num_workgroups_bo =
1897 &cmd_buffer->device->dynamic_state_block_pool.bo;
1898 }
1899
1900 genX(cmd_buffer_flush_compute_state)(cmd_buffer);
1901
1902 anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER), ggw) {
1903 ggw.SIMDSize = prog_data->simd_size / 16;
1904 ggw.ThreadDepthCounterMaximum = 0;
1905 ggw.ThreadHeightCounterMaximum = 0;
1906 ggw.ThreadWidthCounterMaximum = prog_data->threads - 1;
1907 ggw.ThreadGroupIDXDimension = x;
1908 ggw.ThreadGroupIDYDimension = y;
1909 ggw.ThreadGroupIDZDimension = z;
1910 ggw.RightExecutionMask = pipeline->cs_right_mask;
1911 ggw.BottomExecutionMask = 0xffffffff;
1912 }
1913
1914 anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_STATE_FLUSH), msf);
1915 }
1916
1917 #define GPGPU_DISPATCHDIMX 0x2500
1918 #define GPGPU_DISPATCHDIMY 0x2504
1919 #define GPGPU_DISPATCHDIMZ 0x2508
1920
1921 #define MI_PREDICATE_SRC0 0x2400
1922 #define MI_PREDICATE_SRC1 0x2408
1923
1924 void genX(CmdDispatchIndirect)(
1925 VkCommandBuffer commandBuffer,
1926 VkBuffer _buffer,
1927 VkDeviceSize offset)
1928 {
1929 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1930 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
1931 struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
1932 const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
1933 struct anv_bo *bo = buffer->bo;
1934 uint32_t bo_offset = buffer->offset + offset;
1935 struct anv_batch *batch = &cmd_buffer->batch;
1936
1937 #if GEN_GEN == 7
1938 /* Linux 4.4 added command parser version 5 which allows the GPGPU
1939 * indirect dispatch registers to be written.
1940 */
1941 if (!verify_cmd_parser(cmd_buffer->device, 5, "vkCmdDispatchIndirect"))
1942 return;
1943 #endif
1944
1945 if (prog_data->uses_num_work_groups) {
1946 cmd_buffer->state.num_workgroups_offset = bo_offset;
1947 cmd_buffer->state.num_workgroups_bo = bo;
1948 }
1949
1950 genX(cmd_buffer_flush_compute_state)(cmd_buffer);
1951
1952 emit_lrm(batch, GPGPU_DISPATCHDIMX, bo, bo_offset);
1953 emit_lrm(batch, GPGPU_DISPATCHDIMY, bo, bo_offset + 4);
1954 emit_lrm(batch, GPGPU_DISPATCHDIMZ, bo, bo_offset + 8);
1955
1956 #if GEN_GEN <= 7
1957 /* Clear upper 32-bits of SRC0 and all 64-bits of SRC1 */
1958 emit_lri(batch, MI_PREDICATE_SRC0 + 4, 0);
1959 emit_lri(batch, MI_PREDICATE_SRC1 + 0, 0);
1960 emit_lri(batch, MI_PREDICATE_SRC1 + 4, 0);
1961
1962 /* Load compute_dispatch_indirect_x_size into SRC0 */
1963 emit_lrm(batch, MI_PREDICATE_SRC0, bo, bo_offset + 0);
1964
1965 /* predicate = (compute_dispatch_indirect_x_size == 0); */
1966 anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
1967 mip.LoadOperation = LOAD_LOAD;
1968 mip.CombineOperation = COMBINE_SET;
1969 mip.CompareOperation = COMPARE_SRCS_EQUAL;
1970 }
1971
1972 /* Load compute_dispatch_indirect_y_size into SRC0 */
1973 emit_lrm(batch, MI_PREDICATE_SRC0, bo, bo_offset + 4);
1974
1975 /* predicate |= (compute_dispatch_indirect_y_size == 0); */
1976 anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
1977 mip.LoadOperation = LOAD_LOAD;
1978 mip.CombineOperation = COMBINE_OR;
1979 mip.CompareOperation = COMPARE_SRCS_EQUAL;
1980 }
1981
1982 /* Load compute_dispatch_indirect_z_size into SRC0 */
1983 emit_lrm(batch, MI_PREDICATE_SRC0, bo, bo_offset + 8);
1984
1985 /* predicate |= (compute_dispatch_indirect_z_size == 0); */
1986 anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
1987 mip.LoadOperation = LOAD_LOAD;
1988 mip.CombineOperation = COMBINE_OR;
1989 mip.CompareOperation = COMPARE_SRCS_EQUAL;
1990 }
1991
1992 /* predicate = !predicate; */
1993 #define COMPARE_FALSE 1
1994 anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
1995 mip.LoadOperation = LOAD_LOADINV;
1996 mip.CombineOperation = COMBINE_OR;
1997 mip.CompareOperation = COMPARE_FALSE;
1998 }
1999 #endif
2000
2001 anv_batch_emit(batch, GENX(GPGPU_WALKER), ggw) {
2002 ggw.IndirectParameterEnable = true;
2003 ggw.PredicateEnable = GEN_GEN <= 7;
2004 ggw.SIMDSize = prog_data->simd_size / 16;
2005 ggw.ThreadDepthCounterMaximum = 0;
2006 ggw.ThreadHeightCounterMaximum = 0;
2007 ggw.ThreadWidthCounterMaximum = prog_data->threads - 1;
2008 ggw.RightExecutionMask = pipeline->cs_right_mask;
2009 ggw.BottomExecutionMask = 0xffffffff;
2010 }
2011
2012 anv_batch_emit(batch, GENX(MEDIA_STATE_FLUSH), msf);
2013 }
2014
2015 static void
2016 flush_pipeline_before_pipeline_select(struct anv_cmd_buffer *cmd_buffer,
2017 uint32_t pipeline)
2018 {
2019 #if GEN_GEN >= 8 && GEN_GEN < 10
2020 /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
2021 *
2022 * Software must clear the COLOR_CALC_STATE Valid field in
2023 * 3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT
2024 * with Pipeline Select set to GPGPU.
2025 *
2026 * The internal hardware docs recommend the same workaround for Gen9
2027 * hardware too.
2028 */
2029 if (pipeline == GPGPU)
2030 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), t);
2031 #elif GEN_GEN <= 7
2032 /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
2033 * PIPELINE_SELECT [DevBWR+]":
2034 *
2035 * Project: DEVSNB+
2036 *
2037 * Software must ensure all the write caches are flushed through a
2038 * stalling PIPE_CONTROL command followed by another PIPE_CONTROL
2039 * command to invalidate read only caches prior to programming
2040 * MI_PIPELINE_SELECT command to change the Pipeline Select Mode.
2041 */
2042 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
2043 pc.RenderTargetCacheFlushEnable = true;
2044 pc.DepthCacheFlushEnable = true;
2045 pc.DCFlushEnable = true;
2046 pc.PostSyncOperation = NoWrite;
2047 pc.CommandStreamerStallEnable = true;
2048 }
2049
2050 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
2051 pc.TextureCacheInvalidationEnable = true;
2052 pc.ConstantCacheInvalidationEnable = true;
2053 pc.StateCacheInvalidationEnable = true;
2054 pc.InstructionCacheInvalidateEnable = true;
2055 pc.PostSyncOperation = NoWrite;
2056 }
2057 #endif
2058 }
2059
2060 void
2061 genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer)
2062 {
2063 if (cmd_buffer->state.current_pipeline != _3D) {
2064 flush_pipeline_before_pipeline_select(cmd_buffer, _3D);
2065
2066 anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT), ps) {
2067 #if GEN_GEN >= 9
2068 ps.MaskBits = 3;
2069 #endif
2070 ps.PipelineSelection = _3D;
2071 }
2072
2073 cmd_buffer->state.current_pipeline = _3D;
2074 }
2075 }
2076
2077 void
2078 genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer)
2079 {
2080 if (cmd_buffer->state.current_pipeline != GPGPU) {
2081 flush_pipeline_before_pipeline_select(cmd_buffer, GPGPU);
2082
2083 anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT), ps) {
2084 #if GEN_GEN >= 9
2085 ps.MaskBits = 3;
2086 #endif
2087 ps.PipelineSelection = GPGPU;
2088 }
2089
2090 cmd_buffer->state.current_pipeline = GPGPU;
2091 }
2092 }
2093
2094 void
2095 genX(cmd_buffer_emit_gen7_depth_flush)(struct anv_cmd_buffer *cmd_buffer)
2096 {
2097 if (GEN_GEN >= 8)
2098 return;
2099
2100 /* From the Haswell PRM, documentation for 3DSTATE_DEPTH_BUFFER:
2101 *
2102 * "Restriction: Prior to changing Depth/Stencil Buffer state (i.e., any
2103 * combination of 3DSTATE_DEPTH_BUFFER, 3DSTATE_CLEAR_PARAMS,
2104 * 3DSTATE_STENCIL_BUFFER, 3DSTATE_HIER_DEPTH_BUFFER) SW must first
2105 * issue a pipelined depth stall (PIPE_CONTROL with Depth Stall bit
2106 * set), followed by a pipelined depth cache flush (PIPE_CONTROL with
2107 * Depth Flush Bit set, followed by another pipelined depth stall
2108 * (PIPE_CONTROL with Depth Stall Bit set), unless SW can otherwise
2109 * guarantee that the pipeline from WM onwards is already flushed (e.g.,
2110 * via a preceding MI_FLUSH)."
2111 */
2112 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
2113 pipe.DepthStallEnable = true;
2114 }
2115 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
2116 pipe.DepthCacheFlushEnable = true;
2117 }
2118 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
2119 pipe.DepthStallEnable = true;
2120 }
2121 }
2122
2123 static uint32_t
2124 depth_stencil_surface_type(enum isl_surf_dim dim)
2125 {
2126 switch (dim) {
2127 case ISL_SURF_DIM_1D:
2128 if (GEN_GEN >= 9) {
2129 /* From the Sky Lake PRM, 3DSTATAE_DEPTH_BUFFER::SurfaceType
2130 *
2131 * Programming Notes:
2132 * The Surface Type of the depth buffer must be the same as the
2133 * Surface Type of the render target(s) (defined in
2134 * SURFACE_STATE), unless either the depth buffer or render
2135 * targets are SURFTYPE_NULL (see exception below for SKL). 1D
2136 * surface type not allowed for depth surface and stencil surface.
2137 *
2138 * Workaround:
2139 * If depth/stencil is enabled with 1D render target,
2140 * depth/stencil surface type needs to be set to 2D surface type
2141 * and height set to 1. Depth will use (legacy) TileY and stencil
2142 * will use TileW. For this case only, the Surface Type of the
2143 * depth buffer can be 2D while the Surface Type of the render
2144 * target(s) are 1D, representing an exception to a programming
2145 * note above.
2146 */
2147 return SURFTYPE_2D;
2148 } else {
2149 return SURFTYPE_1D;
2150 }
2151 case ISL_SURF_DIM_2D:
2152 return SURFTYPE_2D;
2153 case ISL_SURF_DIM_3D:
2154 if (GEN_GEN >= 9) {
2155 /* The Sky Lake docs list the value for 3D as "Reserved". However,
2156 * they have the exact same layout as 2D arrays on gen9+, so we can
2157 * just use 2D here.
2158 */
2159 return SURFTYPE_2D;
2160 } else {
2161 return SURFTYPE_3D;
2162 }
2163 default:
2164 unreachable("Invalid surface dimension");
2165 }
2166 }
2167
2168 static void
2169 cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
2170 {
2171 struct anv_device *device = cmd_buffer->device;
2172 const struct anv_framebuffer *fb = cmd_buffer->state.framebuffer;
2173 const struct anv_image_view *iview =
2174 anv_cmd_buffer_get_depth_stencil_view(cmd_buffer);
2175 const struct anv_image *image = iview ? iview->image : NULL;
2176 const bool has_depth = image && (image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT);
2177 const uint32_t ds = cmd_buffer->state.subpass->depth_stencil_attachment;
2178 const bool has_hiz = image != NULL &&
2179 cmd_buffer->state.attachments[ds].aux_usage == ISL_AUX_USAGE_HIZ;
2180 const bool has_stencil =
2181 image && (image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT);
2182
2183 /* FIXME: Implement the PMA stall W/A */
2184 /* FIXME: Width and Height are wrong */
2185
2186 genX(cmd_buffer_emit_gen7_depth_flush)(cmd_buffer);
2187
2188 /* Emit 3DSTATE_DEPTH_BUFFER */
2189 if (has_depth) {
2190 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BUFFER), db) {
2191 db.SurfaceType =
2192 depth_stencil_surface_type(image->depth_surface.isl.dim);
2193 db.DepthWriteEnable = true;
2194 db.StencilWriteEnable = has_stencil;
2195 db.HierarchicalDepthBufferEnable = has_hiz;
2196
2197 db.SurfaceFormat = isl_surf_get_depth_format(&device->isl_dev,
2198 &image->depth_surface.isl);
2199
2200 db.SurfaceBaseAddress = (struct anv_address) {
2201 .bo = image->bo,
2202 .offset = image->offset + image->depth_surface.offset,
2203 };
2204 db.DepthBufferObjectControlState = GENX(MOCS);
2205
2206 db.SurfacePitch = image->depth_surface.isl.row_pitch - 1;
2207 db.Height = image->extent.height - 1;
2208 db.Width = image->extent.width - 1;
2209 db.LOD = iview->isl.base_level;
2210 db.MinimumArrayElement = iview->isl.base_array_layer;
2211
2212 assert(image->depth_surface.isl.dim != ISL_SURF_DIM_3D);
2213 db.Depth =
2214 db.RenderTargetViewExtent =
2215 iview->isl.array_len - iview->isl.base_array_layer - 1;
2216
2217 #if GEN_GEN >= 8
2218 db.SurfaceQPitch =
2219 isl_surf_get_array_pitch_el_rows(&image->depth_surface.isl) >> 2;
2220 #endif
2221 }
2222 } else {
2223 /* Even when no depth buffer is present, the hardware requires that
2224 * 3DSTATE_DEPTH_BUFFER be programmed correctly. The Broadwell PRM says:
2225 *
2226 * If a null depth buffer is bound, the driver must instead bind depth as:
2227 * 3DSTATE_DEPTH.SurfaceType = SURFTYPE_2D
2228 * 3DSTATE_DEPTH.Width = 1
2229 * 3DSTATE_DEPTH.Height = 1
2230 * 3DSTATE_DEPTH.SuraceFormat = D16_UNORM
2231 * 3DSTATE_DEPTH.SurfaceBaseAddress = 0
2232 * 3DSTATE_DEPTH.HierarchicalDepthBufferEnable = 0
2233 * 3DSTATE_WM_DEPTH_STENCIL.DepthTestEnable = 0
2234 * 3DSTATE_WM_DEPTH_STENCIL.DepthBufferWriteEnable = 0
2235 *
2236 * The PRM is wrong, though. The width and height must be programmed to
2237 * actual framebuffer's width and height, even when neither depth buffer
2238 * nor stencil buffer is present. Also, D16_UNORM is not allowed to
2239 * be combined with a stencil buffer so we use D32_FLOAT instead.
2240 */
2241 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BUFFER), db) {
2242 if (has_stencil) {
2243 db.SurfaceType =
2244 depth_stencil_surface_type(image->stencil_surface.isl.dim);
2245 } else {
2246 db.SurfaceType = SURFTYPE_2D;
2247 }
2248 db.SurfaceFormat = D32_FLOAT;
2249 db.Width = MAX2(fb->width, 1) - 1;
2250 db.Height = MAX2(fb->height, 1) - 1;
2251 db.StencilWriteEnable = has_stencil;
2252 }
2253 }
2254
2255 if (has_hiz) {
2256 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_HIER_DEPTH_BUFFER), hdb) {
2257 hdb.HierarchicalDepthBufferObjectControlState = GENX(MOCS);
2258 hdb.SurfacePitch = image->aux_surface.isl.row_pitch - 1;
2259 hdb.SurfaceBaseAddress = (struct anv_address) {
2260 .bo = image->bo,
2261 .offset = image->offset + image->aux_surface.offset,
2262 };
2263 #if GEN_GEN >= 8
2264 /* From the SKL PRM Vol2a:
2265 *
2266 * The interpretation of this field is dependent on Surface Type
2267 * as follows:
2268 * - SURFTYPE_1D: distance in pixels between array slices
2269 * - SURFTYPE_2D/CUBE: distance in rows between array slices
2270 * - SURFTYPE_3D: distance in rows between R - slices
2271 *
2272 * Unfortunately, the docs aren't 100% accurate here. They fail to
2273 * mention that the 1-D rule only applies to linear 1-D images.
2274 * Since depth and HiZ buffers are always tiled, they are treated as
2275 * 2-D images. Prior to Sky Lake, this field is always in rows.
2276 */
2277 hdb.SurfaceQPitch =
2278 isl_surf_get_array_pitch_sa_rows(&image->aux_surface.isl) >> 2;
2279 #endif
2280 }
2281 } else {
2282 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_HIER_DEPTH_BUFFER), hdb);
2283 }
2284
2285 /* Emit 3DSTATE_STENCIL_BUFFER */
2286 if (has_stencil) {
2287 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STENCIL_BUFFER), sb) {
2288 #if GEN_GEN >= 8 || GEN_IS_HASWELL
2289 sb.StencilBufferEnable = true;
2290 #endif
2291 sb.StencilBufferObjectControlState = GENX(MOCS);
2292
2293 sb.SurfacePitch = image->stencil_surface.isl.row_pitch - 1;
2294
2295 #if GEN_GEN >= 8
2296 sb.SurfaceQPitch = isl_surf_get_array_pitch_el_rows(&image->stencil_surface.isl) >> 2;
2297 #endif
2298 sb.SurfaceBaseAddress = (struct anv_address) {
2299 .bo = image->bo,
2300 .offset = image->offset + image->stencil_surface.offset,
2301 };
2302 }
2303 } else {
2304 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STENCIL_BUFFER), sb);
2305 }
2306
2307 /* From the IVB PRM Vol2P1, 11.5.5.4 3DSTATE_CLEAR_PARAMS:
2308 *
2309 * 3DSTATE_CLEAR_PARAMS must always be programmed in the along with
2310 * the other Depth/Stencil state commands(i.e. 3DSTATE_DEPTH_BUFFER,
2311 * 3DSTATE_STENCIL_BUFFER, or 3DSTATE_HIER_DEPTH_BUFFER)
2312 *
2313 * Testing also shows that some variant of this restriction may exist HSW+.
2314 * On BDW+, it is not possible to emit 2 of these packets consecutively when
2315 * both have DepthClearValueValid set. An analysis of such state programming
2316 * on SKL showed that the GPU doesn't register the latter packet's clear
2317 * value.
2318 */
2319 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CLEAR_PARAMS), cp) {
2320 if (has_hiz) {
2321 cp.DepthClearValueValid = true;
2322 cp.DepthClearValue = ANV_HZ_FC_VAL;
2323 }
2324 }
2325 }
2326
2327 static void
2328 genX(cmd_buffer_set_subpass)(struct anv_cmd_buffer *cmd_buffer,
2329 struct anv_subpass *subpass)
2330 {
2331 cmd_buffer->state.subpass = subpass;
2332
2333 cmd_buffer->state.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
2334
2335 const struct anv_image_view *iview =
2336 anv_cmd_buffer_get_depth_stencil_view(cmd_buffer);
2337
2338 if (iview && iview->image->aux_usage == ISL_AUX_USAGE_HIZ) {
2339 const uint32_t ds = subpass->depth_stencil_attachment;
2340 transition_depth_buffer(cmd_buffer, iview->image,
2341 cmd_buffer->state.attachments[ds].current_layout,
2342 cmd_buffer->state.subpass->depth_stencil_layout);
2343 cmd_buffer->state.attachments[ds].current_layout =
2344 cmd_buffer->state.subpass->depth_stencil_layout;
2345 cmd_buffer->state.attachments[ds].aux_usage =
2346 layout_to_hiz_usage(cmd_buffer->state.subpass->depth_stencil_layout,
2347 iview->image->samples);
2348 }
2349
2350 cmd_buffer_emit_depth_stencil(cmd_buffer);
2351
2352 anv_cmd_buffer_clear_subpass(cmd_buffer);
2353 }
2354
2355 void genX(CmdBeginRenderPass)(
2356 VkCommandBuffer commandBuffer,
2357 const VkRenderPassBeginInfo* pRenderPassBegin,
2358 VkSubpassContents contents)
2359 {
2360 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2361 ANV_FROM_HANDLE(anv_render_pass, pass, pRenderPassBegin->renderPass);
2362 ANV_FROM_HANDLE(anv_framebuffer, framebuffer, pRenderPassBegin->framebuffer);
2363
2364 cmd_buffer->state.framebuffer = framebuffer;
2365 cmd_buffer->state.pass = pass;
2366 cmd_buffer->state.render_area = pRenderPassBegin->renderArea;
2367 genX(cmd_buffer_setup_attachments)(cmd_buffer, pass, pRenderPassBegin);
2368
2369 genX(flush_pipeline_select_3d)(cmd_buffer);
2370
2371 genX(cmd_buffer_set_subpass)(cmd_buffer, pass->subpasses);
2372 }
2373
2374 void genX(CmdNextSubpass)(
2375 VkCommandBuffer commandBuffer,
2376 VkSubpassContents contents)
2377 {
2378 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2379
2380 assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
2381
2382 const struct anv_image_view *iview =
2383 anv_cmd_buffer_get_depth_stencil_view(cmd_buffer);
2384
2385 if (iview && iview->image->aux_usage == ISL_AUX_USAGE_HIZ) {
2386 const uint32_t ds = cmd_buffer->state.subpass->depth_stencil_attachment;
2387
2388 if (cmd_buffer->state.subpass - cmd_buffer->state.pass->subpasses ==
2389 cmd_buffer->state.pass->attachments[ds].last_subpass_idx) {
2390 transition_depth_buffer(cmd_buffer, iview->image,
2391 cmd_buffer->state.attachments[ds].current_layout,
2392 cmd_buffer->state.pass->attachments[ds].final_layout);
2393 }
2394 }
2395
2396 anv_cmd_buffer_resolve_subpass(cmd_buffer);
2397 genX(cmd_buffer_set_subpass)(cmd_buffer, cmd_buffer->state.subpass + 1);
2398 }
2399
2400 void genX(CmdEndRenderPass)(
2401 VkCommandBuffer commandBuffer)
2402 {
2403 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2404
2405 const struct anv_image_view *iview =
2406 anv_cmd_buffer_get_depth_stencil_view(cmd_buffer);
2407
2408 if (iview && iview->image->aux_usage == ISL_AUX_USAGE_HIZ) {
2409 const uint32_t ds = cmd_buffer->state.subpass->depth_stencil_attachment;
2410
2411 if (cmd_buffer->state.subpass - cmd_buffer->state.pass->subpasses ==
2412 cmd_buffer->state.pass->attachments[ds].last_subpass_idx) {
2413 transition_depth_buffer(cmd_buffer, iview->image,
2414 cmd_buffer->state.attachments[ds].current_layout,
2415 cmd_buffer->state.pass->attachments[ds].final_layout);
2416 }
2417 }
2418
2419 anv_cmd_buffer_resolve_subpass(cmd_buffer);
2420
2421 #ifndef NDEBUG
2422 anv_dump_add_framebuffer(cmd_buffer, cmd_buffer->state.framebuffer);
2423 #endif
2424 }
2425
2426 static void
2427 emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer,
2428 struct anv_bo *bo, uint32_t offset)
2429 {
2430 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
2431 pc.DestinationAddressType = DAT_PPGTT;
2432 pc.PostSyncOperation = WritePSDepthCount;
2433 pc.DepthStallEnable = true;
2434 pc.Address = (struct anv_address) { bo, offset };
2435
2436 if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
2437 pc.CommandStreamerStallEnable = true;
2438 }
2439 }
2440
2441 static void
2442 emit_query_availability(struct anv_cmd_buffer *cmd_buffer,
2443 struct anv_bo *bo, uint32_t offset)
2444 {
2445 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
2446 pc.DestinationAddressType = DAT_PPGTT;
2447 pc.PostSyncOperation = WriteImmediateData;
2448 pc.Address = (struct anv_address) { bo, offset };
2449 pc.ImmediateData = 1;
2450 }
2451 }
2452
2453 void genX(CmdBeginQuery)(
2454 VkCommandBuffer commandBuffer,
2455 VkQueryPool queryPool,
2456 uint32_t query,
2457 VkQueryControlFlags flags)
2458 {
2459 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2460 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
2461
2462 /* Workaround: When meta uses the pipeline with the VS disabled, it seems
2463 * that the pipelining of the depth write breaks. What we see is that
2464 * samples from the render pass clear leaks into the first query
2465 * immediately after the clear. Doing a pipecontrol with a post-sync
2466 * operation and DepthStallEnable seems to work around the issue.
2467 */
2468 if (cmd_buffer->state.need_query_wa) {
2469 cmd_buffer->state.need_query_wa = false;
2470 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
2471 pc.DepthCacheFlushEnable = true;
2472 pc.DepthStallEnable = true;
2473 }
2474 }
2475
2476 switch (pool->type) {
2477 case VK_QUERY_TYPE_OCCLUSION:
2478 emit_ps_depth_count(cmd_buffer, &pool->bo,
2479 query * sizeof(struct anv_query_pool_slot));
2480 break;
2481
2482 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
2483 default:
2484 unreachable("");
2485 }
2486 }
2487
2488 void genX(CmdEndQuery)(
2489 VkCommandBuffer commandBuffer,
2490 VkQueryPool queryPool,
2491 uint32_t query)
2492 {
2493 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2494 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
2495
2496 switch (pool->type) {
2497 case VK_QUERY_TYPE_OCCLUSION:
2498 emit_ps_depth_count(cmd_buffer, &pool->bo,
2499 query * sizeof(struct anv_query_pool_slot) + 8);
2500
2501 emit_query_availability(cmd_buffer, &pool->bo,
2502 query * sizeof(struct anv_query_pool_slot) + 16);
2503 break;
2504
2505 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
2506 default:
2507 unreachable("");
2508 }
2509 }
2510
2511 #define TIMESTAMP 0x2358
2512
2513 void genX(CmdWriteTimestamp)(
2514 VkCommandBuffer commandBuffer,
2515 VkPipelineStageFlagBits pipelineStage,
2516 VkQueryPool queryPool,
2517 uint32_t query)
2518 {
2519 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2520 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
2521 uint32_t offset = query * sizeof(struct anv_query_pool_slot);
2522
2523 assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);
2524
2525 switch (pipelineStage) {
2526 case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
2527 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
2528 srm.RegisterAddress = TIMESTAMP;
2529 srm.MemoryAddress = (struct anv_address) { &pool->bo, offset };
2530 }
2531 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
2532 srm.RegisterAddress = TIMESTAMP + 4;
2533 srm.MemoryAddress = (struct anv_address) { &pool->bo, offset + 4 };
2534 }
2535 break;
2536
2537 default:
2538 /* Everything else is bottom-of-pipe */
2539 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
2540 pc.DestinationAddressType = DAT_PPGTT;
2541 pc.PostSyncOperation = WriteTimestamp;
2542 pc.Address = (struct anv_address) { &pool->bo, offset };
2543
2544 if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
2545 pc.CommandStreamerStallEnable = true;
2546 }
2547 break;
2548 }
2549
2550 emit_query_availability(cmd_buffer, &pool->bo, query + 16);
2551 }
2552
2553 #if GEN_GEN > 7 || GEN_IS_HASWELL
2554
2555 #define alu_opcode(v) __gen_uint((v), 20, 31)
2556 #define alu_operand1(v) __gen_uint((v), 10, 19)
2557 #define alu_operand2(v) __gen_uint((v), 0, 9)
2558 #define alu(opcode, operand1, operand2) \
2559 alu_opcode(opcode) | alu_operand1(operand1) | alu_operand2(operand2)
2560
2561 #define OPCODE_NOOP 0x000
2562 #define OPCODE_LOAD 0x080
2563 #define OPCODE_LOADINV 0x480
2564 #define OPCODE_LOAD0 0x081
2565 #define OPCODE_LOAD1 0x481
2566 #define OPCODE_ADD 0x100
2567 #define OPCODE_SUB 0x101
2568 #define OPCODE_AND 0x102
2569 #define OPCODE_OR 0x103
2570 #define OPCODE_XOR 0x104
2571 #define OPCODE_STORE 0x180
2572 #define OPCODE_STOREINV 0x580
2573
2574 #define OPERAND_R0 0x00
2575 #define OPERAND_R1 0x01
2576 #define OPERAND_R2 0x02
2577 #define OPERAND_R3 0x03
2578 #define OPERAND_R4 0x04
2579 #define OPERAND_SRCA 0x20
2580 #define OPERAND_SRCB 0x21
2581 #define OPERAND_ACCU 0x31
2582 #define OPERAND_ZF 0x32
2583 #define OPERAND_CF 0x33
2584
2585 #define CS_GPR(n) (0x2600 + (n) * 8)
2586
2587 static void
2588 emit_load_alu_reg_u64(struct anv_batch *batch, uint32_t reg,
2589 struct anv_bo *bo, uint32_t offset)
2590 {
2591 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
2592 lrm.RegisterAddress = reg,
2593 lrm.MemoryAddress = (struct anv_address) { bo, offset };
2594 }
2595 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
2596 lrm.RegisterAddress = reg + 4;
2597 lrm.MemoryAddress = (struct anv_address) { bo, offset + 4 };
2598 }
2599 }
2600
2601 static void
2602 store_query_result(struct anv_batch *batch, uint32_t reg,
2603 struct anv_bo *bo, uint32_t offset, VkQueryResultFlags flags)
2604 {
2605 anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
2606 srm.RegisterAddress = reg;
2607 srm.MemoryAddress = (struct anv_address) { bo, offset };
2608 }
2609
2610 if (flags & VK_QUERY_RESULT_64_BIT) {
2611 anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
2612 srm.RegisterAddress = reg + 4;
2613 srm.MemoryAddress = (struct anv_address) { bo, offset + 4 };
2614 }
2615 }
2616 }
2617
2618 void genX(CmdCopyQueryPoolResults)(
2619 VkCommandBuffer commandBuffer,
2620 VkQueryPool queryPool,
2621 uint32_t firstQuery,
2622 uint32_t queryCount,
2623 VkBuffer destBuffer,
2624 VkDeviceSize destOffset,
2625 VkDeviceSize destStride,
2626 VkQueryResultFlags flags)
2627 {
2628 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2629 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
2630 ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
2631 uint32_t slot_offset, dst_offset;
2632
2633 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
2634 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
2635 pc.CommandStreamerStallEnable = true;
2636 pc.StallAtPixelScoreboard = true;
2637 }
2638 }
2639
2640 dst_offset = buffer->offset + destOffset;
2641 for (uint32_t i = 0; i < queryCount; i++) {
2642
2643 slot_offset = (firstQuery + i) * sizeof(struct anv_query_pool_slot);
2644 switch (pool->type) {
2645 case VK_QUERY_TYPE_OCCLUSION:
2646 emit_load_alu_reg_u64(&cmd_buffer->batch,
2647 CS_GPR(0), &pool->bo, slot_offset);
2648 emit_load_alu_reg_u64(&cmd_buffer->batch,
2649 CS_GPR(1), &pool->bo, slot_offset + 8);
2650
2651 /* FIXME: We need to clamp the result for 32 bit. */
2652
2653 uint32_t *dw = anv_batch_emitn(&cmd_buffer->batch, 5, GENX(MI_MATH));
2654 dw[1] = alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R1);
2655 dw[2] = alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R0);
2656 dw[3] = alu(OPCODE_SUB, 0, 0);
2657 dw[4] = alu(OPCODE_STORE, OPERAND_R2, OPERAND_ACCU);
2658 break;
2659
2660 case VK_QUERY_TYPE_TIMESTAMP:
2661 emit_load_alu_reg_u64(&cmd_buffer->batch,
2662 CS_GPR(2), &pool->bo, slot_offset);
2663 break;
2664
2665 default:
2666 unreachable("unhandled query type");
2667 }
2668
2669 store_query_result(&cmd_buffer->batch,
2670 CS_GPR(2), buffer->bo, dst_offset, flags);
2671
2672 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
2673 emit_load_alu_reg_u64(&cmd_buffer->batch, CS_GPR(0),
2674 &pool->bo, slot_offset + 16);
2675 if (flags & VK_QUERY_RESULT_64_BIT)
2676 store_query_result(&cmd_buffer->batch,
2677 CS_GPR(0), buffer->bo, dst_offset + 8, flags);
2678 else
2679 store_query_result(&cmd_buffer->batch,
2680 CS_GPR(0), buffer->bo, dst_offset + 4, flags);
2681 }
2682
2683 dst_offset += destStride;
2684 }
2685 }
2686
2687 #else
2688 void genX(CmdCopyQueryPoolResults)(
2689 VkCommandBuffer commandBuffer,
2690 VkQueryPool queryPool,
2691 uint32_t firstQuery,
2692 uint32_t queryCount,
2693 VkBuffer destBuffer,
2694 VkDeviceSize destOffset,
2695 VkDeviceSize destStride,
2696 VkQueryResultFlags flags)
2697 {
2698 anv_finishme("Queries not yet supported on Ivy Bridge");
2699 }
2700 #endif