anv/cmd: Dirty descriptor sets when a new pipeline is bound
[mesa.git] / src / intel / vulkan / genX_cmd_buffer.c
1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <assert.h>
25 #include <stdbool.h>
26
27 #include "anv_private.h"
28
29 #include "genxml/gen_macros.h"
30 #include "genxml/genX_pack.h"
31
32 void
33 genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
34 {
35 struct anv_device *device = cmd_buffer->device;
36 struct anv_bo *scratch_bo = NULL;
37
38 cmd_buffer->state.scratch_size =
39 anv_block_pool_size(&device->scratch_block_pool);
40 if (cmd_buffer->state.scratch_size > 0)
41 scratch_bo = &device->scratch_block_pool.bo;
42
43 /* XXX: Do we need this on more than just BDW? */
44 #if (GEN_GEN >= 8)
45 /* Emit a render target cache flush.
46 *
47 * This isn't documented anywhere in the PRM. However, it seems to be
48 * necessary prior to changing the surface state base adress. Without
49 * this, we get GPU hangs when using multi-level command buffers which
50 * clear depth, reset state base address, and then go render stuff.
51 */
52 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
53 pc.RenderTargetCacheFlushEnable = true;
54 }
55 #endif
56
57 anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BASE_ADDRESS), sba) {
58 sba.GeneralStateBaseAddress = (struct anv_address) { scratch_bo, 0 };
59 sba.GeneralStateMemoryObjectControlState = GENX(MOCS);
60 sba.GeneralStateBaseAddressModifyEnable = true;
61
62 sba.SurfaceStateBaseAddress =
63 anv_cmd_buffer_surface_base_address(cmd_buffer);
64 sba.SurfaceStateMemoryObjectControlState = GENX(MOCS);
65 sba.SurfaceStateBaseAddressModifyEnable = true;
66
67 sba.DynamicStateBaseAddress =
68 (struct anv_address) { &device->dynamic_state_block_pool.bo, 0 };
69 sba.DynamicStateMemoryObjectControlState = GENX(MOCS),
70 sba.DynamicStateBaseAddressModifyEnable = true,
71
72 sba.IndirectObjectBaseAddress = (struct anv_address) { NULL, 0 };
73 sba.IndirectObjectMemoryObjectControlState = GENX(MOCS);
74 sba.IndirectObjectBaseAddressModifyEnable = true;
75
76 sba.InstructionBaseAddress =
77 (struct anv_address) { &device->instruction_block_pool.bo, 0 };
78 sba.InstructionMemoryObjectControlState = GENX(MOCS);
79 sba.InstructionBaseAddressModifyEnable = true;
80
81 # if (GEN_GEN >= 8)
82 /* Broadwell requires that we specify a buffer size for a bunch of
83 * these fields. However, since we will be growing the BO's live, we
84 * just set them all to the maximum.
85 */
86 sba.GeneralStateBufferSize = 0xfffff;
87 sba.GeneralStateBufferSizeModifyEnable = true;
88 sba.DynamicStateBufferSize = 0xfffff;
89 sba.DynamicStateBufferSizeModifyEnable = true;
90 sba.IndirectObjectBufferSize = 0xfffff;
91 sba.IndirectObjectBufferSizeModifyEnable = true;
92 sba.InstructionBufferSize = 0xfffff;
93 sba.InstructionBuffersizeModifyEnable = true;
94 # endif
95 }
96
97 /* After re-setting the surface state base address, we have to do some
98 * cache flusing so that the sampler engine will pick up the new
99 * SURFACE_STATE objects and binding tables. From the Broadwell PRM,
100 * Shared Function > 3D Sampler > State > State Caching (page 96):
101 *
102 * Coherency with system memory in the state cache, like the texture
103 * cache is handled partially by software. It is expected that the
104 * command stream or shader will issue Cache Flush operation or
105 * Cache_Flush sampler message to ensure that the L1 cache remains
106 * coherent with system memory.
107 *
108 * [...]
109 *
110 * Whenever the value of the Dynamic_State_Base_Addr,
111 * Surface_State_Base_Addr are altered, the L1 state cache must be
112 * invalidated to ensure the new surface or sampler state is fetched
113 * from system memory.
114 *
115 * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit
116 * which, according the PIPE_CONTROL instruction documentation in the
117 * Broadwell PRM:
118 *
119 * Setting this bit is independent of any other bit in this packet.
120 * This bit controls the invalidation of the L1 and L2 state caches
121 * at the top of the pipe i.e. at the parsing time.
122 *
123 * Unfortunately, experimentation seems to indicate that state cache
124 * invalidation through a PIPE_CONTROL does nothing whatsoever in
125 * regards to surface state and binding tables. In stead, it seems that
126 * invalidating the texture cache is what is actually needed.
127 *
128 * XXX: As far as we have been able to determine through
129 * experimentation, shows that flush the texture cache appears to be
130 * sufficient. The theory here is that all of the sampling/rendering
131 * units cache the binding table in the texture cache. However, we have
132 * yet to be able to actually confirm this.
133 */
134 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
135 pc.TextureCacheInvalidationEnable = true;
136 }
137 }
138
139 void
140 genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
141 {
142 enum anv_pipe_bits bits = cmd_buffer->state.pending_pipe_bits;
143
144 /* Flushes are pipelined while invalidations are handled immediately.
145 * Therefore, if we're flushing anything then we need to schedule a stall
146 * before any invalidations can happen.
147 */
148 if (bits & ANV_PIPE_FLUSH_BITS)
149 bits |= ANV_PIPE_NEEDS_CS_STALL_BIT;
150
151 /* If we're going to do an invalidate and we have a pending CS stall that
152 * has yet to be resolved, we do the CS stall now.
153 */
154 if ((bits & ANV_PIPE_INVALIDATE_BITS) &&
155 (bits & ANV_PIPE_NEEDS_CS_STALL_BIT)) {
156 bits |= ANV_PIPE_CS_STALL_BIT;
157 bits &= ~ANV_PIPE_NEEDS_CS_STALL_BIT;
158 }
159
160 if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT)) {
161 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
162 pipe.DepthCacheFlushEnable = bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
163 pipe.DCFlushEnable = bits & ANV_PIPE_DATA_CACHE_FLUSH_BIT;
164 pipe.RenderTargetCacheFlushEnable =
165 bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
166
167 pipe.DepthStallEnable = bits & ANV_PIPE_DEPTH_STALL_BIT;
168 pipe.CommandStreamerStallEnable = bits & ANV_PIPE_CS_STALL_BIT;
169 pipe.StallAtPixelScoreboard = bits & ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
170
171 /*
172 * According to the Broadwell documentation, any PIPE_CONTROL with the
173 * "Command Streamer Stall" bit set must also have another bit set,
174 * with five different options:
175 *
176 * - Render Target Cache Flush
177 * - Depth Cache Flush
178 * - Stall at Pixel Scoreboard
179 * - Post-Sync Operation
180 * - Depth Stall
181 * - DC Flush Enable
182 *
183 * I chose "Stall at Pixel Scoreboard" since that's what we use in
184 * mesa and it seems to work fine. The choice is fairly arbitrary.
185 */
186 if ((bits & ANV_PIPE_CS_STALL_BIT) &&
187 !(bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_DEPTH_STALL_BIT |
188 ANV_PIPE_STALL_AT_SCOREBOARD_BIT)))
189 pipe.StallAtPixelScoreboard = true;
190 }
191
192 bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT);
193 }
194
195 if (bits & ANV_PIPE_INVALIDATE_BITS) {
196 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
197 pipe.StateCacheInvalidationEnable =
198 bits & ANV_PIPE_STATE_CACHE_INVALIDATE_BIT;
199 pipe.ConstantCacheInvalidationEnable =
200 bits & ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
201 pipe.VFCacheInvalidationEnable =
202 bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
203 pipe.TextureCacheInvalidationEnable =
204 bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
205 pipe.InstructionCacheInvalidateEnable =
206 bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT;
207 }
208
209 bits &= ~ANV_PIPE_INVALIDATE_BITS;
210 }
211
212 cmd_buffer->state.pending_pipe_bits = bits;
213 }
214
215 void genX(CmdPipelineBarrier)(
216 VkCommandBuffer commandBuffer,
217 VkPipelineStageFlags srcStageMask,
218 VkPipelineStageFlags destStageMask,
219 VkBool32 byRegion,
220 uint32_t memoryBarrierCount,
221 const VkMemoryBarrier* pMemoryBarriers,
222 uint32_t bufferMemoryBarrierCount,
223 const VkBufferMemoryBarrier* pBufferMemoryBarriers,
224 uint32_t imageMemoryBarrierCount,
225 const VkImageMemoryBarrier* pImageMemoryBarriers)
226 {
227 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
228 uint32_t b;
229
230 /* XXX: Right now, we're really dumb and just flush whatever categories
231 * the app asks for. One of these days we may make this a bit better
232 * but right now that's all the hardware allows for in most areas.
233 */
234 VkAccessFlags src_flags = 0;
235 VkAccessFlags dst_flags = 0;
236
237 for (uint32_t i = 0; i < memoryBarrierCount; i++) {
238 src_flags |= pMemoryBarriers[i].srcAccessMask;
239 dst_flags |= pMemoryBarriers[i].dstAccessMask;
240 }
241
242 for (uint32_t i = 0; i < bufferMemoryBarrierCount; i++) {
243 src_flags |= pBufferMemoryBarriers[i].srcAccessMask;
244 dst_flags |= pBufferMemoryBarriers[i].dstAccessMask;
245 }
246
247 for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) {
248 src_flags |= pImageMemoryBarriers[i].srcAccessMask;
249 dst_flags |= pImageMemoryBarriers[i].dstAccessMask;
250 }
251
252 enum anv_pipe_bits pipe_bits = 0;
253
254 for_each_bit(b, src_flags) {
255 switch ((VkAccessFlagBits)(1 << b)) {
256 case VK_ACCESS_SHADER_WRITE_BIT:
257 pipe_bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT;
258 break;
259 case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT:
260 pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
261 break;
262 case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
263 pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
264 break;
265 case VK_ACCESS_TRANSFER_WRITE_BIT:
266 pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
267 pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
268 break;
269 default:
270 break; /* Nothing to do */
271 }
272 }
273
274 for_each_bit(b, dst_flags) {
275 switch ((VkAccessFlagBits)(1 << b)) {
276 case VK_ACCESS_INDIRECT_COMMAND_READ_BIT:
277 case VK_ACCESS_INDEX_READ_BIT:
278 case VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT:
279 pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
280 break;
281 case VK_ACCESS_UNIFORM_READ_BIT:
282 pipe_bits |= ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
283 pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
284 break;
285 case VK_ACCESS_SHADER_READ_BIT:
286 case VK_ACCESS_COLOR_ATTACHMENT_READ_BIT:
287 case VK_ACCESS_TRANSFER_READ_BIT:
288 pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
289 break;
290 default:
291 break; /* Nothing to do */
292 }
293 }
294
295 cmd_buffer->state.pending_pipe_bits |= pipe_bits;
296 }
297
298 static void
299 cmd_buffer_alloc_push_constants(struct anv_cmd_buffer *cmd_buffer)
300 {
301 VkShaderStageFlags stages = cmd_buffer->state.pipeline->active_stages;
302
303 /* In order to avoid thrash, we assume that vertex and fragment stages
304 * always exist. In the rare case where one is missing *and* the other
305 * uses push concstants, this may be suboptimal. However, avoiding stalls
306 * seems more important.
307 */
308 stages |= VK_SHADER_STAGE_FRAGMENT_BIT | VK_SHADER_STAGE_VERTEX_BIT;
309
310 if (stages == cmd_buffer->state.push_constant_stages)
311 return;
312
313 #if GEN_GEN >= 8
314 const unsigned push_constant_kb = 32;
315 #elif GEN_IS_HASWELL
316 const unsigned push_constant_kb = cmd_buffer->device->info.gt == 3 ? 32 : 16;
317 #else
318 const unsigned push_constant_kb = 16;
319 #endif
320
321 const unsigned num_stages =
322 _mesa_bitcount(stages & VK_SHADER_STAGE_ALL_GRAPHICS);
323 unsigned size_per_stage = push_constant_kb / num_stages;
324
325 /* Broadwell+ and Haswell gt3 require that the push constant sizes be in
326 * units of 2KB. Incidentally, these are the same platforms that have
327 * 32KB worth of push constant space.
328 */
329 if (push_constant_kb == 32)
330 size_per_stage &= ~1u;
331
332 uint32_t kb_used = 0;
333 for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_FRAGMENT; i++) {
334 unsigned push_size = (stages & (1 << i)) ? size_per_stage : 0;
335 anv_batch_emit(&cmd_buffer->batch,
336 GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
337 alloc._3DCommandSubOpcode = 18 + i;
338 alloc.ConstantBufferOffset = (push_size > 0) ? kb_used : 0;
339 alloc.ConstantBufferSize = push_size;
340 }
341 kb_used += push_size;
342 }
343
344 anv_batch_emit(&cmd_buffer->batch,
345 GENX(3DSTATE_PUSH_CONSTANT_ALLOC_PS), alloc) {
346 alloc.ConstantBufferOffset = kb_used;
347 alloc.ConstantBufferSize = push_constant_kb - kb_used;
348 }
349
350 cmd_buffer->state.push_constant_stages = stages;
351
352 /* From the BDW PRM for 3DSTATE_PUSH_CONSTANT_ALLOC_VS:
353 *
354 * "The 3DSTATE_CONSTANT_VS must be reprogrammed prior to
355 * the next 3DPRIMITIVE command after programming the
356 * 3DSTATE_PUSH_CONSTANT_ALLOC_VS"
357 *
358 * Since 3DSTATE_PUSH_CONSTANT_ALLOC_VS is programmed as part of
359 * pipeline setup, we need to dirty push constants.
360 */
361 cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS;
362 }
363
364 static void
365 cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer *cmd_buffer,
366 uint32_t stages)
367 {
368 static const uint32_t sampler_state_opcodes[] = {
369 [MESA_SHADER_VERTEX] = 43,
370 [MESA_SHADER_TESS_CTRL] = 44, /* HS */
371 [MESA_SHADER_TESS_EVAL] = 45, /* DS */
372 [MESA_SHADER_GEOMETRY] = 46,
373 [MESA_SHADER_FRAGMENT] = 47,
374 [MESA_SHADER_COMPUTE] = 0,
375 };
376
377 static const uint32_t binding_table_opcodes[] = {
378 [MESA_SHADER_VERTEX] = 38,
379 [MESA_SHADER_TESS_CTRL] = 39,
380 [MESA_SHADER_TESS_EVAL] = 40,
381 [MESA_SHADER_GEOMETRY] = 41,
382 [MESA_SHADER_FRAGMENT] = 42,
383 [MESA_SHADER_COMPUTE] = 0,
384 };
385
386 anv_foreach_stage(s, stages) {
387 if (cmd_buffer->state.samplers[s].alloc_size > 0) {
388 anv_batch_emit(&cmd_buffer->batch,
389 GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ssp) {
390 ssp._3DCommandSubOpcode = sampler_state_opcodes[s];
391 ssp.PointertoVSSamplerState = cmd_buffer->state.samplers[s].offset;
392 }
393 }
394
395 /* Always emit binding table pointers if we're asked to, since on SKL
396 * this is what flushes push constants. */
397 anv_batch_emit(&cmd_buffer->batch,
398 GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), btp) {
399 btp._3DCommandSubOpcode = binding_table_opcodes[s];
400 btp.PointertoVSBindingTable = cmd_buffer->state.binding_tables[s].offset;
401 }
402 }
403 }
404
405 static uint32_t
406 cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer)
407 {
408 static const uint32_t push_constant_opcodes[] = {
409 [MESA_SHADER_VERTEX] = 21,
410 [MESA_SHADER_TESS_CTRL] = 25, /* HS */
411 [MESA_SHADER_TESS_EVAL] = 26, /* DS */
412 [MESA_SHADER_GEOMETRY] = 22,
413 [MESA_SHADER_FRAGMENT] = 23,
414 [MESA_SHADER_COMPUTE] = 0,
415 };
416
417 VkShaderStageFlags flushed = 0;
418
419 anv_foreach_stage(stage, cmd_buffer->state.push_constants_dirty) {
420 if (stage == MESA_SHADER_COMPUTE)
421 continue;
422
423 struct anv_state state = anv_cmd_buffer_push_constants(cmd_buffer, stage);
424
425 if (state.offset == 0) {
426 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c)
427 c._3DCommandSubOpcode = push_constant_opcodes[stage];
428 } else {
429 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c) {
430 c._3DCommandSubOpcode = push_constant_opcodes[stage],
431 c.ConstantBody = (struct GENX(3DSTATE_CONSTANT_BODY)) {
432 #if GEN_GEN >= 9
433 .PointerToConstantBuffer2 = { &cmd_buffer->device->dynamic_state_block_pool.bo, state.offset },
434 .ConstantBuffer2ReadLength = DIV_ROUND_UP(state.alloc_size, 32),
435 #else
436 .PointerToConstantBuffer0 = { .offset = state.offset },
437 .ConstantBuffer0ReadLength = DIV_ROUND_UP(state.alloc_size, 32),
438 #endif
439 };
440 }
441 }
442
443 flushed |= mesa_to_vk_shader_stage(stage);
444 }
445
446 cmd_buffer->state.push_constants_dirty &= ~VK_SHADER_STAGE_ALL_GRAPHICS;
447
448 return flushed;
449 }
450
451 void
452 genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer)
453 {
454 struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
455 uint32_t *p;
456
457 uint32_t vb_emit = cmd_buffer->state.vb_dirty & pipeline->vb_used;
458
459 assert((pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT) == 0);
460
461 genX(cmd_buffer_config_l3)(cmd_buffer, pipeline);
462
463 genX(flush_pipeline_select_3d)(cmd_buffer);
464
465 if (vb_emit) {
466 const uint32_t num_buffers = __builtin_popcount(vb_emit);
467 const uint32_t num_dwords = 1 + num_buffers * 4;
468
469 p = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
470 GENX(3DSTATE_VERTEX_BUFFERS));
471 uint32_t vb, i = 0;
472 for_each_bit(vb, vb_emit) {
473 struct anv_buffer *buffer = cmd_buffer->state.vertex_bindings[vb].buffer;
474 uint32_t offset = cmd_buffer->state.vertex_bindings[vb].offset;
475
476 struct GENX(VERTEX_BUFFER_STATE) state = {
477 .VertexBufferIndex = vb,
478
479 #if GEN_GEN >= 8
480 .MemoryObjectControlState = GENX(MOCS),
481 #else
482 .BufferAccessType = pipeline->instancing_enable[vb] ? INSTANCEDATA : VERTEXDATA,
483 .InstanceDataStepRate = 1,
484 .VertexBufferMemoryObjectControlState = GENX(MOCS),
485 #endif
486
487 .AddressModifyEnable = true,
488 .BufferPitch = pipeline->binding_stride[vb],
489 .BufferStartingAddress = { buffer->bo, buffer->offset + offset },
490
491 #if GEN_GEN >= 8
492 .BufferSize = buffer->size - offset
493 #else
494 .EndAddress = { buffer->bo, buffer->offset + buffer->size - 1},
495 #endif
496 };
497
498 GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state);
499 i++;
500 }
501 }
502
503 cmd_buffer->state.vb_dirty &= ~vb_emit;
504
505 if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_PIPELINE) {
506 /* If somebody compiled a pipeline after starting a command buffer the
507 * scratch bo may have grown since we started this cmd buffer (and
508 * emitted STATE_BASE_ADDRESS). If we're binding that pipeline now,
509 * reemit STATE_BASE_ADDRESS so that we use the bigger scratch bo. */
510 if (cmd_buffer->state.scratch_size < pipeline->total_scratch)
511 anv_cmd_buffer_emit_state_base_address(cmd_buffer);
512
513 anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->batch);
514
515 /* The exact descriptor layout is pulled from the pipeline, so we need
516 * to re-emit binding tables on every pipeline change.
517 */
518 cmd_buffer->state.descriptors_dirty |=
519 cmd_buffer->state.pipeline->active_stages;
520
521 /* If the pipeline changed, we may need to re-allocate push constant
522 * space in the URB.
523 */
524 cmd_buffer_alloc_push_constants(cmd_buffer);
525 }
526
527 #if GEN_GEN <= 7
528 if (cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_VERTEX_BIT ||
529 cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_VERTEX_BIT) {
530 /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1:
531 *
532 * "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth
533 * stall needs to be sent just prior to any 3DSTATE_VS,
534 * 3DSTATE_URB_VS, 3DSTATE_CONSTANT_VS,
535 * 3DSTATE_BINDING_TABLE_POINTER_VS,
536 * 3DSTATE_SAMPLER_STATE_POINTER_VS command. Only one
537 * PIPE_CONTROL needs to be sent before any combination of VS
538 * associated 3DSTATE."
539 */
540 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
541 pc.DepthStallEnable = true;
542 pc.PostSyncOperation = WriteImmediateData;
543 pc.Address =
544 (struct anv_address) { &cmd_buffer->device->workaround_bo, 0 };
545 }
546 }
547 #endif
548
549 /* We emit the binding tables and sampler tables first, then emit push
550 * constants and then finally emit binding table and sampler table
551 * pointers. It has to happen in this order, since emitting the binding
552 * tables may change the push constants (in case of storage images). After
553 * emitting push constants, on SKL+ we have to emit the corresponding
554 * 3DSTATE_BINDING_TABLE_POINTER_* for the push constants to take effect.
555 */
556 uint32_t dirty = 0;
557 if (cmd_buffer->state.descriptors_dirty)
558 dirty = anv_cmd_buffer_flush_descriptor_sets(cmd_buffer);
559
560 if (cmd_buffer->state.push_constants_dirty) {
561 #if GEN_GEN >= 9
562 /* On Sky Lake and later, the binding table pointers commands are
563 * what actually flush the changes to push constant state so we need
564 * to dirty them so they get re-emitted below.
565 */
566 dirty |= cmd_buffer_flush_push_constants(cmd_buffer);
567 #else
568 cmd_buffer_flush_push_constants(cmd_buffer);
569 #endif
570 }
571
572 if (dirty)
573 cmd_buffer_emit_descriptor_pointers(cmd_buffer, dirty);
574
575 if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_DYNAMIC_VIEWPORT)
576 gen8_cmd_buffer_emit_viewport(cmd_buffer);
577
578 if (cmd_buffer->state.dirty & (ANV_CMD_DIRTY_DYNAMIC_VIEWPORT |
579 ANV_CMD_DIRTY_PIPELINE)) {
580 gen8_cmd_buffer_emit_depth_viewport(cmd_buffer,
581 pipeline->depth_clamp_enable);
582 }
583
584 if (cmd_buffer->state.dirty & ANV_CMD_DIRTY_DYNAMIC_SCISSOR)
585 gen7_cmd_buffer_emit_scissor(cmd_buffer);
586
587 genX(cmd_buffer_flush_dynamic_state)(cmd_buffer);
588
589 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
590 }
591
592 static void
593 emit_base_vertex_instance_bo(struct anv_cmd_buffer *cmd_buffer,
594 struct anv_bo *bo, uint32_t offset)
595 {
596 uint32_t *p = anv_batch_emitn(&cmd_buffer->batch, 5,
597 GENX(3DSTATE_VERTEX_BUFFERS));
598
599 GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, p + 1,
600 &(struct GENX(VERTEX_BUFFER_STATE)) {
601 .VertexBufferIndex = 32, /* Reserved for this */
602 .AddressModifyEnable = true,
603 .BufferPitch = 0,
604 #if (GEN_GEN >= 8)
605 .MemoryObjectControlState = GENX(MOCS),
606 .BufferStartingAddress = { bo, offset },
607 .BufferSize = 8
608 #else
609 .VertexBufferMemoryObjectControlState = GENX(MOCS),
610 .BufferStartingAddress = { bo, offset },
611 .EndAddress = { bo, offset + 8 },
612 #endif
613 });
614 }
615
616 static void
617 emit_base_vertex_instance(struct anv_cmd_buffer *cmd_buffer,
618 uint32_t base_vertex, uint32_t base_instance)
619 {
620 struct anv_state id_state =
621 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 8, 4);
622
623 ((uint32_t *)id_state.map)[0] = base_vertex;
624 ((uint32_t *)id_state.map)[1] = base_instance;
625
626 if (!cmd_buffer->device->info.has_llc)
627 anv_state_clflush(id_state);
628
629 emit_base_vertex_instance_bo(cmd_buffer,
630 &cmd_buffer->device->dynamic_state_block_pool.bo, id_state.offset);
631 }
632
633 void genX(CmdDraw)(
634 VkCommandBuffer commandBuffer,
635 uint32_t vertexCount,
636 uint32_t instanceCount,
637 uint32_t firstVertex,
638 uint32_t firstInstance)
639 {
640 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
641 struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
642 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
643
644 genX(cmd_buffer_flush_state)(cmd_buffer);
645
646 if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance)
647 emit_base_vertex_instance(cmd_buffer, firstVertex, firstInstance);
648
649 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
650 prim.VertexAccessType = SEQUENTIAL;
651 prim.PrimitiveTopologyType = pipeline->topology;
652 prim.VertexCountPerInstance = vertexCount;
653 prim.StartVertexLocation = firstVertex;
654 prim.InstanceCount = instanceCount;
655 prim.StartInstanceLocation = firstInstance;
656 prim.BaseVertexLocation = 0;
657 }
658 }
659
660 void genX(CmdDrawIndexed)(
661 VkCommandBuffer commandBuffer,
662 uint32_t indexCount,
663 uint32_t instanceCount,
664 uint32_t firstIndex,
665 int32_t vertexOffset,
666 uint32_t firstInstance)
667 {
668 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
669 struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
670 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
671
672 genX(cmd_buffer_flush_state)(cmd_buffer);
673
674 if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance)
675 emit_base_vertex_instance(cmd_buffer, vertexOffset, firstInstance);
676
677 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
678 prim.VertexAccessType = RANDOM;
679 prim.PrimitiveTopologyType = pipeline->topology;
680 prim.VertexCountPerInstance = indexCount;
681 prim.StartVertexLocation = firstIndex;
682 prim.InstanceCount = instanceCount;
683 prim.StartInstanceLocation = firstInstance;
684 prim.BaseVertexLocation = vertexOffset;
685 }
686 }
687
688 /* Auto-Draw / Indirect Registers */
689 #define GEN7_3DPRIM_END_OFFSET 0x2420
690 #define GEN7_3DPRIM_START_VERTEX 0x2430
691 #define GEN7_3DPRIM_VERTEX_COUNT 0x2434
692 #define GEN7_3DPRIM_INSTANCE_COUNT 0x2438
693 #define GEN7_3DPRIM_START_INSTANCE 0x243C
694 #define GEN7_3DPRIM_BASE_VERTEX 0x2440
695
696 static void
697 emit_lrm(struct anv_batch *batch,
698 uint32_t reg, struct anv_bo *bo, uint32_t offset)
699 {
700 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
701 lrm.RegisterAddress = reg;
702 lrm.MemoryAddress = (struct anv_address) { bo, offset };
703 }
704 }
705
706 static void
707 emit_lri(struct anv_batch *batch, uint32_t reg, uint32_t imm)
708 {
709 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
710 lri.RegisterOffset = reg;
711 lri.DataDWord = imm;
712 }
713 }
714
715 void genX(CmdDrawIndirect)(
716 VkCommandBuffer commandBuffer,
717 VkBuffer _buffer,
718 VkDeviceSize offset,
719 uint32_t drawCount,
720 uint32_t stride)
721 {
722 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
723 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
724 struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
725 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
726 struct anv_bo *bo = buffer->bo;
727 uint32_t bo_offset = buffer->offset + offset;
728
729 genX(cmd_buffer_flush_state)(cmd_buffer);
730
731 if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance)
732 emit_base_vertex_instance_bo(cmd_buffer, bo, bo_offset + 8);
733
734 emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset);
735 emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4);
736 emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8);
737 emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 12);
738 emit_lri(&cmd_buffer->batch, GEN7_3DPRIM_BASE_VERTEX, 0);
739
740 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
741 prim.IndirectParameterEnable = true;
742 prim.VertexAccessType = SEQUENTIAL;
743 prim.PrimitiveTopologyType = pipeline->topology;
744 }
745 }
746
747 void genX(CmdDrawIndexedIndirect)(
748 VkCommandBuffer commandBuffer,
749 VkBuffer _buffer,
750 VkDeviceSize offset,
751 uint32_t drawCount,
752 uint32_t stride)
753 {
754 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
755 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
756 struct anv_pipeline *pipeline = cmd_buffer->state.pipeline;
757 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
758 struct anv_bo *bo = buffer->bo;
759 uint32_t bo_offset = buffer->offset + offset;
760
761 genX(cmd_buffer_flush_state)(cmd_buffer);
762
763 /* TODO: We need to stomp base vertex to 0 somehow */
764 if (vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance)
765 emit_base_vertex_instance_bo(cmd_buffer, bo, bo_offset + 12);
766
767 emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset);
768 emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4);
769 emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8);
770 emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_BASE_VERTEX, bo, bo_offset + 12);
771 emit_lrm(&cmd_buffer->batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 16);
772
773 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
774 prim.IndirectParameterEnable = true;
775 prim.VertexAccessType = RANDOM;
776 prim.PrimitiveTopologyType = pipeline->topology;
777 }
778 }
779
780 #if GEN_GEN == 7
781
782 static bool
783 verify_cmd_parser(const struct anv_device *device,
784 int required_version,
785 const char *function)
786 {
787 if (device->instance->physicalDevice.cmd_parser_version < required_version) {
788 vk_errorf(VK_ERROR_FEATURE_NOT_PRESENT,
789 "cmd parser version %d is required for %s",
790 required_version, function);
791 return false;
792 } else {
793 return true;
794 }
795 }
796
797 #endif
798
799 void genX(CmdDispatch)(
800 VkCommandBuffer commandBuffer,
801 uint32_t x,
802 uint32_t y,
803 uint32_t z)
804 {
805 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
806 struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
807 const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
808
809 if (prog_data->uses_num_work_groups) {
810 struct anv_state state =
811 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 12, 4);
812 uint32_t *sizes = state.map;
813 sizes[0] = x;
814 sizes[1] = y;
815 sizes[2] = z;
816 if (!cmd_buffer->device->info.has_llc)
817 anv_state_clflush(state);
818 cmd_buffer->state.num_workgroups_offset = state.offset;
819 cmd_buffer->state.num_workgroups_bo =
820 &cmd_buffer->device->dynamic_state_block_pool.bo;
821 }
822
823 genX(cmd_buffer_flush_compute_state)(cmd_buffer);
824
825 anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER), ggw) {
826 ggw.SIMDSize = prog_data->simd_size / 16;
827 ggw.ThreadDepthCounterMaximum = 0;
828 ggw.ThreadHeightCounterMaximum = 0;
829 ggw.ThreadWidthCounterMaximum = prog_data->threads - 1;
830 ggw.ThreadGroupIDXDimension = x;
831 ggw.ThreadGroupIDYDimension = y;
832 ggw.ThreadGroupIDZDimension = z;
833 ggw.RightExecutionMask = pipeline->cs_right_mask;
834 ggw.BottomExecutionMask = 0xffffffff;
835 }
836
837 anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_STATE_FLUSH), msf);
838 }
839
840 #define GPGPU_DISPATCHDIMX 0x2500
841 #define GPGPU_DISPATCHDIMY 0x2504
842 #define GPGPU_DISPATCHDIMZ 0x2508
843
844 #define MI_PREDICATE_SRC0 0x2400
845 #define MI_PREDICATE_SRC1 0x2408
846
847 void genX(CmdDispatchIndirect)(
848 VkCommandBuffer commandBuffer,
849 VkBuffer _buffer,
850 VkDeviceSize offset)
851 {
852 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
853 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
854 struct anv_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
855 const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
856 struct anv_bo *bo = buffer->bo;
857 uint32_t bo_offset = buffer->offset + offset;
858 struct anv_batch *batch = &cmd_buffer->batch;
859
860 #if GEN_GEN == 7
861 /* Linux 4.4 added command parser version 5 which allows the GPGPU
862 * indirect dispatch registers to be written.
863 */
864 if (!verify_cmd_parser(cmd_buffer->device, 5, "vkCmdDispatchIndirect"))
865 return;
866 #endif
867
868 if (prog_data->uses_num_work_groups) {
869 cmd_buffer->state.num_workgroups_offset = bo_offset;
870 cmd_buffer->state.num_workgroups_bo = bo;
871 }
872
873 genX(cmd_buffer_flush_compute_state)(cmd_buffer);
874
875 emit_lrm(batch, GPGPU_DISPATCHDIMX, bo, bo_offset);
876 emit_lrm(batch, GPGPU_DISPATCHDIMY, bo, bo_offset + 4);
877 emit_lrm(batch, GPGPU_DISPATCHDIMZ, bo, bo_offset + 8);
878
879 #if GEN_GEN <= 7
880 /* Clear upper 32-bits of SRC0 and all 64-bits of SRC1 */
881 emit_lri(batch, MI_PREDICATE_SRC0 + 4, 0);
882 emit_lri(batch, MI_PREDICATE_SRC1 + 0, 0);
883 emit_lri(batch, MI_PREDICATE_SRC1 + 4, 0);
884
885 /* Load compute_dispatch_indirect_x_size into SRC0 */
886 emit_lrm(batch, MI_PREDICATE_SRC0, bo, bo_offset + 0);
887
888 /* predicate = (compute_dispatch_indirect_x_size == 0); */
889 anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
890 mip.LoadOperation = LOAD_LOAD;
891 mip.CombineOperation = COMBINE_SET;
892 mip.CompareOperation = COMPARE_SRCS_EQUAL;
893 }
894
895 /* Load compute_dispatch_indirect_y_size into SRC0 */
896 emit_lrm(batch, MI_PREDICATE_SRC0, bo, bo_offset + 4);
897
898 /* predicate |= (compute_dispatch_indirect_y_size == 0); */
899 anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
900 mip.LoadOperation = LOAD_LOAD;
901 mip.CombineOperation = COMBINE_OR;
902 mip.CompareOperation = COMPARE_SRCS_EQUAL;
903 }
904
905 /* Load compute_dispatch_indirect_z_size into SRC0 */
906 emit_lrm(batch, MI_PREDICATE_SRC0, bo, bo_offset + 8);
907
908 /* predicate |= (compute_dispatch_indirect_z_size == 0); */
909 anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
910 mip.LoadOperation = LOAD_LOAD;
911 mip.CombineOperation = COMBINE_OR;
912 mip.CompareOperation = COMPARE_SRCS_EQUAL;
913 }
914
915 /* predicate = !predicate; */
916 #define COMPARE_FALSE 1
917 anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
918 mip.LoadOperation = LOAD_LOADINV;
919 mip.CombineOperation = COMBINE_OR;
920 mip.CompareOperation = COMPARE_FALSE;
921 }
922 #endif
923
924 anv_batch_emit(batch, GENX(GPGPU_WALKER), ggw) {
925 ggw.IndirectParameterEnable = true;
926 ggw.PredicateEnable = GEN_GEN <= 7;
927 ggw.SIMDSize = prog_data->simd_size / 16;
928 ggw.ThreadDepthCounterMaximum = 0;
929 ggw.ThreadHeightCounterMaximum = 0;
930 ggw.ThreadWidthCounterMaximum = prog_data->threads - 1;
931 ggw.RightExecutionMask = pipeline->cs_right_mask;
932 ggw.BottomExecutionMask = 0xffffffff;
933 }
934
935 anv_batch_emit(batch, GENX(MEDIA_STATE_FLUSH), msf);
936 }
937
938 static void
939 flush_pipeline_before_pipeline_select(struct anv_cmd_buffer *cmd_buffer,
940 uint32_t pipeline)
941 {
942 #if GEN_GEN >= 8 && GEN_GEN < 10
943 /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
944 *
945 * Software must clear the COLOR_CALC_STATE Valid field in
946 * 3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT
947 * with Pipeline Select set to GPGPU.
948 *
949 * The internal hardware docs recommend the same workaround for Gen9
950 * hardware too.
951 */
952 if (pipeline == GPGPU)
953 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), t);
954 #elif GEN_GEN <= 7
955 /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
956 * PIPELINE_SELECT [DevBWR+]":
957 *
958 * Project: DEVSNB+
959 *
960 * Software must ensure all the write caches are flushed through a
961 * stalling PIPE_CONTROL command followed by another PIPE_CONTROL
962 * command to invalidate read only caches prior to programming
963 * MI_PIPELINE_SELECT command to change the Pipeline Select Mode.
964 */
965 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
966 pc.RenderTargetCacheFlushEnable = true;
967 pc.DepthCacheFlushEnable = true;
968 pc.DCFlushEnable = true;
969 pc.PostSyncOperation = NoWrite;
970 pc.CommandStreamerStallEnable = true;
971 }
972
973 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
974 pc.TextureCacheInvalidationEnable = true;
975 pc.ConstantCacheInvalidationEnable = true;
976 pc.StateCacheInvalidationEnable = true;
977 pc.InstructionCacheInvalidateEnable = true;
978 pc.PostSyncOperation = NoWrite;
979 }
980 #endif
981 }
982
983 void
984 genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer)
985 {
986 if (cmd_buffer->state.current_pipeline != _3D) {
987 flush_pipeline_before_pipeline_select(cmd_buffer, _3D);
988
989 anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT), ps) {
990 #if GEN_GEN >= 9
991 ps.MaskBits = 3;
992 #endif
993 ps.PipelineSelection = _3D;
994 }
995
996 cmd_buffer->state.current_pipeline = _3D;
997 }
998 }
999
1000 void
1001 genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer)
1002 {
1003 if (cmd_buffer->state.current_pipeline != GPGPU) {
1004 flush_pipeline_before_pipeline_select(cmd_buffer, GPGPU);
1005
1006 anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT), ps) {
1007 #if GEN_GEN >= 9
1008 ps.MaskBits = 3;
1009 #endif
1010 ps.PipelineSelection = GPGPU;
1011 }
1012
1013 cmd_buffer->state.current_pipeline = GPGPU;
1014 }
1015 }
1016
1017 struct anv_state
1018 genX(cmd_buffer_alloc_null_surface_state)(struct anv_cmd_buffer *cmd_buffer,
1019 struct anv_framebuffer *fb)
1020 {
1021 struct anv_state state =
1022 anv_state_stream_alloc(&cmd_buffer->surface_state_stream, 64, 64);
1023
1024 struct GENX(RENDER_SURFACE_STATE) null_ss = {
1025 .SurfaceType = SURFTYPE_NULL,
1026 .SurfaceArray = fb->layers > 0,
1027 .SurfaceFormat = ISL_FORMAT_R8G8B8A8_UNORM,
1028 #if GEN_GEN >= 8
1029 .TileMode = YMAJOR,
1030 #else
1031 .TiledSurface = true,
1032 #endif
1033 .Width = fb->width - 1,
1034 .Height = fb->height - 1,
1035 .Depth = fb->layers - 1,
1036 .RenderTargetViewExtent = fb->layers - 1,
1037 };
1038
1039 GENX(RENDER_SURFACE_STATE_pack)(NULL, state.map, &null_ss);
1040
1041 if (!cmd_buffer->device->info.has_llc)
1042 anv_state_clflush(state);
1043
1044 return state;
1045 }
1046
1047 static void
1048 cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
1049 {
1050 struct anv_device *device = cmd_buffer->device;
1051 const struct anv_framebuffer *fb = cmd_buffer->state.framebuffer;
1052 const struct anv_image_view *iview =
1053 anv_cmd_buffer_get_depth_stencil_view(cmd_buffer);
1054 const struct anv_image *image = iview ? iview->image : NULL;
1055 const bool has_depth = image && (image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT);
1056 const bool has_stencil =
1057 image && (image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT);
1058
1059 /* FIXME: Implement the PMA stall W/A */
1060 /* FIXME: Width and Height are wrong */
1061
1062 /* Emit 3DSTATE_DEPTH_BUFFER */
1063 if (has_depth) {
1064 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BUFFER), db) {
1065 db.SurfaceType = SURFTYPE_2D;
1066 db.DepthWriteEnable = true;
1067 db.StencilWriteEnable = has_stencil;
1068 db.HierarchicalDepthBufferEnable = false;
1069
1070 db.SurfaceFormat = isl_surf_get_depth_format(&device->isl_dev,
1071 &image->depth_surface.isl);
1072
1073 db.SurfaceBaseAddress = (struct anv_address) {
1074 .bo = image->bo,
1075 .offset = image->offset + image->depth_surface.offset,
1076 };
1077 db.DepthBufferObjectControlState = GENX(MOCS),
1078
1079 db.SurfacePitch = image->depth_surface.isl.row_pitch - 1;
1080 db.Height = image->extent.height - 1;
1081 db.Width = image->extent.width - 1;
1082 db.LOD = iview->base_mip;
1083 db.Depth = image->array_size - 1; /* FIXME: 3-D */
1084 db.MinimumArrayElement = iview->base_layer;
1085
1086 #if GEN_GEN >= 8
1087 db.SurfaceQPitch =
1088 isl_surf_get_array_pitch_el_rows(&image->depth_surface.isl) >> 2,
1089 #endif
1090 db.RenderTargetViewExtent = 1 - 1;
1091 }
1092 } else {
1093 /* Even when no depth buffer is present, the hardware requires that
1094 * 3DSTATE_DEPTH_BUFFER be programmed correctly. The Broadwell PRM says:
1095 *
1096 * If a null depth buffer is bound, the driver must instead bind depth as:
1097 * 3DSTATE_DEPTH.SurfaceType = SURFTYPE_2D
1098 * 3DSTATE_DEPTH.Width = 1
1099 * 3DSTATE_DEPTH.Height = 1
1100 * 3DSTATE_DEPTH.SuraceFormat = D16_UNORM
1101 * 3DSTATE_DEPTH.SurfaceBaseAddress = 0
1102 * 3DSTATE_DEPTH.HierarchicalDepthBufferEnable = 0
1103 * 3DSTATE_WM_DEPTH_STENCIL.DepthTestEnable = 0
1104 * 3DSTATE_WM_DEPTH_STENCIL.DepthBufferWriteEnable = 0
1105 *
1106 * The PRM is wrong, though. The width and height must be programmed to
1107 * actual framebuffer's width and height, even when neither depth buffer
1108 * nor stencil buffer is present. Also, D16_UNORM is not allowed to
1109 * be combined with a stencil buffer so we use D32_FLOAT instead.
1110 */
1111 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BUFFER), db) {
1112 db.SurfaceType = SURFTYPE_2D;
1113 db.SurfaceFormat = D32_FLOAT;
1114 db.Width = fb->width - 1;
1115 db.Height = fb->height - 1;
1116 db.StencilWriteEnable = has_stencil;
1117 }
1118 }
1119
1120 /* Emit 3DSTATE_STENCIL_BUFFER */
1121 if (has_stencil) {
1122 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STENCIL_BUFFER), sb) {
1123 #if GEN_GEN >= 8 || GEN_IS_HASWELL
1124 sb.StencilBufferEnable = true,
1125 #endif
1126 sb.StencilBufferObjectControlState = GENX(MOCS),
1127
1128 /* Stencil buffers have strange pitch. The PRM says:
1129 *
1130 * The pitch must be set to 2x the value computed based on width,
1131 * as the stencil buffer is stored with two rows interleaved.
1132 */
1133 sb.SurfacePitch = 2 * image->stencil_surface.isl.row_pitch - 1,
1134
1135 #if GEN_GEN >= 8
1136 sb.SurfaceQPitch = isl_surf_get_array_pitch_el_rows(&image->stencil_surface.isl) >> 2,
1137 #endif
1138 sb.SurfaceBaseAddress = (struct anv_address) {
1139 .bo = image->bo,
1140 .offset = image->offset + image->stencil_surface.offset,
1141 };
1142 }
1143 } else {
1144 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STENCIL_BUFFER), sb);
1145 }
1146
1147 /* Disable hierarchial depth buffers. */
1148 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_HIER_DEPTH_BUFFER), hz);
1149
1150 /* Clear the clear params. */
1151 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CLEAR_PARAMS), cp);
1152 }
1153
1154 /**
1155 * @see anv_cmd_buffer_set_subpass()
1156 */
1157 void
1158 genX(cmd_buffer_set_subpass)(struct anv_cmd_buffer *cmd_buffer,
1159 struct anv_subpass *subpass)
1160 {
1161 cmd_buffer->state.subpass = subpass;
1162
1163 cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
1164
1165 cmd_buffer_emit_depth_stencil(cmd_buffer);
1166 }
1167
1168 void genX(CmdBeginRenderPass)(
1169 VkCommandBuffer commandBuffer,
1170 const VkRenderPassBeginInfo* pRenderPassBegin,
1171 VkSubpassContents contents)
1172 {
1173 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1174 ANV_FROM_HANDLE(anv_render_pass, pass, pRenderPassBegin->renderPass);
1175 ANV_FROM_HANDLE(anv_framebuffer, framebuffer, pRenderPassBegin->framebuffer);
1176
1177 cmd_buffer->state.framebuffer = framebuffer;
1178 cmd_buffer->state.pass = pass;
1179 cmd_buffer->state.render_area = pRenderPassBegin->renderArea;
1180 anv_cmd_state_setup_attachments(cmd_buffer, pRenderPassBegin);
1181
1182 genX(flush_pipeline_select_3d)(cmd_buffer);
1183
1184 genX(cmd_buffer_set_subpass)(cmd_buffer, pass->subpasses);
1185 anv_cmd_buffer_clear_subpass(cmd_buffer);
1186 }
1187
1188 void genX(CmdNextSubpass)(
1189 VkCommandBuffer commandBuffer,
1190 VkSubpassContents contents)
1191 {
1192 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1193
1194 assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
1195
1196 anv_cmd_buffer_resolve_subpass(cmd_buffer);
1197 genX(cmd_buffer_set_subpass)(cmd_buffer, cmd_buffer->state.subpass + 1);
1198 anv_cmd_buffer_clear_subpass(cmd_buffer);
1199 }
1200
1201 void genX(CmdEndRenderPass)(
1202 VkCommandBuffer commandBuffer)
1203 {
1204 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1205
1206 anv_cmd_buffer_resolve_subpass(cmd_buffer);
1207 }
1208
1209 static void
1210 emit_ps_depth_count(struct anv_batch *batch,
1211 struct anv_bo *bo, uint32_t offset)
1212 {
1213 anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) {
1214 pc.DestinationAddressType = DAT_PPGTT;
1215 pc.PostSyncOperation = WritePSDepthCount;
1216 pc.DepthStallEnable = true;
1217 pc.Address = (struct anv_address) { bo, offset };
1218 }
1219 }
1220
1221 static void
1222 emit_query_availability(struct anv_batch *batch,
1223 struct anv_bo *bo, uint32_t offset)
1224 {
1225 anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) {
1226 pc.DestinationAddressType = DAT_PPGTT;
1227 pc.PostSyncOperation = WriteImmediateData;
1228 pc.Address = (struct anv_address) { bo, offset };
1229 pc.ImmediateData = 1;
1230 }
1231 }
1232
1233 void genX(CmdBeginQuery)(
1234 VkCommandBuffer commandBuffer,
1235 VkQueryPool queryPool,
1236 uint32_t query,
1237 VkQueryControlFlags flags)
1238 {
1239 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1240 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
1241
1242 /* Workaround: When meta uses the pipeline with the VS disabled, it seems
1243 * that the pipelining of the depth write breaks. What we see is that
1244 * samples from the render pass clear leaks into the first query
1245 * immediately after the clear. Doing a pipecontrol with a post-sync
1246 * operation and DepthStallEnable seems to work around the issue.
1247 */
1248 if (cmd_buffer->state.need_query_wa) {
1249 cmd_buffer->state.need_query_wa = false;
1250 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1251 pc.DepthCacheFlushEnable = true;
1252 pc.DepthStallEnable = true;
1253 }
1254 }
1255
1256 switch (pool->type) {
1257 case VK_QUERY_TYPE_OCCLUSION:
1258 emit_ps_depth_count(&cmd_buffer->batch, &pool->bo,
1259 query * sizeof(struct anv_query_pool_slot));
1260 break;
1261
1262 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1263 default:
1264 unreachable("");
1265 }
1266 }
1267
1268 void genX(CmdEndQuery)(
1269 VkCommandBuffer commandBuffer,
1270 VkQueryPool queryPool,
1271 uint32_t query)
1272 {
1273 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1274 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
1275
1276 switch (pool->type) {
1277 case VK_QUERY_TYPE_OCCLUSION:
1278 emit_ps_depth_count(&cmd_buffer->batch, &pool->bo,
1279 query * sizeof(struct anv_query_pool_slot) + 8);
1280
1281 emit_query_availability(&cmd_buffer->batch, &pool->bo,
1282 query * sizeof(struct anv_query_pool_slot) + 16);
1283 break;
1284
1285 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1286 default:
1287 unreachable("");
1288 }
1289 }
1290
1291 #define TIMESTAMP 0x2358
1292
1293 void genX(CmdWriteTimestamp)(
1294 VkCommandBuffer commandBuffer,
1295 VkPipelineStageFlagBits pipelineStage,
1296 VkQueryPool queryPool,
1297 uint32_t query)
1298 {
1299 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1300 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
1301 uint32_t offset = query * sizeof(struct anv_query_pool_slot);
1302
1303 assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);
1304
1305 switch (pipelineStage) {
1306 case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
1307 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
1308 srm.RegisterAddress = TIMESTAMP;
1309 srm.MemoryAddress = (struct anv_address) { &pool->bo, offset };
1310 }
1311 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
1312 srm.RegisterAddress = TIMESTAMP + 4;
1313 srm.MemoryAddress = (struct anv_address) { &pool->bo, offset + 4 };
1314 }
1315 break;
1316
1317 default:
1318 /* Everything else is bottom-of-pipe */
1319 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1320 pc.DestinationAddressType = DAT_PPGTT,
1321 pc.PostSyncOperation = WriteTimestamp,
1322 pc.Address = (struct anv_address) { &pool->bo, offset };
1323 }
1324 break;
1325 }
1326
1327 emit_query_availability(&cmd_buffer->batch, &pool->bo, query + 16);
1328 }
1329
1330 #if GEN_GEN > 7 || GEN_IS_HASWELL
1331
1332 #define alu_opcode(v) __gen_uint((v), 20, 31)
1333 #define alu_operand1(v) __gen_uint((v), 10, 19)
1334 #define alu_operand2(v) __gen_uint((v), 0, 9)
1335 #define alu(opcode, operand1, operand2) \
1336 alu_opcode(opcode) | alu_operand1(operand1) | alu_operand2(operand2)
1337
1338 #define OPCODE_NOOP 0x000
1339 #define OPCODE_LOAD 0x080
1340 #define OPCODE_LOADINV 0x480
1341 #define OPCODE_LOAD0 0x081
1342 #define OPCODE_LOAD1 0x481
1343 #define OPCODE_ADD 0x100
1344 #define OPCODE_SUB 0x101
1345 #define OPCODE_AND 0x102
1346 #define OPCODE_OR 0x103
1347 #define OPCODE_XOR 0x104
1348 #define OPCODE_STORE 0x180
1349 #define OPCODE_STOREINV 0x580
1350
1351 #define OPERAND_R0 0x00
1352 #define OPERAND_R1 0x01
1353 #define OPERAND_R2 0x02
1354 #define OPERAND_R3 0x03
1355 #define OPERAND_R4 0x04
1356 #define OPERAND_SRCA 0x20
1357 #define OPERAND_SRCB 0x21
1358 #define OPERAND_ACCU 0x31
1359 #define OPERAND_ZF 0x32
1360 #define OPERAND_CF 0x33
1361
1362 #define CS_GPR(n) (0x2600 + (n) * 8)
1363
1364 static void
1365 emit_load_alu_reg_u64(struct anv_batch *batch, uint32_t reg,
1366 struct anv_bo *bo, uint32_t offset)
1367 {
1368 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
1369 lrm.RegisterAddress = reg,
1370 lrm.MemoryAddress = (struct anv_address) { bo, offset };
1371 }
1372 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
1373 lrm.RegisterAddress = reg + 4;
1374 lrm.MemoryAddress = (struct anv_address) { bo, offset + 4 };
1375 }
1376 }
1377
1378 static void
1379 store_query_result(struct anv_batch *batch, uint32_t reg,
1380 struct anv_bo *bo, uint32_t offset, VkQueryResultFlags flags)
1381 {
1382 anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
1383 srm.RegisterAddress = reg;
1384 srm.MemoryAddress = (struct anv_address) { bo, offset };
1385 }
1386
1387 if (flags & VK_QUERY_RESULT_64_BIT) {
1388 anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
1389 srm.RegisterAddress = reg + 4;
1390 srm.MemoryAddress = (struct anv_address) { bo, offset + 4 };
1391 }
1392 }
1393 }
1394
1395 void genX(CmdCopyQueryPoolResults)(
1396 VkCommandBuffer commandBuffer,
1397 VkQueryPool queryPool,
1398 uint32_t firstQuery,
1399 uint32_t queryCount,
1400 VkBuffer destBuffer,
1401 VkDeviceSize destOffset,
1402 VkDeviceSize destStride,
1403 VkQueryResultFlags flags)
1404 {
1405 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1406 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
1407 ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
1408 uint32_t slot_offset, dst_offset;
1409
1410 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1411 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1412 pc.CommandStreamerStallEnable = true;
1413 pc.StallAtPixelScoreboard = true;
1414 }
1415 }
1416
1417 dst_offset = buffer->offset + destOffset;
1418 for (uint32_t i = 0; i < queryCount; i++) {
1419
1420 slot_offset = (firstQuery + i) * sizeof(struct anv_query_pool_slot);
1421 switch (pool->type) {
1422 case VK_QUERY_TYPE_OCCLUSION:
1423 emit_load_alu_reg_u64(&cmd_buffer->batch,
1424 CS_GPR(0), &pool->bo, slot_offset);
1425 emit_load_alu_reg_u64(&cmd_buffer->batch,
1426 CS_GPR(1), &pool->bo, slot_offset + 8);
1427
1428 /* FIXME: We need to clamp the result for 32 bit. */
1429
1430 uint32_t *dw = anv_batch_emitn(&cmd_buffer->batch, 5, GENX(MI_MATH));
1431 dw[1] = alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R1);
1432 dw[2] = alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R0);
1433 dw[3] = alu(OPCODE_SUB, 0, 0);
1434 dw[4] = alu(OPCODE_STORE, OPERAND_R2, OPERAND_ACCU);
1435 break;
1436
1437 case VK_QUERY_TYPE_TIMESTAMP:
1438 emit_load_alu_reg_u64(&cmd_buffer->batch,
1439 CS_GPR(2), &pool->bo, slot_offset);
1440 break;
1441
1442 default:
1443 unreachable("unhandled query type");
1444 }
1445
1446 store_query_result(&cmd_buffer->batch,
1447 CS_GPR(2), buffer->bo, dst_offset, flags);
1448
1449 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
1450 emit_load_alu_reg_u64(&cmd_buffer->batch, CS_GPR(0),
1451 &pool->bo, slot_offset + 16);
1452 if (flags & VK_QUERY_RESULT_64_BIT)
1453 store_query_result(&cmd_buffer->batch,
1454 CS_GPR(0), buffer->bo, dst_offset + 8, flags);
1455 else
1456 store_query_result(&cmd_buffer->batch,
1457 CS_GPR(0), buffer->bo, dst_offset + 4, flags);
1458 }
1459
1460 dst_offset += destStride;
1461 }
1462 }
1463
1464 #endif