2 * Copyright 2013 Advanced Micro Devices, Inc.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25 #include "si_build_pm4.h"
27 #include "util/u_memory.h"
28 #include "util/u_suballoc.h"
30 static void si_set_streamout_enable(struct si_context
*sctx
, bool enable
);
32 static inline void si_so_target_reference(struct si_streamout_target
**dst
,
33 struct pipe_stream_output_target
*src
)
35 pipe_so_target_reference((struct pipe_stream_output_target
**)dst
, src
);
38 static struct pipe_stream_output_target
*
39 si_create_so_target(struct pipe_context
*ctx
,
40 struct pipe_resource
*buffer
,
41 unsigned buffer_offset
,
44 struct si_context
*sctx
= (struct si_context
*)ctx
;
45 struct si_streamout_target
*t
;
46 struct si_resource
*buf
= si_resource(buffer
);
48 t
= CALLOC_STRUCT(si_streamout_target
);
53 unsigned buf_filled_size_size
= sctx
->screen
->use_ngg_streamout
? 8 : 4;
54 u_suballocator_alloc(sctx
->allocator_zeroed_memory
, buf_filled_size_size
, 4,
55 &t
->buf_filled_size_offset
,
56 (struct pipe_resource
**)&t
->buf_filled_size
);
57 if (!t
->buf_filled_size
) {
62 t
->b
.reference
.count
= 1;
64 pipe_resource_reference(&t
->b
.buffer
, buffer
);
65 t
->b
.buffer_offset
= buffer_offset
;
66 t
->b
.buffer_size
= buffer_size
;
68 util_range_add(&buf
->b
.b
, &buf
->valid_buffer_range
, buffer_offset
,
69 buffer_offset
+ buffer_size
);
73 static void si_so_target_destroy(struct pipe_context
*ctx
,
74 struct pipe_stream_output_target
*target
)
76 struct si_streamout_target
*t
= (struct si_streamout_target
*)target
;
77 pipe_resource_reference(&t
->b
.buffer
, NULL
);
78 si_resource_reference(&t
->buf_filled_size
, NULL
);
82 void si_streamout_buffers_dirty(struct si_context
*sctx
)
84 if (!sctx
->streamout
.enabled_mask
)
87 si_mark_atom_dirty(sctx
, &sctx
->atoms
.s
.streamout_begin
);
88 si_set_streamout_enable(sctx
, true);
91 static void si_set_streamout_targets(struct pipe_context
*ctx
,
93 struct pipe_stream_output_target
**targets
,
94 const unsigned *offsets
)
96 struct si_context
*sctx
= (struct si_context
*)ctx
;
97 unsigned old_num_targets
= sctx
->streamout
.num_targets
;
99 bool wait_now
= false;
101 /* We are going to unbind the buffers. Mark which caches need to be flushed. */
102 if (sctx
->streamout
.num_targets
&& sctx
->streamout
.begin_emitted
) {
103 /* Since streamout uses vector writes which go through TC L2
104 * and most other clients can use TC L2 as well, we don't need
107 * The only cases which requires flushing it is VGT DMA index
108 * fetching (on <= GFX7) and indirect draw data, which are rare
109 * cases. Thus, flag the TC L2 dirtiness in the resource and
110 * handle it at draw call time.
112 for (i
= 0; i
< sctx
->streamout
.num_targets
; i
++)
113 if (sctx
->streamout
.targets
[i
])
114 si_resource(sctx
->streamout
.targets
[i
]->b
.buffer
)->TC_L2_dirty
= true;
116 /* Invalidate the scalar cache in case a streamout buffer is
117 * going to be used as a constant buffer.
119 * Invalidate vL1, because streamout bypasses it (done by
120 * setting GLC=1 in the store instruction), but vL1 in other
121 * CUs can contain outdated data of streamout buffers.
123 * VS_PARTIAL_FLUSH is required if the buffers are going to be
124 * used as an input immediately.
126 sctx
->flags
|= SI_CONTEXT_INV_SCACHE
|
127 SI_CONTEXT_INV_VCACHE
;
129 /* The BUFFER_FILLED_SIZE is written using a PS_DONE event. */
130 if (sctx
->screen
->use_ngg_streamout
) {
131 sctx
->flags
|= SI_CONTEXT_PS_PARTIAL_FLUSH
;
133 /* Wait now. This is needed to make sure that GDS is not
134 * busy at the end of IBs.
136 * Also, the next streamout operation will overwrite GDS,
137 * so we need to make sure that it's idle.
141 sctx
->flags
|= SI_CONTEXT_VS_PARTIAL_FLUSH
;
145 /* All readers of the streamout targets need to be finished before we can
146 * start writing to the targets.
149 if (sctx
->screen
->use_ngg_streamout
)
150 si_allocate_gds(sctx
);
152 sctx
->flags
|= SI_CONTEXT_PS_PARTIAL_FLUSH
|
153 SI_CONTEXT_CS_PARTIAL_FLUSH
;
156 /* Streamout buffers must be bound in 2 places:
157 * 1) in VGT by setting the VGT_STRMOUT registers
158 * 2) as shader resources
161 /* Stop streamout. */
162 if (sctx
->streamout
.num_targets
&& sctx
->streamout
.begin_emitted
)
163 si_emit_streamout_end(sctx
);
165 /* Set the new targets. */
166 unsigned enabled_mask
= 0, append_bitmask
= 0;
167 for (i
= 0; i
< num_targets
; i
++) {
168 si_so_target_reference(&sctx
->streamout
.targets
[i
], targets
[i
]);
172 si_context_add_resource_size(sctx
, targets
[i
]->buffer
);
173 enabled_mask
|= 1 << i
;
175 if (offsets
[i
] == ((unsigned)-1))
176 append_bitmask
|= 1 << i
;
179 for (; i
< sctx
->streamout
.num_targets
; i
++)
180 si_so_target_reference(&sctx
->streamout
.targets
[i
], NULL
);
182 sctx
->streamout
.enabled_mask
= enabled_mask
;
183 sctx
->streamout
.num_targets
= num_targets
;
184 sctx
->streamout
.append_bitmask
= append_bitmask
;
186 /* Update dirty state bits. */
188 si_streamout_buffers_dirty(sctx
);
190 si_set_atom_dirty(sctx
, &sctx
->atoms
.s
.streamout_begin
, false);
191 si_set_streamout_enable(sctx
, false);
194 /* Set the shader resources.*/
195 for (i
= 0; i
< num_targets
; i
++) {
197 struct pipe_shader_buffer sbuf
;
198 sbuf
.buffer
= targets
[i
]->buffer
;
200 if (sctx
->screen
->use_ngg_streamout
) {
201 sbuf
.buffer_offset
= targets
[i
]->buffer_offset
;
202 sbuf
.buffer_size
= targets
[i
]->buffer_size
;
204 sbuf
.buffer_offset
= 0;
205 sbuf
.buffer_size
= targets
[i
]->buffer_offset
+
206 targets
[i
]->buffer_size
;
209 si_set_rw_shader_buffer(sctx
, SI_VS_STREAMOUT_BUF0
+ i
, &sbuf
);
210 si_resource(targets
[i
]->buffer
)->bind_history
|= PIPE_BIND_STREAM_OUTPUT
;
212 si_set_rw_shader_buffer(sctx
, SI_VS_STREAMOUT_BUF0
+ i
, NULL
);
215 for (; i
< old_num_targets
; i
++)
216 si_set_rw_shader_buffer(sctx
, SI_VS_STREAMOUT_BUF0
+ i
, NULL
);
219 sctx
->emit_cache_flush(sctx
);
222 static void gfx10_emit_streamout_begin(struct si_context
*sctx
)
224 struct si_streamout_target
**t
= sctx
->streamout
.targets
;
225 struct radeon_cmdbuf
*cs
= sctx
->gfx_cs
;
226 unsigned last_target
= 0;
228 for (unsigned i
= 0; i
< sctx
->streamout
.num_targets
; i
++) {
233 for (unsigned i
= 0; i
< sctx
->streamout
.num_targets
; i
++) {
237 t
[i
]->stride_in_dw
= sctx
->streamout
.stride_in_dw
[i
];
239 bool append
= sctx
->streamout
.append_bitmask
& (1 << i
);
243 radeon_add_to_buffer_list(sctx
, sctx
->gfx_cs
,
244 t
[i
]->buf_filled_size
,
246 RADEON_PRIO_SO_FILLED_SIZE
);
248 va
= t
[i
]->buf_filled_size
->gpu_address
+
249 t
[i
]->buf_filled_size_offset
;
252 radeon_emit(cs
, PKT3(PKT3_DMA_DATA
, 5, 0));
253 radeon_emit(cs
, S_411_SRC_SEL(append
? V_411_SRC_ADDR_TC_L2
: V_411_DATA
) |
254 S_411_DST_SEL(V_411_GDS
) |
255 S_411_CP_SYNC(i
== last_target
));
257 radeon_emit(cs
, va
>> 32);
258 radeon_emit(cs
, 4 * i
); /* destination in GDS */
260 radeon_emit(cs
, S_414_BYTE_COUNT_GFX9(4) |
261 S_414_DISABLE_WR_CONFIRM_GFX9(i
!= last_target
));
264 sctx
->streamout
.begin_emitted
= true;
267 static void gfx10_emit_streamout_end(struct si_context
*sctx
)
269 struct si_streamout_target
**t
= sctx
->streamout
.targets
;
271 for (unsigned i
= 0; i
< sctx
->streamout
.num_targets
; i
++) {
275 uint64_t va
= t
[i
]->buf_filled_size
->gpu_address
+ t
[i
]->buf_filled_size_offset
;
277 si_cp_release_mem(sctx
, sctx
->gfx_cs
, V_028A90_PS_DONE
, 0,
279 EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM
,
281 t
[i
]->buf_filled_size
, va
,
282 EOP_DATA_GDS(i
, 1), 0);
284 t
[i
]->buf_filled_size_valid
= true;
287 sctx
->streamout
.begin_emitted
= false;
290 static void si_flush_vgt_streamout(struct si_context
*sctx
)
292 struct radeon_cmdbuf
*cs
= sctx
->gfx_cs
;
293 unsigned reg_strmout_cntl
;
295 /* The register is at different places on different ASICs. */
296 if (sctx
->chip_class
>= GFX7
) {
297 reg_strmout_cntl
= R_0300FC_CP_STRMOUT_CNTL
;
298 radeon_set_uconfig_reg(cs
, reg_strmout_cntl
, 0);
300 reg_strmout_cntl
= R_0084FC_CP_STRMOUT_CNTL
;
301 radeon_set_config_reg(cs
, reg_strmout_cntl
, 0);
304 radeon_emit(cs
, PKT3(PKT3_EVENT_WRITE
, 0, 0));
305 radeon_emit(cs
, EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH
) | EVENT_INDEX(0));
307 radeon_emit(cs
, PKT3(PKT3_WAIT_REG_MEM
, 5, 0));
308 radeon_emit(cs
, WAIT_REG_MEM_EQUAL
); /* wait until the register is equal to the reference value */
309 radeon_emit(cs
, reg_strmout_cntl
>> 2); /* register */
311 radeon_emit(cs
, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */
312 radeon_emit(cs
, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */
313 radeon_emit(cs
, 4); /* poll interval */
316 static void si_emit_streamout_begin(struct si_context
*sctx
)
318 struct radeon_cmdbuf
*cs
= sctx
->gfx_cs
;
319 struct si_streamout_target
**t
= sctx
->streamout
.targets
;
320 uint16_t *stride_in_dw
= sctx
->streamout
.stride_in_dw
;
323 si_flush_vgt_streamout(sctx
);
325 for (i
= 0; i
< sctx
->streamout
.num_targets
; i
++) {
329 t
[i
]->stride_in_dw
= stride_in_dw
[i
];
331 /* AMD GCN binds streamout buffers as shader resources.
332 * VGT only counts primitives and tells the shader
333 * through SGPRs what to do. */
334 radeon_set_context_reg_seq(cs
, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0
+ 16*i
, 2);
335 radeon_emit(cs
, (t
[i
]->b
.buffer_offset
+
336 t
[i
]->b
.buffer_size
) >> 2); /* BUFFER_SIZE (in DW) */
337 radeon_emit(cs
, stride_in_dw
[i
]); /* VTX_STRIDE (in DW) */
339 if (sctx
->streamout
.append_bitmask
& (1 << i
) && t
[i
]->buf_filled_size_valid
) {
340 uint64_t va
= t
[i
]->buf_filled_size
->gpu_address
+
341 t
[i
]->buf_filled_size_offset
;
344 radeon_emit(cs
, PKT3(PKT3_STRMOUT_BUFFER_UPDATE
, 4, 0));
345 radeon_emit(cs
, STRMOUT_SELECT_BUFFER(i
) |
346 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM
)); /* control */
347 radeon_emit(cs
, 0); /* unused */
348 radeon_emit(cs
, 0); /* unused */
349 radeon_emit(cs
, va
); /* src address lo */
350 radeon_emit(cs
, va
>> 32); /* src address hi */
352 radeon_add_to_buffer_list(sctx
, sctx
->gfx_cs
,
353 t
[i
]->buf_filled_size
,
355 RADEON_PRIO_SO_FILLED_SIZE
);
357 /* Start from the beginning. */
358 radeon_emit(cs
, PKT3(PKT3_STRMOUT_BUFFER_UPDATE
, 4, 0));
359 radeon_emit(cs
, STRMOUT_SELECT_BUFFER(i
) |
360 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET
)); /* control */
361 radeon_emit(cs
, 0); /* unused */
362 radeon_emit(cs
, 0); /* unused */
363 radeon_emit(cs
, t
[i
]->b
.buffer_offset
>> 2); /* buffer offset in DW */
364 radeon_emit(cs
, 0); /* unused */
368 sctx
->streamout
.begin_emitted
= true;
371 void si_emit_streamout_end(struct si_context
*sctx
)
373 if (sctx
->screen
->use_ngg_streamout
) {
374 gfx10_emit_streamout_end(sctx
);
378 struct radeon_cmdbuf
*cs
= sctx
->gfx_cs
;
379 struct si_streamout_target
**t
= sctx
->streamout
.targets
;
383 si_flush_vgt_streamout(sctx
);
385 for (i
= 0; i
< sctx
->streamout
.num_targets
; i
++) {
389 va
= t
[i
]->buf_filled_size
->gpu_address
+ t
[i
]->buf_filled_size_offset
;
390 radeon_emit(cs
, PKT3(PKT3_STRMOUT_BUFFER_UPDATE
, 4, 0));
391 radeon_emit(cs
, STRMOUT_SELECT_BUFFER(i
) |
392 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE
) |
393 STRMOUT_STORE_BUFFER_FILLED_SIZE
); /* control */
394 radeon_emit(cs
, va
); /* dst address lo */
395 radeon_emit(cs
, va
>> 32); /* dst address hi */
396 radeon_emit(cs
, 0); /* unused */
397 radeon_emit(cs
, 0); /* unused */
399 radeon_add_to_buffer_list(sctx
, sctx
->gfx_cs
,
400 t
[i
]->buf_filled_size
,
402 RADEON_PRIO_SO_FILLED_SIZE
);
404 /* Zero the buffer size. The counters (primitives generated,
405 * primitives emitted) may be enabled even if there is not
406 * buffer bound. This ensures that the primitives-emitted query
407 * won't increment. */
408 radeon_set_context_reg(cs
, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0
+ 16*i
, 0);
409 sctx
->context_roll
= true;
411 t
[i
]->buf_filled_size_valid
= true;
414 sctx
->streamout
.begin_emitted
= false;
417 /* STREAMOUT CONFIG DERIVED STATE
419 * Streamout must be enabled for the PRIMITIVES_GENERATED query to work.
420 * The buffer mask is an independent state, so no writes occur if there
421 * are no buffers bound.
424 static void si_emit_streamout_enable(struct si_context
*sctx
)
426 assert(!sctx
->screen
->use_ngg_streamout
);
428 radeon_set_context_reg_seq(sctx
->gfx_cs
, R_028B94_VGT_STRMOUT_CONFIG
, 2);
429 radeon_emit(sctx
->gfx_cs
,
430 S_028B94_STREAMOUT_0_EN(si_get_strmout_en(sctx
)) |
431 S_028B94_RAST_STREAM(0) |
432 S_028B94_STREAMOUT_1_EN(si_get_strmout_en(sctx
)) |
433 S_028B94_STREAMOUT_2_EN(si_get_strmout_en(sctx
)) |
434 S_028B94_STREAMOUT_3_EN(si_get_strmout_en(sctx
)));
435 radeon_emit(sctx
->gfx_cs
,
436 sctx
->streamout
.hw_enabled_mask
&
437 sctx
->streamout
.enabled_stream_buffers_mask
);
440 static void si_set_streamout_enable(struct si_context
*sctx
, bool enable
)
442 bool old_strmout_en
= si_get_strmout_en(sctx
);
443 unsigned old_hw_enabled_mask
= sctx
->streamout
.hw_enabled_mask
;
445 sctx
->streamout
.streamout_enabled
= enable
;
447 sctx
->streamout
.hw_enabled_mask
= sctx
->streamout
.enabled_mask
|
448 (sctx
->streamout
.enabled_mask
<< 4) |
449 (sctx
->streamout
.enabled_mask
<< 8) |
450 (sctx
->streamout
.enabled_mask
<< 12);
452 if (!sctx
->screen
->use_ngg_streamout
&&
453 ((old_strmout_en
!= si_get_strmout_en(sctx
)) ||
454 (old_hw_enabled_mask
!= sctx
->streamout
.hw_enabled_mask
)))
455 si_mark_atom_dirty(sctx
, &sctx
->atoms
.s
.streamout_enable
);
458 void si_update_prims_generated_query_state(struct si_context
*sctx
,
459 unsigned type
, int diff
)
461 if (!sctx
->screen
->use_ngg_streamout
&&
462 type
== PIPE_QUERY_PRIMITIVES_GENERATED
) {
463 bool old_strmout_en
= si_get_strmout_en(sctx
);
465 sctx
->streamout
.num_prims_gen_queries
+= diff
;
466 assert(sctx
->streamout
.num_prims_gen_queries
>= 0);
468 sctx
->streamout
.prims_gen_query_enabled
=
469 sctx
->streamout
.num_prims_gen_queries
!= 0;
471 if (old_strmout_en
!= si_get_strmout_en(sctx
))
472 si_mark_atom_dirty(sctx
, &sctx
->atoms
.s
.streamout_enable
);
474 if (si_update_ngg(sctx
)) {
475 si_shader_change_notify(sctx
);
476 sctx
->do_update_shaders
= true;
481 void si_init_streamout_functions(struct si_context
*sctx
)
483 sctx
->b
.create_stream_output_target
= si_create_so_target
;
484 sctx
->b
.stream_output_target_destroy
= si_so_target_destroy
;
485 sctx
->b
.set_stream_output_targets
= si_set_streamout_targets
;
487 if (sctx
->screen
->use_ngg_streamout
) {
488 sctx
->atoms
.s
.streamout_begin
.emit
= gfx10_emit_streamout_begin
;
490 sctx
->atoms
.s
.streamout_begin
.emit
= si_emit_streamout_begin
;
491 sctx
->atoms
.s
.streamout_enable
.emit
= si_emit_streamout_enable
;