2 * Copyright 2013 Advanced Micro Devices, Inc.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25 #include "si_build_pm4.h"
26 #include "util/u_memory.h"
27 #include "util/u_suballoc.h"
29 static void si_set_streamout_enable(struct si_context
*sctx
, bool enable
);
31 static inline void si_so_target_reference(struct si_streamout_target
**dst
,
32 struct pipe_stream_output_target
*src
)
34 pipe_so_target_reference((struct pipe_stream_output_target
**)dst
, src
);
37 static struct pipe_stream_output_target
*si_create_so_target(struct pipe_context
*ctx
,
38 struct pipe_resource
*buffer
,
39 unsigned buffer_offset
,
42 struct si_context
*sctx
= (struct si_context
*)ctx
;
43 struct si_streamout_target
*t
;
44 struct si_resource
*buf
= si_resource(buffer
);
46 t
= CALLOC_STRUCT(si_streamout_target
);
51 unsigned buf_filled_size_size
= sctx
->screen
->use_ngg_streamout
? 8 : 4;
52 u_suballocator_alloc(sctx
->allocator_zeroed_memory
, buf_filled_size_size
, 4,
53 &t
->buf_filled_size_offset
, (struct pipe_resource
**)&t
->buf_filled_size
);
54 if (!t
->buf_filled_size
) {
59 t
->b
.reference
.count
= 1;
61 pipe_resource_reference(&t
->b
.buffer
, buffer
);
62 t
->b
.buffer_offset
= buffer_offset
;
63 t
->b
.buffer_size
= buffer_size
;
65 util_range_add(&buf
->b
.b
, &buf
->valid_buffer_range
, buffer_offset
, buffer_offset
+ buffer_size
);
69 static void si_so_target_destroy(struct pipe_context
*ctx
, struct pipe_stream_output_target
*target
)
71 struct si_streamout_target
*t
= (struct si_streamout_target
*)target
;
72 pipe_resource_reference(&t
->b
.buffer
, NULL
);
73 si_resource_reference(&t
->buf_filled_size
, NULL
);
77 void si_streamout_buffers_dirty(struct si_context
*sctx
)
79 if (!sctx
->streamout
.enabled_mask
)
82 si_mark_atom_dirty(sctx
, &sctx
->atoms
.s
.streamout_begin
);
83 si_set_streamout_enable(sctx
, true);
86 static void si_set_streamout_targets(struct pipe_context
*ctx
, unsigned num_targets
,
87 struct pipe_stream_output_target
**targets
,
88 const unsigned *offsets
)
90 struct si_context
*sctx
= (struct si_context
*)ctx
;
91 unsigned old_num_targets
= sctx
->streamout
.num_targets
;
93 bool wait_now
= false;
95 /* We are going to unbind the buffers. Mark which caches need to be flushed. */
96 if (sctx
->streamout
.num_targets
&& sctx
->streamout
.begin_emitted
) {
97 /* Since streamout uses vector writes which go through TC L2
98 * and most other clients can use TC L2 as well, we don't need
101 * The only cases which requires flushing it is VGT DMA index
102 * fetching (on <= GFX7) and indirect draw data, which are rare
103 * cases. Thus, flag the TC L2 dirtiness in the resource and
104 * handle it at draw call time.
106 for (i
= 0; i
< sctx
->streamout
.num_targets
; i
++)
107 if (sctx
->streamout
.targets
[i
])
108 si_resource(sctx
->streamout
.targets
[i
]->b
.buffer
)->TC_L2_dirty
= true;
110 /* Invalidate the scalar cache in case a streamout buffer is
111 * going to be used as a constant buffer.
113 * Invalidate vL1, because streamout bypasses it (done by
114 * setting GLC=1 in the store instruction), but vL1 in other
115 * CUs can contain outdated data of streamout buffers.
117 * VS_PARTIAL_FLUSH is required if the buffers are going to be
118 * used as an input immediately.
120 sctx
->flags
|= SI_CONTEXT_INV_SCACHE
| SI_CONTEXT_INV_VCACHE
;
122 /* The BUFFER_FILLED_SIZE is written using a PS_DONE event. */
123 if (sctx
->screen
->use_ngg_streamout
) {
124 sctx
->flags
|= SI_CONTEXT_PS_PARTIAL_FLUSH
;
126 /* Wait now. This is needed to make sure that GDS is not
127 * busy at the end of IBs.
129 * Also, the next streamout operation will overwrite GDS,
130 * so we need to make sure that it's idle.
134 sctx
->flags
|= SI_CONTEXT_VS_PARTIAL_FLUSH
;
138 /* All readers of the streamout targets need to be finished before we can
139 * start writing to the targets.
142 if (sctx
->screen
->use_ngg_streamout
)
143 si_allocate_gds(sctx
);
145 sctx
->flags
|= SI_CONTEXT_PS_PARTIAL_FLUSH
| SI_CONTEXT_CS_PARTIAL_FLUSH
;
148 /* Streamout buffers must be bound in 2 places:
149 * 1) in VGT by setting the VGT_STRMOUT registers
150 * 2) as shader resources
153 /* Stop streamout. */
154 if (sctx
->streamout
.num_targets
&& sctx
->streamout
.begin_emitted
)
155 si_emit_streamout_end(sctx
);
157 /* Set the new targets. */
158 unsigned enabled_mask
= 0, append_bitmask
= 0;
159 for (i
= 0; i
< num_targets
; i
++) {
160 si_so_target_reference(&sctx
->streamout
.targets
[i
], targets
[i
]);
164 si_context_add_resource_size(sctx
, targets
[i
]->buffer
);
165 enabled_mask
|= 1 << i
;
167 if (offsets
[i
] == ((unsigned)-1))
168 append_bitmask
|= 1 << i
;
171 for (; i
< sctx
->streamout
.num_targets
; i
++)
172 si_so_target_reference(&sctx
->streamout
.targets
[i
], NULL
);
174 sctx
->streamout
.enabled_mask
= enabled_mask
;
175 sctx
->streamout
.num_targets
= num_targets
;
176 sctx
->streamout
.append_bitmask
= append_bitmask
;
178 /* Update dirty state bits. */
180 si_streamout_buffers_dirty(sctx
);
182 si_set_atom_dirty(sctx
, &sctx
->atoms
.s
.streamout_begin
, false);
183 si_set_streamout_enable(sctx
, false);
186 /* Set the shader resources.*/
187 for (i
= 0; i
< num_targets
; i
++) {
189 struct pipe_shader_buffer sbuf
;
190 sbuf
.buffer
= targets
[i
]->buffer
;
192 if (sctx
->screen
->use_ngg_streamout
) {
193 sbuf
.buffer_offset
= targets
[i
]->buffer_offset
;
194 sbuf
.buffer_size
= targets
[i
]->buffer_size
;
196 sbuf
.buffer_offset
= 0;
197 sbuf
.buffer_size
= targets
[i
]->buffer_offset
+ targets
[i
]->buffer_size
;
200 si_set_rw_shader_buffer(sctx
, SI_VS_STREAMOUT_BUF0
+ i
, &sbuf
);
201 si_resource(targets
[i
]->buffer
)->bind_history
|= PIPE_BIND_STREAM_OUTPUT
;
203 si_set_rw_shader_buffer(sctx
, SI_VS_STREAMOUT_BUF0
+ i
, NULL
);
206 for (; i
< old_num_targets
; i
++)
207 si_set_rw_shader_buffer(sctx
, SI_VS_STREAMOUT_BUF0
+ i
, NULL
);
210 sctx
->emit_cache_flush(sctx
);
213 static void gfx10_emit_streamout_begin(struct si_context
*sctx
)
215 struct si_streamout_target
**t
= sctx
->streamout
.targets
;
216 struct radeon_cmdbuf
*cs
= sctx
->gfx_cs
;
217 unsigned last_target
= 0;
219 for (unsigned i
= 0; i
< sctx
->streamout
.num_targets
; i
++) {
224 for (unsigned i
= 0; i
< sctx
->streamout
.num_targets
; i
++) {
228 t
[i
]->stride_in_dw
= sctx
->streamout
.stride_in_dw
[i
];
230 bool append
= sctx
->streamout
.append_bitmask
& (1 << i
);
234 radeon_add_to_buffer_list(sctx
, sctx
->gfx_cs
, t
[i
]->buf_filled_size
, RADEON_USAGE_READ
,
235 RADEON_PRIO_SO_FILLED_SIZE
);
237 va
= t
[i
]->buf_filled_size
->gpu_address
+ t
[i
]->buf_filled_size_offset
;
240 radeon_emit(cs
, PKT3(PKT3_DMA_DATA
, 5, 0));
241 radeon_emit(cs
, S_411_SRC_SEL(append
? V_411_SRC_ADDR_TC_L2
: V_411_DATA
) |
242 S_411_DST_SEL(V_411_GDS
) | S_411_CP_SYNC(i
== last_target
));
244 radeon_emit(cs
, va
>> 32);
245 radeon_emit(cs
, 4 * i
); /* destination in GDS */
247 radeon_emit(cs
, S_414_BYTE_COUNT_GFX9(4) | S_414_DISABLE_WR_CONFIRM_GFX9(i
!= last_target
));
250 sctx
->streamout
.begin_emitted
= true;
253 static void gfx10_emit_streamout_end(struct si_context
*sctx
)
255 struct si_streamout_target
**t
= sctx
->streamout
.targets
;
257 for (unsigned i
= 0; i
< sctx
->streamout
.num_targets
; i
++) {
261 uint64_t va
= t
[i
]->buf_filled_size
->gpu_address
+ t
[i
]->buf_filled_size_offset
;
263 si_cp_release_mem(sctx
, sctx
->gfx_cs
, V_028A90_PS_DONE
, 0, EOP_DST_SEL_TC_L2
,
264 EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM
, EOP_DATA_SEL_GDS
,
265 t
[i
]->buf_filled_size
, va
, EOP_DATA_GDS(i
, 1), 0);
267 t
[i
]->buf_filled_size_valid
= true;
270 sctx
->streamout
.begin_emitted
= false;
273 static void si_flush_vgt_streamout(struct si_context
*sctx
)
275 struct radeon_cmdbuf
*cs
= sctx
->gfx_cs
;
276 unsigned reg_strmout_cntl
;
278 /* The register is at different places on different ASICs. */
279 if (sctx
->chip_class
>= GFX7
) {
280 reg_strmout_cntl
= R_0300FC_CP_STRMOUT_CNTL
;
281 radeon_set_uconfig_reg(cs
, reg_strmout_cntl
, 0);
283 reg_strmout_cntl
= R_0084FC_CP_STRMOUT_CNTL
;
284 radeon_set_config_reg(cs
, reg_strmout_cntl
, 0);
287 radeon_emit(cs
, PKT3(PKT3_EVENT_WRITE
, 0, 0));
288 radeon_emit(cs
, EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH
) | EVENT_INDEX(0));
290 radeon_emit(cs
, PKT3(PKT3_WAIT_REG_MEM
, 5, 0));
292 WAIT_REG_MEM_EQUAL
); /* wait until the register is equal to the reference value */
293 radeon_emit(cs
, reg_strmout_cntl
>> 2); /* register */
295 radeon_emit(cs
, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */
296 radeon_emit(cs
, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */
297 radeon_emit(cs
, 4); /* poll interval */
300 static void si_emit_streamout_begin(struct si_context
*sctx
)
302 struct radeon_cmdbuf
*cs
= sctx
->gfx_cs
;
303 struct si_streamout_target
**t
= sctx
->streamout
.targets
;
304 uint16_t *stride_in_dw
= sctx
->streamout
.stride_in_dw
;
307 si_flush_vgt_streamout(sctx
);
309 for (i
= 0; i
< sctx
->streamout
.num_targets
; i
++) {
313 t
[i
]->stride_in_dw
= stride_in_dw
[i
];
315 /* AMD GCN binds streamout buffers as shader resources.
316 * VGT only counts primitives and tells the shader
317 * through SGPRs what to do. */
318 radeon_set_context_reg_seq(cs
, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0
+ 16 * i
, 2);
319 radeon_emit(cs
, (t
[i
]->b
.buffer_offset
+ t
[i
]->b
.buffer_size
) >> 2); /* BUFFER_SIZE (in DW) */
320 radeon_emit(cs
, stride_in_dw
[i
]); /* VTX_STRIDE (in DW) */
322 if (sctx
->streamout
.append_bitmask
& (1 << i
) && t
[i
]->buf_filled_size_valid
) {
323 uint64_t va
= t
[i
]->buf_filled_size
->gpu_address
+ t
[i
]->buf_filled_size_offset
;
326 radeon_emit(cs
, PKT3(PKT3_STRMOUT_BUFFER_UPDATE
, 4, 0));
327 radeon_emit(cs
, STRMOUT_SELECT_BUFFER(i
) |
328 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM
)); /* control */
329 radeon_emit(cs
, 0); /* unused */
330 radeon_emit(cs
, 0); /* unused */
331 radeon_emit(cs
, va
); /* src address lo */
332 radeon_emit(cs
, va
>> 32); /* src address hi */
334 radeon_add_to_buffer_list(sctx
, sctx
->gfx_cs
, t
[i
]->buf_filled_size
, RADEON_USAGE_READ
,
335 RADEON_PRIO_SO_FILLED_SIZE
);
337 /* Start from the beginning. */
338 radeon_emit(cs
, PKT3(PKT3_STRMOUT_BUFFER_UPDATE
, 4, 0));
339 radeon_emit(cs
, STRMOUT_SELECT_BUFFER(i
) |
340 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET
)); /* control */
341 radeon_emit(cs
, 0); /* unused */
342 radeon_emit(cs
, 0); /* unused */
343 radeon_emit(cs
, t
[i
]->b
.buffer_offset
>> 2); /* buffer offset in DW */
344 radeon_emit(cs
, 0); /* unused */
348 sctx
->streamout
.begin_emitted
= true;
351 void si_emit_streamout_end(struct si_context
*sctx
)
353 if (sctx
->screen
->use_ngg_streamout
) {
354 gfx10_emit_streamout_end(sctx
);
358 struct radeon_cmdbuf
*cs
= sctx
->gfx_cs
;
359 struct si_streamout_target
**t
= sctx
->streamout
.targets
;
363 si_flush_vgt_streamout(sctx
);
365 for (i
= 0; i
< sctx
->streamout
.num_targets
; i
++) {
369 va
= t
[i
]->buf_filled_size
->gpu_address
+ t
[i
]->buf_filled_size_offset
;
370 radeon_emit(cs
, PKT3(PKT3_STRMOUT_BUFFER_UPDATE
, 4, 0));
371 radeon_emit(cs
, STRMOUT_SELECT_BUFFER(i
) | STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE
) |
372 STRMOUT_STORE_BUFFER_FILLED_SIZE
); /* control */
373 radeon_emit(cs
, va
); /* dst address lo */
374 radeon_emit(cs
, va
>> 32); /* dst address hi */
375 radeon_emit(cs
, 0); /* unused */
376 radeon_emit(cs
, 0); /* unused */
378 radeon_add_to_buffer_list(sctx
, sctx
->gfx_cs
, t
[i
]->buf_filled_size
, RADEON_USAGE_WRITE
,
379 RADEON_PRIO_SO_FILLED_SIZE
);
381 /* Zero the buffer size. The counters (primitives generated,
382 * primitives emitted) may be enabled even if there is not
383 * buffer bound. This ensures that the primitives-emitted query
384 * won't increment. */
385 radeon_set_context_reg(cs
, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0
+ 16 * i
, 0);
386 sctx
->context_roll
= true;
388 t
[i
]->buf_filled_size_valid
= true;
391 sctx
->streamout
.begin_emitted
= false;
394 /* STREAMOUT CONFIG DERIVED STATE
396 * Streamout must be enabled for the PRIMITIVES_GENERATED query to work.
397 * The buffer mask is an independent state, so no writes occur if there
398 * are no buffers bound.
401 static void si_emit_streamout_enable(struct si_context
*sctx
)
403 assert(!sctx
->screen
->use_ngg_streamout
);
405 radeon_set_context_reg_seq(sctx
->gfx_cs
, R_028B94_VGT_STRMOUT_CONFIG
, 2);
406 radeon_emit(sctx
->gfx_cs
, S_028B94_STREAMOUT_0_EN(si_get_strmout_en(sctx
)) |
407 S_028B94_RAST_STREAM(0) |
408 S_028B94_STREAMOUT_1_EN(si_get_strmout_en(sctx
)) |
409 S_028B94_STREAMOUT_2_EN(si_get_strmout_en(sctx
)) |
410 S_028B94_STREAMOUT_3_EN(si_get_strmout_en(sctx
)));
411 radeon_emit(sctx
->gfx_cs
,
412 sctx
->streamout
.hw_enabled_mask
& sctx
->streamout
.enabled_stream_buffers_mask
);
415 static void si_set_streamout_enable(struct si_context
*sctx
, bool enable
)
417 bool old_strmout_en
= si_get_strmout_en(sctx
);
418 unsigned old_hw_enabled_mask
= sctx
->streamout
.hw_enabled_mask
;
420 sctx
->streamout
.streamout_enabled
= enable
;
422 sctx
->streamout
.hw_enabled_mask
=
423 sctx
->streamout
.enabled_mask
| (sctx
->streamout
.enabled_mask
<< 4) |
424 (sctx
->streamout
.enabled_mask
<< 8) | (sctx
->streamout
.enabled_mask
<< 12);
426 if (!sctx
->screen
->use_ngg_streamout
&&
427 ((old_strmout_en
!= si_get_strmout_en(sctx
)) ||
428 (old_hw_enabled_mask
!= sctx
->streamout
.hw_enabled_mask
)))
429 si_mark_atom_dirty(sctx
, &sctx
->atoms
.s
.streamout_enable
);
432 void si_update_prims_generated_query_state(struct si_context
*sctx
, unsigned type
, int diff
)
434 if (!sctx
->screen
->use_ngg_streamout
&& type
== PIPE_QUERY_PRIMITIVES_GENERATED
) {
435 bool old_strmout_en
= si_get_strmout_en(sctx
);
437 sctx
->streamout
.num_prims_gen_queries
+= diff
;
438 assert(sctx
->streamout
.num_prims_gen_queries
>= 0);
440 sctx
->streamout
.prims_gen_query_enabled
= sctx
->streamout
.num_prims_gen_queries
!= 0;
442 if (old_strmout_en
!= si_get_strmout_en(sctx
))
443 si_mark_atom_dirty(sctx
, &sctx
->atoms
.s
.streamout_enable
);
445 if (si_update_ngg(sctx
)) {
446 si_shader_change_notify(sctx
);
447 sctx
->do_update_shaders
= true;
452 void si_init_streamout_functions(struct si_context
*sctx
)
454 sctx
->b
.create_stream_output_target
= si_create_so_target
;
455 sctx
->b
.stream_output_target_destroy
= si_so_target_destroy
;
456 sctx
->b
.set_stream_output_targets
= si_set_streamout_targets
;
458 if (sctx
->screen
->use_ngg_streamout
) {
459 sctx
->atoms
.s
.streamout_begin
.emit
= gfx10_emit_streamout_begin
;
461 sctx
->atoms
.s
.streamout_begin
.emit
= si_emit_streamout_begin
;
462 sctx
->atoms
.s
.streamout_enable
.emit
= si_emit_streamout_enable
;