2 * Copyright 2013 Advanced Micro Devices, Inc.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25 #include "si_build_pm4.h"
27 #include "util/u_memory.h"
28 #include "util/u_suballoc.h"
30 static void si_set_streamout_enable(struct si_context
*sctx
, bool enable
);
32 static inline void si_so_target_reference(struct si_streamout_target
**dst
,
33 struct pipe_stream_output_target
*src
)
35 pipe_so_target_reference((struct pipe_stream_output_target
**)dst
, src
);
38 static struct pipe_stream_output_target
*
39 si_create_so_target(struct pipe_context
*ctx
,
40 struct pipe_resource
*buffer
,
41 unsigned buffer_offset
,
44 struct si_context
*sctx
= (struct si_context
*)ctx
;
45 struct si_streamout_target
*t
;
46 struct si_resource
*buf
= si_resource(buffer
);
48 t
= CALLOC_STRUCT(si_streamout_target
);
53 u_suballocator_alloc(sctx
->allocator_zeroed_memory
, 4, 4,
54 &t
->buf_filled_size_offset
,
55 (struct pipe_resource
**)&t
->buf_filled_size
);
56 if (!t
->buf_filled_size
) {
61 t
->b
.reference
.count
= 1;
63 pipe_resource_reference(&t
->b
.buffer
, buffer
);
64 t
->b
.buffer_offset
= buffer_offset
;
65 t
->b
.buffer_size
= buffer_size
;
67 util_range_add(&buf
->valid_buffer_range
, buffer_offset
,
68 buffer_offset
+ buffer_size
);
72 static void si_so_target_destroy(struct pipe_context
*ctx
,
73 struct pipe_stream_output_target
*target
)
75 struct si_streamout_target
*t
= (struct si_streamout_target
*)target
;
76 pipe_resource_reference(&t
->b
.buffer
, NULL
);
77 si_resource_reference(&t
->buf_filled_size
, NULL
);
81 void si_streamout_buffers_dirty(struct si_context
*sctx
)
83 if (!sctx
->streamout
.enabled_mask
)
86 si_mark_atom_dirty(sctx
, &sctx
->atoms
.s
.streamout_begin
);
87 si_set_streamout_enable(sctx
, true);
90 static void si_set_streamout_targets(struct pipe_context
*ctx
,
92 struct pipe_stream_output_target
**targets
,
93 const unsigned *offsets
)
95 struct si_context
*sctx
= (struct si_context
*)ctx
;
96 unsigned old_num_targets
= sctx
->streamout
.num_targets
;
99 /* We are going to unbind the buffers. Mark which caches need to be flushed. */
100 if (sctx
->streamout
.num_targets
&& sctx
->streamout
.begin_emitted
) {
101 /* Since streamout uses vector writes which go through TC L2
102 * and most other clients can use TC L2 as well, we don't need
105 * The only cases which requires flushing it is VGT DMA index
106 * fetching (on <= GFX7) and indirect draw data, which are rare
107 * cases. Thus, flag the TC L2 dirtiness in the resource and
108 * handle it at draw call time.
110 for (i
= 0; i
< sctx
->streamout
.num_targets
; i
++)
111 if (sctx
->streamout
.targets
[i
])
112 si_resource(sctx
->streamout
.targets
[i
]->b
.buffer
)->TC_L2_dirty
= true;
114 /* Invalidate the scalar cache in case a streamout buffer is
115 * going to be used as a constant buffer.
117 * Invalidate vL1, because streamout bypasses it (done by
118 * setting GLC=1 in the store instruction), but vL1 in other
119 * CUs can contain outdated data of streamout buffers.
121 * VS_PARTIAL_FLUSH is required if the buffers are going to be
122 * used as an input immediately.
124 sctx
->flags
|= SI_CONTEXT_INV_SMEM_L1
|
125 SI_CONTEXT_INV_VMEM_L1
|
126 SI_CONTEXT_VS_PARTIAL_FLUSH
;
129 /* All readers of the streamout targets need to be finished before we can
130 * start writing to the targets.
133 sctx
->flags
|= SI_CONTEXT_PS_PARTIAL_FLUSH
|
134 SI_CONTEXT_CS_PARTIAL_FLUSH
;
136 /* Streamout buffers must be bound in 2 places:
137 * 1) in VGT by setting the VGT_STRMOUT registers
138 * 2) as shader resources
141 /* Stop streamout. */
142 if (sctx
->streamout
.num_targets
&& sctx
->streamout
.begin_emitted
)
143 si_emit_streamout_end(sctx
);
145 /* Set the new targets. */
146 unsigned enabled_mask
= 0, append_bitmask
= 0;
147 for (i
= 0; i
< num_targets
; i
++) {
148 si_so_target_reference(&sctx
->streamout
.targets
[i
], targets
[i
]);
152 si_context_add_resource_size(sctx
, targets
[i
]->buffer
);
153 enabled_mask
|= 1 << i
;
155 if (offsets
[i
] == ((unsigned)-1))
156 append_bitmask
|= 1 << i
;
159 for (; i
< sctx
->streamout
.num_targets
; i
++)
160 si_so_target_reference(&sctx
->streamout
.targets
[i
], NULL
);
162 sctx
->streamout
.enabled_mask
= enabled_mask
;
163 sctx
->streamout
.num_targets
= num_targets
;
164 sctx
->streamout
.append_bitmask
= append_bitmask
;
166 /* Update dirty state bits. */
168 si_streamout_buffers_dirty(sctx
);
170 si_set_atom_dirty(sctx
, &sctx
->atoms
.s
.streamout_begin
, false);
171 si_set_streamout_enable(sctx
, false);
174 /* Set the shader resources.*/
175 for (i
= 0; i
< num_targets
; i
++) {
177 struct pipe_shader_buffer sbuf
;
178 sbuf
.buffer
= targets
[i
]->buffer
;
179 sbuf
.buffer_offset
= 0;
180 sbuf
.buffer_size
= targets
[i
]->buffer_offset
+
181 targets
[i
]->buffer_size
;
182 si_set_rw_shader_buffer(sctx
, SI_VS_STREAMOUT_BUF0
+ i
, &sbuf
);
183 si_resource(targets
[i
]->buffer
)->bind_history
|= PIPE_BIND_STREAM_OUTPUT
;
185 si_set_rw_shader_buffer(sctx
, SI_VS_STREAMOUT_BUF0
+ i
, NULL
);
188 for (; i
< old_num_targets
; i
++)
189 si_set_rw_shader_buffer(sctx
, SI_VS_STREAMOUT_BUF0
+ i
, NULL
);
192 static void si_flush_vgt_streamout(struct si_context
*sctx
)
194 struct radeon_cmdbuf
*cs
= sctx
->gfx_cs
;
195 unsigned reg_strmout_cntl
;
197 /* The register is at different places on different ASICs. */
198 if (sctx
->chip_class
>= GFX7
) {
199 reg_strmout_cntl
= R_0300FC_CP_STRMOUT_CNTL
;
200 radeon_set_uconfig_reg(cs
, reg_strmout_cntl
, 0);
202 reg_strmout_cntl
= R_0084FC_CP_STRMOUT_CNTL
;
203 radeon_set_config_reg(cs
, reg_strmout_cntl
, 0);
206 radeon_emit(cs
, PKT3(PKT3_EVENT_WRITE
, 0, 0));
207 radeon_emit(cs
, EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH
) | EVENT_INDEX(0));
209 radeon_emit(cs
, PKT3(PKT3_WAIT_REG_MEM
, 5, 0));
210 radeon_emit(cs
, WAIT_REG_MEM_EQUAL
); /* wait until the register is equal to the reference value */
211 radeon_emit(cs
, reg_strmout_cntl
>> 2); /* register */
213 radeon_emit(cs
, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */
214 radeon_emit(cs
, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */
215 radeon_emit(cs
, 4); /* poll interval */
218 static void si_emit_streamout_begin(struct si_context
*sctx
)
220 struct radeon_cmdbuf
*cs
= sctx
->gfx_cs
;
221 struct si_streamout_target
**t
= sctx
->streamout
.targets
;
222 uint16_t *stride_in_dw
= sctx
->streamout
.stride_in_dw
;
225 si_flush_vgt_streamout(sctx
);
227 for (i
= 0; i
< sctx
->streamout
.num_targets
; i
++) {
231 t
[i
]->stride_in_dw
= stride_in_dw
[i
];
233 /* AMD GCN binds streamout buffers as shader resources.
234 * VGT only counts primitives and tells the shader
235 * through SGPRs what to do. */
236 radeon_set_context_reg_seq(cs
, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0
+ 16*i
, 2);
237 radeon_emit(cs
, (t
[i
]->b
.buffer_offset
+
238 t
[i
]->b
.buffer_size
) >> 2); /* BUFFER_SIZE (in DW) */
239 radeon_emit(cs
, stride_in_dw
[i
]); /* VTX_STRIDE (in DW) */
241 if (sctx
->streamout
.append_bitmask
& (1 << i
) && t
[i
]->buf_filled_size_valid
) {
242 uint64_t va
= t
[i
]->buf_filled_size
->gpu_address
+
243 t
[i
]->buf_filled_size_offset
;
246 radeon_emit(cs
, PKT3(PKT3_STRMOUT_BUFFER_UPDATE
, 4, 0));
247 radeon_emit(cs
, STRMOUT_SELECT_BUFFER(i
) |
248 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM
)); /* control */
249 radeon_emit(cs
, 0); /* unused */
250 radeon_emit(cs
, 0); /* unused */
251 radeon_emit(cs
, va
); /* src address lo */
252 radeon_emit(cs
, va
>> 32); /* src address hi */
254 radeon_add_to_buffer_list(sctx
, sctx
->gfx_cs
,
255 t
[i
]->buf_filled_size
,
257 RADEON_PRIO_SO_FILLED_SIZE
);
259 /* Start from the beginning. */
260 radeon_emit(cs
, PKT3(PKT3_STRMOUT_BUFFER_UPDATE
, 4, 0));
261 radeon_emit(cs
, STRMOUT_SELECT_BUFFER(i
) |
262 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET
)); /* control */
263 radeon_emit(cs
, 0); /* unused */
264 radeon_emit(cs
, 0); /* unused */
265 radeon_emit(cs
, t
[i
]->b
.buffer_offset
>> 2); /* buffer offset in DW */
266 radeon_emit(cs
, 0); /* unused */
270 sctx
->streamout
.begin_emitted
= true;
273 void si_emit_streamout_end(struct si_context
*sctx
)
275 struct radeon_cmdbuf
*cs
= sctx
->gfx_cs
;
276 struct si_streamout_target
**t
= sctx
->streamout
.targets
;
280 si_flush_vgt_streamout(sctx
);
282 for (i
= 0; i
< sctx
->streamout
.num_targets
; i
++) {
286 va
= t
[i
]->buf_filled_size
->gpu_address
+ t
[i
]->buf_filled_size_offset
;
287 radeon_emit(cs
, PKT3(PKT3_STRMOUT_BUFFER_UPDATE
, 4, 0));
288 radeon_emit(cs
, STRMOUT_SELECT_BUFFER(i
) |
289 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE
) |
290 STRMOUT_STORE_BUFFER_FILLED_SIZE
); /* control */
291 radeon_emit(cs
, va
); /* dst address lo */
292 radeon_emit(cs
, va
>> 32); /* dst address hi */
293 radeon_emit(cs
, 0); /* unused */
294 radeon_emit(cs
, 0); /* unused */
296 radeon_add_to_buffer_list(sctx
, sctx
->gfx_cs
,
297 t
[i
]->buf_filled_size
,
299 RADEON_PRIO_SO_FILLED_SIZE
);
301 /* Zero the buffer size. The counters (primitives generated,
302 * primitives emitted) may be enabled even if there is not
303 * buffer bound. This ensures that the primitives-emitted query
304 * won't increment. */
305 radeon_set_context_reg(cs
, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0
+ 16*i
, 0);
306 sctx
->context_roll
= true;
308 t
[i
]->buf_filled_size_valid
= true;
311 sctx
->streamout
.begin_emitted
= false;
314 /* STREAMOUT CONFIG DERIVED STATE
316 * Streamout must be enabled for the PRIMITIVES_GENERATED query to work.
317 * The buffer mask is an independent state, so no writes occur if there
318 * are no buffers bound.
321 static void si_emit_streamout_enable(struct si_context
*sctx
)
323 radeon_set_context_reg_seq(sctx
->gfx_cs
, R_028B94_VGT_STRMOUT_CONFIG
, 2);
324 radeon_emit(sctx
->gfx_cs
,
325 S_028B94_STREAMOUT_0_EN(si_get_strmout_en(sctx
)) |
326 S_028B94_RAST_STREAM(0) |
327 S_028B94_STREAMOUT_1_EN(si_get_strmout_en(sctx
)) |
328 S_028B94_STREAMOUT_2_EN(si_get_strmout_en(sctx
)) |
329 S_028B94_STREAMOUT_3_EN(si_get_strmout_en(sctx
)));
330 radeon_emit(sctx
->gfx_cs
,
331 sctx
->streamout
.hw_enabled_mask
&
332 sctx
->streamout
.enabled_stream_buffers_mask
);
335 static void si_set_streamout_enable(struct si_context
*sctx
, bool enable
)
337 bool old_strmout_en
= si_get_strmout_en(sctx
);
338 unsigned old_hw_enabled_mask
= sctx
->streamout
.hw_enabled_mask
;
340 sctx
->streamout
.streamout_enabled
= enable
;
342 sctx
->streamout
.hw_enabled_mask
= sctx
->streamout
.enabled_mask
|
343 (sctx
->streamout
.enabled_mask
<< 4) |
344 (sctx
->streamout
.enabled_mask
<< 8) |
345 (sctx
->streamout
.enabled_mask
<< 12);
347 if ((old_strmout_en
!= si_get_strmout_en(sctx
)) ||
348 (old_hw_enabled_mask
!= sctx
->streamout
.hw_enabled_mask
))
349 si_mark_atom_dirty(sctx
, &sctx
->atoms
.s
.streamout_enable
);
352 void si_update_prims_generated_query_state(struct si_context
*sctx
,
353 unsigned type
, int diff
)
355 if (type
== PIPE_QUERY_PRIMITIVES_GENERATED
) {
356 bool old_strmout_en
= si_get_strmout_en(sctx
);
358 sctx
->streamout
.num_prims_gen_queries
+= diff
;
359 assert(sctx
->streamout
.num_prims_gen_queries
>= 0);
361 sctx
->streamout
.prims_gen_query_enabled
=
362 sctx
->streamout
.num_prims_gen_queries
!= 0;
364 if (old_strmout_en
!= si_get_strmout_en(sctx
))
365 si_mark_atom_dirty(sctx
, &sctx
->atoms
.s
.streamout_enable
);
369 void si_init_streamout_functions(struct si_context
*sctx
)
371 sctx
->b
.create_stream_output_target
= si_create_so_target
;
372 sctx
->b
.stream_output_target_destroy
= si_so_target_destroy
;
373 sctx
->b
.set_stream_output_targets
= si_set_streamout_targets
;
374 sctx
->atoms
.s
.streamout_begin
.emit
= si_emit_streamout_begin
;
375 sctx
->atoms
.s
.streamout_enable
.emit
= si_emit_streamout_enable
;