radeonsi: allow si_cp_dma_clear_buffer to clear GDS from any IB
[mesa.git] / src / gallium / drivers / radeonsi / si_compute_blit.c
1 /*
2 * Copyright 2018 Advanced Micro Devices, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 *
24 */
25
26 #include "si_pipe.h"
27
28 /* Note: Compute shaders always use SI_COMPUTE_DST_CACHE_POLICY for dst
29 * and L2_STREAM for src.
30 */
31 static enum si_cache_policy get_cache_policy(struct si_context *sctx,
32 enum si_coherency coher,
33 uint64_t size)
34 {
35 if ((sctx->chip_class >= GFX9 && (coher == SI_COHERENCY_CB_META ||
36 coher == SI_COHERENCY_CP)) ||
37 (sctx->chip_class >= CIK && coher == SI_COHERENCY_SHADER))
38 return size <= 256 * 1024 ? L2_LRU : L2_STREAM;
39
40 return L2_BYPASS;
41 }
42
43 unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher,
44 enum si_cache_policy cache_policy)
45 {
46 switch (coher) {
47 default:
48 case SI_COHERENCY_NONE:
49 case SI_COHERENCY_CP:
50 return 0;
51 case SI_COHERENCY_SHADER:
52 return SI_CONTEXT_INV_SMEM_L1 |
53 SI_CONTEXT_INV_VMEM_L1 |
54 (cache_policy == L2_BYPASS ? SI_CONTEXT_INV_GLOBAL_L2 : 0);
55 case SI_COHERENCY_CB_META:
56 return SI_CONTEXT_FLUSH_AND_INV_CB;
57 }
58 }
59
60 static void si_compute_do_clear_or_copy(struct si_context *sctx,
61 struct pipe_resource *dst,
62 unsigned dst_offset,
63 struct pipe_resource *src,
64 unsigned src_offset,
65 unsigned size,
66 const uint32_t *clear_value,
67 unsigned clear_value_size,
68 enum si_coherency coher)
69 {
70 struct pipe_context *ctx = &sctx->b;
71
72 assert(src_offset % 4 == 0);
73 assert(dst_offset % 4 == 0);
74 assert(size % 4 == 0);
75
76 assert(dst->target != PIPE_BUFFER || dst_offset + size <= dst->width0);
77 assert(!src || src_offset + size <= src->width0);
78
79 sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
80 SI_CONTEXT_CS_PARTIAL_FLUSH |
81 si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY);
82 si_emit_cache_flush(sctx);
83
84 /* Save states. */
85 void *saved_cs = sctx->cs_shader_state.program;
86 struct pipe_shader_buffer saved_sb[2] = {};
87 si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb);
88
89 /* The memory accesses are coalesced, meaning that the 1st instruction writes
90 * the 1st contiguous block of data for the whole wave, the 2nd instruction
91 * writes the 2nd contiguous block of data, etc.
92 */
93 unsigned dwords_per_thread = src ? SI_COMPUTE_COPY_DW_PER_THREAD :
94 SI_COMPUTE_CLEAR_DW_PER_THREAD;
95 unsigned instructions_per_thread = MAX2(1, dwords_per_thread / 4);
96 unsigned dwords_per_instruction = dwords_per_thread / instructions_per_thread;
97 unsigned dwords_per_wave = dwords_per_thread * 64;
98
99 unsigned num_dwords = size / 4;
100 unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
101
102 struct pipe_grid_info info = {};
103 info.block[0] = MIN2(64, num_instructions);
104 info.block[1] = 1;
105 info.block[2] = 1;
106 info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
107 info.grid[1] = 1;
108 info.grid[2] = 1;
109
110 struct pipe_shader_buffer sb[2] = {};
111 sb[0].buffer = dst;
112 sb[0].buffer_offset = dst_offset;
113 sb[0].buffer_size = size;
114
115 if (src) {
116 sb[1].buffer = src;
117 sb[1].buffer_offset = src_offset;
118 sb[1].buffer_size = size;
119
120 ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 2, sb);
121 ctx->bind_compute_state(ctx, sctx->cs_copy_buffer);
122 } else {
123 assert(clear_value_size >= 4 &&
124 clear_value_size <= 16 &&
125 util_is_power_of_two_or_zero(clear_value_size));
126
127 for (unsigned i = 0; i < 4; i++)
128 sctx->cs_user_data[i] = clear_value[i % (clear_value_size / 4)];
129
130 ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, sb);
131 ctx->bind_compute_state(ctx, sctx->cs_clear_buffer);
132 }
133
134 ctx->launch_grid(ctx, &info);
135
136 enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
137 sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
138 (cache_policy == L2_BYPASS ? SI_CONTEXT_WRITEBACK_GLOBAL_L2 : 0);
139
140 if (cache_policy != L2_BYPASS)
141 r600_resource(dst)->TC_L2_dirty = true;
142
143 /* Restore states. */
144 ctx->bind_compute_state(ctx, saved_cs);
145 ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb);
146 }
147
148 void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
149 uint64_t offset, uint64_t size, uint32_t *clear_value,
150 uint32_t clear_value_size, enum si_coherency coher)
151 {
152 if (!size)
153 return;
154
155 unsigned clear_alignment = MIN2(clear_value_size, 4);
156
157 assert(clear_value_size != 3 && clear_value_size != 6); /* 12 is allowed. */
158 assert(offset % clear_alignment == 0);
159 assert(size % clear_alignment == 0);
160 assert(size < (UINT_MAX & ~0xf)); /* TODO: test 64-bit sizes in all codepaths */
161
162 /* Reduce a large clear value size if possible. */
163 if (clear_value_size > 4) {
164 bool clear_dword_duplicated = true;
165
166 /* See if we can lower large fills to dword fills. */
167 for (unsigned i = 1; i < clear_value_size / 4; i++) {
168 if (clear_value[0] != clear_value[i]) {
169 clear_dword_duplicated = false;
170 break;
171 }
172 }
173 if (clear_dword_duplicated)
174 clear_value_size = 4;
175 }
176
177 /* Expand a small clear value size. */
178 uint32_t tmp_clear_value;
179 if (clear_value_size <= 2) {
180 if (clear_value_size == 1) {
181 tmp_clear_value = *(uint8_t*)clear_value;
182 tmp_clear_value |= (tmp_clear_value << 8) |
183 (tmp_clear_value << 16) |
184 (tmp_clear_value << 24);
185 } else {
186 tmp_clear_value = *(uint16_t*)clear_value;
187 tmp_clear_value |= tmp_clear_value << 16;
188 }
189 clear_value = &tmp_clear_value;
190 clear_value_size = 4;
191 }
192
193 /* Use transform feedback for 12-byte clears. */
194 /* TODO: Use compute. */
195 if (clear_value_size == 12) {
196 union pipe_color_union streamout_clear_value;
197
198 memcpy(&streamout_clear_value, clear_value, clear_value_size);
199 si_blitter_begin(sctx, SI_DISABLE_RENDER_COND);
200 util_blitter_clear_buffer(sctx->blitter, dst, offset,
201 size, clear_value_size / 4,
202 &streamout_clear_value);
203 si_blitter_end(sctx);
204 return;
205 }
206
207 uint64_t aligned_size = size & ~3ull;
208 if (aligned_size >= 4) {
209 /* Before GFX9, CP DMA was very slow when clearing GTT, so never
210 * use CP DMA clears on those chips, because we can't be certain
211 * about buffer placements.
212 */
213 if (clear_value_size > 4 ||
214 (clear_value_size == 4 &&
215 offset % 4 == 0 &&
216 (size > 32*1024 || sctx->chip_class <= VI))) {
217 si_compute_do_clear_or_copy(sctx, dst, offset, NULL, 0,
218 aligned_size, clear_value,
219 clear_value_size, coher);
220 } else {
221 assert(clear_value_size == 4);
222 si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, offset,
223 aligned_size, *clear_value, 0, coher,
224 get_cache_policy(sctx, coher, size));
225 }
226
227 offset += aligned_size;
228 size -= aligned_size;
229 }
230
231 /* Handle non-dword alignment. */
232 if (size) {
233 assert(dst);
234 assert(dst->target == PIPE_BUFFER);
235 assert(size < 4);
236
237 pipe_buffer_write(&sctx->b, dst, offset, size, clear_value);
238 }
239 }
240
241 static void si_pipe_clear_buffer(struct pipe_context *ctx,
242 struct pipe_resource *dst,
243 unsigned offset, unsigned size,
244 const void *clear_value,
245 int clear_value_size)
246 {
247 enum si_coherency coher;
248
249 if (dst->flags & SI_RESOURCE_FLAG_SO_FILLED_SIZE)
250 coher = SI_COHERENCY_CP;
251 else
252 coher = SI_COHERENCY_SHADER;
253
254 si_clear_buffer((struct si_context*)ctx, dst, offset, size, (uint32_t*)clear_value,
255 clear_value_size, coher);
256 }
257
258 void si_copy_buffer(struct si_context *sctx,
259 struct pipe_resource *dst, struct pipe_resource *src,
260 uint64_t dst_offset, uint64_t src_offset, unsigned size)
261 {
262 if (!size)
263 return;
264
265 enum si_coherency coher = SI_COHERENCY_SHADER;
266 enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
267
268 /* Only use compute for VRAM copies on dGPUs. */
269 if (sctx->screen->info.has_dedicated_vram &&
270 r600_resource(dst)->domains & RADEON_DOMAIN_VRAM &&
271 r600_resource(src)->domains & RADEON_DOMAIN_VRAM &&
272 size > 32 * 1024 &&
273 dst_offset % 4 == 0 && src_offset % 4 == 0 && size % 4 == 0) {
274 si_compute_do_clear_or_copy(sctx, dst, dst_offset, src, src_offset,
275 size, NULL, 0, coher);
276 } else {
277 si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size,
278 0, coher, cache_policy);
279 }
280 }
281
282 void si_init_compute_blit_functions(struct si_context *sctx)
283 {
284 sctx->b.clear_buffer = si_pipe_clear_buffer;
285 }