radeonsi: remove redundant call to emit_cache_flush in compute clear/copy
[mesa.git] / src / gallium / drivers / radeonsi / si_compute_blit.c
1 /*
2 * Copyright 2018 Advanced Micro Devices, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 *
24 */
25
26 #include "si_pipe.h"
27
28 /* Note: Compute shaders always use SI_COMPUTE_DST_CACHE_POLICY for dst
29 * and L2_STREAM for src.
30 */
31 static enum si_cache_policy get_cache_policy(struct si_context *sctx,
32 enum si_coherency coher,
33 uint64_t size)
34 {
35 if ((sctx->chip_class >= GFX9 && (coher == SI_COHERENCY_CB_META ||
36 coher == SI_COHERENCY_CP)) ||
37 (sctx->chip_class >= CIK && coher == SI_COHERENCY_SHADER))
38 return size <= 256 * 1024 ? L2_LRU : L2_STREAM;
39
40 return L2_BYPASS;
41 }
42
43 unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher,
44 enum si_cache_policy cache_policy)
45 {
46 switch (coher) {
47 default:
48 case SI_COHERENCY_NONE:
49 case SI_COHERENCY_CP:
50 return 0;
51 case SI_COHERENCY_SHADER:
52 return SI_CONTEXT_INV_SMEM_L1 |
53 SI_CONTEXT_INV_VMEM_L1 |
54 (cache_policy == L2_BYPASS ? SI_CONTEXT_INV_GLOBAL_L2 : 0);
55 case SI_COHERENCY_CB_META:
56 return SI_CONTEXT_FLUSH_AND_INV_CB;
57 }
58 }
59
60 static void si_compute_do_clear_or_copy(struct si_context *sctx,
61 struct pipe_resource *dst,
62 unsigned dst_offset,
63 struct pipe_resource *src,
64 unsigned src_offset,
65 unsigned size,
66 const uint32_t *clear_value,
67 unsigned clear_value_size,
68 enum si_coherency coher)
69 {
70 struct pipe_context *ctx = &sctx->b;
71
72 assert(src_offset % 4 == 0);
73 assert(dst_offset % 4 == 0);
74 assert(size % 4 == 0);
75
76 assert(dst->target != PIPE_BUFFER || dst_offset + size <= dst->width0);
77 assert(!src || src_offset + size <= src->width0);
78
79 sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
80 SI_CONTEXT_CS_PARTIAL_FLUSH |
81 si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY);
82
83 /* Save states. */
84 void *saved_cs = sctx->cs_shader_state.program;
85 struct pipe_shader_buffer saved_sb[2] = {};
86 si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb);
87
88 /* The memory accesses are coalesced, meaning that the 1st instruction writes
89 * the 1st contiguous block of data for the whole wave, the 2nd instruction
90 * writes the 2nd contiguous block of data, etc.
91 */
92 unsigned dwords_per_thread = src ? SI_COMPUTE_COPY_DW_PER_THREAD :
93 SI_COMPUTE_CLEAR_DW_PER_THREAD;
94 unsigned instructions_per_thread = MAX2(1, dwords_per_thread / 4);
95 unsigned dwords_per_instruction = dwords_per_thread / instructions_per_thread;
96 unsigned dwords_per_wave = dwords_per_thread * 64;
97
98 unsigned num_dwords = size / 4;
99 unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
100
101 struct pipe_grid_info info = {};
102 info.block[0] = MIN2(64, num_instructions);
103 info.block[1] = 1;
104 info.block[2] = 1;
105 info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
106 info.grid[1] = 1;
107 info.grid[2] = 1;
108
109 struct pipe_shader_buffer sb[2] = {};
110 sb[0].buffer = dst;
111 sb[0].buffer_offset = dst_offset;
112 sb[0].buffer_size = size;
113
114 if (src) {
115 sb[1].buffer = src;
116 sb[1].buffer_offset = src_offset;
117 sb[1].buffer_size = size;
118
119 ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 2, sb);
120 ctx->bind_compute_state(ctx, sctx->cs_copy_buffer);
121 } else {
122 assert(clear_value_size >= 4 &&
123 clear_value_size <= 16 &&
124 util_is_power_of_two_or_zero(clear_value_size));
125
126 for (unsigned i = 0; i < 4; i++)
127 sctx->cs_user_data[i] = clear_value[i % (clear_value_size / 4)];
128
129 ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, sb);
130 ctx->bind_compute_state(ctx, sctx->cs_clear_buffer);
131 }
132
133 ctx->launch_grid(ctx, &info);
134
135 enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
136 sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
137 (cache_policy == L2_BYPASS ? SI_CONTEXT_WRITEBACK_GLOBAL_L2 : 0);
138
139 if (cache_policy != L2_BYPASS)
140 r600_resource(dst)->TC_L2_dirty = true;
141
142 /* Restore states. */
143 ctx->bind_compute_state(ctx, saved_cs);
144 ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb);
145 }
146
147 void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
148 uint64_t offset, uint64_t size, uint32_t *clear_value,
149 uint32_t clear_value_size, enum si_coherency coher)
150 {
151 if (!size)
152 return;
153
154 unsigned clear_alignment = MIN2(clear_value_size, 4);
155
156 assert(clear_value_size != 3 && clear_value_size != 6); /* 12 is allowed. */
157 assert(offset % clear_alignment == 0);
158 assert(size % clear_alignment == 0);
159 assert(size < (UINT_MAX & ~0xf)); /* TODO: test 64-bit sizes in all codepaths */
160
161 /* Reduce a large clear value size if possible. */
162 if (clear_value_size > 4) {
163 bool clear_dword_duplicated = true;
164
165 /* See if we can lower large fills to dword fills. */
166 for (unsigned i = 1; i < clear_value_size / 4; i++) {
167 if (clear_value[0] != clear_value[i]) {
168 clear_dword_duplicated = false;
169 break;
170 }
171 }
172 if (clear_dword_duplicated)
173 clear_value_size = 4;
174 }
175
176 /* Expand a small clear value size. */
177 uint32_t tmp_clear_value;
178 if (clear_value_size <= 2) {
179 if (clear_value_size == 1) {
180 tmp_clear_value = *(uint8_t*)clear_value;
181 tmp_clear_value |= (tmp_clear_value << 8) |
182 (tmp_clear_value << 16) |
183 (tmp_clear_value << 24);
184 } else {
185 tmp_clear_value = *(uint16_t*)clear_value;
186 tmp_clear_value |= tmp_clear_value << 16;
187 }
188 clear_value = &tmp_clear_value;
189 clear_value_size = 4;
190 }
191
192 /* Use transform feedback for 12-byte clears. */
193 /* TODO: Use compute. */
194 if (clear_value_size == 12) {
195 union pipe_color_union streamout_clear_value;
196
197 memcpy(&streamout_clear_value, clear_value, clear_value_size);
198 si_blitter_begin(sctx, SI_DISABLE_RENDER_COND);
199 util_blitter_clear_buffer(sctx->blitter, dst, offset,
200 size, clear_value_size / 4,
201 &streamout_clear_value);
202 si_blitter_end(sctx);
203 return;
204 }
205
206 uint64_t aligned_size = size & ~3ull;
207 if (aligned_size >= 4) {
208 /* Before GFX9, CP DMA was very slow when clearing GTT, so never
209 * use CP DMA clears on those chips, because we can't be certain
210 * about buffer placements.
211 */
212 if (clear_value_size > 4 ||
213 (clear_value_size == 4 &&
214 offset % 4 == 0 &&
215 (size > 32*1024 || sctx->chip_class <= VI))) {
216 si_compute_do_clear_or_copy(sctx, dst, offset, NULL, 0,
217 aligned_size, clear_value,
218 clear_value_size, coher);
219 } else {
220 assert(clear_value_size == 4);
221 si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, offset,
222 aligned_size, *clear_value, 0, coher,
223 get_cache_policy(sctx, coher, size));
224 }
225
226 offset += aligned_size;
227 size -= aligned_size;
228 }
229
230 /* Handle non-dword alignment. */
231 if (size) {
232 assert(dst);
233 assert(dst->target == PIPE_BUFFER);
234 assert(size < 4);
235
236 pipe_buffer_write(&sctx->b, dst, offset, size, clear_value);
237 }
238 }
239
240 static void si_pipe_clear_buffer(struct pipe_context *ctx,
241 struct pipe_resource *dst,
242 unsigned offset, unsigned size,
243 const void *clear_value,
244 int clear_value_size)
245 {
246 enum si_coherency coher;
247
248 if (dst->flags & SI_RESOURCE_FLAG_SO_FILLED_SIZE)
249 coher = SI_COHERENCY_CP;
250 else
251 coher = SI_COHERENCY_SHADER;
252
253 si_clear_buffer((struct si_context*)ctx, dst, offset, size, (uint32_t*)clear_value,
254 clear_value_size, coher);
255 }
256
257 void si_copy_buffer(struct si_context *sctx,
258 struct pipe_resource *dst, struct pipe_resource *src,
259 uint64_t dst_offset, uint64_t src_offset, unsigned size)
260 {
261 if (!size)
262 return;
263
264 enum si_coherency coher = SI_COHERENCY_SHADER;
265 enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
266
267 /* Only use compute for VRAM copies on dGPUs. */
268 if (sctx->screen->info.has_dedicated_vram &&
269 r600_resource(dst)->domains & RADEON_DOMAIN_VRAM &&
270 r600_resource(src)->domains & RADEON_DOMAIN_VRAM &&
271 size > 32 * 1024 &&
272 dst_offset % 4 == 0 && src_offset % 4 == 0 && size % 4 == 0) {
273 si_compute_do_clear_or_copy(sctx, dst, dst_offset, src, src_offset,
274 size, NULL, 0, coher);
275 } else {
276 si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size,
277 0, coher, cache_policy);
278 }
279 }
280
281 void si_init_compute_blit_functions(struct si_context *sctx)
282 {
283 sctx->b.clear_buffer = si_pipe_clear_buffer;
284 }