dfa77a98804cef92ca8985cac43afa959348713a
[mesa.git] / src / gallium / drivers / radeonsi / si_compute_blit.c
1 /*
2 * Copyright 2018 Advanced Micro Devices, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 *
24 */
25
26 #include "si_pipe.h"
27
28 /* Note: Compute shaders always use SI_COMPUTE_DST_CACHE_POLICY for dst
29 * and L2_STREAM for src.
30 */
31 static enum si_cache_policy get_cache_policy(struct si_context *sctx,
32 enum si_coherency coher,
33 uint64_t size)
34 {
35 if ((sctx->chip_class >= GFX9 && (coher == SI_COHERENCY_CB_META ||
36 coher == SI_COHERENCY_CP)) ||
37 (sctx->chip_class >= CIK && coher == SI_COHERENCY_SHADER))
38 return size <= 256 * 1024 ? L2_LRU : L2_STREAM;
39
40 return L2_BYPASS;
41 }
42
43 unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher,
44 enum si_cache_policy cache_policy)
45 {
46 switch (coher) {
47 default:
48 case SI_COHERENCY_NONE:
49 case SI_COHERENCY_CP:
50 return 0;
51 case SI_COHERENCY_SHADER:
52 return SI_CONTEXT_INV_SMEM_L1 |
53 SI_CONTEXT_INV_VMEM_L1 |
54 (cache_policy == L2_BYPASS ? SI_CONTEXT_INV_GLOBAL_L2 : 0);
55 case SI_COHERENCY_CB_META:
56 return SI_CONTEXT_FLUSH_AND_INV_CB;
57 }
58 }
59
60 static void si_compute_do_clear_or_copy(struct si_context *sctx,
61 struct pipe_resource *dst,
62 unsigned dst_offset,
63 struct pipe_resource *src,
64 unsigned src_offset,
65 unsigned size,
66 const uint32_t *clear_value,
67 unsigned clear_value_size,
68 enum si_coherency coher)
69 {
70 struct pipe_context *ctx = &sctx->b;
71
72 assert(src_offset % 4 == 0);
73 assert(dst_offset % 4 == 0);
74 assert(size % 4 == 0);
75
76 assert(dst->target != PIPE_BUFFER || dst_offset + size <= dst->width0);
77 assert(!src || src_offset + size <= src->width0);
78
79 sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
80 SI_CONTEXT_CS_PARTIAL_FLUSH |
81 si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY);
82
83 /* Save states. */
84 void *saved_cs = sctx->cs_shader_state.program;
85 struct pipe_shader_buffer saved_sb[2] = {};
86 si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb);
87
88 /* The memory accesses are coalesced, meaning that the 1st instruction writes
89 * the 1st contiguous block of data for the whole wave, the 2nd instruction
90 * writes the 2nd contiguous block of data, etc.
91 */
92 unsigned dwords_per_thread = src ? SI_COMPUTE_COPY_DW_PER_THREAD :
93 SI_COMPUTE_CLEAR_DW_PER_THREAD;
94 unsigned instructions_per_thread = MAX2(1, dwords_per_thread / 4);
95 unsigned dwords_per_instruction = dwords_per_thread / instructions_per_thread;
96 unsigned dwords_per_wave = dwords_per_thread * 64;
97
98 unsigned num_dwords = size / 4;
99 unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
100
101 struct pipe_grid_info info = {};
102 info.block[0] = MIN2(64, num_instructions);
103 info.block[1] = 1;
104 info.block[2] = 1;
105 info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
106 info.grid[1] = 1;
107 info.grid[2] = 1;
108
109 struct pipe_shader_buffer sb[2] = {};
110 sb[0].buffer = dst;
111 sb[0].buffer_offset = dst_offset;
112 sb[0].buffer_size = size;
113
114 bool shader_dst_stream_policy = SI_COMPUTE_DST_CACHE_POLICY != L2_LRU;
115
116 if (src) {
117 sb[1].buffer = src;
118 sb[1].buffer_offset = src_offset;
119 sb[1].buffer_size = size;
120
121 ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 2, sb);
122
123 if (!sctx->cs_copy_buffer) {
124 sctx->cs_copy_buffer = si_create_dma_compute_shader(&sctx->b,
125 SI_COMPUTE_COPY_DW_PER_THREAD,
126 shader_dst_stream_policy, true);
127 }
128 ctx->bind_compute_state(ctx, sctx->cs_copy_buffer);
129 } else {
130 assert(clear_value_size >= 4 &&
131 clear_value_size <= 16 &&
132 util_is_power_of_two_or_zero(clear_value_size));
133
134 for (unsigned i = 0; i < 4; i++)
135 sctx->cs_user_data[i] = clear_value[i % (clear_value_size / 4)];
136
137 ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, sb);
138
139 if (!sctx->cs_clear_buffer) {
140 sctx->cs_clear_buffer = si_create_dma_compute_shader(&sctx->b,
141 SI_COMPUTE_CLEAR_DW_PER_THREAD,
142 shader_dst_stream_policy, false);
143 }
144 ctx->bind_compute_state(ctx, sctx->cs_clear_buffer);
145 }
146
147 ctx->launch_grid(ctx, &info);
148
149 enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
150 sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
151 (cache_policy == L2_BYPASS ? SI_CONTEXT_WRITEBACK_GLOBAL_L2 : 0);
152
153 if (cache_policy != L2_BYPASS)
154 r600_resource(dst)->TC_L2_dirty = true;
155
156 /* Restore states. */
157 ctx->bind_compute_state(ctx, saved_cs);
158 ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb);
159 }
160
161 void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
162 uint64_t offset, uint64_t size, uint32_t *clear_value,
163 uint32_t clear_value_size, enum si_coherency coher)
164 {
165 if (!size)
166 return;
167
168 unsigned clear_alignment = MIN2(clear_value_size, 4);
169
170 assert(clear_value_size != 3 && clear_value_size != 6); /* 12 is allowed. */
171 assert(offset % clear_alignment == 0);
172 assert(size % clear_alignment == 0);
173 assert(size < (UINT_MAX & ~0xf)); /* TODO: test 64-bit sizes in all codepaths */
174
175 /* Reduce a large clear value size if possible. */
176 if (clear_value_size > 4) {
177 bool clear_dword_duplicated = true;
178
179 /* See if we can lower large fills to dword fills. */
180 for (unsigned i = 1; i < clear_value_size / 4; i++) {
181 if (clear_value[0] != clear_value[i]) {
182 clear_dword_duplicated = false;
183 break;
184 }
185 }
186 if (clear_dword_duplicated)
187 clear_value_size = 4;
188 }
189
190 /* Expand a small clear value size. */
191 uint32_t tmp_clear_value;
192 if (clear_value_size <= 2) {
193 if (clear_value_size == 1) {
194 tmp_clear_value = *(uint8_t*)clear_value;
195 tmp_clear_value |= (tmp_clear_value << 8) |
196 (tmp_clear_value << 16) |
197 (tmp_clear_value << 24);
198 } else {
199 tmp_clear_value = *(uint16_t*)clear_value;
200 tmp_clear_value |= tmp_clear_value << 16;
201 }
202 clear_value = &tmp_clear_value;
203 clear_value_size = 4;
204 }
205
206 /* Use transform feedback for 12-byte clears. */
207 /* TODO: Use compute. */
208 if (clear_value_size == 12) {
209 union pipe_color_union streamout_clear_value;
210
211 memcpy(&streamout_clear_value, clear_value, clear_value_size);
212 si_blitter_begin(sctx, SI_DISABLE_RENDER_COND);
213 util_blitter_clear_buffer(sctx->blitter, dst, offset,
214 size, clear_value_size / 4,
215 &streamout_clear_value);
216 si_blitter_end(sctx);
217 return;
218 }
219
220 uint64_t aligned_size = size & ~3ull;
221 if (aligned_size >= 4) {
222 /* Before GFX9, CP DMA was very slow when clearing GTT, so never
223 * use CP DMA clears on those chips, because we can't be certain
224 * about buffer placements.
225 */
226 if (clear_value_size > 4 ||
227 (clear_value_size == 4 &&
228 offset % 4 == 0 &&
229 (size > 32*1024 || sctx->chip_class <= VI))) {
230 si_compute_do_clear_or_copy(sctx, dst, offset, NULL, 0,
231 aligned_size, clear_value,
232 clear_value_size, coher);
233 } else {
234 assert(clear_value_size == 4);
235 si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, offset,
236 aligned_size, *clear_value, 0, coher,
237 get_cache_policy(sctx, coher, size));
238 }
239
240 offset += aligned_size;
241 size -= aligned_size;
242 }
243
244 /* Handle non-dword alignment. */
245 if (size) {
246 assert(dst);
247 assert(dst->target == PIPE_BUFFER);
248 assert(size < 4);
249
250 pipe_buffer_write(&sctx->b, dst, offset, size, clear_value);
251 }
252 }
253
254 static void si_pipe_clear_buffer(struct pipe_context *ctx,
255 struct pipe_resource *dst,
256 unsigned offset, unsigned size,
257 const void *clear_value,
258 int clear_value_size)
259 {
260 enum si_coherency coher;
261
262 if (dst->flags & SI_RESOURCE_FLAG_SO_FILLED_SIZE)
263 coher = SI_COHERENCY_CP;
264 else
265 coher = SI_COHERENCY_SHADER;
266
267 si_clear_buffer((struct si_context*)ctx, dst, offset, size, (uint32_t*)clear_value,
268 clear_value_size, coher);
269 }
270
271 void si_copy_buffer(struct si_context *sctx,
272 struct pipe_resource *dst, struct pipe_resource *src,
273 uint64_t dst_offset, uint64_t src_offset, unsigned size)
274 {
275 if (!size)
276 return;
277
278 enum si_coherency coher = SI_COHERENCY_SHADER;
279 enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
280
281 /* Only use compute for VRAM copies on dGPUs. */
282 if (sctx->screen->info.has_dedicated_vram &&
283 r600_resource(dst)->domains & RADEON_DOMAIN_VRAM &&
284 r600_resource(src)->domains & RADEON_DOMAIN_VRAM &&
285 size > 32 * 1024 &&
286 dst_offset % 4 == 0 && src_offset % 4 == 0 && size % 4 == 0) {
287 si_compute_do_clear_or_copy(sctx, dst, dst_offset, src, src_offset,
288 size, NULL, 0, coher);
289 } else {
290 si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size,
291 0, coher, cache_policy);
292 }
293 }
294
295 void si_init_compute_blit_functions(struct si_context *sctx)
296 {
297 sctx->b.clear_buffer = si_pipe_clear_buffer;
298 }