freedreno/ir3: handle const/immed/abs/neg in cp
[mesa.git] / src / gallium / drivers / freedreno / freedreno_gmem.c
1 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
2
3 /*
4 * Copyright (C) 2012 Rob Clark <robclark@freedesktop.org>
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 * SOFTWARE.
24 *
25 * Authors:
26 * Rob Clark <robclark@freedesktop.org>
27 */
28
29 #include "pipe/p_state.h"
30 #include "util/u_string.h"
31 #include "util/u_memory.h"
32 #include "util/u_inlines.h"
33 #include "util/u_format.h"
34
35 #include "freedreno_gmem.h"
36 #include "freedreno_context.h"
37 #include "freedreno_resource.h"
38 #include "freedreno_query_hw.h"
39 #include "freedreno_util.h"
40
41 /*
42 * GMEM is the small (ie. 256KiB for a200, 512KiB for a220, etc) tile buffer
43 * inside the GPU. All rendering happens to GMEM. Larger render targets
44 * are split into tiles that are small enough for the color (and depth and/or
45 * stencil, if enabled) buffers to fit within GMEM. Before rendering a tile,
46 * if there was not a clear invalidating the previous tile contents, we need
47 * to restore the previous tiles contents (system mem -> GMEM), and after all
48 * the draw calls, before moving to the next tile, we need to save the tile
49 * contents (GMEM -> system mem).
50 *
51 * The code in this file handles dealing with GMEM and tiling.
52 *
53 * The structure of the ringbuffer ends up being:
54 *
55 * +--<---<-- IB ---<---+---<---+---<---<---<--+
56 * | | | |
57 * v ^ ^ ^
58 * ------------------------------------------------------
59 * | clear/draw cmds | Tile0 | Tile1 | .... | TileN |
60 * ------------------------------------------------------
61 * ^
62 * |
63 * address submitted in issueibcmds
64 *
65 * Where the per-tile section handles scissor setup, mem2gmem restore (if
66 * needed), IB to draw cmds earlier in the ringbuffer, and then gmem2mem
67 * resolve.
68 */
69
70 static uint32_t bin_width(struct fd_context *ctx)
71 {
72 if (is_a4xx(ctx->screen))
73 return 1024;
74 if (is_a3xx(ctx->screen))
75 return 992;
76 return 512;
77 }
78
79 static uint32_t
80 total_size(uint8_t cbuf_cpp[], uint8_t zsbuf_cpp,
81 uint32_t bin_w, uint32_t bin_h, struct fd_gmem_stateobj *gmem)
82 {
83 uint32_t total = 0, i;
84
85 for (i = 0; i < 4; i++) {
86 if (cbuf_cpp[i]) {
87 gmem->cbuf_base[i] = align(total, 0x4000);
88 total = gmem->cbuf_base[i] + cbuf_cpp[i] * bin_w * bin_h;
89 }
90 }
91
92 if (zsbuf_cpp) {
93 gmem->zsbuf_base = align(total, 0x4000);
94 total = gmem->zsbuf_base + zsbuf_cpp * bin_w * bin_h;
95 }
96
97 return total;
98 }
99
100 static void
101 calculate_tiles(struct fd_context *ctx)
102 {
103 struct fd_gmem_stateobj *gmem = &ctx->gmem;
104 struct pipe_scissor_state *scissor = &ctx->max_scissor;
105 struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
106 uint32_t gmem_size = ctx->screen->gmemsize_bytes;
107 uint32_t minx, miny, width, height;
108 uint32_t nbins_x = 1, nbins_y = 1;
109 uint32_t bin_w, bin_h;
110 uint32_t max_width = bin_width(ctx);
111 uint8_t cbuf_cpp[4] = {0}, zsbuf_cpp = 0;
112 uint32_t i, j, t, xoff, yoff;
113 uint32_t tpp_x, tpp_y;
114 bool has_zs = !!(ctx->resolve & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL));
115
116 if (has_zs)
117 zsbuf_cpp = util_format_get_blocksize(pfb->zsbuf->format);
118 for (i = 0; i < pfb->nr_cbufs; i++) {
119 if (pfb->cbufs[i])
120 cbuf_cpp[i] = util_format_get_blocksize(pfb->cbufs[i]->format);
121 else
122 cbuf_cpp[i] = 4;
123 }
124
125 if (gmem->zsbuf_cpp == zsbuf_cpp &&
126 !memcmp(gmem->cbuf_cpp, cbuf_cpp, sizeof(cbuf_cpp)) &&
127 !memcmp(&gmem->scissor, scissor, sizeof(gmem->scissor))) {
128 /* everything is up-to-date */
129 return;
130 }
131
132 if (fd_mesa_debug & FD_DBG_NOSCIS) {
133 minx = 0;
134 miny = 0;
135 width = pfb->width;
136 height = pfb->height;
137 } else {
138 minx = scissor->minx & ~31; /* round down to multiple of 32 */
139 miny = scissor->miny & ~31;
140 width = scissor->maxx - minx;
141 height = scissor->maxy - miny;
142 }
143
144 bin_w = align(width, 32);
145 bin_h = align(height, 32);
146
147 /* first, find a bin width that satisfies the maximum width
148 * restrictions:
149 */
150 while (bin_w > max_width) {
151 nbins_x++;
152 bin_w = align(width / nbins_x, 32);
153 }
154
155 /* then find a bin width/height that satisfies the memory
156 * constraints:
157 */
158 DBG("binning input: cbuf cpp: %d %d %d %d, zsbuf cpp: %d; %dx%d",
159 cbuf_cpp[0], cbuf_cpp[1], cbuf_cpp[2], cbuf_cpp[3], zsbuf_cpp,
160 width, height);
161 while (total_size(cbuf_cpp, zsbuf_cpp, bin_w, bin_h, gmem) > gmem_size) {
162 if (bin_w > bin_h) {
163 nbins_x++;
164 bin_w = align(width / nbins_x, 32);
165 } else {
166 nbins_y++;
167 bin_h = align(height / nbins_y, 32);
168 }
169 }
170
171 DBG("using %d bins of size %dx%d", nbins_x*nbins_y, bin_w, bin_h);
172
173 gmem->scissor = *scissor;
174 memcpy(gmem->cbuf_cpp, cbuf_cpp, sizeof(cbuf_cpp));
175 gmem->zsbuf_cpp = zsbuf_cpp;
176 gmem->bin_h = bin_h;
177 gmem->bin_w = bin_w;
178 gmem->nbins_x = nbins_x;
179 gmem->nbins_y = nbins_y;
180 gmem->minx = minx;
181 gmem->miny = miny;
182 gmem->width = width;
183 gmem->height = height;
184
185 /*
186 * Assign tiles and pipes:
187 *
188 * At some point it might be worth playing with different
189 * strategies and seeing if that makes much impact on
190 * performance.
191 */
192
193 #define div_round_up(v, a) (((v) + (a) - 1) / (a))
194 /* figure out number of tiles per pipe: */
195 tpp_x = tpp_y = 1;
196 while (div_round_up(nbins_y, tpp_y) > 8)
197 tpp_y += 2;
198 while ((div_round_up(nbins_y, tpp_y) *
199 div_round_up(nbins_x, tpp_x)) > 8)
200 tpp_x += 1;
201
202 /* configure pipes: */
203 xoff = yoff = 0;
204 for (i = 0; i < ARRAY_SIZE(ctx->pipe); i++) {
205 struct fd_vsc_pipe *pipe = &ctx->pipe[i];
206
207 if (xoff >= nbins_x) {
208 xoff = 0;
209 yoff += tpp_y;
210 }
211
212 if (yoff >= nbins_y) {
213 break;
214 }
215
216 pipe->x = xoff;
217 pipe->y = yoff;
218 pipe->w = MIN2(tpp_x, nbins_x - xoff);
219 pipe->h = MIN2(tpp_y, nbins_y - yoff);
220
221 xoff += tpp_x;
222 }
223
224 for (; i < ARRAY_SIZE(ctx->pipe); i++) {
225 struct fd_vsc_pipe *pipe = &ctx->pipe[i];
226 pipe->x = pipe->y = pipe->w = pipe->h = 0;
227 }
228
229 #if 0 /* debug */
230 printf("%dx%d ... tpp=%dx%d\n", nbins_x, nbins_y, tpp_x, tpp_y);
231 for (i = 0; i < 8; i++) {
232 struct fd_vsc_pipe *pipe = &ctx->pipe[i];
233 printf("pipe[%d]: %ux%u @ %u,%u\n", i,
234 pipe->w, pipe->h, pipe->x, pipe->y);
235 }
236 #endif
237
238 /* configure tiles: */
239 t = 0;
240 yoff = miny;
241 for (i = 0; i < nbins_y; i++) {
242 uint32_t bw, bh;
243
244 xoff = minx;
245
246 /* clip bin height: */
247 bh = MIN2(bin_h, miny + height - yoff);
248
249 for (j = 0; j < nbins_x; j++) {
250 struct fd_tile *tile = &ctx->tile[t];
251 uint32_t n, p;
252
253 assert(t < ARRAY_SIZE(ctx->tile));
254
255 /* pipe number: */
256 p = ((i / tpp_y) * div_round_up(nbins_x, tpp_x)) + (j / tpp_x);
257
258 /* slot number: */
259 n = ((i % tpp_y) * tpp_x) + (j % tpp_x);
260
261 /* clip bin width: */
262 bw = MIN2(bin_w, minx + width - xoff);
263
264 tile->n = n;
265 tile->p = p;
266 tile->bin_w = bw;
267 tile->bin_h = bh;
268 tile->xoff = xoff;
269 tile->yoff = yoff;
270
271 t++;
272
273 xoff += bw;
274 }
275
276 yoff += bh;
277 }
278
279 #if 0 /* debug */
280 t = 0;
281 for (i = 0; i < nbins_y; i++) {
282 for (j = 0; j < nbins_x; j++) {
283 struct fd_tile *tile = &ctx->tile[t++];
284 printf("|p:%u n:%u|", tile->p, tile->n);
285 }
286 printf("\n");
287 }
288 #endif
289 }
290
291 static void
292 render_tiles(struct fd_context *ctx)
293 {
294 struct fd_gmem_stateobj *gmem = &ctx->gmem;
295 int i;
296
297 ctx->emit_tile_init(ctx);
298
299 if (ctx->restore)
300 ctx->stats.batch_restore++;
301
302 for (i = 0; i < (gmem->nbins_x * gmem->nbins_y); i++) {
303 struct fd_tile *tile = &ctx->tile[i];
304
305 DBG("bin_h=%d, yoff=%d, bin_w=%d, xoff=%d",
306 tile->bin_h, tile->yoff, tile->bin_w, tile->xoff);
307
308 ctx->emit_tile_prep(ctx, tile);
309
310 if (ctx->restore) {
311 fd_hw_query_set_stage(ctx, ctx->ring, FD_STAGE_MEM2GMEM);
312 ctx->emit_tile_mem2gmem(ctx, tile);
313 fd_hw_query_set_stage(ctx, ctx->ring, FD_STAGE_NULL);
314 }
315
316 ctx->emit_tile_renderprep(ctx, tile);
317
318 fd_hw_query_prepare_tile(ctx, i, ctx->ring);
319
320 /* emit IB to drawcmds: */
321 OUT_IB(ctx->ring, ctx->draw_start, ctx->draw_end);
322 fd_reset_wfi(ctx);
323
324 /* emit gmem2mem to transfer tile back to system memory: */
325 fd_hw_query_set_stage(ctx, ctx->ring, FD_STAGE_GMEM2MEM);
326 ctx->emit_tile_gmem2mem(ctx, tile);
327 fd_hw_query_set_stage(ctx, ctx->ring, FD_STAGE_NULL);
328 }
329 }
330
331 static void
332 render_sysmem(struct fd_context *ctx)
333 {
334 ctx->emit_sysmem_prep(ctx);
335
336 fd_hw_query_prepare_tile(ctx, 0, ctx->ring);
337
338 /* emit IB to drawcmds: */
339 OUT_IB(ctx->ring, ctx->draw_start, ctx->draw_end);
340 fd_reset_wfi(ctx);
341 }
342
343 void
344 fd_gmem_render_tiles(struct fd_context *ctx)
345 {
346 struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
347 uint32_t i, timestamp = 0;
348 bool sysmem = false;
349
350 if (ctx->emit_sysmem_prep) {
351 if (ctx->cleared || ctx->gmem_reason || (ctx->num_draws > 5)) {
352 DBG("GMEM: cleared=%x, gmem_reason=%x, num_draws=%u",
353 ctx->cleared, ctx->gmem_reason, ctx->num_draws);
354 } else if (!(fd_mesa_debug & FD_DBG_NOBYPASS)) {
355 sysmem = true;
356 }
357 }
358
359 /* close out the draw cmds by making sure any active queries are
360 * paused:
361 */
362 fd_hw_query_set_stage(ctx, ctx->ring, FD_STAGE_NULL);
363
364 /* mark the end of the clear/draw cmds before emitting per-tile cmds: */
365 fd_ringmarker_mark(ctx->draw_end);
366 fd_ringmarker_mark(ctx->binning_end);
367
368 fd_reset_wfi(ctx);
369
370 ctx->stats.batch_total++;
371
372 if (sysmem) {
373 DBG("rendering sysmem (%s/%s)",
374 util_format_short_name(pipe_surface_format(pfb->cbufs[0])),
375 util_format_short_name(pipe_surface_format(pfb->zsbuf)));
376 fd_hw_query_prepare(ctx, 1);
377 render_sysmem(ctx);
378 ctx->stats.batch_sysmem++;
379 } else {
380 struct fd_gmem_stateobj *gmem = &ctx->gmem;
381 calculate_tiles(ctx);
382 DBG("rendering %dx%d tiles (%s/%s)", gmem->nbins_x, gmem->nbins_y,
383 util_format_short_name(pipe_surface_format(pfb->cbufs[0])),
384 util_format_short_name(pipe_surface_format(pfb->zsbuf)));
385 fd_hw_query_prepare(ctx, gmem->nbins_x * gmem->nbins_y);
386 render_tiles(ctx);
387 ctx->stats.batch_gmem++;
388 }
389
390 /* GPU executes starting from tile cmds, which IB back to draw cmds: */
391 fd_ringmarker_flush(ctx->draw_end);
392
393 /* mark start for next draw/binning cmds: */
394 fd_ringmarker_mark(ctx->draw_start);
395 fd_ringmarker_mark(ctx->binning_start);
396
397 fd_reset_wfi(ctx);
398
399 /* update timestamps on render targets: */
400 timestamp = fd_ringbuffer_timestamp(ctx->ring);
401 for (i = 0; i < pfb->nr_cbufs; i++)
402 if (pfb->cbufs[i])
403 fd_resource(pfb->cbufs[i]->texture)->timestamp = timestamp;
404 if (pfb->zsbuf)
405 fd_resource(pfb->zsbuf->texture)->timestamp = timestamp;
406
407 /* reset maximal bounds: */
408 ctx->max_scissor.minx = ctx->max_scissor.miny = ~0;
409 ctx->max_scissor.maxx = ctx->max_scissor.maxy = 0;
410
411 ctx->dirty = ~0;
412 }
413
414 /* tile needs restore if it isn't completely contained within the
415 * cleared scissor:
416 */
417 static bool
418 skip_restore(struct pipe_scissor_state *scissor, struct fd_tile *tile)
419 {
420 unsigned minx = tile->xoff;
421 unsigned maxx = tile->xoff + tile->bin_w;
422 unsigned miny = tile->yoff;
423 unsigned maxy = tile->yoff + tile->bin_h;
424 return (minx >= scissor->minx) && (maxx <= scissor->maxx) &&
425 (miny >= scissor->miny) && (maxy <= scissor->maxy);
426 }
427
428 /* When deciding whether a tile needs mem2gmem, we need to take into
429 * account the scissor rect(s) that were cleared. To simplify we only
430 * consider the last scissor rect for each buffer, since the common
431 * case would be a single clear.
432 */
433 bool
434 fd_gmem_needs_restore(struct fd_context *ctx, struct fd_tile *tile,
435 uint32_t buffers)
436 {
437 if (!(ctx->restore & buffers))
438 return false;
439
440 /* if buffers partially cleared, then slow-path to figure out
441 * if this particular tile needs restoring:
442 */
443 if ((buffers & FD_BUFFER_COLOR) &&
444 (ctx->partial_cleared & FD_BUFFER_COLOR) &&
445 skip_restore(&ctx->cleared_scissor.color, tile))
446 return false;
447 if ((buffers & FD_BUFFER_DEPTH) &&
448 (ctx->partial_cleared & FD_BUFFER_DEPTH) &&
449 skip_restore(&ctx->cleared_scissor.depth, tile))
450 return false;
451 if ((buffers & FD_BUFFER_STENCIL) &&
452 (ctx->partial_cleared & FD_BUFFER_STENCIL) &&
453 skip_restore(&ctx->cleared_scissor.stencil, tile))
454 return false;
455
456 return true;
457 }