i965: Fix execution size of scalar TCS barrier setup code.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_meta_util.c
1 /*
2 * Copyright © 2014 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_context.h"
25 #include "intel_fbo.h"
26 #include "brw_meta_util.h"
27 #include "brw_state.h"
28 #include "main/blend.h"
29 #include "main/fbobject.h"
30 #include "util/format_srgb.h"
31
32 /**
33 * Helper function for handling mirror image blits.
34 *
35 * If coord0 > coord1, swap them and invert the "mirror" boolean.
36 */
37 static inline void
38 fixup_mirroring(bool *mirror, float *coord0, float *coord1)
39 {
40 if (*coord0 > *coord1) {
41 *mirror = !*mirror;
42 float tmp = *coord0;
43 *coord0 = *coord1;
44 *coord1 = tmp;
45 }
46 }
47
48 /**
49 * Compute the number of pixels to clip for each side of a rect
50 *
51 * \param x0 The rect's left coordinate
52 * \param y0 The rect's bottom coordinate
53 * \param x1 The rect's right coordinate
54 * \param y1 The rect's top coordinate
55 * \param min_x The clipping region's left coordinate
56 * \param min_y The clipping region's bottom coordinate
57 * \param max_x The clipping region's right coordinate
58 * \param max_y The clipping region's top coordinate
59 * \param clipped_x0 The number of pixels to clip from the left side
60 * \param clipped_y0 The number of pixels to clip from the bottom side
61 * \param clipped_x1 The number of pixels to clip from the right side
62 * \param clipped_y1 The number of pixels to clip from the top side
63 *
64 * \return false if we clip everything away, true otherwise
65 */
66 static inline bool
67 compute_pixels_clipped(float x0, float y0, float x1, float y1,
68 float min_x, float min_y, float max_x, float max_y,
69 float *clipped_x0, float *clipped_y0, float *clipped_x1, float *clipped_y1)
70 {
71 /* If we are going to clip everything away, stop. */
72 if (!(min_x <= max_x &&
73 min_y <= max_y &&
74 x0 <= max_x &&
75 y0 <= max_y &&
76 min_x <= x1 &&
77 min_y <= y1 &&
78 x0 <= x1 &&
79 y0 <= y1)) {
80 return false;
81 }
82
83 if (x0 < min_x)
84 *clipped_x0 = min_x - x0;
85 else
86 *clipped_x0 = 0;
87 if (max_x < x1)
88 *clipped_x1 = x1 - max_x;
89 else
90 *clipped_x1 = 0;
91
92 if (y0 < min_y)
93 *clipped_y0 = min_y - y0;
94 else
95 *clipped_y0 = 0;
96 if (max_y < y1)
97 *clipped_y1 = y1 - max_y;
98 else
99 *clipped_y1 = 0;
100
101 return true;
102 }
103
104 /**
105 * Clips a coordinate (left, right, top or bottom) for the src or dst rect
106 * (whichever requires the largest clip) and adjusts the coordinate
107 * for the other rect accordingly.
108 *
109 * \param mirror true if mirroring is required
110 * \param src the source rect coordinate (for example srcX0)
111 * \param dst0 the dst rect coordinate (for example dstX0)
112 * \param dst1 the opposite dst rect coordinate (for example dstX1)
113 * \param clipped_src0 number of pixels to clip from the src coordinate
114 * \param clipped_dst0 number of pixels to clip from the dst coordinate
115 * \param clipped_dst1 number of pixels to clip from the opposite dst coordinate
116 * \param scale the src vs dst scale involved for that coordinate
117 * \param isLeftOrBottom true if we are clipping the left or bottom sides
118 * of the rect.
119 */
120 static inline void
121 clip_coordinates(bool mirror,
122 float *src, float *dst0, float *dst1,
123 float clipped_src0,
124 float clipped_dst0,
125 float clipped_dst1,
126 float scale,
127 bool isLeftOrBottom)
128 {
129 /* When clipping we need to add or subtract pixels from the original
130 * coordinates depending on whether we are acting on the left/bottom
131 * or right/top sides of the rect respectively. We assume we have to
132 * add them in the code below, and multiply by -1 when we should
133 * subtract.
134 */
135 int mult = isLeftOrBottom ? 1 : -1;
136
137 if (!mirror) {
138 if (clipped_src0 >= clipped_dst0 * scale) {
139 *src += clipped_src0 * mult;
140 *dst0 += clipped_src0 / scale * mult;
141 } else {
142 *dst0 += clipped_dst0 * mult;
143 *src += clipped_dst0 * scale * mult;
144 }
145 } else {
146 if (clipped_src0 >= clipped_dst1 * scale) {
147 *src += clipped_src0 * mult;
148 *dst1 -= clipped_src0 / scale * mult;
149 } else {
150 *dst1 -= clipped_dst1 * mult;
151 *src += clipped_dst1 * scale * mult;
152 }
153 }
154 }
155
156 bool
157 brw_meta_mirror_clip_and_scissor(const struct gl_context *ctx,
158 const struct gl_framebuffer *read_fb,
159 const struct gl_framebuffer *draw_fb,
160 GLfloat *srcX0, GLfloat *srcY0,
161 GLfloat *srcX1, GLfloat *srcY1,
162 GLfloat *dstX0, GLfloat *dstY0,
163 GLfloat *dstX1, GLfloat *dstY1,
164 bool *mirror_x, bool *mirror_y)
165 {
166 *mirror_x = false;
167 *mirror_y = false;
168
169 /* Detect if the blit needs to be mirrored */
170 fixup_mirroring(mirror_x, srcX0, srcX1);
171 fixup_mirroring(mirror_x, dstX0, dstX1);
172 fixup_mirroring(mirror_y, srcY0, srcY1);
173 fixup_mirroring(mirror_y, dstY0, dstY1);
174
175 /* Compute number of pixels to clip for each side of both rects. Return
176 * early if we are going to clip everything away.
177 */
178 float clip_src_x0;
179 float clip_src_x1;
180 float clip_src_y0;
181 float clip_src_y1;
182 float clip_dst_x0;
183 float clip_dst_x1;
184 float clip_dst_y0;
185 float clip_dst_y1;
186
187 if (!compute_pixels_clipped(*srcX0, *srcY0, *srcX1, *srcY1,
188 0, 0, read_fb->Width, read_fb->Height,
189 &clip_src_x0, &clip_src_y0, &clip_src_x1, &clip_src_y1))
190 return true;
191
192 if (!compute_pixels_clipped(*dstX0, *dstY0, *dstX1, *dstY1,
193 draw_fb->_Xmin, draw_fb->_Ymin, draw_fb->_Xmax, draw_fb->_Ymax,
194 &clip_dst_x0, &clip_dst_y0, &clip_dst_x1, &clip_dst_y1))
195 return true;
196
197 /* When clipping any of the two rects we need to adjust the coordinates in
198 * the other rect considering the scaling factor involved. To obtain the best
199 * precision we want to make sure that we only clip once per side to avoid
200 * accumulating errors due to the scaling adjustment.
201 *
202 * For example, if srcX0 and dstX0 need both to be clipped we want to avoid
203 * the situation where we clip srcX0 first, then adjust dstX0 accordingly
204 * but then we realize that the resulting dstX0 still needs to be clipped,
205 * so we clip dstX0 and adjust srcX0 again. Because we are applying scaling
206 * factors to adjust the coordinates in each clipping pass we lose some
207 * precision and that can affect the results of the blorp blit operation
208 * slightly. What we want to do here is detect the rect that we should
209 * clip first for each side so that when we adjust the other rect we ensure
210 * the resulting coordinate does not need to be clipped again.
211 *
212 * The code below implements this by comparing the number of pixels that
213 * we need to clip for each side of both rects considering the scales
214 * involved. For example, clip_src_x0 represents the number of pixels to be
215 * clipped for the src rect's left side, so if clip_src_x0 = 5,
216 * clip_dst_x0 = 4 and scaleX = 2 it means that we are clipping more from
217 * the dst rect so we should clip dstX0 only and adjust srcX0. This is
218 * because clipping 4 pixels in the dst is equivalent to clipping
219 * 4 * 2 = 8 > 5 in the src.
220 */
221
222 float scaleX = (float) (*srcX1 - *srcX0) / (*dstX1 - *dstX0);
223 float scaleY = (float) (*srcY1 - *srcY0) / (*dstY1 - *dstY0);
224
225 /* Clip left side */
226 clip_coordinates(*mirror_x,
227 srcX0, dstX0, dstX1,
228 clip_src_x0, clip_dst_x0, clip_dst_x1,
229 scaleX, true);
230
231 /* Clip right side */
232 clip_coordinates(*mirror_x,
233 srcX1, dstX1, dstX0,
234 clip_src_x1, clip_dst_x1, clip_dst_x0,
235 scaleX, false);
236
237 /* Clip bottom side */
238 clip_coordinates(*mirror_y,
239 srcY0, dstY0, dstY1,
240 clip_src_y0, clip_dst_y0, clip_dst_y1,
241 scaleY, true);
242
243 /* Clip top side */
244 clip_coordinates(*mirror_y,
245 srcY1, dstY1, dstY0,
246 clip_src_y1, clip_dst_y1, clip_dst_y0,
247 scaleY, false);
248
249 /* Account for the fact that in the system framebuffer, the origin is at
250 * the lower left.
251 */
252 if (_mesa_is_winsys_fbo(read_fb)) {
253 GLint tmp = read_fb->Height - *srcY0;
254 *srcY0 = read_fb->Height - *srcY1;
255 *srcY1 = tmp;
256 *mirror_y = !*mirror_y;
257 }
258 if (_mesa_is_winsys_fbo(draw_fb)) {
259 GLint tmp = draw_fb->Height - *dstY0;
260 *dstY0 = draw_fb->Height - *dstY1;
261 *dstY1 = tmp;
262 *mirror_y = !*mirror_y;
263 }
264
265 return false;
266 }
267
268 /**
269 * Creates a new named renderbuffer that wraps the first slice
270 * of an existing miptree.
271 *
272 * Clobbers the current renderbuffer binding (ctx->CurrentRenderbuffer).
273 */
274 struct gl_renderbuffer *
275 brw_get_rb_for_slice(struct brw_context *brw,
276 struct intel_mipmap_tree *mt,
277 unsigned level, unsigned layer, bool flat)
278 {
279 struct gl_context *ctx = &brw->ctx;
280 struct gl_renderbuffer *rb = ctx->Driver.NewRenderbuffer(ctx, 0xDEADBEEF);
281 struct intel_renderbuffer *irb = intel_renderbuffer(rb);
282
283 rb->RefCount = 1;
284 rb->Format = mt->format;
285 rb->_BaseFormat = _mesa_get_format_base_format(mt->format);
286
287 /* Program takes care of msaa and mip-level access manually for stencil.
288 * The surface is also treated as Y-tiled instead of as W-tiled calling for
289 * twice the width and half the height in dimensions.
290 */
291 if (flat) {
292 const unsigned halign_stencil = 8;
293
294 rb->NumSamples = 0;
295 rb->Width = ALIGN(mt->total_width, halign_stencil) * 2;
296 rb->Height = (mt->total_height / mt->physical_depth0) / 2;
297 irb->mt_level = 0;
298 } else {
299 rb->NumSamples = mt->num_samples;
300 rb->Width = mt->logical_width0;
301 rb->Height = mt->logical_height0;
302 irb->mt_level = level;
303 }
304
305 irb->mt_layer = layer;
306
307 intel_miptree_reference(&irb->mt, mt);
308
309 return rb;
310 }
311
312 /**
313 * Determine if fast color clear supports the given clear color.
314 *
315 * Fast color clear can only clear to color values of 1.0 or 0.0. At the
316 * moment we only support floating point, unorm, and snorm buffers.
317 */
318 bool
319 brw_is_color_fast_clear_compatible(struct brw_context *brw,
320 const struct intel_mipmap_tree *mt,
321 const union gl_color_union *color)
322 {
323 const struct gl_context *ctx = &brw->ctx;
324
325 /* If we're mapping the render format to a different format than the
326 * format we use for texturing then it is a bit questionable whether it
327 * should be possible to use a fast clear. Although we only actually
328 * render using a renderable format, without the override workaround it
329 * wouldn't be possible to have a non-renderable surface in a fast clear
330 * state so the hardware probably legitimately doesn't need to support
331 * this case. At least on Gen9 this really does seem to cause problems.
332 */
333 if (brw->gen >= 9 &&
334 brw_format_for_mesa_format(mt->format) !=
335 brw->render_target_format[mt->format])
336 return false;
337
338 /* Gen9 doesn't support fast clear on single-sampled SRGB buffers. When
339 * GL_FRAMEBUFFER_SRGB is enabled any color renderbuffers will be
340 * resolved in intel_update_state. In that case it's pointless to do a
341 * fast clear because it's very likely to be immediately resolved.
342 */
343 if (brw->gen >= 9 &&
344 mt->num_samples <= 1 &&
345 ctx->Color.sRGBEnabled &&
346 _mesa_get_srgb_format_linear(mt->format) != mt->format)
347 return false;
348
349 const mesa_format format = _mesa_get_render_format(ctx, mt->format);
350 if (_mesa_is_format_integer_color(format)) {
351 if (brw->gen >= 8) {
352 perf_debug("Integer fast clear not enabled for (%s)",
353 _mesa_get_format_name(format));
354 }
355 return false;
356 }
357
358 for (int i = 0; i < 4; i++) {
359 if (!_mesa_format_has_color_component(format, i)) {
360 continue;
361 }
362
363 if (brw->gen < 9 &&
364 color->f[i] != 0.0f && color->f[i] != 1.0f) {
365 return false;
366 }
367 }
368 return true;
369 }
370
371 /**
372 * Convert the given color to a bitfield suitable for ORing into DWORD 7 of
373 * SURFACE_STATE (DWORD 12-15 on SKL+).
374 *
375 * Returned boolean tells if the given color differs from the stored.
376 */
377 bool
378 brw_meta_set_fast_clear_color(struct brw_context *brw,
379 struct intel_mipmap_tree *mt,
380 const union gl_color_union *color)
381 {
382 union gl_color_union override_color = *color;
383
384 /* The sampler doesn't look at the format of the surface when the fast
385 * clear color is used so we need to implement luminance, intensity and
386 * missing components manually.
387 */
388 switch (_mesa_get_format_base_format(mt->format)) {
389 case GL_INTENSITY:
390 override_color.ui[3] = override_color.ui[0];
391 /* flow through */
392 case GL_LUMINANCE:
393 case GL_LUMINANCE_ALPHA:
394 override_color.ui[1] = override_color.ui[0];
395 override_color.ui[2] = override_color.ui[0];
396 break;
397 default:
398 for (int i = 0; i < 3; i++) {
399 if (!_mesa_format_has_color_component(mt->format, i))
400 override_color.ui[i] = 0;
401 }
402 break;
403 }
404
405 if (!_mesa_format_has_color_component(mt->format, 3)) {
406 if (_mesa_is_format_integer_color(mt->format))
407 override_color.ui[3] = 1;
408 else
409 override_color.f[3] = 1.0f;
410 }
411
412 /* Handle linear→SRGB conversion */
413 if (brw->ctx.Color.sRGBEnabled &&
414 _mesa_get_srgb_format_linear(mt->format) != mt->format) {
415 for (int i = 0; i < 3; i++) {
416 override_color.f[i] =
417 util_format_linear_to_srgb_float(override_color.f[i]);
418 }
419 }
420
421 bool updated;
422 if (brw->gen >= 9) {
423 updated = memcmp(&mt->gen9_fast_clear_color, &override_color,
424 sizeof(mt->gen9_fast_clear_color));
425 mt->gen9_fast_clear_color = override_color;
426 } else {
427 const uint32_t old_color_value = mt->fast_clear_color_value;
428
429 mt->fast_clear_color_value = 0;
430 for (int i = 0; i < 4; i++) {
431 /* Testing for non-0 works for integer and float colors */
432 if (override_color.f[i] != 0.0f) {
433 mt->fast_clear_color_value |=
434 1 << (GEN7_SURFACE_CLEAR_COLOR_SHIFT + (3 - i));
435 }
436 }
437
438 updated = (old_color_value != mt->fast_clear_color_value);
439 }
440
441 return updated;
442 }
443
444 /* The x0, y0, x1, and y1 parameters must already be populated with the render
445 * area of the framebuffer to be cleared.
446 */
447 void
448 brw_get_fast_clear_rect(const struct brw_context *brw,
449 const struct isl_surf *aux_surf,
450 unsigned *x0, unsigned *y0,
451 unsigned *x1, unsigned *y1)
452 {
453 unsigned int x_align, y_align;
454 unsigned int x_scaledown, y_scaledown;
455
456 /* Only single sampled surfaces need to (and actually can) be resolved. */
457 if (aux_surf->usage == ISL_SURF_USAGE_CCS_BIT) {
458 /* From the Ivy Bridge PRM, Vol2 Part1 11.7 "MCS Buffer for Render
459 * Target(s)", beneath the "Fast Color Clear" bullet (p327):
460 *
461 * Clear pass must have a clear rectangle that must follow
462 * alignment rules in terms of pixels and lines as shown in the
463 * table below. Further, the clear-rectangle height and width
464 * must be multiple of the following dimensions. If the height
465 * and width of the render target being cleared do not meet these
466 * requirements, an MCS buffer can be created such that it
467 * follows the requirement and covers the RT.
468 *
469 * The alignment size in the table that follows is related to the
470 * alignment size that is baked into the CCS surface format but with X
471 * alignment multiplied by 16 and Y alignment multiplied by 32.
472 */
473 x_align = isl_format_get_layout(aux_surf->format)->bw;
474 y_align = isl_format_get_layout(aux_surf->format)->bh;
475
476 x_align *= 16;
477
478 /* SKL+ line alignment requirement for Y-tiled are half those of the prior
479 * generations.
480 */
481 if (brw->gen >= 9)
482 y_align *= 16;
483 else
484 y_align *= 32;
485
486 /* From the Ivy Bridge PRM, Vol2 Part1 11.7 "MCS Buffer for Render
487 * Target(s)", beneath the "Fast Color Clear" bullet (p327):
488 *
489 * In order to optimize the performance MCS buffer (when bound to
490 * 1X RT) clear similarly to MCS buffer clear for MSRT case,
491 * clear rect is required to be scaled by the following factors
492 * in the horizontal and vertical directions:
493 *
494 * The X and Y scale down factors in the table that follows are each
495 * equal to half the alignment value computed above.
496 */
497 x_scaledown = x_align / 2;
498 y_scaledown = y_align / 2;
499
500 /* From BSpec: 3D-Media-GPGPU Engine > 3D Pipeline > Pixel > Pixel
501 * Backend > MCS Buffer for Render Target(s) [DevIVB+] > Table "Color
502 * Clear of Non-MultiSampled Render Target Restrictions":
503 *
504 * Clear rectangle must be aligned to two times the number of
505 * pixels in the table shown below due to 16x16 hashing across the
506 * slice.
507 */
508 x_align *= 2;
509 y_align *= 2;
510 } else {
511 assert(aux_surf->usage == ISL_SURF_USAGE_MCS_BIT);
512
513 /* From the Ivy Bridge PRM, Vol2 Part1 11.7 "MCS Buffer for Render
514 * Target(s)", beneath the "MSAA Compression" bullet (p326):
515 *
516 * Clear pass for this case requires that scaled down primitive
517 * is sent down with upper left co-ordinate to coincide with
518 * actual rectangle being cleared. For MSAA, clear rectangle’s
519 * height and width need to as show in the following table in
520 * terms of (width,height) of the RT.
521 *
522 * MSAA Width of Clear Rect Height of Clear Rect
523 * 2X Ceil(1/8*width) Ceil(1/2*height)
524 * 4X Ceil(1/8*width) Ceil(1/2*height)
525 * 8X Ceil(1/2*width) Ceil(1/2*height)
526 * 16X width Ceil(1/2*height)
527 *
528 * The text "with upper left co-ordinate to coincide with actual
529 * rectangle being cleared" is a little confusing--it seems to imply
530 * that to clear a rectangle from (x,y) to (x+w,y+h), one needs to
531 * feed the pipeline using the rectangle (x,y) to
532 * (x+Ceil(w/N),y+Ceil(h/2)), where N is either 2 or 8 depending on
533 * the number of samples. Experiments indicate that this is not
534 * quite correct; actually, what the hardware appears to do is to
535 * align whatever rectangle is sent down the pipeline to the nearest
536 * multiple of 2x2 blocks, and then scale it up by a factor of N
537 * horizontally and 2 vertically. So the resulting alignment is 4
538 * vertically and either 4 or 16 horizontally, and the scaledown
539 * factor is 2 vertically and either 2 or 8 horizontally.
540 */
541 switch (aux_surf->format) {
542 case ISL_FORMAT_MCS_2X:
543 case ISL_FORMAT_MCS_4X:
544 x_scaledown = 8;
545 break;
546 case ISL_FORMAT_MCS_8X:
547 x_scaledown = 2;
548 break;
549 case ISL_FORMAT_MCS_16X:
550 x_scaledown = 1;
551 break;
552 default:
553 unreachable("Unexpected MCS format for fast clear");
554 }
555 y_scaledown = 2;
556 x_align = x_scaledown * 2;
557 y_align = y_scaledown * 2;
558 }
559
560 *x0 = ROUND_DOWN_TO(*x0, x_align) / x_scaledown;
561 *y0 = ROUND_DOWN_TO(*y0, y_align) / y_scaledown;
562 *x1 = ALIGN(*x1, x_align) / x_scaledown;
563 *y1 = ALIGN(*y1, y_align) / y_scaledown;
564 }
565
566 void
567 brw_get_ccs_resolve_rect(const struct isl_device *dev,
568 const struct isl_surf *ccs_surf,
569 unsigned *x0, unsigned *y0,
570 unsigned *x1, unsigned *y1)
571 {
572 unsigned x_scaledown, y_scaledown;
573
574 /* From the Ivy Bridge PRM, Vol2 Part1 11.9 "Render Target Resolve":
575 *
576 * A rectangle primitive must be scaled down by the following factors
577 * with respect to render target being resolved.
578 *
579 * The scaledown factors in the table that follows are related to the block
580 * size of the CCS format. For IVB and HSW, we divide by two, for BDW we
581 * multiply by 8 and 16. On Sky Lake, we multiply by 8.
582 */
583 const struct isl_format_layout *fmtl =
584 isl_format_get_layout(ccs_surf->format);
585 assert(fmtl->txc == ISL_TXC_CCS);
586
587 if (ISL_DEV_GEN(dev) >= 9) {
588 x_scaledown = fmtl->bw * 8;
589 y_scaledown = fmtl->bh * 8;
590 } else if (ISL_DEV_GEN(dev) >= 8) {
591 x_scaledown = fmtl->bw * 8;
592 y_scaledown = fmtl->bh * 16;
593 } else {
594 x_scaledown = fmtl->bw / 2;
595 y_scaledown = fmtl->bh / 2;
596 }
597 *x0 = *y0 = 0;
598 *x1 = ALIGN(ccs_surf->logical_level0_px.width, x_scaledown) / x_scaledown;
599 *y1 = ALIGN(ccs_surf->logical_level0_px.height, y_scaledown) / y_scaledown;
600 }