2 * Copyright © 2012 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 #include "main/context.h"
25 #include "main/teximage.h"
26 #include "main/fbobject.h"
28 #include "compiler/nir/nir_builder.h"
30 #include "intel_fbo.h"
32 #include "brw_blorp.h"
33 #include "brw_context.h"
34 #include "brw_state.h"
35 #include "brw_meta_util.h"
37 #define FILE_DEBUG_FLAG DEBUG_BLORP
39 static struct intel_mipmap_tree
*
40 find_miptree(GLbitfield buffer_bit
, struct intel_renderbuffer
*irb
)
42 struct intel_mipmap_tree
*mt
= irb
->mt
;
43 if (buffer_bit
== GL_STENCIL_BUFFER_BIT
&& mt
->stencil_mt
)
49 blorp_get_texture_swizzle(const struct intel_renderbuffer
*irb
)
51 return irb
->Base
.Base
._BaseFormat
== GL_RGB
?
52 MAKE_SWIZZLE4(SWIZZLE_X
, SWIZZLE_Y
, SWIZZLE_Z
, SWIZZLE_ONE
) :
57 do_blorp_blit(struct brw_context
*brw
, GLbitfield buffer_bit
,
58 struct intel_renderbuffer
*src_irb
, mesa_format src_format
,
59 struct intel_renderbuffer
*dst_irb
, mesa_format dst_format
,
60 GLfloat srcX0
, GLfloat srcY0
, GLfloat srcX1
, GLfloat srcY1
,
61 GLfloat dstX0
, GLfloat dstY0
, GLfloat dstX1
, GLfloat dstY1
,
62 GLenum filter
, bool mirror_x
, bool mirror_y
)
64 /* Find source/dst miptrees */
65 struct intel_mipmap_tree
*src_mt
= find_miptree(buffer_bit
, src_irb
);
66 struct intel_mipmap_tree
*dst_mt
= find_miptree(buffer_bit
, dst_irb
);
68 const bool es3
= _mesa_is_gles3(&brw
->ctx
);
70 brw_blorp_blit_miptrees(brw
,
71 src_mt
, src_irb
->mt_level
, src_irb
->mt_layer
,
72 src_format
, blorp_get_texture_swizzle(src_irb
),
73 dst_mt
, dst_irb
->mt_level
, dst_irb
->mt_layer
,
75 srcX0
, srcY0
, srcX1
, srcY1
,
76 dstX0
, dstY0
, dstX1
, dstY1
,
77 filter
, mirror_x
, mirror_y
,
80 dst_irb
->need_downsample
= true;
84 try_blorp_blit(struct brw_context
*brw
,
85 const struct gl_framebuffer
*read_fb
,
86 const struct gl_framebuffer
*draw_fb
,
87 GLfloat srcX0
, GLfloat srcY0
, GLfloat srcX1
, GLfloat srcY1
,
88 GLfloat dstX0
, GLfloat dstY0
, GLfloat dstX1
, GLfloat dstY1
,
89 GLenum filter
, GLbitfield buffer_bit
)
91 struct gl_context
*ctx
= &brw
->ctx
;
93 /* Sync up the state of window system buffers. We need to do this before
94 * we go looking for the buffers.
96 intel_prepare_render(brw
);
98 bool mirror_x
, mirror_y
;
99 if (brw_meta_mirror_clip_and_scissor(ctx
, read_fb
, draw_fb
,
100 &srcX0
, &srcY0
, &srcX1
, &srcY1
,
101 &dstX0
, &dstY0
, &dstX1
, &dstY1
,
102 &mirror_x
, &mirror_y
))
106 struct intel_renderbuffer
*src_irb
;
107 struct intel_renderbuffer
*dst_irb
;
108 struct intel_mipmap_tree
*src_mt
;
109 struct intel_mipmap_tree
*dst_mt
;
110 switch (buffer_bit
) {
111 case GL_COLOR_BUFFER_BIT
:
112 src_irb
= intel_renderbuffer(read_fb
->_ColorReadBuffer
);
113 for (unsigned i
= 0; i
< draw_fb
->_NumColorDrawBuffers
; ++i
) {
114 dst_irb
= intel_renderbuffer(draw_fb
->_ColorDrawBuffers
[i
]);
116 do_blorp_blit(brw
, buffer_bit
,
117 src_irb
, src_irb
->Base
.Base
.Format
,
118 dst_irb
, dst_irb
->Base
.Base
.Format
,
119 srcX0
, srcY0
, srcX1
, srcY1
,
120 dstX0
, dstY0
, dstX1
, dstY1
,
121 filter
, mirror_x
, mirror_y
);
124 case GL_DEPTH_BUFFER_BIT
:
126 intel_renderbuffer(read_fb
->Attachment
[BUFFER_DEPTH
].Renderbuffer
);
128 intel_renderbuffer(draw_fb
->Attachment
[BUFFER_DEPTH
].Renderbuffer
);
129 src_mt
= find_miptree(buffer_bit
, src_irb
);
130 dst_mt
= find_miptree(buffer_bit
, dst_irb
);
132 /* We can't handle format conversions between Z24 and other formats
133 * since we have to lie about the surface format. See the comments in
134 * brw_blorp_surface_info::set().
136 if ((src_mt
->format
== MESA_FORMAT_Z24_UNORM_X8_UINT
) !=
137 (dst_mt
->format
== MESA_FORMAT_Z24_UNORM_X8_UINT
))
140 do_blorp_blit(brw
, buffer_bit
, src_irb
, MESA_FORMAT_NONE
,
141 dst_irb
, MESA_FORMAT_NONE
, srcX0
, srcY0
,
142 srcX1
, srcY1
, dstX0
, dstY0
, dstX1
, dstY1
,
143 filter
, mirror_x
, mirror_y
);
145 case GL_STENCIL_BUFFER_BIT
:
147 intel_renderbuffer(read_fb
->Attachment
[BUFFER_STENCIL
].Renderbuffer
);
149 intel_renderbuffer(draw_fb
->Attachment
[BUFFER_STENCIL
].Renderbuffer
);
150 do_blorp_blit(brw
, buffer_bit
, src_irb
, MESA_FORMAT_NONE
,
151 dst_irb
, MESA_FORMAT_NONE
, srcX0
, srcY0
,
152 srcX1
, srcY1
, dstX0
, dstY0
, dstX1
, dstY1
,
153 filter
, mirror_x
, mirror_y
);
156 unreachable("not reached");
163 brw_blorp_copytexsubimage(struct brw_context
*brw
,
164 struct gl_renderbuffer
*src_rb
,
165 struct gl_texture_image
*dst_image
,
167 int srcX0
, int srcY0
,
168 int dstX0
, int dstY0
,
169 int width
, int height
)
171 struct gl_context
*ctx
= &brw
->ctx
;
172 struct intel_renderbuffer
*src_irb
= intel_renderbuffer(src_rb
);
173 struct intel_texture_image
*intel_image
= intel_texture_image(dst_image
);
175 /* No pixel transfer operations (zoom, bias, mapping), just a blit */
176 if (brw
->ctx
._ImageTransferState
)
179 /* Sync up the state of window system buffers. We need to do this before
180 * we go looking at the src renderbuffer's miptree.
182 intel_prepare_render(brw
);
184 struct intel_mipmap_tree
*src_mt
= src_irb
->mt
;
185 struct intel_mipmap_tree
*dst_mt
= intel_image
->mt
;
187 /* There is support for only up to eight samples. */
188 if (src_mt
->num_samples
> 8 || dst_mt
->num_samples
> 8)
191 /* BLORP is only supported from Gen6 onwards. */
195 if (_mesa_get_format_base_format(src_rb
->Format
) !=
196 _mesa_get_format_base_format(dst_image
->TexFormat
)) {
200 /* We can't handle format conversions between Z24 and other formats since
201 * we have to lie about the surface format. See the comments in
202 * brw_blorp_surface_info::set().
204 if ((src_mt
->format
== MESA_FORMAT_Z24_UNORM_X8_UINT
) !=
205 (dst_mt
->format
== MESA_FORMAT_Z24_UNORM_X8_UINT
)) {
209 if (!brw
->format_supported_as_render_target
[dst_image
->TexFormat
])
212 /* Source clipping shouldn't be necessary, since copytexsubimage (in
213 * src/mesa/main/teximage.c) calls _mesa_clip_copytexsubimage() which
216 * Destination clipping shouldn't be necessary since the restrictions on
217 * glCopyTexSubImage prevent the user from specifying a destination rectangle
218 * that falls outside the bounds of the destination texture.
219 * See error_check_subtexture_dimensions().
222 int srcY1
= srcY0
+ height
;
223 int srcX1
= srcX0
+ width
;
224 int dstX1
= dstX0
+ width
;
225 int dstY1
= dstY0
+ height
;
227 /* Account for the fact that in the system framebuffer, the origin is at
230 bool mirror_y
= false;
231 if (_mesa_is_winsys_fbo(ctx
->ReadBuffer
)) {
232 GLint tmp
= src_rb
->Height
- srcY0
;
233 srcY0
= src_rb
->Height
- srcY1
;
238 /* Account for face selection and texture view MinLayer */
239 int dst_slice
= slice
+ dst_image
->TexObject
->MinLayer
+ dst_image
->Face
;
240 int dst_level
= dst_image
->Level
+ dst_image
->TexObject
->MinLevel
;
242 brw_blorp_blit_miptrees(brw
,
243 src_mt
, src_irb
->mt_level
, src_irb
->mt_layer
,
244 src_rb
->Format
, blorp_get_texture_swizzle(src_irb
),
245 dst_mt
, dst_level
, dst_slice
,
246 dst_image
->TexFormat
,
247 srcX0
, srcY0
, srcX1
, srcY1
,
248 dstX0
, dstY0
, dstX1
, dstY1
,
249 GL_NEAREST
, false, mirror_y
,
252 /* If we're copying to a packed depth stencil texture and the source
253 * framebuffer has separate stencil, we need to also copy the stencil data
256 src_rb
= ctx
->ReadBuffer
->Attachment
[BUFFER_STENCIL
].Renderbuffer
;
257 if (_mesa_get_format_bits(dst_image
->TexFormat
, GL_STENCIL_BITS
) > 0 &&
259 src_irb
= intel_renderbuffer(src_rb
);
260 src_mt
= src_irb
->mt
;
262 if (src_mt
->stencil_mt
)
263 src_mt
= src_mt
->stencil_mt
;
264 if (dst_mt
->stencil_mt
)
265 dst_mt
= dst_mt
->stencil_mt
;
267 if (src_mt
!= dst_mt
) {
268 brw_blorp_blit_miptrees(brw
,
269 src_mt
, src_irb
->mt_level
, src_irb
->mt_layer
,
271 blorp_get_texture_swizzle(src_irb
),
272 dst_mt
, dst_level
, dst_slice
,
274 srcX0
, srcY0
, srcX1
, srcY1
,
275 dstX0
, dstY0
, dstX1
, dstY1
,
276 GL_NEAREST
, false, mirror_y
,
286 brw_blorp_framebuffer(struct brw_context
*brw
,
287 struct gl_framebuffer
*readFb
,
288 struct gl_framebuffer
*drawFb
,
289 GLint srcX0
, GLint srcY0
, GLint srcX1
, GLint srcY1
,
290 GLint dstX0
, GLint dstY0
, GLint dstX1
, GLint dstY1
,
291 GLbitfield mask
, GLenum filter
)
293 /* BLORP is not supported before Gen6. */
297 static GLbitfield buffer_bits
[] = {
300 GL_STENCIL_BUFFER_BIT
,
303 for (unsigned int i
= 0; i
< ARRAY_SIZE(buffer_bits
); ++i
) {
304 if ((mask
& buffer_bits
[i
]) &&
305 try_blorp_blit(brw
, readFb
, drawFb
,
306 srcX0
, srcY0
, srcX1
, srcY1
,
307 dstX0
, dstY0
, dstX1
, dstY1
,
308 filter
, buffer_bits
[i
])) {
309 mask
&= ~buffer_bits
[i
];
318 * Enum to specify the order of arguments in a sampler message
320 enum sampler_message_arg
322 SAMPLER_MESSAGE_ARG_U_FLOAT
,
323 SAMPLER_MESSAGE_ARG_V_FLOAT
,
324 SAMPLER_MESSAGE_ARG_U_INT
,
325 SAMPLER_MESSAGE_ARG_V_INT
,
326 SAMPLER_MESSAGE_ARG_R_INT
,
327 SAMPLER_MESSAGE_ARG_SI_INT
,
328 SAMPLER_MESSAGE_ARG_MCS_INT
,
329 SAMPLER_MESSAGE_ARG_ZERO_INT
,
332 struct brw_blorp_blit_vars
{
333 /* Input values from brw_blorp_wm_inputs */
334 nir_variable
*v_discard_rect
;
335 nir_variable
*v_rect_grid
;
336 nir_variable
*v_coord_transform
;
337 nir_variable
*v_src_z
;
340 nir_variable
*frag_coord
;
343 nir_variable
*color_out
;
347 brw_blorp_blit_vars_init(nir_builder
*b
, struct brw_blorp_blit_vars
*v
,
348 const struct brw_blorp_blit_prog_key
*key
)
350 /* Blended and scaled blits never use pixel discard. */
351 assert(!key
->use_kill
|| !(key
->blend
&& key
->blit_scaled
));
353 #define LOAD_INPUT(name, type)\
354 v->v_##name = nir_variable_create(b->shader, nir_var_shader_in, \
356 v->v_##name->data.interpolation = INTERP_QUALIFIER_FLAT; \
357 v->v_##name->data.location = VARYING_SLOT_VAR0 + \
358 offsetof(struct brw_blorp_wm_inputs, name) / (4 * sizeof(float));
360 LOAD_INPUT(discard_rect
, glsl_vec4_type())
361 LOAD_INPUT(rect_grid
, glsl_vec4_type())
362 LOAD_INPUT(coord_transform
, glsl_vec4_type())
363 LOAD_INPUT(src_z
, glsl_uint_type())
367 v
->frag_coord
= nir_variable_create(b
->shader
, nir_var_shader_in
,
368 glsl_vec4_type(), "gl_FragCoord");
369 v
->frag_coord
->data
.location
= VARYING_SLOT_POS
;
370 v
->frag_coord
->data
.origin_upper_left
= true;
372 v
->color_out
= nir_variable_create(b
->shader
, nir_var_shader_out
,
373 glsl_vec4_type(), "gl_FragColor");
374 v
->color_out
->data
.location
= FRAG_RESULT_COLOR
;
378 blorp_blit_get_frag_coords(nir_builder
*b
,
379 const struct brw_blorp_blit_prog_key
*key
,
380 struct brw_blorp_blit_vars
*v
)
382 nir_ssa_def
*coord
= nir_f2i(b
, nir_load_var(b
, v
->frag_coord
));
384 if (key
->persample_msaa_dispatch
) {
385 return nir_vec3(b
, nir_channel(b
, coord
, 0), nir_channel(b
, coord
, 1),
386 nir_load_system_value(b
, nir_intrinsic_load_sample_id
, 0));
388 return nir_vec2(b
, nir_channel(b
, coord
, 0), nir_channel(b
, coord
, 1));
393 * Emit code to translate from destination (X, Y) coordinates to source (X, Y)
397 blorp_blit_apply_transform(nir_builder
*b
, nir_ssa_def
*src_pos
,
398 struct brw_blorp_blit_vars
*v
)
400 nir_ssa_def
*coord_transform
= nir_load_var(b
, v
->v_coord_transform
);
402 nir_ssa_def
*offset
= nir_vec2(b
, nir_channel(b
, coord_transform
, 1),
403 nir_channel(b
, coord_transform
, 3));
404 nir_ssa_def
*mul
= nir_vec2(b
, nir_channel(b
, coord_transform
, 0),
405 nir_channel(b
, coord_transform
, 2));
407 return nir_ffma(b
, src_pos
, mul
, offset
);
411 blorp_nir_discard_if_outside_rect(nir_builder
*b
, nir_ssa_def
*pos
,
412 struct brw_blorp_blit_vars
*v
)
414 nir_ssa_def
*c0
, *c1
, *c2
, *c3
;
415 nir_ssa_def
*discard_rect
= nir_load_var(b
, v
->v_discard_rect
);
416 nir_ssa_def
*dst_x0
= nir_channel(b
, discard_rect
, 0);
417 nir_ssa_def
*dst_x1
= nir_channel(b
, discard_rect
, 1);
418 nir_ssa_def
*dst_y0
= nir_channel(b
, discard_rect
, 2);
419 nir_ssa_def
*dst_y1
= nir_channel(b
, discard_rect
, 3);
421 c0
= nir_ult(b
, nir_channel(b
, pos
, 0), dst_x0
);
422 c1
= nir_uge(b
, nir_channel(b
, pos
, 0), dst_x1
);
423 c2
= nir_ult(b
, nir_channel(b
, pos
, 1), dst_y0
);
424 c3
= nir_uge(b
, nir_channel(b
, pos
, 1), dst_y1
);
426 nir_ssa_def
*oob
= nir_ior(b
, nir_ior(b
, c0
, c1
), nir_ior(b
, c2
, c3
));
428 nir_intrinsic_instr
*discard
=
429 nir_intrinsic_instr_create(b
->shader
, nir_intrinsic_discard_if
);
430 discard
->src
[0] = nir_src_for_ssa(oob
);
431 nir_builder_instr_insert(b
, &discard
->instr
);
434 static nir_tex_instr
*
435 blorp_create_nir_tex_instr(nir_shader
*shader
, nir_texop op
,
436 nir_ssa_def
*pos
, unsigned num_srcs
,
437 enum brw_reg_type dst_type
)
439 nir_tex_instr
*tex
= nir_tex_instr_create(shader
, num_srcs
);
444 case BRW_REGISTER_TYPE_F
:
445 tex
->dest_type
= nir_type_float
;
447 case BRW_REGISTER_TYPE_D
:
448 tex
->dest_type
= nir_type_int
;
450 case BRW_REGISTER_TYPE_UD
:
451 tex
->dest_type
= nir_type_uint
;
454 unreachable("Invalid texture return type");
457 tex
->is_array
= false;
458 tex
->is_shadow
= false;
460 /* Blorp only has one texture and it's bound at unit 0 */
463 tex
->texture_index
= 0;
464 tex
->sampler_index
= 0;
466 nir_ssa_dest_init(&tex
->instr
, &tex
->dest
, 4, 32, NULL
);
472 blorp_nir_tex(nir_builder
*b
, nir_ssa_def
*pos
, enum brw_reg_type dst_type
)
475 blorp_create_nir_tex_instr(b
->shader
, nir_texop_tex
, pos
, 2, dst_type
);
477 assert(pos
->num_components
== 2);
478 tex
->sampler_dim
= GLSL_SAMPLER_DIM_2D
;
479 tex
->coord_components
= 2;
480 tex
->src
[0].src_type
= nir_tex_src_coord
;
481 tex
->src
[0].src
= nir_src_for_ssa(pos
);
482 tex
->src
[1].src_type
= nir_tex_src_lod
;
483 tex
->src
[1].src
= nir_src_for_ssa(nir_imm_int(b
, 0));
485 nir_builder_instr_insert(b
, &tex
->instr
);
487 return &tex
->dest
.ssa
;
491 blorp_nir_txf(nir_builder
*b
, struct brw_blorp_blit_vars
*v
,
492 nir_ssa_def
*pos
, enum brw_reg_type dst_type
)
495 blorp_create_nir_tex_instr(b
->shader
, nir_texop_txf
, pos
, 2, dst_type
);
497 /* In order to properly handle 3-D textures, we pull the Z component from
498 * a uniform. TODO: This is a bit magic; we should probably make this
499 * more explicit in the future.
501 assert(pos
->num_components
== 2);
502 pos
= nir_vec3(b
, nir_channel(b
, pos
, 0), nir_channel(b
, pos
, 1),
503 nir_load_var(b
, v
->v_src_z
));
505 tex
->sampler_dim
= GLSL_SAMPLER_DIM_3D
;
506 tex
->coord_components
= 3;
507 tex
->src
[0].src_type
= nir_tex_src_coord
;
508 tex
->src
[0].src
= nir_src_for_ssa(pos
);
509 tex
->src
[1].src_type
= nir_tex_src_lod
;
510 tex
->src
[1].src
= nir_src_for_ssa(nir_imm_int(b
, 0));
512 nir_builder_instr_insert(b
, &tex
->instr
);
514 return &tex
->dest
.ssa
;
518 blorp_nir_txf_ms(nir_builder
*b
, nir_ssa_def
*pos
, nir_ssa_def
*mcs
,
519 enum brw_reg_type dst_type
)
522 blorp_create_nir_tex_instr(b
->shader
, nir_texop_txf_ms
, pos
,
523 mcs
!= NULL
? 3 : 2, dst_type
);
525 tex
->sampler_dim
= GLSL_SAMPLER_DIM_MS
;
526 tex
->coord_components
= 2;
527 tex
->src
[0].src_type
= nir_tex_src_coord
;
528 tex
->src
[0].src
= nir_src_for_ssa(pos
);
530 tex
->src
[1].src_type
= nir_tex_src_ms_index
;
531 if (pos
->num_components
== 2) {
532 tex
->src
[1].src
= nir_src_for_ssa(nir_imm_int(b
, 0));
534 assert(pos
->num_components
== 3);
535 tex
->src
[1].src
= nir_src_for_ssa(nir_channel(b
, pos
, 2));
539 tex
->src
[2].src_type
= nir_tex_src_ms_mcs
;
540 tex
->src
[2].src
= nir_src_for_ssa(mcs
);
543 nir_builder_instr_insert(b
, &tex
->instr
);
545 return &tex
->dest
.ssa
;
549 blorp_nir_txf_ms_mcs(nir_builder
*b
, nir_ssa_def
*pos
)
552 blorp_create_nir_tex_instr(b
->shader
, nir_texop_txf_ms_mcs
,
553 pos
, 1, BRW_REGISTER_TYPE_D
);
555 tex
->sampler_dim
= GLSL_SAMPLER_DIM_MS
;
556 tex
->coord_components
= 2;
557 tex
->src
[0].src_type
= nir_tex_src_coord
;
558 tex
->src
[0].src
= nir_src_for_ssa(pos
);
560 nir_builder_instr_insert(b
, &tex
->instr
);
562 return &tex
->dest
.ssa
;
566 nir_mask_shift_or(struct nir_builder
*b
, nir_ssa_def
*dst
, nir_ssa_def
*src
,
567 uint32_t src_mask
, int src_left_shift
)
569 nir_ssa_def
*masked
= nir_iand(b
, src
, nir_imm_int(b
, src_mask
));
571 nir_ssa_def
*shifted
;
572 if (src_left_shift
> 0) {
573 shifted
= nir_ishl(b
, masked
, nir_imm_int(b
, src_left_shift
));
574 } else if (src_left_shift
< 0) {
575 shifted
= nir_ushr(b
, masked
, nir_imm_int(b
, -src_left_shift
));
577 assert(src_left_shift
== 0);
581 return nir_ior(b
, dst
, shifted
);
585 * Emit code to compensate for the difference between Y and W tiling.
587 * This code modifies the X and Y coordinates according to the formula:
589 * (X', Y', S') = detile(W-MAJOR, tile(Y-MAJOR, X, Y, S))
591 * (See brw_blorp_build_nir_shader).
593 static inline nir_ssa_def
*
594 blorp_nir_retile_y_to_w(nir_builder
*b
, nir_ssa_def
*pos
)
596 assert(pos
->num_components
== 2);
597 nir_ssa_def
*x_Y
= nir_channel(b
, pos
, 0);
598 nir_ssa_def
*y_Y
= nir_channel(b
, pos
, 1);
600 /* Given X and Y coordinates that describe an address using Y tiling,
601 * translate to the X and Y coordinates that describe the same address
604 * If we break down the low order bits of X and Y, using a
605 * single letter to represent each low-order bit:
607 * X = A << 7 | 0bBCDEFGH
608 * Y = J << 5 | 0bKLMNP (1)
610 * Then we can apply the Y tiling formula to see the memory offset being
613 * offset = (J * tile_pitch + A) << 12 | 0bBCDKLMNPEFGH (2)
615 * If we apply the W detiling formula to this memory location, that the
616 * corresponding X' and Y' coordinates are:
618 * X' = A << 6 | 0bBCDPFH (3)
619 * Y' = J << 6 | 0bKLMNEG
621 * Combining (1) and (3), we see that to transform (X, Y) to (X', Y'),
622 * we need to make the following computation:
624 * X' = (X & ~0b1011) >> 1 | (Y & 0b1) << 2 | X & 0b1 (4)
625 * Y' = (Y & ~0b1) << 1 | (X & 0b1000) >> 2 | (X & 0b10) >> 1
627 nir_ssa_def
*x_W
= nir_imm_int(b
, 0);
628 x_W
= nir_mask_shift_or(b
, x_W
, x_Y
, 0xfffffff4, -1);
629 x_W
= nir_mask_shift_or(b
, x_W
, y_Y
, 0x1, 2);
630 x_W
= nir_mask_shift_or(b
, x_W
, x_Y
, 0x1, 0);
632 nir_ssa_def
*y_W
= nir_imm_int(b
, 0);
633 y_W
= nir_mask_shift_or(b
, y_W
, y_Y
, 0xfffffffe, 1);
634 y_W
= nir_mask_shift_or(b
, y_W
, x_Y
, 0x8, -2);
635 y_W
= nir_mask_shift_or(b
, y_W
, x_Y
, 0x2, -1);
637 return nir_vec2(b
, x_W
, y_W
);
641 * Emit code to compensate for the difference between Y and W tiling.
643 * This code modifies the X and Y coordinates according to the formula:
645 * (X', Y', S') = detile(Y-MAJOR, tile(W-MAJOR, X, Y, S))
647 * (See brw_blorp_build_nir_shader).
649 static inline nir_ssa_def
*
650 blorp_nir_retile_w_to_y(nir_builder
*b
, nir_ssa_def
*pos
)
652 assert(pos
->num_components
== 2);
653 nir_ssa_def
*x_W
= nir_channel(b
, pos
, 0);
654 nir_ssa_def
*y_W
= nir_channel(b
, pos
, 1);
656 /* Applying the same logic as above, but in reverse, we obtain the
659 * X' = (X & ~0b101) << 1 | (Y & 0b10) << 2 | (Y & 0b1) << 1 | X & 0b1
660 * Y' = (Y & ~0b11) >> 1 | (X & 0b100) >> 2
662 nir_ssa_def
*x_Y
= nir_imm_int(b
, 0);
663 x_Y
= nir_mask_shift_or(b
, x_Y
, x_W
, 0xfffffffa, 1);
664 x_Y
= nir_mask_shift_or(b
, x_Y
, y_W
, 0x2, 2);
665 x_Y
= nir_mask_shift_or(b
, x_Y
, y_W
, 0x1, 1);
666 x_Y
= nir_mask_shift_or(b
, x_Y
, x_W
, 0x1, 0);
668 nir_ssa_def
*y_Y
= nir_imm_int(b
, 0);
669 y_Y
= nir_mask_shift_or(b
, y_Y
, y_W
, 0xfffffffc, -1);
670 y_Y
= nir_mask_shift_or(b
, y_Y
, x_W
, 0x4, -2);
672 return nir_vec2(b
, x_Y
, y_Y
);
676 * Emit code to compensate for the difference between MSAA and non-MSAA
679 * This code modifies the X and Y coordinates according to the formula:
681 * (X', Y', S') = encode_msaa(num_samples, IMS, X, Y, S)
683 * (See brw_blorp_blit_program).
685 static inline nir_ssa_def
*
686 blorp_nir_encode_msaa(nir_builder
*b
, nir_ssa_def
*pos
,
687 unsigned num_samples
, enum intel_msaa_layout layout
)
689 assert(pos
->num_components
== 2 || pos
->num_components
== 3);
692 case INTEL_MSAA_LAYOUT_NONE
:
693 assert(pos
->num_components
== 2);
695 case INTEL_MSAA_LAYOUT_CMS
:
696 /* We can't compensate for compressed layout since at this point in the
697 * program we haven't read from the MCS buffer.
699 unreachable("Bad layout in encode_msaa");
700 case INTEL_MSAA_LAYOUT_UMS
:
701 /* No translation needed */
703 case INTEL_MSAA_LAYOUT_IMS
: {
704 nir_ssa_def
*x_in
= nir_channel(b
, pos
, 0);
705 nir_ssa_def
*y_in
= nir_channel(b
, pos
, 1);
706 nir_ssa_def
*s_in
= pos
->num_components
== 2 ? nir_imm_int(b
, 0) :
707 nir_channel(b
, pos
, 2);
709 nir_ssa_def
*x_out
= nir_imm_int(b
, 0);
710 nir_ssa_def
*y_out
= nir_imm_int(b
, 0);
711 switch (num_samples
) {
714 /* encode_msaa(2, IMS, X, Y, S) = (X', Y', 0)
715 * where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1)
718 * encode_msaa(4, IMS, X, Y, S) = (X', Y', 0)
719 * where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1)
720 * Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1)
722 x_out
= nir_mask_shift_or(b
, x_out
, x_in
, 0xfffffffe, 1);
723 x_out
= nir_mask_shift_or(b
, x_out
, s_in
, 0x1, 1);
724 x_out
= nir_mask_shift_or(b
, x_out
, x_in
, 0x1, 0);
725 if (num_samples
== 2) {
728 y_out
= nir_mask_shift_or(b
, y_out
, y_in
, 0xfffffffe, 1);
729 y_out
= nir_mask_shift_or(b
, y_out
, s_in
, 0x2, 0);
730 y_out
= nir_mask_shift_or(b
, y_out
, y_in
, 0x1, 0);
735 /* encode_msaa(8, IMS, X, Y, S) = (X', Y', 0)
736 * where X' = (X & ~0b1) << 2 | (S & 0b100) | (S & 0b1) << 1
738 * Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1)
740 x_out
= nir_mask_shift_or(b
, x_out
, x_in
, 0xfffffffe, 2);
741 x_out
= nir_mask_shift_or(b
, x_out
, s_in
, 0x4, 0);
742 x_out
= nir_mask_shift_or(b
, x_out
, s_in
, 0x1, 1);
743 x_out
= nir_mask_shift_or(b
, x_out
, x_in
, 0x1, 0);
744 y_out
= nir_mask_shift_or(b
, y_out
, y_in
, 0xfffffffe, 1);
745 y_out
= nir_mask_shift_or(b
, y_out
, s_in
, 0x2, 0);
746 y_out
= nir_mask_shift_or(b
, y_out
, y_in
, 0x1, 0);
750 /* encode_msaa(16, IMS, X, Y, S) = (X', Y', 0)
751 * where X' = (X & ~0b1) << 2 | (S & 0b100) | (S & 0b1) << 1
753 * Y' = (Y & ~0b1) << 2 | (S & 0b1000) >> 1 (S & 0b10)
756 x_out
= nir_mask_shift_or(b
, x_out
, x_in
, 0xfffffffe, 2);
757 x_out
= nir_mask_shift_or(b
, x_out
, s_in
, 0x4, 0);
758 x_out
= nir_mask_shift_or(b
, x_out
, s_in
, 0x1, 1);
759 x_out
= nir_mask_shift_or(b
, x_out
, x_in
, 0x1, 0);
760 y_out
= nir_mask_shift_or(b
, y_out
, y_in
, 0xfffffffe, 2);
761 y_out
= nir_mask_shift_or(b
, y_out
, s_in
, 0x8, -1);
762 y_out
= nir_mask_shift_or(b
, y_out
, s_in
, 0x2, 0);
763 y_out
= nir_mask_shift_or(b
, y_out
, y_in
, 0x1, 0);
767 unreachable("Invalid number of samples for IMS layout");
770 return nir_vec2(b
, x_out
, y_out
);
774 unreachable("Invalid MSAA layout");
779 * Emit code to compensate for the difference between MSAA and non-MSAA
782 * This code modifies the X and Y coordinates according to the formula:
784 * (X', Y', S) = decode_msaa(num_samples, IMS, X, Y, S)
786 * (See brw_blorp_blit_program).
788 static inline nir_ssa_def
*
789 blorp_nir_decode_msaa(nir_builder
*b
, nir_ssa_def
*pos
,
790 unsigned num_samples
, enum intel_msaa_layout layout
)
792 assert(pos
->num_components
== 2 || pos
->num_components
== 3);
795 case INTEL_MSAA_LAYOUT_NONE
:
796 /* No translation necessary, and S should already be zero. */
797 assert(pos
->num_components
== 2);
799 case INTEL_MSAA_LAYOUT_CMS
:
800 /* We can't compensate for compressed layout since at this point in the
801 * program we don't have access to the MCS buffer.
803 unreachable("Bad layout in encode_msaa");
804 case INTEL_MSAA_LAYOUT_UMS
:
805 /* No translation necessary. */
807 case INTEL_MSAA_LAYOUT_IMS
: {
808 assert(pos
->num_components
== 2);
810 nir_ssa_def
*x_in
= nir_channel(b
, pos
, 0);
811 nir_ssa_def
*y_in
= nir_channel(b
, pos
, 1);
813 nir_ssa_def
*x_out
= nir_imm_int(b
, 0);
814 nir_ssa_def
*y_out
= nir_imm_int(b
, 0);
815 nir_ssa_def
*s_out
= nir_imm_int(b
, 0);
816 switch (num_samples
) {
819 /* decode_msaa(2, IMS, X, Y, 0) = (X', Y', S)
820 * where X' = (X & ~0b11) >> 1 | (X & 0b1)
821 * S = (X & 0b10) >> 1
823 * decode_msaa(4, IMS, X, Y, 0) = (X', Y', S)
824 * where X' = (X & ~0b11) >> 1 | (X & 0b1)
825 * Y' = (Y & ~0b11) >> 1 | (Y & 0b1)
826 * S = (Y & 0b10) | (X & 0b10) >> 1
828 x_out
= nir_mask_shift_or(b
, x_out
, x_in
, 0xfffffffc, -1);
829 x_out
= nir_mask_shift_or(b
, x_out
, x_in
, 0x1, 0);
830 if (num_samples
== 2) {
832 s_out
= nir_mask_shift_or(b
, s_out
, x_in
, 0x2, -1);
834 y_out
= nir_mask_shift_or(b
, y_out
, y_in
, 0xfffffffc, -1);
835 y_out
= nir_mask_shift_or(b
, y_out
, y_in
, 0x1, 0);
836 s_out
= nir_mask_shift_or(b
, s_out
, x_in
, 0x2, -1);
837 s_out
= nir_mask_shift_or(b
, s_out
, y_in
, 0x2, 0);
842 /* decode_msaa(8, IMS, X, Y, 0) = (X', Y', S)
843 * where X' = (X & ~0b111) >> 2 | (X & 0b1)
844 * Y' = (Y & ~0b11) >> 1 | (Y & 0b1)
845 * S = (X & 0b100) | (Y & 0b10) | (X & 0b10) >> 1
847 x_out
= nir_mask_shift_or(b
, x_out
, x_in
, 0xfffffff8, -2);
848 x_out
= nir_mask_shift_or(b
, x_out
, x_in
, 0x1, 0);
849 y_out
= nir_mask_shift_or(b
, y_out
, y_in
, 0xfffffffc, -1);
850 y_out
= nir_mask_shift_or(b
, y_out
, y_in
, 0x1, 0);
851 s_out
= nir_mask_shift_or(b
, s_out
, x_in
, 0x4, 0);
852 s_out
= nir_mask_shift_or(b
, s_out
, y_in
, 0x2, 0);
853 s_out
= nir_mask_shift_or(b
, s_out
, x_in
, 0x2, -1);
857 /* decode_msaa(16, IMS, X, Y, 0) = (X', Y', S)
858 * where X' = (X & ~0b111) >> 2 | (X & 0b1)
859 * Y' = (Y & ~0b111) >> 2 | (Y & 0b1)
860 * S = (Y & 0b100) << 1 | (X & 0b100) |
861 * (Y & 0b10) | (X & 0b10) >> 1
863 x_out
= nir_mask_shift_or(b
, x_out
, x_in
, 0xfffffff8, -2);
864 x_out
= nir_mask_shift_or(b
, x_out
, x_in
, 0x1, 0);
865 y_out
= nir_mask_shift_or(b
, y_out
, y_in
, 0xfffffff8, -2);
866 y_out
= nir_mask_shift_or(b
, y_out
, y_in
, 0x1, 0);
867 s_out
= nir_mask_shift_or(b
, s_out
, y_in
, 0x4, 1);
868 s_out
= nir_mask_shift_or(b
, s_out
, x_in
, 0x4, 0);
869 s_out
= nir_mask_shift_or(b
, s_out
, y_in
, 0x2, 0);
870 s_out
= nir_mask_shift_or(b
, s_out
, x_in
, 0x2, -1);
874 unreachable("Invalid number of samples for IMS layout");
877 return nir_vec3(b
, x_out
, y_out
, s_out
);
881 unreachable("Invalid MSAA layout");
886 * Count the number of trailing 1 bits in the given value. For example:
888 * count_trailing_one_bits(0) == 0
889 * count_trailing_one_bits(7) == 3
890 * count_trailing_one_bits(11) == 2
892 static inline int count_trailing_one_bits(unsigned value
)
894 #ifdef HAVE___BUILTIN_CTZ
895 return __builtin_ctz(~value
);
897 return _mesa_bitcount(value
& ~(value
+ 1));
902 blorp_nir_manual_blend_average(nir_builder
*b
, nir_ssa_def
*pos
,
903 unsigned tex_samples
,
904 enum intel_msaa_layout tex_layout
,
905 enum brw_reg_type dst_type
)
907 /* If non-null, this is the outer-most if statement */
908 nir_if
*outer_if
= NULL
;
910 nir_variable
*color
=
911 nir_local_variable_create(b
->impl
, glsl_vec4_type(), "color");
913 nir_ssa_def
*mcs
= NULL
;
914 if (tex_layout
== INTEL_MSAA_LAYOUT_CMS
)
915 mcs
= blorp_nir_txf_ms_mcs(b
, pos
);
917 /* We add together samples using a binary tree structure, e.g. for 4x MSAA:
919 * result = ((sample[0] + sample[1]) + (sample[2] + sample[3])) / 4
921 * This ensures that when all samples have the same value, no numerical
922 * precision is lost, since each addition operation always adds two equal
923 * values, and summing two equal floating point values does not lose
926 * We perform this computation by treating the texture_data array as a
927 * stack and performing the following operations:
929 * - push sample 0 onto stack
930 * - push sample 1 onto stack
931 * - add top two stack entries
932 * - push sample 2 onto stack
933 * - push sample 3 onto stack
934 * - add top two stack entries
935 * - add top two stack entries
936 * - divide top stack entry by 4
938 * Note that after pushing sample i onto the stack, the number of add
939 * operations we do is equal to the number of trailing 1 bits in i. This
940 * works provided the total number of samples is a power of two, which it
941 * always is for i965.
943 * For integer formats, we replace the add operations with average
944 * operations and skip the final division.
946 nir_ssa_def
*texture_data
[5];
947 unsigned stack_depth
= 0;
948 for (unsigned i
= 0; i
< tex_samples
; ++i
) {
949 assert(stack_depth
== _mesa_bitcount(i
)); /* Loop invariant */
951 /* Push sample i onto the stack */
952 assert(stack_depth
< ARRAY_SIZE(texture_data
));
954 nir_ssa_def
*ms_pos
= nir_vec3(b
, nir_channel(b
, pos
, 0),
955 nir_channel(b
, pos
, 1),
957 texture_data
[stack_depth
++] = blorp_nir_txf_ms(b
, ms_pos
, mcs
, dst_type
);
959 if (i
== 0 && tex_layout
== INTEL_MSAA_LAYOUT_CMS
) {
960 /* The Ivy Bridge PRM, Vol4 Part1 p27 (Multisample Control Surface)
961 * suggests an optimization:
963 * "A simple optimization with probable large return in
964 * performance is to compare the MCS value to zero (indicating
965 * all samples are on sample slice 0), and sample only from
966 * sample slice 0 using ld2dss if MCS is zero."
968 * Note that in the case where the MCS value is zero, sampling from
969 * sample slice 0 using ld2dss and sampling from sample 0 using
970 * ld2dms are equivalent (since all samples are on sample slice 0).
971 * Since we have already sampled from sample 0, all we need to do is
972 * skip the remaining fetches and averaging if MCS is zero.
974 nir_ssa_def
*mcs_zero
=
975 nir_ieq(b
, nir_channel(b
, mcs
, 0), nir_imm_int(b
, 0));
976 if (tex_samples
== 16) {
977 mcs_zero
= nir_iand(b
, mcs_zero
,
978 nir_ieq(b
, nir_channel(b
, mcs
, 1), nir_imm_int(b
, 0)));
981 nir_if
*if_stmt
= nir_if_create(b
->shader
);
982 if_stmt
->condition
= nir_src_for_ssa(mcs_zero
);
983 nir_cf_node_insert(b
->cursor
, &if_stmt
->cf_node
);
985 b
->cursor
= nir_after_cf_list(&if_stmt
->then_list
);
986 nir_store_var(b
, color
, texture_data
[0], 0xf);
988 b
->cursor
= nir_after_cf_list(&if_stmt
->else_list
);
992 for (int j
= 0; j
< count_trailing_one_bits(i
); j
++) {
993 assert(stack_depth
>= 2);
996 assert(dst_type
== BRW_REGISTER_TYPE_F
);
997 texture_data
[stack_depth
- 1] =
998 nir_fadd(b
, texture_data
[stack_depth
- 1],
999 texture_data
[stack_depth
]);
1003 /* We should have just 1 sample on the stack now. */
1004 assert(stack_depth
== 1);
1006 texture_data
[0] = nir_fmul(b
, texture_data
[0],
1007 nir_imm_float(b
, 1.0 / tex_samples
));
1009 nir_store_var(b
, color
, texture_data
[0], 0xf);
1012 b
->cursor
= nir_after_cf_node(&outer_if
->cf_node
);
1014 return nir_load_var(b
, color
);
1017 static inline nir_ssa_def
*
1018 nir_imm_vec2(nir_builder
*build
, float x
, float y
)
1022 memset(&v
, 0, sizeof(v
));
1026 return nir_build_imm(build
, 4, 32, v
);
1029 static nir_ssa_def
*
1030 blorp_nir_manual_blend_bilinear(nir_builder
*b
, nir_ssa_def
*pos
,
1031 unsigned tex_samples
,
1032 const brw_blorp_blit_prog_key
*key
,
1033 struct brw_blorp_blit_vars
*v
)
1035 nir_ssa_def
*pos_xy
= nir_channels(b
, pos
, 0x3);
1036 nir_ssa_def
*rect_grid
= nir_load_var(b
, v
->v_rect_grid
);
1037 nir_ssa_def
*scale
= nir_imm_vec2(b
, key
->x_scale
, key
->y_scale
);
1039 /* Translate coordinates to lay out the samples in a rectangular grid
1040 * roughly corresponding to sample locations.
1042 pos_xy
= nir_fmul(b
, pos_xy
, scale
);
1043 /* Adjust coordinates so that integers represent pixel centers rather
1046 pos_xy
= nir_fadd(b
, pos_xy
, nir_imm_float(b
, -0.5));
1047 /* Clamp the X, Y texture coordinates to properly handle the sampling of
1048 * texels on texture edges.
1050 pos_xy
= nir_fmin(b
, nir_fmax(b
, pos_xy
, nir_imm_float(b
, 0.0)),
1051 nir_vec2(b
, nir_channel(b
, rect_grid
, 0),
1052 nir_channel(b
, rect_grid
, 1)));
1054 /* Store the fractional parts to be used as bilinear interpolation
1057 nir_ssa_def
*frac_xy
= nir_ffract(b
, pos_xy
);
1058 /* Round the float coordinates down to nearest integer */
1059 pos_xy
= nir_fdiv(b
, nir_ftrunc(b
, pos_xy
), scale
);
1061 nir_ssa_def
*tex_data
[4];
1062 for (unsigned i
= 0; i
< 4; ++i
) {
1063 float sample_off_x
= (float)(i
& 0x1) / key
->x_scale
;
1064 float sample_off_y
= (float)((i
>> 1) & 0x1) / key
->y_scale
;
1065 nir_ssa_def
*sample_off
= nir_imm_vec2(b
, sample_off_x
, sample_off_y
);
1067 nir_ssa_def
*sample_coords
= nir_fadd(b
, pos_xy
, sample_off
);
1068 nir_ssa_def
*sample_coords_int
= nir_f2i(b
, sample_coords
);
1070 /* The MCS value we fetch has to match up with the pixel that we're
1071 * sampling from. Since we sample from different pixels in each
1072 * iteration of this "for" loop, the call to mcs_fetch() should be
1073 * here inside the loop after computing the pixel coordinates.
1075 nir_ssa_def
*mcs
= NULL
;
1076 if (key
->tex_layout
== INTEL_MSAA_LAYOUT_CMS
)
1077 mcs
= blorp_nir_txf_ms_mcs(b
, sample_coords_int
);
1079 /* Compute sample index and map the sample index to a sample number.
1080 * Sample index layout shows the numbering of slots in a rectangular
1081 * grid of samples with in a pixel. Sample number layout shows the
1082 * rectangular grid of samples roughly corresponding to the real sample
1083 * locations with in a pixel.
1084 * In case of 4x MSAA, layout of sample indices matches the layout of
1092 * In case of 8x MSAA the two layouts don't match.
1093 * sample index layout : --------- sample number layout : ---------
1094 * | 0 | 1 | | 5 | 2 |
1095 * --------- ---------
1096 * | 2 | 3 | | 4 | 6 |
1097 * --------- ---------
1098 * | 4 | 5 | | 0 | 3 |
1099 * --------- ---------
1100 * | 6 | 7 | | 7 | 1 |
1101 * --------- ---------
1103 * Fortunately, this can be done fairly easily as:
1104 * S' = (0x17306425 >> (S * 4)) & 0xf
1106 * In the case of 16x MSAA the two layouts don't match.
1107 * Sample index layout: Sample number layout:
1108 * --------------------- ---------------------
1109 * | 0 | 1 | 2 | 3 | | 15 | 10 | 9 | 7 |
1110 * --------------------- ---------------------
1111 * | 4 | 5 | 6 | 7 | | 4 | 1 | 3 | 13 |
1112 * --------------------- ---------------------
1113 * | 8 | 9 | 10 | 11 | | 12 | 2 | 0 | 6 |
1114 * --------------------- ---------------------
1115 * | 12 | 13 | 14 | 15 | | 11 | 8 | 5 | 14 |
1116 * --------------------- ---------------------
1118 * This is equivalent to
1119 * S' = (0xe58b602cd31479af >> (S * 4)) & 0xf
1121 nir_ssa_def
*frac
= nir_ffract(b
, sample_coords
);
1122 nir_ssa_def
*sample
=
1123 nir_fdot2(b
, frac
, nir_imm_vec2(b
, key
->x_scale
,
1124 key
->x_scale
* key
->y_scale
));
1125 sample
= nir_f2i(b
, sample
);
1127 if (tex_samples
== 8) {
1128 sample
= nir_iand(b
, nir_ishr(b
, nir_imm_int(b
, 0x17306425),
1129 nir_ishl(b
, sample
, nir_imm_int(b
, 2))),
1130 nir_imm_int(b
, 0xf));
1131 } else if (tex_samples
== 16) {
1132 nir_ssa_def
*sample_low
=
1133 nir_iand(b
, nir_ishr(b
, nir_imm_int(b
, 0xd31479af),
1134 nir_ishl(b
, sample
, nir_imm_int(b
, 2))),
1135 nir_imm_int(b
, 0xf));
1136 nir_ssa_def
*sample_high
=
1137 nir_iand(b
, nir_ishr(b
, nir_imm_int(b
, 0xe58b602c),
1138 nir_ishl(b
, nir_iadd(b
, sample
,
1139 nir_imm_int(b
, -8)),
1140 nir_imm_int(b
, 2))),
1141 nir_imm_int(b
, 0xf));
1143 sample
= nir_bcsel(b
, nir_ilt(b
, sample
, nir_imm_int(b
, 8)),
1144 sample_low
, sample_high
);
1146 nir_ssa_def
*pos_ms
= nir_vec3(b
, nir_channel(b
, sample_coords_int
, 0),
1147 nir_channel(b
, sample_coords_int
, 1),
1149 tex_data
[i
] = blorp_nir_txf_ms(b
, pos_ms
, mcs
, key
->texture_data_type
);
1152 nir_ssa_def
*frac_x
= nir_channel(b
, frac_xy
, 0);
1153 nir_ssa_def
*frac_y
= nir_channel(b
, frac_xy
, 1);
1154 return nir_flrp(b
, nir_flrp(b
, tex_data
[0], tex_data
[1], frac_x
),
1155 nir_flrp(b
, tex_data
[2], tex_data
[3], frac_x
),
1160 * Generator for WM programs used in BLORP blits.
1162 * The bulk of the work done by the WM program is to wrap and unwrap the
1163 * coordinate transformations used by the hardware to store surfaces in
1164 * memory. The hardware transforms a pixel location (X, Y, S) (where S is the
1165 * sample index for a multisampled surface) to a memory offset by the
1166 * following formulas:
1168 * offset = tile(tiling_format, encode_msaa(num_samples, layout, X, Y, S))
1169 * (X, Y, S) = decode_msaa(num_samples, layout, detile(tiling_format, offset))
1171 * For a single-sampled surface, or for a multisampled surface using
1172 * INTEL_MSAA_LAYOUT_UMS, encode_msaa() and decode_msaa are the identity
1175 * encode_msaa(1, NONE, X, Y, 0) = (X, Y, 0)
1176 * decode_msaa(1, NONE, X, Y, 0) = (X, Y, 0)
1177 * encode_msaa(n, UMS, X, Y, S) = (X, Y, S)
1178 * decode_msaa(n, UMS, X, Y, S) = (X, Y, S)
1180 * For a 4x multisampled surface using INTEL_MSAA_LAYOUT_IMS, encode_msaa()
1181 * embeds the sample number into bit 1 of the X and Y coordinates:
1183 * encode_msaa(4, IMS, X, Y, S) = (X', Y', 0)
1184 * where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1)
1185 * Y' = (Y & ~0b1 ) << 1 | (S & 0b10) | (Y & 0b1)
1186 * decode_msaa(4, IMS, X, Y, 0) = (X', Y', S)
1187 * where X' = (X & ~0b11) >> 1 | (X & 0b1)
1188 * Y' = (Y & ~0b11) >> 1 | (Y & 0b1)
1189 * S = (Y & 0b10) | (X & 0b10) >> 1
1191 * For an 8x multisampled surface using INTEL_MSAA_LAYOUT_IMS, encode_msaa()
1192 * embeds the sample number into bits 1 and 2 of the X coordinate and bit 1 of
1195 * encode_msaa(8, IMS, X, Y, S) = (X', Y', 0)
1196 * where X' = (X & ~0b1) << 2 | (S & 0b100) | (S & 0b1) << 1 | (X & 0b1)
1197 * Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1)
1198 * decode_msaa(8, IMS, X, Y, 0) = (X', Y', S)
1199 * where X' = (X & ~0b111) >> 2 | (X & 0b1)
1200 * Y' = (Y & ~0b11) >> 1 | (Y & 0b1)
1201 * S = (X & 0b100) | (Y & 0b10) | (X & 0b10) >> 1
1203 * For X tiling, tile() combines together the low-order bits of the X and Y
1204 * coordinates in the pattern 0byyyxxxxxxxxx, creating 4k tiles that are 512
1205 * bytes wide and 8 rows high:
1207 * tile(x_tiled, X, Y, S) = A
1208 * where A = tile_num << 12 | offset
1209 * tile_num = (Y' >> 3) * tile_pitch + (X' >> 9)
1210 * offset = (Y' & 0b111) << 9
1211 * | (X & 0b111111111)
1213 * Y' = Y + S * qpitch
1214 * detile(x_tiled, A) = (X, Y, S)
1215 * where X = X' / cpp
1218 * Y' = (tile_num / tile_pitch) << 3
1219 * | (A & 0b111000000000) >> 9
1220 * X' = (tile_num % tile_pitch) << 9
1221 * | (A & 0b111111111)
1223 * (In all tiling formulas, cpp is the number of bytes occupied by a single
1224 * sample ("chars per pixel"), tile_pitch is the number of 4k tiles required
1225 * to fill the width of the surface, and qpitch is the spacing (in rows)
1226 * between array slices).
1228 * For Y tiling, tile() combines together the low-order bits of the X and Y
1229 * coordinates in the pattern 0bxxxyyyyyxxxx, creating 4k tiles that are 128
1230 * bytes wide and 32 rows high:
1232 * tile(y_tiled, X, Y, S) = A
1233 * where A = tile_num << 12 | offset
1234 * tile_num = (Y' >> 5) * tile_pitch + (X' >> 7)
1235 * offset = (X' & 0b1110000) << 5
1236 * | (Y' & 0b11111) << 4
1239 * Y' = Y + S * qpitch
1240 * detile(y_tiled, A) = (X, Y, S)
1241 * where X = X' / cpp
1244 * Y' = (tile_num / tile_pitch) << 5
1245 * | (A & 0b111110000) >> 4
1246 * X' = (tile_num % tile_pitch) << 7
1247 * | (A & 0b111000000000) >> 5
1250 * For W tiling, tile() combines together the low-order bits of the X and Y
1251 * coordinates in the pattern 0bxxxyyyyxyxyx, creating 4k tiles that are 64
1252 * bytes wide and 64 rows high (note that W tiling is only used for stencil
1253 * buffers, which always have cpp = 1 and S=0):
1255 * tile(w_tiled, X, Y, S) = A
1256 * where A = tile_num << 12 | offset
1257 * tile_num = (Y' >> 6) * tile_pitch + (X' >> 6)
1258 * offset = (X' & 0b111000) << 6
1259 * | (Y' & 0b111100) << 3
1260 * | (X' & 0b100) << 2
1261 * | (Y' & 0b10) << 2
1262 * | (X' & 0b10) << 1
1266 * Y' = Y + S * qpitch
1267 * detile(w_tiled, A) = (X, Y, S)
1268 * where X = X' / cpp = X'
1269 * Y = Y' % qpitch = Y'
1270 * S = Y / qpitch = 0
1271 * Y' = (tile_num / tile_pitch) << 6
1272 * | (A & 0b111100000) >> 3
1273 * | (A & 0b1000) >> 2
1275 * X' = (tile_num % tile_pitch) << 6
1276 * | (A & 0b111000000000) >> 6
1277 * | (A & 0b10000) >> 2
1278 * | (A & 0b100) >> 1
1281 * Finally, for a non-tiled surface, tile() simply combines together the X and
1282 * Y coordinates in the natural way:
1284 * tile(untiled, X, Y, S) = A
1285 * where A = Y * pitch + X'
1287 * Y' = Y + S * qpitch
1288 * detile(untiled, A) = (X, Y, S)
1289 * where X = X' / cpp
1295 * (In these formulas, pitch is the number of bytes occupied by a single row
1299 brw_blorp_build_nir_shader(struct brw_context
*brw
,
1300 const brw_blorp_blit_prog_key
*key
)
1302 nir_ssa_def
*src_pos
, *dst_pos
, *color
;
1305 if (key
->dst_tiled_w
&& key
->rt_samples
> 0) {
1306 /* If the destination image is W tiled and multisampled, then the thread
1307 * must be dispatched once per sample, not once per pixel. This is
1308 * necessary because after conversion between W and Y tiling, there's no
1309 * guarantee that all samples corresponding to a single pixel will still
1312 assert(key
->persample_msaa_dispatch
);
1316 /* We are blending, which means we won't have an opportunity to
1317 * translate the tiling and sample count for the texture surface. So
1318 * the surface state for the texture must be configured with the correct
1319 * tiling and sample count.
1321 assert(!key
->src_tiled_w
);
1322 assert(key
->tex_samples
== key
->src_samples
);
1323 assert(key
->tex_layout
== key
->src_layout
);
1324 assert(key
->tex_samples
> 0);
1327 if (key
->persample_msaa_dispatch
) {
1328 /* It only makes sense to do persample dispatch if the render target is
1329 * configured as multisampled.
1331 assert(key
->rt_samples
> 0);
1334 /* Make sure layout is consistent with sample count */
1335 assert((key
->tex_layout
== INTEL_MSAA_LAYOUT_NONE
) ==
1336 (key
->tex_samples
== 0));
1337 assert((key
->rt_layout
== INTEL_MSAA_LAYOUT_NONE
) ==
1338 (key
->rt_samples
== 0));
1339 assert((key
->src_layout
== INTEL_MSAA_LAYOUT_NONE
) ==
1340 (key
->src_samples
== 0));
1341 assert((key
->dst_layout
== INTEL_MSAA_LAYOUT_NONE
) ==
1342 (key
->dst_samples
== 0));
1345 nir_builder_init_simple_shader(&b
, NULL
, MESA_SHADER_FRAGMENT
, NULL
);
1347 struct brw_blorp_blit_vars v
;
1348 brw_blorp_blit_vars_init(&b
, &v
, key
);
1350 dst_pos
= blorp_blit_get_frag_coords(&b
, key
, &v
);
1352 /* Render target and texture hardware don't support W tiling until Gen8. */
1353 const bool rt_tiled_w
= false;
1354 const bool tex_tiled_w
= brw
->gen
>= 8 && key
->src_tiled_w
;
1356 /* The address that data will be written to is determined by the
1357 * coordinates supplied to the WM thread and the tiling and sample count of
1358 * the render target, according to the formula:
1360 * (X, Y, S) = decode_msaa(rt_samples, detile(rt_tiling, offset))
1362 * If the actual tiling and sample count of the destination surface are not
1363 * the same as the configuration of the render target, then these
1364 * coordinates are wrong and we have to adjust them to compensate for the
1367 if (rt_tiled_w
!= key
->dst_tiled_w
||
1368 key
->rt_samples
!= key
->dst_samples
||
1369 key
->rt_layout
!= key
->dst_layout
) {
1370 dst_pos
= blorp_nir_encode_msaa(&b
, dst_pos
, key
->rt_samples
,
1372 /* Now (X, Y, S) = detile(rt_tiling, offset) */
1373 if (rt_tiled_w
!= key
->dst_tiled_w
)
1374 dst_pos
= blorp_nir_retile_y_to_w(&b
, dst_pos
);
1375 /* Now (X, Y, S) = detile(rt_tiling, offset) */
1376 dst_pos
= blorp_nir_decode_msaa(&b
, dst_pos
, key
->dst_samples
,
1380 /* Now (X, Y, S) = decode_msaa(dst_samples, detile(dst_tiling, offset)).
1382 * That is: X, Y and S now contain the true coordinates and sample index of
1383 * the data that the WM thread should output.
1385 * If we need to kill pixels that are outside the destination rectangle,
1386 * now is the time to do it.
1388 if (key
->use_kill
) {
1389 assert(!(key
->blend
&& key
->blit_scaled
));
1390 blorp_nir_discard_if_outside_rect(&b
, dst_pos
, &v
);
1393 src_pos
= blorp_blit_apply_transform(&b
, nir_i2f(&b
, dst_pos
), &v
);
1394 if (dst_pos
->num_components
== 3) {
1395 /* The sample coordinate is an integer that we want left alone but
1396 * blorp_blit_apply_transform() blindly applies the transform to all
1397 * three coordinates. Grab the original sample index.
1399 src_pos
= nir_vec3(&b
, nir_channel(&b
, src_pos
, 0),
1400 nir_channel(&b
, src_pos
, 1),
1401 nir_channel(&b
, dst_pos
, 2));
1404 /* If the source image is not multisampled, then we want to fetch sample
1405 * number 0, because that's the only sample there is.
1407 if (key
->src_samples
== 0)
1408 src_pos
= nir_channels(&b
, src_pos
, 0x3);
1410 /* X, Y, and S are now the coordinates of the pixel in the source image
1411 * that we want to texture from. Exception: if we are blending, then S is
1412 * irrelevant, because we are going to fetch all samples.
1414 if (key
->blend
&& !key
->blit_scaled
) {
1415 /* Resolves (effecively) use texelFetch, so we need integers and we
1416 * don't care about the sample index if we got one.
1418 src_pos
= nir_f2i(&b
, nir_channels(&b
, src_pos
, 0x3));
1420 if (brw
->gen
== 6) {
1421 /* Because gen6 only supports 4x interleved MSAA, we can do all the
1422 * blending we need with a single linear-interpolated texture lookup
1423 * at the center of the sample. The texture coordinates to be odd
1424 * integers so that they correspond to the center of a 2x2 block
1425 * representing the four samples that maxe up a pixel. So we need
1426 * to multiply our X and Y coordinates each by 2 and then add 1.
1428 src_pos
= nir_ishl(&b
, src_pos
, nir_imm_int(&b
, 1));
1429 src_pos
= nir_iadd(&b
, src_pos
, nir_imm_int(&b
, 1));
1430 src_pos
= nir_i2f(&b
, src_pos
);
1431 color
= blorp_nir_tex(&b
, src_pos
, key
->texture_data_type
);
1433 /* Gen7+ hardware doesn't automaticaly blend. */
1434 color
= blorp_nir_manual_blend_average(&b
, src_pos
, key
->src_samples
,
1436 key
->texture_data_type
);
1438 } else if (key
->blend
&& key
->blit_scaled
) {
1439 assert(!key
->use_kill
);
1440 color
= blorp_nir_manual_blend_bilinear(&b
, src_pos
, key
->src_samples
, key
, &v
);
1442 if (key
->bilinear_filter
) {
1443 color
= blorp_nir_tex(&b
, src_pos
, key
->texture_data_type
);
1445 /* We're going to use texelFetch, so we need integers */
1446 if (src_pos
->num_components
== 2) {
1447 src_pos
= nir_f2i(&b
, src_pos
);
1449 assert(src_pos
->num_components
== 3);
1450 src_pos
= nir_vec3(&b
, nir_channel(&b
, nir_f2i(&b
, src_pos
), 0),
1451 nir_channel(&b
, nir_f2i(&b
, src_pos
), 1),
1452 nir_channel(&b
, src_pos
, 2));
1455 /* We aren't blending, which means we just want to fetch a single
1456 * sample from the source surface. The address that we want to fetch
1457 * from is related to the X, Y and S values according to the formula:
1459 * (X, Y, S) = decode_msaa(src_samples, detile(src_tiling, offset)).
1461 * If the actual tiling and sample count of the source surface are
1462 * not the same as the configuration of the texture, then we need to
1463 * adjust the coordinates to compensate for the difference.
1465 if (tex_tiled_w
!= key
->src_tiled_w
||
1466 key
->tex_samples
!= key
->src_samples
||
1467 key
->tex_layout
!= key
->src_layout
) {
1468 src_pos
= blorp_nir_encode_msaa(&b
, src_pos
, key
->src_samples
,
1470 /* Now (X, Y, S) = detile(src_tiling, offset) */
1471 if (tex_tiled_w
!= key
->src_tiled_w
)
1472 src_pos
= blorp_nir_retile_w_to_y(&b
, src_pos
);
1473 /* Now (X, Y, S) = detile(tex_tiling, offset) */
1474 src_pos
= blorp_nir_decode_msaa(&b
, src_pos
, key
->tex_samples
,
1478 /* Now (X, Y, S) = decode_msaa(tex_samples, detile(tex_tiling, offset)).
1480 * In other words: X, Y, and S now contain values which, when passed to
1481 * the texturing unit, will cause data to be read from the correct
1482 * memory location. So we can fetch the texel now.
1484 if (key
->src_samples
== 0) {
1485 color
= blorp_nir_txf(&b
, &v
, src_pos
, key
->texture_data_type
);
1487 nir_ssa_def
*mcs
= NULL
;
1488 if (key
->tex_layout
== INTEL_MSAA_LAYOUT_CMS
)
1489 mcs
= blorp_nir_txf_ms_mcs(&b
, src_pos
);
1491 color
= blorp_nir_txf_ms(&b
, src_pos
, mcs
, key
->texture_data_type
);
1496 nir_store_var(&b
, v
.color_out
, color
, 0xf);
1502 brw_blorp_get_blit_kernel(struct brw_context
*brw
,
1503 struct brw_blorp_params
*params
,
1504 const struct brw_blorp_blit_prog_key
*prog_key
)
1506 if (brw_search_cache(&brw
->cache
, BRW_CACHE_BLORP_PROG
,
1507 prog_key
, sizeof(*prog_key
),
1508 ¶ms
->wm_prog_kernel
, ¶ms
->wm_prog_data
))
1511 const unsigned *program
;
1512 unsigned program_size
;
1513 struct brw_blorp_prog_data prog_data
;
1515 /* Try and compile with NIR first. If that fails, fall back to the old
1516 * method of building shaders manually.
1518 nir_shader
*nir
= brw_blorp_build_nir_shader(brw
, prog_key
);
1519 struct brw_wm_prog_key wm_key
;
1520 brw_blorp_init_wm_prog_key(&wm_key
);
1521 wm_key
.tex
.compressed_multisample_layout_mask
=
1522 prog_key
->tex_layout
== INTEL_MSAA_LAYOUT_CMS
;
1523 wm_key
.tex
.msaa_16
= prog_key
->tex_samples
== 16;
1524 wm_key
.multisample_fbo
= prog_key
->rt_samples
> 1;
1526 program
= brw_blorp_compile_nir_shader(brw
, nir
, &wm_key
, false,
1527 &prog_data
, &program_size
);
1529 brw_upload_cache(&brw
->cache
, BRW_CACHE_BLORP_PROG
,
1530 prog_key
, sizeof(*prog_key
),
1531 program
, program_size
,
1532 &prog_data
, sizeof(prog_data
),
1533 ¶ms
->wm_prog_kernel
, ¶ms
->wm_prog_data
);
1537 brw_blorp_setup_coord_transform(struct brw_blorp_coord_transform
*xform
,
1538 GLfloat src0
, GLfloat src1
,
1539 GLfloat dst0
, GLfloat dst1
,
1542 float scale
= (src1
- src0
) / (dst1
- dst0
);
1544 /* When not mirroring a coordinate (say, X), we need:
1545 * src_x - src_x0 = (dst_x - dst_x0 + 0.5) * scale
1547 * src_x = src_x0 + (dst_x - dst_x0 + 0.5) * scale
1549 * blorp program uses "round toward zero" to convert the
1550 * transformed floating point coordinates to integer coordinates,
1551 * whereas the behaviour we actually want is "round to nearest",
1552 * so 0.5 provides the necessary correction.
1554 xform
->multiplier
= scale
;
1555 xform
->offset
= src0
+ (-dst0
+ 0.5f
) * scale
;
1557 /* When mirroring X we need:
1558 * src_x - src_x0 = dst_x1 - dst_x - 0.5
1560 * src_x = src_x0 + (dst_x1 -dst_x - 0.5) * scale
1562 xform
->multiplier
= -scale
;
1563 xform
->offset
= src0
+ (dst1
- 0.5f
) * scale
;
1569 * Determine which MSAA layout the GPU pipeline should be configured for,
1570 * based on the chip generation, the number of samples, and the true layout of
1571 * the image in memory.
1573 inline intel_msaa_layout
1574 compute_msaa_layout_for_pipeline(struct brw_context
*brw
, unsigned num_samples
,
1575 intel_msaa_layout true_layout
)
1577 if (num_samples
<= 1) {
1578 /* Layout is used to determine if ld2dms is needed for sampling. In
1579 * single sampled case normal ld is enough avoiding also the need to
1580 * fetch mcs. Therefore simply set the layout to none.
1582 if (brw
->gen
>= 9 && true_layout
== INTEL_MSAA_LAYOUT_CMS
) {
1583 return INTEL_MSAA_LAYOUT_NONE
;
1586 /* When configuring the GPU for non-MSAA, we can still accommodate IMS
1587 * format buffers, by transforming coordinates appropriately.
1589 assert(true_layout
== INTEL_MSAA_LAYOUT_NONE
||
1590 true_layout
== INTEL_MSAA_LAYOUT_IMS
);
1591 return INTEL_MSAA_LAYOUT_NONE
;
1593 assert(true_layout
!= INTEL_MSAA_LAYOUT_NONE
);
1596 /* Prior to Gen7, all MSAA surfaces use IMS layout. */
1597 if (brw
->gen
== 6) {
1598 assert(true_layout
== INTEL_MSAA_LAYOUT_IMS
);
1606 * Note: if the src (or dst) is a 2D multisample array texture on Gen7+ using
1607 * INTEL_MSAA_LAYOUT_UMS or INTEL_MSAA_LAYOUT_CMS, src_layer (dst_layer) is
1608 * the physical layer holding sample 0. So, for example, if
1609 * src_mt->num_samples == 4, then logical layer n corresponds to src_layer ==
1613 brw_blorp_blit_miptrees(struct brw_context
*brw
,
1614 struct intel_mipmap_tree
*src_mt
,
1615 unsigned src_level
, unsigned src_layer
,
1616 mesa_format src_format
, int src_swizzle
,
1617 struct intel_mipmap_tree
*dst_mt
,
1618 unsigned dst_level
, unsigned dst_layer
,
1619 mesa_format dst_format
,
1620 float src_x0
, float src_y0
,
1621 float src_x1
, float src_y1
,
1622 float dst_x0
, float dst_y0
,
1623 float dst_x1
, float dst_y1
,
1624 GLenum filter
, bool mirror_x
, bool mirror_y
,
1625 bool decode_srgb
, bool encode_srgb
)
1627 /* Get ready to blit. This includes depth resolving the src and dst
1628 * buffers if necessary. Note: it's not necessary to do a color resolve on
1629 * the destination buffer because we use the standard render path to render
1630 * to destination color buffers, and the standard render path is
1633 intel_miptree_resolve_color(brw
, src_mt
, INTEL_MIPTREE_IGNORE_CCS_E
);
1634 intel_miptree_slice_resolve_depth(brw
, src_mt
, src_level
, src_layer
);
1635 intel_miptree_slice_resolve_depth(brw
, dst_mt
, dst_level
, dst_layer
);
1637 intel_miptree_prepare_mcs(brw
, dst_mt
);
1639 DBG("%s from %dx %s mt %p %d %d (%f,%f) (%f,%f)"
1640 "to %dx %s mt %p %d %d (%f,%f) (%f,%f) (flip %d,%d)\n",
1642 src_mt
->num_samples
, _mesa_get_format_name(src_mt
->format
), src_mt
,
1643 src_level
, src_layer
, src_x0
, src_y0
, src_x1
, src_y1
,
1644 dst_mt
->num_samples
, _mesa_get_format_name(dst_mt
->format
), dst_mt
,
1645 dst_level
, dst_layer
, dst_x0
, dst_y0
, dst_x1
, dst_y1
,
1646 mirror_x
, mirror_y
);
1648 if (!decode_srgb
&& _mesa_get_format_color_encoding(src_format
) == GL_SRGB
)
1649 src_format
= _mesa_get_srgb_format_linear(src_format
);
1651 if (!encode_srgb
&& _mesa_get_format_color_encoding(dst_format
) == GL_SRGB
)
1652 dst_format
= _mesa_get_srgb_format_linear(dst_format
);
1654 struct brw_blorp_params params
;
1655 brw_blorp_params_init(¶ms
);
1657 brw_blorp_surface_info_init(brw
, ¶ms
.src
, src_mt
, src_level
,
1658 src_layer
, src_format
, false);
1659 brw_blorp_surface_info_init(brw
, ¶ms
.dst
, dst_mt
, dst_level
,
1660 dst_layer
, dst_format
, true);
1662 /* Even though we do multisample resolves at the time of the blit, OpenGL
1663 * specification defines them as if they happen at the time of rendering,
1664 * which means that the type of averaging we do during the resolve should
1665 * only depend on the source format; the destination format should be
1666 * ignored. But, specification doesn't seem to be strict about it.
1668 * It has been observed that mulitisample resolves produce slightly better
1669 * looking images when averaging is done using destination format. NVIDIA's
1670 * proprietary OpenGL driver also follow this approach. So, we choose to
1671 * follow it in our driver.
1673 * When multisampling, if the source and destination formats are equal
1674 * (aside from the color space), we choose to blit in sRGB space to get
1675 * this higher quality image.
1677 if (params
.src
.num_samples
> 1 &&
1678 _mesa_get_format_color_encoding(dst_mt
->format
) == GL_SRGB
&&
1679 _mesa_get_srgb_format_linear(src_mt
->format
) ==
1680 _mesa_get_srgb_format_linear(dst_mt
->format
)) {
1681 assert(brw
->format_supported_as_render_target
[dst_mt
->format
]);
1682 params
.dst
.brw_surfaceformat
= brw
->render_target_format
[dst_mt
->format
];
1683 params
.src
.brw_surfaceformat
= brw_format_for_mesa_format(dst_mt
->format
);
1686 /* When doing a multisample resolve of a GL_LUMINANCE32F or GL_INTENSITY32F
1687 * texture, the above code configures the source format for L32_FLOAT or
1688 * I32_FLOAT, and the destination format for R32_FLOAT. On Sandy Bridge,
1689 * the SAMPLE message appears to handle multisampled L32_FLOAT and
1690 * I32_FLOAT textures incorrectly, resulting in blocky artifacts. So work
1691 * around the problem by using a source format of R32_FLOAT. This
1692 * shouldn't affect rendering correctness, since the destination format is
1693 * R32_FLOAT, so only the contents of the red channel matters.
1695 if (brw
->gen
== 6 &&
1696 params
.src
.num_samples
> 1 && params
.dst
.num_samples
<= 1 &&
1697 src_mt
->format
== dst_mt
->format
&&
1698 params
.dst
.brw_surfaceformat
== BRW_SURFACEFORMAT_R32_FLOAT
) {
1699 params
.src
.brw_surfaceformat
= params
.dst
.brw_surfaceformat
;
1702 struct brw_blorp_blit_prog_key wm_prog_key
;
1703 memset(&wm_prog_key
, 0, sizeof(wm_prog_key
));
1705 /* texture_data_type indicates the register type that should be used to
1706 * manipulate texture data.
1708 switch (_mesa_get_format_datatype(src_mt
->format
)) {
1709 case GL_UNSIGNED_NORMALIZED
:
1710 case GL_SIGNED_NORMALIZED
:
1712 wm_prog_key
.texture_data_type
= BRW_REGISTER_TYPE_F
;
1714 case GL_UNSIGNED_INT
:
1715 if (src_mt
->format
== MESA_FORMAT_S_UINT8
) {
1716 /* We process stencil as though it's an unsigned normalized color */
1717 wm_prog_key
.texture_data_type
= BRW_REGISTER_TYPE_F
;
1719 wm_prog_key
.texture_data_type
= BRW_REGISTER_TYPE_UD
;
1723 wm_prog_key
.texture_data_type
= BRW_REGISTER_TYPE_D
;
1726 unreachable("Unrecognized blorp format");
1730 /* Gen7's rendering hardware only supports the IMS layout for depth and
1731 * stencil render targets. Blorp always maps its destination surface as
1732 * a color render target (even if it's actually a depth or stencil
1733 * buffer). So if the destination is IMS, we'll have to map it as a
1734 * single-sampled texture and interleave the samples ourselves.
1736 if (dst_mt
->msaa_layout
== INTEL_MSAA_LAYOUT_IMS
)
1737 params
.dst
.num_samples
= 0;
1740 if (params
.dst
.map_stencil_as_y_tiled
&& params
.dst
.num_samples
> 1) {
1741 /* If the destination surface is a W-tiled multisampled stencil buffer
1742 * that we're mapping as Y tiled, then we need to arrange for the WM
1743 * program to run once per sample rather than once per pixel, because
1744 * the memory layout of related samples doesn't match between W and Y
1747 wm_prog_key
.persample_msaa_dispatch
= true;
1750 if (params
.src
.num_samples
> 0 && params
.dst
.num_samples
> 1) {
1751 /* We are blitting from a multisample buffer to a multisample buffer, so
1752 * we must preserve samples within a pixel. This means we have to
1753 * arrange for the WM program to run once per sample rather than once
1756 wm_prog_key
.persample_msaa_dispatch
= true;
1759 /* Scaled blitting or not. */
1760 wm_prog_key
.blit_scaled
=
1761 ((dst_x1
- dst_x0
) == (src_x1
- src_x0
) &&
1762 (dst_y1
- dst_y0
) == (src_y1
- src_y0
)) ? false : true;
1764 /* Scaling factors used for bilinear filtering in multisample scaled
1767 if (src_mt
->num_samples
== 16)
1768 wm_prog_key
.x_scale
= 4.0f
;
1770 wm_prog_key
.x_scale
= 2.0f
;
1771 wm_prog_key
.y_scale
= src_mt
->num_samples
/ wm_prog_key
.x_scale
;
1773 if (filter
== GL_LINEAR
&&
1774 params
.src
.num_samples
<= 1 && params
.dst
.num_samples
<= 1)
1775 wm_prog_key
.bilinear_filter
= true;
1777 GLenum base_format
= _mesa_get_format_base_format(src_mt
->format
);
1778 if (base_format
!= GL_DEPTH_COMPONENT
&& /* TODO: what about depth/stencil? */
1779 base_format
!= GL_STENCIL_INDEX
&&
1780 !_mesa_is_format_integer(src_mt
->format
) &&
1781 src_mt
->num_samples
> 1 && dst_mt
->num_samples
<= 1) {
1782 /* We are downsampling a non-integer color buffer, so blend.
1784 * Regarding integer color buffers, the OpenGL ES 3.2 spec says:
1786 * "If the source formats are integer types or stencil values, a
1787 * single sample's value is selected for each pixel."
1789 * This implies we should not blend in that case.
1791 wm_prog_key
.blend
= true;
1794 /* src_samples and dst_samples are the true sample counts */
1795 wm_prog_key
.src_samples
= src_mt
->num_samples
;
1796 wm_prog_key
.dst_samples
= dst_mt
->num_samples
;
1798 /* tex_samples and rt_samples are the sample counts that are set up in
1801 wm_prog_key
.tex_samples
= params
.src
.num_samples
;
1802 wm_prog_key
.rt_samples
= params
.dst
.num_samples
;
1804 /* tex_layout and rt_layout indicate the MSAA layout the GPU pipeline will
1805 * use to access the source and destination surfaces.
1807 wm_prog_key
.tex_layout
=
1808 compute_msaa_layout_for_pipeline(brw
, params
.src
.num_samples
,
1809 params
.src
.msaa_layout
);
1810 wm_prog_key
.rt_layout
=
1811 compute_msaa_layout_for_pipeline(brw
, params
.dst
.num_samples
,
1812 params
.dst
.msaa_layout
);
1814 /* src_layout and dst_layout indicate the true MSAA layout used by src and
1817 wm_prog_key
.src_layout
= src_mt
->msaa_layout
;
1818 wm_prog_key
.dst_layout
= dst_mt
->msaa_layout
;
1820 /* On gen9+ compressed single sampled buffers carry the same layout type as
1821 * multisampled. The difference is that they can be sampled using normal
1822 * ld message and as render target behave just like non-compressed surface
1823 * from compiler point of view. Therefore override the type in the program
1826 if (brw
->gen
>= 9 && params
.src
.num_samples
<= 1 &&
1827 src_mt
->msaa_layout
== INTEL_MSAA_LAYOUT_CMS
)
1828 wm_prog_key
.src_layout
= INTEL_MSAA_LAYOUT_NONE
;
1829 if (brw
->gen
>= 9 && params
.dst
.num_samples
<= 1 &&
1830 dst_mt
->msaa_layout
== INTEL_MSAA_LAYOUT_CMS
)
1831 wm_prog_key
.dst_layout
= INTEL_MSAA_LAYOUT_NONE
;
1833 wm_prog_key
.src_tiled_w
= params
.src
.map_stencil_as_y_tiled
;
1834 wm_prog_key
.dst_tiled_w
= params
.dst
.map_stencil_as_y_tiled
;
1835 /* Round floating point values to nearest integer to avoid "off by one texel"
1836 * kind of errors when blitting.
1838 params
.x0
= params
.wm_inputs
.discard_rect
.x0
= roundf(dst_x0
);
1839 params
.y0
= params
.wm_inputs
.discard_rect
.y0
= roundf(dst_y0
);
1840 params
.x1
= params
.wm_inputs
.discard_rect
.x1
= roundf(dst_x1
);
1841 params
.y1
= params
.wm_inputs
.discard_rect
.y1
= roundf(dst_y1
);
1843 params
.wm_inputs
.rect_grid
.x1
=
1844 minify(src_mt
->logical_width0
, src_level
) * wm_prog_key
.x_scale
- 1.0f
;
1845 params
.wm_inputs
.rect_grid
.y1
=
1846 minify(src_mt
->logical_height0
, src_level
) * wm_prog_key
.y_scale
- 1.0f
;
1848 brw_blorp_setup_coord_transform(¶ms
.wm_inputs
.coord_transform
[0],
1849 src_x0
, src_x1
, dst_x0
, dst_x1
, mirror_x
);
1850 brw_blorp_setup_coord_transform(¶ms
.wm_inputs
.coord_transform
[1],
1851 src_y0
, src_y1
, dst_y0
, dst_y1
, mirror_y
);
1853 if (brw
->gen
>= 8 && params
.src
.mt
->target
== GL_TEXTURE_3D
) {
1854 /* On gen8+ we use actual 3-D textures so we need to pass the layer
1855 * through to the sampler.
1857 params
.wm_inputs
.src_z
= params
.src
.layer
;
1859 /* On gen7 and earlier, we fake everything with 2-D textures */
1860 params
.wm_inputs
.src_z
= 0;
1863 if (params
.dst
.num_samples
<= 1 && dst_mt
->num_samples
> 1) {
1864 /* We must expand the rectangle we send through the rendering pipeline,
1865 * to account for the fact that we are mapping the destination region as
1866 * single-sampled when it is in fact multisampled. We must also align
1867 * it to a multiple of the multisampling pattern, because the
1868 * differences between multisampled and single-sampled surface formats
1869 * will mean that pixels are scrambled within the multisampling pattern.
1870 * TODO: what if this makes the coordinates too large?
1872 * Note: this only works if the destination surface uses the IMS layout.
1873 * If it's UMS, then we have no choice but to set up the rendering
1874 * pipeline as multisampled.
1876 assert(dst_mt
->msaa_layout
== INTEL_MSAA_LAYOUT_IMS
);
1877 switch (dst_mt
->num_samples
) {
1879 params
.x0
= ROUND_DOWN_TO(params
.x0
* 2, 4);
1880 params
.y0
= ROUND_DOWN_TO(params
.y0
, 4);
1881 params
.x1
= ALIGN(params
.x1
* 2, 4);
1882 params
.y1
= ALIGN(params
.y1
, 4);
1885 params
.x0
= ROUND_DOWN_TO(params
.x0
* 2, 4);
1886 params
.y0
= ROUND_DOWN_TO(params
.y0
* 2, 4);
1887 params
.x1
= ALIGN(params
.x1
* 2, 4);
1888 params
.y1
= ALIGN(params
.y1
* 2, 4);
1891 params
.x0
= ROUND_DOWN_TO(params
.x0
* 4, 8);
1892 params
.y0
= ROUND_DOWN_TO(params
.y0
* 2, 4);
1893 params
.x1
= ALIGN(params
.x1
* 4, 8);
1894 params
.y1
= ALIGN(params
.y1
* 2, 4);
1897 params
.x0
= ROUND_DOWN_TO(params
.x0
* 4, 8);
1898 params
.y0
= ROUND_DOWN_TO(params
.y0
* 4, 8);
1899 params
.x1
= ALIGN(params
.x1
* 4, 8);
1900 params
.y1
= ALIGN(params
.y1
* 4, 8);
1903 unreachable("Unrecognized sample count in brw_blorp_blit_params ctor");
1905 wm_prog_key
.use_kill
= true;
1908 if (params
.dst
.map_stencil_as_y_tiled
) {
1909 /* We must modify the rectangle we send through the rendering pipeline
1910 * (and the size and x/y offset of the destination surface), to account
1911 * for the fact that we are mapping it as Y-tiled when it is in fact
1914 * Both Y tiling and W tiling can be understood as organizations of
1915 * 32-byte sub-tiles; within each 32-byte sub-tile, the layout of pixels
1916 * is different, but the layout of the 32-byte sub-tiles within the 4k
1917 * tile is the same (8 sub-tiles across by 16 sub-tiles down, in
1918 * column-major order). In Y tiling, the sub-tiles are 16 bytes wide
1919 * and 2 rows high; in W tiling, they are 8 bytes wide and 4 rows high.
1921 * Therefore, to account for the layout differences within the 32-byte
1922 * sub-tiles, we must expand the rectangle so the X coordinates of its
1923 * edges are multiples of 8 (the W sub-tile width), and its Y
1924 * coordinates of its edges are multiples of 4 (the W sub-tile height).
1925 * Then we need to scale the X and Y coordinates of the rectangle to
1926 * account for the differences in aspect ratio between the Y and W
1927 * sub-tiles. We need to modify the layer width and height similarly.
1929 * A correction needs to be applied when MSAA is in use: since
1930 * INTEL_MSAA_LAYOUT_IMS uses an interleaving pattern whose height is 4,
1931 * we need to align the Y coordinates to multiples of 8, so that when
1932 * they are divided by two they are still multiples of 4.
1934 * Note: Since the x/y offset of the surface will be applied using the
1935 * SURFACE_STATE command packet, it will be invisible to the swizzling
1936 * code in the shader; therefore it needs to be in a multiple of the
1937 * 32-byte sub-tile size. Fortunately it is, since the sub-tile is 8
1938 * pixels wide and 4 pixels high (when viewed as a W-tiled stencil
1939 * buffer), and the miplevel alignment used for stencil buffers is 8
1940 * pixels horizontally and either 4 or 8 pixels vertically (see
1941 * intel_horizontal_texture_alignment_unit() and
1942 * intel_vertical_texture_alignment_unit()).
1944 * Note: Also, since the SURFACE_STATE command packet can only apply
1945 * offsets that are multiples of 4 pixels horizontally and 2 pixels
1946 * vertically, it is important that the offsets will be multiples of
1947 * these sizes after they are converted into Y-tiled coordinates.
1948 * Fortunately they will be, since we know from above that the offsets
1949 * are a multiple of the 32-byte sub-tile size, and in Y-tiled
1950 * coordinates the sub-tile is 16 pixels wide and 2 pixels high.
1952 * TODO: what if this makes the coordinates (or the texture size) too
1955 const unsigned x_align
= 8, y_align
= params
.dst
.num_samples
!= 0 ? 8 : 4;
1956 params
.x0
= ROUND_DOWN_TO(params
.x0
, x_align
) * 2;
1957 params
.y0
= ROUND_DOWN_TO(params
.y0
, y_align
) / 2;
1958 params
.x1
= ALIGN(params
.x1
, x_align
) * 2;
1959 params
.y1
= ALIGN(params
.y1
, y_align
) / 2;
1960 params
.dst
.width
= ALIGN(params
.dst
.width
, x_align
) * 2;
1961 params
.dst
.height
= ALIGN(params
.dst
.height
, y_align
) / 2;
1962 params
.dst
.x_offset
*= 2;
1963 params
.dst
.y_offset
/= 2;
1964 wm_prog_key
.use_kill
= true;
1967 if (params
.src
.map_stencil_as_y_tiled
) {
1968 /* We must modify the size and x/y offset of the source surface to
1969 * account for the fact that we are mapping it as Y-tiled when it is in
1972 * See the comments above concerning x/y offset alignment for the
1973 * destination surface.
1975 * TODO: what if this makes the texture size too large?
1977 const unsigned x_align
= 8, y_align
= params
.src
.num_samples
!= 0 ? 8 : 4;
1978 params
.src
.width
= ALIGN(params
.src
.width
, x_align
) * 2;
1979 params
.src
.height
= ALIGN(params
.src
.height
, y_align
) / 2;
1980 params
.src
.x_offset
*= 2;
1981 params
.src
.y_offset
/= 2;
1984 brw_blorp_get_blit_kernel(brw
, ¶ms
, &wm_prog_key
);
1986 params
.src
.swizzle
= src_swizzle
;
1988 brw_blorp_exec(brw
, ¶ms
);
1990 intel_miptree_slice_set_needs_hiz_resolve(dst_mt
, dst_level
, dst_layer
);
1992 if (intel_miptree_is_lossless_compressed(brw
, dst_mt
))
1993 dst_mt
->fast_clear_state
= INTEL_FAST_CLEAR_STATE_UNRESOLVED
;