2 * Copyright © 2012 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 #include "main/context.h"
25 #include "main/teximage.h"
26 #include "main/fbobject.h"
28 #include "compiler/nir/nir_builder.h"
30 #include "intel_fbo.h"
32 #include "brw_blorp.h"
33 #include "brw_context.h"
34 #include "brw_state.h"
35 #include "brw_meta_util.h"
37 #define FILE_DEBUG_FLAG DEBUG_BLORP
39 static struct intel_mipmap_tree
*
40 find_miptree(GLbitfield buffer_bit
, struct intel_renderbuffer
*irb
)
42 struct intel_mipmap_tree
*mt
= irb
->mt
;
43 if (buffer_bit
== GL_STENCIL_BUFFER_BIT
&& mt
->stencil_mt
)
49 blorp_get_texture_swizzle(const struct intel_renderbuffer
*irb
)
51 return irb
->Base
.Base
._BaseFormat
== GL_RGB
?
52 MAKE_SWIZZLE4(SWIZZLE_X
, SWIZZLE_Y
, SWIZZLE_Z
, SWIZZLE_ONE
) :
57 do_blorp_blit(struct brw_context
*brw
, GLbitfield buffer_bit
,
58 struct intel_renderbuffer
*src_irb
, mesa_format src_format
,
59 struct intel_renderbuffer
*dst_irb
, mesa_format dst_format
,
60 GLfloat srcX0
, GLfloat srcY0
, GLfloat srcX1
, GLfloat srcY1
,
61 GLfloat dstX0
, GLfloat dstY0
, GLfloat dstX1
, GLfloat dstY1
,
62 GLenum filter
, bool mirror_x
, bool mirror_y
)
64 const struct gl_context
*ctx
= &brw
->ctx
;
66 /* Find source/dst miptrees */
67 struct intel_mipmap_tree
*src_mt
= find_miptree(buffer_bit
, src_irb
);
68 struct intel_mipmap_tree
*dst_mt
= find_miptree(buffer_bit
, dst_irb
);
70 const bool do_srgb
= ctx
->Color
.sRGBEnabled
;
73 brw_blorp_blit_miptrees(brw
,
74 src_mt
, src_irb
->mt_level
, src_irb
->mt_layer
,
75 src_format
, blorp_get_texture_swizzle(src_irb
),
76 dst_mt
, dst_irb
->mt_level
, dst_irb
->mt_layer
,
78 srcX0
, srcY0
, srcX1
, srcY1
,
79 dstX0
, dstY0
, dstX1
, dstY1
,
80 filter
, mirror_x
, mirror_y
,
83 dst_irb
->need_downsample
= true;
87 try_blorp_blit(struct brw_context
*brw
,
88 const struct gl_framebuffer
*read_fb
,
89 const struct gl_framebuffer
*draw_fb
,
90 GLfloat srcX0
, GLfloat srcY0
, GLfloat srcX1
, GLfloat srcY1
,
91 GLfloat dstX0
, GLfloat dstY0
, GLfloat dstX1
, GLfloat dstY1
,
92 GLenum filter
, GLbitfield buffer_bit
)
94 struct gl_context
*ctx
= &brw
->ctx
;
96 /* Sync up the state of window system buffers. We need to do this before
97 * we go looking for the buffers.
99 intel_prepare_render(brw
);
101 bool mirror_x
, mirror_y
;
102 if (brw_meta_mirror_clip_and_scissor(ctx
, read_fb
, draw_fb
,
103 &srcX0
, &srcY0
, &srcX1
, &srcY1
,
104 &dstX0
, &dstY0
, &dstX1
, &dstY1
,
105 &mirror_x
, &mirror_y
))
109 struct intel_renderbuffer
*src_irb
;
110 struct intel_renderbuffer
*dst_irb
;
111 struct intel_mipmap_tree
*src_mt
;
112 struct intel_mipmap_tree
*dst_mt
;
113 switch (buffer_bit
) {
114 case GL_COLOR_BUFFER_BIT
:
115 src_irb
= intel_renderbuffer(read_fb
->_ColorReadBuffer
);
116 for (unsigned i
= 0; i
< draw_fb
->_NumColorDrawBuffers
; ++i
) {
117 dst_irb
= intel_renderbuffer(draw_fb
->_ColorDrawBuffers
[i
]);
119 do_blorp_blit(brw
, buffer_bit
,
120 src_irb
, src_irb
->Base
.Base
.Format
,
121 dst_irb
, dst_irb
->Base
.Base
.Format
,
122 srcX0
, srcY0
, srcX1
, srcY1
,
123 dstX0
, dstY0
, dstX1
, dstY1
,
124 filter
, mirror_x
, mirror_y
);
127 case GL_DEPTH_BUFFER_BIT
:
129 intel_renderbuffer(read_fb
->Attachment
[BUFFER_DEPTH
].Renderbuffer
);
131 intel_renderbuffer(draw_fb
->Attachment
[BUFFER_DEPTH
].Renderbuffer
);
132 src_mt
= find_miptree(buffer_bit
, src_irb
);
133 dst_mt
= find_miptree(buffer_bit
, dst_irb
);
135 /* We can't handle format conversions between Z24 and other formats
136 * since we have to lie about the surface format. See the comments in
137 * brw_blorp_surface_info::set().
139 if ((src_mt
->format
== MESA_FORMAT_Z24_UNORM_X8_UINT
) !=
140 (dst_mt
->format
== MESA_FORMAT_Z24_UNORM_X8_UINT
))
143 do_blorp_blit(brw
, buffer_bit
, src_irb
, MESA_FORMAT_NONE
,
144 dst_irb
, MESA_FORMAT_NONE
, srcX0
, srcY0
,
145 srcX1
, srcY1
, dstX0
, dstY0
, dstX1
, dstY1
,
146 filter
, mirror_x
, mirror_y
);
148 case GL_STENCIL_BUFFER_BIT
:
150 intel_renderbuffer(read_fb
->Attachment
[BUFFER_STENCIL
].Renderbuffer
);
152 intel_renderbuffer(draw_fb
->Attachment
[BUFFER_STENCIL
].Renderbuffer
);
153 do_blorp_blit(brw
, buffer_bit
, src_irb
, MESA_FORMAT_NONE
,
154 dst_irb
, MESA_FORMAT_NONE
, srcX0
, srcY0
,
155 srcX1
, srcY1
, dstX0
, dstY0
, dstX1
, dstY1
,
156 filter
, mirror_x
, mirror_y
);
159 unreachable("not reached");
166 brw_blorp_copytexsubimage(struct brw_context
*brw
,
167 struct gl_renderbuffer
*src_rb
,
168 struct gl_texture_image
*dst_image
,
170 int srcX0
, int srcY0
,
171 int dstX0
, int dstY0
,
172 int width
, int height
)
174 struct gl_context
*ctx
= &brw
->ctx
;
175 struct intel_renderbuffer
*src_irb
= intel_renderbuffer(src_rb
);
176 struct intel_texture_image
*intel_image
= intel_texture_image(dst_image
);
178 /* No pixel transfer operations (zoom, bias, mapping), just a blit */
179 if (brw
->ctx
._ImageTransferState
)
182 /* Sync up the state of window system buffers. We need to do this before
183 * we go looking at the src renderbuffer's miptree.
185 intel_prepare_render(brw
);
187 struct intel_mipmap_tree
*src_mt
= src_irb
->mt
;
188 struct intel_mipmap_tree
*dst_mt
= intel_image
->mt
;
190 /* There is support for only up to eight samples. */
191 if (src_mt
->num_samples
> 8 || dst_mt
->num_samples
> 8)
194 /* BLORP is only supported from Gen6 onwards. */
198 if (_mesa_get_format_base_format(src_rb
->Format
) !=
199 _mesa_get_format_base_format(dst_image
->TexFormat
)) {
203 /* We can't handle format conversions between Z24 and other formats since
204 * we have to lie about the surface format. See the comments in
205 * brw_blorp_surface_info::set().
207 if ((src_mt
->format
== MESA_FORMAT_Z24_UNORM_X8_UINT
) !=
208 (dst_mt
->format
== MESA_FORMAT_Z24_UNORM_X8_UINT
)) {
212 if (!brw
->format_supported_as_render_target
[dst_image
->TexFormat
])
215 /* Source clipping shouldn't be necessary, since copytexsubimage (in
216 * src/mesa/main/teximage.c) calls _mesa_clip_copytexsubimage() which
219 * Destination clipping shouldn't be necessary since the restrictions on
220 * glCopyTexSubImage prevent the user from specifying a destination rectangle
221 * that falls outside the bounds of the destination texture.
222 * See error_check_subtexture_dimensions().
225 int srcY1
= srcY0
+ height
;
226 int srcX1
= srcX0
+ width
;
227 int dstX1
= dstX0
+ width
;
228 int dstY1
= dstY0
+ height
;
230 /* Account for the fact that in the system framebuffer, the origin is at
233 bool mirror_y
= false;
234 if (_mesa_is_winsys_fbo(ctx
->ReadBuffer
)) {
235 GLint tmp
= src_rb
->Height
- srcY0
;
236 srcY0
= src_rb
->Height
- srcY1
;
241 /* Account for face selection and texture view MinLayer */
242 int dst_slice
= slice
+ dst_image
->TexObject
->MinLayer
+ dst_image
->Face
;
243 int dst_level
= dst_image
->Level
+ dst_image
->TexObject
->MinLevel
;
245 brw_blorp_blit_miptrees(brw
,
246 src_mt
, src_irb
->mt_level
, src_irb
->mt_layer
,
247 src_rb
->Format
, blorp_get_texture_swizzle(src_irb
),
248 dst_mt
, dst_level
, dst_slice
,
249 dst_image
->TexFormat
,
250 srcX0
, srcY0
, srcX1
, srcY1
,
251 dstX0
, dstY0
, dstX1
, dstY1
,
252 GL_NEAREST
, false, mirror_y
,
255 /* If we're copying to a packed depth stencil texture and the source
256 * framebuffer has separate stencil, we need to also copy the stencil data
259 src_rb
= ctx
->ReadBuffer
->Attachment
[BUFFER_STENCIL
].Renderbuffer
;
260 if (_mesa_get_format_bits(dst_image
->TexFormat
, GL_STENCIL_BITS
) > 0 &&
262 src_irb
= intel_renderbuffer(src_rb
);
263 src_mt
= src_irb
->mt
;
265 if (src_mt
->stencil_mt
)
266 src_mt
= src_mt
->stencil_mt
;
267 if (dst_mt
->stencil_mt
)
268 dst_mt
= dst_mt
->stencil_mt
;
270 if (src_mt
!= dst_mt
) {
271 brw_blorp_blit_miptrees(brw
,
272 src_mt
, src_irb
->mt_level
, src_irb
->mt_layer
,
274 blorp_get_texture_swizzle(src_irb
),
275 dst_mt
, dst_level
, dst_slice
,
277 srcX0
, srcY0
, srcX1
, srcY1
,
278 dstX0
, dstY0
, dstX1
, dstY1
,
279 GL_NEAREST
, false, mirror_y
,
289 brw_blorp_framebuffer(struct brw_context
*brw
,
290 struct gl_framebuffer
*readFb
,
291 struct gl_framebuffer
*drawFb
,
292 GLint srcX0
, GLint srcY0
, GLint srcX1
, GLint srcY1
,
293 GLint dstX0
, GLint dstY0
, GLint dstX1
, GLint dstY1
,
294 GLbitfield mask
, GLenum filter
)
296 /* BLORP is not supported before Gen6. */
300 static GLbitfield buffer_bits
[] = {
303 GL_STENCIL_BUFFER_BIT
,
306 for (unsigned int i
= 0; i
< ARRAY_SIZE(buffer_bits
); ++i
) {
307 if ((mask
& buffer_bits
[i
]) &&
308 try_blorp_blit(brw
, readFb
, drawFb
,
309 srcX0
, srcY0
, srcX1
, srcY1
,
310 dstX0
, dstY0
, dstX1
, dstY1
,
311 filter
, buffer_bits
[i
])) {
312 mask
&= ~buffer_bits
[i
];
321 * Enum to specify the order of arguments in a sampler message
323 enum sampler_message_arg
325 SAMPLER_MESSAGE_ARG_U_FLOAT
,
326 SAMPLER_MESSAGE_ARG_V_FLOAT
,
327 SAMPLER_MESSAGE_ARG_U_INT
,
328 SAMPLER_MESSAGE_ARG_V_INT
,
329 SAMPLER_MESSAGE_ARG_R_INT
,
330 SAMPLER_MESSAGE_ARG_SI_INT
,
331 SAMPLER_MESSAGE_ARG_MCS_INT
,
332 SAMPLER_MESSAGE_ARG_ZERO_INT
,
335 struct brw_blorp_blit_vars
{
336 /* Input values from brw_blorp_wm_inputs */
337 nir_variable
*v_discard_rect
;
338 nir_variable
*v_rect_grid
;
339 nir_variable
*v_coord_transform
;
340 nir_variable
*v_src_z
;
343 nir_variable
*frag_coord
;
346 nir_variable
*color_out
;
350 brw_blorp_blit_vars_init(nir_builder
*b
, struct brw_blorp_blit_vars
*v
,
351 const struct brw_blorp_blit_prog_key
*key
)
353 /* Blended and scaled blits never use pixel discard. */
354 assert(!key
->use_kill
|| !(key
->blend
&& key
->blit_scaled
));
356 #define LOAD_INPUT(name, type)\
357 v->v_##name = nir_variable_create(b->shader, nir_var_shader_in, \
359 v->v_##name->data.interpolation = INTERP_MODE_FLAT; \
360 v->v_##name->data.location = VARYING_SLOT_VAR0 + \
361 offsetof(struct brw_blorp_wm_inputs, name) / (4 * sizeof(float));
363 LOAD_INPUT(discard_rect
, glsl_vec4_type())
364 LOAD_INPUT(rect_grid
, glsl_vec4_type())
365 LOAD_INPUT(coord_transform
, glsl_vec4_type())
366 LOAD_INPUT(src_z
, glsl_uint_type())
370 v
->frag_coord
= nir_variable_create(b
->shader
, nir_var_shader_in
,
371 glsl_vec4_type(), "gl_FragCoord");
372 v
->frag_coord
->data
.location
= VARYING_SLOT_POS
;
373 v
->frag_coord
->data
.origin_upper_left
= true;
375 v
->color_out
= nir_variable_create(b
->shader
, nir_var_shader_out
,
376 glsl_vec4_type(), "gl_FragColor");
377 v
->color_out
->data
.location
= FRAG_RESULT_COLOR
;
381 blorp_blit_get_frag_coords(nir_builder
*b
,
382 const struct brw_blorp_blit_prog_key
*key
,
383 struct brw_blorp_blit_vars
*v
)
385 nir_ssa_def
*coord
= nir_f2i(b
, nir_load_var(b
, v
->frag_coord
));
387 if (key
->persample_msaa_dispatch
) {
388 return nir_vec3(b
, nir_channel(b
, coord
, 0), nir_channel(b
, coord
, 1),
389 nir_load_system_value(b
, nir_intrinsic_load_sample_id
, 0));
391 return nir_vec2(b
, nir_channel(b
, coord
, 0), nir_channel(b
, coord
, 1));
396 * Emit code to translate from destination (X, Y) coordinates to source (X, Y)
400 blorp_blit_apply_transform(nir_builder
*b
, nir_ssa_def
*src_pos
,
401 struct brw_blorp_blit_vars
*v
)
403 nir_ssa_def
*coord_transform
= nir_load_var(b
, v
->v_coord_transform
);
405 nir_ssa_def
*offset
= nir_vec2(b
, nir_channel(b
, coord_transform
, 1),
406 nir_channel(b
, coord_transform
, 3));
407 nir_ssa_def
*mul
= nir_vec2(b
, nir_channel(b
, coord_transform
, 0),
408 nir_channel(b
, coord_transform
, 2));
410 return nir_ffma(b
, src_pos
, mul
, offset
);
414 blorp_nir_discard_if_outside_rect(nir_builder
*b
, nir_ssa_def
*pos
,
415 struct brw_blorp_blit_vars
*v
)
417 nir_ssa_def
*c0
, *c1
, *c2
, *c3
;
418 nir_ssa_def
*discard_rect
= nir_load_var(b
, v
->v_discard_rect
);
419 nir_ssa_def
*dst_x0
= nir_channel(b
, discard_rect
, 0);
420 nir_ssa_def
*dst_x1
= nir_channel(b
, discard_rect
, 1);
421 nir_ssa_def
*dst_y0
= nir_channel(b
, discard_rect
, 2);
422 nir_ssa_def
*dst_y1
= nir_channel(b
, discard_rect
, 3);
424 c0
= nir_ult(b
, nir_channel(b
, pos
, 0), dst_x0
);
425 c1
= nir_uge(b
, nir_channel(b
, pos
, 0), dst_x1
);
426 c2
= nir_ult(b
, nir_channel(b
, pos
, 1), dst_y0
);
427 c3
= nir_uge(b
, nir_channel(b
, pos
, 1), dst_y1
);
429 nir_ssa_def
*oob
= nir_ior(b
, nir_ior(b
, c0
, c1
), nir_ior(b
, c2
, c3
));
431 nir_intrinsic_instr
*discard
=
432 nir_intrinsic_instr_create(b
->shader
, nir_intrinsic_discard_if
);
433 discard
->src
[0] = nir_src_for_ssa(oob
);
434 nir_builder_instr_insert(b
, &discard
->instr
);
437 static nir_tex_instr
*
438 blorp_create_nir_tex_instr(nir_builder
*b
, struct brw_blorp_blit_vars
*v
,
439 nir_texop op
, nir_ssa_def
*pos
, unsigned num_srcs
,
440 enum brw_reg_type dst_type
)
442 nir_tex_instr
*tex
= nir_tex_instr_create(b
->shader
, num_srcs
);
447 case BRW_REGISTER_TYPE_F
:
448 tex
->dest_type
= nir_type_float
;
450 case BRW_REGISTER_TYPE_D
:
451 tex
->dest_type
= nir_type_int
;
453 case BRW_REGISTER_TYPE_UD
:
454 tex
->dest_type
= nir_type_uint
;
457 unreachable("Invalid texture return type");
460 tex
->is_array
= false;
461 tex
->is_shadow
= false;
463 /* Blorp only has one texture and it's bound at unit 0 */
466 tex
->texture_index
= 0;
467 tex
->sampler_index
= 0;
469 /* To properly handle 3-D and 2-D array textures, we pull the Z component
470 * from an input. TODO: This is a bit magic; we should probably make this
471 * more explicit in the future.
473 assert(pos
->num_components
>= 2);
474 pos
= nir_vec3(b
, nir_channel(b
, pos
, 0), nir_channel(b
, pos
, 1),
475 nir_load_var(b
, v
->v_src_z
));
477 tex
->src
[0].src_type
= nir_tex_src_coord
;
478 tex
->src
[0].src
= nir_src_for_ssa(pos
);
479 tex
->coord_components
= 3;
481 nir_ssa_dest_init(&tex
->instr
, &tex
->dest
, 4, 32, NULL
);
487 blorp_nir_tex(nir_builder
*b
, struct brw_blorp_blit_vars
*v
,
488 nir_ssa_def
*pos
, enum brw_reg_type dst_type
)
491 blorp_create_nir_tex_instr(b
, v
, nir_texop_tex
, pos
, 2, dst_type
);
493 assert(pos
->num_components
== 2);
494 tex
->sampler_dim
= GLSL_SAMPLER_DIM_2D
;
495 tex
->src
[1].src_type
= nir_tex_src_lod
;
496 tex
->src
[1].src
= nir_src_for_ssa(nir_imm_int(b
, 0));
498 nir_builder_instr_insert(b
, &tex
->instr
);
500 return &tex
->dest
.ssa
;
504 blorp_nir_txf(nir_builder
*b
, struct brw_blorp_blit_vars
*v
,
505 nir_ssa_def
*pos
, enum brw_reg_type dst_type
)
508 blorp_create_nir_tex_instr(b
, v
, nir_texop_txf
, pos
, 2, dst_type
);
510 tex
->sampler_dim
= GLSL_SAMPLER_DIM_3D
;
511 tex
->src
[1].src_type
= nir_tex_src_lod
;
512 tex
->src
[1].src
= nir_src_for_ssa(nir_imm_int(b
, 0));
514 nir_builder_instr_insert(b
, &tex
->instr
);
516 return &tex
->dest
.ssa
;
520 blorp_nir_txf_ms(nir_builder
*b
, struct brw_blorp_blit_vars
*v
,
521 nir_ssa_def
*pos
, nir_ssa_def
*mcs
, enum brw_reg_type dst_type
)
524 blorp_create_nir_tex_instr(b
, v
, nir_texop_txf_ms
, pos
,
525 mcs
!= NULL
? 3 : 2, dst_type
);
527 tex
->sampler_dim
= GLSL_SAMPLER_DIM_MS
;
529 tex
->src
[1].src_type
= nir_tex_src_ms_index
;
530 if (pos
->num_components
== 2) {
531 tex
->src
[1].src
= nir_src_for_ssa(nir_imm_int(b
, 0));
533 assert(pos
->num_components
== 3);
534 tex
->src
[1].src
= nir_src_for_ssa(nir_channel(b
, pos
, 2));
538 tex
->src
[2].src_type
= nir_tex_src_ms_mcs
;
539 tex
->src
[2].src
= nir_src_for_ssa(mcs
);
542 nir_builder_instr_insert(b
, &tex
->instr
);
544 return &tex
->dest
.ssa
;
548 blorp_nir_txf_ms_mcs(nir_builder
*b
, struct brw_blorp_blit_vars
*v
, nir_ssa_def
*pos
)
551 blorp_create_nir_tex_instr(b
, v
, nir_texop_txf_ms_mcs
,
552 pos
, 1, BRW_REGISTER_TYPE_D
);
554 tex
->sampler_dim
= GLSL_SAMPLER_DIM_MS
;
556 nir_builder_instr_insert(b
, &tex
->instr
);
558 return &tex
->dest
.ssa
;
562 nir_mask_shift_or(struct nir_builder
*b
, nir_ssa_def
*dst
, nir_ssa_def
*src
,
563 uint32_t src_mask
, int src_left_shift
)
565 nir_ssa_def
*masked
= nir_iand(b
, src
, nir_imm_int(b
, src_mask
));
567 nir_ssa_def
*shifted
;
568 if (src_left_shift
> 0) {
569 shifted
= nir_ishl(b
, masked
, nir_imm_int(b
, src_left_shift
));
570 } else if (src_left_shift
< 0) {
571 shifted
= nir_ushr(b
, masked
, nir_imm_int(b
, -src_left_shift
));
573 assert(src_left_shift
== 0);
577 return nir_ior(b
, dst
, shifted
);
581 * Emit code to compensate for the difference between Y and W tiling.
583 * This code modifies the X and Y coordinates according to the formula:
585 * (X', Y', S') = detile(W-MAJOR, tile(Y-MAJOR, X, Y, S))
587 * (See brw_blorp_build_nir_shader).
589 static inline nir_ssa_def
*
590 blorp_nir_retile_y_to_w(nir_builder
*b
, nir_ssa_def
*pos
)
592 assert(pos
->num_components
== 2);
593 nir_ssa_def
*x_Y
= nir_channel(b
, pos
, 0);
594 nir_ssa_def
*y_Y
= nir_channel(b
, pos
, 1);
596 /* Given X and Y coordinates that describe an address using Y tiling,
597 * translate to the X and Y coordinates that describe the same address
600 * If we break down the low order bits of X and Y, using a
601 * single letter to represent each low-order bit:
603 * X = A << 7 | 0bBCDEFGH
604 * Y = J << 5 | 0bKLMNP (1)
606 * Then we can apply the Y tiling formula to see the memory offset being
609 * offset = (J * tile_pitch + A) << 12 | 0bBCDKLMNPEFGH (2)
611 * If we apply the W detiling formula to this memory location, that the
612 * corresponding X' and Y' coordinates are:
614 * X' = A << 6 | 0bBCDPFH (3)
615 * Y' = J << 6 | 0bKLMNEG
617 * Combining (1) and (3), we see that to transform (X, Y) to (X', Y'),
618 * we need to make the following computation:
620 * X' = (X & ~0b1011) >> 1 | (Y & 0b1) << 2 | X & 0b1 (4)
621 * Y' = (Y & ~0b1) << 1 | (X & 0b1000) >> 2 | (X & 0b10) >> 1
623 nir_ssa_def
*x_W
= nir_imm_int(b
, 0);
624 x_W
= nir_mask_shift_or(b
, x_W
, x_Y
, 0xfffffff4, -1);
625 x_W
= nir_mask_shift_or(b
, x_W
, y_Y
, 0x1, 2);
626 x_W
= nir_mask_shift_or(b
, x_W
, x_Y
, 0x1, 0);
628 nir_ssa_def
*y_W
= nir_imm_int(b
, 0);
629 y_W
= nir_mask_shift_or(b
, y_W
, y_Y
, 0xfffffffe, 1);
630 y_W
= nir_mask_shift_or(b
, y_W
, x_Y
, 0x8, -2);
631 y_W
= nir_mask_shift_or(b
, y_W
, x_Y
, 0x2, -1);
633 return nir_vec2(b
, x_W
, y_W
);
637 * Emit code to compensate for the difference between Y and W tiling.
639 * This code modifies the X and Y coordinates according to the formula:
641 * (X', Y', S') = detile(Y-MAJOR, tile(W-MAJOR, X, Y, S))
643 * (See brw_blorp_build_nir_shader).
645 static inline nir_ssa_def
*
646 blorp_nir_retile_w_to_y(nir_builder
*b
, nir_ssa_def
*pos
)
648 assert(pos
->num_components
== 2);
649 nir_ssa_def
*x_W
= nir_channel(b
, pos
, 0);
650 nir_ssa_def
*y_W
= nir_channel(b
, pos
, 1);
652 /* Applying the same logic as above, but in reverse, we obtain the
655 * X' = (X & ~0b101) << 1 | (Y & 0b10) << 2 | (Y & 0b1) << 1 | X & 0b1
656 * Y' = (Y & ~0b11) >> 1 | (X & 0b100) >> 2
658 nir_ssa_def
*x_Y
= nir_imm_int(b
, 0);
659 x_Y
= nir_mask_shift_or(b
, x_Y
, x_W
, 0xfffffffa, 1);
660 x_Y
= nir_mask_shift_or(b
, x_Y
, y_W
, 0x2, 2);
661 x_Y
= nir_mask_shift_or(b
, x_Y
, y_W
, 0x1, 1);
662 x_Y
= nir_mask_shift_or(b
, x_Y
, x_W
, 0x1, 0);
664 nir_ssa_def
*y_Y
= nir_imm_int(b
, 0);
665 y_Y
= nir_mask_shift_or(b
, y_Y
, y_W
, 0xfffffffc, -1);
666 y_Y
= nir_mask_shift_or(b
, y_Y
, x_W
, 0x4, -2);
668 return nir_vec2(b
, x_Y
, y_Y
);
672 * Emit code to compensate for the difference between MSAA and non-MSAA
675 * This code modifies the X and Y coordinates according to the formula:
677 * (X', Y', S') = encode_msaa(num_samples, IMS, X, Y, S)
679 * (See brw_blorp_blit_program).
681 static inline nir_ssa_def
*
682 blorp_nir_encode_msaa(nir_builder
*b
, nir_ssa_def
*pos
,
683 unsigned num_samples
, enum isl_msaa_layout layout
)
685 assert(pos
->num_components
== 2 || pos
->num_components
== 3);
688 case ISL_MSAA_LAYOUT_NONE
:
689 assert(pos
->num_components
== 2);
691 case ISL_MSAA_LAYOUT_ARRAY
:
692 /* No translation needed */
694 case ISL_MSAA_LAYOUT_INTERLEAVED
: {
695 nir_ssa_def
*x_in
= nir_channel(b
, pos
, 0);
696 nir_ssa_def
*y_in
= nir_channel(b
, pos
, 1);
697 nir_ssa_def
*s_in
= pos
->num_components
== 2 ? nir_imm_int(b
, 0) :
698 nir_channel(b
, pos
, 2);
700 nir_ssa_def
*x_out
= nir_imm_int(b
, 0);
701 nir_ssa_def
*y_out
= nir_imm_int(b
, 0);
702 switch (num_samples
) {
705 /* encode_msaa(2, IMS, X, Y, S) = (X', Y', 0)
706 * where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1)
709 * encode_msaa(4, IMS, X, Y, S) = (X', Y', 0)
710 * where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1)
711 * Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1)
713 x_out
= nir_mask_shift_or(b
, x_out
, x_in
, 0xfffffffe, 1);
714 x_out
= nir_mask_shift_or(b
, x_out
, s_in
, 0x1, 1);
715 x_out
= nir_mask_shift_or(b
, x_out
, x_in
, 0x1, 0);
716 if (num_samples
== 2) {
719 y_out
= nir_mask_shift_or(b
, y_out
, y_in
, 0xfffffffe, 1);
720 y_out
= nir_mask_shift_or(b
, y_out
, s_in
, 0x2, 0);
721 y_out
= nir_mask_shift_or(b
, y_out
, y_in
, 0x1, 0);
726 /* encode_msaa(8, IMS, X, Y, S) = (X', Y', 0)
727 * where X' = (X & ~0b1) << 2 | (S & 0b100) | (S & 0b1) << 1
729 * Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1)
731 x_out
= nir_mask_shift_or(b
, x_out
, x_in
, 0xfffffffe, 2);
732 x_out
= nir_mask_shift_or(b
, x_out
, s_in
, 0x4, 0);
733 x_out
= nir_mask_shift_or(b
, x_out
, s_in
, 0x1, 1);
734 x_out
= nir_mask_shift_or(b
, x_out
, x_in
, 0x1, 0);
735 y_out
= nir_mask_shift_or(b
, y_out
, y_in
, 0xfffffffe, 1);
736 y_out
= nir_mask_shift_or(b
, y_out
, s_in
, 0x2, 0);
737 y_out
= nir_mask_shift_or(b
, y_out
, y_in
, 0x1, 0);
741 /* encode_msaa(16, IMS, X, Y, S) = (X', Y', 0)
742 * where X' = (X & ~0b1) << 2 | (S & 0b100) | (S & 0b1) << 1
744 * Y' = (Y & ~0b1) << 2 | (S & 0b1000) >> 1 (S & 0b10)
747 x_out
= nir_mask_shift_or(b
, x_out
, x_in
, 0xfffffffe, 2);
748 x_out
= nir_mask_shift_or(b
, x_out
, s_in
, 0x4, 0);
749 x_out
= nir_mask_shift_or(b
, x_out
, s_in
, 0x1, 1);
750 x_out
= nir_mask_shift_or(b
, x_out
, x_in
, 0x1, 0);
751 y_out
= nir_mask_shift_or(b
, y_out
, y_in
, 0xfffffffe, 2);
752 y_out
= nir_mask_shift_or(b
, y_out
, s_in
, 0x8, -1);
753 y_out
= nir_mask_shift_or(b
, y_out
, s_in
, 0x2, 0);
754 y_out
= nir_mask_shift_or(b
, y_out
, y_in
, 0x1, 0);
758 unreachable("Invalid number of samples for IMS layout");
761 return nir_vec2(b
, x_out
, y_out
);
765 unreachable("Invalid MSAA layout");
770 * Emit code to compensate for the difference between MSAA and non-MSAA
773 * This code modifies the X and Y coordinates according to the formula:
775 * (X', Y', S) = decode_msaa(num_samples, IMS, X, Y, S)
777 * (See brw_blorp_blit_program).
779 static inline nir_ssa_def
*
780 blorp_nir_decode_msaa(nir_builder
*b
, nir_ssa_def
*pos
,
781 unsigned num_samples
, enum isl_msaa_layout layout
)
783 assert(pos
->num_components
== 2 || pos
->num_components
== 3);
786 case ISL_MSAA_LAYOUT_NONE
:
787 /* No translation necessary, and S should already be zero. */
788 assert(pos
->num_components
== 2);
790 case ISL_MSAA_LAYOUT_ARRAY
:
791 /* No translation necessary. */
793 case ISL_MSAA_LAYOUT_INTERLEAVED
: {
794 assert(pos
->num_components
== 2);
796 nir_ssa_def
*x_in
= nir_channel(b
, pos
, 0);
797 nir_ssa_def
*y_in
= nir_channel(b
, pos
, 1);
799 nir_ssa_def
*x_out
= nir_imm_int(b
, 0);
800 nir_ssa_def
*y_out
= nir_imm_int(b
, 0);
801 nir_ssa_def
*s_out
= nir_imm_int(b
, 0);
802 switch (num_samples
) {
805 /* decode_msaa(2, IMS, X, Y, 0) = (X', Y', S)
806 * where X' = (X & ~0b11) >> 1 | (X & 0b1)
807 * S = (X & 0b10) >> 1
809 * decode_msaa(4, IMS, X, Y, 0) = (X', Y', S)
810 * where X' = (X & ~0b11) >> 1 | (X & 0b1)
811 * Y' = (Y & ~0b11) >> 1 | (Y & 0b1)
812 * S = (Y & 0b10) | (X & 0b10) >> 1
814 x_out
= nir_mask_shift_or(b
, x_out
, x_in
, 0xfffffffc, -1);
815 x_out
= nir_mask_shift_or(b
, x_out
, x_in
, 0x1, 0);
816 if (num_samples
== 2) {
818 s_out
= nir_mask_shift_or(b
, s_out
, x_in
, 0x2, -1);
820 y_out
= nir_mask_shift_or(b
, y_out
, y_in
, 0xfffffffc, -1);
821 y_out
= nir_mask_shift_or(b
, y_out
, y_in
, 0x1, 0);
822 s_out
= nir_mask_shift_or(b
, s_out
, x_in
, 0x2, -1);
823 s_out
= nir_mask_shift_or(b
, s_out
, y_in
, 0x2, 0);
828 /* decode_msaa(8, IMS, X, Y, 0) = (X', Y', S)
829 * where X' = (X & ~0b111) >> 2 | (X & 0b1)
830 * Y' = (Y & ~0b11) >> 1 | (Y & 0b1)
831 * S = (X & 0b100) | (Y & 0b10) | (X & 0b10) >> 1
833 x_out
= nir_mask_shift_or(b
, x_out
, x_in
, 0xfffffff8, -2);
834 x_out
= nir_mask_shift_or(b
, x_out
, x_in
, 0x1, 0);
835 y_out
= nir_mask_shift_or(b
, y_out
, y_in
, 0xfffffffc, -1);
836 y_out
= nir_mask_shift_or(b
, y_out
, y_in
, 0x1, 0);
837 s_out
= nir_mask_shift_or(b
, s_out
, x_in
, 0x4, 0);
838 s_out
= nir_mask_shift_or(b
, s_out
, y_in
, 0x2, 0);
839 s_out
= nir_mask_shift_or(b
, s_out
, x_in
, 0x2, -1);
843 /* decode_msaa(16, IMS, X, Y, 0) = (X', Y', S)
844 * where X' = (X & ~0b111) >> 2 | (X & 0b1)
845 * Y' = (Y & ~0b111) >> 2 | (Y & 0b1)
846 * S = (Y & 0b100) << 1 | (X & 0b100) |
847 * (Y & 0b10) | (X & 0b10) >> 1
849 x_out
= nir_mask_shift_or(b
, x_out
, x_in
, 0xfffffff8, -2);
850 x_out
= nir_mask_shift_or(b
, x_out
, x_in
, 0x1, 0);
851 y_out
= nir_mask_shift_or(b
, y_out
, y_in
, 0xfffffff8, -2);
852 y_out
= nir_mask_shift_or(b
, y_out
, y_in
, 0x1, 0);
853 s_out
= nir_mask_shift_or(b
, s_out
, y_in
, 0x4, 1);
854 s_out
= nir_mask_shift_or(b
, s_out
, x_in
, 0x4, 0);
855 s_out
= nir_mask_shift_or(b
, s_out
, y_in
, 0x2, 0);
856 s_out
= nir_mask_shift_or(b
, s_out
, x_in
, 0x2, -1);
860 unreachable("Invalid number of samples for IMS layout");
863 return nir_vec3(b
, x_out
, y_out
, s_out
);
867 unreachable("Invalid MSAA layout");
872 * Count the number of trailing 1 bits in the given value. For example:
874 * count_trailing_one_bits(0) == 0
875 * count_trailing_one_bits(7) == 3
876 * count_trailing_one_bits(11) == 2
878 static inline int count_trailing_one_bits(unsigned value
)
880 #ifdef HAVE___BUILTIN_CTZ
881 return __builtin_ctz(~value
);
883 return _mesa_bitcount(value
& ~(value
+ 1));
888 blorp_nir_manual_blend_average(nir_builder
*b
, struct brw_blorp_blit_vars
*v
,
889 nir_ssa_def
*pos
, unsigned tex_samples
,
890 enum isl_aux_usage tex_aux_usage
,
891 enum brw_reg_type dst_type
)
893 /* If non-null, this is the outer-most if statement */
894 nir_if
*outer_if
= NULL
;
896 nir_variable
*color
=
897 nir_local_variable_create(b
->impl
, glsl_vec4_type(), "color");
899 nir_ssa_def
*mcs
= NULL
;
900 if (tex_aux_usage
== ISL_AUX_USAGE_MCS
)
901 mcs
= blorp_nir_txf_ms_mcs(b
, v
, pos
);
903 /* We add together samples using a binary tree structure, e.g. for 4x MSAA:
905 * result = ((sample[0] + sample[1]) + (sample[2] + sample[3])) / 4
907 * This ensures that when all samples have the same value, no numerical
908 * precision is lost, since each addition operation always adds two equal
909 * values, and summing two equal floating point values does not lose
912 * We perform this computation by treating the texture_data array as a
913 * stack and performing the following operations:
915 * - push sample 0 onto stack
916 * - push sample 1 onto stack
917 * - add top two stack entries
918 * - push sample 2 onto stack
919 * - push sample 3 onto stack
920 * - add top two stack entries
921 * - add top two stack entries
922 * - divide top stack entry by 4
924 * Note that after pushing sample i onto the stack, the number of add
925 * operations we do is equal to the number of trailing 1 bits in i. This
926 * works provided the total number of samples is a power of two, which it
927 * always is for i965.
929 * For integer formats, we replace the add operations with average
930 * operations and skip the final division.
932 nir_ssa_def
*texture_data
[5];
933 unsigned stack_depth
= 0;
934 for (unsigned i
= 0; i
< tex_samples
; ++i
) {
935 assert(stack_depth
== _mesa_bitcount(i
)); /* Loop invariant */
937 /* Push sample i onto the stack */
938 assert(stack_depth
< ARRAY_SIZE(texture_data
));
940 nir_ssa_def
*ms_pos
= nir_vec3(b
, nir_channel(b
, pos
, 0),
941 nir_channel(b
, pos
, 1),
943 texture_data
[stack_depth
++] = blorp_nir_txf_ms(b
, v
, ms_pos
, mcs
, dst_type
);
945 if (i
== 0 && tex_aux_usage
== ISL_AUX_USAGE_MCS
) {
946 /* The Ivy Bridge PRM, Vol4 Part1 p27 (Multisample Control Surface)
947 * suggests an optimization:
949 * "A simple optimization with probable large return in
950 * performance is to compare the MCS value to zero (indicating
951 * all samples are on sample slice 0), and sample only from
952 * sample slice 0 using ld2dss if MCS is zero."
954 * Note that in the case where the MCS value is zero, sampling from
955 * sample slice 0 using ld2dss and sampling from sample 0 using
956 * ld2dms are equivalent (since all samples are on sample slice 0).
957 * Since we have already sampled from sample 0, all we need to do is
958 * skip the remaining fetches and averaging if MCS is zero.
960 nir_ssa_def
*mcs_zero
=
961 nir_ieq(b
, nir_channel(b
, mcs
, 0), nir_imm_int(b
, 0));
962 if (tex_samples
== 16) {
963 mcs_zero
= nir_iand(b
, mcs_zero
,
964 nir_ieq(b
, nir_channel(b
, mcs
, 1), nir_imm_int(b
, 0)));
967 nir_if
*if_stmt
= nir_if_create(b
->shader
);
968 if_stmt
->condition
= nir_src_for_ssa(mcs_zero
);
969 nir_cf_node_insert(b
->cursor
, &if_stmt
->cf_node
);
971 b
->cursor
= nir_after_cf_list(&if_stmt
->then_list
);
972 nir_store_var(b
, color
, texture_data
[0], 0xf);
974 b
->cursor
= nir_after_cf_list(&if_stmt
->else_list
);
978 for (int j
= 0; j
< count_trailing_one_bits(i
); j
++) {
979 assert(stack_depth
>= 2);
982 assert(dst_type
== BRW_REGISTER_TYPE_F
);
983 texture_data
[stack_depth
- 1] =
984 nir_fadd(b
, texture_data
[stack_depth
- 1],
985 texture_data
[stack_depth
]);
989 /* We should have just 1 sample on the stack now. */
990 assert(stack_depth
== 1);
992 texture_data
[0] = nir_fmul(b
, texture_data
[0],
993 nir_imm_float(b
, 1.0 / tex_samples
));
995 nir_store_var(b
, color
, texture_data
[0], 0xf);
998 b
->cursor
= nir_after_cf_node(&outer_if
->cf_node
);
1000 return nir_load_var(b
, color
);
1003 static inline nir_ssa_def
*
1004 nir_imm_vec2(nir_builder
*build
, float x
, float y
)
1008 memset(&v
, 0, sizeof(v
));
1012 return nir_build_imm(build
, 4, 32, v
);
1015 static nir_ssa_def
*
1016 blorp_nir_manual_blend_bilinear(nir_builder
*b
, nir_ssa_def
*pos
,
1017 unsigned tex_samples
,
1018 const brw_blorp_blit_prog_key
*key
,
1019 struct brw_blorp_blit_vars
*v
)
1021 nir_ssa_def
*pos_xy
= nir_channels(b
, pos
, 0x3);
1022 nir_ssa_def
*rect_grid
= nir_load_var(b
, v
->v_rect_grid
);
1023 nir_ssa_def
*scale
= nir_imm_vec2(b
, key
->x_scale
, key
->y_scale
);
1025 /* Translate coordinates to lay out the samples in a rectangular grid
1026 * roughly corresponding to sample locations.
1028 pos_xy
= nir_fmul(b
, pos_xy
, scale
);
1029 /* Adjust coordinates so that integers represent pixel centers rather
1032 pos_xy
= nir_fadd(b
, pos_xy
, nir_imm_float(b
, -0.5));
1033 /* Clamp the X, Y texture coordinates to properly handle the sampling of
1034 * texels on texture edges.
1036 pos_xy
= nir_fmin(b
, nir_fmax(b
, pos_xy
, nir_imm_float(b
, 0.0)),
1037 nir_vec2(b
, nir_channel(b
, rect_grid
, 0),
1038 nir_channel(b
, rect_grid
, 1)));
1040 /* Store the fractional parts to be used as bilinear interpolation
1043 nir_ssa_def
*frac_xy
= nir_ffract(b
, pos_xy
);
1044 /* Round the float coordinates down to nearest integer */
1045 pos_xy
= nir_fdiv(b
, nir_ftrunc(b
, pos_xy
), scale
);
1047 nir_ssa_def
*tex_data
[4];
1048 for (unsigned i
= 0; i
< 4; ++i
) {
1049 float sample_off_x
= (float)(i
& 0x1) / key
->x_scale
;
1050 float sample_off_y
= (float)((i
>> 1) & 0x1) / key
->y_scale
;
1051 nir_ssa_def
*sample_off
= nir_imm_vec2(b
, sample_off_x
, sample_off_y
);
1053 nir_ssa_def
*sample_coords
= nir_fadd(b
, pos_xy
, sample_off
);
1054 nir_ssa_def
*sample_coords_int
= nir_f2i(b
, sample_coords
);
1056 /* The MCS value we fetch has to match up with the pixel that we're
1057 * sampling from. Since we sample from different pixels in each
1058 * iteration of this "for" loop, the call to mcs_fetch() should be
1059 * here inside the loop after computing the pixel coordinates.
1061 nir_ssa_def
*mcs
= NULL
;
1062 if (key
->tex_aux_usage
== ISL_AUX_USAGE_MCS
)
1063 mcs
= blorp_nir_txf_ms_mcs(b
, v
, sample_coords_int
);
1065 /* Compute sample index and map the sample index to a sample number.
1066 * Sample index layout shows the numbering of slots in a rectangular
1067 * grid of samples with in a pixel. Sample number layout shows the
1068 * rectangular grid of samples roughly corresponding to the real sample
1069 * locations with in a pixel.
1070 * In case of 4x MSAA, layout of sample indices matches the layout of
1078 * In case of 8x MSAA the two layouts don't match.
1079 * sample index layout : --------- sample number layout : ---------
1080 * | 0 | 1 | | 3 | 7 |
1081 * --------- ---------
1082 * | 2 | 3 | | 5 | 0 |
1083 * --------- ---------
1084 * | 4 | 5 | | 1 | 2 |
1085 * --------- ---------
1086 * | 6 | 7 | | 4 | 6 |
1087 * --------- ---------
1089 * Fortunately, this can be done fairly easily as:
1090 * S' = (0x17306425 >> (S * 4)) & 0xf
1092 * In the case of 16x MSAA the two layouts don't match.
1093 * Sample index layout: Sample number layout:
1094 * --------------------- ---------------------
1095 * | 0 | 1 | 2 | 3 | | 15 | 10 | 9 | 7 |
1096 * --------------------- ---------------------
1097 * | 4 | 5 | 6 | 7 | | 4 | 1 | 3 | 13 |
1098 * --------------------- ---------------------
1099 * | 8 | 9 | 10 | 11 | | 12 | 2 | 0 | 6 |
1100 * --------------------- ---------------------
1101 * | 12 | 13 | 14 | 15 | | 11 | 8 | 5 | 14 |
1102 * --------------------- ---------------------
1104 * This is equivalent to
1105 * S' = (0xe58b602cd31479af >> (S * 4)) & 0xf
1107 nir_ssa_def
*frac
= nir_ffract(b
, sample_coords
);
1108 nir_ssa_def
*sample
=
1109 nir_fdot2(b
, frac
, nir_imm_vec2(b
, key
->x_scale
,
1110 key
->x_scale
* key
->y_scale
));
1111 sample
= nir_f2i(b
, sample
);
1113 if (tex_samples
== 8) {
1114 sample
= nir_iand(b
, nir_ishr(b
, nir_imm_int(b
, 0x64210573),
1115 nir_ishl(b
, sample
, nir_imm_int(b
, 2))),
1116 nir_imm_int(b
, 0xf));
1117 } else if (tex_samples
== 16) {
1118 nir_ssa_def
*sample_low
=
1119 nir_iand(b
, nir_ishr(b
, nir_imm_int(b
, 0xd31479af),
1120 nir_ishl(b
, sample
, nir_imm_int(b
, 2))),
1121 nir_imm_int(b
, 0xf));
1122 nir_ssa_def
*sample_high
=
1123 nir_iand(b
, nir_ishr(b
, nir_imm_int(b
, 0xe58b602c),
1124 nir_ishl(b
, nir_iadd(b
, sample
,
1125 nir_imm_int(b
, -8)),
1126 nir_imm_int(b
, 2))),
1127 nir_imm_int(b
, 0xf));
1129 sample
= nir_bcsel(b
, nir_ilt(b
, sample
, nir_imm_int(b
, 8)),
1130 sample_low
, sample_high
);
1132 nir_ssa_def
*pos_ms
= nir_vec3(b
, nir_channel(b
, sample_coords_int
, 0),
1133 nir_channel(b
, sample_coords_int
, 1),
1135 tex_data
[i
] = blorp_nir_txf_ms(b
, v
, pos_ms
, mcs
, key
->texture_data_type
);
1138 nir_ssa_def
*frac_x
= nir_channel(b
, frac_xy
, 0);
1139 nir_ssa_def
*frac_y
= nir_channel(b
, frac_xy
, 1);
1140 return nir_flrp(b
, nir_flrp(b
, tex_data
[0], tex_data
[1], frac_x
),
1141 nir_flrp(b
, tex_data
[2], tex_data
[3], frac_x
),
1146 * Generator for WM programs used in BLORP blits.
1148 * The bulk of the work done by the WM program is to wrap and unwrap the
1149 * coordinate transformations used by the hardware to store surfaces in
1150 * memory. The hardware transforms a pixel location (X, Y, S) (where S is the
1151 * sample index for a multisampled surface) to a memory offset by the
1152 * following formulas:
1154 * offset = tile(tiling_format, encode_msaa(num_samples, layout, X, Y, S))
1155 * (X, Y, S) = decode_msaa(num_samples, layout, detile(tiling_format, offset))
1157 * For a single-sampled surface, or for a multisampled surface using
1158 * INTEL_MSAA_LAYOUT_UMS, encode_msaa() and decode_msaa are the identity
1161 * encode_msaa(1, NONE, X, Y, 0) = (X, Y, 0)
1162 * decode_msaa(1, NONE, X, Y, 0) = (X, Y, 0)
1163 * encode_msaa(n, UMS, X, Y, S) = (X, Y, S)
1164 * decode_msaa(n, UMS, X, Y, S) = (X, Y, S)
1166 * For a 4x multisampled surface using INTEL_MSAA_LAYOUT_IMS, encode_msaa()
1167 * embeds the sample number into bit 1 of the X and Y coordinates:
1169 * encode_msaa(4, IMS, X, Y, S) = (X', Y', 0)
1170 * where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1)
1171 * Y' = (Y & ~0b1 ) << 1 | (S & 0b10) | (Y & 0b1)
1172 * decode_msaa(4, IMS, X, Y, 0) = (X', Y', S)
1173 * where X' = (X & ~0b11) >> 1 | (X & 0b1)
1174 * Y' = (Y & ~0b11) >> 1 | (Y & 0b1)
1175 * S = (Y & 0b10) | (X & 0b10) >> 1
1177 * For an 8x multisampled surface using INTEL_MSAA_LAYOUT_IMS, encode_msaa()
1178 * embeds the sample number into bits 1 and 2 of the X coordinate and bit 1 of
1181 * encode_msaa(8, IMS, X, Y, S) = (X', Y', 0)
1182 * where X' = (X & ~0b1) << 2 | (S & 0b100) | (S & 0b1) << 1 | (X & 0b1)
1183 * Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1)
1184 * decode_msaa(8, IMS, X, Y, 0) = (X', Y', S)
1185 * where X' = (X & ~0b111) >> 2 | (X & 0b1)
1186 * Y' = (Y & ~0b11) >> 1 | (Y & 0b1)
1187 * S = (X & 0b100) | (Y & 0b10) | (X & 0b10) >> 1
1189 * For X tiling, tile() combines together the low-order bits of the X and Y
1190 * coordinates in the pattern 0byyyxxxxxxxxx, creating 4k tiles that are 512
1191 * bytes wide and 8 rows high:
1193 * tile(x_tiled, X, Y, S) = A
1194 * where A = tile_num << 12 | offset
1195 * tile_num = (Y' >> 3) * tile_pitch + (X' >> 9)
1196 * offset = (Y' & 0b111) << 9
1197 * | (X & 0b111111111)
1199 * Y' = Y + S * qpitch
1200 * detile(x_tiled, A) = (X, Y, S)
1201 * where X = X' / cpp
1204 * Y' = (tile_num / tile_pitch) << 3
1205 * | (A & 0b111000000000) >> 9
1206 * X' = (tile_num % tile_pitch) << 9
1207 * | (A & 0b111111111)
1209 * (In all tiling formulas, cpp is the number of bytes occupied by a single
1210 * sample ("chars per pixel"), tile_pitch is the number of 4k tiles required
1211 * to fill the width of the surface, and qpitch is the spacing (in rows)
1212 * between array slices).
1214 * For Y tiling, tile() combines together the low-order bits of the X and Y
1215 * coordinates in the pattern 0bxxxyyyyyxxxx, creating 4k tiles that are 128
1216 * bytes wide and 32 rows high:
1218 * tile(y_tiled, X, Y, S) = A
1219 * where A = tile_num << 12 | offset
1220 * tile_num = (Y' >> 5) * tile_pitch + (X' >> 7)
1221 * offset = (X' & 0b1110000) << 5
1222 * | (Y' & 0b11111) << 4
1225 * Y' = Y + S * qpitch
1226 * detile(y_tiled, A) = (X, Y, S)
1227 * where X = X' / cpp
1230 * Y' = (tile_num / tile_pitch) << 5
1231 * | (A & 0b111110000) >> 4
1232 * X' = (tile_num % tile_pitch) << 7
1233 * | (A & 0b111000000000) >> 5
1236 * For W tiling, tile() combines together the low-order bits of the X and Y
1237 * coordinates in the pattern 0bxxxyyyyxyxyx, creating 4k tiles that are 64
1238 * bytes wide and 64 rows high (note that W tiling is only used for stencil
1239 * buffers, which always have cpp = 1 and S=0):
1241 * tile(w_tiled, X, Y, S) = A
1242 * where A = tile_num << 12 | offset
1243 * tile_num = (Y' >> 6) * tile_pitch + (X' >> 6)
1244 * offset = (X' & 0b111000) << 6
1245 * | (Y' & 0b111100) << 3
1246 * | (X' & 0b100) << 2
1247 * | (Y' & 0b10) << 2
1248 * | (X' & 0b10) << 1
1252 * Y' = Y + S * qpitch
1253 * detile(w_tiled, A) = (X, Y, S)
1254 * where X = X' / cpp = X'
1255 * Y = Y' % qpitch = Y'
1256 * S = Y / qpitch = 0
1257 * Y' = (tile_num / tile_pitch) << 6
1258 * | (A & 0b111100000) >> 3
1259 * | (A & 0b1000) >> 2
1261 * X' = (tile_num % tile_pitch) << 6
1262 * | (A & 0b111000000000) >> 6
1263 * | (A & 0b10000) >> 2
1264 * | (A & 0b100) >> 1
1267 * Finally, for a non-tiled surface, tile() simply combines together the X and
1268 * Y coordinates in the natural way:
1270 * tile(untiled, X, Y, S) = A
1271 * where A = Y * pitch + X'
1273 * Y' = Y + S * qpitch
1274 * detile(untiled, A) = (X, Y, S)
1275 * where X = X' / cpp
1281 * (In these formulas, pitch is the number of bytes occupied by a single row
1285 brw_blorp_build_nir_shader(struct brw_context
*brw
,
1286 const brw_blorp_blit_prog_key
*key
)
1288 nir_ssa_def
*src_pos
, *dst_pos
, *color
;
1291 if (key
->dst_tiled_w
&& key
->rt_samples
> 1) {
1292 /* If the destination image is W tiled and multisampled, then the thread
1293 * must be dispatched once per sample, not once per pixel. This is
1294 * necessary because after conversion between W and Y tiling, there's no
1295 * guarantee that all samples corresponding to a single pixel will still
1298 assert(key
->persample_msaa_dispatch
);
1302 /* We are blending, which means we won't have an opportunity to
1303 * translate the tiling and sample count for the texture surface. So
1304 * the surface state for the texture must be configured with the correct
1305 * tiling and sample count.
1307 assert(!key
->src_tiled_w
);
1308 assert(key
->tex_samples
== key
->src_samples
);
1309 assert(key
->tex_layout
== key
->src_layout
);
1310 assert(key
->tex_samples
> 0);
1313 if (key
->persample_msaa_dispatch
) {
1314 /* It only makes sense to do persample dispatch if the render target is
1315 * configured as multisampled.
1317 assert(key
->rt_samples
> 0);
1320 /* Make sure layout is consistent with sample count */
1321 assert((key
->tex_layout
== ISL_MSAA_LAYOUT_NONE
) ==
1322 (key
->tex_samples
<= 1));
1323 assert((key
->rt_layout
== ISL_MSAA_LAYOUT_NONE
) ==
1324 (key
->rt_samples
<= 1));
1325 assert((key
->src_layout
== ISL_MSAA_LAYOUT_NONE
) ==
1326 (key
->src_samples
<= 1));
1327 assert((key
->dst_layout
== ISL_MSAA_LAYOUT_NONE
) ==
1328 (key
->dst_samples
<= 1));
1331 nir_builder_init_simple_shader(&b
, NULL
, MESA_SHADER_FRAGMENT
, NULL
);
1333 struct brw_blorp_blit_vars v
;
1334 brw_blorp_blit_vars_init(&b
, &v
, key
);
1336 dst_pos
= blorp_blit_get_frag_coords(&b
, key
, &v
);
1338 /* Render target and texture hardware don't support W tiling until Gen8. */
1339 const bool rt_tiled_w
= false;
1340 const bool tex_tiled_w
= brw
->gen
>= 8 && key
->src_tiled_w
;
1342 /* The address that data will be written to is determined by the
1343 * coordinates supplied to the WM thread and the tiling and sample count of
1344 * the render target, according to the formula:
1346 * (X, Y, S) = decode_msaa(rt_samples, detile(rt_tiling, offset))
1348 * If the actual tiling and sample count of the destination surface are not
1349 * the same as the configuration of the render target, then these
1350 * coordinates are wrong and we have to adjust them to compensate for the
1353 if (rt_tiled_w
!= key
->dst_tiled_w
||
1354 key
->rt_samples
!= key
->dst_samples
||
1355 key
->rt_layout
!= key
->dst_layout
) {
1356 dst_pos
= blorp_nir_encode_msaa(&b
, dst_pos
, key
->rt_samples
,
1358 /* Now (X, Y, S) = detile(rt_tiling, offset) */
1359 if (rt_tiled_w
!= key
->dst_tiled_w
)
1360 dst_pos
= blorp_nir_retile_y_to_w(&b
, dst_pos
);
1361 /* Now (X, Y, S) = detile(rt_tiling, offset) */
1362 dst_pos
= blorp_nir_decode_msaa(&b
, dst_pos
, key
->dst_samples
,
1366 /* Now (X, Y, S) = decode_msaa(dst_samples, detile(dst_tiling, offset)).
1368 * That is: X, Y and S now contain the true coordinates and sample index of
1369 * the data that the WM thread should output.
1371 * If we need to kill pixels that are outside the destination rectangle,
1372 * now is the time to do it.
1374 if (key
->use_kill
) {
1375 assert(!(key
->blend
&& key
->blit_scaled
));
1376 blorp_nir_discard_if_outside_rect(&b
, dst_pos
, &v
);
1379 src_pos
= blorp_blit_apply_transform(&b
, nir_i2f(&b
, dst_pos
), &v
);
1380 if (dst_pos
->num_components
== 3) {
1381 /* The sample coordinate is an integer that we want left alone but
1382 * blorp_blit_apply_transform() blindly applies the transform to all
1383 * three coordinates. Grab the original sample index.
1385 src_pos
= nir_vec3(&b
, nir_channel(&b
, src_pos
, 0),
1386 nir_channel(&b
, src_pos
, 1),
1387 nir_channel(&b
, dst_pos
, 2));
1390 /* If the source image is not multisampled, then we want to fetch sample
1391 * number 0, because that's the only sample there is.
1393 if (key
->src_samples
== 0)
1394 src_pos
= nir_channels(&b
, src_pos
, 0x3);
1396 /* X, Y, and S are now the coordinates of the pixel in the source image
1397 * that we want to texture from. Exception: if we are blending, then S is
1398 * irrelevant, because we are going to fetch all samples.
1400 if (key
->blend
&& !key
->blit_scaled
) {
1401 /* Resolves (effecively) use texelFetch, so we need integers and we
1402 * don't care about the sample index if we got one.
1404 src_pos
= nir_f2i(&b
, nir_channels(&b
, src_pos
, 0x3));
1406 if (brw
->gen
== 6) {
1407 /* Because gen6 only supports 4x interleved MSAA, we can do all the
1408 * blending we need with a single linear-interpolated texture lookup
1409 * at the center of the sample. The texture coordinates to be odd
1410 * integers so that they correspond to the center of a 2x2 block
1411 * representing the four samples that maxe up a pixel. So we need
1412 * to multiply our X and Y coordinates each by 2 and then add 1.
1414 src_pos
= nir_ishl(&b
, src_pos
, nir_imm_int(&b
, 1));
1415 src_pos
= nir_iadd(&b
, src_pos
, nir_imm_int(&b
, 1));
1416 src_pos
= nir_i2f(&b
, src_pos
);
1417 color
= blorp_nir_tex(&b
, &v
, src_pos
, key
->texture_data_type
);
1419 /* Gen7+ hardware doesn't automaticaly blend. */
1420 color
= blorp_nir_manual_blend_average(&b
, &v
, src_pos
, key
->src_samples
,
1422 key
->texture_data_type
);
1424 } else if (key
->blend
&& key
->blit_scaled
) {
1425 assert(!key
->use_kill
);
1426 color
= blorp_nir_manual_blend_bilinear(&b
, src_pos
, key
->src_samples
, key
, &v
);
1428 if (key
->bilinear_filter
) {
1429 color
= blorp_nir_tex(&b
, &v
, src_pos
, key
->texture_data_type
);
1431 /* We're going to use texelFetch, so we need integers */
1432 if (src_pos
->num_components
== 2) {
1433 src_pos
= nir_f2i(&b
, src_pos
);
1435 assert(src_pos
->num_components
== 3);
1436 src_pos
= nir_vec3(&b
, nir_channel(&b
, nir_f2i(&b
, src_pos
), 0),
1437 nir_channel(&b
, nir_f2i(&b
, src_pos
), 1),
1438 nir_channel(&b
, src_pos
, 2));
1441 /* We aren't blending, which means we just want to fetch a single
1442 * sample from the source surface. The address that we want to fetch
1443 * from is related to the X, Y and S values according to the formula:
1445 * (X, Y, S) = decode_msaa(src_samples, detile(src_tiling, offset)).
1447 * If the actual tiling and sample count of the source surface are
1448 * not the same as the configuration of the texture, then we need to
1449 * adjust the coordinates to compensate for the difference.
1451 if (tex_tiled_w
!= key
->src_tiled_w
||
1452 key
->tex_samples
!= key
->src_samples
||
1453 key
->tex_layout
!= key
->src_layout
) {
1454 src_pos
= blorp_nir_encode_msaa(&b
, src_pos
, key
->src_samples
,
1456 /* Now (X, Y, S) = detile(src_tiling, offset) */
1457 if (tex_tiled_w
!= key
->src_tiled_w
)
1458 src_pos
= blorp_nir_retile_w_to_y(&b
, src_pos
);
1459 /* Now (X, Y, S) = detile(tex_tiling, offset) */
1460 src_pos
= blorp_nir_decode_msaa(&b
, src_pos
, key
->tex_samples
,
1464 /* Now (X, Y, S) = decode_msaa(tex_samples, detile(tex_tiling, offset)).
1466 * In other words: X, Y, and S now contain values which, when passed to
1467 * the texturing unit, will cause data to be read from the correct
1468 * memory location. So we can fetch the texel now.
1470 if (key
->src_samples
== 0) {
1471 color
= blorp_nir_txf(&b
, &v
, src_pos
, key
->texture_data_type
);
1473 nir_ssa_def
*mcs
= NULL
;
1474 if (key
->tex_aux_usage
== ISL_AUX_USAGE_MCS
)
1475 mcs
= blorp_nir_txf_ms_mcs(&b
, &v
, src_pos
);
1477 color
= blorp_nir_txf_ms(&b
, &v
, src_pos
, mcs
, key
->texture_data_type
);
1482 nir_store_var(&b
, v
.color_out
, color
, 0xf);
1488 brw_blorp_get_blit_kernel(struct brw_context
*brw
,
1489 struct brw_blorp_params
*params
,
1490 const struct brw_blorp_blit_prog_key
*prog_key
)
1492 if (brw_search_cache(&brw
->cache
, BRW_CACHE_BLORP_PROG
,
1493 prog_key
, sizeof(*prog_key
),
1494 ¶ms
->wm_prog_kernel
, ¶ms
->wm_prog_data
))
1497 const unsigned *program
;
1498 unsigned program_size
;
1499 struct brw_blorp_prog_data prog_data
;
1501 /* Try and compile with NIR first. If that fails, fall back to the old
1502 * method of building shaders manually.
1504 nir_shader
*nir
= brw_blorp_build_nir_shader(brw
, prog_key
);
1505 struct brw_wm_prog_key wm_key
;
1506 brw_blorp_init_wm_prog_key(&wm_key
);
1507 wm_key
.tex
.compressed_multisample_layout_mask
=
1508 prog_key
->tex_aux_usage
== ISL_AUX_USAGE_MCS
;
1509 wm_key
.tex
.msaa_16
= prog_key
->tex_samples
== 16;
1510 wm_key
.multisample_fbo
= prog_key
->rt_samples
> 1;
1512 program
= brw_blorp_compile_nir_shader(brw
, nir
, &wm_key
, false,
1513 &prog_data
, &program_size
);
1515 brw_upload_cache(&brw
->cache
, BRW_CACHE_BLORP_PROG
,
1516 prog_key
, sizeof(*prog_key
),
1517 program
, program_size
,
1518 &prog_data
, sizeof(prog_data
),
1519 ¶ms
->wm_prog_kernel
, ¶ms
->wm_prog_data
);
1523 brw_blorp_setup_coord_transform(struct brw_blorp_coord_transform
*xform
,
1524 GLfloat src0
, GLfloat src1
,
1525 GLfloat dst0
, GLfloat dst1
,
1528 float scale
= (src1
- src0
) / (dst1
- dst0
);
1530 /* When not mirroring a coordinate (say, X), we need:
1531 * src_x - src_x0 = (dst_x - dst_x0 + 0.5) * scale
1533 * src_x = src_x0 + (dst_x - dst_x0 + 0.5) * scale
1535 * blorp program uses "round toward zero" to convert the
1536 * transformed floating point coordinates to integer coordinates,
1537 * whereas the behaviour we actually want is "round to nearest",
1538 * so 0.5 provides the necessary correction.
1540 xform
->multiplier
= scale
;
1541 xform
->offset
= src0
+ (-dst0
+ 0.5f
) * scale
;
1543 /* When mirroring X we need:
1544 * src_x - src_x0 = dst_x1 - dst_x - 0.5
1546 * src_x = src_x0 + (dst_x1 -dst_x - 0.5) * scale
1548 xform
->multiplier
= -scale
;
1549 xform
->offset
= src0
+ (dst1
- 0.5f
) * scale
;
1553 static enum isl_msaa_layout
1554 get_isl_msaa_layout(unsigned samples
, enum intel_msaa_layout layout
)
1558 case INTEL_MSAA_LAYOUT_NONE
:
1559 return ISL_MSAA_LAYOUT_NONE
;
1560 case INTEL_MSAA_LAYOUT_IMS
:
1561 return ISL_MSAA_LAYOUT_INTERLEAVED
;
1562 case INTEL_MSAA_LAYOUT_UMS
:
1563 case INTEL_MSAA_LAYOUT_CMS
:
1564 return ISL_MSAA_LAYOUT_ARRAY
;
1566 unreachable("Invalid MSAA layout");
1569 return ISL_MSAA_LAYOUT_NONE
;
1574 * Convert an swizzle enumeration (i.e. SWIZZLE_X) to one of the Gen7.5+
1575 * "Shader Channel Select" enumerations (i.e. HSW_SCS_RED). The mappings are
1577 * SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W, SWIZZLE_ZERO, SWIZZLE_ONE
1580 * SCS_RED, SCS_GREEN, SCS_BLUE, SCS_ALPHA, SCS_ZERO, SCS_ONE
1582 * which is simply adding 4 then modding by 8 (or anding with 7).
1584 * We then may need to apply workarounds for textureGather hardware bugs.
1586 static enum isl_channel_select
1587 swizzle_to_scs(GLenum swizzle
)
1589 return (enum isl_channel_select
)((swizzle
+ 4) & 7);
1593 surf_convert_to_single_slice(struct brw_context
*brw
,
1594 struct brw_blorp_surface_info
*info
)
1596 /* This only makes sense for a single level and array slice */
1597 assert(info
->view
.levels
== 1 && info
->view
.array_len
== 1);
1599 /* Just bail if we have nothing to do. */
1600 if (info
->surf
.dim
== ISL_SURF_DIM_2D
&&
1601 info
->view
.base_level
== 0 && info
->view
.base_array_layer
== 0 &&
1602 info
->surf
.levels
== 0 && info
->surf
.logical_level0_px
.array_len
== 0)
1605 uint32_t x_offset_sa
, y_offset_sa
;
1606 blorp_get_image_offset_sa(&brw
->isl_dev
, &info
->surf
, info
->view
.base_level
,
1607 info
->view
.base_array_layer
,
1608 &x_offset_sa
, &y_offset_sa
);
1610 isl_tiling_get_intratile_offset_sa(&brw
->isl_dev
, info
->surf
.tiling
,
1611 info
->view
.format
, info
->surf
.row_pitch
,
1612 x_offset_sa
, y_offset_sa
,
1614 &info
->tile_x_sa
, &info
->tile_y_sa
);
1616 /* TODO: Once this file gets converted to C, we shouls just use designated
1619 struct isl_surf_init_info init_info
= isl_surf_init_info();
1621 init_info
.dim
= ISL_SURF_DIM_2D
;
1622 init_info
.format
= ISL_FORMAT_R8_UINT
;
1624 minify(info
->surf
.logical_level0_px
.width
, info
->view
.base_level
);
1626 minify(info
->surf
.logical_level0_px
.height
, info
->view
.base_level
);
1627 init_info
.depth
= 1;
1628 init_info
.levels
= 1;
1629 init_info
.array_len
= 1;
1630 init_info
.samples
= info
->surf
.samples
;
1631 init_info
.min_pitch
= info
->surf
.row_pitch
;
1632 init_info
.usage
= info
->surf
.usage
;
1633 init_info
.tiling_flags
= 1 << info
->surf
.tiling
;
1635 isl_surf_init_s(&brw
->isl_dev
, &info
->surf
, &init_info
);
1636 assert(info
->surf
.row_pitch
== init_info
.min_pitch
);
1638 /* The view is also different now. */
1639 info
->view
.base_level
= 0;
1640 info
->view
.levels
= 1;
1641 info
->view
.base_array_layer
= 0;
1642 info
->view
.array_len
= 1;
1646 surf_fake_interleaved_msaa(struct brw_context
*brw
,
1647 struct brw_blorp_surface_info
*info
)
1649 assert(info
->surf
.msaa_layout
== ISL_MSAA_LAYOUT_INTERLEAVED
);
1651 /* First, we need to convert it to a simple 1-level 1-layer 2-D surface */
1652 surf_convert_to_single_slice(brw
, info
);
1654 info
->surf
.logical_level0_px
= info
->surf
.phys_level0_sa
;
1655 info
->surf
.samples
= 1;
1656 info
->surf
.msaa_layout
= ISL_MSAA_LAYOUT_NONE
;
1660 surf_retile_w_to_y(struct brw_context
*brw
,
1661 struct brw_blorp_surface_info
*info
)
1663 assert(info
->surf
.tiling
== ISL_TILING_W
);
1665 /* First, we need to convert it to a simple 1-level 1-layer 2-D surface */
1666 surf_convert_to_single_slice(brw
, info
);
1668 /* On gen7+, we don't have interleaved multisampling for color render
1669 * targets so we have to fake it.
1671 * TODO: Are we sure we don't also need to fake it on gen6?
1673 if (brw
->gen
> 6 && info
->surf
.msaa_layout
== ISL_MSAA_LAYOUT_INTERLEAVED
) {
1674 info
->surf
.logical_level0_px
= info
->surf
.phys_level0_sa
;
1675 info
->surf
.samples
= 1;
1676 info
->surf
.msaa_layout
= ISL_MSAA_LAYOUT_NONE
;
1679 if (brw
->gen
== 6) {
1680 /* Gen6 stencil buffers have a very large alignment coming in from the
1681 * miptree. It's out-of-bounds for what the surface state can handle.
1682 * Since we have a single layer and level, it doesn't really matter as
1683 * long as we don't pass a bogus value into isl_surf_fill_state().
1685 info
->surf
.image_alignment_el
= isl_extent3d(4, 2, 1);
1688 /* Now that we've converted everything to a simple 2-D surface with only
1689 * one miplevel, we can go about retiling it.
1691 const unsigned x_align
= 8, y_align
= info
->surf
.samples
!= 0 ? 8 : 4;
1692 info
->surf
.tiling
= ISL_TILING_Y0
;
1693 info
->surf
.logical_level0_px
.width
=
1694 ALIGN(info
->surf
.logical_level0_px
.width
, x_align
) * 2;
1695 info
->surf
.logical_level0_px
.height
=
1696 ALIGN(info
->surf
.logical_level0_px
.height
, y_align
) / 2;
1697 info
->tile_x_sa
*= 2;
1698 info
->tile_y_sa
/= 2;
1702 * Note: if the src (or dst) is a 2D multisample array texture on Gen7+ using
1703 * INTEL_MSAA_LAYOUT_UMS or INTEL_MSAA_LAYOUT_CMS, src_layer (dst_layer) is
1704 * the physical layer holding sample 0. So, for example, if
1705 * src_mt->num_samples == 4, then logical layer n corresponds to src_layer ==
1709 brw_blorp_blit_miptrees(struct brw_context
*brw
,
1710 struct intel_mipmap_tree
*src_mt
,
1711 unsigned src_level
, unsigned src_layer
,
1712 mesa_format src_format
, int src_swizzle
,
1713 struct intel_mipmap_tree
*dst_mt
,
1714 unsigned dst_level
, unsigned dst_layer
,
1715 mesa_format dst_format
,
1716 float src_x0
, float src_y0
,
1717 float src_x1
, float src_y1
,
1718 float dst_x0
, float dst_y0
,
1719 float dst_x1
, float dst_y1
,
1720 GLenum filter
, bool mirror_x
, bool mirror_y
,
1721 bool decode_srgb
, bool encode_srgb
)
1723 /* Get ready to blit. This includes depth resolving the src and dst
1724 * buffers if necessary. Note: it's not necessary to do a color resolve on
1725 * the destination buffer because we use the standard render path to render
1726 * to destination color buffers, and the standard render path is
1729 intel_miptree_resolve_color(brw
, src_mt
, INTEL_MIPTREE_IGNORE_CCS_E
);
1730 intel_miptree_slice_resolve_depth(brw
, src_mt
, src_level
, src_layer
);
1731 intel_miptree_slice_resolve_depth(brw
, dst_mt
, dst_level
, dst_layer
);
1733 intel_miptree_prepare_mcs(brw
, dst_mt
);
1735 DBG("%s from %dx %s mt %p %d %d (%f,%f) (%f,%f)"
1736 "to %dx %s mt %p %d %d (%f,%f) (%f,%f) (flip %d,%d)\n",
1738 src_mt
->num_samples
, _mesa_get_format_name(src_mt
->format
), src_mt
,
1739 src_level
, src_layer
, src_x0
, src_y0
, src_x1
, src_y1
,
1740 dst_mt
->num_samples
, _mesa_get_format_name(dst_mt
->format
), dst_mt
,
1741 dst_level
, dst_layer
, dst_x0
, dst_y0
, dst_x1
, dst_y1
,
1742 mirror_x
, mirror_y
);
1744 if (!decode_srgb
&& _mesa_get_format_color_encoding(src_format
) == GL_SRGB
)
1745 src_format
= _mesa_get_srgb_format_linear(src_format
);
1747 if (!encode_srgb
&& _mesa_get_format_color_encoding(dst_format
) == GL_SRGB
)
1748 dst_format
= _mesa_get_srgb_format_linear(dst_format
);
1750 struct brw_blorp_params params
;
1751 brw_blorp_params_init(¶ms
);
1753 brw_blorp_surface_info_init(brw
, ¶ms
.src
, src_mt
, src_level
,
1754 src_layer
, src_format
, false);
1755 brw_blorp_surface_info_init(brw
, ¶ms
.dst
, dst_mt
, dst_level
,
1756 dst_layer
, dst_format
, true);
1758 /* When doing a multisample resolve of a GL_LUMINANCE32F or GL_INTENSITY32F
1759 * texture, the above code configures the source format for L32_FLOAT or
1760 * I32_FLOAT, and the destination format for R32_FLOAT. On Sandy Bridge,
1761 * the SAMPLE message appears to handle multisampled L32_FLOAT and
1762 * I32_FLOAT textures incorrectly, resulting in blocky artifacts. So work
1763 * around the problem by using a source format of R32_FLOAT. This
1764 * shouldn't affect rendering correctness, since the destination format is
1765 * R32_FLOAT, so only the contents of the red channel matters.
1767 if (brw
->gen
== 6 &&
1768 params
.src
.surf
.samples
> 1 && params
.dst
.surf
.samples
<= 1 &&
1769 src_mt
->format
== dst_mt
->format
&&
1770 params
.dst
.view
.format
== ISL_FORMAT_R32_FLOAT
) {
1771 params
.src
.view
.format
= params
.dst
.view
.format
;
1774 struct brw_blorp_blit_prog_key wm_prog_key
;
1775 memset(&wm_prog_key
, 0, sizeof(wm_prog_key
));
1777 /* texture_data_type indicates the register type that should be used to
1778 * manipulate texture data.
1780 switch (_mesa_get_format_datatype(src_mt
->format
)) {
1781 case GL_UNSIGNED_NORMALIZED
:
1782 case GL_SIGNED_NORMALIZED
:
1784 wm_prog_key
.texture_data_type
= BRW_REGISTER_TYPE_F
;
1786 case GL_UNSIGNED_INT
:
1787 if (src_mt
->format
== MESA_FORMAT_S_UINT8
) {
1788 /* We process stencil as though it's an unsigned normalized color */
1789 wm_prog_key
.texture_data_type
= BRW_REGISTER_TYPE_F
;
1791 wm_prog_key
.texture_data_type
= BRW_REGISTER_TYPE_UD
;
1795 wm_prog_key
.texture_data_type
= BRW_REGISTER_TYPE_D
;
1798 unreachable("Unrecognized blorp format");
1801 /* Scaled blitting or not. */
1802 wm_prog_key
.blit_scaled
=
1803 ((dst_x1
- dst_x0
) == (src_x1
- src_x0
) &&
1804 (dst_y1
- dst_y0
) == (src_y1
- src_y0
)) ? false : true;
1806 /* Scaling factors used for bilinear filtering in multisample scaled
1809 if (src_mt
->num_samples
== 16)
1810 wm_prog_key
.x_scale
= 4.0f
;
1812 wm_prog_key
.x_scale
= 2.0f
;
1813 wm_prog_key
.y_scale
= src_mt
->num_samples
/ wm_prog_key
.x_scale
;
1815 if (filter
== GL_LINEAR
&&
1816 params
.src
.surf
.samples
<= 1 && params
.dst
.surf
.samples
<= 1)
1817 wm_prog_key
.bilinear_filter
= true;
1819 GLenum base_format
= _mesa_get_format_base_format(src_mt
->format
);
1820 if (base_format
!= GL_DEPTH_COMPONENT
&& /* TODO: what about depth/stencil? */
1821 base_format
!= GL_STENCIL_INDEX
&&
1822 !_mesa_is_format_integer(src_mt
->format
) &&
1823 src_mt
->num_samples
> 1 && dst_mt
->num_samples
<= 1) {
1824 /* We are downsampling a non-integer color buffer, so blend.
1826 * Regarding integer color buffers, the OpenGL ES 3.2 spec says:
1828 * "If the source formats are integer types or stencil values, a
1829 * single sample's value is selected for each pixel."
1831 * This implies we should not blend in that case.
1833 wm_prog_key
.blend
= true;
1836 /* src_samples and dst_samples are the true sample counts */
1837 wm_prog_key
.src_samples
= src_mt
->num_samples
;
1838 wm_prog_key
.dst_samples
= dst_mt
->num_samples
;
1840 wm_prog_key
.tex_aux_usage
= params
.src
.aux_usage
;
1842 /* src_layout and dst_layout indicate the true MSAA layout used by src and
1845 wm_prog_key
.src_layout
= get_isl_msaa_layout(src_mt
->num_samples
,
1846 src_mt
->msaa_layout
);
1847 wm_prog_key
.dst_layout
= get_isl_msaa_layout(dst_mt
->num_samples
,
1848 dst_mt
->msaa_layout
);
1850 /* Round floating point values to nearest integer to avoid "off by one texel"
1851 * kind of errors when blitting.
1853 params
.x0
= params
.wm_inputs
.discard_rect
.x0
= roundf(dst_x0
);
1854 params
.y0
= params
.wm_inputs
.discard_rect
.y0
= roundf(dst_y0
);
1855 params
.x1
= params
.wm_inputs
.discard_rect
.x1
= roundf(dst_x1
);
1856 params
.y1
= params
.wm_inputs
.discard_rect
.y1
= roundf(dst_y1
);
1858 params
.wm_inputs
.rect_grid
.x1
=
1859 minify(src_mt
->logical_width0
, src_level
) * wm_prog_key
.x_scale
- 1.0f
;
1860 params
.wm_inputs
.rect_grid
.y1
=
1861 minify(src_mt
->logical_height0
, src_level
) * wm_prog_key
.y_scale
- 1.0f
;
1863 brw_blorp_setup_coord_transform(¶ms
.wm_inputs
.coord_transform
[0],
1864 src_x0
, src_x1
, dst_x0
, dst_x1
, mirror_x
);
1865 brw_blorp_setup_coord_transform(¶ms
.wm_inputs
.coord_transform
[1],
1866 src_y0
, src_y1
, dst_y0
, dst_y1
, mirror_y
);
1868 /* For some texture types, we need to pass the layer through the sampler. */
1869 params
.wm_inputs
.src_z
= params
.src
.z_offset
;
1872 params
.dst
.surf
.msaa_layout
== ISL_MSAA_LAYOUT_INTERLEAVED
) {
1873 assert(params
.dst
.surf
.samples
> 1);
1875 /* We must expand the rectangle we send through the rendering pipeline,
1876 * to account for the fact that we are mapping the destination region as
1877 * single-sampled when it is in fact multisampled. We must also align
1878 * it to a multiple of the multisampling pattern, because the
1879 * differences between multisampled and single-sampled surface formats
1880 * will mean that pixels are scrambled within the multisampling pattern.
1881 * TODO: what if this makes the coordinates too large?
1883 * Note: this only works if the destination surface uses the IMS layout.
1884 * If it's UMS, then we have no choice but to set up the rendering
1885 * pipeline as multisampled.
1887 switch (params
.dst
.surf
.samples
) {
1889 params
.x0
= ROUND_DOWN_TO(params
.x0
* 2, 4);
1890 params
.y0
= ROUND_DOWN_TO(params
.y0
, 4);
1891 params
.x1
= ALIGN(params
.x1
* 2, 4);
1892 params
.y1
= ALIGN(params
.y1
, 4);
1895 params
.x0
= ROUND_DOWN_TO(params
.x0
* 2, 4);
1896 params
.y0
= ROUND_DOWN_TO(params
.y0
* 2, 4);
1897 params
.x1
= ALIGN(params
.x1
* 2, 4);
1898 params
.y1
= ALIGN(params
.y1
* 2, 4);
1901 params
.x0
= ROUND_DOWN_TO(params
.x0
* 4, 8);
1902 params
.y0
= ROUND_DOWN_TO(params
.y0
* 2, 4);
1903 params
.x1
= ALIGN(params
.x1
* 4, 8);
1904 params
.y1
= ALIGN(params
.y1
* 2, 4);
1907 params
.x0
= ROUND_DOWN_TO(params
.x0
* 4, 8);
1908 params
.y0
= ROUND_DOWN_TO(params
.y0
* 4, 8);
1909 params
.x1
= ALIGN(params
.x1
* 4, 8);
1910 params
.y1
= ALIGN(params
.y1
* 4, 8);
1913 unreachable("Unrecognized sample count in brw_blorp_blit_params ctor");
1916 surf_fake_interleaved_msaa(brw
, ¶ms
.dst
);
1918 wm_prog_key
.use_kill
= true;
1921 if (params
.dst
.surf
.tiling
== ISL_TILING_W
) {
1922 /* We must modify the rectangle we send through the rendering pipeline
1923 * (and the size and x/y offset of the destination surface), to account
1924 * for the fact that we are mapping it as Y-tiled when it is in fact
1927 * Both Y tiling and W tiling can be understood as organizations of
1928 * 32-byte sub-tiles; within each 32-byte sub-tile, the layout of pixels
1929 * is different, but the layout of the 32-byte sub-tiles within the 4k
1930 * tile is the same (8 sub-tiles across by 16 sub-tiles down, in
1931 * column-major order). In Y tiling, the sub-tiles are 16 bytes wide
1932 * and 2 rows high; in W tiling, they are 8 bytes wide and 4 rows high.
1934 * Therefore, to account for the layout differences within the 32-byte
1935 * sub-tiles, we must expand the rectangle so the X coordinates of its
1936 * edges are multiples of 8 (the W sub-tile width), and its Y
1937 * coordinates of its edges are multiples of 4 (the W sub-tile height).
1938 * Then we need to scale the X and Y coordinates of the rectangle to
1939 * account for the differences in aspect ratio between the Y and W
1940 * sub-tiles. We need to modify the layer width and height similarly.
1942 * A correction needs to be applied when MSAA is in use: since
1943 * INTEL_MSAA_LAYOUT_IMS uses an interleaving pattern whose height is 4,
1944 * we need to align the Y coordinates to multiples of 8, so that when
1945 * they are divided by two they are still multiples of 4.
1947 * Note: Since the x/y offset of the surface will be applied using the
1948 * SURFACE_STATE command packet, it will be invisible to the swizzling
1949 * code in the shader; therefore it needs to be in a multiple of the
1950 * 32-byte sub-tile size. Fortunately it is, since the sub-tile is 8
1951 * pixels wide and 4 pixels high (when viewed as a W-tiled stencil
1952 * buffer), and the miplevel alignment used for stencil buffers is 8
1953 * pixels horizontally and either 4 or 8 pixels vertically (see
1954 * intel_horizontal_texture_alignment_unit() and
1955 * intel_vertical_texture_alignment_unit()).
1957 * Note: Also, since the SURFACE_STATE command packet can only apply
1958 * offsets that are multiples of 4 pixels horizontally and 2 pixels
1959 * vertically, it is important that the offsets will be multiples of
1960 * these sizes after they are converted into Y-tiled coordinates.
1961 * Fortunately they will be, since we know from above that the offsets
1962 * are a multiple of the 32-byte sub-tile size, and in Y-tiled
1963 * coordinates the sub-tile is 16 pixels wide and 2 pixels high.
1965 * TODO: what if this makes the coordinates (or the texture size) too
1968 const unsigned x_align
= 8, y_align
= params
.dst
.surf
.samples
!= 0 ? 8 : 4;
1969 params
.x0
= ROUND_DOWN_TO(params
.x0
, x_align
) * 2;
1970 params
.y0
= ROUND_DOWN_TO(params
.y0
, y_align
) / 2;
1971 params
.x1
= ALIGN(params
.x1
, x_align
) * 2;
1972 params
.y1
= ALIGN(params
.y1
, y_align
) / 2;
1974 /* Retile the surface to Y-tiled */
1975 surf_retile_w_to_y(brw
, ¶ms
.dst
);
1977 wm_prog_key
.dst_tiled_w
= true;
1978 wm_prog_key
.use_kill
= true;
1980 if (params
.dst
.surf
.samples
> 1) {
1981 /* If the destination surface is a W-tiled multisampled stencil
1982 * buffer that we're mapping as Y tiled, then we need to arrange for
1983 * the WM program to run once per sample rather than once per pixel,
1984 * because the memory layout of related samples doesn't match between
1987 wm_prog_key
.persample_msaa_dispatch
= true;
1991 if (brw
->gen
< 8 && params
.src
.surf
.tiling
== ISL_TILING_W
) {
1992 /* On Haswell and earlier, we have to fake W-tiled sources as Y-tiled.
1993 * Broadwell adds support for sampling from stencil.
1995 * See the comments above concerning x/y offset alignment for the
1996 * destination surface.
1998 * TODO: what if this makes the texture size too large?
2000 surf_retile_w_to_y(brw
, ¶ms
.src
);
2002 wm_prog_key
.src_tiled_w
= true;
2005 /* tex_samples and rt_samples are the sample counts that are set up in
2008 wm_prog_key
.tex_samples
= params
.src
.surf
.samples
;
2009 wm_prog_key
.rt_samples
= params
.dst
.surf
.samples
;
2011 /* tex_layout and rt_layout indicate the MSAA layout the GPU pipeline will
2012 * use to access the source and destination surfaces.
2014 wm_prog_key
.tex_layout
= params
.src
.surf
.msaa_layout
;
2015 wm_prog_key
.rt_layout
= params
.dst
.surf
.msaa_layout
;
2017 if (params
.src
.surf
.samples
> 0 && params
.dst
.surf
.samples
> 1) {
2018 /* We are blitting from a multisample buffer to a multisample buffer, so
2019 * we must preserve samples within a pixel. This means we have to
2020 * arrange for the WM program to run once per sample rather than once
2023 wm_prog_key
.persample_msaa_dispatch
= true;
2026 brw_blorp_get_blit_kernel(brw
, ¶ms
, &wm_prog_key
);
2028 for (unsigned i
= 0; i
< 4; i
++) {
2029 params
.src
.view
.channel_select
[i
] =
2030 swizzle_to_scs(GET_SWZ(src_swizzle
, i
));
2033 brw_blorp_exec(brw
, ¶ms
);
2035 intel_miptree_slice_set_needs_hiz_resolve(dst_mt
, dst_level
, dst_layer
);
2037 if (intel_miptree_is_lossless_compressed(brw
, dst_mt
))
2038 dst_mt
->fast_clear_state
= INTEL_FAST_CLEAR_STATE_UNRESOLVED
;