2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **********************************************************************/
29 * Keith Whitwell <keithw@vmware.com>
33 #include "brw_eu_defines.h"
36 #include "util/ralloc.h"
39 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
40 * registers, implicitly moving the operand to a message register.
42 * On Sandybridge, this is no longer the case. This function performs the
43 * explicit move; it should be called before emitting a SEND instruction.
46 gen6_resolve_implied_move(struct brw_codegen
*p
,
50 const struct gen_device_info
*devinfo
= p
->devinfo
;
54 if (src
->file
== BRW_MESSAGE_REGISTER_FILE
)
57 if (src
->file
!= BRW_ARCHITECTURE_REGISTER_FILE
|| src
->nr
!= BRW_ARF_NULL
) {
58 assert(devinfo
->gen
< 12);
59 brw_push_insn_state(p
);
60 brw_set_default_exec_size(p
, BRW_EXECUTE_8
);
61 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
62 brw_set_default_compression_control(p
, BRW_COMPRESSION_NONE
);
63 brw_MOV(p
, retype(brw_message_reg(msg_reg_nr
), BRW_REGISTER_TYPE_UD
),
64 retype(*src
, BRW_REGISTER_TYPE_UD
));
65 brw_pop_insn_state(p
);
67 *src
= brw_message_reg(msg_reg_nr
);
71 gen7_convert_mrf_to_grf(struct brw_codegen
*p
, struct brw_reg
*reg
)
73 /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
74 * "The send with EOT should use register space R112-R127 for <src>. This is
75 * to enable loading of a new thread into the same slot while the message
76 * with EOT for current thread is pending dispatch."
78 * Since we're pretending to have 16 MRFs anyway, we may as well use the
79 * registers required for messages with EOT.
81 const struct gen_device_info
*devinfo
= p
->devinfo
;
82 if (devinfo
->gen
>= 7 && reg
->file
== BRW_MESSAGE_REGISTER_FILE
) {
83 reg
->file
= BRW_GENERAL_REGISTER_FILE
;
84 reg
->nr
+= GEN7_MRF_HACK_START
;
89 brw_set_dest(struct brw_codegen
*p
, brw_inst
*inst
, struct brw_reg dest
)
91 const struct gen_device_info
*devinfo
= p
->devinfo
;
93 if (dest
.file
== BRW_MESSAGE_REGISTER_FILE
)
94 assert((dest
.nr
& ~BRW_MRF_COMPR4
) < BRW_MAX_MRF(devinfo
->gen
));
95 else if (dest
.file
== BRW_GENERAL_REGISTER_FILE
)
96 assert(dest
.nr
< 128);
98 /* The hardware has a restriction where if the destination is Byte,
99 * the instruction needs to have a stride of 2 (except for packed byte
100 * MOV). This seems to be required even if the destination is the NULL
103 if (dest
.file
== BRW_ARCHITECTURE_REGISTER_FILE
&&
104 dest
.nr
== BRW_ARF_NULL
&&
105 type_sz(dest
.type
) == 1) {
106 dest
.hstride
= BRW_HORIZONTAL_STRIDE_2
;
109 gen7_convert_mrf_to_grf(p
, &dest
);
111 if (devinfo
->gen
>= 12 &&
112 (brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SEND
||
113 brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDC
)) {
114 assert(dest
.file
== BRW_GENERAL_REGISTER_FILE
||
115 dest
.file
== BRW_ARCHITECTURE_REGISTER_FILE
);
116 assert(dest
.address_mode
== BRW_ADDRESS_DIRECT
);
117 assert(dest
.subnr
== 0);
118 assert(brw_inst_exec_size(devinfo
, inst
) == BRW_EXECUTE_1
||
119 (dest
.hstride
== BRW_HORIZONTAL_STRIDE_1
&&
120 dest
.vstride
== dest
.width
+ 1));
121 assert(!dest
.negate
&& !dest
.abs
);
122 brw_inst_set_dst_reg_file(devinfo
, inst
, dest
.file
);
123 brw_inst_set_dst_da_reg_nr(devinfo
, inst
, dest
.nr
);
125 } else if (brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDS
||
126 brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDSC
) {
127 assert(devinfo
->gen
< 12);
128 assert(dest
.file
== BRW_GENERAL_REGISTER_FILE
||
129 dest
.file
== BRW_ARCHITECTURE_REGISTER_FILE
);
130 assert(dest
.address_mode
== BRW_ADDRESS_DIRECT
);
131 assert(dest
.subnr
% 16 == 0);
132 assert(dest
.hstride
== BRW_HORIZONTAL_STRIDE_1
&&
133 dest
.vstride
== dest
.width
+ 1);
134 assert(!dest
.negate
&& !dest
.abs
);
135 brw_inst_set_dst_da_reg_nr(devinfo
, inst
, dest
.nr
);
136 brw_inst_set_dst_da16_subreg_nr(devinfo
, inst
, dest
.subnr
/ 16);
137 brw_inst_set_send_dst_reg_file(devinfo
, inst
, dest
.file
);
139 brw_inst_set_dst_file_type(devinfo
, inst
, dest
.file
, dest
.type
);
140 brw_inst_set_dst_address_mode(devinfo
, inst
, dest
.address_mode
);
142 if (dest
.address_mode
== BRW_ADDRESS_DIRECT
) {
143 brw_inst_set_dst_da_reg_nr(devinfo
, inst
, dest
.nr
);
145 if (brw_inst_access_mode(devinfo
, inst
) == BRW_ALIGN_1
) {
146 brw_inst_set_dst_da1_subreg_nr(devinfo
, inst
, dest
.subnr
);
147 if (dest
.hstride
== BRW_HORIZONTAL_STRIDE_0
)
148 dest
.hstride
= BRW_HORIZONTAL_STRIDE_1
;
149 brw_inst_set_dst_hstride(devinfo
, inst
, dest
.hstride
);
151 brw_inst_set_dst_da16_subreg_nr(devinfo
, inst
, dest
.subnr
/ 16);
152 brw_inst_set_da16_writemask(devinfo
, inst
, dest
.writemask
);
153 if (dest
.file
== BRW_GENERAL_REGISTER_FILE
||
154 dest
.file
== BRW_MESSAGE_REGISTER_FILE
) {
155 assert(dest
.writemask
!= 0);
157 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
158 * Although Dst.HorzStride is a don't care for Align16, HW needs
159 * this to be programmed as "01".
161 brw_inst_set_dst_hstride(devinfo
, inst
, 1);
164 brw_inst_set_dst_ia_subreg_nr(devinfo
, inst
, dest
.subnr
);
166 /* These are different sizes in align1 vs align16:
168 if (brw_inst_access_mode(devinfo
, inst
) == BRW_ALIGN_1
) {
169 brw_inst_set_dst_ia1_addr_imm(devinfo
, inst
,
170 dest
.indirect_offset
);
171 if (dest
.hstride
== BRW_HORIZONTAL_STRIDE_0
)
172 dest
.hstride
= BRW_HORIZONTAL_STRIDE_1
;
173 brw_inst_set_dst_hstride(devinfo
, inst
, dest
.hstride
);
175 brw_inst_set_dst_ia16_addr_imm(devinfo
, inst
,
176 dest
.indirect_offset
);
177 /* even ignored in da16, still need to set as '01' */
178 brw_inst_set_dst_hstride(devinfo
, inst
, 1);
183 /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
184 * or 16 (SIMD16), as that's normally correct. However, when dealing with
185 * small registers, it can be useful for us to automatically reduce it to
186 * match the register size.
188 if (p
->automatic_exec_sizes
) {
190 * In platforms that support fp64 we can emit instructions with a width
191 * of 4 that need two SIMD8 registers and an exec_size of 8 or 16. In
192 * these cases we need to make sure that these instructions have their
193 * exec sizes set properly when they are emitted and we can't rely on
194 * this code to fix it.
197 if (devinfo
->gen
>= 6)
198 fix_exec_size
= dest
.width
< BRW_EXECUTE_4
;
200 fix_exec_size
= dest
.width
< BRW_EXECUTE_8
;
203 brw_inst_set_exec_size(devinfo
, inst
, dest
.width
);
208 brw_set_src0(struct brw_codegen
*p
, brw_inst
*inst
, struct brw_reg reg
)
210 const struct gen_device_info
*devinfo
= p
->devinfo
;
212 if (reg
.file
== BRW_MESSAGE_REGISTER_FILE
)
213 assert((reg
.nr
& ~BRW_MRF_COMPR4
) < BRW_MAX_MRF(devinfo
->gen
));
214 else if (reg
.file
== BRW_GENERAL_REGISTER_FILE
)
215 assert(reg
.nr
< 128);
217 gen7_convert_mrf_to_grf(p
, ®
);
219 if (devinfo
->gen
>= 6 &&
220 (brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SEND
||
221 brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDC
||
222 brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDS
||
223 brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDSC
)) {
224 /* Any source modifiers or regions will be ignored, since this just
225 * identifies the MRF/GRF to start reading the message contents from.
226 * Check for some likely failures.
230 assert(reg
.address_mode
== BRW_ADDRESS_DIRECT
);
233 if (devinfo
->gen
>= 12 &&
234 (brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SEND
||
235 brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDC
)) {
236 assert(reg
.file
!= BRW_IMMEDIATE_VALUE
);
237 assert(reg
.address_mode
== BRW_ADDRESS_DIRECT
);
238 assert(reg
.subnr
== 0);
239 assert(brw_inst_exec_size(devinfo
, inst
) == BRW_EXECUTE_1
||
240 (reg
.hstride
== BRW_HORIZONTAL_STRIDE_1
&&
241 reg
.vstride
== reg
.width
+ 1));
242 assert(!reg
.negate
&& !reg
.abs
);
243 brw_inst_set_send_src0_reg_file(devinfo
, inst
, reg
.file
);
244 brw_inst_set_src0_da_reg_nr(devinfo
, inst
, reg
.nr
);
246 } else if (brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDS
||
247 brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDSC
) {
248 assert(reg
.file
== BRW_GENERAL_REGISTER_FILE
);
249 assert(reg
.address_mode
== BRW_ADDRESS_DIRECT
);
250 assert(reg
.subnr
% 16 == 0);
251 assert(reg
.hstride
== BRW_HORIZONTAL_STRIDE_1
&&
252 reg
.vstride
== reg
.width
+ 1);
253 assert(!reg
.negate
&& !reg
.abs
);
254 brw_inst_set_src0_da_reg_nr(devinfo
, inst
, reg
.nr
);
255 brw_inst_set_src0_da16_subreg_nr(devinfo
, inst
, reg
.subnr
/ 16);
257 brw_inst_set_src0_file_type(devinfo
, inst
, reg
.file
, reg
.type
);
258 brw_inst_set_src0_abs(devinfo
, inst
, reg
.abs
);
259 brw_inst_set_src0_negate(devinfo
, inst
, reg
.negate
);
260 brw_inst_set_src0_address_mode(devinfo
, inst
, reg
.address_mode
);
262 if (reg
.file
== BRW_IMMEDIATE_VALUE
) {
263 if (reg
.type
== BRW_REGISTER_TYPE_DF
||
264 brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_DIM
)
265 brw_inst_set_imm_df(devinfo
, inst
, reg
.df
);
266 else if (reg
.type
== BRW_REGISTER_TYPE_UQ
||
267 reg
.type
== BRW_REGISTER_TYPE_Q
)
268 brw_inst_set_imm_uq(devinfo
, inst
, reg
.u64
);
270 brw_inst_set_imm_ud(devinfo
, inst
, reg
.ud
);
272 if (devinfo
->gen
< 12 && type_sz(reg
.type
) < 8) {
273 brw_inst_set_src1_reg_file(devinfo
, inst
,
274 BRW_ARCHITECTURE_REGISTER_FILE
);
275 brw_inst_set_src1_reg_hw_type(devinfo
, inst
,
276 brw_inst_src0_reg_hw_type(devinfo
, inst
));
279 if (reg
.address_mode
== BRW_ADDRESS_DIRECT
) {
280 brw_inst_set_src0_da_reg_nr(devinfo
, inst
, reg
.nr
);
281 if (brw_inst_access_mode(devinfo
, inst
) == BRW_ALIGN_1
) {
282 brw_inst_set_src0_da1_subreg_nr(devinfo
, inst
, reg
.subnr
);
284 brw_inst_set_src0_da16_subreg_nr(devinfo
, inst
, reg
.subnr
/ 16);
287 brw_inst_set_src0_ia_subreg_nr(devinfo
, inst
, reg
.subnr
);
289 if (brw_inst_access_mode(devinfo
, inst
) == BRW_ALIGN_1
) {
290 brw_inst_set_src0_ia1_addr_imm(devinfo
, inst
, reg
.indirect_offset
);
292 brw_inst_set_src0_ia16_addr_imm(devinfo
, inst
, reg
.indirect_offset
);
296 if (brw_inst_access_mode(devinfo
, inst
) == BRW_ALIGN_1
) {
297 if (reg
.width
== BRW_WIDTH_1
&&
298 brw_inst_exec_size(devinfo
, inst
) == BRW_EXECUTE_1
) {
299 brw_inst_set_src0_hstride(devinfo
, inst
, BRW_HORIZONTAL_STRIDE_0
);
300 brw_inst_set_src0_width(devinfo
, inst
, BRW_WIDTH_1
);
301 brw_inst_set_src0_vstride(devinfo
, inst
, BRW_VERTICAL_STRIDE_0
);
303 brw_inst_set_src0_hstride(devinfo
, inst
, reg
.hstride
);
304 brw_inst_set_src0_width(devinfo
, inst
, reg
.width
);
305 brw_inst_set_src0_vstride(devinfo
, inst
, reg
.vstride
);
308 brw_inst_set_src0_da16_swiz_x(devinfo
, inst
,
309 BRW_GET_SWZ(reg
.swizzle
, BRW_CHANNEL_X
));
310 brw_inst_set_src0_da16_swiz_y(devinfo
, inst
,
311 BRW_GET_SWZ(reg
.swizzle
, BRW_CHANNEL_Y
));
312 brw_inst_set_src0_da16_swiz_z(devinfo
, inst
,
313 BRW_GET_SWZ(reg
.swizzle
, BRW_CHANNEL_Z
));
314 brw_inst_set_src0_da16_swiz_w(devinfo
, inst
,
315 BRW_GET_SWZ(reg
.swizzle
, BRW_CHANNEL_W
));
317 if (reg
.vstride
== BRW_VERTICAL_STRIDE_8
) {
318 /* This is an oddity of the fact we're using the same
319 * descriptions for registers in align_16 as align_1:
321 brw_inst_set_src0_vstride(devinfo
, inst
, BRW_VERTICAL_STRIDE_4
);
322 } else if (devinfo
->gen
== 7 && !devinfo
->is_haswell
&&
323 reg
.type
== BRW_REGISTER_TYPE_DF
&&
324 reg
.vstride
== BRW_VERTICAL_STRIDE_2
) {
327 * "For Align16 access mode, only encodings of 0000 and 0011
328 * are allowed. Other codes are reserved."
330 * Presumably the DevSNB behavior applies to IVB as well.
332 brw_inst_set_src0_vstride(devinfo
, inst
, BRW_VERTICAL_STRIDE_4
);
334 brw_inst_set_src0_vstride(devinfo
, inst
, reg
.vstride
);
343 brw_set_src1(struct brw_codegen
*p
, brw_inst
*inst
, struct brw_reg reg
)
345 const struct gen_device_info
*devinfo
= p
->devinfo
;
347 if (reg
.file
== BRW_GENERAL_REGISTER_FILE
)
348 assert(reg
.nr
< 128);
350 if (brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDS
||
351 brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDSC
||
352 (devinfo
->gen
>= 12 &&
353 (brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SEND
||
354 brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDC
))) {
355 assert(reg
.file
== BRW_GENERAL_REGISTER_FILE
||
356 reg
.file
== BRW_ARCHITECTURE_REGISTER_FILE
);
357 assert(reg
.address_mode
== BRW_ADDRESS_DIRECT
);
358 assert(reg
.subnr
== 0);
359 assert(brw_inst_exec_size(devinfo
, inst
) == BRW_EXECUTE_1
||
360 (reg
.hstride
== BRW_HORIZONTAL_STRIDE_1
&&
361 reg
.vstride
== reg
.width
+ 1));
362 assert(!reg
.negate
&& !reg
.abs
);
363 brw_inst_set_send_src1_reg_nr(devinfo
, inst
, reg
.nr
);
364 brw_inst_set_send_src1_reg_file(devinfo
, inst
, reg
.file
);
366 /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
368 * "Accumulator registers may be accessed explicitly as src0
371 assert(reg
.file
!= BRW_ARCHITECTURE_REGISTER_FILE
||
372 reg
.nr
!= BRW_ARF_ACCUMULATOR
);
374 gen7_convert_mrf_to_grf(p
, ®
);
375 assert(reg
.file
!= BRW_MESSAGE_REGISTER_FILE
);
377 brw_inst_set_src1_file_type(devinfo
, inst
, reg
.file
, reg
.type
);
378 brw_inst_set_src1_abs(devinfo
, inst
, reg
.abs
);
379 brw_inst_set_src1_negate(devinfo
, inst
, reg
.negate
);
381 /* Only src1 can be immediate in two-argument instructions.
383 assert(brw_inst_src0_reg_file(devinfo
, inst
) != BRW_IMMEDIATE_VALUE
);
385 if (reg
.file
== BRW_IMMEDIATE_VALUE
) {
386 /* two-argument instructions can only use 32-bit immediates */
387 assert(type_sz(reg
.type
) < 8);
388 brw_inst_set_imm_ud(devinfo
, inst
, reg
.ud
);
390 /* This is a hardware restriction, which may or may not be lifted
393 assert (reg
.address_mode
== BRW_ADDRESS_DIRECT
);
394 /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
396 brw_inst_set_src1_da_reg_nr(devinfo
, inst
, reg
.nr
);
397 if (brw_inst_access_mode(devinfo
, inst
) == BRW_ALIGN_1
) {
398 brw_inst_set_src1_da1_subreg_nr(devinfo
, inst
, reg
.subnr
);
400 brw_inst_set_src1_da16_subreg_nr(devinfo
, inst
, reg
.subnr
/ 16);
403 if (brw_inst_access_mode(devinfo
, inst
) == BRW_ALIGN_1
) {
404 if (reg
.width
== BRW_WIDTH_1
&&
405 brw_inst_exec_size(devinfo
, inst
) == BRW_EXECUTE_1
) {
406 brw_inst_set_src1_hstride(devinfo
, inst
, BRW_HORIZONTAL_STRIDE_0
);
407 brw_inst_set_src1_width(devinfo
, inst
, BRW_WIDTH_1
);
408 brw_inst_set_src1_vstride(devinfo
, inst
, BRW_VERTICAL_STRIDE_0
);
410 brw_inst_set_src1_hstride(devinfo
, inst
, reg
.hstride
);
411 brw_inst_set_src1_width(devinfo
, inst
, reg
.width
);
412 brw_inst_set_src1_vstride(devinfo
, inst
, reg
.vstride
);
415 brw_inst_set_src1_da16_swiz_x(devinfo
, inst
,
416 BRW_GET_SWZ(reg
.swizzle
, BRW_CHANNEL_X
));
417 brw_inst_set_src1_da16_swiz_y(devinfo
, inst
,
418 BRW_GET_SWZ(reg
.swizzle
, BRW_CHANNEL_Y
));
419 brw_inst_set_src1_da16_swiz_z(devinfo
, inst
,
420 BRW_GET_SWZ(reg
.swizzle
, BRW_CHANNEL_Z
));
421 brw_inst_set_src1_da16_swiz_w(devinfo
, inst
,
422 BRW_GET_SWZ(reg
.swizzle
, BRW_CHANNEL_W
));
424 if (reg
.vstride
== BRW_VERTICAL_STRIDE_8
) {
425 /* This is an oddity of the fact we're using the same
426 * descriptions for registers in align_16 as align_1:
428 brw_inst_set_src1_vstride(devinfo
, inst
, BRW_VERTICAL_STRIDE_4
);
429 } else if (devinfo
->gen
== 7 && !devinfo
->is_haswell
&&
430 reg
.type
== BRW_REGISTER_TYPE_DF
&&
431 reg
.vstride
== BRW_VERTICAL_STRIDE_2
) {
434 * "For Align16 access mode, only encodings of 0000 and 0011
435 * are allowed. Other codes are reserved."
437 * Presumably the DevSNB behavior applies to IVB as well.
439 brw_inst_set_src1_vstride(devinfo
, inst
, BRW_VERTICAL_STRIDE_4
);
441 brw_inst_set_src1_vstride(devinfo
, inst
, reg
.vstride
);
449 * Specify the descriptor and extended descriptor immediate for a SEND(C)
450 * message instruction.
453 brw_set_desc_ex(struct brw_codegen
*p
, brw_inst
*inst
,
454 unsigned desc
, unsigned ex_desc
)
456 const struct gen_device_info
*devinfo
= p
->devinfo
;
457 assert(brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SEND
||
458 brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDC
);
459 if (devinfo
->gen
< 12)
460 brw_inst_set_src1_file_type(devinfo
, inst
,
461 BRW_IMMEDIATE_VALUE
, BRW_REGISTER_TYPE_UD
);
462 brw_inst_set_send_desc(devinfo
, inst
, desc
);
463 if (devinfo
->gen
>= 9)
464 brw_inst_set_send_ex_desc(devinfo
, inst
, ex_desc
);
467 static void brw_set_math_message( struct brw_codegen
*p
,
470 unsigned integer_type
,
474 const struct gen_device_info
*devinfo
= p
->devinfo
;
476 unsigned response_length
;
478 /* Infer message length from the function */
480 case BRW_MATH_FUNCTION_POW
:
481 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT
:
482 case BRW_MATH_FUNCTION_INT_DIV_REMAINDER
:
483 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER
:
491 /* Infer response length from the function */
493 case BRW_MATH_FUNCTION_SINCOS
:
494 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER
:
502 brw_set_desc(p
, inst
, brw_message_desc(
503 devinfo
, msg_length
, response_length
, false));
505 brw_inst_set_sfid(devinfo
, inst
, BRW_SFID_MATH
);
506 brw_inst_set_math_msg_function(devinfo
, inst
, function
);
507 brw_inst_set_math_msg_signed_int(devinfo
, inst
, integer_type
);
508 brw_inst_set_math_msg_precision(devinfo
, inst
, low_precision
);
509 brw_inst_set_math_msg_saturate(devinfo
, inst
, brw_inst_saturate(devinfo
, inst
));
510 brw_inst_set_math_msg_data_type(devinfo
, inst
, dataType
);
511 brw_inst_set_saturate(devinfo
, inst
, 0);
515 static void brw_set_ff_sync_message(struct brw_codegen
*p
,
518 unsigned response_length
,
521 const struct gen_device_info
*devinfo
= p
->devinfo
;
523 brw_set_desc(p
, insn
, brw_message_desc(
524 devinfo
, 1, response_length
, true));
526 brw_inst_set_sfid(devinfo
, insn
, BRW_SFID_URB
);
527 brw_inst_set_eot(devinfo
, insn
, end_of_thread
);
528 brw_inst_set_urb_opcode(devinfo
, insn
, 1); /* FF_SYNC */
529 brw_inst_set_urb_allocate(devinfo
, insn
, allocate
);
530 /* The following fields are not used by FF_SYNC: */
531 brw_inst_set_urb_global_offset(devinfo
, insn
, 0);
532 brw_inst_set_urb_swizzle_control(devinfo
, insn
, 0);
533 brw_inst_set_urb_used(devinfo
, insn
, 0);
534 brw_inst_set_urb_complete(devinfo
, insn
, 0);
537 static void brw_set_urb_message( struct brw_codegen
*p
,
539 enum brw_urb_write_flags flags
,
541 unsigned response_length
,
543 unsigned swizzle_control
)
545 const struct gen_device_info
*devinfo
= p
->devinfo
;
547 assert(devinfo
->gen
< 7 || swizzle_control
!= BRW_URB_SWIZZLE_TRANSPOSE
);
548 assert(devinfo
->gen
< 7 || !(flags
& BRW_URB_WRITE_ALLOCATE
));
549 assert(devinfo
->gen
>= 7 || !(flags
& BRW_URB_WRITE_PER_SLOT_OFFSET
));
551 brw_set_desc(p
, insn
, brw_message_desc(
552 devinfo
, msg_length
, response_length
, true));
554 brw_inst_set_sfid(devinfo
, insn
, BRW_SFID_URB
);
555 brw_inst_set_eot(devinfo
, insn
, !!(flags
& BRW_URB_WRITE_EOT
));
557 if (flags
& BRW_URB_WRITE_OWORD
) {
558 assert(msg_length
== 2); /* header + one OWORD of data */
559 brw_inst_set_urb_opcode(devinfo
, insn
, BRW_URB_OPCODE_WRITE_OWORD
);
561 brw_inst_set_urb_opcode(devinfo
, insn
, BRW_URB_OPCODE_WRITE_HWORD
);
564 brw_inst_set_urb_global_offset(devinfo
, insn
, offset
);
565 brw_inst_set_urb_swizzle_control(devinfo
, insn
, swizzle_control
);
567 if (devinfo
->gen
< 8) {
568 brw_inst_set_urb_complete(devinfo
, insn
, !!(flags
& BRW_URB_WRITE_COMPLETE
));
571 if (devinfo
->gen
< 7) {
572 brw_inst_set_urb_allocate(devinfo
, insn
, !!(flags
& BRW_URB_WRITE_ALLOCATE
));
573 brw_inst_set_urb_used(devinfo
, insn
, !(flags
& BRW_URB_WRITE_UNUSED
));
575 brw_inst_set_urb_per_slot_offset(devinfo
, insn
,
576 !!(flags
& BRW_URB_WRITE_PER_SLOT_OFFSET
));
581 gen7_set_dp_scratch_message(struct brw_codegen
*p
,
585 bool invalidate_after_read
,
587 unsigned addr_offset
,
592 const struct gen_device_info
*devinfo
= p
->devinfo
;
593 assert(num_regs
== 1 || num_regs
== 2 || num_regs
== 4 ||
594 (devinfo
->gen
>= 8 && num_regs
== 8));
595 const unsigned block_size
= (devinfo
->gen
>= 8 ? _mesa_logbase2(num_regs
) :
598 brw_set_desc(p
, inst
, brw_message_desc(
599 devinfo
, mlen
, rlen
, header_present
));
601 brw_inst_set_sfid(devinfo
, inst
, GEN7_SFID_DATAPORT_DATA_CACHE
);
602 brw_inst_set_dp_category(devinfo
, inst
, 1); /* Scratch Block Read/Write msgs */
603 brw_inst_set_scratch_read_write(devinfo
, inst
, write
);
604 brw_inst_set_scratch_type(devinfo
, inst
, dword
);
605 brw_inst_set_scratch_invalidate_after_read(devinfo
, inst
, invalidate_after_read
);
606 brw_inst_set_scratch_block_size(devinfo
, inst
, block_size
);
607 brw_inst_set_scratch_addr_offset(devinfo
, inst
, addr_offset
);
611 brw_inst_set_state(const struct gen_device_info
*devinfo
,
613 const struct brw_insn_state
*state
)
615 brw_inst_set_exec_size(devinfo
, insn
, state
->exec_size
);
616 brw_inst_set_group(devinfo
, insn
, state
->group
);
617 brw_inst_set_compression(devinfo
, insn
, state
->compressed
);
618 brw_inst_set_access_mode(devinfo
, insn
, state
->access_mode
);
619 brw_inst_set_mask_control(devinfo
, insn
, state
->mask_control
);
620 if (devinfo
->gen
>= 12)
621 brw_inst_set_swsb(devinfo
, insn
, tgl_swsb_encode(state
->swsb
));
622 brw_inst_set_saturate(devinfo
, insn
, state
->saturate
);
623 brw_inst_set_pred_control(devinfo
, insn
, state
->predicate
);
624 brw_inst_set_pred_inv(devinfo
, insn
, state
->pred_inv
);
626 if (is_3src(devinfo
, brw_inst_opcode(devinfo
, insn
)) &&
627 state
->access_mode
== BRW_ALIGN_16
) {
628 brw_inst_set_3src_a16_flag_subreg_nr(devinfo
, insn
, state
->flag_subreg
% 2);
629 if (devinfo
->gen
>= 7)
630 brw_inst_set_3src_a16_flag_reg_nr(devinfo
, insn
, state
->flag_subreg
/ 2);
632 brw_inst_set_flag_subreg_nr(devinfo
, insn
, state
->flag_subreg
% 2);
633 if (devinfo
->gen
>= 7)
634 brw_inst_set_flag_reg_nr(devinfo
, insn
, state
->flag_subreg
/ 2);
637 if (devinfo
->gen
>= 6)
638 brw_inst_set_acc_wr_control(devinfo
, insn
, state
->acc_wr_control
);
641 #define next_insn brw_next_insn
643 brw_next_insn(struct brw_codegen
*p
, unsigned opcode
)
645 const struct gen_device_info
*devinfo
= p
->devinfo
;
648 if (p
->nr_insn
+ 1 > p
->store_size
) {
650 p
->store
= reralloc(p
->mem_ctx
, p
->store
, brw_inst
, p
->store_size
);
653 p
->next_insn_offset
+= 16;
654 insn
= &p
->store
[p
->nr_insn
++];
656 memset(insn
, 0, sizeof(*insn
));
657 brw_inst_set_opcode(devinfo
, insn
, opcode
);
659 /* Apply the default instruction state */
660 brw_inst_set_state(devinfo
, insn
, p
->current
);
666 brw_alu1(struct brw_codegen
*p
, unsigned opcode
,
667 struct brw_reg dest
, struct brw_reg src
)
669 brw_inst
*insn
= next_insn(p
, opcode
);
670 brw_set_dest(p
, insn
, dest
);
671 brw_set_src0(p
, insn
, src
);
676 brw_alu2(struct brw_codegen
*p
, unsigned opcode
,
677 struct brw_reg dest
, struct brw_reg src0
, struct brw_reg src1
)
679 /* 64-bit immediates are only supported on 1-src instructions */
680 assert(src0
.file
!= BRW_IMMEDIATE_VALUE
|| type_sz(src0
.type
) <= 4);
681 assert(src1
.file
!= BRW_IMMEDIATE_VALUE
|| type_sz(src1
.type
) <= 4);
683 brw_inst
*insn
= next_insn(p
, opcode
);
684 brw_set_dest(p
, insn
, dest
);
685 brw_set_src0(p
, insn
, src0
);
686 brw_set_src1(p
, insn
, src1
);
691 get_3src_subreg_nr(struct brw_reg reg
)
693 /* Normally, SubRegNum is in bytes (0..31). However, 3-src instructions
694 * use 32-bit units (components 0..7). Since they only support F/D/UD
695 * types, this doesn't lose any flexibility, but uses fewer bits.
697 return reg
.subnr
/ 4;
700 static enum gen10_align1_3src_vertical_stride
701 to_3src_align1_vstride(const struct gen_device_info
*devinfo
,
702 enum brw_vertical_stride vstride
)
705 case BRW_VERTICAL_STRIDE_0
:
706 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_0
;
707 case BRW_VERTICAL_STRIDE_1
:
708 assert(devinfo
->gen
>= 12);
709 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_1
;
710 case BRW_VERTICAL_STRIDE_2
:
711 assert(devinfo
->gen
< 12);
712 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_2
;
713 case BRW_VERTICAL_STRIDE_4
:
714 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_4
;
715 case BRW_VERTICAL_STRIDE_8
:
716 case BRW_VERTICAL_STRIDE_16
:
717 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_8
;
719 unreachable("invalid vstride");
724 static enum gen10_align1_3src_src_horizontal_stride
725 to_3src_align1_hstride(enum brw_horizontal_stride hstride
)
728 case BRW_HORIZONTAL_STRIDE_0
:
729 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_0
;
730 case BRW_HORIZONTAL_STRIDE_1
:
731 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_1
;
732 case BRW_HORIZONTAL_STRIDE_2
:
733 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_2
;
734 case BRW_HORIZONTAL_STRIDE_4
:
735 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_4
;
737 unreachable("invalid hstride");
742 brw_alu3(struct brw_codegen
*p
, unsigned opcode
, struct brw_reg dest
,
743 struct brw_reg src0
, struct brw_reg src1
, struct brw_reg src2
)
745 const struct gen_device_info
*devinfo
= p
->devinfo
;
746 brw_inst
*inst
= next_insn(p
, opcode
);
748 gen7_convert_mrf_to_grf(p
, &dest
);
750 assert(dest
.nr
< 128);
752 if (devinfo
->gen
>= 10)
753 assert(!(src0
.file
== BRW_IMMEDIATE_VALUE
&&
754 src2
.file
== BRW_IMMEDIATE_VALUE
));
756 assert(src0
.file
== BRW_IMMEDIATE_VALUE
|| src0
.nr
< 128);
757 assert(src1
.file
!= BRW_IMMEDIATE_VALUE
&& src1
.nr
< 128);
758 assert(src2
.file
== BRW_IMMEDIATE_VALUE
|| src2
.nr
< 128);
759 assert(dest
.address_mode
== BRW_ADDRESS_DIRECT
);
760 assert(src0
.address_mode
== BRW_ADDRESS_DIRECT
);
761 assert(src1
.address_mode
== BRW_ADDRESS_DIRECT
);
762 assert(src2
.address_mode
== BRW_ADDRESS_DIRECT
);
764 if (brw_inst_access_mode(devinfo
, inst
) == BRW_ALIGN_1
) {
765 assert(dest
.file
== BRW_GENERAL_REGISTER_FILE
||
766 dest
.file
== BRW_ARCHITECTURE_REGISTER_FILE
);
768 if (devinfo
->gen
>= 12) {
769 brw_inst_set_3src_a1_dst_reg_file(devinfo
, inst
, dest
.file
);
770 brw_inst_set_3src_dst_reg_nr(devinfo
, inst
, dest
.nr
);
772 if (dest
.file
== BRW_ARCHITECTURE_REGISTER_FILE
) {
773 brw_inst_set_3src_a1_dst_reg_file(devinfo
, inst
,
774 BRW_ALIGN1_3SRC_ACCUMULATOR
);
775 brw_inst_set_3src_dst_reg_nr(devinfo
, inst
, BRW_ARF_ACCUMULATOR
);
777 brw_inst_set_3src_a1_dst_reg_file(devinfo
, inst
,
778 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE
);
779 brw_inst_set_3src_dst_reg_nr(devinfo
, inst
, dest
.nr
);
782 brw_inst_set_3src_a1_dst_subreg_nr(devinfo
, inst
, dest
.subnr
/ 8);
784 brw_inst_set_3src_a1_dst_hstride(devinfo
, inst
, BRW_ALIGN1_3SRC_DST_HORIZONTAL_STRIDE_1
);
786 if (brw_reg_type_is_floating_point(dest
.type
)) {
787 brw_inst_set_3src_a1_exec_type(devinfo
, inst
,
788 BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT
);
790 brw_inst_set_3src_a1_exec_type(devinfo
, inst
,
791 BRW_ALIGN1_3SRC_EXEC_TYPE_INT
);
794 brw_inst_set_3src_a1_dst_type(devinfo
, inst
, dest
.type
);
795 brw_inst_set_3src_a1_src0_type(devinfo
, inst
, src0
.type
);
796 brw_inst_set_3src_a1_src1_type(devinfo
, inst
, src1
.type
);
797 brw_inst_set_3src_a1_src2_type(devinfo
, inst
, src2
.type
);
799 if (src0
.file
== BRW_IMMEDIATE_VALUE
) {
800 brw_inst_set_3src_a1_src0_imm(devinfo
, inst
, src0
.ud
);
802 brw_inst_set_3src_a1_src0_vstride(
803 devinfo
, inst
, to_3src_align1_vstride(devinfo
, src0
.vstride
));
804 brw_inst_set_3src_a1_src0_hstride(devinfo
, inst
,
805 to_3src_align1_hstride(src0
.hstride
));
806 brw_inst_set_3src_a1_src0_subreg_nr(devinfo
, inst
, src0
.subnr
);
807 if (src0
.type
== BRW_REGISTER_TYPE_NF
) {
808 brw_inst_set_3src_src0_reg_nr(devinfo
, inst
, BRW_ARF_ACCUMULATOR
);
810 brw_inst_set_3src_src0_reg_nr(devinfo
, inst
, src0
.nr
);
812 brw_inst_set_3src_src0_abs(devinfo
, inst
, src0
.abs
);
813 brw_inst_set_3src_src0_negate(devinfo
, inst
, src0
.negate
);
815 brw_inst_set_3src_a1_src1_vstride(
816 devinfo
, inst
, to_3src_align1_vstride(devinfo
, src1
.vstride
));
817 brw_inst_set_3src_a1_src1_hstride(devinfo
, inst
,
818 to_3src_align1_hstride(src1
.hstride
));
820 brw_inst_set_3src_a1_src1_subreg_nr(devinfo
, inst
, src1
.subnr
);
821 if (src1
.file
== BRW_ARCHITECTURE_REGISTER_FILE
) {
822 brw_inst_set_3src_src1_reg_nr(devinfo
, inst
, BRW_ARF_ACCUMULATOR
);
824 brw_inst_set_3src_src1_reg_nr(devinfo
, inst
, src1
.nr
);
826 brw_inst_set_3src_src1_abs(devinfo
, inst
, src1
.abs
);
827 brw_inst_set_3src_src1_negate(devinfo
, inst
, src1
.negate
);
829 if (src2
.file
== BRW_IMMEDIATE_VALUE
) {
830 brw_inst_set_3src_a1_src2_imm(devinfo
, inst
, src2
.ud
);
832 brw_inst_set_3src_a1_src2_hstride(devinfo
, inst
,
833 to_3src_align1_hstride(src2
.hstride
));
834 /* no vstride on src2 */
835 brw_inst_set_3src_a1_src2_subreg_nr(devinfo
, inst
, src2
.subnr
);
836 brw_inst_set_3src_src2_reg_nr(devinfo
, inst
, src2
.nr
);
837 brw_inst_set_3src_src2_abs(devinfo
, inst
, src2
.abs
);
838 brw_inst_set_3src_src2_negate(devinfo
, inst
, src2
.negate
);
841 assert(src0
.file
== BRW_GENERAL_REGISTER_FILE
||
842 src0
.file
== BRW_IMMEDIATE_VALUE
||
843 (src0
.file
== BRW_ARCHITECTURE_REGISTER_FILE
&&
844 src0
.type
== BRW_REGISTER_TYPE_NF
));
845 assert(src1
.file
== BRW_GENERAL_REGISTER_FILE
||
846 src1
.file
== BRW_ARCHITECTURE_REGISTER_FILE
);
847 assert(src2
.file
== BRW_GENERAL_REGISTER_FILE
||
848 src2
.file
== BRW_IMMEDIATE_VALUE
);
850 if (devinfo
->gen
>= 12) {
851 brw_inst_set_3src_a1_src0_reg_file(devinfo
, inst
, src0
.file
);
852 brw_inst_set_3src_a1_src1_reg_file(devinfo
, inst
, src1
.file
);
853 brw_inst_set_3src_a1_src2_reg_file(devinfo
, inst
, src2
.file
);
855 brw_inst_set_3src_a1_src0_reg_file(devinfo
, inst
,
856 src0
.file
== BRW_GENERAL_REGISTER_FILE
?
857 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE
:
858 BRW_ALIGN1_3SRC_IMMEDIATE_VALUE
);
859 brw_inst_set_3src_a1_src1_reg_file(devinfo
, inst
,
860 src1
.file
== BRW_GENERAL_REGISTER_FILE
?
861 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE
:
862 BRW_ALIGN1_3SRC_ACCUMULATOR
);
863 brw_inst_set_3src_a1_src2_reg_file(devinfo
, inst
,
864 src2
.file
== BRW_GENERAL_REGISTER_FILE
?
865 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE
:
866 BRW_ALIGN1_3SRC_IMMEDIATE_VALUE
);
870 assert(dest
.file
== BRW_GENERAL_REGISTER_FILE
||
871 dest
.file
== BRW_MESSAGE_REGISTER_FILE
);
872 assert(dest
.type
== BRW_REGISTER_TYPE_F
||
873 dest
.type
== BRW_REGISTER_TYPE_DF
||
874 dest
.type
== BRW_REGISTER_TYPE_D
||
875 dest
.type
== BRW_REGISTER_TYPE_UD
||
876 (dest
.type
== BRW_REGISTER_TYPE_HF
&& devinfo
->gen
>= 8));
877 if (devinfo
->gen
== 6) {
878 brw_inst_set_3src_a16_dst_reg_file(devinfo
, inst
,
879 dest
.file
== BRW_MESSAGE_REGISTER_FILE
);
881 brw_inst_set_3src_dst_reg_nr(devinfo
, inst
, dest
.nr
);
882 brw_inst_set_3src_a16_dst_subreg_nr(devinfo
, inst
, dest
.subnr
/ 16);
883 brw_inst_set_3src_a16_dst_writemask(devinfo
, inst
, dest
.writemask
);
885 assert(src0
.file
== BRW_GENERAL_REGISTER_FILE
);
886 brw_inst_set_3src_a16_src0_swizzle(devinfo
, inst
, src0
.swizzle
);
887 brw_inst_set_3src_a16_src0_subreg_nr(devinfo
, inst
, get_3src_subreg_nr(src0
));
888 brw_inst_set_3src_src0_reg_nr(devinfo
, inst
, src0
.nr
);
889 brw_inst_set_3src_src0_abs(devinfo
, inst
, src0
.abs
);
890 brw_inst_set_3src_src0_negate(devinfo
, inst
, src0
.negate
);
891 brw_inst_set_3src_a16_src0_rep_ctrl(devinfo
, inst
,
892 src0
.vstride
== BRW_VERTICAL_STRIDE_0
);
894 assert(src1
.file
== BRW_GENERAL_REGISTER_FILE
);
895 brw_inst_set_3src_a16_src1_swizzle(devinfo
, inst
, src1
.swizzle
);
896 brw_inst_set_3src_a16_src1_subreg_nr(devinfo
, inst
, get_3src_subreg_nr(src1
));
897 brw_inst_set_3src_src1_reg_nr(devinfo
, inst
, src1
.nr
);
898 brw_inst_set_3src_src1_abs(devinfo
, inst
, src1
.abs
);
899 brw_inst_set_3src_src1_negate(devinfo
, inst
, src1
.negate
);
900 brw_inst_set_3src_a16_src1_rep_ctrl(devinfo
, inst
,
901 src1
.vstride
== BRW_VERTICAL_STRIDE_0
);
903 assert(src2
.file
== BRW_GENERAL_REGISTER_FILE
);
904 brw_inst_set_3src_a16_src2_swizzle(devinfo
, inst
, src2
.swizzle
);
905 brw_inst_set_3src_a16_src2_subreg_nr(devinfo
, inst
, get_3src_subreg_nr(src2
));
906 brw_inst_set_3src_src2_reg_nr(devinfo
, inst
, src2
.nr
);
907 brw_inst_set_3src_src2_abs(devinfo
, inst
, src2
.abs
);
908 brw_inst_set_3src_src2_negate(devinfo
, inst
, src2
.negate
);
909 brw_inst_set_3src_a16_src2_rep_ctrl(devinfo
, inst
,
910 src2
.vstride
== BRW_VERTICAL_STRIDE_0
);
912 if (devinfo
->gen
>= 7) {
913 /* Set both the source and destination types based on dest.type,
914 * ignoring the source register types. The MAD and LRP emitters ensure
915 * that all four types are float. The BFE and BFI2 emitters, however,
916 * may send us mixed D and UD types and want us to ignore that and use
917 * the destination type.
919 brw_inst_set_3src_a16_src_type(devinfo
, inst
, dest
.type
);
920 brw_inst_set_3src_a16_dst_type(devinfo
, inst
, dest
.type
);
922 /* From the Bspec, 3D Media GPGPU, Instruction fields, srcType:
924 * "Three source instructions can use operands with mixed-mode
925 * precision. When SrcType field is set to :f or :hf it defines
926 * precision for source 0 only, and fields Src1Type and Src2Type
927 * define precision for other source operands:
929 * 0b = :f. Single precision Float (32-bit).
930 * 1b = :hf. Half precision Float (16-bit)."
932 if (src1
.type
== BRW_REGISTER_TYPE_HF
)
933 brw_inst_set_3src_a16_src1_type(devinfo
, inst
, 1);
935 if (src2
.type
== BRW_REGISTER_TYPE_HF
)
936 brw_inst_set_3src_a16_src2_type(devinfo
, inst
, 1);
944 /***********************************************************************
945 * Convenience routines.
948 brw_inst *brw_##OP(struct brw_codegen *p, \
949 struct brw_reg dest, \
950 struct brw_reg src0) \
952 return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \
956 brw_inst *brw_##OP(struct brw_codegen *p, \
957 struct brw_reg dest, \
958 struct brw_reg src0, \
959 struct brw_reg src1) \
961 return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \
965 brw_inst *brw_##OP(struct brw_codegen *p, \
966 struct brw_reg dest, \
967 struct brw_reg src0, \
968 struct brw_reg src1, \
969 struct brw_reg src2) \
971 if (p->current->access_mode == BRW_ALIGN_16) { \
972 if (src0.vstride == BRW_VERTICAL_STRIDE_0) \
973 src0.swizzle = BRW_SWIZZLE_XXXX; \
974 if (src1.vstride == BRW_VERTICAL_STRIDE_0) \
975 src1.swizzle = BRW_SWIZZLE_XXXX; \
976 if (src2.vstride == BRW_VERTICAL_STRIDE_0) \
977 src2.swizzle = BRW_SWIZZLE_XXXX; \
979 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
983 brw_inst *brw_##OP(struct brw_codegen *p, \
984 struct brw_reg dest, \
985 struct brw_reg src0, \
986 struct brw_reg src1, \
987 struct brw_reg src2) \
989 assert(dest.type == BRW_REGISTER_TYPE_F || \
990 dest.type == BRW_REGISTER_TYPE_DF); \
991 if (dest.type == BRW_REGISTER_TYPE_F) { \
992 assert(src0.type == BRW_REGISTER_TYPE_F); \
993 assert(src1.type == BRW_REGISTER_TYPE_F); \
994 assert(src2.type == BRW_REGISTER_TYPE_F); \
995 } else if (dest.type == BRW_REGISTER_TYPE_DF) { \
996 assert(src0.type == BRW_REGISTER_TYPE_DF); \
997 assert(src1.type == BRW_REGISTER_TYPE_DF); \
998 assert(src2.type == BRW_REGISTER_TYPE_DF); \
1001 if (p->current->access_mode == BRW_ALIGN_16) { \
1002 if (src0.vstride == BRW_VERTICAL_STRIDE_0) \
1003 src0.swizzle = BRW_SWIZZLE_XXXX; \
1004 if (src1.vstride == BRW_VERTICAL_STRIDE_0) \
1005 src1.swizzle = BRW_SWIZZLE_XXXX; \
1006 if (src2.vstride == BRW_VERTICAL_STRIDE_0) \
1007 src2.swizzle = BRW_SWIZZLE_XXXX; \
1009 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
1012 /* Rounding operations (other than RNDD) require two instructions - the first
1013 * stores a rounded value (possibly the wrong way) in the dest register, but
1014 * also sets a per-channel "increment bit" in the flag register. A predicated
1015 * add of 1.0 fixes dest to contain the desired result.
1017 * Sandybridge and later appear to round correctly without an ADD.
1020 void brw_##OP(struct brw_codegen *p, \
1021 struct brw_reg dest, \
1022 struct brw_reg src) \
1024 const struct gen_device_info *devinfo = p->devinfo; \
1025 brw_inst *rnd, *add; \
1026 rnd = next_insn(p, BRW_OPCODE_##OP); \
1027 brw_set_dest(p, rnd, dest); \
1028 brw_set_src0(p, rnd, src); \
1030 if (devinfo->gen < 6) { \
1031 /* turn on round-increments */ \
1032 brw_inst_set_cond_modifier(devinfo, rnd, BRW_CONDITIONAL_R); \
1033 add = brw_ADD(p, dest, dest, brw_imm_f(1.0f)); \
1034 brw_inst_set_pred_control(devinfo, add, BRW_PREDICATE_NORMAL); \
1076 brw_MOV(struct brw_codegen
*p
, struct brw_reg dest
, struct brw_reg src0
)
1078 const struct gen_device_info
*devinfo
= p
->devinfo
;
1080 /* When converting F->DF on IVB/BYT, every odd source channel is ignored.
1081 * To avoid the problems that causes, we use an <X,2,0> source region to
1082 * read each element twice.
1084 if (devinfo
->gen
== 7 && !devinfo
->is_haswell
&&
1085 brw_get_default_access_mode(p
) == BRW_ALIGN_1
&&
1086 dest
.type
== BRW_REGISTER_TYPE_DF
&&
1087 (src0
.type
== BRW_REGISTER_TYPE_F
||
1088 src0
.type
== BRW_REGISTER_TYPE_D
||
1089 src0
.type
== BRW_REGISTER_TYPE_UD
) &&
1090 !has_scalar_region(src0
)) {
1091 assert(src0
.vstride
== src0
.width
+ src0
.hstride
);
1092 src0
.vstride
= src0
.hstride
;
1093 src0
.width
= BRW_WIDTH_2
;
1094 src0
.hstride
= BRW_HORIZONTAL_STRIDE_0
;
1097 return brw_alu1(p
, BRW_OPCODE_MOV
, dest
, src0
);
1101 brw_ADD(struct brw_codegen
*p
, struct brw_reg dest
,
1102 struct brw_reg src0
, struct brw_reg src1
)
1105 if (src0
.type
== BRW_REGISTER_TYPE_F
||
1106 (src0
.file
== BRW_IMMEDIATE_VALUE
&&
1107 src0
.type
== BRW_REGISTER_TYPE_VF
)) {
1108 assert(src1
.type
!= BRW_REGISTER_TYPE_UD
);
1109 assert(src1
.type
!= BRW_REGISTER_TYPE_D
);
1112 if (src1
.type
== BRW_REGISTER_TYPE_F
||
1113 (src1
.file
== BRW_IMMEDIATE_VALUE
&&
1114 src1
.type
== BRW_REGISTER_TYPE_VF
)) {
1115 assert(src0
.type
!= BRW_REGISTER_TYPE_UD
);
1116 assert(src0
.type
!= BRW_REGISTER_TYPE_D
);
1119 return brw_alu2(p
, BRW_OPCODE_ADD
, dest
, src0
, src1
);
1123 brw_AVG(struct brw_codegen
*p
, struct brw_reg dest
,
1124 struct brw_reg src0
, struct brw_reg src1
)
1126 assert(dest
.type
== src0
.type
);
1127 assert(src0
.type
== src1
.type
);
1128 switch (src0
.type
) {
1129 case BRW_REGISTER_TYPE_B
:
1130 case BRW_REGISTER_TYPE_UB
:
1131 case BRW_REGISTER_TYPE_W
:
1132 case BRW_REGISTER_TYPE_UW
:
1133 case BRW_REGISTER_TYPE_D
:
1134 case BRW_REGISTER_TYPE_UD
:
1137 unreachable("Bad type for brw_AVG");
1140 return brw_alu2(p
, BRW_OPCODE_AVG
, dest
, src0
, src1
);
1144 brw_MUL(struct brw_codegen
*p
, struct brw_reg dest
,
1145 struct brw_reg src0
, struct brw_reg src1
)
1148 if (src0
.type
== BRW_REGISTER_TYPE_D
||
1149 src0
.type
== BRW_REGISTER_TYPE_UD
||
1150 src1
.type
== BRW_REGISTER_TYPE_D
||
1151 src1
.type
== BRW_REGISTER_TYPE_UD
) {
1152 assert(dest
.type
!= BRW_REGISTER_TYPE_F
);
1155 if (src0
.type
== BRW_REGISTER_TYPE_F
||
1156 (src0
.file
== BRW_IMMEDIATE_VALUE
&&
1157 src0
.type
== BRW_REGISTER_TYPE_VF
)) {
1158 assert(src1
.type
!= BRW_REGISTER_TYPE_UD
);
1159 assert(src1
.type
!= BRW_REGISTER_TYPE_D
);
1162 if (src1
.type
== BRW_REGISTER_TYPE_F
||
1163 (src1
.file
== BRW_IMMEDIATE_VALUE
&&
1164 src1
.type
== BRW_REGISTER_TYPE_VF
)) {
1165 assert(src0
.type
!= BRW_REGISTER_TYPE_UD
);
1166 assert(src0
.type
!= BRW_REGISTER_TYPE_D
);
1169 assert(src0
.file
!= BRW_ARCHITECTURE_REGISTER_FILE
||
1170 src0
.nr
!= BRW_ARF_ACCUMULATOR
);
1171 assert(src1
.file
!= BRW_ARCHITECTURE_REGISTER_FILE
||
1172 src1
.nr
!= BRW_ARF_ACCUMULATOR
);
1174 return brw_alu2(p
, BRW_OPCODE_MUL
, dest
, src0
, src1
);
1178 brw_LINE(struct brw_codegen
*p
, struct brw_reg dest
,
1179 struct brw_reg src0
, struct brw_reg src1
)
1181 src0
.vstride
= BRW_VERTICAL_STRIDE_0
;
1182 src0
.width
= BRW_WIDTH_1
;
1183 src0
.hstride
= BRW_HORIZONTAL_STRIDE_0
;
1184 return brw_alu2(p
, BRW_OPCODE_LINE
, dest
, src0
, src1
);
1188 brw_PLN(struct brw_codegen
*p
, struct brw_reg dest
,
1189 struct brw_reg src0
, struct brw_reg src1
)
1191 src0
.vstride
= BRW_VERTICAL_STRIDE_0
;
1192 src0
.width
= BRW_WIDTH_1
;
1193 src0
.hstride
= BRW_HORIZONTAL_STRIDE_0
;
1194 src1
.vstride
= BRW_VERTICAL_STRIDE_8
;
1195 src1
.width
= BRW_WIDTH_8
;
1196 src1
.hstride
= BRW_HORIZONTAL_STRIDE_1
;
1197 return brw_alu2(p
, BRW_OPCODE_PLN
, dest
, src0
, src1
);
1201 brw_F32TO16(struct brw_codegen
*p
, struct brw_reg dst
, struct brw_reg src
)
1203 const struct gen_device_info
*devinfo
= p
->devinfo
;
1204 const bool align16
= brw_get_default_access_mode(p
) == BRW_ALIGN_16
;
1205 /* The F32TO16 instruction doesn't support 32-bit destination types in
1206 * Align1 mode, and neither does the Gen8 implementation in terms of a
1207 * converting MOV. Gen7 does zero out the high 16 bits in Align16 mode as
1208 * an undocumented feature.
1210 const bool needs_zero_fill
= (dst
.type
== BRW_REGISTER_TYPE_UD
&&
1211 (!align16
|| devinfo
->gen
>= 8));
1215 assert(dst
.type
== BRW_REGISTER_TYPE_UD
);
1217 assert(dst
.type
== BRW_REGISTER_TYPE_UD
||
1218 dst
.type
== BRW_REGISTER_TYPE_W
||
1219 dst
.type
== BRW_REGISTER_TYPE_UW
||
1220 dst
.type
== BRW_REGISTER_TYPE_HF
);
1223 brw_push_insn_state(p
);
1225 if (needs_zero_fill
) {
1226 brw_set_default_access_mode(p
, BRW_ALIGN_1
);
1227 dst
= spread(retype(dst
, BRW_REGISTER_TYPE_W
), 2);
1230 if (devinfo
->gen
>= 8) {
1231 inst
= brw_MOV(p
, retype(dst
, BRW_REGISTER_TYPE_HF
), src
);
1233 assert(devinfo
->gen
== 7);
1234 inst
= brw_alu1(p
, BRW_OPCODE_F32TO16
, dst
, src
);
1237 if (needs_zero_fill
) {
1238 if (devinfo
->gen
< 12)
1239 brw_inst_set_no_dd_clear(devinfo
, inst
, true);
1240 brw_set_default_swsb(p
, tgl_swsb_null());
1241 inst
= brw_MOV(p
, suboffset(dst
, 1), brw_imm_w(0));
1242 if (devinfo
->gen
< 12)
1243 brw_inst_set_no_dd_check(devinfo
, inst
, true);
1246 brw_pop_insn_state(p
);
1251 brw_F16TO32(struct brw_codegen
*p
, struct brw_reg dst
, struct brw_reg src
)
1253 const struct gen_device_info
*devinfo
= p
->devinfo
;
1254 bool align16
= brw_get_default_access_mode(p
) == BRW_ALIGN_16
;
1257 assert(src
.type
== BRW_REGISTER_TYPE_UD
);
1259 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1261 * Because this instruction does not have a 16-bit floating-point
1262 * type, the source data type must be Word (W). The destination type
1263 * must be F (Float).
1265 if (src
.type
== BRW_REGISTER_TYPE_UD
)
1266 src
= spread(retype(src
, BRW_REGISTER_TYPE_W
), 2);
1268 assert(src
.type
== BRW_REGISTER_TYPE_W
||
1269 src
.type
== BRW_REGISTER_TYPE_UW
||
1270 src
.type
== BRW_REGISTER_TYPE_HF
);
1273 if (devinfo
->gen
>= 8) {
1274 return brw_MOV(p
, dst
, retype(src
, BRW_REGISTER_TYPE_HF
));
1276 assert(devinfo
->gen
== 7);
1277 return brw_alu1(p
, BRW_OPCODE_F16TO32
, dst
, src
);
1282 void brw_NOP(struct brw_codegen
*p
)
1284 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_NOP
);
1285 memset(insn
, 0, sizeof(*insn
));
1286 brw_inst_set_opcode(p
->devinfo
, insn
, BRW_OPCODE_NOP
);
1289 void brw_SYNC(struct brw_codegen
*p
, enum tgl_sync_function func
)
1291 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_SYNC
);
1292 brw_inst_set_cond_modifier(p
->devinfo
, insn
, func
);
1295 /***********************************************************************
1296 * Comparisons, if/else/endif
1300 brw_JMPI(struct brw_codegen
*p
, struct brw_reg index
,
1301 unsigned predicate_control
)
1303 const struct gen_device_info
*devinfo
= p
->devinfo
;
1304 struct brw_reg ip
= brw_ip_reg();
1305 brw_inst
*inst
= brw_alu2(p
, BRW_OPCODE_JMPI
, ip
, ip
, index
);
1307 brw_inst_set_exec_size(devinfo
, inst
, BRW_EXECUTE_1
);
1308 brw_inst_set_qtr_control(devinfo
, inst
, BRW_COMPRESSION_NONE
);
1309 brw_inst_set_mask_control(devinfo
, inst
, BRW_MASK_DISABLE
);
1310 brw_inst_set_pred_control(devinfo
, inst
, predicate_control
);
1316 push_if_stack(struct brw_codegen
*p
, brw_inst
*inst
)
1318 p
->if_stack
[p
->if_stack_depth
] = inst
- p
->store
;
1320 p
->if_stack_depth
++;
1321 if (p
->if_stack_array_size
<= p
->if_stack_depth
) {
1322 p
->if_stack_array_size
*= 2;
1323 p
->if_stack
= reralloc(p
->mem_ctx
, p
->if_stack
, int,
1324 p
->if_stack_array_size
);
1329 pop_if_stack(struct brw_codegen
*p
)
1331 p
->if_stack_depth
--;
1332 return &p
->store
[p
->if_stack
[p
->if_stack_depth
]];
1336 push_loop_stack(struct brw_codegen
*p
, brw_inst
*inst
)
1338 if (p
->loop_stack_array_size
<= (p
->loop_stack_depth
+ 1)) {
1339 p
->loop_stack_array_size
*= 2;
1340 p
->loop_stack
= reralloc(p
->mem_ctx
, p
->loop_stack
, int,
1341 p
->loop_stack_array_size
);
1342 p
->if_depth_in_loop
= reralloc(p
->mem_ctx
, p
->if_depth_in_loop
, int,
1343 p
->loop_stack_array_size
);
1346 p
->loop_stack
[p
->loop_stack_depth
] = inst
- p
->store
;
1347 p
->loop_stack_depth
++;
1348 p
->if_depth_in_loop
[p
->loop_stack_depth
] = 0;
1352 get_inner_do_insn(struct brw_codegen
*p
)
1354 return &p
->store
[p
->loop_stack
[p
->loop_stack_depth
- 1]];
1357 /* EU takes the value from the flag register and pushes it onto some
1358 * sort of a stack (presumably merging with any flag value already on
1359 * the stack). Within an if block, the flags at the top of the stack
1360 * control execution on each channel of the unit, eg. on each of the
1361 * 16 pixel values in our wm programs.
1363 * When the matching 'else' instruction is reached (presumably by
1364 * countdown of the instruction count patched in by our ELSE/ENDIF
1365 * functions), the relevant flags are inverted.
1367 * When the matching 'endif' instruction is reached, the flags are
1368 * popped off. If the stack is now empty, normal execution resumes.
1371 brw_IF(struct brw_codegen
*p
, unsigned execute_size
)
1373 const struct gen_device_info
*devinfo
= p
->devinfo
;
1376 insn
= next_insn(p
, BRW_OPCODE_IF
);
1378 /* Override the defaults for this instruction:
1380 if (devinfo
->gen
< 6) {
1381 brw_set_dest(p
, insn
, brw_ip_reg());
1382 brw_set_src0(p
, insn
, brw_ip_reg());
1383 brw_set_src1(p
, insn
, brw_imm_d(0x0));
1384 } else if (devinfo
->gen
== 6) {
1385 brw_set_dest(p
, insn
, brw_imm_w(0));
1386 brw_inst_set_gen6_jump_count(devinfo
, insn
, 0);
1387 brw_set_src0(p
, insn
, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D
)));
1388 brw_set_src1(p
, insn
, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D
)));
1389 } else if (devinfo
->gen
== 7) {
1390 brw_set_dest(p
, insn
, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D
)));
1391 brw_set_src0(p
, insn
, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D
)));
1392 brw_set_src1(p
, insn
, brw_imm_w(0));
1393 brw_inst_set_jip(devinfo
, insn
, 0);
1394 brw_inst_set_uip(devinfo
, insn
, 0);
1396 brw_set_dest(p
, insn
, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D
)));
1397 if (devinfo
->gen
< 12)
1398 brw_set_src0(p
, insn
, brw_imm_d(0));
1399 brw_inst_set_jip(devinfo
, insn
, 0);
1400 brw_inst_set_uip(devinfo
, insn
, 0);
1403 brw_inst_set_exec_size(devinfo
, insn
, execute_size
);
1404 brw_inst_set_qtr_control(devinfo
, insn
, BRW_COMPRESSION_NONE
);
1405 brw_inst_set_pred_control(devinfo
, insn
, BRW_PREDICATE_NORMAL
);
1406 brw_inst_set_mask_control(devinfo
, insn
, BRW_MASK_ENABLE
);
1407 if (!p
->single_program_flow
&& devinfo
->gen
< 6)
1408 brw_inst_set_thread_control(devinfo
, insn
, BRW_THREAD_SWITCH
);
1410 push_if_stack(p
, insn
);
1411 p
->if_depth_in_loop
[p
->loop_stack_depth
]++;
1415 /* This function is only used for gen6-style IF instructions with an
1416 * embedded comparison (conditional modifier). It is not used on gen7.
1419 gen6_IF(struct brw_codegen
*p
, enum brw_conditional_mod conditional
,
1420 struct brw_reg src0
, struct brw_reg src1
)
1422 const struct gen_device_info
*devinfo
= p
->devinfo
;
1425 insn
= next_insn(p
, BRW_OPCODE_IF
);
1427 brw_set_dest(p
, insn
, brw_imm_w(0));
1428 brw_inst_set_exec_size(devinfo
, insn
, brw_get_default_exec_size(p
));
1429 brw_inst_set_gen6_jump_count(devinfo
, insn
, 0);
1430 brw_set_src0(p
, insn
, src0
);
1431 brw_set_src1(p
, insn
, src1
);
1433 assert(brw_inst_qtr_control(devinfo
, insn
) == BRW_COMPRESSION_NONE
);
1434 assert(brw_inst_pred_control(devinfo
, insn
) == BRW_PREDICATE_NONE
);
1435 brw_inst_set_cond_modifier(devinfo
, insn
, conditional
);
1437 push_if_stack(p
, insn
);
1442 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1445 convert_IF_ELSE_to_ADD(struct brw_codegen
*p
,
1446 brw_inst
*if_inst
, brw_inst
*else_inst
)
1448 const struct gen_device_info
*devinfo
= p
->devinfo
;
1450 /* The next instruction (where the ENDIF would be, if it existed) */
1451 brw_inst
*next_inst
= &p
->store
[p
->nr_insn
];
1453 assert(p
->single_program_flow
);
1454 assert(if_inst
!= NULL
&& brw_inst_opcode(devinfo
, if_inst
) == BRW_OPCODE_IF
);
1455 assert(else_inst
== NULL
|| brw_inst_opcode(devinfo
, else_inst
) == BRW_OPCODE_ELSE
);
1456 assert(brw_inst_exec_size(devinfo
, if_inst
) == BRW_EXECUTE_1
);
1458 /* Convert IF to an ADD instruction that moves the instruction pointer
1459 * to the first instruction of the ELSE block. If there is no ELSE
1460 * block, point to where ENDIF would be. Reverse the predicate.
1462 * There's no need to execute an ENDIF since we don't need to do any
1463 * stack operations, and if we're currently executing, we just want to
1464 * continue normally.
1466 brw_inst_set_opcode(devinfo
, if_inst
, BRW_OPCODE_ADD
);
1467 brw_inst_set_pred_inv(devinfo
, if_inst
, true);
1469 if (else_inst
!= NULL
) {
1470 /* Convert ELSE to an ADD instruction that points where the ENDIF
1473 brw_inst_set_opcode(devinfo
, else_inst
, BRW_OPCODE_ADD
);
1475 brw_inst_set_imm_ud(devinfo
, if_inst
, (else_inst
- if_inst
+ 1) * 16);
1476 brw_inst_set_imm_ud(devinfo
, else_inst
, (next_inst
- else_inst
) * 16);
1478 brw_inst_set_imm_ud(devinfo
, if_inst
, (next_inst
- if_inst
) * 16);
1483 * Patch IF and ELSE instructions with appropriate jump targets.
1486 patch_IF_ELSE(struct brw_codegen
*p
,
1487 brw_inst
*if_inst
, brw_inst
*else_inst
, brw_inst
*endif_inst
)
1489 const struct gen_device_info
*devinfo
= p
->devinfo
;
1491 /* We shouldn't be patching IF and ELSE instructions in single program flow
1492 * mode when gen < 6, because in single program flow mode on those
1493 * platforms, we convert flow control instructions to conditional ADDs that
1494 * operate on IP (see brw_ENDIF).
1496 * However, on Gen6, writing to IP doesn't work in single program flow mode
1497 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1498 * not be updated by non-flow control instructions."). And on later
1499 * platforms, there is no significant benefit to converting control flow
1500 * instructions to conditional ADDs. So we do patch IF and ELSE
1501 * instructions in single program flow mode on those platforms.
1503 if (devinfo
->gen
< 6)
1504 assert(!p
->single_program_flow
);
1506 assert(if_inst
!= NULL
&& brw_inst_opcode(devinfo
, if_inst
) == BRW_OPCODE_IF
);
1507 assert(endif_inst
!= NULL
);
1508 assert(else_inst
== NULL
|| brw_inst_opcode(devinfo
, else_inst
) == BRW_OPCODE_ELSE
);
1510 unsigned br
= brw_jump_scale(devinfo
);
1512 assert(brw_inst_opcode(devinfo
, endif_inst
) == BRW_OPCODE_ENDIF
);
1513 brw_inst_set_exec_size(devinfo
, endif_inst
, brw_inst_exec_size(devinfo
, if_inst
));
1515 if (else_inst
== NULL
) {
1516 /* Patch IF -> ENDIF */
1517 if (devinfo
->gen
< 6) {
1518 /* Turn it into an IFF, which means no mask stack operations for
1519 * all-false and jumping past the ENDIF.
1521 brw_inst_set_opcode(devinfo
, if_inst
, BRW_OPCODE_IFF
);
1522 brw_inst_set_gen4_jump_count(devinfo
, if_inst
,
1523 br
* (endif_inst
- if_inst
+ 1));
1524 brw_inst_set_gen4_pop_count(devinfo
, if_inst
, 0);
1525 } else if (devinfo
->gen
== 6) {
1526 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1527 brw_inst_set_gen6_jump_count(devinfo
, if_inst
, br
*(endif_inst
- if_inst
));
1529 brw_inst_set_uip(devinfo
, if_inst
, br
* (endif_inst
- if_inst
));
1530 brw_inst_set_jip(devinfo
, if_inst
, br
* (endif_inst
- if_inst
));
1533 brw_inst_set_exec_size(devinfo
, else_inst
, brw_inst_exec_size(devinfo
, if_inst
));
1535 /* Patch IF -> ELSE */
1536 if (devinfo
->gen
< 6) {
1537 brw_inst_set_gen4_jump_count(devinfo
, if_inst
,
1538 br
* (else_inst
- if_inst
));
1539 brw_inst_set_gen4_pop_count(devinfo
, if_inst
, 0);
1540 } else if (devinfo
->gen
== 6) {
1541 brw_inst_set_gen6_jump_count(devinfo
, if_inst
,
1542 br
* (else_inst
- if_inst
+ 1));
1545 /* Patch ELSE -> ENDIF */
1546 if (devinfo
->gen
< 6) {
1547 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1550 brw_inst_set_gen4_jump_count(devinfo
, else_inst
,
1551 br
* (endif_inst
- else_inst
+ 1));
1552 brw_inst_set_gen4_pop_count(devinfo
, else_inst
, 1);
1553 } else if (devinfo
->gen
== 6) {
1554 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1555 brw_inst_set_gen6_jump_count(devinfo
, else_inst
,
1556 br
* (endif_inst
- else_inst
));
1558 /* The IF instruction's JIP should point just past the ELSE */
1559 brw_inst_set_jip(devinfo
, if_inst
, br
* (else_inst
- if_inst
+ 1));
1560 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1561 brw_inst_set_uip(devinfo
, if_inst
, br
* (endif_inst
- if_inst
));
1562 brw_inst_set_jip(devinfo
, else_inst
, br
* (endif_inst
- else_inst
));
1563 if (devinfo
->gen
>= 8) {
1564 /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both
1565 * should point to ENDIF.
1567 brw_inst_set_uip(devinfo
, else_inst
, br
* (endif_inst
- else_inst
));
1574 brw_ELSE(struct brw_codegen
*p
)
1576 const struct gen_device_info
*devinfo
= p
->devinfo
;
1579 insn
= next_insn(p
, BRW_OPCODE_ELSE
);
1581 if (devinfo
->gen
< 6) {
1582 brw_set_dest(p
, insn
, brw_ip_reg());
1583 brw_set_src0(p
, insn
, brw_ip_reg());
1584 brw_set_src1(p
, insn
, brw_imm_d(0x0));
1585 } else if (devinfo
->gen
== 6) {
1586 brw_set_dest(p
, insn
, brw_imm_w(0));
1587 brw_inst_set_gen6_jump_count(devinfo
, insn
, 0);
1588 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1589 brw_set_src1(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1590 } else if (devinfo
->gen
== 7) {
1591 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1592 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1593 brw_set_src1(p
, insn
, brw_imm_w(0));
1594 brw_inst_set_jip(devinfo
, insn
, 0);
1595 brw_inst_set_uip(devinfo
, insn
, 0);
1597 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1598 if (devinfo
->gen
< 12)
1599 brw_set_src0(p
, insn
, brw_imm_d(0));
1600 brw_inst_set_jip(devinfo
, insn
, 0);
1601 brw_inst_set_uip(devinfo
, insn
, 0);
1604 brw_inst_set_qtr_control(devinfo
, insn
, BRW_COMPRESSION_NONE
);
1605 brw_inst_set_mask_control(devinfo
, insn
, BRW_MASK_ENABLE
);
1606 if (!p
->single_program_flow
&& devinfo
->gen
< 6)
1607 brw_inst_set_thread_control(devinfo
, insn
, BRW_THREAD_SWITCH
);
1609 push_if_stack(p
, insn
);
1613 brw_ENDIF(struct brw_codegen
*p
)
1615 const struct gen_device_info
*devinfo
= p
->devinfo
;
1616 brw_inst
*insn
= NULL
;
1617 brw_inst
*else_inst
= NULL
;
1618 brw_inst
*if_inst
= NULL
;
1620 bool emit_endif
= true;
1622 /* In single program flow mode, we can express IF and ELSE instructions
1623 * equivalently as ADD instructions that operate on IP. On platforms prior
1624 * to Gen6, flow control instructions cause an implied thread switch, so
1625 * this is a significant savings.
1627 * However, on Gen6, writing to IP doesn't work in single program flow mode
1628 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1629 * not be updated by non-flow control instructions."). And on later
1630 * platforms, there is no significant benefit to converting control flow
1631 * instructions to conditional ADDs. So we only do this trick on Gen4 and
1634 if (devinfo
->gen
< 6 && p
->single_program_flow
)
1638 * A single next_insn() may change the base address of instruction store
1639 * memory(p->store), so call it first before referencing the instruction
1640 * store pointer from an index
1643 insn
= next_insn(p
, BRW_OPCODE_ENDIF
);
1645 /* Pop the IF and (optional) ELSE instructions from the stack */
1646 p
->if_depth_in_loop
[p
->loop_stack_depth
]--;
1647 tmp
= pop_if_stack(p
);
1648 if (brw_inst_opcode(devinfo
, tmp
) == BRW_OPCODE_ELSE
) {
1650 tmp
= pop_if_stack(p
);
1655 /* ENDIF is useless; don't bother emitting it. */
1656 convert_IF_ELSE_to_ADD(p
, if_inst
, else_inst
);
1660 if (devinfo
->gen
< 6) {
1661 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1662 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1663 brw_set_src1(p
, insn
, brw_imm_d(0x0));
1664 } else if (devinfo
->gen
== 6) {
1665 brw_set_dest(p
, insn
, brw_imm_w(0));
1666 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1667 brw_set_src1(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1668 } else if (devinfo
->gen
== 7) {
1669 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1670 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1671 brw_set_src1(p
, insn
, brw_imm_w(0));
1673 brw_set_src0(p
, insn
, brw_imm_d(0));
1676 brw_inst_set_qtr_control(devinfo
, insn
, BRW_COMPRESSION_NONE
);
1677 brw_inst_set_mask_control(devinfo
, insn
, BRW_MASK_ENABLE
);
1678 if (devinfo
->gen
< 6)
1679 brw_inst_set_thread_control(devinfo
, insn
, BRW_THREAD_SWITCH
);
1681 /* Also pop item off the stack in the endif instruction: */
1682 if (devinfo
->gen
< 6) {
1683 brw_inst_set_gen4_jump_count(devinfo
, insn
, 0);
1684 brw_inst_set_gen4_pop_count(devinfo
, insn
, 1);
1685 } else if (devinfo
->gen
== 6) {
1686 brw_inst_set_gen6_jump_count(devinfo
, insn
, 2);
1688 brw_inst_set_jip(devinfo
, insn
, 2);
1690 patch_IF_ELSE(p
, if_inst
, else_inst
, insn
);
1694 brw_BREAK(struct brw_codegen
*p
)
1696 const struct gen_device_info
*devinfo
= p
->devinfo
;
1699 insn
= next_insn(p
, BRW_OPCODE_BREAK
);
1700 if (devinfo
->gen
>= 8) {
1701 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1702 brw_set_src0(p
, insn
, brw_imm_d(0x0));
1703 } else if (devinfo
->gen
>= 6) {
1704 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1705 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1706 brw_set_src1(p
, insn
, brw_imm_d(0x0));
1708 brw_set_dest(p
, insn
, brw_ip_reg());
1709 brw_set_src0(p
, insn
, brw_ip_reg());
1710 brw_set_src1(p
, insn
, brw_imm_d(0x0));
1711 brw_inst_set_gen4_pop_count(devinfo
, insn
,
1712 p
->if_depth_in_loop
[p
->loop_stack_depth
]);
1714 brw_inst_set_qtr_control(devinfo
, insn
, BRW_COMPRESSION_NONE
);
1715 brw_inst_set_exec_size(devinfo
, insn
, brw_get_default_exec_size(p
));
1721 brw_CONT(struct brw_codegen
*p
)
1723 const struct gen_device_info
*devinfo
= p
->devinfo
;
1726 insn
= next_insn(p
, BRW_OPCODE_CONTINUE
);
1727 brw_set_dest(p
, insn
, brw_ip_reg());
1728 if (devinfo
->gen
>= 8) {
1729 brw_set_src0(p
, insn
, brw_imm_d(0x0));
1731 brw_set_src0(p
, insn
, brw_ip_reg());
1732 brw_set_src1(p
, insn
, brw_imm_d(0x0));
1735 if (devinfo
->gen
< 6) {
1736 brw_inst_set_gen4_pop_count(devinfo
, insn
,
1737 p
->if_depth_in_loop
[p
->loop_stack_depth
]);
1739 brw_inst_set_qtr_control(devinfo
, insn
, BRW_COMPRESSION_NONE
);
1740 brw_inst_set_exec_size(devinfo
, insn
, brw_get_default_exec_size(p
));
1745 gen6_HALT(struct brw_codegen
*p
)
1747 const struct gen_device_info
*devinfo
= p
->devinfo
;
1750 insn
= next_insn(p
, BRW_OPCODE_HALT
);
1751 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1752 if (devinfo
->gen
< 8) {
1753 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1754 brw_set_src1(p
, insn
, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1755 } else if (devinfo
->gen
< 12) {
1756 brw_set_src0(p
, insn
, brw_imm_d(0x0));
1759 brw_inst_set_qtr_control(devinfo
, insn
, BRW_COMPRESSION_NONE
);
1760 brw_inst_set_exec_size(devinfo
, insn
, brw_get_default_exec_size(p
));
1766 * The DO/WHILE is just an unterminated loop -- break or continue are
1767 * used for control within the loop. We have a few ways they can be
1770 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1771 * jip and no DO instruction.
1773 * For non-uniform control flow pre-gen6, there's a DO instruction to
1774 * push the mask, and a WHILE to jump back, and BREAK to get out and
1777 * For gen6, there's no more mask stack, so no need for DO. WHILE
1778 * just points back to the first instruction of the loop.
1781 brw_DO(struct brw_codegen
*p
, unsigned execute_size
)
1783 const struct gen_device_info
*devinfo
= p
->devinfo
;
1785 if (devinfo
->gen
>= 6 || p
->single_program_flow
) {
1786 push_loop_stack(p
, &p
->store
[p
->nr_insn
]);
1787 return &p
->store
[p
->nr_insn
];
1789 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_DO
);
1791 push_loop_stack(p
, insn
);
1793 /* Override the defaults for this instruction:
1795 brw_set_dest(p
, insn
, brw_null_reg());
1796 brw_set_src0(p
, insn
, brw_null_reg());
1797 brw_set_src1(p
, insn
, brw_null_reg());
1799 brw_inst_set_qtr_control(devinfo
, insn
, BRW_COMPRESSION_NONE
);
1800 brw_inst_set_exec_size(devinfo
, insn
, execute_size
);
1801 brw_inst_set_pred_control(devinfo
, insn
, BRW_PREDICATE_NONE
);
1808 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1811 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1812 * nesting, since it can always just point to the end of the block/current loop.
1815 brw_patch_break_cont(struct brw_codegen
*p
, brw_inst
*while_inst
)
1817 const struct gen_device_info
*devinfo
= p
->devinfo
;
1818 brw_inst
*do_inst
= get_inner_do_insn(p
);
1820 unsigned br
= brw_jump_scale(devinfo
);
1822 assert(devinfo
->gen
< 6);
1824 for (inst
= while_inst
- 1; inst
!= do_inst
; inst
--) {
1825 /* If the jump count is != 0, that means that this instruction has already
1826 * been patched because it's part of a loop inside of the one we're
1829 if (brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_BREAK
&&
1830 brw_inst_gen4_jump_count(devinfo
, inst
) == 0) {
1831 brw_inst_set_gen4_jump_count(devinfo
, inst
, br
*((while_inst
- inst
) + 1));
1832 } else if (brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_CONTINUE
&&
1833 brw_inst_gen4_jump_count(devinfo
, inst
) == 0) {
1834 brw_inst_set_gen4_jump_count(devinfo
, inst
, br
* (while_inst
- inst
));
1840 brw_WHILE(struct brw_codegen
*p
)
1842 const struct gen_device_info
*devinfo
= p
->devinfo
;
1843 brw_inst
*insn
, *do_insn
;
1844 unsigned br
= brw_jump_scale(devinfo
);
1846 if (devinfo
->gen
>= 6) {
1847 insn
= next_insn(p
, BRW_OPCODE_WHILE
);
1848 do_insn
= get_inner_do_insn(p
);
1850 if (devinfo
->gen
>= 8) {
1851 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1852 if (devinfo
->gen
< 12)
1853 brw_set_src0(p
, insn
, brw_imm_d(0));
1854 brw_inst_set_jip(devinfo
, insn
, br
* (do_insn
- insn
));
1855 } else if (devinfo
->gen
== 7) {
1856 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1857 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1858 brw_set_src1(p
, insn
, brw_imm_w(0));
1859 brw_inst_set_jip(devinfo
, insn
, br
* (do_insn
- insn
));
1861 brw_set_dest(p
, insn
, brw_imm_w(0));
1862 brw_inst_set_gen6_jump_count(devinfo
, insn
, br
* (do_insn
- insn
));
1863 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1864 brw_set_src1(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1867 brw_inst_set_exec_size(devinfo
, insn
, brw_get_default_exec_size(p
));
1870 if (p
->single_program_flow
) {
1871 insn
= next_insn(p
, BRW_OPCODE_ADD
);
1872 do_insn
= get_inner_do_insn(p
);
1874 brw_set_dest(p
, insn
, brw_ip_reg());
1875 brw_set_src0(p
, insn
, brw_ip_reg());
1876 brw_set_src1(p
, insn
, brw_imm_d((do_insn
- insn
) * 16));
1877 brw_inst_set_exec_size(devinfo
, insn
, BRW_EXECUTE_1
);
1879 insn
= next_insn(p
, BRW_OPCODE_WHILE
);
1880 do_insn
= get_inner_do_insn(p
);
1882 assert(brw_inst_opcode(devinfo
, do_insn
) == BRW_OPCODE_DO
);
1884 brw_set_dest(p
, insn
, brw_ip_reg());
1885 brw_set_src0(p
, insn
, brw_ip_reg());
1886 brw_set_src1(p
, insn
, brw_imm_d(0));
1888 brw_inst_set_exec_size(devinfo
, insn
, brw_inst_exec_size(devinfo
, do_insn
));
1889 brw_inst_set_gen4_jump_count(devinfo
, insn
, br
* (do_insn
- insn
+ 1));
1890 brw_inst_set_gen4_pop_count(devinfo
, insn
, 0);
1892 brw_patch_break_cont(p
, insn
);
1895 brw_inst_set_qtr_control(devinfo
, insn
, BRW_COMPRESSION_NONE
);
1897 p
->loop_stack_depth
--;
1904 void brw_land_fwd_jump(struct brw_codegen
*p
, int jmp_insn_idx
)
1906 const struct gen_device_info
*devinfo
= p
->devinfo
;
1907 brw_inst
*jmp_insn
= &p
->store
[jmp_insn_idx
];
1910 if (devinfo
->gen
>= 5)
1913 assert(brw_inst_opcode(devinfo
, jmp_insn
) == BRW_OPCODE_JMPI
);
1914 assert(brw_inst_src1_reg_file(devinfo
, jmp_insn
) == BRW_IMMEDIATE_VALUE
);
1916 brw_inst_set_gen4_jump_count(devinfo
, jmp_insn
,
1917 jmpi
* (p
->nr_insn
- jmp_insn_idx
- 1));
1920 /* To integrate with the above, it makes sense that the comparison
1921 * instruction should populate the flag register. It might be simpler
1922 * just to use the flag reg for most WM tasks?
1924 void brw_CMP(struct brw_codegen
*p
,
1925 struct brw_reg dest
,
1926 unsigned conditional
,
1927 struct brw_reg src0
,
1928 struct brw_reg src1
)
1930 const struct gen_device_info
*devinfo
= p
->devinfo
;
1931 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_CMP
);
1933 brw_inst_set_cond_modifier(devinfo
, insn
, conditional
);
1934 brw_set_dest(p
, insn
, dest
);
1935 brw_set_src0(p
, insn
, src0
);
1936 brw_set_src1(p
, insn
, src1
);
1938 /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1940 * "Any CMP instruction with a null destination must use a {switch}."
1942 * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1943 * mentioned on their work-arounds pages.
1945 if (devinfo
->gen
== 7) {
1946 if (dest
.file
== BRW_ARCHITECTURE_REGISTER_FILE
&&
1947 dest
.nr
== BRW_ARF_NULL
) {
1948 brw_inst_set_thread_control(devinfo
, insn
, BRW_THREAD_SWITCH
);
1953 /***********************************************************************
1954 * Helpers for the various SEND message types:
1957 /** Extended math function, float[8].
1959 void gen4_math(struct brw_codegen
*p
,
1960 struct brw_reg dest
,
1962 unsigned msg_reg_nr
,
1964 unsigned precision
)
1966 const struct gen_device_info
*devinfo
= p
->devinfo
;
1967 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_SEND
);
1969 if (has_scalar_region(src
)) {
1970 data_type
= BRW_MATH_DATA_SCALAR
;
1972 data_type
= BRW_MATH_DATA_VECTOR
;
1975 assert(devinfo
->gen
< 6);
1977 /* Example code doesn't set predicate_control for send
1980 brw_inst_set_pred_control(devinfo
, insn
, 0);
1981 brw_inst_set_base_mrf(devinfo
, insn
, msg_reg_nr
);
1983 brw_set_dest(p
, insn
, dest
);
1984 brw_set_src0(p
, insn
, src
);
1985 brw_set_math_message(p
,
1988 src
.type
== BRW_REGISTER_TYPE_D
,
1993 void gen6_math(struct brw_codegen
*p
,
1994 struct brw_reg dest
,
1996 struct brw_reg src0
,
1997 struct brw_reg src1
)
1999 const struct gen_device_info
*devinfo
= p
->devinfo
;
2000 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_MATH
);
2002 assert(devinfo
->gen
>= 6);
2004 assert(dest
.file
== BRW_GENERAL_REGISTER_FILE
||
2005 (devinfo
->gen
>= 7 && dest
.file
== BRW_MESSAGE_REGISTER_FILE
));
2007 assert(dest
.hstride
== BRW_HORIZONTAL_STRIDE_1
);
2008 if (devinfo
->gen
== 6) {
2009 assert(src0
.hstride
== BRW_HORIZONTAL_STRIDE_1
);
2010 assert(src1
.hstride
== BRW_HORIZONTAL_STRIDE_1
);
2013 if (function
== BRW_MATH_FUNCTION_INT_DIV_QUOTIENT
||
2014 function
== BRW_MATH_FUNCTION_INT_DIV_REMAINDER
||
2015 function
== BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER
) {
2016 assert(src0
.type
!= BRW_REGISTER_TYPE_F
);
2017 assert(src1
.type
!= BRW_REGISTER_TYPE_F
);
2018 assert(src1
.file
== BRW_GENERAL_REGISTER_FILE
||
2019 (devinfo
->gen
>= 8 && src1
.file
== BRW_IMMEDIATE_VALUE
));
2021 assert(src0
.type
== BRW_REGISTER_TYPE_F
||
2022 (src0
.type
== BRW_REGISTER_TYPE_HF
&& devinfo
->gen
>= 9));
2023 assert(src1
.type
== BRW_REGISTER_TYPE_F
||
2024 (src1
.type
== BRW_REGISTER_TYPE_HF
&& devinfo
->gen
>= 9));
2027 /* Source modifiers are ignored for extended math instructions on Gen6. */
2028 if (devinfo
->gen
== 6) {
2029 assert(!src0
.negate
);
2031 assert(!src1
.negate
);
2035 brw_inst_set_math_function(devinfo
, insn
, function
);
2037 brw_set_dest(p
, insn
, dest
);
2038 brw_set_src0(p
, insn
, src0
);
2039 brw_set_src1(p
, insn
, src1
);
2043 * Return the right surface index to access the thread scratch space using
2044 * stateless dataport messages.
2047 brw_scratch_surface_idx(const struct brw_codegen
*p
)
2049 /* The scratch space is thread-local so IA coherency is unnecessary. */
2050 if (p
->devinfo
->gen
>= 8)
2051 return GEN8_BTI_STATELESS_NON_COHERENT
;
2053 return BRW_BTI_STATELESS
;
2057 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
2058 * using a constant offset per channel.
2060 * The offset must be aligned to oword size (16 bytes). Used for
2061 * register spilling.
2063 void brw_oword_block_write_scratch(struct brw_codegen
*p
,
2068 const struct gen_device_info
*devinfo
= p
->devinfo
;
2069 const unsigned target_cache
=
2070 (devinfo
->gen
>= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE
:
2071 devinfo
->gen
>= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE
:
2072 BRW_SFID_DATAPORT_WRITE
);
2073 const struct tgl_swsb swsb
= brw_get_default_swsb(p
);
2076 if (devinfo
->gen
>= 6)
2079 mrf
= retype(mrf
, BRW_REGISTER_TYPE_UD
);
2081 const unsigned mlen
= 1 + num_regs
;
2083 /* Set up the message header. This is g0, with g0.2 filled with
2084 * the offset. We don't want to leave our offset around in g0 or
2085 * it'll screw up texture samples, so set it up inside the message
2089 brw_push_insn_state(p
);
2090 brw_set_default_exec_size(p
, BRW_EXECUTE_8
);
2091 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
2092 brw_set_default_compression_control(p
, BRW_COMPRESSION_NONE
);
2093 brw_set_default_swsb(p
, tgl_swsb_src_dep(swsb
));
2095 brw_MOV(p
, mrf
, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD
));
2097 /* set message header global offset field (reg 0, element 2) */
2098 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
2099 brw_set_default_swsb(p
, tgl_swsb_null());
2101 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE
,
2103 2), BRW_REGISTER_TYPE_UD
),
2104 brw_imm_ud(offset
));
2106 brw_pop_insn_state(p
);
2107 brw_set_default_swsb(p
, tgl_swsb_dst_dep(swsb
, 1));
2111 struct brw_reg dest
;
2112 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_SEND
);
2113 int send_commit_msg
;
2114 struct brw_reg src_header
= retype(brw_vec8_grf(0, 0),
2115 BRW_REGISTER_TYPE_UW
);
2117 brw_inst_set_sfid(devinfo
, insn
, target_cache
);
2118 brw_inst_set_compression(devinfo
, insn
, false);
2120 if (brw_inst_exec_size(devinfo
, insn
) >= 16)
2121 src_header
= vec16(src_header
);
2123 assert(brw_inst_pred_control(devinfo
, insn
) == BRW_PREDICATE_NONE
);
2124 if (devinfo
->gen
< 6)
2125 brw_inst_set_base_mrf(devinfo
, insn
, mrf
.nr
);
2127 /* Until gen6, writes followed by reads from the same location
2128 * are not guaranteed to be ordered unless write_commit is set.
2129 * If set, then a no-op write is issued to the destination
2130 * register to set a dependency, and a read from the destination
2131 * can be used to ensure the ordering.
2133 * For gen6, only writes between different threads need ordering
2134 * protection. Our use of DP writes is all about register
2135 * spilling within a thread.
2137 if (devinfo
->gen
>= 6) {
2138 dest
= retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW
);
2139 send_commit_msg
= 0;
2142 send_commit_msg
= 1;
2145 brw_set_dest(p
, insn
, dest
);
2146 if (devinfo
->gen
>= 6) {
2147 brw_set_src0(p
, insn
, mrf
);
2149 brw_set_src0(p
, insn
, brw_null_reg());
2152 if (devinfo
->gen
>= 6)
2153 msg_type
= GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE
;
2155 msg_type
= BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE
;
2157 brw_set_desc(p
, insn
,
2158 brw_message_desc(devinfo
, mlen
, send_commit_msg
, true) |
2159 brw_dp_write_desc(devinfo
, brw_scratch_surface_idx(p
),
2160 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs
* 8),
2161 msg_type
, 0, /* not a render target */
2168 * Read a block of owords (half a GRF each) from the scratch buffer
2169 * using a constant index per channel.
2171 * Offset must be aligned to oword size (16 bytes). Used for register
2175 brw_oword_block_read_scratch(struct brw_codegen
*p
,
2176 struct brw_reg dest
,
2181 const struct gen_device_info
*devinfo
= p
->devinfo
;
2182 const struct tgl_swsb swsb
= brw_get_default_swsb(p
);
2184 if (devinfo
->gen
>= 6)
2187 if (p
->devinfo
->gen
>= 7) {
2188 /* On gen 7 and above, we no longer have message registers and we can
2189 * send from any register we want. By using the destination register
2190 * for the message, we guarantee that the implied message write won't
2191 * accidentally overwrite anything. This has been a problem because
2192 * the MRF registers and source for the final FB write are both fixed
2195 mrf
= retype(dest
, BRW_REGISTER_TYPE_UD
);
2197 mrf
= retype(mrf
, BRW_REGISTER_TYPE_UD
);
2199 dest
= retype(dest
, BRW_REGISTER_TYPE_UW
);
2201 const unsigned rlen
= num_regs
;
2202 const unsigned target_cache
=
2203 (devinfo
->gen
>= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE
:
2204 devinfo
->gen
>= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE
:
2205 BRW_SFID_DATAPORT_READ
);
2208 brw_push_insn_state(p
);
2209 brw_set_default_swsb(p
, tgl_swsb_src_dep(swsb
));
2210 brw_set_default_exec_size(p
, BRW_EXECUTE_8
);
2211 brw_set_default_compression_control(p
, BRW_COMPRESSION_NONE
);
2212 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
2214 brw_MOV(p
, mrf
, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD
));
2216 /* set message header global offset field (reg 0, element 2) */
2217 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
2218 brw_set_default_swsb(p
, tgl_swsb_null());
2219 brw_MOV(p
, get_element_ud(mrf
, 2), brw_imm_ud(offset
));
2221 brw_pop_insn_state(p
);
2222 brw_set_default_swsb(p
, tgl_swsb_dst_dep(swsb
, 1));
2226 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_SEND
);
2228 brw_inst_set_sfid(devinfo
, insn
, target_cache
);
2229 assert(brw_inst_pred_control(devinfo
, insn
) == 0);
2230 brw_inst_set_compression(devinfo
, insn
, false);
2232 brw_set_dest(p
, insn
, dest
); /* UW? */
2233 if (devinfo
->gen
>= 6) {
2234 brw_set_src0(p
, insn
, mrf
);
2236 brw_set_src0(p
, insn
, brw_null_reg());
2237 brw_inst_set_base_mrf(devinfo
, insn
, mrf
.nr
);
2240 brw_set_desc(p
, insn
,
2241 brw_message_desc(devinfo
, 1, rlen
, true) |
2242 brw_dp_read_desc(devinfo
, brw_scratch_surface_idx(p
),
2243 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs
* 8),
2244 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ
,
2245 BRW_DATAPORT_READ_TARGET_RENDER_CACHE
));
2250 gen7_block_read_scratch(struct brw_codegen
*p
,
2251 struct brw_reg dest
,
2255 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_SEND
);
2256 assert(brw_inst_pred_control(p
->devinfo
, insn
) == BRW_PREDICATE_NONE
);
2258 brw_set_dest(p
, insn
, retype(dest
, BRW_REGISTER_TYPE_UW
));
2260 /* The HW requires that the header is present; this is to get the g0.5
2263 brw_set_src0(p
, insn
, brw_vec8_grf(0, 0));
2265 /* According to the docs, offset is "A 12-bit HWord offset into the memory
2266 * Immediate Memory buffer as specified by binding table 0xFF." An HWORD
2267 * is 32 bytes, which happens to be the size of a register.
2270 assert(offset
< (1 << 12));
2272 gen7_set_dp_scratch_message(p
, insn
,
2273 false, /* scratch read */
2275 false, /* invalidate after read */
2278 1, /* mlen: just g0 */
2279 num_regs
, /* rlen */
2280 true); /* header present */
2284 * Read float[4] vectors from the data port constant cache.
2285 * Location (in buffer) should be a multiple of 16.
2286 * Used for fetching shader constants.
2288 void brw_oword_block_read(struct brw_codegen
*p
,
2289 struct brw_reg dest
,
2292 uint32_t bind_table_index
)
2294 const struct gen_device_info
*devinfo
= p
->devinfo
;
2295 const unsigned target_cache
=
2296 (devinfo
->gen
>= 6 ? GEN6_SFID_DATAPORT_CONSTANT_CACHE
:
2297 BRW_SFID_DATAPORT_READ
);
2298 const unsigned exec_size
= 1 << brw_get_default_exec_size(p
);
2299 const struct tgl_swsb swsb
= brw_get_default_swsb(p
);
2301 /* On newer hardware, offset is in units of owords. */
2302 if (devinfo
->gen
>= 6)
2305 mrf
= retype(mrf
, BRW_REGISTER_TYPE_UD
);
2307 brw_push_insn_state(p
);
2308 brw_set_default_predicate_control(p
, BRW_PREDICATE_NONE
);
2309 brw_set_default_compression_control(p
, BRW_COMPRESSION_NONE
);
2310 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
2312 brw_push_insn_state(p
);
2313 brw_set_default_exec_size(p
, BRW_EXECUTE_8
);
2314 brw_set_default_swsb(p
, tgl_swsb_src_dep(swsb
));
2315 brw_MOV(p
, mrf
, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD
));
2317 /* set message header global offset field (reg 0, element 2) */
2318 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
2319 brw_set_default_swsb(p
, tgl_swsb_null());
2321 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE
,
2323 2), BRW_REGISTER_TYPE_UD
),
2324 brw_imm_ud(offset
));
2325 brw_pop_insn_state(p
);
2327 brw_set_default_swsb(p
, tgl_swsb_dst_dep(swsb
, 1));
2329 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_SEND
);
2331 brw_inst_set_sfid(devinfo
, insn
, target_cache
);
2333 /* cast dest to a uword[8] vector */
2334 dest
= retype(vec8(dest
), BRW_REGISTER_TYPE_UW
);
2336 brw_set_dest(p
, insn
, dest
);
2337 if (devinfo
->gen
>= 6) {
2338 brw_set_src0(p
, insn
, mrf
);
2340 brw_set_src0(p
, insn
, brw_null_reg());
2341 brw_inst_set_base_mrf(devinfo
, insn
, mrf
.nr
);
2344 brw_set_desc(p
, insn
,
2345 brw_message_desc(devinfo
, 1, DIV_ROUND_UP(exec_size
, 8), true) |
2346 brw_dp_read_desc(devinfo
, bind_table_index
,
2347 BRW_DATAPORT_OWORD_BLOCK_DWORDS(exec_size
),
2348 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ
,
2349 BRW_DATAPORT_READ_TARGET_DATA_CACHE
));
2351 brw_pop_insn_state(p
);
2355 brw_fb_WRITE(struct brw_codegen
*p
,
2356 struct brw_reg payload
,
2357 struct brw_reg implied_header
,
2358 unsigned msg_control
,
2359 unsigned binding_table_index
,
2360 unsigned msg_length
,
2361 unsigned response_length
,
2363 bool last_render_target
,
2364 bool header_present
)
2366 const struct gen_device_info
*devinfo
= p
->devinfo
;
2367 const unsigned target_cache
=
2368 (devinfo
->gen
>= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE
:
2369 BRW_SFID_DATAPORT_WRITE
);
2372 struct brw_reg dest
, src0
;
2374 if (brw_get_default_exec_size(p
) >= BRW_EXECUTE_16
)
2375 dest
= retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW
);
2377 dest
= retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW
);
2379 if (devinfo
->gen
>= 6) {
2380 insn
= next_insn(p
, BRW_OPCODE_SENDC
);
2382 insn
= next_insn(p
, BRW_OPCODE_SEND
);
2384 brw_inst_set_sfid(devinfo
, insn
, target_cache
);
2385 brw_inst_set_compression(devinfo
, insn
, false);
2387 if (devinfo
->gen
>= 6) {
2388 /* headerless version, just submit color payload */
2391 msg_type
= GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE
;
2393 assert(payload
.file
== BRW_MESSAGE_REGISTER_FILE
);
2394 brw_inst_set_base_mrf(devinfo
, insn
, payload
.nr
);
2395 src0
= implied_header
;
2397 msg_type
= BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE
;
2400 brw_set_dest(p
, insn
, dest
);
2401 brw_set_src0(p
, insn
, src0
);
2402 brw_set_desc(p
, insn
,
2403 brw_message_desc(devinfo
, msg_length
, response_length
,
2405 brw_dp_write_desc(devinfo
, binding_table_index
, msg_control
,
2406 msg_type
, last_render_target
,
2407 0 /* send_commit_msg */));
2408 brw_inst_set_eot(devinfo
, insn
, eot
);
2414 gen9_fb_READ(struct brw_codegen
*p
,
2416 struct brw_reg payload
,
2417 unsigned binding_table_index
,
2418 unsigned msg_length
,
2419 unsigned response_length
,
2422 const struct gen_device_info
*devinfo
= p
->devinfo
;
2423 assert(devinfo
->gen
>= 9);
2424 const unsigned msg_subtype
=
2425 brw_get_default_exec_size(p
) == BRW_EXECUTE_16
? 0 : 1;
2426 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_SENDC
);
2428 brw_inst_set_sfid(devinfo
, insn
, GEN6_SFID_DATAPORT_RENDER_CACHE
);
2429 brw_set_dest(p
, insn
, dst
);
2430 brw_set_src0(p
, insn
, payload
);
2433 brw_message_desc(devinfo
, msg_length
, response_length
, true) |
2434 brw_dp_read_desc(devinfo
, binding_table_index
,
2435 per_sample
<< 5 | msg_subtype
,
2436 GEN9_DATAPORT_RC_RENDER_TARGET_READ
,
2437 BRW_DATAPORT_READ_TARGET_RENDER_CACHE
));
2438 brw_inst_set_rt_slot_group(devinfo
, insn
, brw_get_default_group(p
) / 16);
2444 * Texture sample instruction.
2445 * Note: the msg_type plus msg_length values determine exactly what kind
2446 * of sampling operation is performed. See volume 4, page 161 of docs.
2448 void brw_SAMPLE(struct brw_codegen
*p
,
2449 struct brw_reg dest
,
2450 unsigned msg_reg_nr
,
2451 struct brw_reg src0
,
2452 unsigned binding_table_index
,
2455 unsigned response_length
,
2456 unsigned msg_length
,
2457 unsigned header_present
,
2459 unsigned return_format
)
2461 const struct gen_device_info
*devinfo
= p
->devinfo
;
2464 if (msg_reg_nr
!= -1)
2465 gen6_resolve_implied_move(p
, &src0
, msg_reg_nr
);
2467 insn
= next_insn(p
, BRW_OPCODE_SEND
);
2468 brw_inst_set_sfid(devinfo
, insn
, BRW_SFID_SAMPLER
);
2469 brw_inst_set_pred_control(devinfo
, insn
, BRW_PREDICATE_NONE
); /* XXX */
2471 /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2473 * "Instruction compression is not allowed for this instruction (that
2474 * is, send). The hardware behavior is undefined if this instruction is
2475 * set as compressed. However, compress control can be set to "SecHalf"
2476 * to affect the EMask generation."
2478 * No similar wording is found in later PRMs, but there are examples
2479 * utilizing send with SecHalf. More importantly, SIMD8 sampler messages
2480 * are allowed in SIMD16 mode and they could not work without SecHalf. For
2481 * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2483 brw_inst_set_compression(devinfo
, insn
, false);
2485 if (devinfo
->gen
< 6)
2486 brw_inst_set_base_mrf(devinfo
, insn
, msg_reg_nr
);
2488 brw_set_dest(p
, insn
, dest
);
2489 brw_set_src0(p
, insn
, src0
);
2490 brw_set_desc(p
, insn
,
2491 brw_message_desc(devinfo
, msg_length
, response_length
,
2493 brw_sampler_desc(devinfo
, binding_table_index
, sampler
,
2494 msg_type
, simd_mode
, return_format
));
2497 /* Adjust the message header's sampler state pointer to
2498 * select the correct group of 16 samplers.
2500 void brw_adjust_sampler_state_pointer(struct brw_codegen
*p
,
2501 struct brw_reg header
,
2502 struct brw_reg sampler_index
)
2504 /* The "Sampler Index" field can only store values between 0 and 15.
2505 * However, we can add an offset to the "Sampler State Pointer"
2506 * field, effectively selecting a different set of 16 samplers.
2508 * The "Sampler State Pointer" needs to be aligned to a 32-byte
2509 * offset, and each sampler state is only 16-bytes, so we can't
2510 * exclusively use the offset - we have to use both.
2513 const struct gen_device_info
*devinfo
= p
->devinfo
;
2515 if (sampler_index
.file
== BRW_IMMEDIATE_VALUE
) {
2516 const int sampler_state_size
= 16; /* 16 bytes */
2517 uint32_t sampler
= sampler_index
.ud
;
2519 if (sampler
>= 16) {
2520 assert(devinfo
->is_haswell
|| devinfo
->gen
>= 8);
2522 get_element_ud(header
, 3),
2523 get_element_ud(brw_vec8_grf(0, 0), 3),
2524 brw_imm_ud(16 * (sampler
/ 16) * sampler_state_size
));
2527 /* Non-const sampler array indexing case */
2528 if (devinfo
->gen
< 8 && !devinfo
->is_haswell
) {
2532 struct brw_reg temp
= get_element_ud(header
, 3);
2534 brw_push_insn_state(p
);
2535 brw_AND(p
, temp
, get_element_ud(sampler_index
, 0), brw_imm_ud(0x0f0));
2536 brw_set_default_swsb(p
, tgl_swsb_regdist(1));
2537 brw_SHL(p
, temp
, temp
, brw_imm_ud(4));
2539 get_element_ud(header
, 3),
2540 get_element_ud(brw_vec8_grf(0, 0), 3),
2542 brw_pop_insn_state(p
);
2546 /* All these variables are pretty confusing - we might be better off
2547 * using bitmasks and macros for this, in the old style. Or perhaps
2548 * just having the caller instantiate the fields in dword3 itself.
2550 void brw_urb_WRITE(struct brw_codegen
*p
,
2551 struct brw_reg dest
,
2552 unsigned msg_reg_nr
,
2553 struct brw_reg src0
,
2554 enum brw_urb_write_flags flags
,
2555 unsigned msg_length
,
2556 unsigned response_length
,
2560 const struct gen_device_info
*devinfo
= p
->devinfo
;
2563 gen6_resolve_implied_move(p
, &src0
, msg_reg_nr
);
2565 if (devinfo
->gen
>= 7 && !(flags
& BRW_URB_WRITE_USE_CHANNEL_MASKS
)) {
2566 /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2567 brw_push_insn_state(p
);
2568 brw_set_default_access_mode(p
, BRW_ALIGN_1
);
2569 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
2570 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
2571 brw_OR(p
, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE
, msg_reg_nr
, 5),
2572 BRW_REGISTER_TYPE_UD
),
2573 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD
),
2574 brw_imm_ud(0xff00));
2575 brw_pop_insn_state(p
);
2578 insn
= next_insn(p
, BRW_OPCODE_SEND
);
2580 assert(msg_length
< BRW_MAX_MRF(devinfo
->gen
));
2582 brw_set_dest(p
, insn
, dest
);
2583 brw_set_src0(p
, insn
, src0
);
2584 brw_set_src1(p
, insn
, brw_imm_d(0));
2586 if (devinfo
->gen
< 6)
2587 brw_inst_set_base_mrf(devinfo
, insn
, msg_reg_nr
);
2589 brw_set_urb_message(p
,
2599 brw_send_indirect_message(struct brw_codegen
*p
,
2602 struct brw_reg payload
,
2603 struct brw_reg desc
,
2607 const struct gen_device_info
*devinfo
= p
->devinfo
;
2608 struct brw_inst
*send
;
2610 dst
= retype(dst
, BRW_REGISTER_TYPE_UW
);
2612 assert(desc
.type
== BRW_REGISTER_TYPE_UD
);
2614 if (desc
.file
== BRW_IMMEDIATE_VALUE
) {
2615 send
= next_insn(p
, BRW_OPCODE_SEND
);
2616 brw_set_src0(p
, send
, retype(payload
, BRW_REGISTER_TYPE_UD
));
2617 brw_set_desc(p
, send
, desc
.ud
| desc_imm
);
2619 const struct tgl_swsb swsb
= brw_get_default_swsb(p
);
2620 struct brw_reg addr
= retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD
);
2622 brw_push_insn_state(p
);
2623 brw_set_default_access_mode(p
, BRW_ALIGN_1
);
2624 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
2625 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
2626 brw_set_default_predicate_control(p
, BRW_PREDICATE_NONE
);
2627 brw_set_default_swsb(p
, tgl_swsb_src_dep(swsb
));
2629 /* Load the indirect descriptor to an address register using OR so the
2630 * caller can specify additional descriptor bits with the desc_imm
2633 brw_OR(p
, addr
, desc
, brw_imm_ud(desc_imm
));
2635 brw_pop_insn_state(p
);
2637 brw_set_default_swsb(p
, tgl_swsb_dst_dep(swsb
, 1));
2638 send
= next_insn(p
, BRW_OPCODE_SEND
);
2639 brw_set_src0(p
, send
, retype(payload
, BRW_REGISTER_TYPE_UD
));
2641 if (devinfo
->gen
>= 12)
2642 brw_inst_set_send_sel_reg32_desc(devinfo
, send
, true);
2644 brw_set_src1(p
, send
, addr
);
2647 brw_set_dest(p
, send
, dst
);
2648 brw_inst_set_sfid(devinfo
, send
, sfid
);
2649 brw_inst_set_eot(devinfo
, send
, eot
);
2653 brw_send_indirect_split_message(struct brw_codegen
*p
,
2656 struct brw_reg payload0
,
2657 struct brw_reg payload1
,
2658 struct brw_reg desc
,
2660 struct brw_reg ex_desc
,
2661 unsigned ex_desc_imm
,
2664 const struct gen_device_info
*devinfo
= p
->devinfo
;
2665 struct brw_inst
*send
;
2667 dst
= retype(dst
, BRW_REGISTER_TYPE_UW
);
2669 assert(desc
.type
== BRW_REGISTER_TYPE_UD
);
2671 if (desc
.file
== BRW_IMMEDIATE_VALUE
) {
2672 desc
.ud
|= desc_imm
;
2674 const struct tgl_swsb swsb
= brw_get_default_swsb(p
);
2675 struct brw_reg addr
= retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD
);
2677 brw_push_insn_state(p
);
2678 brw_set_default_access_mode(p
, BRW_ALIGN_1
);
2679 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
2680 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
2681 brw_set_default_predicate_control(p
, BRW_PREDICATE_NONE
);
2682 brw_set_default_swsb(p
, tgl_swsb_src_dep(swsb
));
2684 /* Load the indirect descriptor to an address register using OR so the
2685 * caller can specify additional descriptor bits with the desc_imm
2688 brw_OR(p
, addr
, desc
, brw_imm_ud(desc_imm
));
2690 brw_pop_insn_state(p
);
2693 brw_set_default_swsb(p
, tgl_swsb_dst_dep(swsb
, 1));
2696 if (ex_desc
.file
== BRW_IMMEDIATE_VALUE
&&
2697 (ex_desc
.ud
& INTEL_MASK(15, 12)) == 0) {
2698 ex_desc
.ud
|= ex_desc_imm
;
2700 const struct tgl_swsb swsb
= brw_get_default_swsb(p
);
2701 struct brw_reg addr
= retype(brw_address_reg(2), BRW_REGISTER_TYPE_UD
);
2703 brw_push_insn_state(p
);
2704 brw_set_default_access_mode(p
, BRW_ALIGN_1
);
2705 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
2706 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
2707 brw_set_default_predicate_control(p
, BRW_PREDICATE_NONE
);
2708 brw_set_default_swsb(p
, tgl_swsb_src_dep(swsb
));
2710 /* Load the indirect extended descriptor to an address register using OR
2711 * so the caller can specify additional descriptor bits with the
2712 * desc_imm immediate.
2714 * Even though the instruction dispatcher always pulls the SFID and EOT
2715 * fields from the instruction itself, actual external unit which
2716 * processes the message gets the SFID and EOT from the extended
2717 * descriptor which comes from the address register. If we don't OR
2718 * those two bits in, the external unit may get confused and hang.
2720 unsigned imm_part
= ex_desc_imm
| sfid
| eot
<< 5;
2722 if (ex_desc
.file
== BRW_IMMEDIATE_VALUE
) {
2723 /* ex_desc bits 15:12 don't exist in the instruction encoding, so
2724 * we may have fallen back to an indirect extended descriptor.
2726 brw_MOV(p
, addr
, brw_imm_ud(ex_desc
.ud
| imm_part
));
2728 brw_OR(p
, addr
, ex_desc
, brw_imm_ud(imm_part
));
2731 brw_pop_insn_state(p
);
2734 brw_set_default_swsb(p
, tgl_swsb_dst_dep(swsb
, 1));
2737 send
= next_insn(p
, devinfo
->gen
>= 12 ? BRW_OPCODE_SEND
: BRW_OPCODE_SENDS
);
2738 brw_set_dest(p
, send
, dst
);
2739 brw_set_src0(p
, send
, retype(payload0
, BRW_REGISTER_TYPE_UD
));
2740 brw_set_src1(p
, send
, retype(payload1
, BRW_REGISTER_TYPE_UD
));
2742 if (desc
.file
== BRW_IMMEDIATE_VALUE
) {
2743 brw_inst_set_send_sel_reg32_desc(devinfo
, send
, 0);
2744 brw_inst_set_send_desc(devinfo
, send
, desc
.ud
);
2746 assert(desc
.file
== BRW_ARCHITECTURE_REGISTER_FILE
);
2747 assert(desc
.nr
== BRW_ARF_ADDRESS
);
2748 assert(desc
.subnr
== 0);
2749 brw_inst_set_send_sel_reg32_desc(devinfo
, send
, 1);
2752 if (ex_desc
.file
== BRW_IMMEDIATE_VALUE
) {
2753 brw_inst_set_send_sel_reg32_ex_desc(devinfo
, send
, 0);
2754 brw_inst_set_sends_ex_desc(devinfo
, send
, ex_desc
.ud
);
2756 assert(ex_desc
.file
== BRW_ARCHITECTURE_REGISTER_FILE
);
2757 assert(ex_desc
.nr
== BRW_ARF_ADDRESS
);
2758 assert((ex_desc
.subnr
& 0x3) == 0);
2759 brw_inst_set_send_sel_reg32_ex_desc(devinfo
, send
, 1);
2760 brw_inst_set_send_ex_desc_ia_subreg_nr(devinfo
, send
, ex_desc
.subnr
>> 2);
2763 brw_inst_set_sfid(devinfo
, send
, sfid
);
2764 brw_inst_set_eot(devinfo
, send
, eot
);
2768 brw_send_indirect_surface_message(struct brw_codegen
*p
,
2771 struct brw_reg payload
,
2772 struct brw_reg surface
,
2775 if (surface
.file
!= BRW_IMMEDIATE_VALUE
) {
2776 const struct tgl_swsb swsb
= brw_get_default_swsb(p
);
2777 struct brw_reg addr
= retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD
);
2779 brw_push_insn_state(p
);
2780 brw_set_default_access_mode(p
, BRW_ALIGN_1
);
2781 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
2782 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
2783 brw_set_default_predicate_control(p
, BRW_PREDICATE_NONE
);
2784 brw_set_default_swsb(p
, tgl_swsb_src_dep(swsb
));
2786 /* Mask out invalid bits from the surface index to avoid hangs e.g. when
2787 * some surface array is accessed out of bounds.
2790 suboffset(vec1(retype(surface
, BRW_REGISTER_TYPE_UD
)),
2791 BRW_GET_SWZ(surface
.swizzle
, 0)),
2794 brw_pop_insn_state(p
);
2797 brw_set_default_swsb(p
, tgl_swsb_dst_dep(swsb
, 1));
2800 brw_send_indirect_message(p
, sfid
, dst
, payload
, surface
, desc_imm
, false);
2804 while_jumps_before_offset(const struct gen_device_info
*devinfo
,
2805 brw_inst
*insn
, int while_offset
, int start_offset
)
2807 int scale
= 16 / brw_jump_scale(devinfo
);
2808 int jip
= devinfo
->gen
== 6 ? brw_inst_gen6_jump_count(devinfo
, insn
)
2809 : brw_inst_jip(devinfo
, insn
);
2811 return while_offset
+ jip
* scale
<= start_offset
;
2816 brw_find_next_block_end(struct brw_codegen
*p
, int start_offset
)
2819 void *store
= p
->store
;
2820 const struct gen_device_info
*devinfo
= p
->devinfo
;
2824 for (offset
= next_offset(devinfo
, store
, start_offset
);
2825 offset
< p
->next_insn_offset
;
2826 offset
= next_offset(devinfo
, store
, offset
)) {
2827 brw_inst
*insn
= store
+ offset
;
2829 switch (brw_inst_opcode(devinfo
, insn
)) {
2833 case BRW_OPCODE_ENDIF
:
2838 case BRW_OPCODE_WHILE
:
2839 /* If the while doesn't jump before our instruction, it's the end
2840 * of a sibling do...while loop. Ignore it.
2842 if (!while_jumps_before_offset(devinfo
, insn
, offset
, start_offset
))
2845 case BRW_OPCODE_ELSE
:
2846 case BRW_OPCODE_HALT
:
2857 /* There is no DO instruction on gen6, so to find the end of the loop
2858 * we have to see if the loop is jumping back before our start
2862 brw_find_loop_end(struct brw_codegen
*p
, int start_offset
)
2864 const struct gen_device_info
*devinfo
= p
->devinfo
;
2866 void *store
= p
->store
;
2868 assert(devinfo
->gen
>= 6);
2870 /* Always start after the instruction (such as a WHILE) we're trying to fix
2873 for (offset
= next_offset(devinfo
, store
, start_offset
);
2874 offset
< p
->next_insn_offset
;
2875 offset
= next_offset(devinfo
, store
, offset
)) {
2876 brw_inst
*insn
= store
+ offset
;
2878 if (brw_inst_opcode(devinfo
, insn
) == BRW_OPCODE_WHILE
) {
2879 if (while_jumps_before_offset(devinfo
, insn
, offset
, start_offset
))
2883 assert(!"not reached");
2884 return start_offset
;
2887 /* After program generation, go back and update the UIP and JIP of
2888 * BREAK, CONT, and HALT instructions to their correct locations.
2891 brw_set_uip_jip(struct brw_codegen
*p
, int start_offset
)
2893 const struct gen_device_info
*devinfo
= p
->devinfo
;
2895 int br
= brw_jump_scale(devinfo
);
2896 int scale
= 16 / br
;
2897 void *store
= p
->store
;
2899 if (devinfo
->gen
< 6)
2902 for (offset
= start_offset
; offset
< p
->next_insn_offset
; offset
+= 16) {
2903 brw_inst
*insn
= store
+ offset
;
2904 assert(brw_inst_cmpt_control(devinfo
, insn
) == 0);
2906 int block_end_offset
= brw_find_next_block_end(p
, offset
);
2907 switch (brw_inst_opcode(devinfo
, insn
)) {
2908 case BRW_OPCODE_BREAK
:
2909 assert(block_end_offset
!= 0);
2910 brw_inst_set_jip(devinfo
, insn
, (block_end_offset
- offset
) / scale
);
2911 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2912 brw_inst_set_uip(devinfo
, insn
,
2913 (brw_find_loop_end(p
, offset
) - offset
+
2914 (devinfo
->gen
== 6 ? 16 : 0)) / scale
);
2916 case BRW_OPCODE_CONTINUE
:
2917 assert(block_end_offset
!= 0);
2918 brw_inst_set_jip(devinfo
, insn
, (block_end_offset
- offset
) / scale
);
2919 brw_inst_set_uip(devinfo
, insn
,
2920 (brw_find_loop_end(p
, offset
) - offset
) / scale
);
2922 assert(brw_inst_uip(devinfo
, insn
) != 0);
2923 assert(brw_inst_jip(devinfo
, insn
) != 0);
2926 case BRW_OPCODE_ENDIF
: {
2927 int32_t jump
= (block_end_offset
== 0) ?
2928 1 * br
: (block_end_offset
- offset
) / scale
;
2929 if (devinfo
->gen
>= 7)
2930 brw_inst_set_jip(devinfo
, insn
, jump
);
2932 brw_inst_set_gen6_jump_count(devinfo
, insn
, jump
);
2936 case BRW_OPCODE_HALT
:
2937 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2939 * "In case of the halt instruction not inside any conditional
2940 * code block, the value of <JIP> and <UIP> should be the
2941 * same. In case of the halt instruction inside conditional code
2942 * block, the <UIP> should be the end of the program, and the
2943 * <JIP> should be end of the most inner conditional code block."
2945 * The uip will have already been set by whoever set up the
2948 if (block_end_offset
== 0) {
2949 brw_inst_set_jip(devinfo
, insn
, brw_inst_uip(devinfo
, insn
));
2951 brw_inst_set_jip(devinfo
, insn
, (block_end_offset
- offset
) / scale
);
2953 assert(brw_inst_uip(devinfo
, insn
) != 0);
2954 assert(brw_inst_jip(devinfo
, insn
) != 0);
2963 void brw_ff_sync(struct brw_codegen
*p
,
2964 struct brw_reg dest
,
2965 unsigned msg_reg_nr
,
2966 struct brw_reg src0
,
2968 unsigned response_length
,
2971 const struct gen_device_info
*devinfo
= p
->devinfo
;
2974 gen6_resolve_implied_move(p
, &src0
, msg_reg_nr
);
2976 insn
= next_insn(p
, BRW_OPCODE_SEND
);
2977 brw_set_dest(p
, insn
, dest
);
2978 brw_set_src0(p
, insn
, src0
);
2979 brw_set_src1(p
, insn
, brw_imm_d(0));
2981 if (devinfo
->gen
< 6)
2982 brw_inst_set_base_mrf(devinfo
, insn
, msg_reg_nr
);
2984 brw_set_ff_sync_message(p
,
2992 * Emit the SEND instruction necessary to generate stream output data on Gen6
2993 * (for transform feedback).
2995 * If send_commit_msg is true, this is the last piece of stream output data
2996 * from this thread, so send the data as a committed write. According to the
2997 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2999 * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
3000 * writes are complete by sending the final write as a committed write."
3003 brw_svb_write(struct brw_codegen
*p
,
3004 struct brw_reg dest
,
3005 unsigned msg_reg_nr
,
3006 struct brw_reg src0
,
3007 unsigned binding_table_index
,
3008 bool send_commit_msg
)
3010 const struct gen_device_info
*devinfo
= p
->devinfo
;
3011 const unsigned target_cache
=
3012 (devinfo
->gen
>= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE
:
3013 devinfo
->gen
>= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE
:
3014 BRW_SFID_DATAPORT_WRITE
);
3017 gen6_resolve_implied_move(p
, &src0
, msg_reg_nr
);
3019 insn
= next_insn(p
, BRW_OPCODE_SEND
);
3020 brw_inst_set_sfid(devinfo
, insn
, target_cache
);
3021 brw_set_dest(p
, insn
, dest
);
3022 brw_set_src0(p
, insn
, src0
);
3023 brw_set_desc(p
, insn
,
3024 brw_message_desc(devinfo
, 1, send_commit_msg
, true) |
3025 brw_dp_write_desc(devinfo
, binding_table_index
,
3026 0, /* msg_control: ignored */
3027 GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE
,
3028 0, /* last_render_target: ignored */
3029 send_commit_msg
)); /* send_commit_msg */
3033 brw_surface_payload_size(struct brw_codegen
*p
,
3034 unsigned num_channels
,
3035 unsigned exec_size
/**< 0 for SIMD4x2 */)
3038 return 1; /* SIMD4x2 */
3039 else if (exec_size
<= 8)
3040 return num_channels
;
3042 return 2 * num_channels
;
3046 brw_untyped_atomic(struct brw_codegen
*p
,
3048 struct brw_reg payload
,
3049 struct brw_reg surface
,
3051 unsigned msg_length
,
3052 bool response_expected
,
3053 bool header_present
)
3055 const struct gen_device_info
*devinfo
= p
->devinfo
;
3056 const unsigned sfid
= (devinfo
->gen
>= 8 || devinfo
->is_haswell
?
3057 HSW_SFID_DATAPORT_DATA_CACHE_1
:
3058 GEN7_SFID_DATAPORT_DATA_CACHE
);
3059 const bool align1
= brw_get_default_access_mode(p
) == BRW_ALIGN_1
;
3060 /* SIMD4x2 untyped atomic instructions only exist on HSW+ */
3061 const bool has_simd4x2
= devinfo
->gen
>= 8 || devinfo
->is_haswell
;
3062 const unsigned exec_size
= align1
? 1 << brw_get_default_exec_size(p
) :
3063 has_simd4x2
? 0 : 8;
3064 const unsigned response_length
=
3065 brw_surface_payload_size(p
, response_expected
, exec_size
);
3066 const unsigned desc
=
3067 brw_message_desc(devinfo
, msg_length
, response_length
, header_present
) |
3068 brw_dp_untyped_atomic_desc(devinfo
, exec_size
, atomic_op
,
3070 /* Mask out unused components -- This is especially important in Align16
3071 * mode on generations that don't have native support for SIMD4x2 atomics,
3072 * because unused but enabled components will cause the dataport to perform
3073 * additional atomic operations on the addresses that happen to be in the
3074 * uninitialized Y, Z and W coordinates of the payload.
3076 const unsigned mask
= align1
? WRITEMASK_XYZW
: WRITEMASK_X
;
3078 brw_send_indirect_surface_message(p
, sfid
, brw_writemask(dst
, mask
),
3079 payload
, surface
, desc
);
3083 brw_untyped_surface_read(struct brw_codegen
*p
,
3085 struct brw_reg payload
,
3086 struct brw_reg surface
,
3087 unsigned msg_length
,
3088 unsigned num_channels
)
3090 const struct gen_device_info
*devinfo
= p
->devinfo
;
3091 const unsigned sfid
= (devinfo
->gen
>= 8 || devinfo
->is_haswell
?
3092 HSW_SFID_DATAPORT_DATA_CACHE_1
:
3093 GEN7_SFID_DATAPORT_DATA_CACHE
);
3094 const bool align1
= brw_get_default_access_mode(p
) == BRW_ALIGN_1
;
3095 const unsigned exec_size
= align1
? 1 << brw_get_default_exec_size(p
) : 0;
3096 const unsigned response_length
=
3097 brw_surface_payload_size(p
, num_channels
, exec_size
);
3098 const unsigned desc
=
3099 brw_message_desc(devinfo
, msg_length
, response_length
, false) |
3100 brw_dp_untyped_surface_rw_desc(devinfo
, exec_size
, num_channels
, false);
3102 brw_send_indirect_surface_message(p
, sfid
, dst
, payload
, surface
, desc
);
3106 brw_untyped_surface_write(struct brw_codegen
*p
,
3107 struct brw_reg payload
,
3108 struct brw_reg surface
,
3109 unsigned msg_length
,
3110 unsigned num_channels
,
3111 bool header_present
)
3113 const struct gen_device_info
*devinfo
= p
->devinfo
;
3114 const unsigned sfid
= (devinfo
->gen
>= 8 || devinfo
->is_haswell
?
3115 HSW_SFID_DATAPORT_DATA_CACHE_1
:
3116 GEN7_SFID_DATAPORT_DATA_CACHE
);
3117 const bool align1
= brw_get_default_access_mode(p
) == BRW_ALIGN_1
;
3118 /* SIMD4x2 untyped surface write instructions only exist on HSW+ */
3119 const bool has_simd4x2
= devinfo
->gen
>= 8 || devinfo
->is_haswell
;
3120 const unsigned exec_size
= align1
? 1 << brw_get_default_exec_size(p
) :
3121 has_simd4x2
? 0 : 8;
3122 const unsigned desc
=
3123 brw_message_desc(devinfo
, msg_length
, 0, header_present
) |
3124 brw_dp_untyped_surface_rw_desc(devinfo
, exec_size
, num_channels
, true);
3125 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3126 const unsigned mask
= !has_simd4x2
&& !align1
? WRITEMASK_X
: WRITEMASK_XYZW
;
3128 brw_send_indirect_surface_message(p
, sfid
, brw_writemask(brw_null_reg(), mask
),
3129 payload
, surface
, desc
);
3133 brw_set_memory_fence_message(struct brw_codegen
*p
,
3134 struct brw_inst
*insn
,
3135 enum brw_message_target sfid
,
3139 const struct gen_device_info
*devinfo
= p
->devinfo
;
3141 brw_set_desc(p
, insn
, brw_message_desc(
3142 devinfo
, 1, (commit_enable
? 1 : 0), true));
3144 brw_inst_set_sfid(devinfo
, insn
, sfid
);
3147 case GEN6_SFID_DATAPORT_RENDER_CACHE
:
3148 brw_inst_set_dp_msg_type(devinfo
, insn
, GEN7_DATAPORT_RC_MEMORY_FENCE
);
3150 case GEN7_SFID_DATAPORT_DATA_CACHE
:
3151 brw_inst_set_dp_msg_type(devinfo
, insn
, GEN7_DATAPORT_DC_MEMORY_FENCE
);
3154 unreachable("Not reached");
3158 brw_inst_set_dp_msg_control(devinfo
, insn
, 1 << 5);
3160 assert(devinfo
->gen
>= 11 || bti
== 0);
3161 brw_inst_set_binding_table_index(devinfo
, insn
, bti
);
3165 brw_memory_fence(struct brw_codegen
*p
,
3168 enum opcode send_op
,
3172 const struct gen_device_info
*devinfo
= p
->devinfo
;
3173 const bool commit_enable
= stall
||
3174 devinfo
->gen
>= 10 || /* HSD ES # 1404612949 */
3175 (devinfo
->gen
== 7 && !devinfo
->is_haswell
);
3176 struct brw_inst
*insn
;
3178 brw_push_insn_state(p
);
3179 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
3180 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
3181 dst
= retype(vec1(dst
), BRW_REGISTER_TYPE_UW
);
3182 src
= retype(vec1(src
), BRW_REGISTER_TYPE_UD
);
3184 /* Set dst as destination for dependency tracking, the MEMORY_FENCE
3185 * message doesn't write anything back.
3187 insn
= next_insn(p
, send_op
);
3188 brw_set_dest(p
, insn
, dst
);
3189 brw_set_src0(p
, insn
, src
);
3190 brw_set_memory_fence_message(p
, insn
, GEN7_SFID_DATAPORT_DATA_CACHE
,
3191 commit_enable
, bti
);
3193 if (devinfo
->gen
== 7 && !devinfo
->is_haswell
) {
3194 /* IVB does typed surface access through the render cache, so we need to
3195 * flush it too. Use a different register so both flushes can be
3196 * pipelined by the hardware.
3198 insn
= next_insn(p
, send_op
);
3199 brw_set_dest(p
, insn
, offset(dst
, 1));
3200 brw_set_src0(p
, insn
, src
);
3201 brw_set_memory_fence_message(p
, insn
, GEN6_SFID_DATAPORT_RENDER_CACHE
,
3202 commit_enable
, bti
);
3204 /* Now write the response of the second message into the response of the
3205 * first to trigger a pipeline stall -- This way future render and data
3206 * cache messages will be properly ordered with respect to past data and
3207 * render cache messages.
3209 brw_MOV(p
, dst
, offset(dst
, 1));
3213 brw_set_default_swsb(p
, tgl_swsb_sbid(TGL_SBID_DST
,
3214 brw_get_default_swsb(p
).sbid
));
3216 brw_MOV(p
, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW
), dst
);
3219 brw_pop_insn_state(p
);
3223 brw_pixel_interpolator_query(struct brw_codegen
*p
,
3224 struct brw_reg dest
,
3228 struct brw_reg data
,
3229 unsigned msg_length
,
3230 unsigned response_length
)
3232 const struct gen_device_info
*devinfo
= p
->devinfo
;
3233 const uint16_t exec_size
= brw_get_default_exec_size(p
);
3234 const unsigned slot_group
= brw_get_default_group(p
) / 16;
3235 const unsigned simd_mode
= (exec_size
== BRW_EXECUTE_16
);
3236 const unsigned desc
=
3237 brw_message_desc(devinfo
, msg_length
, response_length
, false) |
3238 brw_pixel_interp_desc(devinfo
, mode
, noperspective
, simd_mode
,
3241 /* brw_send_indirect_message will automatically use a direct send message
3242 * if data is actually immediate.
3244 brw_send_indirect_message(p
,
3245 GEN7_SFID_PIXEL_INTERPOLATOR
,
3254 brw_find_live_channel(struct brw_codegen
*p
, struct brw_reg dst
,
3255 struct brw_reg mask
)
3257 const struct gen_device_info
*devinfo
= p
->devinfo
;
3258 const unsigned exec_size
= 1 << brw_get_default_exec_size(p
);
3259 const unsigned qtr_control
= brw_get_default_group(p
) / 8;
3262 assert(devinfo
->gen
>= 7);
3263 assert(mask
.type
== BRW_REGISTER_TYPE_UD
);
3265 brw_push_insn_state(p
);
3267 /* The flag register is only used on Gen7 in align1 mode, so avoid setting
3268 * unnecessary bits in the instruction words, get the information we need
3269 * and reset the default flag register. This allows more instructions to be
3272 const unsigned flag_subreg
= p
->current
->flag_subreg
;
3273 brw_set_default_flag_reg(p
, 0, 0);
3275 if (brw_get_default_access_mode(p
) == BRW_ALIGN_1
) {
3276 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
3278 if (devinfo
->gen
>= 8) {
3279 /* Getting the first active channel index is easy on Gen8: Just find
3280 * the first bit set in the execution mask. The register exists on
3281 * HSW already but it reads back as all ones when the current
3282 * instruction has execution masking disabled, so it's kind of
3285 struct brw_reg exec_mask
=
3286 retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD
);
3288 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
3289 if (mask
.file
!= BRW_IMMEDIATE_VALUE
|| mask
.ud
!= 0xffffffff) {
3290 /* Unfortunately, ce0 does not take into account the thread
3291 * dispatch mask, which may be a problem in cases where it's not
3292 * tightly packed (i.e. it doesn't have the form '2^n - 1' for
3293 * some n). Combine ce0 with the given dispatch (or vector) mask
3294 * to mask off those channels which were never dispatched by the
3297 brw_SHR(p
, vec1(dst
), mask
, brw_imm_ud(qtr_control
* 8));
3298 brw_set_default_swsb(p
, tgl_swsb_regdist(1));
3299 brw_AND(p
, vec1(dst
), exec_mask
, vec1(dst
));
3300 exec_mask
= vec1(dst
);
3303 /* Quarter control has the effect of magically shifting the value of
3304 * ce0 so you'll get the first active channel relative to the
3305 * specified quarter control as result.
3307 inst
= brw_FBL(p
, vec1(dst
), exec_mask
);
3309 const struct brw_reg flag
= brw_flag_subreg(flag_subreg
);
3311 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
3312 brw_MOV(p
, retype(flag
, BRW_REGISTER_TYPE_UD
), brw_imm_ud(0));
3314 /* Run enough instructions returning zero with execution masking and
3315 * a conditional modifier enabled in order to get the full execution
3316 * mask in f1.0. We could use a single 32-wide move here if it
3317 * weren't because of the hardware bug that causes channel enables to
3318 * be applied incorrectly to the second half of 32-wide instructions
3321 const unsigned lower_size
= MIN2(16, exec_size
);
3322 for (unsigned i
= 0; i
< exec_size
/ lower_size
; i
++) {
3323 inst
= brw_MOV(p
, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW
),
3325 brw_inst_set_mask_control(devinfo
, inst
, BRW_MASK_ENABLE
);
3326 brw_inst_set_group(devinfo
, inst
, lower_size
* i
+ 8 * qtr_control
);
3327 brw_inst_set_cond_modifier(devinfo
, inst
, BRW_CONDITIONAL_Z
);
3328 brw_inst_set_exec_size(devinfo
, inst
, cvt(lower_size
) - 1);
3329 brw_inst_set_flag_reg_nr(devinfo
, inst
, flag_subreg
/ 2);
3330 brw_inst_set_flag_subreg_nr(devinfo
, inst
, flag_subreg
% 2);
3333 /* Find the first bit set in the exec_size-wide portion of the flag
3334 * register that was updated by the last sequence of MOV
3337 const enum brw_reg_type type
= brw_int_type(exec_size
/ 8, false);
3338 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
3339 brw_FBL(p
, vec1(dst
), byte_offset(retype(flag
, type
), qtr_control
));
3342 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
3344 if (devinfo
->gen
>= 8 &&
3345 mask
.file
== BRW_IMMEDIATE_VALUE
&& mask
.ud
== 0xffffffff) {
3346 /* In SIMD4x2 mode the first active channel index is just the
3347 * negation of the first bit of the mask register. Note that ce0
3348 * doesn't take into account the dispatch mask, so the Gen7 path
3349 * should be used instead unless you have the guarantee that the
3350 * dispatch mask is tightly packed (i.e. it has the form '2^n - 1'
3353 inst
= brw_AND(p
, brw_writemask(dst
, WRITEMASK_X
),
3354 negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD
)),
3358 /* Overwrite the destination without and with execution masking to
3359 * find out which of the channels is active.
3361 brw_push_insn_state(p
);
3362 brw_set_default_exec_size(p
, BRW_EXECUTE_4
);
3363 brw_MOV(p
, brw_writemask(vec4(dst
), WRITEMASK_X
),
3366 inst
= brw_MOV(p
, brw_writemask(vec4(dst
), WRITEMASK_X
),
3368 brw_pop_insn_state(p
);
3369 brw_inst_set_mask_control(devinfo
, inst
, BRW_MASK_ENABLE
);
3373 brw_pop_insn_state(p
);
3377 brw_broadcast(struct brw_codegen
*p
,
3382 const struct gen_device_info
*devinfo
= p
->devinfo
;
3383 const bool align1
= brw_get_default_access_mode(p
) == BRW_ALIGN_1
;
3386 brw_push_insn_state(p
);
3387 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
3388 brw_set_default_exec_size(p
, align1
? BRW_EXECUTE_1
: BRW_EXECUTE_4
);
3390 assert(src
.file
== BRW_GENERAL_REGISTER_FILE
&&
3391 src
.address_mode
== BRW_ADDRESS_DIRECT
);
3392 assert(!src
.abs
&& !src
.negate
);
3393 assert(src
.type
== dst
.type
);
3395 if ((src
.vstride
== 0 && (src
.hstride
== 0 || !align1
)) ||
3396 idx
.file
== BRW_IMMEDIATE_VALUE
) {
3397 /* Trivial, the source is already uniform or the index is a constant.
3398 * We will typically not get here if the optimizer is doing its job, but
3399 * asserting would be mean.
3401 const unsigned i
= idx
.file
== BRW_IMMEDIATE_VALUE
? idx
.ud
: 0;
3403 (align1
? stride(suboffset(src
, i
), 0, 1, 0) :
3404 stride(suboffset(src
, 4 * i
), 0, 4, 1)));
3406 /* From the Haswell PRM section "Register Region Restrictions":
3408 * "The lower bits of the AddressImmediate must not overflow to
3409 * change the register address. The lower 5 bits of Address
3410 * Immediate when added to lower 5 bits of address register gives
3411 * the sub-register offset. The upper bits of Address Immediate
3412 * when added to upper bits of address register gives the register
3413 * address. Any overflow from sub-register offset is dropped."
3415 * Fortunately, for broadcast, we never have a sub-register offset so
3416 * this isn't an issue.
3418 assert(src
.subnr
== 0);
3421 const struct brw_reg addr
=
3422 retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD
);
3423 unsigned offset
= src
.nr
* REG_SIZE
+ src
.subnr
;
3424 /* Limit in bytes of the signed indirect addressing immediate. */
3425 const unsigned limit
= 512;
3427 brw_push_insn_state(p
);
3428 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
3429 brw_set_default_predicate_control(p
, BRW_PREDICATE_NONE
);
3431 /* Take into account the component size and horizontal stride. */
3432 assert(src
.vstride
== src
.hstride
+ src
.width
);
3433 brw_SHL(p
, addr
, vec1(idx
),
3434 brw_imm_ud(_mesa_logbase2(type_sz(src
.type
)) +
3437 /* We can only address up to limit bytes using the indirect
3438 * addressing immediate, account for the difference if the source
3439 * register is above this limit.
3441 if (offset
>= limit
) {
3442 brw_set_default_swsb(p
, tgl_swsb_regdist(1));
3443 brw_ADD(p
, addr
, addr
, brw_imm_ud(offset
- offset
% limit
));
3444 offset
= offset
% limit
;
3447 brw_pop_insn_state(p
);
3449 brw_set_default_swsb(p
, tgl_swsb_regdist(1));
3451 /* Use indirect addressing to fetch the specified component. */
3452 if (type_sz(src
.type
) > 4 &&
3453 (devinfo
->is_cherryview
|| gen_device_info_is_9lp(devinfo
))) {
3454 /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
3456 * "When source or destination datatype is 64b or operation is
3457 * integer DWord multiply, indirect addressing must not be
3460 * To work around both of this issue, we do two integer MOVs
3461 * insead of one 64-bit MOV. Because no double value should ever
3462 * cross a register boundary, it's safe to use the immediate
3463 * offset in the indirect here to handle adding 4 bytes to the
3464 * offset and avoid the extra ADD to the register file.
3466 brw_MOV(p
, subscript(dst
, BRW_REGISTER_TYPE_D
, 0),
3467 retype(brw_vec1_indirect(addr
.subnr
, offset
),
3468 BRW_REGISTER_TYPE_D
));
3469 brw_set_default_swsb(p
, tgl_swsb_null());
3470 brw_MOV(p
, subscript(dst
, BRW_REGISTER_TYPE_D
, 1),
3471 retype(brw_vec1_indirect(addr
.subnr
, offset
+ 4),
3472 BRW_REGISTER_TYPE_D
));
3475 retype(brw_vec1_indirect(addr
.subnr
, offset
), src
.type
));
3478 /* In SIMD4x2 mode the index can be either zero or one, replicate it
3479 * to all bits of a flag register,
3483 stride(brw_swizzle(idx
, BRW_SWIZZLE_XXXX
), 4, 4, 1));
3484 brw_inst_set_pred_control(devinfo
, inst
, BRW_PREDICATE_NONE
);
3485 brw_inst_set_cond_modifier(devinfo
, inst
, BRW_CONDITIONAL_NZ
);
3486 brw_inst_set_flag_reg_nr(devinfo
, inst
, 1);
3488 /* and use predicated SEL to pick the right channel. */
3489 inst
= brw_SEL(p
, dst
,
3490 stride(suboffset(src
, 4), 4, 4, 1),
3491 stride(src
, 4, 4, 1));
3492 brw_inst_set_pred_control(devinfo
, inst
, BRW_PREDICATE_NORMAL
);
3493 brw_inst_set_flag_reg_nr(devinfo
, inst
, 1);
3497 brw_pop_insn_state(p
);
3501 * This instruction is generated as a single-channel align1 instruction by
3502 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
3504 * We can't use the typed atomic op in the FS because that has the execution
3505 * mask ANDed with the pixel mask, but we just want to write the one dword for
3508 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
3509 * one u32. So we use the same untyped atomic write message as the pixel
3512 * The untyped atomic operation requires a BUFFER surface type with RAW
3513 * format, and is only accessible through the legacy DATA_CACHE dataport
3516 void brw_shader_time_add(struct brw_codegen
*p
,
3517 struct brw_reg payload
,
3518 uint32_t surf_index
)
3520 const struct gen_device_info
*devinfo
= p
->devinfo
;
3521 const unsigned sfid
= (devinfo
->gen
>= 8 || devinfo
->is_haswell
?
3522 HSW_SFID_DATAPORT_DATA_CACHE_1
:
3523 GEN7_SFID_DATAPORT_DATA_CACHE
);
3524 assert(devinfo
->gen
>= 7);
3526 brw_push_insn_state(p
);
3527 brw_set_default_access_mode(p
, BRW_ALIGN_1
);
3528 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
3529 brw_set_default_compression_control(p
, BRW_COMPRESSION_NONE
);
3530 brw_inst
*send
= brw_next_insn(p
, BRW_OPCODE_SEND
);
3532 /* We use brw_vec1_reg and unmasked because we want to increment the given
3535 brw_set_dest(p
, send
, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE
,
3537 brw_set_src0(p
, send
, brw_vec1_reg(payload
.file
,
3539 brw_set_desc(p
, send
, (brw_message_desc(devinfo
, 2, 0, false) |
3540 brw_dp_untyped_atomic_desc(devinfo
, 1, BRW_AOP_ADD
,
3543 brw_inst_set_sfid(devinfo
, send
, sfid
);
3544 brw_inst_set_binding_table_index(devinfo
, send
, surf_index
);
3546 brw_pop_insn_state(p
);
3551 * Emit the SEND message for a barrier
3554 brw_barrier(struct brw_codegen
*p
, struct brw_reg src
)
3556 const struct gen_device_info
*devinfo
= p
->devinfo
;
3557 struct brw_inst
*inst
;
3559 assert(devinfo
->gen
>= 7);
3561 brw_push_insn_state(p
);
3562 brw_set_default_access_mode(p
, BRW_ALIGN_1
);
3563 inst
= next_insn(p
, BRW_OPCODE_SEND
);
3564 brw_set_dest(p
, inst
, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW
));
3565 brw_set_src0(p
, inst
, src
);
3566 brw_set_src1(p
, inst
, brw_null_reg());
3567 brw_set_desc(p
, inst
, brw_message_desc(devinfo
, 1, 0, false));
3569 brw_inst_set_sfid(devinfo
, inst
, BRW_SFID_MESSAGE_GATEWAY
);
3570 brw_inst_set_gateway_subfuncid(devinfo
, inst
,
3571 BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG
);
3573 brw_inst_set_mask_control(devinfo
, inst
, BRW_MASK_DISABLE
);
3574 brw_pop_insn_state(p
);
3579 * Emit the wait instruction for a barrier
3582 brw_WAIT(struct brw_codegen
*p
)
3584 const struct gen_device_info
*devinfo
= p
->devinfo
;
3585 struct brw_inst
*insn
;
3587 struct brw_reg src
= brw_notification_reg();
3589 insn
= next_insn(p
, BRW_OPCODE_WAIT
);
3590 brw_set_dest(p
, insn
, src
);
3591 brw_set_src0(p
, insn
, src
);
3592 brw_set_src1(p
, insn
, brw_null_reg());
3594 brw_inst_set_exec_size(devinfo
, insn
, BRW_EXECUTE_1
);
3595 brw_inst_set_mask_control(devinfo
, insn
, BRW_MASK_DISABLE
);
3599 brw_float_controls_mode(struct brw_codegen
*p
,
3600 unsigned mode
, unsigned mask
)
3602 /* From the Skylake PRM, Volume 7, page 760:
3603 * "Implementation Restriction on Register Access: When the control
3604 * register is used as an explicit source and/or destination, hardware
3605 * does not ensure execution pipeline coherency. Software must set the
3606 * thread control field to ‘switch’ for an instruction that uses
3607 * control register as an explicit operand."
3609 * On Gen12+ this is implemented in terms of SWSB annotations instead.
3611 brw_set_default_swsb(p
, tgl_swsb_regdist(1));
3613 brw_inst
*inst
= brw_AND(p
, brw_cr0_reg(0), brw_cr0_reg(0),
3615 brw_inst_set_exec_size(p
->devinfo
, inst
, BRW_EXECUTE_1
);
3616 if (p
->devinfo
->gen
< 12)
3617 brw_inst_set_thread_control(p
->devinfo
, inst
, BRW_THREAD_SWITCH
);
3620 brw_inst
*inst_or
= brw_OR(p
, brw_cr0_reg(0), brw_cr0_reg(0),
3622 brw_inst_set_exec_size(p
->devinfo
, inst_or
, BRW_EXECUTE_1
);
3623 if (p
->devinfo
->gen
< 12)
3624 brw_inst_set_thread_control(p
->devinfo
, inst_or
, BRW_THREAD_SWITCH
);
3627 if (p
->devinfo
->gen
>= 12)
3628 brw_SYNC(p
, TGL_SYNC_NOP
);