2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **********************************************************************/
29 * Keith Whitwell <keithw@vmware.com>
33 #include "brw_eu_defines.h"
36 #include "util/ralloc.h"
39 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
40 * registers, implicitly moving the operand to a message register.
42 * On Sandybridge, this is no longer the case. This function performs the
43 * explicit move; it should be called before emitting a SEND instruction.
46 gen6_resolve_implied_move(struct brw_codegen
*p
,
50 const struct gen_device_info
*devinfo
= p
->devinfo
;
54 if (src
->file
== BRW_MESSAGE_REGISTER_FILE
)
57 if (src
->file
!= BRW_ARCHITECTURE_REGISTER_FILE
|| src
->nr
!= BRW_ARF_NULL
) {
58 brw_push_insn_state(p
);
59 brw_set_default_exec_size(p
, BRW_EXECUTE_8
);
60 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
61 brw_set_default_compression_control(p
, BRW_COMPRESSION_NONE
);
62 brw_MOV(p
, retype(brw_message_reg(msg_reg_nr
), BRW_REGISTER_TYPE_UD
),
63 retype(*src
, BRW_REGISTER_TYPE_UD
));
64 brw_pop_insn_state(p
);
66 *src
= brw_message_reg(msg_reg_nr
);
70 gen7_convert_mrf_to_grf(struct brw_codegen
*p
, struct brw_reg
*reg
)
72 /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
73 * "The send with EOT should use register space R112-R127 for <src>. This is
74 * to enable loading of a new thread into the same slot while the message
75 * with EOT for current thread is pending dispatch."
77 * Since we're pretending to have 16 MRFs anyway, we may as well use the
78 * registers required for messages with EOT.
80 const struct gen_device_info
*devinfo
= p
->devinfo
;
81 if (devinfo
->gen
>= 7 && reg
->file
== BRW_MESSAGE_REGISTER_FILE
) {
82 reg
->file
= BRW_GENERAL_REGISTER_FILE
;
83 reg
->nr
+= GEN7_MRF_HACK_START
;
88 brw_set_dest(struct brw_codegen
*p
, brw_inst
*inst
, struct brw_reg dest
)
90 const struct gen_device_info
*devinfo
= p
->devinfo
;
92 if (dest
.file
== BRW_MESSAGE_REGISTER_FILE
)
93 assert((dest
.nr
& ~BRW_MRF_COMPR4
) < BRW_MAX_MRF(devinfo
->gen
));
94 else if (dest
.file
== BRW_GENERAL_REGISTER_FILE
)
95 assert(dest
.nr
< 128);
97 /* The hardware has a restriction where if the destination is Byte,
98 * the instruction needs to have a stride of 2 (except for packed byte
99 * MOV). This seems to be required even if the destination is the NULL
102 if (dest
.file
== BRW_ARCHITECTURE_REGISTER_FILE
&&
103 dest
.nr
== BRW_ARF_NULL
&&
104 type_sz(dest
.type
) == 1) {
105 dest
.hstride
= BRW_HORIZONTAL_STRIDE_2
;
108 gen7_convert_mrf_to_grf(p
, &dest
);
110 if (devinfo
->gen
>= 12 &&
111 (brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SEND
||
112 brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDC
)) {
113 assert(dest
.file
== BRW_GENERAL_REGISTER_FILE
||
114 dest
.file
== BRW_ARCHITECTURE_REGISTER_FILE
);
115 assert(dest
.address_mode
== BRW_ADDRESS_DIRECT
);
116 assert(dest
.subnr
== 0);
117 assert(brw_inst_exec_size(devinfo
, inst
) == BRW_EXECUTE_1
||
118 (dest
.hstride
== BRW_HORIZONTAL_STRIDE_1
&&
119 dest
.vstride
== dest
.width
+ 1));
120 assert(!dest
.negate
&& !dest
.abs
);
121 brw_inst_set_dst_reg_file(devinfo
, inst
, dest
.file
);
122 brw_inst_set_dst_da_reg_nr(devinfo
, inst
, dest
.nr
);
124 } else if (brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDS
||
125 brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDSC
) {
126 assert(devinfo
->gen
< 12);
127 assert(dest
.file
== BRW_GENERAL_REGISTER_FILE
||
128 dest
.file
== BRW_ARCHITECTURE_REGISTER_FILE
);
129 assert(dest
.address_mode
== BRW_ADDRESS_DIRECT
);
130 assert(dest
.subnr
% 16 == 0);
131 assert(dest
.hstride
== BRW_HORIZONTAL_STRIDE_1
&&
132 dest
.vstride
== dest
.width
+ 1);
133 assert(!dest
.negate
&& !dest
.abs
);
134 brw_inst_set_dst_da_reg_nr(devinfo
, inst
, dest
.nr
);
135 brw_inst_set_dst_da16_subreg_nr(devinfo
, inst
, dest
.subnr
/ 16);
136 brw_inst_set_send_dst_reg_file(devinfo
, inst
, dest
.file
);
138 brw_inst_set_dst_file_type(devinfo
, inst
, dest
.file
, dest
.type
);
139 brw_inst_set_dst_address_mode(devinfo
, inst
, dest
.address_mode
);
141 if (dest
.address_mode
== BRW_ADDRESS_DIRECT
) {
142 brw_inst_set_dst_da_reg_nr(devinfo
, inst
, dest
.nr
);
144 if (brw_inst_access_mode(devinfo
, inst
) == BRW_ALIGN_1
) {
145 brw_inst_set_dst_da1_subreg_nr(devinfo
, inst
, dest
.subnr
);
146 if (dest
.hstride
== BRW_HORIZONTAL_STRIDE_0
)
147 dest
.hstride
= BRW_HORIZONTAL_STRIDE_1
;
148 brw_inst_set_dst_hstride(devinfo
, inst
, dest
.hstride
);
150 brw_inst_set_dst_da16_subreg_nr(devinfo
, inst
, dest
.subnr
/ 16);
151 brw_inst_set_da16_writemask(devinfo
, inst
, dest
.writemask
);
152 if (dest
.file
== BRW_GENERAL_REGISTER_FILE
||
153 dest
.file
== BRW_MESSAGE_REGISTER_FILE
) {
154 assert(dest
.writemask
!= 0);
156 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
157 * Although Dst.HorzStride is a don't care for Align16, HW needs
158 * this to be programmed as "01".
160 brw_inst_set_dst_hstride(devinfo
, inst
, 1);
163 brw_inst_set_dst_ia_subreg_nr(devinfo
, inst
, dest
.subnr
);
165 /* These are different sizes in align1 vs align16:
167 if (brw_inst_access_mode(devinfo
, inst
) == BRW_ALIGN_1
) {
168 brw_inst_set_dst_ia1_addr_imm(devinfo
, inst
,
169 dest
.indirect_offset
);
170 if (dest
.hstride
== BRW_HORIZONTAL_STRIDE_0
)
171 dest
.hstride
= BRW_HORIZONTAL_STRIDE_1
;
172 brw_inst_set_dst_hstride(devinfo
, inst
, dest
.hstride
);
174 brw_inst_set_dst_ia16_addr_imm(devinfo
, inst
,
175 dest
.indirect_offset
);
176 /* even ignored in da16, still need to set as '01' */
177 brw_inst_set_dst_hstride(devinfo
, inst
, 1);
182 /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
183 * or 16 (SIMD16), as that's normally correct. However, when dealing with
184 * small registers, it can be useful for us to automatically reduce it to
185 * match the register size.
187 if (p
->automatic_exec_sizes
) {
189 * In platforms that support fp64 we can emit instructions with a width
190 * of 4 that need two SIMD8 registers and an exec_size of 8 or 16. In
191 * these cases we need to make sure that these instructions have their
192 * exec sizes set properly when they are emitted and we can't rely on
193 * this code to fix it.
196 if (devinfo
->gen
>= 6)
197 fix_exec_size
= dest
.width
< BRW_EXECUTE_4
;
199 fix_exec_size
= dest
.width
< BRW_EXECUTE_8
;
202 brw_inst_set_exec_size(devinfo
, inst
, dest
.width
);
207 brw_set_src0(struct brw_codegen
*p
, brw_inst
*inst
, struct brw_reg reg
)
209 const struct gen_device_info
*devinfo
= p
->devinfo
;
211 if (reg
.file
== BRW_MESSAGE_REGISTER_FILE
)
212 assert((reg
.nr
& ~BRW_MRF_COMPR4
) < BRW_MAX_MRF(devinfo
->gen
));
213 else if (reg
.file
== BRW_GENERAL_REGISTER_FILE
)
214 assert(reg
.nr
< 128);
216 gen7_convert_mrf_to_grf(p
, ®
);
218 if (devinfo
->gen
>= 6 &&
219 (brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SEND
||
220 brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDC
||
221 brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDS
||
222 brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDSC
)) {
223 /* Any source modifiers or regions will be ignored, since this just
224 * identifies the MRF/GRF to start reading the message contents from.
225 * Check for some likely failures.
229 assert(reg
.address_mode
== BRW_ADDRESS_DIRECT
);
232 if (devinfo
->gen
>= 12 &&
233 (brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SEND
||
234 brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDC
)) {
235 assert(reg
.file
!= BRW_IMMEDIATE_VALUE
);
236 assert(reg
.address_mode
== BRW_ADDRESS_DIRECT
);
237 assert(reg
.subnr
== 0);
238 assert(brw_inst_exec_size(devinfo
, inst
) == BRW_EXECUTE_1
||
239 (reg
.hstride
== BRW_HORIZONTAL_STRIDE_1
&&
240 reg
.vstride
== reg
.width
+ 1));
241 assert(!reg
.negate
&& !reg
.abs
);
242 brw_inst_set_send_src0_reg_file(devinfo
, inst
, reg
.file
);
243 brw_inst_set_src0_da_reg_nr(devinfo
, inst
, reg
.nr
);
245 } else if (brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDS
||
246 brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDSC
) {
247 assert(reg
.file
== BRW_GENERAL_REGISTER_FILE
);
248 assert(reg
.address_mode
== BRW_ADDRESS_DIRECT
);
249 assert(reg
.subnr
% 16 == 0);
250 assert(reg
.hstride
== BRW_HORIZONTAL_STRIDE_1
&&
251 reg
.vstride
== reg
.width
+ 1);
252 assert(!reg
.negate
&& !reg
.abs
);
253 brw_inst_set_src0_da_reg_nr(devinfo
, inst
, reg
.nr
);
254 brw_inst_set_src0_da16_subreg_nr(devinfo
, inst
, reg
.subnr
/ 16);
256 brw_inst_set_src0_file_type(devinfo
, inst
, reg
.file
, reg
.type
);
257 brw_inst_set_src0_abs(devinfo
, inst
, reg
.abs
);
258 brw_inst_set_src0_negate(devinfo
, inst
, reg
.negate
);
259 brw_inst_set_src0_address_mode(devinfo
, inst
, reg
.address_mode
);
261 if (reg
.file
== BRW_IMMEDIATE_VALUE
) {
262 if (reg
.type
== BRW_REGISTER_TYPE_DF
||
263 brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_DIM
)
264 brw_inst_set_imm_df(devinfo
, inst
, reg
.df
);
265 else if (reg
.type
== BRW_REGISTER_TYPE_UQ
||
266 reg
.type
== BRW_REGISTER_TYPE_Q
)
267 brw_inst_set_imm_uq(devinfo
, inst
, reg
.u64
);
269 brw_inst_set_imm_ud(devinfo
, inst
, reg
.ud
);
271 if (devinfo
->gen
< 12 && type_sz(reg
.type
) < 8) {
272 brw_inst_set_src1_reg_file(devinfo
, inst
,
273 BRW_ARCHITECTURE_REGISTER_FILE
);
274 brw_inst_set_src1_reg_hw_type(devinfo
, inst
,
275 brw_inst_src0_reg_hw_type(devinfo
, inst
));
278 if (reg
.address_mode
== BRW_ADDRESS_DIRECT
) {
279 brw_inst_set_src0_da_reg_nr(devinfo
, inst
, reg
.nr
);
280 if (brw_inst_access_mode(devinfo
, inst
) == BRW_ALIGN_1
) {
281 brw_inst_set_src0_da1_subreg_nr(devinfo
, inst
, reg
.subnr
);
283 brw_inst_set_src0_da16_subreg_nr(devinfo
, inst
, reg
.subnr
/ 16);
286 brw_inst_set_src0_ia_subreg_nr(devinfo
, inst
, reg
.subnr
);
288 if (brw_inst_access_mode(devinfo
, inst
) == BRW_ALIGN_1
) {
289 brw_inst_set_src0_ia1_addr_imm(devinfo
, inst
, reg
.indirect_offset
);
291 brw_inst_set_src0_ia16_addr_imm(devinfo
, inst
, reg
.indirect_offset
);
295 if (brw_inst_access_mode(devinfo
, inst
) == BRW_ALIGN_1
) {
296 if (reg
.width
== BRW_WIDTH_1
&&
297 brw_inst_exec_size(devinfo
, inst
) == BRW_EXECUTE_1
) {
298 brw_inst_set_src0_hstride(devinfo
, inst
, BRW_HORIZONTAL_STRIDE_0
);
299 brw_inst_set_src0_width(devinfo
, inst
, BRW_WIDTH_1
);
300 brw_inst_set_src0_vstride(devinfo
, inst
, BRW_VERTICAL_STRIDE_0
);
302 brw_inst_set_src0_hstride(devinfo
, inst
, reg
.hstride
);
303 brw_inst_set_src0_width(devinfo
, inst
, reg
.width
);
304 brw_inst_set_src0_vstride(devinfo
, inst
, reg
.vstride
);
307 brw_inst_set_src0_da16_swiz_x(devinfo
, inst
,
308 BRW_GET_SWZ(reg
.swizzle
, BRW_CHANNEL_X
));
309 brw_inst_set_src0_da16_swiz_y(devinfo
, inst
,
310 BRW_GET_SWZ(reg
.swizzle
, BRW_CHANNEL_Y
));
311 brw_inst_set_src0_da16_swiz_z(devinfo
, inst
,
312 BRW_GET_SWZ(reg
.swizzle
, BRW_CHANNEL_Z
));
313 brw_inst_set_src0_da16_swiz_w(devinfo
, inst
,
314 BRW_GET_SWZ(reg
.swizzle
, BRW_CHANNEL_W
));
316 if (reg
.vstride
== BRW_VERTICAL_STRIDE_8
) {
317 /* This is an oddity of the fact we're using the same
318 * descriptions for registers in align_16 as align_1:
320 brw_inst_set_src0_vstride(devinfo
, inst
, BRW_VERTICAL_STRIDE_4
);
321 } else if (devinfo
->gen
== 7 && !devinfo
->is_haswell
&&
322 reg
.type
== BRW_REGISTER_TYPE_DF
&&
323 reg
.vstride
== BRW_VERTICAL_STRIDE_2
) {
326 * "For Align16 access mode, only encodings of 0000 and 0011
327 * are allowed. Other codes are reserved."
329 * Presumably the DevSNB behavior applies to IVB as well.
331 brw_inst_set_src0_vstride(devinfo
, inst
, BRW_VERTICAL_STRIDE_4
);
333 brw_inst_set_src0_vstride(devinfo
, inst
, reg
.vstride
);
342 brw_set_src1(struct brw_codegen
*p
, brw_inst
*inst
, struct brw_reg reg
)
344 const struct gen_device_info
*devinfo
= p
->devinfo
;
346 if (reg
.file
== BRW_GENERAL_REGISTER_FILE
)
347 assert(reg
.nr
< 128);
349 if (brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDS
||
350 brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDSC
||
351 (devinfo
->gen
>= 12 &&
352 (brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SEND
||
353 brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDC
))) {
354 assert(reg
.file
== BRW_GENERAL_REGISTER_FILE
||
355 reg
.file
== BRW_ARCHITECTURE_REGISTER_FILE
);
356 assert(reg
.address_mode
== BRW_ADDRESS_DIRECT
);
357 assert(reg
.subnr
== 0);
358 assert(brw_inst_exec_size(devinfo
, inst
) == BRW_EXECUTE_1
||
359 (reg
.hstride
== BRW_HORIZONTAL_STRIDE_1
&&
360 reg
.vstride
== reg
.width
+ 1));
361 assert(!reg
.negate
&& !reg
.abs
);
362 brw_inst_set_send_src1_reg_nr(devinfo
, inst
, reg
.nr
);
363 brw_inst_set_send_src1_reg_file(devinfo
, inst
, reg
.file
);
365 /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
367 * "Accumulator registers may be accessed explicitly as src0
370 assert(reg
.file
!= BRW_ARCHITECTURE_REGISTER_FILE
||
371 reg
.nr
!= BRW_ARF_ACCUMULATOR
);
373 gen7_convert_mrf_to_grf(p
, ®
);
374 assert(reg
.file
!= BRW_MESSAGE_REGISTER_FILE
);
376 brw_inst_set_src1_file_type(devinfo
, inst
, reg
.file
, reg
.type
);
377 brw_inst_set_src1_abs(devinfo
, inst
, reg
.abs
);
378 brw_inst_set_src1_negate(devinfo
, inst
, reg
.negate
);
380 /* Only src1 can be immediate in two-argument instructions.
382 assert(brw_inst_src0_reg_file(devinfo
, inst
) != BRW_IMMEDIATE_VALUE
);
384 if (reg
.file
== BRW_IMMEDIATE_VALUE
) {
385 /* two-argument instructions can only use 32-bit immediates */
386 assert(type_sz(reg
.type
) < 8);
387 brw_inst_set_imm_ud(devinfo
, inst
, reg
.ud
);
389 /* This is a hardware restriction, which may or may not be lifted
392 assert (reg
.address_mode
== BRW_ADDRESS_DIRECT
);
393 /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
395 brw_inst_set_src1_da_reg_nr(devinfo
, inst
, reg
.nr
);
396 if (brw_inst_access_mode(devinfo
, inst
) == BRW_ALIGN_1
) {
397 brw_inst_set_src1_da1_subreg_nr(devinfo
, inst
, reg
.subnr
);
399 brw_inst_set_src1_da16_subreg_nr(devinfo
, inst
, reg
.subnr
/ 16);
402 if (brw_inst_access_mode(devinfo
, inst
) == BRW_ALIGN_1
) {
403 if (reg
.width
== BRW_WIDTH_1
&&
404 brw_inst_exec_size(devinfo
, inst
) == BRW_EXECUTE_1
) {
405 brw_inst_set_src1_hstride(devinfo
, inst
, BRW_HORIZONTAL_STRIDE_0
);
406 brw_inst_set_src1_width(devinfo
, inst
, BRW_WIDTH_1
);
407 brw_inst_set_src1_vstride(devinfo
, inst
, BRW_VERTICAL_STRIDE_0
);
409 brw_inst_set_src1_hstride(devinfo
, inst
, reg
.hstride
);
410 brw_inst_set_src1_width(devinfo
, inst
, reg
.width
);
411 brw_inst_set_src1_vstride(devinfo
, inst
, reg
.vstride
);
414 brw_inst_set_src1_da16_swiz_x(devinfo
, inst
,
415 BRW_GET_SWZ(reg
.swizzle
, BRW_CHANNEL_X
));
416 brw_inst_set_src1_da16_swiz_y(devinfo
, inst
,
417 BRW_GET_SWZ(reg
.swizzle
, BRW_CHANNEL_Y
));
418 brw_inst_set_src1_da16_swiz_z(devinfo
, inst
,
419 BRW_GET_SWZ(reg
.swizzle
, BRW_CHANNEL_Z
));
420 brw_inst_set_src1_da16_swiz_w(devinfo
, inst
,
421 BRW_GET_SWZ(reg
.swizzle
, BRW_CHANNEL_W
));
423 if (reg
.vstride
== BRW_VERTICAL_STRIDE_8
) {
424 /* This is an oddity of the fact we're using the same
425 * descriptions for registers in align_16 as align_1:
427 brw_inst_set_src1_vstride(devinfo
, inst
, BRW_VERTICAL_STRIDE_4
);
428 } else if (devinfo
->gen
== 7 && !devinfo
->is_haswell
&&
429 reg
.type
== BRW_REGISTER_TYPE_DF
&&
430 reg
.vstride
== BRW_VERTICAL_STRIDE_2
) {
433 * "For Align16 access mode, only encodings of 0000 and 0011
434 * are allowed. Other codes are reserved."
436 * Presumably the DevSNB behavior applies to IVB as well.
438 brw_inst_set_src1_vstride(devinfo
, inst
, BRW_VERTICAL_STRIDE_4
);
440 brw_inst_set_src1_vstride(devinfo
, inst
, reg
.vstride
);
448 * Specify the descriptor and extended descriptor immediate for a SEND(C)
449 * message instruction.
452 brw_set_desc_ex(struct brw_codegen
*p
, brw_inst
*inst
,
453 unsigned desc
, unsigned ex_desc
)
455 const struct gen_device_info
*devinfo
= p
->devinfo
;
456 assert(brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SEND
||
457 brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDC
);
458 if (devinfo
->gen
< 12)
459 brw_inst_set_src1_file_type(devinfo
, inst
,
460 BRW_IMMEDIATE_VALUE
, BRW_REGISTER_TYPE_UD
);
461 brw_inst_set_send_desc(devinfo
, inst
, desc
);
462 if (devinfo
->gen
>= 9)
463 brw_inst_set_send_ex_desc(devinfo
, inst
, ex_desc
);
466 static void brw_set_math_message( struct brw_codegen
*p
,
469 unsigned integer_type
,
473 const struct gen_device_info
*devinfo
= p
->devinfo
;
475 unsigned response_length
;
477 /* Infer message length from the function */
479 case BRW_MATH_FUNCTION_POW
:
480 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT
:
481 case BRW_MATH_FUNCTION_INT_DIV_REMAINDER
:
482 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER
:
490 /* Infer response length from the function */
492 case BRW_MATH_FUNCTION_SINCOS
:
493 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER
:
501 brw_set_desc(p
, inst
, brw_message_desc(
502 devinfo
, msg_length
, response_length
, false));
504 brw_inst_set_sfid(devinfo
, inst
, BRW_SFID_MATH
);
505 brw_inst_set_math_msg_function(devinfo
, inst
, function
);
506 brw_inst_set_math_msg_signed_int(devinfo
, inst
, integer_type
);
507 brw_inst_set_math_msg_precision(devinfo
, inst
, low_precision
);
508 brw_inst_set_math_msg_saturate(devinfo
, inst
, brw_inst_saturate(devinfo
, inst
));
509 brw_inst_set_math_msg_data_type(devinfo
, inst
, dataType
);
510 brw_inst_set_saturate(devinfo
, inst
, 0);
514 static void brw_set_ff_sync_message(struct brw_codegen
*p
,
517 unsigned response_length
,
520 const struct gen_device_info
*devinfo
= p
->devinfo
;
522 brw_set_desc(p
, insn
, brw_message_desc(
523 devinfo
, 1, response_length
, true));
525 brw_inst_set_sfid(devinfo
, insn
, BRW_SFID_URB
);
526 brw_inst_set_eot(devinfo
, insn
, end_of_thread
);
527 brw_inst_set_urb_opcode(devinfo
, insn
, 1); /* FF_SYNC */
528 brw_inst_set_urb_allocate(devinfo
, insn
, allocate
);
529 /* The following fields are not used by FF_SYNC: */
530 brw_inst_set_urb_global_offset(devinfo
, insn
, 0);
531 brw_inst_set_urb_swizzle_control(devinfo
, insn
, 0);
532 brw_inst_set_urb_used(devinfo
, insn
, 0);
533 brw_inst_set_urb_complete(devinfo
, insn
, 0);
536 static void brw_set_urb_message( struct brw_codegen
*p
,
538 enum brw_urb_write_flags flags
,
540 unsigned response_length
,
542 unsigned swizzle_control
)
544 const struct gen_device_info
*devinfo
= p
->devinfo
;
546 assert(devinfo
->gen
< 7 || swizzle_control
!= BRW_URB_SWIZZLE_TRANSPOSE
);
547 assert(devinfo
->gen
< 7 || !(flags
& BRW_URB_WRITE_ALLOCATE
));
548 assert(devinfo
->gen
>= 7 || !(flags
& BRW_URB_WRITE_PER_SLOT_OFFSET
));
550 brw_set_desc(p
, insn
, brw_message_desc(
551 devinfo
, msg_length
, response_length
, true));
553 brw_inst_set_sfid(devinfo
, insn
, BRW_SFID_URB
);
554 brw_inst_set_eot(devinfo
, insn
, !!(flags
& BRW_URB_WRITE_EOT
));
556 if (flags
& BRW_URB_WRITE_OWORD
) {
557 assert(msg_length
== 2); /* header + one OWORD of data */
558 brw_inst_set_urb_opcode(devinfo
, insn
, BRW_URB_OPCODE_WRITE_OWORD
);
560 brw_inst_set_urb_opcode(devinfo
, insn
, BRW_URB_OPCODE_WRITE_HWORD
);
563 brw_inst_set_urb_global_offset(devinfo
, insn
, offset
);
564 brw_inst_set_urb_swizzle_control(devinfo
, insn
, swizzle_control
);
566 if (devinfo
->gen
< 8) {
567 brw_inst_set_urb_complete(devinfo
, insn
, !!(flags
& BRW_URB_WRITE_COMPLETE
));
570 if (devinfo
->gen
< 7) {
571 brw_inst_set_urb_allocate(devinfo
, insn
, !!(flags
& BRW_URB_WRITE_ALLOCATE
));
572 brw_inst_set_urb_used(devinfo
, insn
, !(flags
& BRW_URB_WRITE_UNUSED
));
574 brw_inst_set_urb_per_slot_offset(devinfo
, insn
,
575 !!(flags
& BRW_URB_WRITE_PER_SLOT_OFFSET
));
580 gen7_set_dp_scratch_message(struct brw_codegen
*p
,
584 bool invalidate_after_read
,
586 unsigned addr_offset
,
591 const struct gen_device_info
*devinfo
= p
->devinfo
;
592 assert(num_regs
== 1 || num_regs
== 2 || num_regs
== 4 ||
593 (devinfo
->gen
>= 8 && num_regs
== 8));
594 const unsigned block_size
= (devinfo
->gen
>= 8 ? _mesa_logbase2(num_regs
) :
597 brw_set_desc(p
, inst
, brw_message_desc(
598 devinfo
, mlen
, rlen
, header_present
));
600 brw_inst_set_sfid(devinfo
, inst
, GEN7_SFID_DATAPORT_DATA_CACHE
);
601 brw_inst_set_dp_category(devinfo
, inst
, 1); /* Scratch Block Read/Write msgs */
602 brw_inst_set_scratch_read_write(devinfo
, inst
, write
);
603 brw_inst_set_scratch_type(devinfo
, inst
, dword
);
604 brw_inst_set_scratch_invalidate_after_read(devinfo
, inst
, invalidate_after_read
);
605 brw_inst_set_scratch_block_size(devinfo
, inst
, block_size
);
606 brw_inst_set_scratch_addr_offset(devinfo
, inst
, addr_offset
);
610 brw_inst_set_state(const struct gen_device_info
*devinfo
,
612 const struct brw_insn_state
*state
)
614 brw_inst_set_exec_size(devinfo
, insn
, state
->exec_size
);
615 brw_inst_set_group(devinfo
, insn
, state
->group
);
616 brw_inst_set_compression(devinfo
, insn
, state
->compressed
);
617 brw_inst_set_access_mode(devinfo
, insn
, state
->access_mode
);
618 brw_inst_set_mask_control(devinfo
, insn
, state
->mask_control
);
619 if (devinfo
->gen
>= 12)
620 brw_inst_set_swsb(devinfo
, insn
, tgl_swsb_encode(state
->swsb
));
621 brw_inst_set_saturate(devinfo
, insn
, state
->saturate
);
622 brw_inst_set_pred_control(devinfo
, insn
, state
->predicate
);
623 brw_inst_set_pred_inv(devinfo
, insn
, state
->pred_inv
);
625 if (is_3src(devinfo
, brw_inst_opcode(devinfo
, insn
)) &&
626 state
->access_mode
== BRW_ALIGN_16
) {
627 brw_inst_set_3src_a16_flag_subreg_nr(devinfo
, insn
, state
->flag_subreg
% 2);
628 if (devinfo
->gen
>= 7)
629 brw_inst_set_3src_a16_flag_reg_nr(devinfo
, insn
, state
->flag_subreg
/ 2);
631 brw_inst_set_flag_subreg_nr(devinfo
, insn
, state
->flag_subreg
% 2);
632 if (devinfo
->gen
>= 7)
633 brw_inst_set_flag_reg_nr(devinfo
, insn
, state
->flag_subreg
/ 2);
636 if (devinfo
->gen
>= 6)
637 brw_inst_set_acc_wr_control(devinfo
, insn
, state
->acc_wr_control
);
640 #define next_insn brw_next_insn
642 brw_next_insn(struct brw_codegen
*p
, unsigned opcode
)
644 const struct gen_device_info
*devinfo
= p
->devinfo
;
647 if (p
->nr_insn
+ 1 > p
->store_size
) {
649 p
->store
= reralloc(p
->mem_ctx
, p
->store
, brw_inst
, p
->store_size
);
652 p
->next_insn_offset
+= 16;
653 insn
= &p
->store
[p
->nr_insn
++];
655 memset(insn
, 0, sizeof(*insn
));
656 brw_inst_set_opcode(devinfo
, insn
, opcode
);
658 /* Apply the default instruction state */
659 brw_inst_set_state(devinfo
, insn
, p
->current
);
665 brw_alu1(struct brw_codegen
*p
, unsigned opcode
,
666 struct brw_reg dest
, struct brw_reg src
)
668 brw_inst
*insn
= next_insn(p
, opcode
);
669 brw_set_dest(p
, insn
, dest
);
670 brw_set_src0(p
, insn
, src
);
675 brw_alu2(struct brw_codegen
*p
, unsigned opcode
,
676 struct brw_reg dest
, struct brw_reg src0
, struct brw_reg src1
)
678 /* 64-bit immediates are only supported on 1-src instructions */
679 assert(src0
.file
!= BRW_IMMEDIATE_VALUE
|| type_sz(src0
.type
) <= 4);
680 assert(src1
.file
!= BRW_IMMEDIATE_VALUE
|| type_sz(src1
.type
) <= 4);
682 brw_inst
*insn
= next_insn(p
, opcode
);
683 brw_set_dest(p
, insn
, dest
);
684 brw_set_src0(p
, insn
, src0
);
685 brw_set_src1(p
, insn
, src1
);
690 get_3src_subreg_nr(struct brw_reg reg
)
692 /* Normally, SubRegNum is in bytes (0..31). However, 3-src instructions
693 * use 32-bit units (components 0..7). Since they only support F/D/UD
694 * types, this doesn't lose any flexibility, but uses fewer bits.
696 return reg
.subnr
/ 4;
699 static enum gen10_align1_3src_vertical_stride
700 to_3src_align1_vstride(const struct gen_device_info
*devinfo
,
701 enum brw_vertical_stride vstride
)
704 case BRW_VERTICAL_STRIDE_0
:
705 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_0
;
706 case BRW_VERTICAL_STRIDE_1
:
707 assert(devinfo
->gen
>= 12);
708 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_1
;
709 case BRW_VERTICAL_STRIDE_2
:
710 assert(devinfo
->gen
< 12);
711 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_2
;
712 case BRW_VERTICAL_STRIDE_4
:
713 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_4
;
714 case BRW_VERTICAL_STRIDE_8
:
715 case BRW_VERTICAL_STRIDE_16
:
716 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_8
;
718 unreachable("invalid vstride");
723 static enum gen10_align1_3src_src_horizontal_stride
724 to_3src_align1_hstride(enum brw_horizontal_stride hstride
)
727 case BRW_HORIZONTAL_STRIDE_0
:
728 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_0
;
729 case BRW_HORIZONTAL_STRIDE_1
:
730 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_1
;
731 case BRW_HORIZONTAL_STRIDE_2
:
732 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_2
;
733 case BRW_HORIZONTAL_STRIDE_4
:
734 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_4
;
736 unreachable("invalid hstride");
741 brw_alu3(struct brw_codegen
*p
, unsigned opcode
, struct brw_reg dest
,
742 struct brw_reg src0
, struct brw_reg src1
, struct brw_reg src2
)
744 const struct gen_device_info
*devinfo
= p
->devinfo
;
745 brw_inst
*inst
= next_insn(p
, opcode
);
747 gen7_convert_mrf_to_grf(p
, &dest
);
749 assert(dest
.nr
< 128);
750 assert(src0
.file
== BRW_IMMEDIATE_VALUE
|| src0
.nr
< 128);
751 assert(src1
.file
!= BRW_IMMEDIATE_VALUE
&& src1
.nr
< 128);
752 assert(src2
.file
== BRW_IMMEDIATE_VALUE
|| src2
.nr
< 128);
753 assert(dest
.address_mode
== BRW_ADDRESS_DIRECT
);
754 assert(src0
.address_mode
== BRW_ADDRESS_DIRECT
);
755 assert(src1
.address_mode
== BRW_ADDRESS_DIRECT
);
756 assert(src2
.address_mode
== BRW_ADDRESS_DIRECT
);
758 if (brw_inst_access_mode(devinfo
, inst
) == BRW_ALIGN_1
) {
759 assert(dest
.file
== BRW_GENERAL_REGISTER_FILE
||
760 dest
.file
== BRW_ARCHITECTURE_REGISTER_FILE
);
762 if (devinfo
->gen
>= 12) {
763 brw_inst_set_3src_a1_dst_reg_file(devinfo
, inst
, dest
.file
);
764 brw_inst_set_3src_dst_reg_nr(devinfo
, inst
, dest
.nr
);
766 if (dest
.file
== BRW_ARCHITECTURE_REGISTER_FILE
) {
767 brw_inst_set_3src_a1_dst_reg_file(devinfo
, inst
,
768 BRW_ALIGN1_3SRC_ACCUMULATOR
);
769 brw_inst_set_3src_dst_reg_nr(devinfo
, inst
, BRW_ARF_ACCUMULATOR
);
771 brw_inst_set_3src_a1_dst_reg_file(devinfo
, inst
,
772 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE
);
773 brw_inst_set_3src_dst_reg_nr(devinfo
, inst
, dest
.nr
);
776 brw_inst_set_3src_a1_dst_subreg_nr(devinfo
, inst
, dest
.subnr
/ 8);
778 brw_inst_set_3src_a1_dst_hstride(devinfo
, inst
, BRW_ALIGN1_3SRC_DST_HORIZONTAL_STRIDE_1
);
780 if (brw_reg_type_is_floating_point(dest
.type
)) {
781 brw_inst_set_3src_a1_exec_type(devinfo
, inst
,
782 BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT
);
784 brw_inst_set_3src_a1_exec_type(devinfo
, inst
,
785 BRW_ALIGN1_3SRC_EXEC_TYPE_INT
);
788 brw_inst_set_3src_a1_dst_type(devinfo
, inst
, dest
.type
);
789 brw_inst_set_3src_a1_src0_type(devinfo
, inst
, src0
.type
);
790 brw_inst_set_3src_a1_src1_type(devinfo
, inst
, src1
.type
);
791 brw_inst_set_3src_a1_src2_type(devinfo
, inst
, src2
.type
);
793 brw_inst_set_3src_a1_src0_vstride(
794 devinfo
, inst
, to_3src_align1_vstride(devinfo
, src0
.vstride
));
795 brw_inst_set_3src_a1_src1_vstride(
796 devinfo
, inst
, to_3src_align1_vstride(devinfo
, src1
.vstride
));
797 /* no vstride on src2 */
799 brw_inst_set_3src_a1_src0_hstride(devinfo
, inst
,
800 to_3src_align1_hstride(src0
.hstride
));
801 brw_inst_set_3src_a1_src1_hstride(devinfo
, inst
,
802 to_3src_align1_hstride(src1
.hstride
));
803 brw_inst_set_3src_a1_src2_hstride(devinfo
, inst
,
804 to_3src_align1_hstride(src2
.hstride
));
806 brw_inst_set_3src_a1_src0_subreg_nr(devinfo
, inst
, src0
.subnr
);
807 if (src0
.type
== BRW_REGISTER_TYPE_NF
) {
808 brw_inst_set_3src_src0_reg_nr(devinfo
, inst
, BRW_ARF_ACCUMULATOR
);
810 brw_inst_set_3src_src0_reg_nr(devinfo
, inst
, src0
.nr
);
812 brw_inst_set_3src_src0_abs(devinfo
, inst
, src0
.abs
);
813 brw_inst_set_3src_src0_negate(devinfo
, inst
, src0
.negate
);
815 brw_inst_set_3src_a1_src1_subreg_nr(devinfo
, inst
, src1
.subnr
);
816 if (src1
.file
== BRW_ARCHITECTURE_REGISTER_FILE
) {
817 brw_inst_set_3src_src1_reg_nr(devinfo
, inst
, BRW_ARF_ACCUMULATOR
);
819 brw_inst_set_3src_src1_reg_nr(devinfo
, inst
, src1
.nr
);
821 brw_inst_set_3src_src1_abs(devinfo
, inst
, src1
.abs
);
822 brw_inst_set_3src_src1_negate(devinfo
, inst
, src1
.negate
);
824 brw_inst_set_3src_a1_src2_subreg_nr(devinfo
, inst
, src2
.subnr
);
825 brw_inst_set_3src_src2_reg_nr(devinfo
, inst
, src2
.nr
);
826 brw_inst_set_3src_src2_abs(devinfo
, inst
, src2
.abs
);
827 brw_inst_set_3src_src2_negate(devinfo
, inst
, src2
.negate
);
829 assert(src0
.file
== BRW_GENERAL_REGISTER_FILE
||
830 src0
.file
== BRW_IMMEDIATE_VALUE
||
831 (src0
.file
== BRW_ARCHITECTURE_REGISTER_FILE
&&
832 src0
.type
== BRW_REGISTER_TYPE_NF
));
833 assert(src1
.file
== BRW_GENERAL_REGISTER_FILE
||
834 src1
.file
== BRW_ARCHITECTURE_REGISTER_FILE
);
835 assert(src2
.file
== BRW_GENERAL_REGISTER_FILE
||
836 src2
.file
== BRW_IMMEDIATE_VALUE
);
838 if (devinfo
->gen
>= 12) {
839 brw_inst_set_3src_a1_src0_reg_file(devinfo
, inst
, src0
.file
);
840 brw_inst_set_3src_a1_src1_reg_file(devinfo
, inst
, src1
.file
);
841 brw_inst_set_3src_a1_src2_reg_file(devinfo
, inst
, src2
.file
);
843 brw_inst_set_3src_a1_src0_reg_file(devinfo
, inst
,
844 src0
.file
== BRW_GENERAL_REGISTER_FILE
?
845 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE
:
846 BRW_ALIGN1_3SRC_IMMEDIATE_VALUE
);
847 brw_inst_set_3src_a1_src1_reg_file(devinfo
, inst
,
848 src1
.file
== BRW_GENERAL_REGISTER_FILE
?
849 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE
:
850 BRW_ALIGN1_3SRC_ACCUMULATOR
);
851 brw_inst_set_3src_a1_src2_reg_file(devinfo
, inst
,
852 src2
.file
== BRW_GENERAL_REGISTER_FILE
?
853 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE
:
854 BRW_ALIGN1_3SRC_IMMEDIATE_VALUE
);
858 assert(dest
.file
== BRW_GENERAL_REGISTER_FILE
||
859 dest
.file
== BRW_MESSAGE_REGISTER_FILE
);
860 assert(dest
.type
== BRW_REGISTER_TYPE_F
||
861 dest
.type
== BRW_REGISTER_TYPE_DF
||
862 dest
.type
== BRW_REGISTER_TYPE_D
||
863 dest
.type
== BRW_REGISTER_TYPE_UD
||
864 (dest
.type
== BRW_REGISTER_TYPE_HF
&& devinfo
->gen
>= 8));
865 if (devinfo
->gen
== 6) {
866 brw_inst_set_3src_a16_dst_reg_file(devinfo
, inst
,
867 dest
.file
== BRW_MESSAGE_REGISTER_FILE
);
869 brw_inst_set_3src_dst_reg_nr(devinfo
, inst
, dest
.nr
);
870 brw_inst_set_3src_a16_dst_subreg_nr(devinfo
, inst
, dest
.subnr
/ 16);
871 brw_inst_set_3src_a16_dst_writemask(devinfo
, inst
, dest
.writemask
);
873 assert(src0
.file
== BRW_GENERAL_REGISTER_FILE
);
874 brw_inst_set_3src_a16_src0_swizzle(devinfo
, inst
, src0
.swizzle
);
875 brw_inst_set_3src_a16_src0_subreg_nr(devinfo
, inst
, get_3src_subreg_nr(src0
));
876 brw_inst_set_3src_src0_reg_nr(devinfo
, inst
, src0
.nr
);
877 brw_inst_set_3src_src0_abs(devinfo
, inst
, src0
.abs
);
878 brw_inst_set_3src_src0_negate(devinfo
, inst
, src0
.negate
);
879 brw_inst_set_3src_a16_src0_rep_ctrl(devinfo
, inst
,
880 src0
.vstride
== BRW_VERTICAL_STRIDE_0
);
882 assert(src1
.file
== BRW_GENERAL_REGISTER_FILE
);
883 brw_inst_set_3src_a16_src1_swizzle(devinfo
, inst
, src1
.swizzle
);
884 brw_inst_set_3src_a16_src1_subreg_nr(devinfo
, inst
, get_3src_subreg_nr(src1
));
885 brw_inst_set_3src_src1_reg_nr(devinfo
, inst
, src1
.nr
);
886 brw_inst_set_3src_src1_abs(devinfo
, inst
, src1
.abs
);
887 brw_inst_set_3src_src1_negate(devinfo
, inst
, src1
.negate
);
888 brw_inst_set_3src_a16_src1_rep_ctrl(devinfo
, inst
,
889 src1
.vstride
== BRW_VERTICAL_STRIDE_0
);
891 assert(src2
.file
== BRW_GENERAL_REGISTER_FILE
);
892 brw_inst_set_3src_a16_src2_swizzle(devinfo
, inst
, src2
.swizzle
);
893 brw_inst_set_3src_a16_src2_subreg_nr(devinfo
, inst
, get_3src_subreg_nr(src2
));
894 brw_inst_set_3src_src2_reg_nr(devinfo
, inst
, src2
.nr
);
895 brw_inst_set_3src_src2_abs(devinfo
, inst
, src2
.abs
);
896 brw_inst_set_3src_src2_negate(devinfo
, inst
, src2
.negate
);
897 brw_inst_set_3src_a16_src2_rep_ctrl(devinfo
, inst
,
898 src2
.vstride
== BRW_VERTICAL_STRIDE_0
);
900 if (devinfo
->gen
>= 7) {
901 /* Set both the source and destination types based on dest.type,
902 * ignoring the source register types. The MAD and LRP emitters ensure
903 * that all four types are float. The BFE and BFI2 emitters, however,
904 * may send us mixed D and UD types and want us to ignore that and use
905 * the destination type.
907 brw_inst_set_3src_a16_src_type(devinfo
, inst
, dest
.type
);
908 brw_inst_set_3src_a16_dst_type(devinfo
, inst
, dest
.type
);
910 /* From the Bspec, 3D Media GPGPU, Instruction fields, srcType:
912 * "Three source instructions can use operands with mixed-mode
913 * precision. When SrcType field is set to :f or :hf it defines
914 * precision for source 0 only, and fields Src1Type and Src2Type
915 * define precision for other source operands:
917 * 0b = :f. Single precision Float (32-bit).
918 * 1b = :hf. Half precision Float (16-bit)."
920 if (src1
.type
== BRW_REGISTER_TYPE_HF
)
921 brw_inst_set_3src_a16_src1_type(devinfo
, inst
, 1);
923 if (src2
.type
== BRW_REGISTER_TYPE_HF
)
924 brw_inst_set_3src_a16_src2_type(devinfo
, inst
, 1);
932 /***********************************************************************
933 * Convenience routines.
936 brw_inst *brw_##OP(struct brw_codegen *p, \
937 struct brw_reg dest, \
938 struct brw_reg src0) \
940 return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \
944 brw_inst *brw_##OP(struct brw_codegen *p, \
945 struct brw_reg dest, \
946 struct brw_reg src0, \
947 struct brw_reg src1) \
949 return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \
953 brw_inst *brw_##OP(struct brw_codegen *p, \
954 struct brw_reg dest, \
955 struct brw_reg src0, \
956 struct brw_reg src1, \
957 struct brw_reg src2) \
959 if (p->current->access_mode == BRW_ALIGN_16) { \
960 if (src0.vstride == BRW_VERTICAL_STRIDE_0) \
961 src0.swizzle = BRW_SWIZZLE_XXXX; \
962 if (src1.vstride == BRW_VERTICAL_STRIDE_0) \
963 src1.swizzle = BRW_SWIZZLE_XXXX; \
964 if (src2.vstride == BRW_VERTICAL_STRIDE_0) \
965 src2.swizzle = BRW_SWIZZLE_XXXX; \
967 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
971 brw_inst *brw_##OP(struct brw_codegen *p, \
972 struct brw_reg dest, \
973 struct brw_reg src0, \
974 struct brw_reg src1, \
975 struct brw_reg src2) \
977 assert(dest.type == BRW_REGISTER_TYPE_F || \
978 dest.type == BRW_REGISTER_TYPE_DF); \
979 if (dest.type == BRW_REGISTER_TYPE_F) { \
980 assert(src0.type == BRW_REGISTER_TYPE_F); \
981 assert(src1.type == BRW_REGISTER_TYPE_F); \
982 assert(src2.type == BRW_REGISTER_TYPE_F); \
983 } else if (dest.type == BRW_REGISTER_TYPE_DF) { \
984 assert(src0.type == BRW_REGISTER_TYPE_DF); \
985 assert(src1.type == BRW_REGISTER_TYPE_DF); \
986 assert(src2.type == BRW_REGISTER_TYPE_DF); \
989 if (p->current->access_mode == BRW_ALIGN_16) { \
990 if (src0.vstride == BRW_VERTICAL_STRIDE_0) \
991 src0.swizzle = BRW_SWIZZLE_XXXX; \
992 if (src1.vstride == BRW_VERTICAL_STRIDE_0) \
993 src1.swizzle = BRW_SWIZZLE_XXXX; \
994 if (src2.vstride == BRW_VERTICAL_STRIDE_0) \
995 src2.swizzle = BRW_SWIZZLE_XXXX; \
997 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
1000 /* Rounding operations (other than RNDD) require two instructions - the first
1001 * stores a rounded value (possibly the wrong way) in the dest register, but
1002 * also sets a per-channel "increment bit" in the flag register. A predicated
1003 * add of 1.0 fixes dest to contain the desired result.
1005 * Sandybridge and later appear to round correctly without an ADD.
1008 void brw_##OP(struct brw_codegen *p, \
1009 struct brw_reg dest, \
1010 struct brw_reg src) \
1012 const struct gen_device_info *devinfo = p->devinfo; \
1013 brw_inst *rnd, *add; \
1014 rnd = next_insn(p, BRW_OPCODE_##OP); \
1015 brw_set_dest(p, rnd, dest); \
1016 brw_set_src0(p, rnd, src); \
1018 if (devinfo->gen < 6) { \
1019 /* turn on round-increments */ \
1020 brw_inst_set_cond_modifier(devinfo, rnd, BRW_CONDITIONAL_R); \
1021 add = brw_ADD(p, dest, dest, brw_imm_f(1.0f)); \
1022 brw_inst_set_pred_control(devinfo, add, BRW_PREDICATE_NORMAL); \
1064 brw_MOV(struct brw_codegen
*p
, struct brw_reg dest
, struct brw_reg src0
)
1066 const struct gen_device_info
*devinfo
= p
->devinfo
;
1068 /* When converting F->DF on IVB/BYT, every odd source channel is ignored.
1069 * To avoid the problems that causes, we use an <X,2,0> source region to
1070 * read each element twice.
1072 if (devinfo
->gen
== 7 && !devinfo
->is_haswell
&&
1073 brw_get_default_access_mode(p
) == BRW_ALIGN_1
&&
1074 dest
.type
== BRW_REGISTER_TYPE_DF
&&
1075 (src0
.type
== BRW_REGISTER_TYPE_F
||
1076 src0
.type
== BRW_REGISTER_TYPE_D
||
1077 src0
.type
== BRW_REGISTER_TYPE_UD
) &&
1078 !has_scalar_region(src0
)) {
1079 assert(src0
.vstride
== src0
.width
+ src0
.hstride
);
1080 src0
.vstride
= src0
.hstride
;
1081 src0
.width
= BRW_WIDTH_2
;
1082 src0
.hstride
= BRW_HORIZONTAL_STRIDE_0
;
1085 return brw_alu1(p
, BRW_OPCODE_MOV
, dest
, src0
);
1089 brw_ADD(struct brw_codegen
*p
, struct brw_reg dest
,
1090 struct brw_reg src0
, struct brw_reg src1
)
1093 if (src0
.type
== BRW_REGISTER_TYPE_F
||
1094 (src0
.file
== BRW_IMMEDIATE_VALUE
&&
1095 src0
.type
== BRW_REGISTER_TYPE_VF
)) {
1096 assert(src1
.type
!= BRW_REGISTER_TYPE_UD
);
1097 assert(src1
.type
!= BRW_REGISTER_TYPE_D
);
1100 if (src1
.type
== BRW_REGISTER_TYPE_F
||
1101 (src1
.file
== BRW_IMMEDIATE_VALUE
&&
1102 src1
.type
== BRW_REGISTER_TYPE_VF
)) {
1103 assert(src0
.type
!= BRW_REGISTER_TYPE_UD
);
1104 assert(src0
.type
!= BRW_REGISTER_TYPE_D
);
1107 return brw_alu2(p
, BRW_OPCODE_ADD
, dest
, src0
, src1
);
1111 brw_AVG(struct brw_codegen
*p
, struct brw_reg dest
,
1112 struct brw_reg src0
, struct brw_reg src1
)
1114 assert(dest
.type
== src0
.type
);
1115 assert(src0
.type
== src1
.type
);
1116 switch (src0
.type
) {
1117 case BRW_REGISTER_TYPE_B
:
1118 case BRW_REGISTER_TYPE_UB
:
1119 case BRW_REGISTER_TYPE_W
:
1120 case BRW_REGISTER_TYPE_UW
:
1121 case BRW_REGISTER_TYPE_D
:
1122 case BRW_REGISTER_TYPE_UD
:
1125 unreachable("Bad type for brw_AVG");
1128 return brw_alu2(p
, BRW_OPCODE_AVG
, dest
, src0
, src1
);
1132 brw_MUL(struct brw_codegen
*p
, struct brw_reg dest
,
1133 struct brw_reg src0
, struct brw_reg src1
)
1136 if (src0
.type
== BRW_REGISTER_TYPE_D
||
1137 src0
.type
== BRW_REGISTER_TYPE_UD
||
1138 src1
.type
== BRW_REGISTER_TYPE_D
||
1139 src1
.type
== BRW_REGISTER_TYPE_UD
) {
1140 assert(dest
.type
!= BRW_REGISTER_TYPE_F
);
1143 if (src0
.type
== BRW_REGISTER_TYPE_F
||
1144 (src0
.file
== BRW_IMMEDIATE_VALUE
&&
1145 src0
.type
== BRW_REGISTER_TYPE_VF
)) {
1146 assert(src1
.type
!= BRW_REGISTER_TYPE_UD
);
1147 assert(src1
.type
!= BRW_REGISTER_TYPE_D
);
1150 if (src1
.type
== BRW_REGISTER_TYPE_F
||
1151 (src1
.file
== BRW_IMMEDIATE_VALUE
&&
1152 src1
.type
== BRW_REGISTER_TYPE_VF
)) {
1153 assert(src0
.type
!= BRW_REGISTER_TYPE_UD
);
1154 assert(src0
.type
!= BRW_REGISTER_TYPE_D
);
1157 assert(src0
.file
!= BRW_ARCHITECTURE_REGISTER_FILE
||
1158 src0
.nr
!= BRW_ARF_ACCUMULATOR
);
1159 assert(src1
.file
!= BRW_ARCHITECTURE_REGISTER_FILE
||
1160 src1
.nr
!= BRW_ARF_ACCUMULATOR
);
1162 return brw_alu2(p
, BRW_OPCODE_MUL
, dest
, src0
, src1
);
1166 brw_LINE(struct brw_codegen
*p
, struct brw_reg dest
,
1167 struct brw_reg src0
, struct brw_reg src1
)
1169 src0
.vstride
= BRW_VERTICAL_STRIDE_0
;
1170 src0
.width
= BRW_WIDTH_1
;
1171 src0
.hstride
= BRW_HORIZONTAL_STRIDE_0
;
1172 return brw_alu2(p
, BRW_OPCODE_LINE
, dest
, src0
, src1
);
1176 brw_PLN(struct brw_codegen
*p
, struct brw_reg dest
,
1177 struct brw_reg src0
, struct brw_reg src1
)
1179 src0
.vstride
= BRW_VERTICAL_STRIDE_0
;
1180 src0
.width
= BRW_WIDTH_1
;
1181 src0
.hstride
= BRW_HORIZONTAL_STRIDE_0
;
1182 src1
.vstride
= BRW_VERTICAL_STRIDE_8
;
1183 src1
.width
= BRW_WIDTH_8
;
1184 src1
.hstride
= BRW_HORIZONTAL_STRIDE_1
;
1185 return brw_alu2(p
, BRW_OPCODE_PLN
, dest
, src0
, src1
);
1189 brw_F32TO16(struct brw_codegen
*p
, struct brw_reg dst
, struct brw_reg src
)
1191 const struct gen_device_info
*devinfo
= p
->devinfo
;
1192 const bool align16
= brw_get_default_access_mode(p
) == BRW_ALIGN_16
;
1193 /* The F32TO16 instruction doesn't support 32-bit destination types in
1194 * Align1 mode, and neither does the Gen8 implementation in terms of a
1195 * converting MOV. Gen7 does zero out the high 16 bits in Align16 mode as
1196 * an undocumented feature.
1198 const bool needs_zero_fill
= (dst
.type
== BRW_REGISTER_TYPE_UD
&&
1199 (!align16
|| devinfo
->gen
>= 8));
1203 assert(dst
.type
== BRW_REGISTER_TYPE_UD
);
1205 assert(dst
.type
== BRW_REGISTER_TYPE_UD
||
1206 dst
.type
== BRW_REGISTER_TYPE_W
||
1207 dst
.type
== BRW_REGISTER_TYPE_UW
||
1208 dst
.type
== BRW_REGISTER_TYPE_HF
);
1211 brw_push_insn_state(p
);
1213 if (needs_zero_fill
) {
1214 brw_set_default_access_mode(p
, BRW_ALIGN_1
);
1215 dst
= spread(retype(dst
, BRW_REGISTER_TYPE_W
), 2);
1218 if (devinfo
->gen
>= 8) {
1219 inst
= brw_MOV(p
, retype(dst
, BRW_REGISTER_TYPE_HF
), src
);
1221 assert(devinfo
->gen
== 7);
1222 inst
= brw_alu1(p
, BRW_OPCODE_F32TO16
, dst
, src
);
1225 if (needs_zero_fill
) {
1226 if (devinfo
->gen
< 12)
1227 brw_inst_set_no_dd_clear(devinfo
, inst
, true);
1228 inst
= brw_MOV(p
, suboffset(dst
, 1), brw_imm_w(0));
1229 if (devinfo
->gen
< 12)
1230 brw_inst_set_no_dd_check(devinfo
, inst
, true);
1233 brw_pop_insn_state(p
);
1238 brw_F16TO32(struct brw_codegen
*p
, struct brw_reg dst
, struct brw_reg src
)
1240 const struct gen_device_info
*devinfo
= p
->devinfo
;
1241 bool align16
= brw_get_default_access_mode(p
) == BRW_ALIGN_16
;
1244 assert(src
.type
== BRW_REGISTER_TYPE_UD
);
1246 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1248 * Because this instruction does not have a 16-bit floating-point
1249 * type, the source data type must be Word (W). The destination type
1250 * must be F (Float).
1252 if (src
.type
== BRW_REGISTER_TYPE_UD
)
1253 src
= spread(retype(src
, BRW_REGISTER_TYPE_W
), 2);
1255 assert(src
.type
== BRW_REGISTER_TYPE_W
||
1256 src
.type
== BRW_REGISTER_TYPE_UW
||
1257 src
.type
== BRW_REGISTER_TYPE_HF
);
1260 if (devinfo
->gen
>= 8) {
1261 return brw_MOV(p
, dst
, retype(src
, BRW_REGISTER_TYPE_HF
));
1263 assert(devinfo
->gen
== 7);
1264 return brw_alu1(p
, BRW_OPCODE_F16TO32
, dst
, src
);
1269 void brw_NOP(struct brw_codegen
*p
)
1271 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_NOP
);
1272 memset(insn
, 0, sizeof(*insn
));
1273 brw_inst_set_opcode(p
->devinfo
, insn
, BRW_OPCODE_NOP
);
1276 void brw_SYNC(struct brw_codegen
*p
, enum tgl_sync_function func
)
1278 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_SYNC
);
1279 brw_inst_set_cond_modifier(p
->devinfo
, insn
, func
);
1282 /***********************************************************************
1283 * Comparisons, if/else/endif
1287 brw_JMPI(struct brw_codegen
*p
, struct brw_reg index
,
1288 unsigned predicate_control
)
1290 const struct gen_device_info
*devinfo
= p
->devinfo
;
1291 struct brw_reg ip
= brw_ip_reg();
1292 brw_inst
*inst
= brw_alu2(p
, BRW_OPCODE_JMPI
, ip
, ip
, index
);
1294 brw_inst_set_exec_size(devinfo
, inst
, BRW_EXECUTE_1
);
1295 brw_inst_set_qtr_control(devinfo
, inst
, BRW_COMPRESSION_NONE
);
1296 brw_inst_set_mask_control(devinfo
, inst
, BRW_MASK_DISABLE
);
1297 brw_inst_set_pred_control(devinfo
, inst
, predicate_control
);
1303 push_if_stack(struct brw_codegen
*p
, brw_inst
*inst
)
1305 p
->if_stack
[p
->if_stack_depth
] = inst
- p
->store
;
1307 p
->if_stack_depth
++;
1308 if (p
->if_stack_array_size
<= p
->if_stack_depth
) {
1309 p
->if_stack_array_size
*= 2;
1310 p
->if_stack
= reralloc(p
->mem_ctx
, p
->if_stack
, int,
1311 p
->if_stack_array_size
);
1316 pop_if_stack(struct brw_codegen
*p
)
1318 p
->if_stack_depth
--;
1319 return &p
->store
[p
->if_stack
[p
->if_stack_depth
]];
1323 push_loop_stack(struct brw_codegen
*p
, brw_inst
*inst
)
1325 if (p
->loop_stack_array_size
<= (p
->loop_stack_depth
+ 1)) {
1326 p
->loop_stack_array_size
*= 2;
1327 p
->loop_stack
= reralloc(p
->mem_ctx
, p
->loop_stack
, int,
1328 p
->loop_stack_array_size
);
1329 p
->if_depth_in_loop
= reralloc(p
->mem_ctx
, p
->if_depth_in_loop
, int,
1330 p
->loop_stack_array_size
);
1333 p
->loop_stack
[p
->loop_stack_depth
] = inst
- p
->store
;
1334 p
->loop_stack_depth
++;
1335 p
->if_depth_in_loop
[p
->loop_stack_depth
] = 0;
1339 get_inner_do_insn(struct brw_codegen
*p
)
1341 return &p
->store
[p
->loop_stack
[p
->loop_stack_depth
- 1]];
1344 /* EU takes the value from the flag register and pushes it onto some
1345 * sort of a stack (presumably merging with any flag value already on
1346 * the stack). Within an if block, the flags at the top of the stack
1347 * control execution on each channel of the unit, eg. on each of the
1348 * 16 pixel values in our wm programs.
1350 * When the matching 'else' instruction is reached (presumably by
1351 * countdown of the instruction count patched in by our ELSE/ENDIF
1352 * functions), the relevant flags are inverted.
1354 * When the matching 'endif' instruction is reached, the flags are
1355 * popped off. If the stack is now empty, normal execution resumes.
1358 brw_IF(struct brw_codegen
*p
, unsigned execute_size
)
1360 const struct gen_device_info
*devinfo
= p
->devinfo
;
1363 insn
= next_insn(p
, BRW_OPCODE_IF
);
1365 /* Override the defaults for this instruction:
1367 if (devinfo
->gen
< 6) {
1368 brw_set_dest(p
, insn
, brw_ip_reg());
1369 brw_set_src0(p
, insn
, brw_ip_reg());
1370 brw_set_src1(p
, insn
, brw_imm_d(0x0));
1371 } else if (devinfo
->gen
== 6) {
1372 brw_set_dest(p
, insn
, brw_imm_w(0));
1373 brw_inst_set_gen6_jump_count(devinfo
, insn
, 0);
1374 brw_set_src0(p
, insn
, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D
)));
1375 brw_set_src1(p
, insn
, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D
)));
1376 } else if (devinfo
->gen
== 7) {
1377 brw_set_dest(p
, insn
, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D
)));
1378 brw_set_src0(p
, insn
, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D
)));
1379 brw_set_src1(p
, insn
, brw_imm_w(0));
1380 brw_inst_set_jip(devinfo
, insn
, 0);
1381 brw_inst_set_uip(devinfo
, insn
, 0);
1383 brw_set_dest(p
, insn
, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D
)));
1384 if (devinfo
->gen
< 12)
1385 brw_set_src0(p
, insn
, brw_imm_d(0));
1386 brw_inst_set_jip(devinfo
, insn
, 0);
1387 brw_inst_set_uip(devinfo
, insn
, 0);
1390 brw_inst_set_exec_size(devinfo
, insn
, execute_size
);
1391 brw_inst_set_qtr_control(devinfo
, insn
, BRW_COMPRESSION_NONE
);
1392 brw_inst_set_pred_control(devinfo
, insn
, BRW_PREDICATE_NORMAL
);
1393 brw_inst_set_mask_control(devinfo
, insn
, BRW_MASK_ENABLE
);
1394 if (!p
->single_program_flow
&& devinfo
->gen
< 6)
1395 brw_inst_set_thread_control(devinfo
, insn
, BRW_THREAD_SWITCH
);
1397 push_if_stack(p
, insn
);
1398 p
->if_depth_in_loop
[p
->loop_stack_depth
]++;
1402 /* This function is only used for gen6-style IF instructions with an
1403 * embedded comparison (conditional modifier). It is not used on gen7.
1406 gen6_IF(struct brw_codegen
*p
, enum brw_conditional_mod conditional
,
1407 struct brw_reg src0
, struct brw_reg src1
)
1409 const struct gen_device_info
*devinfo
= p
->devinfo
;
1412 insn
= next_insn(p
, BRW_OPCODE_IF
);
1414 brw_set_dest(p
, insn
, brw_imm_w(0));
1415 brw_inst_set_exec_size(devinfo
, insn
, brw_get_default_exec_size(p
));
1416 brw_inst_set_gen6_jump_count(devinfo
, insn
, 0);
1417 brw_set_src0(p
, insn
, src0
);
1418 brw_set_src1(p
, insn
, src1
);
1420 assert(brw_inst_qtr_control(devinfo
, insn
) == BRW_COMPRESSION_NONE
);
1421 assert(brw_inst_pred_control(devinfo
, insn
) == BRW_PREDICATE_NONE
);
1422 brw_inst_set_cond_modifier(devinfo
, insn
, conditional
);
1424 push_if_stack(p
, insn
);
1429 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1432 convert_IF_ELSE_to_ADD(struct brw_codegen
*p
,
1433 brw_inst
*if_inst
, brw_inst
*else_inst
)
1435 const struct gen_device_info
*devinfo
= p
->devinfo
;
1437 /* The next instruction (where the ENDIF would be, if it existed) */
1438 brw_inst
*next_inst
= &p
->store
[p
->nr_insn
];
1440 assert(p
->single_program_flow
);
1441 assert(if_inst
!= NULL
&& brw_inst_opcode(devinfo
, if_inst
) == BRW_OPCODE_IF
);
1442 assert(else_inst
== NULL
|| brw_inst_opcode(devinfo
, else_inst
) == BRW_OPCODE_ELSE
);
1443 assert(brw_inst_exec_size(devinfo
, if_inst
) == BRW_EXECUTE_1
);
1445 /* Convert IF to an ADD instruction that moves the instruction pointer
1446 * to the first instruction of the ELSE block. If there is no ELSE
1447 * block, point to where ENDIF would be. Reverse the predicate.
1449 * There's no need to execute an ENDIF since we don't need to do any
1450 * stack operations, and if we're currently executing, we just want to
1451 * continue normally.
1453 brw_inst_set_opcode(devinfo
, if_inst
, BRW_OPCODE_ADD
);
1454 brw_inst_set_pred_inv(devinfo
, if_inst
, true);
1456 if (else_inst
!= NULL
) {
1457 /* Convert ELSE to an ADD instruction that points where the ENDIF
1460 brw_inst_set_opcode(devinfo
, else_inst
, BRW_OPCODE_ADD
);
1462 brw_inst_set_imm_ud(devinfo
, if_inst
, (else_inst
- if_inst
+ 1) * 16);
1463 brw_inst_set_imm_ud(devinfo
, else_inst
, (next_inst
- else_inst
) * 16);
1465 brw_inst_set_imm_ud(devinfo
, if_inst
, (next_inst
- if_inst
) * 16);
1470 * Patch IF and ELSE instructions with appropriate jump targets.
1473 patch_IF_ELSE(struct brw_codegen
*p
,
1474 brw_inst
*if_inst
, brw_inst
*else_inst
, brw_inst
*endif_inst
)
1476 const struct gen_device_info
*devinfo
= p
->devinfo
;
1478 /* We shouldn't be patching IF and ELSE instructions in single program flow
1479 * mode when gen < 6, because in single program flow mode on those
1480 * platforms, we convert flow control instructions to conditional ADDs that
1481 * operate on IP (see brw_ENDIF).
1483 * However, on Gen6, writing to IP doesn't work in single program flow mode
1484 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1485 * not be updated by non-flow control instructions."). And on later
1486 * platforms, there is no significant benefit to converting control flow
1487 * instructions to conditional ADDs. So we do patch IF and ELSE
1488 * instructions in single program flow mode on those platforms.
1490 if (devinfo
->gen
< 6)
1491 assert(!p
->single_program_flow
);
1493 assert(if_inst
!= NULL
&& brw_inst_opcode(devinfo
, if_inst
) == BRW_OPCODE_IF
);
1494 assert(endif_inst
!= NULL
);
1495 assert(else_inst
== NULL
|| brw_inst_opcode(devinfo
, else_inst
) == BRW_OPCODE_ELSE
);
1497 unsigned br
= brw_jump_scale(devinfo
);
1499 assert(brw_inst_opcode(devinfo
, endif_inst
) == BRW_OPCODE_ENDIF
);
1500 brw_inst_set_exec_size(devinfo
, endif_inst
, brw_inst_exec_size(devinfo
, if_inst
));
1502 if (else_inst
== NULL
) {
1503 /* Patch IF -> ENDIF */
1504 if (devinfo
->gen
< 6) {
1505 /* Turn it into an IFF, which means no mask stack operations for
1506 * all-false and jumping past the ENDIF.
1508 brw_inst_set_opcode(devinfo
, if_inst
, BRW_OPCODE_IFF
);
1509 brw_inst_set_gen4_jump_count(devinfo
, if_inst
,
1510 br
* (endif_inst
- if_inst
+ 1));
1511 brw_inst_set_gen4_pop_count(devinfo
, if_inst
, 0);
1512 } else if (devinfo
->gen
== 6) {
1513 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1514 brw_inst_set_gen6_jump_count(devinfo
, if_inst
, br
*(endif_inst
- if_inst
));
1516 brw_inst_set_uip(devinfo
, if_inst
, br
* (endif_inst
- if_inst
));
1517 brw_inst_set_jip(devinfo
, if_inst
, br
* (endif_inst
- if_inst
));
1520 brw_inst_set_exec_size(devinfo
, else_inst
, brw_inst_exec_size(devinfo
, if_inst
));
1522 /* Patch IF -> ELSE */
1523 if (devinfo
->gen
< 6) {
1524 brw_inst_set_gen4_jump_count(devinfo
, if_inst
,
1525 br
* (else_inst
- if_inst
));
1526 brw_inst_set_gen4_pop_count(devinfo
, if_inst
, 0);
1527 } else if (devinfo
->gen
== 6) {
1528 brw_inst_set_gen6_jump_count(devinfo
, if_inst
,
1529 br
* (else_inst
- if_inst
+ 1));
1532 /* Patch ELSE -> ENDIF */
1533 if (devinfo
->gen
< 6) {
1534 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1537 brw_inst_set_gen4_jump_count(devinfo
, else_inst
,
1538 br
* (endif_inst
- else_inst
+ 1));
1539 brw_inst_set_gen4_pop_count(devinfo
, else_inst
, 1);
1540 } else if (devinfo
->gen
== 6) {
1541 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1542 brw_inst_set_gen6_jump_count(devinfo
, else_inst
,
1543 br
* (endif_inst
- else_inst
));
1545 /* The IF instruction's JIP should point just past the ELSE */
1546 brw_inst_set_jip(devinfo
, if_inst
, br
* (else_inst
- if_inst
+ 1));
1547 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1548 brw_inst_set_uip(devinfo
, if_inst
, br
* (endif_inst
- if_inst
));
1549 brw_inst_set_jip(devinfo
, else_inst
, br
* (endif_inst
- else_inst
));
1550 if (devinfo
->gen
>= 8) {
1551 /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both
1552 * should point to ENDIF.
1554 brw_inst_set_uip(devinfo
, else_inst
, br
* (endif_inst
- else_inst
));
1561 brw_ELSE(struct brw_codegen
*p
)
1563 const struct gen_device_info
*devinfo
= p
->devinfo
;
1566 insn
= next_insn(p
, BRW_OPCODE_ELSE
);
1568 if (devinfo
->gen
< 6) {
1569 brw_set_dest(p
, insn
, brw_ip_reg());
1570 brw_set_src0(p
, insn
, brw_ip_reg());
1571 brw_set_src1(p
, insn
, brw_imm_d(0x0));
1572 } else if (devinfo
->gen
== 6) {
1573 brw_set_dest(p
, insn
, brw_imm_w(0));
1574 brw_inst_set_gen6_jump_count(devinfo
, insn
, 0);
1575 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1576 brw_set_src1(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1577 } else if (devinfo
->gen
== 7) {
1578 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1579 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1580 brw_set_src1(p
, insn
, brw_imm_w(0));
1581 brw_inst_set_jip(devinfo
, insn
, 0);
1582 brw_inst_set_uip(devinfo
, insn
, 0);
1584 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1585 if (devinfo
->gen
< 12)
1586 brw_set_src0(p
, insn
, brw_imm_d(0));
1587 brw_inst_set_jip(devinfo
, insn
, 0);
1588 brw_inst_set_uip(devinfo
, insn
, 0);
1591 brw_inst_set_qtr_control(devinfo
, insn
, BRW_COMPRESSION_NONE
);
1592 brw_inst_set_mask_control(devinfo
, insn
, BRW_MASK_ENABLE
);
1593 if (!p
->single_program_flow
&& devinfo
->gen
< 6)
1594 brw_inst_set_thread_control(devinfo
, insn
, BRW_THREAD_SWITCH
);
1596 push_if_stack(p
, insn
);
1600 brw_ENDIF(struct brw_codegen
*p
)
1602 const struct gen_device_info
*devinfo
= p
->devinfo
;
1603 brw_inst
*insn
= NULL
;
1604 brw_inst
*else_inst
= NULL
;
1605 brw_inst
*if_inst
= NULL
;
1607 bool emit_endif
= true;
1609 /* In single program flow mode, we can express IF and ELSE instructions
1610 * equivalently as ADD instructions that operate on IP. On platforms prior
1611 * to Gen6, flow control instructions cause an implied thread switch, so
1612 * this is a significant savings.
1614 * However, on Gen6, writing to IP doesn't work in single program flow mode
1615 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1616 * not be updated by non-flow control instructions."). And on later
1617 * platforms, there is no significant benefit to converting control flow
1618 * instructions to conditional ADDs. So we only do this trick on Gen4 and
1621 if (devinfo
->gen
< 6 && p
->single_program_flow
)
1625 * A single next_insn() may change the base address of instruction store
1626 * memory(p->store), so call it first before referencing the instruction
1627 * store pointer from an index
1630 insn
= next_insn(p
, BRW_OPCODE_ENDIF
);
1632 /* Pop the IF and (optional) ELSE instructions from the stack */
1633 p
->if_depth_in_loop
[p
->loop_stack_depth
]--;
1634 tmp
= pop_if_stack(p
);
1635 if (brw_inst_opcode(devinfo
, tmp
) == BRW_OPCODE_ELSE
) {
1637 tmp
= pop_if_stack(p
);
1642 /* ENDIF is useless; don't bother emitting it. */
1643 convert_IF_ELSE_to_ADD(p
, if_inst
, else_inst
);
1647 if (devinfo
->gen
< 6) {
1648 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1649 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1650 brw_set_src1(p
, insn
, brw_imm_d(0x0));
1651 } else if (devinfo
->gen
== 6) {
1652 brw_set_dest(p
, insn
, brw_imm_w(0));
1653 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1654 brw_set_src1(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1655 } else if (devinfo
->gen
== 7) {
1656 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1657 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1658 brw_set_src1(p
, insn
, brw_imm_w(0));
1660 brw_set_src0(p
, insn
, brw_imm_d(0));
1663 brw_inst_set_qtr_control(devinfo
, insn
, BRW_COMPRESSION_NONE
);
1664 brw_inst_set_mask_control(devinfo
, insn
, BRW_MASK_ENABLE
);
1665 if (devinfo
->gen
< 6)
1666 brw_inst_set_thread_control(devinfo
, insn
, BRW_THREAD_SWITCH
);
1668 /* Also pop item off the stack in the endif instruction: */
1669 if (devinfo
->gen
< 6) {
1670 brw_inst_set_gen4_jump_count(devinfo
, insn
, 0);
1671 brw_inst_set_gen4_pop_count(devinfo
, insn
, 1);
1672 } else if (devinfo
->gen
== 6) {
1673 brw_inst_set_gen6_jump_count(devinfo
, insn
, 2);
1675 brw_inst_set_jip(devinfo
, insn
, 2);
1677 patch_IF_ELSE(p
, if_inst
, else_inst
, insn
);
1681 brw_BREAK(struct brw_codegen
*p
)
1683 const struct gen_device_info
*devinfo
= p
->devinfo
;
1686 insn
= next_insn(p
, BRW_OPCODE_BREAK
);
1687 if (devinfo
->gen
>= 8) {
1688 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1689 brw_set_src0(p
, insn
, brw_imm_d(0x0));
1690 } else if (devinfo
->gen
>= 6) {
1691 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1692 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1693 brw_set_src1(p
, insn
, brw_imm_d(0x0));
1695 brw_set_dest(p
, insn
, brw_ip_reg());
1696 brw_set_src0(p
, insn
, brw_ip_reg());
1697 brw_set_src1(p
, insn
, brw_imm_d(0x0));
1698 brw_inst_set_gen4_pop_count(devinfo
, insn
,
1699 p
->if_depth_in_loop
[p
->loop_stack_depth
]);
1701 brw_inst_set_qtr_control(devinfo
, insn
, BRW_COMPRESSION_NONE
);
1702 brw_inst_set_exec_size(devinfo
, insn
, brw_get_default_exec_size(p
));
1708 brw_CONT(struct brw_codegen
*p
)
1710 const struct gen_device_info
*devinfo
= p
->devinfo
;
1713 insn
= next_insn(p
, BRW_OPCODE_CONTINUE
);
1714 brw_set_dest(p
, insn
, brw_ip_reg());
1715 if (devinfo
->gen
>= 8) {
1716 brw_set_src0(p
, insn
, brw_imm_d(0x0));
1718 brw_set_src0(p
, insn
, brw_ip_reg());
1719 brw_set_src1(p
, insn
, brw_imm_d(0x0));
1722 if (devinfo
->gen
< 6) {
1723 brw_inst_set_gen4_pop_count(devinfo
, insn
,
1724 p
->if_depth_in_loop
[p
->loop_stack_depth
]);
1726 brw_inst_set_qtr_control(devinfo
, insn
, BRW_COMPRESSION_NONE
);
1727 brw_inst_set_exec_size(devinfo
, insn
, brw_get_default_exec_size(p
));
1732 gen6_HALT(struct brw_codegen
*p
)
1734 const struct gen_device_info
*devinfo
= p
->devinfo
;
1737 insn
= next_insn(p
, BRW_OPCODE_HALT
);
1738 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1739 if (devinfo
->gen
< 8) {
1740 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1741 brw_set_src1(p
, insn
, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1742 } else if (devinfo
->gen
< 12) {
1743 brw_set_src0(p
, insn
, brw_imm_d(0x0));
1746 brw_inst_set_qtr_control(devinfo
, insn
, BRW_COMPRESSION_NONE
);
1747 brw_inst_set_exec_size(devinfo
, insn
, brw_get_default_exec_size(p
));
1753 * The DO/WHILE is just an unterminated loop -- break or continue are
1754 * used for control within the loop. We have a few ways they can be
1757 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1758 * jip and no DO instruction.
1760 * For non-uniform control flow pre-gen6, there's a DO instruction to
1761 * push the mask, and a WHILE to jump back, and BREAK to get out and
1764 * For gen6, there's no more mask stack, so no need for DO. WHILE
1765 * just points back to the first instruction of the loop.
1768 brw_DO(struct brw_codegen
*p
, unsigned execute_size
)
1770 const struct gen_device_info
*devinfo
= p
->devinfo
;
1772 if (devinfo
->gen
>= 6 || p
->single_program_flow
) {
1773 push_loop_stack(p
, &p
->store
[p
->nr_insn
]);
1774 return &p
->store
[p
->nr_insn
];
1776 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_DO
);
1778 push_loop_stack(p
, insn
);
1780 /* Override the defaults for this instruction:
1782 brw_set_dest(p
, insn
, brw_null_reg());
1783 brw_set_src0(p
, insn
, brw_null_reg());
1784 brw_set_src1(p
, insn
, brw_null_reg());
1786 brw_inst_set_qtr_control(devinfo
, insn
, BRW_COMPRESSION_NONE
);
1787 brw_inst_set_exec_size(devinfo
, insn
, execute_size
);
1788 brw_inst_set_pred_control(devinfo
, insn
, BRW_PREDICATE_NONE
);
1795 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1798 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1799 * nesting, since it can always just point to the end of the block/current loop.
1802 brw_patch_break_cont(struct brw_codegen
*p
, brw_inst
*while_inst
)
1804 const struct gen_device_info
*devinfo
= p
->devinfo
;
1805 brw_inst
*do_inst
= get_inner_do_insn(p
);
1807 unsigned br
= brw_jump_scale(devinfo
);
1809 assert(devinfo
->gen
< 6);
1811 for (inst
= while_inst
- 1; inst
!= do_inst
; inst
--) {
1812 /* If the jump count is != 0, that means that this instruction has already
1813 * been patched because it's part of a loop inside of the one we're
1816 if (brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_BREAK
&&
1817 brw_inst_gen4_jump_count(devinfo
, inst
) == 0) {
1818 brw_inst_set_gen4_jump_count(devinfo
, inst
, br
*((while_inst
- inst
) + 1));
1819 } else if (brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_CONTINUE
&&
1820 brw_inst_gen4_jump_count(devinfo
, inst
) == 0) {
1821 brw_inst_set_gen4_jump_count(devinfo
, inst
, br
* (while_inst
- inst
));
1827 brw_WHILE(struct brw_codegen
*p
)
1829 const struct gen_device_info
*devinfo
= p
->devinfo
;
1830 brw_inst
*insn
, *do_insn
;
1831 unsigned br
= brw_jump_scale(devinfo
);
1833 if (devinfo
->gen
>= 6) {
1834 insn
= next_insn(p
, BRW_OPCODE_WHILE
);
1835 do_insn
= get_inner_do_insn(p
);
1837 if (devinfo
->gen
>= 8) {
1838 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1839 if (devinfo
->gen
< 12)
1840 brw_set_src0(p
, insn
, brw_imm_d(0));
1841 brw_inst_set_jip(devinfo
, insn
, br
* (do_insn
- insn
));
1842 } else if (devinfo
->gen
== 7) {
1843 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1844 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1845 brw_set_src1(p
, insn
, brw_imm_w(0));
1846 brw_inst_set_jip(devinfo
, insn
, br
* (do_insn
- insn
));
1848 brw_set_dest(p
, insn
, brw_imm_w(0));
1849 brw_inst_set_gen6_jump_count(devinfo
, insn
, br
* (do_insn
- insn
));
1850 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1851 brw_set_src1(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1854 brw_inst_set_exec_size(devinfo
, insn
, brw_get_default_exec_size(p
));
1857 if (p
->single_program_flow
) {
1858 insn
= next_insn(p
, BRW_OPCODE_ADD
);
1859 do_insn
= get_inner_do_insn(p
);
1861 brw_set_dest(p
, insn
, brw_ip_reg());
1862 brw_set_src0(p
, insn
, brw_ip_reg());
1863 brw_set_src1(p
, insn
, brw_imm_d((do_insn
- insn
) * 16));
1864 brw_inst_set_exec_size(devinfo
, insn
, BRW_EXECUTE_1
);
1866 insn
= next_insn(p
, BRW_OPCODE_WHILE
);
1867 do_insn
= get_inner_do_insn(p
);
1869 assert(brw_inst_opcode(devinfo
, do_insn
) == BRW_OPCODE_DO
);
1871 brw_set_dest(p
, insn
, brw_ip_reg());
1872 brw_set_src0(p
, insn
, brw_ip_reg());
1873 brw_set_src1(p
, insn
, brw_imm_d(0));
1875 brw_inst_set_exec_size(devinfo
, insn
, brw_inst_exec_size(devinfo
, do_insn
));
1876 brw_inst_set_gen4_jump_count(devinfo
, insn
, br
* (do_insn
- insn
+ 1));
1877 brw_inst_set_gen4_pop_count(devinfo
, insn
, 0);
1879 brw_patch_break_cont(p
, insn
);
1882 brw_inst_set_qtr_control(devinfo
, insn
, BRW_COMPRESSION_NONE
);
1884 p
->loop_stack_depth
--;
1891 void brw_land_fwd_jump(struct brw_codegen
*p
, int jmp_insn_idx
)
1893 const struct gen_device_info
*devinfo
= p
->devinfo
;
1894 brw_inst
*jmp_insn
= &p
->store
[jmp_insn_idx
];
1897 if (devinfo
->gen
>= 5)
1900 assert(brw_inst_opcode(devinfo
, jmp_insn
) == BRW_OPCODE_JMPI
);
1901 assert(brw_inst_src1_reg_file(devinfo
, jmp_insn
) == BRW_IMMEDIATE_VALUE
);
1903 brw_inst_set_gen4_jump_count(devinfo
, jmp_insn
,
1904 jmpi
* (p
->nr_insn
- jmp_insn_idx
- 1));
1907 /* To integrate with the above, it makes sense that the comparison
1908 * instruction should populate the flag register. It might be simpler
1909 * just to use the flag reg for most WM tasks?
1911 void brw_CMP(struct brw_codegen
*p
,
1912 struct brw_reg dest
,
1913 unsigned conditional
,
1914 struct brw_reg src0
,
1915 struct brw_reg src1
)
1917 const struct gen_device_info
*devinfo
= p
->devinfo
;
1918 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_CMP
);
1920 brw_inst_set_cond_modifier(devinfo
, insn
, conditional
);
1921 brw_set_dest(p
, insn
, dest
);
1922 brw_set_src0(p
, insn
, src0
);
1923 brw_set_src1(p
, insn
, src1
);
1925 /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1927 * "Any CMP instruction with a null destination must use a {switch}."
1929 * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1930 * mentioned on their work-arounds pages.
1932 if (devinfo
->gen
== 7) {
1933 if (dest
.file
== BRW_ARCHITECTURE_REGISTER_FILE
&&
1934 dest
.nr
== BRW_ARF_NULL
) {
1935 brw_inst_set_thread_control(devinfo
, insn
, BRW_THREAD_SWITCH
);
1940 /***********************************************************************
1941 * Helpers for the various SEND message types:
1944 /** Extended math function, float[8].
1946 void gen4_math(struct brw_codegen
*p
,
1947 struct brw_reg dest
,
1949 unsigned msg_reg_nr
,
1951 unsigned precision
)
1953 const struct gen_device_info
*devinfo
= p
->devinfo
;
1954 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_SEND
);
1956 if (has_scalar_region(src
)) {
1957 data_type
= BRW_MATH_DATA_SCALAR
;
1959 data_type
= BRW_MATH_DATA_VECTOR
;
1962 assert(devinfo
->gen
< 6);
1964 /* Example code doesn't set predicate_control for send
1967 brw_inst_set_pred_control(devinfo
, insn
, 0);
1968 brw_inst_set_base_mrf(devinfo
, insn
, msg_reg_nr
);
1970 brw_set_dest(p
, insn
, dest
);
1971 brw_set_src0(p
, insn
, src
);
1972 brw_set_math_message(p
,
1975 src
.type
== BRW_REGISTER_TYPE_D
,
1980 void gen6_math(struct brw_codegen
*p
,
1981 struct brw_reg dest
,
1983 struct brw_reg src0
,
1984 struct brw_reg src1
)
1986 const struct gen_device_info
*devinfo
= p
->devinfo
;
1987 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_MATH
);
1989 assert(devinfo
->gen
>= 6);
1991 assert(dest
.file
== BRW_GENERAL_REGISTER_FILE
||
1992 (devinfo
->gen
>= 7 && dest
.file
== BRW_MESSAGE_REGISTER_FILE
));
1994 assert(dest
.hstride
== BRW_HORIZONTAL_STRIDE_1
);
1995 if (devinfo
->gen
== 6) {
1996 assert(src0
.hstride
== BRW_HORIZONTAL_STRIDE_1
);
1997 assert(src1
.hstride
== BRW_HORIZONTAL_STRIDE_1
);
2000 if (function
== BRW_MATH_FUNCTION_INT_DIV_QUOTIENT
||
2001 function
== BRW_MATH_FUNCTION_INT_DIV_REMAINDER
||
2002 function
== BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER
) {
2003 assert(src0
.type
!= BRW_REGISTER_TYPE_F
);
2004 assert(src1
.type
!= BRW_REGISTER_TYPE_F
);
2005 assert(src1
.file
== BRW_GENERAL_REGISTER_FILE
||
2006 (devinfo
->gen
>= 8 && src1
.file
== BRW_IMMEDIATE_VALUE
));
2008 assert(src0
.type
== BRW_REGISTER_TYPE_F
||
2009 (src0
.type
== BRW_REGISTER_TYPE_HF
&& devinfo
->gen
>= 9));
2010 assert(src1
.type
== BRW_REGISTER_TYPE_F
||
2011 (src1
.type
== BRW_REGISTER_TYPE_HF
&& devinfo
->gen
>= 9));
2014 /* Source modifiers are ignored for extended math instructions on Gen6. */
2015 if (devinfo
->gen
== 6) {
2016 assert(!src0
.negate
);
2018 assert(!src1
.negate
);
2022 brw_inst_set_math_function(devinfo
, insn
, function
);
2024 brw_set_dest(p
, insn
, dest
);
2025 brw_set_src0(p
, insn
, src0
);
2026 brw_set_src1(p
, insn
, src1
);
2030 * Return the right surface index to access the thread scratch space using
2031 * stateless dataport messages.
2034 brw_scratch_surface_idx(const struct brw_codegen
*p
)
2036 /* The scratch space is thread-local so IA coherency is unnecessary. */
2037 if (p
->devinfo
->gen
>= 8)
2038 return GEN8_BTI_STATELESS_NON_COHERENT
;
2040 return BRW_BTI_STATELESS
;
2044 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
2045 * using a constant offset per channel.
2047 * The offset must be aligned to oword size (16 bytes). Used for
2048 * register spilling.
2050 void brw_oword_block_write_scratch(struct brw_codegen
*p
,
2055 const struct gen_device_info
*devinfo
= p
->devinfo
;
2056 const unsigned target_cache
=
2057 (devinfo
->gen
>= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE
:
2058 devinfo
->gen
>= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE
:
2059 BRW_SFID_DATAPORT_WRITE
);
2062 if (devinfo
->gen
>= 6)
2065 mrf
= retype(mrf
, BRW_REGISTER_TYPE_UD
);
2067 const unsigned mlen
= 1 + num_regs
;
2069 /* Set up the message header. This is g0, with g0.2 filled with
2070 * the offset. We don't want to leave our offset around in g0 or
2071 * it'll screw up texture samples, so set it up inside the message
2075 brw_push_insn_state(p
);
2076 brw_set_default_exec_size(p
, BRW_EXECUTE_8
);
2077 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
2078 brw_set_default_compression_control(p
, BRW_COMPRESSION_NONE
);
2080 brw_MOV(p
, mrf
, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD
));
2082 /* set message header global offset field (reg 0, element 2) */
2083 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
2085 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE
,
2087 2), BRW_REGISTER_TYPE_UD
),
2088 brw_imm_ud(offset
));
2090 brw_pop_insn_state(p
);
2094 struct brw_reg dest
;
2095 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_SEND
);
2096 int send_commit_msg
;
2097 struct brw_reg src_header
= retype(brw_vec8_grf(0, 0),
2098 BRW_REGISTER_TYPE_UW
);
2100 brw_inst_set_sfid(devinfo
, insn
, target_cache
);
2101 brw_inst_set_compression(devinfo
, insn
, false);
2103 if (brw_inst_exec_size(devinfo
, insn
) >= 16)
2104 src_header
= vec16(src_header
);
2106 assert(brw_inst_pred_control(devinfo
, insn
) == BRW_PREDICATE_NONE
);
2107 if (devinfo
->gen
< 6)
2108 brw_inst_set_base_mrf(devinfo
, insn
, mrf
.nr
);
2110 /* Until gen6, writes followed by reads from the same location
2111 * are not guaranteed to be ordered unless write_commit is set.
2112 * If set, then a no-op write is issued to the destination
2113 * register to set a dependency, and a read from the destination
2114 * can be used to ensure the ordering.
2116 * For gen6, only writes between different threads need ordering
2117 * protection. Our use of DP writes is all about register
2118 * spilling within a thread.
2120 if (devinfo
->gen
>= 6) {
2121 dest
= retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW
);
2122 send_commit_msg
= 0;
2125 send_commit_msg
= 1;
2128 brw_set_dest(p
, insn
, dest
);
2129 if (devinfo
->gen
>= 6) {
2130 brw_set_src0(p
, insn
, mrf
);
2132 brw_set_src0(p
, insn
, brw_null_reg());
2135 if (devinfo
->gen
>= 6)
2136 msg_type
= GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE
;
2138 msg_type
= BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE
;
2140 brw_set_desc(p
, insn
,
2141 brw_message_desc(devinfo
, mlen
, send_commit_msg
, true) |
2142 brw_dp_write_desc(devinfo
, brw_scratch_surface_idx(p
),
2143 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs
* 8),
2144 msg_type
, 0, /* not a render target */
2151 * Read a block of owords (half a GRF each) from the scratch buffer
2152 * using a constant index per channel.
2154 * Offset must be aligned to oword size (16 bytes). Used for register
2158 brw_oword_block_read_scratch(struct brw_codegen
*p
,
2159 struct brw_reg dest
,
2164 const struct gen_device_info
*devinfo
= p
->devinfo
;
2166 if (devinfo
->gen
>= 6)
2169 if (p
->devinfo
->gen
>= 7) {
2170 /* On gen 7 and above, we no longer have message registers and we can
2171 * send from any register we want. By using the destination register
2172 * for the message, we guarantee that the implied message write won't
2173 * accidentally overwrite anything. This has been a problem because
2174 * the MRF registers and source for the final FB write are both fixed
2177 mrf
= retype(dest
, BRW_REGISTER_TYPE_UD
);
2179 mrf
= retype(mrf
, BRW_REGISTER_TYPE_UD
);
2181 dest
= retype(dest
, BRW_REGISTER_TYPE_UW
);
2183 const unsigned rlen
= num_regs
;
2184 const unsigned target_cache
=
2185 (devinfo
->gen
>= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE
:
2186 devinfo
->gen
>= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE
:
2187 BRW_SFID_DATAPORT_READ
);
2190 brw_push_insn_state(p
);
2191 brw_set_default_exec_size(p
, BRW_EXECUTE_8
);
2192 brw_set_default_compression_control(p
, BRW_COMPRESSION_NONE
);
2193 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
2195 brw_MOV(p
, mrf
, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD
));
2197 /* set message header global offset field (reg 0, element 2) */
2198 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
2199 brw_MOV(p
, get_element_ud(mrf
, 2), brw_imm_ud(offset
));
2201 brw_pop_insn_state(p
);
2205 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_SEND
);
2207 brw_inst_set_sfid(devinfo
, insn
, target_cache
);
2208 assert(brw_inst_pred_control(devinfo
, insn
) == 0);
2209 brw_inst_set_compression(devinfo
, insn
, false);
2211 brw_set_dest(p
, insn
, dest
); /* UW? */
2212 if (devinfo
->gen
>= 6) {
2213 brw_set_src0(p
, insn
, mrf
);
2215 brw_set_src0(p
, insn
, brw_null_reg());
2216 brw_inst_set_base_mrf(devinfo
, insn
, mrf
.nr
);
2219 brw_set_desc(p
, insn
,
2220 brw_message_desc(devinfo
, 1, rlen
, true) |
2221 brw_dp_read_desc(devinfo
, brw_scratch_surface_idx(p
),
2222 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs
* 8),
2223 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ
,
2224 BRW_DATAPORT_READ_TARGET_RENDER_CACHE
));
2229 gen7_block_read_scratch(struct brw_codegen
*p
,
2230 struct brw_reg dest
,
2234 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_SEND
);
2235 assert(brw_inst_pred_control(p
->devinfo
, insn
) == BRW_PREDICATE_NONE
);
2237 brw_set_dest(p
, insn
, retype(dest
, BRW_REGISTER_TYPE_UW
));
2239 /* The HW requires that the header is present; this is to get the g0.5
2242 brw_set_src0(p
, insn
, brw_vec8_grf(0, 0));
2244 /* According to the docs, offset is "A 12-bit HWord offset into the memory
2245 * Immediate Memory buffer as specified by binding table 0xFF." An HWORD
2246 * is 32 bytes, which happens to be the size of a register.
2249 assert(offset
< (1 << 12));
2251 gen7_set_dp_scratch_message(p
, insn
,
2252 false, /* scratch read */
2254 false, /* invalidate after read */
2257 1, /* mlen: just g0 */
2258 num_regs
, /* rlen */
2259 true); /* header present */
2263 * Read float[4] vectors from the data port constant cache.
2264 * Location (in buffer) should be a multiple of 16.
2265 * Used for fetching shader constants.
2267 void brw_oword_block_read(struct brw_codegen
*p
,
2268 struct brw_reg dest
,
2271 uint32_t bind_table_index
)
2273 const struct gen_device_info
*devinfo
= p
->devinfo
;
2274 const unsigned target_cache
=
2275 (devinfo
->gen
>= 6 ? GEN6_SFID_DATAPORT_CONSTANT_CACHE
:
2276 BRW_SFID_DATAPORT_READ
);
2277 const unsigned exec_size
= 1 << brw_get_default_exec_size(p
);
2279 /* On newer hardware, offset is in units of owords. */
2280 if (devinfo
->gen
>= 6)
2283 mrf
= retype(mrf
, BRW_REGISTER_TYPE_UD
);
2285 brw_push_insn_state(p
);
2286 brw_set_default_predicate_control(p
, BRW_PREDICATE_NONE
);
2287 brw_set_default_compression_control(p
, BRW_COMPRESSION_NONE
);
2288 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
2290 brw_push_insn_state(p
);
2291 brw_set_default_exec_size(p
, BRW_EXECUTE_8
);
2292 brw_MOV(p
, mrf
, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD
));
2294 /* set message header global offset field (reg 0, element 2) */
2295 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
2297 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE
,
2299 2), BRW_REGISTER_TYPE_UD
),
2300 brw_imm_ud(offset
));
2301 brw_pop_insn_state(p
);
2303 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_SEND
);
2305 brw_inst_set_sfid(devinfo
, insn
, target_cache
);
2307 /* cast dest to a uword[8] vector */
2308 dest
= retype(vec8(dest
), BRW_REGISTER_TYPE_UW
);
2310 brw_set_dest(p
, insn
, dest
);
2311 if (devinfo
->gen
>= 6) {
2312 brw_set_src0(p
, insn
, mrf
);
2314 brw_set_src0(p
, insn
, brw_null_reg());
2315 brw_inst_set_base_mrf(devinfo
, insn
, mrf
.nr
);
2318 brw_set_desc(p
, insn
,
2319 brw_message_desc(devinfo
, 1, DIV_ROUND_UP(exec_size
, 8), true) |
2320 brw_dp_read_desc(devinfo
, bind_table_index
,
2321 BRW_DATAPORT_OWORD_BLOCK_DWORDS(exec_size
),
2322 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ
,
2323 BRW_DATAPORT_READ_TARGET_DATA_CACHE
));
2325 brw_pop_insn_state(p
);
2329 brw_fb_WRITE(struct brw_codegen
*p
,
2330 struct brw_reg payload
,
2331 struct brw_reg implied_header
,
2332 unsigned msg_control
,
2333 unsigned binding_table_index
,
2334 unsigned msg_length
,
2335 unsigned response_length
,
2337 bool last_render_target
,
2338 bool header_present
)
2340 const struct gen_device_info
*devinfo
= p
->devinfo
;
2341 const unsigned target_cache
=
2342 (devinfo
->gen
>= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE
:
2343 BRW_SFID_DATAPORT_WRITE
);
2346 struct brw_reg dest
, src0
;
2348 if (brw_get_default_exec_size(p
) >= BRW_EXECUTE_16
)
2349 dest
= retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW
);
2351 dest
= retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW
);
2353 if (devinfo
->gen
>= 6) {
2354 insn
= next_insn(p
, BRW_OPCODE_SENDC
);
2356 insn
= next_insn(p
, BRW_OPCODE_SEND
);
2358 brw_inst_set_sfid(devinfo
, insn
, target_cache
);
2359 brw_inst_set_compression(devinfo
, insn
, false);
2361 if (devinfo
->gen
>= 6) {
2362 /* headerless version, just submit color payload */
2365 msg_type
= GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE
;
2367 assert(payload
.file
== BRW_MESSAGE_REGISTER_FILE
);
2368 brw_inst_set_base_mrf(devinfo
, insn
, payload
.nr
);
2369 src0
= implied_header
;
2371 msg_type
= BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE
;
2374 brw_set_dest(p
, insn
, dest
);
2375 brw_set_src0(p
, insn
, src0
);
2376 brw_set_desc(p
, insn
,
2377 brw_message_desc(devinfo
, msg_length
, response_length
,
2379 brw_dp_write_desc(devinfo
, binding_table_index
, msg_control
,
2380 msg_type
, last_render_target
,
2381 0 /* send_commit_msg */));
2382 brw_inst_set_eot(devinfo
, insn
, eot
);
2388 gen9_fb_READ(struct brw_codegen
*p
,
2390 struct brw_reg payload
,
2391 unsigned binding_table_index
,
2392 unsigned msg_length
,
2393 unsigned response_length
,
2396 const struct gen_device_info
*devinfo
= p
->devinfo
;
2397 assert(devinfo
->gen
>= 9);
2398 const unsigned msg_subtype
=
2399 brw_get_default_exec_size(p
) == BRW_EXECUTE_16
? 0 : 1;
2400 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_SENDC
);
2402 brw_inst_set_sfid(devinfo
, insn
, GEN6_SFID_DATAPORT_RENDER_CACHE
);
2403 brw_set_dest(p
, insn
, dst
);
2404 brw_set_src0(p
, insn
, payload
);
2407 brw_message_desc(devinfo
, msg_length
, response_length
, true) |
2408 brw_dp_read_desc(devinfo
, binding_table_index
,
2409 per_sample
<< 5 | msg_subtype
,
2410 GEN9_DATAPORT_RC_RENDER_TARGET_READ
,
2411 BRW_DATAPORT_READ_TARGET_RENDER_CACHE
));
2412 brw_inst_set_rt_slot_group(devinfo
, insn
, brw_get_default_group(p
) / 16);
2418 * Texture sample instruction.
2419 * Note: the msg_type plus msg_length values determine exactly what kind
2420 * of sampling operation is performed. See volume 4, page 161 of docs.
2422 void brw_SAMPLE(struct brw_codegen
*p
,
2423 struct brw_reg dest
,
2424 unsigned msg_reg_nr
,
2425 struct brw_reg src0
,
2426 unsigned binding_table_index
,
2429 unsigned response_length
,
2430 unsigned msg_length
,
2431 unsigned header_present
,
2433 unsigned return_format
)
2435 const struct gen_device_info
*devinfo
= p
->devinfo
;
2438 if (msg_reg_nr
!= -1)
2439 gen6_resolve_implied_move(p
, &src0
, msg_reg_nr
);
2441 insn
= next_insn(p
, BRW_OPCODE_SEND
);
2442 brw_inst_set_sfid(devinfo
, insn
, BRW_SFID_SAMPLER
);
2443 brw_inst_set_pred_control(devinfo
, insn
, BRW_PREDICATE_NONE
); /* XXX */
2445 /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2447 * "Instruction compression is not allowed for this instruction (that
2448 * is, send). The hardware behavior is undefined if this instruction is
2449 * set as compressed. However, compress control can be set to "SecHalf"
2450 * to affect the EMask generation."
2452 * No similar wording is found in later PRMs, but there are examples
2453 * utilizing send with SecHalf. More importantly, SIMD8 sampler messages
2454 * are allowed in SIMD16 mode and they could not work without SecHalf. For
2455 * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2457 brw_inst_set_compression(devinfo
, insn
, false);
2459 if (devinfo
->gen
< 6)
2460 brw_inst_set_base_mrf(devinfo
, insn
, msg_reg_nr
);
2462 brw_set_dest(p
, insn
, dest
);
2463 brw_set_src0(p
, insn
, src0
);
2464 brw_set_desc(p
, insn
,
2465 brw_message_desc(devinfo
, msg_length
, response_length
,
2467 brw_sampler_desc(devinfo
, binding_table_index
, sampler
,
2468 msg_type
, simd_mode
, return_format
));
2471 /* Adjust the message header's sampler state pointer to
2472 * select the correct group of 16 samplers.
2474 void brw_adjust_sampler_state_pointer(struct brw_codegen
*p
,
2475 struct brw_reg header
,
2476 struct brw_reg sampler_index
)
2478 /* The "Sampler Index" field can only store values between 0 and 15.
2479 * However, we can add an offset to the "Sampler State Pointer"
2480 * field, effectively selecting a different set of 16 samplers.
2482 * The "Sampler State Pointer" needs to be aligned to a 32-byte
2483 * offset, and each sampler state is only 16-bytes, so we can't
2484 * exclusively use the offset - we have to use both.
2487 const struct gen_device_info
*devinfo
= p
->devinfo
;
2489 if (sampler_index
.file
== BRW_IMMEDIATE_VALUE
) {
2490 const int sampler_state_size
= 16; /* 16 bytes */
2491 uint32_t sampler
= sampler_index
.ud
;
2493 if (sampler
>= 16) {
2494 assert(devinfo
->is_haswell
|| devinfo
->gen
>= 8);
2496 get_element_ud(header
, 3),
2497 get_element_ud(brw_vec8_grf(0, 0), 3),
2498 brw_imm_ud(16 * (sampler
/ 16) * sampler_state_size
));
2501 /* Non-const sampler array indexing case */
2502 if (devinfo
->gen
< 8 && !devinfo
->is_haswell
) {
2506 struct brw_reg temp
= get_element_ud(header
, 3);
2508 brw_AND(p
, temp
, get_element_ud(sampler_index
, 0), brw_imm_ud(0x0f0));
2509 brw_SHL(p
, temp
, temp
, brw_imm_ud(4));
2511 get_element_ud(header
, 3),
2512 get_element_ud(brw_vec8_grf(0, 0), 3),
2517 /* All these variables are pretty confusing - we might be better off
2518 * using bitmasks and macros for this, in the old style. Or perhaps
2519 * just having the caller instantiate the fields in dword3 itself.
2521 void brw_urb_WRITE(struct brw_codegen
*p
,
2522 struct brw_reg dest
,
2523 unsigned msg_reg_nr
,
2524 struct brw_reg src0
,
2525 enum brw_urb_write_flags flags
,
2526 unsigned msg_length
,
2527 unsigned response_length
,
2531 const struct gen_device_info
*devinfo
= p
->devinfo
;
2534 gen6_resolve_implied_move(p
, &src0
, msg_reg_nr
);
2536 if (devinfo
->gen
>= 7 && !(flags
& BRW_URB_WRITE_USE_CHANNEL_MASKS
)) {
2537 /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2538 brw_push_insn_state(p
);
2539 brw_set_default_access_mode(p
, BRW_ALIGN_1
);
2540 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
2541 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
2542 brw_OR(p
, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE
, msg_reg_nr
, 5),
2543 BRW_REGISTER_TYPE_UD
),
2544 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD
),
2545 brw_imm_ud(0xff00));
2546 brw_pop_insn_state(p
);
2549 insn
= next_insn(p
, BRW_OPCODE_SEND
);
2551 assert(msg_length
< BRW_MAX_MRF(devinfo
->gen
));
2553 brw_set_dest(p
, insn
, dest
);
2554 brw_set_src0(p
, insn
, src0
);
2555 brw_set_src1(p
, insn
, brw_imm_d(0));
2557 if (devinfo
->gen
< 6)
2558 brw_inst_set_base_mrf(devinfo
, insn
, msg_reg_nr
);
2560 brw_set_urb_message(p
,
2570 brw_send_indirect_message(struct brw_codegen
*p
,
2573 struct brw_reg payload
,
2574 struct brw_reg desc
,
2578 const struct gen_device_info
*devinfo
= p
->devinfo
;
2579 struct brw_inst
*send
;
2581 dst
= retype(dst
, BRW_REGISTER_TYPE_UW
);
2583 assert(desc
.type
== BRW_REGISTER_TYPE_UD
);
2585 if (desc
.file
== BRW_IMMEDIATE_VALUE
) {
2586 send
= next_insn(p
, BRW_OPCODE_SEND
);
2587 brw_set_src0(p
, send
, retype(payload
, BRW_REGISTER_TYPE_UD
));
2588 brw_set_desc(p
, send
, desc
.ud
| desc_imm
);
2590 struct brw_reg addr
= retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD
);
2592 brw_push_insn_state(p
);
2593 brw_set_default_access_mode(p
, BRW_ALIGN_1
);
2594 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
2595 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
2596 brw_set_default_predicate_control(p
, BRW_PREDICATE_NONE
);
2598 /* Load the indirect descriptor to an address register using OR so the
2599 * caller can specify additional descriptor bits with the desc_imm
2602 brw_OR(p
, addr
, desc
, brw_imm_ud(desc_imm
));
2604 brw_pop_insn_state(p
);
2606 send
= next_insn(p
, BRW_OPCODE_SEND
);
2607 brw_set_src0(p
, send
, retype(payload
, BRW_REGISTER_TYPE_UD
));
2609 if (devinfo
->gen
>= 12)
2610 brw_inst_set_send_sel_reg32_desc(devinfo
, send
, true);
2612 brw_set_src1(p
, send
, addr
);
2615 brw_set_dest(p
, send
, dst
);
2616 brw_inst_set_sfid(devinfo
, send
, sfid
);
2617 brw_inst_set_eot(devinfo
, send
, eot
);
2621 brw_send_indirect_split_message(struct brw_codegen
*p
,
2624 struct brw_reg payload0
,
2625 struct brw_reg payload1
,
2626 struct brw_reg desc
,
2628 struct brw_reg ex_desc
,
2629 unsigned ex_desc_imm
,
2632 const struct gen_device_info
*devinfo
= p
->devinfo
;
2633 struct brw_inst
*send
;
2635 dst
= retype(dst
, BRW_REGISTER_TYPE_UW
);
2637 assert(desc
.type
== BRW_REGISTER_TYPE_UD
);
2639 if (desc
.file
== BRW_IMMEDIATE_VALUE
) {
2640 desc
.ud
|= desc_imm
;
2642 struct brw_reg addr
= retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD
);
2644 brw_push_insn_state(p
);
2645 brw_set_default_access_mode(p
, BRW_ALIGN_1
);
2646 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
2647 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
2648 brw_set_default_predicate_control(p
, BRW_PREDICATE_NONE
);
2650 /* Load the indirect descriptor to an address register using OR so the
2651 * caller can specify additional descriptor bits with the desc_imm
2654 brw_OR(p
, addr
, desc
, brw_imm_ud(desc_imm
));
2656 brw_pop_insn_state(p
);
2660 if (ex_desc
.file
== BRW_IMMEDIATE_VALUE
&&
2661 (ex_desc
.ud
& INTEL_MASK(15, 12)) == 0) {
2662 ex_desc
.ud
|= ex_desc_imm
;
2664 struct brw_reg addr
= retype(brw_address_reg(2), BRW_REGISTER_TYPE_UD
);
2666 brw_push_insn_state(p
);
2667 brw_set_default_access_mode(p
, BRW_ALIGN_1
);
2668 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
2669 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
2670 brw_set_default_predicate_control(p
, BRW_PREDICATE_NONE
);
2672 /* Load the indirect extended descriptor to an address register using OR
2673 * so the caller can specify additional descriptor bits with the
2674 * desc_imm immediate.
2676 * Even though the instruction dispatcher always pulls the SFID and EOT
2677 * fields from the instruction itself, actual external unit which
2678 * processes the message gets the SFID and EOT from the extended
2679 * descriptor which comes from the address register. If we don't OR
2680 * those two bits in, the external unit may get confused and hang.
2682 unsigned imm_part
= ex_desc_imm
| sfid
| eot
<< 5;
2684 if (ex_desc
.file
== BRW_IMMEDIATE_VALUE
) {
2685 /* ex_desc bits 15:12 don't exist in the instruction encoding, so
2686 * we may have fallen back to an indirect extended descriptor.
2688 brw_MOV(p
, addr
, brw_imm_ud(ex_desc
.ud
| imm_part
));
2690 brw_OR(p
, addr
, ex_desc
, brw_imm_ud(imm_part
));
2693 brw_pop_insn_state(p
);
2697 send
= next_insn(p
, devinfo
->gen
>= 12 ? BRW_OPCODE_SEND
: BRW_OPCODE_SENDS
);
2698 brw_set_dest(p
, send
, dst
);
2699 brw_set_src0(p
, send
, retype(payload0
, BRW_REGISTER_TYPE_UD
));
2700 brw_set_src1(p
, send
, retype(payload1
, BRW_REGISTER_TYPE_UD
));
2702 if (desc
.file
== BRW_IMMEDIATE_VALUE
) {
2703 brw_inst_set_send_sel_reg32_desc(devinfo
, send
, 0);
2704 brw_inst_set_send_desc(devinfo
, send
, desc
.ud
);
2706 assert(desc
.file
== BRW_ARCHITECTURE_REGISTER_FILE
);
2707 assert(desc
.nr
== BRW_ARF_ADDRESS
);
2708 assert(desc
.subnr
== 0);
2709 brw_inst_set_send_sel_reg32_desc(devinfo
, send
, 1);
2712 if (ex_desc
.file
== BRW_IMMEDIATE_VALUE
) {
2713 brw_inst_set_send_sel_reg32_ex_desc(devinfo
, send
, 0);
2714 brw_inst_set_sends_ex_desc(devinfo
, send
, ex_desc
.ud
);
2716 assert(ex_desc
.file
== BRW_ARCHITECTURE_REGISTER_FILE
);
2717 assert(ex_desc
.nr
== BRW_ARF_ADDRESS
);
2718 assert((ex_desc
.subnr
& 0x3) == 0);
2719 brw_inst_set_send_sel_reg32_ex_desc(devinfo
, send
, 1);
2720 brw_inst_set_send_ex_desc_ia_subreg_nr(devinfo
, send
, ex_desc
.subnr
>> 2);
2723 brw_inst_set_sfid(devinfo
, send
, sfid
);
2724 brw_inst_set_eot(devinfo
, send
, eot
);
2728 brw_send_indirect_surface_message(struct brw_codegen
*p
,
2731 struct brw_reg payload
,
2732 struct brw_reg surface
,
2735 if (surface
.file
!= BRW_IMMEDIATE_VALUE
) {
2736 struct brw_reg addr
= retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD
);
2738 brw_push_insn_state(p
);
2739 brw_set_default_access_mode(p
, BRW_ALIGN_1
);
2740 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
2741 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
2742 brw_set_default_predicate_control(p
, BRW_PREDICATE_NONE
);
2744 /* Mask out invalid bits from the surface index to avoid hangs e.g. when
2745 * some surface array is accessed out of bounds.
2748 suboffset(vec1(retype(surface
, BRW_REGISTER_TYPE_UD
)),
2749 BRW_GET_SWZ(surface
.swizzle
, 0)),
2752 brw_pop_insn_state(p
);
2757 brw_send_indirect_message(p
, sfid
, dst
, payload
, surface
, desc_imm
, false);
2761 while_jumps_before_offset(const struct gen_device_info
*devinfo
,
2762 brw_inst
*insn
, int while_offset
, int start_offset
)
2764 int scale
= 16 / brw_jump_scale(devinfo
);
2765 int jip
= devinfo
->gen
== 6 ? brw_inst_gen6_jump_count(devinfo
, insn
)
2766 : brw_inst_jip(devinfo
, insn
);
2768 return while_offset
+ jip
* scale
<= start_offset
;
2773 brw_find_next_block_end(struct brw_codegen
*p
, int start_offset
)
2776 void *store
= p
->store
;
2777 const struct gen_device_info
*devinfo
= p
->devinfo
;
2781 for (offset
= next_offset(devinfo
, store
, start_offset
);
2782 offset
< p
->next_insn_offset
;
2783 offset
= next_offset(devinfo
, store
, offset
)) {
2784 brw_inst
*insn
= store
+ offset
;
2786 switch (brw_inst_opcode(devinfo
, insn
)) {
2790 case BRW_OPCODE_ENDIF
:
2795 case BRW_OPCODE_WHILE
:
2796 /* If the while doesn't jump before our instruction, it's the end
2797 * of a sibling do...while loop. Ignore it.
2799 if (!while_jumps_before_offset(devinfo
, insn
, offset
, start_offset
))
2802 case BRW_OPCODE_ELSE
:
2803 case BRW_OPCODE_HALT
:
2814 /* There is no DO instruction on gen6, so to find the end of the loop
2815 * we have to see if the loop is jumping back before our start
2819 brw_find_loop_end(struct brw_codegen
*p
, int start_offset
)
2821 const struct gen_device_info
*devinfo
= p
->devinfo
;
2823 void *store
= p
->store
;
2825 assert(devinfo
->gen
>= 6);
2827 /* Always start after the instruction (such as a WHILE) we're trying to fix
2830 for (offset
= next_offset(devinfo
, store
, start_offset
);
2831 offset
< p
->next_insn_offset
;
2832 offset
= next_offset(devinfo
, store
, offset
)) {
2833 brw_inst
*insn
= store
+ offset
;
2835 if (brw_inst_opcode(devinfo
, insn
) == BRW_OPCODE_WHILE
) {
2836 if (while_jumps_before_offset(devinfo
, insn
, offset
, start_offset
))
2840 assert(!"not reached");
2841 return start_offset
;
2844 /* After program generation, go back and update the UIP and JIP of
2845 * BREAK, CONT, and HALT instructions to their correct locations.
2848 brw_set_uip_jip(struct brw_codegen
*p
, int start_offset
)
2850 const struct gen_device_info
*devinfo
= p
->devinfo
;
2852 int br
= brw_jump_scale(devinfo
);
2853 int scale
= 16 / br
;
2854 void *store
= p
->store
;
2856 if (devinfo
->gen
< 6)
2859 for (offset
= start_offset
; offset
< p
->next_insn_offset
; offset
+= 16) {
2860 brw_inst
*insn
= store
+ offset
;
2861 assert(brw_inst_cmpt_control(devinfo
, insn
) == 0);
2863 int block_end_offset
= brw_find_next_block_end(p
, offset
);
2864 switch (brw_inst_opcode(devinfo
, insn
)) {
2865 case BRW_OPCODE_BREAK
:
2866 assert(block_end_offset
!= 0);
2867 brw_inst_set_jip(devinfo
, insn
, (block_end_offset
- offset
) / scale
);
2868 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2869 brw_inst_set_uip(devinfo
, insn
,
2870 (brw_find_loop_end(p
, offset
) - offset
+
2871 (devinfo
->gen
== 6 ? 16 : 0)) / scale
);
2873 case BRW_OPCODE_CONTINUE
:
2874 assert(block_end_offset
!= 0);
2875 brw_inst_set_jip(devinfo
, insn
, (block_end_offset
- offset
) / scale
);
2876 brw_inst_set_uip(devinfo
, insn
,
2877 (brw_find_loop_end(p
, offset
) - offset
) / scale
);
2879 assert(brw_inst_uip(devinfo
, insn
) != 0);
2880 assert(brw_inst_jip(devinfo
, insn
) != 0);
2883 case BRW_OPCODE_ENDIF
: {
2884 int32_t jump
= (block_end_offset
== 0) ?
2885 1 * br
: (block_end_offset
- offset
) / scale
;
2886 if (devinfo
->gen
>= 7)
2887 brw_inst_set_jip(devinfo
, insn
, jump
);
2889 brw_inst_set_gen6_jump_count(devinfo
, insn
, jump
);
2893 case BRW_OPCODE_HALT
:
2894 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2896 * "In case of the halt instruction not inside any conditional
2897 * code block, the value of <JIP> and <UIP> should be the
2898 * same. In case of the halt instruction inside conditional code
2899 * block, the <UIP> should be the end of the program, and the
2900 * <JIP> should be end of the most inner conditional code block."
2902 * The uip will have already been set by whoever set up the
2905 if (block_end_offset
== 0) {
2906 brw_inst_set_jip(devinfo
, insn
, brw_inst_uip(devinfo
, insn
));
2908 brw_inst_set_jip(devinfo
, insn
, (block_end_offset
- offset
) / scale
);
2910 assert(brw_inst_uip(devinfo
, insn
) != 0);
2911 assert(brw_inst_jip(devinfo
, insn
) != 0);
2920 void brw_ff_sync(struct brw_codegen
*p
,
2921 struct brw_reg dest
,
2922 unsigned msg_reg_nr
,
2923 struct brw_reg src0
,
2925 unsigned response_length
,
2928 const struct gen_device_info
*devinfo
= p
->devinfo
;
2931 gen6_resolve_implied_move(p
, &src0
, msg_reg_nr
);
2933 insn
= next_insn(p
, BRW_OPCODE_SEND
);
2934 brw_set_dest(p
, insn
, dest
);
2935 brw_set_src0(p
, insn
, src0
);
2936 brw_set_src1(p
, insn
, brw_imm_d(0));
2938 if (devinfo
->gen
< 6)
2939 brw_inst_set_base_mrf(devinfo
, insn
, msg_reg_nr
);
2941 brw_set_ff_sync_message(p
,
2949 * Emit the SEND instruction necessary to generate stream output data on Gen6
2950 * (for transform feedback).
2952 * If send_commit_msg is true, this is the last piece of stream output data
2953 * from this thread, so send the data as a committed write. According to the
2954 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2956 * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2957 * writes are complete by sending the final write as a committed write."
2960 brw_svb_write(struct brw_codegen
*p
,
2961 struct brw_reg dest
,
2962 unsigned msg_reg_nr
,
2963 struct brw_reg src0
,
2964 unsigned binding_table_index
,
2965 bool send_commit_msg
)
2967 const struct gen_device_info
*devinfo
= p
->devinfo
;
2968 const unsigned target_cache
=
2969 (devinfo
->gen
>= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE
:
2970 devinfo
->gen
>= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE
:
2971 BRW_SFID_DATAPORT_WRITE
);
2974 gen6_resolve_implied_move(p
, &src0
, msg_reg_nr
);
2976 insn
= next_insn(p
, BRW_OPCODE_SEND
);
2977 brw_inst_set_sfid(devinfo
, insn
, target_cache
);
2978 brw_set_dest(p
, insn
, dest
);
2979 brw_set_src0(p
, insn
, src0
);
2980 brw_set_desc(p
, insn
,
2981 brw_message_desc(devinfo
, 1, send_commit_msg
, true) |
2982 brw_dp_write_desc(devinfo
, binding_table_index
,
2983 0, /* msg_control: ignored */
2984 GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE
,
2985 0, /* last_render_target: ignored */
2986 send_commit_msg
)); /* send_commit_msg */
2990 brw_surface_payload_size(struct brw_codegen
*p
,
2991 unsigned num_channels
,
2992 unsigned exec_size
/**< 0 for SIMD4x2 */)
2995 return 1; /* SIMD4x2 */
2996 else if (exec_size
<= 8)
2997 return num_channels
;
2999 return 2 * num_channels
;
3003 brw_untyped_atomic(struct brw_codegen
*p
,
3005 struct brw_reg payload
,
3006 struct brw_reg surface
,
3008 unsigned msg_length
,
3009 bool response_expected
,
3010 bool header_present
)
3012 const struct gen_device_info
*devinfo
= p
->devinfo
;
3013 const unsigned sfid
= (devinfo
->gen
>= 8 || devinfo
->is_haswell
?
3014 HSW_SFID_DATAPORT_DATA_CACHE_1
:
3015 GEN7_SFID_DATAPORT_DATA_CACHE
);
3016 const bool align1
= brw_get_default_access_mode(p
) == BRW_ALIGN_1
;
3017 /* SIMD4x2 untyped atomic instructions only exist on HSW+ */
3018 const bool has_simd4x2
= devinfo
->gen
>= 8 || devinfo
->is_haswell
;
3019 const unsigned exec_size
= align1
? 1 << brw_get_default_exec_size(p
) :
3020 has_simd4x2
? 0 : 8;
3021 const unsigned response_length
=
3022 brw_surface_payload_size(p
, response_expected
, exec_size
);
3023 const unsigned desc
=
3024 brw_message_desc(devinfo
, msg_length
, response_length
, header_present
) |
3025 brw_dp_untyped_atomic_desc(devinfo
, exec_size
, atomic_op
,
3027 /* Mask out unused components -- This is especially important in Align16
3028 * mode on generations that don't have native support for SIMD4x2 atomics,
3029 * because unused but enabled components will cause the dataport to perform
3030 * additional atomic operations on the addresses that happen to be in the
3031 * uninitialized Y, Z and W coordinates of the payload.
3033 const unsigned mask
= align1
? WRITEMASK_XYZW
: WRITEMASK_X
;
3035 brw_send_indirect_surface_message(p
, sfid
, brw_writemask(dst
, mask
),
3036 payload
, surface
, desc
);
3040 brw_untyped_surface_read(struct brw_codegen
*p
,
3042 struct brw_reg payload
,
3043 struct brw_reg surface
,
3044 unsigned msg_length
,
3045 unsigned num_channels
)
3047 const struct gen_device_info
*devinfo
= p
->devinfo
;
3048 const unsigned sfid
= (devinfo
->gen
>= 8 || devinfo
->is_haswell
?
3049 HSW_SFID_DATAPORT_DATA_CACHE_1
:
3050 GEN7_SFID_DATAPORT_DATA_CACHE
);
3051 const bool align1
= brw_get_default_access_mode(p
) == BRW_ALIGN_1
;
3052 const unsigned exec_size
= align1
? 1 << brw_get_default_exec_size(p
) : 0;
3053 const unsigned response_length
=
3054 brw_surface_payload_size(p
, num_channels
, exec_size
);
3055 const unsigned desc
=
3056 brw_message_desc(devinfo
, msg_length
, response_length
, false) |
3057 brw_dp_untyped_surface_rw_desc(devinfo
, exec_size
, num_channels
, false);
3059 brw_send_indirect_surface_message(p
, sfid
, dst
, payload
, surface
, desc
);
3063 brw_untyped_surface_write(struct brw_codegen
*p
,
3064 struct brw_reg payload
,
3065 struct brw_reg surface
,
3066 unsigned msg_length
,
3067 unsigned num_channels
,
3068 bool header_present
)
3070 const struct gen_device_info
*devinfo
= p
->devinfo
;
3071 const unsigned sfid
= (devinfo
->gen
>= 8 || devinfo
->is_haswell
?
3072 HSW_SFID_DATAPORT_DATA_CACHE_1
:
3073 GEN7_SFID_DATAPORT_DATA_CACHE
);
3074 const bool align1
= brw_get_default_access_mode(p
) == BRW_ALIGN_1
;
3075 /* SIMD4x2 untyped surface write instructions only exist on HSW+ */
3076 const bool has_simd4x2
= devinfo
->gen
>= 8 || devinfo
->is_haswell
;
3077 const unsigned exec_size
= align1
? 1 << brw_get_default_exec_size(p
) :
3078 has_simd4x2
? 0 : 8;
3079 const unsigned desc
=
3080 brw_message_desc(devinfo
, msg_length
, 0, header_present
) |
3081 brw_dp_untyped_surface_rw_desc(devinfo
, exec_size
, num_channels
, true);
3082 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3083 const unsigned mask
= !has_simd4x2
&& !align1
? WRITEMASK_X
: WRITEMASK_XYZW
;
3085 brw_send_indirect_surface_message(p
, sfid
, brw_writemask(brw_null_reg(), mask
),
3086 payload
, surface
, desc
);
3090 brw_set_memory_fence_message(struct brw_codegen
*p
,
3091 struct brw_inst
*insn
,
3092 enum brw_message_target sfid
,
3096 const struct gen_device_info
*devinfo
= p
->devinfo
;
3098 brw_set_desc(p
, insn
, brw_message_desc(
3099 devinfo
, 1, (commit_enable
? 1 : 0), true));
3101 brw_inst_set_sfid(devinfo
, insn
, sfid
);
3104 case GEN6_SFID_DATAPORT_RENDER_CACHE
:
3105 brw_inst_set_dp_msg_type(devinfo
, insn
, GEN7_DATAPORT_RC_MEMORY_FENCE
);
3107 case GEN7_SFID_DATAPORT_DATA_CACHE
:
3108 brw_inst_set_dp_msg_type(devinfo
, insn
, GEN7_DATAPORT_DC_MEMORY_FENCE
);
3111 unreachable("Not reached");
3115 brw_inst_set_dp_msg_control(devinfo
, insn
, 1 << 5);
3117 assert(devinfo
->gen
>= 11 || bti
== 0);
3118 brw_inst_set_binding_table_index(devinfo
, insn
, bti
);
3122 brw_memory_fence(struct brw_codegen
*p
,
3125 enum opcode send_op
,
3129 const struct gen_device_info
*devinfo
= p
->devinfo
;
3130 const bool commit_enable
= stall
||
3131 devinfo
->gen
>= 10 || /* HSD ES # 1404612949 */
3132 (devinfo
->gen
== 7 && !devinfo
->is_haswell
);
3133 struct brw_inst
*insn
;
3135 brw_push_insn_state(p
);
3136 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
3137 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
3138 dst
= retype(vec1(dst
), BRW_REGISTER_TYPE_UW
);
3139 src
= retype(vec1(src
), BRW_REGISTER_TYPE_UD
);
3141 /* Set dst as destination for dependency tracking, the MEMORY_FENCE
3142 * message doesn't write anything back.
3144 insn
= next_insn(p
, send_op
);
3145 brw_set_dest(p
, insn
, dst
);
3146 brw_set_src0(p
, insn
, src
);
3147 brw_set_memory_fence_message(p
, insn
, GEN7_SFID_DATAPORT_DATA_CACHE
,
3148 commit_enable
, bti
);
3150 if (devinfo
->gen
== 7 && !devinfo
->is_haswell
) {
3151 /* IVB does typed surface access through the render cache, so we need to
3152 * flush it too. Use a different register so both flushes can be
3153 * pipelined by the hardware.
3155 insn
= next_insn(p
, send_op
);
3156 brw_set_dest(p
, insn
, offset(dst
, 1));
3157 brw_set_src0(p
, insn
, src
);
3158 brw_set_memory_fence_message(p
, insn
, GEN6_SFID_DATAPORT_RENDER_CACHE
,
3159 commit_enable
, bti
);
3161 /* Now write the response of the second message into the response of the
3162 * first to trigger a pipeline stall -- This way future render and data
3163 * cache messages will be properly ordered with respect to past data and
3164 * render cache messages.
3166 brw_MOV(p
, dst
, offset(dst
, 1));
3170 brw_MOV(p
, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW
), dst
);
3172 brw_pop_insn_state(p
);
3176 brw_pixel_interpolator_query(struct brw_codegen
*p
,
3177 struct brw_reg dest
,
3181 struct brw_reg data
,
3182 unsigned msg_length
,
3183 unsigned response_length
)
3185 const struct gen_device_info
*devinfo
= p
->devinfo
;
3186 const uint16_t exec_size
= brw_get_default_exec_size(p
);
3187 const unsigned slot_group
= brw_get_default_group(p
) / 16;
3188 const unsigned simd_mode
= (exec_size
== BRW_EXECUTE_16
);
3189 const unsigned desc
=
3190 brw_message_desc(devinfo
, msg_length
, response_length
, false) |
3191 brw_pixel_interp_desc(devinfo
, mode
, noperspective
, simd_mode
,
3194 /* brw_send_indirect_message will automatically use a direct send message
3195 * if data is actually immediate.
3197 brw_send_indirect_message(p
,
3198 GEN7_SFID_PIXEL_INTERPOLATOR
,
3207 brw_find_live_channel(struct brw_codegen
*p
, struct brw_reg dst
,
3208 struct brw_reg mask
)
3210 const struct gen_device_info
*devinfo
= p
->devinfo
;
3211 const unsigned exec_size
= 1 << brw_get_default_exec_size(p
);
3212 const unsigned qtr_control
= brw_get_default_group(p
) / 8;
3215 assert(devinfo
->gen
>= 7);
3216 assert(mask
.type
== BRW_REGISTER_TYPE_UD
);
3218 brw_push_insn_state(p
);
3220 /* The flag register is only used on Gen7 in align1 mode, so avoid setting
3221 * unnecessary bits in the instruction words, get the information we need
3222 * and reset the default flag register. This allows more instructions to be
3225 const unsigned flag_subreg
= p
->current
->flag_subreg
;
3226 brw_set_default_flag_reg(p
, 0, 0);
3228 if (brw_get_default_access_mode(p
) == BRW_ALIGN_1
) {
3229 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
3231 if (devinfo
->gen
>= 8) {
3232 /* Getting the first active channel index is easy on Gen8: Just find
3233 * the first bit set in the execution mask. The register exists on
3234 * HSW already but it reads back as all ones when the current
3235 * instruction has execution masking disabled, so it's kind of
3238 struct brw_reg exec_mask
=
3239 retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD
);
3241 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
3242 if (mask
.file
!= BRW_IMMEDIATE_VALUE
|| mask
.ud
!= 0xffffffff) {
3243 /* Unfortunately, ce0 does not take into account the thread
3244 * dispatch mask, which may be a problem in cases where it's not
3245 * tightly packed (i.e. it doesn't have the form '2^n - 1' for
3246 * some n). Combine ce0 with the given dispatch (or vector) mask
3247 * to mask off those channels which were never dispatched by the
3250 brw_SHR(p
, vec1(dst
), mask
, brw_imm_ud(qtr_control
* 8));
3251 brw_AND(p
, vec1(dst
), exec_mask
, vec1(dst
));
3252 exec_mask
= vec1(dst
);
3255 /* Quarter control has the effect of magically shifting the value of
3256 * ce0 so you'll get the first active channel relative to the
3257 * specified quarter control as result.
3259 inst
= brw_FBL(p
, vec1(dst
), exec_mask
);
3261 const struct brw_reg flag
= brw_flag_subreg(flag_subreg
);
3263 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
3264 brw_MOV(p
, retype(flag
, BRW_REGISTER_TYPE_UD
), brw_imm_ud(0));
3266 /* Run enough instructions returning zero with execution masking and
3267 * a conditional modifier enabled in order to get the full execution
3268 * mask in f1.0. We could use a single 32-wide move here if it
3269 * weren't because of the hardware bug that causes channel enables to
3270 * be applied incorrectly to the second half of 32-wide instructions
3273 const unsigned lower_size
= MIN2(16, exec_size
);
3274 for (unsigned i
= 0; i
< exec_size
/ lower_size
; i
++) {
3275 inst
= brw_MOV(p
, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW
),
3277 brw_inst_set_mask_control(devinfo
, inst
, BRW_MASK_ENABLE
);
3278 brw_inst_set_group(devinfo
, inst
, lower_size
* i
+ 8 * qtr_control
);
3279 brw_inst_set_cond_modifier(devinfo
, inst
, BRW_CONDITIONAL_Z
);
3280 brw_inst_set_exec_size(devinfo
, inst
, cvt(lower_size
) - 1);
3281 brw_inst_set_flag_reg_nr(devinfo
, inst
, flag_subreg
/ 2);
3282 brw_inst_set_flag_subreg_nr(devinfo
, inst
, flag_subreg
% 2);
3285 /* Find the first bit set in the exec_size-wide portion of the flag
3286 * register that was updated by the last sequence of MOV
3289 const enum brw_reg_type type
= brw_int_type(exec_size
/ 8, false);
3290 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
3291 brw_FBL(p
, vec1(dst
), byte_offset(retype(flag
, type
), qtr_control
));
3294 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
3296 if (devinfo
->gen
>= 8 &&
3297 mask
.file
== BRW_IMMEDIATE_VALUE
&& mask
.ud
== 0xffffffff) {
3298 /* In SIMD4x2 mode the first active channel index is just the
3299 * negation of the first bit of the mask register. Note that ce0
3300 * doesn't take into account the dispatch mask, so the Gen7 path
3301 * should be used instead unless you have the guarantee that the
3302 * dispatch mask is tightly packed (i.e. it has the form '2^n - 1'
3305 inst
= brw_AND(p
, brw_writemask(dst
, WRITEMASK_X
),
3306 negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD
)),
3310 /* Overwrite the destination without and with execution masking to
3311 * find out which of the channels is active.
3313 brw_push_insn_state(p
);
3314 brw_set_default_exec_size(p
, BRW_EXECUTE_4
);
3315 brw_MOV(p
, brw_writemask(vec4(dst
), WRITEMASK_X
),
3318 inst
= brw_MOV(p
, brw_writemask(vec4(dst
), WRITEMASK_X
),
3320 brw_pop_insn_state(p
);
3321 brw_inst_set_mask_control(devinfo
, inst
, BRW_MASK_ENABLE
);
3325 brw_pop_insn_state(p
);
3329 brw_broadcast(struct brw_codegen
*p
,
3334 const struct gen_device_info
*devinfo
= p
->devinfo
;
3335 const bool align1
= brw_get_default_access_mode(p
) == BRW_ALIGN_1
;
3338 brw_push_insn_state(p
);
3339 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
3340 brw_set_default_exec_size(p
, align1
? BRW_EXECUTE_1
: BRW_EXECUTE_4
);
3342 assert(src
.file
== BRW_GENERAL_REGISTER_FILE
&&
3343 src
.address_mode
== BRW_ADDRESS_DIRECT
);
3344 assert(!src
.abs
&& !src
.negate
);
3345 assert(src
.type
== dst
.type
);
3347 if ((src
.vstride
== 0 && (src
.hstride
== 0 || !align1
)) ||
3348 idx
.file
== BRW_IMMEDIATE_VALUE
) {
3349 /* Trivial, the source is already uniform or the index is a constant.
3350 * We will typically not get here if the optimizer is doing its job, but
3351 * asserting would be mean.
3353 const unsigned i
= idx
.file
== BRW_IMMEDIATE_VALUE
? idx
.ud
: 0;
3355 (align1
? stride(suboffset(src
, i
), 0, 1, 0) :
3356 stride(suboffset(src
, 4 * i
), 0, 4, 1)));
3358 /* From the Haswell PRM section "Register Region Restrictions":
3360 * "The lower bits of the AddressImmediate must not overflow to
3361 * change the register address. The lower 5 bits of Address
3362 * Immediate when added to lower 5 bits of address register gives
3363 * the sub-register offset. The upper bits of Address Immediate
3364 * when added to upper bits of address register gives the register
3365 * address. Any overflow from sub-register offset is dropped."
3367 * Fortunately, for broadcast, we never have a sub-register offset so
3368 * this isn't an issue.
3370 assert(src
.subnr
== 0);
3373 const struct brw_reg addr
=
3374 retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD
);
3375 unsigned offset
= src
.nr
* REG_SIZE
+ src
.subnr
;
3376 /* Limit in bytes of the signed indirect addressing immediate. */
3377 const unsigned limit
= 512;
3379 brw_push_insn_state(p
);
3380 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
3381 brw_set_default_predicate_control(p
, BRW_PREDICATE_NONE
);
3383 /* Take into account the component size and horizontal stride. */
3384 assert(src
.vstride
== src
.hstride
+ src
.width
);
3385 brw_SHL(p
, addr
, vec1(idx
),
3386 brw_imm_ud(_mesa_logbase2(type_sz(src
.type
)) +
3389 /* We can only address up to limit bytes using the indirect
3390 * addressing immediate, account for the difference if the source
3391 * register is above this limit.
3393 if (offset
>= limit
) {
3394 brw_ADD(p
, addr
, addr
, brw_imm_ud(offset
- offset
% limit
));
3395 offset
= offset
% limit
;
3398 brw_pop_insn_state(p
);
3400 /* Use indirect addressing to fetch the specified component. */
3401 if (type_sz(src
.type
) > 4 &&
3402 (devinfo
->is_cherryview
|| gen_device_info_is_9lp(devinfo
))) {
3403 /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
3405 * "When source or destination datatype is 64b or operation is
3406 * integer DWord multiply, indirect addressing must not be
3409 * To work around both of this issue, we do two integer MOVs
3410 * insead of one 64-bit MOV. Because no double value should ever
3411 * cross a register boundary, it's safe to use the immediate
3412 * offset in the indirect here to handle adding 4 bytes to the
3413 * offset and avoid the extra ADD to the register file.
3415 brw_MOV(p
, subscript(dst
, BRW_REGISTER_TYPE_D
, 0),
3416 retype(brw_vec1_indirect(addr
.subnr
, offset
),
3417 BRW_REGISTER_TYPE_D
));
3418 brw_MOV(p
, subscript(dst
, BRW_REGISTER_TYPE_D
, 1),
3419 retype(brw_vec1_indirect(addr
.subnr
, offset
+ 4),
3420 BRW_REGISTER_TYPE_D
));
3423 retype(brw_vec1_indirect(addr
.subnr
, offset
), src
.type
));
3426 /* In SIMD4x2 mode the index can be either zero or one, replicate it
3427 * to all bits of a flag register,
3431 stride(brw_swizzle(idx
, BRW_SWIZZLE_XXXX
), 4, 4, 1));
3432 brw_inst_set_pred_control(devinfo
, inst
, BRW_PREDICATE_NONE
);
3433 brw_inst_set_cond_modifier(devinfo
, inst
, BRW_CONDITIONAL_NZ
);
3434 brw_inst_set_flag_reg_nr(devinfo
, inst
, 1);
3436 /* and use predicated SEL to pick the right channel. */
3437 inst
= brw_SEL(p
, dst
,
3438 stride(suboffset(src
, 4), 4, 4, 1),
3439 stride(src
, 4, 4, 1));
3440 brw_inst_set_pred_control(devinfo
, inst
, BRW_PREDICATE_NORMAL
);
3441 brw_inst_set_flag_reg_nr(devinfo
, inst
, 1);
3445 brw_pop_insn_state(p
);
3449 * This instruction is generated as a single-channel align1 instruction by
3450 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
3452 * We can't use the typed atomic op in the FS because that has the execution
3453 * mask ANDed with the pixel mask, but we just want to write the one dword for
3456 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
3457 * one u32. So we use the same untyped atomic write message as the pixel
3460 * The untyped atomic operation requires a BUFFER surface type with RAW
3461 * format, and is only accessible through the legacy DATA_CACHE dataport
3464 void brw_shader_time_add(struct brw_codegen
*p
,
3465 struct brw_reg payload
,
3466 uint32_t surf_index
)
3468 const struct gen_device_info
*devinfo
= p
->devinfo
;
3469 const unsigned sfid
= (devinfo
->gen
>= 8 || devinfo
->is_haswell
?
3470 HSW_SFID_DATAPORT_DATA_CACHE_1
:
3471 GEN7_SFID_DATAPORT_DATA_CACHE
);
3472 assert(devinfo
->gen
>= 7);
3474 brw_push_insn_state(p
);
3475 brw_set_default_access_mode(p
, BRW_ALIGN_1
);
3476 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
3477 brw_set_default_compression_control(p
, BRW_COMPRESSION_NONE
);
3478 brw_inst
*send
= brw_next_insn(p
, BRW_OPCODE_SEND
);
3480 /* We use brw_vec1_reg and unmasked because we want to increment the given
3483 brw_set_dest(p
, send
, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE
,
3485 brw_set_src0(p
, send
, brw_vec1_reg(payload
.file
,
3487 brw_set_desc(p
, send
, (brw_message_desc(devinfo
, 2, 0, false) |
3488 brw_dp_untyped_atomic_desc(devinfo
, 1, BRW_AOP_ADD
,
3491 brw_inst_set_sfid(devinfo
, send
, sfid
);
3492 brw_inst_set_binding_table_index(devinfo
, send
, surf_index
);
3494 brw_pop_insn_state(p
);
3499 * Emit the SEND message for a barrier
3502 brw_barrier(struct brw_codegen
*p
, struct brw_reg src
)
3504 const struct gen_device_info
*devinfo
= p
->devinfo
;
3505 struct brw_inst
*inst
;
3507 assert(devinfo
->gen
>= 7);
3509 brw_push_insn_state(p
);
3510 brw_set_default_access_mode(p
, BRW_ALIGN_1
);
3511 inst
= next_insn(p
, BRW_OPCODE_SEND
);
3512 brw_set_dest(p
, inst
, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW
));
3513 brw_set_src0(p
, inst
, src
);
3514 brw_set_src1(p
, inst
, brw_null_reg());
3515 brw_set_desc(p
, inst
, brw_message_desc(devinfo
, 1, 0, false));
3517 brw_inst_set_sfid(devinfo
, inst
, BRW_SFID_MESSAGE_GATEWAY
);
3518 brw_inst_set_gateway_notify(devinfo
, inst
, 1);
3519 brw_inst_set_gateway_subfuncid(devinfo
, inst
,
3520 BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG
);
3522 brw_inst_set_mask_control(devinfo
, inst
, BRW_MASK_DISABLE
);
3523 brw_pop_insn_state(p
);
3528 * Emit the wait instruction for a barrier
3531 brw_WAIT(struct brw_codegen
*p
)
3533 const struct gen_device_info
*devinfo
= p
->devinfo
;
3534 struct brw_inst
*insn
;
3536 struct brw_reg src
= brw_notification_reg();
3538 insn
= next_insn(p
, BRW_OPCODE_WAIT
);
3539 brw_set_dest(p
, insn
, src
);
3540 brw_set_src0(p
, insn
, src
);
3541 brw_set_src1(p
, insn
, brw_null_reg());
3543 brw_inst_set_exec_size(devinfo
, insn
, BRW_EXECUTE_1
);
3544 brw_inst_set_mask_control(devinfo
, insn
, BRW_MASK_DISABLE
);
3548 brw_float_controls_mode(struct brw_codegen
*p
,
3549 unsigned mode
, unsigned mask
)
3551 brw_inst
*inst
= brw_AND(p
, brw_cr0_reg(0), brw_cr0_reg(0),
3553 brw_inst_set_exec_size(p
->devinfo
, inst
, BRW_EXECUTE_1
);
3555 /* From the Skylake PRM, Volume 7, page 760:
3556 * "Implementation Restriction on Register Access: When the control
3557 * register is used as an explicit source and/or destination, hardware
3558 * does not ensure execution pipeline coherency. Software must set the
3559 * thread control field to ‘switch’ for an instruction that uses
3560 * control register as an explicit operand."
3562 if (p
->devinfo
->gen
< 12)
3563 brw_inst_set_thread_control(p
->devinfo
, inst
, BRW_THREAD_SWITCH
);
3566 brw_inst
*inst_or
= brw_OR(p
, brw_cr0_reg(0), brw_cr0_reg(0),
3568 brw_inst_set_exec_size(p
->devinfo
, inst_or
, BRW_EXECUTE_1
);
3569 if (p
->devinfo
->gen
< 12)
3570 brw_inst_set_thread_control(p
->devinfo
, inst_or
, BRW_THREAD_SWITCH
);