2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **********************************************************************/
29 * Keith Whitwell <keithw@vmware.com>
33 #include "brw_eu_defines.h"
36 #include "util/ralloc.h"
39 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
40 * registers, implicitly moving the operand to a message register.
42 * On Sandybridge, this is no longer the case. This function performs the
43 * explicit move; it should be called before emitting a SEND instruction.
46 gen6_resolve_implied_move(struct brw_codegen
*p
,
50 const struct gen_device_info
*devinfo
= p
->devinfo
;
54 if (src
->file
== BRW_MESSAGE_REGISTER_FILE
)
57 if (src
->file
!= BRW_ARCHITECTURE_REGISTER_FILE
|| src
->nr
!= BRW_ARF_NULL
) {
58 brw_push_insn_state(p
);
59 brw_set_default_exec_size(p
, BRW_EXECUTE_8
);
60 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
61 brw_set_default_compression_control(p
, BRW_COMPRESSION_NONE
);
62 brw_MOV(p
, retype(brw_message_reg(msg_reg_nr
), BRW_REGISTER_TYPE_UD
),
63 retype(*src
, BRW_REGISTER_TYPE_UD
));
64 brw_pop_insn_state(p
);
66 *src
= brw_message_reg(msg_reg_nr
);
70 gen7_convert_mrf_to_grf(struct brw_codegen
*p
, struct brw_reg
*reg
)
72 /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
73 * "The send with EOT should use register space R112-R127 for <src>. This is
74 * to enable loading of a new thread into the same slot while the message
75 * with EOT for current thread is pending dispatch."
77 * Since we're pretending to have 16 MRFs anyway, we may as well use the
78 * registers required for messages with EOT.
80 const struct gen_device_info
*devinfo
= p
->devinfo
;
81 if (devinfo
->gen
>= 7 && reg
->file
== BRW_MESSAGE_REGISTER_FILE
) {
82 reg
->file
= BRW_GENERAL_REGISTER_FILE
;
83 reg
->nr
+= GEN7_MRF_HACK_START
;
88 brw_set_dest(struct brw_codegen
*p
, brw_inst
*inst
, struct brw_reg dest
)
90 const struct gen_device_info
*devinfo
= p
->devinfo
;
92 if (dest
.file
== BRW_MESSAGE_REGISTER_FILE
)
93 assert((dest
.nr
& ~BRW_MRF_COMPR4
) < BRW_MAX_MRF(devinfo
->gen
));
94 else if (dest
.file
== BRW_GENERAL_REGISTER_FILE
)
95 assert(dest
.nr
< 128);
97 /* The hardware has a restriction where if the destination is Byte,
98 * the instruction needs to have a stride of 2 (except for packed byte
99 * MOV). This seems to be required even if the destination is the NULL
102 if (dest
.file
== BRW_ARCHITECTURE_REGISTER_FILE
&&
103 dest
.nr
== BRW_ARF_NULL
&&
104 type_sz(dest
.type
) == 1) {
105 dest
.hstride
= BRW_HORIZONTAL_STRIDE_2
;
108 gen7_convert_mrf_to_grf(p
, &dest
);
110 if (devinfo
->gen
>= 12 &&
111 (brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SEND
||
112 brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDC
)) {
113 assert(dest
.file
== BRW_GENERAL_REGISTER_FILE
||
114 dest
.file
== BRW_ARCHITECTURE_REGISTER_FILE
);
115 assert(dest
.address_mode
== BRW_ADDRESS_DIRECT
);
116 assert(dest
.subnr
== 0);
117 assert(brw_inst_exec_size(devinfo
, inst
) == BRW_EXECUTE_1
||
118 (dest
.hstride
== BRW_HORIZONTAL_STRIDE_1
&&
119 dest
.vstride
== dest
.width
+ 1));
120 assert(!dest
.negate
&& !dest
.abs
);
121 brw_inst_set_dst_reg_file(devinfo
, inst
, dest
.file
);
122 brw_inst_set_dst_da_reg_nr(devinfo
, inst
, dest
.nr
);
124 } else if (brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDS
||
125 brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDSC
) {
126 assert(devinfo
->gen
< 12);
127 assert(dest
.file
== BRW_GENERAL_REGISTER_FILE
||
128 dest
.file
== BRW_ARCHITECTURE_REGISTER_FILE
);
129 assert(dest
.address_mode
== BRW_ADDRESS_DIRECT
);
130 assert(dest
.subnr
% 16 == 0);
131 assert(dest
.hstride
== BRW_HORIZONTAL_STRIDE_1
&&
132 dest
.vstride
== dest
.width
+ 1);
133 assert(!dest
.negate
&& !dest
.abs
);
134 brw_inst_set_dst_da_reg_nr(devinfo
, inst
, dest
.nr
);
135 brw_inst_set_dst_da16_subreg_nr(devinfo
, inst
, dest
.subnr
/ 16);
136 brw_inst_set_send_dst_reg_file(devinfo
, inst
, dest
.file
);
138 brw_inst_set_dst_file_type(devinfo
, inst
, dest
.file
, dest
.type
);
139 brw_inst_set_dst_address_mode(devinfo
, inst
, dest
.address_mode
);
141 if (dest
.address_mode
== BRW_ADDRESS_DIRECT
) {
142 brw_inst_set_dst_da_reg_nr(devinfo
, inst
, dest
.nr
);
144 if (brw_inst_access_mode(devinfo
, inst
) == BRW_ALIGN_1
) {
145 brw_inst_set_dst_da1_subreg_nr(devinfo
, inst
, dest
.subnr
);
146 if (dest
.hstride
== BRW_HORIZONTAL_STRIDE_0
)
147 dest
.hstride
= BRW_HORIZONTAL_STRIDE_1
;
148 brw_inst_set_dst_hstride(devinfo
, inst
, dest
.hstride
);
150 brw_inst_set_dst_da16_subreg_nr(devinfo
, inst
, dest
.subnr
/ 16);
151 brw_inst_set_da16_writemask(devinfo
, inst
, dest
.writemask
);
152 if (dest
.file
== BRW_GENERAL_REGISTER_FILE
||
153 dest
.file
== BRW_MESSAGE_REGISTER_FILE
) {
154 assert(dest
.writemask
!= 0);
156 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
157 * Although Dst.HorzStride is a don't care for Align16, HW needs
158 * this to be programmed as "01".
160 brw_inst_set_dst_hstride(devinfo
, inst
, 1);
163 brw_inst_set_dst_ia_subreg_nr(devinfo
, inst
, dest
.subnr
);
165 /* These are different sizes in align1 vs align16:
167 if (brw_inst_access_mode(devinfo
, inst
) == BRW_ALIGN_1
) {
168 brw_inst_set_dst_ia1_addr_imm(devinfo
, inst
,
169 dest
.indirect_offset
);
170 if (dest
.hstride
== BRW_HORIZONTAL_STRIDE_0
)
171 dest
.hstride
= BRW_HORIZONTAL_STRIDE_1
;
172 brw_inst_set_dst_hstride(devinfo
, inst
, dest
.hstride
);
174 brw_inst_set_dst_ia16_addr_imm(devinfo
, inst
,
175 dest
.indirect_offset
);
176 /* even ignored in da16, still need to set as '01' */
177 brw_inst_set_dst_hstride(devinfo
, inst
, 1);
182 /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
183 * or 16 (SIMD16), as that's normally correct. However, when dealing with
184 * small registers, it can be useful for us to automatically reduce it to
185 * match the register size.
187 if (p
->automatic_exec_sizes
) {
189 * In platforms that support fp64 we can emit instructions with a width
190 * of 4 that need two SIMD8 registers and an exec_size of 8 or 16. In
191 * these cases we need to make sure that these instructions have their
192 * exec sizes set properly when they are emitted and we can't rely on
193 * this code to fix it.
196 if (devinfo
->gen
>= 6)
197 fix_exec_size
= dest
.width
< BRW_EXECUTE_4
;
199 fix_exec_size
= dest
.width
< BRW_EXECUTE_8
;
202 brw_inst_set_exec_size(devinfo
, inst
, dest
.width
);
207 brw_set_src0(struct brw_codegen
*p
, brw_inst
*inst
, struct brw_reg reg
)
209 const struct gen_device_info
*devinfo
= p
->devinfo
;
211 if (reg
.file
== BRW_MESSAGE_REGISTER_FILE
)
212 assert((reg
.nr
& ~BRW_MRF_COMPR4
) < BRW_MAX_MRF(devinfo
->gen
));
213 else if (reg
.file
== BRW_GENERAL_REGISTER_FILE
)
214 assert(reg
.nr
< 128);
216 gen7_convert_mrf_to_grf(p
, ®
);
218 if (devinfo
->gen
>= 6 &&
219 (brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SEND
||
220 brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDC
||
221 brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDS
||
222 brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDSC
)) {
223 /* Any source modifiers or regions will be ignored, since this just
224 * identifies the MRF/GRF to start reading the message contents from.
225 * Check for some likely failures.
229 assert(reg
.address_mode
== BRW_ADDRESS_DIRECT
);
232 if (devinfo
->gen
>= 12 &&
233 (brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SEND
||
234 brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDC
)) {
235 assert(reg
.file
!= BRW_IMMEDIATE_VALUE
);
236 assert(reg
.address_mode
== BRW_ADDRESS_DIRECT
);
237 assert(reg
.subnr
== 0);
238 assert(brw_inst_exec_size(devinfo
, inst
) == BRW_EXECUTE_1
||
239 (reg
.hstride
== BRW_HORIZONTAL_STRIDE_1
&&
240 reg
.vstride
== reg
.width
+ 1));
241 assert(!reg
.negate
&& !reg
.abs
);
242 brw_inst_set_send_src0_reg_file(devinfo
, inst
, reg
.file
);
243 brw_inst_set_src0_da_reg_nr(devinfo
, inst
, reg
.nr
);
245 } else if (brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDS
||
246 brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDSC
) {
247 assert(reg
.file
== BRW_GENERAL_REGISTER_FILE
);
248 assert(reg
.address_mode
== BRW_ADDRESS_DIRECT
);
249 assert(reg
.subnr
% 16 == 0);
250 assert(reg
.hstride
== BRW_HORIZONTAL_STRIDE_1
&&
251 reg
.vstride
== reg
.width
+ 1);
252 assert(!reg
.negate
&& !reg
.abs
);
253 brw_inst_set_src0_da_reg_nr(devinfo
, inst
, reg
.nr
);
254 brw_inst_set_src0_da16_subreg_nr(devinfo
, inst
, reg
.subnr
/ 16);
256 brw_inst_set_src0_file_type(devinfo
, inst
, reg
.file
, reg
.type
);
257 brw_inst_set_src0_abs(devinfo
, inst
, reg
.abs
);
258 brw_inst_set_src0_negate(devinfo
, inst
, reg
.negate
);
259 brw_inst_set_src0_address_mode(devinfo
, inst
, reg
.address_mode
);
261 if (reg
.file
== BRW_IMMEDIATE_VALUE
) {
262 if (reg
.type
== BRW_REGISTER_TYPE_DF
||
263 brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_DIM
)
264 brw_inst_set_imm_df(devinfo
, inst
, reg
.df
);
265 else if (reg
.type
== BRW_REGISTER_TYPE_UQ
||
266 reg
.type
== BRW_REGISTER_TYPE_Q
)
267 brw_inst_set_imm_uq(devinfo
, inst
, reg
.u64
);
269 brw_inst_set_imm_ud(devinfo
, inst
, reg
.ud
);
271 if (devinfo
->gen
< 12 && type_sz(reg
.type
) < 8) {
272 brw_inst_set_src1_reg_file(devinfo
, inst
,
273 BRW_ARCHITECTURE_REGISTER_FILE
);
274 brw_inst_set_src1_reg_hw_type(devinfo
, inst
,
275 brw_inst_src0_reg_hw_type(devinfo
, inst
));
278 if (reg
.address_mode
== BRW_ADDRESS_DIRECT
) {
279 brw_inst_set_src0_da_reg_nr(devinfo
, inst
, reg
.nr
);
280 if (brw_inst_access_mode(devinfo
, inst
) == BRW_ALIGN_1
) {
281 brw_inst_set_src0_da1_subreg_nr(devinfo
, inst
, reg
.subnr
);
283 brw_inst_set_src0_da16_subreg_nr(devinfo
, inst
, reg
.subnr
/ 16);
286 brw_inst_set_src0_ia_subreg_nr(devinfo
, inst
, reg
.subnr
);
288 if (brw_inst_access_mode(devinfo
, inst
) == BRW_ALIGN_1
) {
289 brw_inst_set_src0_ia1_addr_imm(devinfo
, inst
, reg
.indirect_offset
);
291 brw_inst_set_src0_ia16_addr_imm(devinfo
, inst
, reg
.indirect_offset
);
295 if (brw_inst_access_mode(devinfo
, inst
) == BRW_ALIGN_1
) {
296 if (reg
.width
== BRW_WIDTH_1
&&
297 brw_inst_exec_size(devinfo
, inst
) == BRW_EXECUTE_1
) {
298 brw_inst_set_src0_hstride(devinfo
, inst
, BRW_HORIZONTAL_STRIDE_0
);
299 brw_inst_set_src0_width(devinfo
, inst
, BRW_WIDTH_1
);
300 brw_inst_set_src0_vstride(devinfo
, inst
, BRW_VERTICAL_STRIDE_0
);
302 brw_inst_set_src0_hstride(devinfo
, inst
, reg
.hstride
);
303 brw_inst_set_src0_width(devinfo
, inst
, reg
.width
);
304 brw_inst_set_src0_vstride(devinfo
, inst
, reg
.vstride
);
307 brw_inst_set_src0_da16_swiz_x(devinfo
, inst
,
308 BRW_GET_SWZ(reg
.swizzle
, BRW_CHANNEL_X
));
309 brw_inst_set_src0_da16_swiz_y(devinfo
, inst
,
310 BRW_GET_SWZ(reg
.swizzle
, BRW_CHANNEL_Y
));
311 brw_inst_set_src0_da16_swiz_z(devinfo
, inst
,
312 BRW_GET_SWZ(reg
.swizzle
, BRW_CHANNEL_Z
));
313 brw_inst_set_src0_da16_swiz_w(devinfo
, inst
,
314 BRW_GET_SWZ(reg
.swizzle
, BRW_CHANNEL_W
));
316 if (reg
.vstride
== BRW_VERTICAL_STRIDE_8
) {
317 /* This is an oddity of the fact we're using the same
318 * descriptions for registers in align_16 as align_1:
320 brw_inst_set_src0_vstride(devinfo
, inst
, BRW_VERTICAL_STRIDE_4
);
321 } else if (devinfo
->gen
== 7 && !devinfo
->is_haswell
&&
322 reg
.type
== BRW_REGISTER_TYPE_DF
&&
323 reg
.vstride
== BRW_VERTICAL_STRIDE_2
) {
326 * "For Align16 access mode, only encodings of 0000 and 0011
327 * are allowed. Other codes are reserved."
329 * Presumably the DevSNB behavior applies to IVB as well.
331 brw_inst_set_src0_vstride(devinfo
, inst
, BRW_VERTICAL_STRIDE_4
);
333 brw_inst_set_src0_vstride(devinfo
, inst
, reg
.vstride
);
342 brw_set_src1(struct brw_codegen
*p
, brw_inst
*inst
, struct brw_reg reg
)
344 const struct gen_device_info
*devinfo
= p
->devinfo
;
346 if (reg
.file
== BRW_GENERAL_REGISTER_FILE
)
347 assert(reg
.nr
< 128);
349 if (brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDS
||
350 brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDSC
||
351 (devinfo
->gen
>= 12 &&
352 (brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SEND
||
353 brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDC
))) {
354 assert(reg
.file
== BRW_GENERAL_REGISTER_FILE
||
355 reg
.file
== BRW_ARCHITECTURE_REGISTER_FILE
);
356 assert(reg
.address_mode
== BRW_ADDRESS_DIRECT
);
357 assert(reg
.subnr
== 0);
358 assert(brw_inst_exec_size(devinfo
, inst
) == BRW_EXECUTE_1
||
359 (reg
.hstride
== BRW_HORIZONTAL_STRIDE_1
&&
360 reg
.vstride
== reg
.width
+ 1));
361 assert(!reg
.negate
&& !reg
.abs
);
362 brw_inst_set_send_src1_reg_nr(devinfo
, inst
, reg
.nr
);
363 brw_inst_set_send_src1_reg_file(devinfo
, inst
, reg
.file
);
365 /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
367 * "Accumulator registers may be accessed explicitly as src0
370 assert(reg
.file
!= BRW_ARCHITECTURE_REGISTER_FILE
||
371 reg
.nr
!= BRW_ARF_ACCUMULATOR
);
373 gen7_convert_mrf_to_grf(p
, ®
);
374 assert(reg
.file
!= BRW_MESSAGE_REGISTER_FILE
);
376 brw_inst_set_src1_file_type(devinfo
, inst
, reg
.file
, reg
.type
);
377 brw_inst_set_src1_abs(devinfo
, inst
, reg
.abs
);
378 brw_inst_set_src1_negate(devinfo
, inst
, reg
.negate
);
380 /* Only src1 can be immediate in two-argument instructions.
382 assert(brw_inst_src0_reg_file(devinfo
, inst
) != BRW_IMMEDIATE_VALUE
);
384 if (reg
.file
== BRW_IMMEDIATE_VALUE
) {
385 /* two-argument instructions can only use 32-bit immediates */
386 assert(type_sz(reg
.type
) < 8);
387 brw_inst_set_imm_ud(devinfo
, inst
, reg
.ud
);
389 /* This is a hardware restriction, which may or may not be lifted
392 assert (reg
.address_mode
== BRW_ADDRESS_DIRECT
);
393 /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
395 brw_inst_set_src1_da_reg_nr(devinfo
, inst
, reg
.nr
);
396 if (brw_inst_access_mode(devinfo
, inst
) == BRW_ALIGN_1
) {
397 brw_inst_set_src1_da1_subreg_nr(devinfo
, inst
, reg
.subnr
);
399 brw_inst_set_src1_da16_subreg_nr(devinfo
, inst
, reg
.subnr
/ 16);
402 if (brw_inst_access_mode(devinfo
, inst
) == BRW_ALIGN_1
) {
403 if (reg
.width
== BRW_WIDTH_1
&&
404 brw_inst_exec_size(devinfo
, inst
) == BRW_EXECUTE_1
) {
405 brw_inst_set_src1_hstride(devinfo
, inst
, BRW_HORIZONTAL_STRIDE_0
);
406 brw_inst_set_src1_width(devinfo
, inst
, BRW_WIDTH_1
);
407 brw_inst_set_src1_vstride(devinfo
, inst
, BRW_VERTICAL_STRIDE_0
);
409 brw_inst_set_src1_hstride(devinfo
, inst
, reg
.hstride
);
410 brw_inst_set_src1_width(devinfo
, inst
, reg
.width
);
411 brw_inst_set_src1_vstride(devinfo
, inst
, reg
.vstride
);
414 brw_inst_set_src1_da16_swiz_x(devinfo
, inst
,
415 BRW_GET_SWZ(reg
.swizzle
, BRW_CHANNEL_X
));
416 brw_inst_set_src1_da16_swiz_y(devinfo
, inst
,
417 BRW_GET_SWZ(reg
.swizzle
, BRW_CHANNEL_Y
));
418 brw_inst_set_src1_da16_swiz_z(devinfo
, inst
,
419 BRW_GET_SWZ(reg
.swizzle
, BRW_CHANNEL_Z
));
420 brw_inst_set_src1_da16_swiz_w(devinfo
, inst
,
421 BRW_GET_SWZ(reg
.swizzle
, BRW_CHANNEL_W
));
423 if (reg
.vstride
== BRW_VERTICAL_STRIDE_8
) {
424 /* This is an oddity of the fact we're using the same
425 * descriptions for registers in align_16 as align_1:
427 brw_inst_set_src1_vstride(devinfo
, inst
, BRW_VERTICAL_STRIDE_4
);
428 } else if (devinfo
->gen
== 7 && !devinfo
->is_haswell
&&
429 reg
.type
== BRW_REGISTER_TYPE_DF
&&
430 reg
.vstride
== BRW_VERTICAL_STRIDE_2
) {
433 * "For Align16 access mode, only encodings of 0000 and 0011
434 * are allowed. Other codes are reserved."
436 * Presumably the DevSNB behavior applies to IVB as well.
438 brw_inst_set_src1_vstride(devinfo
, inst
, BRW_VERTICAL_STRIDE_4
);
440 brw_inst_set_src1_vstride(devinfo
, inst
, reg
.vstride
);
448 * Specify the descriptor and extended descriptor immediate for a SEND(C)
449 * message instruction.
452 brw_set_desc_ex(struct brw_codegen
*p
, brw_inst
*inst
,
453 unsigned desc
, unsigned ex_desc
)
455 const struct gen_device_info
*devinfo
= p
->devinfo
;
456 assert(brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SEND
||
457 brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDC
);
458 if (devinfo
->gen
< 12)
459 brw_inst_set_src1_file_type(devinfo
, inst
,
460 BRW_IMMEDIATE_VALUE
, BRW_REGISTER_TYPE_UD
);
461 brw_inst_set_send_desc(devinfo
, inst
, desc
);
462 if (devinfo
->gen
>= 9)
463 brw_inst_set_send_ex_desc(devinfo
, inst
, ex_desc
);
466 static void brw_set_math_message( struct brw_codegen
*p
,
469 unsigned integer_type
,
473 const struct gen_device_info
*devinfo
= p
->devinfo
;
475 unsigned response_length
;
477 /* Infer message length from the function */
479 case BRW_MATH_FUNCTION_POW
:
480 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT
:
481 case BRW_MATH_FUNCTION_INT_DIV_REMAINDER
:
482 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER
:
490 /* Infer response length from the function */
492 case BRW_MATH_FUNCTION_SINCOS
:
493 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER
:
501 brw_set_desc(p
, inst
, brw_message_desc(
502 devinfo
, msg_length
, response_length
, false));
504 brw_inst_set_sfid(devinfo
, inst
, BRW_SFID_MATH
);
505 brw_inst_set_math_msg_function(devinfo
, inst
, function
);
506 brw_inst_set_math_msg_signed_int(devinfo
, inst
, integer_type
);
507 brw_inst_set_math_msg_precision(devinfo
, inst
, low_precision
);
508 brw_inst_set_math_msg_saturate(devinfo
, inst
, brw_inst_saturate(devinfo
, inst
));
509 brw_inst_set_math_msg_data_type(devinfo
, inst
, dataType
);
510 brw_inst_set_saturate(devinfo
, inst
, 0);
514 static void brw_set_ff_sync_message(struct brw_codegen
*p
,
517 unsigned response_length
,
520 const struct gen_device_info
*devinfo
= p
->devinfo
;
522 brw_set_desc(p
, insn
, brw_message_desc(
523 devinfo
, 1, response_length
, true));
525 brw_inst_set_sfid(devinfo
, insn
, BRW_SFID_URB
);
526 brw_inst_set_eot(devinfo
, insn
, end_of_thread
);
527 brw_inst_set_urb_opcode(devinfo
, insn
, 1); /* FF_SYNC */
528 brw_inst_set_urb_allocate(devinfo
, insn
, allocate
);
529 /* The following fields are not used by FF_SYNC: */
530 brw_inst_set_urb_global_offset(devinfo
, insn
, 0);
531 brw_inst_set_urb_swizzle_control(devinfo
, insn
, 0);
532 brw_inst_set_urb_used(devinfo
, insn
, 0);
533 brw_inst_set_urb_complete(devinfo
, insn
, 0);
536 static void brw_set_urb_message( struct brw_codegen
*p
,
538 enum brw_urb_write_flags flags
,
540 unsigned response_length
,
542 unsigned swizzle_control
)
544 const struct gen_device_info
*devinfo
= p
->devinfo
;
546 assert(devinfo
->gen
< 7 || swizzle_control
!= BRW_URB_SWIZZLE_TRANSPOSE
);
547 assert(devinfo
->gen
< 7 || !(flags
& BRW_URB_WRITE_ALLOCATE
));
548 assert(devinfo
->gen
>= 7 || !(flags
& BRW_URB_WRITE_PER_SLOT_OFFSET
));
550 brw_set_desc(p
, insn
, brw_message_desc(
551 devinfo
, msg_length
, response_length
, true));
553 brw_inst_set_sfid(devinfo
, insn
, BRW_SFID_URB
);
554 brw_inst_set_eot(devinfo
, insn
, !!(flags
& BRW_URB_WRITE_EOT
));
556 if (flags
& BRW_URB_WRITE_OWORD
) {
557 assert(msg_length
== 2); /* header + one OWORD of data */
558 brw_inst_set_urb_opcode(devinfo
, insn
, BRW_URB_OPCODE_WRITE_OWORD
);
560 brw_inst_set_urb_opcode(devinfo
, insn
, BRW_URB_OPCODE_WRITE_HWORD
);
563 brw_inst_set_urb_global_offset(devinfo
, insn
, offset
);
564 brw_inst_set_urb_swizzle_control(devinfo
, insn
, swizzle_control
);
566 if (devinfo
->gen
< 8) {
567 brw_inst_set_urb_complete(devinfo
, insn
, !!(flags
& BRW_URB_WRITE_COMPLETE
));
570 if (devinfo
->gen
< 7) {
571 brw_inst_set_urb_allocate(devinfo
, insn
, !!(flags
& BRW_URB_WRITE_ALLOCATE
));
572 brw_inst_set_urb_used(devinfo
, insn
, !(flags
& BRW_URB_WRITE_UNUSED
));
574 brw_inst_set_urb_per_slot_offset(devinfo
, insn
,
575 !!(flags
& BRW_URB_WRITE_PER_SLOT_OFFSET
));
580 gen7_set_dp_scratch_message(struct brw_codegen
*p
,
584 bool invalidate_after_read
,
586 unsigned addr_offset
,
591 const struct gen_device_info
*devinfo
= p
->devinfo
;
592 assert(num_regs
== 1 || num_regs
== 2 || num_regs
== 4 ||
593 (devinfo
->gen
>= 8 && num_regs
== 8));
594 const unsigned block_size
= (devinfo
->gen
>= 8 ? _mesa_logbase2(num_regs
) :
597 brw_set_desc(p
, inst
, brw_message_desc(
598 devinfo
, mlen
, rlen
, header_present
));
600 brw_inst_set_sfid(devinfo
, inst
, GEN7_SFID_DATAPORT_DATA_CACHE
);
601 brw_inst_set_dp_category(devinfo
, inst
, 1); /* Scratch Block Read/Write msgs */
602 brw_inst_set_scratch_read_write(devinfo
, inst
, write
);
603 brw_inst_set_scratch_type(devinfo
, inst
, dword
);
604 brw_inst_set_scratch_invalidate_after_read(devinfo
, inst
, invalidate_after_read
);
605 brw_inst_set_scratch_block_size(devinfo
, inst
, block_size
);
606 brw_inst_set_scratch_addr_offset(devinfo
, inst
, addr_offset
);
610 brw_inst_set_state(const struct gen_device_info
*devinfo
,
612 const struct brw_insn_state
*state
)
614 brw_inst_set_exec_size(devinfo
, insn
, state
->exec_size
);
615 brw_inst_set_group(devinfo
, insn
, state
->group
);
616 brw_inst_set_compression(devinfo
, insn
, state
->compressed
);
617 brw_inst_set_access_mode(devinfo
, insn
, state
->access_mode
);
618 brw_inst_set_mask_control(devinfo
, insn
, state
->mask_control
);
619 brw_inst_set_saturate(devinfo
, insn
, state
->saturate
);
620 brw_inst_set_pred_control(devinfo
, insn
, state
->predicate
);
621 brw_inst_set_pred_inv(devinfo
, insn
, state
->pred_inv
);
623 if (is_3src(devinfo
, brw_inst_opcode(devinfo
, insn
)) &&
624 state
->access_mode
== BRW_ALIGN_16
) {
625 brw_inst_set_3src_a16_flag_subreg_nr(devinfo
, insn
, state
->flag_subreg
% 2);
626 if (devinfo
->gen
>= 7)
627 brw_inst_set_3src_a16_flag_reg_nr(devinfo
, insn
, state
->flag_subreg
/ 2);
629 brw_inst_set_flag_subreg_nr(devinfo
, insn
, state
->flag_subreg
% 2);
630 if (devinfo
->gen
>= 7)
631 brw_inst_set_flag_reg_nr(devinfo
, insn
, state
->flag_subreg
/ 2);
634 if (devinfo
->gen
>= 6)
635 brw_inst_set_acc_wr_control(devinfo
, insn
, state
->acc_wr_control
);
638 #define next_insn brw_next_insn
640 brw_next_insn(struct brw_codegen
*p
, unsigned opcode
)
642 const struct gen_device_info
*devinfo
= p
->devinfo
;
645 if (p
->nr_insn
+ 1 > p
->store_size
) {
647 p
->store
= reralloc(p
->mem_ctx
, p
->store
, brw_inst
, p
->store_size
);
650 p
->next_insn_offset
+= 16;
651 insn
= &p
->store
[p
->nr_insn
++];
653 memset(insn
, 0, sizeof(*insn
));
654 brw_inst_set_opcode(devinfo
, insn
, opcode
);
656 /* Apply the default instruction state */
657 brw_inst_set_state(devinfo
, insn
, p
->current
);
663 brw_alu1(struct brw_codegen
*p
, unsigned opcode
,
664 struct brw_reg dest
, struct brw_reg src
)
666 brw_inst
*insn
= next_insn(p
, opcode
);
667 brw_set_dest(p
, insn
, dest
);
668 brw_set_src0(p
, insn
, src
);
673 brw_alu2(struct brw_codegen
*p
, unsigned opcode
,
674 struct brw_reg dest
, struct brw_reg src0
, struct brw_reg src1
)
676 /* 64-bit immediates are only supported on 1-src instructions */
677 assert(src0
.file
!= BRW_IMMEDIATE_VALUE
|| type_sz(src0
.type
) <= 4);
678 assert(src1
.file
!= BRW_IMMEDIATE_VALUE
|| type_sz(src1
.type
) <= 4);
680 brw_inst
*insn
= next_insn(p
, opcode
);
681 brw_set_dest(p
, insn
, dest
);
682 brw_set_src0(p
, insn
, src0
);
683 brw_set_src1(p
, insn
, src1
);
688 get_3src_subreg_nr(struct brw_reg reg
)
690 /* Normally, SubRegNum is in bytes (0..31). However, 3-src instructions
691 * use 32-bit units (components 0..7). Since they only support F/D/UD
692 * types, this doesn't lose any flexibility, but uses fewer bits.
694 return reg
.subnr
/ 4;
697 static enum gen10_align1_3src_vertical_stride
698 to_3src_align1_vstride(const struct gen_device_info
*devinfo
,
699 enum brw_vertical_stride vstride
)
702 case BRW_VERTICAL_STRIDE_0
:
703 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_0
;
704 case BRW_VERTICAL_STRIDE_1
:
705 assert(devinfo
->gen
>= 12);
706 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_1
;
707 case BRW_VERTICAL_STRIDE_2
:
708 assert(devinfo
->gen
< 12);
709 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_2
;
710 case BRW_VERTICAL_STRIDE_4
:
711 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_4
;
712 case BRW_VERTICAL_STRIDE_8
:
713 case BRW_VERTICAL_STRIDE_16
:
714 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_8
;
716 unreachable("invalid vstride");
721 static enum gen10_align1_3src_src_horizontal_stride
722 to_3src_align1_hstride(enum brw_horizontal_stride hstride
)
725 case BRW_HORIZONTAL_STRIDE_0
:
726 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_0
;
727 case BRW_HORIZONTAL_STRIDE_1
:
728 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_1
;
729 case BRW_HORIZONTAL_STRIDE_2
:
730 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_2
;
731 case BRW_HORIZONTAL_STRIDE_4
:
732 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_4
;
734 unreachable("invalid hstride");
739 brw_alu3(struct brw_codegen
*p
, unsigned opcode
, struct brw_reg dest
,
740 struct brw_reg src0
, struct brw_reg src1
, struct brw_reg src2
)
742 const struct gen_device_info
*devinfo
= p
->devinfo
;
743 brw_inst
*inst
= next_insn(p
, opcode
);
745 gen7_convert_mrf_to_grf(p
, &dest
);
747 assert(dest
.nr
< 128);
748 assert(src0
.file
== BRW_IMMEDIATE_VALUE
|| src0
.nr
< 128);
749 assert(src1
.file
!= BRW_IMMEDIATE_VALUE
&& src1
.nr
< 128);
750 assert(src2
.file
== BRW_IMMEDIATE_VALUE
|| src2
.nr
< 128);
751 assert(dest
.address_mode
== BRW_ADDRESS_DIRECT
);
752 assert(src0
.address_mode
== BRW_ADDRESS_DIRECT
);
753 assert(src1
.address_mode
== BRW_ADDRESS_DIRECT
);
754 assert(src2
.address_mode
== BRW_ADDRESS_DIRECT
);
756 if (brw_inst_access_mode(devinfo
, inst
) == BRW_ALIGN_1
) {
757 assert(dest
.file
== BRW_GENERAL_REGISTER_FILE
||
758 dest
.file
== BRW_ARCHITECTURE_REGISTER_FILE
);
760 if (devinfo
->gen
>= 12) {
761 brw_inst_set_3src_a1_dst_reg_file(devinfo
, inst
, dest
.file
);
762 brw_inst_set_3src_dst_reg_nr(devinfo
, inst
, dest
.nr
);
764 if (dest
.file
== BRW_ARCHITECTURE_REGISTER_FILE
) {
765 brw_inst_set_3src_a1_dst_reg_file(devinfo
, inst
,
766 BRW_ALIGN1_3SRC_ACCUMULATOR
);
767 brw_inst_set_3src_dst_reg_nr(devinfo
, inst
, BRW_ARF_ACCUMULATOR
);
769 brw_inst_set_3src_a1_dst_reg_file(devinfo
, inst
,
770 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE
);
771 brw_inst_set_3src_dst_reg_nr(devinfo
, inst
, dest
.nr
);
774 brw_inst_set_3src_a1_dst_subreg_nr(devinfo
, inst
, dest
.subnr
/ 8);
776 brw_inst_set_3src_a1_dst_hstride(devinfo
, inst
, BRW_ALIGN1_3SRC_DST_HORIZONTAL_STRIDE_1
);
778 if (brw_reg_type_is_floating_point(dest
.type
)) {
779 brw_inst_set_3src_a1_exec_type(devinfo
, inst
,
780 BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT
);
782 brw_inst_set_3src_a1_exec_type(devinfo
, inst
,
783 BRW_ALIGN1_3SRC_EXEC_TYPE_INT
);
786 brw_inst_set_3src_a1_dst_type(devinfo
, inst
, dest
.type
);
787 brw_inst_set_3src_a1_src0_type(devinfo
, inst
, src0
.type
);
788 brw_inst_set_3src_a1_src1_type(devinfo
, inst
, src1
.type
);
789 brw_inst_set_3src_a1_src2_type(devinfo
, inst
, src2
.type
);
791 brw_inst_set_3src_a1_src0_vstride(
792 devinfo
, inst
, to_3src_align1_vstride(devinfo
, src0
.vstride
));
793 brw_inst_set_3src_a1_src1_vstride(
794 devinfo
, inst
, to_3src_align1_vstride(devinfo
, src1
.vstride
));
795 /* no vstride on src2 */
797 brw_inst_set_3src_a1_src0_hstride(devinfo
, inst
,
798 to_3src_align1_hstride(src0
.hstride
));
799 brw_inst_set_3src_a1_src1_hstride(devinfo
, inst
,
800 to_3src_align1_hstride(src1
.hstride
));
801 brw_inst_set_3src_a1_src2_hstride(devinfo
, inst
,
802 to_3src_align1_hstride(src2
.hstride
));
804 brw_inst_set_3src_a1_src0_subreg_nr(devinfo
, inst
, src0
.subnr
);
805 if (src0
.type
== BRW_REGISTER_TYPE_NF
) {
806 brw_inst_set_3src_src0_reg_nr(devinfo
, inst
, BRW_ARF_ACCUMULATOR
);
808 brw_inst_set_3src_src0_reg_nr(devinfo
, inst
, src0
.nr
);
810 brw_inst_set_3src_src0_abs(devinfo
, inst
, src0
.abs
);
811 brw_inst_set_3src_src0_negate(devinfo
, inst
, src0
.negate
);
813 brw_inst_set_3src_a1_src1_subreg_nr(devinfo
, inst
, src1
.subnr
);
814 if (src1
.file
== BRW_ARCHITECTURE_REGISTER_FILE
) {
815 brw_inst_set_3src_src1_reg_nr(devinfo
, inst
, BRW_ARF_ACCUMULATOR
);
817 brw_inst_set_3src_src1_reg_nr(devinfo
, inst
, src1
.nr
);
819 brw_inst_set_3src_src1_abs(devinfo
, inst
, src1
.abs
);
820 brw_inst_set_3src_src1_negate(devinfo
, inst
, src1
.negate
);
822 brw_inst_set_3src_a1_src2_subreg_nr(devinfo
, inst
, src2
.subnr
);
823 brw_inst_set_3src_src2_reg_nr(devinfo
, inst
, src2
.nr
);
824 brw_inst_set_3src_src2_abs(devinfo
, inst
, src2
.abs
);
825 brw_inst_set_3src_src2_negate(devinfo
, inst
, src2
.negate
);
827 assert(src0
.file
== BRW_GENERAL_REGISTER_FILE
||
828 src0
.file
== BRW_IMMEDIATE_VALUE
||
829 (src0
.file
== BRW_ARCHITECTURE_REGISTER_FILE
&&
830 src0
.type
== BRW_REGISTER_TYPE_NF
));
831 assert(src1
.file
== BRW_GENERAL_REGISTER_FILE
||
832 src1
.file
== BRW_ARCHITECTURE_REGISTER_FILE
);
833 assert(src2
.file
== BRW_GENERAL_REGISTER_FILE
||
834 src2
.file
== BRW_IMMEDIATE_VALUE
);
836 if (devinfo
->gen
>= 12) {
837 brw_inst_set_3src_a1_src0_reg_file(devinfo
, inst
, src0
.file
);
838 brw_inst_set_3src_a1_src1_reg_file(devinfo
, inst
, src1
.file
);
839 brw_inst_set_3src_a1_src2_reg_file(devinfo
, inst
, src2
.file
);
841 brw_inst_set_3src_a1_src0_reg_file(devinfo
, inst
,
842 src0
.file
== BRW_GENERAL_REGISTER_FILE
?
843 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE
:
844 BRW_ALIGN1_3SRC_IMMEDIATE_VALUE
);
845 brw_inst_set_3src_a1_src1_reg_file(devinfo
, inst
,
846 src1
.file
== BRW_GENERAL_REGISTER_FILE
?
847 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE
:
848 BRW_ALIGN1_3SRC_ACCUMULATOR
);
849 brw_inst_set_3src_a1_src2_reg_file(devinfo
, inst
,
850 src2
.file
== BRW_GENERAL_REGISTER_FILE
?
851 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE
:
852 BRW_ALIGN1_3SRC_IMMEDIATE_VALUE
);
856 assert(dest
.file
== BRW_GENERAL_REGISTER_FILE
||
857 dest
.file
== BRW_MESSAGE_REGISTER_FILE
);
858 assert(dest
.type
== BRW_REGISTER_TYPE_F
||
859 dest
.type
== BRW_REGISTER_TYPE_DF
||
860 dest
.type
== BRW_REGISTER_TYPE_D
||
861 dest
.type
== BRW_REGISTER_TYPE_UD
||
862 (dest
.type
== BRW_REGISTER_TYPE_HF
&& devinfo
->gen
>= 8));
863 if (devinfo
->gen
== 6) {
864 brw_inst_set_3src_a16_dst_reg_file(devinfo
, inst
,
865 dest
.file
== BRW_MESSAGE_REGISTER_FILE
);
867 brw_inst_set_3src_dst_reg_nr(devinfo
, inst
, dest
.nr
);
868 brw_inst_set_3src_a16_dst_subreg_nr(devinfo
, inst
, dest
.subnr
/ 16);
869 brw_inst_set_3src_a16_dst_writemask(devinfo
, inst
, dest
.writemask
);
871 assert(src0
.file
== BRW_GENERAL_REGISTER_FILE
);
872 brw_inst_set_3src_a16_src0_swizzle(devinfo
, inst
, src0
.swizzle
);
873 brw_inst_set_3src_a16_src0_subreg_nr(devinfo
, inst
, get_3src_subreg_nr(src0
));
874 brw_inst_set_3src_src0_reg_nr(devinfo
, inst
, src0
.nr
);
875 brw_inst_set_3src_src0_abs(devinfo
, inst
, src0
.abs
);
876 brw_inst_set_3src_src0_negate(devinfo
, inst
, src0
.negate
);
877 brw_inst_set_3src_a16_src0_rep_ctrl(devinfo
, inst
,
878 src0
.vstride
== BRW_VERTICAL_STRIDE_0
);
880 assert(src1
.file
== BRW_GENERAL_REGISTER_FILE
);
881 brw_inst_set_3src_a16_src1_swizzle(devinfo
, inst
, src1
.swizzle
);
882 brw_inst_set_3src_a16_src1_subreg_nr(devinfo
, inst
, get_3src_subreg_nr(src1
));
883 brw_inst_set_3src_src1_reg_nr(devinfo
, inst
, src1
.nr
);
884 brw_inst_set_3src_src1_abs(devinfo
, inst
, src1
.abs
);
885 brw_inst_set_3src_src1_negate(devinfo
, inst
, src1
.negate
);
886 brw_inst_set_3src_a16_src1_rep_ctrl(devinfo
, inst
,
887 src1
.vstride
== BRW_VERTICAL_STRIDE_0
);
889 assert(src2
.file
== BRW_GENERAL_REGISTER_FILE
);
890 brw_inst_set_3src_a16_src2_swizzle(devinfo
, inst
, src2
.swizzle
);
891 brw_inst_set_3src_a16_src2_subreg_nr(devinfo
, inst
, get_3src_subreg_nr(src2
));
892 brw_inst_set_3src_src2_reg_nr(devinfo
, inst
, src2
.nr
);
893 brw_inst_set_3src_src2_abs(devinfo
, inst
, src2
.abs
);
894 brw_inst_set_3src_src2_negate(devinfo
, inst
, src2
.negate
);
895 brw_inst_set_3src_a16_src2_rep_ctrl(devinfo
, inst
,
896 src2
.vstride
== BRW_VERTICAL_STRIDE_0
);
898 if (devinfo
->gen
>= 7) {
899 /* Set both the source and destination types based on dest.type,
900 * ignoring the source register types. The MAD and LRP emitters ensure
901 * that all four types are float. The BFE and BFI2 emitters, however,
902 * may send us mixed D and UD types and want us to ignore that and use
903 * the destination type.
905 brw_inst_set_3src_a16_src_type(devinfo
, inst
, dest
.type
);
906 brw_inst_set_3src_a16_dst_type(devinfo
, inst
, dest
.type
);
908 /* From the Bspec, 3D Media GPGPU, Instruction fields, srcType:
910 * "Three source instructions can use operands with mixed-mode
911 * precision. When SrcType field is set to :f or :hf it defines
912 * precision for source 0 only, and fields Src1Type and Src2Type
913 * define precision for other source operands:
915 * 0b = :f. Single precision Float (32-bit).
916 * 1b = :hf. Half precision Float (16-bit)."
918 if (src1
.type
== BRW_REGISTER_TYPE_HF
)
919 brw_inst_set_3src_a16_src1_type(devinfo
, inst
, 1);
921 if (src2
.type
== BRW_REGISTER_TYPE_HF
)
922 brw_inst_set_3src_a16_src2_type(devinfo
, inst
, 1);
930 /***********************************************************************
931 * Convenience routines.
934 brw_inst *brw_##OP(struct brw_codegen *p, \
935 struct brw_reg dest, \
936 struct brw_reg src0) \
938 return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \
942 brw_inst *brw_##OP(struct brw_codegen *p, \
943 struct brw_reg dest, \
944 struct brw_reg src0, \
945 struct brw_reg src1) \
947 return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \
951 brw_inst *brw_##OP(struct brw_codegen *p, \
952 struct brw_reg dest, \
953 struct brw_reg src0, \
954 struct brw_reg src1, \
955 struct brw_reg src2) \
957 if (p->current->access_mode == BRW_ALIGN_16) { \
958 if (src0.vstride == BRW_VERTICAL_STRIDE_0) \
959 src0.swizzle = BRW_SWIZZLE_XXXX; \
960 if (src1.vstride == BRW_VERTICAL_STRIDE_0) \
961 src1.swizzle = BRW_SWIZZLE_XXXX; \
962 if (src2.vstride == BRW_VERTICAL_STRIDE_0) \
963 src2.swizzle = BRW_SWIZZLE_XXXX; \
965 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
969 brw_inst *brw_##OP(struct brw_codegen *p, \
970 struct brw_reg dest, \
971 struct brw_reg src0, \
972 struct brw_reg src1, \
973 struct brw_reg src2) \
975 assert(dest.type == BRW_REGISTER_TYPE_F || \
976 dest.type == BRW_REGISTER_TYPE_DF); \
977 if (dest.type == BRW_REGISTER_TYPE_F) { \
978 assert(src0.type == BRW_REGISTER_TYPE_F); \
979 assert(src1.type == BRW_REGISTER_TYPE_F); \
980 assert(src2.type == BRW_REGISTER_TYPE_F); \
981 } else if (dest.type == BRW_REGISTER_TYPE_DF) { \
982 assert(src0.type == BRW_REGISTER_TYPE_DF); \
983 assert(src1.type == BRW_REGISTER_TYPE_DF); \
984 assert(src2.type == BRW_REGISTER_TYPE_DF); \
987 if (p->current->access_mode == BRW_ALIGN_16) { \
988 if (src0.vstride == BRW_VERTICAL_STRIDE_0) \
989 src0.swizzle = BRW_SWIZZLE_XXXX; \
990 if (src1.vstride == BRW_VERTICAL_STRIDE_0) \
991 src1.swizzle = BRW_SWIZZLE_XXXX; \
992 if (src2.vstride == BRW_VERTICAL_STRIDE_0) \
993 src2.swizzle = BRW_SWIZZLE_XXXX; \
995 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
998 /* Rounding operations (other than RNDD) require two instructions - the first
999 * stores a rounded value (possibly the wrong way) in the dest register, but
1000 * also sets a per-channel "increment bit" in the flag register. A predicated
1001 * add of 1.0 fixes dest to contain the desired result.
1003 * Sandybridge and later appear to round correctly without an ADD.
1006 void brw_##OP(struct brw_codegen *p, \
1007 struct brw_reg dest, \
1008 struct brw_reg src) \
1010 const struct gen_device_info *devinfo = p->devinfo; \
1011 brw_inst *rnd, *add; \
1012 rnd = next_insn(p, BRW_OPCODE_##OP); \
1013 brw_set_dest(p, rnd, dest); \
1014 brw_set_src0(p, rnd, src); \
1016 if (devinfo->gen < 6) { \
1017 /* turn on round-increments */ \
1018 brw_inst_set_cond_modifier(devinfo, rnd, BRW_CONDITIONAL_R); \
1019 add = brw_ADD(p, dest, dest, brw_imm_f(1.0f)); \
1020 brw_inst_set_pred_control(devinfo, add, BRW_PREDICATE_NORMAL); \
1062 brw_MOV(struct brw_codegen
*p
, struct brw_reg dest
, struct brw_reg src0
)
1064 const struct gen_device_info
*devinfo
= p
->devinfo
;
1066 /* When converting F->DF on IVB/BYT, every odd source channel is ignored.
1067 * To avoid the problems that causes, we use an <X,2,0> source region to
1068 * read each element twice.
1070 if (devinfo
->gen
== 7 && !devinfo
->is_haswell
&&
1071 brw_get_default_access_mode(p
) == BRW_ALIGN_1
&&
1072 dest
.type
== BRW_REGISTER_TYPE_DF
&&
1073 (src0
.type
== BRW_REGISTER_TYPE_F
||
1074 src0
.type
== BRW_REGISTER_TYPE_D
||
1075 src0
.type
== BRW_REGISTER_TYPE_UD
) &&
1076 !has_scalar_region(src0
)) {
1077 assert(src0
.vstride
== src0
.width
+ src0
.hstride
);
1078 src0
.vstride
= src0
.hstride
;
1079 src0
.width
= BRW_WIDTH_2
;
1080 src0
.hstride
= BRW_HORIZONTAL_STRIDE_0
;
1083 return brw_alu1(p
, BRW_OPCODE_MOV
, dest
, src0
);
1087 brw_ADD(struct brw_codegen
*p
, struct brw_reg dest
,
1088 struct brw_reg src0
, struct brw_reg src1
)
1091 if (src0
.type
== BRW_REGISTER_TYPE_F
||
1092 (src0
.file
== BRW_IMMEDIATE_VALUE
&&
1093 src0
.type
== BRW_REGISTER_TYPE_VF
)) {
1094 assert(src1
.type
!= BRW_REGISTER_TYPE_UD
);
1095 assert(src1
.type
!= BRW_REGISTER_TYPE_D
);
1098 if (src1
.type
== BRW_REGISTER_TYPE_F
||
1099 (src1
.file
== BRW_IMMEDIATE_VALUE
&&
1100 src1
.type
== BRW_REGISTER_TYPE_VF
)) {
1101 assert(src0
.type
!= BRW_REGISTER_TYPE_UD
);
1102 assert(src0
.type
!= BRW_REGISTER_TYPE_D
);
1105 return brw_alu2(p
, BRW_OPCODE_ADD
, dest
, src0
, src1
);
1109 brw_AVG(struct brw_codegen
*p
, struct brw_reg dest
,
1110 struct brw_reg src0
, struct brw_reg src1
)
1112 assert(dest
.type
== src0
.type
);
1113 assert(src0
.type
== src1
.type
);
1114 switch (src0
.type
) {
1115 case BRW_REGISTER_TYPE_B
:
1116 case BRW_REGISTER_TYPE_UB
:
1117 case BRW_REGISTER_TYPE_W
:
1118 case BRW_REGISTER_TYPE_UW
:
1119 case BRW_REGISTER_TYPE_D
:
1120 case BRW_REGISTER_TYPE_UD
:
1123 unreachable("Bad type for brw_AVG");
1126 return brw_alu2(p
, BRW_OPCODE_AVG
, dest
, src0
, src1
);
1130 brw_MUL(struct brw_codegen
*p
, struct brw_reg dest
,
1131 struct brw_reg src0
, struct brw_reg src1
)
1134 if (src0
.type
== BRW_REGISTER_TYPE_D
||
1135 src0
.type
== BRW_REGISTER_TYPE_UD
||
1136 src1
.type
== BRW_REGISTER_TYPE_D
||
1137 src1
.type
== BRW_REGISTER_TYPE_UD
) {
1138 assert(dest
.type
!= BRW_REGISTER_TYPE_F
);
1141 if (src0
.type
== BRW_REGISTER_TYPE_F
||
1142 (src0
.file
== BRW_IMMEDIATE_VALUE
&&
1143 src0
.type
== BRW_REGISTER_TYPE_VF
)) {
1144 assert(src1
.type
!= BRW_REGISTER_TYPE_UD
);
1145 assert(src1
.type
!= BRW_REGISTER_TYPE_D
);
1148 if (src1
.type
== BRW_REGISTER_TYPE_F
||
1149 (src1
.file
== BRW_IMMEDIATE_VALUE
&&
1150 src1
.type
== BRW_REGISTER_TYPE_VF
)) {
1151 assert(src0
.type
!= BRW_REGISTER_TYPE_UD
);
1152 assert(src0
.type
!= BRW_REGISTER_TYPE_D
);
1155 assert(src0
.file
!= BRW_ARCHITECTURE_REGISTER_FILE
||
1156 src0
.nr
!= BRW_ARF_ACCUMULATOR
);
1157 assert(src1
.file
!= BRW_ARCHITECTURE_REGISTER_FILE
||
1158 src1
.nr
!= BRW_ARF_ACCUMULATOR
);
1160 return brw_alu2(p
, BRW_OPCODE_MUL
, dest
, src0
, src1
);
1164 brw_LINE(struct brw_codegen
*p
, struct brw_reg dest
,
1165 struct brw_reg src0
, struct brw_reg src1
)
1167 src0
.vstride
= BRW_VERTICAL_STRIDE_0
;
1168 src0
.width
= BRW_WIDTH_1
;
1169 src0
.hstride
= BRW_HORIZONTAL_STRIDE_0
;
1170 return brw_alu2(p
, BRW_OPCODE_LINE
, dest
, src0
, src1
);
1174 brw_PLN(struct brw_codegen
*p
, struct brw_reg dest
,
1175 struct brw_reg src0
, struct brw_reg src1
)
1177 src0
.vstride
= BRW_VERTICAL_STRIDE_0
;
1178 src0
.width
= BRW_WIDTH_1
;
1179 src0
.hstride
= BRW_HORIZONTAL_STRIDE_0
;
1180 src1
.vstride
= BRW_VERTICAL_STRIDE_8
;
1181 src1
.width
= BRW_WIDTH_8
;
1182 src1
.hstride
= BRW_HORIZONTAL_STRIDE_1
;
1183 return brw_alu2(p
, BRW_OPCODE_PLN
, dest
, src0
, src1
);
1187 brw_F32TO16(struct brw_codegen
*p
, struct brw_reg dst
, struct brw_reg src
)
1189 const struct gen_device_info
*devinfo
= p
->devinfo
;
1190 const bool align16
= brw_get_default_access_mode(p
) == BRW_ALIGN_16
;
1191 /* The F32TO16 instruction doesn't support 32-bit destination types in
1192 * Align1 mode, and neither does the Gen8 implementation in terms of a
1193 * converting MOV. Gen7 does zero out the high 16 bits in Align16 mode as
1194 * an undocumented feature.
1196 const bool needs_zero_fill
= (dst
.type
== BRW_REGISTER_TYPE_UD
&&
1197 (!align16
|| devinfo
->gen
>= 8));
1201 assert(dst
.type
== BRW_REGISTER_TYPE_UD
);
1203 assert(dst
.type
== BRW_REGISTER_TYPE_UD
||
1204 dst
.type
== BRW_REGISTER_TYPE_W
||
1205 dst
.type
== BRW_REGISTER_TYPE_UW
||
1206 dst
.type
== BRW_REGISTER_TYPE_HF
);
1209 brw_push_insn_state(p
);
1211 if (needs_zero_fill
) {
1212 brw_set_default_access_mode(p
, BRW_ALIGN_1
);
1213 dst
= spread(retype(dst
, BRW_REGISTER_TYPE_W
), 2);
1216 if (devinfo
->gen
>= 8) {
1217 inst
= brw_MOV(p
, retype(dst
, BRW_REGISTER_TYPE_HF
), src
);
1219 assert(devinfo
->gen
== 7);
1220 inst
= brw_alu1(p
, BRW_OPCODE_F32TO16
, dst
, src
);
1223 if (needs_zero_fill
) {
1224 if (devinfo
->gen
< 12)
1225 brw_inst_set_no_dd_clear(devinfo
, inst
, true);
1226 inst
= brw_MOV(p
, suboffset(dst
, 1), brw_imm_w(0));
1227 if (devinfo
->gen
< 12)
1228 brw_inst_set_no_dd_check(devinfo
, inst
, true);
1231 brw_pop_insn_state(p
);
1236 brw_F16TO32(struct brw_codegen
*p
, struct brw_reg dst
, struct brw_reg src
)
1238 const struct gen_device_info
*devinfo
= p
->devinfo
;
1239 bool align16
= brw_get_default_access_mode(p
) == BRW_ALIGN_16
;
1242 assert(src
.type
== BRW_REGISTER_TYPE_UD
);
1244 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1246 * Because this instruction does not have a 16-bit floating-point
1247 * type, the source data type must be Word (W). The destination type
1248 * must be F (Float).
1250 if (src
.type
== BRW_REGISTER_TYPE_UD
)
1251 src
= spread(retype(src
, BRW_REGISTER_TYPE_W
), 2);
1253 assert(src
.type
== BRW_REGISTER_TYPE_W
||
1254 src
.type
== BRW_REGISTER_TYPE_UW
||
1255 src
.type
== BRW_REGISTER_TYPE_HF
);
1258 if (devinfo
->gen
>= 8) {
1259 return brw_MOV(p
, dst
, retype(src
, BRW_REGISTER_TYPE_HF
));
1261 assert(devinfo
->gen
== 7);
1262 return brw_alu1(p
, BRW_OPCODE_F16TO32
, dst
, src
);
1267 void brw_NOP(struct brw_codegen
*p
)
1269 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_NOP
);
1270 memset(insn
, 0, sizeof(*insn
));
1271 brw_inst_set_opcode(p
->devinfo
, insn
, BRW_OPCODE_NOP
);
1278 /***********************************************************************
1279 * Comparisons, if/else/endif
1283 brw_JMPI(struct brw_codegen
*p
, struct brw_reg index
,
1284 unsigned predicate_control
)
1286 const struct gen_device_info
*devinfo
= p
->devinfo
;
1287 struct brw_reg ip
= brw_ip_reg();
1288 brw_inst
*inst
= brw_alu2(p
, BRW_OPCODE_JMPI
, ip
, ip
, index
);
1290 brw_inst_set_exec_size(devinfo
, inst
, BRW_EXECUTE_1
);
1291 brw_inst_set_qtr_control(devinfo
, inst
, BRW_COMPRESSION_NONE
);
1292 brw_inst_set_mask_control(devinfo
, inst
, BRW_MASK_DISABLE
);
1293 brw_inst_set_pred_control(devinfo
, inst
, predicate_control
);
1299 push_if_stack(struct brw_codegen
*p
, brw_inst
*inst
)
1301 p
->if_stack
[p
->if_stack_depth
] = inst
- p
->store
;
1303 p
->if_stack_depth
++;
1304 if (p
->if_stack_array_size
<= p
->if_stack_depth
) {
1305 p
->if_stack_array_size
*= 2;
1306 p
->if_stack
= reralloc(p
->mem_ctx
, p
->if_stack
, int,
1307 p
->if_stack_array_size
);
1312 pop_if_stack(struct brw_codegen
*p
)
1314 p
->if_stack_depth
--;
1315 return &p
->store
[p
->if_stack
[p
->if_stack_depth
]];
1319 push_loop_stack(struct brw_codegen
*p
, brw_inst
*inst
)
1321 if (p
->loop_stack_array_size
<= (p
->loop_stack_depth
+ 1)) {
1322 p
->loop_stack_array_size
*= 2;
1323 p
->loop_stack
= reralloc(p
->mem_ctx
, p
->loop_stack
, int,
1324 p
->loop_stack_array_size
);
1325 p
->if_depth_in_loop
= reralloc(p
->mem_ctx
, p
->if_depth_in_loop
, int,
1326 p
->loop_stack_array_size
);
1329 p
->loop_stack
[p
->loop_stack_depth
] = inst
- p
->store
;
1330 p
->loop_stack_depth
++;
1331 p
->if_depth_in_loop
[p
->loop_stack_depth
] = 0;
1335 get_inner_do_insn(struct brw_codegen
*p
)
1337 return &p
->store
[p
->loop_stack
[p
->loop_stack_depth
- 1]];
1340 /* EU takes the value from the flag register and pushes it onto some
1341 * sort of a stack (presumably merging with any flag value already on
1342 * the stack). Within an if block, the flags at the top of the stack
1343 * control execution on each channel of the unit, eg. on each of the
1344 * 16 pixel values in our wm programs.
1346 * When the matching 'else' instruction is reached (presumably by
1347 * countdown of the instruction count patched in by our ELSE/ENDIF
1348 * functions), the relevant flags are inverted.
1350 * When the matching 'endif' instruction is reached, the flags are
1351 * popped off. If the stack is now empty, normal execution resumes.
1354 brw_IF(struct brw_codegen
*p
, unsigned execute_size
)
1356 const struct gen_device_info
*devinfo
= p
->devinfo
;
1359 insn
= next_insn(p
, BRW_OPCODE_IF
);
1361 /* Override the defaults for this instruction:
1363 if (devinfo
->gen
< 6) {
1364 brw_set_dest(p
, insn
, brw_ip_reg());
1365 brw_set_src0(p
, insn
, brw_ip_reg());
1366 brw_set_src1(p
, insn
, brw_imm_d(0x0));
1367 } else if (devinfo
->gen
== 6) {
1368 brw_set_dest(p
, insn
, brw_imm_w(0));
1369 brw_inst_set_gen6_jump_count(devinfo
, insn
, 0);
1370 brw_set_src0(p
, insn
, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D
)));
1371 brw_set_src1(p
, insn
, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D
)));
1372 } else if (devinfo
->gen
== 7) {
1373 brw_set_dest(p
, insn
, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D
)));
1374 brw_set_src0(p
, insn
, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D
)));
1375 brw_set_src1(p
, insn
, brw_imm_w(0));
1376 brw_inst_set_jip(devinfo
, insn
, 0);
1377 brw_inst_set_uip(devinfo
, insn
, 0);
1379 brw_set_dest(p
, insn
, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D
)));
1380 if (devinfo
->gen
< 12)
1381 brw_set_src0(p
, insn
, brw_imm_d(0));
1382 brw_inst_set_jip(devinfo
, insn
, 0);
1383 brw_inst_set_uip(devinfo
, insn
, 0);
1386 brw_inst_set_exec_size(devinfo
, insn
, execute_size
);
1387 brw_inst_set_qtr_control(devinfo
, insn
, BRW_COMPRESSION_NONE
);
1388 brw_inst_set_pred_control(devinfo
, insn
, BRW_PREDICATE_NORMAL
);
1389 brw_inst_set_mask_control(devinfo
, insn
, BRW_MASK_ENABLE
);
1390 if (!p
->single_program_flow
&& devinfo
->gen
< 6)
1391 brw_inst_set_thread_control(devinfo
, insn
, BRW_THREAD_SWITCH
);
1393 push_if_stack(p
, insn
);
1394 p
->if_depth_in_loop
[p
->loop_stack_depth
]++;
1398 /* This function is only used for gen6-style IF instructions with an
1399 * embedded comparison (conditional modifier). It is not used on gen7.
1402 gen6_IF(struct brw_codegen
*p
, enum brw_conditional_mod conditional
,
1403 struct brw_reg src0
, struct brw_reg src1
)
1405 const struct gen_device_info
*devinfo
= p
->devinfo
;
1408 insn
= next_insn(p
, BRW_OPCODE_IF
);
1410 brw_set_dest(p
, insn
, brw_imm_w(0));
1411 brw_inst_set_exec_size(devinfo
, insn
, brw_get_default_exec_size(p
));
1412 brw_inst_set_gen6_jump_count(devinfo
, insn
, 0);
1413 brw_set_src0(p
, insn
, src0
);
1414 brw_set_src1(p
, insn
, src1
);
1416 assert(brw_inst_qtr_control(devinfo
, insn
) == BRW_COMPRESSION_NONE
);
1417 assert(brw_inst_pred_control(devinfo
, insn
) == BRW_PREDICATE_NONE
);
1418 brw_inst_set_cond_modifier(devinfo
, insn
, conditional
);
1420 push_if_stack(p
, insn
);
1425 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1428 convert_IF_ELSE_to_ADD(struct brw_codegen
*p
,
1429 brw_inst
*if_inst
, brw_inst
*else_inst
)
1431 const struct gen_device_info
*devinfo
= p
->devinfo
;
1433 /* The next instruction (where the ENDIF would be, if it existed) */
1434 brw_inst
*next_inst
= &p
->store
[p
->nr_insn
];
1436 assert(p
->single_program_flow
);
1437 assert(if_inst
!= NULL
&& brw_inst_opcode(devinfo
, if_inst
) == BRW_OPCODE_IF
);
1438 assert(else_inst
== NULL
|| brw_inst_opcode(devinfo
, else_inst
) == BRW_OPCODE_ELSE
);
1439 assert(brw_inst_exec_size(devinfo
, if_inst
) == BRW_EXECUTE_1
);
1441 /* Convert IF to an ADD instruction that moves the instruction pointer
1442 * to the first instruction of the ELSE block. If there is no ELSE
1443 * block, point to where ENDIF would be. Reverse the predicate.
1445 * There's no need to execute an ENDIF since we don't need to do any
1446 * stack operations, and if we're currently executing, we just want to
1447 * continue normally.
1449 brw_inst_set_opcode(devinfo
, if_inst
, BRW_OPCODE_ADD
);
1450 brw_inst_set_pred_inv(devinfo
, if_inst
, true);
1452 if (else_inst
!= NULL
) {
1453 /* Convert ELSE to an ADD instruction that points where the ENDIF
1456 brw_inst_set_opcode(devinfo
, else_inst
, BRW_OPCODE_ADD
);
1458 brw_inst_set_imm_ud(devinfo
, if_inst
, (else_inst
- if_inst
+ 1) * 16);
1459 brw_inst_set_imm_ud(devinfo
, else_inst
, (next_inst
- else_inst
) * 16);
1461 brw_inst_set_imm_ud(devinfo
, if_inst
, (next_inst
- if_inst
) * 16);
1466 * Patch IF and ELSE instructions with appropriate jump targets.
1469 patch_IF_ELSE(struct brw_codegen
*p
,
1470 brw_inst
*if_inst
, brw_inst
*else_inst
, brw_inst
*endif_inst
)
1472 const struct gen_device_info
*devinfo
= p
->devinfo
;
1474 /* We shouldn't be patching IF and ELSE instructions in single program flow
1475 * mode when gen < 6, because in single program flow mode on those
1476 * platforms, we convert flow control instructions to conditional ADDs that
1477 * operate on IP (see brw_ENDIF).
1479 * However, on Gen6, writing to IP doesn't work in single program flow mode
1480 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1481 * not be updated by non-flow control instructions."). And on later
1482 * platforms, there is no significant benefit to converting control flow
1483 * instructions to conditional ADDs. So we do patch IF and ELSE
1484 * instructions in single program flow mode on those platforms.
1486 if (devinfo
->gen
< 6)
1487 assert(!p
->single_program_flow
);
1489 assert(if_inst
!= NULL
&& brw_inst_opcode(devinfo
, if_inst
) == BRW_OPCODE_IF
);
1490 assert(endif_inst
!= NULL
);
1491 assert(else_inst
== NULL
|| brw_inst_opcode(devinfo
, else_inst
) == BRW_OPCODE_ELSE
);
1493 unsigned br
= brw_jump_scale(devinfo
);
1495 assert(brw_inst_opcode(devinfo
, endif_inst
) == BRW_OPCODE_ENDIF
);
1496 brw_inst_set_exec_size(devinfo
, endif_inst
, brw_inst_exec_size(devinfo
, if_inst
));
1498 if (else_inst
== NULL
) {
1499 /* Patch IF -> ENDIF */
1500 if (devinfo
->gen
< 6) {
1501 /* Turn it into an IFF, which means no mask stack operations for
1502 * all-false and jumping past the ENDIF.
1504 brw_inst_set_opcode(devinfo
, if_inst
, BRW_OPCODE_IFF
);
1505 brw_inst_set_gen4_jump_count(devinfo
, if_inst
,
1506 br
* (endif_inst
- if_inst
+ 1));
1507 brw_inst_set_gen4_pop_count(devinfo
, if_inst
, 0);
1508 } else if (devinfo
->gen
== 6) {
1509 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1510 brw_inst_set_gen6_jump_count(devinfo
, if_inst
, br
*(endif_inst
- if_inst
));
1512 brw_inst_set_uip(devinfo
, if_inst
, br
* (endif_inst
- if_inst
));
1513 brw_inst_set_jip(devinfo
, if_inst
, br
* (endif_inst
- if_inst
));
1516 brw_inst_set_exec_size(devinfo
, else_inst
, brw_inst_exec_size(devinfo
, if_inst
));
1518 /* Patch IF -> ELSE */
1519 if (devinfo
->gen
< 6) {
1520 brw_inst_set_gen4_jump_count(devinfo
, if_inst
,
1521 br
* (else_inst
- if_inst
));
1522 brw_inst_set_gen4_pop_count(devinfo
, if_inst
, 0);
1523 } else if (devinfo
->gen
== 6) {
1524 brw_inst_set_gen6_jump_count(devinfo
, if_inst
,
1525 br
* (else_inst
- if_inst
+ 1));
1528 /* Patch ELSE -> ENDIF */
1529 if (devinfo
->gen
< 6) {
1530 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1533 brw_inst_set_gen4_jump_count(devinfo
, else_inst
,
1534 br
* (endif_inst
- else_inst
+ 1));
1535 brw_inst_set_gen4_pop_count(devinfo
, else_inst
, 1);
1536 } else if (devinfo
->gen
== 6) {
1537 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1538 brw_inst_set_gen6_jump_count(devinfo
, else_inst
,
1539 br
* (endif_inst
- else_inst
));
1541 /* The IF instruction's JIP should point just past the ELSE */
1542 brw_inst_set_jip(devinfo
, if_inst
, br
* (else_inst
- if_inst
+ 1));
1543 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1544 brw_inst_set_uip(devinfo
, if_inst
, br
* (endif_inst
- if_inst
));
1545 brw_inst_set_jip(devinfo
, else_inst
, br
* (endif_inst
- else_inst
));
1546 if (devinfo
->gen
>= 8) {
1547 /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both
1548 * should point to ENDIF.
1550 brw_inst_set_uip(devinfo
, else_inst
, br
* (endif_inst
- else_inst
));
1557 brw_ELSE(struct brw_codegen
*p
)
1559 const struct gen_device_info
*devinfo
= p
->devinfo
;
1562 insn
= next_insn(p
, BRW_OPCODE_ELSE
);
1564 if (devinfo
->gen
< 6) {
1565 brw_set_dest(p
, insn
, brw_ip_reg());
1566 brw_set_src0(p
, insn
, brw_ip_reg());
1567 brw_set_src1(p
, insn
, brw_imm_d(0x0));
1568 } else if (devinfo
->gen
== 6) {
1569 brw_set_dest(p
, insn
, brw_imm_w(0));
1570 brw_inst_set_gen6_jump_count(devinfo
, insn
, 0);
1571 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1572 brw_set_src1(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1573 } else if (devinfo
->gen
== 7) {
1574 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1575 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1576 brw_set_src1(p
, insn
, brw_imm_w(0));
1577 brw_inst_set_jip(devinfo
, insn
, 0);
1578 brw_inst_set_uip(devinfo
, insn
, 0);
1580 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1581 if (devinfo
->gen
< 12)
1582 brw_set_src0(p
, insn
, brw_imm_d(0));
1583 brw_inst_set_jip(devinfo
, insn
, 0);
1584 brw_inst_set_uip(devinfo
, insn
, 0);
1587 brw_inst_set_qtr_control(devinfo
, insn
, BRW_COMPRESSION_NONE
);
1588 brw_inst_set_mask_control(devinfo
, insn
, BRW_MASK_ENABLE
);
1589 if (!p
->single_program_flow
&& devinfo
->gen
< 6)
1590 brw_inst_set_thread_control(devinfo
, insn
, BRW_THREAD_SWITCH
);
1592 push_if_stack(p
, insn
);
1596 brw_ENDIF(struct brw_codegen
*p
)
1598 const struct gen_device_info
*devinfo
= p
->devinfo
;
1599 brw_inst
*insn
= NULL
;
1600 brw_inst
*else_inst
= NULL
;
1601 brw_inst
*if_inst
= NULL
;
1603 bool emit_endif
= true;
1605 /* In single program flow mode, we can express IF and ELSE instructions
1606 * equivalently as ADD instructions that operate on IP. On platforms prior
1607 * to Gen6, flow control instructions cause an implied thread switch, so
1608 * this is a significant savings.
1610 * However, on Gen6, writing to IP doesn't work in single program flow mode
1611 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1612 * not be updated by non-flow control instructions."). And on later
1613 * platforms, there is no significant benefit to converting control flow
1614 * instructions to conditional ADDs. So we only do this trick on Gen4 and
1617 if (devinfo
->gen
< 6 && p
->single_program_flow
)
1621 * A single next_insn() may change the base address of instruction store
1622 * memory(p->store), so call it first before referencing the instruction
1623 * store pointer from an index
1626 insn
= next_insn(p
, BRW_OPCODE_ENDIF
);
1628 /* Pop the IF and (optional) ELSE instructions from the stack */
1629 p
->if_depth_in_loop
[p
->loop_stack_depth
]--;
1630 tmp
= pop_if_stack(p
);
1631 if (brw_inst_opcode(devinfo
, tmp
) == BRW_OPCODE_ELSE
) {
1633 tmp
= pop_if_stack(p
);
1638 /* ENDIF is useless; don't bother emitting it. */
1639 convert_IF_ELSE_to_ADD(p
, if_inst
, else_inst
);
1643 if (devinfo
->gen
< 6) {
1644 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1645 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1646 brw_set_src1(p
, insn
, brw_imm_d(0x0));
1647 } else if (devinfo
->gen
== 6) {
1648 brw_set_dest(p
, insn
, brw_imm_w(0));
1649 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1650 brw_set_src1(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1651 } else if (devinfo
->gen
== 7) {
1652 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1653 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1654 brw_set_src1(p
, insn
, brw_imm_w(0));
1656 brw_set_src0(p
, insn
, brw_imm_d(0));
1659 brw_inst_set_qtr_control(devinfo
, insn
, BRW_COMPRESSION_NONE
);
1660 brw_inst_set_mask_control(devinfo
, insn
, BRW_MASK_ENABLE
);
1661 if (devinfo
->gen
< 6)
1662 brw_inst_set_thread_control(devinfo
, insn
, BRW_THREAD_SWITCH
);
1664 /* Also pop item off the stack in the endif instruction: */
1665 if (devinfo
->gen
< 6) {
1666 brw_inst_set_gen4_jump_count(devinfo
, insn
, 0);
1667 brw_inst_set_gen4_pop_count(devinfo
, insn
, 1);
1668 } else if (devinfo
->gen
== 6) {
1669 brw_inst_set_gen6_jump_count(devinfo
, insn
, 2);
1671 brw_inst_set_jip(devinfo
, insn
, 2);
1673 patch_IF_ELSE(p
, if_inst
, else_inst
, insn
);
1677 brw_BREAK(struct brw_codegen
*p
)
1679 const struct gen_device_info
*devinfo
= p
->devinfo
;
1682 insn
= next_insn(p
, BRW_OPCODE_BREAK
);
1683 if (devinfo
->gen
>= 8) {
1684 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1685 brw_set_src0(p
, insn
, brw_imm_d(0x0));
1686 } else if (devinfo
->gen
>= 6) {
1687 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1688 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1689 brw_set_src1(p
, insn
, brw_imm_d(0x0));
1691 brw_set_dest(p
, insn
, brw_ip_reg());
1692 brw_set_src0(p
, insn
, brw_ip_reg());
1693 brw_set_src1(p
, insn
, brw_imm_d(0x0));
1694 brw_inst_set_gen4_pop_count(devinfo
, insn
,
1695 p
->if_depth_in_loop
[p
->loop_stack_depth
]);
1697 brw_inst_set_qtr_control(devinfo
, insn
, BRW_COMPRESSION_NONE
);
1698 brw_inst_set_exec_size(devinfo
, insn
, brw_get_default_exec_size(p
));
1704 brw_CONT(struct brw_codegen
*p
)
1706 const struct gen_device_info
*devinfo
= p
->devinfo
;
1709 insn
= next_insn(p
, BRW_OPCODE_CONTINUE
);
1710 brw_set_dest(p
, insn
, brw_ip_reg());
1711 if (devinfo
->gen
>= 8) {
1712 brw_set_src0(p
, insn
, brw_imm_d(0x0));
1714 brw_set_src0(p
, insn
, brw_ip_reg());
1715 brw_set_src1(p
, insn
, brw_imm_d(0x0));
1718 if (devinfo
->gen
< 6) {
1719 brw_inst_set_gen4_pop_count(devinfo
, insn
,
1720 p
->if_depth_in_loop
[p
->loop_stack_depth
]);
1722 brw_inst_set_qtr_control(devinfo
, insn
, BRW_COMPRESSION_NONE
);
1723 brw_inst_set_exec_size(devinfo
, insn
, brw_get_default_exec_size(p
));
1728 gen6_HALT(struct brw_codegen
*p
)
1730 const struct gen_device_info
*devinfo
= p
->devinfo
;
1733 insn
= next_insn(p
, BRW_OPCODE_HALT
);
1734 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1735 if (devinfo
->gen
< 8) {
1736 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1737 brw_set_src1(p
, insn
, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1738 } else if (devinfo
->gen
< 12) {
1739 brw_set_src0(p
, insn
, brw_imm_d(0x0));
1742 brw_inst_set_qtr_control(devinfo
, insn
, BRW_COMPRESSION_NONE
);
1743 brw_inst_set_exec_size(devinfo
, insn
, brw_get_default_exec_size(p
));
1749 * The DO/WHILE is just an unterminated loop -- break or continue are
1750 * used for control within the loop. We have a few ways they can be
1753 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1754 * jip and no DO instruction.
1756 * For non-uniform control flow pre-gen6, there's a DO instruction to
1757 * push the mask, and a WHILE to jump back, and BREAK to get out and
1760 * For gen6, there's no more mask stack, so no need for DO. WHILE
1761 * just points back to the first instruction of the loop.
1764 brw_DO(struct brw_codegen
*p
, unsigned execute_size
)
1766 const struct gen_device_info
*devinfo
= p
->devinfo
;
1768 if (devinfo
->gen
>= 6 || p
->single_program_flow
) {
1769 push_loop_stack(p
, &p
->store
[p
->nr_insn
]);
1770 return &p
->store
[p
->nr_insn
];
1772 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_DO
);
1774 push_loop_stack(p
, insn
);
1776 /* Override the defaults for this instruction:
1778 brw_set_dest(p
, insn
, brw_null_reg());
1779 brw_set_src0(p
, insn
, brw_null_reg());
1780 brw_set_src1(p
, insn
, brw_null_reg());
1782 brw_inst_set_qtr_control(devinfo
, insn
, BRW_COMPRESSION_NONE
);
1783 brw_inst_set_exec_size(devinfo
, insn
, execute_size
);
1784 brw_inst_set_pred_control(devinfo
, insn
, BRW_PREDICATE_NONE
);
1791 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1794 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1795 * nesting, since it can always just point to the end of the block/current loop.
1798 brw_patch_break_cont(struct brw_codegen
*p
, brw_inst
*while_inst
)
1800 const struct gen_device_info
*devinfo
= p
->devinfo
;
1801 brw_inst
*do_inst
= get_inner_do_insn(p
);
1803 unsigned br
= brw_jump_scale(devinfo
);
1805 assert(devinfo
->gen
< 6);
1807 for (inst
= while_inst
- 1; inst
!= do_inst
; inst
--) {
1808 /* If the jump count is != 0, that means that this instruction has already
1809 * been patched because it's part of a loop inside of the one we're
1812 if (brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_BREAK
&&
1813 brw_inst_gen4_jump_count(devinfo
, inst
) == 0) {
1814 brw_inst_set_gen4_jump_count(devinfo
, inst
, br
*((while_inst
- inst
) + 1));
1815 } else if (brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_CONTINUE
&&
1816 brw_inst_gen4_jump_count(devinfo
, inst
) == 0) {
1817 brw_inst_set_gen4_jump_count(devinfo
, inst
, br
* (while_inst
- inst
));
1823 brw_WHILE(struct brw_codegen
*p
)
1825 const struct gen_device_info
*devinfo
= p
->devinfo
;
1826 brw_inst
*insn
, *do_insn
;
1827 unsigned br
= brw_jump_scale(devinfo
);
1829 if (devinfo
->gen
>= 6) {
1830 insn
= next_insn(p
, BRW_OPCODE_WHILE
);
1831 do_insn
= get_inner_do_insn(p
);
1833 if (devinfo
->gen
>= 8) {
1834 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1835 if (devinfo
->gen
< 12)
1836 brw_set_src0(p
, insn
, brw_imm_d(0));
1837 brw_inst_set_jip(devinfo
, insn
, br
* (do_insn
- insn
));
1838 } else if (devinfo
->gen
== 7) {
1839 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1840 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1841 brw_set_src1(p
, insn
, brw_imm_w(0));
1842 brw_inst_set_jip(devinfo
, insn
, br
* (do_insn
- insn
));
1844 brw_set_dest(p
, insn
, brw_imm_w(0));
1845 brw_inst_set_gen6_jump_count(devinfo
, insn
, br
* (do_insn
- insn
));
1846 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1847 brw_set_src1(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1850 brw_inst_set_exec_size(devinfo
, insn
, brw_get_default_exec_size(p
));
1853 if (p
->single_program_flow
) {
1854 insn
= next_insn(p
, BRW_OPCODE_ADD
);
1855 do_insn
= get_inner_do_insn(p
);
1857 brw_set_dest(p
, insn
, brw_ip_reg());
1858 brw_set_src0(p
, insn
, brw_ip_reg());
1859 brw_set_src1(p
, insn
, brw_imm_d((do_insn
- insn
) * 16));
1860 brw_inst_set_exec_size(devinfo
, insn
, BRW_EXECUTE_1
);
1862 insn
= next_insn(p
, BRW_OPCODE_WHILE
);
1863 do_insn
= get_inner_do_insn(p
);
1865 assert(brw_inst_opcode(devinfo
, do_insn
) == BRW_OPCODE_DO
);
1867 brw_set_dest(p
, insn
, brw_ip_reg());
1868 brw_set_src0(p
, insn
, brw_ip_reg());
1869 brw_set_src1(p
, insn
, brw_imm_d(0));
1871 brw_inst_set_exec_size(devinfo
, insn
, brw_inst_exec_size(devinfo
, do_insn
));
1872 brw_inst_set_gen4_jump_count(devinfo
, insn
, br
* (do_insn
- insn
+ 1));
1873 brw_inst_set_gen4_pop_count(devinfo
, insn
, 0);
1875 brw_patch_break_cont(p
, insn
);
1878 brw_inst_set_qtr_control(devinfo
, insn
, BRW_COMPRESSION_NONE
);
1880 p
->loop_stack_depth
--;
1887 void brw_land_fwd_jump(struct brw_codegen
*p
, int jmp_insn_idx
)
1889 const struct gen_device_info
*devinfo
= p
->devinfo
;
1890 brw_inst
*jmp_insn
= &p
->store
[jmp_insn_idx
];
1893 if (devinfo
->gen
>= 5)
1896 assert(brw_inst_opcode(devinfo
, jmp_insn
) == BRW_OPCODE_JMPI
);
1897 assert(brw_inst_src1_reg_file(devinfo
, jmp_insn
) == BRW_IMMEDIATE_VALUE
);
1899 brw_inst_set_gen4_jump_count(devinfo
, jmp_insn
,
1900 jmpi
* (p
->nr_insn
- jmp_insn_idx
- 1));
1903 /* To integrate with the above, it makes sense that the comparison
1904 * instruction should populate the flag register. It might be simpler
1905 * just to use the flag reg for most WM tasks?
1907 void brw_CMP(struct brw_codegen
*p
,
1908 struct brw_reg dest
,
1909 unsigned conditional
,
1910 struct brw_reg src0
,
1911 struct brw_reg src1
)
1913 const struct gen_device_info
*devinfo
= p
->devinfo
;
1914 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_CMP
);
1916 brw_inst_set_cond_modifier(devinfo
, insn
, conditional
);
1917 brw_set_dest(p
, insn
, dest
);
1918 brw_set_src0(p
, insn
, src0
);
1919 brw_set_src1(p
, insn
, src1
);
1921 /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1923 * "Any CMP instruction with a null destination must use a {switch}."
1925 * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1926 * mentioned on their work-arounds pages.
1928 if (devinfo
->gen
== 7) {
1929 if (dest
.file
== BRW_ARCHITECTURE_REGISTER_FILE
&&
1930 dest
.nr
== BRW_ARF_NULL
) {
1931 brw_inst_set_thread_control(devinfo
, insn
, BRW_THREAD_SWITCH
);
1936 /***********************************************************************
1937 * Helpers for the various SEND message types:
1940 /** Extended math function, float[8].
1942 void gen4_math(struct brw_codegen
*p
,
1943 struct brw_reg dest
,
1945 unsigned msg_reg_nr
,
1947 unsigned precision
)
1949 const struct gen_device_info
*devinfo
= p
->devinfo
;
1950 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_SEND
);
1952 if (has_scalar_region(src
)) {
1953 data_type
= BRW_MATH_DATA_SCALAR
;
1955 data_type
= BRW_MATH_DATA_VECTOR
;
1958 assert(devinfo
->gen
< 6);
1960 /* Example code doesn't set predicate_control for send
1963 brw_inst_set_pred_control(devinfo
, insn
, 0);
1964 brw_inst_set_base_mrf(devinfo
, insn
, msg_reg_nr
);
1966 brw_set_dest(p
, insn
, dest
);
1967 brw_set_src0(p
, insn
, src
);
1968 brw_set_math_message(p
,
1971 src
.type
== BRW_REGISTER_TYPE_D
,
1976 void gen6_math(struct brw_codegen
*p
,
1977 struct brw_reg dest
,
1979 struct brw_reg src0
,
1980 struct brw_reg src1
)
1982 const struct gen_device_info
*devinfo
= p
->devinfo
;
1983 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_MATH
);
1985 assert(devinfo
->gen
>= 6);
1987 assert(dest
.file
== BRW_GENERAL_REGISTER_FILE
||
1988 (devinfo
->gen
>= 7 && dest
.file
== BRW_MESSAGE_REGISTER_FILE
));
1990 assert(dest
.hstride
== BRW_HORIZONTAL_STRIDE_1
);
1991 if (devinfo
->gen
== 6) {
1992 assert(src0
.hstride
== BRW_HORIZONTAL_STRIDE_1
);
1993 assert(src1
.hstride
== BRW_HORIZONTAL_STRIDE_1
);
1996 if (function
== BRW_MATH_FUNCTION_INT_DIV_QUOTIENT
||
1997 function
== BRW_MATH_FUNCTION_INT_DIV_REMAINDER
||
1998 function
== BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER
) {
1999 assert(src0
.type
!= BRW_REGISTER_TYPE_F
);
2000 assert(src1
.type
!= BRW_REGISTER_TYPE_F
);
2001 assert(src1
.file
== BRW_GENERAL_REGISTER_FILE
||
2002 (devinfo
->gen
>= 8 && src1
.file
== BRW_IMMEDIATE_VALUE
));
2004 assert(src0
.type
== BRW_REGISTER_TYPE_F
||
2005 (src0
.type
== BRW_REGISTER_TYPE_HF
&& devinfo
->gen
>= 9));
2006 assert(src1
.type
== BRW_REGISTER_TYPE_F
||
2007 (src1
.type
== BRW_REGISTER_TYPE_HF
&& devinfo
->gen
>= 9));
2010 /* Source modifiers are ignored for extended math instructions on Gen6. */
2011 if (devinfo
->gen
== 6) {
2012 assert(!src0
.negate
);
2014 assert(!src1
.negate
);
2018 brw_inst_set_math_function(devinfo
, insn
, function
);
2020 brw_set_dest(p
, insn
, dest
);
2021 brw_set_src0(p
, insn
, src0
);
2022 brw_set_src1(p
, insn
, src1
);
2026 * Return the right surface index to access the thread scratch space using
2027 * stateless dataport messages.
2030 brw_scratch_surface_idx(const struct brw_codegen
*p
)
2032 /* The scratch space is thread-local so IA coherency is unnecessary. */
2033 if (p
->devinfo
->gen
>= 8)
2034 return GEN8_BTI_STATELESS_NON_COHERENT
;
2036 return BRW_BTI_STATELESS
;
2040 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
2041 * using a constant offset per channel.
2043 * The offset must be aligned to oword size (16 bytes). Used for
2044 * register spilling.
2046 void brw_oword_block_write_scratch(struct brw_codegen
*p
,
2051 const struct gen_device_info
*devinfo
= p
->devinfo
;
2052 const unsigned target_cache
=
2053 (devinfo
->gen
>= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE
:
2054 devinfo
->gen
>= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE
:
2055 BRW_SFID_DATAPORT_WRITE
);
2058 if (devinfo
->gen
>= 6)
2061 mrf
= retype(mrf
, BRW_REGISTER_TYPE_UD
);
2063 const unsigned mlen
= 1 + num_regs
;
2065 /* Set up the message header. This is g0, with g0.2 filled with
2066 * the offset. We don't want to leave our offset around in g0 or
2067 * it'll screw up texture samples, so set it up inside the message
2071 brw_push_insn_state(p
);
2072 brw_set_default_exec_size(p
, BRW_EXECUTE_8
);
2073 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
2074 brw_set_default_compression_control(p
, BRW_COMPRESSION_NONE
);
2076 brw_MOV(p
, mrf
, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD
));
2078 /* set message header global offset field (reg 0, element 2) */
2079 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
2081 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE
,
2083 2), BRW_REGISTER_TYPE_UD
),
2084 brw_imm_ud(offset
));
2086 brw_pop_insn_state(p
);
2090 struct brw_reg dest
;
2091 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_SEND
);
2092 int send_commit_msg
;
2093 struct brw_reg src_header
= retype(brw_vec8_grf(0, 0),
2094 BRW_REGISTER_TYPE_UW
);
2096 brw_inst_set_sfid(devinfo
, insn
, target_cache
);
2097 brw_inst_set_compression(devinfo
, insn
, false);
2099 if (brw_inst_exec_size(devinfo
, insn
) >= 16)
2100 src_header
= vec16(src_header
);
2102 assert(brw_inst_pred_control(devinfo
, insn
) == BRW_PREDICATE_NONE
);
2103 if (devinfo
->gen
< 6)
2104 brw_inst_set_base_mrf(devinfo
, insn
, mrf
.nr
);
2106 /* Until gen6, writes followed by reads from the same location
2107 * are not guaranteed to be ordered unless write_commit is set.
2108 * If set, then a no-op write is issued to the destination
2109 * register to set a dependency, and a read from the destination
2110 * can be used to ensure the ordering.
2112 * For gen6, only writes between different threads need ordering
2113 * protection. Our use of DP writes is all about register
2114 * spilling within a thread.
2116 if (devinfo
->gen
>= 6) {
2117 dest
= retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW
);
2118 send_commit_msg
= 0;
2121 send_commit_msg
= 1;
2124 brw_set_dest(p
, insn
, dest
);
2125 if (devinfo
->gen
>= 6) {
2126 brw_set_src0(p
, insn
, mrf
);
2128 brw_set_src0(p
, insn
, brw_null_reg());
2131 if (devinfo
->gen
>= 6)
2132 msg_type
= GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE
;
2134 msg_type
= BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE
;
2136 brw_set_desc(p
, insn
,
2137 brw_message_desc(devinfo
, mlen
, send_commit_msg
, true) |
2138 brw_dp_write_desc(devinfo
, brw_scratch_surface_idx(p
),
2139 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs
* 8),
2140 msg_type
, 0, /* not a render target */
2147 * Read a block of owords (half a GRF each) from the scratch buffer
2148 * using a constant index per channel.
2150 * Offset must be aligned to oword size (16 bytes). Used for register
2154 brw_oword_block_read_scratch(struct brw_codegen
*p
,
2155 struct brw_reg dest
,
2160 const struct gen_device_info
*devinfo
= p
->devinfo
;
2162 if (devinfo
->gen
>= 6)
2165 if (p
->devinfo
->gen
>= 7) {
2166 /* On gen 7 and above, we no longer have message registers and we can
2167 * send from any register we want. By using the destination register
2168 * for the message, we guarantee that the implied message write won't
2169 * accidentally overwrite anything. This has been a problem because
2170 * the MRF registers and source for the final FB write are both fixed
2173 mrf
= retype(dest
, BRW_REGISTER_TYPE_UD
);
2175 mrf
= retype(mrf
, BRW_REGISTER_TYPE_UD
);
2177 dest
= retype(dest
, BRW_REGISTER_TYPE_UW
);
2179 const unsigned rlen
= num_regs
;
2180 const unsigned target_cache
=
2181 (devinfo
->gen
>= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE
:
2182 devinfo
->gen
>= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE
:
2183 BRW_SFID_DATAPORT_READ
);
2186 brw_push_insn_state(p
);
2187 brw_set_default_exec_size(p
, BRW_EXECUTE_8
);
2188 brw_set_default_compression_control(p
, BRW_COMPRESSION_NONE
);
2189 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
2191 brw_MOV(p
, mrf
, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD
));
2193 /* set message header global offset field (reg 0, element 2) */
2194 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
2195 brw_MOV(p
, get_element_ud(mrf
, 2), brw_imm_ud(offset
));
2197 brw_pop_insn_state(p
);
2201 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_SEND
);
2203 brw_inst_set_sfid(devinfo
, insn
, target_cache
);
2204 assert(brw_inst_pred_control(devinfo
, insn
) == 0);
2205 brw_inst_set_compression(devinfo
, insn
, false);
2207 brw_set_dest(p
, insn
, dest
); /* UW? */
2208 if (devinfo
->gen
>= 6) {
2209 brw_set_src0(p
, insn
, mrf
);
2211 brw_set_src0(p
, insn
, brw_null_reg());
2212 brw_inst_set_base_mrf(devinfo
, insn
, mrf
.nr
);
2215 brw_set_desc(p
, insn
,
2216 brw_message_desc(devinfo
, 1, rlen
, true) |
2217 brw_dp_read_desc(devinfo
, brw_scratch_surface_idx(p
),
2218 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs
* 8),
2219 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ
,
2220 BRW_DATAPORT_READ_TARGET_RENDER_CACHE
));
2225 gen7_block_read_scratch(struct brw_codegen
*p
,
2226 struct brw_reg dest
,
2230 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_SEND
);
2231 assert(brw_inst_pred_control(p
->devinfo
, insn
) == BRW_PREDICATE_NONE
);
2233 brw_set_dest(p
, insn
, retype(dest
, BRW_REGISTER_TYPE_UW
));
2235 /* The HW requires that the header is present; this is to get the g0.5
2238 brw_set_src0(p
, insn
, brw_vec8_grf(0, 0));
2240 /* According to the docs, offset is "A 12-bit HWord offset into the memory
2241 * Immediate Memory buffer as specified by binding table 0xFF." An HWORD
2242 * is 32 bytes, which happens to be the size of a register.
2245 assert(offset
< (1 << 12));
2247 gen7_set_dp_scratch_message(p
, insn
,
2248 false, /* scratch read */
2250 false, /* invalidate after read */
2253 1, /* mlen: just g0 */
2254 num_regs
, /* rlen */
2255 true); /* header present */
2259 * Read float[4] vectors from the data port constant cache.
2260 * Location (in buffer) should be a multiple of 16.
2261 * Used for fetching shader constants.
2263 void brw_oword_block_read(struct brw_codegen
*p
,
2264 struct brw_reg dest
,
2267 uint32_t bind_table_index
)
2269 const struct gen_device_info
*devinfo
= p
->devinfo
;
2270 const unsigned target_cache
=
2271 (devinfo
->gen
>= 6 ? GEN6_SFID_DATAPORT_CONSTANT_CACHE
:
2272 BRW_SFID_DATAPORT_READ
);
2273 const unsigned exec_size
= 1 << brw_get_default_exec_size(p
);
2275 /* On newer hardware, offset is in units of owords. */
2276 if (devinfo
->gen
>= 6)
2279 mrf
= retype(mrf
, BRW_REGISTER_TYPE_UD
);
2281 brw_push_insn_state(p
);
2282 brw_set_default_predicate_control(p
, BRW_PREDICATE_NONE
);
2283 brw_set_default_compression_control(p
, BRW_COMPRESSION_NONE
);
2284 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
2286 brw_push_insn_state(p
);
2287 brw_set_default_exec_size(p
, BRW_EXECUTE_8
);
2288 brw_MOV(p
, mrf
, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD
));
2290 /* set message header global offset field (reg 0, element 2) */
2291 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
2293 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE
,
2295 2), BRW_REGISTER_TYPE_UD
),
2296 brw_imm_ud(offset
));
2297 brw_pop_insn_state(p
);
2299 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_SEND
);
2301 brw_inst_set_sfid(devinfo
, insn
, target_cache
);
2303 /* cast dest to a uword[8] vector */
2304 dest
= retype(vec8(dest
), BRW_REGISTER_TYPE_UW
);
2306 brw_set_dest(p
, insn
, dest
);
2307 if (devinfo
->gen
>= 6) {
2308 brw_set_src0(p
, insn
, mrf
);
2310 brw_set_src0(p
, insn
, brw_null_reg());
2311 brw_inst_set_base_mrf(devinfo
, insn
, mrf
.nr
);
2314 brw_set_desc(p
, insn
,
2315 brw_message_desc(devinfo
, 1, DIV_ROUND_UP(exec_size
, 8), true) |
2316 brw_dp_read_desc(devinfo
, bind_table_index
,
2317 BRW_DATAPORT_OWORD_BLOCK_DWORDS(exec_size
),
2318 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ
,
2319 BRW_DATAPORT_READ_TARGET_DATA_CACHE
));
2321 brw_pop_insn_state(p
);
2325 brw_fb_WRITE(struct brw_codegen
*p
,
2326 struct brw_reg payload
,
2327 struct brw_reg implied_header
,
2328 unsigned msg_control
,
2329 unsigned binding_table_index
,
2330 unsigned msg_length
,
2331 unsigned response_length
,
2333 bool last_render_target
,
2334 bool header_present
)
2336 const struct gen_device_info
*devinfo
= p
->devinfo
;
2337 const unsigned target_cache
=
2338 (devinfo
->gen
>= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE
:
2339 BRW_SFID_DATAPORT_WRITE
);
2342 struct brw_reg dest
, src0
;
2344 if (brw_get_default_exec_size(p
) >= BRW_EXECUTE_16
)
2345 dest
= retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW
);
2347 dest
= retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW
);
2349 if (devinfo
->gen
>= 6) {
2350 insn
= next_insn(p
, BRW_OPCODE_SENDC
);
2352 insn
= next_insn(p
, BRW_OPCODE_SEND
);
2354 brw_inst_set_sfid(devinfo
, insn
, target_cache
);
2355 brw_inst_set_compression(devinfo
, insn
, false);
2357 if (devinfo
->gen
>= 6) {
2358 /* headerless version, just submit color payload */
2361 msg_type
= GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE
;
2363 assert(payload
.file
== BRW_MESSAGE_REGISTER_FILE
);
2364 brw_inst_set_base_mrf(devinfo
, insn
, payload
.nr
);
2365 src0
= implied_header
;
2367 msg_type
= BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE
;
2370 brw_set_dest(p
, insn
, dest
);
2371 brw_set_src0(p
, insn
, src0
);
2372 brw_set_desc(p
, insn
,
2373 brw_message_desc(devinfo
, msg_length
, response_length
,
2375 brw_dp_write_desc(devinfo
, binding_table_index
, msg_control
,
2376 msg_type
, last_render_target
,
2377 0 /* send_commit_msg */));
2378 brw_inst_set_eot(devinfo
, insn
, eot
);
2384 gen9_fb_READ(struct brw_codegen
*p
,
2386 struct brw_reg payload
,
2387 unsigned binding_table_index
,
2388 unsigned msg_length
,
2389 unsigned response_length
,
2392 const struct gen_device_info
*devinfo
= p
->devinfo
;
2393 assert(devinfo
->gen
>= 9);
2394 const unsigned msg_subtype
=
2395 brw_get_default_exec_size(p
) == BRW_EXECUTE_16
? 0 : 1;
2396 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_SENDC
);
2398 brw_inst_set_sfid(devinfo
, insn
, GEN6_SFID_DATAPORT_RENDER_CACHE
);
2399 brw_set_dest(p
, insn
, dst
);
2400 brw_set_src0(p
, insn
, payload
);
2403 brw_message_desc(devinfo
, msg_length
, response_length
, true) |
2404 brw_dp_read_desc(devinfo
, binding_table_index
,
2405 per_sample
<< 5 | msg_subtype
,
2406 GEN9_DATAPORT_RC_RENDER_TARGET_READ
,
2407 BRW_DATAPORT_READ_TARGET_RENDER_CACHE
));
2408 brw_inst_set_rt_slot_group(devinfo
, insn
, brw_get_default_group(p
) / 16);
2414 * Texture sample instruction.
2415 * Note: the msg_type plus msg_length values determine exactly what kind
2416 * of sampling operation is performed. See volume 4, page 161 of docs.
2418 void brw_SAMPLE(struct brw_codegen
*p
,
2419 struct brw_reg dest
,
2420 unsigned msg_reg_nr
,
2421 struct brw_reg src0
,
2422 unsigned binding_table_index
,
2425 unsigned response_length
,
2426 unsigned msg_length
,
2427 unsigned header_present
,
2429 unsigned return_format
)
2431 const struct gen_device_info
*devinfo
= p
->devinfo
;
2434 if (msg_reg_nr
!= -1)
2435 gen6_resolve_implied_move(p
, &src0
, msg_reg_nr
);
2437 insn
= next_insn(p
, BRW_OPCODE_SEND
);
2438 brw_inst_set_sfid(devinfo
, insn
, BRW_SFID_SAMPLER
);
2439 brw_inst_set_pred_control(devinfo
, insn
, BRW_PREDICATE_NONE
); /* XXX */
2441 /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2443 * "Instruction compression is not allowed for this instruction (that
2444 * is, send). The hardware behavior is undefined if this instruction is
2445 * set as compressed. However, compress control can be set to "SecHalf"
2446 * to affect the EMask generation."
2448 * No similar wording is found in later PRMs, but there are examples
2449 * utilizing send with SecHalf. More importantly, SIMD8 sampler messages
2450 * are allowed in SIMD16 mode and they could not work without SecHalf. For
2451 * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2453 brw_inst_set_compression(devinfo
, insn
, false);
2455 if (devinfo
->gen
< 6)
2456 brw_inst_set_base_mrf(devinfo
, insn
, msg_reg_nr
);
2458 brw_set_dest(p
, insn
, dest
);
2459 brw_set_src0(p
, insn
, src0
);
2460 brw_set_desc(p
, insn
,
2461 brw_message_desc(devinfo
, msg_length
, response_length
,
2463 brw_sampler_desc(devinfo
, binding_table_index
, sampler
,
2464 msg_type
, simd_mode
, return_format
));
2467 /* Adjust the message header's sampler state pointer to
2468 * select the correct group of 16 samplers.
2470 void brw_adjust_sampler_state_pointer(struct brw_codegen
*p
,
2471 struct brw_reg header
,
2472 struct brw_reg sampler_index
)
2474 /* The "Sampler Index" field can only store values between 0 and 15.
2475 * However, we can add an offset to the "Sampler State Pointer"
2476 * field, effectively selecting a different set of 16 samplers.
2478 * The "Sampler State Pointer" needs to be aligned to a 32-byte
2479 * offset, and each sampler state is only 16-bytes, so we can't
2480 * exclusively use the offset - we have to use both.
2483 const struct gen_device_info
*devinfo
= p
->devinfo
;
2485 if (sampler_index
.file
== BRW_IMMEDIATE_VALUE
) {
2486 const int sampler_state_size
= 16; /* 16 bytes */
2487 uint32_t sampler
= sampler_index
.ud
;
2489 if (sampler
>= 16) {
2490 assert(devinfo
->is_haswell
|| devinfo
->gen
>= 8);
2492 get_element_ud(header
, 3),
2493 get_element_ud(brw_vec8_grf(0, 0), 3),
2494 brw_imm_ud(16 * (sampler
/ 16) * sampler_state_size
));
2497 /* Non-const sampler array indexing case */
2498 if (devinfo
->gen
< 8 && !devinfo
->is_haswell
) {
2502 struct brw_reg temp
= get_element_ud(header
, 3);
2504 brw_AND(p
, temp
, get_element_ud(sampler_index
, 0), brw_imm_ud(0x0f0));
2505 brw_SHL(p
, temp
, temp
, brw_imm_ud(4));
2507 get_element_ud(header
, 3),
2508 get_element_ud(brw_vec8_grf(0, 0), 3),
2513 /* All these variables are pretty confusing - we might be better off
2514 * using bitmasks and macros for this, in the old style. Or perhaps
2515 * just having the caller instantiate the fields in dword3 itself.
2517 void brw_urb_WRITE(struct brw_codegen
*p
,
2518 struct brw_reg dest
,
2519 unsigned msg_reg_nr
,
2520 struct brw_reg src0
,
2521 enum brw_urb_write_flags flags
,
2522 unsigned msg_length
,
2523 unsigned response_length
,
2527 const struct gen_device_info
*devinfo
= p
->devinfo
;
2530 gen6_resolve_implied_move(p
, &src0
, msg_reg_nr
);
2532 if (devinfo
->gen
>= 7 && !(flags
& BRW_URB_WRITE_USE_CHANNEL_MASKS
)) {
2533 /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2534 brw_push_insn_state(p
);
2535 brw_set_default_access_mode(p
, BRW_ALIGN_1
);
2536 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
2537 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
2538 brw_OR(p
, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE
, msg_reg_nr
, 5),
2539 BRW_REGISTER_TYPE_UD
),
2540 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD
),
2541 brw_imm_ud(0xff00));
2542 brw_pop_insn_state(p
);
2545 insn
= next_insn(p
, BRW_OPCODE_SEND
);
2547 assert(msg_length
< BRW_MAX_MRF(devinfo
->gen
));
2549 brw_set_dest(p
, insn
, dest
);
2550 brw_set_src0(p
, insn
, src0
);
2551 brw_set_src1(p
, insn
, brw_imm_d(0));
2553 if (devinfo
->gen
< 6)
2554 brw_inst_set_base_mrf(devinfo
, insn
, msg_reg_nr
);
2556 brw_set_urb_message(p
,
2566 brw_send_indirect_message(struct brw_codegen
*p
,
2569 struct brw_reg payload
,
2570 struct brw_reg desc
,
2574 const struct gen_device_info
*devinfo
= p
->devinfo
;
2575 struct brw_inst
*send
;
2577 dst
= retype(dst
, BRW_REGISTER_TYPE_UW
);
2579 assert(desc
.type
== BRW_REGISTER_TYPE_UD
);
2581 if (desc
.file
== BRW_IMMEDIATE_VALUE
) {
2582 send
= next_insn(p
, BRW_OPCODE_SEND
);
2583 brw_set_src0(p
, send
, retype(payload
, BRW_REGISTER_TYPE_UD
));
2584 brw_set_desc(p
, send
, desc
.ud
| desc_imm
);
2586 struct brw_reg addr
= retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD
);
2588 brw_push_insn_state(p
);
2589 brw_set_default_access_mode(p
, BRW_ALIGN_1
);
2590 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
2591 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
2592 brw_set_default_predicate_control(p
, BRW_PREDICATE_NONE
);
2594 /* Load the indirect descriptor to an address register using OR so the
2595 * caller can specify additional descriptor bits with the desc_imm
2598 brw_OR(p
, addr
, desc
, brw_imm_ud(desc_imm
));
2600 brw_pop_insn_state(p
);
2602 send
= next_insn(p
, BRW_OPCODE_SEND
);
2603 brw_set_src0(p
, send
, retype(payload
, BRW_REGISTER_TYPE_UD
));
2605 if (devinfo
->gen
>= 12)
2606 brw_inst_set_send_sel_reg32_desc(devinfo
, send
, true);
2608 brw_set_src1(p
, send
, addr
);
2611 brw_set_dest(p
, send
, dst
);
2612 brw_inst_set_sfid(devinfo
, send
, sfid
);
2613 brw_inst_set_eot(devinfo
, send
, eot
);
2617 brw_send_indirect_split_message(struct brw_codegen
*p
,
2620 struct brw_reg payload0
,
2621 struct brw_reg payload1
,
2622 struct brw_reg desc
,
2624 struct brw_reg ex_desc
,
2625 unsigned ex_desc_imm
,
2628 const struct gen_device_info
*devinfo
= p
->devinfo
;
2629 struct brw_inst
*send
;
2631 dst
= retype(dst
, BRW_REGISTER_TYPE_UW
);
2633 assert(desc
.type
== BRW_REGISTER_TYPE_UD
);
2635 if (desc
.file
== BRW_IMMEDIATE_VALUE
) {
2636 desc
.ud
|= desc_imm
;
2638 struct brw_reg addr
= retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD
);
2640 brw_push_insn_state(p
);
2641 brw_set_default_access_mode(p
, BRW_ALIGN_1
);
2642 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
2643 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
2644 brw_set_default_predicate_control(p
, BRW_PREDICATE_NONE
);
2646 /* Load the indirect descriptor to an address register using OR so the
2647 * caller can specify additional descriptor bits with the desc_imm
2650 brw_OR(p
, addr
, desc
, brw_imm_ud(desc_imm
));
2652 brw_pop_insn_state(p
);
2656 if (ex_desc
.file
== BRW_IMMEDIATE_VALUE
&&
2657 (ex_desc
.ud
& INTEL_MASK(15, 12)) == 0) {
2658 ex_desc
.ud
|= ex_desc_imm
;
2660 struct brw_reg addr
= retype(brw_address_reg(2), BRW_REGISTER_TYPE_UD
);
2662 brw_push_insn_state(p
);
2663 brw_set_default_access_mode(p
, BRW_ALIGN_1
);
2664 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
2665 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
2666 brw_set_default_predicate_control(p
, BRW_PREDICATE_NONE
);
2668 /* Load the indirect extended descriptor to an address register using OR
2669 * so the caller can specify additional descriptor bits with the
2670 * desc_imm immediate.
2672 * Even though the instruction dispatcher always pulls the SFID and EOT
2673 * fields from the instruction itself, actual external unit which
2674 * processes the message gets the SFID and EOT from the extended
2675 * descriptor which comes from the address register. If we don't OR
2676 * those two bits in, the external unit may get confused and hang.
2678 unsigned imm_part
= ex_desc_imm
| sfid
| eot
<< 5;
2680 if (ex_desc
.file
== BRW_IMMEDIATE_VALUE
) {
2681 /* ex_desc bits 15:12 don't exist in the instruction encoding, so
2682 * we may have fallen back to an indirect extended descriptor.
2684 brw_MOV(p
, addr
, brw_imm_ud(ex_desc
.ud
| imm_part
));
2686 brw_OR(p
, addr
, ex_desc
, brw_imm_ud(imm_part
));
2689 brw_pop_insn_state(p
);
2693 send
= next_insn(p
, devinfo
->gen
>= 12 ? BRW_OPCODE_SEND
: BRW_OPCODE_SENDS
);
2694 brw_set_dest(p
, send
, dst
);
2695 brw_set_src0(p
, send
, retype(payload0
, BRW_REGISTER_TYPE_UD
));
2696 brw_set_src1(p
, send
, retype(payload1
, BRW_REGISTER_TYPE_UD
));
2698 if (desc
.file
== BRW_IMMEDIATE_VALUE
) {
2699 brw_inst_set_send_sel_reg32_desc(devinfo
, send
, 0);
2700 brw_inst_set_send_desc(devinfo
, send
, desc
.ud
);
2702 assert(desc
.file
== BRW_ARCHITECTURE_REGISTER_FILE
);
2703 assert(desc
.nr
== BRW_ARF_ADDRESS
);
2704 assert(desc
.subnr
== 0);
2705 brw_inst_set_send_sel_reg32_desc(devinfo
, send
, 1);
2708 if (ex_desc
.file
== BRW_IMMEDIATE_VALUE
) {
2709 brw_inst_set_send_sel_reg32_ex_desc(devinfo
, send
, 0);
2710 brw_inst_set_sends_ex_desc(devinfo
, send
, ex_desc
.ud
);
2712 assert(ex_desc
.file
== BRW_ARCHITECTURE_REGISTER_FILE
);
2713 assert(ex_desc
.nr
== BRW_ARF_ADDRESS
);
2714 assert((ex_desc
.subnr
& 0x3) == 0);
2715 brw_inst_set_send_sel_reg32_ex_desc(devinfo
, send
, 1);
2716 brw_inst_set_send_ex_desc_ia_subreg_nr(devinfo
, send
, ex_desc
.subnr
>> 2);
2719 brw_inst_set_sfid(devinfo
, send
, sfid
);
2720 brw_inst_set_eot(devinfo
, send
, eot
);
2724 brw_send_indirect_surface_message(struct brw_codegen
*p
,
2727 struct brw_reg payload
,
2728 struct brw_reg surface
,
2731 if (surface
.file
!= BRW_IMMEDIATE_VALUE
) {
2732 struct brw_reg addr
= retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD
);
2734 brw_push_insn_state(p
);
2735 brw_set_default_access_mode(p
, BRW_ALIGN_1
);
2736 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
2737 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
2738 brw_set_default_predicate_control(p
, BRW_PREDICATE_NONE
);
2740 /* Mask out invalid bits from the surface index to avoid hangs e.g. when
2741 * some surface array is accessed out of bounds.
2744 suboffset(vec1(retype(surface
, BRW_REGISTER_TYPE_UD
)),
2745 BRW_GET_SWZ(surface
.swizzle
, 0)),
2748 brw_pop_insn_state(p
);
2753 brw_send_indirect_message(p
, sfid
, dst
, payload
, surface
, desc_imm
, false);
2757 while_jumps_before_offset(const struct gen_device_info
*devinfo
,
2758 brw_inst
*insn
, int while_offset
, int start_offset
)
2760 int scale
= 16 / brw_jump_scale(devinfo
);
2761 int jip
= devinfo
->gen
== 6 ? brw_inst_gen6_jump_count(devinfo
, insn
)
2762 : brw_inst_jip(devinfo
, insn
);
2764 return while_offset
+ jip
* scale
<= start_offset
;
2769 brw_find_next_block_end(struct brw_codegen
*p
, int start_offset
)
2772 void *store
= p
->store
;
2773 const struct gen_device_info
*devinfo
= p
->devinfo
;
2777 for (offset
= next_offset(devinfo
, store
, start_offset
);
2778 offset
< p
->next_insn_offset
;
2779 offset
= next_offset(devinfo
, store
, offset
)) {
2780 brw_inst
*insn
= store
+ offset
;
2782 switch (brw_inst_opcode(devinfo
, insn
)) {
2786 case BRW_OPCODE_ENDIF
:
2791 case BRW_OPCODE_WHILE
:
2792 /* If the while doesn't jump before our instruction, it's the end
2793 * of a sibling do...while loop. Ignore it.
2795 if (!while_jumps_before_offset(devinfo
, insn
, offset
, start_offset
))
2798 case BRW_OPCODE_ELSE
:
2799 case BRW_OPCODE_HALT
:
2810 /* There is no DO instruction on gen6, so to find the end of the loop
2811 * we have to see if the loop is jumping back before our start
2815 brw_find_loop_end(struct brw_codegen
*p
, int start_offset
)
2817 const struct gen_device_info
*devinfo
= p
->devinfo
;
2819 void *store
= p
->store
;
2821 assert(devinfo
->gen
>= 6);
2823 /* Always start after the instruction (such as a WHILE) we're trying to fix
2826 for (offset
= next_offset(devinfo
, store
, start_offset
);
2827 offset
< p
->next_insn_offset
;
2828 offset
= next_offset(devinfo
, store
, offset
)) {
2829 brw_inst
*insn
= store
+ offset
;
2831 if (brw_inst_opcode(devinfo
, insn
) == BRW_OPCODE_WHILE
) {
2832 if (while_jumps_before_offset(devinfo
, insn
, offset
, start_offset
))
2836 assert(!"not reached");
2837 return start_offset
;
2840 /* After program generation, go back and update the UIP and JIP of
2841 * BREAK, CONT, and HALT instructions to their correct locations.
2844 brw_set_uip_jip(struct brw_codegen
*p
, int start_offset
)
2846 const struct gen_device_info
*devinfo
= p
->devinfo
;
2848 int br
= brw_jump_scale(devinfo
);
2849 int scale
= 16 / br
;
2850 void *store
= p
->store
;
2852 if (devinfo
->gen
< 6)
2855 for (offset
= start_offset
; offset
< p
->next_insn_offset
; offset
+= 16) {
2856 brw_inst
*insn
= store
+ offset
;
2857 assert(brw_inst_cmpt_control(devinfo
, insn
) == 0);
2859 int block_end_offset
= brw_find_next_block_end(p
, offset
);
2860 switch (brw_inst_opcode(devinfo
, insn
)) {
2861 case BRW_OPCODE_BREAK
:
2862 assert(block_end_offset
!= 0);
2863 brw_inst_set_jip(devinfo
, insn
, (block_end_offset
- offset
) / scale
);
2864 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2865 brw_inst_set_uip(devinfo
, insn
,
2866 (brw_find_loop_end(p
, offset
) - offset
+
2867 (devinfo
->gen
== 6 ? 16 : 0)) / scale
);
2869 case BRW_OPCODE_CONTINUE
:
2870 assert(block_end_offset
!= 0);
2871 brw_inst_set_jip(devinfo
, insn
, (block_end_offset
- offset
) / scale
);
2872 brw_inst_set_uip(devinfo
, insn
,
2873 (brw_find_loop_end(p
, offset
) - offset
) / scale
);
2875 assert(brw_inst_uip(devinfo
, insn
) != 0);
2876 assert(brw_inst_jip(devinfo
, insn
) != 0);
2879 case BRW_OPCODE_ENDIF
: {
2880 int32_t jump
= (block_end_offset
== 0) ?
2881 1 * br
: (block_end_offset
- offset
) / scale
;
2882 if (devinfo
->gen
>= 7)
2883 brw_inst_set_jip(devinfo
, insn
, jump
);
2885 brw_inst_set_gen6_jump_count(devinfo
, insn
, jump
);
2889 case BRW_OPCODE_HALT
:
2890 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2892 * "In case of the halt instruction not inside any conditional
2893 * code block, the value of <JIP> and <UIP> should be the
2894 * same. In case of the halt instruction inside conditional code
2895 * block, the <UIP> should be the end of the program, and the
2896 * <JIP> should be end of the most inner conditional code block."
2898 * The uip will have already been set by whoever set up the
2901 if (block_end_offset
== 0) {
2902 brw_inst_set_jip(devinfo
, insn
, brw_inst_uip(devinfo
, insn
));
2904 brw_inst_set_jip(devinfo
, insn
, (block_end_offset
- offset
) / scale
);
2906 assert(brw_inst_uip(devinfo
, insn
) != 0);
2907 assert(brw_inst_jip(devinfo
, insn
) != 0);
2916 void brw_ff_sync(struct brw_codegen
*p
,
2917 struct brw_reg dest
,
2918 unsigned msg_reg_nr
,
2919 struct brw_reg src0
,
2921 unsigned response_length
,
2924 const struct gen_device_info
*devinfo
= p
->devinfo
;
2927 gen6_resolve_implied_move(p
, &src0
, msg_reg_nr
);
2929 insn
= next_insn(p
, BRW_OPCODE_SEND
);
2930 brw_set_dest(p
, insn
, dest
);
2931 brw_set_src0(p
, insn
, src0
);
2932 brw_set_src1(p
, insn
, brw_imm_d(0));
2934 if (devinfo
->gen
< 6)
2935 brw_inst_set_base_mrf(devinfo
, insn
, msg_reg_nr
);
2937 brw_set_ff_sync_message(p
,
2945 * Emit the SEND instruction necessary to generate stream output data on Gen6
2946 * (for transform feedback).
2948 * If send_commit_msg is true, this is the last piece of stream output data
2949 * from this thread, so send the data as a committed write. According to the
2950 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2952 * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2953 * writes are complete by sending the final write as a committed write."
2956 brw_svb_write(struct brw_codegen
*p
,
2957 struct brw_reg dest
,
2958 unsigned msg_reg_nr
,
2959 struct brw_reg src0
,
2960 unsigned binding_table_index
,
2961 bool send_commit_msg
)
2963 const struct gen_device_info
*devinfo
= p
->devinfo
;
2964 const unsigned target_cache
=
2965 (devinfo
->gen
>= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE
:
2966 devinfo
->gen
>= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE
:
2967 BRW_SFID_DATAPORT_WRITE
);
2970 gen6_resolve_implied_move(p
, &src0
, msg_reg_nr
);
2972 insn
= next_insn(p
, BRW_OPCODE_SEND
);
2973 brw_inst_set_sfid(devinfo
, insn
, target_cache
);
2974 brw_set_dest(p
, insn
, dest
);
2975 brw_set_src0(p
, insn
, src0
);
2976 brw_set_desc(p
, insn
,
2977 brw_message_desc(devinfo
, 1, send_commit_msg
, true) |
2978 brw_dp_write_desc(devinfo
, binding_table_index
,
2979 0, /* msg_control: ignored */
2980 GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE
,
2981 0, /* last_render_target: ignored */
2982 send_commit_msg
)); /* send_commit_msg */
2986 brw_surface_payload_size(struct brw_codegen
*p
,
2987 unsigned num_channels
,
2988 unsigned exec_size
/**< 0 for SIMD4x2 */)
2991 return 1; /* SIMD4x2 */
2992 else if (exec_size
<= 8)
2993 return num_channels
;
2995 return 2 * num_channels
;
2999 brw_untyped_atomic(struct brw_codegen
*p
,
3001 struct brw_reg payload
,
3002 struct brw_reg surface
,
3004 unsigned msg_length
,
3005 bool response_expected
,
3006 bool header_present
)
3008 const struct gen_device_info
*devinfo
= p
->devinfo
;
3009 const unsigned sfid
= (devinfo
->gen
>= 8 || devinfo
->is_haswell
?
3010 HSW_SFID_DATAPORT_DATA_CACHE_1
:
3011 GEN7_SFID_DATAPORT_DATA_CACHE
);
3012 const bool align1
= brw_get_default_access_mode(p
) == BRW_ALIGN_1
;
3013 /* SIMD4x2 untyped atomic instructions only exist on HSW+ */
3014 const bool has_simd4x2
= devinfo
->gen
>= 8 || devinfo
->is_haswell
;
3015 const unsigned exec_size
= align1
? 1 << brw_get_default_exec_size(p
) :
3016 has_simd4x2
? 0 : 8;
3017 const unsigned response_length
=
3018 brw_surface_payload_size(p
, response_expected
, exec_size
);
3019 const unsigned desc
=
3020 brw_message_desc(devinfo
, msg_length
, response_length
, header_present
) |
3021 brw_dp_untyped_atomic_desc(devinfo
, exec_size
, atomic_op
,
3023 /* Mask out unused components -- This is especially important in Align16
3024 * mode on generations that don't have native support for SIMD4x2 atomics,
3025 * because unused but enabled components will cause the dataport to perform
3026 * additional atomic operations on the addresses that happen to be in the
3027 * uninitialized Y, Z and W coordinates of the payload.
3029 const unsigned mask
= align1
? WRITEMASK_XYZW
: WRITEMASK_X
;
3031 brw_send_indirect_surface_message(p
, sfid
, brw_writemask(dst
, mask
),
3032 payload
, surface
, desc
);
3036 brw_untyped_surface_read(struct brw_codegen
*p
,
3038 struct brw_reg payload
,
3039 struct brw_reg surface
,
3040 unsigned msg_length
,
3041 unsigned num_channels
)
3043 const struct gen_device_info
*devinfo
= p
->devinfo
;
3044 const unsigned sfid
= (devinfo
->gen
>= 8 || devinfo
->is_haswell
?
3045 HSW_SFID_DATAPORT_DATA_CACHE_1
:
3046 GEN7_SFID_DATAPORT_DATA_CACHE
);
3047 const bool align1
= brw_get_default_access_mode(p
) == BRW_ALIGN_1
;
3048 const unsigned exec_size
= align1
? 1 << brw_get_default_exec_size(p
) : 0;
3049 const unsigned response_length
=
3050 brw_surface_payload_size(p
, num_channels
, exec_size
);
3051 const unsigned desc
=
3052 brw_message_desc(devinfo
, msg_length
, response_length
, false) |
3053 brw_dp_untyped_surface_rw_desc(devinfo
, exec_size
, num_channels
, false);
3055 brw_send_indirect_surface_message(p
, sfid
, dst
, payload
, surface
, desc
);
3059 brw_untyped_surface_write(struct brw_codegen
*p
,
3060 struct brw_reg payload
,
3061 struct brw_reg surface
,
3062 unsigned msg_length
,
3063 unsigned num_channels
,
3064 bool header_present
)
3066 const struct gen_device_info
*devinfo
= p
->devinfo
;
3067 const unsigned sfid
= (devinfo
->gen
>= 8 || devinfo
->is_haswell
?
3068 HSW_SFID_DATAPORT_DATA_CACHE_1
:
3069 GEN7_SFID_DATAPORT_DATA_CACHE
);
3070 const bool align1
= brw_get_default_access_mode(p
) == BRW_ALIGN_1
;
3071 /* SIMD4x2 untyped surface write instructions only exist on HSW+ */
3072 const bool has_simd4x2
= devinfo
->gen
>= 8 || devinfo
->is_haswell
;
3073 const unsigned exec_size
= align1
? 1 << brw_get_default_exec_size(p
) :
3074 has_simd4x2
? 0 : 8;
3075 const unsigned desc
=
3076 brw_message_desc(devinfo
, msg_length
, 0, header_present
) |
3077 brw_dp_untyped_surface_rw_desc(devinfo
, exec_size
, num_channels
, true);
3078 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3079 const unsigned mask
= !has_simd4x2
&& !align1
? WRITEMASK_X
: WRITEMASK_XYZW
;
3081 brw_send_indirect_surface_message(p
, sfid
, brw_writemask(brw_null_reg(), mask
),
3082 payload
, surface
, desc
);
3086 brw_set_memory_fence_message(struct brw_codegen
*p
,
3087 struct brw_inst
*insn
,
3088 enum brw_message_target sfid
,
3092 const struct gen_device_info
*devinfo
= p
->devinfo
;
3094 brw_set_desc(p
, insn
, brw_message_desc(
3095 devinfo
, 1, (commit_enable
? 1 : 0), true));
3097 brw_inst_set_sfid(devinfo
, insn
, sfid
);
3100 case GEN6_SFID_DATAPORT_RENDER_CACHE
:
3101 brw_inst_set_dp_msg_type(devinfo
, insn
, GEN7_DATAPORT_RC_MEMORY_FENCE
);
3103 case GEN7_SFID_DATAPORT_DATA_CACHE
:
3104 brw_inst_set_dp_msg_type(devinfo
, insn
, GEN7_DATAPORT_DC_MEMORY_FENCE
);
3107 unreachable("Not reached");
3111 brw_inst_set_dp_msg_control(devinfo
, insn
, 1 << 5);
3113 assert(devinfo
->gen
>= 11 || bti
== 0);
3114 brw_inst_set_binding_table_index(devinfo
, insn
, bti
);
3118 brw_memory_fence(struct brw_codegen
*p
,
3121 enum opcode send_op
,
3125 const struct gen_device_info
*devinfo
= p
->devinfo
;
3126 const bool commit_enable
= stall
||
3127 devinfo
->gen
>= 10 || /* HSD ES # 1404612949 */
3128 (devinfo
->gen
== 7 && !devinfo
->is_haswell
);
3129 struct brw_inst
*insn
;
3131 brw_push_insn_state(p
);
3132 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
3133 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
3134 dst
= retype(vec1(dst
), BRW_REGISTER_TYPE_UW
);
3135 src
= retype(vec1(src
), BRW_REGISTER_TYPE_UD
);
3137 /* Set dst as destination for dependency tracking, the MEMORY_FENCE
3138 * message doesn't write anything back.
3140 insn
= next_insn(p
, send_op
);
3141 brw_set_dest(p
, insn
, dst
);
3142 brw_set_src0(p
, insn
, src
);
3143 brw_set_memory_fence_message(p
, insn
, GEN7_SFID_DATAPORT_DATA_CACHE
,
3144 commit_enable
, bti
);
3146 if (devinfo
->gen
== 7 && !devinfo
->is_haswell
) {
3147 /* IVB does typed surface access through the render cache, so we need to
3148 * flush it too. Use a different register so both flushes can be
3149 * pipelined by the hardware.
3151 insn
= next_insn(p
, send_op
);
3152 brw_set_dest(p
, insn
, offset(dst
, 1));
3153 brw_set_src0(p
, insn
, src
);
3154 brw_set_memory_fence_message(p
, insn
, GEN6_SFID_DATAPORT_RENDER_CACHE
,
3155 commit_enable
, bti
);
3157 /* Now write the response of the second message into the response of the
3158 * first to trigger a pipeline stall -- This way future render and data
3159 * cache messages will be properly ordered with respect to past data and
3160 * render cache messages.
3162 brw_MOV(p
, dst
, offset(dst
, 1));
3166 brw_MOV(p
, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW
), dst
);
3168 brw_pop_insn_state(p
);
3172 brw_pixel_interpolator_query(struct brw_codegen
*p
,
3173 struct brw_reg dest
,
3177 struct brw_reg data
,
3178 unsigned msg_length
,
3179 unsigned response_length
)
3181 const struct gen_device_info
*devinfo
= p
->devinfo
;
3182 const uint16_t exec_size
= brw_get_default_exec_size(p
);
3183 const unsigned slot_group
= brw_get_default_group(p
) / 16;
3184 const unsigned simd_mode
= (exec_size
== BRW_EXECUTE_16
);
3185 const unsigned desc
=
3186 brw_message_desc(devinfo
, msg_length
, response_length
, false) |
3187 brw_pixel_interp_desc(devinfo
, mode
, noperspective
, simd_mode
,
3190 /* brw_send_indirect_message will automatically use a direct send message
3191 * if data is actually immediate.
3193 brw_send_indirect_message(p
,
3194 GEN7_SFID_PIXEL_INTERPOLATOR
,
3203 brw_find_live_channel(struct brw_codegen
*p
, struct brw_reg dst
,
3204 struct brw_reg mask
)
3206 const struct gen_device_info
*devinfo
= p
->devinfo
;
3207 const unsigned exec_size
= 1 << brw_get_default_exec_size(p
);
3208 const unsigned qtr_control
= brw_get_default_group(p
) / 8;
3211 assert(devinfo
->gen
>= 7);
3212 assert(mask
.type
== BRW_REGISTER_TYPE_UD
);
3214 brw_push_insn_state(p
);
3216 /* The flag register is only used on Gen7 in align1 mode, so avoid setting
3217 * unnecessary bits in the instruction words, get the information we need
3218 * and reset the default flag register. This allows more instructions to be
3221 const unsigned flag_subreg
= p
->current
->flag_subreg
;
3222 brw_set_default_flag_reg(p
, 0, 0);
3224 if (brw_get_default_access_mode(p
) == BRW_ALIGN_1
) {
3225 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
3227 if (devinfo
->gen
>= 8) {
3228 /* Getting the first active channel index is easy on Gen8: Just find
3229 * the first bit set in the execution mask. The register exists on
3230 * HSW already but it reads back as all ones when the current
3231 * instruction has execution masking disabled, so it's kind of
3234 struct brw_reg exec_mask
=
3235 retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD
);
3237 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
3238 if (mask
.file
!= BRW_IMMEDIATE_VALUE
|| mask
.ud
!= 0xffffffff) {
3239 /* Unfortunately, ce0 does not take into account the thread
3240 * dispatch mask, which may be a problem in cases where it's not
3241 * tightly packed (i.e. it doesn't have the form '2^n - 1' for
3242 * some n). Combine ce0 with the given dispatch (or vector) mask
3243 * to mask off those channels which were never dispatched by the
3246 brw_SHR(p
, vec1(dst
), mask
, brw_imm_ud(qtr_control
* 8));
3247 brw_AND(p
, vec1(dst
), exec_mask
, vec1(dst
));
3248 exec_mask
= vec1(dst
);
3251 /* Quarter control has the effect of magically shifting the value of
3252 * ce0 so you'll get the first active channel relative to the
3253 * specified quarter control as result.
3255 inst
= brw_FBL(p
, vec1(dst
), exec_mask
);
3257 const struct brw_reg flag
= brw_flag_subreg(flag_subreg
);
3259 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
3260 brw_MOV(p
, retype(flag
, BRW_REGISTER_TYPE_UD
), brw_imm_ud(0));
3262 /* Run enough instructions returning zero with execution masking and
3263 * a conditional modifier enabled in order to get the full execution
3264 * mask in f1.0. We could use a single 32-wide move here if it
3265 * weren't because of the hardware bug that causes channel enables to
3266 * be applied incorrectly to the second half of 32-wide instructions
3269 const unsigned lower_size
= MIN2(16, exec_size
);
3270 for (unsigned i
= 0; i
< exec_size
/ lower_size
; i
++) {
3271 inst
= brw_MOV(p
, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW
),
3273 brw_inst_set_mask_control(devinfo
, inst
, BRW_MASK_ENABLE
);
3274 brw_inst_set_group(devinfo
, inst
, lower_size
* i
+ 8 * qtr_control
);
3275 brw_inst_set_cond_modifier(devinfo
, inst
, BRW_CONDITIONAL_Z
);
3276 brw_inst_set_exec_size(devinfo
, inst
, cvt(lower_size
) - 1);
3277 brw_inst_set_flag_reg_nr(devinfo
, inst
, flag_subreg
/ 2);
3278 brw_inst_set_flag_subreg_nr(devinfo
, inst
, flag_subreg
% 2);
3281 /* Find the first bit set in the exec_size-wide portion of the flag
3282 * register that was updated by the last sequence of MOV
3285 const enum brw_reg_type type
= brw_int_type(exec_size
/ 8, false);
3286 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
3287 brw_FBL(p
, vec1(dst
), byte_offset(retype(flag
, type
), qtr_control
));
3290 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
3292 if (devinfo
->gen
>= 8 &&
3293 mask
.file
== BRW_IMMEDIATE_VALUE
&& mask
.ud
== 0xffffffff) {
3294 /* In SIMD4x2 mode the first active channel index is just the
3295 * negation of the first bit of the mask register. Note that ce0
3296 * doesn't take into account the dispatch mask, so the Gen7 path
3297 * should be used instead unless you have the guarantee that the
3298 * dispatch mask is tightly packed (i.e. it has the form '2^n - 1'
3301 inst
= brw_AND(p
, brw_writemask(dst
, WRITEMASK_X
),
3302 negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD
)),
3306 /* Overwrite the destination without and with execution masking to
3307 * find out which of the channels is active.
3309 brw_push_insn_state(p
);
3310 brw_set_default_exec_size(p
, BRW_EXECUTE_4
);
3311 brw_MOV(p
, brw_writemask(vec4(dst
), WRITEMASK_X
),
3314 inst
= brw_MOV(p
, brw_writemask(vec4(dst
), WRITEMASK_X
),
3316 brw_pop_insn_state(p
);
3317 brw_inst_set_mask_control(devinfo
, inst
, BRW_MASK_ENABLE
);
3321 brw_pop_insn_state(p
);
3325 brw_broadcast(struct brw_codegen
*p
,
3330 const struct gen_device_info
*devinfo
= p
->devinfo
;
3331 const bool align1
= brw_get_default_access_mode(p
) == BRW_ALIGN_1
;
3334 brw_push_insn_state(p
);
3335 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
3336 brw_set_default_exec_size(p
, align1
? BRW_EXECUTE_1
: BRW_EXECUTE_4
);
3338 assert(src
.file
== BRW_GENERAL_REGISTER_FILE
&&
3339 src
.address_mode
== BRW_ADDRESS_DIRECT
);
3340 assert(!src
.abs
&& !src
.negate
);
3341 assert(src
.type
== dst
.type
);
3343 if ((src
.vstride
== 0 && (src
.hstride
== 0 || !align1
)) ||
3344 idx
.file
== BRW_IMMEDIATE_VALUE
) {
3345 /* Trivial, the source is already uniform or the index is a constant.
3346 * We will typically not get here if the optimizer is doing its job, but
3347 * asserting would be mean.
3349 const unsigned i
= idx
.file
== BRW_IMMEDIATE_VALUE
? idx
.ud
: 0;
3351 (align1
? stride(suboffset(src
, i
), 0, 1, 0) :
3352 stride(suboffset(src
, 4 * i
), 0, 4, 1)));
3354 /* From the Haswell PRM section "Register Region Restrictions":
3356 * "The lower bits of the AddressImmediate must not overflow to
3357 * change the register address. The lower 5 bits of Address
3358 * Immediate when added to lower 5 bits of address register gives
3359 * the sub-register offset. The upper bits of Address Immediate
3360 * when added to upper bits of address register gives the register
3361 * address. Any overflow from sub-register offset is dropped."
3363 * Fortunately, for broadcast, we never have a sub-register offset so
3364 * this isn't an issue.
3366 assert(src
.subnr
== 0);
3369 const struct brw_reg addr
=
3370 retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD
);
3371 unsigned offset
= src
.nr
* REG_SIZE
+ src
.subnr
;
3372 /* Limit in bytes of the signed indirect addressing immediate. */
3373 const unsigned limit
= 512;
3375 brw_push_insn_state(p
);
3376 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
3377 brw_set_default_predicate_control(p
, BRW_PREDICATE_NONE
);
3379 /* Take into account the component size and horizontal stride. */
3380 assert(src
.vstride
== src
.hstride
+ src
.width
);
3381 brw_SHL(p
, addr
, vec1(idx
),
3382 brw_imm_ud(_mesa_logbase2(type_sz(src
.type
)) +
3385 /* We can only address up to limit bytes using the indirect
3386 * addressing immediate, account for the difference if the source
3387 * register is above this limit.
3389 if (offset
>= limit
) {
3390 brw_ADD(p
, addr
, addr
, brw_imm_ud(offset
- offset
% limit
));
3391 offset
= offset
% limit
;
3394 brw_pop_insn_state(p
);
3396 /* Use indirect addressing to fetch the specified component. */
3397 if (type_sz(src
.type
) > 4 &&
3398 (devinfo
->is_cherryview
|| gen_device_info_is_9lp(devinfo
))) {
3399 /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
3401 * "When source or destination datatype is 64b or operation is
3402 * integer DWord multiply, indirect addressing must not be
3405 * To work around both of this issue, we do two integer MOVs
3406 * insead of one 64-bit MOV. Because no double value should ever
3407 * cross a register boundary, it's safe to use the immediate
3408 * offset in the indirect here to handle adding 4 bytes to the
3409 * offset and avoid the extra ADD to the register file.
3411 brw_MOV(p
, subscript(dst
, BRW_REGISTER_TYPE_D
, 0),
3412 retype(brw_vec1_indirect(addr
.subnr
, offset
),
3413 BRW_REGISTER_TYPE_D
));
3414 brw_MOV(p
, subscript(dst
, BRW_REGISTER_TYPE_D
, 1),
3415 retype(brw_vec1_indirect(addr
.subnr
, offset
+ 4),
3416 BRW_REGISTER_TYPE_D
));
3419 retype(brw_vec1_indirect(addr
.subnr
, offset
), src
.type
));
3422 /* In SIMD4x2 mode the index can be either zero or one, replicate it
3423 * to all bits of a flag register,
3427 stride(brw_swizzle(idx
, BRW_SWIZZLE_XXXX
), 4, 4, 1));
3428 brw_inst_set_pred_control(devinfo
, inst
, BRW_PREDICATE_NONE
);
3429 brw_inst_set_cond_modifier(devinfo
, inst
, BRW_CONDITIONAL_NZ
);
3430 brw_inst_set_flag_reg_nr(devinfo
, inst
, 1);
3432 /* and use predicated SEL to pick the right channel. */
3433 inst
= brw_SEL(p
, dst
,
3434 stride(suboffset(src
, 4), 4, 4, 1),
3435 stride(src
, 4, 4, 1));
3436 brw_inst_set_pred_control(devinfo
, inst
, BRW_PREDICATE_NORMAL
);
3437 brw_inst_set_flag_reg_nr(devinfo
, inst
, 1);
3441 brw_pop_insn_state(p
);
3445 * This instruction is generated as a single-channel align1 instruction by
3446 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
3448 * We can't use the typed atomic op in the FS because that has the execution
3449 * mask ANDed with the pixel mask, but we just want to write the one dword for
3452 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
3453 * one u32. So we use the same untyped atomic write message as the pixel
3456 * The untyped atomic operation requires a BUFFER surface type with RAW
3457 * format, and is only accessible through the legacy DATA_CACHE dataport
3460 void brw_shader_time_add(struct brw_codegen
*p
,
3461 struct brw_reg payload
,
3462 uint32_t surf_index
)
3464 const struct gen_device_info
*devinfo
= p
->devinfo
;
3465 const unsigned sfid
= (devinfo
->gen
>= 8 || devinfo
->is_haswell
?
3466 HSW_SFID_DATAPORT_DATA_CACHE_1
:
3467 GEN7_SFID_DATAPORT_DATA_CACHE
);
3468 assert(devinfo
->gen
>= 7);
3470 brw_push_insn_state(p
);
3471 brw_set_default_access_mode(p
, BRW_ALIGN_1
);
3472 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
3473 brw_set_default_compression_control(p
, BRW_COMPRESSION_NONE
);
3474 brw_inst
*send
= brw_next_insn(p
, BRW_OPCODE_SEND
);
3476 /* We use brw_vec1_reg and unmasked because we want to increment the given
3479 brw_set_dest(p
, send
, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE
,
3481 brw_set_src0(p
, send
, brw_vec1_reg(payload
.file
,
3483 brw_set_desc(p
, send
, (brw_message_desc(devinfo
, 2, 0, false) |
3484 brw_dp_untyped_atomic_desc(devinfo
, 1, BRW_AOP_ADD
,
3487 brw_inst_set_sfid(devinfo
, send
, sfid
);
3488 brw_inst_set_binding_table_index(devinfo
, send
, surf_index
);
3490 brw_pop_insn_state(p
);
3495 * Emit the SEND message for a barrier
3498 brw_barrier(struct brw_codegen
*p
, struct brw_reg src
)
3500 const struct gen_device_info
*devinfo
= p
->devinfo
;
3501 struct brw_inst
*inst
;
3503 assert(devinfo
->gen
>= 7);
3505 brw_push_insn_state(p
);
3506 brw_set_default_access_mode(p
, BRW_ALIGN_1
);
3507 inst
= next_insn(p
, BRW_OPCODE_SEND
);
3508 brw_set_dest(p
, inst
, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW
));
3509 brw_set_src0(p
, inst
, src
);
3510 brw_set_src1(p
, inst
, brw_null_reg());
3511 brw_set_desc(p
, inst
, brw_message_desc(devinfo
, 1, 0, false));
3513 brw_inst_set_sfid(devinfo
, inst
, BRW_SFID_MESSAGE_GATEWAY
);
3514 brw_inst_set_gateway_notify(devinfo
, inst
, 1);
3515 brw_inst_set_gateway_subfuncid(devinfo
, inst
,
3516 BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG
);
3518 brw_inst_set_mask_control(devinfo
, inst
, BRW_MASK_DISABLE
);
3519 brw_pop_insn_state(p
);
3524 * Emit the wait instruction for a barrier
3527 brw_WAIT(struct brw_codegen
*p
)
3529 const struct gen_device_info
*devinfo
= p
->devinfo
;
3530 struct brw_inst
*insn
;
3532 struct brw_reg src
= brw_notification_reg();
3534 insn
= next_insn(p
, BRW_OPCODE_WAIT
);
3535 brw_set_dest(p
, insn
, src
);
3536 brw_set_src0(p
, insn
, src
);
3537 brw_set_src1(p
, insn
, brw_null_reg());
3539 brw_inst_set_exec_size(devinfo
, insn
, BRW_EXECUTE_1
);
3540 brw_inst_set_mask_control(devinfo
, insn
, BRW_MASK_DISABLE
);
3544 brw_float_controls_mode(struct brw_codegen
*p
,
3545 unsigned mode
, unsigned mask
)
3547 brw_inst
*inst
= brw_AND(p
, brw_cr0_reg(0), brw_cr0_reg(0),
3549 brw_inst_set_exec_size(p
->devinfo
, inst
, BRW_EXECUTE_1
);
3551 /* From the Skylake PRM, Volume 7, page 760:
3552 * "Implementation Restriction on Register Access: When the control
3553 * register is used as an explicit source and/or destination, hardware
3554 * does not ensure execution pipeline coherency. Software must set the
3555 * thread control field to ‘switch’ for an instruction that uses
3556 * control register as an explicit operand."
3558 brw_inst_set_thread_control(p
->devinfo
, inst
, BRW_THREAD_SWITCH
);
3561 brw_inst
*inst_or
= brw_OR(p
, brw_cr0_reg(0), brw_cr0_reg(0),
3563 brw_inst_set_exec_size(p
->devinfo
, inst_or
, BRW_EXECUTE_1
);
3564 brw_inst_set_thread_control(p
->devinfo
, inst_or
, BRW_THREAD_SWITCH
);