2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **********************************************************************/
29 * Keith Whitwell <keithw@vmware.com>
33 #include "brw_eu_defines.h"
36 #include "util/ralloc.h"
39 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
40 * registers, implicitly moving the operand to a message register.
42 * On Sandybridge, this is no longer the case. This function performs the
43 * explicit move; it should be called before emitting a SEND instruction.
46 gen6_resolve_implied_move(struct brw_codegen
*p
,
50 const struct gen_device_info
*devinfo
= p
->devinfo
;
54 if (src
->file
== BRW_MESSAGE_REGISTER_FILE
)
57 if (src
->file
!= BRW_ARCHITECTURE_REGISTER_FILE
|| src
->nr
!= BRW_ARF_NULL
) {
58 brw_push_insn_state(p
);
59 brw_set_default_exec_size(p
, BRW_EXECUTE_8
);
60 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
61 brw_set_default_compression_control(p
, BRW_COMPRESSION_NONE
);
62 brw_MOV(p
, retype(brw_message_reg(msg_reg_nr
), BRW_REGISTER_TYPE_UD
),
63 retype(*src
, BRW_REGISTER_TYPE_UD
));
64 brw_pop_insn_state(p
);
66 *src
= brw_message_reg(msg_reg_nr
);
70 gen7_convert_mrf_to_grf(struct brw_codegen
*p
, struct brw_reg
*reg
)
72 /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
73 * "The send with EOT should use register space R112-R127 for <src>. This is
74 * to enable loading of a new thread into the same slot while the message
75 * with EOT for current thread is pending dispatch."
77 * Since we're pretending to have 16 MRFs anyway, we may as well use the
78 * registers required for messages with EOT.
80 const struct gen_device_info
*devinfo
= p
->devinfo
;
81 if (devinfo
->gen
>= 7 && reg
->file
== BRW_MESSAGE_REGISTER_FILE
) {
82 reg
->file
= BRW_GENERAL_REGISTER_FILE
;
83 reg
->nr
+= GEN7_MRF_HACK_START
;
88 brw_set_dest(struct brw_codegen
*p
, brw_inst
*inst
, struct brw_reg dest
)
90 const struct gen_device_info
*devinfo
= p
->devinfo
;
92 if (dest
.file
== BRW_MESSAGE_REGISTER_FILE
)
93 assert((dest
.nr
& ~BRW_MRF_COMPR4
) < BRW_MAX_MRF(devinfo
->gen
));
94 else if (dest
.file
!= BRW_ARCHITECTURE_REGISTER_FILE
)
95 assert(dest
.nr
< 128);
97 gen7_convert_mrf_to_grf(p
, &dest
);
99 brw_inst_set_dst_file_type(devinfo
, inst
, dest
.file
, dest
.type
);
100 brw_inst_set_dst_address_mode(devinfo
, inst
, dest
.address_mode
);
102 if (dest
.address_mode
== BRW_ADDRESS_DIRECT
) {
103 brw_inst_set_dst_da_reg_nr(devinfo
, inst
, dest
.nr
);
105 if (brw_inst_access_mode(devinfo
, inst
) == BRW_ALIGN_1
) {
106 brw_inst_set_dst_da1_subreg_nr(devinfo
, inst
, dest
.subnr
);
107 if (dest
.hstride
== BRW_HORIZONTAL_STRIDE_0
)
108 dest
.hstride
= BRW_HORIZONTAL_STRIDE_1
;
109 brw_inst_set_dst_hstride(devinfo
, inst
, dest
.hstride
);
111 brw_inst_set_dst_da16_subreg_nr(devinfo
, inst
, dest
.subnr
/ 16);
112 brw_inst_set_da16_writemask(devinfo
, inst
, dest
.writemask
);
113 if (dest
.file
== BRW_GENERAL_REGISTER_FILE
||
114 dest
.file
== BRW_MESSAGE_REGISTER_FILE
) {
115 assert(dest
.writemask
!= 0);
117 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
118 * Although Dst.HorzStride is a don't care for Align16, HW needs
119 * this to be programmed as "01".
121 brw_inst_set_dst_hstride(devinfo
, inst
, 1);
124 brw_inst_set_dst_ia_subreg_nr(devinfo
, inst
, dest
.subnr
);
126 /* These are different sizes in align1 vs align16:
128 if (brw_inst_access_mode(devinfo
, inst
) == BRW_ALIGN_1
) {
129 brw_inst_set_dst_ia1_addr_imm(devinfo
, inst
,
130 dest
.indirect_offset
);
131 if (dest
.hstride
== BRW_HORIZONTAL_STRIDE_0
)
132 dest
.hstride
= BRW_HORIZONTAL_STRIDE_1
;
133 brw_inst_set_dst_hstride(devinfo
, inst
, dest
.hstride
);
135 brw_inst_set_dst_ia16_addr_imm(devinfo
, inst
,
136 dest
.indirect_offset
);
137 /* even ignored in da16, still need to set as '01' */
138 brw_inst_set_dst_hstride(devinfo
, inst
, 1);
142 /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
143 * or 16 (SIMD16), as that's normally correct. However, when dealing with
144 * small registers, it can be useful for us to automatically reduce it to
145 * match the register size.
147 if (p
->automatic_exec_sizes
) {
149 * In platforms that support fp64 we can emit instructions with a width
150 * of 4 that need two SIMD8 registers and an exec_size of 8 or 16. In
151 * these cases we need to make sure that these instructions have their
152 * exec sizes set properly when they are emitted and we can't rely on
153 * this code to fix it.
156 if (devinfo
->gen
>= 6)
157 fix_exec_size
= dest
.width
< BRW_EXECUTE_4
;
159 fix_exec_size
= dest
.width
< BRW_EXECUTE_8
;
162 brw_inst_set_exec_size(devinfo
, inst
, dest
.width
);
167 brw_set_src0(struct brw_codegen
*p
, brw_inst
*inst
, struct brw_reg reg
)
169 const struct gen_device_info
*devinfo
= p
->devinfo
;
171 if (reg
.file
== BRW_MESSAGE_REGISTER_FILE
)
172 assert((reg
.nr
& ~BRW_MRF_COMPR4
) < BRW_MAX_MRF(devinfo
->gen
));
173 else if (reg
.file
!= BRW_ARCHITECTURE_REGISTER_FILE
)
174 assert(reg
.nr
< 128);
176 gen7_convert_mrf_to_grf(p
, ®
);
178 if (devinfo
->gen
>= 6 && (brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SEND
||
179 brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDC
)) {
180 /* Any source modifiers or regions will be ignored, since this just
181 * identifies the MRF/GRF to start reading the message contents from.
182 * Check for some likely failures.
186 assert(reg
.address_mode
== BRW_ADDRESS_DIRECT
);
189 brw_inst_set_src0_file_type(devinfo
, inst
, reg
.file
, reg
.type
);
190 brw_inst_set_src0_abs(devinfo
, inst
, reg
.abs
);
191 brw_inst_set_src0_negate(devinfo
, inst
, reg
.negate
);
192 brw_inst_set_src0_address_mode(devinfo
, inst
, reg
.address_mode
);
194 if (reg
.file
== BRW_IMMEDIATE_VALUE
) {
195 if (reg
.type
== BRW_REGISTER_TYPE_DF
||
196 brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_DIM
)
197 brw_inst_set_imm_df(devinfo
, inst
, reg
.df
);
198 else if (reg
.type
== BRW_REGISTER_TYPE_UQ
||
199 reg
.type
== BRW_REGISTER_TYPE_Q
)
200 brw_inst_set_imm_uq(devinfo
, inst
, reg
.u64
);
202 brw_inst_set_imm_ud(devinfo
, inst
, reg
.ud
);
204 if (type_sz(reg
.type
) < 8) {
205 brw_inst_set_src1_reg_file(devinfo
, inst
,
206 BRW_ARCHITECTURE_REGISTER_FILE
);
207 brw_inst_set_src1_reg_hw_type(devinfo
, inst
,
208 brw_inst_src0_reg_hw_type(devinfo
, inst
));
211 if (reg
.address_mode
== BRW_ADDRESS_DIRECT
) {
212 brw_inst_set_src0_da_reg_nr(devinfo
, inst
, reg
.nr
);
213 if (brw_inst_access_mode(devinfo
, inst
) == BRW_ALIGN_1
) {
214 brw_inst_set_src0_da1_subreg_nr(devinfo
, inst
, reg
.subnr
);
216 brw_inst_set_src0_da16_subreg_nr(devinfo
, inst
, reg
.subnr
/ 16);
219 brw_inst_set_src0_ia_subreg_nr(devinfo
, inst
, reg
.subnr
);
221 if (brw_inst_access_mode(devinfo
, inst
) == BRW_ALIGN_1
) {
222 brw_inst_set_src0_ia1_addr_imm(devinfo
, inst
, reg
.indirect_offset
);
224 brw_inst_set_src0_ia16_addr_imm(devinfo
, inst
, reg
.indirect_offset
);
228 if (brw_inst_access_mode(devinfo
, inst
) == BRW_ALIGN_1
) {
229 if (reg
.width
== BRW_WIDTH_1
&&
230 brw_inst_exec_size(devinfo
, inst
) == BRW_EXECUTE_1
) {
231 brw_inst_set_src0_hstride(devinfo
, inst
, BRW_HORIZONTAL_STRIDE_0
);
232 brw_inst_set_src0_width(devinfo
, inst
, BRW_WIDTH_1
);
233 brw_inst_set_src0_vstride(devinfo
, inst
, BRW_VERTICAL_STRIDE_0
);
235 brw_inst_set_src0_hstride(devinfo
, inst
, reg
.hstride
);
236 brw_inst_set_src0_width(devinfo
, inst
, reg
.width
);
237 brw_inst_set_src0_vstride(devinfo
, inst
, reg
.vstride
);
240 brw_inst_set_src0_da16_swiz_x(devinfo
, inst
,
241 BRW_GET_SWZ(reg
.swizzle
, BRW_CHANNEL_X
));
242 brw_inst_set_src0_da16_swiz_y(devinfo
, inst
,
243 BRW_GET_SWZ(reg
.swizzle
, BRW_CHANNEL_Y
));
244 brw_inst_set_src0_da16_swiz_z(devinfo
, inst
,
245 BRW_GET_SWZ(reg
.swizzle
, BRW_CHANNEL_Z
));
246 brw_inst_set_src0_da16_swiz_w(devinfo
, inst
,
247 BRW_GET_SWZ(reg
.swizzle
, BRW_CHANNEL_W
));
249 if (reg
.vstride
== BRW_VERTICAL_STRIDE_8
) {
250 /* This is an oddity of the fact we're using the same
251 * descriptions for registers in align_16 as align_1:
253 brw_inst_set_src0_vstride(devinfo
, inst
, BRW_VERTICAL_STRIDE_4
);
254 } else if (devinfo
->gen
== 7 && !devinfo
->is_haswell
&&
255 reg
.type
== BRW_REGISTER_TYPE_DF
&&
256 reg
.vstride
== BRW_VERTICAL_STRIDE_2
) {
259 * "For Align16 access mode, only encodings of 0000 and 0011
260 * are allowed. Other codes are reserved."
262 * Presumably the DevSNB behavior applies to IVB as well.
264 brw_inst_set_src0_vstride(devinfo
, inst
, BRW_VERTICAL_STRIDE_4
);
266 brw_inst_set_src0_vstride(devinfo
, inst
, reg
.vstride
);
274 brw_set_src1(struct brw_codegen
*p
, brw_inst
*inst
, struct brw_reg reg
)
276 const struct gen_device_info
*devinfo
= p
->devinfo
;
278 if (reg
.file
!= BRW_ARCHITECTURE_REGISTER_FILE
)
279 assert(reg
.nr
< 128);
281 /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
283 * "Accumulator registers may be accessed explicitly as src0
286 assert(reg
.file
!= BRW_ARCHITECTURE_REGISTER_FILE
||
287 reg
.nr
!= BRW_ARF_ACCUMULATOR
);
289 gen7_convert_mrf_to_grf(p
, ®
);
290 assert(reg
.file
!= BRW_MESSAGE_REGISTER_FILE
);
292 brw_inst_set_src1_file_type(devinfo
, inst
, reg
.file
, reg
.type
);
293 brw_inst_set_src1_abs(devinfo
, inst
, reg
.abs
);
294 brw_inst_set_src1_negate(devinfo
, inst
, reg
.negate
);
296 /* Only src1 can be immediate in two-argument instructions.
298 assert(brw_inst_src0_reg_file(devinfo
, inst
) != BRW_IMMEDIATE_VALUE
);
300 if (reg
.file
== BRW_IMMEDIATE_VALUE
) {
301 /* two-argument instructions can only use 32-bit immediates */
302 assert(type_sz(reg
.type
) < 8);
303 brw_inst_set_imm_ud(devinfo
, inst
, reg
.ud
);
305 /* This is a hardware restriction, which may or may not be lifted
308 assert (reg
.address_mode
== BRW_ADDRESS_DIRECT
);
309 /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
311 brw_inst_set_src1_da_reg_nr(devinfo
, inst
, reg
.nr
);
312 if (brw_inst_access_mode(devinfo
, inst
) == BRW_ALIGN_1
) {
313 brw_inst_set_src1_da1_subreg_nr(devinfo
, inst
, reg
.subnr
);
315 brw_inst_set_src1_da16_subreg_nr(devinfo
, inst
, reg
.subnr
/ 16);
318 if (brw_inst_access_mode(devinfo
, inst
) == BRW_ALIGN_1
) {
319 if (reg
.width
== BRW_WIDTH_1
&&
320 brw_inst_exec_size(devinfo
, inst
) == BRW_EXECUTE_1
) {
321 brw_inst_set_src1_hstride(devinfo
, inst
, BRW_HORIZONTAL_STRIDE_0
);
322 brw_inst_set_src1_width(devinfo
, inst
, BRW_WIDTH_1
);
323 brw_inst_set_src1_vstride(devinfo
, inst
, BRW_VERTICAL_STRIDE_0
);
325 brw_inst_set_src1_hstride(devinfo
, inst
, reg
.hstride
);
326 brw_inst_set_src1_width(devinfo
, inst
, reg
.width
);
327 brw_inst_set_src1_vstride(devinfo
, inst
, reg
.vstride
);
330 brw_inst_set_src1_da16_swiz_x(devinfo
, inst
,
331 BRW_GET_SWZ(reg
.swizzle
, BRW_CHANNEL_X
));
332 brw_inst_set_src1_da16_swiz_y(devinfo
, inst
,
333 BRW_GET_SWZ(reg
.swizzle
, BRW_CHANNEL_Y
));
334 brw_inst_set_src1_da16_swiz_z(devinfo
, inst
,
335 BRW_GET_SWZ(reg
.swizzle
, BRW_CHANNEL_Z
));
336 brw_inst_set_src1_da16_swiz_w(devinfo
, inst
,
337 BRW_GET_SWZ(reg
.swizzle
, BRW_CHANNEL_W
));
339 if (reg
.vstride
== BRW_VERTICAL_STRIDE_8
) {
340 /* This is an oddity of the fact we're using the same
341 * descriptions for registers in align_16 as align_1:
343 brw_inst_set_src1_vstride(devinfo
, inst
, BRW_VERTICAL_STRIDE_4
);
344 } else if (devinfo
->gen
== 7 && !devinfo
->is_haswell
&&
345 reg
.type
== BRW_REGISTER_TYPE_DF
&&
346 reg
.vstride
== BRW_VERTICAL_STRIDE_2
) {
349 * "For Align16 access mode, only encodings of 0000 and 0011
350 * are allowed. Other codes are reserved."
352 * Presumably the DevSNB behavior applies to IVB as well.
354 brw_inst_set_src1_vstride(devinfo
, inst
, BRW_VERTICAL_STRIDE_4
);
356 brw_inst_set_src1_vstride(devinfo
, inst
, reg
.vstride
);
363 * Specify the descriptor and extended descriptor immediate for a SEND(C)
364 * message instruction.
367 brw_set_desc_ex(struct brw_codegen
*p
, brw_inst
*inst
,
368 unsigned desc
, unsigned ex_desc
)
370 const struct gen_device_info
*devinfo
= p
->devinfo
;
371 assert(brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SEND
||
372 brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDC
);
373 brw_inst_set_src1_file_type(devinfo
, inst
,
374 BRW_IMMEDIATE_VALUE
, BRW_REGISTER_TYPE_UD
);
375 brw_inst_set_send_desc(devinfo
, inst
, desc
);
376 if (devinfo
->gen
>= 9)
377 brw_inst_set_send_ex_desc(devinfo
, inst
, ex_desc
);
380 static void brw_set_math_message( struct brw_codegen
*p
,
383 unsigned integer_type
,
387 const struct gen_device_info
*devinfo
= p
->devinfo
;
389 unsigned response_length
;
391 /* Infer message length from the function */
393 case BRW_MATH_FUNCTION_POW
:
394 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT
:
395 case BRW_MATH_FUNCTION_INT_DIV_REMAINDER
:
396 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER
:
404 /* Infer response length from the function */
406 case BRW_MATH_FUNCTION_SINCOS
:
407 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER
:
415 brw_set_desc(p
, inst
, brw_message_desc(
416 devinfo
, msg_length
, response_length
, false));
418 brw_inst_set_sfid(devinfo
, inst
, BRW_SFID_MATH
);
419 brw_inst_set_math_msg_function(devinfo
, inst
, function
);
420 brw_inst_set_math_msg_signed_int(devinfo
, inst
, integer_type
);
421 brw_inst_set_math_msg_precision(devinfo
, inst
, low_precision
);
422 brw_inst_set_math_msg_saturate(devinfo
, inst
, brw_inst_saturate(devinfo
, inst
));
423 brw_inst_set_math_msg_data_type(devinfo
, inst
, dataType
);
424 brw_inst_set_saturate(devinfo
, inst
, 0);
428 static void brw_set_ff_sync_message(struct brw_codegen
*p
,
431 unsigned response_length
,
434 const struct gen_device_info
*devinfo
= p
->devinfo
;
436 brw_set_desc(p
, insn
, brw_message_desc(
437 devinfo
, 1, response_length
, true));
439 brw_inst_set_sfid(devinfo
, insn
, BRW_SFID_URB
);
440 brw_inst_set_eot(devinfo
, insn
, end_of_thread
);
441 brw_inst_set_urb_opcode(devinfo
, insn
, 1); /* FF_SYNC */
442 brw_inst_set_urb_allocate(devinfo
, insn
, allocate
);
443 /* The following fields are not used by FF_SYNC: */
444 brw_inst_set_urb_global_offset(devinfo
, insn
, 0);
445 brw_inst_set_urb_swizzle_control(devinfo
, insn
, 0);
446 brw_inst_set_urb_used(devinfo
, insn
, 0);
447 brw_inst_set_urb_complete(devinfo
, insn
, 0);
450 static void brw_set_urb_message( struct brw_codegen
*p
,
452 enum brw_urb_write_flags flags
,
454 unsigned response_length
,
456 unsigned swizzle_control
)
458 const struct gen_device_info
*devinfo
= p
->devinfo
;
460 assert(devinfo
->gen
< 7 || swizzle_control
!= BRW_URB_SWIZZLE_TRANSPOSE
);
461 assert(devinfo
->gen
< 7 || !(flags
& BRW_URB_WRITE_ALLOCATE
));
462 assert(devinfo
->gen
>= 7 || !(flags
& BRW_URB_WRITE_PER_SLOT_OFFSET
));
464 brw_set_desc(p
, insn
, brw_message_desc(
465 devinfo
, msg_length
, response_length
, true));
467 brw_inst_set_sfid(devinfo
, insn
, BRW_SFID_URB
);
468 brw_inst_set_eot(devinfo
, insn
, !!(flags
& BRW_URB_WRITE_EOT
));
470 if (flags
& BRW_URB_WRITE_OWORD
) {
471 assert(msg_length
== 2); /* header + one OWORD of data */
472 brw_inst_set_urb_opcode(devinfo
, insn
, BRW_URB_OPCODE_WRITE_OWORD
);
474 brw_inst_set_urb_opcode(devinfo
, insn
, BRW_URB_OPCODE_WRITE_HWORD
);
477 brw_inst_set_urb_global_offset(devinfo
, insn
, offset
);
478 brw_inst_set_urb_swizzle_control(devinfo
, insn
, swizzle_control
);
480 if (devinfo
->gen
< 8) {
481 brw_inst_set_urb_complete(devinfo
, insn
, !!(flags
& BRW_URB_WRITE_COMPLETE
));
484 if (devinfo
->gen
< 7) {
485 brw_inst_set_urb_allocate(devinfo
, insn
, !!(flags
& BRW_URB_WRITE_ALLOCATE
));
486 brw_inst_set_urb_used(devinfo
, insn
, !(flags
& BRW_URB_WRITE_UNUSED
));
488 brw_inst_set_urb_per_slot_offset(devinfo
, insn
,
489 !!(flags
& BRW_URB_WRITE_PER_SLOT_OFFSET
));
494 gen7_set_dp_scratch_message(struct brw_codegen
*p
,
498 bool invalidate_after_read
,
500 unsigned addr_offset
,
505 const struct gen_device_info
*devinfo
= p
->devinfo
;
506 assert(num_regs
== 1 || num_regs
== 2 || num_regs
== 4 ||
507 (devinfo
->gen
>= 8 && num_regs
== 8));
508 const unsigned block_size
= (devinfo
->gen
>= 8 ? _mesa_logbase2(num_regs
) :
511 brw_set_desc(p
, inst
, brw_message_desc(
512 devinfo
, mlen
, rlen
, header_present
));
514 brw_inst_set_sfid(devinfo
, inst
, GEN7_SFID_DATAPORT_DATA_CACHE
);
515 brw_inst_set_dp_category(devinfo
, inst
, 1); /* Scratch Block Read/Write msgs */
516 brw_inst_set_scratch_read_write(devinfo
, inst
, write
);
517 brw_inst_set_scratch_type(devinfo
, inst
, dword
);
518 brw_inst_set_scratch_invalidate_after_read(devinfo
, inst
, invalidate_after_read
);
519 brw_inst_set_scratch_block_size(devinfo
, inst
, block_size
);
520 brw_inst_set_scratch_addr_offset(devinfo
, inst
, addr_offset
);
524 brw_inst_set_state(const struct gen_device_info
*devinfo
,
526 const struct brw_insn_state
*state
)
528 brw_inst_set_exec_size(devinfo
, insn
, state
->exec_size
);
529 brw_inst_set_group(devinfo
, insn
, state
->group
);
530 brw_inst_set_compression(devinfo
, insn
, state
->compressed
);
531 brw_inst_set_access_mode(devinfo
, insn
, state
->access_mode
);
532 brw_inst_set_mask_control(devinfo
, insn
, state
->mask_control
);
533 brw_inst_set_saturate(devinfo
, insn
, state
->saturate
);
534 brw_inst_set_pred_control(devinfo
, insn
, state
->predicate
);
535 brw_inst_set_pred_inv(devinfo
, insn
, state
->pred_inv
);
537 if (is_3src(devinfo
, brw_inst_opcode(devinfo
, insn
)) &&
538 state
->access_mode
== BRW_ALIGN_16
) {
539 brw_inst_set_3src_a16_flag_subreg_nr(devinfo
, insn
, state
->flag_subreg
% 2);
540 if (devinfo
->gen
>= 7)
541 brw_inst_set_3src_a16_flag_reg_nr(devinfo
, insn
, state
->flag_subreg
/ 2);
543 brw_inst_set_flag_subreg_nr(devinfo
, insn
, state
->flag_subreg
% 2);
544 if (devinfo
->gen
>= 7)
545 brw_inst_set_flag_reg_nr(devinfo
, insn
, state
->flag_subreg
/ 2);
548 if (devinfo
->gen
>= 6)
549 brw_inst_set_acc_wr_control(devinfo
, insn
, state
->acc_wr_control
);
552 #define next_insn brw_next_insn
554 brw_next_insn(struct brw_codegen
*p
, unsigned opcode
)
556 const struct gen_device_info
*devinfo
= p
->devinfo
;
559 if (p
->nr_insn
+ 1 > p
->store_size
) {
561 p
->store
= reralloc(p
->mem_ctx
, p
->store
, brw_inst
, p
->store_size
);
564 p
->next_insn_offset
+= 16;
565 insn
= &p
->store
[p
->nr_insn
++];
567 memset(insn
, 0, sizeof(*insn
));
568 brw_inst_set_opcode(devinfo
, insn
, opcode
);
570 /* Apply the default instruction state */
571 brw_inst_set_state(devinfo
, insn
, p
->current
);
577 brw_alu1(struct brw_codegen
*p
, unsigned opcode
,
578 struct brw_reg dest
, struct brw_reg src
)
580 brw_inst
*insn
= next_insn(p
, opcode
);
581 brw_set_dest(p
, insn
, dest
);
582 brw_set_src0(p
, insn
, src
);
587 brw_alu2(struct brw_codegen
*p
, unsigned opcode
,
588 struct brw_reg dest
, struct brw_reg src0
, struct brw_reg src1
)
590 /* 64-bit immediates are only supported on 1-src instructions */
591 assert(src0
.file
!= BRW_IMMEDIATE_VALUE
|| type_sz(src0
.type
) <= 4);
592 assert(src1
.file
!= BRW_IMMEDIATE_VALUE
|| type_sz(src1
.type
) <= 4);
594 brw_inst
*insn
= next_insn(p
, opcode
);
595 brw_set_dest(p
, insn
, dest
);
596 brw_set_src0(p
, insn
, src0
);
597 brw_set_src1(p
, insn
, src1
);
602 get_3src_subreg_nr(struct brw_reg reg
)
604 /* Normally, SubRegNum is in bytes (0..31). However, 3-src instructions
605 * use 32-bit units (components 0..7). Since they only support F/D/UD
606 * types, this doesn't lose any flexibility, but uses fewer bits.
608 return reg
.subnr
/ 4;
611 static enum gen10_align1_3src_vertical_stride
612 to_3src_align1_vstride(enum brw_vertical_stride vstride
)
615 case BRW_VERTICAL_STRIDE_0
:
616 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_0
;
617 case BRW_VERTICAL_STRIDE_2
:
618 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_2
;
619 case BRW_VERTICAL_STRIDE_4
:
620 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_4
;
621 case BRW_VERTICAL_STRIDE_8
:
622 case BRW_VERTICAL_STRIDE_16
:
623 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_8
;
625 unreachable("invalid vstride");
630 static enum gen10_align1_3src_src_horizontal_stride
631 to_3src_align1_hstride(enum brw_horizontal_stride hstride
)
634 case BRW_HORIZONTAL_STRIDE_0
:
635 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_0
;
636 case BRW_HORIZONTAL_STRIDE_1
:
637 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_1
;
638 case BRW_HORIZONTAL_STRIDE_2
:
639 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_2
;
640 case BRW_HORIZONTAL_STRIDE_4
:
641 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_4
;
643 unreachable("invalid hstride");
648 brw_alu3(struct brw_codegen
*p
, unsigned opcode
, struct brw_reg dest
,
649 struct brw_reg src0
, struct brw_reg src1
, struct brw_reg src2
)
651 const struct gen_device_info
*devinfo
= p
->devinfo
;
652 brw_inst
*inst
= next_insn(p
, opcode
);
654 gen7_convert_mrf_to_grf(p
, &dest
);
656 assert(dest
.nr
< 128);
657 assert(src0
.nr
< 128);
658 assert(src1
.nr
< 128);
659 assert(src2
.nr
< 128);
660 assert(dest
.address_mode
== BRW_ADDRESS_DIRECT
);
661 assert(src0
.address_mode
== BRW_ADDRESS_DIRECT
);
662 assert(src1
.address_mode
== BRW_ADDRESS_DIRECT
);
663 assert(src2
.address_mode
== BRW_ADDRESS_DIRECT
);
665 if (brw_inst_access_mode(devinfo
, inst
) == BRW_ALIGN_1
) {
666 assert(dest
.file
== BRW_GENERAL_REGISTER_FILE
||
667 dest
.file
== BRW_ARCHITECTURE_REGISTER_FILE
);
669 if (dest
.file
== BRW_ARCHITECTURE_REGISTER_FILE
) {
670 brw_inst_set_3src_a1_dst_reg_file(devinfo
, inst
,
671 BRW_ALIGN1_3SRC_ACCUMULATOR
);
672 brw_inst_set_3src_dst_reg_nr(devinfo
, inst
, BRW_ARF_ACCUMULATOR
);
674 brw_inst_set_3src_a1_dst_reg_file(devinfo
, inst
,
675 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE
);
676 brw_inst_set_3src_dst_reg_nr(devinfo
, inst
, dest
.nr
);
678 brw_inst_set_3src_a1_dst_subreg_nr(devinfo
, inst
, dest
.subnr
/ 8);
680 brw_inst_set_3src_a1_dst_hstride(devinfo
, inst
, BRW_ALIGN1_3SRC_DST_HORIZONTAL_STRIDE_1
);
682 if (brw_reg_type_is_floating_point(dest
.type
)) {
683 brw_inst_set_3src_a1_exec_type(devinfo
, inst
,
684 BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT
);
686 brw_inst_set_3src_a1_exec_type(devinfo
, inst
,
687 BRW_ALIGN1_3SRC_EXEC_TYPE_INT
);
690 brw_inst_set_3src_a1_dst_type(devinfo
, inst
, dest
.type
);
691 brw_inst_set_3src_a1_src0_type(devinfo
, inst
, src0
.type
);
692 brw_inst_set_3src_a1_src1_type(devinfo
, inst
, src1
.type
);
693 brw_inst_set_3src_a1_src2_type(devinfo
, inst
, src2
.type
);
695 brw_inst_set_3src_a1_src0_vstride(devinfo
, inst
,
696 to_3src_align1_vstride(src0
.vstride
));
697 brw_inst_set_3src_a1_src1_vstride(devinfo
, inst
,
698 to_3src_align1_vstride(src1
.vstride
));
699 /* no vstride on src2 */
701 brw_inst_set_3src_a1_src0_hstride(devinfo
, inst
,
702 to_3src_align1_hstride(src0
.hstride
));
703 brw_inst_set_3src_a1_src1_hstride(devinfo
, inst
,
704 to_3src_align1_hstride(src1
.hstride
));
705 brw_inst_set_3src_a1_src2_hstride(devinfo
, inst
,
706 to_3src_align1_hstride(src2
.hstride
));
708 brw_inst_set_3src_a1_src0_subreg_nr(devinfo
, inst
, src0
.subnr
);
709 if (src0
.type
== BRW_REGISTER_TYPE_NF
) {
710 brw_inst_set_3src_src0_reg_nr(devinfo
, inst
, BRW_ARF_ACCUMULATOR
);
712 brw_inst_set_3src_src0_reg_nr(devinfo
, inst
, src0
.nr
);
714 brw_inst_set_3src_src0_abs(devinfo
, inst
, src0
.abs
);
715 brw_inst_set_3src_src0_negate(devinfo
, inst
, src0
.negate
);
717 brw_inst_set_3src_a1_src1_subreg_nr(devinfo
, inst
, src1
.subnr
);
718 if (src1
.file
== BRW_ARCHITECTURE_REGISTER_FILE
) {
719 brw_inst_set_3src_src1_reg_nr(devinfo
, inst
, BRW_ARF_ACCUMULATOR
);
721 brw_inst_set_3src_src1_reg_nr(devinfo
, inst
, src1
.nr
);
723 brw_inst_set_3src_src1_abs(devinfo
, inst
, src1
.abs
);
724 brw_inst_set_3src_src1_negate(devinfo
, inst
, src1
.negate
);
726 brw_inst_set_3src_a1_src2_subreg_nr(devinfo
, inst
, src2
.subnr
);
727 brw_inst_set_3src_src2_reg_nr(devinfo
, inst
, src2
.nr
);
728 brw_inst_set_3src_src2_abs(devinfo
, inst
, src2
.abs
);
729 brw_inst_set_3src_src2_negate(devinfo
, inst
, src2
.negate
);
731 assert(src0
.file
== BRW_GENERAL_REGISTER_FILE
||
732 src0
.file
== BRW_IMMEDIATE_VALUE
||
733 (src0
.file
== BRW_ARCHITECTURE_REGISTER_FILE
&&
734 src0
.type
== BRW_REGISTER_TYPE_NF
));
735 assert(src1
.file
== BRW_GENERAL_REGISTER_FILE
||
736 src1
.file
== BRW_ARCHITECTURE_REGISTER_FILE
);
737 assert(src2
.file
== BRW_GENERAL_REGISTER_FILE
||
738 src2
.file
== BRW_IMMEDIATE_VALUE
);
740 brw_inst_set_3src_a1_src0_reg_file(devinfo
, inst
,
741 src0
.file
== BRW_GENERAL_REGISTER_FILE
?
742 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE
:
743 BRW_ALIGN1_3SRC_IMMEDIATE_VALUE
);
744 brw_inst_set_3src_a1_src1_reg_file(devinfo
, inst
,
745 src1
.file
== BRW_GENERAL_REGISTER_FILE
?
746 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE
:
747 BRW_ALIGN1_3SRC_ACCUMULATOR
);
748 brw_inst_set_3src_a1_src2_reg_file(devinfo
, inst
,
749 src2
.file
== BRW_GENERAL_REGISTER_FILE
?
750 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE
:
751 BRW_ALIGN1_3SRC_IMMEDIATE_VALUE
);
753 assert(dest
.file
== BRW_GENERAL_REGISTER_FILE
||
754 dest
.file
== BRW_MESSAGE_REGISTER_FILE
);
755 assert(dest
.type
== BRW_REGISTER_TYPE_F
||
756 dest
.type
== BRW_REGISTER_TYPE_DF
||
757 dest
.type
== BRW_REGISTER_TYPE_D
||
758 dest
.type
== BRW_REGISTER_TYPE_UD
);
759 if (devinfo
->gen
== 6) {
760 brw_inst_set_3src_a16_dst_reg_file(devinfo
, inst
,
761 dest
.file
== BRW_MESSAGE_REGISTER_FILE
);
763 brw_inst_set_3src_dst_reg_nr(devinfo
, inst
, dest
.nr
);
764 brw_inst_set_3src_a16_dst_subreg_nr(devinfo
, inst
, dest
.subnr
/ 16);
765 brw_inst_set_3src_a16_dst_writemask(devinfo
, inst
, dest
.writemask
);
767 assert(src0
.file
== BRW_GENERAL_REGISTER_FILE
);
768 brw_inst_set_3src_a16_src0_swizzle(devinfo
, inst
, src0
.swizzle
);
769 brw_inst_set_3src_a16_src0_subreg_nr(devinfo
, inst
, get_3src_subreg_nr(src0
));
770 brw_inst_set_3src_src0_reg_nr(devinfo
, inst
, src0
.nr
);
771 brw_inst_set_3src_src0_abs(devinfo
, inst
, src0
.abs
);
772 brw_inst_set_3src_src0_negate(devinfo
, inst
, src0
.negate
);
773 brw_inst_set_3src_a16_src0_rep_ctrl(devinfo
, inst
,
774 src0
.vstride
== BRW_VERTICAL_STRIDE_0
);
776 assert(src1
.file
== BRW_GENERAL_REGISTER_FILE
);
777 brw_inst_set_3src_a16_src1_swizzle(devinfo
, inst
, src1
.swizzle
);
778 brw_inst_set_3src_a16_src1_subreg_nr(devinfo
, inst
, get_3src_subreg_nr(src1
));
779 brw_inst_set_3src_src1_reg_nr(devinfo
, inst
, src1
.nr
);
780 brw_inst_set_3src_src1_abs(devinfo
, inst
, src1
.abs
);
781 brw_inst_set_3src_src1_negate(devinfo
, inst
, src1
.negate
);
782 brw_inst_set_3src_a16_src1_rep_ctrl(devinfo
, inst
,
783 src1
.vstride
== BRW_VERTICAL_STRIDE_0
);
785 assert(src2
.file
== BRW_GENERAL_REGISTER_FILE
);
786 brw_inst_set_3src_a16_src2_swizzle(devinfo
, inst
, src2
.swizzle
);
787 brw_inst_set_3src_a16_src2_subreg_nr(devinfo
, inst
, get_3src_subreg_nr(src2
));
788 brw_inst_set_3src_src2_reg_nr(devinfo
, inst
, src2
.nr
);
789 brw_inst_set_3src_src2_abs(devinfo
, inst
, src2
.abs
);
790 brw_inst_set_3src_src2_negate(devinfo
, inst
, src2
.negate
);
791 brw_inst_set_3src_a16_src2_rep_ctrl(devinfo
, inst
,
792 src2
.vstride
== BRW_VERTICAL_STRIDE_0
);
794 if (devinfo
->gen
>= 7) {
795 /* Set both the source and destination types based on dest.type,
796 * ignoring the source register types. The MAD and LRP emitters ensure
797 * that all four types are float. The BFE and BFI2 emitters, however,
798 * may send us mixed D and UD types and want us to ignore that and use
799 * the destination type.
801 brw_inst_set_3src_a16_src_type(devinfo
, inst
, dest
.type
);
802 brw_inst_set_3src_a16_dst_type(devinfo
, inst
, dest
.type
);
810 /***********************************************************************
811 * Convenience routines.
814 brw_inst *brw_##OP(struct brw_codegen *p, \
815 struct brw_reg dest, \
816 struct brw_reg src0) \
818 return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \
822 brw_inst *brw_##OP(struct brw_codegen *p, \
823 struct brw_reg dest, \
824 struct brw_reg src0, \
825 struct brw_reg src1) \
827 return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \
831 brw_inst *brw_##OP(struct brw_codegen *p, \
832 struct brw_reg dest, \
833 struct brw_reg src0, \
834 struct brw_reg src1, \
835 struct brw_reg src2) \
837 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
841 brw_inst *brw_##OP(struct brw_codegen *p, \
842 struct brw_reg dest, \
843 struct brw_reg src0, \
844 struct brw_reg src1, \
845 struct brw_reg src2) \
847 assert(dest.type == BRW_REGISTER_TYPE_F || \
848 dest.type == BRW_REGISTER_TYPE_DF); \
849 if (dest.type == BRW_REGISTER_TYPE_F) { \
850 assert(src0.type == BRW_REGISTER_TYPE_F); \
851 assert(src1.type == BRW_REGISTER_TYPE_F); \
852 assert(src2.type == BRW_REGISTER_TYPE_F); \
853 } else if (dest.type == BRW_REGISTER_TYPE_DF) { \
854 assert(src0.type == BRW_REGISTER_TYPE_DF); \
855 assert(src1.type == BRW_REGISTER_TYPE_DF); \
856 assert(src2.type == BRW_REGISTER_TYPE_DF); \
858 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
861 /* Rounding operations (other than RNDD) require two instructions - the first
862 * stores a rounded value (possibly the wrong way) in the dest register, but
863 * also sets a per-channel "increment bit" in the flag register. A predicated
864 * add of 1.0 fixes dest to contain the desired result.
866 * Sandybridge and later appear to round correctly without an ADD.
869 void brw_##OP(struct brw_codegen *p, \
870 struct brw_reg dest, \
871 struct brw_reg src) \
873 const struct gen_device_info *devinfo = p->devinfo; \
874 brw_inst *rnd, *add; \
875 rnd = next_insn(p, BRW_OPCODE_##OP); \
876 brw_set_dest(p, rnd, dest); \
877 brw_set_src0(p, rnd, src); \
879 if (devinfo->gen < 6) { \
880 /* turn on round-increments */ \
881 brw_inst_set_cond_modifier(devinfo, rnd, BRW_CONDITIONAL_R); \
882 add = brw_ADD(p, dest, dest, brw_imm_f(1.0f)); \
883 brw_inst_set_pred_control(devinfo, add, BRW_PREDICATE_NORMAL); \
923 brw_MOV(struct brw_codegen
*p
, struct brw_reg dest
, struct brw_reg src0
)
925 const struct gen_device_info
*devinfo
= p
->devinfo
;
927 /* When converting F->DF on IVB/BYT, every odd source channel is ignored.
928 * To avoid the problems that causes, we use a <1,2,0> source region to read
929 * each element twice.
931 if (devinfo
->gen
== 7 && !devinfo
->is_haswell
&&
932 brw_get_default_access_mode(p
) == BRW_ALIGN_1
&&
933 dest
.type
== BRW_REGISTER_TYPE_DF
&&
934 (src0
.type
== BRW_REGISTER_TYPE_F
||
935 src0
.type
== BRW_REGISTER_TYPE_D
||
936 src0
.type
== BRW_REGISTER_TYPE_UD
) &&
937 !has_scalar_region(src0
)) {
938 assert(src0
.vstride
== BRW_VERTICAL_STRIDE_4
&&
939 src0
.width
== BRW_WIDTH_4
&&
940 src0
.hstride
== BRW_HORIZONTAL_STRIDE_1
);
942 src0
.vstride
= BRW_VERTICAL_STRIDE_1
;
943 src0
.width
= BRW_WIDTH_2
;
944 src0
.hstride
= BRW_HORIZONTAL_STRIDE_0
;
947 return brw_alu1(p
, BRW_OPCODE_MOV
, dest
, src0
);
951 brw_ADD(struct brw_codegen
*p
, struct brw_reg dest
,
952 struct brw_reg src0
, struct brw_reg src1
)
955 if (src0
.type
== BRW_REGISTER_TYPE_F
||
956 (src0
.file
== BRW_IMMEDIATE_VALUE
&&
957 src0
.type
== BRW_REGISTER_TYPE_VF
)) {
958 assert(src1
.type
!= BRW_REGISTER_TYPE_UD
);
959 assert(src1
.type
!= BRW_REGISTER_TYPE_D
);
962 if (src1
.type
== BRW_REGISTER_TYPE_F
||
963 (src1
.file
== BRW_IMMEDIATE_VALUE
&&
964 src1
.type
== BRW_REGISTER_TYPE_VF
)) {
965 assert(src0
.type
!= BRW_REGISTER_TYPE_UD
);
966 assert(src0
.type
!= BRW_REGISTER_TYPE_D
);
969 return brw_alu2(p
, BRW_OPCODE_ADD
, dest
, src0
, src1
);
973 brw_AVG(struct brw_codegen
*p
, struct brw_reg dest
,
974 struct brw_reg src0
, struct brw_reg src1
)
976 assert(dest
.type
== src0
.type
);
977 assert(src0
.type
== src1
.type
);
979 case BRW_REGISTER_TYPE_B
:
980 case BRW_REGISTER_TYPE_UB
:
981 case BRW_REGISTER_TYPE_W
:
982 case BRW_REGISTER_TYPE_UW
:
983 case BRW_REGISTER_TYPE_D
:
984 case BRW_REGISTER_TYPE_UD
:
987 unreachable("Bad type for brw_AVG");
990 return brw_alu2(p
, BRW_OPCODE_AVG
, dest
, src0
, src1
);
994 brw_MUL(struct brw_codegen
*p
, struct brw_reg dest
,
995 struct brw_reg src0
, struct brw_reg src1
)
998 if (src0
.type
== BRW_REGISTER_TYPE_D
||
999 src0
.type
== BRW_REGISTER_TYPE_UD
||
1000 src1
.type
== BRW_REGISTER_TYPE_D
||
1001 src1
.type
== BRW_REGISTER_TYPE_UD
) {
1002 assert(dest
.type
!= BRW_REGISTER_TYPE_F
);
1005 if (src0
.type
== BRW_REGISTER_TYPE_F
||
1006 (src0
.file
== BRW_IMMEDIATE_VALUE
&&
1007 src0
.type
== BRW_REGISTER_TYPE_VF
)) {
1008 assert(src1
.type
!= BRW_REGISTER_TYPE_UD
);
1009 assert(src1
.type
!= BRW_REGISTER_TYPE_D
);
1012 if (src1
.type
== BRW_REGISTER_TYPE_F
||
1013 (src1
.file
== BRW_IMMEDIATE_VALUE
&&
1014 src1
.type
== BRW_REGISTER_TYPE_VF
)) {
1015 assert(src0
.type
!= BRW_REGISTER_TYPE_UD
);
1016 assert(src0
.type
!= BRW_REGISTER_TYPE_D
);
1019 assert(src0
.file
!= BRW_ARCHITECTURE_REGISTER_FILE
||
1020 src0
.nr
!= BRW_ARF_ACCUMULATOR
);
1021 assert(src1
.file
!= BRW_ARCHITECTURE_REGISTER_FILE
||
1022 src1
.nr
!= BRW_ARF_ACCUMULATOR
);
1024 return brw_alu2(p
, BRW_OPCODE_MUL
, dest
, src0
, src1
);
1028 brw_LINE(struct brw_codegen
*p
, struct brw_reg dest
,
1029 struct brw_reg src0
, struct brw_reg src1
)
1031 src0
.vstride
= BRW_VERTICAL_STRIDE_0
;
1032 src0
.width
= BRW_WIDTH_1
;
1033 src0
.hstride
= BRW_HORIZONTAL_STRIDE_0
;
1034 return brw_alu2(p
, BRW_OPCODE_LINE
, dest
, src0
, src1
);
1038 brw_PLN(struct brw_codegen
*p
, struct brw_reg dest
,
1039 struct brw_reg src0
, struct brw_reg src1
)
1041 src0
.vstride
= BRW_VERTICAL_STRIDE_0
;
1042 src0
.width
= BRW_WIDTH_1
;
1043 src0
.hstride
= BRW_HORIZONTAL_STRIDE_0
;
1044 src1
.vstride
= BRW_VERTICAL_STRIDE_8
;
1045 src1
.width
= BRW_WIDTH_8
;
1046 src1
.hstride
= BRW_HORIZONTAL_STRIDE_1
;
1047 return brw_alu2(p
, BRW_OPCODE_PLN
, dest
, src0
, src1
);
1051 brw_F32TO16(struct brw_codegen
*p
, struct brw_reg dst
, struct brw_reg src
)
1053 const struct gen_device_info
*devinfo
= p
->devinfo
;
1054 const bool align16
= brw_get_default_access_mode(p
) == BRW_ALIGN_16
;
1055 /* The F32TO16 instruction doesn't support 32-bit destination types in
1056 * Align1 mode, and neither does the Gen8 implementation in terms of a
1057 * converting MOV. Gen7 does zero out the high 16 bits in Align16 mode as
1058 * an undocumented feature.
1060 const bool needs_zero_fill
= (dst
.type
== BRW_REGISTER_TYPE_UD
&&
1061 (!align16
|| devinfo
->gen
>= 8));
1065 assert(dst
.type
== BRW_REGISTER_TYPE_UD
);
1067 assert(dst
.type
== BRW_REGISTER_TYPE_UD
||
1068 dst
.type
== BRW_REGISTER_TYPE_W
||
1069 dst
.type
== BRW_REGISTER_TYPE_UW
||
1070 dst
.type
== BRW_REGISTER_TYPE_HF
);
1073 brw_push_insn_state(p
);
1075 if (needs_zero_fill
) {
1076 brw_set_default_access_mode(p
, BRW_ALIGN_1
);
1077 dst
= spread(retype(dst
, BRW_REGISTER_TYPE_W
), 2);
1080 if (devinfo
->gen
>= 8) {
1081 inst
= brw_MOV(p
, retype(dst
, BRW_REGISTER_TYPE_HF
), src
);
1083 assert(devinfo
->gen
== 7);
1084 inst
= brw_alu1(p
, BRW_OPCODE_F32TO16
, dst
, src
);
1087 if (needs_zero_fill
) {
1088 brw_inst_set_no_dd_clear(devinfo
, inst
, true);
1089 inst
= brw_MOV(p
, suboffset(dst
, 1), brw_imm_w(0));
1090 brw_inst_set_no_dd_check(devinfo
, inst
, true);
1093 brw_pop_insn_state(p
);
1098 brw_F16TO32(struct brw_codegen
*p
, struct brw_reg dst
, struct brw_reg src
)
1100 const struct gen_device_info
*devinfo
= p
->devinfo
;
1101 bool align16
= brw_get_default_access_mode(p
) == BRW_ALIGN_16
;
1104 assert(src
.type
== BRW_REGISTER_TYPE_UD
);
1106 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1108 * Because this instruction does not have a 16-bit floating-point
1109 * type, the source data type must be Word (W). The destination type
1110 * must be F (Float).
1112 if (src
.type
== BRW_REGISTER_TYPE_UD
)
1113 src
= spread(retype(src
, BRW_REGISTER_TYPE_W
), 2);
1115 assert(src
.type
== BRW_REGISTER_TYPE_W
||
1116 src
.type
== BRW_REGISTER_TYPE_UW
||
1117 src
.type
== BRW_REGISTER_TYPE_HF
);
1120 if (devinfo
->gen
>= 8) {
1121 return brw_MOV(p
, dst
, retype(src
, BRW_REGISTER_TYPE_HF
));
1123 assert(devinfo
->gen
== 7);
1124 return brw_alu1(p
, BRW_OPCODE_F16TO32
, dst
, src
);
1129 void brw_NOP(struct brw_codegen
*p
)
1131 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_NOP
);
1132 memset(insn
, 0, sizeof(*insn
));
1133 brw_inst_set_opcode(p
->devinfo
, insn
, BRW_OPCODE_NOP
);
1140 /***********************************************************************
1141 * Comparisons, if/else/endif
1145 brw_JMPI(struct brw_codegen
*p
, struct brw_reg index
,
1146 unsigned predicate_control
)
1148 const struct gen_device_info
*devinfo
= p
->devinfo
;
1149 struct brw_reg ip
= brw_ip_reg();
1150 brw_inst
*inst
= brw_alu2(p
, BRW_OPCODE_JMPI
, ip
, ip
, index
);
1152 brw_inst_set_exec_size(devinfo
, inst
, BRW_EXECUTE_1
);
1153 brw_inst_set_qtr_control(devinfo
, inst
, BRW_COMPRESSION_NONE
);
1154 brw_inst_set_mask_control(devinfo
, inst
, BRW_MASK_DISABLE
);
1155 brw_inst_set_pred_control(devinfo
, inst
, predicate_control
);
1161 push_if_stack(struct brw_codegen
*p
, brw_inst
*inst
)
1163 p
->if_stack
[p
->if_stack_depth
] = inst
- p
->store
;
1165 p
->if_stack_depth
++;
1166 if (p
->if_stack_array_size
<= p
->if_stack_depth
) {
1167 p
->if_stack_array_size
*= 2;
1168 p
->if_stack
= reralloc(p
->mem_ctx
, p
->if_stack
, int,
1169 p
->if_stack_array_size
);
1174 pop_if_stack(struct brw_codegen
*p
)
1176 p
->if_stack_depth
--;
1177 return &p
->store
[p
->if_stack
[p
->if_stack_depth
]];
1181 push_loop_stack(struct brw_codegen
*p
, brw_inst
*inst
)
1183 if (p
->loop_stack_array_size
<= (p
->loop_stack_depth
+ 1)) {
1184 p
->loop_stack_array_size
*= 2;
1185 p
->loop_stack
= reralloc(p
->mem_ctx
, p
->loop_stack
, int,
1186 p
->loop_stack_array_size
);
1187 p
->if_depth_in_loop
= reralloc(p
->mem_ctx
, p
->if_depth_in_loop
, int,
1188 p
->loop_stack_array_size
);
1191 p
->loop_stack
[p
->loop_stack_depth
] = inst
- p
->store
;
1192 p
->loop_stack_depth
++;
1193 p
->if_depth_in_loop
[p
->loop_stack_depth
] = 0;
1197 get_inner_do_insn(struct brw_codegen
*p
)
1199 return &p
->store
[p
->loop_stack
[p
->loop_stack_depth
- 1]];
1202 /* EU takes the value from the flag register and pushes it onto some
1203 * sort of a stack (presumably merging with any flag value already on
1204 * the stack). Within an if block, the flags at the top of the stack
1205 * control execution on each channel of the unit, eg. on each of the
1206 * 16 pixel values in our wm programs.
1208 * When the matching 'else' instruction is reached (presumably by
1209 * countdown of the instruction count patched in by our ELSE/ENDIF
1210 * functions), the relevant flags are inverted.
1212 * When the matching 'endif' instruction is reached, the flags are
1213 * popped off. If the stack is now empty, normal execution resumes.
1216 brw_IF(struct brw_codegen
*p
, unsigned execute_size
)
1218 const struct gen_device_info
*devinfo
= p
->devinfo
;
1221 insn
= next_insn(p
, BRW_OPCODE_IF
);
1223 /* Override the defaults for this instruction:
1225 if (devinfo
->gen
< 6) {
1226 brw_set_dest(p
, insn
, brw_ip_reg());
1227 brw_set_src0(p
, insn
, brw_ip_reg());
1228 brw_set_src1(p
, insn
, brw_imm_d(0x0));
1229 } else if (devinfo
->gen
== 6) {
1230 brw_set_dest(p
, insn
, brw_imm_w(0));
1231 brw_inst_set_gen6_jump_count(devinfo
, insn
, 0);
1232 brw_set_src0(p
, insn
, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D
)));
1233 brw_set_src1(p
, insn
, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D
)));
1234 } else if (devinfo
->gen
== 7) {
1235 brw_set_dest(p
, insn
, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D
)));
1236 brw_set_src0(p
, insn
, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D
)));
1237 brw_set_src1(p
, insn
, brw_imm_w(0));
1238 brw_inst_set_jip(devinfo
, insn
, 0);
1239 brw_inst_set_uip(devinfo
, insn
, 0);
1241 brw_set_dest(p
, insn
, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D
)));
1242 brw_set_src0(p
, insn
, brw_imm_d(0));
1243 brw_inst_set_jip(devinfo
, insn
, 0);
1244 brw_inst_set_uip(devinfo
, insn
, 0);
1247 brw_inst_set_exec_size(devinfo
, insn
, execute_size
);
1248 brw_inst_set_qtr_control(devinfo
, insn
, BRW_COMPRESSION_NONE
);
1249 brw_inst_set_pred_control(devinfo
, insn
, BRW_PREDICATE_NORMAL
);
1250 brw_inst_set_mask_control(devinfo
, insn
, BRW_MASK_ENABLE
);
1251 if (!p
->single_program_flow
&& devinfo
->gen
< 6)
1252 brw_inst_set_thread_control(devinfo
, insn
, BRW_THREAD_SWITCH
);
1254 push_if_stack(p
, insn
);
1255 p
->if_depth_in_loop
[p
->loop_stack_depth
]++;
1259 /* This function is only used for gen6-style IF instructions with an
1260 * embedded comparison (conditional modifier). It is not used on gen7.
1263 gen6_IF(struct brw_codegen
*p
, enum brw_conditional_mod conditional
,
1264 struct brw_reg src0
, struct brw_reg src1
)
1266 const struct gen_device_info
*devinfo
= p
->devinfo
;
1269 insn
= next_insn(p
, BRW_OPCODE_IF
);
1271 brw_set_dest(p
, insn
, brw_imm_w(0));
1272 brw_inst_set_exec_size(devinfo
, insn
, brw_get_default_exec_size(p
));
1273 brw_inst_set_gen6_jump_count(devinfo
, insn
, 0);
1274 brw_set_src0(p
, insn
, src0
);
1275 brw_set_src1(p
, insn
, src1
);
1277 assert(brw_inst_qtr_control(devinfo
, insn
) == BRW_COMPRESSION_NONE
);
1278 assert(brw_inst_pred_control(devinfo
, insn
) == BRW_PREDICATE_NONE
);
1279 brw_inst_set_cond_modifier(devinfo
, insn
, conditional
);
1281 push_if_stack(p
, insn
);
1286 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1289 convert_IF_ELSE_to_ADD(struct brw_codegen
*p
,
1290 brw_inst
*if_inst
, brw_inst
*else_inst
)
1292 const struct gen_device_info
*devinfo
= p
->devinfo
;
1294 /* The next instruction (where the ENDIF would be, if it existed) */
1295 brw_inst
*next_inst
= &p
->store
[p
->nr_insn
];
1297 assert(p
->single_program_flow
);
1298 assert(if_inst
!= NULL
&& brw_inst_opcode(devinfo
, if_inst
) == BRW_OPCODE_IF
);
1299 assert(else_inst
== NULL
|| brw_inst_opcode(devinfo
, else_inst
) == BRW_OPCODE_ELSE
);
1300 assert(brw_inst_exec_size(devinfo
, if_inst
) == BRW_EXECUTE_1
);
1302 /* Convert IF to an ADD instruction that moves the instruction pointer
1303 * to the first instruction of the ELSE block. If there is no ELSE
1304 * block, point to where ENDIF would be. Reverse the predicate.
1306 * There's no need to execute an ENDIF since we don't need to do any
1307 * stack operations, and if we're currently executing, we just want to
1308 * continue normally.
1310 brw_inst_set_opcode(devinfo
, if_inst
, BRW_OPCODE_ADD
);
1311 brw_inst_set_pred_inv(devinfo
, if_inst
, true);
1313 if (else_inst
!= NULL
) {
1314 /* Convert ELSE to an ADD instruction that points where the ENDIF
1317 brw_inst_set_opcode(devinfo
, else_inst
, BRW_OPCODE_ADD
);
1319 brw_inst_set_imm_ud(devinfo
, if_inst
, (else_inst
- if_inst
+ 1) * 16);
1320 brw_inst_set_imm_ud(devinfo
, else_inst
, (next_inst
- else_inst
) * 16);
1322 brw_inst_set_imm_ud(devinfo
, if_inst
, (next_inst
- if_inst
) * 16);
1327 * Patch IF and ELSE instructions with appropriate jump targets.
1330 patch_IF_ELSE(struct brw_codegen
*p
,
1331 brw_inst
*if_inst
, brw_inst
*else_inst
, brw_inst
*endif_inst
)
1333 const struct gen_device_info
*devinfo
= p
->devinfo
;
1335 /* We shouldn't be patching IF and ELSE instructions in single program flow
1336 * mode when gen < 6, because in single program flow mode on those
1337 * platforms, we convert flow control instructions to conditional ADDs that
1338 * operate on IP (see brw_ENDIF).
1340 * However, on Gen6, writing to IP doesn't work in single program flow mode
1341 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1342 * not be updated by non-flow control instructions."). And on later
1343 * platforms, there is no significant benefit to converting control flow
1344 * instructions to conditional ADDs. So we do patch IF and ELSE
1345 * instructions in single program flow mode on those platforms.
1347 if (devinfo
->gen
< 6)
1348 assert(!p
->single_program_flow
);
1350 assert(if_inst
!= NULL
&& brw_inst_opcode(devinfo
, if_inst
) == BRW_OPCODE_IF
);
1351 assert(endif_inst
!= NULL
);
1352 assert(else_inst
== NULL
|| brw_inst_opcode(devinfo
, else_inst
) == BRW_OPCODE_ELSE
);
1354 unsigned br
= brw_jump_scale(devinfo
);
1356 assert(brw_inst_opcode(devinfo
, endif_inst
) == BRW_OPCODE_ENDIF
);
1357 brw_inst_set_exec_size(devinfo
, endif_inst
, brw_inst_exec_size(devinfo
, if_inst
));
1359 if (else_inst
== NULL
) {
1360 /* Patch IF -> ENDIF */
1361 if (devinfo
->gen
< 6) {
1362 /* Turn it into an IFF, which means no mask stack operations for
1363 * all-false and jumping past the ENDIF.
1365 brw_inst_set_opcode(devinfo
, if_inst
, BRW_OPCODE_IFF
);
1366 brw_inst_set_gen4_jump_count(devinfo
, if_inst
,
1367 br
* (endif_inst
- if_inst
+ 1));
1368 brw_inst_set_gen4_pop_count(devinfo
, if_inst
, 0);
1369 } else if (devinfo
->gen
== 6) {
1370 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1371 brw_inst_set_gen6_jump_count(devinfo
, if_inst
, br
*(endif_inst
- if_inst
));
1373 brw_inst_set_uip(devinfo
, if_inst
, br
* (endif_inst
- if_inst
));
1374 brw_inst_set_jip(devinfo
, if_inst
, br
* (endif_inst
- if_inst
));
1377 brw_inst_set_exec_size(devinfo
, else_inst
, brw_inst_exec_size(devinfo
, if_inst
));
1379 /* Patch IF -> ELSE */
1380 if (devinfo
->gen
< 6) {
1381 brw_inst_set_gen4_jump_count(devinfo
, if_inst
,
1382 br
* (else_inst
- if_inst
));
1383 brw_inst_set_gen4_pop_count(devinfo
, if_inst
, 0);
1384 } else if (devinfo
->gen
== 6) {
1385 brw_inst_set_gen6_jump_count(devinfo
, if_inst
,
1386 br
* (else_inst
- if_inst
+ 1));
1389 /* Patch ELSE -> ENDIF */
1390 if (devinfo
->gen
< 6) {
1391 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1394 brw_inst_set_gen4_jump_count(devinfo
, else_inst
,
1395 br
* (endif_inst
- else_inst
+ 1));
1396 brw_inst_set_gen4_pop_count(devinfo
, else_inst
, 1);
1397 } else if (devinfo
->gen
== 6) {
1398 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1399 brw_inst_set_gen6_jump_count(devinfo
, else_inst
,
1400 br
* (endif_inst
- else_inst
));
1402 /* The IF instruction's JIP should point just past the ELSE */
1403 brw_inst_set_jip(devinfo
, if_inst
, br
* (else_inst
- if_inst
+ 1));
1404 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1405 brw_inst_set_uip(devinfo
, if_inst
, br
* (endif_inst
- if_inst
));
1406 brw_inst_set_jip(devinfo
, else_inst
, br
* (endif_inst
- else_inst
));
1407 if (devinfo
->gen
>= 8) {
1408 /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both
1409 * should point to ENDIF.
1411 brw_inst_set_uip(devinfo
, else_inst
, br
* (endif_inst
- else_inst
));
1418 brw_ELSE(struct brw_codegen
*p
)
1420 const struct gen_device_info
*devinfo
= p
->devinfo
;
1423 insn
= next_insn(p
, BRW_OPCODE_ELSE
);
1425 if (devinfo
->gen
< 6) {
1426 brw_set_dest(p
, insn
, brw_ip_reg());
1427 brw_set_src0(p
, insn
, brw_ip_reg());
1428 brw_set_src1(p
, insn
, brw_imm_d(0x0));
1429 } else if (devinfo
->gen
== 6) {
1430 brw_set_dest(p
, insn
, brw_imm_w(0));
1431 brw_inst_set_gen6_jump_count(devinfo
, insn
, 0);
1432 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1433 brw_set_src1(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1434 } else if (devinfo
->gen
== 7) {
1435 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1436 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1437 brw_set_src1(p
, insn
, brw_imm_w(0));
1438 brw_inst_set_jip(devinfo
, insn
, 0);
1439 brw_inst_set_uip(devinfo
, insn
, 0);
1441 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1442 brw_set_src0(p
, insn
, brw_imm_d(0));
1443 brw_inst_set_jip(devinfo
, insn
, 0);
1444 brw_inst_set_uip(devinfo
, insn
, 0);
1447 brw_inst_set_qtr_control(devinfo
, insn
, BRW_COMPRESSION_NONE
);
1448 brw_inst_set_mask_control(devinfo
, insn
, BRW_MASK_ENABLE
);
1449 if (!p
->single_program_flow
&& devinfo
->gen
< 6)
1450 brw_inst_set_thread_control(devinfo
, insn
, BRW_THREAD_SWITCH
);
1452 push_if_stack(p
, insn
);
1456 brw_ENDIF(struct brw_codegen
*p
)
1458 const struct gen_device_info
*devinfo
= p
->devinfo
;
1459 brw_inst
*insn
= NULL
;
1460 brw_inst
*else_inst
= NULL
;
1461 brw_inst
*if_inst
= NULL
;
1463 bool emit_endif
= true;
1465 /* In single program flow mode, we can express IF and ELSE instructions
1466 * equivalently as ADD instructions that operate on IP. On platforms prior
1467 * to Gen6, flow control instructions cause an implied thread switch, so
1468 * this is a significant savings.
1470 * However, on Gen6, writing to IP doesn't work in single program flow mode
1471 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1472 * not be updated by non-flow control instructions."). And on later
1473 * platforms, there is no significant benefit to converting control flow
1474 * instructions to conditional ADDs. So we only do this trick on Gen4 and
1477 if (devinfo
->gen
< 6 && p
->single_program_flow
)
1481 * A single next_insn() may change the base address of instruction store
1482 * memory(p->store), so call it first before referencing the instruction
1483 * store pointer from an index
1486 insn
= next_insn(p
, BRW_OPCODE_ENDIF
);
1488 /* Pop the IF and (optional) ELSE instructions from the stack */
1489 p
->if_depth_in_loop
[p
->loop_stack_depth
]--;
1490 tmp
= pop_if_stack(p
);
1491 if (brw_inst_opcode(devinfo
, tmp
) == BRW_OPCODE_ELSE
) {
1493 tmp
= pop_if_stack(p
);
1498 /* ENDIF is useless; don't bother emitting it. */
1499 convert_IF_ELSE_to_ADD(p
, if_inst
, else_inst
);
1503 if (devinfo
->gen
< 6) {
1504 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1505 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1506 brw_set_src1(p
, insn
, brw_imm_d(0x0));
1507 } else if (devinfo
->gen
== 6) {
1508 brw_set_dest(p
, insn
, brw_imm_w(0));
1509 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1510 brw_set_src1(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1511 } else if (devinfo
->gen
== 7) {
1512 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1513 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1514 brw_set_src1(p
, insn
, brw_imm_w(0));
1516 brw_set_src0(p
, insn
, brw_imm_d(0));
1519 brw_inst_set_qtr_control(devinfo
, insn
, BRW_COMPRESSION_NONE
);
1520 brw_inst_set_mask_control(devinfo
, insn
, BRW_MASK_ENABLE
);
1521 if (devinfo
->gen
< 6)
1522 brw_inst_set_thread_control(devinfo
, insn
, BRW_THREAD_SWITCH
);
1524 /* Also pop item off the stack in the endif instruction: */
1525 if (devinfo
->gen
< 6) {
1526 brw_inst_set_gen4_jump_count(devinfo
, insn
, 0);
1527 brw_inst_set_gen4_pop_count(devinfo
, insn
, 1);
1528 } else if (devinfo
->gen
== 6) {
1529 brw_inst_set_gen6_jump_count(devinfo
, insn
, 2);
1531 brw_inst_set_jip(devinfo
, insn
, 2);
1533 patch_IF_ELSE(p
, if_inst
, else_inst
, insn
);
1537 brw_BREAK(struct brw_codegen
*p
)
1539 const struct gen_device_info
*devinfo
= p
->devinfo
;
1542 insn
= next_insn(p
, BRW_OPCODE_BREAK
);
1543 if (devinfo
->gen
>= 8) {
1544 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1545 brw_set_src0(p
, insn
, brw_imm_d(0x0));
1546 } else if (devinfo
->gen
>= 6) {
1547 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1548 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1549 brw_set_src1(p
, insn
, brw_imm_d(0x0));
1551 brw_set_dest(p
, insn
, brw_ip_reg());
1552 brw_set_src0(p
, insn
, brw_ip_reg());
1553 brw_set_src1(p
, insn
, brw_imm_d(0x0));
1554 brw_inst_set_gen4_pop_count(devinfo
, insn
,
1555 p
->if_depth_in_loop
[p
->loop_stack_depth
]);
1557 brw_inst_set_qtr_control(devinfo
, insn
, BRW_COMPRESSION_NONE
);
1558 brw_inst_set_exec_size(devinfo
, insn
, brw_get_default_exec_size(p
));
1564 brw_CONT(struct brw_codegen
*p
)
1566 const struct gen_device_info
*devinfo
= p
->devinfo
;
1569 insn
= next_insn(p
, BRW_OPCODE_CONTINUE
);
1570 brw_set_dest(p
, insn
, brw_ip_reg());
1571 if (devinfo
->gen
>= 8) {
1572 brw_set_src0(p
, insn
, brw_imm_d(0x0));
1574 brw_set_src0(p
, insn
, brw_ip_reg());
1575 brw_set_src1(p
, insn
, brw_imm_d(0x0));
1578 if (devinfo
->gen
< 6) {
1579 brw_inst_set_gen4_pop_count(devinfo
, insn
,
1580 p
->if_depth_in_loop
[p
->loop_stack_depth
]);
1582 brw_inst_set_qtr_control(devinfo
, insn
, BRW_COMPRESSION_NONE
);
1583 brw_inst_set_exec_size(devinfo
, insn
, brw_get_default_exec_size(p
));
1588 gen6_HALT(struct brw_codegen
*p
)
1590 const struct gen_device_info
*devinfo
= p
->devinfo
;
1593 insn
= next_insn(p
, BRW_OPCODE_HALT
);
1594 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1595 if (devinfo
->gen
>= 8) {
1596 brw_set_src0(p
, insn
, brw_imm_d(0x0));
1598 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1599 brw_set_src1(p
, insn
, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1602 brw_inst_set_qtr_control(devinfo
, insn
, BRW_COMPRESSION_NONE
);
1603 brw_inst_set_exec_size(devinfo
, insn
, brw_get_default_exec_size(p
));
1609 * The DO/WHILE is just an unterminated loop -- break or continue are
1610 * used for control within the loop. We have a few ways they can be
1613 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1614 * jip and no DO instruction.
1616 * For non-uniform control flow pre-gen6, there's a DO instruction to
1617 * push the mask, and a WHILE to jump back, and BREAK to get out and
1620 * For gen6, there's no more mask stack, so no need for DO. WHILE
1621 * just points back to the first instruction of the loop.
1624 brw_DO(struct brw_codegen
*p
, unsigned execute_size
)
1626 const struct gen_device_info
*devinfo
= p
->devinfo
;
1628 if (devinfo
->gen
>= 6 || p
->single_program_flow
) {
1629 push_loop_stack(p
, &p
->store
[p
->nr_insn
]);
1630 return &p
->store
[p
->nr_insn
];
1632 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_DO
);
1634 push_loop_stack(p
, insn
);
1636 /* Override the defaults for this instruction:
1638 brw_set_dest(p
, insn
, brw_null_reg());
1639 brw_set_src0(p
, insn
, brw_null_reg());
1640 brw_set_src1(p
, insn
, brw_null_reg());
1642 brw_inst_set_qtr_control(devinfo
, insn
, BRW_COMPRESSION_NONE
);
1643 brw_inst_set_exec_size(devinfo
, insn
, execute_size
);
1644 brw_inst_set_pred_control(devinfo
, insn
, BRW_PREDICATE_NONE
);
1651 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1654 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1655 * nesting, since it can always just point to the end of the block/current loop.
1658 brw_patch_break_cont(struct brw_codegen
*p
, brw_inst
*while_inst
)
1660 const struct gen_device_info
*devinfo
= p
->devinfo
;
1661 brw_inst
*do_inst
= get_inner_do_insn(p
);
1663 unsigned br
= brw_jump_scale(devinfo
);
1665 assert(devinfo
->gen
< 6);
1667 for (inst
= while_inst
- 1; inst
!= do_inst
; inst
--) {
1668 /* If the jump count is != 0, that means that this instruction has already
1669 * been patched because it's part of a loop inside of the one we're
1672 if (brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_BREAK
&&
1673 brw_inst_gen4_jump_count(devinfo
, inst
) == 0) {
1674 brw_inst_set_gen4_jump_count(devinfo
, inst
, br
*((while_inst
- inst
) + 1));
1675 } else if (brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_CONTINUE
&&
1676 brw_inst_gen4_jump_count(devinfo
, inst
) == 0) {
1677 brw_inst_set_gen4_jump_count(devinfo
, inst
, br
* (while_inst
- inst
));
1683 brw_WHILE(struct brw_codegen
*p
)
1685 const struct gen_device_info
*devinfo
= p
->devinfo
;
1686 brw_inst
*insn
, *do_insn
;
1687 unsigned br
= brw_jump_scale(devinfo
);
1689 if (devinfo
->gen
>= 6) {
1690 insn
= next_insn(p
, BRW_OPCODE_WHILE
);
1691 do_insn
= get_inner_do_insn(p
);
1693 if (devinfo
->gen
>= 8) {
1694 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1695 brw_set_src0(p
, insn
, brw_imm_d(0));
1696 brw_inst_set_jip(devinfo
, insn
, br
* (do_insn
- insn
));
1697 } else if (devinfo
->gen
== 7) {
1698 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1699 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1700 brw_set_src1(p
, insn
, brw_imm_w(0));
1701 brw_inst_set_jip(devinfo
, insn
, br
* (do_insn
- insn
));
1703 brw_set_dest(p
, insn
, brw_imm_w(0));
1704 brw_inst_set_gen6_jump_count(devinfo
, insn
, br
* (do_insn
- insn
));
1705 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1706 brw_set_src1(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1709 brw_inst_set_exec_size(devinfo
, insn
, brw_get_default_exec_size(p
));
1712 if (p
->single_program_flow
) {
1713 insn
= next_insn(p
, BRW_OPCODE_ADD
);
1714 do_insn
= get_inner_do_insn(p
);
1716 brw_set_dest(p
, insn
, brw_ip_reg());
1717 brw_set_src0(p
, insn
, brw_ip_reg());
1718 brw_set_src1(p
, insn
, brw_imm_d((do_insn
- insn
) * 16));
1719 brw_inst_set_exec_size(devinfo
, insn
, BRW_EXECUTE_1
);
1721 insn
= next_insn(p
, BRW_OPCODE_WHILE
);
1722 do_insn
= get_inner_do_insn(p
);
1724 assert(brw_inst_opcode(devinfo
, do_insn
) == BRW_OPCODE_DO
);
1726 brw_set_dest(p
, insn
, brw_ip_reg());
1727 brw_set_src0(p
, insn
, brw_ip_reg());
1728 brw_set_src1(p
, insn
, brw_imm_d(0));
1730 brw_inst_set_exec_size(devinfo
, insn
, brw_inst_exec_size(devinfo
, do_insn
));
1731 brw_inst_set_gen4_jump_count(devinfo
, insn
, br
* (do_insn
- insn
+ 1));
1732 brw_inst_set_gen4_pop_count(devinfo
, insn
, 0);
1734 brw_patch_break_cont(p
, insn
);
1737 brw_inst_set_qtr_control(devinfo
, insn
, BRW_COMPRESSION_NONE
);
1739 p
->loop_stack_depth
--;
1746 void brw_land_fwd_jump(struct brw_codegen
*p
, int jmp_insn_idx
)
1748 const struct gen_device_info
*devinfo
= p
->devinfo
;
1749 brw_inst
*jmp_insn
= &p
->store
[jmp_insn_idx
];
1752 if (devinfo
->gen
>= 5)
1755 assert(brw_inst_opcode(devinfo
, jmp_insn
) == BRW_OPCODE_JMPI
);
1756 assert(brw_inst_src1_reg_file(devinfo
, jmp_insn
) == BRW_IMMEDIATE_VALUE
);
1758 brw_inst_set_gen4_jump_count(devinfo
, jmp_insn
,
1759 jmpi
* (p
->nr_insn
- jmp_insn_idx
- 1));
1762 /* To integrate with the above, it makes sense that the comparison
1763 * instruction should populate the flag register. It might be simpler
1764 * just to use the flag reg for most WM tasks?
1766 void brw_CMP(struct brw_codegen
*p
,
1767 struct brw_reg dest
,
1768 unsigned conditional
,
1769 struct brw_reg src0
,
1770 struct brw_reg src1
)
1772 const struct gen_device_info
*devinfo
= p
->devinfo
;
1773 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_CMP
);
1775 brw_inst_set_cond_modifier(devinfo
, insn
, conditional
);
1776 brw_set_dest(p
, insn
, dest
);
1777 brw_set_src0(p
, insn
, src0
);
1778 brw_set_src1(p
, insn
, src1
);
1780 /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1782 * "Any CMP instruction with a null destination must use a {switch}."
1784 * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1785 * mentioned on their work-arounds pages.
1787 if (devinfo
->gen
== 7) {
1788 if (dest
.file
== BRW_ARCHITECTURE_REGISTER_FILE
&&
1789 dest
.nr
== BRW_ARF_NULL
) {
1790 brw_inst_set_thread_control(devinfo
, insn
, BRW_THREAD_SWITCH
);
1795 /***********************************************************************
1796 * Helpers for the various SEND message types:
1799 /** Extended math function, float[8].
1801 void gen4_math(struct brw_codegen
*p
,
1802 struct brw_reg dest
,
1804 unsigned msg_reg_nr
,
1806 unsigned precision
)
1808 const struct gen_device_info
*devinfo
= p
->devinfo
;
1809 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_SEND
);
1811 if (has_scalar_region(src
)) {
1812 data_type
= BRW_MATH_DATA_SCALAR
;
1814 data_type
= BRW_MATH_DATA_VECTOR
;
1817 assert(devinfo
->gen
< 6);
1819 /* Example code doesn't set predicate_control for send
1822 brw_inst_set_pred_control(devinfo
, insn
, 0);
1823 brw_inst_set_base_mrf(devinfo
, insn
, msg_reg_nr
);
1825 brw_set_dest(p
, insn
, dest
);
1826 brw_set_src0(p
, insn
, src
);
1827 brw_set_math_message(p
,
1830 src
.type
== BRW_REGISTER_TYPE_D
,
1835 void gen6_math(struct brw_codegen
*p
,
1836 struct brw_reg dest
,
1838 struct brw_reg src0
,
1839 struct brw_reg src1
)
1841 const struct gen_device_info
*devinfo
= p
->devinfo
;
1842 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_MATH
);
1844 assert(devinfo
->gen
>= 6);
1846 assert(dest
.file
== BRW_GENERAL_REGISTER_FILE
||
1847 (devinfo
->gen
>= 7 && dest
.file
== BRW_MESSAGE_REGISTER_FILE
));
1849 assert(dest
.hstride
== BRW_HORIZONTAL_STRIDE_1
);
1850 if (devinfo
->gen
== 6) {
1851 assert(src0
.hstride
== BRW_HORIZONTAL_STRIDE_1
);
1852 assert(src1
.hstride
== BRW_HORIZONTAL_STRIDE_1
);
1855 if (function
== BRW_MATH_FUNCTION_INT_DIV_QUOTIENT
||
1856 function
== BRW_MATH_FUNCTION_INT_DIV_REMAINDER
||
1857 function
== BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER
) {
1858 assert(src0
.type
!= BRW_REGISTER_TYPE_F
);
1859 assert(src1
.type
!= BRW_REGISTER_TYPE_F
);
1860 assert(src1
.file
== BRW_GENERAL_REGISTER_FILE
||
1861 (devinfo
->gen
>= 8 && src1
.file
== BRW_IMMEDIATE_VALUE
));
1863 assert(src0
.type
== BRW_REGISTER_TYPE_F
);
1864 assert(src1
.type
== BRW_REGISTER_TYPE_F
);
1867 /* Source modifiers are ignored for extended math instructions on Gen6. */
1868 if (devinfo
->gen
== 6) {
1869 assert(!src0
.negate
);
1871 assert(!src1
.negate
);
1875 brw_inst_set_math_function(devinfo
, insn
, function
);
1877 brw_set_dest(p
, insn
, dest
);
1878 brw_set_src0(p
, insn
, src0
);
1879 brw_set_src1(p
, insn
, src1
);
1883 * Return the right surface index to access the thread scratch space using
1884 * stateless dataport messages.
1887 brw_scratch_surface_idx(const struct brw_codegen
*p
)
1889 /* The scratch space is thread-local so IA coherency is unnecessary. */
1890 if (p
->devinfo
->gen
>= 8)
1891 return GEN8_BTI_STATELESS_NON_COHERENT
;
1893 return BRW_BTI_STATELESS
;
1897 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1898 * using a constant offset per channel.
1900 * The offset must be aligned to oword size (16 bytes). Used for
1901 * register spilling.
1903 void brw_oword_block_write_scratch(struct brw_codegen
*p
,
1908 const struct gen_device_info
*devinfo
= p
->devinfo
;
1909 const unsigned target_cache
=
1910 (devinfo
->gen
>= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE
:
1911 devinfo
->gen
>= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE
:
1912 BRW_SFID_DATAPORT_WRITE
);
1915 if (devinfo
->gen
>= 6)
1918 mrf
= retype(mrf
, BRW_REGISTER_TYPE_UD
);
1920 const unsigned mlen
= 1 + num_regs
;
1922 /* Set up the message header. This is g0, with g0.2 filled with
1923 * the offset. We don't want to leave our offset around in g0 or
1924 * it'll screw up texture samples, so set it up inside the message
1928 brw_push_insn_state(p
);
1929 brw_set_default_exec_size(p
, BRW_EXECUTE_8
);
1930 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
1931 brw_set_default_compression_control(p
, BRW_COMPRESSION_NONE
);
1933 brw_MOV(p
, mrf
, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD
));
1935 /* set message header global offset field (reg 0, element 2) */
1936 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
1938 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE
,
1940 2), BRW_REGISTER_TYPE_UD
),
1941 brw_imm_ud(offset
));
1943 brw_pop_insn_state(p
);
1947 struct brw_reg dest
;
1948 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_SEND
);
1949 int send_commit_msg
;
1950 struct brw_reg src_header
= retype(brw_vec8_grf(0, 0),
1951 BRW_REGISTER_TYPE_UW
);
1953 brw_inst_set_sfid(devinfo
, insn
, target_cache
);
1954 brw_inst_set_compression(devinfo
, insn
, false);
1956 if (brw_inst_exec_size(devinfo
, insn
) >= 16)
1957 src_header
= vec16(src_header
);
1959 assert(brw_inst_pred_control(devinfo
, insn
) == BRW_PREDICATE_NONE
);
1960 if (devinfo
->gen
< 6)
1961 brw_inst_set_base_mrf(devinfo
, insn
, mrf
.nr
);
1963 /* Until gen6, writes followed by reads from the same location
1964 * are not guaranteed to be ordered unless write_commit is set.
1965 * If set, then a no-op write is issued to the destination
1966 * register to set a dependency, and a read from the destination
1967 * can be used to ensure the ordering.
1969 * For gen6, only writes between different threads need ordering
1970 * protection. Our use of DP writes is all about register
1971 * spilling within a thread.
1973 if (devinfo
->gen
>= 6) {
1974 dest
= retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW
);
1975 send_commit_msg
= 0;
1978 send_commit_msg
= 1;
1981 brw_set_dest(p
, insn
, dest
);
1982 if (devinfo
->gen
>= 6) {
1983 brw_set_src0(p
, insn
, mrf
);
1985 brw_set_src0(p
, insn
, brw_null_reg());
1988 if (devinfo
->gen
>= 6)
1989 msg_type
= GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE
;
1991 msg_type
= BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE
;
1993 brw_set_desc(p
, insn
,
1994 brw_message_desc(devinfo
, mlen
, send_commit_msg
, true) |
1995 brw_dp_write_desc(devinfo
, brw_scratch_surface_idx(p
),
1996 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs
* 8),
1997 msg_type
, 0, /* not a render target */
2004 * Read a block of owords (half a GRF each) from the scratch buffer
2005 * using a constant index per channel.
2007 * Offset must be aligned to oword size (16 bytes). Used for register
2011 brw_oword_block_read_scratch(struct brw_codegen
*p
,
2012 struct brw_reg dest
,
2017 const struct gen_device_info
*devinfo
= p
->devinfo
;
2019 if (devinfo
->gen
>= 6)
2022 if (p
->devinfo
->gen
>= 7) {
2023 /* On gen 7 and above, we no longer have message registers and we can
2024 * send from any register we want. By using the destination register
2025 * for the message, we guarantee that the implied message write won't
2026 * accidentally overwrite anything. This has been a problem because
2027 * the MRF registers and source for the final FB write are both fixed
2030 mrf
= retype(dest
, BRW_REGISTER_TYPE_UD
);
2032 mrf
= retype(mrf
, BRW_REGISTER_TYPE_UD
);
2034 dest
= retype(dest
, BRW_REGISTER_TYPE_UW
);
2036 const unsigned rlen
= num_regs
;
2037 const unsigned target_cache
=
2038 (devinfo
->gen
>= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE
:
2039 devinfo
->gen
>= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE
:
2040 BRW_SFID_DATAPORT_READ
);
2043 brw_push_insn_state(p
);
2044 brw_set_default_exec_size(p
, BRW_EXECUTE_8
);
2045 brw_set_default_compression_control(p
, BRW_COMPRESSION_NONE
);
2046 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
2048 brw_MOV(p
, mrf
, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD
));
2050 /* set message header global offset field (reg 0, element 2) */
2051 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
2052 brw_MOV(p
, get_element_ud(mrf
, 2), brw_imm_ud(offset
));
2054 brw_pop_insn_state(p
);
2058 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_SEND
);
2060 brw_inst_set_sfid(devinfo
, insn
, target_cache
);
2061 assert(brw_inst_pred_control(devinfo
, insn
) == 0);
2062 brw_inst_set_compression(devinfo
, insn
, false);
2064 brw_set_dest(p
, insn
, dest
); /* UW? */
2065 if (devinfo
->gen
>= 6) {
2066 brw_set_src0(p
, insn
, mrf
);
2068 brw_set_src0(p
, insn
, brw_null_reg());
2069 brw_inst_set_base_mrf(devinfo
, insn
, mrf
.nr
);
2072 brw_set_desc(p
, insn
,
2073 brw_message_desc(devinfo
, 1, rlen
, true) |
2074 brw_dp_read_desc(devinfo
, brw_scratch_surface_idx(p
),
2075 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs
* 8),
2076 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ
,
2077 BRW_DATAPORT_READ_TARGET_RENDER_CACHE
));
2082 gen7_block_read_scratch(struct brw_codegen
*p
,
2083 struct brw_reg dest
,
2087 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_SEND
);
2088 assert(brw_inst_pred_control(p
->devinfo
, insn
) == BRW_PREDICATE_NONE
);
2090 brw_set_dest(p
, insn
, retype(dest
, BRW_REGISTER_TYPE_UW
));
2092 /* The HW requires that the header is present; this is to get the g0.5
2095 brw_set_src0(p
, insn
, brw_vec8_grf(0, 0));
2097 /* According to the docs, offset is "A 12-bit HWord offset into the memory
2098 * Immediate Memory buffer as specified by binding table 0xFF." An HWORD
2099 * is 32 bytes, which happens to be the size of a register.
2102 assert(offset
< (1 << 12));
2104 gen7_set_dp_scratch_message(p
, insn
,
2105 false, /* scratch read */
2107 false, /* invalidate after read */
2110 1, /* mlen: just g0 */
2111 num_regs
, /* rlen */
2112 true); /* header present */
2116 * Read float[4] vectors from the data port constant cache.
2117 * Location (in buffer) should be a multiple of 16.
2118 * Used for fetching shader constants.
2120 void brw_oword_block_read(struct brw_codegen
*p
,
2121 struct brw_reg dest
,
2124 uint32_t bind_table_index
)
2126 const struct gen_device_info
*devinfo
= p
->devinfo
;
2127 const unsigned target_cache
=
2128 (devinfo
->gen
>= 6 ? GEN6_SFID_DATAPORT_CONSTANT_CACHE
:
2129 BRW_SFID_DATAPORT_READ
);
2130 const unsigned exec_size
= 1 << brw_get_default_exec_size(p
);
2132 /* On newer hardware, offset is in units of owords. */
2133 if (devinfo
->gen
>= 6)
2136 mrf
= retype(mrf
, BRW_REGISTER_TYPE_UD
);
2138 brw_push_insn_state(p
);
2139 brw_set_default_predicate_control(p
, BRW_PREDICATE_NONE
);
2140 brw_set_default_compression_control(p
, BRW_COMPRESSION_NONE
);
2141 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
2143 brw_push_insn_state(p
);
2144 brw_set_default_exec_size(p
, BRW_EXECUTE_8
);
2145 brw_MOV(p
, mrf
, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD
));
2147 /* set message header global offset field (reg 0, element 2) */
2148 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
2150 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE
,
2152 2), BRW_REGISTER_TYPE_UD
),
2153 brw_imm_ud(offset
));
2154 brw_pop_insn_state(p
);
2156 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_SEND
);
2158 brw_inst_set_sfid(devinfo
, insn
, target_cache
);
2160 /* cast dest to a uword[8] vector */
2161 dest
= retype(vec8(dest
), BRW_REGISTER_TYPE_UW
);
2163 brw_set_dest(p
, insn
, dest
);
2164 if (devinfo
->gen
>= 6) {
2165 brw_set_src0(p
, insn
, mrf
);
2167 brw_set_src0(p
, insn
, brw_null_reg());
2168 brw_inst_set_base_mrf(devinfo
, insn
, mrf
.nr
);
2171 brw_set_desc(p
, insn
,
2172 brw_message_desc(devinfo
, 1, DIV_ROUND_UP(exec_size
, 8), true) |
2173 brw_dp_read_desc(devinfo
, bind_table_index
,
2174 BRW_DATAPORT_OWORD_BLOCK_DWORDS(exec_size
),
2175 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ
,
2176 BRW_DATAPORT_READ_TARGET_DATA_CACHE
));
2178 brw_pop_insn_state(p
);
2182 brw_fb_WRITE(struct brw_codegen
*p
,
2183 struct brw_reg payload
,
2184 struct brw_reg implied_header
,
2185 unsigned msg_control
,
2186 unsigned binding_table_index
,
2187 unsigned msg_length
,
2188 unsigned response_length
,
2190 bool last_render_target
,
2191 bool header_present
)
2193 const struct gen_device_info
*devinfo
= p
->devinfo
;
2194 const unsigned target_cache
=
2195 (devinfo
->gen
>= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE
:
2196 BRW_SFID_DATAPORT_WRITE
);
2199 struct brw_reg dest
, src0
;
2201 if (brw_get_default_exec_size(p
) >= BRW_EXECUTE_16
)
2202 dest
= retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW
);
2204 dest
= retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW
);
2206 if (devinfo
->gen
>= 6) {
2207 insn
= next_insn(p
, BRW_OPCODE_SENDC
);
2209 insn
= next_insn(p
, BRW_OPCODE_SEND
);
2211 brw_inst_set_sfid(devinfo
, insn
, target_cache
);
2212 brw_inst_set_compression(devinfo
, insn
, false);
2214 if (devinfo
->gen
>= 6) {
2215 /* headerless version, just submit color payload */
2218 msg_type
= GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE
;
2220 assert(payload
.file
== BRW_MESSAGE_REGISTER_FILE
);
2221 brw_inst_set_base_mrf(devinfo
, insn
, payload
.nr
);
2222 src0
= implied_header
;
2224 msg_type
= BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE
;
2227 brw_set_dest(p
, insn
, dest
);
2228 brw_set_src0(p
, insn
, src0
);
2229 brw_set_desc(p
, insn
,
2230 brw_message_desc(devinfo
, msg_length
, response_length
,
2232 brw_dp_write_desc(devinfo
, binding_table_index
, msg_control
,
2233 msg_type
, last_render_target
,
2234 0 /* send_commit_msg */));
2235 brw_inst_set_eot(devinfo
, insn
, eot
);
2241 gen9_fb_READ(struct brw_codegen
*p
,
2243 struct brw_reg payload
,
2244 unsigned binding_table_index
,
2245 unsigned msg_length
,
2246 unsigned response_length
,
2249 const struct gen_device_info
*devinfo
= p
->devinfo
;
2250 assert(devinfo
->gen
>= 9);
2251 const unsigned msg_subtype
=
2252 brw_get_default_exec_size(p
) == BRW_EXECUTE_16
? 0 : 1;
2253 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_SENDC
);
2255 brw_inst_set_sfid(devinfo
, insn
, GEN6_SFID_DATAPORT_RENDER_CACHE
);
2256 brw_set_dest(p
, insn
, dst
);
2257 brw_set_src0(p
, insn
, payload
);
2260 brw_message_desc(devinfo
, msg_length
, response_length
, true) |
2261 brw_dp_read_desc(devinfo
, binding_table_index
,
2262 per_sample
<< 5 | msg_subtype
,
2263 GEN9_DATAPORT_RC_RENDER_TARGET_READ
,
2264 BRW_DATAPORT_READ_TARGET_RENDER_CACHE
));
2265 brw_inst_set_rt_slot_group(devinfo
, insn
, brw_get_default_group(p
) / 16);
2271 * Texture sample instruction.
2272 * Note: the msg_type plus msg_length values determine exactly what kind
2273 * of sampling operation is performed. See volume 4, page 161 of docs.
2275 void brw_SAMPLE(struct brw_codegen
*p
,
2276 struct brw_reg dest
,
2277 unsigned msg_reg_nr
,
2278 struct brw_reg src0
,
2279 unsigned binding_table_index
,
2282 unsigned response_length
,
2283 unsigned msg_length
,
2284 unsigned header_present
,
2286 unsigned return_format
)
2288 const struct gen_device_info
*devinfo
= p
->devinfo
;
2291 if (msg_reg_nr
!= -1)
2292 gen6_resolve_implied_move(p
, &src0
, msg_reg_nr
);
2294 insn
= next_insn(p
, BRW_OPCODE_SEND
);
2295 brw_inst_set_sfid(devinfo
, insn
, BRW_SFID_SAMPLER
);
2296 brw_inst_set_pred_control(devinfo
, insn
, BRW_PREDICATE_NONE
); /* XXX */
2298 /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2300 * "Instruction compression is not allowed for this instruction (that
2301 * is, send). The hardware behavior is undefined if this instruction is
2302 * set as compressed. However, compress control can be set to "SecHalf"
2303 * to affect the EMask generation."
2305 * No similar wording is found in later PRMs, but there are examples
2306 * utilizing send with SecHalf. More importantly, SIMD8 sampler messages
2307 * are allowed in SIMD16 mode and they could not work without SecHalf. For
2308 * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2310 brw_inst_set_compression(devinfo
, insn
, false);
2312 if (devinfo
->gen
< 6)
2313 brw_inst_set_base_mrf(devinfo
, insn
, msg_reg_nr
);
2315 brw_set_dest(p
, insn
, dest
);
2316 brw_set_src0(p
, insn
, src0
);
2317 brw_set_desc(p
, insn
,
2318 brw_message_desc(devinfo
, msg_length
, response_length
,
2320 brw_sampler_desc(devinfo
, binding_table_index
, sampler
,
2321 msg_type
, simd_mode
, return_format
));
2324 /* Adjust the message header's sampler state pointer to
2325 * select the correct group of 16 samplers.
2327 void brw_adjust_sampler_state_pointer(struct brw_codegen
*p
,
2328 struct brw_reg header
,
2329 struct brw_reg sampler_index
)
2331 /* The "Sampler Index" field can only store values between 0 and 15.
2332 * However, we can add an offset to the "Sampler State Pointer"
2333 * field, effectively selecting a different set of 16 samplers.
2335 * The "Sampler State Pointer" needs to be aligned to a 32-byte
2336 * offset, and each sampler state is only 16-bytes, so we can't
2337 * exclusively use the offset - we have to use both.
2340 const struct gen_device_info
*devinfo
= p
->devinfo
;
2342 if (sampler_index
.file
== BRW_IMMEDIATE_VALUE
) {
2343 const int sampler_state_size
= 16; /* 16 bytes */
2344 uint32_t sampler
= sampler_index
.ud
;
2346 if (sampler
>= 16) {
2347 assert(devinfo
->is_haswell
|| devinfo
->gen
>= 8);
2349 get_element_ud(header
, 3),
2350 get_element_ud(brw_vec8_grf(0, 0), 3),
2351 brw_imm_ud(16 * (sampler
/ 16) * sampler_state_size
));
2354 /* Non-const sampler array indexing case */
2355 if (devinfo
->gen
< 8 && !devinfo
->is_haswell
) {
2359 struct brw_reg temp
= get_element_ud(header
, 3);
2361 brw_AND(p
, temp
, get_element_ud(sampler_index
, 0), brw_imm_ud(0x0f0));
2362 brw_SHL(p
, temp
, temp
, brw_imm_ud(4));
2364 get_element_ud(header
, 3),
2365 get_element_ud(brw_vec8_grf(0, 0), 3),
2370 /* All these variables are pretty confusing - we might be better off
2371 * using bitmasks and macros for this, in the old style. Or perhaps
2372 * just having the caller instantiate the fields in dword3 itself.
2374 void brw_urb_WRITE(struct brw_codegen
*p
,
2375 struct brw_reg dest
,
2376 unsigned msg_reg_nr
,
2377 struct brw_reg src0
,
2378 enum brw_urb_write_flags flags
,
2379 unsigned msg_length
,
2380 unsigned response_length
,
2384 const struct gen_device_info
*devinfo
= p
->devinfo
;
2387 gen6_resolve_implied_move(p
, &src0
, msg_reg_nr
);
2389 if (devinfo
->gen
>= 7 && !(flags
& BRW_URB_WRITE_USE_CHANNEL_MASKS
)) {
2390 /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2391 brw_push_insn_state(p
);
2392 brw_set_default_access_mode(p
, BRW_ALIGN_1
);
2393 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
2394 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
2395 brw_OR(p
, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE
, msg_reg_nr
, 5),
2396 BRW_REGISTER_TYPE_UD
),
2397 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD
),
2398 brw_imm_ud(0xff00));
2399 brw_pop_insn_state(p
);
2402 insn
= next_insn(p
, BRW_OPCODE_SEND
);
2404 assert(msg_length
< BRW_MAX_MRF(devinfo
->gen
));
2406 brw_set_dest(p
, insn
, dest
);
2407 brw_set_src0(p
, insn
, src0
);
2408 brw_set_src1(p
, insn
, brw_imm_d(0));
2410 if (devinfo
->gen
< 6)
2411 brw_inst_set_base_mrf(devinfo
, insn
, msg_reg_nr
);
2413 brw_set_urb_message(p
,
2423 brw_send_indirect_message(struct brw_codegen
*p
,
2426 struct brw_reg payload
,
2427 struct brw_reg desc
,
2430 const struct gen_device_info
*devinfo
= p
->devinfo
;
2431 struct brw_inst
*send
;
2433 dst
= retype(dst
, BRW_REGISTER_TYPE_UW
);
2435 assert(desc
.type
== BRW_REGISTER_TYPE_UD
);
2437 if (desc
.file
== BRW_IMMEDIATE_VALUE
) {
2438 send
= next_insn(p
, BRW_OPCODE_SEND
);
2439 brw_set_desc(p
, send
, desc
.ud
| desc_imm
);
2442 struct brw_reg addr
= retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD
);
2444 brw_push_insn_state(p
);
2445 brw_set_default_access_mode(p
, BRW_ALIGN_1
);
2446 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
2447 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
2448 brw_set_default_predicate_control(p
, BRW_PREDICATE_NONE
);
2450 /* Load the indirect descriptor to an address register using OR so the
2451 * caller can specify additional descriptor bits with the desc_imm
2454 brw_OR(p
, addr
, desc
, brw_imm_ud(desc_imm
));
2456 brw_pop_insn_state(p
);
2458 send
= next_insn(p
, BRW_OPCODE_SEND
);
2459 brw_set_src1(p
, send
, addr
);
2462 if (dst
.width
< BRW_EXECUTE_8
)
2463 brw_inst_set_exec_size(devinfo
, send
, dst
.width
);
2465 brw_set_dest(p
, send
, dst
);
2466 brw_set_src0(p
, send
, retype(payload
, BRW_REGISTER_TYPE_UD
));
2467 brw_inst_set_sfid(devinfo
, send
, sfid
);
2471 brw_send_indirect_surface_message(struct brw_codegen
*p
,
2474 struct brw_reg payload
,
2475 struct brw_reg surface
,
2478 if (surface
.file
!= BRW_IMMEDIATE_VALUE
) {
2479 struct brw_reg addr
= retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD
);
2481 brw_push_insn_state(p
);
2482 brw_set_default_access_mode(p
, BRW_ALIGN_1
);
2483 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
2484 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
2485 brw_set_default_predicate_control(p
, BRW_PREDICATE_NONE
);
2487 /* Mask out invalid bits from the surface index to avoid hangs e.g. when
2488 * some surface array is accessed out of bounds.
2491 suboffset(vec1(retype(surface
, BRW_REGISTER_TYPE_UD
)),
2492 BRW_GET_SWZ(surface
.swizzle
, 0)),
2495 brw_pop_insn_state(p
);
2500 brw_send_indirect_message(p
, sfid
, dst
, payload
, surface
, desc_imm
);
2504 while_jumps_before_offset(const struct gen_device_info
*devinfo
,
2505 brw_inst
*insn
, int while_offset
, int start_offset
)
2507 int scale
= 16 / brw_jump_scale(devinfo
);
2508 int jip
= devinfo
->gen
== 6 ? brw_inst_gen6_jump_count(devinfo
, insn
)
2509 : brw_inst_jip(devinfo
, insn
);
2511 return while_offset
+ jip
* scale
<= start_offset
;
2516 brw_find_next_block_end(struct brw_codegen
*p
, int start_offset
)
2519 void *store
= p
->store
;
2520 const struct gen_device_info
*devinfo
= p
->devinfo
;
2524 for (offset
= next_offset(devinfo
, store
, start_offset
);
2525 offset
< p
->next_insn_offset
;
2526 offset
= next_offset(devinfo
, store
, offset
)) {
2527 brw_inst
*insn
= store
+ offset
;
2529 switch (brw_inst_opcode(devinfo
, insn
)) {
2533 case BRW_OPCODE_ENDIF
:
2538 case BRW_OPCODE_WHILE
:
2539 /* If the while doesn't jump before our instruction, it's the end
2540 * of a sibling do...while loop. Ignore it.
2542 if (!while_jumps_before_offset(devinfo
, insn
, offset
, start_offset
))
2545 case BRW_OPCODE_ELSE
:
2546 case BRW_OPCODE_HALT
:
2555 /* There is no DO instruction on gen6, so to find the end of the loop
2556 * we have to see if the loop is jumping back before our start
2560 brw_find_loop_end(struct brw_codegen
*p
, int start_offset
)
2562 const struct gen_device_info
*devinfo
= p
->devinfo
;
2564 void *store
= p
->store
;
2566 assert(devinfo
->gen
>= 6);
2568 /* Always start after the instruction (such as a WHILE) we're trying to fix
2571 for (offset
= next_offset(devinfo
, store
, start_offset
);
2572 offset
< p
->next_insn_offset
;
2573 offset
= next_offset(devinfo
, store
, offset
)) {
2574 brw_inst
*insn
= store
+ offset
;
2576 if (brw_inst_opcode(devinfo
, insn
) == BRW_OPCODE_WHILE
) {
2577 if (while_jumps_before_offset(devinfo
, insn
, offset
, start_offset
))
2581 assert(!"not reached");
2582 return start_offset
;
2585 /* After program generation, go back and update the UIP and JIP of
2586 * BREAK, CONT, and HALT instructions to their correct locations.
2589 brw_set_uip_jip(struct brw_codegen
*p
, int start_offset
)
2591 const struct gen_device_info
*devinfo
= p
->devinfo
;
2593 int br
= brw_jump_scale(devinfo
);
2594 int scale
= 16 / br
;
2595 void *store
= p
->store
;
2597 if (devinfo
->gen
< 6)
2600 for (offset
= start_offset
; offset
< p
->next_insn_offset
; offset
+= 16) {
2601 brw_inst
*insn
= store
+ offset
;
2602 assert(brw_inst_cmpt_control(devinfo
, insn
) == 0);
2604 int block_end_offset
= brw_find_next_block_end(p
, offset
);
2605 switch (brw_inst_opcode(devinfo
, insn
)) {
2606 case BRW_OPCODE_BREAK
:
2607 assert(block_end_offset
!= 0);
2608 brw_inst_set_jip(devinfo
, insn
, (block_end_offset
- offset
) / scale
);
2609 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2610 brw_inst_set_uip(devinfo
, insn
,
2611 (brw_find_loop_end(p
, offset
) - offset
+
2612 (devinfo
->gen
== 6 ? 16 : 0)) / scale
);
2614 case BRW_OPCODE_CONTINUE
:
2615 assert(block_end_offset
!= 0);
2616 brw_inst_set_jip(devinfo
, insn
, (block_end_offset
- offset
) / scale
);
2617 brw_inst_set_uip(devinfo
, insn
,
2618 (brw_find_loop_end(p
, offset
) - offset
) / scale
);
2620 assert(brw_inst_uip(devinfo
, insn
) != 0);
2621 assert(brw_inst_jip(devinfo
, insn
) != 0);
2624 case BRW_OPCODE_ENDIF
: {
2625 int32_t jump
= (block_end_offset
== 0) ?
2626 1 * br
: (block_end_offset
- offset
) / scale
;
2627 if (devinfo
->gen
>= 7)
2628 brw_inst_set_jip(devinfo
, insn
, jump
);
2630 brw_inst_set_gen6_jump_count(devinfo
, insn
, jump
);
2634 case BRW_OPCODE_HALT
:
2635 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2637 * "In case of the halt instruction not inside any conditional
2638 * code block, the value of <JIP> and <UIP> should be the
2639 * same. In case of the halt instruction inside conditional code
2640 * block, the <UIP> should be the end of the program, and the
2641 * <JIP> should be end of the most inner conditional code block."
2643 * The uip will have already been set by whoever set up the
2646 if (block_end_offset
== 0) {
2647 brw_inst_set_jip(devinfo
, insn
, brw_inst_uip(devinfo
, insn
));
2649 brw_inst_set_jip(devinfo
, insn
, (block_end_offset
- offset
) / scale
);
2651 assert(brw_inst_uip(devinfo
, insn
) != 0);
2652 assert(brw_inst_jip(devinfo
, insn
) != 0);
2658 void brw_ff_sync(struct brw_codegen
*p
,
2659 struct brw_reg dest
,
2660 unsigned msg_reg_nr
,
2661 struct brw_reg src0
,
2663 unsigned response_length
,
2666 const struct gen_device_info
*devinfo
= p
->devinfo
;
2669 gen6_resolve_implied_move(p
, &src0
, msg_reg_nr
);
2671 insn
= next_insn(p
, BRW_OPCODE_SEND
);
2672 brw_set_dest(p
, insn
, dest
);
2673 brw_set_src0(p
, insn
, src0
);
2674 brw_set_src1(p
, insn
, brw_imm_d(0));
2676 if (devinfo
->gen
< 6)
2677 brw_inst_set_base_mrf(devinfo
, insn
, msg_reg_nr
);
2679 brw_set_ff_sync_message(p
,
2687 * Emit the SEND instruction necessary to generate stream output data on Gen6
2688 * (for transform feedback).
2690 * If send_commit_msg is true, this is the last piece of stream output data
2691 * from this thread, so send the data as a committed write. According to the
2692 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2694 * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2695 * writes are complete by sending the final write as a committed write."
2698 brw_svb_write(struct brw_codegen
*p
,
2699 struct brw_reg dest
,
2700 unsigned msg_reg_nr
,
2701 struct brw_reg src0
,
2702 unsigned binding_table_index
,
2703 bool send_commit_msg
)
2705 const struct gen_device_info
*devinfo
= p
->devinfo
;
2706 const unsigned target_cache
=
2707 (devinfo
->gen
>= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE
:
2708 devinfo
->gen
>= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE
:
2709 BRW_SFID_DATAPORT_WRITE
);
2712 gen6_resolve_implied_move(p
, &src0
, msg_reg_nr
);
2714 insn
= next_insn(p
, BRW_OPCODE_SEND
);
2715 brw_inst_set_sfid(devinfo
, insn
, target_cache
);
2716 brw_set_dest(p
, insn
, dest
);
2717 brw_set_src0(p
, insn
, src0
);
2718 brw_set_desc(p
, insn
,
2719 brw_message_desc(devinfo
, 1, send_commit_msg
, true) |
2720 brw_dp_write_desc(devinfo
, binding_table_index
,
2721 0, /* msg_control: ignored */
2722 GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE
,
2723 0, /* last_render_target: ignored */
2724 send_commit_msg
)); /* send_commit_msg */
2728 brw_surface_payload_size(struct brw_codegen
*p
,
2729 unsigned num_channels
,
2733 if (has_simd4x2
&& brw_get_default_access_mode(p
) == BRW_ALIGN_16
)
2735 else if (has_simd16
&& brw_get_default_exec_size(p
) == BRW_EXECUTE_16
)
2736 return 2 * num_channels
;
2738 return num_channels
;
2742 brw_dp_untyped_atomic_desc(struct brw_codegen
*p
,
2744 bool response_expected
)
2746 const struct gen_device_info
*devinfo
= p
->devinfo
;
2747 unsigned msg_control
=
2748 atomic_op
| /* Atomic Operation Type: BRW_AOP_* */
2749 (response_expected
? 1 << 5 : 0); /* Return data expected */
2752 if (devinfo
->gen
>= 8 || devinfo
->is_haswell
) {
2753 if (brw_get_default_access_mode(p
) == BRW_ALIGN_1
) {
2754 if (brw_get_default_exec_size(p
) != BRW_EXECUTE_16
)
2755 msg_control
|= 1 << 4; /* SIMD8 mode */
2757 msg_type
= HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP
;
2759 msg_type
= HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2
;
2762 if (brw_get_default_exec_size(p
) != BRW_EXECUTE_16
)
2763 msg_control
|= 1 << 4; /* SIMD8 mode */
2765 msg_type
= GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP
;
2768 return brw_dp_surface_desc(devinfo
, msg_type
, msg_control
);
2772 brw_untyped_atomic(struct brw_codegen
*p
,
2774 struct brw_reg payload
,
2775 struct brw_reg surface
,
2777 unsigned msg_length
,
2778 bool response_expected
,
2779 bool header_present
)
2781 const struct gen_device_info
*devinfo
= p
->devinfo
;
2782 const unsigned sfid
= (devinfo
->gen
>= 8 || devinfo
->is_haswell
?
2783 HSW_SFID_DATAPORT_DATA_CACHE_1
:
2784 GEN7_SFID_DATAPORT_DATA_CACHE
);
2785 const unsigned response_length
= brw_surface_payload_size(
2786 p
, response_expected
, devinfo
->gen
>= 8 || devinfo
->is_haswell
, true);
2787 const unsigned desc
=
2788 brw_message_desc(devinfo
, msg_length
, response_length
, header_present
) |
2789 brw_dp_untyped_atomic_desc(p
, atomic_op
, response_expected
);
2790 const bool align1
= brw_get_default_access_mode(p
) == BRW_ALIGN_1
;
2791 /* Mask out unused components -- This is especially important in Align16
2792 * mode on generations that don't have native support for SIMD4x2 atomics,
2793 * because unused but enabled components will cause the dataport to perform
2794 * additional atomic operations on the addresses that happen to be in the
2795 * uninitialized Y, Z and W coordinates of the payload.
2797 const unsigned mask
= align1
? WRITEMASK_XYZW
: WRITEMASK_X
;
2799 brw_send_indirect_surface_message(p
, sfid
, brw_writemask(dst
, mask
),
2800 payload
, surface
, desc
);
2804 brw_dp_untyped_atomic_float_desc(struct brw_codegen
*p
,
2806 bool response_expected
)
2808 const struct gen_device_info
*devinfo
= p
->devinfo
;
2809 const unsigned msg_type
= GEN9_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_FLOAT_OP
;
2810 unsigned msg_control
=
2811 atomic_op
| /* Atomic Operation Type: BRW_AOP_F* */
2812 (response_expected
? 1 << 5 : 0); /* Return data expected */
2814 assert(devinfo
->gen
>= 9);
2815 assert(brw_get_default_access_mode(p
) == BRW_ALIGN_1
);
2817 if (brw_get_default_exec_size(p
) != BRW_EXECUTE_16
)
2818 msg_control
|= 1 << 4; /* SIMD8 mode */
2820 return brw_dp_surface_desc(devinfo
, msg_type
, msg_control
);
2824 brw_untyped_atomic_float(struct brw_codegen
*p
,
2826 struct brw_reg payload
,
2827 struct brw_reg surface
,
2829 unsigned msg_length
,
2830 bool response_expected
,
2831 bool header_present
)
2833 const struct gen_device_info
*devinfo
= p
->devinfo
;
2835 assert(devinfo
->gen
>= 9);
2836 assert(brw_get_default_access_mode(p
) == BRW_ALIGN_1
);
2838 const unsigned sfid
= HSW_SFID_DATAPORT_DATA_CACHE_1
;
2839 const unsigned response_length
= brw_surface_payload_size(
2840 p
, response_expected
, true, true);
2841 const unsigned desc
=
2842 brw_message_desc(devinfo
, msg_length
, response_length
, header_present
) |
2843 brw_dp_untyped_atomic_float_desc(p
, atomic_op
, response_expected
);
2845 brw_send_indirect_surface_message(p
, sfid
,
2846 brw_writemask(dst
, WRITEMASK_XYZW
),
2847 payload
, surface
, desc
);
2851 brw_dp_untyped_surface_read_desc(struct brw_codegen
*p
,
2852 unsigned num_channels
)
2854 const struct gen_device_info
*devinfo
= p
->devinfo
;
2855 const unsigned msg_type
= (devinfo
->gen
>= 8 || devinfo
->is_haswell
?
2856 HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ
:
2857 GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ
);
2858 /* Set mask of 32-bit channels to drop. */
2859 unsigned msg_control
= 0xf & (0xf << num_channels
);
2861 if (brw_get_default_access_mode(p
) == BRW_ALIGN_1
) {
2862 if (brw_get_default_exec_size(p
) == BRW_EXECUTE_16
)
2863 msg_control
|= 1 << 4; /* SIMD16 mode */
2865 msg_control
|= 2 << 4; /* SIMD8 mode */
2868 return brw_dp_surface_desc(devinfo
, msg_type
, msg_control
);
2872 brw_untyped_surface_read(struct brw_codegen
*p
,
2874 struct brw_reg payload
,
2875 struct brw_reg surface
,
2876 unsigned msg_length
,
2877 unsigned num_channels
)
2879 const struct gen_device_info
*devinfo
= p
->devinfo
;
2880 const unsigned sfid
= (devinfo
->gen
>= 8 || devinfo
->is_haswell
?
2881 HSW_SFID_DATAPORT_DATA_CACHE_1
:
2882 GEN7_SFID_DATAPORT_DATA_CACHE
);
2883 const unsigned response_length
=
2884 brw_surface_payload_size(p
, num_channels
, true, true);
2885 const unsigned desc
=
2886 brw_message_desc(devinfo
, msg_length
, response_length
, false) |
2887 brw_dp_untyped_surface_read_desc(p
, num_channels
);
2889 brw_send_indirect_surface_message(p
, sfid
, dst
, payload
, surface
, desc
);
2893 brw_dp_untyped_surface_write_desc(struct brw_codegen
*p
,
2894 unsigned num_channels
)
2896 const struct gen_device_info
*devinfo
= p
->devinfo
;
2897 const unsigned msg_type
= (devinfo
->gen
>= 8 || devinfo
->is_haswell
?
2898 HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE
:
2899 GEN7_DATAPORT_DC_UNTYPED_SURFACE_WRITE
);
2900 /* Set mask of 32-bit channels to drop. */
2901 unsigned msg_control
= 0xf & (0xf << num_channels
);
2903 if (brw_get_default_access_mode(p
) == BRW_ALIGN_1
) {
2904 if (brw_get_default_exec_size(p
) == BRW_EXECUTE_16
)
2905 msg_control
|= 1 << 4; /* SIMD16 mode */
2907 msg_control
|= 2 << 4; /* SIMD8 mode */
2909 if (devinfo
->gen
>= 8 || devinfo
->is_haswell
)
2910 msg_control
|= 0 << 4; /* SIMD4x2 mode */
2912 msg_control
|= 2 << 4; /* SIMD8 mode */
2915 return brw_dp_surface_desc(devinfo
, msg_type
, msg_control
);
2919 brw_untyped_surface_write(struct brw_codegen
*p
,
2920 struct brw_reg payload
,
2921 struct brw_reg surface
,
2922 unsigned msg_length
,
2923 unsigned num_channels
,
2924 bool header_present
)
2926 const struct gen_device_info
*devinfo
= p
->devinfo
;
2927 const unsigned sfid
= (devinfo
->gen
>= 8 || devinfo
->is_haswell
?
2928 HSW_SFID_DATAPORT_DATA_CACHE_1
:
2929 GEN7_SFID_DATAPORT_DATA_CACHE
);
2930 const unsigned desc
=
2931 brw_message_desc(devinfo
, msg_length
, 0, header_present
) |
2932 brw_dp_untyped_surface_write_desc(p
, num_channels
);
2933 const bool align1
= brw_get_default_access_mode(p
) == BRW_ALIGN_1
;
2934 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
2935 const unsigned mask
= devinfo
->gen
== 7 && !devinfo
->is_haswell
&& !align1
?
2936 WRITEMASK_X
: WRITEMASK_XYZW
;
2938 brw_send_indirect_surface_message(p
, sfid
, brw_writemask(brw_null_reg(), mask
),
2939 payload
, surface
, desc
);
2943 brw_byte_scattered_data_element_from_bit_size(unsigned bit_size
)
2947 return GEN7_BYTE_SCATTERED_DATA_ELEMENT_BYTE
;
2949 return GEN7_BYTE_SCATTERED_DATA_ELEMENT_WORD
;
2951 return GEN7_BYTE_SCATTERED_DATA_ELEMENT_DWORD
;
2953 unreachable("Unsupported bit_size for byte scattered messages");
2958 brw_dp_byte_scattered_desc(struct brw_codegen
*p
, unsigned bit_size
,
2961 const struct gen_device_info
*devinfo
= p
->devinfo
;
2962 unsigned msg_control
=
2963 brw_byte_scattered_data_element_from_bit_size(bit_size
) << 2;
2965 if (brw_get_default_exec_size(p
) == BRW_EXECUTE_16
)
2966 msg_control
|= 1; /* SIMD16 mode */
2968 msg_control
|= 0; /* SIMD8 mode */
2970 return brw_dp_surface_desc(devinfo
, msg_type
, msg_control
);
2974 brw_byte_scattered_read(struct brw_codegen
*p
,
2976 struct brw_reg payload
,
2977 struct brw_reg surface
,
2978 unsigned msg_length
,
2981 const struct gen_device_info
*devinfo
= p
->devinfo
;
2982 assert(devinfo
->gen
> 7 || devinfo
->is_haswell
);
2983 assert(brw_get_default_access_mode(p
) == BRW_ALIGN_1
);
2984 const unsigned response_length
=
2985 brw_surface_payload_size(p
, 1, true, true);
2986 const unsigned desc
=
2987 brw_message_desc(devinfo
, msg_length
, response_length
, false) |
2988 brw_dp_byte_scattered_desc(p
, bit_size
,
2989 HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_READ
);
2991 brw_send_indirect_surface_message(p
, GEN7_SFID_DATAPORT_DATA_CACHE
,
2992 dst
, payload
, surface
, desc
);
2996 brw_byte_scattered_write(struct brw_codegen
*p
,
2997 struct brw_reg payload
,
2998 struct brw_reg surface
,
2999 unsigned msg_length
,
3001 bool header_present
)
3003 const struct gen_device_info
*devinfo
= p
->devinfo
;
3004 assert(devinfo
->gen
> 7 || devinfo
->is_haswell
);
3005 assert(brw_get_default_access_mode(p
) == BRW_ALIGN_1
);
3006 const unsigned desc
=
3007 brw_message_desc(devinfo
, msg_length
, 0, header_present
) |
3008 brw_dp_byte_scattered_desc(p
, bit_size
,
3009 HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_WRITE
);
3011 brw_send_indirect_surface_message(p
, GEN7_SFID_DATAPORT_DATA_CACHE
,
3012 brw_writemask(brw_null_reg(),
3014 payload
, surface
, desc
);
3018 brw_dp_typed_atomic_desc(struct brw_codegen
*p
,
3020 bool response_expected
)
3022 const struct gen_device_info
*devinfo
= p
->devinfo
;
3023 unsigned msg_control
=
3024 atomic_op
| /* Atomic Operation Type: BRW_AOP_* */
3025 (response_expected
? 1 << 5 : 0); /* Return data expected */
3028 if (devinfo
->gen
>= 8 || devinfo
->is_haswell
) {
3029 if (brw_get_default_access_mode(p
) == BRW_ALIGN_1
) {
3030 if ((brw_get_default_group(p
) / 8) % 2 == 1)
3031 msg_control
|= 1 << 4; /* Use high 8 slots of the sample mask */
3033 msg_type
= HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP
;
3035 msg_type
= HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2
;
3039 if ((brw_get_default_group(p
) / 8) % 2 == 1)
3040 msg_control
|= 1 << 4; /* Use high 8 slots of the sample mask */
3042 msg_type
= GEN7_DATAPORT_RC_TYPED_ATOMIC_OP
;
3045 return brw_dp_surface_desc(devinfo
, msg_type
, msg_control
);
3049 brw_typed_atomic(struct brw_codegen
*p
,
3051 struct brw_reg payload
,
3052 struct brw_reg surface
,
3054 unsigned msg_length
,
3055 bool response_expected
,
3056 bool header_present
) {
3057 const struct gen_device_info
*devinfo
= p
->devinfo
;
3058 const unsigned sfid
= (devinfo
->gen
>= 8 || devinfo
->is_haswell
?
3059 HSW_SFID_DATAPORT_DATA_CACHE_1
:
3060 GEN6_SFID_DATAPORT_RENDER_CACHE
);
3061 const unsigned response_length
= brw_surface_payload_size(
3062 p
, response_expected
, devinfo
->gen
>= 8 || devinfo
->is_haswell
, false);
3063 const unsigned desc
=
3064 brw_message_desc(devinfo
, msg_length
, response_length
, header_present
) |
3065 brw_dp_typed_atomic_desc(p
, atomic_op
, response_expected
);
3066 const bool align1
= brw_get_default_access_mode(p
) == BRW_ALIGN_1
;
3067 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3068 const unsigned mask
= align1
? WRITEMASK_XYZW
: WRITEMASK_X
;
3070 brw_send_indirect_surface_message(p
, sfid
, brw_writemask(dst
, mask
),
3071 payload
, surface
, desc
);
3075 brw_dp_typed_surface_read_desc(struct brw_codegen
*p
,
3076 unsigned num_channels
)
3078 const struct gen_device_info
*devinfo
= p
->devinfo
;
3079 /* Set mask of unused channels. */
3080 unsigned msg_control
= 0xf & (0xf << num_channels
);
3083 if (devinfo
->gen
>= 8 || devinfo
->is_haswell
) {
3084 if (brw_get_default_access_mode(p
) == BRW_ALIGN_1
) {
3085 if ((brw_get_default_group(p
) / 8) % 2 == 1)
3086 msg_control
|= 2 << 4; /* Use high 8 slots of the sample mask */
3088 msg_control
|= 1 << 4; /* Use low 8 slots of the sample mask */
3091 msg_type
= HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ
;
3093 if (brw_get_default_access_mode(p
) == BRW_ALIGN_1
) {
3094 if ((brw_get_default_group(p
) / 8) % 2 == 1)
3095 msg_control
|= 1 << 5; /* Use high 8 slots of the sample mask */
3098 msg_type
= GEN7_DATAPORT_RC_TYPED_SURFACE_READ
;
3101 return brw_dp_surface_desc(devinfo
, msg_type
, msg_control
);
3105 brw_typed_surface_read(struct brw_codegen
*p
,
3107 struct brw_reg payload
,
3108 struct brw_reg surface
,
3109 unsigned msg_length
,
3110 unsigned num_channels
,
3111 bool header_present
)
3113 const struct gen_device_info
*devinfo
= p
->devinfo
;
3114 const unsigned sfid
= (devinfo
->gen
>= 8 || devinfo
->is_haswell
?
3115 HSW_SFID_DATAPORT_DATA_CACHE_1
:
3116 GEN6_SFID_DATAPORT_RENDER_CACHE
);
3117 const unsigned response_length
= brw_surface_payload_size(
3118 p
, num_channels
, devinfo
->gen
>= 8 || devinfo
->is_haswell
, false);
3119 const unsigned desc
=
3120 brw_message_desc(devinfo
, msg_length
, response_length
, header_present
) |
3121 brw_dp_typed_surface_read_desc(p
, num_channels
);
3123 brw_send_indirect_surface_message(p
, sfid
, dst
, payload
, surface
, desc
);
3127 brw_dp_typed_surface_write_desc(struct brw_codegen
*p
,
3128 unsigned num_channels
)
3130 const struct gen_device_info
*devinfo
= p
->devinfo
;
3131 /* Set mask of unused channels. */
3132 unsigned msg_control
= 0xf & (0xf << num_channels
);
3135 if (devinfo
->gen
>= 8 || devinfo
->is_haswell
) {
3136 if (brw_get_default_access_mode(p
) == BRW_ALIGN_1
) {
3137 if ((brw_get_default_group(p
) / 8) % 2 == 1)
3138 msg_control
|= 2 << 4; /* Use high 8 slots of the sample mask */
3140 msg_control
|= 1 << 4; /* Use low 8 slots of the sample mask */
3143 msg_type
= HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE
;
3146 if (brw_get_default_access_mode(p
) == BRW_ALIGN_1
) {
3147 if ((brw_get_default_group(p
) / 8) % 2 == 1)
3148 msg_control
|= 1 << 5; /* Use high 8 slots of the sample mask */
3151 msg_type
= GEN7_DATAPORT_RC_TYPED_SURFACE_WRITE
;
3154 return brw_dp_surface_desc(devinfo
, msg_type
, msg_control
);
3158 brw_typed_surface_write(struct brw_codegen
*p
,
3159 struct brw_reg payload
,
3160 struct brw_reg surface
,
3161 unsigned msg_length
,
3162 unsigned num_channels
,
3163 bool header_present
)
3165 const struct gen_device_info
*devinfo
= p
->devinfo
;
3166 const unsigned sfid
= (devinfo
->gen
>= 8 || devinfo
->is_haswell
?
3167 HSW_SFID_DATAPORT_DATA_CACHE_1
:
3168 GEN6_SFID_DATAPORT_RENDER_CACHE
);
3169 const unsigned desc
=
3170 brw_message_desc(devinfo
, msg_length
, 0, header_present
) |
3171 brw_dp_typed_surface_write_desc(p
, num_channels
);
3172 const bool align1
= brw_get_default_access_mode(p
) == BRW_ALIGN_1
;
3173 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3174 const unsigned mask
= (devinfo
->gen
== 7 && !devinfo
->is_haswell
&& !align1
?
3175 WRITEMASK_X
: WRITEMASK_XYZW
);
3177 brw_send_indirect_surface_message(p
, sfid
, brw_writemask(brw_null_reg(), mask
),
3178 payload
, surface
, desc
);
3182 brw_set_memory_fence_message(struct brw_codegen
*p
,
3183 struct brw_inst
*insn
,
3184 enum brw_message_target sfid
,
3187 const struct gen_device_info
*devinfo
= p
->devinfo
;
3189 brw_set_desc(p
, insn
, brw_message_desc(
3190 devinfo
, 1, (commit_enable
? 1 : 0), true));
3192 brw_inst_set_sfid(devinfo
, insn
, sfid
);
3195 case GEN6_SFID_DATAPORT_RENDER_CACHE
:
3196 brw_inst_set_dp_msg_type(devinfo
, insn
, GEN7_DATAPORT_RC_MEMORY_FENCE
);
3198 case GEN7_SFID_DATAPORT_DATA_CACHE
:
3199 brw_inst_set_dp_msg_type(devinfo
, insn
, GEN7_DATAPORT_DC_MEMORY_FENCE
);
3202 unreachable("Not reached");
3206 brw_inst_set_dp_msg_control(devinfo
, insn
, 1 << 5);
3210 brw_memory_fence(struct brw_codegen
*p
,
3212 enum opcode send_op
)
3214 const struct gen_device_info
*devinfo
= p
->devinfo
;
3215 const bool commit_enable
=
3216 devinfo
->gen
>= 10 || /* HSD ES # 1404612949 */
3217 (devinfo
->gen
== 7 && !devinfo
->is_haswell
);
3218 struct brw_inst
*insn
;
3220 brw_push_insn_state(p
);
3221 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
3222 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
3225 /* Set dst as destination for dependency tracking, the MEMORY_FENCE
3226 * message doesn't write anything back.
3228 insn
= next_insn(p
, send_op
);
3229 dst
= retype(dst
, BRW_REGISTER_TYPE_UW
);
3230 brw_set_dest(p
, insn
, dst
);
3231 brw_set_src0(p
, insn
, dst
);
3232 brw_set_memory_fence_message(p
, insn
, GEN7_SFID_DATAPORT_DATA_CACHE
,
3235 if (devinfo
->gen
== 7 && !devinfo
->is_haswell
) {
3236 /* IVB does typed surface access through the render cache, so we need to
3237 * flush it too. Use a different register so both flushes can be
3238 * pipelined by the hardware.
3240 insn
= next_insn(p
, send_op
);
3241 brw_set_dest(p
, insn
, offset(dst
, 1));
3242 brw_set_src0(p
, insn
, offset(dst
, 1));
3243 brw_set_memory_fence_message(p
, insn
, GEN6_SFID_DATAPORT_RENDER_CACHE
,
3246 /* Now write the response of the second message into the response of the
3247 * first to trigger a pipeline stall -- This way future render and data
3248 * cache messages will be properly ordered with respect to past data and
3249 * render cache messages.
3251 brw_MOV(p
, dst
, offset(dst
, 1));
3254 brw_pop_insn_state(p
);
3258 brw_pixel_interpolator_query(struct brw_codegen
*p
,
3259 struct brw_reg dest
,
3263 struct brw_reg data
,
3264 unsigned msg_length
,
3265 unsigned response_length
)
3267 const struct gen_device_info
*devinfo
= p
->devinfo
;
3268 const uint16_t exec_size
= brw_get_default_exec_size(p
);
3269 const unsigned slot_group
= brw_get_default_group(p
) / 16;
3270 const unsigned simd_mode
= (exec_size
== BRW_EXECUTE_16
);
3271 const unsigned desc
=
3272 brw_message_desc(devinfo
, msg_length
, response_length
, false) |
3273 brw_pixel_interp_desc(devinfo
, mode
, noperspective
, simd_mode
,
3276 /* brw_send_indirect_message will automatically use a direct send message
3277 * if data is actually immediate.
3279 brw_send_indirect_message(p
,
3280 GEN7_SFID_PIXEL_INTERPOLATOR
,
3288 brw_find_live_channel(struct brw_codegen
*p
, struct brw_reg dst
,
3289 struct brw_reg mask
)
3291 const struct gen_device_info
*devinfo
= p
->devinfo
;
3292 const unsigned exec_size
= 1 << brw_get_default_exec_size(p
);
3293 const unsigned qtr_control
= brw_get_default_group(p
) / 8;
3296 assert(devinfo
->gen
>= 7);
3297 assert(mask
.type
== BRW_REGISTER_TYPE_UD
);
3299 brw_push_insn_state(p
);
3301 if (brw_get_default_access_mode(p
) == BRW_ALIGN_1
) {
3302 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
3304 if (devinfo
->gen
>= 8) {
3305 /* Getting the first active channel index is easy on Gen8: Just find
3306 * the first bit set in the execution mask. The register exists on
3307 * HSW already but it reads back as all ones when the current
3308 * instruction has execution masking disabled, so it's kind of
3311 struct brw_reg exec_mask
=
3312 retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD
);
3314 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
3315 if (mask
.file
!= BRW_IMMEDIATE_VALUE
|| mask
.ud
!= 0xffffffff) {
3316 /* Unfortunately, ce0 does not take into account the thread
3317 * dispatch mask, which may be a problem in cases where it's not
3318 * tightly packed (i.e. it doesn't have the form '2^n - 1' for
3319 * some n). Combine ce0 with the given dispatch (or vector) mask
3320 * to mask off those channels which were never dispatched by the
3323 brw_SHR(p
, vec1(dst
), mask
, brw_imm_ud(qtr_control
* 8));
3324 brw_AND(p
, vec1(dst
), exec_mask
, vec1(dst
));
3325 exec_mask
= vec1(dst
);
3328 /* Quarter control has the effect of magically shifting the value of
3329 * ce0 so you'll get the first active channel relative to the
3330 * specified quarter control as result.
3332 inst
= brw_FBL(p
, vec1(dst
), exec_mask
);
3334 const struct brw_reg flag
= brw_flag_reg(p
->current
->flag_subreg
/ 2,
3335 p
->current
->flag_subreg
% 2);
3337 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
3338 brw_MOV(p
, retype(flag
, BRW_REGISTER_TYPE_UD
), brw_imm_ud(0));
3340 /* Run enough instructions returning zero with execution masking and
3341 * a conditional modifier enabled in order to get the full execution
3342 * mask in f1.0. We could use a single 32-wide move here if it
3343 * weren't because of the hardware bug that causes channel enables to
3344 * be applied incorrectly to the second half of 32-wide instructions
3347 const unsigned lower_size
= MIN2(16, exec_size
);
3348 for (unsigned i
= 0; i
< exec_size
/ lower_size
; i
++) {
3349 inst
= brw_MOV(p
, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW
),
3351 brw_inst_set_mask_control(devinfo
, inst
, BRW_MASK_ENABLE
);
3352 brw_inst_set_group(devinfo
, inst
, lower_size
* i
+ 8 * qtr_control
);
3353 brw_inst_set_cond_modifier(devinfo
, inst
, BRW_CONDITIONAL_Z
);
3354 brw_inst_set_exec_size(devinfo
, inst
, cvt(lower_size
) - 1);
3357 /* Find the first bit set in the exec_size-wide portion of the flag
3358 * register that was updated by the last sequence of MOV
3361 const enum brw_reg_type type
= brw_int_type(exec_size
/ 8, false);
3362 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
3363 brw_FBL(p
, vec1(dst
), byte_offset(retype(flag
, type
), qtr_control
));
3366 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
3368 if (devinfo
->gen
>= 8 &&
3369 mask
.file
== BRW_IMMEDIATE_VALUE
&& mask
.ud
== 0xffffffff) {
3370 /* In SIMD4x2 mode the first active channel index is just the
3371 * negation of the first bit of the mask register. Note that ce0
3372 * doesn't take into account the dispatch mask, so the Gen7 path
3373 * should be used instead unless you have the guarantee that the
3374 * dispatch mask is tightly packed (i.e. it has the form '2^n - 1'
3377 inst
= brw_AND(p
, brw_writemask(dst
, WRITEMASK_X
),
3378 negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD
)),
3382 /* Overwrite the destination without and with execution masking to
3383 * find out which of the channels is active.
3385 brw_push_insn_state(p
);
3386 brw_set_default_exec_size(p
, BRW_EXECUTE_4
);
3387 brw_MOV(p
, brw_writemask(vec4(dst
), WRITEMASK_X
),
3390 inst
= brw_MOV(p
, brw_writemask(vec4(dst
), WRITEMASK_X
),
3392 brw_pop_insn_state(p
);
3393 brw_inst_set_mask_control(devinfo
, inst
, BRW_MASK_ENABLE
);
3397 brw_pop_insn_state(p
);
3401 brw_broadcast(struct brw_codegen
*p
,
3406 const struct gen_device_info
*devinfo
= p
->devinfo
;
3407 const bool align1
= brw_get_default_access_mode(p
) == BRW_ALIGN_1
;
3410 brw_push_insn_state(p
);
3411 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
3412 brw_set_default_exec_size(p
, align1
? BRW_EXECUTE_1
: BRW_EXECUTE_4
);
3414 assert(src
.file
== BRW_GENERAL_REGISTER_FILE
&&
3415 src
.address_mode
== BRW_ADDRESS_DIRECT
);
3416 assert(!src
.abs
&& !src
.negate
);
3417 assert(src
.type
== dst
.type
);
3419 if ((src
.vstride
== 0 && (src
.hstride
== 0 || !align1
)) ||
3420 idx
.file
== BRW_IMMEDIATE_VALUE
) {
3421 /* Trivial, the source is already uniform or the index is a constant.
3422 * We will typically not get here if the optimizer is doing its job, but
3423 * asserting would be mean.
3425 const unsigned i
= idx
.file
== BRW_IMMEDIATE_VALUE
? idx
.ud
: 0;
3427 (align1
? stride(suboffset(src
, i
), 0, 1, 0) :
3428 stride(suboffset(src
, 4 * i
), 0, 4, 1)));
3430 /* From the Haswell PRM section "Register Region Restrictions":
3432 * "The lower bits of the AddressImmediate must not overflow to
3433 * change the register address. The lower 5 bits of Address
3434 * Immediate when added to lower 5 bits of address register gives
3435 * the sub-register offset. The upper bits of Address Immediate
3436 * when added to upper bits of address register gives the register
3437 * address. Any overflow from sub-register offset is dropped."
3439 * Fortunately, for broadcast, we never have a sub-register offset so
3440 * this isn't an issue.
3442 assert(src
.subnr
== 0);
3445 const struct brw_reg addr
=
3446 retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD
);
3447 unsigned offset
= src
.nr
* REG_SIZE
+ src
.subnr
;
3448 /* Limit in bytes of the signed indirect addressing immediate. */
3449 const unsigned limit
= 512;
3451 brw_push_insn_state(p
);
3452 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
3453 brw_set_default_predicate_control(p
, BRW_PREDICATE_NONE
);
3455 /* Take into account the component size and horizontal stride. */
3456 assert(src
.vstride
== src
.hstride
+ src
.width
);
3457 brw_SHL(p
, addr
, vec1(idx
),
3458 brw_imm_ud(_mesa_logbase2(type_sz(src
.type
)) +
3461 /* We can only address up to limit bytes using the indirect
3462 * addressing immediate, account for the difference if the source
3463 * register is above this limit.
3465 if (offset
>= limit
) {
3466 brw_ADD(p
, addr
, addr
, brw_imm_ud(offset
- offset
% limit
));
3467 offset
= offset
% limit
;
3470 brw_pop_insn_state(p
);
3472 /* Use indirect addressing to fetch the specified component. */
3473 if (type_sz(src
.type
) > 4 &&
3474 (devinfo
->is_cherryview
|| gen_device_info_is_9lp(devinfo
))) {
3475 /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
3477 * "When source or destination datatype is 64b or operation is
3478 * integer DWord multiply, indirect addressing must not be
3481 * To work around both of this issue, we do two integer MOVs
3482 * insead of one 64-bit MOV. Because no double value should ever
3483 * cross a register boundary, it's safe to use the immediate
3484 * offset in the indirect here to handle adding 4 bytes to the
3485 * offset and avoid the extra ADD to the register file.
3487 brw_MOV(p
, subscript(dst
, BRW_REGISTER_TYPE_D
, 0),
3488 retype(brw_vec1_indirect(addr
.subnr
, offset
),
3489 BRW_REGISTER_TYPE_D
));
3490 brw_MOV(p
, subscript(dst
, BRW_REGISTER_TYPE_D
, 1),
3491 retype(brw_vec1_indirect(addr
.subnr
, offset
+ 4),
3492 BRW_REGISTER_TYPE_D
));
3495 retype(brw_vec1_indirect(addr
.subnr
, offset
), src
.type
));
3498 /* In SIMD4x2 mode the index can be either zero or one, replicate it
3499 * to all bits of a flag register,
3503 stride(brw_swizzle(idx
, BRW_SWIZZLE_XXXX
), 4, 4, 1));
3504 brw_inst_set_pred_control(devinfo
, inst
, BRW_PREDICATE_NONE
);
3505 brw_inst_set_cond_modifier(devinfo
, inst
, BRW_CONDITIONAL_NZ
);
3506 brw_inst_set_flag_reg_nr(devinfo
, inst
, 1);
3508 /* and use predicated SEL to pick the right channel. */
3509 inst
= brw_SEL(p
, dst
,
3510 stride(suboffset(src
, 4), 4, 4, 1),
3511 stride(src
, 4, 4, 1));
3512 brw_inst_set_pred_control(devinfo
, inst
, BRW_PREDICATE_NORMAL
);
3513 brw_inst_set_flag_reg_nr(devinfo
, inst
, 1);
3517 brw_pop_insn_state(p
);
3521 * This instruction is generated as a single-channel align1 instruction by
3522 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
3524 * We can't use the typed atomic op in the FS because that has the execution
3525 * mask ANDed with the pixel mask, but we just want to write the one dword for
3528 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
3529 * one u32. So we use the same untyped atomic write message as the pixel
3532 * The untyped atomic operation requires a BUFFER surface type with RAW
3533 * format, and is only accessible through the legacy DATA_CACHE dataport
3536 void brw_shader_time_add(struct brw_codegen
*p
,
3537 struct brw_reg payload
,
3538 uint32_t surf_index
)
3540 const struct gen_device_info
*devinfo
= p
->devinfo
;
3541 const unsigned sfid
= (devinfo
->gen
>= 8 || devinfo
->is_haswell
?
3542 HSW_SFID_DATAPORT_DATA_CACHE_1
:
3543 GEN7_SFID_DATAPORT_DATA_CACHE
);
3544 assert(devinfo
->gen
>= 7);
3546 brw_push_insn_state(p
);
3547 brw_set_default_access_mode(p
, BRW_ALIGN_1
);
3548 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
3549 brw_set_default_compression_control(p
, BRW_COMPRESSION_NONE
);
3550 brw_inst
*send
= brw_next_insn(p
, BRW_OPCODE_SEND
);
3552 /* We use brw_vec1_reg and unmasked because we want to increment the given
3555 brw_set_dest(p
, send
, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE
,
3557 brw_set_src0(p
, send
, brw_vec1_reg(payload
.file
,
3559 brw_set_desc(p
, send
, (brw_message_desc(devinfo
, 2, 0, false) |
3560 brw_dp_untyped_atomic_desc(p
, BRW_AOP_ADD
, false)));
3562 brw_inst_set_sfid(devinfo
, send
, sfid
);
3563 brw_inst_set_binding_table_index(devinfo
, send
, surf_index
);
3565 brw_pop_insn_state(p
);
3570 * Emit the SEND message for a barrier
3573 brw_barrier(struct brw_codegen
*p
, struct brw_reg src
)
3575 const struct gen_device_info
*devinfo
= p
->devinfo
;
3576 struct brw_inst
*inst
;
3578 assert(devinfo
->gen
>= 7);
3580 brw_push_insn_state(p
);
3581 brw_set_default_access_mode(p
, BRW_ALIGN_1
);
3582 inst
= next_insn(p
, BRW_OPCODE_SEND
);
3583 brw_set_dest(p
, inst
, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW
));
3584 brw_set_src0(p
, inst
, src
);
3585 brw_set_src1(p
, inst
, brw_null_reg());
3586 brw_set_desc(p
, inst
, brw_message_desc(devinfo
, 1, 0, false));
3588 brw_inst_set_sfid(devinfo
, inst
, BRW_SFID_MESSAGE_GATEWAY
);
3589 brw_inst_set_gateway_notify(devinfo
, inst
, 1);
3590 brw_inst_set_gateway_subfuncid(devinfo
, inst
,
3591 BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG
);
3593 brw_inst_set_mask_control(devinfo
, inst
, BRW_MASK_DISABLE
);
3594 brw_pop_insn_state(p
);
3599 * Emit the wait instruction for a barrier
3602 brw_WAIT(struct brw_codegen
*p
)
3604 const struct gen_device_info
*devinfo
= p
->devinfo
;
3605 struct brw_inst
*insn
;
3607 struct brw_reg src
= brw_notification_reg();
3609 insn
= next_insn(p
, BRW_OPCODE_WAIT
);
3610 brw_set_dest(p
, insn
, src
);
3611 brw_set_src0(p
, insn
, src
);
3612 brw_set_src1(p
, insn
, brw_null_reg());
3614 brw_inst_set_exec_size(devinfo
, insn
, BRW_EXECUTE_1
);
3615 brw_inst_set_mask_control(devinfo
, insn
, BRW_MASK_DISABLE
);
3619 * Changes the floating point rounding mode updating the control register
3620 * field defined at cr0.0[5-6] bits. This function supports the changes to
3621 * RTNE (00), RU (01), RD (10) and RTZ (11) rounding using bitwise operations.
3622 * Only RTNE and RTZ rounding are enabled at nir.
3625 brw_rounding_mode(struct brw_codegen
*p
,
3626 enum brw_rnd_mode mode
)
3628 const unsigned bits
= mode
<< BRW_CR0_RND_MODE_SHIFT
;
3630 if (bits
!= BRW_CR0_RND_MODE_MASK
) {
3631 brw_inst
*inst
= brw_AND(p
, brw_cr0_reg(0), brw_cr0_reg(0),
3632 brw_imm_ud(~BRW_CR0_RND_MODE_MASK
));
3633 brw_inst_set_exec_size(p
->devinfo
, inst
, BRW_EXECUTE_1
);
3635 /* From the Skylake PRM, Volume 7, page 760:
3636 * "Implementation Restriction on Register Access: When the control
3637 * register is used as an explicit source and/or destination, hardware
3638 * does not ensure execution pipeline coherency. Software must set the
3639 * thread control field to ‘switch’ for an instruction that uses
3640 * control register as an explicit operand."
3642 brw_inst_set_thread_control(p
->devinfo
, inst
, BRW_THREAD_SWITCH
);
3646 brw_inst
*inst
= brw_OR(p
, brw_cr0_reg(0), brw_cr0_reg(0),
3648 brw_inst_set_exec_size(p
->devinfo
, inst
, BRW_EXECUTE_1
);
3649 brw_inst_set_thread_control(p
->devinfo
, inst
, BRW_THREAD_SWITCH
);