2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **********************************************************************/
29 * Keith Whitwell <keithw@vmware.com>
33 #include "brw_eu_defines.h"
36 #include "util/ralloc.h"
39 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
40 * registers, implicitly moving the operand to a message register.
42 * On Sandybridge, this is no longer the case. This function performs the
43 * explicit move; it should be called before emitting a SEND instruction.
46 gen6_resolve_implied_move(struct brw_codegen
*p
,
50 const struct gen_device_info
*devinfo
= p
->devinfo
;
54 if (src
->file
== BRW_MESSAGE_REGISTER_FILE
)
57 if (src
->file
!= BRW_ARCHITECTURE_REGISTER_FILE
|| src
->nr
!= BRW_ARF_NULL
) {
58 brw_push_insn_state(p
);
59 brw_set_default_exec_size(p
, BRW_EXECUTE_8
);
60 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
61 brw_set_default_compression_control(p
, BRW_COMPRESSION_NONE
);
62 brw_MOV(p
, retype(brw_message_reg(msg_reg_nr
), BRW_REGISTER_TYPE_UD
),
63 retype(*src
, BRW_REGISTER_TYPE_UD
));
64 brw_pop_insn_state(p
);
66 *src
= brw_message_reg(msg_reg_nr
);
70 gen7_convert_mrf_to_grf(struct brw_codegen
*p
, struct brw_reg
*reg
)
72 /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
73 * "The send with EOT should use register space R112-R127 for <src>. This is
74 * to enable loading of a new thread into the same slot while the message
75 * with EOT for current thread is pending dispatch."
77 * Since we're pretending to have 16 MRFs anyway, we may as well use the
78 * registers required for messages with EOT.
80 const struct gen_device_info
*devinfo
= p
->devinfo
;
81 if (devinfo
->gen
>= 7 && reg
->file
== BRW_MESSAGE_REGISTER_FILE
) {
82 reg
->file
= BRW_GENERAL_REGISTER_FILE
;
83 reg
->nr
+= GEN7_MRF_HACK_START
;
88 brw_set_dest(struct brw_codegen
*p
, brw_inst
*inst
, struct brw_reg dest
)
90 const struct gen_device_info
*devinfo
= p
->devinfo
;
92 if (dest
.file
== BRW_MESSAGE_REGISTER_FILE
)
93 assert((dest
.nr
& ~BRW_MRF_COMPR4
) < BRW_MAX_MRF(devinfo
->gen
));
94 else if (dest
.file
== BRW_GENERAL_REGISTER_FILE
)
95 assert(dest
.nr
< 128);
97 /* The hardware has a restriction where if the destination is Byte,
98 * the instruction needs to have a stride of 2 (except for packed byte
99 * MOV). This seems to be required even if the destination is the NULL
102 if (dest
.file
== BRW_ARCHITECTURE_REGISTER_FILE
&&
103 dest
.nr
== BRW_ARF_NULL
&&
104 type_sz(dest
.type
) == 1) {
105 dest
.hstride
= BRW_HORIZONTAL_STRIDE_2
;
108 gen7_convert_mrf_to_grf(p
, &dest
);
110 if (brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDS
||
111 brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDSC
) {
112 assert(dest
.file
== BRW_GENERAL_REGISTER_FILE
||
113 dest
.file
== BRW_ARCHITECTURE_REGISTER_FILE
);
114 assert(dest
.address_mode
== BRW_ADDRESS_DIRECT
);
115 assert(dest
.subnr
% 16 == 0);
116 assert(dest
.hstride
== BRW_HORIZONTAL_STRIDE_1
&&
117 dest
.vstride
== dest
.width
+ 1);
118 assert(!dest
.negate
&& !dest
.abs
);
119 brw_inst_set_dst_da_reg_nr(devinfo
, inst
, dest
.nr
);
120 brw_inst_set_dst_da16_subreg_nr(devinfo
, inst
, dest
.subnr
/ 16);
121 brw_inst_set_send_dst_reg_file(devinfo
, inst
, dest
.file
);
123 brw_inst_set_dst_file_type(devinfo
, inst
, dest
.file
, dest
.type
);
124 brw_inst_set_dst_address_mode(devinfo
, inst
, dest
.address_mode
);
126 if (dest
.address_mode
== BRW_ADDRESS_DIRECT
) {
127 brw_inst_set_dst_da_reg_nr(devinfo
, inst
, dest
.nr
);
129 if (brw_inst_access_mode(devinfo
, inst
) == BRW_ALIGN_1
) {
130 brw_inst_set_dst_da1_subreg_nr(devinfo
, inst
, dest
.subnr
);
131 if (dest
.hstride
== BRW_HORIZONTAL_STRIDE_0
)
132 dest
.hstride
= BRW_HORIZONTAL_STRIDE_1
;
133 brw_inst_set_dst_hstride(devinfo
, inst
, dest
.hstride
);
135 brw_inst_set_dst_da16_subreg_nr(devinfo
, inst
, dest
.subnr
/ 16);
136 brw_inst_set_da16_writemask(devinfo
, inst
, dest
.writemask
);
137 if (dest
.file
== BRW_GENERAL_REGISTER_FILE
||
138 dest
.file
== BRW_MESSAGE_REGISTER_FILE
) {
139 assert(dest
.writemask
!= 0);
141 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
142 * Although Dst.HorzStride is a don't care for Align16, HW needs
143 * this to be programmed as "01".
145 brw_inst_set_dst_hstride(devinfo
, inst
, 1);
148 brw_inst_set_dst_ia_subreg_nr(devinfo
, inst
, dest
.subnr
);
150 /* These are different sizes in align1 vs align16:
152 if (brw_inst_access_mode(devinfo
, inst
) == BRW_ALIGN_1
) {
153 brw_inst_set_dst_ia1_addr_imm(devinfo
, inst
,
154 dest
.indirect_offset
);
155 if (dest
.hstride
== BRW_HORIZONTAL_STRIDE_0
)
156 dest
.hstride
= BRW_HORIZONTAL_STRIDE_1
;
157 brw_inst_set_dst_hstride(devinfo
, inst
, dest
.hstride
);
159 brw_inst_set_dst_ia16_addr_imm(devinfo
, inst
,
160 dest
.indirect_offset
);
161 /* even ignored in da16, still need to set as '01' */
162 brw_inst_set_dst_hstride(devinfo
, inst
, 1);
167 /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
168 * or 16 (SIMD16), as that's normally correct. However, when dealing with
169 * small registers, it can be useful for us to automatically reduce it to
170 * match the register size.
172 if (p
->automatic_exec_sizes
) {
174 * In platforms that support fp64 we can emit instructions with a width
175 * of 4 that need two SIMD8 registers and an exec_size of 8 or 16. In
176 * these cases we need to make sure that these instructions have their
177 * exec sizes set properly when they are emitted and we can't rely on
178 * this code to fix it.
181 if (devinfo
->gen
>= 6)
182 fix_exec_size
= dest
.width
< BRW_EXECUTE_4
;
184 fix_exec_size
= dest
.width
< BRW_EXECUTE_8
;
187 brw_inst_set_exec_size(devinfo
, inst
, dest
.width
);
192 brw_set_src0(struct brw_codegen
*p
, brw_inst
*inst
, struct brw_reg reg
)
194 const struct gen_device_info
*devinfo
= p
->devinfo
;
196 if (reg
.file
== BRW_MESSAGE_REGISTER_FILE
)
197 assert((reg
.nr
& ~BRW_MRF_COMPR4
) < BRW_MAX_MRF(devinfo
->gen
));
198 else if (reg
.file
== BRW_GENERAL_REGISTER_FILE
)
199 assert(reg
.nr
< 128);
201 gen7_convert_mrf_to_grf(p
, ®
);
203 if (devinfo
->gen
>= 6 &&
204 (brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SEND
||
205 brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDC
||
206 brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDS
||
207 brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDSC
)) {
208 /* Any source modifiers or regions will be ignored, since this just
209 * identifies the MRF/GRF to start reading the message contents from.
210 * Check for some likely failures.
214 assert(reg
.address_mode
== BRW_ADDRESS_DIRECT
);
217 if (brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDS
||
218 brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDSC
) {
219 assert(reg
.file
== BRW_GENERAL_REGISTER_FILE
);
220 assert(reg
.address_mode
== BRW_ADDRESS_DIRECT
);
221 assert(reg
.subnr
% 16 == 0);
222 assert(reg
.hstride
== BRW_HORIZONTAL_STRIDE_1
&&
223 reg
.vstride
== reg
.width
+ 1);
224 assert(!reg
.negate
&& !reg
.abs
);
225 brw_inst_set_src0_da_reg_nr(devinfo
, inst
, reg
.nr
);
226 brw_inst_set_src0_da16_subreg_nr(devinfo
, inst
, reg
.subnr
/ 16);
228 brw_inst_set_src0_file_type(devinfo
, inst
, reg
.file
, reg
.type
);
229 brw_inst_set_src0_abs(devinfo
, inst
, reg
.abs
);
230 brw_inst_set_src0_negate(devinfo
, inst
, reg
.negate
);
231 brw_inst_set_src0_address_mode(devinfo
, inst
, reg
.address_mode
);
233 if (reg
.file
== BRW_IMMEDIATE_VALUE
) {
234 if (reg
.type
== BRW_REGISTER_TYPE_DF
||
235 brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_DIM
)
236 brw_inst_set_imm_df(devinfo
, inst
, reg
.df
);
237 else if (reg
.type
== BRW_REGISTER_TYPE_UQ
||
238 reg
.type
== BRW_REGISTER_TYPE_Q
)
239 brw_inst_set_imm_uq(devinfo
, inst
, reg
.u64
);
241 brw_inst_set_imm_ud(devinfo
, inst
, reg
.ud
);
243 if (devinfo
->gen
< 12 && type_sz(reg
.type
) < 8) {
244 brw_inst_set_src1_reg_file(devinfo
, inst
,
245 BRW_ARCHITECTURE_REGISTER_FILE
);
246 brw_inst_set_src1_reg_hw_type(devinfo
, inst
,
247 brw_inst_src0_reg_hw_type(devinfo
, inst
));
250 if (reg
.address_mode
== BRW_ADDRESS_DIRECT
) {
251 brw_inst_set_src0_da_reg_nr(devinfo
, inst
, reg
.nr
);
252 if (brw_inst_access_mode(devinfo
, inst
) == BRW_ALIGN_1
) {
253 brw_inst_set_src0_da1_subreg_nr(devinfo
, inst
, reg
.subnr
);
255 brw_inst_set_src0_da16_subreg_nr(devinfo
, inst
, reg
.subnr
/ 16);
258 brw_inst_set_src0_ia_subreg_nr(devinfo
, inst
, reg
.subnr
);
260 if (brw_inst_access_mode(devinfo
, inst
) == BRW_ALIGN_1
) {
261 brw_inst_set_src0_ia1_addr_imm(devinfo
, inst
, reg
.indirect_offset
);
263 brw_inst_set_src0_ia16_addr_imm(devinfo
, inst
, reg
.indirect_offset
);
267 if (brw_inst_access_mode(devinfo
, inst
) == BRW_ALIGN_1
) {
268 if (reg
.width
== BRW_WIDTH_1
&&
269 brw_inst_exec_size(devinfo
, inst
) == BRW_EXECUTE_1
) {
270 brw_inst_set_src0_hstride(devinfo
, inst
, BRW_HORIZONTAL_STRIDE_0
);
271 brw_inst_set_src0_width(devinfo
, inst
, BRW_WIDTH_1
);
272 brw_inst_set_src0_vstride(devinfo
, inst
, BRW_VERTICAL_STRIDE_0
);
274 brw_inst_set_src0_hstride(devinfo
, inst
, reg
.hstride
);
275 brw_inst_set_src0_width(devinfo
, inst
, reg
.width
);
276 brw_inst_set_src0_vstride(devinfo
, inst
, reg
.vstride
);
279 brw_inst_set_src0_da16_swiz_x(devinfo
, inst
,
280 BRW_GET_SWZ(reg
.swizzle
, BRW_CHANNEL_X
));
281 brw_inst_set_src0_da16_swiz_y(devinfo
, inst
,
282 BRW_GET_SWZ(reg
.swizzle
, BRW_CHANNEL_Y
));
283 brw_inst_set_src0_da16_swiz_z(devinfo
, inst
,
284 BRW_GET_SWZ(reg
.swizzle
, BRW_CHANNEL_Z
));
285 brw_inst_set_src0_da16_swiz_w(devinfo
, inst
,
286 BRW_GET_SWZ(reg
.swizzle
, BRW_CHANNEL_W
));
288 if (reg
.vstride
== BRW_VERTICAL_STRIDE_8
) {
289 /* This is an oddity of the fact we're using the same
290 * descriptions for registers in align_16 as align_1:
292 brw_inst_set_src0_vstride(devinfo
, inst
, BRW_VERTICAL_STRIDE_4
);
293 } else if (devinfo
->gen
== 7 && !devinfo
->is_haswell
&&
294 reg
.type
== BRW_REGISTER_TYPE_DF
&&
295 reg
.vstride
== BRW_VERTICAL_STRIDE_2
) {
298 * "For Align16 access mode, only encodings of 0000 and 0011
299 * are allowed. Other codes are reserved."
301 * Presumably the DevSNB behavior applies to IVB as well.
303 brw_inst_set_src0_vstride(devinfo
, inst
, BRW_VERTICAL_STRIDE_4
);
305 brw_inst_set_src0_vstride(devinfo
, inst
, reg
.vstride
);
314 brw_set_src1(struct brw_codegen
*p
, brw_inst
*inst
, struct brw_reg reg
)
316 const struct gen_device_info
*devinfo
= p
->devinfo
;
318 if (reg
.file
== BRW_GENERAL_REGISTER_FILE
)
319 assert(reg
.nr
< 128);
321 if (brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDS
||
322 brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDSC
) {
323 assert(reg
.file
== BRW_GENERAL_REGISTER_FILE
||
324 reg
.file
== BRW_ARCHITECTURE_REGISTER_FILE
);
325 assert(reg
.address_mode
== BRW_ADDRESS_DIRECT
);
326 assert(reg
.subnr
== 0);
327 assert(reg
.hstride
== BRW_HORIZONTAL_STRIDE_1
&&
328 reg
.vstride
== reg
.width
+ 1);
329 assert(!reg
.negate
&& !reg
.abs
);
330 brw_inst_set_send_src1_reg_nr(devinfo
, inst
, reg
.nr
);
331 brw_inst_set_send_src1_reg_file(devinfo
, inst
, reg
.file
);
333 /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
335 * "Accumulator registers may be accessed explicitly as src0
338 assert(reg
.file
!= BRW_ARCHITECTURE_REGISTER_FILE
||
339 reg
.nr
!= BRW_ARF_ACCUMULATOR
);
341 gen7_convert_mrf_to_grf(p
, ®
);
342 assert(reg
.file
!= BRW_MESSAGE_REGISTER_FILE
);
344 brw_inst_set_src1_file_type(devinfo
, inst
, reg
.file
, reg
.type
);
345 brw_inst_set_src1_abs(devinfo
, inst
, reg
.abs
);
346 brw_inst_set_src1_negate(devinfo
, inst
, reg
.negate
);
348 /* Only src1 can be immediate in two-argument instructions.
350 assert(brw_inst_src0_reg_file(devinfo
, inst
) != BRW_IMMEDIATE_VALUE
);
352 if (reg
.file
== BRW_IMMEDIATE_VALUE
) {
353 /* two-argument instructions can only use 32-bit immediates */
354 assert(type_sz(reg
.type
) < 8);
355 brw_inst_set_imm_ud(devinfo
, inst
, reg
.ud
);
357 /* This is a hardware restriction, which may or may not be lifted
360 assert (reg
.address_mode
== BRW_ADDRESS_DIRECT
);
361 /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
363 brw_inst_set_src1_da_reg_nr(devinfo
, inst
, reg
.nr
);
364 if (brw_inst_access_mode(devinfo
, inst
) == BRW_ALIGN_1
) {
365 brw_inst_set_src1_da1_subreg_nr(devinfo
, inst
, reg
.subnr
);
367 brw_inst_set_src1_da16_subreg_nr(devinfo
, inst
, reg
.subnr
/ 16);
370 if (brw_inst_access_mode(devinfo
, inst
) == BRW_ALIGN_1
) {
371 if (reg
.width
== BRW_WIDTH_1
&&
372 brw_inst_exec_size(devinfo
, inst
) == BRW_EXECUTE_1
) {
373 brw_inst_set_src1_hstride(devinfo
, inst
, BRW_HORIZONTAL_STRIDE_0
);
374 brw_inst_set_src1_width(devinfo
, inst
, BRW_WIDTH_1
);
375 brw_inst_set_src1_vstride(devinfo
, inst
, BRW_VERTICAL_STRIDE_0
);
377 brw_inst_set_src1_hstride(devinfo
, inst
, reg
.hstride
);
378 brw_inst_set_src1_width(devinfo
, inst
, reg
.width
);
379 brw_inst_set_src1_vstride(devinfo
, inst
, reg
.vstride
);
382 brw_inst_set_src1_da16_swiz_x(devinfo
, inst
,
383 BRW_GET_SWZ(reg
.swizzle
, BRW_CHANNEL_X
));
384 brw_inst_set_src1_da16_swiz_y(devinfo
, inst
,
385 BRW_GET_SWZ(reg
.swizzle
, BRW_CHANNEL_Y
));
386 brw_inst_set_src1_da16_swiz_z(devinfo
, inst
,
387 BRW_GET_SWZ(reg
.swizzle
, BRW_CHANNEL_Z
));
388 brw_inst_set_src1_da16_swiz_w(devinfo
, inst
,
389 BRW_GET_SWZ(reg
.swizzle
, BRW_CHANNEL_W
));
391 if (reg
.vstride
== BRW_VERTICAL_STRIDE_8
) {
392 /* This is an oddity of the fact we're using the same
393 * descriptions for registers in align_16 as align_1:
395 brw_inst_set_src1_vstride(devinfo
, inst
, BRW_VERTICAL_STRIDE_4
);
396 } else if (devinfo
->gen
== 7 && !devinfo
->is_haswell
&&
397 reg
.type
== BRW_REGISTER_TYPE_DF
&&
398 reg
.vstride
== BRW_VERTICAL_STRIDE_2
) {
401 * "For Align16 access mode, only encodings of 0000 and 0011
402 * are allowed. Other codes are reserved."
404 * Presumably the DevSNB behavior applies to IVB as well.
406 brw_inst_set_src1_vstride(devinfo
, inst
, BRW_VERTICAL_STRIDE_4
);
408 brw_inst_set_src1_vstride(devinfo
, inst
, reg
.vstride
);
416 * Specify the descriptor and extended descriptor immediate for a SEND(C)
417 * message instruction.
420 brw_set_desc_ex(struct brw_codegen
*p
, brw_inst
*inst
,
421 unsigned desc
, unsigned ex_desc
)
423 const struct gen_device_info
*devinfo
= p
->devinfo
;
424 assert(brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SEND
||
425 brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDC
);
426 brw_inst_set_src1_file_type(devinfo
, inst
,
427 BRW_IMMEDIATE_VALUE
, BRW_REGISTER_TYPE_UD
);
428 brw_inst_set_send_desc(devinfo
, inst
, desc
);
429 if (devinfo
->gen
>= 9)
430 brw_inst_set_send_ex_desc(devinfo
, inst
, ex_desc
);
433 static void brw_set_math_message( struct brw_codegen
*p
,
436 unsigned integer_type
,
440 const struct gen_device_info
*devinfo
= p
->devinfo
;
442 unsigned response_length
;
444 /* Infer message length from the function */
446 case BRW_MATH_FUNCTION_POW
:
447 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT
:
448 case BRW_MATH_FUNCTION_INT_DIV_REMAINDER
:
449 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER
:
457 /* Infer response length from the function */
459 case BRW_MATH_FUNCTION_SINCOS
:
460 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER
:
468 brw_set_desc(p
, inst
, brw_message_desc(
469 devinfo
, msg_length
, response_length
, false));
471 brw_inst_set_sfid(devinfo
, inst
, BRW_SFID_MATH
);
472 brw_inst_set_math_msg_function(devinfo
, inst
, function
);
473 brw_inst_set_math_msg_signed_int(devinfo
, inst
, integer_type
);
474 brw_inst_set_math_msg_precision(devinfo
, inst
, low_precision
);
475 brw_inst_set_math_msg_saturate(devinfo
, inst
, brw_inst_saturate(devinfo
, inst
));
476 brw_inst_set_math_msg_data_type(devinfo
, inst
, dataType
);
477 brw_inst_set_saturate(devinfo
, inst
, 0);
481 static void brw_set_ff_sync_message(struct brw_codegen
*p
,
484 unsigned response_length
,
487 const struct gen_device_info
*devinfo
= p
->devinfo
;
489 brw_set_desc(p
, insn
, brw_message_desc(
490 devinfo
, 1, response_length
, true));
492 brw_inst_set_sfid(devinfo
, insn
, BRW_SFID_URB
);
493 brw_inst_set_eot(devinfo
, insn
, end_of_thread
);
494 brw_inst_set_urb_opcode(devinfo
, insn
, 1); /* FF_SYNC */
495 brw_inst_set_urb_allocate(devinfo
, insn
, allocate
);
496 /* The following fields are not used by FF_SYNC: */
497 brw_inst_set_urb_global_offset(devinfo
, insn
, 0);
498 brw_inst_set_urb_swizzle_control(devinfo
, insn
, 0);
499 brw_inst_set_urb_used(devinfo
, insn
, 0);
500 brw_inst_set_urb_complete(devinfo
, insn
, 0);
503 static void brw_set_urb_message( struct brw_codegen
*p
,
505 enum brw_urb_write_flags flags
,
507 unsigned response_length
,
509 unsigned swizzle_control
)
511 const struct gen_device_info
*devinfo
= p
->devinfo
;
513 assert(devinfo
->gen
< 7 || swizzle_control
!= BRW_URB_SWIZZLE_TRANSPOSE
);
514 assert(devinfo
->gen
< 7 || !(flags
& BRW_URB_WRITE_ALLOCATE
));
515 assert(devinfo
->gen
>= 7 || !(flags
& BRW_URB_WRITE_PER_SLOT_OFFSET
));
517 brw_set_desc(p
, insn
, brw_message_desc(
518 devinfo
, msg_length
, response_length
, true));
520 brw_inst_set_sfid(devinfo
, insn
, BRW_SFID_URB
);
521 brw_inst_set_eot(devinfo
, insn
, !!(flags
& BRW_URB_WRITE_EOT
));
523 if (flags
& BRW_URB_WRITE_OWORD
) {
524 assert(msg_length
== 2); /* header + one OWORD of data */
525 brw_inst_set_urb_opcode(devinfo
, insn
, BRW_URB_OPCODE_WRITE_OWORD
);
527 brw_inst_set_urb_opcode(devinfo
, insn
, BRW_URB_OPCODE_WRITE_HWORD
);
530 brw_inst_set_urb_global_offset(devinfo
, insn
, offset
);
531 brw_inst_set_urb_swizzle_control(devinfo
, insn
, swizzle_control
);
533 if (devinfo
->gen
< 8) {
534 brw_inst_set_urb_complete(devinfo
, insn
, !!(flags
& BRW_URB_WRITE_COMPLETE
));
537 if (devinfo
->gen
< 7) {
538 brw_inst_set_urb_allocate(devinfo
, insn
, !!(flags
& BRW_URB_WRITE_ALLOCATE
));
539 brw_inst_set_urb_used(devinfo
, insn
, !(flags
& BRW_URB_WRITE_UNUSED
));
541 brw_inst_set_urb_per_slot_offset(devinfo
, insn
,
542 !!(flags
& BRW_URB_WRITE_PER_SLOT_OFFSET
));
547 gen7_set_dp_scratch_message(struct brw_codegen
*p
,
551 bool invalidate_after_read
,
553 unsigned addr_offset
,
558 const struct gen_device_info
*devinfo
= p
->devinfo
;
559 assert(num_regs
== 1 || num_regs
== 2 || num_regs
== 4 ||
560 (devinfo
->gen
>= 8 && num_regs
== 8));
561 const unsigned block_size
= (devinfo
->gen
>= 8 ? _mesa_logbase2(num_regs
) :
564 brw_set_desc(p
, inst
, brw_message_desc(
565 devinfo
, mlen
, rlen
, header_present
));
567 brw_inst_set_sfid(devinfo
, inst
, GEN7_SFID_DATAPORT_DATA_CACHE
);
568 brw_inst_set_dp_category(devinfo
, inst
, 1); /* Scratch Block Read/Write msgs */
569 brw_inst_set_scratch_read_write(devinfo
, inst
, write
);
570 brw_inst_set_scratch_type(devinfo
, inst
, dword
);
571 brw_inst_set_scratch_invalidate_after_read(devinfo
, inst
, invalidate_after_read
);
572 brw_inst_set_scratch_block_size(devinfo
, inst
, block_size
);
573 brw_inst_set_scratch_addr_offset(devinfo
, inst
, addr_offset
);
577 brw_inst_set_state(const struct gen_device_info
*devinfo
,
579 const struct brw_insn_state
*state
)
581 brw_inst_set_exec_size(devinfo
, insn
, state
->exec_size
);
582 brw_inst_set_group(devinfo
, insn
, state
->group
);
583 brw_inst_set_compression(devinfo
, insn
, state
->compressed
);
584 brw_inst_set_access_mode(devinfo
, insn
, state
->access_mode
);
585 brw_inst_set_mask_control(devinfo
, insn
, state
->mask_control
);
586 brw_inst_set_saturate(devinfo
, insn
, state
->saturate
);
587 brw_inst_set_pred_control(devinfo
, insn
, state
->predicate
);
588 brw_inst_set_pred_inv(devinfo
, insn
, state
->pred_inv
);
590 if (is_3src(devinfo
, brw_inst_opcode(devinfo
, insn
)) &&
591 state
->access_mode
== BRW_ALIGN_16
) {
592 brw_inst_set_3src_a16_flag_subreg_nr(devinfo
, insn
, state
->flag_subreg
% 2);
593 if (devinfo
->gen
>= 7)
594 brw_inst_set_3src_a16_flag_reg_nr(devinfo
, insn
, state
->flag_subreg
/ 2);
596 brw_inst_set_flag_subreg_nr(devinfo
, insn
, state
->flag_subreg
% 2);
597 if (devinfo
->gen
>= 7)
598 brw_inst_set_flag_reg_nr(devinfo
, insn
, state
->flag_subreg
/ 2);
601 if (devinfo
->gen
>= 6)
602 brw_inst_set_acc_wr_control(devinfo
, insn
, state
->acc_wr_control
);
605 #define next_insn brw_next_insn
607 brw_next_insn(struct brw_codegen
*p
, unsigned opcode
)
609 const struct gen_device_info
*devinfo
= p
->devinfo
;
612 if (p
->nr_insn
+ 1 > p
->store_size
) {
614 p
->store
= reralloc(p
->mem_ctx
, p
->store
, brw_inst
, p
->store_size
);
617 p
->next_insn_offset
+= 16;
618 insn
= &p
->store
[p
->nr_insn
++];
620 memset(insn
, 0, sizeof(*insn
));
621 brw_inst_set_opcode(devinfo
, insn
, opcode
);
623 /* Apply the default instruction state */
624 brw_inst_set_state(devinfo
, insn
, p
->current
);
630 brw_alu1(struct brw_codegen
*p
, unsigned opcode
,
631 struct brw_reg dest
, struct brw_reg src
)
633 brw_inst
*insn
= next_insn(p
, opcode
);
634 brw_set_dest(p
, insn
, dest
);
635 brw_set_src0(p
, insn
, src
);
640 brw_alu2(struct brw_codegen
*p
, unsigned opcode
,
641 struct brw_reg dest
, struct brw_reg src0
, struct brw_reg src1
)
643 /* 64-bit immediates are only supported on 1-src instructions */
644 assert(src0
.file
!= BRW_IMMEDIATE_VALUE
|| type_sz(src0
.type
) <= 4);
645 assert(src1
.file
!= BRW_IMMEDIATE_VALUE
|| type_sz(src1
.type
) <= 4);
647 brw_inst
*insn
= next_insn(p
, opcode
);
648 brw_set_dest(p
, insn
, dest
);
649 brw_set_src0(p
, insn
, src0
);
650 brw_set_src1(p
, insn
, src1
);
655 get_3src_subreg_nr(struct brw_reg reg
)
657 /* Normally, SubRegNum is in bytes (0..31). However, 3-src instructions
658 * use 32-bit units (components 0..7). Since they only support F/D/UD
659 * types, this doesn't lose any flexibility, but uses fewer bits.
661 return reg
.subnr
/ 4;
664 static enum gen10_align1_3src_vertical_stride
665 to_3src_align1_vstride(const struct gen_device_info
*devinfo
,
666 enum brw_vertical_stride vstride
)
669 case BRW_VERTICAL_STRIDE_0
:
670 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_0
;
671 case BRW_VERTICAL_STRIDE_1
:
672 assert(devinfo
->gen
>= 12);
673 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_1
;
674 case BRW_VERTICAL_STRIDE_2
:
675 assert(devinfo
->gen
< 12);
676 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_2
;
677 case BRW_VERTICAL_STRIDE_4
:
678 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_4
;
679 case BRW_VERTICAL_STRIDE_8
:
680 case BRW_VERTICAL_STRIDE_16
:
681 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_8
;
683 unreachable("invalid vstride");
688 static enum gen10_align1_3src_src_horizontal_stride
689 to_3src_align1_hstride(enum brw_horizontal_stride hstride
)
692 case BRW_HORIZONTAL_STRIDE_0
:
693 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_0
;
694 case BRW_HORIZONTAL_STRIDE_1
:
695 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_1
;
696 case BRW_HORIZONTAL_STRIDE_2
:
697 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_2
;
698 case BRW_HORIZONTAL_STRIDE_4
:
699 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_4
;
701 unreachable("invalid hstride");
706 brw_alu3(struct brw_codegen
*p
, unsigned opcode
, struct brw_reg dest
,
707 struct brw_reg src0
, struct brw_reg src1
, struct brw_reg src2
)
709 const struct gen_device_info
*devinfo
= p
->devinfo
;
710 brw_inst
*inst
= next_insn(p
, opcode
);
712 gen7_convert_mrf_to_grf(p
, &dest
);
714 assert(dest
.nr
< 128);
715 assert(src0
.file
== BRW_IMMEDIATE_VALUE
|| src0
.nr
< 128);
716 assert(src1
.file
!= BRW_IMMEDIATE_VALUE
&& src1
.nr
< 128);
717 assert(src2
.file
== BRW_IMMEDIATE_VALUE
|| src2
.nr
< 128);
718 assert(dest
.address_mode
== BRW_ADDRESS_DIRECT
);
719 assert(src0
.address_mode
== BRW_ADDRESS_DIRECT
);
720 assert(src1
.address_mode
== BRW_ADDRESS_DIRECT
);
721 assert(src2
.address_mode
== BRW_ADDRESS_DIRECT
);
723 if (brw_inst_access_mode(devinfo
, inst
) == BRW_ALIGN_1
) {
724 assert(dest
.file
== BRW_GENERAL_REGISTER_FILE
||
725 dest
.file
== BRW_ARCHITECTURE_REGISTER_FILE
);
727 if (devinfo
->gen
>= 12) {
728 brw_inst_set_3src_a1_dst_reg_file(devinfo
, inst
, dest
.file
);
729 brw_inst_set_3src_dst_reg_nr(devinfo
, inst
, dest
.nr
);
731 if (dest
.file
== BRW_ARCHITECTURE_REGISTER_FILE
) {
732 brw_inst_set_3src_a1_dst_reg_file(devinfo
, inst
,
733 BRW_ALIGN1_3SRC_ACCUMULATOR
);
734 brw_inst_set_3src_dst_reg_nr(devinfo
, inst
, BRW_ARF_ACCUMULATOR
);
736 brw_inst_set_3src_a1_dst_reg_file(devinfo
, inst
,
737 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE
);
738 brw_inst_set_3src_dst_reg_nr(devinfo
, inst
, dest
.nr
);
741 brw_inst_set_3src_a1_dst_subreg_nr(devinfo
, inst
, dest
.subnr
/ 8);
743 brw_inst_set_3src_a1_dst_hstride(devinfo
, inst
, BRW_ALIGN1_3SRC_DST_HORIZONTAL_STRIDE_1
);
745 if (brw_reg_type_is_floating_point(dest
.type
)) {
746 brw_inst_set_3src_a1_exec_type(devinfo
, inst
,
747 BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT
);
749 brw_inst_set_3src_a1_exec_type(devinfo
, inst
,
750 BRW_ALIGN1_3SRC_EXEC_TYPE_INT
);
753 brw_inst_set_3src_a1_dst_type(devinfo
, inst
, dest
.type
);
754 brw_inst_set_3src_a1_src0_type(devinfo
, inst
, src0
.type
);
755 brw_inst_set_3src_a1_src1_type(devinfo
, inst
, src1
.type
);
756 brw_inst_set_3src_a1_src2_type(devinfo
, inst
, src2
.type
);
758 brw_inst_set_3src_a1_src0_vstride(
759 devinfo
, inst
, to_3src_align1_vstride(devinfo
, src0
.vstride
));
760 brw_inst_set_3src_a1_src1_vstride(
761 devinfo
, inst
, to_3src_align1_vstride(devinfo
, src1
.vstride
));
762 /* no vstride on src2 */
764 brw_inst_set_3src_a1_src0_hstride(devinfo
, inst
,
765 to_3src_align1_hstride(src0
.hstride
));
766 brw_inst_set_3src_a1_src1_hstride(devinfo
, inst
,
767 to_3src_align1_hstride(src1
.hstride
));
768 brw_inst_set_3src_a1_src2_hstride(devinfo
, inst
,
769 to_3src_align1_hstride(src2
.hstride
));
771 brw_inst_set_3src_a1_src0_subreg_nr(devinfo
, inst
, src0
.subnr
);
772 if (src0
.type
== BRW_REGISTER_TYPE_NF
) {
773 brw_inst_set_3src_src0_reg_nr(devinfo
, inst
, BRW_ARF_ACCUMULATOR
);
775 brw_inst_set_3src_src0_reg_nr(devinfo
, inst
, src0
.nr
);
777 brw_inst_set_3src_src0_abs(devinfo
, inst
, src0
.abs
);
778 brw_inst_set_3src_src0_negate(devinfo
, inst
, src0
.negate
);
780 brw_inst_set_3src_a1_src1_subreg_nr(devinfo
, inst
, src1
.subnr
);
781 if (src1
.file
== BRW_ARCHITECTURE_REGISTER_FILE
) {
782 brw_inst_set_3src_src1_reg_nr(devinfo
, inst
, BRW_ARF_ACCUMULATOR
);
784 brw_inst_set_3src_src1_reg_nr(devinfo
, inst
, src1
.nr
);
786 brw_inst_set_3src_src1_abs(devinfo
, inst
, src1
.abs
);
787 brw_inst_set_3src_src1_negate(devinfo
, inst
, src1
.negate
);
789 brw_inst_set_3src_a1_src2_subreg_nr(devinfo
, inst
, src2
.subnr
);
790 brw_inst_set_3src_src2_reg_nr(devinfo
, inst
, src2
.nr
);
791 brw_inst_set_3src_src2_abs(devinfo
, inst
, src2
.abs
);
792 brw_inst_set_3src_src2_negate(devinfo
, inst
, src2
.negate
);
794 assert(src0
.file
== BRW_GENERAL_REGISTER_FILE
||
795 src0
.file
== BRW_IMMEDIATE_VALUE
||
796 (src0
.file
== BRW_ARCHITECTURE_REGISTER_FILE
&&
797 src0
.type
== BRW_REGISTER_TYPE_NF
));
798 assert(src1
.file
== BRW_GENERAL_REGISTER_FILE
||
799 src1
.file
== BRW_ARCHITECTURE_REGISTER_FILE
);
800 assert(src2
.file
== BRW_GENERAL_REGISTER_FILE
||
801 src2
.file
== BRW_IMMEDIATE_VALUE
);
803 if (devinfo
->gen
>= 12) {
804 brw_inst_set_3src_a1_src0_reg_file(devinfo
, inst
, src0
.file
);
805 brw_inst_set_3src_a1_src1_reg_file(devinfo
, inst
, src1
.file
);
806 brw_inst_set_3src_a1_src2_reg_file(devinfo
, inst
, src2
.file
);
808 brw_inst_set_3src_a1_src0_reg_file(devinfo
, inst
,
809 src0
.file
== BRW_GENERAL_REGISTER_FILE
?
810 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE
:
811 BRW_ALIGN1_3SRC_IMMEDIATE_VALUE
);
812 brw_inst_set_3src_a1_src1_reg_file(devinfo
, inst
,
813 src1
.file
== BRW_GENERAL_REGISTER_FILE
?
814 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE
:
815 BRW_ALIGN1_3SRC_ACCUMULATOR
);
816 brw_inst_set_3src_a1_src2_reg_file(devinfo
, inst
,
817 src2
.file
== BRW_GENERAL_REGISTER_FILE
?
818 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE
:
819 BRW_ALIGN1_3SRC_IMMEDIATE_VALUE
);
823 assert(dest
.file
== BRW_GENERAL_REGISTER_FILE
||
824 dest
.file
== BRW_MESSAGE_REGISTER_FILE
);
825 assert(dest
.type
== BRW_REGISTER_TYPE_F
||
826 dest
.type
== BRW_REGISTER_TYPE_DF
||
827 dest
.type
== BRW_REGISTER_TYPE_D
||
828 dest
.type
== BRW_REGISTER_TYPE_UD
||
829 (dest
.type
== BRW_REGISTER_TYPE_HF
&& devinfo
->gen
>= 8));
830 if (devinfo
->gen
== 6) {
831 brw_inst_set_3src_a16_dst_reg_file(devinfo
, inst
,
832 dest
.file
== BRW_MESSAGE_REGISTER_FILE
);
834 brw_inst_set_3src_dst_reg_nr(devinfo
, inst
, dest
.nr
);
835 brw_inst_set_3src_a16_dst_subreg_nr(devinfo
, inst
, dest
.subnr
/ 16);
836 brw_inst_set_3src_a16_dst_writemask(devinfo
, inst
, dest
.writemask
);
838 assert(src0
.file
== BRW_GENERAL_REGISTER_FILE
);
839 brw_inst_set_3src_a16_src0_swizzle(devinfo
, inst
, src0
.swizzle
);
840 brw_inst_set_3src_a16_src0_subreg_nr(devinfo
, inst
, get_3src_subreg_nr(src0
));
841 brw_inst_set_3src_src0_reg_nr(devinfo
, inst
, src0
.nr
);
842 brw_inst_set_3src_src0_abs(devinfo
, inst
, src0
.abs
);
843 brw_inst_set_3src_src0_negate(devinfo
, inst
, src0
.negate
);
844 brw_inst_set_3src_a16_src0_rep_ctrl(devinfo
, inst
,
845 src0
.vstride
== BRW_VERTICAL_STRIDE_0
);
847 assert(src1
.file
== BRW_GENERAL_REGISTER_FILE
);
848 brw_inst_set_3src_a16_src1_swizzle(devinfo
, inst
, src1
.swizzle
);
849 brw_inst_set_3src_a16_src1_subreg_nr(devinfo
, inst
, get_3src_subreg_nr(src1
));
850 brw_inst_set_3src_src1_reg_nr(devinfo
, inst
, src1
.nr
);
851 brw_inst_set_3src_src1_abs(devinfo
, inst
, src1
.abs
);
852 brw_inst_set_3src_src1_negate(devinfo
, inst
, src1
.negate
);
853 brw_inst_set_3src_a16_src1_rep_ctrl(devinfo
, inst
,
854 src1
.vstride
== BRW_VERTICAL_STRIDE_0
);
856 assert(src2
.file
== BRW_GENERAL_REGISTER_FILE
);
857 brw_inst_set_3src_a16_src2_swizzle(devinfo
, inst
, src2
.swizzle
);
858 brw_inst_set_3src_a16_src2_subreg_nr(devinfo
, inst
, get_3src_subreg_nr(src2
));
859 brw_inst_set_3src_src2_reg_nr(devinfo
, inst
, src2
.nr
);
860 brw_inst_set_3src_src2_abs(devinfo
, inst
, src2
.abs
);
861 brw_inst_set_3src_src2_negate(devinfo
, inst
, src2
.negate
);
862 brw_inst_set_3src_a16_src2_rep_ctrl(devinfo
, inst
,
863 src2
.vstride
== BRW_VERTICAL_STRIDE_0
);
865 if (devinfo
->gen
>= 7) {
866 /* Set both the source and destination types based on dest.type,
867 * ignoring the source register types. The MAD and LRP emitters ensure
868 * that all four types are float. The BFE and BFI2 emitters, however,
869 * may send us mixed D and UD types and want us to ignore that and use
870 * the destination type.
872 brw_inst_set_3src_a16_src_type(devinfo
, inst
, dest
.type
);
873 brw_inst_set_3src_a16_dst_type(devinfo
, inst
, dest
.type
);
875 /* From the Bspec, 3D Media GPGPU, Instruction fields, srcType:
877 * "Three source instructions can use operands with mixed-mode
878 * precision. When SrcType field is set to :f or :hf it defines
879 * precision for source 0 only, and fields Src1Type and Src2Type
880 * define precision for other source operands:
882 * 0b = :f. Single precision Float (32-bit).
883 * 1b = :hf. Half precision Float (16-bit)."
885 if (src1
.type
== BRW_REGISTER_TYPE_HF
)
886 brw_inst_set_3src_a16_src1_type(devinfo
, inst
, 1);
888 if (src2
.type
== BRW_REGISTER_TYPE_HF
)
889 brw_inst_set_3src_a16_src2_type(devinfo
, inst
, 1);
897 /***********************************************************************
898 * Convenience routines.
901 brw_inst *brw_##OP(struct brw_codegen *p, \
902 struct brw_reg dest, \
903 struct brw_reg src0) \
905 return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \
909 brw_inst *brw_##OP(struct brw_codegen *p, \
910 struct brw_reg dest, \
911 struct brw_reg src0, \
912 struct brw_reg src1) \
914 return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \
918 brw_inst *brw_##OP(struct brw_codegen *p, \
919 struct brw_reg dest, \
920 struct brw_reg src0, \
921 struct brw_reg src1, \
922 struct brw_reg src2) \
924 if (p->current->access_mode == BRW_ALIGN_16) { \
925 if (src0.vstride == BRW_VERTICAL_STRIDE_0) \
926 src0.swizzle = BRW_SWIZZLE_XXXX; \
927 if (src1.vstride == BRW_VERTICAL_STRIDE_0) \
928 src1.swizzle = BRW_SWIZZLE_XXXX; \
929 if (src2.vstride == BRW_VERTICAL_STRIDE_0) \
930 src2.swizzle = BRW_SWIZZLE_XXXX; \
932 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
936 brw_inst *brw_##OP(struct brw_codegen *p, \
937 struct brw_reg dest, \
938 struct brw_reg src0, \
939 struct brw_reg src1, \
940 struct brw_reg src2) \
942 assert(dest.type == BRW_REGISTER_TYPE_F || \
943 dest.type == BRW_REGISTER_TYPE_DF); \
944 if (dest.type == BRW_REGISTER_TYPE_F) { \
945 assert(src0.type == BRW_REGISTER_TYPE_F); \
946 assert(src1.type == BRW_REGISTER_TYPE_F); \
947 assert(src2.type == BRW_REGISTER_TYPE_F); \
948 } else if (dest.type == BRW_REGISTER_TYPE_DF) { \
949 assert(src0.type == BRW_REGISTER_TYPE_DF); \
950 assert(src1.type == BRW_REGISTER_TYPE_DF); \
951 assert(src2.type == BRW_REGISTER_TYPE_DF); \
954 if (p->current->access_mode == BRW_ALIGN_16) { \
955 if (src0.vstride == BRW_VERTICAL_STRIDE_0) \
956 src0.swizzle = BRW_SWIZZLE_XXXX; \
957 if (src1.vstride == BRW_VERTICAL_STRIDE_0) \
958 src1.swizzle = BRW_SWIZZLE_XXXX; \
959 if (src2.vstride == BRW_VERTICAL_STRIDE_0) \
960 src2.swizzle = BRW_SWIZZLE_XXXX; \
962 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
965 /* Rounding operations (other than RNDD) require two instructions - the first
966 * stores a rounded value (possibly the wrong way) in the dest register, but
967 * also sets a per-channel "increment bit" in the flag register. A predicated
968 * add of 1.0 fixes dest to contain the desired result.
970 * Sandybridge and later appear to round correctly without an ADD.
973 void brw_##OP(struct brw_codegen *p, \
974 struct brw_reg dest, \
975 struct brw_reg src) \
977 const struct gen_device_info *devinfo = p->devinfo; \
978 brw_inst *rnd, *add; \
979 rnd = next_insn(p, BRW_OPCODE_##OP); \
980 brw_set_dest(p, rnd, dest); \
981 brw_set_src0(p, rnd, src); \
983 if (devinfo->gen < 6) { \
984 /* turn on round-increments */ \
985 brw_inst_set_cond_modifier(devinfo, rnd, BRW_CONDITIONAL_R); \
986 add = brw_ADD(p, dest, dest, brw_imm_f(1.0f)); \
987 brw_inst_set_pred_control(devinfo, add, BRW_PREDICATE_NORMAL); \
1029 brw_MOV(struct brw_codegen
*p
, struct brw_reg dest
, struct brw_reg src0
)
1031 const struct gen_device_info
*devinfo
= p
->devinfo
;
1033 /* When converting F->DF on IVB/BYT, every odd source channel is ignored.
1034 * To avoid the problems that causes, we use an <X,2,0> source region to
1035 * read each element twice.
1037 if (devinfo
->gen
== 7 && !devinfo
->is_haswell
&&
1038 brw_get_default_access_mode(p
) == BRW_ALIGN_1
&&
1039 dest
.type
== BRW_REGISTER_TYPE_DF
&&
1040 (src0
.type
== BRW_REGISTER_TYPE_F
||
1041 src0
.type
== BRW_REGISTER_TYPE_D
||
1042 src0
.type
== BRW_REGISTER_TYPE_UD
) &&
1043 !has_scalar_region(src0
)) {
1044 assert(src0
.vstride
== src0
.width
+ src0
.hstride
);
1045 src0
.vstride
= src0
.hstride
;
1046 src0
.width
= BRW_WIDTH_2
;
1047 src0
.hstride
= BRW_HORIZONTAL_STRIDE_0
;
1050 return brw_alu1(p
, BRW_OPCODE_MOV
, dest
, src0
);
1054 brw_ADD(struct brw_codegen
*p
, struct brw_reg dest
,
1055 struct brw_reg src0
, struct brw_reg src1
)
1058 if (src0
.type
== BRW_REGISTER_TYPE_F
||
1059 (src0
.file
== BRW_IMMEDIATE_VALUE
&&
1060 src0
.type
== BRW_REGISTER_TYPE_VF
)) {
1061 assert(src1
.type
!= BRW_REGISTER_TYPE_UD
);
1062 assert(src1
.type
!= BRW_REGISTER_TYPE_D
);
1065 if (src1
.type
== BRW_REGISTER_TYPE_F
||
1066 (src1
.file
== BRW_IMMEDIATE_VALUE
&&
1067 src1
.type
== BRW_REGISTER_TYPE_VF
)) {
1068 assert(src0
.type
!= BRW_REGISTER_TYPE_UD
);
1069 assert(src0
.type
!= BRW_REGISTER_TYPE_D
);
1072 return brw_alu2(p
, BRW_OPCODE_ADD
, dest
, src0
, src1
);
1076 brw_AVG(struct brw_codegen
*p
, struct brw_reg dest
,
1077 struct brw_reg src0
, struct brw_reg src1
)
1079 assert(dest
.type
== src0
.type
);
1080 assert(src0
.type
== src1
.type
);
1081 switch (src0
.type
) {
1082 case BRW_REGISTER_TYPE_B
:
1083 case BRW_REGISTER_TYPE_UB
:
1084 case BRW_REGISTER_TYPE_W
:
1085 case BRW_REGISTER_TYPE_UW
:
1086 case BRW_REGISTER_TYPE_D
:
1087 case BRW_REGISTER_TYPE_UD
:
1090 unreachable("Bad type for brw_AVG");
1093 return brw_alu2(p
, BRW_OPCODE_AVG
, dest
, src0
, src1
);
1097 brw_MUL(struct brw_codegen
*p
, struct brw_reg dest
,
1098 struct brw_reg src0
, struct brw_reg src1
)
1101 if (src0
.type
== BRW_REGISTER_TYPE_D
||
1102 src0
.type
== BRW_REGISTER_TYPE_UD
||
1103 src1
.type
== BRW_REGISTER_TYPE_D
||
1104 src1
.type
== BRW_REGISTER_TYPE_UD
) {
1105 assert(dest
.type
!= BRW_REGISTER_TYPE_F
);
1108 if (src0
.type
== BRW_REGISTER_TYPE_F
||
1109 (src0
.file
== BRW_IMMEDIATE_VALUE
&&
1110 src0
.type
== BRW_REGISTER_TYPE_VF
)) {
1111 assert(src1
.type
!= BRW_REGISTER_TYPE_UD
);
1112 assert(src1
.type
!= BRW_REGISTER_TYPE_D
);
1115 if (src1
.type
== BRW_REGISTER_TYPE_F
||
1116 (src1
.file
== BRW_IMMEDIATE_VALUE
&&
1117 src1
.type
== BRW_REGISTER_TYPE_VF
)) {
1118 assert(src0
.type
!= BRW_REGISTER_TYPE_UD
);
1119 assert(src0
.type
!= BRW_REGISTER_TYPE_D
);
1122 assert(src0
.file
!= BRW_ARCHITECTURE_REGISTER_FILE
||
1123 src0
.nr
!= BRW_ARF_ACCUMULATOR
);
1124 assert(src1
.file
!= BRW_ARCHITECTURE_REGISTER_FILE
||
1125 src1
.nr
!= BRW_ARF_ACCUMULATOR
);
1127 return brw_alu2(p
, BRW_OPCODE_MUL
, dest
, src0
, src1
);
1131 brw_LINE(struct brw_codegen
*p
, struct brw_reg dest
,
1132 struct brw_reg src0
, struct brw_reg src1
)
1134 src0
.vstride
= BRW_VERTICAL_STRIDE_0
;
1135 src0
.width
= BRW_WIDTH_1
;
1136 src0
.hstride
= BRW_HORIZONTAL_STRIDE_0
;
1137 return brw_alu2(p
, BRW_OPCODE_LINE
, dest
, src0
, src1
);
1141 brw_PLN(struct brw_codegen
*p
, struct brw_reg dest
,
1142 struct brw_reg src0
, struct brw_reg src1
)
1144 src0
.vstride
= BRW_VERTICAL_STRIDE_0
;
1145 src0
.width
= BRW_WIDTH_1
;
1146 src0
.hstride
= BRW_HORIZONTAL_STRIDE_0
;
1147 src1
.vstride
= BRW_VERTICAL_STRIDE_8
;
1148 src1
.width
= BRW_WIDTH_8
;
1149 src1
.hstride
= BRW_HORIZONTAL_STRIDE_1
;
1150 return brw_alu2(p
, BRW_OPCODE_PLN
, dest
, src0
, src1
);
1154 brw_F32TO16(struct brw_codegen
*p
, struct brw_reg dst
, struct brw_reg src
)
1156 const struct gen_device_info
*devinfo
= p
->devinfo
;
1157 const bool align16
= brw_get_default_access_mode(p
) == BRW_ALIGN_16
;
1158 /* The F32TO16 instruction doesn't support 32-bit destination types in
1159 * Align1 mode, and neither does the Gen8 implementation in terms of a
1160 * converting MOV. Gen7 does zero out the high 16 bits in Align16 mode as
1161 * an undocumented feature.
1163 const bool needs_zero_fill
= (dst
.type
== BRW_REGISTER_TYPE_UD
&&
1164 (!align16
|| devinfo
->gen
>= 8));
1168 assert(dst
.type
== BRW_REGISTER_TYPE_UD
);
1170 assert(dst
.type
== BRW_REGISTER_TYPE_UD
||
1171 dst
.type
== BRW_REGISTER_TYPE_W
||
1172 dst
.type
== BRW_REGISTER_TYPE_UW
||
1173 dst
.type
== BRW_REGISTER_TYPE_HF
);
1176 brw_push_insn_state(p
);
1178 if (needs_zero_fill
) {
1179 brw_set_default_access_mode(p
, BRW_ALIGN_1
);
1180 dst
= spread(retype(dst
, BRW_REGISTER_TYPE_W
), 2);
1183 if (devinfo
->gen
>= 8) {
1184 inst
= brw_MOV(p
, retype(dst
, BRW_REGISTER_TYPE_HF
), src
);
1186 assert(devinfo
->gen
== 7);
1187 inst
= brw_alu1(p
, BRW_OPCODE_F32TO16
, dst
, src
);
1190 if (needs_zero_fill
) {
1191 brw_inst_set_no_dd_clear(devinfo
, inst
, true);
1192 inst
= brw_MOV(p
, suboffset(dst
, 1), brw_imm_w(0));
1193 brw_inst_set_no_dd_check(devinfo
, inst
, true);
1196 brw_pop_insn_state(p
);
1201 brw_F16TO32(struct brw_codegen
*p
, struct brw_reg dst
, struct brw_reg src
)
1203 const struct gen_device_info
*devinfo
= p
->devinfo
;
1204 bool align16
= brw_get_default_access_mode(p
) == BRW_ALIGN_16
;
1207 assert(src
.type
== BRW_REGISTER_TYPE_UD
);
1209 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1211 * Because this instruction does not have a 16-bit floating-point
1212 * type, the source data type must be Word (W). The destination type
1213 * must be F (Float).
1215 if (src
.type
== BRW_REGISTER_TYPE_UD
)
1216 src
= spread(retype(src
, BRW_REGISTER_TYPE_W
), 2);
1218 assert(src
.type
== BRW_REGISTER_TYPE_W
||
1219 src
.type
== BRW_REGISTER_TYPE_UW
||
1220 src
.type
== BRW_REGISTER_TYPE_HF
);
1223 if (devinfo
->gen
>= 8) {
1224 return brw_MOV(p
, dst
, retype(src
, BRW_REGISTER_TYPE_HF
));
1226 assert(devinfo
->gen
== 7);
1227 return brw_alu1(p
, BRW_OPCODE_F16TO32
, dst
, src
);
1232 void brw_NOP(struct brw_codegen
*p
)
1234 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_NOP
);
1235 memset(insn
, 0, sizeof(*insn
));
1236 brw_inst_set_opcode(p
->devinfo
, insn
, BRW_OPCODE_NOP
);
1243 /***********************************************************************
1244 * Comparisons, if/else/endif
1248 brw_JMPI(struct brw_codegen
*p
, struct brw_reg index
,
1249 unsigned predicate_control
)
1251 const struct gen_device_info
*devinfo
= p
->devinfo
;
1252 struct brw_reg ip
= brw_ip_reg();
1253 brw_inst
*inst
= brw_alu2(p
, BRW_OPCODE_JMPI
, ip
, ip
, index
);
1255 brw_inst_set_exec_size(devinfo
, inst
, BRW_EXECUTE_1
);
1256 brw_inst_set_qtr_control(devinfo
, inst
, BRW_COMPRESSION_NONE
);
1257 brw_inst_set_mask_control(devinfo
, inst
, BRW_MASK_DISABLE
);
1258 brw_inst_set_pred_control(devinfo
, inst
, predicate_control
);
1264 push_if_stack(struct brw_codegen
*p
, brw_inst
*inst
)
1266 p
->if_stack
[p
->if_stack_depth
] = inst
- p
->store
;
1268 p
->if_stack_depth
++;
1269 if (p
->if_stack_array_size
<= p
->if_stack_depth
) {
1270 p
->if_stack_array_size
*= 2;
1271 p
->if_stack
= reralloc(p
->mem_ctx
, p
->if_stack
, int,
1272 p
->if_stack_array_size
);
1277 pop_if_stack(struct brw_codegen
*p
)
1279 p
->if_stack_depth
--;
1280 return &p
->store
[p
->if_stack
[p
->if_stack_depth
]];
1284 push_loop_stack(struct brw_codegen
*p
, brw_inst
*inst
)
1286 if (p
->loop_stack_array_size
<= (p
->loop_stack_depth
+ 1)) {
1287 p
->loop_stack_array_size
*= 2;
1288 p
->loop_stack
= reralloc(p
->mem_ctx
, p
->loop_stack
, int,
1289 p
->loop_stack_array_size
);
1290 p
->if_depth_in_loop
= reralloc(p
->mem_ctx
, p
->if_depth_in_loop
, int,
1291 p
->loop_stack_array_size
);
1294 p
->loop_stack
[p
->loop_stack_depth
] = inst
- p
->store
;
1295 p
->loop_stack_depth
++;
1296 p
->if_depth_in_loop
[p
->loop_stack_depth
] = 0;
1300 get_inner_do_insn(struct brw_codegen
*p
)
1302 return &p
->store
[p
->loop_stack
[p
->loop_stack_depth
- 1]];
1305 /* EU takes the value from the flag register and pushes it onto some
1306 * sort of a stack (presumably merging with any flag value already on
1307 * the stack). Within an if block, the flags at the top of the stack
1308 * control execution on each channel of the unit, eg. on each of the
1309 * 16 pixel values in our wm programs.
1311 * When the matching 'else' instruction is reached (presumably by
1312 * countdown of the instruction count patched in by our ELSE/ENDIF
1313 * functions), the relevant flags are inverted.
1315 * When the matching 'endif' instruction is reached, the flags are
1316 * popped off. If the stack is now empty, normal execution resumes.
1319 brw_IF(struct brw_codegen
*p
, unsigned execute_size
)
1321 const struct gen_device_info
*devinfo
= p
->devinfo
;
1324 insn
= next_insn(p
, BRW_OPCODE_IF
);
1326 /* Override the defaults for this instruction:
1328 if (devinfo
->gen
< 6) {
1329 brw_set_dest(p
, insn
, brw_ip_reg());
1330 brw_set_src0(p
, insn
, brw_ip_reg());
1331 brw_set_src1(p
, insn
, brw_imm_d(0x0));
1332 } else if (devinfo
->gen
== 6) {
1333 brw_set_dest(p
, insn
, brw_imm_w(0));
1334 brw_inst_set_gen6_jump_count(devinfo
, insn
, 0);
1335 brw_set_src0(p
, insn
, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D
)));
1336 brw_set_src1(p
, insn
, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D
)));
1337 } else if (devinfo
->gen
== 7) {
1338 brw_set_dest(p
, insn
, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D
)));
1339 brw_set_src0(p
, insn
, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D
)));
1340 brw_set_src1(p
, insn
, brw_imm_w(0));
1341 brw_inst_set_jip(devinfo
, insn
, 0);
1342 brw_inst_set_uip(devinfo
, insn
, 0);
1344 brw_set_dest(p
, insn
, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D
)));
1345 if (devinfo
->gen
< 12)
1346 brw_set_src0(p
, insn
, brw_imm_d(0));
1347 brw_inst_set_jip(devinfo
, insn
, 0);
1348 brw_inst_set_uip(devinfo
, insn
, 0);
1351 brw_inst_set_exec_size(devinfo
, insn
, execute_size
);
1352 brw_inst_set_qtr_control(devinfo
, insn
, BRW_COMPRESSION_NONE
);
1353 brw_inst_set_pred_control(devinfo
, insn
, BRW_PREDICATE_NORMAL
);
1354 brw_inst_set_mask_control(devinfo
, insn
, BRW_MASK_ENABLE
);
1355 if (!p
->single_program_flow
&& devinfo
->gen
< 6)
1356 brw_inst_set_thread_control(devinfo
, insn
, BRW_THREAD_SWITCH
);
1358 push_if_stack(p
, insn
);
1359 p
->if_depth_in_loop
[p
->loop_stack_depth
]++;
1363 /* This function is only used for gen6-style IF instructions with an
1364 * embedded comparison (conditional modifier). It is not used on gen7.
1367 gen6_IF(struct brw_codegen
*p
, enum brw_conditional_mod conditional
,
1368 struct brw_reg src0
, struct brw_reg src1
)
1370 const struct gen_device_info
*devinfo
= p
->devinfo
;
1373 insn
= next_insn(p
, BRW_OPCODE_IF
);
1375 brw_set_dest(p
, insn
, brw_imm_w(0));
1376 brw_inst_set_exec_size(devinfo
, insn
, brw_get_default_exec_size(p
));
1377 brw_inst_set_gen6_jump_count(devinfo
, insn
, 0);
1378 brw_set_src0(p
, insn
, src0
);
1379 brw_set_src1(p
, insn
, src1
);
1381 assert(brw_inst_qtr_control(devinfo
, insn
) == BRW_COMPRESSION_NONE
);
1382 assert(brw_inst_pred_control(devinfo
, insn
) == BRW_PREDICATE_NONE
);
1383 brw_inst_set_cond_modifier(devinfo
, insn
, conditional
);
1385 push_if_stack(p
, insn
);
1390 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1393 convert_IF_ELSE_to_ADD(struct brw_codegen
*p
,
1394 brw_inst
*if_inst
, brw_inst
*else_inst
)
1396 const struct gen_device_info
*devinfo
= p
->devinfo
;
1398 /* The next instruction (where the ENDIF would be, if it existed) */
1399 brw_inst
*next_inst
= &p
->store
[p
->nr_insn
];
1401 assert(p
->single_program_flow
);
1402 assert(if_inst
!= NULL
&& brw_inst_opcode(devinfo
, if_inst
) == BRW_OPCODE_IF
);
1403 assert(else_inst
== NULL
|| brw_inst_opcode(devinfo
, else_inst
) == BRW_OPCODE_ELSE
);
1404 assert(brw_inst_exec_size(devinfo
, if_inst
) == BRW_EXECUTE_1
);
1406 /* Convert IF to an ADD instruction that moves the instruction pointer
1407 * to the first instruction of the ELSE block. If there is no ELSE
1408 * block, point to where ENDIF would be. Reverse the predicate.
1410 * There's no need to execute an ENDIF since we don't need to do any
1411 * stack operations, and if we're currently executing, we just want to
1412 * continue normally.
1414 brw_inst_set_opcode(devinfo
, if_inst
, BRW_OPCODE_ADD
);
1415 brw_inst_set_pred_inv(devinfo
, if_inst
, true);
1417 if (else_inst
!= NULL
) {
1418 /* Convert ELSE to an ADD instruction that points where the ENDIF
1421 brw_inst_set_opcode(devinfo
, else_inst
, BRW_OPCODE_ADD
);
1423 brw_inst_set_imm_ud(devinfo
, if_inst
, (else_inst
- if_inst
+ 1) * 16);
1424 brw_inst_set_imm_ud(devinfo
, else_inst
, (next_inst
- else_inst
) * 16);
1426 brw_inst_set_imm_ud(devinfo
, if_inst
, (next_inst
- if_inst
) * 16);
1431 * Patch IF and ELSE instructions with appropriate jump targets.
1434 patch_IF_ELSE(struct brw_codegen
*p
,
1435 brw_inst
*if_inst
, brw_inst
*else_inst
, brw_inst
*endif_inst
)
1437 const struct gen_device_info
*devinfo
= p
->devinfo
;
1439 /* We shouldn't be patching IF and ELSE instructions in single program flow
1440 * mode when gen < 6, because in single program flow mode on those
1441 * platforms, we convert flow control instructions to conditional ADDs that
1442 * operate on IP (see brw_ENDIF).
1444 * However, on Gen6, writing to IP doesn't work in single program flow mode
1445 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1446 * not be updated by non-flow control instructions."). And on later
1447 * platforms, there is no significant benefit to converting control flow
1448 * instructions to conditional ADDs. So we do patch IF and ELSE
1449 * instructions in single program flow mode on those platforms.
1451 if (devinfo
->gen
< 6)
1452 assert(!p
->single_program_flow
);
1454 assert(if_inst
!= NULL
&& brw_inst_opcode(devinfo
, if_inst
) == BRW_OPCODE_IF
);
1455 assert(endif_inst
!= NULL
);
1456 assert(else_inst
== NULL
|| brw_inst_opcode(devinfo
, else_inst
) == BRW_OPCODE_ELSE
);
1458 unsigned br
= brw_jump_scale(devinfo
);
1460 assert(brw_inst_opcode(devinfo
, endif_inst
) == BRW_OPCODE_ENDIF
);
1461 brw_inst_set_exec_size(devinfo
, endif_inst
, brw_inst_exec_size(devinfo
, if_inst
));
1463 if (else_inst
== NULL
) {
1464 /* Patch IF -> ENDIF */
1465 if (devinfo
->gen
< 6) {
1466 /* Turn it into an IFF, which means no mask stack operations for
1467 * all-false and jumping past the ENDIF.
1469 brw_inst_set_opcode(devinfo
, if_inst
, BRW_OPCODE_IFF
);
1470 brw_inst_set_gen4_jump_count(devinfo
, if_inst
,
1471 br
* (endif_inst
- if_inst
+ 1));
1472 brw_inst_set_gen4_pop_count(devinfo
, if_inst
, 0);
1473 } else if (devinfo
->gen
== 6) {
1474 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1475 brw_inst_set_gen6_jump_count(devinfo
, if_inst
, br
*(endif_inst
- if_inst
));
1477 brw_inst_set_uip(devinfo
, if_inst
, br
* (endif_inst
- if_inst
));
1478 brw_inst_set_jip(devinfo
, if_inst
, br
* (endif_inst
- if_inst
));
1481 brw_inst_set_exec_size(devinfo
, else_inst
, brw_inst_exec_size(devinfo
, if_inst
));
1483 /* Patch IF -> ELSE */
1484 if (devinfo
->gen
< 6) {
1485 brw_inst_set_gen4_jump_count(devinfo
, if_inst
,
1486 br
* (else_inst
- if_inst
));
1487 brw_inst_set_gen4_pop_count(devinfo
, if_inst
, 0);
1488 } else if (devinfo
->gen
== 6) {
1489 brw_inst_set_gen6_jump_count(devinfo
, if_inst
,
1490 br
* (else_inst
- if_inst
+ 1));
1493 /* Patch ELSE -> ENDIF */
1494 if (devinfo
->gen
< 6) {
1495 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1498 brw_inst_set_gen4_jump_count(devinfo
, else_inst
,
1499 br
* (endif_inst
- else_inst
+ 1));
1500 brw_inst_set_gen4_pop_count(devinfo
, else_inst
, 1);
1501 } else if (devinfo
->gen
== 6) {
1502 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1503 brw_inst_set_gen6_jump_count(devinfo
, else_inst
,
1504 br
* (endif_inst
- else_inst
));
1506 /* The IF instruction's JIP should point just past the ELSE */
1507 brw_inst_set_jip(devinfo
, if_inst
, br
* (else_inst
- if_inst
+ 1));
1508 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1509 brw_inst_set_uip(devinfo
, if_inst
, br
* (endif_inst
- if_inst
));
1510 brw_inst_set_jip(devinfo
, else_inst
, br
* (endif_inst
- else_inst
));
1511 if (devinfo
->gen
>= 8) {
1512 /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both
1513 * should point to ENDIF.
1515 brw_inst_set_uip(devinfo
, else_inst
, br
* (endif_inst
- else_inst
));
1522 brw_ELSE(struct brw_codegen
*p
)
1524 const struct gen_device_info
*devinfo
= p
->devinfo
;
1527 insn
= next_insn(p
, BRW_OPCODE_ELSE
);
1529 if (devinfo
->gen
< 6) {
1530 brw_set_dest(p
, insn
, brw_ip_reg());
1531 brw_set_src0(p
, insn
, brw_ip_reg());
1532 brw_set_src1(p
, insn
, brw_imm_d(0x0));
1533 } else if (devinfo
->gen
== 6) {
1534 brw_set_dest(p
, insn
, brw_imm_w(0));
1535 brw_inst_set_gen6_jump_count(devinfo
, insn
, 0);
1536 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1537 brw_set_src1(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1538 } else if (devinfo
->gen
== 7) {
1539 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1540 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1541 brw_set_src1(p
, insn
, brw_imm_w(0));
1542 brw_inst_set_jip(devinfo
, insn
, 0);
1543 brw_inst_set_uip(devinfo
, insn
, 0);
1545 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1546 if (devinfo
->gen
< 12)
1547 brw_set_src0(p
, insn
, brw_imm_d(0));
1548 brw_inst_set_jip(devinfo
, insn
, 0);
1549 brw_inst_set_uip(devinfo
, insn
, 0);
1552 brw_inst_set_qtr_control(devinfo
, insn
, BRW_COMPRESSION_NONE
);
1553 brw_inst_set_mask_control(devinfo
, insn
, BRW_MASK_ENABLE
);
1554 if (!p
->single_program_flow
&& devinfo
->gen
< 6)
1555 brw_inst_set_thread_control(devinfo
, insn
, BRW_THREAD_SWITCH
);
1557 push_if_stack(p
, insn
);
1561 brw_ENDIF(struct brw_codegen
*p
)
1563 const struct gen_device_info
*devinfo
= p
->devinfo
;
1564 brw_inst
*insn
= NULL
;
1565 brw_inst
*else_inst
= NULL
;
1566 brw_inst
*if_inst
= NULL
;
1568 bool emit_endif
= true;
1570 /* In single program flow mode, we can express IF and ELSE instructions
1571 * equivalently as ADD instructions that operate on IP. On platforms prior
1572 * to Gen6, flow control instructions cause an implied thread switch, so
1573 * this is a significant savings.
1575 * However, on Gen6, writing to IP doesn't work in single program flow mode
1576 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1577 * not be updated by non-flow control instructions."). And on later
1578 * platforms, there is no significant benefit to converting control flow
1579 * instructions to conditional ADDs. So we only do this trick on Gen4 and
1582 if (devinfo
->gen
< 6 && p
->single_program_flow
)
1586 * A single next_insn() may change the base address of instruction store
1587 * memory(p->store), so call it first before referencing the instruction
1588 * store pointer from an index
1591 insn
= next_insn(p
, BRW_OPCODE_ENDIF
);
1593 /* Pop the IF and (optional) ELSE instructions from the stack */
1594 p
->if_depth_in_loop
[p
->loop_stack_depth
]--;
1595 tmp
= pop_if_stack(p
);
1596 if (brw_inst_opcode(devinfo
, tmp
) == BRW_OPCODE_ELSE
) {
1598 tmp
= pop_if_stack(p
);
1603 /* ENDIF is useless; don't bother emitting it. */
1604 convert_IF_ELSE_to_ADD(p
, if_inst
, else_inst
);
1608 if (devinfo
->gen
< 6) {
1609 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1610 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1611 brw_set_src1(p
, insn
, brw_imm_d(0x0));
1612 } else if (devinfo
->gen
== 6) {
1613 brw_set_dest(p
, insn
, brw_imm_w(0));
1614 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1615 brw_set_src1(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1616 } else if (devinfo
->gen
== 7) {
1617 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1618 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1619 brw_set_src1(p
, insn
, brw_imm_w(0));
1621 brw_set_src0(p
, insn
, brw_imm_d(0));
1624 brw_inst_set_qtr_control(devinfo
, insn
, BRW_COMPRESSION_NONE
);
1625 brw_inst_set_mask_control(devinfo
, insn
, BRW_MASK_ENABLE
);
1626 if (devinfo
->gen
< 6)
1627 brw_inst_set_thread_control(devinfo
, insn
, BRW_THREAD_SWITCH
);
1629 /* Also pop item off the stack in the endif instruction: */
1630 if (devinfo
->gen
< 6) {
1631 brw_inst_set_gen4_jump_count(devinfo
, insn
, 0);
1632 brw_inst_set_gen4_pop_count(devinfo
, insn
, 1);
1633 } else if (devinfo
->gen
== 6) {
1634 brw_inst_set_gen6_jump_count(devinfo
, insn
, 2);
1636 brw_inst_set_jip(devinfo
, insn
, 2);
1638 patch_IF_ELSE(p
, if_inst
, else_inst
, insn
);
1642 brw_BREAK(struct brw_codegen
*p
)
1644 const struct gen_device_info
*devinfo
= p
->devinfo
;
1647 insn
= next_insn(p
, BRW_OPCODE_BREAK
);
1648 if (devinfo
->gen
>= 8) {
1649 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1650 brw_set_src0(p
, insn
, brw_imm_d(0x0));
1651 } else if (devinfo
->gen
>= 6) {
1652 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1653 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1654 brw_set_src1(p
, insn
, brw_imm_d(0x0));
1656 brw_set_dest(p
, insn
, brw_ip_reg());
1657 brw_set_src0(p
, insn
, brw_ip_reg());
1658 brw_set_src1(p
, insn
, brw_imm_d(0x0));
1659 brw_inst_set_gen4_pop_count(devinfo
, insn
,
1660 p
->if_depth_in_loop
[p
->loop_stack_depth
]);
1662 brw_inst_set_qtr_control(devinfo
, insn
, BRW_COMPRESSION_NONE
);
1663 brw_inst_set_exec_size(devinfo
, insn
, brw_get_default_exec_size(p
));
1669 brw_CONT(struct brw_codegen
*p
)
1671 const struct gen_device_info
*devinfo
= p
->devinfo
;
1674 insn
= next_insn(p
, BRW_OPCODE_CONTINUE
);
1675 brw_set_dest(p
, insn
, brw_ip_reg());
1676 if (devinfo
->gen
>= 8) {
1677 brw_set_src0(p
, insn
, brw_imm_d(0x0));
1679 brw_set_src0(p
, insn
, brw_ip_reg());
1680 brw_set_src1(p
, insn
, brw_imm_d(0x0));
1683 if (devinfo
->gen
< 6) {
1684 brw_inst_set_gen4_pop_count(devinfo
, insn
,
1685 p
->if_depth_in_loop
[p
->loop_stack_depth
]);
1687 brw_inst_set_qtr_control(devinfo
, insn
, BRW_COMPRESSION_NONE
);
1688 brw_inst_set_exec_size(devinfo
, insn
, brw_get_default_exec_size(p
));
1693 gen6_HALT(struct brw_codegen
*p
)
1695 const struct gen_device_info
*devinfo
= p
->devinfo
;
1698 insn
= next_insn(p
, BRW_OPCODE_HALT
);
1699 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1700 if (devinfo
->gen
< 8) {
1701 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1702 brw_set_src1(p
, insn
, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1703 } else if (devinfo
->gen
< 12) {
1704 brw_set_src0(p
, insn
, brw_imm_d(0x0));
1707 brw_inst_set_qtr_control(devinfo
, insn
, BRW_COMPRESSION_NONE
);
1708 brw_inst_set_exec_size(devinfo
, insn
, brw_get_default_exec_size(p
));
1714 * The DO/WHILE is just an unterminated loop -- break or continue are
1715 * used for control within the loop. We have a few ways they can be
1718 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1719 * jip and no DO instruction.
1721 * For non-uniform control flow pre-gen6, there's a DO instruction to
1722 * push the mask, and a WHILE to jump back, and BREAK to get out and
1725 * For gen6, there's no more mask stack, so no need for DO. WHILE
1726 * just points back to the first instruction of the loop.
1729 brw_DO(struct brw_codegen
*p
, unsigned execute_size
)
1731 const struct gen_device_info
*devinfo
= p
->devinfo
;
1733 if (devinfo
->gen
>= 6 || p
->single_program_flow
) {
1734 push_loop_stack(p
, &p
->store
[p
->nr_insn
]);
1735 return &p
->store
[p
->nr_insn
];
1737 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_DO
);
1739 push_loop_stack(p
, insn
);
1741 /* Override the defaults for this instruction:
1743 brw_set_dest(p
, insn
, brw_null_reg());
1744 brw_set_src0(p
, insn
, brw_null_reg());
1745 brw_set_src1(p
, insn
, brw_null_reg());
1747 brw_inst_set_qtr_control(devinfo
, insn
, BRW_COMPRESSION_NONE
);
1748 brw_inst_set_exec_size(devinfo
, insn
, execute_size
);
1749 brw_inst_set_pred_control(devinfo
, insn
, BRW_PREDICATE_NONE
);
1756 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1759 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1760 * nesting, since it can always just point to the end of the block/current loop.
1763 brw_patch_break_cont(struct brw_codegen
*p
, brw_inst
*while_inst
)
1765 const struct gen_device_info
*devinfo
= p
->devinfo
;
1766 brw_inst
*do_inst
= get_inner_do_insn(p
);
1768 unsigned br
= brw_jump_scale(devinfo
);
1770 assert(devinfo
->gen
< 6);
1772 for (inst
= while_inst
- 1; inst
!= do_inst
; inst
--) {
1773 /* If the jump count is != 0, that means that this instruction has already
1774 * been patched because it's part of a loop inside of the one we're
1777 if (brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_BREAK
&&
1778 brw_inst_gen4_jump_count(devinfo
, inst
) == 0) {
1779 brw_inst_set_gen4_jump_count(devinfo
, inst
, br
*((while_inst
- inst
) + 1));
1780 } else if (brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_CONTINUE
&&
1781 brw_inst_gen4_jump_count(devinfo
, inst
) == 0) {
1782 brw_inst_set_gen4_jump_count(devinfo
, inst
, br
* (while_inst
- inst
));
1788 brw_WHILE(struct brw_codegen
*p
)
1790 const struct gen_device_info
*devinfo
= p
->devinfo
;
1791 brw_inst
*insn
, *do_insn
;
1792 unsigned br
= brw_jump_scale(devinfo
);
1794 if (devinfo
->gen
>= 6) {
1795 insn
= next_insn(p
, BRW_OPCODE_WHILE
);
1796 do_insn
= get_inner_do_insn(p
);
1798 if (devinfo
->gen
>= 8) {
1799 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1800 if (devinfo
->gen
< 12)
1801 brw_set_src0(p
, insn
, brw_imm_d(0));
1802 brw_inst_set_jip(devinfo
, insn
, br
* (do_insn
- insn
));
1803 } else if (devinfo
->gen
== 7) {
1804 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1805 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1806 brw_set_src1(p
, insn
, brw_imm_w(0));
1807 brw_inst_set_jip(devinfo
, insn
, br
* (do_insn
- insn
));
1809 brw_set_dest(p
, insn
, brw_imm_w(0));
1810 brw_inst_set_gen6_jump_count(devinfo
, insn
, br
* (do_insn
- insn
));
1811 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1812 brw_set_src1(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1815 brw_inst_set_exec_size(devinfo
, insn
, brw_get_default_exec_size(p
));
1818 if (p
->single_program_flow
) {
1819 insn
= next_insn(p
, BRW_OPCODE_ADD
);
1820 do_insn
= get_inner_do_insn(p
);
1822 brw_set_dest(p
, insn
, brw_ip_reg());
1823 brw_set_src0(p
, insn
, brw_ip_reg());
1824 brw_set_src1(p
, insn
, brw_imm_d((do_insn
- insn
) * 16));
1825 brw_inst_set_exec_size(devinfo
, insn
, BRW_EXECUTE_1
);
1827 insn
= next_insn(p
, BRW_OPCODE_WHILE
);
1828 do_insn
= get_inner_do_insn(p
);
1830 assert(brw_inst_opcode(devinfo
, do_insn
) == BRW_OPCODE_DO
);
1832 brw_set_dest(p
, insn
, brw_ip_reg());
1833 brw_set_src0(p
, insn
, brw_ip_reg());
1834 brw_set_src1(p
, insn
, brw_imm_d(0));
1836 brw_inst_set_exec_size(devinfo
, insn
, brw_inst_exec_size(devinfo
, do_insn
));
1837 brw_inst_set_gen4_jump_count(devinfo
, insn
, br
* (do_insn
- insn
+ 1));
1838 brw_inst_set_gen4_pop_count(devinfo
, insn
, 0);
1840 brw_patch_break_cont(p
, insn
);
1843 brw_inst_set_qtr_control(devinfo
, insn
, BRW_COMPRESSION_NONE
);
1845 p
->loop_stack_depth
--;
1852 void brw_land_fwd_jump(struct brw_codegen
*p
, int jmp_insn_idx
)
1854 const struct gen_device_info
*devinfo
= p
->devinfo
;
1855 brw_inst
*jmp_insn
= &p
->store
[jmp_insn_idx
];
1858 if (devinfo
->gen
>= 5)
1861 assert(brw_inst_opcode(devinfo
, jmp_insn
) == BRW_OPCODE_JMPI
);
1862 assert(brw_inst_src1_reg_file(devinfo
, jmp_insn
) == BRW_IMMEDIATE_VALUE
);
1864 brw_inst_set_gen4_jump_count(devinfo
, jmp_insn
,
1865 jmpi
* (p
->nr_insn
- jmp_insn_idx
- 1));
1868 /* To integrate with the above, it makes sense that the comparison
1869 * instruction should populate the flag register. It might be simpler
1870 * just to use the flag reg for most WM tasks?
1872 void brw_CMP(struct brw_codegen
*p
,
1873 struct brw_reg dest
,
1874 unsigned conditional
,
1875 struct brw_reg src0
,
1876 struct brw_reg src1
)
1878 const struct gen_device_info
*devinfo
= p
->devinfo
;
1879 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_CMP
);
1881 brw_inst_set_cond_modifier(devinfo
, insn
, conditional
);
1882 brw_set_dest(p
, insn
, dest
);
1883 brw_set_src0(p
, insn
, src0
);
1884 brw_set_src1(p
, insn
, src1
);
1886 /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1888 * "Any CMP instruction with a null destination must use a {switch}."
1890 * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1891 * mentioned on their work-arounds pages.
1893 if (devinfo
->gen
== 7) {
1894 if (dest
.file
== BRW_ARCHITECTURE_REGISTER_FILE
&&
1895 dest
.nr
== BRW_ARF_NULL
) {
1896 brw_inst_set_thread_control(devinfo
, insn
, BRW_THREAD_SWITCH
);
1901 /***********************************************************************
1902 * Helpers for the various SEND message types:
1905 /** Extended math function, float[8].
1907 void gen4_math(struct brw_codegen
*p
,
1908 struct brw_reg dest
,
1910 unsigned msg_reg_nr
,
1912 unsigned precision
)
1914 const struct gen_device_info
*devinfo
= p
->devinfo
;
1915 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_SEND
);
1917 if (has_scalar_region(src
)) {
1918 data_type
= BRW_MATH_DATA_SCALAR
;
1920 data_type
= BRW_MATH_DATA_VECTOR
;
1923 assert(devinfo
->gen
< 6);
1925 /* Example code doesn't set predicate_control for send
1928 brw_inst_set_pred_control(devinfo
, insn
, 0);
1929 brw_inst_set_base_mrf(devinfo
, insn
, msg_reg_nr
);
1931 brw_set_dest(p
, insn
, dest
);
1932 brw_set_src0(p
, insn
, src
);
1933 brw_set_math_message(p
,
1936 src
.type
== BRW_REGISTER_TYPE_D
,
1941 void gen6_math(struct brw_codegen
*p
,
1942 struct brw_reg dest
,
1944 struct brw_reg src0
,
1945 struct brw_reg src1
)
1947 const struct gen_device_info
*devinfo
= p
->devinfo
;
1948 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_MATH
);
1950 assert(devinfo
->gen
>= 6);
1952 assert(dest
.file
== BRW_GENERAL_REGISTER_FILE
||
1953 (devinfo
->gen
>= 7 && dest
.file
== BRW_MESSAGE_REGISTER_FILE
));
1955 assert(dest
.hstride
== BRW_HORIZONTAL_STRIDE_1
);
1956 if (devinfo
->gen
== 6) {
1957 assert(src0
.hstride
== BRW_HORIZONTAL_STRIDE_1
);
1958 assert(src1
.hstride
== BRW_HORIZONTAL_STRIDE_1
);
1961 if (function
== BRW_MATH_FUNCTION_INT_DIV_QUOTIENT
||
1962 function
== BRW_MATH_FUNCTION_INT_DIV_REMAINDER
||
1963 function
== BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER
) {
1964 assert(src0
.type
!= BRW_REGISTER_TYPE_F
);
1965 assert(src1
.type
!= BRW_REGISTER_TYPE_F
);
1966 assert(src1
.file
== BRW_GENERAL_REGISTER_FILE
||
1967 (devinfo
->gen
>= 8 && src1
.file
== BRW_IMMEDIATE_VALUE
));
1969 assert(src0
.type
== BRW_REGISTER_TYPE_F
||
1970 (src0
.type
== BRW_REGISTER_TYPE_HF
&& devinfo
->gen
>= 9));
1971 assert(src1
.type
== BRW_REGISTER_TYPE_F
||
1972 (src1
.type
== BRW_REGISTER_TYPE_HF
&& devinfo
->gen
>= 9));
1975 /* Source modifiers are ignored for extended math instructions on Gen6. */
1976 if (devinfo
->gen
== 6) {
1977 assert(!src0
.negate
);
1979 assert(!src1
.negate
);
1983 brw_inst_set_math_function(devinfo
, insn
, function
);
1985 brw_set_dest(p
, insn
, dest
);
1986 brw_set_src0(p
, insn
, src0
);
1987 brw_set_src1(p
, insn
, src1
);
1991 * Return the right surface index to access the thread scratch space using
1992 * stateless dataport messages.
1995 brw_scratch_surface_idx(const struct brw_codegen
*p
)
1997 /* The scratch space is thread-local so IA coherency is unnecessary. */
1998 if (p
->devinfo
->gen
>= 8)
1999 return GEN8_BTI_STATELESS_NON_COHERENT
;
2001 return BRW_BTI_STATELESS
;
2005 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
2006 * using a constant offset per channel.
2008 * The offset must be aligned to oword size (16 bytes). Used for
2009 * register spilling.
2011 void brw_oword_block_write_scratch(struct brw_codegen
*p
,
2016 const struct gen_device_info
*devinfo
= p
->devinfo
;
2017 const unsigned target_cache
=
2018 (devinfo
->gen
>= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE
:
2019 devinfo
->gen
>= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE
:
2020 BRW_SFID_DATAPORT_WRITE
);
2023 if (devinfo
->gen
>= 6)
2026 mrf
= retype(mrf
, BRW_REGISTER_TYPE_UD
);
2028 const unsigned mlen
= 1 + num_regs
;
2030 /* Set up the message header. This is g0, with g0.2 filled with
2031 * the offset. We don't want to leave our offset around in g0 or
2032 * it'll screw up texture samples, so set it up inside the message
2036 brw_push_insn_state(p
);
2037 brw_set_default_exec_size(p
, BRW_EXECUTE_8
);
2038 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
2039 brw_set_default_compression_control(p
, BRW_COMPRESSION_NONE
);
2041 brw_MOV(p
, mrf
, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD
));
2043 /* set message header global offset field (reg 0, element 2) */
2044 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
2046 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE
,
2048 2), BRW_REGISTER_TYPE_UD
),
2049 brw_imm_ud(offset
));
2051 brw_pop_insn_state(p
);
2055 struct brw_reg dest
;
2056 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_SEND
);
2057 int send_commit_msg
;
2058 struct brw_reg src_header
= retype(brw_vec8_grf(0, 0),
2059 BRW_REGISTER_TYPE_UW
);
2061 brw_inst_set_sfid(devinfo
, insn
, target_cache
);
2062 brw_inst_set_compression(devinfo
, insn
, false);
2064 if (brw_inst_exec_size(devinfo
, insn
) >= 16)
2065 src_header
= vec16(src_header
);
2067 assert(brw_inst_pred_control(devinfo
, insn
) == BRW_PREDICATE_NONE
);
2068 if (devinfo
->gen
< 6)
2069 brw_inst_set_base_mrf(devinfo
, insn
, mrf
.nr
);
2071 /* Until gen6, writes followed by reads from the same location
2072 * are not guaranteed to be ordered unless write_commit is set.
2073 * If set, then a no-op write is issued to the destination
2074 * register to set a dependency, and a read from the destination
2075 * can be used to ensure the ordering.
2077 * For gen6, only writes between different threads need ordering
2078 * protection. Our use of DP writes is all about register
2079 * spilling within a thread.
2081 if (devinfo
->gen
>= 6) {
2082 dest
= retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW
);
2083 send_commit_msg
= 0;
2086 send_commit_msg
= 1;
2089 brw_set_dest(p
, insn
, dest
);
2090 if (devinfo
->gen
>= 6) {
2091 brw_set_src0(p
, insn
, mrf
);
2093 brw_set_src0(p
, insn
, brw_null_reg());
2096 if (devinfo
->gen
>= 6)
2097 msg_type
= GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE
;
2099 msg_type
= BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE
;
2101 brw_set_desc(p
, insn
,
2102 brw_message_desc(devinfo
, mlen
, send_commit_msg
, true) |
2103 brw_dp_write_desc(devinfo
, brw_scratch_surface_idx(p
),
2104 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs
* 8),
2105 msg_type
, 0, /* not a render target */
2112 * Read a block of owords (half a GRF each) from the scratch buffer
2113 * using a constant index per channel.
2115 * Offset must be aligned to oword size (16 bytes). Used for register
2119 brw_oword_block_read_scratch(struct brw_codegen
*p
,
2120 struct brw_reg dest
,
2125 const struct gen_device_info
*devinfo
= p
->devinfo
;
2127 if (devinfo
->gen
>= 6)
2130 if (p
->devinfo
->gen
>= 7) {
2131 /* On gen 7 and above, we no longer have message registers and we can
2132 * send from any register we want. By using the destination register
2133 * for the message, we guarantee that the implied message write won't
2134 * accidentally overwrite anything. This has been a problem because
2135 * the MRF registers and source for the final FB write are both fixed
2138 mrf
= retype(dest
, BRW_REGISTER_TYPE_UD
);
2140 mrf
= retype(mrf
, BRW_REGISTER_TYPE_UD
);
2142 dest
= retype(dest
, BRW_REGISTER_TYPE_UW
);
2144 const unsigned rlen
= num_regs
;
2145 const unsigned target_cache
=
2146 (devinfo
->gen
>= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE
:
2147 devinfo
->gen
>= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE
:
2148 BRW_SFID_DATAPORT_READ
);
2151 brw_push_insn_state(p
);
2152 brw_set_default_exec_size(p
, BRW_EXECUTE_8
);
2153 brw_set_default_compression_control(p
, BRW_COMPRESSION_NONE
);
2154 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
2156 brw_MOV(p
, mrf
, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD
));
2158 /* set message header global offset field (reg 0, element 2) */
2159 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
2160 brw_MOV(p
, get_element_ud(mrf
, 2), brw_imm_ud(offset
));
2162 brw_pop_insn_state(p
);
2166 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_SEND
);
2168 brw_inst_set_sfid(devinfo
, insn
, target_cache
);
2169 assert(brw_inst_pred_control(devinfo
, insn
) == 0);
2170 brw_inst_set_compression(devinfo
, insn
, false);
2172 brw_set_dest(p
, insn
, dest
); /* UW? */
2173 if (devinfo
->gen
>= 6) {
2174 brw_set_src0(p
, insn
, mrf
);
2176 brw_set_src0(p
, insn
, brw_null_reg());
2177 brw_inst_set_base_mrf(devinfo
, insn
, mrf
.nr
);
2180 brw_set_desc(p
, insn
,
2181 brw_message_desc(devinfo
, 1, rlen
, true) |
2182 brw_dp_read_desc(devinfo
, brw_scratch_surface_idx(p
),
2183 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs
* 8),
2184 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ
,
2185 BRW_DATAPORT_READ_TARGET_RENDER_CACHE
));
2190 gen7_block_read_scratch(struct brw_codegen
*p
,
2191 struct brw_reg dest
,
2195 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_SEND
);
2196 assert(brw_inst_pred_control(p
->devinfo
, insn
) == BRW_PREDICATE_NONE
);
2198 brw_set_dest(p
, insn
, retype(dest
, BRW_REGISTER_TYPE_UW
));
2200 /* The HW requires that the header is present; this is to get the g0.5
2203 brw_set_src0(p
, insn
, brw_vec8_grf(0, 0));
2205 /* According to the docs, offset is "A 12-bit HWord offset into the memory
2206 * Immediate Memory buffer as specified by binding table 0xFF." An HWORD
2207 * is 32 bytes, which happens to be the size of a register.
2210 assert(offset
< (1 << 12));
2212 gen7_set_dp_scratch_message(p
, insn
,
2213 false, /* scratch read */
2215 false, /* invalidate after read */
2218 1, /* mlen: just g0 */
2219 num_regs
, /* rlen */
2220 true); /* header present */
2224 * Read float[4] vectors from the data port constant cache.
2225 * Location (in buffer) should be a multiple of 16.
2226 * Used for fetching shader constants.
2228 void brw_oword_block_read(struct brw_codegen
*p
,
2229 struct brw_reg dest
,
2232 uint32_t bind_table_index
)
2234 const struct gen_device_info
*devinfo
= p
->devinfo
;
2235 const unsigned target_cache
=
2236 (devinfo
->gen
>= 6 ? GEN6_SFID_DATAPORT_CONSTANT_CACHE
:
2237 BRW_SFID_DATAPORT_READ
);
2238 const unsigned exec_size
= 1 << brw_get_default_exec_size(p
);
2240 /* On newer hardware, offset is in units of owords. */
2241 if (devinfo
->gen
>= 6)
2244 mrf
= retype(mrf
, BRW_REGISTER_TYPE_UD
);
2246 brw_push_insn_state(p
);
2247 brw_set_default_predicate_control(p
, BRW_PREDICATE_NONE
);
2248 brw_set_default_compression_control(p
, BRW_COMPRESSION_NONE
);
2249 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
2251 brw_push_insn_state(p
);
2252 brw_set_default_exec_size(p
, BRW_EXECUTE_8
);
2253 brw_MOV(p
, mrf
, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD
));
2255 /* set message header global offset field (reg 0, element 2) */
2256 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
2258 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE
,
2260 2), BRW_REGISTER_TYPE_UD
),
2261 brw_imm_ud(offset
));
2262 brw_pop_insn_state(p
);
2264 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_SEND
);
2266 brw_inst_set_sfid(devinfo
, insn
, target_cache
);
2268 /* cast dest to a uword[8] vector */
2269 dest
= retype(vec8(dest
), BRW_REGISTER_TYPE_UW
);
2271 brw_set_dest(p
, insn
, dest
);
2272 if (devinfo
->gen
>= 6) {
2273 brw_set_src0(p
, insn
, mrf
);
2275 brw_set_src0(p
, insn
, brw_null_reg());
2276 brw_inst_set_base_mrf(devinfo
, insn
, mrf
.nr
);
2279 brw_set_desc(p
, insn
,
2280 brw_message_desc(devinfo
, 1, DIV_ROUND_UP(exec_size
, 8), true) |
2281 brw_dp_read_desc(devinfo
, bind_table_index
,
2282 BRW_DATAPORT_OWORD_BLOCK_DWORDS(exec_size
),
2283 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ
,
2284 BRW_DATAPORT_READ_TARGET_DATA_CACHE
));
2286 brw_pop_insn_state(p
);
2290 brw_fb_WRITE(struct brw_codegen
*p
,
2291 struct brw_reg payload
,
2292 struct brw_reg implied_header
,
2293 unsigned msg_control
,
2294 unsigned binding_table_index
,
2295 unsigned msg_length
,
2296 unsigned response_length
,
2298 bool last_render_target
,
2299 bool header_present
)
2301 const struct gen_device_info
*devinfo
= p
->devinfo
;
2302 const unsigned target_cache
=
2303 (devinfo
->gen
>= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE
:
2304 BRW_SFID_DATAPORT_WRITE
);
2307 struct brw_reg dest
, src0
;
2309 if (brw_get_default_exec_size(p
) >= BRW_EXECUTE_16
)
2310 dest
= retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW
);
2312 dest
= retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW
);
2314 if (devinfo
->gen
>= 6) {
2315 insn
= next_insn(p
, BRW_OPCODE_SENDC
);
2317 insn
= next_insn(p
, BRW_OPCODE_SEND
);
2319 brw_inst_set_sfid(devinfo
, insn
, target_cache
);
2320 brw_inst_set_compression(devinfo
, insn
, false);
2322 if (devinfo
->gen
>= 6) {
2323 /* headerless version, just submit color payload */
2326 msg_type
= GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE
;
2328 assert(payload
.file
== BRW_MESSAGE_REGISTER_FILE
);
2329 brw_inst_set_base_mrf(devinfo
, insn
, payload
.nr
);
2330 src0
= implied_header
;
2332 msg_type
= BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE
;
2335 brw_set_dest(p
, insn
, dest
);
2336 brw_set_src0(p
, insn
, src0
);
2337 brw_set_desc(p
, insn
,
2338 brw_message_desc(devinfo
, msg_length
, response_length
,
2340 brw_dp_write_desc(devinfo
, binding_table_index
, msg_control
,
2341 msg_type
, last_render_target
,
2342 0 /* send_commit_msg */));
2343 brw_inst_set_eot(devinfo
, insn
, eot
);
2349 gen9_fb_READ(struct brw_codegen
*p
,
2351 struct brw_reg payload
,
2352 unsigned binding_table_index
,
2353 unsigned msg_length
,
2354 unsigned response_length
,
2357 const struct gen_device_info
*devinfo
= p
->devinfo
;
2358 assert(devinfo
->gen
>= 9);
2359 const unsigned msg_subtype
=
2360 brw_get_default_exec_size(p
) == BRW_EXECUTE_16
? 0 : 1;
2361 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_SENDC
);
2363 brw_inst_set_sfid(devinfo
, insn
, GEN6_SFID_DATAPORT_RENDER_CACHE
);
2364 brw_set_dest(p
, insn
, dst
);
2365 brw_set_src0(p
, insn
, payload
);
2368 brw_message_desc(devinfo
, msg_length
, response_length
, true) |
2369 brw_dp_read_desc(devinfo
, binding_table_index
,
2370 per_sample
<< 5 | msg_subtype
,
2371 GEN9_DATAPORT_RC_RENDER_TARGET_READ
,
2372 BRW_DATAPORT_READ_TARGET_RENDER_CACHE
));
2373 brw_inst_set_rt_slot_group(devinfo
, insn
, brw_get_default_group(p
) / 16);
2379 * Texture sample instruction.
2380 * Note: the msg_type plus msg_length values determine exactly what kind
2381 * of sampling operation is performed. See volume 4, page 161 of docs.
2383 void brw_SAMPLE(struct brw_codegen
*p
,
2384 struct brw_reg dest
,
2385 unsigned msg_reg_nr
,
2386 struct brw_reg src0
,
2387 unsigned binding_table_index
,
2390 unsigned response_length
,
2391 unsigned msg_length
,
2392 unsigned header_present
,
2394 unsigned return_format
)
2396 const struct gen_device_info
*devinfo
= p
->devinfo
;
2399 if (msg_reg_nr
!= -1)
2400 gen6_resolve_implied_move(p
, &src0
, msg_reg_nr
);
2402 insn
= next_insn(p
, BRW_OPCODE_SEND
);
2403 brw_inst_set_sfid(devinfo
, insn
, BRW_SFID_SAMPLER
);
2404 brw_inst_set_pred_control(devinfo
, insn
, BRW_PREDICATE_NONE
); /* XXX */
2406 /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2408 * "Instruction compression is not allowed for this instruction (that
2409 * is, send). The hardware behavior is undefined if this instruction is
2410 * set as compressed. However, compress control can be set to "SecHalf"
2411 * to affect the EMask generation."
2413 * No similar wording is found in later PRMs, but there are examples
2414 * utilizing send with SecHalf. More importantly, SIMD8 sampler messages
2415 * are allowed in SIMD16 mode and they could not work without SecHalf. For
2416 * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2418 brw_inst_set_compression(devinfo
, insn
, false);
2420 if (devinfo
->gen
< 6)
2421 brw_inst_set_base_mrf(devinfo
, insn
, msg_reg_nr
);
2423 brw_set_dest(p
, insn
, dest
);
2424 brw_set_src0(p
, insn
, src0
);
2425 brw_set_desc(p
, insn
,
2426 brw_message_desc(devinfo
, msg_length
, response_length
,
2428 brw_sampler_desc(devinfo
, binding_table_index
, sampler
,
2429 msg_type
, simd_mode
, return_format
));
2432 /* Adjust the message header's sampler state pointer to
2433 * select the correct group of 16 samplers.
2435 void brw_adjust_sampler_state_pointer(struct brw_codegen
*p
,
2436 struct brw_reg header
,
2437 struct brw_reg sampler_index
)
2439 /* The "Sampler Index" field can only store values between 0 and 15.
2440 * However, we can add an offset to the "Sampler State Pointer"
2441 * field, effectively selecting a different set of 16 samplers.
2443 * The "Sampler State Pointer" needs to be aligned to a 32-byte
2444 * offset, and each sampler state is only 16-bytes, so we can't
2445 * exclusively use the offset - we have to use both.
2448 const struct gen_device_info
*devinfo
= p
->devinfo
;
2450 if (sampler_index
.file
== BRW_IMMEDIATE_VALUE
) {
2451 const int sampler_state_size
= 16; /* 16 bytes */
2452 uint32_t sampler
= sampler_index
.ud
;
2454 if (sampler
>= 16) {
2455 assert(devinfo
->is_haswell
|| devinfo
->gen
>= 8);
2457 get_element_ud(header
, 3),
2458 get_element_ud(brw_vec8_grf(0, 0), 3),
2459 brw_imm_ud(16 * (sampler
/ 16) * sampler_state_size
));
2462 /* Non-const sampler array indexing case */
2463 if (devinfo
->gen
< 8 && !devinfo
->is_haswell
) {
2467 struct brw_reg temp
= get_element_ud(header
, 3);
2469 brw_AND(p
, temp
, get_element_ud(sampler_index
, 0), brw_imm_ud(0x0f0));
2470 brw_SHL(p
, temp
, temp
, brw_imm_ud(4));
2472 get_element_ud(header
, 3),
2473 get_element_ud(brw_vec8_grf(0, 0), 3),
2478 /* All these variables are pretty confusing - we might be better off
2479 * using bitmasks and macros for this, in the old style. Or perhaps
2480 * just having the caller instantiate the fields in dword3 itself.
2482 void brw_urb_WRITE(struct brw_codegen
*p
,
2483 struct brw_reg dest
,
2484 unsigned msg_reg_nr
,
2485 struct brw_reg src0
,
2486 enum brw_urb_write_flags flags
,
2487 unsigned msg_length
,
2488 unsigned response_length
,
2492 const struct gen_device_info
*devinfo
= p
->devinfo
;
2495 gen6_resolve_implied_move(p
, &src0
, msg_reg_nr
);
2497 if (devinfo
->gen
>= 7 && !(flags
& BRW_URB_WRITE_USE_CHANNEL_MASKS
)) {
2498 /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2499 brw_push_insn_state(p
);
2500 brw_set_default_access_mode(p
, BRW_ALIGN_1
);
2501 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
2502 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
2503 brw_OR(p
, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE
, msg_reg_nr
, 5),
2504 BRW_REGISTER_TYPE_UD
),
2505 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD
),
2506 brw_imm_ud(0xff00));
2507 brw_pop_insn_state(p
);
2510 insn
= next_insn(p
, BRW_OPCODE_SEND
);
2512 assert(msg_length
< BRW_MAX_MRF(devinfo
->gen
));
2514 brw_set_dest(p
, insn
, dest
);
2515 brw_set_src0(p
, insn
, src0
);
2516 brw_set_src1(p
, insn
, brw_imm_d(0));
2518 if (devinfo
->gen
< 6)
2519 brw_inst_set_base_mrf(devinfo
, insn
, msg_reg_nr
);
2521 brw_set_urb_message(p
,
2531 brw_send_indirect_message(struct brw_codegen
*p
,
2534 struct brw_reg payload
,
2535 struct brw_reg desc
,
2539 const struct gen_device_info
*devinfo
= p
->devinfo
;
2540 struct brw_inst
*send
;
2542 dst
= retype(dst
, BRW_REGISTER_TYPE_UW
);
2544 assert(desc
.type
== BRW_REGISTER_TYPE_UD
);
2546 if (desc
.file
== BRW_IMMEDIATE_VALUE
) {
2547 send
= next_insn(p
, BRW_OPCODE_SEND
);
2548 brw_set_src0(p
, send
, retype(payload
, BRW_REGISTER_TYPE_UD
));
2549 brw_set_desc(p
, send
, desc
.ud
| desc_imm
);
2551 struct brw_reg addr
= retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD
);
2553 brw_push_insn_state(p
);
2554 brw_set_default_access_mode(p
, BRW_ALIGN_1
);
2555 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
2556 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
2557 brw_set_default_predicate_control(p
, BRW_PREDICATE_NONE
);
2559 /* Load the indirect descriptor to an address register using OR so the
2560 * caller can specify additional descriptor bits with the desc_imm
2563 brw_OR(p
, addr
, desc
, brw_imm_ud(desc_imm
));
2565 brw_pop_insn_state(p
);
2567 send
= next_insn(p
, BRW_OPCODE_SEND
);
2568 brw_set_src0(p
, send
, retype(payload
, BRW_REGISTER_TYPE_UD
));
2569 brw_set_src1(p
, send
, addr
);
2572 brw_set_dest(p
, send
, dst
);
2573 brw_inst_set_sfid(devinfo
, send
, sfid
);
2574 brw_inst_set_eot(devinfo
, send
, eot
);
2578 brw_send_indirect_split_message(struct brw_codegen
*p
,
2581 struct brw_reg payload0
,
2582 struct brw_reg payload1
,
2583 struct brw_reg desc
,
2585 struct brw_reg ex_desc
,
2586 unsigned ex_desc_imm
,
2589 const struct gen_device_info
*devinfo
= p
->devinfo
;
2590 struct brw_inst
*send
;
2592 dst
= retype(dst
, BRW_REGISTER_TYPE_UW
);
2594 assert(desc
.type
== BRW_REGISTER_TYPE_UD
);
2596 if (desc
.file
== BRW_IMMEDIATE_VALUE
) {
2597 desc
.ud
|= desc_imm
;
2599 struct brw_reg addr
= retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD
);
2601 brw_push_insn_state(p
);
2602 brw_set_default_access_mode(p
, BRW_ALIGN_1
);
2603 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
2604 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
2605 brw_set_default_predicate_control(p
, BRW_PREDICATE_NONE
);
2607 /* Load the indirect descriptor to an address register using OR so the
2608 * caller can specify additional descriptor bits with the desc_imm
2611 brw_OR(p
, addr
, desc
, brw_imm_ud(desc_imm
));
2613 brw_pop_insn_state(p
);
2617 if (ex_desc
.file
== BRW_IMMEDIATE_VALUE
&&
2618 (ex_desc
.ud
& INTEL_MASK(15, 12)) == 0) {
2619 ex_desc
.ud
|= ex_desc_imm
;
2621 struct brw_reg addr
= retype(brw_address_reg(2), BRW_REGISTER_TYPE_UD
);
2623 brw_push_insn_state(p
);
2624 brw_set_default_access_mode(p
, BRW_ALIGN_1
);
2625 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
2626 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
2627 brw_set_default_predicate_control(p
, BRW_PREDICATE_NONE
);
2629 /* Load the indirect extended descriptor to an address register using OR
2630 * so the caller can specify additional descriptor bits with the
2631 * desc_imm immediate.
2633 * Even though the instruction dispatcher always pulls the SFID and EOT
2634 * fields from the instruction itself, actual external unit which
2635 * processes the message gets the SFID and EOT from the extended
2636 * descriptor which comes from the address register. If we don't OR
2637 * those two bits in, the external unit may get confused and hang.
2639 unsigned imm_part
= ex_desc_imm
| sfid
| eot
<< 5;
2641 if (ex_desc
.file
== BRW_IMMEDIATE_VALUE
) {
2642 /* ex_desc bits 15:12 don't exist in the instruction encoding, so
2643 * we may have fallen back to an indirect extended descriptor.
2645 brw_MOV(p
, addr
, brw_imm_ud(ex_desc
.ud
| imm_part
));
2647 brw_OR(p
, addr
, ex_desc
, brw_imm_ud(imm_part
));
2650 brw_pop_insn_state(p
);
2654 send
= next_insn(p
, BRW_OPCODE_SENDS
);
2655 brw_set_dest(p
, send
, dst
);
2656 brw_set_src0(p
, send
, retype(payload0
, BRW_REGISTER_TYPE_UD
));
2657 brw_set_src1(p
, send
, retype(payload1
, BRW_REGISTER_TYPE_UD
));
2659 if (desc
.file
== BRW_IMMEDIATE_VALUE
) {
2660 brw_inst_set_send_sel_reg32_desc(devinfo
, send
, 0);
2661 brw_inst_set_send_desc(devinfo
, send
, desc
.ud
);
2663 assert(desc
.file
== BRW_ARCHITECTURE_REGISTER_FILE
);
2664 assert(desc
.nr
== BRW_ARF_ADDRESS
);
2665 assert(desc
.subnr
== 0);
2666 brw_inst_set_send_sel_reg32_desc(devinfo
, send
, 1);
2669 if (ex_desc
.file
== BRW_IMMEDIATE_VALUE
) {
2670 brw_inst_set_send_sel_reg32_ex_desc(devinfo
, send
, 0);
2671 brw_inst_set_sends_ex_desc(devinfo
, send
, ex_desc
.ud
);
2673 assert(ex_desc
.file
== BRW_ARCHITECTURE_REGISTER_FILE
);
2674 assert(ex_desc
.nr
== BRW_ARF_ADDRESS
);
2675 assert((ex_desc
.subnr
& 0x3) == 0);
2676 brw_inst_set_send_sel_reg32_ex_desc(devinfo
, send
, 1);
2677 brw_inst_set_send_ex_desc_ia_subreg_nr(devinfo
, send
, ex_desc
.subnr
>> 2);
2680 brw_inst_set_sfid(devinfo
, send
, sfid
);
2681 brw_inst_set_eot(devinfo
, send
, eot
);
2685 brw_send_indirect_surface_message(struct brw_codegen
*p
,
2688 struct brw_reg payload
,
2689 struct brw_reg surface
,
2692 if (surface
.file
!= BRW_IMMEDIATE_VALUE
) {
2693 struct brw_reg addr
= retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD
);
2695 brw_push_insn_state(p
);
2696 brw_set_default_access_mode(p
, BRW_ALIGN_1
);
2697 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
2698 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
2699 brw_set_default_predicate_control(p
, BRW_PREDICATE_NONE
);
2701 /* Mask out invalid bits from the surface index to avoid hangs e.g. when
2702 * some surface array is accessed out of bounds.
2705 suboffset(vec1(retype(surface
, BRW_REGISTER_TYPE_UD
)),
2706 BRW_GET_SWZ(surface
.swizzle
, 0)),
2709 brw_pop_insn_state(p
);
2714 brw_send_indirect_message(p
, sfid
, dst
, payload
, surface
, desc_imm
, false);
2718 while_jumps_before_offset(const struct gen_device_info
*devinfo
,
2719 brw_inst
*insn
, int while_offset
, int start_offset
)
2721 int scale
= 16 / brw_jump_scale(devinfo
);
2722 int jip
= devinfo
->gen
== 6 ? brw_inst_gen6_jump_count(devinfo
, insn
)
2723 : brw_inst_jip(devinfo
, insn
);
2725 return while_offset
+ jip
* scale
<= start_offset
;
2730 brw_find_next_block_end(struct brw_codegen
*p
, int start_offset
)
2733 void *store
= p
->store
;
2734 const struct gen_device_info
*devinfo
= p
->devinfo
;
2738 for (offset
= next_offset(devinfo
, store
, start_offset
);
2739 offset
< p
->next_insn_offset
;
2740 offset
= next_offset(devinfo
, store
, offset
)) {
2741 brw_inst
*insn
= store
+ offset
;
2743 switch (brw_inst_opcode(devinfo
, insn
)) {
2747 case BRW_OPCODE_ENDIF
:
2752 case BRW_OPCODE_WHILE
:
2753 /* If the while doesn't jump before our instruction, it's the end
2754 * of a sibling do...while loop. Ignore it.
2756 if (!while_jumps_before_offset(devinfo
, insn
, offset
, start_offset
))
2759 case BRW_OPCODE_ELSE
:
2760 case BRW_OPCODE_HALT
:
2771 /* There is no DO instruction on gen6, so to find the end of the loop
2772 * we have to see if the loop is jumping back before our start
2776 brw_find_loop_end(struct brw_codegen
*p
, int start_offset
)
2778 const struct gen_device_info
*devinfo
= p
->devinfo
;
2780 void *store
= p
->store
;
2782 assert(devinfo
->gen
>= 6);
2784 /* Always start after the instruction (such as a WHILE) we're trying to fix
2787 for (offset
= next_offset(devinfo
, store
, start_offset
);
2788 offset
< p
->next_insn_offset
;
2789 offset
= next_offset(devinfo
, store
, offset
)) {
2790 brw_inst
*insn
= store
+ offset
;
2792 if (brw_inst_opcode(devinfo
, insn
) == BRW_OPCODE_WHILE
) {
2793 if (while_jumps_before_offset(devinfo
, insn
, offset
, start_offset
))
2797 assert(!"not reached");
2798 return start_offset
;
2801 /* After program generation, go back and update the UIP and JIP of
2802 * BREAK, CONT, and HALT instructions to their correct locations.
2805 brw_set_uip_jip(struct brw_codegen
*p
, int start_offset
)
2807 const struct gen_device_info
*devinfo
= p
->devinfo
;
2809 int br
= brw_jump_scale(devinfo
);
2810 int scale
= 16 / br
;
2811 void *store
= p
->store
;
2813 if (devinfo
->gen
< 6)
2816 for (offset
= start_offset
; offset
< p
->next_insn_offset
; offset
+= 16) {
2817 brw_inst
*insn
= store
+ offset
;
2818 assert(brw_inst_cmpt_control(devinfo
, insn
) == 0);
2820 int block_end_offset
= brw_find_next_block_end(p
, offset
);
2821 switch (brw_inst_opcode(devinfo
, insn
)) {
2822 case BRW_OPCODE_BREAK
:
2823 assert(block_end_offset
!= 0);
2824 brw_inst_set_jip(devinfo
, insn
, (block_end_offset
- offset
) / scale
);
2825 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2826 brw_inst_set_uip(devinfo
, insn
,
2827 (brw_find_loop_end(p
, offset
) - offset
+
2828 (devinfo
->gen
== 6 ? 16 : 0)) / scale
);
2830 case BRW_OPCODE_CONTINUE
:
2831 assert(block_end_offset
!= 0);
2832 brw_inst_set_jip(devinfo
, insn
, (block_end_offset
- offset
) / scale
);
2833 brw_inst_set_uip(devinfo
, insn
,
2834 (brw_find_loop_end(p
, offset
) - offset
) / scale
);
2836 assert(brw_inst_uip(devinfo
, insn
) != 0);
2837 assert(brw_inst_jip(devinfo
, insn
) != 0);
2840 case BRW_OPCODE_ENDIF
: {
2841 int32_t jump
= (block_end_offset
== 0) ?
2842 1 * br
: (block_end_offset
- offset
) / scale
;
2843 if (devinfo
->gen
>= 7)
2844 brw_inst_set_jip(devinfo
, insn
, jump
);
2846 brw_inst_set_gen6_jump_count(devinfo
, insn
, jump
);
2850 case BRW_OPCODE_HALT
:
2851 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2853 * "In case of the halt instruction not inside any conditional
2854 * code block, the value of <JIP> and <UIP> should be the
2855 * same. In case of the halt instruction inside conditional code
2856 * block, the <UIP> should be the end of the program, and the
2857 * <JIP> should be end of the most inner conditional code block."
2859 * The uip will have already been set by whoever set up the
2862 if (block_end_offset
== 0) {
2863 brw_inst_set_jip(devinfo
, insn
, brw_inst_uip(devinfo
, insn
));
2865 brw_inst_set_jip(devinfo
, insn
, (block_end_offset
- offset
) / scale
);
2867 assert(brw_inst_uip(devinfo
, insn
) != 0);
2868 assert(brw_inst_jip(devinfo
, insn
) != 0);
2877 void brw_ff_sync(struct brw_codegen
*p
,
2878 struct brw_reg dest
,
2879 unsigned msg_reg_nr
,
2880 struct brw_reg src0
,
2882 unsigned response_length
,
2885 const struct gen_device_info
*devinfo
= p
->devinfo
;
2888 gen6_resolve_implied_move(p
, &src0
, msg_reg_nr
);
2890 insn
= next_insn(p
, BRW_OPCODE_SEND
);
2891 brw_set_dest(p
, insn
, dest
);
2892 brw_set_src0(p
, insn
, src0
);
2893 brw_set_src1(p
, insn
, brw_imm_d(0));
2895 if (devinfo
->gen
< 6)
2896 brw_inst_set_base_mrf(devinfo
, insn
, msg_reg_nr
);
2898 brw_set_ff_sync_message(p
,
2906 * Emit the SEND instruction necessary to generate stream output data on Gen6
2907 * (for transform feedback).
2909 * If send_commit_msg is true, this is the last piece of stream output data
2910 * from this thread, so send the data as a committed write. According to the
2911 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2913 * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2914 * writes are complete by sending the final write as a committed write."
2917 brw_svb_write(struct brw_codegen
*p
,
2918 struct brw_reg dest
,
2919 unsigned msg_reg_nr
,
2920 struct brw_reg src0
,
2921 unsigned binding_table_index
,
2922 bool send_commit_msg
)
2924 const struct gen_device_info
*devinfo
= p
->devinfo
;
2925 const unsigned target_cache
=
2926 (devinfo
->gen
>= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE
:
2927 devinfo
->gen
>= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE
:
2928 BRW_SFID_DATAPORT_WRITE
);
2931 gen6_resolve_implied_move(p
, &src0
, msg_reg_nr
);
2933 insn
= next_insn(p
, BRW_OPCODE_SEND
);
2934 brw_inst_set_sfid(devinfo
, insn
, target_cache
);
2935 brw_set_dest(p
, insn
, dest
);
2936 brw_set_src0(p
, insn
, src0
);
2937 brw_set_desc(p
, insn
,
2938 brw_message_desc(devinfo
, 1, send_commit_msg
, true) |
2939 brw_dp_write_desc(devinfo
, binding_table_index
,
2940 0, /* msg_control: ignored */
2941 GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE
,
2942 0, /* last_render_target: ignored */
2943 send_commit_msg
)); /* send_commit_msg */
2947 brw_surface_payload_size(struct brw_codegen
*p
,
2948 unsigned num_channels
,
2949 unsigned exec_size
/**< 0 for SIMD4x2 */)
2952 return 1; /* SIMD4x2 */
2953 else if (exec_size
<= 8)
2954 return num_channels
;
2956 return 2 * num_channels
;
2960 brw_untyped_atomic(struct brw_codegen
*p
,
2962 struct brw_reg payload
,
2963 struct brw_reg surface
,
2965 unsigned msg_length
,
2966 bool response_expected
,
2967 bool header_present
)
2969 const struct gen_device_info
*devinfo
= p
->devinfo
;
2970 const unsigned sfid
= (devinfo
->gen
>= 8 || devinfo
->is_haswell
?
2971 HSW_SFID_DATAPORT_DATA_CACHE_1
:
2972 GEN7_SFID_DATAPORT_DATA_CACHE
);
2973 const bool align1
= brw_get_default_access_mode(p
) == BRW_ALIGN_1
;
2974 /* SIMD4x2 untyped atomic instructions only exist on HSW+ */
2975 const bool has_simd4x2
= devinfo
->gen
>= 8 || devinfo
->is_haswell
;
2976 const unsigned exec_size
= align1
? 1 << brw_get_default_exec_size(p
) :
2977 has_simd4x2
? 0 : 8;
2978 const unsigned response_length
=
2979 brw_surface_payload_size(p
, response_expected
, exec_size
);
2980 const unsigned desc
=
2981 brw_message_desc(devinfo
, msg_length
, response_length
, header_present
) |
2982 brw_dp_untyped_atomic_desc(devinfo
, exec_size
, atomic_op
,
2984 /* Mask out unused components -- This is especially important in Align16
2985 * mode on generations that don't have native support for SIMD4x2 atomics,
2986 * because unused but enabled components will cause the dataport to perform
2987 * additional atomic operations on the addresses that happen to be in the
2988 * uninitialized Y, Z and W coordinates of the payload.
2990 const unsigned mask
= align1
? WRITEMASK_XYZW
: WRITEMASK_X
;
2992 brw_send_indirect_surface_message(p
, sfid
, brw_writemask(dst
, mask
),
2993 payload
, surface
, desc
);
2997 brw_untyped_surface_read(struct brw_codegen
*p
,
2999 struct brw_reg payload
,
3000 struct brw_reg surface
,
3001 unsigned msg_length
,
3002 unsigned num_channels
)
3004 const struct gen_device_info
*devinfo
= p
->devinfo
;
3005 const unsigned sfid
= (devinfo
->gen
>= 8 || devinfo
->is_haswell
?
3006 HSW_SFID_DATAPORT_DATA_CACHE_1
:
3007 GEN7_SFID_DATAPORT_DATA_CACHE
);
3008 const bool align1
= brw_get_default_access_mode(p
) == BRW_ALIGN_1
;
3009 const unsigned exec_size
= align1
? 1 << brw_get_default_exec_size(p
) : 0;
3010 const unsigned response_length
=
3011 brw_surface_payload_size(p
, num_channels
, exec_size
);
3012 const unsigned desc
=
3013 brw_message_desc(devinfo
, msg_length
, response_length
, false) |
3014 brw_dp_untyped_surface_rw_desc(devinfo
, exec_size
, num_channels
, false);
3016 brw_send_indirect_surface_message(p
, sfid
, dst
, payload
, surface
, desc
);
3020 brw_untyped_surface_write(struct brw_codegen
*p
,
3021 struct brw_reg payload
,
3022 struct brw_reg surface
,
3023 unsigned msg_length
,
3024 unsigned num_channels
,
3025 bool header_present
)
3027 const struct gen_device_info
*devinfo
= p
->devinfo
;
3028 const unsigned sfid
= (devinfo
->gen
>= 8 || devinfo
->is_haswell
?
3029 HSW_SFID_DATAPORT_DATA_CACHE_1
:
3030 GEN7_SFID_DATAPORT_DATA_CACHE
);
3031 const bool align1
= brw_get_default_access_mode(p
) == BRW_ALIGN_1
;
3032 /* SIMD4x2 untyped surface write instructions only exist on HSW+ */
3033 const bool has_simd4x2
= devinfo
->gen
>= 8 || devinfo
->is_haswell
;
3034 const unsigned exec_size
= align1
? 1 << brw_get_default_exec_size(p
) :
3035 has_simd4x2
? 0 : 8;
3036 const unsigned desc
=
3037 brw_message_desc(devinfo
, msg_length
, 0, header_present
) |
3038 brw_dp_untyped_surface_rw_desc(devinfo
, exec_size
, num_channels
, true);
3039 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3040 const unsigned mask
= !has_simd4x2
&& !align1
? WRITEMASK_X
: WRITEMASK_XYZW
;
3042 brw_send_indirect_surface_message(p
, sfid
, brw_writemask(brw_null_reg(), mask
),
3043 payload
, surface
, desc
);
3047 brw_set_memory_fence_message(struct brw_codegen
*p
,
3048 struct brw_inst
*insn
,
3049 enum brw_message_target sfid
,
3053 const struct gen_device_info
*devinfo
= p
->devinfo
;
3055 brw_set_desc(p
, insn
, brw_message_desc(
3056 devinfo
, 1, (commit_enable
? 1 : 0), true));
3058 brw_inst_set_sfid(devinfo
, insn
, sfid
);
3061 case GEN6_SFID_DATAPORT_RENDER_CACHE
:
3062 brw_inst_set_dp_msg_type(devinfo
, insn
, GEN7_DATAPORT_RC_MEMORY_FENCE
);
3064 case GEN7_SFID_DATAPORT_DATA_CACHE
:
3065 brw_inst_set_dp_msg_type(devinfo
, insn
, GEN7_DATAPORT_DC_MEMORY_FENCE
);
3068 unreachable("Not reached");
3072 brw_inst_set_dp_msg_control(devinfo
, insn
, 1 << 5);
3074 assert(devinfo
->gen
>= 11 || bti
== 0);
3075 brw_inst_set_binding_table_index(devinfo
, insn
, bti
);
3079 brw_memory_fence(struct brw_codegen
*p
,
3082 enum opcode send_op
,
3086 const struct gen_device_info
*devinfo
= p
->devinfo
;
3087 const bool commit_enable
= stall
||
3088 devinfo
->gen
>= 10 || /* HSD ES # 1404612949 */
3089 (devinfo
->gen
== 7 && !devinfo
->is_haswell
);
3090 struct brw_inst
*insn
;
3092 brw_push_insn_state(p
);
3093 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
3094 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
3095 dst
= retype(vec1(dst
), BRW_REGISTER_TYPE_UW
);
3096 src
= retype(vec1(src
), BRW_REGISTER_TYPE_UD
);
3098 /* Set dst as destination for dependency tracking, the MEMORY_FENCE
3099 * message doesn't write anything back.
3101 insn
= next_insn(p
, send_op
);
3102 brw_set_dest(p
, insn
, dst
);
3103 brw_set_src0(p
, insn
, src
);
3104 brw_set_memory_fence_message(p
, insn
, GEN7_SFID_DATAPORT_DATA_CACHE
,
3105 commit_enable
, bti
);
3107 if (devinfo
->gen
== 7 && !devinfo
->is_haswell
) {
3108 /* IVB does typed surface access through the render cache, so we need to
3109 * flush it too. Use a different register so both flushes can be
3110 * pipelined by the hardware.
3112 insn
= next_insn(p
, send_op
);
3113 brw_set_dest(p
, insn
, offset(dst
, 1));
3114 brw_set_src0(p
, insn
, src
);
3115 brw_set_memory_fence_message(p
, insn
, GEN6_SFID_DATAPORT_RENDER_CACHE
,
3116 commit_enable
, bti
);
3118 /* Now write the response of the second message into the response of the
3119 * first to trigger a pipeline stall -- This way future render and data
3120 * cache messages will be properly ordered with respect to past data and
3121 * render cache messages.
3123 brw_MOV(p
, dst
, offset(dst
, 1));
3127 brw_MOV(p
, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW
), dst
);
3129 brw_pop_insn_state(p
);
3133 brw_pixel_interpolator_query(struct brw_codegen
*p
,
3134 struct brw_reg dest
,
3138 struct brw_reg data
,
3139 unsigned msg_length
,
3140 unsigned response_length
)
3142 const struct gen_device_info
*devinfo
= p
->devinfo
;
3143 const uint16_t exec_size
= brw_get_default_exec_size(p
);
3144 const unsigned slot_group
= brw_get_default_group(p
) / 16;
3145 const unsigned simd_mode
= (exec_size
== BRW_EXECUTE_16
);
3146 const unsigned desc
=
3147 brw_message_desc(devinfo
, msg_length
, response_length
, false) |
3148 brw_pixel_interp_desc(devinfo
, mode
, noperspective
, simd_mode
,
3151 /* brw_send_indirect_message will automatically use a direct send message
3152 * if data is actually immediate.
3154 brw_send_indirect_message(p
,
3155 GEN7_SFID_PIXEL_INTERPOLATOR
,
3164 brw_find_live_channel(struct brw_codegen
*p
, struct brw_reg dst
,
3165 struct brw_reg mask
)
3167 const struct gen_device_info
*devinfo
= p
->devinfo
;
3168 const unsigned exec_size
= 1 << brw_get_default_exec_size(p
);
3169 const unsigned qtr_control
= brw_get_default_group(p
) / 8;
3172 assert(devinfo
->gen
>= 7);
3173 assert(mask
.type
== BRW_REGISTER_TYPE_UD
);
3175 brw_push_insn_state(p
);
3177 /* The flag register is only used on Gen7 in align1 mode, so avoid setting
3178 * unnecessary bits in the instruction words, get the information we need
3179 * and reset the default flag register. This allows more instructions to be
3182 const unsigned flag_subreg
= p
->current
->flag_subreg
;
3183 brw_set_default_flag_reg(p
, 0, 0);
3185 if (brw_get_default_access_mode(p
) == BRW_ALIGN_1
) {
3186 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
3188 if (devinfo
->gen
>= 8) {
3189 /* Getting the first active channel index is easy on Gen8: Just find
3190 * the first bit set in the execution mask. The register exists on
3191 * HSW already but it reads back as all ones when the current
3192 * instruction has execution masking disabled, so it's kind of
3195 struct brw_reg exec_mask
=
3196 retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD
);
3198 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
3199 if (mask
.file
!= BRW_IMMEDIATE_VALUE
|| mask
.ud
!= 0xffffffff) {
3200 /* Unfortunately, ce0 does not take into account the thread
3201 * dispatch mask, which may be a problem in cases where it's not
3202 * tightly packed (i.e. it doesn't have the form '2^n - 1' for
3203 * some n). Combine ce0 with the given dispatch (or vector) mask
3204 * to mask off those channels which were never dispatched by the
3207 brw_SHR(p
, vec1(dst
), mask
, brw_imm_ud(qtr_control
* 8));
3208 brw_AND(p
, vec1(dst
), exec_mask
, vec1(dst
));
3209 exec_mask
= vec1(dst
);
3212 /* Quarter control has the effect of magically shifting the value of
3213 * ce0 so you'll get the first active channel relative to the
3214 * specified quarter control as result.
3216 inst
= brw_FBL(p
, vec1(dst
), exec_mask
);
3218 const struct brw_reg flag
= brw_flag_subreg(flag_subreg
);
3220 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
3221 brw_MOV(p
, retype(flag
, BRW_REGISTER_TYPE_UD
), brw_imm_ud(0));
3223 /* Run enough instructions returning zero with execution masking and
3224 * a conditional modifier enabled in order to get the full execution
3225 * mask in f1.0. We could use a single 32-wide move here if it
3226 * weren't because of the hardware bug that causes channel enables to
3227 * be applied incorrectly to the second half of 32-wide instructions
3230 const unsigned lower_size
= MIN2(16, exec_size
);
3231 for (unsigned i
= 0; i
< exec_size
/ lower_size
; i
++) {
3232 inst
= brw_MOV(p
, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW
),
3234 brw_inst_set_mask_control(devinfo
, inst
, BRW_MASK_ENABLE
);
3235 brw_inst_set_group(devinfo
, inst
, lower_size
* i
+ 8 * qtr_control
);
3236 brw_inst_set_cond_modifier(devinfo
, inst
, BRW_CONDITIONAL_Z
);
3237 brw_inst_set_exec_size(devinfo
, inst
, cvt(lower_size
) - 1);
3238 brw_inst_set_flag_reg_nr(devinfo
, inst
, flag_subreg
/ 2);
3239 brw_inst_set_flag_subreg_nr(devinfo
, inst
, flag_subreg
% 2);
3242 /* Find the first bit set in the exec_size-wide portion of the flag
3243 * register that was updated by the last sequence of MOV
3246 const enum brw_reg_type type
= brw_int_type(exec_size
/ 8, false);
3247 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
3248 brw_FBL(p
, vec1(dst
), byte_offset(retype(flag
, type
), qtr_control
));
3251 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
3253 if (devinfo
->gen
>= 8 &&
3254 mask
.file
== BRW_IMMEDIATE_VALUE
&& mask
.ud
== 0xffffffff) {
3255 /* In SIMD4x2 mode the first active channel index is just the
3256 * negation of the first bit of the mask register. Note that ce0
3257 * doesn't take into account the dispatch mask, so the Gen7 path
3258 * should be used instead unless you have the guarantee that the
3259 * dispatch mask is tightly packed (i.e. it has the form '2^n - 1'
3262 inst
= brw_AND(p
, brw_writemask(dst
, WRITEMASK_X
),
3263 negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD
)),
3267 /* Overwrite the destination without and with execution masking to
3268 * find out which of the channels is active.
3270 brw_push_insn_state(p
);
3271 brw_set_default_exec_size(p
, BRW_EXECUTE_4
);
3272 brw_MOV(p
, brw_writemask(vec4(dst
), WRITEMASK_X
),
3275 inst
= brw_MOV(p
, brw_writemask(vec4(dst
), WRITEMASK_X
),
3277 brw_pop_insn_state(p
);
3278 brw_inst_set_mask_control(devinfo
, inst
, BRW_MASK_ENABLE
);
3282 brw_pop_insn_state(p
);
3286 brw_broadcast(struct brw_codegen
*p
,
3291 const struct gen_device_info
*devinfo
= p
->devinfo
;
3292 const bool align1
= brw_get_default_access_mode(p
) == BRW_ALIGN_1
;
3295 brw_push_insn_state(p
);
3296 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
3297 brw_set_default_exec_size(p
, align1
? BRW_EXECUTE_1
: BRW_EXECUTE_4
);
3299 assert(src
.file
== BRW_GENERAL_REGISTER_FILE
&&
3300 src
.address_mode
== BRW_ADDRESS_DIRECT
);
3301 assert(!src
.abs
&& !src
.negate
);
3302 assert(src
.type
== dst
.type
);
3304 if ((src
.vstride
== 0 && (src
.hstride
== 0 || !align1
)) ||
3305 idx
.file
== BRW_IMMEDIATE_VALUE
) {
3306 /* Trivial, the source is already uniform or the index is a constant.
3307 * We will typically not get here if the optimizer is doing its job, but
3308 * asserting would be mean.
3310 const unsigned i
= idx
.file
== BRW_IMMEDIATE_VALUE
? idx
.ud
: 0;
3312 (align1
? stride(suboffset(src
, i
), 0, 1, 0) :
3313 stride(suboffset(src
, 4 * i
), 0, 4, 1)));
3315 /* From the Haswell PRM section "Register Region Restrictions":
3317 * "The lower bits of the AddressImmediate must not overflow to
3318 * change the register address. The lower 5 bits of Address
3319 * Immediate when added to lower 5 bits of address register gives
3320 * the sub-register offset. The upper bits of Address Immediate
3321 * when added to upper bits of address register gives the register
3322 * address. Any overflow from sub-register offset is dropped."
3324 * Fortunately, for broadcast, we never have a sub-register offset so
3325 * this isn't an issue.
3327 assert(src
.subnr
== 0);
3330 const struct brw_reg addr
=
3331 retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD
);
3332 unsigned offset
= src
.nr
* REG_SIZE
+ src
.subnr
;
3333 /* Limit in bytes of the signed indirect addressing immediate. */
3334 const unsigned limit
= 512;
3336 brw_push_insn_state(p
);
3337 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
3338 brw_set_default_predicate_control(p
, BRW_PREDICATE_NONE
);
3340 /* Take into account the component size and horizontal stride. */
3341 assert(src
.vstride
== src
.hstride
+ src
.width
);
3342 brw_SHL(p
, addr
, vec1(idx
),
3343 brw_imm_ud(_mesa_logbase2(type_sz(src
.type
)) +
3346 /* We can only address up to limit bytes using the indirect
3347 * addressing immediate, account for the difference if the source
3348 * register is above this limit.
3350 if (offset
>= limit
) {
3351 brw_ADD(p
, addr
, addr
, brw_imm_ud(offset
- offset
% limit
));
3352 offset
= offset
% limit
;
3355 brw_pop_insn_state(p
);
3357 /* Use indirect addressing to fetch the specified component. */
3358 if (type_sz(src
.type
) > 4 &&
3359 (devinfo
->is_cherryview
|| gen_device_info_is_9lp(devinfo
))) {
3360 /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
3362 * "When source or destination datatype is 64b or operation is
3363 * integer DWord multiply, indirect addressing must not be
3366 * To work around both of this issue, we do two integer MOVs
3367 * insead of one 64-bit MOV. Because no double value should ever
3368 * cross a register boundary, it's safe to use the immediate
3369 * offset in the indirect here to handle adding 4 bytes to the
3370 * offset and avoid the extra ADD to the register file.
3372 brw_MOV(p
, subscript(dst
, BRW_REGISTER_TYPE_D
, 0),
3373 retype(brw_vec1_indirect(addr
.subnr
, offset
),
3374 BRW_REGISTER_TYPE_D
));
3375 brw_MOV(p
, subscript(dst
, BRW_REGISTER_TYPE_D
, 1),
3376 retype(brw_vec1_indirect(addr
.subnr
, offset
+ 4),
3377 BRW_REGISTER_TYPE_D
));
3380 retype(brw_vec1_indirect(addr
.subnr
, offset
), src
.type
));
3383 /* In SIMD4x2 mode the index can be either zero or one, replicate it
3384 * to all bits of a flag register,
3388 stride(brw_swizzle(idx
, BRW_SWIZZLE_XXXX
), 4, 4, 1));
3389 brw_inst_set_pred_control(devinfo
, inst
, BRW_PREDICATE_NONE
);
3390 brw_inst_set_cond_modifier(devinfo
, inst
, BRW_CONDITIONAL_NZ
);
3391 brw_inst_set_flag_reg_nr(devinfo
, inst
, 1);
3393 /* and use predicated SEL to pick the right channel. */
3394 inst
= brw_SEL(p
, dst
,
3395 stride(suboffset(src
, 4), 4, 4, 1),
3396 stride(src
, 4, 4, 1));
3397 brw_inst_set_pred_control(devinfo
, inst
, BRW_PREDICATE_NORMAL
);
3398 brw_inst_set_flag_reg_nr(devinfo
, inst
, 1);
3402 brw_pop_insn_state(p
);
3406 * This instruction is generated as a single-channel align1 instruction by
3407 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
3409 * We can't use the typed atomic op in the FS because that has the execution
3410 * mask ANDed with the pixel mask, but we just want to write the one dword for
3413 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
3414 * one u32. So we use the same untyped atomic write message as the pixel
3417 * The untyped atomic operation requires a BUFFER surface type with RAW
3418 * format, and is only accessible through the legacy DATA_CACHE dataport
3421 void brw_shader_time_add(struct brw_codegen
*p
,
3422 struct brw_reg payload
,
3423 uint32_t surf_index
)
3425 const struct gen_device_info
*devinfo
= p
->devinfo
;
3426 const unsigned sfid
= (devinfo
->gen
>= 8 || devinfo
->is_haswell
?
3427 HSW_SFID_DATAPORT_DATA_CACHE_1
:
3428 GEN7_SFID_DATAPORT_DATA_CACHE
);
3429 assert(devinfo
->gen
>= 7);
3431 brw_push_insn_state(p
);
3432 brw_set_default_access_mode(p
, BRW_ALIGN_1
);
3433 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
3434 brw_set_default_compression_control(p
, BRW_COMPRESSION_NONE
);
3435 brw_inst
*send
= brw_next_insn(p
, BRW_OPCODE_SEND
);
3437 /* We use brw_vec1_reg and unmasked because we want to increment the given
3440 brw_set_dest(p
, send
, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE
,
3442 brw_set_src0(p
, send
, brw_vec1_reg(payload
.file
,
3444 brw_set_desc(p
, send
, (brw_message_desc(devinfo
, 2, 0, false) |
3445 brw_dp_untyped_atomic_desc(devinfo
, 1, BRW_AOP_ADD
,
3448 brw_inst_set_sfid(devinfo
, send
, sfid
);
3449 brw_inst_set_binding_table_index(devinfo
, send
, surf_index
);
3451 brw_pop_insn_state(p
);
3456 * Emit the SEND message for a barrier
3459 brw_barrier(struct brw_codegen
*p
, struct brw_reg src
)
3461 const struct gen_device_info
*devinfo
= p
->devinfo
;
3462 struct brw_inst
*inst
;
3464 assert(devinfo
->gen
>= 7);
3466 brw_push_insn_state(p
);
3467 brw_set_default_access_mode(p
, BRW_ALIGN_1
);
3468 inst
= next_insn(p
, BRW_OPCODE_SEND
);
3469 brw_set_dest(p
, inst
, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW
));
3470 brw_set_src0(p
, inst
, src
);
3471 brw_set_src1(p
, inst
, brw_null_reg());
3472 brw_set_desc(p
, inst
, brw_message_desc(devinfo
, 1, 0, false));
3474 brw_inst_set_sfid(devinfo
, inst
, BRW_SFID_MESSAGE_GATEWAY
);
3475 brw_inst_set_gateway_notify(devinfo
, inst
, 1);
3476 brw_inst_set_gateway_subfuncid(devinfo
, inst
,
3477 BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG
);
3479 brw_inst_set_mask_control(devinfo
, inst
, BRW_MASK_DISABLE
);
3480 brw_pop_insn_state(p
);
3485 * Emit the wait instruction for a barrier
3488 brw_WAIT(struct brw_codegen
*p
)
3490 const struct gen_device_info
*devinfo
= p
->devinfo
;
3491 struct brw_inst
*insn
;
3493 struct brw_reg src
= brw_notification_reg();
3495 insn
= next_insn(p
, BRW_OPCODE_WAIT
);
3496 brw_set_dest(p
, insn
, src
);
3497 brw_set_src0(p
, insn
, src
);
3498 brw_set_src1(p
, insn
, brw_null_reg());
3500 brw_inst_set_exec_size(devinfo
, insn
, BRW_EXECUTE_1
);
3501 brw_inst_set_mask_control(devinfo
, insn
, BRW_MASK_DISABLE
);
3505 brw_float_controls_mode(struct brw_codegen
*p
,
3506 unsigned mode
, unsigned mask
)
3508 brw_inst
*inst
= brw_AND(p
, brw_cr0_reg(0), brw_cr0_reg(0),
3510 brw_inst_set_exec_size(p
->devinfo
, inst
, BRW_EXECUTE_1
);
3512 /* From the Skylake PRM, Volume 7, page 760:
3513 * "Implementation Restriction on Register Access: When the control
3514 * register is used as an explicit source and/or destination, hardware
3515 * does not ensure execution pipeline coherency. Software must set the
3516 * thread control field to ‘switch’ for an instruction that uses
3517 * control register as an explicit operand."
3519 brw_inst_set_thread_control(p
->devinfo
, inst
, BRW_THREAD_SWITCH
);
3522 brw_inst
*inst_or
= brw_OR(p
, brw_cr0_reg(0), brw_cr0_reg(0),
3524 brw_inst_set_exec_size(p
->devinfo
, inst_or
, BRW_EXECUTE_1
);
3525 brw_inst_set_thread_control(p
->devinfo
, inst_or
, BRW_THREAD_SWITCH
);