2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **********************************************************************/
29 * Keith Whitwell <keithw@vmware.com>
33 #include "brw_eu_defines.h"
36 #include "util/ralloc.h"
39 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
40 * registers, implicitly moving the operand to a message register.
42 * On Sandybridge, this is no longer the case. This function performs the
43 * explicit move; it should be called before emitting a SEND instruction.
46 gen6_resolve_implied_move(struct brw_codegen
*p
,
50 const struct gen_device_info
*devinfo
= p
->devinfo
;
54 if (src
->file
== BRW_MESSAGE_REGISTER_FILE
)
57 if (src
->file
!= BRW_ARCHITECTURE_REGISTER_FILE
|| src
->nr
!= BRW_ARF_NULL
) {
58 brw_push_insn_state(p
);
59 brw_set_default_exec_size(p
, BRW_EXECUTE_8
);
60 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
61 brw_set_default_compression_control(p
, BRW_COMPRESSION_NONE
);
62 brw_MOV(p
, retype(brw_message_reg(msg_reg_nr
), BRW_REGISTER_TYPE_UD
),
63 retype(*src
, BRW_REGISTER_TYPE_UD
));
64 brw_pop_insn_state(p
);
66 *src
= brw_message_reg(msg_reg_nr
);
70 gen7_convert_mrf_to_grf(struct brw_codegen
*p
, struct brw_reg
*reg
)
72 /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
73 * "The send with EOT should use register space R112-R127 for <src>. This is
74 * to enable loading of a new thread into the same slot while the message
75 * with EOT for current thread is pending dispatch."
77 * Since we're pretending to have 16 MRFs anyway, we may as well use the
78 * registers required for messages with EOT.
80 const struct gen_device_info
*devinfo
= p
->devinfo
;
81 if (devinfo
->gen
>= 7 && reg
->file
== BRW_MESSAGE_REGISTER_FILE
) {
82 reg
->file
= BRW_GENERAL_REGISTER_FILE
;
83 reg
->nr
+= GEN7_MRF_HACK_START
;
88 * Convert a brw_reg_type enumeration value into the hardware representation.
90 * The hardware encoding may depend on whether the value is an immediate.
93 brw_reg_type_to_hw_type(const struct gen_device_info
*devinfo
,
94 enum brw_reg_type type
, enum brw_reg_file file
)
96 if (file
== BRW_IMMEDIATE_VALUE
) {
97 static const int imm_hw_types
[] = {
98 [BRW_REGISTER_TYPE_UD
] = BRW_HW_REG_TYPE_UD
,
99 [BRW_REGISTER_TYPE_D
] = BRW_HW_REG_TYPE_D
,
100 [BRW_REGISTER_TYPE_UW
] = BRW_HW_REG_TYPE_UW
,
101 [BRW_REGISTER_TYPE_W
] = BRW_HW_REG_TYPE_W
,
102 [BRW_REGISTER_TYPE_F
] = BRW_HW_REG_TYPE_F
,
103 [BRW_REGISTER_TYPE_UB
] = -1,
104 [BRW_REGISTER_TYPE_B
] = -1,
105 [BRW_REGISTER_TYPE_UV
] = BRW_HW_REG_IMM_TYPE_UV
,
106 [BRW_REGISTER_TYPE_VF
] = BRW_HW_REG_IMM_TYPE_VF
,
107 [BRW_REGISTER_TYPE_V
] = BRW_HW_REG_IMM_TYPE_V
,
108 [BRW_REGISTER_TYPE_DF
] = GEN8_HW_REG_IMM_TYPE_DF
,
109 [BRW_REGISTER_TYPE_HF
] = GEN8_HW_REG_IMM_TYPE_HF
,
110 [BRW_REGISTER_TYPE_UQ
] = GEN8_HW_REG_TYPE_UQ
,
111 [BRW_REGISTER_TYPE_Q
] = GEN8_HW_REG_TYPE_Q
,
113 assert(type
< ARRAY_SIZE(imm_hw_types
));
114 assert(imm_hw_types
[type
] != -1);
115 assert(devinfo
->gen
>= 8 || type
< BRW_REGISTER_TYPE_DF
);
116 return imm_hw_types
[type
];
118 /* Non-immediate registers */
119 static const int hw_types
[] = {
120 [BRW_REGISTER_TYPE_UD
] = BRW_HW_REG_TYPE_UD
,
121 [BRW_REGISTER_TYPE_D
] = BRW_HW_REG_TYPE_D
,
122 [BRW_REGISTER_TYPE_UW
] = BRW_HW_REG_TYPE_UW
,
123 [BRW_REGISTER_TYPE_W
] = BRW_HW_REG_TYPE_W
,
124 [BRW_REGISTER_TYPE_UB
] = BRW_HW_REG_NON_IMM_TYPE_UB
,
125 [BRW_REGISTER_TYPE_B
] = BRW_HW_REG_NON_IMM_TYPE_B
,
126 [BRW_REGISTER_TYPE_F
] = BRW_HW_REG_TYPE_F
,
127 [BRW_REGISTER_TYPE_UV
] = -1,
128 [BRW_REGISTER_TYPE_VF
] = -1,
129 [BRW_REGISTER_TYPE_V
] = -1,
130 [BRW_REGISTER_TYPE_DF
] = GEN7_HW_REG_NON_IMM_TYPE_DF
,
131 [BRW_REGISTER_TYPE_HF
] = GEN8_HW_REG_NON_IMM_TYPE_HF
,
132 [BRW_REGISTER_TYPE_UQ
] = GEN8_HW_REG_TYPE_UQ
,
133 [BRW_REGISTER_TYPE_Q
] = GEN8_HW_REG_TYPE_Q
,
135 assert(type
< ARRAY_SIZE(hw_types
));
136 assert(hw_types
[type
] != -1);
137 assert(devinfo
->gen
>= 7 || type
< BRW_REGISTER_TYPE_DF
);
138 assert(devinfo
->gen
>= 8 || type
< BRW_REGISTER_TYPE_Q
);
139 return hw_types
[type
];
144 * Return the element size given a hardware register type and file.
146 * The hardware encoding may depend on whether the value is an immediate.
149 brw_hw_reg_type_to_size(const struct gen_device_info
*devinfo
,
150 unsigned type
, enum brw_reg_file file
)
152 if (file
== BRW_IMMEDIATE_VALUE
) {
153 static const unsigned imm_hw_sizes
[] = {
154 [BRW_HW_REG_TYPE_UD
] = 4,
155 [BRW_HW_REG_TYPE_D
] = 4,
156 [BRW_HW_REG_TYPE_UW
] = 2,
157 [BRW_HW_REG_TYPE_W
] = 2,
158 [BRW_HW_REG_IMM_TYPE_UV
] = 2,
159 [BRW_HW_REG_IMM_TYPE_VF
] = 4,
160 [BRW_HW_REG_IMM_TYPE_V
] = 2,
161 [BRW_HW_REG_TYPE_F
] = 4,
162 [GEN8_HW_REG_TYPE_UQ
] = 8,
163 [GEN8_HW_REG_TYPE_Q
] = 8,
164 [GEN8_HW_REG_IMM_TYPE_DF
] = 8,
165 [GEN8_HW_REG_IMM_TYPE_HF
] = 2,
167 assert(type
< ARRAY_SIZE(imm_hw_sizes
));
168 assert(devinfo
->gen
>= 6 || type
!= BRW_HW_REG_IMM_TYPE_UV
);
169 assert(devinfo
->gen
>= 8 || type
<= BRW_HW_REG_TYPE_F
);
170 return imm_hw_sizes
[type
];
172 /* Non-immediate registers */
173 static const unsigned hw_sizes
[] = {
174 [BRW_HW_REG_TYPE_UD
] = 4,
175 [BRW_HW_REG_TYPE_D
] = 4,
176 [BRW_HW_REG_TYPE_UW
] = 2,
177 [BRW_HW_REG_TYPE_W
] = 2,
178 [BRW_HW_REG_NON_IMM_TYPE_UB
] = 1,
179 [BRW_HW_REG_NON_IMM_TYPE_B
] = 1,
180 [GEN7_HW_REG_NON_IMM_TYPE_DF
] = 8,
181 [BRW_HW_REG_TYPE_F
] = 4,
182 [GEN8_HW_REG_TYPE_UQ
] = 8,
183 [GEN8_HW_REG_TYPE_Q
] = 8,
184 [GEN8_HW_REG_NON_IMM_TYPE_HF
] = 2,
186 assert(type
< ARRAY_SIZE(hw_sizes
));
187 assert(devinfo
->gen
>= 7 ||
188 (type
< GEN7_HW_REG_NON_IMM_TYPE_DF
|| type
== BRW_HW_REG_TYPE_F
));
189 assert(devinfo
->gen
>= 8 || type
<= BRW_HW_REG_TYPE_F
);
190 return hw_sizes
[type
];
195 brw_set_dest(struct brw_codegen
*p
, brw_inst
*inst
, struct brw_reg dest
)
197 const struct gen_device_info
*devinfo
= p
->devinfo
;
199 if (dest
.file
== BRW_MESSAGE_REGISTER_FILE
)
200 assert((dest
.nr
& ~BRW_MRF_COMPR4
) < BRW_MAX_MRF(devinfo
->gen
));
201 else if (dest
.file
!= BRW_ARCHITECTURE_REGISTER_FILE
)
202 assert(dest
.nr
< 128);
204 gen7_convert_mrf_to_grf(p
, &dest
);
206 brw_inst_set_dst_reg_file(devinfo
, inst
, dest
.file
);
207 brw_inst_set_dst_reg_type(devinfo
, inst
,
208 brw_reg_type_to_hw_type(devinfo
, dest
.type
,
210 brw_inst_set_dst_address_mode(devinfo
, inst
, dest
.address_mode
);
212 if (dest
.address_mode
== BRW_ADDRESS_DIRECT
) {
213 brw_inst_set_dst_da_reg_nr(devinfo
, inst
, dest
.nr
);
215 if (brw_inst_access_mode(devinfo
, inst
) == BRW_ALIGN_1
) {
216 brw_inst_set_dst_da1_subreg_nr(devinfo
, inst
, dest
.subnr
);
217 if (dest
.hstride
== BRW_HORIZONTAL_STRIDE_0
)
218 dest
.hstride
= BRW_HORIZONTAL_STRIDE_1
;
219 brw_inst_set_dst_hstride(devinfo
, inst
, dest
.hstride
);
221 brw_inst_set_dst_da16_subreg_nr(devinfo
, inst
, dest
.subnr
/ 16);
222 brw_inst_set_da16_writemask(devinfo
, inst
, dest
.writemask
);
223 if (dest
.file
== BRW_GENERAL_REGISTER_FILE
||
224 dest
.file
== BRW_MESSAGE_REGISTER_FILE
) {
225 assert(dest
.writemask
!= 0);
227 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
228 * Although Dst.HorzStride is a don't care for Align16, HW needs
229 * this to be programmed as "01".
231 brw_inst_set_dst_hstride(devinfo
, inst
, 1);
234 brw_inst_set_dst_ia_subreg_nr(devinfo
, inst
, dest
.subnr
);
236 /* These are different sizes in align1 vs align16:
238 if (brw_inst_access_mode(devinfo
, inst
) == BRW_ALIGN_1
) {
239 brw_inst_set_dst_ia1_addr_imm(devinfo
, inst
,
240 dest
.indirect_offset
);
241 if (dest
.hstride
== BRW_HORIZONTAL_STRIDE_0
)
242 dest
.hstride
= BRW_HORIZONTAL_STRIDE_1
;
243 brw_inst_set_dst_hstride(devinfo
, inst
, dest
.hstride
);
245 brw_inst_set_dst_ia16_addr_imm(devinfo
, inst
,
246 dest
.indirect_offset
);
247 /* even ignored in da16, still need to set as '01' */
248 brw_inst_set_dst_hstride(devinfo
, inst
, 1);
252 /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
253 * or 16 (SIMD16), as that's normally correct. However, when dealing with
254 * small registers, we automatically reduce it to match the register size.
256 * In platforms that support fp64 we can emit instructions with a width of
257 * 4 that need two SIMD8 registers and an exec_size of 8 or 16. In these
258 * cases we need to make sure that these instructions have their exec sizes
259 * set properly when they are emitted and we can't rely on this code to fix
263 if (devinfo
->gen
>= 6)
264 fix_exec_size
= dest
.width
< BRW_EXECUTE_4
;
266 fix_exec_size
= dest
.width
< BRW_EXECUTE_8
;
269 brw_inst_set_exec_size(devinfo
, inst
, dest
.width
);
273 validate_reg(const struct gen_device_info
*devinfo
,
274 brw_inst
*inst
, struct brw_reg reg
)
276 const int hstride_for_reg
[] = {0, 1, 2, 4};
277 const int vstride_for_reg
[] = {0, 1, 2, 4, 8, 16, 32};
278 const int width_for_reg
[] = {1, 2, 4, 8, 16};
279 const int execsize_for_reg
[] = {1, 2, 4, 8, 16, 32};
280 int width
, hstride
, vstride
, execsize
;
282 if (reg
.file
== BRW_IMMEDIATE_VALUE
) {
283 /* 3.3.6: Region Parameters. Restriction: Immediate vectors
284 * mean the destination has to be 128-bit aligned and the
285 * destination horiz stride has to be a word.
287 if (reg
.type
== BRW_REGISTER_TYPE_V
) {
288 unsigned UNUSED elem_size
= brw_element_size(devinfo
, inst
, dst
);
289 assert(hstride_for_reg
[brw_inst_dst_hstride(devinfo
, inst
)] *
296 if (reg
.file
== BRW_ARCHITECTURE_REGISTER_FILE
&&
297 reg
.file
== BRW_ARF_NULL
)
300 /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
302 * "Swizzling is not allowed when an accumulator is used as an implicit
303 * source or an explicit source in an instruction."
305 if (reg
.file
== BRW_ARCHITECTURE_REGISTER_FILE
&&
306 reg
.nr
== BRW_ARF_ACCUMULATOR
)
307 assert(reg
.swizzle
== BRW_SWIZZLE_XYZW
);
309 assert(reg
.hstride
< ARRAY_SIZE(hstride_for_reg
));
310 hstride
= hstride_for_reg
[reg
.hstride
];
312 if (reg
.vstride
== 0xf) {
315 assert(reg
.vstride
>= 0 && reg
.vstride
< ARRAY_SIZE(vstride_for_reg
));
316 vstride
= vstride_for_reg
[reg
.vstride
];
319 assert(reg
.width
>= 0 && reg
.width
< ARRAY_SIZE(width_for_reg
));
320 width
= width_for_reg
[reg
.width
];
322 assert(brw_inst_exec_size(devinfo
, inst
) >= 0 &&
323 brw_inst_exec_size(devinfo
, inst
) < ARRAY_SIZE(execsize_for_reg
));
324 execsize
= execsize_for_reg
[brw_inst_exec_size(devinfo
, inst
)];
326 /* Restrictions from 3.3.10: Register Region Restrictions. */
328 assert(execsize
>= width
);
331 if (execsize
== width
&& hstride
!= 0) {
332 assert(vstride
== -1 || vstride
== width
* hstride
);
336 if (execsize
== width
&& hstride
== 0) {
337 /* no restriction on vstride. */
342 assert(hstride
== 0);
346 if (execsize
== 1 && width
== 1) {
347 assert(hstride
== 0);
348 assert(vstride
== 0);
352 if (vstride
== 0 && hstride
== 0) {
356 /* 10. Check destination issues. */
360 is_compactable_immediate(unsigned imm
)
362 /* We get the low 12 bits as-is. */
365 /* We get one bit replicated through the top 20 bits. */
366 return imm
== 0 || imm
== 0xfffff000;
370 brw_set_src0(struct brw_codegen
*p
, brw_inst
*inst
, struct brw_reg reg
)
372 const struct gen_device_info
*devinfo
= p
->devinfo
;
374 if (reg
.file
== BRW_MESSAGE_REGISTER_FILE
)
375 assert((reg
.nr
& ~BRW_MRF_COMPR4
) < BRW_MAX_MRF(devinfo
->gen
));
376 else if (reg
.file
!= BRW_ARCHITECTURE_REGISTER_FILE
)
377 assert(reg
.nr
< 128);
379 gen7_convert_mrf_to_grf(p
, ®
);
381 if (devinfo
->gen
>= 6 && (brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SEND
||
382 brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_SENDC
)) {
383 /* Any source modifiers or regions will be ignored, since this just
384 * identifies the MRF/GRF to start reading the message contents from.
385 * Check for some likely failures.
389 assert(reg
.address_mode
== BRW_ADDRESS_DIRECT
);
392 validate_reg(devinfo
, inst
, reg
);
394 brw_inst_set_src0_reg_file(devinfo
, inst
, reg
.file
);
395 brw_inst_set_src0_reg_type(devinfo
, inst
,
396 brw_reg_type_to_hw_type(devinfo
, reg
.type
, reg
.file
));
397 brw_inst_set_src0_abs(devinfo
, inst
, reg
.abs
);
398 brw_inst_set_src0_negate(devinfo
, inst
, reg
.negate
);
399 brw_inst_set_src0_address_mode(devinfo
, inst
, reg
.address_mode
);
401 if (reg
.file
== BRW_IMMEDIATE_VALUE
) {
402 if (reg
.type
== BRW_REGISTER_TYPE_DF
||
403 brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_DIM
)
404 brw_inst_set_imm_df(devinfo
, inst
, reg
.df
);
405 else if (reg
.type
== BRW_REGISTER_TYPE_UQ
||
406 reg
.type
== BRW_REGISTER_TYPE_Q
)
407 brw_inst_set_imm_uq(devinfo
, inst
, reg
.u64
);
409 brw_inst_set_imm_ud(devinfo
, inst
, reg
.ud
);
411 /* The Bspec's section titled "Non-present Operands" claims that if src0
412 * is an immediate that src1's type must be the same as that of src0.
414 * The SNB+ DataTypeIndex instruction compaction tables contain mappings
415 * that do not follow this rule. E.g., from the IVB/HSW table:
417 * DataTypeIndex 18-Bit Mapping Mapped Meaning
418 * 3 001000001011111101 r:f | i:vf | a:ud | <1> | dir |
420 * And from the SNB table:
422 * DataTypeIndex 18-Bit Mapping Mapped Meaning
423 * 8 001000000111101100 a:w | i:w | a:ud | <1> | dir |
425 * Neither of these cause warnings from the simulator when used,
426 * compacted or otherwise. In fact, all compaction mappings that have an
427 * immediate in src0 use a:ud for src1.
429 * The GM45 instruction compaction tables do not contain mapped meanings
430 * so it's not clear whether it has the restriction. We'll assume it was
431 * lifted on SNB. (FINISHME: decode the GM45 tables and check.)
433 * Don't do any of this for 64-bit immediates, since the src1 fields
434 * overlap with the immediate and setting them would overwrite the
437 if (type_sz(reg
.type
) < 8) {
438 brw_inst_set_src1_reg_file(devinfo
, inst
,
439 BRW_ARCHITECTURE_REGISTER_FILE
);
440 if (devinfo
->gen
< 6) {
441 brw_inst_set_src1_reg_type(devinfo
, inst
,
442 brw_inst_src0_reg_type(devinfo
, inst
));
444 brw_inst_set_src1_reg_type(devinfo
, inst
, BRW_HW_REG_TYPE_UD
);
448 /* Compacted instructions only have 12-bits (plus 1 for the other 20)
449 * for immediate values. Presumably the hardware engineers realized
450 * that the only useful floating-point value that could be represented
451 * in this format is 0.0, which can also be represented as a VF-typed
452 * immediate, so they gave us the previously mentioned mapping on IVB+.
454 * Strangely, we do have a mapping for imm:f in src1, so we don't need
457 * If we see a 0.0:F, change the type to VF so that it can be compacted.
459 if (brw_inst_imm_ud(devinfo
, inst
) == 0x0 &&
460 brw_inst_src0_reg_type(devinfo
, inst
) == BRW_HW_REG_TYPE_F
&&
461 brw_inst_dst_reg_type(devinfo
, inst
) != GEN7_HW_REG_NON_IMM_TYPE_DF
) {
462 brw_inst_set_src0_reg_type(devinfo
, inst
, BRW_HW_REG_IMM_TYPE_VF
);
465 /* There are no mappings for dst:d | i:d, so if the immediate is suitable
466 * set the types to :UD so the instruction can be compacted.
468 if (is_compactable_immediate(brw_inst_imm_ud(devinfo
, inst
)) &&
469 brw_inst_cond_modifier(devinfo
, inst
) == BRW_CONDITIONAL_NONE
&&
470 brw_inst_src0_reg_type(devinfo
, inst
) == BRW_HW_REG_TYPE_D
&&
471 brw_inst_dst_reg_type(devinfo
, inst
) == BRW_HW_REG_TYPE_D
) {
472 brw_inst_set_src0_reg_type(devinfo
, inst
, BRW_HW_REG_TYPE_UD
);
473 brw_inst_set_dst_reg_type(devinfo
, inst
, BRW_HW_REG_TYPE_UD
);
476 if (reg
.address_mode
== BRW_ADDRESS_DIRECT
) {
477 brw_inst_set_src0_da_reg_nr(devinfo
, inst
, reg
.nr
);
478 if (brw_inst_access_mode(devinfo
, inst
) == BRW_ALIGN_1
) {
479 brw_inst_set_src0_da1_subreg_nr(devinfo
, inst
, reg
.subnr
);
481 brw_inst_set_src0_da16_subreg_nr(devinfo
, inst
, reg
.subnr
/ 16);
484 brw_inst_set_src0_ia_subreg_nr(devinfo
, inst
, reg
.subnr
);
486 if (brw_inst_access_mode(devinfo
, inst
) == BRW_ALIGN_1
) {
487 brw_inst_set_src0_ia1_addr_imm(devinfo
, inst
, reg
.indirect_offset
);
489 brw_inst_set_src0_ia16_addr_imm(devinfo
, inst
, reg
.indirect_offset
);
493 if (brw_inst_access_mode(devinfo
, inst
) == BRW_ALIGN_1
) {
494 if (reg
.width
== BRW_WIDTH_1
&&
495 brw_inst_exec_size(devinfo
, inst
) == BRW_EXECUTE_1
) {
496 brw_inst_set_src0_hstride(devinfo
, inst
, BRW_HORIZONTAL_STRIDE_0
);
497 brw_inst_set_src0_width(devinfo
, inst
, BRW_WIDTH_1
);
498 brw_inst_set_src0_vstride(devinfo
, inst
, BRW_VERTICAL_STRIDE_0
);
500 brw_inst_set_src0_hstride(devinfo
, inst
, reg
.hstride
);
501 brw_inst_set_src0_width(devinfo
, inst
, reg
.width
);
502 brw_inst_set_src0_vstride(devinfo
, inst
, reg
.vstride
);
505 brw_inst_set_src0_da16_swiz_x(devinfo
, inst
,
506 BRW_GET_SWZ(reg
.swizzle
, BRW_CHANNEL_X
));
507 brw_inst_set_src0_da16_swiz_y(devinfo
, inst
,
508 BRW_GET_SWZ(reg
.swizzle
, BRW_CHANNEL_Y
));
509 brw_inst_set_src0_da16_swiz_z(devinfo
, inst
,
510 BRW_GET_SWZ(reg
.swizzle
, BRW_CHANNEL_Z
));
511 brw_inst_set_src0_da16_swiz_w(devinfo
, inst
,
512 BRW_GET_SWZ(reg
.swizzle
, BRW_CHANNEL_W
));
514 if (reg
.vstride
== BRW_VERTICAL_STRIDE_8
) {
515 /* This is an oddity of the fact we're using the same
516 * descriptions for registers in align_16 as align_1:
518 brw_inst_set_src0_vstride(devinfo
, inst
, BRW_VERTICAL_STRIDE_4
);
519 } else if (devinfo
->gen
== 7 && !devinfo
->is_haswell
&&
520 reg
.type
== BRW_REGISTER_TYPE_DF
&&
521 reg
.vstride
== BRW_VERTICAL_STRIDE_2
) {
524 * "For Align16 access mode, only encodings of 0000 and 0011
525 * are allowed. Other codes are reserved."
527 * Presumably the DevSNB behavior applies to IVB as well.
529 brw_inst_set_src0_vstride(devinfo
, inst
, BRW_VERTICAL_STRIDE_4
);
531 brw_inst_set_src0_vstride(devinfo
, inst
, reg
.vstride
);
539 brw_set_src1(struct brw_codegen
*p
, brw_inst
*inst
, struct brw_reg reg
)
541 const struct gen_device_info
*devinfo
= p
->devinfo
;
543 if (reg
.file
!= BRW_ARCHITECTURE_REGISTER_FILE
)
544 assert(reg
.nr
< 128);
546 /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
548 * "Accumulator registers may be accessed explicitly as src0
551 assert(reg
.file
!= BRW_ARCHITECTURE_REGISTER_FILE
||
552 reg
.nr
!= BRW_ARF_ACCUMULATOR
);
554 gen7_convert_mrf_to_grf(p
, ®
);
555 assert(reg
.file
!= BRW_MESSAGE_REGISTER_FILE
);
557 validate_reg(devinfo
, inst
, reg
);
559 brw_inst_set_src1_reg_file(devinfo
, inst
, reg
.file
);
560 brw_inst_set_src1_reg_type(devinfo
, inst
,
561 brw_reg_type_to_hw_type(devinfo
, reg
.type
, reg
.file
));
562 brw_inst_set_src1_abs(devinfo
, inst
, reg
.abs
);
563 brw_inst_set_src1_negate(devinfo
, inst
, reg
.negate
);
565 /* Only src1 can be immediate in two-argument instructions.
567 assert(brw_inst_src0_reg_file(devinfo
, inst
) != BRW_IMMEDIATE_VALUE
);
569 if (reg
.file
== BRW_IMMEDIATE_VALUE
) {
570 /* two-argument instructions can only use 32-bit immediates */
571 assert(type_sz(reg
.type
) < 8);
572 brw_inst_set_imm_ud(devinfo
, inst
, reg
.ud
);
574 /* This is a hardware restriction, which may or may not be lifted
577 assert (reg
.address_mode
== BRW_ADDRESS_DIRECT
);
578 /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
580 brw_inst_set_src1_da_reg_nr(devinfo
, inst
, reg
.nr
);
581 if (brw_inst_access_mode(devinfo
, inst
) == BRW_ALIGN_1
) {
582 brw_inst_set_src1_da1_subreg_nr(devinfo
, inst
, reg
.subnr
);
584 brw_inst_set_src1_da16_subreg_nr(devinfo
, inst
, reg
.subnr
/ 16);
587 if (brw_inst_access_mode(devinfo
, inst
) == BRW_ALIGN_1
) {
588 if (reg
.width
== BRW_WIDTH_1
&&
589 brw_inst_exec_size(devinfo
, inst
) == BRW_EXECUTE_1
) {
590 brw_inst_set_src1_hstride(devinfo
, inst
, BRW_HORIZONTAL_STRIDE_0
);
591 brw_inst_set_src1_width(devinfo
, inst
, BRW_WIDTH_1
);
592 brw_inst_set_src1_vstride(devinfo
, inst
, BRW_VERTICAL_STRIDE_0
);
594 brw_inst_set_src1_hstride(devinfo
, inst
, reg
.hstride
);
595 brw_inst_set_src1_width(devinfo
, inst
, reg
.width
);
596 brw_inst_set_src1_vstride(devinfo
, inst
, reg
.vstride
);
599 brw_inst_set_src1_da16_swiz_x(devinfo
, inst
,
600 BRW_GET_SWZ(reg
.swizzle
, BRW_CHANNEL_X
));
601 brw_inst_set_src1_da16_swiz_y(devinfo
, inst
,
602 BRW_GET_SWZ(reg
.swizzle
, BRW_CHANNEL_Y
));
603 brw_inst_set_src1_da16_swiz_z(devinfo
, inst
,
604 BRW_GET_SWZ(reg
.swizzle
, BRW_CHANNEL_Z
));
605 brw_inst_set_src1_da16_swiz_w(devinfo
, inst
,
606 BRW_GET_SWZ(reg
.swizzle
, BRW_CHANNEL_W
));
608 if (reg
.vstride
== BRW_VERTICAL_STRIDE_8
) {
609 /* This is an oddity of the fact we're using the same
610 * descriptions for registers in align_16 as align_1:
612 brw_inst_set_src1_vstride(devinfo
, inst
, BRW_VERTICAL_STRIDE_4
);
613 } else if (devinfo
->gen
== 7 && !devinfo
->is_haswell
&&
614 reg
.type
== BRW_REGISTER_TYPE_DF
&&
615 reg
.vstride
== BRW_VERTICAL_STRIDE_2
) {
618 * "For Align16 access mode, only encodings of 0000 and 0011
619 * are allowed. Other codes are reserved."
621 * Presumably the DevSNB behavior applies to IVB as well.
623 brw_inst_set_src1_vstride(devinfo
, inst
, BRW_VERTICAL_STRIDE_4
);
625 brw_inst_set_src1_vstride(devinfo
, inst
, reg
.vstride
);
632 * Set the Message Descriptor and Extended Message Descriptor fields
635 * \note This zeroes out the Function Control bits, so it must be called
636 * \b before filling out any message-specific data. Callers can
637 * choose not to fill in irrelevant bits; they will be zero.
640 brw_set_message_descriptor(struct brw_codegen
*p
,
642 enum brw_message_target sfid
,
644 unsigned response_length
,
648 const struct gen_device_info
*devinfo
= p
->devinfo
;
650 brw_set_src1(p
, inst
, brw_imm_d(0));
652 /* For indirect sends, `inst` will not be the SEND/SENDC instruction
653 * itself; instead, it will be a MOV/OR into the address register.
655 * In this case, we avoid setting the extended message descriptor bits,
656 * since they go on the later SEND/SENDC instead and if set here would
657 * instead clobber the conditionalmod bits.
659 unsigned opcode
= brw_inst_opcode(devinfo
, inst
);
660 if (opcode
== BRW_OPCODE_SEND
|| opcode
== BRW_OPCODE_SENDC
) {
661 brw_inst_set_sfid(devinfo
, inst
, sfid
);
664 brw_inst_set_mlen(devinfo
, inst
, msg_length
);
665 brw_inst_set_rlen(devinfo
, inst
, response_length
);
666 brw_inst_set_eot(devinfo
, inst
, end_of_thread
);
668 if (devinfo
->gen
>= 5) {
669 brw_inst_set_header_present(devinfo
, inst
, header_present
);
673 static void brw_set_math_message( struct brw_codegen
*p
,
676 unsigned integer_type
,
680 const struct gen_device_info
*devinfo
= p
->devinfo
;
682 unsigned response_length
;
684 /* Infer message length from the function */
686 case BRW_MATH_FUNCTION_POW
:
687 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT
:
688 case BRW_MATH_FUNCTION_INT_DIV_REMAINDER
:
689 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER
:
697 /* Infer response length from the function */
699 case BRW_MATH_FUNCTION_SINCOS
:
700 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER
:
709 brw_set_message_descriptor(p
, inst
, BRW_SFID_MATH
,
710 msg_length
, response_length
, false, false);
711 brw_inst_set_math_msg_function(devinfo
, inst
, function
);
712 brw_inst_set_math_msg_signed_int(devinfo
, inst
, integer_type
);
713 brw_inst_set_math_msg_precision(devinfo
, inst
, low_precision
);
714 brw_inst_set_math_msg_saturate(devinfo
, inst
, brw_inst_saturate(devinfo
, inst
));
715 brw_inst_set_math_msg_data_type(devinfo
, inst
, dataType
);
716 brw_inst_set_saturate(devinfo
, inst
, 0);
720 static void brw_set_ff_sync_message(struct brw_codegen
*p
,
723 unsigned response_length
,
726 const struct gen_device_info
*devinfo
= p
->devinfo
;
728 brw_set_message_descriptor(p
, insn
, BRW_SFID_URB
,
729 1, response_length
, true, end_of_thread
);
730 brw_inst_set_urb_opcode(devinfo
, insn
, 1); /* FF_SYNC */
731 brw_inst_set_urb_allocate(devinfo
, insn
, allocate
);
732 /* The following fields are not used by FF_SYNC: */
733 brw_inst_set_urb_global_offset(devinfo
, insn
, 0);
734 brw_inst_set_urb_swizzle_control(devinfo
, insn
, 0);
735 brw_inst_set_urb_used(devinfo
, insn
, 0);
736 brw_inst_set_urb_complete(devinfo
, insn
, 0);
739 static void brw_set_urb_message( struct brw_codegen
*p
,
741 enum brw_urb_write_flags flags
,
743 unsigned response_length
,
745 unsigned swizzle_control
)
747 const struct gen_device_info
*devinfo
= p
->devinfo
;
749 assert(devinfo
->gen
< 7 || swizzle_control
!= BRW_URB_SWIZZLE_TRANSPOSE
);
750 assert(devinfo
->gen
< 7 || !(flags
& BRW_URB_WRITE_ALLOCATE
));
751 assert(devinfo
->gen
>= 7 || !(flags
& BRW_URB_WRITE_PER_SLOT_OFFSET
));
753 brw_set_message_descriptor(p
, insn
, BRW_SFID_URB
,
754 msg_length
, response_length
, true,
755 flags
& BRW_URB_WRITE_EOT
);
757 if (flags
& BRW_URB_WRITE_OWORD
) {
758 assert(msg_length
== 2); /* header + one OWORD of data */
759 brw_inst_set_urb_opcode(devinfo
, insn
, BRW_URB_OPCODE_WRITE_OWORD
);
761 brw_inst_set_urb_opcode(devinfo
, insn
, BRW_URB_OPCODE_WRITE_HWORD
);
764 brw_inst_set_urb_global_offset(devinfo
, insn
, offset
);
765 brw_inst_set_urb_swizzle_control(devinfo
, insn
, swizzle_control
);
767 if (devinfo
->gen
< 8) {
768 brw_inst_set_urb_complete(devinfo
, insn
, !!(flags
& BRW_URB_WRITE_COMPLETE
));
771 if (devinfo
->gen
< 7) {
772 brw_inst_set_urb_allocate(devinfo
, insn
, !!(flags
& BRW_URB_WRITE_ALLOCATE
));
773 brw_inst_set_urb_used(devinfo
, insn
, !(flags
& BRW_URB_WRITE_UNUSED
));
775 brw_inst_set_urb_per_slot_offset(devinfo
, insn
,
776 !!(flags
& BRW_URB_WRITE_PER_SLOT_OFFSET
));
781 brw_set_dp_write_message(struct brw_codegen
*p
,
783 unsigned binding_table_index
,
784 unsigned msg_control
,
786 unsigned target_cache
,
789 unsigned last_render_target
,
790 unsigned response_length
,
791 unsigned end_of_thread
,
792 unsigned send_commit_msg
)
794 const struct gen_device_info
*devinfo
= p
->devinfo
;
795 const unsigned sfid
= (devinfo
->gen
>= 6 ? target_cache
:
796 BRW_SFID_DATAPORT_WRITE
);
798 brw_set_message_descriptor(p
, insn
, sfid
, msg_length
, response_length
,
799 header_present
, end_of_thread
);
801 brw_inst_set_binding_table_index(devinfo
, insn
, binding_table_index
);
802 brw_inst_set_dp_write_msg_type(devinfo
, insn
, msg_type
);
803 brw_inst_set_dp_write_msg_control(devinfo
, insn
, msg_control
);
804 brw_inst_set_rt_last(devinfo
, insn
, last_render_target
);
805 if (devinfo
->gen
< 7) {
806 brw_inst_set_dp_write_commit(devinfo
, insn
, send_commit_msg
);
811 brw_set_dp_read_message(struct brw_codegen
*p
,
813 unsigned binding_table_index
,
814 unsigned msg_control
,
816 unsigned target_cache
,
819 unsigned response_length
)
821 const struct gen_device_info
*devinfo
= p
->devinfo
;
822 const unsigned sfid
= (devinfo
->gen
>= 6 ? target_cache
:
823 BRW_SFID_DATAPORT_READ
);
825 brw_set_message_descriptor(p
, insn
, sfid
, msg_length
, response_length
,
826 header_present
, false);
828 brw_inst_set_binding_table_index(devinfo
, insn
, binding_table_index
);
829 brw_inst_set_dp_read_msg_type(devinfo
, insn
, msg_type
);
830 brw_inst_set_dp_read_msg_control(devinfo
, insn
, msg_control
);
831 if (devinfo
->gen
< 6)
832 brw_inst_set_dp_read_target_cache(devinfo
, insn
, target_cache
);
836 brw_set_sampler_message(struct brw_codegen
*p
,
838 unsigned binding_table_index
,
841 unsigned response_length
,
843 unsigned header_present
,
845 unsigned return_format
)
847 const struct gen_device_info
*devinfo
= p
->devinfo
;
849 brw_set_message_descriptor(p
, inst
, BRW_SFID_SAMPLER
, msg_length
,
850 response_length
, header_present
, false);
852 brw_inst_set_binding_table_index(devinfo
, inst
, binding_table_index
);
853 brw_inst_set_sampler(devinfo
, inst
, sampler
);
854 brw_inst_set_sampler_msg_type(devinfo
, inst
, msg_type
);
855 if (devinfo
->gen
>= 5) {
856 brw_inst_set_sampler_simd_mode(devinfo
, inst
, simd_mode
);
857 } else if (devinfo
->gen
== 4 && !devinfo
->is_g4x
) {
858 brw_inst_set_sampler_return_format(devinfo
, inst
, return_format
);
863 gen7_set_dp_scratch_message(struct brw_codegen
*p
,
867 bool invalidate_after_read
,
869 unsigned addr_offset
,
874 const struct gen_device_info
*devinfo
= p
->devinfo
;
875 assert(num_regs
== 1 || num_regs
== 2 || num_regs
== 4 ||
876 (devinfo
->gen
>= 8 && num_regs
== 8));
877 const unsigned block_size
= (devinfo
->gen
>= 8 ? _mesa_logbase2(num_regs
) :
880 brw_set_message_descriptor(p
, inst
, GEN7_SFID_DATAPORT_DATA_CACHE
,
881 mlen
, rlen
, header_present
, false);
882 brw_inst_set_dp_category(devinfo
, inst
, 1); /* Scratch Block Read/Write msgs */
883 brw_inst_set_scratch_read_write(devinfo
, inst
, write
);
884 brw_inst_set_scratch_type(devinfo
, inst
, dword
);
885 brw_inst_set_scratch_invalidate_after_read(devinfo
, inst
, invalidate_after_read
);
886 brw_inst_set_scratch_block_size(devinfo
, inst
, block_size
);
887 brw_inst_set_scratch_addr_offset(devinfo
, inst
, addr_offset
);
890 #define next_insn brw_next_insn
892 brw_next_insn(struct brw_codegen
*p
, unsigned opcode
)
894 const struct gen_device_info
*devinfo
= p
->devinfo
;
897 if (p
->nr_insn
+ 1 > p
->store_size
) {
899 p
->store
= reralloc(p
->mem_ctx
, p
->store
, brw_inst
, p
->store_size
);
902 p
->next_insn_offset
+= 16;
903 insn
= &p
->store
[p
->nr_insn
++];
904 memcpy(insn
, p
->current
, sizeof(*insn
));
906 brw_inst_set_opcode(devinfo
, insn
, opcode
);
911 brw_alu1(struct brw_codegen
*p
, unsigned opcode
,
912 struct brw_reg dest
, struct brw_reg src
)
914 brw_inst
*insn
= next_insn(p
, opcode
);
915 brw_set_dest(p
, insn
, dest
);
916 brw_set_src0(p
, insn
, src
);
921 brw_alu2(struct brw_codegen
*p
, unsigned opcode
,
922 struct brw_reg dest
, struct brw_reg src0
, struct brw_reg src1
)
924 /* 64-bit immediates are only supported on 1-src instructions */
925 assert(src0
.file
!= BRW_IMMEDIATE_VALUE
|| type_sz(src0
.type
) <= 4);
926 assert(src1
.file
!= BRW_IMMEDIATE_VALUE
|| type_sz(src1
.type
) <= 4);
928 brw_inst
*insn
= next_insn(p
, opcode
);
929 brw_set_dest(p
, insn
, dest
);
930 brw_set_src0(p
, insn
, src0
);
931 brw_set_src1(p
, insn
, src1
);
936 get_3src_subreg_nr(struct brw_reg reg
)
938 /* Normally, SubRegNum is in bytes (0..31). However, 3-src instructions
939 * use 32-bit units (components 0..7). Since they only support F/D/UD
940 * types, this doesn't lose any flexibility, but uses fewer bits.
942 return reg
.subnr
/ 4;
946 brw_alu3(struct brw_codegen
*p
, unsigned opcode
, struct brw_reg dest
,
947 struct brw_reg src0
, struct brw_reg src1
, struct brw_reg src2
)
949 const struct gen_device_info
*devinfo
= p
->devinfo
;
950 brw_inst
*inst
= next_insn(p
, opcode
);
952 gen7_convert_mrf_to_grf(p
, &dest
);
954 assert(brw_inst_access_mode(devinfo
, inst
) == BRW_ALIGN_16
);
956 assert(dest
.file
== BRW_GENERAL_REGISTER_FILE
||
957 dest
.file
== BRW_MESSAGE_REGISTER_FILE
);
958 assert(dest
.nr
< 128);
959 assert(dest
.address_mode
== BRW_ADDRESS_DIRECT
);
960 assert(dest
.type
== BRW_REGISTER_TYPE_F
||
961 dest
.type
== BRW_REGISTER_TYPE_DF
||
962 dest
.type
== BRW_REGISTER_TYPE_D
||
963 dest
.type
== BRW_REGISTER_TYPE_UD
);
964 if (devinfo
->gen
== 6) {
965 brw_inst_set_3src_dst_reg_file(devinfo
, inst
,
966 dest
.file
== BRW_MESSAGE_REGISTER_FILE
);
968 brw_inst_set_3src_dst_reg_nr(devinfo
, inst
, dest
.nr
);
969 brw_inst_set_3src_dst_subreg_nr(devinfo
, inst
, dest
.subnr
/ 16);
970 brw_inst_set_3src_dst_writemask(devinfo
, inst
, dest
.writemask
);
972 assert(src0
.file
== BRW_GENERAL_REGISTER_FILE
);
973 assert(src0
.address_mode
== BRW_ADDRESS_DIRECT
);
974 assert(src0
.nr
< 128);
975 brw_inst_set_3src_src0_swizzle(devinfo
, inst
, src0
.swizzle
);
976 brw_inst_set_3src_src0_subreg_nr(devinfo
, inst
, get_3src_subreg_nr(src0
));
977 brw_inst_set_3src_src0_reg_nr(devinfo
, inst
, src0
.nr
);
978 brw_inst_set_3src_src0_abs(devinfo
, inst
, src0
.abs
);
979 brw_inst_set_3src_src0_negate(devinfo
, inst
, src0
.negate
);
980 brw_inst_set_3src_src0_rep_ctrl(devinfo
, inst
,
981 src0
.vstride
== BRW_VERTICAL_STRIDE_0
);
983 assert(src1
.file
== BRW_GENERAL_REGISTER_FILE
);
984 assert(src1
.address_mode
== BRW_ADDRESS_DIRECT
);
985 assert(src1
.nr
< 128);
986 brw_inst_set_3src_src1_swizzle(devinfo
, inst
, src1
.swizzle
);
987 brw_inst_set_3src_src1_subreg_nr(devinfo
, inst
, get_3src_subreg_nr(src1
));
988 brw_inst_set_3src_src1_reg_nr(devinfo
, inst
, src1
.nr
);
989 brw_inst_set_3src_src1_abs(devinfo
, inst
, src1
.abs
);
990 brw_inst_set_3src_src1_negate(devinfo
, inst
, src1
.negate
);
991 brw_inst_set_3src_src1_rep_ctrl(devinfo
, inst
,
992 src1
.vstride
== BRW_VERTICAL_STRIDE_0
);
994 assert(src2
.file
== BRW_GENERAL_REGISTER_FILE
);
995 assert(src2
.address_mode
== BRW_ADDRESS_DIRECT
);
996 assert(src2
.nr
< 128);
997 brw_inst_set_3src_src2_swizzle(devinfo
, inst
, src2
.swizzle
);
998 brw_inst_set_3src_src2_subreg_nr(devinfo
, inst
, get_3src_subreg_nr(src2
));
999 brw_inst_set_3src_src2_reg_nr(devinfo
, inst
, src2
.nr
);
1000 brw_inst_set_3src_src2_abs(devinfo
, inst
, src2
.abs
);
1001 brw_inst_set_3src_src2_negate(devinfo
, inst
, src2
.negate
);
1002 brw_inst_set_3src_src2_rep_ctrl(devinfo
, inst
,
1003 src2
.vstride
== BRW_VERTICAL_STRIDE_0
);
1005 if (devinfo
->gen
>= 7) {
1006 /* Set both the source and destination types based on dest.type,
1007 * ignoring the source register types. The MAD and LRP emitters ensure
1008 * that all four types are float. The BFE and BFI2 emitters, however,
1009 * may send us mixed D and UD types and want us to ignore that and use
1010 * the destination type.
1012 switch (dest
.type
) {
1013 case BRW_REGISTER_TYPE_F
:
1014 brw_inst_set_3src_src_type(devinfo
, inst
, BRW_3SRC_TYPE_F
);
1015 brw_inst_set_3src_dst_type(devinfo
, inst
, BRW_3SRC_TYPE_F
);
1017 case BRW_REGISTER_TYPE_DF
:
1018 brw_inst_set_3src_src_type(devinfo
, inst
, BRW_3SRC_TYPE_DF
);
1019 brw_inst_set_3src_dst_type(devinfo
, inst
, BRW_3SRC_TYPE_DF
);
1021 case BRW_REGISTER_TYPE_D
:
1022 brw_inst_set_3src_src_type(devinfo
, inst
, BRW_3SRC_TYPE_D
);
1023 brw_inst_set_3src_dst_type(devinfo
, inst
, BRW_3SRC_TYPE_D
);
1025 case BRW_REGISTER_TYPE_UD
:
1026 brw_inst_set_3src_src_type(devinfo
, inst
, BRW_3SRC_TYPE_UD
);
1027 brw_inst_set_3src_dst_type(devinfo
, inst
, BRW_3SRC_TYPE_UD
);
1030 unreachable("not reached");
1038 /***********************************************************************
1039 * Convenience routines.
1042 brw_inst *brw_##OP(struct brw_codegen *p, \
1043 struct brw_reg dest, \
1044 struct brw_reg src0) \
1046 return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \
1050 brw_inst *brw_##OP(struct brw_codegen *p, \
1051 struct brw_reg dest, \
1052 struct brw_reg src0, \
1053 struct brw_reg src1) \
1055 return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \
1059 brw_inst *brw_##OP(struct brw_codegen *p, \
1060 struct brw_reg dest, \
1061 struct brw_reg src0, \
1062 struct brw_reg src1, \
1063 struct brw_reg src2) \
1065 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
1069 brw_inst *brw_##OP(struct brw_codegen *p, \
1070 struct brw_reg dest, \
1071 struct brw_reg src0, \
1072 struct brw_reg src1, \
1073 struct brw_reg src2) \
1075 assert(dest.type == BRW_REGISTER_TYPE_F || \
1076 dest.type == BRW_REGISTER_TYPE_DF); \
1077 if (dest.type == BRW_REGISTER_TYPE_F) { \
1078 assert(src0.type == BRW_REGISTER_TYPE_F); \
1079 assert(src1.type == BRW_REGISTER_TYPE_F); \
1080 assert(src2.type == BRW_REGISTER_TYPE_F); \
1081 } else if (dest.type == BRW_REGISTER_TYPE_DF) { \
1082 assert(src0.type == BRW_REGISTER_TYPE_DF); \
1083 assert(src1.type == BRW_REGISTER_TYPE_DF); \
1084 assert(src2.type == BRW_REGISTER_TYPE_DF); \
1086 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
1089 /* Rounding operations (other than RNDD) require two instructions - the first
1090 * stores a rounded value (possibly the wrong way) in the dest register, but
1091 * also sets a per-channel "increment bit" in the flag register. A predicated
1092 * add of 1.0 fixes dest to contain the desired result.
1094 * Sandybridge and later appear to round correctly without an ADD.
1097 void brw_##OP(struct brw_codegen *p, \
1098 struct brw_reg dest, \
1099 struct brw_reg src) \
1101 const struct gen_device_info *devinfo = p->devinfo; \
1102 brw_inst *rnd, *add; \
1103 rnd = next_insn(p, BRW_OPCODE_##OP); \
1104 brw_set_dest(p, rnd, dest); \
1105 brw_set_src0(p, rnd, src); \
1107 if (devinfo->gen < 6) { \
1108 /* turn on round-increments */ \
1109 brw_inst_set_cond_modifier(devinfo, rnd, BRW_CONDITIONAL_R); \
1110 add = brw_ADD(p, dest, dest, brw_imm_f(1.0f)); \
1111 brw_inst_set_pred_control(devinfo, add, BRW_PREDICATE_NORMAL); \
1150 brw_MOV(struct brw_codegen
*p
, struct brw_reg dest
, struct brw_reg src0
)
1152 const struct gen_device_info
*devinfo
= p
->devinfo
;
1154 /* When converting F->DF on IVB/BYT, every odd source channel is ignored.
1155 * To avoid the problems that causes, we use a <1,2,0> source region to read
1156 * each element twice.
1158 if (devinfo
->gen
== 7 && !devinfo
->is_haswell
&&
1159 brw_inst_access_mode(devinfo
, p
->current
) == BRW_ALIGN_1
&&
1160 dest
.type
== BRW_REGISTER_TYPE_DF
&&
1161 (src0
.type
== BRW_REGISTER_TYPE_F
||
1162 src0
.type
== BRW_REGISTER_TYPE_D
||
1163 src0
.type
== BRW_REGISTER_TYPE_UD
) &&
1164 !has_scalar_region(src0
)) {
1165 assert(src0
.vstride
== BRW_VERTICAL_STRIDE_4
&&
1166 src0
.width
== BRW_WIDTH_4
&&
1167 src0
.hstride
== BRW_HORIZONTAL_STRIDE_1
);
1169 src0
.vstride
= BRW_VERTICAL_STRIDE_1
;
1170 src0
.width
= BRW_WIDTH_2
;
1171 src0
.hstride
= BRW_HORIZONTAL_STRIDE_0
;
1174 return brw_alu1(p
, BRW_OPCODE_MOV
, dest
, src0
);
1178 brw_ADD(struct brw_codegen
*p
, struct brw_reg dest
,
1179 struct brw_reg src0
, struct brw_reg src1
)
1182 if (src0
.type
== BRW_REGISTER_TYPE_F
||
1183 (src0
.file
== BRW_IMMEDIATE_VALUE
&&
1184 src0
.type
== BRW_REGISTER_TYPE_VF
)) {
1185 assert(src1
.type
!= BRW_REGISTER_TYPE_UD
);
1186 assert(src1
.type
!= BRW_REGISTER_TYPE_D
);
1189 if (src1
.type
== BRW_REGISTER_TYPE_F
||
1190 (src1
.file
== BRW_IMMEDIATE_VALUE
&&
1191 src1
.type
== BRW_REGISTER_TYPE_VF
)) {
1192 assert(src0
.type
!= BRW_REGISTER_TYPE_UD
);
1193 assert(src0
.type
!= BRW_REGISTER_TYPE_D
);
1196 return brw_alu2(p
, BRW_OPCODE_ADD
, dest
, src0
, src1
);
1200 brw_AVG(struct brw_codegen
*p
, struct brw_reg dest
,
1201 struct brw_reg src0
, struct brw_reg src1
)
1203 assert(dest
.type
== src0
.type
);
1204 assert(src0
.type
== src1
.type
);
1205 switch (src0
.type
) {
1206 case BRW_REGISTER_TYPE_B
:
1207 case BRW_REGISTER_TYPE_UB
:
1208 case BRW_REGISTER_TYPE_W
:
1209 case BRW_REGISTER_TYPE_UW
:
1210 case BRW_REGISTER_TYPE_D
:
1211 case BRW_REGISTER_TYPE_UD
:
1214 unreachable("Bad type for brw_AVG");
1217 return brw_alu2(p
, BRW_OPCODE_AVG
, dest
, src0
, src1
);
1221 brw_MUL(struct brw_codegen
*p
, struct brw_reg dest
,
1222 struct brw_reg src0
, struct brw_reg src1
)
1225 if (src0
.type
== BRW_REGISTER_TYPE_D
||
1226 src0
.type
== BRW_REGISTER_TYPE_UD
||
1227 src1
.type
== BRW_REGISTER_TYPE_D
||
1228 src1
.type
== BRW_REGISTER_TYPE_UD
) {
1229 assert(dest
.type
!= BRW_REGISTER_TYPE_F
);
1232 if (src0
.type
== BRW_REGISTER_TYPE_F
||
1233 (src0
.file
== BRW_IMMEDIATE_VALUE
&&
1234 src0
.type
== BRW_REGISTER_TYPE_VF
)) {
1235 assert(src1
.type
!= BRW_REGISTER_TYPE_UD
);
1236 assert(src1
.type
!= BRW_REGISTER_TYPE_D
);
1239 if (src1
.type
== BRW_REGISTER_TYPE_F
||
1240 (src1
.file
== BRW_IMMEDIATE_VALUE
&&
1241 src1
.type
== BRW_REGISTER_TYPE_VF
)) {
1242 assert(src0
.type
!= BRW_REGISTER_TYPE_UD
);
1243 assert(src0
.type
!= BRW_REGISTER_TYPE_D
);
1246 assert(src0
.file
!= BRW_ARCHITECTURE_REGISTER_FILE
||
1247 src0
.nr
!= BRW_ARF_ACCUMULATOR
);
1248 assert(src1
.file
!= BRW_ARCHITECTURE_REGISTER_FILE
||
1249 src1
.nr
!= BRW_ARF_ACCUMULATOR
);
1251 return brw_alu2(p
, BRW_OPCODE_MUL
, dest
, src0
, src1
);
1255 brw_LINE(struct brw_codegen
*p
, struct brw_reg dest
,
1256 struct brw_reg src0
, struct brw_reg src1
)
1258 src0
.vstride
= BRW_VERTICAL_STRIDE_0
;
1259 src0
.width
= BRW_WIDTH_1
;
1260 src0
.hstride
= BRW_HORIZONTAL_STRIDE_0
;
1261 return brw_alu2(p
, BRW_OPCODE_LINE
, dest
, src0
, src1
);
1265 brw_PLN(struct brw_codegen
*p
, struct brw_reg dest
,
1266 struct brw_reg src0
, struct brw_reg src1
)
1268 src0
.vstride
= BRW_VERTICAL_STRIDE_0
;
1269 src0
.width
= BRW_WIDTH_1
;
1270 src0
.hstride
= BRW_HORIZONTAL_STRIDE_0
;
1271 src1
.vstride
= BRW_VERTICAL_STRIDE_8
;
1272 src1
.width
= BRW_WIDTH_8
;
1273 src1
.hstride
= BRW_HORIZONTAL_STRIDE_1
;
1274 return brw_alu2(p
, BRW_OPCODE_PLN
, dest
, src0
, src1
);
1278 brw_F32TO16(struct brw_codegen
*p
, struct brw_reg dst
, struct brw_reg src
)
1280 const struct gen_device_info
*devinfo
= p
->devinfo
;
1281 const bool align16
= brw_inst_access_mode(devinfo
, p
->current
) == BRW_ALIGN_16
;
1282 /* The F32TO16 instruction doesn't support 32-bit destination types in
1283 * Align1 mode, and neither does the Gen8 implementation in terms of a
1284 * converting MOV. Gen7 does zero out the high 16 bits in Align16 mode as
1285 * an undocumented feature.
1287 const bool needs_zero_fill
= (dst
.type
== BRW_REGISTER_TYPE_UD
&&
1288 (!align16
|| devinfo
->gen
>= 8));
1292 assert(dst
.type
== BRW_REGISTER_TYPE_UD
);
1294 assert(dst
.type
== BRW_REGISTER_TYPE_UD
||
1295 dst
.type
== BRW_REGISTER_TYPE_W
||
1296 dst
.type
== BRW_REGISTER_TYPE_UW
||
1297 dst
.type
== BRW_REGISTER_TYPE_HF
);
1300 brw_push_insn_state(p
);
1302 if (needs_zero_fill
) {
1303 brw_set_default_access_mode(p
, BRW_ALIGN_1
);
1304 dst
= spread(retype(dst
, BRW_REGISTER_TYPE_W
), 2);
1307 if (devinfo
->gen
>= 8) {
1308 inst
= brw_MOV(p
, retype(dst
, BRW_REGISTER_TYPE_HF
), src
);
1310 assert(devinfo
->gen
== 7);
1311 inst
= brw_alu1(p
, BRW_OPCODE_F32TO16
, dst
, src
);
1314 if (needs_zero_fill
) {
1315 brw_inst_set_no_dd_clear(devinfo
, inst
, true);
1316 inst
= brw_MOV(p
, suboffset(dst
, 1), brw_imm_w(0));
1317 brw_inst_set_no_dd_check(devinfo
, inst
, true);
1320 brw_pop_insn_state(p
);
1325 brw_F16TO32(struct brw_codegen
*p
, struct brw_reg dst
, struct brw_reg src
)
1327 const struct gen_device_info
*devinfo
= p
->devinfo
;
1328 bool align16
= brw_inst_access_mode(devinfo
, p
->current
) == BRW_ALIGN_16
;
1331 assert(src
.type
== BRW_REGISTER_TYPE_UD
);
1333 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1335 * Because this instruction does not have a 16-bit floating-point
1336 * type, the source data type must be Word (W). The destination type
1337 * must be F (Float).
1339 if (src
.type
== BRW_REGISTER_TYPE_UD
)
1340 src
= spread(retype(src
, BRW_REGISTER_TYPE_W
), 2);
1342 assert(src
.type
== BRW_REGISTER_TYPE_W
||
1343 src
.type
== BRW_REGISTER_TYPE_UW
||
1344 src
.type
== BRW_REGISTER_TYPE_HF
);
1347 if (devinfo
->gen
>= 8) {
1348 return brw_MOV(p
, dst
, retype(src
, BRW_REGISTER_TYPE_HF
));
1350 assert(devinfo
->gen
== 7);
1351 return brw_alu1(p
, BRW_OPCODE_F16TO32
, dst
, src
);
1356 void brw_NOP(struct brw_codegen
*p
)
1358 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_NOP
);
1359 memset(insn
, 0, sizeof(*insn
));
1360 brw_inst_set_opcode(p
->devinfo
, insn
, BRW_OPCODE_NOP
);
1367 /***********************************************************************
1368 * Comparisons, if/else/endif
1372 brw_JMPI(struct brw_codegen
*p
, struct brw_reg index
,
1373 unsigned predicate_control
)
1375 const struct gen_device_info
*devinfo
= p
->devinfo
;
1376 struct brw_reg ip
= brw_ip_reg();
1377 brw_inst
*inst
= brw_alu2(p
, BRW_OPCODE_JMPI
, ip
, ip
, index
);
1379 brw_inst_set_exec_size(devinfo
, inst
, BRW_EXECUTE_2
);
1380 brw_inst_set_qtr_control(devinfo
, inst
, BRW_COMPRESSION_NONE
);
1381 brw_inst_set_mask_control(devinfo
, inst
, BRW_MASK_DISABLE
);
1382 brw_inst_set_pred_control(devinfo
, inst
, predicate_control
);
1388 push_if_stack(struct brw_codegen
*p
, brw_inst
*inst
)
1390 p
->if_stack
[p
->if_stack_depth
] = inst
- p
->store
;
1392 p
->if_stack_depth
++;
1393 if (p
->if_stack_array_size
<= p
->if_stack_depth
) {
1394 p
->if_stack_array_size
*= 2;
1395 p
->if_stack
= reralloc(p
->mem_ctx
, p
->if_stack
, int,
1396 p
->if_stack_array_size
);
1401 pop_if_stack(struct brw_codegen
*p
)
1403 p
->if_stack_depth
--;
1404 return &p
->store
[p
->if_stack
[p
->if_stack_depth
]];
1408 push_loop_stack(struct brw_codegen
*p
, brw_inst
*inst
)
1410 if (p
->loop_stack_array_size
<= (p
->loop_stack_depth
+ 1)) {
1411 p
->loop_stack_array_size
*= 2;
1412 p
->loop_stack
= reralloc(p
->mem_ctx
, p
->loop_stack
, int,
1413 p
->loop_stack_array_size
);
1414 p
->if_depth_in_loop
= reralloc(p
->mem_ctx
, p
->if_depth_in_loop
, int,
1415 p
->loop_stack_array_size
);
1418 p
->loop_stack
[p
->loop_stack_depth
] = inst
- p
->store
;
1419 p
->loop_stack_depth
++;
1420 p
->if_depth_in_loop
[p
->loop_stack_depth
] = 0;
1424 get_inner_do_insn(struct brw_codegen
*p
)
1426 return &p
->store
[p
->loop_stack
[p
->loop_stack_depth
- 1]];
1429 /* EU takes the value from the flag register and pushes it onto some
1430 * sort of a stack (presumably merging with any flag value already on
1431 * the stack). Within an if block, the flags at the top of the stack
1432 * control execution on each channel of the unit, eg. on each of the
1433 * 16 pixel values in our wm programs.
1435 * When the matching 'else' instruction is reached (presumably by
1436 * countdown of the instruction count patched in by our ELSE/ENDIF
1437 * functions), the relevant flags are inverted.
1439 * When the matching 'endif' instruction is reached, the flags are
1440 * popped off. If the stack is now empty, normal execution resumes.
1443 brw_IF(struct brw_codegen
*p
, unsigned execute_size
)
1445 const struct gen_device_info
*devinfo
= p
->devinfo
;
1448 insn
= next_insn(p
, BRW_OPCODE_IF
);
1450 /* Override the defaults for this instruction:
1452 if (devinfo
->gen
< 6) {
1453 brw_set_dest(p
, insn
, brw_ip_reg());
1454 brw_set_src0(p
, insn
, brw_ip_reg());
1455 brw_set_src1(p
, insn
, brw_imm_d(0x0));
1456 } else if (devinfo
->gen
== 6) {
1457 brw_set_dest(p
, insn
, brw_imm_w(0));
1458 brw_inst_set_gen6_jump_count(devinfo
, insn
, 0);
1459 brw_set_src0(p
, insn
, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D
)));
1460 brw_set_src1(p
, insn
, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D
)));
1461 } else if (devinfo
->gen
== 7) {
1462 brw_set_dest(p
, insn
, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D
)));
1463 brw_set_src0(p
, insn
, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D
)));
1464 brw_set_src1(p
, insn
, brw_imm_w(0));
1465 brw_inst_set_jip(devinfo
, insn
, 0);
1466 brw_inst_set_uip(devinfo
, insn
, 0);
1468 brw_set_dest(p
, insn
, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D
)));
1469 brw_set_src0(p
, insn
, brw_imm_d(0));
1470 brw_inst_set_jip(devinfo
, insn
, 0);
1471 brw_inst_set_uip(devinfo
, insn
, 0);
1474 brw_inst_set_exec_size(devinfo
, insn
, execute_size
);
1475 brw_inst_set_qtr_control(devinfo
, insn
, BRW_COMPRESSION_NONE
);
1476 brw_inst_set_pred_control(devinfo
, insn
, BRW_PREDICATE_NORMAL
);
1477 brw_inst_set_mask_control(devinfo
, insn
, BRW_MASK_ENABLE
);
1478 if (!p
->single_program_flow
&& devinfo
->gen
< 6)
1479 brw_inst_set_thread_control(devinfo
, insn
, BRW_THREAD_SWITCH
);
1481 push_if_stack(p
, insn
);
1482 p
->if_depth_in_loop
[p
->loop_stack_depth
]++;
1486 /* This function is only used for gen6-style IF instructions with an
1487 * embedded comparison (conditional modifier). It is not used on gen7.
1490 gen6_IF(struct brw_codegen
*p
, enum brw_conditional_mod conditional
,
1491 struct brw_reg src0
, struct brw_reg src1
)
1493 const struct gen_device_info
*devinfo
= p
->devinfo
;
1496 insn
= next_insn(p
, BRW_OPCODE_IF
);
1498 brw_set_dest(p
, insn
, brw_imm_w(0));
1499 brw_inst_set_exec_size(devinfo
, insn
,
1500 brw_inst_exec_size(devinfo
, p
->current
));
1501 brw_inst_set_gen6_jump_count(devinfo
, insn
, 0);
1502 brw_set_src0(p
, insn
, src0
);
1503 brw_set_src1(p
, insn
, src1
);
1505 assert(brw_inst_qtr_control(devinfo
, insn
) == BRW_COMPRESSION_NONE
);
1506 assert(brw_inst_pred_control(devinfo
, insn
) == BRW_PREDICATE_NONE
);
1507 brw_inst_set_cond_modifier(devinfo
, insn
, conditional
);
1509 push_if_stack(p
, insn
);
1514 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1517 convert_IF_ELSE_to_ADD(struct brw_codegen
*p
,
1518 brw_inst
*if_inst
, brw_inst
*else_inst
)
1520 const struct gen_device_info
*devinfo
= p
->devinfo
;
1522 /* The next instruction (where the ENDIF would be, if it existed) */
1523 brw_inst
*next_inst
= &p
->store
[p
->nr_insn
];
1525 assert(p
->single_program_flow
);
1526 assert(if_inst
!= NULL
&& brw_inst_opcode(devinfo
, if_inst
) == BRW_OPCODE_IF
);
1527 assert(else_inst
== NULL
|| brw_inst_opcode(devinfo
, else_inst
) == BRW_OPCODE_ELSE
);
1528 assert(brw_inst_exec_size(devinfo
, if_inst
) == BRW_EXECUTE_1
);
1530 /* Convert IF to an ADD instruction that moves the instruction pointer
1531 * to the first instruction of the ELSE block. If there is no ELSE
1532 * block, point to where ENDIF would be. Reverse the predicate.
1534 * There's no need to execute an ENDIF since we don't need to do any
1535 * stack operations, and if we're currently executing, we just want to
1536 * continue normally.
1538 brw_inst_set_opcode(devinfo
, if_inst
, BRW_OPCODE_ADD
);
1539 brw_inst_set_pred_inv(devinfo
, if_inst
, true);
1541 if (else_inst
!= NULL
) {
1542 /* Convert ELSE to an ADD instruction that points where the ENDIF
1545 brw_inst_set_opcode(devinfo
, else_inst
, BRW_OPCODE_ADD
);
1547 brw_inst_set_imm_ud(devinfo
, if_inst
, (else_inst
- if_inst
+ 1) * 16);
1548 brw_inst_set_imm_ud(devinfo
, else_inst
, (next_inst
- else_inst
) * 16);
1550 brw_inst_set_imm_ud(devinfo
, if_inst
, (next_inst
- if_inst
) * 16);
1555 * Patch IF and ELSE instructions with appropriate jump targets.
1558 patch_IF_ELSE(struct brw_codegen
*p
,
1559 brw_inst
*if_inst
, brw_inst
*else_inst
, brw_inst
*endif_inst
)
1561 const struct gen_device_info
*devinfo
= p
->devinfo
;
1563 /* We shouldn't be patching IF and ELSE instructions in single program flow
1564 * mode when gen < 6, because in single program flow mode on those
1565 * platforms, we convert flow control instructions to conditional ADDs that
1566 * operate on IP (see brw_ENDIF).
1568 * However, on Gen6, writing to IP doesn't work in single program flow mode
1569 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1570 * not be updated by non-flow control instructions."). And on later
1571 * platforms, there is no significant benefit to converting control flow
1572 * instructions to conditional ADDs. So we do patch IF and ELSE
1573 * instructions in single program flow mode on those platforms.
1575 if (devinfo
->gen
< 6)
1576 assert(!p
->single_program_flow
);
1578 assert(if_inst
!= NULL
&& brw_inst_opcode(devinfo
, if_inst
) == BRW_OPCODE_IF
);
1579 assert(endif_inst
!= NULL
);
1580 assert(else_inst
== NULL
|| brw_inst_opcode(devinfo
, else_inst
) == BRW_OPCODE_ELSE
);
1582 unsigned br
= brw_jump_scale(devinfo
);
1584 assert(brw_inst_opcode(devinfo
, endif_inst
) == BRW_OPCODE_ENDIF
);
1585 brw_inst_set_exec_size(devinfo
, endif_inst
, brw_inst_exec_size(devinfo
, if_inst
));
1587 if (else_inst
== NULL
) {
1588 /* Patch IF -> ENDIF */
1589 if (devinfo
->gen
< 6) {
1590 /* Turn it into an IFF, which means no mask stack operations for
1591 * all-false and jumping past the ENDIF.
1593 brw_inst_set_opcode(devinfo
, if_inst
, BRW_OPCODE_IFF
);
1594 brw_inst_set_gen4_jump_count(devinfo
, if_inst
,
1595 br
* (endif_inst
- if_inst
+ 1));
1596 brw_inst_set_gen4_pop_count(devinfo
, if_inst
, 0);
1597 } else if (devinfo
->gen
== 6) {
1598 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1599 brw_inst_set_gen6_jump_count(devinfo
, if_inst
, br
*(endif_inst
- if_inst
));
1601 brw_inst_set_uip(devinfo
, if_inst
, br
* (endif_inst
- if_inst
));
1602 brw_inst_set_jip(devinfo
, if_inst
, br
* (endif_inst
- if_inst
));
1605 brw_inst_set_exec_size(devinfo
, else_inst
, brw_inst_exec_size(devinfo
, if_inst
));
1607 /* Patch IF -> ELSE */
1608 if (devinfo
->gen
< 6) {
1609 brw_inst_set_gen4_jump_count(devinfo
, if_inst
,
1610 br
* (else_inst
- if_inst
));
1611 brw_inst_set_gen4_pop_count(devinfo
, if_inst
, 0);
1612 } else if (devinfo
->gen
== 6) {
1613 brw_inst_set_gen6_jump_count(devinfo
, if_inst
,
1614 br
* (else_inst
- if_inst
+ 1));
1617 /* Patch ELSE -> ENDIF */
1618 if (devinfo
->gen
< 6) {
1619 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1622 brw_inst_set_gen4_jump_count(devinfo
, else_inst
,
1623 br
* (endif_inst
- else_inst
+ 1));
1624 brw_inst_set_gen4_pop_count(devinfo
, else_inst
, 1);
1625 } else if (devinfo
->gen
== 6) {
1626 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1627 brw_inst_set_gen6_jump_count(devinfo
, else_inst
,
1628 br
* (endif_inst
- else_inst
));
1630 /* The IF instruction's JIP should point just past the ELSE */
1631 brw_inst_set_jip(devinfo
, if_inst
, br
* (else_inst
- if_inst
+ 1));
1632 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1633 brw_inst_set_uip(devinfo
, if_inst
, br
* (endif_inst
- if_inst
));
1634 brw_inst_set_jip(devinfo
, else_inst
, br
* (endif_inst
- else_inst
));
1635 if (devinfo
->gen
>= 8) {
1636 /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both
1637 * should point to ENDIF.
1639 brw_inst_set_uip(devinfo
, else_inst
, br
* (endif_inst
- else_inst
));
1646 brw_ELSE(struct brw_codegen
*p
)
1648 const struct gen_device_info
*devinfo
= p
->devinfo
;
1651 insn
= next_insn(p
, BRW_OPCODE_ELSE
);
1653 if (devinfo
->gen
< 6) {
1654 brw_set_dest(p
, insn
, brw_ip_reg());
1655 brw_set_src0(p
, insn
, brw_ip_reg());
1656 brw_set_src1(p
, insn
, brw_imm_d(0x0));
1657 } else if (devinfo
->gen
== 6) {
1658 brw_set_dest(p
, insn
, brw_imm_w(0));
1659 brw_inst_set_gen6_jump_count(devinfo
, insn
, 0);
1660 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1661 brw_set_src1(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1662 } else if (devinfo
->gen
== 7) {
1663 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1664 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1665 brw_set_src1(p
, insn
, brw_imm_w(0));
1666 brw_inst_set_jip(devinfo
, insn
, 0);
1667 brw_inst_set_uip(devinfo
, insn
, 0);
1669 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1670 brw_set_src0(p
, insn
, brw_imm_d(0));
1671 brw_inst_set_jip(devinfo
, insn
, 0);
1672 brw_inst_set_uip(devinfo
, insn
, 0);
1675 brw_inst_set_qtr_control(devinfo
, insn
, BRW_COMPRESSION_NONE
);
1676 brw_inst_set_mask_control(devinfo
, insn
, BRW_MASK_ENABLE
);
1677 if (!p
->single_program_flow
&& devinfo
->gen
< 6)
1678 brw_inst_set_thread_control(devinfo
, insn
, BRW_THREAD_SWITCH
);
1680 push_if_stack(p
, insn
);
1684 brw_ENDIF(struct brw_codegen
*p
)
1686 const struct gen_device_info
*devinfo
= p
->devinfo
;
1687 brw_inst
*insn
= NULL
;
1688 brw_inst
*else_inst
= NULL
;
1689 brw_inst
*if_inst
= NULL
;
1691 bool emit_endif
= true;
1693 /* In single program flow mode, we can express IF and ELSE instructions
1694 * equivalently as ADD instructions that operate on IP. On platforms prior
1695 * to Gen6, flow control instructions cause an implied thread switch, so
1696 * this is a significant savings.
1698 * However, on Gen6, writing to IP doesn't work in single program flow mode
1699 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1700 * not be updated by non-flow control instructions."). And on later
1701 * platforms, there is no significant benefit to converting control flow
1702 * instructions to conditional ADDs. So we only do this trick on Gen4 and
1705 if (devinfo
->gen
< 6 && p
->single_program_flow
)
1709 * A single next_insn() may change the base address of instruction store
1710 * memory(p->store), so call it first before referencing the instruction
1711 * store pointer from an index
1714 insn
= next_insn(p
, BRW_OPCODE_ENDIF
);
1716 /* Pop the IF and (optional) ELSE instructions from the stack */
1717 p
->if_depth_in_loop
[p
->loop_stack_depth
]--;
1718 tmp
= pop_if_stack(p
);
1719 if (brw_inst_opcode(devinfo
, tmp
) == BRW_OPCODE_ELSE
) {
1721 tmp
= pop_if_stack(p
);
1726 /* ENDIF is useless; don't bother emitting it. */
1727 convert_IF_ELSE_to_ADD(p
, if_inst
, else_inst
);
1731 if (devinfo
->gen
< 6) {
1732 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1733 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1734 brw_set_src1(p
, insn
, brw_imm_d(0x0));
1735 } else if (devinfo
->gen
== 6) {
1736 brw_set_dest(p
, insn
, brw_imm_w(0));
1737 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1738 brw_set_src1(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1739 } else if (devinfo
->gen
== 7) {
1740 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1741 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1742 brw_set_src1(p
, insn
, brw_imm_w(0));
1744 brw_set_src0(p
, insn
, brw_imm_d(0));
1747 brw_inst_set_qtr_control(devinfo
, insn
, BRW_COMPRESSION_NONE
);
1748 brw_inst_set_mask_control(devinfo
, insn
, BRW_MASK_ENABLE
);
1749 if (devinfo
->gen
< 6)
1750 brw_inst_set_thread_control(devinfo
, insn
, BRW_THREAD_SWITCH
);
1752 /* Also pop item off the stack in the endif instruction: */
1753 if (devinfo
->gen
< 6) {
1754 brw_inst_set_gen4_jump_count(devinfo
, insn
, 0);
1755 brw_inst_set_gen4_pop_count(devinfo
, insn
, 1);
1756 } else if (devinfo
->gen
== 6) {
1757 brw_inst_set_gen6_jump_count(devinfo
, insn
, 2);
1759 brw_inst_set_jip(devinfo
, insn
, 2);
1761 patch_IF_ELSE(p
, if_inst
, else_inst
, insn
);
1765 brw_BREAK(struct brw_codegen
*p
)
1767 const struct gen_device_info
*devinfo
= p
->devinfo
;
1770 insn
= next_insn(p
, BRW_OPCODE_BREAK
);
1771 if (devinfo
->gen
>= 8) {
1772 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1773 brw_set_src0(p
, insn
, brw_imm_d(0x0));
1774 } else if (devinfo
->gen
>= 6) {
1775 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1776 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1777 brw_set_src1(p
, insn
, brw_imm_d(0x0));
1779 brw_set_dest(p
, insn
, brw_ip_reg());
1780 brw_set_src0(p
, insn
, brw_ip_reg());
1781 brw_set_src1(p
, insn
, brw_imm_d(0x0));
1782 brw_inst_set_gen4_pop_count(devinfo
, insn
,
1783 p
->if_depth_in_loop
[p
->loop_stack_depth
]);
1785 brw_inst_set_qtr_control(devinfo
, insn
, BRW_COMPRESSION_NONE
);
1786 brw_inst_set_exec_size(devinfo
, insn
,
1787 brw_inst_exec_size(devinfo
, p
->current
));
1793 brw_CONT(struct brw_codegen
*p
)
1795 const struct gen_device_info
*devinfo
= p
->devinfo
;
1798 insn
= next_insn(p
, BRW_OPCODE_CONTINUE
);
1799 brw_set_dest(p
, insn
, brw_ip_reg());
1800 if (devinfo
->gen
>= 8) {
1801 brw_set_src0(p
, insn
, brw_imm_d(0x0));
1803 brw_set_src0(p
, insn
, brw_ip_reg());
1804 brw_set_src1(p
, insn
, brw_imm_d(0x0));
1807 if (devinfo
->gen
< 6) {
1808 brw_inst_set_gen4_pop_count(devinfo
, insn
,
1809 p
->if_depth_in_loop
[p
->loop_stack_depth
]);
1811 brw_inst_set_qtr_control(devinfo
, insn
, BRW_COMPRESSION_NONE
);
1812 brw_inst_set_exec_size(devinfo
, insn
,
1813 brw_inst_exec_size(devinfo
, p
->current
));
1818 gen6_HALT(struct brw_codegen
*p
)
1820 const struct gen_device_info
*devinfo
= p
->devinfo
;
1823 insn
= next_insn(p
, BRW_OPCODE_HALT
);
1824 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1825 if (devinfo
->gen
>= 8) {
1826 brw_set_src0(p
, insn
, brw_imm_d(0x0));
1828 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1829 brw_set_src1(p
, insn
, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1832 brw_inst_set_qtr_control(devinfo
, insn
, BRW_COMPRESSION_NONE
);
1833 brw_inst_set_exec_size(devinfo
, insn
,
1834 brw_inst_exec_size(devinfo
, p
->current
));
1840 * The DO/WHILE is just an unterminated loop -- break or continue are
1841 * used for control within the loop. We have a few ways they can be
1844 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1845 * jip and no DO instruction.
1847 * For non-uniform control flow pre-gen6, there's a DO instruction to
1848 * push the mask, and a WHILE to jump back, and BREAK to get out and
1851 * For gen6, there's no more mask stack, so no need for DO. WHILE
1852 * just points back to the first instruction of the loop.
1855 brw_DO(struct brw_codegen
*p
, unsigned execute_size
)
1857 const struct gen_device_info
*devinfo
= p
->devinfo
;
1859 if (devinfo
->gen
>= 6 || p
->single_program_flow
) {
1860 push_loop_stack(p
, &p
->store
[p
->nr_insn
]);
1861 return &p
->store
[p
->nr_insn
];
1863 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_DO
);
1865 push_loop_stack(p
, insn
);
1867 /* Override the defaults for this instruction:
1869 brw_set_dest(p
, insn
, brw_null_reg());
1870 brw_set_src0(p
, insn
, brw_null_reg());
1871 brw_set_src1(p
, insn
, brw_null_reg());
1873 brw_inst_set_qtr_control(devinfo
, insn
, BRW_COMPRESSION_NONE
);
1874 brw_inst_set_exec_size(devinfo
, insn
, execute_size
);
1875 brw_inst_set_pred_control(devinfo
, insn
, BRW_PREDICATE_NONE
);
1882 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1885 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1886 * nesting, since it can always just point to the end of the block/current loop.
1889 brw_patch_break_cont(struct brw_codegen
*p
, brw_inst
*while_inst
)
1891 const struct gen_device_info
*devinfo
= p
->devinfo
;
1892 brw_inst
*do_inst
= get_inner_do_insn(p
);
1894 unsigned br
= brw_jump_scale(devinfo
);
1896 assert(devinfo
->gen
< 6);
1898 for (inst
= while_inst
- 1; inst
!= do_inst
; inst
--) {
1899 /* If the jump count is != 0, that means that this instruction has already
1900 * been patched because it's part of a loop inside of the one we're
1903 if (brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_BREAK
&&
1904 brw_inst_gen4_jump_count(devinfo
, inst
) == 0) {
1905 brw_inst_set_gen4_jump_count(devinfo
, inst
, br
*((while_inst
- inst
) + 1));
1906 } else if (brw_inst_opcode(devinfo
, inst
) == BRW_OPCODE_CONTINUE
&&
1907 brw_inst_gen4_jump_count(devinfo
, inst
) == 0) {
1908 brw_inst_set_gen4_jump_count(devinfo
, inst
, br
* (while_inst
- inst
));
1914 brw_WHILE(struct brw_codegen
*p
)
1916 const struct gen_device_info
*devinfo
= p
->devinfo
;
1917 brw_inst
*insn
, *do_insn
;
1918 unsigned br
= brw_jump_scale(devinfo
);
1920 if (devinfo
->gen
>= 6) {
1921 insn
= next_insn(p
, BRW_OPCODE_WHILE
);
1922 do_insn
= get_inner_do_insn(p
);
1924 if (devinfo
->gen
>= 8) {
1925 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1926 brw_set_src0(p
, insn
, brw_imm_d(0));
1927 brw_inst_set_jip(devinfo
, insn
, br
* (do_insn
- insn
));
1928 } else if (devinfo
->gen
== 7) {
1929 brw_set_dest(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1930 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1931 brw_set_src1(p
, insn
, brw_imm_w(0));
1932 brw_inst_set_jip(devinfo
, insn
, br
* (do_insn
- insn
));
1934 brw_set_dest(p
, insn
, brw_imm_w(0));
1935 brw_inst_set_gen6_jump_count(devinfo
, insn
, br
* (do_insn
- insn
));
1936 brw_set_src0(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1937 brw_set_src1(p
, insn
, retype(brw_null_reg(), BRW_REGISTER_TYPE_D
));
1940 brw_inst_set_exec_size(devinfo
, insn
,
1941 brw_inst_exec_size(devinfo
, p
->current
));
1944 if (p
->single_program_flow
) {
1945 insn
= next_insn(p
, BRW_OPCODE_ADD
);
1946 do_insn
= get_inner_do_insn(p
);
1948 brw_set_dest(p
, insn
, brw_ip_reg());
1949 brw_set_src0(p
, insn
, brw_ip_reg());
1950 brw_set_src1(p
, insn
, brw_imm_d((do_insn
- insn
) * 16));
1951 brw_inst_set_exec_size(devinfo
, insn
, BRW_EXECUTE_1
);
1953 insn
= next_insn(p
, BRW_OPCODE_WHILE
);
1954 do_insn
= get_inner_do_insn(p
);
1956 assert(brw_inst_opcode(devinfo
, do_insn
) == BRW_OPCODE_DO
);
1958 brw_set_dest(p
, insn
, brw_ip_reg());
1959 brw_set_src0(p
, insn
, brw_ip_reg());
1960 brw_set_src1(p
, insn
, brw_imm_d(0));
1962 brw_inst_set_exec_size(devinfo
, insn
, brw_inst_exec_size(devinfo
, do_insn
));
1963 brw_inst_set_gen4_jump_count(devinfo
, insn
, br
* (do_insn
- insn
+ 1));
1964 brw_inst_set_gen4_pop_count(devinfo
, insn
, 0);
1966 brw_patch_break_cont(p
, insn
);
1969 brw_inst_set_qtr_control(devinfo
, insn
, BRW_COMPRESSION_NONE
);
1971 p
->loop_stack_depth
--;
1978 void brw_land_fwd_jump(struct brw_codegen
*p
, int jmp_insn_idx
)
1980 const struct gen_device_info
*devinfo
= p
->devinfo
;
1981 brw_inst
*jmp_insn
= &p
->store
[jmp_insn_idx
];
1984 if (devinfo
->gen
>= 5)
1987 assert(brw_inst_opcode(devinfo
, jmp_insn
) == BRW_OPCODE_JMPI
);
1988 assert(brw_inst_src1_reg_file(devinfo
, jmp_insn
) == BRW_IMMEDIATE_VALUE
);
1990 brw_inst_set_gen4_jump_count(devinfo
, jmp_insn
,
1991 jmpi
* (p
->nr_insn
- jmp_insn_idx
- 1));
1994 /* To integrate with the above, it makes sense that the comparison
1995 * instruction should populate the flag register. It might be simpler
1996 * just to use the flag reg for most WM tasks?
1998 void brw_CMP(struct brw_codegen
*p
,
1999 struct brw_reg dest
,
2000 unsigned conditional
,
2001 struct brw_reg src0
,
2002 struct brw_reg src1
)
2004 const struct gen_device_info
*devinfo
= p
->devinfo
;
2005 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_CMP
);
2007 brw_inst_set_cond_modifier(devinfo
, insn
, conditional
);
2008 brw_set_dest(p
, insn
, dest
);
2009 brw_set_src0(p
, insn
, src0
);
2010 brw_set_src1(p
, insn
, src1
);
2012 /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
2014 * "Any CMP instruction with a null destination must use a {switch}."
2016 * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
2017 * mentioned on their work-arounds pages.
2019 if (devinfo
->gen
== 7) {
2020 if (dest
.file
== BRW_ARCHITECTURE_REGISTER_FILE
&&
2021 dest
.nr
== BRW_ARF_NULL
) {
2022 brw_inst_set_thread_control(devinfo
, insn
, BRW_THREAD_SWITCH
);
2027 /***********************************************************************
2028 * Helpers for the various SEND message types:
2031 /** Extended math function, float[8].
2033 void gen4_math(struct brw_codegen
*p
,
2034 struct brw_reg dest
,
2036 unsigned msg_reg_nr
,
2038 unsigned precision
)
2040 const struct gen_device_info
*devinfo
= p
->devinfo
;
2041 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_SEND
);
2043 if (has_scalar_region(src
)) {
2044 data_type
= BRW_MATH_DATA_SCALAR
;
2046 data_type
= BRW_MATH_DATA_VECTOR
;
2049 assert(devinfo
->gen
< 6);
2051 /* Example code doesn't set predicate_control for send
2054 brw_inst_set_pred_control(devinfo
, insn
, 0);
2055 brw_inst_set_base_mrf(devinfo
, insn
, msg_reg_nr
);
2057 brw_set_dest(p
, insn
, dest
);
2058 brw_set_src0(p
, insn
, src
);
2059 brw_set_math_message(p
,
2062 src
.type
== BRW_REGISTER_TYPE_D
,
2067 void gen6_math(struct brw_codegen
*p
,
2068 struct brw_reg dest
,
2070 struct brw_reg src0
,
2071 struct brw_reg src1
)
2073 const struct gen_device_info
*devinfo
= p
->devinfo
;
2074 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_MATH
);
2076 assert(devinfo
->gen
>= 6);
2078 assert(dest
.file
== BRW_GENERAL_REGISTER_FILE
||
2079 (devinfo
->gen
>= 7 && dest
.file
== BRW_MESSAGE_REGISTER_FILE
));
2081 assert(dest
.hstride
== BRW_HORIZONTAL_STRIDE_1
);
2082 if (devinfo
->gen
== 6) {
2083 assert(src0
.hstride
== BRW_HORIZONTAL_STRIDE_1
);
2084 assert(src1
.hstride
== BRW_HORIZONTAL_STRIDE_1
);
2087 if (function
== BRW_MATH_FUNCTION_INT_DIV_QUOTIENT
||
2088 function
== BRW_MATH_FUNCTION_INT_DIV_REMAINDER
||
2089 function
== BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER
) {
2090 assert(src0
.type
!= BRW_REGISTER_TYPE_F
);
2091 assert(src1
.type
!= BRW_REGISTER_TYPE_F
);
2092 assert(src1
.file
== BRW_GENERAL_REGISTER_FILE
||
2093 (devinfo
->gen
>= 8 && src1
.file
== BRW_IMMEDIATE_VALUE
));
2095 assert(src0
.type
== BRW_REGISTER_TYPE_F
);
2096 assert(src1
.type
== BRW_REGISTER_TYPE_F
);
2099 /* Source modifiers are ignored for extended math instructions on Gen6. */
2100 if (devinfo
->gen
== 6) {
2101 assert(!src0
.negate
);
2103 assert(!src1
.negate
);
2107 brw_inst_set_math_function(devinfo
, insn
, function
);
2109 brw_set_dest(p
, insn
, dest
);
2110 brw_set_src0(p
, insn
, src0
);
2111 brw_set_src1(p
, insn
, src1
);
2115 * Return the right surface index to access the thread scratch space using
2116 * stateless dataport messages.
2119 brw_scratch_surface_idx(const struct brw_codegen
*p
)
2121 /* The scratch space is thread-local so IA coherency is unnecessary. */
2122 if (p
->devinfo
->gen
>= 8)
2123 return GEN8_BTI_STATELESS_NON_COHERENT
;
2125 return BRW_BTI_STATELESS
;
2129 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
2130 * using a constant offset per channel.
2132 * The offset must be aligned to oword size (16 bytes). Used for
2133 * register spilling.
2135 void brw_oword_block_write_scratch(struct brw_codegen
*p
,
2140 const struct gen_device_info
*devinfo
= p
->devinfo
;
2141 const unsigned target_cache
=
2142 (devinfo
->gen
>= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE
:
2143 devinfo
->gen
>= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE
:
2144 BRW_DATAPORT_READ_TARGET_RENDER_CACHE
);
2147 if (devinfo
->gen
>= 6)
2150 mrf
= retype(mrf
, BRW_REGISTER_TYPE_UD
);
2152 const unsigned mlen
= 1 + num_regs
;
2154 /* Set up the message header. This is g0, with g0.2 filled with
2155 * the offset. We don't want to leave our offset around in g0 or
2156 * it'll screw up texture samples, so set it up inside the message
2160 brw_push_insn_state(p
);
2161 brw_set_default_exec_size(p
, BRW_EXECUTE_8
);
2162 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
2163 brw_set_default_compression_control(p
, BRW_COMPRESSION_NONE
);
2165 brw_MOV(p
, mrf
, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD
));
2167 /* set message header global offset field (reg 0, element 2) */
2169 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE
,
2171 2), BRW_REGISTER_TYPE_UD
),
2172 brw_imm_ud(offset
));
2174 brw_pop_insn_state(p
);
2178 struct brw_reg dest
;
2179 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_SEND
);
2180 int send_commit_msg
;
2181 struct brw_reg src_header
= retype(brw_vec8_grf(0, 0),
2182 BRW_REGISTER_TYPE_UW
);
2184 brw_inst_set_compression(devinfo
, insn
, false);
2186 if (brw_inst_exec_size(devinfo
, insn
) >= 16)
2187 src_header
= vec16(src_header
);
2189 assert(brw_inst_pred_control(devinfo
, insn
) == BRW_PREDICATE_NONE
);
2190 if (devinfo
->gen
< 6)
2191 brw_inst_set_base_mrf(devinfo
, insn
, mrf
.nr
);
2193 /* Until gen6, writes followed by reads from the same location
2194 * are not guaranteed to be ordered unless write_commit is set.
2195 * If set, then a no-op write is issued to the destination
2196 * register to set a dependency, and a read from the destination
2197 * can be used to ensure the ordering.
2199 * For gen6, only writes between different threads need ordering
2200 * protection. Our use of DP writes is all about register
2201 * spilling within a thread.
2203 if (devinfo
->gen
>= 6) {
2204 dest
= retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW
);
2205 send_commit_msg
= 0;
2208 send_commit_msg
= 1;
2211 brw_set_dest(p
, insn
, dest
);
2212 if (devinfo
->gen
>= 6) {
2213 brw_set_src0(p
, insn
, mrf
);
2215 brw_set_src0(p
, insn
, brw_null_reg());
2218 if (devinfo
->gen
>= 6)
2219 msg_type
= GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE
;
2221 msg_type
= BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE
;
2223 brw_set_dp_write_message(p
,
2225 brw_scratch_surface_idx(p
),
2226 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs
* 8),
2230 true, /* header_present */
2231 0, /* not a render target */
2232 send_commit_msg
, /* response_length */
2240 * Read a block of owords (half a GRF each) from the scratch buffer
2241 * using a constant index per channel.
2243 * Offset must be aligned to oword size (16 bytes). Used for register
2247 brw_oword_block_read_scratch(struct brw_codegen
*p
,
2248 struct brw_reg dest
,
2253 const struct gen_device_info
*devinfo
= p
->devinfo
;
2255 if (devinfo
->gen
>= 6)
2258 if (p
->devinfo
->gen
>= 7) {
2259 /* On gen 7 and above, we no longer have message registers and we can
2260 * send from any register we want. By using the destination register
2261 * for the message, we guarantee that the implied message write won't
2262 * accidentally overwrite anything. This has been a problem because
2263 * the MRF registers and source for the final FB write are both fixed
2266 mrf
= retype(dest
, BRW_REGISTER_TYPE_UD
);
2268 mrf
= retype(mrf
, BRW_REGISTER_TYPE_UD
);
2270 dest
= retype(dest
, BRW_REGISTER_TYPE_UW
);
2272 const unsigned rlen
= num_regs
;
2273 const unsigned target_cache
=
2274 (devinfo
->gen
>= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE
:
2275 devinfo
->gen
>= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE
:
2276 BRW_DATAPORT_READ_TARGET_RENDER_CACHE
);
2279 brw_push_insn_state(p
);
2280 brw_set_default_exec_size(p
, BRW_EXECUTE_8
);
2281 brw_set_default_compression_control(p
, BRW_COMPRESSION_NONE
);
2282 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
2284 brw_MOV(p
, mrf
, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD
));
2286 /* set message header global offset field (reg 0, element 2) */
2287 brw_MOV(p
, get_element_ud(mrf
, 2), brw_imm_ud(offset
));
2289 brw_pop_insn_state(p
);
2293 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_SEND
);
2295 assert(brw_inst_pred_control(devinfo
, insn
) == 0);
2296 brw_inst_set_compression(devinfo
, insn
, false);
2298 brw_set_dest(p
, insn
, dest
); /* UW? */
2299 if (devinfo
->gen
>= 6) {
2300 brw_set_src0(p
, insn
, mrf
);
2302 brw_set_src0(p
, insn
, brw_null_reg());
2303 brw_inst_set_base_mrf(devinfo
, insn
, mrf
.nr
);
2306 brw_set_dp_read_message(p
,
2308 brw_scratch_surface_idx(p
),
2309 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs
* 8),
2310 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ
, /* msg_type */
2313 true, /* header_present */
2319 gen7_block_read_scratch(struct brw_codegen
*p
,
2320 struct brw_reg dest
,
2324 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_SEND
);
2325 assert(brw_inst_pred_control(p
->devinfo
, insn
) == BRW_PREDICATE_NONE
);
2327 brw_set_dest(p
, insn
, retype(dest
, BRW_REGISTER_TYPE_UW
));
2329 /* The HW requires that the header is present; this is to get the g0.5
2332 brw_set_src0(p
, insn
, brw_vec8_grf(0, 0));
2334 /* According to the docs, offset is "A 12-bit HWord offset into the memory
2335 * Immediate Memory buffer as specified by binding table 0xFF." An HWORD
2336 * is 32 bytes, which happens to be the size of a register.
2339 assert(offset
< (1 << 12));
2341 gen7_set_dp_scratch_message(p
, insn
,
2342 false, /* scratch read */
2344 false, /* invalidate after read */
2347 1, /* mlen: just g0 */
2348 num_regs
, /* rlen */
2349 true); /* header present */
2353 * Read float[4] vectors from the data port constant cache.
2354 * Location (in buffer) should be a multiple of 16.
2355 * Used for fetching shader constants.
2357 void brw_oword_block_read(struct brw_codegen
*p
,
2358 struct brw_reg dest
,
2361 uint32_t bind_table_index
)
2363 const struct gen_device_info
*devinfo
= p
->devinfo
;
2364 const unsigned target_cache
=
2365 (devinfo
->gen
>= 6 ? GEN6_SFID_DATAPORT_CONSTANT_CACHE
:
2366 BRW_DATAPORT_READ_TARGET_DATA_CACHE
);
2367 const unsigned exec_size
= 1 << brw_inst_exec_size(devinfo
, p
->current
);
2369 /* On newer hardware, offset is in units of owords. */
2370 if (devinfo
->gen
>= 6)
2373 mrf
= retype(mrf
, BRW_REGISTER_TYPE_UD
);
2375 brw_push_insn_state(p
);
2376 brw_set_default_predicate_control(p
, BRW_PREDICATE_NONE
);
2377 brw_set_default_compression_control(p
, BRW_COMPRESSION_NONE
);
2378 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
2380 brw_push_insn_state(p
);
2381 brw_set_default_exec_size(p
, BRW_EXECUTE_8
);
2382 brw_MOV(p
, mrf
, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD
));
2384 /* set message header global offset field (reg 0, element 2) */
2386 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE
,
2388 2), BRW_REGISTER_TYPE_UD
),
2389 brw_imm_ud(offset
));
2390 brw_pop_insn_state(p
);
2392 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_SEND
);
2394 /* cast dest to a uword[8] vector */
2395 dest
= retype(vec8(dest
), BRW_REGISTER_TYPE_UW
);
2397 brw_set_dest(p
, insn
, dest
);
2398 if (devinfo
->gen
>= 6) {
2399 brw_set_src0(p
, insn
, mrf
);
2401 brw_set_src0(p
, insn
, brw_null_reg());
2402 brw_inst_set_base_mrf(devinfo
, insn
, mrf
.nr
);
2405 brw_set_dp_read_message(p
, insn
, bind_table_index
,
2406 BRW_DATAPORT_OWORD_BLOCK_DWORDS(exec_size
),
2407 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ
,
2410 true, /* header_present */
2411 DIV_ROUND_UP(exec_size
, 8)); /* response_length */
2413 brw_pop_insn_state(p
);
2417 void brw_fb_WRITE(struct brw_codegen
*p
,
2418 struct brw_reg payload
,
2419 struct brw_reg implied_header
,
2420 unsigned msg_control
,
2421 unsigned binding_table_index
,
2422 unsigned msg_length
,
2423 unsigned response_length
,
2425 bool last_render_target
,
2426 bool header_present
)
2428 const struct gen_device_info
*devinfo
= p
->devinfo
;
2429 const unsigned target_cache
=
2430 (devinfo
->gen
>= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE
:
2431 BRW_DATAPORT_READ_TARGET_RENDER_CACHE
);
2434 struct brw_reg dest
, src0
;
2436 if (brw_inst_exec_size(devinfo
, p
->current
) >= BRW_EXECUTE_16
)
2437 dest
= retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW
);
2439 dest
= retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW
);
2441 if (devinfo
->gen
>= 6) {
2442 insn
= next_insn(p
, BRW_OPCODE_SENDC
);
2444 insn
= next_insn(p
, BRW_OPCODE_SEND
);
2446 brw_inst_set_compression(devinfo
, insn
, false);
2448 if (devinfo
->gen
>= 6) {
2449 /* headerless version, just submit color payload */
2452 msg_type
= GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE
;
2454 assert(payload
.file
== BRW_MESSAGE_REGISTER_FILE
);
2455 brw_inst_set_base_mrf(devinfo
, insn
, payload
.nr
);
2456 src0
= implied_header
;
2458 msg_type
= BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE
;
2461 brw_set_dest(p
, insn
, dest
);
2462 brw_set_src0(p
, insn
, src0
);
2463 brw_set_dp_write_message(p
,
2465 binding_table_index
,
2474 0 /* send_commit_msg */);
2478 gen9_fb_READ(struct brw_codegen
*p
,
2480 struct brw_reg payload
,
2481 unsigned binding_table_index
,
2482 unsigned msg_length
,
2483 unsigned response_length
,
2486 const struct gen_device_info
*devinfo
= p
->devinfo
;
2487 assert(devinfo
->gen
>= 9);
2488 const unsigned msg_subtype
=
2489 brw_inst_exec_size(devinfo
, p
->current
) == BRW_EXECUTE_16
? 0 : 1;
2490 brw_inst
*insn
= next_insn(p
, BRW_OPCODE_SENDC
);
2492 brw_set_dest(p
, insn
, dst
);
2493 brw_set_src0(p
, insn
, payload
);
2494 brw_set_dp_read_message(p
, insn
, binding_table_index
,
2495 per_sample
<< 5 | msg_subtype
,
2496 GEN9_DATAPORT_RC_RENDER_TARGET_READ
,
2497 GEN6_SFID_DATAPORT_RENDER_CACHE
,
2498 msg_length
, true /* header_present */,
2500 brw_inst_set_rt_slot_group(devinfo
, insn
,
2501 brw_inst_qtr_control(devinfo
, p
->current
) / 2);
2507 * Texture sample instruction.
2508 * Note: the msg_type plus msg_length values determine exactly what kind
2509 * of sampling operation is performed. See volume 4, page 161 of docs.
2511 void brw_SAMPLE(struct brw_codegen
*p
,
2512 struct brw_reg dest
,
2513 unsigned msg_reg_nr
,
2514 struct brw_reg src0
,
2515 unsigned binding_table_index
,
2518 unsigned response_length
,
2519 unsigned msg_length
,
2520 unsigned header_present
,
2522 unsigned return_format
)
2524 const struct gen_device_info
*devinfo
= p
->devinfo
;
2527 if (msg_reg_nr
!= -1)
2528 gen6_resolve_implied_move(p
, &src0
, msg_reg_nr
);
2530 insn
= next_insn(p
, BRW_OPCODE_SEND
);
2531 brw_inst_set_pred_control(devinfo
, insn
, BRW_PREDICATE_NONE
); /* XXX */
2533 /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2535 * "Instruction compression is not allowed for this instruction (that
2536 * is, send). The hardware behavior is undefined if this instruction is
2537 * set as compressed. However, compress control can be set to "SecHalf"
2538 * to affect the EMask generation."
2540 * No similar wording is found in later PRMs, but there are examples
2541 * utilizing send with SecHalf. More importantly, SIMD8 sampler messages
2542 * are allowed in SIMD16 mode and they could not work without SecHalf. For
2543 * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2545 brw_inst_set_compression(devinfo
, insn
, false);
2547 if (devinfo
->gen
< 6)
2548 brw_inst_set_base_mrf(devinfo
, insn
, msg_reg_nr
);
2550 brw_set_dest(p
, insn
, dest
);
2551 brw_set_src0(p
, insn
, src0
);
2552 brw_set_sampler_message(p
, insn
,
2553 binding_table_index
,
2563 /* Adjust the message header's sampler state pointer to
2564 * select the correct group of 16 samplers.
2566 void brw_adjust_sampler_state_pointer(struct brw_codegen
*p
,
2567 struct brw_reg header
,
2568 struct brw_reg sampler_index
)
2570 /* The "Sampler Index" field can only store values between 0 and 15.
2571 * However, we can add an offset to the "Sampler State Pointer"
2572 * field, effectively selecting a different set of 16 samplers.
2574 * The "Sampler State Pointer" needs to be aligned to a 32-byte
2575 * offset, and each sampler state is only 16-bytes, so we can't
2576 * exclusively use the offset - we have to use both.
2579 const struct gen_device_info
*devinfo
= p
->devinfo
;
2581 if (sampler_index
.file
== BRW_IMMEDIATE_VALUE
) {
2582 const int sampler_state_size
= 16; /* 16 bytes */
2583 uint32_t sampler
= sampler_index
.ud
;
2585 if (sampler
>= 16) {
2586 assert(devinfo
->is_haswell
|| devinfo
->gen
>= 8);
2588 get_element_ud(header
, 3),
2589 get_element_ud(brw_vec8_grf(0, 0), 3),
2590 brw_imm_ud(16 * (sampler
/ 16) * sampler_state_size
));
2593 /* Non-const sampler array indexing case */
2594 if (devinfo
->gen
< 8 && !devinfo
->is_haswell
) {
2598 struct brw_reg temp
= get_element_ud(header
, 3);
2600 brw_AND(p
, temp
, get_element_ud(sampler_index
, 0), brw_imm_ud(0x0f0));
2601 brw_SHL(p
, temp
, temp
, brw_imm_ud(4));
2603 get_element_ud(header
, 3),
2604 get_element_ud(brw_vec8_grf(0, 0), 3),
2609 /* All these variables are pretty confusing - we might be better off
2610 * using bitmasks and macros for this, in the old style. Or perhaps
2611 * just having the caller instantiate the fields in dword3 itself.
2613 void brw_urb_WRITE(struct brw_codegen
*p
,
2614 struct brw_reg dest
,
2615 unsigned msg_reg_nr
,
2616 struct brw_reg src0
,
2617 enum brw_urb_write_flags flags
,
2618 unsigned msg_length
,
2619 unsigned response_length
,
2623 const struct gen_device_info
*devinfo
= p
->devinfo
;
2626 gen6_resolve_implied_move(p
, &src0
, msg_reg_nr
);
2628 if (devinfo
->gen
>= 7 && !(flags
& BRW_URB_WRITE_USE_CHANNEL_MASKS
)) {
2629 /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2630 brw_push_insn_state(p
);
2631 brw_set_default_access_mode(p
, BRW_ALIGN_1
);
2632 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
2633 brw_OR(p
, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE
, msg_reg_nr
, 5),
2634 BRW_REGISTER_TYPE_UD
),
2635 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD
),
2636 brw_imm_ud(0xff00));
2637 brw_pop_insn_state(p
);
2640 insn
= next_insn(p
, BRW_OPCODE_SEND
);
2642 assert(msg_length
< BRW_MAX_MRF(devinfo
->gen
));
2644 brw_set_dest(p
, insn
, dest
);
2645 brw_set_src0(p
, insn
, src0
);
2646 brw_set_src1(p
, insn
, brw_imm_d(0));
2648 if (devinfo
->gen
< 6)
2649 brw_inst_set_base_mrf(devinfo
, insn
, msg_reg_nr
);
2651 brw_set_urb_message(p
,
2661 brw_send_indirect_message(struct brw_codegen
*p
,
2664 struct brw_reg payload
,
2665 struct brw_reg desc
)
2667 const struct gen_device_info
*devinfo
= p
->devinfo
;
2668 struct brw_inst
*send
;
2671 dst
= retype(dst
, BRW_REGISTER_TYPE_UW
);
2673 assert(desc
.type
== BRW_REGISTER_TYPE_UD
);
2675 /* We hold on to the setup instruction (the SEND in the direct case, the OR
2676 * in the indirect case) by its index in the instruction store. The
2677 * pointer returned by next_insn() may become invalid if emitting the SEND
2678 * in the indirect case reallocs the store.
2681 if (desc
.file
== BRW_IMMEDIATE_VALUE
) {
2683 send
= next_insn(p
, BRW_OPCODE_SEND
);
2684 brw_set_src1(p
, send
, desc
);
2687 struct brw_reg addr
= retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD
);
2689 brw_push_insn_state(p
);
2690 brw_set_default_access_mode(p
, BRW_ALIGN_1
);
2691 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
2692 brw_set_default_predicate_control(p
, BRW_PREDICATE_NONE
);
2694 /* Load the indirect descriptor to an address register using OR so the
2695 * caller can specify additional descriptor bits with the usual
2696 * brw_set_*_message() helper functions.
2699 brw_OR(p
, addr
, desc
, brw_imm_ud(0));
2701 brw_pop_insn_state(p
);
2703 send
= next_insn(p
, BRW_OPCODE_SEND
);
2704 brw_set_src1(p
, send
, addr
);
2707 if (dst
.width
< BRW_EXECUTE_8
)
2708 brw_inst_set_exec_size(devinfo
, send
, dst
.width
);
2710 brw_set_dest(p
, send
, dst
);
2711 brw_set_src0(p
, send
, retype(payload
, BRW_REGISTER_TYPE_UD
));
2712 brw_inst_set_sfid(devinfo
, send
, sfid
);
2714 return &p
->store
[setup
];
2717 static struct brw_inst
*
2718 brw_send_indirect_surface_message(struct brw_codegen
*p
,
2721 struct brw_reg payload
,
2722 struct brw_reg surface
,
2723 unsigned message_len
,
2724 unsigned response_len
,
2725 bool header_present
)
2727 const struct gen_device_info
*devinfo
= p
->devinfo
;
2728 struct brw_inst
*insn
;
2730 if (surface
.file
!= BRW_IMMEDIATE_VALUE
) {
2731 struct brw_reg addr
= retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD
);
2733 brw_push_insn_state(p
);
2734 brw_set_default_access_mode(p
, BRW_ALIGN_1
);
2735 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
2736 brw_set_default_predicate_control(p
, BRW_PREDICATE_NONE
);
2738 /* Mask out invalid bits from the surface index to avoid hangs e.g. when
2739 * some surface array is accessed out of bounds.
2741 insn
= brw_AND(p
, addr
,
2742 suboffset(vec1(retype(surface
, BRW_REGISTER_TYPE_UD
)),
2743 BRW_GET_SWZ(surface
.swizzle
, 0)),
2746 brw_pop_insn_state(p
);
2751 insn
= brw_send_indirect_message(p
, sfid
, dst
, payload
, surface
);
2752 brw_inst_set_mlen(devinfo
, insn
, message_len
);
2753 brw_inst_set_rlen(devinfo
, insn
, response_len
);
2754 brw_inst_set_header_present(devinfo
, insn
, header_present
);
2760 while_jumps_before_offset(const struct gen_device_info
*devinfo
,
2761 brw_inst
*insn
, int while_offset
, int start_offset
)
2763 int scale
= 16 / brw_jump_scale(devinfo
);
2764 int jip
= devinfo
->gen
== 6 ? brw_inst_gen6_jump_count(devinfo
, insn
)
2765 : brw_inst_jip(devinfo
, insn
);
2767 return while_offset
+ jip
* scale
<= start_offset
;
2772 brw_find_next_block_end(struct brw_codegen
*p
, int start_offset
)
2775 void *store
= p
->store
;
2776 const struct gen_device_info
*devinfo
= p
->devinfo
;
2780 for (offset
= next_offset(devinfo
, store
, start_offset
);
2781 offset
< p
->next_insn_offset
;
2782 offset
= next_offset(devinfo
, store
, offset
)) {
2783 brw_inst
*insn
= store
+ offset
;
2785 switch (brw_inst_opcode(devinfo
, insn
)) {
2789 case BRW_OPCODE_ENDIF
:
2794 case BRW_OPCODE_WHILE
:
2795 /* If the while doesn't jump before our instruction, it's the end
2796 * of a sibling do...while loop. Ignore it.
2798 if (!while_jumps_before_offset(devinfo
, insn
, offset
, start_offset
))
2801 case BRW_OPCODE_ELSE
:
2802 case BRW_OPCODE_HALT
:
2811 /* There is no DO instruction on gen6, so to find the end of the loop
2812 * we have to see if the loop is jumping back before our start
2816 brw_find_loop_end(struct brw_codegen
*p
, int start_offset
)
2818 const struct gen_device_info
*devinfo
= p
->devinfo
;
2820 void *store
= p
->store
;
2822 assert(devinfo
->gen
>= 6);
2824 /* Always start after the instruction (such as a WHILE) we're trying to fix
2827 for (offset
= next_offset(devinfo
, store
, start_offset
);
2828 offset
< p
->next_insn_offset
;
2829 offset
= next_offset(devinfo
, store
, offset
)) {
2830 brw_inst
*insn
= store
+ offset
;
2832 if (brw_inst_opcode(devinfo
, insn
) == BRW_OPCODE_WHILE
) {
2833 if (while_jumps_before_offset(devinfo
, insn
, offset
, start_offset
))
2837 assert(!"not reached");
2838 return start_offset
;
2841 /* After program generation, go back and update the UIP and JIP of
2842 * BREAK, CONT, and HALT instructions to their correct locations.
2845 brw_set_uip_jip(struct brw_codegen
*p
, int start_offset
)
2847 const struct gen_device_info
*devinfo
= p
->devinfo
;
2849 int br
= brw_jump_scale(devinfo
);
2850 int scale
= 16 / br
;
2851 void *store
= p
->store
;
2853 if (devinfo
->gen
< 6)
2856 for (offset
= start_offset
; offset
< p
->next_insn_offset
; offset
+= 16) {
2857 brw_inst
*insn
= store
+ offset
;
2858 assert(brw_inst_cmpt_control(devinfo
, insn
) == 0);
2860 int block_end_offset
= brw_find_next_block_end(p
, offset
);
2861 switch (brw_inst_opcode(devinfo
, insn
)) {
2862 case BRW_OPCODE_BREAK
:
2863 assert(block_end_offset
!= 0);
2864 brw_inst_set_jip(devinfo
, insn
, (block_end_offset
- offset
) / scale
);
2865 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2866 brw_inst_set_uip(devinfo
, insn
,
2867 (brw_find_loop_end(p
, offset
) - offset
+
2868 (devinfo
->gen
== 6 ? 16 : 0)) / scale
);
2870 case BRW_OPCODE_CONTINUE
:
2871 assert(block_end_offset
!= 0);
2872 brw_inst_set_jip(devinfo
, insn
, (block_end_offset
- offset
) / scale
);
2873 brw_inst_set_uip(devinfo
, insn
,
2874 (brw_find_loop_end(p
, offset
) - offset
) / scale
);
2876 assert(brw_inst_uip(devinfo
, insn
) != 0);
2877 assert(brw_inst_jip(devinfo
, insn
) != 0);
2880 case BRW_OPCODE_ENDIF
: {
2881 int32_t jump
= (block_end_offset
== 0) ?
2882 1 * br
: (block_end_offset
- offset
) / scale
;
2883 if (devinfo
->gen
>= 7)
2884 brw_inst_set_jip(devinfo
, insn
, jump
);
2886 brw_inst_set_gen6_jump_count(devinfo
, insn
, jump
);
2890 case BRW_OPCODE_HALT
:
2891 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2893 * "In case of the halt instruction not inside any conditional
2894 * code block, the value of <JIP> and <UIP> should be the
2895 * same. In case of the halt instruction inside conditional code
2896 * block, the <UIP> should be the end of the program, and the
2897 * <JIP> should be end of the most inner conditional code block."
2899 * The uip will have already been set by whoever set up the
2902 if (block_end_offset
== 0) {
2903 brw_inst_set_jip(devinfo
, insn
, brw_inst_uip(devinfo
, insn
));
2905 brw_inst_set_jip(devinfo
, insn
, (block_end_offset
- offset
) / scale
);
2907 assert(brw_inst_uip(devinfo
, insn
) != 0);
2908 assert(brw_inst_jip(devinfo
, insn
) != 0);
2914 void brw_ff_sync(struct brw_codegen
*p
,
2915 struct brw_reg dest
,
2916 unsigned msg_reg_nr
,
2917 struct brw_reg src0
,
2919 unsigned response_length
,
2922 const struct gen_device_info
*devinfo
= p
->devinfo
;
2925 gen6_resolve_implied_move(p
, &src0
, msg_reg_nr
);
2927 insn
= next_insn(p
, BRW_OPCODE_SEND
);
2928 brw_set_dest(p
, insn
, dest
);
2929 brw_set_src0(p
, insn
, src0
);
2930 brw_set_src1(p
, insn
, brw_imm_d(0));
2932 if (devinfo
->gen
< 6)
2933 brw_inst_set_base_mrf(devinfo
, insn
, msg_reg_nr
);
2935 brw_set_ff_sync_message(p
,
2943 * Emit the SEND instruction necessary to generate stream output data on Gen6
2944 * (for transform feedback).
2946 * If send_commit_msg is true, this is the last piece of stream output data
2947 * from this thread, so send the data as a committed write. According to the
2948 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2950 * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2951 * writes are complete by sending the final write as a committed write."
2954 brw_svb_write(struct brw_codegen
*p
,
2955 struct brw_reg dest
,
2956 unsigned msg_reg_nr
,
2957 struct brw_reg src0
,
2958 unsigned binding_table_index
,
2959 bool send_commit_msg
)
2961 const struct gen_device_info
*devinfo
= p
->devinfo
;
2962 const unsigned target_cache
=
2963 (devinfo
->gen
>= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE
:
2964 devinfo
->gen
>= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE
:
2965 BRW_DATAPORT_READ_TARGET_RENDER_CACHE
);
2968 gen6_resolve_implied_move(p
, &src0
, msg_reg_nr
);
2970 insn
= next_insn(p
, BRW_OPCODE_SEND
);
2971 brw_set_dest(p
, insn
, dest
);
2972 brw_set_src0(p
, insn
, src0
);
2973 brw_set_src1(p
, insn
, brw_imm_d(0));
2974 brw_set_dp_write_message(p
, insn
,
2975 binding_table_index
,
2976 0, /* msg_control: ignored */
2977 GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE
,
2980 true, /* header_present */
2981 0, /* last_render_target: ignored */
2982 send_commit_msg
, /* response_length */
2983 0, /* end_of_thread */
2984 send_commit_msg
); /* send_commit_msg */
2988 brw_surface_payload_size(struct brw_codegen
*p
,
2989 unsigned num_channels
,
2994 brw_inst_access_mode(p
->devinfo
, p
->current
) == BRW_ALIGN_16
)
2996 else if (has_simd16
&&
2997 brw_inst_exec_size(p
->devinfo
, p
->current
) == BRW_EXECUTE_16
)
2998 return 2 * num_channels
;
3000 return num_channels
;
3004 brw_set_dp_untyped_atomic_message(struct brw_codegen
*p
,
3007 bool response_expected
)
3009 const struct gen_device_info
*devinfo
= p
->devinfo
;
3010 unsigned msg_control
=
3011 atomic_op
| /* Atomic Operation Type: BRW_AOP_* */
3012 (response_expected
? 1 << 5 : 0); /* Return data expected */
3014 if (devinfo
->gen
>= 8 || devinfo
->is_haswell
) {
3015 if (brw_inst_access_mode(devinfo
, p
->current
) == BRW_ALIGN_1
) {
3016 if (brw_inst_exec_size(devinfo
, p
->current
) != BRW_EXECUTE_16
)
3017 msg_control
|= 1 << 4; /* SIMD8 mode */
3019 brw_inst_set_dp_msg_type(devinfo
, insn
,
3020 HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP
);
3022 brw_inst_set_dp_msg_type(devinfo
, insn
,
3023 HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2
);
3026 brw_inst_set_dp_msg_type(devinfo
, insn
,
3027 GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP
);
3029 if (brw_inst_exec_size(devinfo
, p
->current
) != BRW_EXECUTE_16
)
3030 msg_control
|= 1 << 4; /* SIMD8 mode */
3033 brw_inst_set_dp_msg_control(devinfo
, insn
, msg_control
);
3037 brw_untyped_atomic(struct brw_codegen
*p
,
3039 struct brw_reg payload
,
3040 struct brw_reg surface
,
3042 unsigned msg_length
,
3043 bool response_expected
)
3045 const struct gen_device_info
*devinfo
= p
->devinfo
;
3046 const unsigned sfid
= (devinfo
->gen
>= 8 || devinfo
->is_haswell
?
3047 HSW_SFID_DATAPORT_DATA_CACHE_1
:
3048 GEN7_SFID_DATAPORT_DATA_CACHE
);
3049 const bool align1
= brw_inst_access_mode(devinfo
, p
->current
) == BRW_ALIGN_1
;
3050 /* Mask out unused components -- This is especially important in Align16
3051 * mode on generations that don't have native support for SIMD4x2 atomics,
3052 * because unused but enabled components will cause the dataport to perform
3053 * additional atomic operations on the addresses that happen to be in the
3054 * uninitialized Y, Z and W coordinates of the payload.
3056 const unsigned mask
= align1
? WRITEMASK_XYZW
: WRITEMASK_X
;
3057 struct brw_inst
*insn
= brw_send_indirect_surface_message(
3058 p
, sfid
, brw_writemask(dst
, mask
), payload
, surface
, msg_length
,
3059 brw_surface_payload_size(p
, response_expected
,
3060 devinfo
->gen
>= 8 || devinfo
->is_haswell
, true),
3063 brw_set_dp_untyped_atomic_message(
3064 p
, insn
, atomic_op
, response_expected
);
3068 brw_set_dp_untyped_surface_read_message(struct brw_codegen
*p
,
3069 struct brw_inst
*insn
,
3070 unsigned num_channels
)
3072 const struct gen_device_info
*devinfo
= p
->devinfo
;
3073 /* Set mask of 32-bit channels to drop. */
3074 unsigned msg_control
= 0xf & (0xf << num_channels
);
3076 if (brw_inst_access_mode(devinfo
, p
->current
) == BRW_ALIGN_1
) {
3077 if (brw_inst_exec_size(devinfo
, p
->current
) == BRW_EXECUTE_16
)
3078 msg_control
|= 1 << 4; /* SIMD16 mode */
3080 msg_control
|= 2 << 4; /* SIMD8 mode */
3083 brw_inst_set_dp_msg_type(devinfo
, insn
,
3084 (devinfo
->gen
>= 8 || devinfo
->is_haswell
?
3085 HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ
:
3086 GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ
));
3087 brw_inst_set_dp_msg_control(devinfo
, insn
, msg_control
);
3091 brw_untyped_surface_read(struct brw_codegen
*p
,
3093 struct brw_reg payload
,
3094 struct brw_reg surface
,
3095 unsigned msg_length
,
3096 unsigned num_channels
)
3098 const struct gen_device_info
*devinfo
= p
->devinfo
;
3099 const unsigned sfid
= (devinfo
->gen
>= 8 || devinfo
->is_haswell
?
3100 HSW_SFID_DATAPORT_DATA_CACHE_1
:
3101 GEN7_SFID_DATAPORT_DATA_CACHE
);
3102 struct brw_inst
*insn
= brw_send_indirect_surface_message(
3103 p
, sfid
, dst
, payload
, surface
, msg_length
,
3104 brw_surface_payload_size(p
, num_channels
, true, true),
3107 brw_set_dp_untyped_surface_read_message(
3108 p
, insn
, num_channels
);
3112 brw_set_dp_untyped_surface_write_message(struct brw_codegen
*p
,
3113 struct brw_inst
*insn
,
3114 unsigned num_channels
)
3116 const struct gen_device_info
*devinfo
= p
->devinfo
;
3117 /* Set mask of 32-bit channels to drop. */
3118 unsigned msg_control
= 0xf & (0xf << num_channels
);
3120 if (brw_inst_access_mode(devinfo
, p
->current
) == BRW_ALIGN_1
) {
3121 if (brw_inst_exec_size(devinfo
, p
->current
) == BRW_EXECUTE_16
)
3122 msg_control
|= 1 << 4; /* SIMD16 mode */
3124 msg_control
|= 2 << 4; /* SIMD8 mode */
3126 if (devinfo
->gen
>= 8 || devinfo
->is_haswell
)
3127 msg_control
|= 0 << 4; /* SIMD4x2 mode */
3129 msg_control
|= 2 << 4; /* SIMD8 mode */
3132 brw_inst_set_dp_msg_type(devinfo
, insn
,
3133 devinfo
->gen
>= 8 || devinfo
->is_haswell
?
3134 HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE
:
3135 GEN7_DATAPORT_DC_UNTYPED_SURFACE_WRITE
);
3136 brw_inst_set_dp_msg_control(devinfo
, insn
, msg_control
);
3140 brw_untyped_surface_write(struct brw_codegen
*p
,
3141 struct brw_reg payload
,
3142 struct brw_reg surface
,
3143 unsigned msg_length
,
3144 unsigned num_channels
)
3146 const struct gen_device_info
*devinfo
= p
->devinfo
;
3147 const unsigned sfid
= (devinfo
->gen
>= 8 || devinfo
->is_haswell
?
3148 HSW_SFID_DATAPORT_DATA_CACHE_1
:
3149 GEN7_SFID_DATAPORT_DATA_CACHE
);
3150 const bool align1
= brw_inst_access_mode(devinfo
, p
->current
) == BRW_ALIGN_1
;
3151 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3152 const unsigned mask
= devinfo
->gen
== 7 && !devinfo
->is_haswell
&& !align1
?
3153 WRITEMASK_X
: WRITEMASK_XYZW
;
3154 struct brw_inst
*insn
= brw_send_indirect_surface_message(
3155 p
, sfid
, brw_writemask(brw_null_reg(), mask
),
3156 payload
, surface
, msg_length
, 0, align1
);
3158 brw_set_dp_untyped_surface_write_message(
3159 p
, insn
, num_channels
);
3163 brw_set_dp_typed_atomic_message(struct brw_codegen
*p
,
3164 struct brw_inst
*insn
,
3166 bool response_expected
)
3168 const struct gen_device_info
*devinfo
= p
->devinfo
;
3169 unsigned msg_control
=
3170 atomic_op
| /* Atomic Operation Type: BRW_AOP_* */
3171 (response_expected
? 1 << 5 : 0); /* Return data expected */
3173 if (devinfo
->gen
>= 8 || devinfo
->is_haswell
) {
3174 if (brw_inst_access_mode(devinfo
, p
->current
) == BRW_ALIGN_1
) {
3175 if (brw_inst_qtr_control(devinfo
, p
->current
) % 2 == 1)
3176 msg_control
|= 1 << 4; /* Use high 8 slots of the sample mask */
3178 brw_inst_set_dp_msg_type(devinfo
, insn
,
3179 HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP
);
3181 brw_inst_set_dp_msg_type(devinfo
, insn
,
3182 HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2
);
3186 brw_inst_set_dp_msg_type(devinfo
, insn
,
3187 GEN7_DATAPORT_RC_TYPED_ATOMIC_OP
);
3189 if (brw_inst_qtr_control(devinfo
, p
->current
) % 2 == 1)
3190 msg_control
|= 1 << 4; /* Use high 8 slots of the sample mask */
3193 brw_inst_set_dp_msg_control(devinfo
, insn
, msg_control
);
3197 brw_typed_atomic(struct brw_codegen
*p
,
3199 struct brw_reg payload
,
3200 struct brw_reg surface
,
3202 unsigned msg_length
,
3203 bool response_expected
) {
3204 const struct gen_device_info
*devinfo
= p
->devinfo
;
3205 const unsigned sfid
= (devinfo
->gen
>= 8 || devinfo
->is_haswell
?
3206 HSW_SFID_DATAPORT_DATA_CACHE_1
:
3207 GEN6_SFID_DATAPORT_RENDER_CACHE
);
3208 const bool align1
= (brw_inst_access_mode(devinfo
, p
->current
) == BRW_ALIGN_1
);
3209 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3210 const unsigned mask
= align1
? WRITEMASK_XYZW
: WRITEMASK_X
;
3211 struct brw_inst
*insn
= brw_send_indirect_surface_message(
3212 p
, sfid
, brw_writemask(dst
, mask
), payload
, surface
, msg_length
,
3213 brw_surface_payload_size(p
, response_expected
,
3214 devinfo
->gen
>= 8 || devinfo
->is_haswell
, false),
3217 brw_set_dp_typed_atomic_message(
3218 p
, insn
, atomic_op
, response_expected
);
3222 brw_set_dp_typed_surface_read_message(struct brw_codegen
*p
,
3223 struct brw_inst
*insn
,
3224 unsigned num_channels
)
3226 const struct gen_device_info
*devinfo
= p
->devinfo
;
3227 /* Set mask of unused channels. */
3228 unsigned msg_control
= 0xf & (0xf << num_channels
);
3230 if (devinfo
->gen
>= 8 || devinfo
->is_haswell
) {
3231 if (brw_inst_access_mode(devinfo
, p
->current
) == BRW_ALIGN_1
) {
3232 if (brw_inst_qtr_control(devinfo
, p
->current
) % 2 == 1)
3233 msg_control
|= 2 << 4; /* Use high 8 slots of the sample mask */
3235 msg_control
|= 1 << 4; /* Use low 8 slots of the sample mask */
3238 brw_inst_set_dp_msg_type(devinfo
, insn
,
3239 HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ
);
3241 if (brw_inst_access_mode(devinfo
, p
->current
) == BRW_ALIGN_1
) {
3242 if (brw_inst_qtr_control(devinfo
, p
->current
) % 2 == 1)
3243 msg_control
|= 1 << 5; /* Use high 8 slots of the sample mask */
3246 brw_inst_set_dp_msg_type(devinfo
, insn
,
3247 GEN7_DATAPORT_RC_TYPED_SURFACE_READ
);
3250 brw_inst_set_dp_msg_control(devinfo
, insn
, msg_control
);
3254 brw_typed_surface_read(struct brw_codegen
*p
,
3256 struct brw_reg payload
,
3257 struct brw_reg surface
,
3258 unsigned msg_length
,
3259 unsigned num_channels
)
3261 const struct gen_device_info
*devinfo
= p
->devinfo
;
3262 const unsigned sfid
= (devinfo
->gen
>= 8 || devinfo
->is_haswell
?
3263 HSW_SFID_DATAPORT_DATA_CACHE_1
:
3264 GEN6_SFID_DATAPORT_RENDER_CACHE
);
3265 struct brw_inst
*insn
= brw_send_indirect_surface_message(
3266 p
, sfid
, dst
, payload
, surface
, msg_length
,
3267 brw_surface_payload_size(p
, num_channels
,
3268 devinfo
->gen
>= 8 || devinfo
->is_haswell
, false),
3271 brw_set_dp_typed_surface_read_message(
3272 p
, insn
, num_channels
);
3276 brw_set_dp_typed_surface_write_message(struct brw_codegen
*p
,
3277 struct brw_inst
*insn
,
3278 unsigned num_channels
)
3280 const struct gen_device_info
*devinfo
= p
->devinfo
;
3281 /* Set mask of unused channels. */
3282 unsigned msg_control
= 0xf & (0xf << num_channels
);
3284 if (devinfo
->gen
>= 8 || devinfo
->is_haswell
) {
3285 if (brw_inst_access_mode(devinfo
, p
->current
) == BRW_ALIGN_1
) {
3286 if (brw_inst_qtr_control(devinfo
, p
->current
) % 2 == 1)
3287 msg_control
|= 2 << 4; /* Use high 8 slots of the sample mask */
3289 msg_control
|= 1 << 4; /* Use low 8 slots of the sample mask */
3292 brw_inst_set_dp_msg_type(devinfo
, insn
,
3293 HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE
);
3296 if (brw_inst_access_mode(devinfo
, p
->current
) == BRW_ALIGN_1
) {
3297 if (brw_inst_qtr_control(devinfo
, p
->current
) % 2 == 1)
3298 msg_control
|= 1 << 5; /* Use high 8 slots of the sample mask */
3301 brw_inst_set_dp_msg_type(devinfo
, insn
,
3302 GEN7_DATAPORT_RC_TYPED_SURFACE_WRITE
);
3305 brw_inst_set_dp_msg_control(devinfo
, insn
, msg_control
);
3309 brw_typed_surface_write(struct brw_codegen
*p
,
3310 struct brw_reg payload
,
3311 struct brw_reg surface
,
3312 unsigned msg_length
,
3313 unsigned num_channels
)
3315 const struct gen_device_info
*devinfo
= p
->devinfo
;
3316 const unsigned sfid
= (devinfo
->gen
>= 8 || devinfo
->is_haswell
?
3317 HSW_SFID_DATAPORT_DATA_CACHE_1
:
3318 GEN6_SFID_DATAPORT_RENDER_CACHE
);
3319 const bool align1
= (brw_inst_access_mode(devinfo
, p
->current
) == BRW_ALIGN_1
);
3320 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3321 const unsigned mask
= (devinfo
->gen
== 7 && !devinfo
->is_haswell
&& !align1
?
3322 WRITEMASK_X
: WRITEMASK_XYZW
);
3323 struct brw_inst
*insn
= brw_send_indirect_surface_message(
3324 p
, sfid
, brw_writemask(brw_null_reg(), mask
),
3325 payload
, surface
, msg_length
, 0, true);
3327 brw_set_dp_typed_surface_write_message(
3328 p
, insn
, num_channels
);
3332 brw_set_memory_fence_message(struct brw_codegen
*p
,
3333 struct brw_inst
*insn
,
3334 enum brw_message_target sfid
,
3337 const struct gen_device_info
*devinfo
= p
->devinfo
;
3339 brw_set_message_descriptor(p
, insn
, sfid
,
3340 1 /* message length */,
3341 (commit_enable
? 1 : 0) /* response length */,
3342 true /* header present */,
3346 case GEN6_SFID_DATAPORT_RENDER_CACHE
:
3347 brw_inst_set_dp_msg_type(devinfo
, insn
, GEN7_DATAPORT_RC_MEMORY_FENCE
);
3349 case GEN7_SFID_DATAPORT_DATA_CACHE
:
3350 brw_inst_set_dp_msg_type(devinfo
, insn
, GEN7_DATAPORT_DC_MEMORY_FENCE
);
3353 unreachable("Not reached");
3357 brw_inst_set_dp_msg_control(devinfo
, insn
, 1 << 5);
3361 brw_memory_fence(struct brw_codegen
*p
,
3364 const struct gen_device_info
*devinfo
= p
->devinfo
;
3365 const bool commit_enable
= devinfo
->gen
== 7 && !devinfo
->is_haswell
;
3366 struct brw_inst
*insn
;
3368 brw_push_insn_state(p
);
3369 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
3370 brw_set_default_exec_size(p
, BRW_EXECUTE_1
);
3373 /* Set dst as destination for dependency tracking, the MEMORY_FENCE
3374 * message doesn't write anything back.
3376 insn
= next_insn(p
, BRW_OPCODE_SEND
);
3377 dst
= retype(dst
, BRW_REGISTER_TYPE_UW
);
3378 brw_set_dest(p
, insn
, dst
);
3379 brw_set_src0(p
, insn
, dst
);
3380 brw_set_memory_fence_message(p
, insn
, GEN7_SFID_DATAPORT_DATA_CACHE
,
3383 if (devinfo
->gen
== 7 && !devinfo
->is_haswell
) {
3384 /* IVB does typed surface access through the render cache, so we need to
3385 * flush it too. Use a different register so both flushes can be
3386 * pipelined by the hardware.
3388 insn
= next_insn(p
, BRW_OPCODE_SEND
);
3389 brw_set_dest(p
, insn
, offset(dst
, 1));
3390 brw_set_src0(p
, insn
, offset(dst
, 1));
3391 brw_set_memory_fence_message(p
, insn
, GEN6_SFID_DATAPORT_RENDER_CACHE
,
3394 /* Now write the response of the second message into the response of the
3395 * first to trigger a pipeline stall -- This way future render and data
3396 * cache messages will be properly ordered with respect to past data and
3397 * render cache messages.
3399 brw_MOV(p
, dst
, offset(dst
, 1));
3402 brw_pop_insn_state(p
);
3406 brw_pixel_interpolator_query(struct brw_codegen
*p
,
3407 struct brw_reg dest
,
3411 struct brw_reg data
,
3412 unsigned msg_length
,
3413 unsigned response_length
)
3415 const struct gen_device_info
*devinfo
= p
->devinfo
;
3416 struct brw_inst
*insn
;
3417 const uint16_t exec_size
= brw_inst_exec_size(devinfo
, p
->current
);
3419 /* brw_send_indirect_message will automatically use a direct send message
3420 * if data is actually immediate.
3422 insn
= brw_send_indirect_message(p
,
3423 GEN7_SFID_PIXEL_INTERPOLATOR
,
3427 brw_inst_set_mlen(devinfo
, insn
, msg_length
);
3428 brw_inst_set_rlen(devinfo
, insn
, response_length
);
3430 brw_inst_set_pi_simd_mode(devinfo
, insn
, exec_size
== BRW_EXECUTE_16
);
3431 brw_inst_set_pi_slot_group(devinfo
, insn
, 0); /* zero unless 32/64px dispatch */
3432 brw_inst_set_pi_nopersp(devinfo
, insn
, noperspective
);
3433 brw_inst_set_pi_message_type(devinfo
, insn
, mode
);
3437 brw_find_live_channel(struct brw_codegen
*p
, struct brw_reg dst
,
3438 struct brw_reg mask
)
3440 const struct gen_device_info
*devinfo
= p
->devinfo
;
3441 const unsigned exec_size
= 1 << brw_inst_exec_size(devinfo
, p
->current
);
3442 const unsigned qtr_control
= brw_inst_qtr_control(devinfo
, p
->current
);
3445 assert(devinfo
->gen
>= 7);
3446 assert(mask
.type
== BRW_REGISTER_TYPE_UD
);
3448 brw_push_insn_state(p
);
3450 if (brw_inst_access_mode(devinfo
, p
->current
) == BRW_ALIGN_1
) {
3451 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
3453 if (devinfo
->gen
>= 8) {
3454 /* Getting the first active channel index is easy on Gen8: Just find
3455 * the first bit set in the execution mask. The register exists on
3456 * HSW already but it reads back as all ones when the current
3457 * instruction has execution masking disabled, so it's kind of
3460 struct brw_reg exec_mask
=
3461 retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD
);
3463 if (mask
.file
!= BRW_IMMEDIATE_VALUE
|| mask
.ud
!= 0xffffffff) {
3464 /* Unfortunately, ce0 does not take into account the thread
3465 * dispatch mask, which may be a problem in cases where it's not
3466 * tightly packed (i.e. it doesn't have the form '2^n - 1' for
3467 * some n). Combine ce0 with the given dispatch (or vector) mask
3468 * to mask off those channels which were never dispatched by the
3471 brw_SHR(p
, vec1(dst
), mask
, brw_imm_ud(qtr_control
* 8));
3472 brw_AND(p
, vec1(dst
), exec_mask
, vec1(dst
));
3473 exec_mask
= vec1(dst
);
3476 /* Quarter control has the effect of magically shifting the value of
3477 * ce0 so you'll get the first active channel relative to the
3478 * specified quarter control as result.
3480 inst
= brw_FBL(p
, vec1(dst
), exec_mask
);
3482 const struct brw_reg flag
= brw_flag_reg(1, 0);
3484 brw_MOV(p
, retype(flag
, BRW_REGISTER_TYPE_UD
), brw_imm_ud(0));
3486 /* Run enough instructions returning zero with execution masking and
3487 * a conditional modifier enabled in order to get the full execution
3488 * mask in f1.0. We could use a single 32-wide move here if it
3489 * weren't because of the hardware bug that causes channel enables to
3490 * be applied incorrectly to the second half of 32-wide instructions
3493 const unsigned lower_size
= MIN2(16, exec_size
);
3494 for (unsigned i
= 0; i
< exec_size
/ lower_size
; i
++) {
3495 inst
= brw_MOV(p
, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW
),
3497 brw_inst_set_mask_control(devinfo
, inst
, BRW_MASK_ENABLE
);
3498 brw_inst_set_group(devinfo
, inst
, lower_size
* i
+ 8 * qtr_control
);
3499 brw_inst_set_cond_modifier(devinfo
, inst
, BRW_CONDITIONAL_Z
);
3500 brw_inst_set_flag_reg_nr(devinfo
, inst
, 1);
3501 brw_inst_set_exec_size(devinfo
, inst
, cvt(lower_size
) - 1);
3504 /* Find the first bit set in the exec_size-wide portion of the flag
3505 * register that was updated by the last sequence of MOV
3508 const enum brw_reg_type type
= brw_int_type(exec_size
/ 8, false);
3509 brw_FBL(p
, vec1(dst
), byte_offset(retype(flag
, type
), qtr_control
));
3512 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
3514 if (devinfo
->gen
>= 8 &&
3515 mask
.file
== BRW_IMMEDIATE_VALUE
&& mask
.ud
== 0xffffffff) {
3516 /* In SIMD4x2 mode the first active channel index is just the
3517 * negation of the first bit of the mask register. Note that ce0
3518 * doesn't take into account the dispatch mask, so the Gen7 path
3519 * should be used instead unless you have the guarantee that the
3520 * dispatch mask is tightly packed (i.e. it has the form '2^n - 1'
3523 inst
= brw_AND(p
, brw_writemask(dst
, WRITEMASK_X
),
3524 negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD
)),
3528 /* Overwrite the destination without and with execution masking to
3529 * find out which of the channels is active.
3531 brw_push_insn_state(p
);
3532 brw_set_default_exec_size(p
, BRW_EXECUTE_4
);
3533 brw_MOV(p
, brw_writemask(vec4(dst
), WRITEMASK_X
),
3536 inst
= brw_MOV(p
, brw_writemask(vec4(dst
), WRITEMASK_X
),
3538 brw_pop_insn_state(p
);
3539 brw_inst_set_mask_control(devinfo
, inst
, BRW_MASK_ENABLE
);
3543 brw_pop_insn_state(p
);
3547 brw_broadcast(struct brw_codegen
*p
,
3552 const struct gen_device_info
*devinfo
= p
->devinfo
;
3553 const bool align1
= brw_inst_access_mode(devinfo
, p
->current
) == BRW_ALIGN_1
;
3556 brw_push_insn_state(p
);
3557 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
3558 brw_set_default_exec_size(p
, align1
? BRW_EXECUTE_1
: BRW_EXECUTE_4
);
3560 assert(src
.file
== BRW_GENERAL_REGISTER_FILE
&&
3561 src
.address_mode
== BRW_ADDRESS_DIRECT
);
3563 if ((src
.vstride
== 0 && (src
.hstride
== 0 || !align1
)) ||
3564 idx
.file
== BRW_IMMEDIATE_VALUE
) {
3565 /* Trivial, the source is already uniform or the index is a constant.
3566 * We will typically not get here if the optimizer is doing its job, but
3567 * asserting would be mean.
3569 const unsigned i
= idx
.file
== BRW_IMMEDIATE_VALUE
? idx
.ud
: 0;
3571 (align1
? stride(suboffset(src
, i
), 0, 1, 0) :
3572 stride(suboffset(src
, 4 * i
), 0, 4, 1)));
3575 const struct brw_reg addr
=
3576 retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD
);
3577 const unsigned offset
= src
.nr
* REG_SIZE
+ src
.subnr
;
3578 /* Limit in bytes of the signed indirect addressing immediate. */
3579 const unsigned limit
= 512;
3581 brw_push_insn_state(p
);
3582 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
3583 brw_set_default_predicate_control(p
, BRW_PREDICATE_NONE
);
3585 /* Take into account the component size and horizontal stride. */
3586 assert(src
.vstride
== src
.hstride
+ src
.width
);
3587 brw_SHL(p
, addr
, vec1(idx
),
3588 brw_imm_ud(_mesa_logbase2(type_sz(src
.type
)) +
3591 /* We can only address up to limit bytes using the indirect
3592 * addressing immediate, account for the difference if the source
3593 * register is above this limit.
3595 if (offset
>= limit
)
3596 brw_ADD(p
, addr
, addr
, brw_imm_ud(offset
- offset
% limit
));
3598 brw_pop_insn_state(p
);
3600 /* Use indirect addressing to fetch the specified component. */
3602 retype(brw_vec1_indirect(addr
.subnr
, offset
% limit
),
3605 /* In SIMD4x2 mode the index can be either zero or one, replicate it
3606 * to all bits of a flag register,
3610 stride(brw_swizzle(idx
, BRW_SWIZZLE_XXXX
), 4, 4, 1));
3611 brw_inst_set_pred_control(devinfo
, inst
, BRW_PREDICATE_NONE
);
3612 brw_inst_set_cond_modifier(devinfo
, inst
, BRW_CONDITIONAL_NZ
);
3613 brw_inst_set_flag_reg_nr(devinfo
, inst
, 1);
3615 /* and use predicated SEL to pick the right channel. */
3616 inst
= brw_SEL(p
, dst
,
3617 stride(suboffset(src
, 4), 4, 4, 1),
3618 stride(src
, 4, 4, 1));
3619 brw_inst_set_pred_control(devinfo
, inst
, BRW_PREDICATE_NORMAL
);
3620 brw_inst_set_flag_reg_nr(devinfo
, inst
, 1);
3624 brw_pop_insn_state(p
);
3628 * This instruction is generated as a single-channel align1 instruction by
3629 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
3631 * We can't use the typed atomic op in the FS because that has the execution
3632 * mask ANDed with the pixel mask, but we just want to write the one dword for
3635 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
3636 * one u32. So we use the same untyped atomic write message as the pixel
3639 * The untyped atomic operation requires a BUFFER surface type with RAW
3640 * format, and is only accessible through the legacy DATA_CACHE dataport
3643 void brw_shader_time_add(struct brw_codegen
*p
,
3644 struct brw_reg payload
,
3645 uint32_t surf_index
)
3647 const unsigned sfid
= (p
->devinfo
->gen
>= 8 || p
->devinfo
->is_haswell
?
3648 HSW_SFID_DATAPORT_DATA_CACHE_1
:
3649 GEN7_SFID_DATAPORT_DATA_CACHE
);
3650 assert(p
->devinfo
->gen
>= 7);
3652 brw_push_insn_state(p
);
3653 brw_set_default_access_mode(p
, BRW_ALIGN_1
);
3654 brw_set_default_mask_control(p
, BRW_MASK_DISABLE
);
3655 brw_set_default_compression_control(p
, BRW_COMPRESSION_NONE
);
3656 brw_inst
*send
= brw_next_insn(p
, BRW_OPCODE_SEND
);
3658 /* We use brw_vec1_reg and unmasked because we want to increment the given
3661 brw_set_dest(p
, send
, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE
,
3663 brw_set_src0(p
, send
, brw_vec1_reg(payload
.file
,
3665 brw_set_src1(p
, send
, brw_imm_ud(0));
3666 brw_set_message_descriptor(p
, send
, sfid
, 2, 0, false, false);
3667 brw_inst_set_binding_table_index(p
->devinfo
, send
, surf_index
);
3668 brw_set_dp_untyped_atomic_message(p
, send
, BRW_AOP_ADD
, false);
3670 brw_pop_insn_state(p
);
3675 * Emit the SEND message for a barrier
3678 brw_barrier(struct brw_codegen
*p
, struct brw_reg src
)
3680 const struct gen_device_info
*devinfo
= p
->devinfo
;
3681 struct brw_inst
*inst
;
3683 assert(devinfo
->gen
>= 7);
3685 brw_push_insn_state(p
);
3686 brw_set_default_access_mode(p
, BRW_ALIGN_1
);
3687 inst
= next_insn(p
, BRW_OPCODE_SEND
);
3688 brw_set_dest(p
, inst
, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW
));
3689 brw_set_src0(p
, inst
, src
);
3690 brw_set_src1(p
, inst
, brw_null_reg());
3692 brw_set_message_descriptor(p
, inst
, BRW_SFID_MESSAGE_GATEWAY
,
3694 0 /* response_length */,
3695 false /* header_present */,
3696 false /* end_of_thread */);
3698 brw_inst_set_gateway_notify(devinfo
, inst
, 1);
3699 brw_inst_set_gateway_subfuncid(devinfo
, inst
,
3700 BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG
);
3702 brw_inst_set_mask_control(devinfo
, inst
, BRW_MASK_DISABLE
);
3703 brw_pop_insn_state(p
);
3708 * Emit the wait instruction for a barrier
3711 brw_WAIT(struct brw_codegen
*p
)
3713 const struct gen_device_info
*devinfo
= p
->devinfo
;
3714 struct brw_inst
*insn
;
3716 struct brw_reg src
= brw_notification_reg();
3718 insn
= next_insn(p
, BRW_OPCODE_WAIT
);
3719 brw_set_dest(p
, insn
, src
);
3720 brw_set_src0(p
, insn
, src
);
3721 brw_set_src1(p
, insn
, brw_null_reg());
3723 brw_inst_set_exec_size(devinfo
, insn
, BRW_EXECUTE_1
);
3724 brw_inst_set_mask_control(devinfo
, insn
, BRW_MASK_DISABLE
);